Skip to content

Commit

Permalink
OpenAI Vector Rally Track (#517)
Browse files Browse the repository at this point in the history
Add a rally track for OpenAI vector benchmarking. Based on the cohere_vector track.
  • Loading branch information
Mikep86 authored Nov 20, 2023
1 parent df4f214 commit 99e9115
Show file tree
Hide file tree
Showing 12 changed files with 487 additions and 0 deletions.
56 changes: 56 additions & 0 deletions openai_vector/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
## OpenAI vector track

This track benchmarks the [NQ dataset](https://huggingface.co/datasets/BeIR/nq) enriched with embeddings generated using OpenAI's [`text-embedding-ada-002` model](https://openai.com/blog/new-and-improved-embedding-model).

### Generating the document dataset

To rebuild the document dataset:

1. Install Python dependencies listed in `_tools/requirements.txt`
2. Download the raw corpus dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/corpus/nq_openai-text-embedding-ada-002_corpus_dataset.arrow)
3. Run `./_tools/parse_documents.py <raw_corpus_dataset_path>`

This will build the document dataset files in the `openai-documents` directory.

### Example Document

```json
{
"docid": "doc0",
"title": "Minority interest",
"text": "In accounting, minority interest (or non-controlling interest) is the portion of a subsidiary corporation's stock that is not owned by the parent corporation. The magnitude of the minority interest in the subsidiary company is generally less than 50% of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent.[1]",
"emb": [-0.01128644309937954, -0.02616020105779171, 0.012801663018763065, ...]
}
```

### Generating the queries

To rebuild the `queries.json.bz2` file:

1. Install Python dependencies listed in `_tools/requirements.txt`
2. Download the raw queries dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/queries/nq_openai-text-embedding-ada-002_queries_dataset.arrow)
3. Run `./_tools/parse_queries.py <raw_queries_dataset_path>`

### Parameters

This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:

- initial_indexing_bulk_size (default: 500)
- initial_indexing_bulk_warmup (default: 40)
- initial_indexing_bulk_indexing_clients (default: 5)
- initial_indexing_ingest_percentage (default: 100)
- parallel_indexing_bulk_size (default: 500)
- parallel_indexing_bulk_clients (default: 1)
- parallel_indexing_ingest_percentage (default: 100)
- parallel_indexing_time_period (default: 1800)
- parallel_indexing_bulk_target_throughput (default: 1)
- parallel_indexing_search_clients (default: 3)
- parallel_indexing_search_target_throughput (default: 100)
- post_ingest_sleep (default: false): Whether to pause after ingest and prior to subsequent operations.
- post_ingest_sleep_duration (default: 30): Sleep duration in seconds.
- standalone_search_clients (default: 8)
- standalone_search_iterations (default: 10000)

### License

We use the same license for the data as the original data: [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
95 changes: 95 additions & 0 deletions openai_vector/_tools/parse_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
import bz2
import json
import os
import sys

import pyarrow as pa

OUTPUT_DIR: str = "openai-documents"
INITIAL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-initial-indexing.json.bz2"
PARALLEL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-parallel-indexing.json.bz2"
DEFAULT_MAX_INITIAL_INDEXING_DOCS: int = -1
DEFAULT_MAX_PARALLEL_INDEXING_DOCS: int = 100_000
PROGRESS_EVERY = 100


def progress_bar(count, total):
bar_length = 100
filled_length = int(round(bar_length * count / float(total)))
percentage = round(100.0 * count / float(total), 1)
bar = "=" * filled_length + "-" * (bar_length - filled_length)
sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
sys.stdout.flush()


def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
if max_parallel_indexing_docs < 0:
raise ValueError("max_parallel_indexing_docs must be >= 0")

os.makedirs(OUTPUT_DIR, exist_ok=True)
with pa.memory_map(input_file_path, "rb") as source:
doc_table = pa.ipc.open_stream(source).read_all()

if max_initial_indexing_docs < 0:
# Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
else:
initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)

parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)

parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)


def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
output_file_path = os.path.join(OUTPUT_DIR, output_filename)
print(f"Writing {doc_count} documents to {output_file_path}")

with bz2.open(output_file_path, "wt") as output_file:
if doc_count <= 0:
# Return here so we always create the output file
return

doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)

docs_written = 0
progress_bar(docs_written, doc_count)

for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
docid_col = record_batch.column("_id")
title_col = record_batch.column("title")
text_col = record_batch.column("text")
emb_col = record_batch.column("embedding")
for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
output_file.write(
json.dumps(
{"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
)
)
output_file.write("\n")

docs_written += record_batch.num_rows
progress_bar(docs_written, doc_count)

# Print newline so that progress bar is not overwritten by next print statement
print()


def parse_arguments():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <input_file_path> [<max_initial_indexing_docs> <max_parallel_indexing_docs>]")
exit(1)

if len(sys.argv) == 2:
return (sys.argv[1], DEFAULT_MAX_INITIAL_INDEXING_DOCS, DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
elif len(sys.argv) == 3:
return (sys.argv[1], int(sys.argv[2]), DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
elif len(sys.argv) >= 4:
return (sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))


if __name__ == "__main__":
input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs = parse_arguments()
output_documents(input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs)
32 changes: 32 additions & 0 deletions openai_vector/_tools/parse_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python3
import bz2
import json
import sys
import typing

import pyarrow as pa

BATCH_SIZE: int = 1000
QUERY_COLUMN: str = "embedding"
OUTPUT_FILENAME: str = "queries.json.bz2"


def output_queries(input_filename: str, queries_file: typing.TextIO):
with pa.memory_map(input_filename, "rb") as source:
query_table = pa.ipc.open_stream(source).read_all()
for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE):
query_list = record_batch.column(QUERY_COLUMN)
for query in query_list:
queries_file.write(json.dumps(query.as_py()))
queries_file.write("\n")


if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: {} <input_file_path>".format(sys.argv[0]))
exit(1)

input_filename = sys.argv[1]

with bz2.open(OUTPUT_FILENAME, "wt") as queries_file:
output_queries(input_filename, queries_file)
1 change: 1 addition & 0 deletions openai_vector/_tools/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyarrow
102 changes: 102 additions & 0 deletions openai_vector/challenges/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"name": "index-and-search",
"description": "",
"default": true,
"schedule": [
{
"operation": {
"operation-type": "delete-index"
}
},
{
"name": "create-index",
"operation": "create-index"
},
{
"name": "check-cluster-health",
"operation": "check-cluster-health"
},
{
"name": "initial-documents-indexing",
"operation": "initial-documents-indexing",
"warmup-time-period": {{ initial_indexing_bulk_warmup | default(40) | int }},
"clients": {{ initial_indexing_bulk_indexing_clients | default(5) | int }}
},
{
"name": "refresh-after-index",
"operation": {
"operation-type": "refresh",
"request-timeout": 1000,
"include-in-reporting": true
}
},
{
"name": "wait-until-merges-finish-after-index",
"operation": {
"operation-type": "index-stats",
"index": "_all",
"condition": {
"path": "_all.total.merges.current",
"expected-value": 0
},
"retry-until-success": true,
"include-in-reporting": false
}
},
{# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
{
"name": "post-ingest-sleep",
"operation": {
"operation-type": "sleep",
"duration": {{ post_ingest_sleep_duration|default(30) }}
}
},
{%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
{
"name": "standalone-search-knn-10-100-single-client",
"operation": "knn-search-10-100",
"warmup-iterations": 100,
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"name": "standalone-knn-search-100-1000-single-client",
"operation": "knn-search-100-1000",
"warmup-iterations": 100,
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"name": "standalone-search-knn-10-100-multiple-clients",
"operation": "knn-search-10-100",
"warmup-iterations": 100,
"clients": {{ standalone_search_clients | default(8) | int }},
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"name": "standalone-search-knn-100-1000-multiple-clients",
"operation": "knn-search-100-1000",
"warmup-iterations": 100,
"clients": {{ standalone_search_clients | default(8) | int }},
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"parallel": {
"tasks": [
{
"name": "parallel-documents-indexing-bulk",
"operation": "parallel-documents-indexing",
"clients": {{ parallel_indexing_bulk_clients | default(1) | int }},
"time-period": {{ parallel_indexing_time_period | default(1800) | int }},
"target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }}
},
{
"name": "parallel-documents-indexing-search-knn-10-100",
"operation": "knn-search-10-100",
"clients": {{ parallel_indexing_search_clients | default(3) | int }},
"time-period": {{ parallel_indexing_time_period | default(1800) | int }},
"target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }}
}
]
}
}
]
}
4 changes: 4 additions & 0 deletions openai_vector/files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
open_ai_corpus-initial-indexing.json.bz2
open_ai_corpus-initial-indexing-1k.json.bz2
open_ai_corpus-parallel-indexing.json.bz2
open_ai_corpus-parallel-indexing-1k.json.bz2
26 changes: 26 additions & 0 deletions openai_vector/index-vectors-only-mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"settings": {
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
{% if preload_pagecache %}
"index.store.preload": [ "vec", "vex", "vem"],
{% endif %}
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}}
{%- endif -%}{# non-serverless-index-settings-marker-end #}
},
"mappings": {
"dynamic": false,
"_source": {
"enabled": false
},
"properties": {
"emb": {
"type": "dense_vector",
"element_type": "float",
"dims": 1536,
"index": true,
"similarity": "dot_product"
}
}
}
}
31 changes: 31 additions & 0 deletions openai_vector/index-vectors-with-text-mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"settings": {
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
{% if preload_pagecache %}
"index.store.preload": [ "vec", "vex", "vem"],
{% endif %}
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}}
{%- endif -%}{# non-serverless-index-settings-marker-end #}
},
"mappings": {
"properties": {
"docid": {
"type": "keyword"
},
"title": {
"type": "text"
},
"text": {
"type": "text"
},
"emb": {
"type": "dense_vector",
"element_type": "float",
"dims": 1536,
"index": true,
"similarity": "dot_product"
}
}
}
}
41 changes: 41 additions & 0 deletions openai_vector/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"name": "create-index",
"operation-type": "create-index",
"settings": {{index_settings | default({}) | tojson}}
},
{
"name": "check-cluster-health",
"operation-type": "cluster-health",
"request-params": {
"wait_for_status": "green"
},
"retry-until-success": true
},
{
"name": "initial-documents-indexing",
"operation-type": "bulk",
"corpora": "openai-initial-indexing",
"bulk-size": {{initial_indexing_bulk_size | default(500)}},
"ingest-percentage": {{initial_indexing_ingest_percentage | default(100)}}
},
{
"name": "parallel-documents-indexing",
"operation-type": "bulk",
"corpora": "openai-parallel-indexing",
"bulk-size": {{parallel_indexing_bulk_size | default(500)}},
"ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
},
{
"name": "knn-search-10-100",
"operation-type": "search",
"param-source": "knn-param-source",
"k": 10,
"num-candidates": 100
},
{
"name": "knn-search-100-1000",
"operation-type": "search",
"param-source": "knn-param-source",
"k": 100,
"num-candidates": 1000
}
Binary file added openai_vector/queries.json.bz2
Binary file not shown.
Loading

0 comments on commit 99e9115

Please sign in to comment.