-
Notifications
You must be signed in to change notification settings - Fork 189
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a rally track for OpenAI vector benchmarking. Based on the cohere_vector track.
- Loading branch information
Showing
12 changed files
with
487 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
## OpenAI vector track | ||
|
||
This track benchmarks the [NQ dataset](https://huggingface.co/datasets/BeIR/nq) enriched with embeddings generated using OpenAI's [`text-embedding-ada-002` model](https://openai.com/blog/new-and-improved-embedding-model). | ||
|
||
### Generating the document dataset | ||
|
||
To rebuild the document dataset: | ||
|
||
1. Install Python dependencies listed in `_tools/requirements.txt` | ||
2. Download the raw corpus dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/corpus/nq_openai-text-embedding-ada-002_corpus_dataset.arrow) | ||
3. Run `./_tools/parse_documents.py <raw_corpus_dataset_path>` | ||
|
||
This will build the document dataset files in the `openai-documents` directory. | ||
|
||
### Example Document | ||
|
||
```json | ||
{ | ||
"docid": "doc0", | ||
"title": "Minority interest", | ||
"text": "In accounting, minority interest (or non-controlling interest) is the portion of a subsidiary corporation's stock that is not owned by the parent corporation. The magnitude of the minority interest in the subsidiary company is generally less than 50% of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent.[1]", | ||
"emb": [-0.01128644309937954, -0.02616020105779171, 0.012801663018763065, ...] | ||
} | ||
``` | ||
|
||
### Generating the queries | ||
|
||
To rebuild the `queries.json.bz2` file: | ||
|
||
1. Install Python dependencies listed in `_tools/requirements.txt` | ||
2. Download the raw queries dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/queries/nq_openai-text-embedding-ada-002_queries_dataset.arrow) | ||
3. Run `./_tools/parse_queries.py <raw_queries_dataset_path>` | ||
|
||
### Parameters | ||
|
||
This track accepts the following parameters with Rally 0.8.0+ using `--track-params`: | ||
|
||
- initial_indexing_bulk_size (default: 500) | ||
- initial_indexing_bulk_warmup (default: 40) | ||
- initial_indexing_bulk_indexing_clients (default: 5) | ||
- initial_indexing_ingest_percentage (default: 100) | ||
- parallel_indexing_bulk_size (default: 500) | ||
- parallel_indexing_bulk_clients (default: 1) | ||
- parallel_indexing_ingest_percentage (default: 100) | ||
- parallel_indexing_time_period (default: 1800) | ||
- parallel_indexing_bulk_target_throughput (default: 1) | ||
- parallel_indexing_search_clients (default: 3) | ||
- parallel_indexing_search_target_throughput (default: 100) | ||
- post_ingest_sleep (default: false): Whether to pause after ingest and prior to subsequent operations. | ||
- post_ingest_sleep_duration (default: 30): Sleep duration in seconds. | ||
- standalone_search_clients (default: 8) | ||
- standalone_search_iterations (default: 10000) | ||
|
||
### License | ||
|
||
We use the same license for the data as the original data: [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/usr/bin/env python3 | ||
import bz2 | ||
import json | ||
import os | ||
import sys | ||
|
||
import pyarrow as pa | ||
|
||
OUTPUT_DIR: str = "openai-documents" | ||
INITIAL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-initial-indexing.json.bz2" | ||
PARALLEL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-parallel-indexing.json.bz2" | ||
DEFAULT_MAX_INITIAL_INDEXING_DOCS: int = -1 | ||
DEFAULT_MAX_PARALLEL_INDEXING_DOCS: int = 100_000 | ||
PROGRESS_EVERY = 100 | ||
|
||
|
||
def progress_bar(count, total): | ||
bar_length = 100 | ||
filled_length = int(round(bar_length * count / float(total))) | ||
percentage = round(100.0 * count / float(total), 1) | ||
bar = "=" * filled_length + "-" * (bar_length - filled_length) | ||
sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total)) | ||
sys.stdout.flush() | ||
|
||
|
||
def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int): | ||
if max_parallel_indexing_docs < 0: | ||
raise ValueError("max_parallel_indexing_docs must be >= 0") | ||
|
||
os.makedirs(OUTPUT_DIR, exist_ok=True) | ||
with pa.memory_map(input_file_path, "rb") as source: | ||
doc_table = pa.ipc.open_stream(source).read_all() | ||
|
||
if max_initial_indexing_docs < 0: | ||
# Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements | ||
initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs) | ||
else: | ||
initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs) | ||
|
||
parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs) | ||
|
||
parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME) | ||
parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME) | ||
|
||
|
||
def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str): | ||
output_file_path = os.path.join(OUTPUT_DIR, output_filename) | ||
print(f"Writing {doc_count} documents to {output_file_path}") | ||
|
||
with bz2.open(output_file_path, "wt") as output_file: | ||
if doc_count <= 0: | ||
# Return here so we always create the output file | ||
return | ||
|
||
doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count) | ||
|
||
docs_written = 0 | ||
progress_bar(docs_written, doc_count) | ||
|
||
for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY): | ||
docid_col = record_batch.column("_id") | ||
title_col = record_batch.column("title") | ||
text_col = record_batch.column("text") | ||
emb_col = record_batch.column("embedding") | ||
for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col): | ||
output_file.write( | ||
json.dumps( | ||
{"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True | ||
) | ||
) | ||
output_file.write("\n") | ||
|
||
docs_written += record_batch.num_rows | ||
progress_bar(docs_written, doc_count) | ||
|
||
# Print newline so that progress bar is not overwritten by next print statement | ||
print() | ||
|
||
|
||
def parse_arguments(): | ||
if len(sys.argv) < 2: | ||
print(f"Usage: {sys.argv[0]} <input_file_path> [<max_initial_indexing_docs> <max_parallel_indexing_docs>]") | ||
exit(1) | ||
|
||
if len(sys.argv) == 2: | ||
return (sys.argv[1], DEFAULT_MAX_INITIAL_INDEXING_DOCS, DEFAULT_MAX_PARALLEL_INDEXING_DOCS) | ||
elif len(sys.argv) == 3: | ||
return (sys.argv[1], int(sys.argv[2]), DEFAULT_MAX_PARALLEL_INDEXING_DOCS) | ||
elif len(sys.argv) >= 4: | ||
return (sys.argv[1], int(sys.argv[2]), int(sys.argv[3])) | ||
|
||
|
||
if __name__ == "__main__": | ||
input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs = parse_arguments() | ||
output_documents(input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/usr/bin/env python3 | ||
import bz2 | ||
import json | ||
import sys | ||
import typing | ||
|
||
import pyarrow as pa | ||
|
||
BATCH_SIZE: int = 1000 | ||
QUERY_COLUMN: str = "embedding" | ||
OUTPUT_FILENAME: str = "queries.json.bz2" | ||
|
||
|
||
def output_queries(input_filename: str, queries_file: typing.TextIO): | ||
with pa.memory_map(input_filename, "rb") as source: | ||
query_table = pa.ipc.open_stream(source).read_all() | ||
for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE): | ||
query_list = record_batch.column(QUERY_COLUMN) | ||
for query in query_list: | ||
queries_file.write(json.dumps(query.as_py())) | ||
queries_file.write("\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 2: | ||
print("Usage: {} <input_file_path>".format(sys.argv[0])) | ||
exit(1) | ||
|
||
input_filename = sys.argv[1] | ||
|
||
with bz2.open(OUTPUT_FILENAME, "wt") as queries_file: | ||
output_queries(input_filename, queries_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pyarrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
{ | ||
"name": "index-and-search", | ||
"description": "", | ||
"default": true, | ||
"schedule": [ | ||
{ | ||
"operation": { | ||
"operation-type": "delete-index" | ||
} | ||
}, | ||
{ | ||
"name": "create-index", | ||
"operation": "create-index" | ||
}, | ||
{ | ||
"name": "check-cluster-health", | ||
"operation": "check-cluster-health" | ||
}, | ||
{ | ||
"name": "initial-documents-indexing", | ||
"operation": "initial-documents-indexing", | ||
"warmup-time-period": {{ initial_indexing_bulk_warmup | default(40) | int }}, | ||
"clients": {{ initial_indexing_bulk_indexing_clients | default(5) | int }} | ||
}, | ||
{ | ||
"name": "refresh-after-index", | ||
"operation": { | ||
"operation-type": "refresh", | ||
"request-timeout": 1000, | ||
"include-in-reporting": true | ||
} | ||
}, | ||
{ | ||
"name": "wait-until-merges-finish-after-index", | ||
"operation": { | ||
"operation-type": "index-stats", | ||
"index": "_all", | ||
"condition": { | ||
"path": "_all.total.merges.current", | ||
"expected-value": 0 | ||
}, | ||
"retry-until-success": true, | ||
"include-in-reporting": false | ||
} | ||
}, | ||
{# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%} | ||
{ | ||
"name": "post-ingest-sleep", | ||
"operation": { | ||
"operation-type": "sleep", | ||
"duration": {{ post_ingest_sleep_duration|default(30) }} | ||
} | ||
}, | ||
{%- endif -%}{# serverless-post-ingest-sleep-marker-end #} | ||
{ | ||
"name": "standalone-search-knn-10-100-single-client", | ||
"operation": "knn-search-10-100", | ||
"warmup-iterations": 100, | ||
"iterations": {{ standalone_search_iterations | default(10000) | int }} | ||
}, | ||
{ | ||
"name": "standalone-knn-search-100-1000-single-client", | ||
"operation": "knn-search-100-1000", | ||
"warmup-iterations": 100, | ||
"iterations": {{ standalone_search_iterations | default(10000) | int }} | ||
}, | ||
{ | ||
"name": "standalone-search-knn-10-100-multiple-clients", | ||
"operation": "knn-search-10-100", | ||
"warmup-iterations": 100, | ||
"clients": {{ standalone_search_clients | default(8) | int }}, | ||
"iterations": {{ standalone_search_iterations | default(10000) | int }} | ||
}, | ||
{ | ||
"name": "standalone-search-knn-100-1000-multiple-clients", | ||
"operation": "knn-search-100-1000", | ||
"warmup-iterations": 100, | ||
"clients": {{ standalone_search_clients | default(8) | int }}, | ||
"iterations": {{ standalone_search_iterations | default(10000) | int }} | ||
}, | ||
{ | ||
"parallel": { | ||
"tasks": [ | ||
{ | ||
"name": "parallel-documents-indexing-bulk", | ||
"operation": "parallel-documents-indexing", | ||
"clients": {{ parallel_indexing_bulk_clients | default(1) | int }}, | ||
"time-period": {{ parallel_indexing_time_period | default(1800) | int }}, | ||
"target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }} | ||
}, | ||
{ | ||
"name": "parallel-documents-indexing-search-knn-10-100", | ||
"operation": "knn-search-10-100", | ||
"clients": {{ parallel_indexing_search_clients | default(3) | int }}, | ||
"time-period": {{ parallel_indexing_time_period | default(1800) | int }}, | ||
"target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }} | ||
} | ||
] | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
open_ai_corpus-initial-indexing.json.bz2 | ||
open_ai_corpus-initial-indexing-1k.json.bz2 | ||
open_ai_corpus-parallel-indexing.json.bz2 | ||
open_ai_corpus-parallel-indexing-1k.json.bz2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"settings": { | ||
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} | ||
{% if preload_pagecache %} | ||
"index.store.preload": [ "vec", "vex", "vem"], | ||
{% endif %} | ||
"index.number_of_shards": {{number_of_shards | default(1)}}, | ||
"index.number_of_replicas": {{number_of_replicas | default(0)}} | ||
{%- endif -%}{# non-serverless-index-settings-marker-end #} | ||
}, | ||
"mappings": { | ||
"dynamic": false, | ||
"_source": { | ||
"enabled": false | ||
}, | ||
"properties": { | ||
"emb": { | ||
"type": "dense_vector", | ||
"element_type": "float", | ||
"dims": 1536, | ||
"index": true, | ||
"similarity": "dot_product" | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
"settings": { | ||
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%} | ||
{% if preload_pagecache %} | ||
"index.store.preload": [ "vec", "vex", "vem"], | ||
{% endif %} | ||
"index.number_of_shards": {{number_of_shards | default(1)}}, | ||
"index.number_of_replicas": {{number_of_replicas | default(0)}} | ||
{%- endif -%}{# non-serverless-index-settings-marker-end #} | ||
}, | ||
"mappings": { | ||
"properties": { | ||
"docid": { | ||
"type": "keyword" | ||
}, | ||
"title": { | ||
"type": "text" | ||
}, | ||
"text": { | ||
"type": "text" | ||
}, | ||
"emb": { | ||
"type": "dense_vector", | ||
"element_type": "float", | ||
"dims": 1536, | ||
"index": true, | ||
"similarity": "dot_product" | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
{ | ||
"name": "create-index", | ||
"operation-type": "create-index", | ||
"settings": {{index_settings | default({}) | tojson}} | ||
}, | ||
{ | ||
"name": "check-cluster-health", | ||
"operation-type": "cluster-health", | ||
"request-params": { | ||
"wait_for_status": "green" | ||
}, | ||
"retry-until-success": true | ||
}, | ||
{ | ||
"name": "initial-documents-indexing", | ||
"operation-type": "bulk", | ||
"corpora": "openai-initial-indexing", | ||
"bulk-size": {{initial_indexing_bulk_size | default(500)}}, | ||
"ingest-percentage": {{initial_indexing_ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "parallel-documents-indexing", | ||
"operation-type": "bulk", | ||
"corpora": "openai-parallel-indexing", | ||
"bulk-size": {{parallel_indexing_bulk_size | default(500)}}, | ||
"ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "knn-search-10-100", | ||
"operation-type": "search", | ||
"param-source": "knn-param-source", | ||
"k": 10, | ||
"num-candidates": 100 | ||
}, | ||
{ | ||
"name": "knn-search-100-1000", | ||
"operation-type": "search", | ||
"param-source": "knn-param-source", | ||
"k": 100, | ||
"num-candidates": 1000 | ||
} |
Binary file not shown.
Oops, something went wrong.