OpenAI Vector Rally Track (#517)

Add a rally track for OpenAI vector benchmarking. Based on the cohere_vector track.
elastic · Nov 20, 2023 · 99e9115 · 99e9115
1 parent df4f214
commit 99e9115
Show file tree

Hide file tree

Showing 12 changed files with 487 additions and 0 deletions.
diff --git a/openai_vector/README.md b/openai_vector/README.md
@@ -0,0 +1,56 @@
+## OpenAI vector track
+
+This track benchmarks the [NQ dataset](https://huggingface.co/datasets/BeIR/nq) enriched with embeddings generated using OpenAI's [`text-embedding-ada-002` model](https://openai.com/blog/new-and-improved-embedding-model).
+
+### Generating the document dataset
+
+To rebuild the document dataset:
+
+1. Install Python dependencies listed in `_tools/requirements.txt`
+2. Download the raw corpus dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/corpus/nq_openai-text-embedding-ada-002_corpus_dataset.arrow)
+3. Run `./_tools/parse_documents.py <raw_corpus_dataset_path>`
+
+This will build the document dataset files in the `openai-documents` directory.
+
+### Example Document
+
+```json
+{
+  "docid": "doc0",
+  "title": "Minority interest",
+  "text": "In accounting, minority interest (or non-controlling interest) is the portion of a subsidiary corporation's stock that is not owned by the parent corporation. The magnitude of the minority interest in the subsidiary company is generally less than 50% of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent.[1]",
+  "emb": [-0.01128644309937954, -0.02616020105779171, 0.012801663018763065, ...]
+}
+```
+
+### Generating the queries
+
+To rebuild the `queries.json.bz2` file:
+
+1. Install Python dependencies listed in `_tools/requirements.txt`
+2. Download the raw queries dataset [from here](https://rally-tracks.elastic.co/openai_vector/raw_data/queries/nq_openai-text-embedding-ada-002_queries_dataset.arrow)
+3. Run `./_tools/parse_queries.py <raw_queries_dataset_path>`
+
+### Parameters
+
+This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:
+
+- initial_indexing_bulk_size (default: 500)
+- initial_indexing_bulk_warmup (default: 40)
+- initial_indexing_bulk_indexing_clients (default: 5)
+- initial_indexing_ingest_percentage (default: 100)
+- parallel_indexing_bulk_size (default: 500)
+- parallel_indexing_bulk_clients (default: 1)
+- parallel_indexing_ingest_percentage (default: 100)
+- parallel_indexing_time_period (default: 1800)
+- parallel_indexing_bulk_target_throughput (default: 1)
+- parallel_indexing_search_clients (default: 3)
+- parallel_indexing_search_target_throughput (default: 100)
+- post_ingest_sleep (default: false): Whether to pause after ingest and prior to subsequent operations.
+- post_ingest_sleep_duration (default: 30): Sleep duration in seconds.
+- standalone_search_clients (default: 8)
+- standalone_search_iterations (default: 10000)
+
+### License
+
+We use the same license for the data as the original data: [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
diff --git a/openai_vector/_tools/parse_documents.py b/openai_vector/_tools/parse_documents.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+import bz2
+import json
+import os
+import sys
+
+import pyarrow as pa
+
+OUTPUT_DIR: str = "openai-documents"
+INITIAL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-initial-indexing.json.bz2"
+PARALLEL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-parallel-indexing.json.bz2"
+DEFAULT_MAX_INITIAL_INDEXING_DOCS: int = -1
+DEFAULT_MAX_PARALLEL_INDEXING_DOCS: int = 100_000
+PROGRESS_EVERY = 100
+
+
+def progress_bar(count, total):
+    bar_length = 100
+    filled_length = int(round(bar_length * count / float(total)))
+    percentage = round(100.0 * count / float(total), 1)
+    bar = "=" * filled_length + "-" * (bar_length - filled_length)
+    sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
+    sys.stdout.flush()
+
+
+def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
+    if max_parallel_indexing_docs < 0:
+        raise ValueError("max_parallel_indexing_docs must be >= 0")
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    with pa.memory_map(input_file_path, "rb") as source:
+        doc_table = pa.ipc.open_stream(source).read_all()
+
+        if max_initial_indexing_docs < 0:
+            # Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
+            initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
+        else:
+            initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)
+
+        parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)
+
+        parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
+        parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)
+
+
+def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
+    output_file_path = os.path.join(OUTPUT_DIR, output_filename)
+    print(f"Writing {doc_count} documents to {output_file_path}")
+
+    with bz2.open(output_file_path, "wt") as output_file:
+        if doc_count <= 0:
+            # Return here so we always create the output file
+            return
+
+        doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)
+
+        docs_written = 0
+        progress_bar(docs_written, doc_count)
+
+        for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
+            docid_col = record_batch.column("_id")
+            title_col = record_batch.column("title")
+            text_col = record_batch.column("text")
+            emb_col = record_batch.column("embedding")
+            for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
+                output_file.write(
+                    json.dumps(
+                        {"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
+                    )
+                )
+                output_file.write("\n")
+
+            docs_written += record_batch.num_rows
+            progress_bar(docs_written, doc_count)
+
+    # Print newline so that progress bar is not overwritten by next print statement
+    print()
+
+
+def parse_arguments():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <input_file_path> [<max_initial_indexing_docs> <max_parallel_indexing_docs>]")
+        exit(1)
+
+    if len(sys.argv) == 2:
+        return (sys.argv[1], DEFAULT_MAX_INITIAL_INDEXING_DOCS, DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
+    elif len(sys.argv) == 3:
+        return (sys.argv[1], int(sys.argv[2]), DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
+    elif len(sys.argv) >= 4:
+        return (sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
+
+
+if __name__ == "__main__":
+    input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs = parse_arguments()
+    output_documents(input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs)
diff --git a/openai_vector/_tools/parse_queries.py b/openai_vector/_tools/parse_queries.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import bz2
+import json
+import sys
+import typing
+
+import pyarrow as pa
+
+BATCH_SIZE: int = 1000
+QUERY_COLUMN: str = "embedding"
+OUTPUT_FILENAME: str = "queries.json.bz2"
+
+
+def output_queries(input_filename: str, queries_file: typing.TextIO):
+    with pa.memory_map(input_filename, "rb") as source:
+        query_table = pa.ipc.open_stream(source).read_all()
+        for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE):
+            query_list = record_batch.column(QUERY_COLUMN)
+            for query in query_list:
+                queries_file.write(json.dumps(query.as_py()))
+                queries_file.write("\n")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: {} <input_file_path>".format(sys.argv[0]))
+        exit(1)
+
+    input_filename = sys.argv[1]
+
+    with bz2.open(OUTPUT_FILENAME, "wt") as queries_file:
+        output_queries(input_filename, queries_file)
diff --git a/openai_vector/_tools/requirements.txt b/openai_vector/_tools/requirements.txt
@@ -0,0 +1 @@
+pyarrow
diff --git a/openai_vector/challenges/default.json b/openai_vector/challenges/default.json
@@ -0,0 +1,102 @@
+{
+  "name": "index-and-search",
+  "description": "",
+  "default": true,
+  "schedule": [
+    {
+      "operation": {
+        "operation-type": "delete-index"
+      }
+    },
+    {
+      "name": "create-index",
+      "operation": "create-index"
+    },
+    {
+      "name": "check-cluster-health",
+      "operation": "check-cluster-health"
+    },
+    {
+      "name": "initial-documents-indexing",
+      "operation": "initial-documents-indexing",
+      "warmup-time-period": {{ initial_indexing_bulk_warmup | default(40) | int }},
+      "clients": {{ initial_indexing_bulk_indexing_clients | default(5) | int }}
+    },
+    {
+      "name": "refresh-after-index",
+      "operation": {
+        "operation-type": "refresh",
+        "request-timeout": 1000,
+        "include-in-reporting": true
+      }
+    },
+    {
+      "name": "wait-until-merges-finish-after-index",
+      "operation": {
+        "operation-type": "index-stats",
+        "index": "_all",
+        "condition": {
+          "path": "_all.total.merges.current",
+          "expected-value": 0
+        },
+        "retry-until-success": true,
+        "include-in-reporting": false
+      }
+    },
+    {# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
+    {
+      "name": "post-ingest-sleep",
+      "operation": {
+        "operation-type": "sleep",
+        "duration": {{ post_ingest_sleep_duration|default(30) }}
+      }
+    },
+    {%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
+    {
+      "name": "standalone-search-knn-10-100-single-client",
+      "operation": "knn-search-10-100",
+      "warmup-iterations": 100,
+      "iterations": {{ standalone_search_iterations | default(10000) | int }}
+    },
+    {
+      "name": "standalone-knn-search-100-1000-single-client",
+      "operation": "knn-search-100-1000",
+      "warmup-iterations": 100,
+      "iterations": {{ standalone_search_iterations | default(10000) | int }}
+    },
+    {
+      "name": "standalone-search-knn-10-100-multiple-clients",
+      "operation": "knn-search-10-100",
+      "warmup-iterations": 100,
+      "clients": {{ standalone_search_clients | default(8) | int }},
+      "iterations": {{ standalone_search_iterations | default(10000) | int }}
+    },
+    {
+      "name": "standalone-search-knn-100-1000-multiple-clients",
+      "operation": "knn-search-100-1000",
+      "warmup-iterations": 100,
+      "clients": {{ standalone_search_clients | default(8) | int }},
+      "iterations": {{ standalone_search_iterations | default(10000) | int }}
+    },
+    {
+      "parallel": {
+        "tasks": [
+          {
+            "name": "parallel-documents-indexing-bulk",
+            "operation": "parallel-documents-indexing",
+            "clients": {{ parallel_indexing_bulk_clients | default(1) | int }},
+            "time-period": {{ parallel_indexing_time_period | default(1800) | int }},
+            "target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }}
+          },
+          {
+            "name": "parallel-documents-indexing-search-knn-10-100",
+            "operation": "knn-search-10-100",
+            "clients": {{ parallel_indexing_search_clients | default(3) | int }},
+            "time-period": {{ parallel_indexing_time_period | default(1800) | int }},
+            "target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }}
+          }
+        ]
+      }
+    }
+  ]
+}
diff --git a/openai_vector/files.txt b/openai_vector/files.txt
@@ -0,0 +1,4 @@
+open_ai_corpus-initial-indexing.json.bz2
+open_ai_corpus-initial-indexing-1k.json.bz2
+open_ai_corpus-parallel-indexing.json.bz2
+open_ai_corpus-parallel-indexing-1k.json.bz2
diff --git a/openai_vector/index-vectors-only-mapping.json b/openai_vector/index-vectors-only-mapping.json
@@ -0,0 +1,26 @@
+{
+  "settings": {
+    {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
+      {% if preload_pagecache %}
+    "index.store.preload": [ "vec", "vex", "vem"],
+      {% endif %}
+    "index.number_of_shards": {{number_of_shards | default(1)}},
+    "index.number_of_replicas": {{number_of_replicas | default(0)}}
+    {%- endif -%}{# non-serverless-index-settings-marker-end #}
+  },
+  "mappings": {
+    "dynamic": false,
+    "_source": {
+      "enabled": false
+    },
+    "properties": {
+      "emb": {
+        "type": "dense_vector",
+        "element_type": "float",
+        "dims": 1536,
+        "index": true,
+        "similarity": "dot_product"
+      }
+    }
+  }
+}
diff --git a/openai_vector/index-vectors-with-text-mapping.json b/openai_vector/index-vectors-with-text-mapping.json
@@ -0,0 +1,31 @@
+{
+  "settings": {
+    {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
+      {% if preload_pagecache %}
+    "index.store.preload": [ "vec", "vex", "vem"],
+      {% endif %}
+    "index.number_of_shards": {{number_of_shards | default(1)}},
+    "index.number_of_replicas": {{number_of_replicas | default(0)}}
+    {%- endif -%}{# non-serverless-index-settings-marker-end #}
+  },
+  "mappings": {
+    "properties": {
+      "docid": {
+        "type": "keyword"
+      },
+      "title": {
+        "type": "text"
+      },
+      "text": {
+        "type": "text"
+      },
+      "emb": {
+        "type": "dense_vector",
+        "element_type": "float",
+        "dims": 1536,
+        "index": true,
+        "similarity": "dot_product"
+      }
+    }
+  }
+}
diff --git a/openai_vector/operations/default.json b/openai_vector/operations/default.json
@@ -0,0 +1,41 @@
+{
+  "name": "create-index",
+  "operation-type": "create-index",
+  "settings": {{index_settings | default({}) | tojson}}
+},
+{
+  "name": "check-cluster-health",
+  "operation-type": "cluster-health",
+  "request-params": {
+    "wait_for_status": "green"
+  },
+  "retry-until-success": true
+},
+{
+  "name": "initial-documents-indexing",
+  "operation-type": "bulk",
+  "corpora": "openai-initial-indexing",
+  "bulk-size": {{initial_indexing_bulk_size | default(500)}},
+  "ingest-percentage": {{initial_indexing_ingest_percentage | default(100)}}
+},
+{
+  "name": "parallel-documents-indexing",
+  "operation-type": "bulk",
+  "corpora": "openai-parallel-indexing",
+  "bulk-size": {{parallel_indexing_bulk_size | default(500)}},
+  "ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
+},
+{
+  "name": "knn-search-10-100",
+  "operation-type": "search",
+  "param-source": "knn-param-source",
+  "k": 10,
+  "num-candidates": 100
+},
+{
+  "name": "knn-search-100-1000",
+  "operation-type": "search",
+  "param-source": "knn-param-source",
+  "k": 100,
+  "num-candidates": 1000
+}
diff --git a/openai_vector/queries.json.bz2 b/openai_vector/queries.json.bz2