From c7d2dc61ea2ba0a9f25299f99ba2d285592b6bf8 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Thu, 16 Jan 2025 11:52:00 -0800
Subject: [PATCH 01/14] wildbench simplify chat input to be compatible with all
 models, add curator model (doesn't work as is)

---
 .../WildBench/eval_instruct.py                |   7 +-
 eval/chat_benchmarks/curator_lm.py            | 164 ++++++++++++++++++
 eval/eval.py                                  |   1 +
 pyproject.toml                                |   1 +
 4 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 eval/chat_benchmarks/curator_lm.py

diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
index 100e6d68..4d44aa97 100644
--- a/eval/chat_benchmarks/WildBench/eval_instruct.py
+++ b/eval/chat_benchmarks/WildBench/eval_instruct.py
@@ -166,8 +166,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             # Load data
             id_strs, chat_history, extracted_chats, metadata = self.load_dataset()
 
+            # Remove extra fields that might not be compatible with some models
+            simplified_extracted_chats = [
+                [{"role": c["role"], "content": c["content"]} for c in chat] for chat in extracted_chats
+            ]
+
             # Prepare model inputs
-            model_inputs = [model.apply_chat_template(chat) for chat in extracted_chats]
+            model_inputs = [model.apply_chat_template(chat) for chat in simplified_extracted_chats]
 
             # Create temporary directory
             temp_dir_obj = tempfile.TemporaryDirectory()
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
new file mode 100644
index 00000000..e2dca791
--- /dev/null
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -0,0 +1,164 @@
+from typing import List, Dict, Any, Optional, Union, Tuple
+import json
+
+from bespokelabs import curator
+from datasets import Dataset
+
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.api_models import JsonChatStr
+from lm_eval.api.registry import register_model
+from lm_eval.api.instance import Instance
+from lm_eval.models.utils import handle_stop_sequences
+
+
+class prompter(curator.LLM):
+    def prompt(self, row):
+        return row["messages"]
+
+    def parse(self, row, response):
+        return {"response": response}
+
+
+@register_model("curator")
+class CuratorAPIModel(TemplateLM):
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,
+        max_length: Optional[int] = 2048,
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        timeout: int = 300,
+        tokenized_requests: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        if tokenized_requests:
+            raise NotImplementedError("Tokenized requests not implemented for curator.")
+        self.tokenized_requests = False
+        self.model_name = model or pretrained
+        self.max_length = max_length
+        self.num_concurrent = num_concurrent
+        self.max_retries = max_retries
+        self.timeout = timeout
+        self.llm = None
+        self.gen_kwargs = {}
+        self._max_gen_toks = 2048
+        self.eos = None
+
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = False,
+        gen_kwargs: Optional[dict] = None,
+        eos=None,
+        **kwargs,
+    ) -> dict:
+        assert generate, "Curator only supports generation."
+        # Create the payload for the API request
+        if self.llm is None:
+            self.gen_kwargs = gen_kwargs.copy()
+            self.eos = eos
+            max_tokens = gen_kwargs.get("max_gen_toks", self._max_gen_toks)
+            temperature = gen_kwargs.get("temperature", 0)
+            stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
+            gen_kwargs = {
+                "max_completion_tokens": max_tokens,
+                "temperature": temperature,
+                "stop": stop,
+            }
+            self.llm = prompter(model_name=self.model_name, generation_params=gen_kwargs)
+        else:
+            assert self.gen_kwargs == gen_kwargs, "Generation parameters must be the same for all requests in curator"
+            assert self.eos == eos, "EOS must be the same for all requests in curator"
+        return messages
+
+    def create_message(
+        self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        # Convert messages to the format expected by the API
+        if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
+            return [Dataset.from_dict({"messages": json.loads(m.prompt)}) for m in messages]
+        return messages
+
+    @staticmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]], tokens: List[List[int]] = None, ctxlen: List[int] = None, **kwargs
+    ) -> List[Tuple[float, bool]]:
+        # Implement log probability parsing logic
+        raise NotImplementedError("Log probability parsing not implemented.")
+        logprobs = []
+        for output in outputs:
+            # Assuming output has a structure that includes log probabilities
+            logprob = output.get("logprob", 0.0)  # Replace with actual key
+            is_greedy = output.get("is_greedy", False)  # Replace with actual key
+            logprobs.append((logprob, is_greedy))
+        return logprobs
+
+    @staticmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        # Parse the generated outputs from the API
+        return [output["response"] for output in outputs]
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.model_name
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str, JsonChatStr]:
+        # Convert chat history to the required format
+        return JsonChatStr(json.dumps(chat_history))
+
+    def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
+        payload = self._create_payload(self.create_message(messages), **kwargs)
+        response = self.llm(payload)
+        return response
+
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        raise NotImplementedError("Log likelihood tokens not implemented for curator.")
+        results = []
+        for context, continuation in requests:
+            # Assuming the model can compute log likelihoods
+            response = self.model_call([context, continuation])
+            logprob = response.get("logprob", 0.0)  # Replace with actual key
+            is_greedy = response.get("is_greedy", False)  # Replace with actual key
+            results.append((logprob, is_greedy))
+        return results
+
+    @property
+    def eot_token_id(self) -> Optional[int]:
+        # Assuming the model has a specific end-of-text token ID
+        return self.llm.eot_token_id  # Replace with actual method to get EOT token ID
+
+    def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]:
+        # Tokenize contexts if required
+        if self.tokenized_requests:
+            raise NotImplementedError("Tokenized requests not implemented for curator.")
+
+        # Extract contexts and generation kwargs from the Instance objects
+        contexts = [req.args[0] for req in requests]
+        gen_kwargs = [req.args[1] for req in requests]
+
+        # Assert all gen_kwargs are the same
+        assert all(
+            gen_kwargs[0] == gkw for gkw in gen_kwargs
+        ), "Generation parameters must be the same for all requests in curator"
+
+        payload = self._create_payload(self.create_message(contexts), generate=True, gen_kwargs=gen_kwargs[0])
+        breakpoint()
+        response = self.llm(payload)
+        breakpoint()
+        return response
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
+        raise NotImplementedError("Log likelihood rolling not implemented for curator.")
+        loglikelihoods = []
+        for context in requests:
+            response = self.model_call(context)
+            loglikelihood = response.get("loglikelihood", 0.0)  # Replace with actual key
+            loglikelihoods.append(loglikelihood)
+        return loglikelihoods
+
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        raise NotImplementedError("Token encoding not implemented for curator.")
+        return self.llm.tokenizer.encode(string)  # Replace with actual method to tokenize
diff --git a/eval/eval.py b/eval/eval.py
index ca3deb31..9816134c 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -23,6 +23,7 @@
 import lm_eval.api.task
 import lm_eval.models
 
+from eval.chat_benchmarks.curator_lm import CuratorAPIModel  # register curator model
 from eval.task import TaskManager as InstructTaskManager
 from eval.eval_tracker import DCEvaluationTracker
 
diff --git a/pyproject.toml b/pyproject.toml
index 824c1156..294246c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,6 +104,7 @@ dependencies = [
     "requests>=2.28",
     "websocket",
     "aiofiles",
+    "bespokelabs-curator",
     
     # Database
     "sqlalchemy",

From 32b8a4bea4752a18aa3e53b84eeebf8b96b7f59a Mon Sep 17 00:00:00 2001
From: Ryan Marten <ryanmarten2000@gmail.com>
Date: Thu, 16 Jan 2025 14:02:48 -0800
Subject: [PATCH 02/14] fixed input dataset creation

---
 .../baseline_gpt4_1106_preview.csv            |   2 +
 eval/chat_benchmarks/curator_lm.py            |  18 +-
 .../annotations.json                          | 164 ++++++++++++++++++
 .../leaderboard.csv                           |   2 +
 4 files changed, 179 insertions(+), 7 deletions(-)
 create mode 100644 eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
 create mode 100644 results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
 create mode 100644 results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv

diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
new file mode 100644
index 00000000..e5984c89
--- /dev/null
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -0,0 +1,2 @@
+,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
+"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",-1.8366974017900992,0.7446564084265157,-7.2599824829397441
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index e2dca791..1abc0fb9 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -9,6 +9,7 @@
 from lm_eval.api.registry import register_model
 from lm_eval.api.instance import Instance
 from lm_eval.models.utils import handle_stop_sequences
+import os
 
 
 class prompter(curator.LLM):
@@ -46,6 +47,9 @@ def __init__(
         self._max_gen_toks = 2048
         self.eos = None
 
+        # Disable cache since it is not necessary
+        os.environ["CURATOR_DISABLE_CACHE"] = "true"
+
     def _create_payload(
         self,
         messages: Union[List[List[int]], List[dict], List[str], str],
@@ -79,8 +83,9 @@ def create_message(
     ) -> Union[List[List[int]], List[dict], List[str], str]:
         # Convert messages to the format expected by the API
         if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
-            return [Dataset.from_dict({"messages": json.loads(m.prompt)}) for m in messages]
-        return messages
+            return Dataset.from_dict({"messages": [json.loads(m.prompt) for m in messages]})
+        else:
+            raise ValueError("Messages must be a list of JsonChatStr objects")
 
     @staticmethod
     def parse_logprobs(
@@ -111,7 +116,7 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str,
 
     def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
         payload = self._create_payload(self.create_message(messages), **kwargs)
-        response = self.llm(payload)
+        response = self.llm(payload)["response"]
         return response
 
     def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
@@ -144,10 +149,9 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
             gen_kwargs[0] == gkw for gkw in gen_kwargs
         ), "Generation parameters must be the same for all requests in curator"
 
-        payload = self._create_payload(self.create_message(contexts), generate=True, gen_kwargs=gen_kwargs[0])
-        breakpoint()
-        response = self.llm(payload)
-        breakpoint()
+        contexts_dataset = self.create_message(contexts)
+        payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
+        response = self.llm(payload)["response"]
         return response
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
diff --git a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
new file mode 100644
index 00000000..e1cc9b24
--- /dev/null
+++ b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
@@ -0,0 +1,164 @@
+[
+  {
+    "instruction":"What are the names of some famous actors that started their careers on Broadway?",
+    "output_1":"Several famous actors started their careers on Broadway before making it big in film and television. Here are a few notable examples:\n\n1. Sarah Jessica Parker - Before she was Carrie Bradshaw on \"Sex and the City,\" Sarah Jessica Parker was a Broadway star, having appeared in productions like \"Annie\" as a child.\n\n2. Meryl Streep - Meryl Streep's early career included Broadway productions such as \"Trelawny of the 'Wells'\" and \"A Memory of Two Mondays \/ 27 Wagons Full of Cotton.\"\n\n3. Hugh Jackman - Hugh Jackman won a Tony Award for his role in \"The Boy from Oz\" and has been known for his stage work as well as his film career.\n\n4. Sutton Foster - Known for her television role in \"Younger,\" Sutton Foster is also a Broadway legend with leading roles in shows like \"Thoroughly Modern Millie\" and \"Anything Goes.\"\n\n5. Kristen Bell - Before she was the voice of Anna in \"Frozen\" or the star of \"The Good Place,\" Kristen Bell appeared in Broadway's \"The Adventures of Tom Sawyer\" and \"The Crucible.\"\n\n6. Audra McDonald - Audra McDonald is a renowned Broadway actress with a record-breaking number of Tony Awards. She's starred in \"Ragtime,\" \"Carousel,\" \"Master Class,\" and more.\n\n7. Nathan Lane - Nathan Lane is a Broadway veteran known for his roles in \"The Producers,\" \"A Funny Thing Happened on the Way to the Forum,\" and \"Angels in America.\"\n\n8. Idina Menzel - Before \"Frozen\" and \"Wicked\" made her a household name, Idina Menzel started on Broadway in shows like \"Rent\" and \"Hair.\"\n\n9. Lin-Manuel Miranda - Before \"Hamilton\" and \"In the Heights\" became huge hits, Lin-Manuel Miranda was performing on Broadway, eventually becoming a celebrated writer and actor.\n\n10. Lea Michele - Prior to her role on \"Glee,\" Lea Michele was a young Broadway actress in shows like \"Les Mis\u00e9rables,\" \"Ragtime,\" and \"Spring Awakening.\"\n\nThese actors are just a few examples of the many performers who have transitioned from the Broadway stage to broader fame in the entertainment industry. Broadway often serves as a proving ground for talent, and many actors continue to return to the stage throughout their careers.",
+    "generator_1":"gpt4_1106_preview",
+    "dataset":"helpful_base",
+    "datasplit":"eval",
+    "generator_2":"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",
+    "output_2":"Here are some notable actors who began their careers on Broadway:\n\n1. Hugh Jackman\n2. Bernadette Peters\n3. Idina Menzel\n4. Nathan Lane\n5. Audra McDonald\n6. Lin-Manuel Miranda\n7. Sutton Foster\n8. Patti LuPone\n9. Kristin Chenoweth\n10. Mandy Patinkin\n11. Viola Davis\n12. Neil Patrick Harris\n13. Bette Midler\n14. John Travolta\n15. Chita Rivera\n\nThese actors often started in musical theater or stage plays before transitioning to film and television careers.",
+    "annotator":"weighted_alpaca_eval_gpt-4o-mini-2024-07-18",
+    "preference":1.0000000013,
+    "preference_version":"alpaca_eval==0.6.5",
+    "preference_date":"2025-01-16T14:01:18.848043",
+    "preference_time_per_example":0.6654965878,
+    "preference_price_per_example":0.00013425,
+    "preference_raw_completion":{
+      "finish_reason":"length",
+      "index":0,
+      "logprobs":{
+        "content":[
+          {
+            "token":"M",
+            "bytes":[
+              77
+            ],
+            "logprob":0.0,
+            "top_logprobs":[
+              {
+                "token":"M",
+                "bytes":[
+                  77
+                ],
+                "logprob":0.0
+              },
+              {
+                "token":"m",
+                "bytes":[
+                  109
+                ],
+                "logprob":-20.5
+              },
+              {
+                "token":" M",
+                "bytes":[
+                  32,
+                  77
+                ],
+                "logprob":-22.375
+              },
+              {
+                "token":"\u041c",
+                "bytes":[
+                  208,
+                  156
+                ],
+                "logprob":-23.625
+              },
+              {
+                "token":"\"M",
+                "bytes":[
+                  34,
+                  77
+                ],
+                "logprob":-25.75
+              }
+            ]
+          }
+        ],
+        "refusal":null
+      },
+      "message":{
+        "content":"M",
+        "refusal":null,
+        "role":"assistant",
+        "audio":null,
+        "function_call":null,
+        "tool_calls":null
+      },
+      "text":"M",
+      "total_tokens":895.0
+    }
+  },
+  {
+    "instruction":"How did US states get their names?",
+    "output_1":"The names of U.S. states are derived from a variety of languages and sources, including indigenous American languages, European languages (primarily English, Spanish, and French), as well as other historical influences. Here is a brief overview of how some states got their names:\n\n1. **Indigenous American Languages**: Many state names come from Native American words or names of tribes, often from the languages of the indigenous peoples who lived in or near those areas.\n   - **Alabama**: Possibly derived from the Choctaw language, meaning \"thicket clearers.\"\n   - **Connecticut**: From a Mohegan-Pequot word meaning \"long tidal river.\"\n   - **Massachusetts**: Named after the Massachusett tribe; the name means \"at or about the great hill.\"\n\n2. **Spanish**: Some state names come from Spanish explorers and settlers.\n   - **California**: Possibly named after a mythical island in a Spanish novel, \"Las sergas de Esplandi\u00e1n\" by Garci Rodr\u00edguez de Montalvo.\n   - **Colorado**: Named after the Colorado River, which in turn was named for its red-colored silt, from Spanish \"colorado\" meaning \"colored red.\"\n   - **Florida**: Named by Spanish explorer Juan Ponce de Le\u00f3n in 1513, \"Pascua Florida\" refers to the \"feast of the flowers\" (Easter) and the verdant landscape.\n\n3. **French**: Some states have names derived from French explorers and settlers or from French interpretations of Native American words.\n   - **Louisiana**: Named in honor of King Louis XIV of France.\n   - **Vermont**: From the French \"vert mont,\" meaning \"green mountain.\"\n\n4. **English**: Many states have names that are either directly taken from places in England or are English translations or adaptations of Native American names.\n   - **New York**: Named after the Duke of York (later King James II of England), when the English took it from the Dutch who called it New Amsterdam.\n   - **Virginia**: Named after Queen Elizabeth I of England, the \"Virgin Queen.\"\n\n5. **Other European Languages**: A few state names have origins in other European languages due to the influence of various European settlers and explorers.\n   - **Illinois**: French adaptation of an Algonquin word meaning \"tribe of superior men.\"\n   - **Minnesota**: From the Dakota Sioux word \"mnisota,\" meaning \"cloudy water\" or \"sky-tinted water.\"\n\n6. **Latin or Greek**: Some names are derived from Latin or Greek words or roots.\n   - **Pennsylvania**: Named by William Penn after his father and means \"Penn's woods\" in Latin.\n\n7. **Place Names and Royalty**: Some states are named after monarchs or other places.\n   - **Georgia**: Named in honor of King George II of Great Britain.\n   - **Maryland**: Named in honor of Queen Henrietta Maria, the wife of King Charles I of England.\n\n8. **Combination of Languages and Influences**: Some state names are a blend of influences from different languages or are adaptations that have transformed over time due to usage and pronunciation.\n   - **Arkansas**: Named after the French plural of a Quapaw word \"akakaze\" meaning \"land of downriver people.\"\n\n9. **Geographical Features**: Some states are named after prominent geographical features.\n   - **Indiana**: Meaning \"land of the Indians.\"\n   - **Tennessee**: Likely derived from a Cherokee village name \"Tanasi.\"\n\nKeep in mind this is not an exhaustive list, and many of the origins can be more complex, with different theories about their etymology.",
+    "generator_1":"gpt4_1106_preview",
+    "dataset":"helpful_base",
+    "datasplit":"eval",
+    "generator_2":"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",
+    "output_2":"The origins of US state names are diverse, with influences from Native American languages, European explorers, historical figures, and geographical features. Here are some examples:\n\n1. Native American origins:\n- Alabama: From Choctaw words meaning \"thicket clearers\"\n- Kansas: From Kansa Native American tribe\n- Michigan: From Ojibwe word meaning \"large water\"\n\n2. European languages:\n- California: From a Spanish novel\n- Florida: Spanish for \"flowery\"\n- Vermont: French for \"green mountain\"\n\n3. English royalty\/leaders:\n- Maryland: Named after Queen Henrietta Maria\n- Virginia: Named after Queen Elizabeth I (Virgin Queen)\n- Georgia: Named after King George II\n\n4. Geographical features:\n- Colorado: Spanish for \"colored red\"\n- Nevada: Spanish for \"snow-covered\"\n- Montana: Spanish for \"mountainous\"\n\n5. Historical figures:\n- Washington: Named after George Washington\n- Pennsylvania: Named after William Penn\n- Louisiana: Named after King Louis XIV of France\n\n6. Descriptive terms:\n- New Hampshire: After Hampshire, England\n- New Mexico: Similar to Mexico\n- New York: After York, England\n\nEach state's name has a unique story reflecting its historical and cultural background.",
+    "annotator":"weighted_alpaca_eval_gpt-4o-mini-2024-07-18",
+    "preference":1.0000000001,
+    "preference_version":"alpaca_eval==0.6.5",
+    "preference_date":"2025-01-16T14:01:18.848043",
+    "preference_time_per_example":0.6654965878,
+    "preference_price_per_example":0.0001968,
+    "preference_raw_completion":{
+      "finish_reason":"length",
+      "index":0,
+      "logprobs":{
+        "content":[
+          {
+            "token":"M",
+            "bytes":[
+              77
+            ],
+            "logprob":0.0,
+            "top_logprobs":[
+              {
+                "token":"M",
+                "bytes":[
+                  77
+                ],
+                "logprob":0.0
+              },
+              {
+                "token":" M",
+                "bytes":[
+                  32,
+                  77
+                ],
+                "logprob":-22.875
+              },
+              {
+                "token":"m",
+                "bytes":[
+                  109
+                ],
+                "logprob":-23.5
+              },
+              {
+                "token":"\u041c",
+                "bytes":[
+                  208,
+                  156
+                ],
+                "logprob":-23.625
+              },
+              {
+                "token":"\"M",
+                "bytes":[
+                  34,
+                  77
+                ],
+                "logprob":-26.375
+              }
+            ]
+          }
+        ],
+        "refusal":null
+      },
+      "message":{
+        "content":"M",
+        "refusal":null,
+        "role":"assistant",
+        "audio":null,
+        "function_call":null,
+        "tool_calls":null
+      },
+      "text":"M",
+      "total_tokens":1312.0
+    }
+  }
+]
\ No newline at end of file
diff --git a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv
new file mode 100644
index 00000000..05a189cd
--- /dev/null
+++ b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv
@@ -0,0 +1,2 @@
+,win_rate,standard_error,mode,avg_length,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,length_controlled_winrate,lc_standard_error
+"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",6.561972076823963e-08,5.939556624312557e-08,community,816,0,2,0,2,0.0,0.06771025040792734,0.002552381715452862

From bfa48e01f4c6c7177fd4ff5f64abd383acf8e5ca Mon Sep 17 00:00:00 2001
From: Ryan Marten <ryanmarten2000@gmail.com>
Date: Thu, 16 Jan 2025 14:04:20 -0800
Subject: [PATCH 03/14] .gitignore

---
 .gitignore                                    |   1 +
 .../annotations.json                          | 164 ------------------
 .../leaderboard.csv                           |   2 -
 3 files changed, 1 insertion(+), 166 deletions(-)
 delete mode 100644 results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
 delete mode 100644 results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv

diff --git a/.gitignore b/.gitignore
index d865858f..cb913079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -225,3 +225,4 @@ eval/chat_benchmarks/MixEval/mix_eval/data/test/
 eval/chat_benchmarks/MixEval/mix_eval/data/model_responses
 eval/chat_benchmarks/MixEval/mix_eval/eval_scripts
 eval/chat_benchmarks/MixEval/results
+results/
\ No newline at end of file
diff --git a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
deleted file mode 100644
index e1cc9b24..00000000
--- a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/annotations.json
+++ /dev/null
@@ -1,164 +0,0 @@
-[
-  {
-    "instruction":"What are the names of some famous actors that started their careers on Broadway?",
-    "output_1":"Several famous actors started their careers on Broadway before making it big in film and television. Here are a few notable examples:\n\n1. Sarah Jessica Parker - Before she was Carrie Bradshaw on \"Sex and the City,\" Sarah Jessica Parker was a Broadway star, having appeared in productions like \"Annie\" as a child.\n\n2. Meryl Streep - Meryl Streep's early career included Broadway productions such as \"Trelawny of the 'Wells'\" and \"A Memory of Two Mondays \/ 27 Wagons Full of Cotton.\"\n\n3. Hugh Jackman - Hugh Jackman won a Tony Award for his role in \"The Boy from Oz\" and has been known for his stage work as well as his film career.\n\n4. Sutton Foster - Known for her television role in \"Younger,\" Sutton Foster is also a Broadway legend with leading roles in shows like \"Thoroughly Modern Millie\" and \"Anything Goes.\"\n\n5. Kristen Bell - Before she was the voice of Anna in \"Frozen\" or the star of \"The Good Place,\" Kristen Bell appeared in Broadway's \"The Adventures of Tom Sawyer\" and \"The Crucible.\"\n\n6. Audra McDonald - Audra McDonald is a renowned Broadway actress with a record-breaking number of Tony Awards. She's starred in \"Ragtime,\" \"Carousel,\" \"Master Class,\" and more.\n\n7. Nathan Lane - Nathan Lane is a Broadway veteran known for his roles in \"The Producers,\" \"A Funny Thing Happened on the Way to the Forum,\" and \"Angels in America.\"\n\n8. Idina Menzel - Before \"Frozen\" and \"Wicked\" made her a household name, Idina Menzel started on Broadway in shows like \"Rent\" and \"Hair.\"\n\n9. Lin-Manuel Miranda - Before \"Hamilton\" and \"In the Heights\" became huge hits, Lin-Manuel Miranda was performing on Broadway, eventually becoming a celebrated writer and actor.\n\n10. Lea Michele - Prior to her role on \"Glee,\" Lea Michele was a young Broadway actress in shows like \"Les Mis\u00e9rables,\" \"Ragtime,\" and \"Spring Awakening.\"\n\nThese actors are just a few examples of the many performers who have transitioned from the Broadway stage to broader fame in the entertainment industry. Broadway often serves as a proving ground for talent, and many actors continue to return to the stage throughout their careers.",
-    "generator_1":"gpt4_1106_preview",
-    "dataset":"helpful_base",
-    "datasplit":"eval",
-    "generator_2":"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",
-    "output_2":"Here are some notable actors who began their careers on Broadway:\n\n1. Hugh Jackman\n2. Bernadette Peters\n3. Idina Menzel\n4. Nathan Lane\n5. Audra McDonald\n6. Lin-Manuel Miranda\n7. Sutton Foster\n8. Patti LuPone\n9. Kristin Chenoweth\n10. Mandy Patinkin\n11. Viola Davis\n12. Neil Patrick Harris\n13. Bette Midler\n14. John Travolta\n15. Chita Rivera\n\nThese actors often started in musical theater or stage plays before transitioning to film and television careers.",
-    "annotator":"weighted_alpaca_eval_gpt-4o-mini-2024-07-18",
-    "preference":1.0000000013,
-    "preference_version":"alpaca_eval==0.6.5",
-    "preference_date":"2025-01-16T14:01:18.848043",
-    "preference_time_per_example":0.6654965878,
-    "preference_price_per_example":0.00013425,
-    "preference_raw_completion":{
-      "finish_reason":"length",
-      "index":0,
-      "logprobs":{
-        "content":[
-          {
-            "token":"M",
-            "bytes":[
-              77
-            ],
-            "logprob":0.0,
-            "top_logprobs":[
-              {
-                "token":"M",
-                "bytes":[
-                  77
-                ],
-                "logprob":0.0
-              },
-              {
-                "token":"m",
-                "bytes":[
-                  109
-                ],
-                "logprob":-20.5
-              },
-              {
-                "token":" M",
-                "bytes":[
-                  32,
-                  77
-                ],
-                "logprob":-22.375
-              },
-              {
-                "token":"\u041c",
-                "bytes":[
-                  208,
-                  156
-                ],
-                "logprob":-23.625
-              },
-              {
-                "token":"\"M",
-                "bytes":[
-                  34,
-                  77
-                ],
-                "logprob":-25.75
-              }
-            ]
-          }
-        ],
-        "refusal":null
-      },
-      "message":{
-        "content":"M",
-        "refusal":null,
-        "role":"assistant",
-        "audio":null,
-        "function_call":null,
-        "tool_calls":null
-      },
-      "text":"M",
-      "total_tokens":895.0
-    }
-  },
-  {
-    "instruction":"How did US states get their names?",
-    "output_1":"The names of U.S. states are derived from a variety of languages and sources, including indigenous American languages, European languages (primarily English, Spanish, and French), as well as other historical influences. Here is a brief overview of how some states got their names:\n\n1. **Indigenous American Languages**: Many state names come from Native American words or names of tribes, often from the languages of the indigenous peoples who lived in or near those areas.\n   - **Alabama**: Possibly derived from the Choctaw language, meaning \"thicket clearers.\"\n   - **Connecticut**: From a Mohegan-Pequot word meaning \"long tidal river.\"\n   - **Massachusetts**: Named after the Massachusett tribe; the name means \"at or about the great hill.\"\n\n2. **Spanish**: Some state names come from Spanish explorers and settlers.\n   - **California**: Possibly named after a mythical island in a Spanish novel, \"Las sergas de Esplandi\u00e1n\" by Garci Rodr\u00edguez de Montalvo.\n   - **Colorado**: Named after the Colorado River, which in turn was named for its red-colored silt, from Spanish \"colorado\" meaning \"colored red.\"\n   - **Florida**: Named by Spanish explorer Juan Ponce de Le\u00f3n in 1513, \"Pascua Florida\" refers to the \"feast of the flowers\" (Easter) and the verdant landscape.\n\n3. **French**: Some states have names derived from French explorers and settlers or from French interpretations of Native American words.\n   - **Louisiana**: Named in honor of King Louis XIV of France.\n   - **Vermont**: From the French \"vert mont,\" meaning \"green mountain.\"\n\n4. **English**: Many states have names that are either directly taken from places in England or are English translations or adaptations of Native American names.\n   - **New York**: Named after the Duke of York (later King James II of England), when the English took it from the Dutch who called it New Amsterdam.\n   - **Virginia**: Named after Queen Elizabeth I of England, the \"Virgin Queen.\"\n\n5. **Other European Languages**: A few state names have origins in other European languages due to the influence of various European settlers and explorers.\n   - **Illinois**: French adaptation of an Algonquin word meaning \"tribe of superior men.\"\n   - **Minnesota**: From the Dakota Sioux word \"mnisota,\" meaning \"cloudy water\" or \"sky-tinted water.\"\n\n6. **Latin or Greek**: Some names are derived from Latin or Greek words or roots.\n   - **Pennsylvania**: Named by William Penn after his father and means \"Penn's woods\" in Latin.\n\n7. **Place Names and Royalty**: Some states are named after monarchs or other places.\n   - **Georgia**: Named in honor of King George II of Great Britain.\n   - **Maryland**: Named in honor of Queen Henrietta Maria, the wife of King Charles I of England.\n\n8. **Combination of Languages and Influences**: Some state names are a blend of influences from different languages or are adaptations that have transformed over time due to usage and pronunciation.\n   - **Arkansas**: Named after the French plural of a Quapaw word \"akakaze\" meaning \"land of downriver people.\"\n\n9. **Geographical Features**: Some states are named after prominent geographical features.\n   - **Indiana**: Meaning \"land of the Indians.\"\n   - **Tennessee**: Likely derived from a Cherokee village name \"Tanasi.\"\n\nKeep in mind this is not an exhaustive list, and many of the origins can be more complex, with different theories about their etymology.",
-    "generator_1":"gpt4_1106_preview",
-    "dataset":"helpful_base",
-    "datasplit":"eval",
-    "generator_2":"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",
-    "output_2":"The origins of US state names are diverse, with influences from Native American languages, European explorers, historical figures, and geographical features. Here are some examples:\n\n1. Native American origins:\n- Alabama: From Choctaw words meaning \"thicket clearers\"\n- Kansas: From Kansa Native American tribe\n- Michigan: From Ojibwe word meaning \"large water\"\n\n2. European languages:\n- California: From a Spanish novel\n- Florida: Spanish for \"flowery\"\n- Vermont: French for \"green mountain\"\n\n3. English royalty\/leaders:\n- Maryland: Named after Queen Henrietta Maria\n- Virginia: Named after Queen Elizabeth I (Virgin Queen)\n- Georgia: Named after King George II\n\n4. Geographical features:\n- Colorado: Spanish for \"colored red\"\n- Nevada: Spanish for \"snow-covered\"\n- Montana: Spanish for \"mountainous\"\n\n5. Historical figures:\n- Washington: Named after George Washington\n- Pennsylvania: Named after William Penn\n- Louisiana: Named after King Louis XIV of France\n\n6. Descriptive terms:\n- New Hampshire: After Hampshire, England\n- New Mexico: Similar to Mexico\n- New York: After York, England\n\nEach state's name has a unique story reflecting its historical and cultural background.",
-    "annotator":"weighted_alpaca_eval_gpt-4o-mini-2024-07-18",
-    "preference":1.0000000001,
-    "preference_version":"alpaca_eval==0.6.5",
-    "preference_date":"2025-01-16T14:01:18.848043",
-    "preference_time_per_example":0.6654965878,
-    "preference_price_per_example":0.0001968,
-    "preference_raw_completion":{
-      "finish_reason":"length",
-      "index":0,
-      "logprobs":{
-        "content":[
-          {
-            "token":"M",
-            "bytes":[
-              77
-            ],
-            "logprob":0.0,
-            "top_logprobs":[
-              {
-                "token":"M",
-                "bytes":[
-                  77
-                ],
-                "logprob":0.0
-              },
-              {
-                "token":" M",
-                "bytes":[
-                  32,
-                  77
-                ],
-                "logprob":-22.875
-              },
-              {
-                "token":"m",
-                "bytes":[
-                  109
-                ],
-                "logprob":-23.5
-              },
-              {
-                "token":"\u041c",
-                "bytes":[
-                  208,
-                  156
-                ],
-                "logprob":-23.625
-              },
-              {
-                "token":"\"M",
-                "bytes":[
-                  34,
-                  77
-                ],
-                "logprob":-26.375
-              }
-            ]
-          }
-        ],
-        "refusal":null
-      },
-      "message":{
-        "content":"M",
-        "refusal":null,
-        "role":"assistant",
-        "audio":null,
-        "function_call":null,
-        "tool_calls":null
-      },
-      "text":"M",
-      "total_tokens":1312.0
-    }
-  }
-]
\ No newline at end of file
diff --git a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv b/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv
deleted file mode 100644
index 05a189cd..00000000
--- a/results/model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/leaderboard.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-,win_rate,standard_error,mode,avg_length,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,length_controlled_winrate,lc_standard_error
-"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",6.561972076823963e-08,5.939556624312557e-08,community,816,0,2,0,2,0.0,0.06771025040792734,0.002552381715452862

From 4a4ebb569295bef1b19e04b20bcca908a3dd1815 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Thu, 16 Jan 2025 14:25:05 -0800
Subject: [PATCH 04/14] Ryan's fixes

---
 eval/chat_benchmarks/curator_lm.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index e2dca791..af9cfebc 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -33,6 +33,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
+        # os.environ["CURATOR_DISABLE_CACHE"] = "true"
         if tokenized_requests:
             raise NotImplementedError("Tokenized requests not implemented for curator.")
         self.tokenized_requests = False
@@ -79,7 +80,7 @@ def create_message(
     ) -> Union[List[List[int]], List[dict], List[str], str]:
         # Convert messages to the format expected by the API
         if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
-            return [Dataset.from_dict({"messages": json.loads(m.prompt)}) for m in messages]
+            return Dataset.from_dict({"messages": [json.loads(m.prompt) for m in messages]})
         return messages
 
     @staticmethod
@@ -111,7 +112,7 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str,
 
     def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
         payload = self._create_payload(self.create_message(messages), **kwargs)
-        response = self.llm(payload)
+        response = self.llm(payload)["response"]
         return response
 
     def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
@@ -145,9 +146,7 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         ), "Generation parameters must be the same for all requests in curator"
 
         payload = self._create_payload(self.create_message(contexts), generate=True, gen_kwargs=gen_kwargs[0])
-        breakpoint()
-        response = self.llm(payload)
-        breakpoint()
+        response = self.llm(payload)["response"]
         return response
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:

From d9ca64bfd61c1c88c1a7405794bf3c61f4722530 Mon Sep 17 00:00:00 2001
From: ryanmarten <ryanmarten2000@gmail.com>
Date: Mon, 20 Jan 2025 23:12:17 -0800
Subject: [PATCH 05/14] finish length and input interface fixes

---
 eval/chat_benchmarks/curator_lm.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 1abc0fb9..680034e7 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -1,15 +1,14 @@
-from typing import List, Dict, Any, Optional, Union, Tuple
 import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from bespokelabs import curator
 from datasets import Dataset
-
+from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
-from lm_eval.models.api_models import JsonChatStr
 from lm_eval.api.registry import register_model
-from lm_eval.api.instance import Instance
+from lm_eval.models.api_models import JsonChatStr
 from lm_eval.models.utils import handle_stop_sequences
-import os
 
 
 class prompter(curator.LLM):
@@ -72,7 +71,12 @@ def _create_payload(
                 "temperature": temperature,
                 "stop": stop,
             }
-            self.llm = prompter(model_name=self.model_name, generation_params=gen_kwargs)
+            backend_kwargs = {
+                "invalid_finish_reasons": [
+                    "content_filter"
+                ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"
+            }
+            self.llm = prompter(model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs)
         else:
             assert self.gen_kwargs == gen_kwargs, "Generation parameters must be the same for all requests in curator"
             assert self.eos == eos, "EOS must be the same for all requests in curator"
@@ -83,7 +87,7 @@ def create_message(
     ) -> Union[List[List[int]], List[dict], List[str], str]:
         # Convert messages to the format expected by the API
         if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
-            return Dataset.from_dict({"messages": [json.loads(m.prompt) for m in messages]})
+            return [json.loads(m.prompt) for m in messages]
         else:
             raise ValueError("Messages must be a list of JsonChatStr objects")
 

From 3fec4c45add16331967dd6c04237c2fda412b5ce Mon Sep 17 00:00:00 2001
From: Ryan Marten <ryanmarten2000@gmail.com>
Date: Tue, 21 Jan 2025 00:10:31 -0800
Subject: [PATCH 06/14] test functions now

---
 .../baseline_gpt4_1106_preview.csv                   |  1 +
 eval/chat_benchmarks/curator_lm.py                   | 12 +++---------
 pyproject.toml                                       |  2 +-
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
index e5984c89..25881b4d 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -1,2 +1,3 @@
 ,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
 "model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",-1.8366974017900992,0.7446564084265157,-7.2599824829397441
+"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False,batch_size=1",-1.8366973986291528,0.7446564081282206,-7.2603061644307445
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 680034e7..edc392f9 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -11,14 +11,6 @@
 from lm_eval.models.utils import handle_stop_sequences
 
 
-class prompter(curator.LLM):
-    def prompt(self, row):
-        return row["messages"]
-
-    def parse(self, row, response):
-        return {"response": response}
-
-
 @register_model("curator")
 class CuratorAPIModel(TemplateLM):
     def __init__(
@@ -76,7 +68,9 @@ def _create_payload(
                     "content_filter"
                 ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"
             }
-            self.llm = prompter(model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs)
+            self.llm = curator.LLM(
+                model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs
+            )
         else:
             assert self.gen_kwargs == gen_kwargs, "Generation parameters must be the same for all requests in curator"
             assert self.eos == eos, "EOS must be the same for all requests in curator"
diff --git a/pyproject.toml b/pyproject.toml
index 11e05d56..95e906e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,7 +104,7 @@ dependencies = [
     "requests>=2.28",
     "websocket",
     "aiofiles",
-    "bespokelabs-curator",
+    "bespokelabs-curator @ git+https://github.com/bespokelabsai/curator.git@main",
     
     # Database
     "sqlalchemy",

From 4f04018f80ebfa37aab386c99a042a9be73fe3f3 Mon Sep 17 00:00:00 2001
From: Ryan Marten <ryanmarten2000@gmail.com>
Date: Tue, 21 Jan 2025 00:50:27 -0800
Subject: [PATCH 07/14] undo

---
 .../length_controlled_v1/baseline_gpt4_1106_preview.csv    | 2 +-
 eval/chat_benchmarks/curator_lm.py                         | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
index 25881b4d..ba6a3d39 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -1,3 +1,3 @@
 ,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
 "model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",-1.8366974017900992,0.7446564084265157,-7.2599824829397441
-"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False,batch_size=1",-1.8366973986291528,0.7446564081282206,-7.2603061644307445
+"model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False,batch_size=1",-2.2123505700477266,0.4430662251338714,0.0602887165933287
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index edc392f9..ee8fa571 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -68,6 +68,13 @@ def _create_payload(
                     "content_filter"
                 ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"
             }
+            # hard code for now but should be passed in through model_args
+            if self.model_name == "gemini/gemini-1.5-flash":
+                backend_kwargs["max_requests_per_minute"] = 2_000
+                backend_kwargs["max_tokens_per_minute"] = 4_000_000
+            elif self.model_name == "gemini/gemini-1.5-pro":
+                backend_kwargs["max_requests_per_minute"] = 1_000
+                backend_kwargs["max_tokens_per_minute"] = 4_000_000
             self.llm = curator.LLM(
                 model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs
             )

From 2ec5a576ea4de8c499fcc573ea73fcec1edf4744 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 21 Jan 2025 15:12:18 -0800
Subject: [PATCH 08/14] some clean-up

---
 .../baseline_gpt4_1106_preview.csv            |  1 +
 eval/chat_benchmarks/curator_lm.py            | 43 +++++++++++++++----
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
index e5984c89..66091daa 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -1,2 +1,3 @@
 ,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
 "model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",-1.8366974017900992,0.7446564084265157,-7.2599824829397441
+"model_curator_model_args_pretrained=gpt-4o-mini-2024-07-18,tokenized_requests=False",-1.7052487285212568,0.3381921818177004,0.5963566226936441
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index c248bd03..e0aa32a7 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -12,7 +12,7 @@
 import os
 
 
-class prompter(curator.LLM):
+class CuratorPrompter(curator.LLM):
     def prompt(self, row):
         return row["messages"]
 
@@ -27,26 +27,44 @@ def __init__(
         model: str = None,
         pretrained: str = None,
         max_length: Optional[int] = 2048,
-        num_concurrent: int = 1,
-        max_retries: int = 3,
+        max_retries: int = 10,
         timeout: int = 300,
         tokenized_requests: bool = False,
+        max_requests_per_minute: int = None,
+        max_tokens_per_minute: int = None,
+        seconds_to_pause_on_rate_limit: int = None,
         **kwargs,
     ):
         super().__init__()
+
+        self.model_name = model or pretrained
+
+        if "gemini" in self.model_name:
+            max_requests_per_minute = max_requests_per_minute or 2000
+            max_tokens_per_minute = max_tokens_per_minute or 4_000_000
+        elif "claude" in self.model_name:
+            max_requests_per_minute = max_requests_per_minute or 2000
+            max_tokens_per_minute = max_tokens_per_minute or 80_000
+
         os.environ["CURATOR_DISABLE_CACHE"] = "true"
         if tokenized_requests:
             raise NotImplementedError("Tokenized requests not implemented for curator.")
         self.tokenized_requests = False
-        self.model_name = model or pretrained
         self.max_length = max_length
-        self.num_concurrent = num_concurrent
-        self.max_retries = max_retries
-        self.timeout = timeout
         self.llm = None
         self.gen_kwargs = {}
         self._max_gen_toks = 2048
         self.eos = None
+        self.backend_params = kwargs
+        self.backend_params["require_all_responses"] = False
+        self.backend_params["request_timeout"] = timeout
+        self.backend_params["max_retries"] = max_retries
+        if max_requests_per_minute is not None:
+            self.backend_params["max_requests_per_minute"] = max_requests_per_minute
+        if max_tokens_per_minute is not None:
+            self.backend_params["max_tokens_per_minute"] = max_tokens_per_minute
+        if seconds_to_pause_on_rate_limit is not None:
+            self.backend_params["seconds_to_pause_on_rate_limit"] = seconds_to_pause_on_rate_limit
 
         # Disable cache since it is not necessary
         os.environ["CURATOR_DISABLE_CACHE"] = "true"
@@ -58,6 +76,7 @@ def _create_payload(
         generate: bool = False,
         gen_kwargs: Optional[dict] = None,
         eos=None,
+        backend_params=None,
         **kwargs,
     ) -> dict:
         assert generate, "Curator only supports generation."
@@ -73,7 +92,11 @@ def _create_payload(
                 "temperature": temperature,
                 "stop": stop,
             }
-            self.llm = prompter(model_name=self.model_name, generation_params=gen_kwargs)
+            self.llm = CuratorPrompter(
+                model_name=self.model_name,
+                generation_params=gen_kwargs,
+                backend_params=backend_params,
+            )
         else:
             assert self.gen_kwargs == gen_kwargs, "Generation parameters must be the same for all requests in curator"
             assert self.eos == eos, "EOS must be the same for all requests in curator"
@@ -151,7 +174,9 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         ), "Generation parameters must be the same for all requests in curator"
 
         contexts_dataset = self.create_message(contexts)
-        payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
+        payload = self._create_payload(
+            contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0], backend_params=self.backend_params
+        )
         response = self.llm(payload)["response"]
         return response
 

From edd766711de8b0073f5fb95de23db302415c4b27 Mon Sep 17 00:00:00 2001
From: ryanmarten <ryanmarten2000@gmail.com>
Date: Tue, 21 Jan 2025 15:44:35 -0800
Subject: [PATCH 09/14] bump evalchemy

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95e906e6..9fe8360a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,7 +104,7 @@ dependencies = [
     "requests>=2.28",
     "websocket",
     "aiofiles",
-    "bespokelabs-curator @ git+https://github.com/bespokelabsai/curator.git@main",
+    "bespokelabs-curator>=0.16.0",
     
     # Database
     "sqlalchemy",

From bda927f33ccd6217a9f6a97bb279bbac5089fa70 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 21 Jan 2025 17:06:18 -0800
Subject: [PATCH 10/14] In CuratorAPIModel: allow to overwirte the default
 requests limits for claude and gemini, remove ability to pass arbitrary
 backend_params as it mostly crashes, uses max_retries and timeout parameters

---
 eval/chat_benchmarks/curator_lm.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 32793f6d..d6e9db2a 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -46,7 +46,7 @@ def __init__(
         self.gen_kwargs = {}
         self._max_gen_toks = 2048
         self.eos = None
-        self.backend_params = kwargs
+        self.backend_params = {}
         self.backend_params["require_all_responses"] = False
         self.backend_params["request_timeout"] = timeout
         self.backend_params["max_retries"] = max_retries
@@ -67,13 +67,11 @@ def _create_payload(
         generate: bool = False,
         gen_kwargs: Optional[dict] = None,
         eos=None,
-        backend_params=None,
         **kwargs,
     ) -> dict:
         assert generate, "Curator only supports generation."
         # Create the payload for the API request
         if self.llm is None:
-            self.gen_kwargs = gen_kwargs.copy()
             self.eos = eos
             max_tokens = gen_kwargs.get("max_gen_toks", self._max_gen_toks)
             temperature = gen_kwargs.get("temperature", 0)
@@ -83,18 +81,14 @@ def _create_payload(
                 "temperature": temperature,
                 "stop": stop,
             }
+            self.gen_kwargs = gen_kwargs.copy()
             backend_kwargs = {
                 "invalid_finish_reasons": [
                     "content_filter"
                 ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"
             }
-            # hard code for now but should be passed in through model_args
-            if self.model_name == "gemini/gemini-1.5-flash":
-                backend_kwargs["max_requests_per_minute"] = 2_000
-                backend_kwargs["max_tokens_per_minute"] = 4_000_000
-            elif self.model_name == "gemini/gemini-1.5-pro":
-                backend_kwargs["max_requests_per_minute"] = 1_000
-                backend_kwargs["max_tokens_per_minute"] = 4_000_000
+            backend_kwargs.update(self.backend_params)
+
             self.llm = curator.LLM(
                 model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs
             )
@@ -175,9 +169,7 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         ), "Generation parameters must be the same for all requests in curator"
 
         contexts_dataset = self.create_message(contexts)
-        payload = self._create_payload(
-            contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0], backend_params=self.backend_params
-        )
+        payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
         response = self.llm(payload)["response"]
         return response
 

From 1849ff8fc5d30d9f687cf863bd9d7c084d4e00ad Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 21 Jan 2025 17:30:10 -0800
Subject: [PATCH 11/14] don't exclude args

---
 eval/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval/eval.py b/eval/eval.py
index 259bea62..b15e77fb 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -35,7 +35,7 @@ def setup_custom_parser():
     Create a custom argument parser that extends lm-eval-harness parser.
     """
     parser = setup_parser()
-    db_group = parser.add_mutually_exclusive_group()
+    db_group = parser.add_argument_group("database")
 
     db_group.add_argument("--model_id", type=str, default=None, help="Model UUID for direct database tracking")
 

From 78fbd39f782efc77e3743b5a676645dc3bd43d3c Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Wed, 22 Jan 2025 11:05:55 -0800
Subject: [PATCH 12/14] Handle change of gen params needed for multi-turn

---
 .../baseline_gpt4_1106_preview.csv            |  2 +
 eval/chat_benchmarks/curator_lm.py            | 47 ++++++++++---------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
index ba6a3d39..16fd27f6 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -1,3 +1,5 @@
 ,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
 "model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False",-1.8366974017900992,0.7446564084265157,-7.2599824829397441
 "model_curator_model_args_pretrained=claude-3-5-haiku-20241022,tokenized_requests=False,batch_size=1",-2.2123505700477266,0.4430662251338714,0.0602887165933287
+"model_curator_model_args_pretrained=gemini__gemini-1.5-flash,tokenized_requests=False,batch_size=1",-2.0652896814524828,0.2909775910083418,0.0988106449800108
+"model_curator_model_args_pretrained=gemini__gemini-1.5-flash-8b,tokenized_requests=False,max_requests_per_minute=10000,max_tokens_per_minute=10000000,batch_size=1",-2.3695718018361474,0.3369055355846819,-0.6677889603940012
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index d6e9db2a..9233dfa2 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -46,10 +46,14 @@ def __init__(
         self.gen_kwargs = {}
         self._max_gen_toks = 2048
         self.eos = None
-        self.backend_params = {}
-        self.backend_params["require_all_responses"] = False
-        self.backend_params["request_timeout"] = timeout
-        self.backend_params["max_retries"] = max_retries
+        self.backend_params = {
+            "invalid_finish_reasons": [
+                "content_filter"
+            ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"}
+            "require_all_responses": False,
+            "request_timeout": timeout,
+            "max_retries": max_retries,
+        }
         if max_requests_per_minute is not None:
             self.backend_params["max_requests_per_minute"] = max_requests_per_minute
         if max_tokens_per_minute is not None:
@@ -71,30 +75,29 @@ def _create_payload(
     ) -> dict:
         assert generate, "Curator only supports generation."
         # Create the payload for the API request
+        max_tokens = gen_kwargs.get("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.get("temperature", 0)
+        stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
+        gen_kwargs = {
+            "max_completion_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop,
+        }
         if self.llm is None:
             self.eos = eos
-            max_tokens = gen_kwargs.get("max_gen_toks", self._max_gen_toks)
-            temperature = gen_kwargs.get("temperature", 0)
-            stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
-            gen_kwargs = {
-                "max_completion_tokens": max_tokens,
-                "temperature": temperature,
-                "stop": stop,
-            }
             self.gen_kwargs = gen_kwargs.copy()
-            backend_kwargs = {
-                "invalid_finish_reasons": [
-                    "content_filter"
-                ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"
-            }
-            backend_kwargs.update(self.backend_params)
-
             self.llm = curator.LLM(
-                model_name=self.model_name, generation_params=gen_kwargs, backend_params=backend_kwargs
+                model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params
             )
         else:
-            assert self.gen_kwargs == gen_kwargs, "Generation parameters must be the same for all requests in curator"
-            assert self.eos == eos, "EOS must be the same for all requests in curator"
+            if self.gen_kwargs != gen_kwargs:
+                print(
+                    "Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
+                )
+                self.llm = curator.LLM(
+                    model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params
+                )
+                self.gen_kwargs = gen_kwargs.copy()
         return messages
 
     def create_message(

From f83bb6baae466fc6c88f925327fe6ef7604798bb Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Wed, 22 Jan 2025 13:36:19 -0800
Subject: [PATCH 13/14] [wip] MTBench with curator is failing

---
 eval/chat_benchmarks/curator_lm.py | 47 ++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 9233dfa2..1d39de4c 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -1,5 +1,6 @@
 import json
 import os
+import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from bespokelabs import curator
@@ -37,7 +38,6 @@ def __init__(
             max_requests_per_minute = max_requests_per_minute or 2000
             max_tokens_per_minute = max_tokens_per_minute or 80_000
 
-        os.environ["CURATOR_DISABLE_CACHE"] = "true"
         if tokenized_requests:
             raise NotImplementedError("Tokenized requests not implemented for curator.")
         self.tokenized_requests = False
@@ -87,17 +87,20 @@ def _create_payload(
             self.eos = eos
             self.gen_kwargs = gen_kwargs.copy()
             self.llm = curator.LLM(
-                model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params
+                model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
             )
         else:
             if self.gen_kwargs != gen_kwargs:
                 print(
                     "Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
                 )
+                # Sleep to make sure the previous request is finished
+                time.sleep(10)
+                self.gen_kwargs = gen_kwargs.copy()
+                breakpoint()
                 self.llm = curator.LLM(
-                    model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params
+                    model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
                 )
-                self.gen_kwargs = gen_kwargs.copy()
         return messages
 
     def create_message(
@@ -157,6 +160,19 @@ def eot_token_id(self) -> Optional[int]:
         # Assuming the model has a specific end-of-text token ID
         return self.llm.eot_token_id  # Replace with actual method to get EOT token ID
 
+    def _group_gen_kwargs(self, gen_kwargs: List[dict]) -> List[Tuple[List[int], dict]]:
+        """
+        Group identical generation parameters together and return a list of indices that share the same generation parameters
+        """
+        ordered_set_gen_kwargs = []
+        for gkw in gen_kwargs:
+            if gkw not in ordered_set_gen_kwargs:
+                ordered_set_gen_kwargs.append(gkw)
+        return [
+            [i for i, gkw in enumerate(gen_kwargs) if gkw == ordered_set_gen_kwargs[j]]
+            for j in range(len(ordered_set_gen_kwargs))
+        ], ordered_set_gen_kwargs
+
     def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]:
         # Tokenize contexts if required
         if self.tokenized_requests:
@@ -166,14 +182,21 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         contexts = [req.args[0] for req in requests]
         gen_kwargs = [req.args[1] for req in requests]
 
-        # Assert all gen_kwargs are the same
-        assert all(
-            gen_kwargs[0] == gkw for gkw in gen_kwargs
-        ), "Generation parameters must be the same for all requests in curator"
-
-        contexts_dataset = self.create_message(contexts)
-        payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
-        response = self.llm(payload)["response"]
+        list_indices, gen_kwargs = self._group_gen_kwargs(gen_kwargs)
+
+        # Curator needs a new object for each set of gen_kwargs
+        # Generate responses for each group of generation parameters
+        response_list = []
+        for indices, gkw in zip(list_indices, gen_kwargs):
+            contexts_dataset = self.create_message([contexts[i] for i in indices])
+            payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gkw)
+            response_list.append(self.llm(payload)["response"])
+
+        # Re-order responses to match the original request order
+        response = [None] * len(requests)
+        for i, indices in enumerate(list_indices):
+            for idx in indices:
+                response[idx] = response_list[i]
         return response
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:

From e31b2cedceecb6368b7731ca81f193848cf36413 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Wed, 22 Jan 2025 13:48:14 -0800
Subject: [PATCH 14/14] remove breakpoint, MTBench still failing

---
 eval/chat_benchmarks/curator_lm.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 1d39de4c..62e83f15 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -44,7 +44,6 @@ def __init__(
         self.max_length = max_length
         self.llm = None
         self.gen_kwargs = {}
-        self._max_gen_toks = 2048
         self.eos = None
         self.backend_params = {
             "invalid_finish_reasons": [
@@ -75,7 +74,7 @@ def _create_payload(
     ) -> dict:
         assert generate, "Curator only supports generation."
         # Create the payload for the API request
-        max_tokens = gen_kwargs.get("max_gen_toks", self._max_gen_toks)
+        max_tokens = gen_kwargs.get("max_gen_toks", self.max_length)
         temperature = gen_kwargs.get("temperature", 0)
         stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
         gen_kwargs = {
@@ -94,10 +93,7 @@ def _create_payload(
                 print(
                     "Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
                 )
-                # Sleep to make sure the previous request is finished
-                time.sleep(10)
                 self.gen_kwargs = gen_kwargs.copy()
-                breakpoint()
                 self.llm = curator.LLM(
                     model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
                 )