Decrease the memory usage at import dspy (stanfordnlp#7915)

* init * remove pandas * revert unrelated * formatting
chenmoneygithub · Mar 6, 2025 · c0b79b0 · c0b79b0
1 parent a2fc6a1
commit c0b79b0
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 22 deletions.
diff --git a/dspy/clients/lm_local.py b/dspy/clients/lm_local.py
@@ -1,18 +1,17 @@
 import datetime
 import logging
 import random
-import requests
 import socket
 import string
 import subprocess
-import time
 import threading
+import time
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
-from datasets import Dataset
-from typing import Any, Dict, List, Optional
-from dspy.clients.provider import TrainingJob, Provider
+import requests
+
+from dspy.clients.provider import Provider, TrainingJob
 from dspy.clients.utils_finetune import TrainDataFormat, save_data
-from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from dspy.clients.lm import LM
@@ -29,7 +28,7 @@ def __init__(self):
     @staticmethod
     def launch(lm: "LM", launch_kwargs: Optional[Dict[str, Any]] = None):
         try:
-            import sglang # noqa: F401
+            import sglang  # noqa: F401
         except ImportError:
             raise ImportError(
                 "For local model launching, please install sglang by running "
@@ -229,6 +228,8 @@ def train_sft_locally(model_name, train_data, train_kwargs):
         train_kwargs["max_seq_length"] = 4096
         logger.info(f"The 'train_kwargs' parameter didn't include a 'max_seq_length', defaulting to {train_kwargs['max_seq_length']}")
 
+    from datasets import Dataset
+
     hf_dataset = Dataset.from_list(train_data)
     def tokenize_function(example):
         return encode_sft_example(example, tokenizer, train_kwargs["max_seq_length"])

diff --git a/dspy/clients/utils_finetune.py b/dspy/clients/utils_finetune.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, List, Optional
 
 import ujson
-from datasets.fingerprint import Hasher
 
 import dspy
 from dspy.adapters.base import Adapter
@@ -47,6 +46,8 @@ def write_lines(file_path, data):
 def save_data(
     data: List[Dict[str, Any]],
 ) -> str:
+    from datasets.fingerprint import Hasher
+
     # Assign a unique name to the file based on the data hash
     hash = Hasher.hash(data)
     file_name = f"{hash}.jsonl"

diff --git a/dspy/datasets/dataloader.py b/dspy/datasets/dataloader.py
@@ -1,13 +1,13 @@
 import random
 from collections.abc import Mapping
-from typing import List, Tuple, Union
-
-import pandas as pd
-from datasets import load_dataset
+from typing import TYPE_CHECKING, List, Tuple, Union
 
 import dspy
 from dspy.datasets.dataset import Dataset
 
+if TYPE_CHECKING:
+    import pandas as pd
+
 
 class DataLoader(Dataset):
     def __init__(self):
@@ -27,6 +27,8 @@ def from_huggingface(
         if not isinstance(input_keys, tuple):
             raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.")
 
+        from datasets import load_dataset
+
         dataset = load_dataset(dataset_name, *args, **kwargs)
 
         if isinstance(dataset, list) and isinstance(kwargs["split"], list):
@@ -59,6 +61,8 @@ def from_huggingface(
                 ]
 
     def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
+        from datasets import load_dataset
+
         dataset = load_dataset("csv", data_files=file_path)["train"]
 
         if not fields:
@@ -68,7 +72,7 @@ def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[s
 
     def from_pandas(
         self,
-        df: pd.DataFrame,
+        df: "pd.DataFrame",
         fields: list[str] = None,
         input_keys: tuple[str] = (),
     ) -> list[dspy.Example]:
@@ -80,6 +84,8 @@ def from_pandas(
         ]
 
     def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
+        from datasets import load_dataset
+
         dataset = load_dataset("json", data_files=file_path)["train"]
 
         if not fields:
@@ -88,6 +94,8 @@ def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[
         return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
 
     def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
+        from datasets import load_dataset
+
         dataset = load_dataset("parquet", data_files=file_path)["train"]
 
         if not fields:

diff --git a/dspy/datasets/gsm8k.py b/dspy/datasets/gsm8k.py
@@ -1,13 +1,14 @@
 import random
 
 import tqdm
-from datasets import load_dataset
 
 
 class GSM8K:
     def __init__(self):
         self.do_shuffle = False
 
+        from datasets import load_dataset
+
         dataset = load_dataset("gsm8k", "main")
 
         hf_official_train = dataset["train"]

diff --git a/dspy/datasets/hotpotqa.py b/dspy/datasets/hotpotqa.py
@@ -1,7 +1,5 @@
 import random
 
-from datasets import load_dataset
-
 from dspy.datasets.dataset import Dataset
 
 
@@ -20,6 +18,8 @@ def __init__(
             "Dev must be all hard to match official dev, but training can be flexible."
         )
 
+        from datasets import load_dataset
+
         hf_official_train = load_dataset("hotpot_qa", "fullwiki", split="train", trust_remote_code=True)
         hf_official_dev = load_dataset("hotpot_qa", "fullwiki", split="validation", trust_remote_code=True)
 

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -1,8 +1,10 @@
 import logging
 import types
-from typing import Any, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, List, Optional
+
+if TYPE_CHECKING:
+    import pandas as pd
 
-import pandas as pd
 import tqdm
 
 import dspy
@@ -183,6 +185,7 @@ def prediction_is_dictlike(prediction):
         ]
 
 
+        import pandas as pd
         # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
         result_df = pd.DataFrame(data)
         result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
@@ -252,7 +255,7 @@ def truncate_cell(content) -> str:
     return content
 
 
-def stylize_metric_name(df: pd.DataFrame, metric_name: str) -> pd.DataFrame:
+def stylize_metric_name(df: "pd.DataFrame", metric_name: str) -> "pd.DataFrame":
     """
     Stylize the cell contents of a pandas DataFrame corresponding to the specified metric name.
 
@@ -265,12 +268,14 @@ def stylize_metric_name(df: pd.DataFrame, metric_name: str) -> pd.DataFrame:
     return df
 
 
-def display_dataframe(df: pd.DataFrame):
+def display_dataframe(df: "pd.DataFrame"):
     """
     Display the specified Pandas DataFrame in the console.
 
     :param df: The Pandas DataFrame to display.
     """
+    import pandas as pd
+
     if is_in_ipython_notebook_environment():
         display(configure_dataframe_for_ipython_notebook_display(df))
     else:
@@ -281,8 +286,10 @@ def display_dataframe(df: pd.DataFrame):
             print(df)
 
 
-def configure_dataframe_for_ipython_notebook_display(df: pd.DataFrame) -> pd.DataFrame:
+def configure_dataframe_for_ipython_notebook_display(df: "pd.DataFrame") -> "pd.DataFrame":
     """Set various pandas display options for DataFrame in an IPython notebook environment."""
+    import pandas as pd
+
     pd.options.display.max_colwidth = 70
     return df
 

diff --git a/dspy/experimental/synthesizer/synthesizer.py b/dspy/experimental/synthesizer/synthesizer.py
@@ -2,7 +2,6 @@
 from collections.abc import Mapping
 from typing import List, Optional, Union
 
-from datasets import Dataset
 from rich import print as rprint
 from tqdm import tqdm, trange
 
@@ -241,6 +240,8 @@ def generate(
     def export(self, data: List[dspy.Example], path: str, mode: str = None, **kwargs):
         extension = mode or path.split(".")[-1]
 
+        from datasets import Dataset
+
         dataset = Dataset.from_list(
             [example.toDict() for example in data],
         )