Skip to content

Commit

Permalink
Decrease the memory usage at import dspy (stanfordnlp#7915)
Browse files Browse the repository at this point in the history
* init

* remove pandas

* revert unrelated

* formatting
  • Loading branch information
chenmoneygithub authored Mar 6, 2025
1 parent a2fc6a1 commit c0b79b0
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 22 deletions.
15 changes: 8 additions & 7 deletions dspy/clients/lm_local.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import datetime
import logging
import random
import requests
import socket
import string
import subprocess
import time
import threading
import time
from typing import TYPE_CHECKING, Any, Dict, List, Optional

from datasets import Dataset
from typing import Any, Dict, List, Optional
from dspy.clients.provider import TrainingJob, Provider
import requests

from dspy.clients.provider import Provider, TrainingJob
from dspy.clients.utils_finetune import TrainDataFormat, save_data
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from dspy.clients.lm import LM
Expand All @@ -29,7 +28,7 @@ def __init__(self):
@staticmethod
def launch(lm: "LM", launch_kwargs: Optional[Dict[str, Any]] = None):
try:
import sglang # noqa: F401
import sglang # noqa: F401
except ImportError:
raise ImportError(
"For local model launching, please install sglang by running "
Expand Down Expand Up @@ -229,6 +228,8 @@ def train_sft_locally(model_name, train_data, train_kwargs):
train_kwargs["max_seq_length"] = 4096
logger.info(f"The 'train_kwargs' parameter didn't include a 'max_seq_length', defaulting to {train_kwargs['max_seq_length']}")

from datasets import Dataset

hf_dataset = Dataset.from_list(train_data)
def tokenize_function(example):
return encode_sft_example(example, tokenizer, train_kwargs["max_seq_length"])
Expand Down
3 changes: 2 additions & 1 deletion dspy/clients/utils_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Any, Dict, List, Optional

import ujson
from datasets.fingerprint import Hasher

import dspy
from dspy.adapters.base import Adapter
Expand Down Expand Up @@ -47,6 +46,8 @@ def write_lines(file_path, data):
def save_data(
data: List[Dict[str, Any]],
) -> str:
from datasets.fingerprint import Hasher

# Assign a unique name to the file based on the data hash
hash = Hasher.hash(data)
file_name = f"{hash}.jsonl"
Expand Down
18 changes: 13 additions & 5 deletions dspy/datasets/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import random
from collections.abc import Mapping
from typing import List, Tuple, Union

import pandas as pd
from datasets import load_dataset
from typing import TYPE_CHECKING, List, Tuple, Union

import dspy
from dspy.datasets.dataset import Dataset

if TYPE_CHECKING:
import pandas as pd


class DataLoader(Dataset):
def __init__(self):
Expand All @@ -27,6 +27,8 @@ def from_huggingface(
if not isinstance(input_keys, tuple):
raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.")

from datasets import load_dataset

dataset = load_dataset(dataset_name, *args, **kwargs)

if isinstance(dataset, list) and isinstance(kwargs["split"], list):
Expand Down Expand Up @@ -59,6 +61,8 @@ def from_huggingface(
]

def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=file_path)["train"]

if not fields:
Expand All @@ -68,7 +72,7 @@ def from_csv(self, file_path: str, fields: List[str] = None, input_keys: Tuple[s

def from_pandas(
self,
df: pd.DataFrame,
df: "pd.DataFrame",
fields: list[str] = None,
input_keys: tuple[str] = (),
) -> list[dspy.Example]:
Expand All @@ -80,6 +84,8 @@ def from_pandas(
]

def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
from datasets import load_dataset

dataset = load_dataset("json", data_files=file_path)["train"]

if not fields:
Expand All @@ -88,6 +94,8 @@ def from_json(self, file_path: str, fields: List[str] = None, input_keys: Tuple[
return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]

def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]:
from datasets import load_dataset

dataset = load_dataset("parquet", data_files=file_path)["train"]

if not fields:
Expand Down
3 changes: 2 additions & 1 deletion dspy/datasets/gsm8k.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import random

import tqdm
from datasets import load_dataset


class GSM8K:
def __init__(self):
self.do_shuffle = False

from datasets import load_dataset

dataset = load_dataset("gsm8k", "main")

hf_official_train = dataset["train"]
Expand Down
4 changes: 2 additions & 2 deletions dspy/datasets/hotpotqa.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import random

from datasets import load_dataset

from dspy.datasets.dataset import Dataset


Expand All @@ -20,6 +18,8 @@ def __init__(
"Dev must be all hard to match official dev, but training can be flexible."
)

from datasets import load_dataset

hf_official_train = load_dataset("hotpot_qa", "fullwiki", split="train", trust_remote_code=True)
hf_official_dev = load_dataset("hotpot_qa", "fullwiki", split="validation", trust_remote_code=True)

Expand Down
17 changes: 12 additions & 5 deletions dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging
import types
from typing import Any, Callable, List, Optional
from typing import TYPE_CHECKING, Any, Callable, List, Optional

if TYPE_CHECKING:
import pandas as pd

import pandas as pd
import tqdm

import dspy
Expand Down Expand Up @@ -183,6 +185,7 @@ def prediction_is_dictlike(prediction):
]


import pandas as pd
# Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
result_df = pd.DataFrame(data)
result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
Expand Down Expand Up @@ -252,7 +255,7 @@ def truncate_cell(content) -> str:
return content


def stylize_metric_name(df: pd.DataFrame, metric_name: str) -> pd.DataFrame:
def stylize_metric_name(df: "pd.DataFrame", metric_name: str) -> "pd.DataFrame":
"""
Stylize the cell contents of a pandas DataFrame corresponding to the specified metric name.
Expand All @@ -265,12 +268,14 @@ def stylize_metric_name(df: pd.DataFrame, metric_name: str) -> pd.DataFrame:
return df


def display_dataframe(df: pd.DataFrame):
def display_dataframe(df: "pd.DataFrame"):
"""
Display the specified Pandas DataFrame in the console.
:param df: The Pandas DataFrame to display.
"""
import pandas as pd

if is_in_ipython_notebook_environment():
display(configure_dataframe_for_ipython_notebook_display(df))
else:
Expand All @@ -281,8 +286,10 @@ def display_dataframe(df: pd.DataFrame):
print(df)


def configure_dataframe_for_ipython_notebook_display(df: pd.DataFrame) -> pd.DataFrame:
def configure_dataframe_for_ipython_notebook_display(df: "pd.DataFrame") -> "pd.DataFrame":
"""Set various pandas display options for DataFrame in an IPython notebook environment."""
import pandas as pd

pd.options.display.max_colwidth = 70
return df

Expand Down
3 changes: 2 additions & 1 deletion dspy/experimental/synthesizer/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from collections.abc import Mapping
from typing import List, Optional, Union

from datasets import Dataset
from rich import print as rprint
from tqdm import tqdm, trange

Expand Down Expand Up @@ -241,6 +240,8 @@ def generate(
def export(self, data: List[dspy.Example], path: str, mode: str = None, **kwargs):
extension = mode or path.split(".")[-1]

from datasets import Dataset

dataset = Dataset.from_list(
[example.toDict() for example in data],
)
Expand Down

0 comments on commit c0b79b0

Please sign in to comment.