diff --git a/README.md b/README.md index e7a702285..412b35343 100644 --- a/README.md +++ b/README.md @@ -175,14 +175,15 @@ pip install -v -e .[tools] # install a subset of tools dependencies The dependency options are listed below: -| Tag | Description | -|--------------|----------------------------------------------------------------------------------------------| +| Tag | Description | +|------------------|----------------------------------------------------------------------------------------------| | `.` or `.[mini]` | Install minimal dependencies for basic Data-Juicer. | -| `.[all]` | Install all optional dependencies (including minimal dependencies and all of the following). | -| `.[sci]` | Install all dependencies for all OPs. | -| `.[dist]` | Install dependencies for distributed data processing. (Experimental) | -| `.[dev]` | Install dependencies for developing the package as contributors. | -| `.[tools]` | Install dependencies for dedicated tools, such as quality classifiers. | +| `.[all]` | Install all optional dependencies (including minimal dependencies and all of the following). | +| `.[sci]` | Install all dependencies for all OPs. | +| `.[sandbox]` | Install all dependencies for sandbox. | +| `.[dist]` | Install dependencies for distributed data processing. (Experimental) | +| `.[dev]` | Install dependencies for developing the package as contributors. | +| `.[tools]` | Install dependencies for dedicated tools, such as quality classifiers. | ### Using pip diff --git a/README_ZH.md b/README_ZH.md index 04f8dc70a..c50e6655f 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -158,14 +158,15 @@ pip install -v -e .[tools] # 安装部分工具库的依赖 依赖选项如下表所示: -| 标签 | 描述 | -|--------------|------------------------------| +| 标签 | 描述 | +|------------------|------------------------------| | `.` 或者 `.[mini]` | 安装支持 Data-Juicer 基础功能的最小依赖项 | -| `.[all]` | 安装所有可选依赖项(包括最小依赖项以及下面所有依赖项) | -| `.[sci]` | 安装所有算子的全量依赖 | -| `.[dist]` | 安装以分布式方式进行数据处理的依赖(实验性功能) | -| `.[dev]` | 安装作为贡献者开发 Data-Juicer 所需的依赖项 | -| `.[tools]` | 安装专用工具库(如质量分类器)所需的依赖项 | +| `.[all]` | 安装所有可选依赖项(包括最小依赖项以及下面所有依赖项) | +| `.[sci]` | 安装所有算子的全量依赖 | +| `.[sandbox]` | 安装沙盒实验室的基础依赖 | +| `.[dist]` | 安装以分布式方式进行数据处理的依赖(实验性功能) | +| `.[dev]` | 安装作为贡献者开发 Data-Juicer 所需的依赖项 | +| `.[tools]` | 安装专用工具库(如质量分类器)所需的依赖项 | ### 使用 pip 安装 diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 2ca191c66..50ccc1014 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -13,14 +13,21 @@ OP_NAME = 'image_deduplicator' with AvailabilityChecking(['imagededup'], OP_NAME): - from imagededup.methods import AHash, DHash, PHash, WHash + import imagededup # noqa: F401 - HASH_METHOD = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} + + def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash + + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -40,10 +47,10 @@ def __init__(self, method: str = 'phash', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - if method not in HASH_METHOD.keys(): + if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD.keys()}.') - self.hasher = HASH_METHOD[method]() + f'Can only be one of {HASH_METHOD}.') + self.hasher = get_hash_method(method)() def compute_hash(self, sample, context=False): # check if it's computed already diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index a95cd3baa..10530c48b 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -11,14 +11,21 @@ OP_NAME = 'ray_image_deduplicator' with AvailabilityChecking(['imagededup'], OP_NAME): - from imagededup.methods import AHash, DHash, PHash, WHash + import imagededup # noqa: F401 - HASH_METHOD = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} + + def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash + + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -46,10 +53,10 @@ def __init__(self, redis_port=redis_port, *args, **kwargs) - if method not in HASH_METHOD.keys(): + if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD.keys()}.') - self.hasher = HASH_METHOD[method]() + f'Can only be one of {HASH_METHOD}.') + self.hasher = get_hash_method(method)() def calculate_hash(self, sample, context=False): if self.image_key not in sample or not sample[self.image_key]: diff --git a/docs/Sandbox-ZH.md b/docs/Sandbox-ZH.md index 9292a3083..6357b8117 100644 --- a/docs/Sandbox-ZH.md +++ b/docs/Sandbox-ZH.md @@ -4,6 +4,19 @@ 用户在沙盒中,除了Data-Juicer基础的数据优化与数据菜谱微调功能外,还可以便捷地使用数据洞察与分析、沙盒模型训练与评测、基于数据和模型反馈优化数据菜谱等可配置组件,共同组成完整的一站式数据-模型研发流水线。 ## 快速上手 +### 依赖准备 +在使用沙盒实验室前,你可能需要使用如下命令安装沙盒相关的第三方依赖: +```shell +pip install -v -e .[sandbox] + +# 或者直接安装全量依赖 +pip install -v -e .[all] +``` + +**注意**:一些沙盒的依赖还需要额外的领域依赖。例如,如果用户想要在沙盒中训练一个 ModelScope 平台的NLP模型,那可能需要为 `modelscope` 库 +安装额外的 `nlp` 领域依赖(参考其[安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) )。 +因此如果使用沙盒过程中,这些第三方依赖抛出了一些"未找到模块(Module-Not-Found)"的报错时,用户需要先检查这些库的文档以寻求帮助。 + ### 准备沙盒配置文件 沙盒的主配置文件除了Data-Juicer的配置文件外,还包括了若干额外的参数用于指定沙盒流水线中可能会运行的模型训练、推理、评测等步骤的配置信息,完整的额外参数可参考 [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml) 中的“for sandbox or hpo”部分参数。一个sandbox的配置文件示例可参考`configs/demo/sandbox/sandbox.yaml`: ```yaml diff --git a/docs/Sandbox.md b/docs/Sandbox.md index 8e1d8e622..2eda98095 100644 --- a/docs/Sandbox.md +++ b/docs/Sandbox.md @@ -4,6 +4,19 @@ In Data-Juicer, the data sandbox laboratory provides users with the best practic In addition to the basic data optimization and recipe refinement features offered by Data-Juicer, users can seamlessly use configurable components such as data probe and analysis, model training and evaluation, and data and model feedback-based recipe refinement to form a complete one-stop data-model research and development pipeline. ## Quick Start +### Requirements +Before using sandbox, you might need to install sandbox-related third-party dependencies by running the command below: +```shell +pip install -v -e .[sandbox] + +# or install all dependencies +pip install -v -e .[all] +``` + +**NOTICE**: some sandbox-related dependencies require extra domain dependencies. For example, if users want to train an NLP model from ModelScope +in the sandbox, you might need to install extra `nlp` dependencies for `modelscope` library (see the [installation docs](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)). +So if some Module-Not-Found errors are raised by these third-party libraries when running the sandbox, users need to check their docs first. + ### Prepare Configuration Files for Sandbox The configuration file of the sandbox includes several additional parameters in addition to the configuration of Data-Juicer. These parameters are used to specify the configuration information for model training, inference, evaluation, and other steps that may run in the sandbox pipeline. For the complete set of additional parameters, please refer to the "for sandbox or hpo" section in the [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml). An example of a sandbox configuration file can be found in `configs/demo/sandbox/sandbox.yaml`: ```yaml diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index b273872b0..c162fb21d 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -1,7 +1,7 @@ fsspec==2023.5.0 pyarrow<=12.0.0 pandas==2.0.3 -datasets==2.11.0 +datasets==2.18.0 av soundfile librosa diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt new file mode 100644 index 000000000..b059d146a --- /dev/null +++ b/environments/sandbox_requires.txt @@ -0,0 +1,3 @@ +wandb +# modelscope-related +modelscope diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 4faa330c8..e848ea5ba 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,3 +1,5 @@ +torch>=1.11.0 +torchaudio easyocr fasttext-wheel kenlm @@ -16,8 +18,6 @@ accelerate tiktoken opencc==1.1.6 imagededup -torch -torchaudio dlib spacy-pkuseg==0.0.32 diffusers diff --git a/setup.py b/setup.py index 0cf944927..18c78b368 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ def get_install_requirements(require_f_paths, env_dir='environments'): 'tools': get_install_requirements( ['preprocess_requires.txt', 'quality_classifier_requires.txt']), + 'sandbox': + get_install_requirements(['sandbox_requires.txt']), } extra_requires['all'] = [v for v in extra_requires.values()] diff --git a/tools/sandbox_starter.py b/tools/sandbox_starter.py index 7b8298b50..e1f8d91a4 100644 --- a/tools/sandbox_starter.py +++ b/tools/sandbox_starter.py @@ -1,9 +1,10 @@ import json import yaml +from jsonargparse import dict_to_namespace from loguru import logger -from data_juicer.config import dict_to_namespace, init_configs +from data_juicer.config import init_configs from data_juicer.core.sandbox.pipelines import SandBoxExecutor