modelscope · HYLcool · Aug 29, 2024 · Jul 4, 2024 · Jul 5, 2024 · Jul 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ dist
 .idea/
 wandb/
 __pycache__
+.vscode/
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -62,8 +62,29 @@ process:
   - clean_copyright_mapper:                                 # remove copyright comments.
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
   - extract_qa_mapper:                                      # mapper to extract question and answer pair from text.
-      hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
+      hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'           # model name on huggingface to extract question and answer pair.
+      pattern: null                                           # regular expression pattern to search for within text.
+      qa_format: 'chatml'                                     # Output format of question and answer pair.
+      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - fix_unicode_mapper:                                     # fix unicode errors in text.
+  - generate_instruction_mapper:                            # generate new instruction text data.
+      hf_model: 'Qwen/Qwen-7B-Chat'                           # model name on huggingface to generate instruction.
+      seed_file: 'demos/data/demo-dataset-chatml.jsonl'       # Seed file as instruction samples to generate new instructions, chatml format.
+      instruct_num: 3                                         # the number of generated samples.
+      similarity_threshold: 0.7                               # the similarity score threshold between the generated samples and the seed samples.Range from 0 to 1. Samples with similarity score less than this threshold will be kept.
+      prompt_template: null                                   # Prompt template for generate samples. Please make sure the template contains "{augmented_data}", which corresponds to the augmented samples.
+      qa_pair_template: null                                  # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'.
+      example_template: null                                  # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`.
+      qa_extraction_pattern: null                             # Regular expression pattern for parsing question and answer from model response.
+      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - image_blur_mapper:                                      # mapper to blur images.
       p: 0.2                                                  # probability of the image being blured
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
@@ -123,6 +144,13 @@ process:
       delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
       swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
       replace_equivalent_num: false                           # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
+  - optimize_instruction_mapper:                            # optimize instruction.
+      hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine'        # model name on huggingface to optimize instruction
+      enable_vllm: true                                       # whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -86,6 +86,13 @@ def init_configs(args=None):
         help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
         'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
         '<w3> dataset3-path ...')
+    parser.add_argument(
+        '--generated_dataset_config',
+        type=Dict,
+        default=None,
+        help='Configuration used to create a dataset. '
+        'The dataset will be created from this configuration if provided. '
+        'It must contain the `type` field to specify the dataset name.')
     parser.add_argument(
         '--export_path',
         type=str,
@@ -371,7 +378,7 @@ def init_setup_from_cfg(cfg):
                  redirect=cfg.executor_type == 'default')
 
     # check and get dataset dir
-    if os.path.exists(cfg.dataset_path):
+    if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
         cfg.dataset_path = os.path.abspath(cfg.dataset_path)
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = cfg.dataset_path

diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
@@ -49,6 +49,7 @@ def __init__(self, cfg=None):
         # setup formatter
         logger.info('Setting up data formatter...')
         self.formatter = load_formatter(self.cfg.dataset_path,
+                                        self.cfg.generated_dataset_config,
                                         self.cfg.text_keys, self.cfg.suffixes,
                                         self.cfg.add_suffix)
 

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
@@ -47,7 +47,17 @@ def run(self, load_data_np=None):
         """
         # 1. load data
         logger.info('Loading dataset with Ray...')
-        dataset = rd.read_json(self.cfg.dataset_path)
+
+        if self.cfg.get('generated_dataset_config', None):
+            generated_dataset_config = self.cfg.generated_dataset_config
+            assert isinstance(generated_dataset_config,
+                              dict) and 'type' in generated_dataset_config
+            args = generated_dataset_config.copy()
+            obj_name = args.pop('type')
+            from data_juicer.format.formatter import FORMATTERS
+            dataset = FORMATTERS.modules[obj_name](**args).load_dataset()
+        else:
+            dataset = rd.read_json(self.cfg.dataset_path)
 
         # convert all the path in dataset to absolute path
         dataset = RayDataset(dataset, self.cfg.dataset_path, self.cfg)

diff --git a/data_juicer/format/__init__.py b/data_juicer/format/__init__.py
@@ -1,6 +1,8 @@
-from . import (csv_formatter, json_formatter, mixture_formatter,
-               parquet_formatter, text_formatter, tsv_formatter)
+from . import (csv_formatter, empty_formatter, json_formatter,
+               mixture_formatter, parquet_formatter, text_formatter,
+               tsv_formatter)
 from .csv_formatter import CsvFormatter
+from .empty_formatter import EmptyFormatter, RayEmptyFormatter
 from .formatter import LocalFormatter, RemoteFormatter
 from .json_formatter import JsonFormatter
 from .load import load_formatter
@@ -12,5 +14,5 @@
 __all__ = [
     'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
     'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
-    'MixtureFormatter'
+    'MixtureFormatter', 'EmptyFormatter', 'RayEmptyFormatter'
 ]
diff --git a/data_juicer/format/empty_formatter.py b/data_juicer/format/empty_formatter.py
@@ -0,0 +1,84 @@
+from typing import List
+
+import pandas as pd
+import ray
+from datasets import Dataset, Features, Value
+
+from .formatter import FORMATTERS, BaseFormatter
+
+
+@FORMATTERS.register_module()
+class EmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data.
+    """
+    SUFFIXES = []
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return None
+
+    def load_dataset(self, *args, **kwargs):
+        data_dict = {}
+        features = Features()
+
+        for key in self.feature_keys:
+            features.update({key: Value('string')})
+            data_dict.update(
+                {key: [self.null_value for _ in range(self.length)]})
+
+        empty_dataset = Dataset.from_dict(data_dict, features=features)
+
+        from data_juicer.core.data import NestedDataset
+        empty_dataset = NestedDataset(empty_dataset)
+
+        return empty_dataset
+
+
+@FORMATTERS.register_module()
+class RayEmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data for ray.
+    """
+    SUFFIXES = []
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return {}
+
+    def load_dataset(self, *args, **kwargs):
+        if len(self.feature_keys):
+            df = pd.DataFrame({
+                col: [self.null_value for _ in range(self.length)]
+                for col in self.feature_keys
+            })
+        else:
+            df = pd.DataFrame([self.null_value for _ in range(self.length)])
+
+        empty_dataset = ray.data.from_pandas(df)
+
+        return empty_dataset
diff --git a/data_juicer/format/load.py b/data_juicer/format/load.py
@@ -3,6 +3,7 @@
 
 
 def load_formatter(dataset_path,
+                   generated_dataset_config=None,
                    text_keys=None,
                    suffixes=[],
                    add_suffix=False,
@@ -12,13 +13,26 @@ def load_formatter(dataset_path,
     weight(default 1.0) according to their formats.
 
     :param dataset_path: path to a dataset file or a dataset directory
+    :param generated_dataset_config: Configuration used to create a dataset.
+        The dataset will be created from this configuration if provided.
+        It must contain the `type` field to specify the dataset name.
     :param text_keys: key names of field that stores sample text.
         Default: None
     :param suffixes: files with specified suffixes to be processed.
     :param add_suffix: whether to add the file suffix to dataset meta
         info
     :return: a dataset formatter.
     """
+    if generated_dataset_config:
+        assert isinstance(generated_dataset_config,
+                          dict) and 'type' in generated_dataset_config
+        args = generated_dataset_config.copy()
+        obj_name = args.pop('type')
+        args.update(kwargs)
+
+        from .formatter import FORMATTERS
+        return FORMATTERS.modules[obj_name](**args)
+
     formatter = MixtureFormatter(dataset_path=dataset_path,
                                  text_keys=text_keys,
                                  suffixes=suffixes,

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -2,10 +2,11 @@
 from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper,
                clean_copyright_mapper, clean_email_mapper, clean_html_mapper,
                clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
-               extract_qa_mapper, fix_unicode_mapper, image_blur_mapper,
+               extract_qa_mapper, fix_unicode_mapper,
+               generate_instruction_mapper, image_blur_mapper,
                image_captioning_from_gpt4v_mapper, image_captioning_mapper,
                image_diffusion_mapper, image_face_blur_mapper,
-               nlpaug_en_mapper, nlpcda_zh_mapper,
+               nlpaug_en_mapper, nlpcda_zh_mapper, optimize_instruction_mapper,
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
@@ -34,13 +35,15 @@
 from .expand_macro_mapper import ExpandMacroMapper
 from .extract_qa_mapper import ExtractQAMapper
 from .fix_unicode_mapper import FixUnicodeMapper
+from .generate_instruction_mapper import GenerateInstructionMapper
 from .image_blur_mapper import ImageBlurMapper
 from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
 from .image_captioning_mapper import ImageCaptioningMapper
 from .image_diffusion_mapper import ImageDiffusionMapper
 from .image_face_blur_mapper import ImageFaceBlurMapper
 from .nlpaug_en_mapper import NlpaugEnMapper
 from .nlpcda_zh_mapper import NlpcdaZhMapper
+from .optimize_instruction_mapper import OptimizeInstructionMapper
 from .punctuation_normalization_mapper import PunctuationNormalizationMapper
 from .remove_bibliography_mapper import RemoveBibliographyMapper
 from .remove_comments_mapper import RemoveCommentsMapper
@@ -92,6 +95,7 @@
     'VideoFFmpegWrappedMapper',
     'ChineseConvertMapper',
     'NlpcdaZhMapper',
+    'OptimizeInstructionMapper',
     'ImageBlurMapper',
     'CleanCopyrightMapper',
     'RemoveNonChineseCharacterlMapper',
@@ -108,6 +112,7 @@
     'RemoveWordsWithIncorrectSubstringsMapper',
     'VideoCaptioningFromVideoMapper',
     'VideoCaptioningFromSummarizerMapper',
+    'GenerateInstructionMapper',
     'FixUnicodeMapper',
     'NlpaugEnMapper',
     'VideoCaptioningFromFramesMapper',
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,3 +14,4 @@ dist @@
     .idea/
     wandb/
     __pycache__
+    .vscode/