Fix (#572)

modelscope · Feb 11, 2025 · b91683b · b91683b
1 parent bd12da3
commit b91683b
Show file tree

Hide file tree

Showing 24 changed files with 48 additions and 48 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -167,7 +167,7 @@ process:
       system_prompt_template: null                            # System prompt template for the task. Need to be specified by given entity and attribute.
       input_template: null                                    # Template for building the model input.
       attr_pattern_template: null                             # Pattern for parsing the attribute from output. Need to be specified by given attribute.
-      demo_pattern: null                                      # Pattern for parsing the demonstraction from output to support the attribute.
+      demo_pattern: null                                      # Pattern for parsing the demonstration from output to support the attribute.
       try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
       drop_text: false                                        # If drop the text in the output.
       model_params: {}                                        # Parameters for initializing the API model.
@@ -265,10 +265,10 @@ process:
       sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
       mem_required: '31GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_blur_mapper:                                      # mapper to blur images.
-      p: 0.2                                                  # probability of the image being blured
+      p: 0.2                                                  # probability of the image being blurred
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
-  - image_captioning_from_gpt4v_mapper:                     # generate samples whose texts are generated based on gpt-4-visison and the image
+  - image_captioning_from_gpt4v_mapper:                     # generate samples whose texts are generated based on gpt-4-vision and the image
       mode: 'description'                                     # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
       api_key: ''                                             # the API key to authenticate the request
       max_token: 500                                          # the maximum number of tokens to generate. Default is 500.
@@ -370,29 +370,29 @@ process:
       lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
       batched: False                                          # A boolean indicating whether to process input data in batches.
   - query_intent_detection_mapper:                          # Mapper to predict user's Intent label in query.
-      hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification'     # Hugginface model ID to predict intent label.
+      hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification'     # Huggingface model ID to predict intent label.
       zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en'         # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
       model_params: {}                                        # model param for hf_model.
       zh_to_en_model_params: {}                               # model param for zh_to_hf_model.
       label_key: 'query_intent_label'                         # The key name in the meta field to store the output label. It is 'query_intent_label' in default.
       score_key: 'query_intent_label_score'                   # The key name in the meta field to store the corresponding label score. It is 'query_intent_label_score' in default.
   - query_sentiment_detection_mapper:                       # Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query.
-      hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'     # Hugginface model ID to predict sentiment label.
+      hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'     # Huggingface model ID to predict sentiment label.
       zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en'         # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
       model_params: {}                                        # model param for hf_model.
       zh_to_en_model_params: {}                               # model param for zh_to_hf_model.
       label_key: 'query_sentiment_label'                      # The key name in the meta field to store the output label. It is 'query_sentiment_label' in default.
       score_key: 'query_sentiment_label_score'                # The key name in the meta field to store the corresponding label score. It is 'query_sentiment_label_score' in default.
   - query_topic_detection_mapper:                           # Mapper to predict user's topic label in query.
-      hf_model: 'dstefa/roberta-base_topic_classification_nyt_news'     # Hugginface model ID to predict topic label.
+      hf_model: 'dstefa/roberta-base_topic_classification_nyt_news'     # Huggingface model ID to predict topic label.
       zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en'         # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
       model_params: {}                                        # model param for hf_model.
       zh_to_en_model_params: {}                               # model param for zh_to_hf_model.
       label_key: 'query_topic_label'                          # The key name in the meta field to store the output label. It is 'query_topic_label' in default.
       score_key: 'query_topic_label_score'                    # The key name in the meta field to store the corresponding label score. It is 'query_topic_label_score' in default.
   - relation_identity_mapper:                               # identify relation between two entity in the text.
       api_model: 'gpt-4o'                                     # API model name.
-      source_entity: '孙悟空'                                  # The source entity of the relation to be dentified.
+      source_entity: '孙悟空'                                  # The source entity of the relation to be identified.
       target_entity: '猪八戒'                                  # The target entity of the relation to be identified.
       output_key: 'role_relation'                             # The output key in the meta field in the samples. It is 'role_relation' in default.
       api_endpoint: null                                      # URL endpoint for the API.
@@ -453,7 +453,7 @@ process:
       max_len: 2000                                           # Split text into multi texts with this max len if not None.
       split_pattern: '\n\n'                                   # Make sure split in this pattern if it is not None and force cut if the length exceeds max_len.
       overlap_len: 200                                        # Overlap length of the split texts if not split in the split pattern.
-      tokenizer: 'gpt-4o'                                     # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offerd. Otherwise, the text length equals to string length.
+      tokenizer: 'gpt-4o'                                     # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offered. Otherwise, the text length equals to string length.
       trust_remote_code: True                                 # for loading huggingface model.
   - video_captioning_from_audio_mapper:                     # caption a video according to its audio streams based on Qwen-Audio model
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
@@ -511,7 +511,7 @@ process:
       roi_type: ratio                                         # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
       roi_key: null                                           # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
       frame_num: 10                                           # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
-      min_frame_threshold: 7                                  # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
+      min_frame_threshold: 7                                  # a coordination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
       detection_method: pixel_value                           # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
   - video_resize_aspect_ratio_mapper:                       # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
       min_ratio: 9/21                                         # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
@@ -845,7 +845,7 @@ process:
       ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
       tokenizer_model: null                                   # path for the sentencepiece model, used for sentencepiece tokenization.
       union_find_parallel_num: 'auto'                         # number of parallel workers for union-find algorithm. Default it's 'auto', and it will be determined by half of the number of CPUs.
-      union_threshold: 256                                    # threshold for minhash values group to perform union-find algorightm.
+      union_threshold: 256                                    # threshold for minhash values group to perform union-find algorithm.
       max_pending_edge_buffer_task: 20                        # max number of pending edge buffer ray tasks.
       num_edge_buffer_task_returns: 10                        # number of edge buffer tasks for `ray.wait` to return.
       max_pending_filter_tasks: 20                            # max number of pending filter ray tasks.
@@ -919,7 +919,7 @@ process:
   - most_relavant_entities_aggregator:                      # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
       api_model: 'gpt-4o'                                     # API model name.
       entity: '孙悟空'                                         # The given entity.
-      query_entity_type: '人物'                                # The type of queried relavant entities.
+      query_entity_type: '人物'                                # The type of queried relevant entities.
       input_key: 'event_description'                          # The input key in the meta field of the samples. It is "event_description" in default.
       output_key: 'most_relavant_entities'                    # The output key in the aggregation field of the samples. It is "most_relavant_entities" in default.
       max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.

diff --git a/configs/demo/bench/vbench_eval.yaml b/configs/demo/bench/vbench_eval.yaml
@@ -12,7 +12,7 @@ result_dir: ./outputs/demo-bench/eval_results
 # Give a name for this eval
 eval_name: mini_test
 
-# If true, load the required model for VBench from the cache path of evironment parameter VBENCH_CACHE_DIR
+# If true, load the required model for VBench from the cache path of environment parameter VBENCH_CACHE_DIR
 load_ckpt_from_local: false
 
 # The dimensions considered in this eval.

diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py
@@ -32,8 +32,8 @@ def __init__(self,
             Objects is independent if their number of edges in the dependency
             tree is below this parameter.
         :param any_or_all: keep this sample with 'any' or 'all' strategy.
-            'any': keep this sample if any objet is dependent. 'all': keep this
-            sample only if all images are dependent.
+            'any': keep this sample if any object is dependent. 'all': keep
+            this sample only if all images are dependent.
         """
         super().__init__(*args, **kwargs)
         # '--no-deps' do not update numpy
@@ -74,7 +74,7 @@ def compute_stats_single(self, sample, context=False):
             if obj.dep_ != 'ROOT':
                 entity_to_dependency_nums[obj] += 1
         for token in doc:
-            # the punctation mark such as ',', '.'
+            # the punctuation mark such as ',', '.'
             if token.pos_ == 'PUNCT':
                 continue
 

diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -27,7 +27,7 @@ def __init__(self,
         """
         Initialization method.
 
-        :param p: Probability of the image being blured.
+        :param p: Probability of the image being blurred.
         :param blur_type: Type of blur kernel, including
             ['mean', 'box', 'gaussian'].
         :param radius: Radius of blur kernel.

diff --git a/data_juicer/ops/mapper/query_intent_detection_mapper.py b/data_juicer/ops/mapper/query_intent_detection_mapper.py
@@ -75,9 +75,9 @@ def process_batched(self, samples, rank=None):
         queries = samples[self.query_key]
 
         if self.zh_to_en_model_key is not None:
-            translater, _ = get_model(self.zh_to_en_model_key, rank,
+            translator, _ = get_model(self.zh_to_en_model_key, rank,
                                       self.use_cuda())
-            results = translater(queries)
+            results = translator(queries)
             queries = [item['translation_text'] for item in results]
 
         classifier, _ = get_model(self.model_key, rank, self.use_cuda())

diff --git a/data_juicer/ops/mapper/query_sentiment_detection_mapper.py b/data_juicer/ops/mapper/query_sentiment_detection_mapper.py
@@ -78,9 +78,9 @@ def process_batched(self, samples, rank=None):
         queries = samples[self.query_key]
 
         if self.zh_to_en_model_key is not None:
-            translater, _ = get_model(self.zh_to_en_model_key, rank,
+            translator, _ = get_model(self.zh_to_en_model_key, rank,
                                       self.use_cuda())
-            results = translater(queries)
+            results = translator(queries)
             queries = [item['translation_text'] for item in results]
 
         classifier, _ = get_model(self.model_key, rank, self.use_cuda())

diff --git a/data_juicer/ops/mapper/query_topic_detection_mapper.py b/data_juicer/ops/mapper/query_topic_detection_mapper.py
@@ -77,9 +77,9 @@ def process_batched(self, samples, rank=None):
         queries = samples[self.query_key]
 
         if self.zh_to_en_model_key is not None:
-            translater, _ = get_model(self.zh_to_en_model_key, rank,
+            translator, _ = get_model(self.zh_to_en_model_key, rank,
                                       self.use_cuda())
-            results = translater(queries)
+            results = translator(queries)
             queries = [item['translation_text'] for item in results]
 
         classifier, _ = get_model(self.model_key, rank, self.use_cuda())

diff --git a/data_juicer/ops/mapper/text_chunk_mapper.py b/data_juicer/ops/mapper/text_chunk_mapper.py
@@ -35,10 +35,10 @@ def __init__(self,
         :param overlap_len: Overlap length of the split texts if not split in
             the split pattern.
         :param tokenizer: The tokenizer name of Hugging Face tokenizers.
-            The text length will be calculate as the token num if it is offerd.
-            Otherwise, the text length equals to string length. Support
-            tiktoken tokenizer (such as gpt-4o), dashscope tokenizer (such as
-            qwen2.5-72b-instruct) and huggingface tokenizer.
+            The text length will be calculate as the token num if it is
+            offered. Otherwise, the text length equals to string length.
+            Support tiktoken tokenizer (such as gpt-4o), dashscope tokenizer (
+            such as qwen2.5-72b-instruct) and huggingface tokenizer.
         :trust_remote_code: for loading huggingface model
         :param args: extra args
         :param kwargs: extra args

diff --git a/data_juicer/utils/common_utils.py b/data_juicer/utils/common_utils.py
@@ -47,7 +47,7 @@ def nested_access(data, path, digit_allowed=True):
     :param path: A dot-separated string representing the path to access.
                     This can include numeric indices when accessing list
                     elements.
-    :param digit_allowed: Allow transfering string to digit.
+    :param digit_allowed: Allow transferring string to digit.
     :return: The value located at the specified path, or raises a KeyError
                 or IndexError if the path does not exist.
     """

diff --git a/data_juicer/utils/compress.py b/data_juicer/utils/compress.py
@@ -16,7 +16,7 @@
 
 class FileLock(HF_FileLock):
     """
-    File lock for compresssion or decompression, and
+    File lock for compression or decompression, and
     remove lock file automatically.
     """
 

diff --git a/scripts/dlc/run_on_dlc.sh b/scripts/dlc/run_on_dlc.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# paremeters
+# parameters
 datajuicer_path= # path to data-juicer
 config_path= # path to config file
 

diff --git a/tools/converter/convert_gpt_to_transformers.py b/tools/converter/convert_gpt_to_transformers.py
@@ -188,7 +188,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
         param, checkpoint_version, num_splits, num_heads, hidden_size):
     """
     Permutes layout of param tensor to the one compatible with respective
-    NVIDIA Megatron-LM chekpoint versions. Input is
+    NVIDIA Megatron-LM checkpoint versions. Input is
     [num_splits * num_heads * hidden_size, :] and output is
     [num_heads * hidden_size * num_splits, :] for version 1.0 and
     [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If
@@ -311,8 +311,8 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             'Megatron-LM checkpoint does not contain arguments. This utility '
             'only supports Megatron-LM checkpoints containing all the '
             'architecture, the tensor and pipeline model parallel size from '
-            'the checkpoint insead of user having to manually specify all the '
-            'details. Please save Megatron-LM checkpoint along with all the '
+            'the checkpoint instead of user having to manually specify all the'
+            ' details. Please save Megatron-LM checkpoint along with all the '
             'megatron arguments to use this utility.')
 
     # Create Transformers GPT2 config from Megatron-LM arguments

diff --git a/tools/evaluator/README.md b/tools/evaluator/README.md
@@ -69,7 +69,7 @@ auto_eval:
     merge_path: <str>      # configuration for gpt2 tokenizer type, path to merge file
     tokenizer_path: <str>  # configuration for sentencepiece tokenizer type, path to model file
     max_tokens: <int>      # max tokens to generate in inference
-    token_per_iteration: <float> # billions tokens per iteraion
+    token_per_iteration: <float> # billions tokens per iteration
   helm:
     helm_spec_template_path: <str> # path of helm spec template file, default is tools/evaluator/config/helm_spec_template.conf
     helm_output_path: <str>  # path of helm output dir

diff --git a/tools/evaluator/config/evaluator_example.yaml b/tools/evaluator/config/evaluator_example.yaml
@@ -10,7 +10,7 @@ auto_eval:
     vocab_path: <path to vocab file>
     merge_path: <path to merge file>
     max_tokens: <max tokens in inference>
-    token_per_iteration: <billions tokens per iteraion>
+    token_per_iteration: <billions tokens per iteration>
     # tokenizer_path:
     # log_path: <default is cache_path/megatron.log>
   helm: