Skip to content

Commit

Permalink
Fix (#572)
Browse files Browse the repository at this point in the history
  • Loading branch information
co63oc authored Feb 11, 2025
1 parent bd12da3 commit b91683b
Show file tree
Hide file tree
Showing 24 changed files with 48 additions and 48 deletions.
22 changes: 11 additions & 11 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ process:
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and attribute.
input_template: null # Template for building the model input.
attr_pattern_template: null # Pattern for parsing the attribute from output. Need to be specified by given attribute.
demo_pattern: null # Pattern for parsing the demonstraction from output to support the attribute.
demo_pattern: null # Pattern for parsing the demonstration from output to support the attribute.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
Expand Down Expand Up @@ -265,10 +265,10 @@ process:
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
mem_required: '31GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
- image_blur_mapper: # mapper to blur images.
p: 0.2 # probability of the image being blured
p: 0.2 # probability of the image being blurred
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_captioning_from_gpt4v_mapper: # generate samples whose texts are generated based on gpt-4-visison and the image
- image_captioning_from_gpt4v_mapper: # generate samples whose texts are generated based on gpt-4-vision and the image
mode: 'description' # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
api_key: '' # the API key to authenticate the request
max_token: 500 # the maximum number of tokens to generate. Default is 500.
Expand Down Expand Up @@ -370,29 +370,29 @@ process:
lambda_str: '' # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
batched: False # A boolean indicating whether to process input data in batches.
- query_intent_detection_mapper: # Mapper to predict user's Intent label in query.
hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification' # Hugginface model ID to predict intent label.
hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification' # Huggingface model ID to predict intent label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_intent_label' # The key name in the meta field to store the output label. It is 'query_intent_label' in default.
score_key: 'query_intent_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_intent_label_score' in default.
- query_sentiment_detection_mapper: # Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query.
hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' # Hugginface model ID to predict sentiment label.
hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' # Huggingface model ID to predict sentiment label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_sentiment_label' # The key name in the meta field to store the output label. It is 'query_sentiment_label' in default.
score_key: 'query_sentiment_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_sentiment_label_score' in default.
- query_topic_detection_mapper: # Mapper to predict user's topic label in query.
hf_model: 'dstefa/roberta-base_topic_classification_nyt_news' # Hugginface model ID to predict topic label.
hf_model: 'dstefa/roberta-base_topic_classification_nyt_news' # Huggingface model ID to predict topic label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_topic_label' # The key name in the meta field to store the output label. It is 'query_topic_label' in default.
score_key: 'query_topic_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_topic_label_score' in default.
- relation_identity_mapper: # identify relation between two entity in the text.
api_model: 'gpt-4o' # API model name.
source_entity: '孙悟空' # The source entity of the relation to be dentified.
source_entity: '孙悟空' # The source entity of the relation to be identified.
target_entity: '猪八戒' # The target entity of the relation to be identified.
output_key: 'role_relation' # The output key in the meta field in the samples. It is 'role_relation' in default.
api_endpoint: null # URL endpoint for the API.
Expand Down Expand Up @@ -453,7 +453,7 @@ process:
max_len: 2000 # Split text into multi texts with this max len if not None.
split_pattern: '\n\n' # Make sure split in this pattern if it is not None and force cut if the length exceeds max_len.
overlap_len: 200 # Overlap length of the split texts if not split in the split pattern.
tokenizer: 'gpt-4o' # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offerd. Otherwise, the text length equals to string length.
tokenizer: 'gpt-4o' # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offered. Otherwise, the text length equals to string length.
trust_remote_code: True # for loading huggingface model.
- video_captioning_from_audio_mapper: # caption a video according to its audio streams based on Qwen-Audio model
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
Expand Down Expand Up @@ -511,7 +511,7 @@ process:
roi_type: ratio # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
roi_key: null # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
frame_num: 10 # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
min_frame_threshold: 7 # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
min_frame_threshold: 7 # a coordination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
detection_method: pixel_value # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
- video_resize_aspect_ratio_mapper: # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
min_ratio: 9/21 # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
Expand Down Expand Up @@ -845,7 +845,7 @@ process:
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
tokenizer_model: null # path for the sentencepiece model, used for sentencepiece tokenization.
union_find_parallel_num: 'auto' # number of parallel workers for union-find algorithm. Default it's 'auto', and it will be determined by half of the number of CPUs.
union_threshold: 256 # threshold for minhash values group to perform union-find algorightm.
union_threshold: 256 # threshold for minhash values group to perform union-find algorithm.
max_pending_edge_buffer_task: 20 # max number of pending edge buffer ray tasks.
num_edge_buffer_task_returns: 10 # number of edge buffer tasks for `ray.wait` to return.
max_pending_filter_tasks: 20 # max number of pending filter ray tasks.
Expand Down Expand Up @@ -919,7 +919,7 @@ process:
- most_relavant_entities_aggregator: # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
api_model: 'gpt-4o' # API model name.
entity: '孙悟空' # The given entity.
query_entity_type: '人物' # The type of queried relavant entities.
query_entity_type: '人物' # The type of queried relevant entities.
input_key: 'event_description' # The input key in the meta field of the samples. It is "event_description" in default.
output_key: 'most_relavant_entities' # The output key in the aggregation field of the samples. It is "most_relavant_entities" in default.
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
Expand Down
2 changes: 1 addition & 1 deletion configs/demo/bench/vbench_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ result_dir: ./outputs/demo-bench/eval_results
# Give a name for this eval
eval_name: mini_test

# If true, load the required model for VBench from the cache path of evironment parameter VBENCH_CACHE_DIR
# If true, load the required model for VBench from the cache path of environment parameter VBENCH_CACHE_DIR
load_ckpt_from_local: false

# The dimensions considered in this eval.
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/text_entity_dependency_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def __init__(self,
Objects is independent if their number of edges in the dependency
tree is below this parameter.
:param any_or_all: keep this sample with 'any' or 'all' strategy.
'any': keep this sample if any objet is dependent. 'all': keep this
sample only if all images are dependent.
'any': keep this sample if any object is dependent. 'all': keep
this sample only if all images are dependent.
"""
super().__init__(*args, **kwargs)
# '--no-deps' do not update numpy
Expand Down Expand Up @@ -74,7 +74,7 @@ def compute_stats_single(self, sample, context=False):
if obj.dep_ != 'ROOT':
entity_to_dependency_nums[obj] += 1
for token in doc:
# the punctation mark such as ',', '.'
# the punctuation mark such as ',', '.'
if token.pos_ == 'PUNCT':
continue

Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/image_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self,
"""
Initialization method.
:param p: Probability of the image being blured.
:param p: Probability of the image being blurred.
:param blur_type: Type of blur kernel, including
['mean', 'box', 'gaussian'].
:param radius: Radius of blur kernel.
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/query_intent_detection_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def process_batched(self, samples, rank=None):
queries = samples[self.query_key]

if self.zh_to_en_model_key is not None:
translater, _ = get_model(self.zh_to_en_model_key, rank,
translator, _ = get_model(self.zh_to_en_model_key, rank,
self.use_cuda())
results = translater(queries)
results = translator(queries)
queries = [item['translation_text'] for item in results]

classifier, _ = get_model(self.model_key, rank, self.use_cuda())
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/query_sentiment_detection_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ def process_batched(self, samples, rank=None):
queries = samples[self.query_key]

if self.zh_to_en_model_key is not None:
translater, _ = get_model(self.zh_to_en_model_key, rank,
translator, _ = get_model(self.zh_to_en_model_key, rank,
self.use_cuda())
results = translater(queries)
results = translator(queries)
queries = [item['translation_text'] for item in results]

classifier, _ = get_model(self.model_key, rank, self.use_cuda())
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/query_topic_detection_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def process_batched(self, samples, rank=None):
queries = samples[self.query_key]

if self.zh_to_en_model_key is not None:
translater, _ = get_model(self.zh_to_en_model_key, rank,
translator, _ = get_model(self.zh_to_en_model_key, rank,
self.use_cuda())
results = translater(queries)
results = translator(queries)
queries = [item['translation_text'] for item in results]

classifier, _ = get_model(self.model_key, rank, self.use_cuda())
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/mapper/text_chunk_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def __init__(self,
:param overlap_len: Overlap length of the split texts if not split in
the split pattern.
:param tokenizer: The tokenizer name of Hugging Face tokenizers.
The text length will be calculate as the token num if it is offerd.
Otherwise, the text length equals to string length. Support
tiktoken tokenizer (such as gpt-4o), dashscope tokenizer (such as
qwen2.5-72b-instruct) and huggingface tokenizer.
The text length will be calculate as the token num if it is
offered. Otherwise, the text length equals to string length.
Support tiktoken tokenizer (such as gpt-4o), dashscope tokenizer (
such as qwen2.5-72b-instruct) and huggingface tokenizer.
:trust_remote_code: for loading huggingface model
:param args: extra args
:param kwargs: extra args
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def nested_access(data, path, digit_allowed=True):
:param path: A dot-separated string representing the path to access.
This can include numeric indices when accessing list
elements.
:param digit_allowed: Allow transfering string to digit.
:param digit_allowed: Allow transferring string to digit.
:return: The value located at the specified path, or raises a KeyError
or IndexError if the path does not exist.
"""
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class FileLock(HF_FileLock):
"""
File lock for compresssion or decompression, and
File lock for compression or decompression, and
remove lock file automatically.
"""

Expand Down
2 changes: 1 addition & 1 deletion scripts/dlc/run_on_dlc.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# paremeters
# parameters
datajuicer_path= # path to data-juicer
config_path= # path to config file

Expand Down
6 changes: 3 additions & 3 deletions tools/converter/convert_gpt_to_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
param, checkpoint_version, num_splits, num_heads, hidden_size):
"""
Permutes layout of param tensor to the one compatible with respective
NVIDIA Megatron-LM chekpoint versions. Input is
NVIDIA Megatron-LM checkpoint versions. Input is
[num_splits * num_heads * hidden_size, :] and output is
[num_heads * hidden_size * num_splits, :] for version 1.0 and
[num_heads * num_splits * hidden_size, :] for version 2.0 and later. If
Expand Down Expand Up @@ -311,8 +311,8 @@ def convert_checkpoint_from_megatron_to_transformers(args):
'Megatron-LM checkpoint does not contain arguments. This utility '
'only supports Megatron-LM checkpoints containing all the '
'architecture, the tensor and pipeline model parallel size from '
'the checkpoint insead of user having to manually specify all the '
'details. Please save Megatron-LM checkpoint along with all the '
'the checkpoint instead of user having to manually specify all the'
' details. Please save Megatron-LM checkpoint along with all the '
'megatron arguments to use this utility.')

# Create Transformers GPT2 config from Megatron-LM arguments
Expand Down
2 changes: 1 addition & 1 deletion tools/evaluator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ auto_eval:
merge_path: <str> # configuration for gpt2 tokenizer type, path to merge file
tokenizer_path: <str> # configuration for sentencepiece tokenizer type, path to model file
max_tokens: <int> # max tokens to generate in inference
token_per_iteration: <float> # billions tokens per iteraion
token_per_iteration: <float> # billions tokens per iteration
helm:
helm_spec_template_path: <str> # path of helm spec template file, default is tools/evaluator/config/helm_spec_template.conf
helm_output_path: <str> # path of helm output dir
Expand Down
2 changes: 1 addition & 1 deletion tools/evaluator/config/evaluator_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ auto_eval:
vocab_path: <path to vocab file>
merge_path: <path to merge file>
max_tokens: <max tokens in inference>
token_per_iteration: <billions tokens per iteraion>
token_per_iteration: <billions tokens per iteration>
# tokenizer_path:
# log_path: <default is cache_path/megatron.log>
helm:
Expand Down
Loading

0 comments on commit b91683b

Please sign in to comment.