From a8f12ddad8ee8fbbec332ff5d0994c5537dfeef6 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 7 Feb 2025 09:45:05 -0800 Subject: [PATCH] Add API docs and make modules use base class Signed-off-by: Ryan Wolf --- docs/user-guide/api/filters.rst | 8 +++++ docs/user-guide/api/misc.rst | 6 ++++ docs/user-guide/api/modifiers.rst | 19 +++++++++++ nemo_curator/filters/heuristic_filter.py | 4 +-- nemo_curator/modifiers/markdown_remover.py | 2 +- nemo_curator/modifiers/slicer.py | 14 +++++--- .../services/huggingface_formatter.py | 33 ------------------- nemo_curator/synthetic/nemotron_cc.py | 7 ++-- 8 files changed, 50 insertions(+), 43 deletions(-) delete mode 100644 nemo_curator/services/huggingface_formatter.py diff --git a/docs/user-guide/api/filters.rst b/docs/user-guide/api/filters.rst index 55b78ed7b..24678b73e 100644 --- a/docs/user-guide/api/filters.rst +++ b/docs/user-guide/api/filters.rst @@ -152,6 +152,14 @@ Heuristic Filters :members: :member-order: bysource +.. autoclass:: nemo_curator.filters.TokenCountFilter + :members: + :member-order: bysource + +.. autoclass:: nemo_curator.filters.SubstringFilter + :members: + :member-order: bysource + ------------------------------ Code Filters ------------------------------ diff --git a/docs/user-guide/api/misc.rst b/docs/user-guide/api/misc.rst index b4785f022..9872cb858 100644 --- a/docs/user-guide/api/misc.rst +++ b/docs/user-guide/api/misc.rst @@ -15,3 +15,9 @@ Miscellaneous .. autoclass:: nemo_curator.Shuffle :members: + +.. autoclass:: nemo_curator.DocumentSplitter + :members: + +.. autoclass:: nemo_curator.DocumentJoiner + :members: diff --git a/docs/user-guide/api/modifiers.rst b/docs/user-guide/api/modifiers.rst index 6e5f506ed..252803a24 100644 --- a/docs/user-guide/api/modifiers.rst +++ b/docs/user-guide/api/modifiers.rst @@ -32,3 +32,22 @@ Modifiers .. autoclass:: nemo_curator.modifiers.PiiModifier :members: + +.. autoclass:: nemo_curator.modifiers.LineRemover + :members: + +.. autoclass:: nemo_curator.modifiers.MarkdownRemover + :members: + +.. autoclass:: nemo_curator.modifiers.NewlineNormalizer + :members: + +.. autoclass:: nemo_curator.modifiers.UrlRemover + :members: + +.. autoclass:: nemo_curator.modifiers.Slicer + :members: + +.. autoclass:: nemo_curator.modifiers.QuotationRemover + :members: + diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py index 182a76672..26617bd60 100644 --- a/nemo_curator/filters/heuristic_filter.py +++ b/nemo_curator/filters/heuristic_filter.py @@ -693,11 +693,11 @@ def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf self._max_tokens = max_tokens self._name = "token_count" - def score_document(self, text): + def score_document(self, text: str) -> int: tokens = self._tokenizer.encode(text) return len(tokens) - def keep_document(self, score): + def keep_document(self, score: int) -> bool: return self._min_tokens <= score <= self._max_tokens diff --git a/nemo_curator/modifiers/markdown_remover.py b/nemo_curator/modifiers/markdown_remover.py index cda29cd5f..be060fd48 100644 --- a/nemo_curator/modifiers/markdown_remover.py +++ b/nemo_curator/modifiers/markdown_remover.py @@ -24,7 +24,7 @@ class MarkdownRemover(DocumentModifier): """ - Removes Markdown formatting in a document including bold, italic, and URL text. + Removes Markdown formatting in a document including bold, italic, underline, and URL text. """ def __init__(self): diff --git a/nemo_curator/modifiers/slicer.py b/nemo_curator/modifiers/slicer.py index 9ce68a6e6..d88070388 100644 --- a/nemo_curator/modifiers/slicer.py +++ b/nemo_curator/modifiers/slicer.py @@ -18,7 +18,7 @@ class Slicer(DocumentModifier): """ - Slices a document based on indices or strings + Slices a document based on indices or strings. """ def __init__( @@ -31,10 +31,14 @@ def __init__( ): """ Args: - left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). If the provided value is a str, slice the string from the first occurence of this substring. - right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). If the provided value is a str, slice the string to the last occurence of this substring. - include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the slicing result. Defaults to False. - include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the slicing result. Defaults to False. + left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). + If the provided value is a str, slice the string from the first occurence of this substring. + right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). + If the provided value is a str, slice the string to the last occurence of this substring. + include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the + slicing result. Defaults to False. + include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the + slicing result. Defaults to False. strip (bool): If True, strip the resulting string. """ super().__init__() diff --git a/nemo_curator/services/huggingface_formatter.py b/nemo_curator/services/huggingface_formatter.py deleted file mode 100644 index 728d1aa73..000000000 --- a/nemo_curator/services/huggingface_formatter.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List - -from transformers import AutoTokenizer - -from nemo_curator.services import ConversationFormatter - - -class HuggingFaceFormatter(ConversationFormatter): - """ - A formatter that uses a Hugging Face tokenizer to format a conversation. - """ - - def __init__(self, tokenizer: AutoTokenizer) -> None: - self.tokenizer = tokenizer - - def format_conversation(self, conversation: List[dict]) -> str: - """ - Format a conversation between a user, assistant, and potentially system into a string. - """ - return self.tokenizer.apply_chat_template(conversation, tokenize=False) diff --git a/nemo_curator/synthetic/nemotron_cc.py b/nemo_curator/synthetic/nemotron_cc.py index 565302cb4..0850eb94a 100644 --- a/nemo_curator/synthetic/nemotron_cc.py +++ b/nemo_curator/synthetic/nemotron_cc.py @@ -17,6 +17,7 @@ from transformers import AutoTokenizer +from nemo_curator import BaseModule from nemo_curator.datasets import DocumentDataset from nemo_curator.services import LLMClient from nemo_curator.synthetic.prompts import ( @@ -198,7 +199,7 @@ def generate_knowledge_list( ) -class NemotronCCDiverseQAPostprocessor: +class NemotronCCDiverseQAPostprocessor(BaseModule): """ Postprocesses the output of the Nemotron-CC Diverse QA generation pipeline. This postprocessor will sample a random number of QA pairs up to max_num_pairs. @@ -226,6 +227,7 @@ def __init__( max_num_pairs (int): The maximum number of QA pairs to sample. prefix (str): The prefix of the response from the LLM. """ + super().__init__(input_backend="pandas") self.tokenizer = tokenizer self.text_field = text_field self.response_field = response_field @@ -287,7 +289,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: # Although this could be implemented as a DocumentModifier, # I have kept it separate to match the other postprocessors. -class NemotronCCKnowledgeListPostprocessor: +class NemotronCCKnowledgeListPostprocessor(BaseModule): """ Processes and cleans the output generated by the Nemotron-CC Knowledge List pipeline. @@ -300,6 +302,7 @@ class NemotronCCKnowledgeListPostprocessor: """ def __init__(self, text_field: str = "text") -> None: + super().__init__(input_backend="pandas") self.text_field = text_field def _postprocess_llm_response(self, text: str) -> str: