Skip to content

Commit

Permalink
Add API docs and make modules use base class
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
  • Loading branch information
ryantwolf committed Feb 7, 2025
1 parent 396a7f6 commit a8f12dd
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 43 deletions.
8 changes: 8 additions & 0 deletions docs/user-guide/api/filters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ Heuristic Filters
:members:
:member-order: bysource

.. autoclass:: nemo_curator.filters.TokenCountFilter
:members:
:member-order: bysource

.. autoclass:: nemo_curator.filters.SubstringFilter
:members:
:member-order: bysource

------------------------------
Code Filters
------------------------------
Expand Down
6 changes: 6 additions & 0 deletions docs/user-guide/api/misc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ Miscellaneous

.. autoclass:: nemo_curator.Shuffle
:members:

.. autoclass:: nemo_curator.DocumentSplitter
:members:

.. autoclass:: nemo_curator.DocumentJoiner
:members:
19 changes: 19 additions & 0 deletions docs/user-guide/api/modifiers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,22 @@ Modifiers

.. autoclass:: nemo_curator.modifiers.PiiModifier
:members:

.. autoclass:: nemo_curator.modifiers.LineRemover
:members:

.. autoclass:: nemo_curator.modifiers.MarkdownRemover
:members:

.. autoclass:: nemo_curator.modifiers.NewlineNormalizer
:members:

.. autoclass:: nemo_curator.modifiers.UrlRemover
:members:

.. autoclass:: nemo_curator.modifiers.Slicer
:members:

.. autoclass:: nemo_curator.modifiers.QuotationRemover
:members:

4 changes: 2 additions & 2 deletions nemo_curator/filters/heuristic_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,11 +693,11 @@ def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf
self._max_tokens = max_tokens
self._name = "token_count"

def score_document(self, text):
def score_document(self, text: str) -> int:
tokens = self._tokenizer.encode(text)
return len(tokens)

def keep_document(self, score):
def keep_document(self, score: int) -> bool:
return self._min_tokens <= score <= self._max_tokens


Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/modifiers/markdown_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

class MarkdownRemover(DocumentModifier):
"""
Removes Markdown formatting in a document including bold, italic, and URL text.
Removes Markdown formatting in a document including bold, italic, underline, and URL text.
"""

def __init__(self):
Expand Down
14 changes: 9 additions & 5 deletions nemo_curator/modifiers/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

class Slicer(DocumentModifier):
"""
Slices a document based on indices or strings
Slices a document based on indices or strings.
"""

def __init__(
Expand All @@ -31,10 +31,14 @@ def __init__(
):
"""
Args:
left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). If the provided value is a str, slice the string from the first occurence of this substring.
right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). If the provided value is a str, slice the string to the last occurence of this substring.
include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the slicing result. Defaults to False.
include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the slicing result. Defaults to False.
left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive).
If the provided value is a str, slice the string from the first occurence of this substring.
right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive).
If the provided value is a str, slice the string to the last occurence of this substring.
include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the
slicing result. Defaults to False.
include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the
slicing result. Defaults to False.
strip (bool): If True, strip the resulting string.
"""
super().__init__()
Expand Down
33 changes: 0 additions & 33 deletions nemo_curator/services/huggingface_formatter.py

This file was deleted.

7 changes: 5 additions & 2 deletions nemo_curator/synthetic/nemotron_cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from transformers import AutoTokenizer

from nemo_curator import BaseModule
from nemo_curator.datasets import DocumentDataset
from nemo_curator.services import LLMClient
from nemo_curator.synthetic.prompts import (
Expand Down Expand Up @@ -198,7 +199,7 @@ def generate_knowledge_list(
)


class NemotronCCDiverseQAPostprocessor:
class NemotronCCDiverseQAPostprocessor(BaseModule):
"""
Postprocesses the output of the Nemotron-CC Diverse QA generation pipeline.
This postprocessor will sample a random number of QA pairs up to max_num_pairs.
Expand Down Expand Up @@ -226,6 +227,7 @@ def __init__(
max_num_pairs (int): The maximum number of QA pairs to sample.
prefix (str): The prefix of the response from the LLM.
"""
super().__init__(input_backend="pandas")
self.tokenizer = tokenizer
self.text_field = text_field
self.response_field = response_field
Expand Down Expand Up @@ -287,7 +289,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:

# Although this could be implemented as a DocumentModifier,
# I have kept it separate to match the other postprocessors.
class NemotronCCKnowledgeListPostprocessor:
class NemotronCCKnowledgeListPostprocessor(BaseModule):
"""
Processes and cleans the output generated by the Nemotron-CC Knowledge List pipeline.
Expand All @@ -300,6 +302,7 @@ class NemotronCCKnowledgeListPostprocessor:
"""

def __init__(self, text_field: str = "text") -> None:
super().__init__(input_backend="pandas")
self.text_field = text_field

def _postprocess_llm_response(self, text: str) -> str:
Expand Down

0 comments on commit a8f12dd

Please sign in to comment.