Add API docs and make modules use base class

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
NVIDIA · Feb 7, 2025 · a8f12dd · a8f12dd
1 parent 396a7f6
commit a8f12dd
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 43 deletions.
diff --git a/docs/user-guide/api/filters.rst b/docs/user-guide/api/filters.rst
@@ -152,6 +152,14 @@ Heuristic Filters
     :members:
     :member-order: bysource
 
+.. autoclass:: nemo_curator.filters.TokenCountFilter
+    :members:
+    :member-order: bysource
+
+.. autoclass:: nemo_curator.filters.SubstringFilter
+    :members:
+    :member-order: bysource
+
 ------------------------------
 Code Filters
 ------------------------------

diff --git a/docs/user-guide/api/misc.rst b/docs/user-guide/api/misc.rst
@@ -15,3 +15,9 @@ Miscellaneous
 
 .. autoclass:: nemo_curator.Shuffle
     :members:
+
+.. autoclass:: nemo_curator.DocumentSplitter
+    :members:
+
+.. autoclass:: nemo_curator.DocumentJoiner
+    :members:
diff --git a/docs/user-guide/api/modifiers.rst b/docs/user-guide/api/modifiers.rst
@@ -32,3 +32,22 @@ Modifiers
 
 .. autoclass:: nemo_curator.modifiers.PiiModifier
     :members:
+
+.. autoclass:: nemo_curator.modifiers.LineRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.MarkdownRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.NewlineNormalizer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.UrlRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.Slicer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.QuotationRemover
+    :members:
+
diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py
@@ -693,11 +693,11 @@ def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf
         self._max_tokens = max_tokens
         self._name = "token_count"
 
-    def score_document(self, text):
+    def score_document(self, text: str) -> int:
         tokens = self._tokenizer.encode(text)
         return len(tokens)
 
-    def keep_document(self, score):
+    def keep_document(self, score: int) -> bool:
         return self._min_tokens <= score <= self._max_tokens
 
 

diff --git a/nemo_curator/modifiers/markdown_remover.py b/nemo_curator/modifiers/markdown_remover.py
@@ -24,7 +24,7 @@
 
 class MarkdownRemover(DocumentModifier):
     """
-    Removes Markdown formatting in a document including bold, italic, and URL text.
+    Removes Markdown formatting in a document including bold, italic, underline, and URL text.
     """
 
     def __init__(self):

diff --git a/nemo_curator/modifiers/slicer.py b/nemo_curator/modifiers/slicer.py
@@ -18,7 +18,7 @@
 
 class Slicer(DocumentModifier):
     """
-    Slices a document based on indices or strings
+    Slices a document based on indices or strings.
     """
 
     def __init__(
@@ -31,10 +31,14 @@ def __init__(
     ):
         """
         Args:
-            left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). If the provided value is a str, slice the string from the first occurence of this substring.
-            right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). If the provided value is a str, slice the string to the last occurence of this substring.
-            include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the slicing result. Defaults to False.
-            include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the slicing result. Defaults to False.
+            left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive).
+                If the provided value is a str, slice the string from the first occurence of this substring.
+            right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive).
+                If the provided value is a str, slice the string to the last occurence of this substring.
+            include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the
+                slicing result. Defaults to False.
+            include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the
+                slicing result. Defaults to False.
             strip (bool): If True, strip the resulting string.
         """
         super().__init__()

diff --git a/nemo_curator/services/huggingface_formatter.py b/nemo_curator/services/huggingface_formatter.py
diff --git a/nemo_curator/synthetic/nemotron_cc.py b/nemo_curator/synthetic/nemotron_cc.py
@@ -17,6 +17,7 @@
 
 from transformers import AutoTokenizer
 
+from nemo_curator import BaseModule
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.services import LLMClient
 from nemo_curator.synthetic.prompts import (
@@ -198,7 +199,7 @@ def generate_knowledge_list(
         )
 
 
-class NemotronCCDiverseQAPostprocessor:
+class NemotronCCDiverseQAPostprocessor(BaseModule):
     """
     Postprocesses the output of the Nemotron-CC Diverse QA generation pipeline.
     This postprocessor will sample a random number of QA pairs up to max_num_pairs.
@@ -226,6 +227,7 @@ def __init__(
             max_num_pairs (int): The maximum number of QA pairs to sample.
             prefix (str): The prefix of the response from the LLM.
         """
+        super().__init__(input_backend="pandas")
         self.tokenizer = tokenizer
         self.text_field = text_field
         self.response_field = response_field
@@ -287,7 +289,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
 
 # Although this could be implemented as a DocumentModifier,
 # I have kept it separate to match the other postprocessors.
-class NemotronCCKnowledgeListPostprocessor:
+class NemotronCCKnowledgeListPostprocessor(BaseModule):
     """
     Processes and cleans the output generated by the Nemotron-CC Knowledge List pipeline.
 
@@ -300,6 +302,7 @@ class NemotronCCKnowledgeListPostprocessor:
     """
 
     def __init__(self, text_field: str = "text") -> None:
+        super().__init__(input_backend="pandas")
         self.text_field = text_field
 
     def _postprocess_llm_response(self, text: str) -> str: