From a8f12ddad8ee8fbbec332ff5d0994c5537dfeef6 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Fri, 7 Feb 2025 09:45:05 -0800
Subject: [PATCH] Add API docs and make modules use base class

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 docs/user-guide/api/filters.rst               |  8 +++++
 docs/user-guide/api/misc.rst                  |  6 ++++
 docs/user-guide/api/modifiers.rst             | 19 +++++++++++
 nemo_curator/filters/heuristic_filter.py      |  4 +--
 nemo_curator/modifiers/markdown_remover.py    |  2 +-
 nemo_curator/modifiers/slicer.py              | 14 +++++---
 .../services/huggingface_formatter.py         | 33 -------------------
 nemo_curator/synthetic/nemotron_cc.py         |  7 ++--
 8 files changed, 50 insertions(+), 43 deletions(-)
 delete mode 100644 nemo_curator/services/huggingface_formatter.py

diff --git a/docs/user-guide/api/filters.rst b/docs/user-guide/api/filters.rst
index 55b78ed7b..24678b73e 100644
--- a/docs/user-guide/api/filters.rst
+++ b/docs/user-guide/api/filters.rst
@@ -152,6 +152,14 @@ Heuristic Filters
     :members:
     :member-order: bysource
 
+.. autoclass:: nemo_curator.filters.TokenCountFilter
+    :members:
+    :member-order: bysource
+
+.. autoclass:: nemo_curator.filters.SubstringFilter
+    :members:
+    :member-order: bysource
+
 ------------------------------
 Code Filters
 ------------------------------
diff --git a/docs/user-guide/api/misc.rst b/docs/user-guide/api/misc.rst
index b4785f022..9872cb858 100644
--- a/docs/user-guide/api/misc.rst
+++ b/docs/user-guide/api/misc.rst
@@ -15,3 +15,9 @@ Miscellaneous
 
 .. autoclass:: nemo_curator.Shuffle
     :members:
+
+.. autoclass:: nemo_curator.DocumentSplitter
+    :members:
+
+.. autoclass:: nemo_curator.DocumentJoiner
+    :members:
diff --git a/docs/user-guide/api/modifiers.rst b/docs/user-guide/api/modifiers.rst
index 6e5f506ed..252803a24 100644
--- a/docs/user-guide/api/modifiers.rst
+++ b/docs/user-guide/api/modifiers.rst
@@ -32,3 +32,22 @@ Modifiers
 
 .. autoclass:: nemo_curator.modifiers.PiiModifier
     :members:
+
+.. autoclass:: nemo_curator.modifiers.LineRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.MarkdownRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.NewlineNormalizer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.UrlRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.Slicer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.QuotationRemover
+    :members:
+
diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py
index 182a76672..26617bd60 100644
--- a/nemo_curator/filters/heuristic_filter.py
+++ b/nemo_curator/filters/heuristic_filter.py
@@ -693,11 +693,11 @@ def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf
         self._max_tokens = max_tokens
         self._name = "token_count"
 
-    def score_document(self, text):
+    def score_document(self, text: str) -> int:
         tokens = self._tokenizer.encode(text)
         return len(tokens)
 
-    def keep_document(self, score):
+    def keep_document(self, score: int) -> bool:
         return self._min_tokens <= score <= self._max_tokens
 
 
diff --git a/nemo_curator/modifiers/markdown_remover.py b/nemo_curator/modifiers/markdown_remover.py
index cda29cd5f..be060fd48 100644
--- a/nemo_curator/modifiers/markdown_remover.py
+++ b/nemo_curator/modifiers/markdown_remover.py
@@ -24,7 +24,7 @@
 
 class MarkdownRemover(DocumentModifier):
     """
-    Removes Markdown formatting in a document including bold, italic, and URL text.
+    Removes Markdown formatting in a document including bold, italic, underline, and URL text.
     """
 
     def __init__(self):
diff --git a/nemo_curator/modifiers/slicer.py b/nemo_curator/modifiers/slicer.py
index 9ce68a6e6..d88070388 100644
--- a/nemo_curator/modifiers/slicer.py
+++ b/nemo_curator/modifiers/slicer.py
@@ -18,7 +18,7 @@
 
 class Slicer(DocumentModifier):
     """
-    Slices a document based on indices or strings
+    Slices a document based on indices or strings.
     """
 
     def __init__(
@@ -31,10 +31,14 @@ def __init__(
     ):
         """
         Args:
-            left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). If the provided value is a str, slice the string from the first occurence of this substring.
-            right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). If the provided value is a str, slice the string to the last occurence of this substring.
-            include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the slicing result. Defaults to False.
-            include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the slicing result. Defaults to False.
+            left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive).
+                If the provided value is a str, slice the string from the first occurence of this substring.
+            right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive).
+                If the provided value is a str, slice the string to the last occurence of this substring.
+            include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the
+                slicing result. Defaults to False.
+            include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the
+                slicing result. Defaults to False.
             strip (bool): If True, strip the resulting string.
         """
         super().__init__()
diff --git a/nemo_curator/services/huggingface_formatter.py b/nemo_curator/services/huggingface_formatter.py
deleted file mode 100644
index 728d1aa73..000000000
--- a/nemo_curator/services/huggingface_formatter.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-from transformers import AutoTokenizer
-
-from nemo_curator.services import ConversationFormatter
-
-
-class HuggingFaceFormatter(ConversationFormatter):
-    """
-    A formatter that uses a Hugging Face tokenizer to format a conversation.
-    """
-
-    def __init__(self, tokenizer: AutoTokenizer) -> None:
-        self.tokenizer = tokenizer
-
-    def format_conversation(self, conversation: List[dict]) -> str:
-        """
-        Format a conversation between a user, assistant, and potentially system into a string.
-        """
-        return self.tokenizer.apply_chat_template(conversation, tokenize=False)
diff --git a/nemo_curator/synthetic/nemotron_cc.py b/nemo_curator/synthetic/nemotron_cc.py
index 565302cb4..0850eb94a 100644
--- a/nemo_curator/synthetic/nemotron_cc.py
+++ b/nemo_curator/synthetic/nemotron_cc.py
@@ -17,6 +17,7 @@
 
 from transformers import AutoTokenizer
 
+from nemo_curator import BaseModule
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.services import LLMClient
 from nemo_curator.synthetic.prompts import (
@@ -198,7 +199,7 @@ def generate_knowledge_list(
         )
 
 
-class NemotronCCDiverseQAPostprocessor:
+class NemotronCCDiverseQAPostprocessor(BaseModule):
     """
     Postprocesses the output of the Nemotron-CC Diverse QA generation pipeline.
     This postprocessor will sample a random number of QA pairs up to max_num_pairs.
@@ -226,6 +227,7 @@ def __init__(
             max_num_pairs (int): The maximum number of QA pairs to sample.
             prefix (str): The prefix of the response from the LLM.
         """
+        super().__init__(input_backend="pandas")
         self.tokenizer = tokenizer
         self.text_field = text_field
         self.response_field = response_field
@@ -287,7 +289,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
 
 # Although this could be implemented as a DocumentModifier,
 # I have kept it separate to match the other postprocessors.
-class NemotronCCKnowledgeListPostprocessor:
+class NemotronCCKnowledgeListPostprocessor(BaseModule):
     """
     Processes and cleans the output generated by the Nemotron-CC Knowledge List pipeline.
 
@@ -300,6 +302,7 @@ class NemotronCCKnowledgeListPostprocessor:
     """
 
     def __init__(self, text_field: str = "text") -> None:
+        super().__init__(input_backend="pandas")
         self.text_field = text_field
 
     def _postprocess_llm_response(self, text: str) -> str: