Fix lang id example

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
NVIDIA · Apr 19, 2024 · e9885d2 · e9885d2
1 parent c78ad21
commit e9885d2
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 1 deletion.
diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py
@@ -60,7 +60,7 @@ def main(args):
 
     # Remove the language score
     filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(
-        lambda score: score[1]
+        lambda score: score[1], meta=(None, str)
     )
 
     # Split the dataset by language

diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import dask
 import fasttext
 import numpy as np
 import pandas as pd
@@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3):
         self._cutoff = min_langid_score
         self._name = "lang_id"
 
+        # Dask will automatically convert the list score type
+        # to a string without this option.
+        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
+        dask.config.set({"dataframe.convert-string": False})
+
     @batched
     def score_document(self, df):
         model_attr = f"{self._name}_{self._model_path}"