Skip to content

Commit

Permalink
Fix lang id example
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
  • Loading branch information
ryantwolf committed Apr 19, 2024
1 parent c78ad21 commit e9885d2
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 1 addition & 1 deletion examples/identify_languages_and_fix_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main(args):

# Remove the language score
filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(
lambda score: score[1]
lambda score: score[1], meta=(None, str)
)

# Split the dataset by language
Expand Down
6 changes: 6 additions & 0 deletions nemo_curator/filters/classifier_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import dask
import fasttext
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3):
self._cutoff = min_langid_score
self._name = "lang_id"

# Dask will automatically convert the list score type
# to a string without this option.
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
dask.config.set({"dataframe.convert-string": False})

@batched
def score_document(self, df):
model_attr = f"{self._name}_{self._model_path}"
Expand Down

0 comments on commit e9885d2

Please sign in to comment.