diff --git a/.github/workflows/run_test.yml b/.github/workflows/run_test.yml index 129fc2c..3bde7ad 100644 --- a/.github/workflows/run_test.yml +++ b/.github/workflows/run_test.yml @@ -37,7 +37,7 @@ jobs: run: poetry install - name: Test with pytest run: | - poetry run python -m pytest tests/ -v -s + poetry run python -m pytest tests/ -s --durations=5 # - name: Generate Coverage Report # run: | # coverage report -m diff --git a/commands/taiwanese/read_embree_csv.py b/commands/taiwanese/read_embree_csv.py index 07b478f..8911322 100644 --- a/commands/taiwanese/read_embree_csv.py +++ b/commands/taiwanese/read_embree_csv.py @@ -38,9 +38,16 @@ def read_embree_csv_raw( df = df.fillna("") # replace all NaN with empty string return df +def _count_taigi_words(poj: str) -> int: + if poj == "": return 0 + first_section = poj.split("/")[0] + # replace spaces with hyphens + first_section = first_section.replace(" ", "-") + return first_section.split("/")[0].count("-")+1 + def add_pd_columns(tw_csv: pd.DataFrame) -> pd.DataFrame: # add the length of each word - tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(lambda x: x.split("/")[0].count("-")+1) + tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(_count_taigi_words).astype(int) return tw_csv TW_EMBREE_CSV_PATH = Path(__file__).parent / "ChhoeTaigi_EmbreeTaiengSutian.csv" diff --git a/tests/test_taigi.py b/tests/test_taigi.py new file mode 100644 index 0000000..3939d94 --- /dev/null +++ b/tests/test_taigi.py @@ -0,0 +1,40 @@ +import pytest +from commands.taiwanese.read_embree_csv import TW_EMBREE_CSV, NUM_WORDS_COL, _count_taigi_words + +def test_num_words_col(): + """Test the integrity of the NUM_WORDS_COL column.""" + # assert that the column exists + assert NUM_WORDS_COL in TW_EMBREE_CSV.columns + # assert it contains only integers + assert TW_EMBREE_CSV[NUM_WORDS_COL].dtype == int + # assert that there are no negative values + assert (TW_EMBREE_CSV[NUM_WORDS_COL] >= 0).all() + +@pytest.mark.parametrize( + "poj,expected", + [ + ("", 0), # edge case, should not exist in the dataset + ("thǹg", 1), + ("thǹg-bō", 2), + ("thǹg-chhiah-kha", 3), + ("thn̂g-chhò͘-pâi-kut", 4), + ("àm-hông-lián-ong-eng", 5), + ("Tâi-oân-tài-bùn-siông-chhú", 6), + ("lêng-géng-bo̍k-ia̍p-kài-khak-thâng", 7), + ]) +def test__count_taigi_words(poj: str, expected: str): + """Test the _count_taigi_words function.""" + assert _count_taigi_words(poj) == expected + +# these are problematic cases, but we write the cases first but skip them +@pytest.mark.skip(reason="These cases are problematic") +@pytest.mark.parametrize( + "poj,expected", + [ + ("tû-khì + N + í-gōa", 4), # row 34426 + ("tû-liáu + N + í-gōa", 4), # row 34428 + ] +) +def test__count_taigi_words_problematic(poj: str, expected: str): + """Test the _count_taigi_words function.""" + assert _count_taigi_words(poj) == expected \ No newline at end of file