Merge pull request #93 from placeTW/test-taigi

Test taigi + other test changes
placeTW · Aug 1, 2024 · d0bad54 · d0bad54
2 parents fbee0dc + 4eb0f0d
commit d0bad54
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 2 deletions.
diff --git a/.github/workflows/run_test.yml b/.github/workflows/run_test.yml
@@ -37,7 +37,7 @@ jobs:
         run: poetry install
       - name: Test with pytest  
         run: |  
-            poetry run python -m pytest tests/ -v -s  
+            poetry run python -m pytest tests/ -s --durations=5
     #   - name: Generate Coverage Report  
     #     run: |  
     #       coverage report -m
diff --git a/commands/taiwanese/read_embree_csv.py b/commands/taiwanese/read_embree_csv.py
@@ -38,9 +38,16 @@ def read_embree_csv_raw(
     df = df.fillna("") # replace all NaN with empty string
     return df
 
+def _count_taigi_words(poj: str) -> int:
+    if poj == "": return 0
+    first_section = poj.split("/")[0]
+    # replace spaces with hyphens
+    first_section = first_section.replace(" ", "-")
+    return first_section.split("/")[0].count("-")+1
+
 def add_pd_columns(tw_csv: pd.DataFrame) -> pd.DataFrame:
     # add the length of each word
-    tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(lambda x: x.split("/")[0].count("-")+1)
+    tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(_count_taigi_words).astype(int)
     return tw_csv
 
 TW_EMBREE_CSV_PATH = Path(__file__).parent / "ChhoeTaigi_EmbreeTaiengSutian.csv"

diff --git a/tests/test_taigi.py b/tests/test_taigi.py
@@ -0,0 +1,40 @@
+import pytest
+from commands.taiwanese.read_embree_csv import TW_EMBREE_CSV, NUM_WORDS_COL, _count_taigi_words
+
+def test_num_words_col():
+    """Test the integrity of the NUM_WORDS_COL column."""
+    # assert that the column exists
+    assert NUM_WORDS_COL in TW_EMBREE_CSV.columns
+    # assert it contains only integers
+    assert TW_EMBREE_CSV[NUM_WORDS_COL].dtype == int
+    # assert that there are no negative values
+    assert (TW_EMBREE_CSV[NUM_WORDS_COL] >= 0).all()
+
+@pytest.mark.parametrize(
+    "poj,expected", 
+    [
+        ("", 0), # edge case, should not exist in the dataset
+        ("thǹg", 1),
+        ("thǹg-bō", 2),
+        ("thǹg-chhiah-kha", 3),
+        ("thn̂g-chhò͘-pâi-kut", 4),
+        ("àm-hông-lián-ong-eng", 5),
+        ("Tâi-oân-tài-bùn-siông-chhú", 6),
+        ("lêng-géng-bo̍k-ia̍p-kài-khak-thâng", 7),
+    ])
+def test__count_taigi_words(poj: str, expected: str):
+    """Test the _count_taigi_words function."""
+    assert _count_taigi_words(poj) == expected
+
+# these are problematic cases, but we write the cases first but skip them
+@pytest.mark.skip(reason="These cases are problematic")
+@pytest.mark.parametrize(
+    "poj,expected", 
+    [
+        ("tû-khì + N + í-gōa", 4), # row 34426
+        ("tû-liáu + N + í-gōa", 4), # row 34428
+    ]
+)
+def test__count_taigi_words_problematic(poj: str, expected: str):
+    """Test the _count_taigi_words function."""
+    assert _count_taigi_words(poj) == expected