Skip to content

Commit

Permalink
Merge pull request #93 from placeTW/test-taigi
Browse files Browse the repository at this point in the history
Test taigi + other test changes
  • Loading branch information
brownsugar-bobamilktea authored Aug 1, 2024
2 parents fbee0dc + 4eb0f0d commit d0bad54
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: poetry install
- name: Test with pytest
run: |
poetry run python -m pytest tests/ -v -s
poetry run python -m pytest tests/ -s --durations=5
# - name: Generate Coverage Report
# run: |
# coverage report -m
9 changes: 8 additions & 1 deletion commands/taiwanese/read_embree_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,16 @@ def read_embree_csv_raw(
df = df.fillna("") # replace all NaN with empty string
return df

def _count_taigi_words(poj: str) -> int:
if poj == "": return 0
first_section = poj.split("/")[0]
# replace spaces with hyphens
first_section = first_section.replace(" ", "-")
return first_section.split("/")[0].count("-")+1

def add_pd_columns(tw_csv: pd.DataFrame) -> pd.DataFrame:
# add the length of each word
tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(lambda x: x.split("/")[0].count("-")+1)
tw_csv[NUM_WORDS_COL] = tw_csv["PojUnicode"].apply(_count_taigi_words).astype(int)
return tw_csv

TW_EMBREE_CSV_PATH = Path(__file__).parent / "ChhoeTaigi_EmbreeTaiengSutian.csv"
Expand Down
40 changes: 40 additions & 0 deletions tests/test_taigi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pytest
from commands.taiwanese.read_embree_csv import TW_EMBREE_CSV, NUM_WORDS_COL, _count_taigi_words

def test_num_words_col():
"""Test the integrity of the NUM_WORDS_COL column."""
# assert that the column exists
assert NUM_WORDS_COL in TW_EMBREE_CSV.columns
# assert it contains only integers
assert TW_EMBREE_CSV[NUM_WORDS_COL].dtype == int
# assert that there are no negative values
assert (TW_EMBREE_CSV[NUM_WORDS_COL] >= 0).all()

@pytest.mark.parametrize(
"poj,expected",
[
("", 0), # edge case, should not exist in the dataset
("thǹg", 1),
("thǹg-bō", 2),
("thǹg-chhiah-kha", 3),
("thn̂g-chhò͘-pâi-kut", 4),
("àm-hông-lián-ong-eng", 5),
("Tâi-oân-tài-bùn-siông-chhú", 6),
("lêng-géng-bo̍k-ia̍p-kài-khak-thâng", 7),
])
def test__count_taigi_words(poj: str, expected: str):
"""Test the _count_taigi_words function."""
assert _count_taigi_words(poj) == expected

# these are problematic cases, but we write the cases first but skip them
@pytest.mark.skip(reason="These cases are problematic")
@pytest.mark.parametrize(
"poj,expected",
[
("tû-khì + N + í-gōa", 4), # row 34426
("tû-liáu + N + í-gōa", 4), # row 34428
]
)
def test__count_taigi_words_problematic(poj: str, expected: str):
"""Test the _count_taigi_words function."""
assert _count_taigi_words(poj) == expected

0 comments on commit d0bad54

Please sign in to comment.