Skip to content

Commit

Permalink
bug fix + test
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Jul 11, 2024
1 parent a279acb commit a68d170
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 131 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ gîrodey xałî řeşte; gwêt le neẍmey tuyûre?

Arabic script into the Latin script suggested by Dr. Feryad Fazil Omar:
```python
>>> print(asosoft.Ar2LaF("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"))
>>> print(asosoft.Ar2LaFeryad("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"))
gîrodey xaḻî ṟeşte; gwêt le nex̱mey tuyûre?
```

Expand Down Expand Up @@ -144,7 +144,7 @@ Trim starting and ending white spaces (including zero width spaces) of line,
### Replace Html Entities
`ReplaceHtmlEntity` replaces HTML Entities with single Unicode characters (e.g. "é" with "é"). It is useful in web crawled corpora.
```python
>>> print(asosoft.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن"))
>>> print(asosoft.ReplaceHtmlEntity("ئێوە "دەق" بە زمانی <کوردی> دەنووسن"))
ئێوە "دەق" بە زمانی <کوردی> دەنووسن
```
### Replace URLs and emails
Expand All @@ -167,8 +167,7 @@ Trim starting and ending white spaces (including zero width spaces) of line,
### Word to Word Replacment
`Word2WordReplacement` applies a "string to string" replacement dictionary on the text. It replaces the full-matched words not a part of them.
```python
>>> dict = {"مال": "ماڵ", "سلاو": "سڵاو"}
>>> print(asosoft.Word2WordReplacement("مال، نووری مالیکی", dict))
>>> print(asosoft.Word2WordReplacement("مال، نووری مالیکی", {"مال": "ماڵ", "سلاو": "سڵاو"}))
ماڵ، نووری مالیکی
```

Expand All @@ -193,13 +192,14 @@ Sorting a string list in correct order of Kurdish alphabet ("ئءاآأإبپت
```python
>>> myList = ["یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"]
>>> print(asosoft.KurdishSort(myList))
"ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"
["ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"]
```
or using your custom order:
```python
>>> inputList = ["یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"]
>>> inputOrder = list(["ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ"])
>>> inputOrder = list("ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ")
>>> print(asosoft.CustomSort(inputList, inputOrder))
["ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"]
```
## Poem Meter Classifier
It classifies the meter of the input Kurdish poem typed in Arabic script. The lines of the poem should be seprated by new line char ('\n').
Expand All @@ -208,5 +208,7 @@ You can find Kurdish poems in https://books.vejin.net/.
>>> poem = "گەرچی تووشی ڕەنجەڕۆیی و حەسرەت و دەردم ئەمن\nقەت لەدەس ئەم چەرخە سپڵە نابەزم مەردم ئەمن\nئاشقی چاوی کەژاڵ و گەردنی پڕ \nخاڵ نیم\nئاشقی کێو و تەلان و بەندەن و بەردم ئەمن"
>>> classified = asosoft.ClassifyKurdishPoem(poem)
>>> print("Poem Type= " + classified.overalMeterType)
Quantitative/عەرووزی
>>> print("Poem Meter= " + classified.overalPattern)
فاعلاتن فاعلاتن فاعلاتن فاعلن
```
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="asosoft",
version="0.1.3",
version="0.2.0",
description="AsoSoft's Library for Kurdish language processing tasks",
keywords='natural-language-processing, normalization, unicode-normalization, central-kurdish, kurdish, sorani',
package_dir={'': 'src'},
Expand Down
71 changes: 36 additions & 35 deletions src/asosoft/G2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from collections import OrderedDict

# Normalizion
def _G2PNormalize(text):
def G2P_normalize(text):
s = [
" +", " " ,
"دٚ", "ڎ",
Expand Down Expand Up @@ -54,37 +54,37 @@ def _G2PNormalize(text):
text = re.sub(s[i], s[i + 1], text)
return text

_History = {}
_path = os.path.dirname(__file__)
_G2PExceptions = {}
_G2PCertain = {}
def _load_replaces():
with open(os.path.join(_path, "resources/G2PExceptions.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
history = {}
path = os.path.dirname(__file__)
G2P_exceptions = {}
G2P_certain = {}
def load_replaces():
with open(os.path.join(path, "resources/G2PExceptions.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
reader = csv.reader(csvfile)
next(reader) # Skip the first row
for row in reader:
_G2PExceptions[row[0]] = row[1]
G2P_exceptions[row[0]] = row[1]

with open(os.path.join(_path, "resources/G2PCertain.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
with open(os.path.join(path, "resources/G2PCertain.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
reader = csv.reader(csvfile)
next(reader) # Skip the first row
for row in reader:
_G2PCertain[row[0]] = row[1]
G2P_certain[row[0]] = row[1]


# GEN: generates all possible candidates:
# e.g. بوون => bûn, buwn, bwun

def _Generator(gr):
if len(_G2PExceptions) == 0:
_load_replaces()
def Generator(gr):
if len(G2P_exceptions) == 0:
load_replaces()

# Converting exceptional words
for key, value in _G2PExceptions.items():
for key, value in G2P_exceptions.items():
gr = re.sub(key, value, gr)

# Converting certain characters
for key, value in _G2PCertain.items():
for key, value in G2P_certain.items():
gr = re.sub(key, value, gr)

# Uncertainty in "و" and "ی"
Expand Down Expand Up @@ -132,10 +132,10 @@ def _Generator(gr):
CandList1.append(TempList[i] + temp[j])

# Adding "i" between Consonant Clusters
Candidates = _iInsertion(CandList1)
Candidates = i_insertion(CandList1)

# ======= Syllabification for each candidate
OutputCandidates = _Syllabification(Candidates)
OutputCandidates = syllabification(Candidates)

# for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda
cCount = len(OutputCandidates)
Expand All @@ -150,7 +150,7 @@ def _Generator(gr):

# insertion of hidden /i/ vowel
# e.g. brd => bird, brid, birid
def _iInsertion(Cands):
def i_insertion(Cands):
Candidates = []
for i in range(len(Cands)):
ThisCand = []
Expand All @@ -171,7 +171,7 @@ def _iInsertion(Cands):

# Syllabification of candidates
# e.g. dexom => ˈdeˈxom
def _Syllabification(Candidates):
def syllabification(Candidates):
cCount = len(Candidates)
for i in range(cCount):
# Onset C(C)V
Expand All @@ -184,7 +184,7 @@ def _Syllabification(Candidates):
return Candidates

# Sonority Sequencing Principle in EVAL needs phoneme ranking
def _SonorityIndex(ch):
def sonority_index(ch):
c = str(ch)
if re.search(r"[wy]", c): # Approximant
return 6
Expand All @@ -201,7 +201,7 @@ def _SonorityIndex(ch):


# EVAL: specifies a penalty number for each syllabified candidate
def _EVAL(Candidates):
def EVAL(Candidates):
output = {}
if len(Candidates) > 0:
Penalty = {}
Expand All @@ -222,7 +222,7 @@ def _EVAL(Candidates):
for coda in codas:
chars = coda
for j in range(len(chars) - 1):
if _SonorityIndex(chars[j]) <= _SonorityIndex(chars[j + 1]):
if sonority_index(chars[j]) <= sonority_index(chars[j + 1]):
P += 10
# DEP: i insertion
P += candidate.count("i") * 2
Expand All @@ -248,6 +248,7 @@ def _EVAL(Candidates):
P += candidate.count("wi") * 2
P += candidate.count("iw") * 2
P += candidate.count("wû") * 5
P += candidate.count("uˈwî") * 1

# ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî)
# ˈdiˈrêjˈyî => ˈdiˈrêˈjîy
Expand Down Expand Up @@ -286,47 +287,47 @@ def _EVAL(Candidates):
pat = re.search(r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])", candidate)
if pat:
C = re.sub("[iˈ]", "", pat.group())
if _SonorityIndex(C[1]) > _SonorityIndex(C[2]):
if sonority_index(C[1]) > sonority_index(C[2]):
P += 3
# ('sern'cê => 'se'rin'cê)
pat = re.search(r"([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])", candidate)
if pat:
C = re.sub("[iˈ]", "", pat.group())
if _SonorityIndex(C[0]) > _SonorityIndex(C[1]):
if sonority_index(C[0]) > sonority_index(C[1]):
P += 3
# ('ser'ni'cê => 'se'rin'cê)
pat = re.search(r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])", candidate)
if pat:
C = re.sub("[iˈ]", "", pat.group())
if _SonorityIndex(C[0]) > _SonorityIndex(C[1]) and _SonorityIndex(C[1]) > _SonorityIndex(C[2]):
if sonority_index(C[0]) > sonority_index(C[1]) and sonority_index(C[1]) > sonority_index(C[2]):
P += 3
# ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne)
pat = re.search(r"[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])", candidate)
if pat:
C = re.sub("[aeêouûiîˈ]", "", pat.group())
if _SonorityIndex(C[2]) >= _SonorityIndex(C[1]):
if sonority_index(C[2]) >= sonority_index(C[1]):
P += 3
Penalty[candidate] = P

output = OrderedDict(sorted(Penalty.items(), key=lambda x: x[1]))
return output

# chooses the best candidates for the word
def _Evaluator(gr, Candidates):
def evaluator(gr, Candidates):
Output = []
evaluatedCandidates = _EVAL(Candidates)
evaluatedCandidates = EVAL(Candidates)
if len(evaluatedCandidates) > 0:
LowestPenalt = list(evaluatedCandidates.values())[0]
for key, value in evaluatedCandidates.items():
if value < LowestPenalt + 5:
Output.append(key)
return gr if len(Output) == 0 else '¶'.join(Output)

def _WordG2P(gr, SingleOutputPerWord):
def word_G2P(gr, SingleOutputPerWord):
# Check history for speed up
if gr not in _History:
_History[gr] = _Evaluator(gr, _Generator(gr))
return _History[gr].split('¶')[0] if SingleOutputPerWord else _History[gr]
if gr not in history:
history[gr] = evaluator(gr, Generator(gr))
return history[gr].split('¶')[0] if SingleOutputPerWord else history[gr]

# Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems)
def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, singleOutputPerWord=True):
Expand All @@ -335,13 +336,13 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
if convertNumbersToWord:
text = Number2Word(text)

text = _G2PNormalize(text.strip())
text = G2P_normalize(text.strip())

ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ"
wordss = re.findall(f"([{ku}]+|[^{ku}]+)", text)
for word in wordss:
if re.search(f"[{ku}]", word) and word != "و":
sb.append(_WordG2P(re.sub(f"[^{ku}]+", "", word), singleOutputPerWord))
sb.append(word_G2P(re.sub(f"[^{ku}]+", "", word), singleOutputPerWord))
else:
sb.append(word)
output = ''.join(sb)
Expand All @@ -366,4 +367,4 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
# if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû)
output = re.sub(r"(\w+)¶\1(\s|$)", r"\1", output)

return output.rstrip()
return output.rstrip()
Loading

0 comments on commit a68d170

Please sign in to comment.