Skip to content

Commit

Permalink
#1 scripts/data-original-prepare-translation-memory.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Nov 18, 2021
1 parent bfa09dc commit 65e9651
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ data/original/fb_covid-19.zip
data/original/google_covid-19.zip
data/original/terminologies.zip
data/original/tico19-testset.zip
data/original/tico19-testset
!.gitignore
!README.md
tmp/
Expand Down
157 changes: 156 additions & 1 deletion scripts/data-original-prepare-translation-memory.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# - sudo apt install libxml2-utils
# - xmlstarlet
# - sudo apt install xmlstarlet
# - pip install langcodes[data]
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
Expand Down Expand Up @@ -44,11 +45,14 @@ rsync --archive --verbose "${DATA_ORIGINAL_GIT_DIR}/data/" "$DATA_ORIGINAL_DIR/"
# is this empty???
# cp "${DATA_ORIGINAL_DIR}/TM/all.en_ckb.tmx" "${DATA_ORIGINAL_DIR}/translation-memory/translators-without-borders/all/"

# xmlstarlet select --template --value-of /tmx/body/tu/tuv/@xml:lang data/original/translation-memory/translators-without-borders/all/all.en-fr.tmx

# unzip, xmllint validate, xmllint format
task_1() {
# TODO: better naming
# echo "$1"
if [ ! -f "${TMX_OUT_ALL_DIR}/all.$1.tmx" ]; then
echo "start $1"
echo unzip "${TMX_DIR_IN}/all.$1tmx.zip" -d "${TMX_OUT_ALL_DIR}/"
unzip "${TMX_DIR_IN}/all.$1.tmx.zip" -d "${TMX_OUT_ALL_DIR}/"

Expand All @@ -57,11 +61,19 @@ task_1() {
echo xmllint --dtdvalid scripts/dtd/tmx14.dtd --noout "${TMX_OUT_ALL_DIR}/all.$1.tmx"
xmllint --dtdvalid scripts/dtd/tmx14.dtd --noout "${TMX_OUT_ALL_DIR}/all.$1.tmx"


echo xmllint --format "${TMX_OUT_ALL_DIR}/all.$1.tmx" > "${TMX_OUT_ALL_DIR}/all.$1_linted.tmx"
xmllint --format "${TMX_OUT_ALL_DIR}/all.$1.tmx" > "${TMX_OUT_ALL_DIR}/all.$1_linted.tmx"

rm "${TMX_OUT_ALL_DIR}/all.$1.tmx"
else
echo "noop $1"
fi

# Print unique languages on each file
# xmlstarlet select --template --value-of /tmx/body/tu/tuv/@xml:lang "${TMX_OUT_ALL_DIR}/all.$1_linted.tmx" | sort | uniq
xmlstarlet select --template --value-of /tmx/body/tu/tuv/@xml:lang "${TMX_OUT_ALL_DIR}/all.$1_linted.tmx" | sort | uniq | xargs echo
# xmlstarlet select --template --value-of /tmx/body/tu/tuv/@xml:lang "${TMX_OUT_ALL_DIR}/all.$1_linted.tmx" | sort | uniq | python3 -c "from langcodes import *; import sys; print(Language.make(language='fr').display_name())"
}


Expand Down Expand Up @@ -115,13 +127,66 @@ task_1 "en-ur" # NOTE: both .tmx and .tmx.zip present
task_1 "en-zh" # NOTE: both .tmx and .tmx.zip present
task_1 "en-zu"
task_1 "es-LA-ar"
task_1 "es-LA-hi"
task_1 "es-LA-id"
task_1 "es-LA-pt-BR"
task_1 "es-LA-ru"
task_1 "es-LA-zh"
task_1 "fa-prs"
task_1 "fr-ar"
task_1 "fr-es-LA"
task_1 "fr-fuv"
task_1 "fr-hi"
task_1 "fr-id"
task_1 "fr-lg"
task_1 "fr-ln"
task_1 "fr-pt-BR"
task_1 "fr-ru"
task_1 "fr-rw"
task_1 "fr-sw"
task_1 "fr-zh"
task_1 "fr-zu"
task_1 "hi-ar"
task_1 "hi-bn"
task_1 "hi-es-LA"
task_1 "hi-fr"
task_1 "hi-id"
task_1 "hi-mr"
task_1 "hi-pt-BR"
task_1 "hi-ru"
task_1 "hi-ur"
task_1 "hi-zh"
task_1 "id-ar"
task_1 "id-es-LA"
task_1 "id-fr"
task_1 "id-hi"
task_1 "id-pt-BR"
task_1 "id-ru"
task_1 "id-zh"
task_1 "ku-ckb"
task_1 "pt-BR-ar"
task_1 "pt-BR-es-LA"
task_1 "pt-BR-hi"
task_1 "pt-BR-id"
task_1 "pt-BR-ru"
task_1 "pt-BR-zh"
task_1 "ru-ar"
task_1 "ru-es-LA"

# ru-fr. (...)

# es-LA-fr (...)

# https://github.com/datasets/language-codes/blob/master/language-codes.sh
# xmllint --xpath 'text()' --dtdvalid scripts/dtd/tmx14.dtd data/original/translation-memory/translators-without-borders/all/all.en-fr.tmx
# xmlstarlet select --value-of tmx data/original/translation-memory/translators-without-borders/all/all.en-fr.tmx
# xmlstarlet select --template --value-of /tmx data/original/translation-memory/translators-without-borders/all/all.en-fr.tmx

# xmlstarlet select --template --value-of /tmx/body/tu/@lang data/original/translation-memory/translators-without-borders/all/all.en-fr.tmx
# xmlstarlet select --template --value-of /tmx/body/tu/tuv[@xml:lang] data/original/translation-memory/translators-without-borders/all/all.en-fr.tm
# https://pypi.org/project/langcodes/
# python3 -c "from langcodes import *; import sys; print('rob')"
# python3 -c "from langcodes import *; import sys; print(Language.make(language='fr').display_name())"

# TODO: implement this type of help
# xmlstarlet select --help

Expand All @@ -136,6 +201,96 @@ task_1 "es-LA-ar"



# find data/original/TM/ -iname all.en-*.zip | wc -l
# 38
# find data/original/TM/ -iname all.en-*.zip
# data/original/TM/all.en-fr.tmx.zip
# data/original/TM/all.en-lg.tmx.zip
# data/original/TM/all.en-so.tmx.zip
# data/original/TM/all.en-hi.tmx.zip
# data/original/TM/all.en-es-LA.tmx.zip
# data/original/TM/all.en-ar.tmx.zip
# data/original/TM/all.en-ti_ET.tmx.zip
# data/original/TM/all.en-ru.tmx.zip
# data/original/TM/all.en-zh.tmx.zip
# data/original/TM/all.en-prs.tmx.zip
# data/original/TM/all.en-pt-BR.tmx.zip
# data/original/TM/all.en-zu.tmx.zip
# data/original/TM/all.en-bn.tmx.zip
# data/original/TM/all.en-ne.tmx.zip
# data/original/TM/all.en-km.tmx.zip
# data/original/TM/all.en-ta.tmx.zip
# data/original/TM/all.en-ckb.tmx.zip
# data/original/TM/all.en-kr.tmx.zip
# data/original/TM/all.en-sw.tmx.zip
# data/original/TM/all.en-ti.tmx.zip
# data/original/TM/all.en-ha.tmx.zip
# data/original/TM/all.en-ku.tmx.zip
# data/original/TM/all.en-ln.tmx.zip
# data/original/TM/all.en-fuv.tmx.zip
# data/original/TM/all.en-ms.tmx.zip
# data/original/TM/all.en-id.tmx.zip
# data/original/TM/all.en-din.tmx.zip
# data/original/TM/all.en-ur.tmx.zip
# data/original/TM/all.en-fa.tmx.zip
# data/original/TM/all.en-rw.tmx.zip
# data/original/TM/all.en-ps.tmx.zip
# data/original/TM/all.en-fa.old.tmx.zip
# data/original/TM/all.en-mr.tmx.zip
# data/original/TM/all.en-ti_ER.tmx.zip
# data/original/TM/all.en-om.tmx.zip
# data/original/TM/all.en-my.tmx.zip
# data/original/TM/all.en-tl.tmx.zip
# data/original/TM/all.en-nus.tmx.zip

# find data/original/TM/ -iname all.en-*.zip | grep -E en-..?.?\.
# find data/original/TM/ -iname all.en-*.zip | grep -E en-..?.?\. --only-matching

# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | wc -l
# 38


# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//'
# en-fr
# en-lg
# en-so
# en-hi
# en-es-LA
# en-ar
# en-ti_ET
# en-ru
# en-zh
# en-prs
# en-pt-BR
# en-zu
# en-bn
# en-ne
# en-km
# en-ta
# en-ckb
# en-kr
# en-sw
# en-ti
# en-ha
# en-ku
# en-ln
# en-fuv
# en-ms
# en-id
# en-din
# en-ur
# en-fa
# en-rw
# en-ps
# en-fa.old
# en-mr
# en-ti_ER
# en-om
# en-my
# en-tl
# en-nus


# xmllint --dtdvalid scripts/dtd/tmx14.dtd data/original/translation-memory/translators-without-borders/all/all.am-om.tmx

echo "Okay!"

0 comments on commit 65e9651

Please sign in to comment.