-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_ensemble_CLUZH_high.sh
executable file
·113 lines (97 loc) · 3.13 KB
/
train_ensemble_CLUZH_high.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/bash
# SGE script for training CLUZH baselines for language pairs (8)
# Peter Makarov and Simon Clematide. 2020.
# CLUZH at SIGMORPHON 2020 Shared Task on Multilingual Grapheme-to-Phoneme Conversion.
# In Proceedings of the 17th SIGMORPHON Workshop on Computational Research in Phonetics,
# Phonology, and Morphology, pages 171–176, Online.
# Association for Computational Linguistics.
# https://aclanthology.org/2020.sigmorphon-1.19/
#$ -cwd
#$ -N CLUZH-high
#$ -j y -o $JOB_NAME-$JOB_ID.$TASK_ID.out
#$ -l ram_free=1G,mem_free=1G
#$ -pe smp 10
#$ -t 1
LANGS=( "ben" "ger" "ita" "per" "swe" "tgl" "tha" "ukr" "gle" "bur" )
LANG=${LANGS[(( SGE_TASK_ID - 1))]}
SETTING="high"
# Environment variable SIGMORPHON points to code repo (same as GitHub)
CODE_DIR="${SIGMORPHON}"
DATA_TARGET="${SIGMORPHON}/data/target_languages"
OUTPUT_BASE="${SIGMORPHON}/results"
mkdir -p "${OUTPUT_BASE}"
# Activate conda environment
conda activate sigmorphon
# Experiment settings
BEAM_WIDTH=4
EPOCHS=60
PATIENCE=12
SED_EM_ITERATIONS=10
MAX_ENSEMBLE_SIZE=10
HIDDEN_DIM=100
# Train MAX_ENSEMBLE_SIZE models
for ENSEMBLE_SIZE in $(seq 1 "${MAX_ENSEMBLE_SIZE}"); do
OUTPUT="${OUTPUT_BASE}/${SETTING}/${LANG}/${ENSEMBLE_SIZE}"
mkdir -p "${OUTPUT}"
python "${CODE_DIR}/baseline/trans/train.py" \
--dynet-seed "${ENSEMBLE_SIZE}" \
--output "${OUTPUT}" \
--train "${DATA_TARGET}/${LANG}_train.tsv" \
--dev "${DATA_TARGET}/${LANG}_dev.tsv" \
--test "${DATA_TARGET}/${LANG}_test.tsv" \
--sed-em-iterations "${SED_EM_ITERATIONS}" \
--enc-hidden-dim "${HIDDEN_DIM}" \
--dec-hidden-dim "${HIDDEN_DIM}" \
--epochs "${EPOCHS}" \
--beam-width "${BEAM_WIDTH}" \
--patience "${PATIENCE}" \
--nfd &
done
wait
echo "Done training"
# Check exit status
if [ $? -ne 0 ]
then
echo "Task ${SGE_TASK_ID} failed on ${SETTING} with language ${LANG}"
fi
# Ensemble models
OUTPUT="${OUTPUT_BASE}/${SETTING}/${LANG}/ensemble"
mkdir -p "${OUTPUT}"
for SPLIT in "dev" "test"
do
python "${CODE_DIR}/baseline/trans/ensembling.py" \
--gold "${DATA_TARGET}/${LANG}_${SPLIT}.tsv" \
--systems "${OUTPUT_BASE}/${SETTING}/${LANG}/"*"/${SPLIT}_beam${BEAM_WIDTH}.predictions" \
--output "${OUTPUT}"
done
echo "Done ensembling"
# Evaluate ensemble model
# Create two-column TSV with gold and hypothesis data
for SPLIT in "dev" "test"
do
paste \
"${DATA_TARGET}/${LANG}_${SPLIT}.tsv" \
"${OUTPUT_BASE}/${SETTING}/${LANG}/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.predictions" \
| cut -f2,4 \
> "${OUTPUT_BASE}/${SETTING}/${LANG}/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.tsv"
done
echo "Done"
# Use the last task to evaluate the models
if [ "${SGE_TASK_ID}" -eq "${SGE_TASK_LAST}" ]
then
# Ensure that last task ID is the last task to finish
while [ "$(qstat -u ${USER} | grep -c ${JOB_ID})" -ne 1 ]
do
# Wait patiently
sleep 20
done
# Evaluate
for SPLIT in "dev" "test"
do
echo "${SPLIT} ${SETTING}:"
python "${CODE_DIR}/evaluation/evaluate_all.py" \
"${OUTPUT_BASE}/${SETTING}/"*"/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.tsv"
echo
done
fi
echo "Done done"