-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsweep
executable file
·138 lines (128 loc) · 4.63 KB
/
sweep
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
# Runs baseline sweep.
set -euo pipefail
readonly DATA=../data
readonly EVALUATION=../evaluation
readonly OUTPUT=output
# Defaults.
readonly BEAM_WIDTH=4
readonly EPOCHS=60
readonly PATIENCE=12
readonly SED_EM_ITERATIONS=10
readonly MAX_ENSEMBLE_SIZE=10
LANGS=""
train() {
for LEVEL in low medium high; do
for TRAIN in "${DATA}/${LEVEL}/"*"_train.tsv"; do
DEV="${TRAIN//_train.tsv/_dev.tsv}"
TEST="${TRAIN//_train.tsv/_test.tsv}"
LG="$(basename ${TRAIN} _train.tsv)"
if [[ ${LANGS} ]] && ! [[ " ${LANGS} " =~ " ${LG} " ]]; then
# If we've flagged target languages, we wish to ignore all non-targets.
continue
fi
for ENSEMBLE_SIZE in $(seq 1 "${MAX_ENSEMBLE_SIZE}"); do
OUTPUTPATH="${OUTPUT}/${LEVEL}/${LG}/${ENSEMBLE_SIZE}"
if [[ -d ${OUTPUTPATH} ]] && [[ "$(ls -A ${OUTPUTPATH})" ]] && ! [[ ${LANGS} ]]; then
# We've already created an instance here. So only would do it again if we're looking for a targeted language.
continue
fi
DEV="${TRAIN//_train.tsv/_dev.tsv}"
TEST="${TRAIN//_train.tsv/_test.tsv}"
# We apply NFD unicode normalization.
python trans/train.py \
--dynet-seed "${ENSEMBLE_SIZE}" \
--train "${TRAIN}" \
--dev "${DEV}" \
--test "${TEST}" \
--sed-em-iterations "${SED_EM_ITERATIONS}" \
--output "${OUTPUTPATH}" \
--epochs "${EPOCHS}" \
--beam-width "${BEAM_WIDTH}" \
--patience "${PATIENCE}" \
--nfd &
done
wait
done
done
}
ensemble() {
for LEVEL in low medium high; do
for TRAIN in "${DATA}/${LEVEL}/"*"_train.tsv"; do
LG="$(basename ${TRAIN} _train.tsv)"
if [[ ${LANGS} ]] && ! [[ " ${LANGS} " =~ " ${LG} " ]]; then
# If we've passed target languages, we wish to ignore all non-targets.
continue
fi
OUTPUTPATH="${OUTPUT}/${LEVEL}/${LG}/ensemble"
if [[ -d ${OUTPUTPATH} ]] && [[ "$(ls -A ${OUTPUTPATH})" ]] && ! [[ ${LANGS} ]]; then
# We've already trained on this data up to this ensemble. So no need to do again unless we're looking for a specific language.
continue
fi
for SPLIT in dev test; do
python trans/ensembling.py \
--gold "${DATA}/${LEVEL}/${LG}_${SPLIT}.tsv" \
--systems "${OUTPUT}/${LEVEL}/${LG}/"*"/${SPLIT}_beam${BEAM_WIDTH}.predictions" \
--output "${OUTPUTPATH}"
done
done
done
}
evaluate() {
# Creates two-column TSV with gold and hypothesis data.
for LEVEL in low medium high; do
for TRAIN in "${DATA}/${LEVEL}/"*"_train.tsv"; do
LG="$(basename ${TRAIN} _train.tsv)"
for SPLIT in dev test; do
paste \
"${TRAIN//_train.tsv/_${SPLIT}.tsv}" \
"${OUTPUT}/${LEVEL}/${LG}/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.predictions" \
| cut -f2,4 \
> "${OUTPUT}/${LEVEL}/${LG}/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.tsv"
done
done
done
# Calls evaluation script.
for SPLIT in dev test; do
for LEVEL in low medium high; do
echo "${SPLIT} ${LEVEL}:"
python ${EVALUATION}/./evaluate_all.py \
"${OUTPUT}/${LEVEL}/"*"/ensemble/${SPLIT}_${MAX_ENSEMBLE_SIZE}ensemble.tsv"
echo
done
done
}
main() {
mkdir -p "${OUTPUT}"
train
ensemble
evaluate
}
while getopts "rt:" OPTION; do
case "${OPTION}" in
t)
# Specifies target language for retraining.
LANGS=${OPTARG}
for lang in ${LANGS}; do
echo "${OUTPUT}/"*"/${lang}"
find ${OUTPUT}/* -type d -name ${lang} -exec rm -rf {} +
done
;;
r)
# Resets entire run. This will delete all prior data.
echo "This will delete all prior data. If you wish to continue, press y."
while :; do
read -n 1 k <&1
if [[ $k = y ]]; then
echo
echo "Resetting run."
rm -rf ${OUTPUT}
break
else
echo "This will delete all prior training data. If you wish to continue, press y."
fi
done
;;
esac
done
main