-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile.params
1047 lines (901 loc) · 41.7 KB
/
Makefile.params
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# vim:noet:ts=3:nowrap
#
# @file Makefile.params
# @brief Master parameter file where all user specific parameters should be set.
#
# @author Samuel Larkin, Darlene Stewart and Eric Joanis
#
# Traitement multilingue de textes / Multilingual Text Processing
# Centre de recherche en technologies numériques / Digital Technologies Research Centre
# Conseil national de recherches Canada / National Research Council Canada
# Copyright 2008, 2012, 2015, 2016, 2018, Sa Majeste la Reine du Chef du Canada
# Copyright 2008, 2012, 2015, 2016, 2018, Her Majesty in Right of Canada
# Print the PortageII Copyright no matter where we start from -- but only once.
ifeq (${MAKELEVEL},0)
$(shell portage_info)
# subprograms launched by the framework don't need to bleet all the time either...
export PORTAGE_INTERNAL_CALL=1
endif
################################################################################
# User definable variables
# Source/from language (must be two lowercase letters)
SRC_LANG ?= en
# Target/to language (must be two lowercase letters)
TGT_LANG ?= fr
# Source locale country code (two uppercase letters).
SRC_LOCALE_COUNTRY ?= CA
# Target locale country code (two uppercase letters).
TGT_LOCALE_COUNTRY ?= CA
########################################
# Corpora specification
# Here we specify the stems of the corpora files.
# Files should look like this: <PREFIX>_<LANGUAGE>.al
# e.g. test1_fr.al
# Warning: TRAIN_TC, TRAIN_SPARSE, TUNE_DECODE, TUNE_RESCORE, TUNE_CE may not contain
# more than one <PREFIX>.
# Set PRIMARY_LM to your in-domain target-language LM training corpus.
# If there is a generic LM for your target language, it will automatically be
# used in a MixLM with your main LM
# If you specify two or more LMs here, they will be groupped in a MixLM (with
# the generic LM if it exists).
# If you want to manually set your LM or MixLM parameters differently, comment
# this variable out and define the LM-related variables below.
# Warning: if you use PRIMARY_LM, leave TRAIN_LM and MIXLM undefined.
PRIMARY_LM ?= lm-train
# TRAIN_LM is used to train regular (non mix) language models.
# NOTE: Do not include stems for any files listed in LM_PRETRAINED_TGT_LMS below.
# NOTE: Do not define TRAIN_LM if you defined PRIMARY_LM above.
#TRAIN_LM ?= lm-train
# LM_PRETRAINED_TGT_LMS specifies file paths to additional pre-trained target
# language LMs to use with the lm module, for example a generic model LM.
# These file names should end with the extension .tplm and must include
# the string _${TGT_LANG}, where ${TGT_LANG} is the target language, e.g. _fr .
# Paths may be absolute or relative to the location of this Makefile.params file.
#LM_PRETRAINED_TGT_LMS ?= ${PORTAGE_GENERIC_MODEL}/generic-2.1/lm/generic-2.1_${TGT_LANG}.tplm
# MIXLM is used to create a mixture language model which itself is composed of
# several other language models.
# NOTE: Do not include stems for any files listed in MIXLM_PRETRAINED_TGT_LMS below.
# NOTE: Do not define TRAIN_LM if you defined PRIMARY_LM above.
#MIXLM ?= sublm1 sublm2 sublm3
# MIXLM_PRETRAINED_TGT_LMS specifies file paths to additional pre-trained target
# language LMs to use with the mixlm module, for example a generic model LM.
# These file names should end with the extension .tplm and must include
# the string _${TGT_LANG}, where ${TGT_LANG} is the target language, e.g. _fr .
# Paths may be absolute or relative to the location of this Makefile.params file.
# Note: The corresponding source language LMs are also required, should have
# the same name except for the language code, and should be located in the same
# directory.
# NOTE: if you used PRIMARY_LM above and the Generic LM exists for your target
# language, it will automatically be used if no other MIXLM_PRETRAINED_TGT_LMS
# are defined.
#MIXLM_PRETRAINED_TGT_LMS ?= ${PORTAGE_GENERIC_MODEL}/generic-2.1/lm/generic-2.1_${TGT_LANG}.tplm
# TRAIN_COARSELM is used to train coarse language models.
# By default, we train coarse LMs for the corpora listed in PRIMARY_LM, or
# TRAIN_LM (if no PRIMARY_LM), or MIXLM (if no PRIMARY_LM or TRAIN_LM).
# If you want to use different corpora, uncomment and define TRAIN_COARSELM.
# => TRAIN_COARSELM = PRIMARY_LM or TRAIN_LM or MIXLM
#TRAIN_COARSELM ?= coarselm-train
# TRAIN_BILM is used to train BiLM models.
# By default, we train BiLMs for the corpora listed in TRAIN_TM; if you
# want to use different corpora, uncomment and define TRAIN_BILM.
# => TRAIN_BILM = TRAIN_TM
#TRAIN_BILM ?= bilm-train
# TRAIN_TC is used to train a truecasing model.
# By default, we train the TC model using the first corpus listed in PRIMARY_LM,
# or TRAIN_LM (if no PRIMARY_LM), or MIXLM (if no PRIMARY_LM or TRAIN_LM).
# If you want to use a different corpus, uncomment and define TRAIN_TC
# explicitly.
# => TRAIN_TC = first word of PRIMARY_LM or TRAIN_LM or MIXLM
#TRAIN_TC ?= tc-train
# TRAIN_TM is used to create the translation tables.
# NOTE: Do not include stems for any files listed in TM_PRETRAINED_TMS below.
TRAIN_TM ?= tm-train
# TM_PRETRAINED_TMS specifies file paths to additional pre-trained translation
# tables to use with the tm module, for example a generic model TM.
# These file names start with cpt. and end in .${SRC_LANG}2${TGT_LANG}.gz,
# where ${SRC_LANG} is the source language and ${TGT_LANG} is the target
# language, e.g. .en2fr.gz .
# Paths may be absolute or relative to the location of this Makefile.params file.
#TM_PRETRAINED_TMS ?= ${PORTAGE_GENERIC_MODEL}/generic-2.1/tm/cpt.generic-2.1.${SRC_LANG}2${TGT_LANG}.gz
# MIXTM is used to create a mixture translation model from several other
# translation models.
# Specify the in-domain corpus first (its word alignment models are needed),
# or specify an alternate training corpus in MIXTM_TRAIN_MIX below.
# NOTE: Do not include stems for any files listed in MIXTM_PRETRAINED_TMS below.
#MIXTM ?= subtm1 subtm2
# MIXTM_PRETRAINED_TMS specifies file paths to additional pre-trained translation
# tables to use in creating a mixture translation model, for example a generic
# model TM.
# These file names should start with cpt.
# Paths may be absolute or relative to the location of this Makefile.params file.
#MIXTM_PRETRAINED_TMS ?= ${PORTAGE_GENERIC_MODEL}/generic-2.1/tm/cpt.generic-2.1.${SRC_LANG}2${TGT_LANG}.gz
# MIXTM_TRAIN_MIX is an in-domain corpus used to train the weights for mixing
# the component translation models (its word alignment models are needed)
# By default, we use the first name listed in MIXTM; if you want to use a
# different corpus, uncomment and define MIXTM_TRAIN_MIX.
# => MIXTM_TRAIN_MIX = first word of MIXTM
#MIXTM_TRAIN_MIX ?= subtm1
### NNJM training
# NNJM training can be done in three modes:
# 1: 1-pass training on your own in-domain data only:
# - define NNJM_TRAIN_CORPUS to point to your in-domain data, e.g.,
# ${TRAIN_TM}
# - leave NNJM_FINE_TUNING_TRAIN_CORPUS and NNJM_PRETRAINED_NNJM empty
# 2: 2-pass training, first on your own generic data, with fine-tuning on your
# own in-domain data:
# - define NNJM_TRAIN_CORPUS to point to your generic data
# - define NNJM_FINE_TUNING_TRAIN_CORPUS to point to your in-domain data
# - leave NNJM_PRETRAINED_NNJM empty
# 3: 2-pass training, starting from a pre-trained NNJM (e.g., NRC's generic
# one), with fine-tuning on your own in-domain data:
# - define NNJM_FINE_TUNING_TRAIN_CORPUS to point to your in-domain data
# - define NNJM_PRETRAINED_NNJM to point to the pre-trained model
# - leave NNJM_TRAIN_CORPUS empty.
# Recommendation: 2-pass training is best. Use the NRC's generic model if your
# language pair is en-fr (mode 3), or use the largest generic corpus your have
# access to (mode 2).
# NNJM main model:
# Define NNJM_TRAIN_CORPUS to train the main NNJM from scratch on this corpus,
# typically large generic data.
# NOTE: Either use NNJM_TRAIN_CORPUS or NNJM_PRETRAINED_NNJM but not both.
#NNJM_TRAIN_CORPUS ?=
NNJM_DEV_CORPUS ?= ${TUNE_DECODE} # Validation corpus, for early stopping
NNJM_TEST_CORPUS ?= # Test corpus, optional
# NNJM fine tuning.
# Define NNJM_FINE_TUNING_TRAIN_CORPUS to fine tune your NNJM model.
# This triggers a second-pass training on an existing model with in-domain data.
NNJM_FINE_TUNING_TRAIN_CORPUS ?=
NNJM_FINE_TUNING_DEV_CORPUS ?= ${NNJM_DEV_CORPUS} # Validation corpus, for early stopping
NNJM_FINE_TUNING_TEST_CORPUS ?= ${NNJM_TEST_CORPUS} # Test corpus, optional
# NNJM_PRETRAINED_NNJM specifies file path to an additional pre-trained Neural
# Network Joint Models to use in your canoe.ini. The NNJM's filename must be
# 'model' and it should use memory mapped class files.
# Path may be absolute or relative to the location of this Makefile.params file.
#NNJM_PRETRAINED_NNJM ?= ${PORTAGE_GENERIC_MODEL}/generic-2.1/nnjm/nnjm.generic-2.1.${SRC_LANG}2${TGT_LANG}/model
# TRAIN_LDM is used to train a Lexicalized Distortion Model (LDM)
# (the word alignment models are needed).
# By default, we use the corpora listed in TRAIN_TM and MIXTM; if you want to
# use different corpora, uncomment and define TRAIN_LDM.
# => TRAIN_LDM = TRAIN_TM MIXTM
#TRAIN_LDM ?= ldm-train
# TRAIN_HLDM is used to train a Hierarchical Lexicalized Distortion Model (HLDM)
# (the word alignment models are needed).
# By default, we use the corpora listed in TRAIN_LDM if it is defined, or
# TRAIN_TM and MIXTM if TRAIN_LDM is not defined; if you want to use different
# corpora, uncomment and define TRAIN_LDM.
# => TRAIN_HLDM = TRAIN_LDM or TRAIN_TM MIXTM
#TRAIN_HLDM ?= hldm-train
# TRAIN_SPARSE is used to train sparse features
# It should point to the concatenation of all your TRAIN_TM and MIXTM corpora in use.
# Warning: you must create that concatenated corpus manually or outside the framework.
# By default we use the first corpus listed in TRAIN_TM;
# if you want to use a different corpus, uncomment and define TRAIN_SPARSE.
# => TRAIN_SPARSE = first word of TRAIN_TM
#TRAIN_SPARSE ?= sparse-train
# TRAIN_WCL is used to train word classes for use in coarse models.
# It should point to the concatenation of all your training corpora.
# By default, we use the TRAIN_LM, MIXLM, TRAIN_TM, MIXTM corpora; if you
# want to use different corpora, uncomment and define TRAIN_WCL.
# => TRAIN_WCL = TRAIN_LM MIXLM TRAIN_TM MIXTM
#TRAIN_WCL ?= wcl-train
# TUNE_DECODE is used to tune the decoding weights (using tune.py).
TUNE_DECODE ?= dev1
# TUNE_DECODE_VARIANTS is used to tune and test using multiple tuning sets.
# Typically, each variant is a 90% sample subset of the TUNE_DECODE set.
# Ex. If TUNE_DECODE is dev1, include "a b" in TUNE_DECODE_VARIANTS to tune
# with dev1a and dev1b in addition to dev1.
# If variants are specified, 90% sample variants of the tuning set are created
# automatically in the corpora directory if the variant files do not already exist.
#TUNE_DECODE_VARIANTS ?= a b c d
# PLIVE_DECODE_VARIANT is used to select which tuning run (which one of
# TUNE_DECODE_VARIANTS) to use for PortageLive.
# Leave PLIVE_DECODE_VARIANT undefined or blank to use weights from the main
# tuning run based on TUNE_DECODE, or specify one of the variants from
# TUNE_DECODE_VARIANTS to use weights from the tuning run corresponding to
# that variant.
# Recommended usage: after tuning, set this to the variant giving the best
# bleu score on the test sets.
#PLIVE_DECODE_VARIANT ?=
# TUNE_RESCORE is used to tune the rescoring weights (using rescore.py).
#TUNE_RESCORE ?= dev2
# TUNE_CE is used to train confidence estimation.
# Note: it's OK for TUNE_DECODE and TUNE_RESCORE to be the same file, but
# TUNE_CE must be completely distinct, not only from all training data, but
# also from all other tuning data.
#TUNE_CE ?= dev3
# TEST_SET files are used to estimate the translation quality of the system.
TEST_SET ?= test1 test2
# Uncomment if you have source text, that doesn't have a reference, to translate.
#TRANSLATE_SET ?=
# Uncomment if you have multiple references for your heldout sets.
# Note that all dev and test sets must have the same number of references.
# <stem>_<tgt_language>#.al
# Where # is some reference identifier.
# i.e. dev_fr1.al, dev_fr2.al, dev_fr3.al & dev_fr4.al
#REFERENCE_INDICES ?= 1 2 3 4
########################################
# Features that can be enabled.
# Normally, the mixlm weights computed for TUNE_DECODE are also used when
# translating the test sets because computing the weights dynamically is
# not supported in deployed PortageLive systems.
# USE_DYNAMIC_MIXLM applies only when a MixLM is used.
# Comment out to use static (TUNE_DECODE) mixlm weights for test sets;
# uncomment to enable dynamic calculation of mixlm weights for test sets.
#USE_DYNAMIC_MIXLM ?= 1
# Train and apply rescoring if this variable is defined.
# Expensive! Use only if the last small BLEU increment is important to you.
# Comment out to disable rescoring; uncomment to enable.
#DO_RESCORING = 1
# Tune and apply confidence estimation if this variable is defined.
# Comment out to disable confidence estimation; uncomment to enable.
#DO_CE = 1
# Train and use a Lexicalized Distortion Model (LDM).
# Comment out to disable using an LDM; uncomment to enable.
#USE_LDM = 1
# Train and use a Hierarchical Lexicalized Distortion Model (HLDM).
# HLDM seems to be quite effective in most scenarios, so we enable it by default.
# Comment out to disable using an HLDM; uncomment to enable.
USE_HLDM = 1
# Train and use a sparse model
# Comment out to disable using a sparse model; uncomment to enable.
USE_SPARSE = 1
# Train and use coarse LMs.
# If enabled, by default this will add 200-class and 800-class coarse LMs.
# Comment out to disable using coarse LMs; uncomment to enable.
USE_COARSELM = 1
# Train and use (coarse) BiLMs.
# If enabled, by default this will train a 400bi-400s-400t coarse BiLM
# (400 bitoken classes with 400 src word classes and 400 tgt word classes).
# Comment out to disable using BiLMs (default); uncomment to enable.
#USE_BILM = 1
# Train and apply truecasing if this variable is defined.
# Comment out to disable truecasing; uncomment to enable.
DO_TRUECASING = 1
# If USE_SIGPRUNING is set, phrase tables will be filtered using significance
# pruning before they are used. Significance pruning removes phrase pairs that
# are statistically well attested in the training corpus. Sig-pruning results
# in much smaller phrase tables, usually without loss in BLEU, sometimes in
# fact with a gain in BLEU.
#USE_SIGPRUNING = 1
########################################
# Tokenization and other preprocessing
# If you have your own tokenizer or detokenizer and you want to use it, then
# define the following variables to the command line to run it. Note that the
# variable names contain the source or target language two-letter identifier.
# For, example, for Spanish source you would define TOKENIZER_es, for French
# target you would define DETOKENIZER_fr. More generally,
# {DE|}TOKENIZER_{${SRC_LANG}|${TGT_LANG}}.
# Examples of defining what tokenizers we want to use:
# Call opennlp instead of the Portage tokenizer:
#TOKENIZER_en := opennlp TokenizerME /modeldir/en-model.bin
# Call the Portage tokenizer:
#TOKENIZER_fr := utokenize.pl -noss -lang=fr
# Call the Portage tokenizer, using fix-slashes.pl to separate pairs of words
# joined by / (this is now the default for fr, en, es, da):
#TOKENIZER_en := { set -o pipefail; fix-slashes.pl | utokenize.pl -noss -lang=en; }
#TOKENIZER_fr := { set -o pipefail; fix-slashes.pl | utokenize.pl -noss -lang=fr; }
# Do Chinese segmentation using ICTCLAS (assuming you have it):
#TOKENIZER_ch := { set -o pipefail; iconv -c -f UTF-8 -t CN-GB | ictclas_preprocessing.pl | ictclas | ictclas_postprocessing.pl | iconv -c -f CN-GB -t UTF-8; }
# Or do Chinese segmentation using the LDC-based segmenter we distribute:
#TOKENIZER_ch := chinese_segmenter.pl
# Do Arabic tokenization in Portage (using precomputed MADA map; requires MADA):
#TOKENIZER_ar ?= tokenize_plugin ar
# Defining what detokenizers we want to use:
#DETOKENIZER_en ?= opennlp DetokenizerME /modeldir/en-model.bin
#DETOKENIZER_fr ?= udetokenize.pl -lang=fr
# If you have ictclas installed and want to use it to tokenize Chinese,
# uncomment the following:
# USE_ICTCLAS ?= 1
# Language specific set of command to mark source devs/tests.
#MARK_RULE_en ?= canoe-escapes.pl -add
#MARK_RULE_fr ?= canoe-escapes.pl -add
#MARK_RULE_ch ?= { chinese_rule_markup.pl | chinese_rule_create.pl; }
########################################
# LM Toolkit
# Change LM_TOOLKIT's value depending on the LM toolkit you have. If you use
# SRILM or MITLM, their executable scripts and programs must be on your PATH.
# LM_TOOLKIT={SRI,IRST,MIT}
# where SRI => SRILM toolkit
# IRST => IRSTLM toolkit
# MIT => MITLM toolkit
LM_TOOLKIT = MIT
# The prefix_root where we can find IRSTLM/bin, which must also be on your
# PATH. (Only needed if you are using IRSTLM - see next variable.)
IRSTLM ?= $(PORTAGE)/pkgs/irstlm
########################################
# Parallelism levels and cluster control.
# How many CPUs should each PortageLive request use?
# You can increase this parameter if your PortageLive server has multiple
# cores. Monitor use to make sure your server does not get saturated. You can
# adjust this parameter later for a running system by changing the '-n <N>'
# parameter in soap-translate.sh for each installed PortageLive context. To
# retroactively add parallelism to previously trained PortageLive contexts, add
# "-w=3 -n=<n>" to their soap-translate.sh.
PARALLELISM_LEVEL_PORTAGELIVE ?= 1
# If you are on a cluster that is run-parallel.sh friendly, define the
# following to force cluster mode. You normally don't need to do so, though,
# since clusters are detected automatically below.
#USING_CLUSTER ?= 1
# If you are on a cluster but you want to force single computer mode,
# uncomment the following line:
#NOCLUSTER ?= 1
# Automatically detects if we are on a cluster.
ifeq ($(strip $(shell on-cluster.sh && echo "true")),true)
USING_CLUSTER ?= 1
endif
ifdef NOCLUSTER
USING_CLUSTER =
endif
OSTYPE ?= $(shell uname -s)
ifdef USING_CLUSTER
PARALLELISM_LEVEL_CORPORA ?= 10
PARALLELISM_LEVEL_LM ?= 5
PARALLELISM_LEVEL_LDM ?= 30
PARALLELISM_LEVEL_TM ?= 5
PARALLELISM_LEVEL_SPARSE ?= 10
PARALLELISM_LEVEL_TUNE_DECODE ?= 10
PARALLELISM_LEVEL_TUNE_RESCORE ?= 10
PARALLELISM_LEVEL_TUNE_CONFIDENCE ?= 10
# Be careful not to over-parallelize for translation if models take long to load, especially if translating many test files.
# One can run canoe-timing-stats.pl on the resulting logs to help assess.
PARALLELISM_LEVEL_TRANSLATE ?= 1
ifeq (${MAKELEVEL},0)
$(info Running in cluster mode.)
endif
else
# Make sure we run in serial mode.
.NOTPARALLEL:
# Autodetect the number of available cpus on this machine.
ifneq (${OSTYPE},Darwin)
NCPUS := $(shell test -n "$$OMP_NUM_THREADS" && echo $$OMP_NUM_THREADS || grep processor /proc/cpuinfo | wc -l)
else
NCPUS := $(shell test -n "$$OMP_NUM_THREADS" && echo $$OMP_NUM_THREADS || sysctl -n hw.ncpu)
endif
# Uncomment the following line to fix the parallelism level in single-host mode
# You would normally do this to set NCPUS to a smaller number than you really
# have, e.g., if you are sharing the machine with other uses.
# Should be <= the real number of CPUs on your machine.
#NCPUS := 24
PARALLELISM_LEVEL_CORPORA ?= ${NCPUS}
PARALLELISM_LEVEL_LM ?= ${NCPUS}
PARALLELISM_LEVEL_LDM ?= ${NCPUS}
PARALLELISM_LEVEL_TM ?= ${NCPUS}
PARALLELISM_LEVEL_SPARSE ?= ${NCPUS}
PARALLELISM_LEVEL_TUNE_DECODE ?= ${NCPUS}
PARALLELISM_LEVEL_TUNE_RESCORE ?= ${NCPUS}
PARALLELISM_LEVEL_TUNE_CONFIDENCE ?= ${NCPUS}
# Be careful not to over-parallelize for translation if models take long to load, especially if translating many test files.
# One can run canoe-timing-stats.pl on the resulting logs to help assess.
PARALLELISM_LEVEL_TRANSLATE ?= 1
# Disable CLUSTER mode in all of PortageII's software.
export PORTAGE_NOCLUSTER=1
ifeq (${MAKELEVEL},0)
$(info Running in local mode.)
endif
endif
########################################
# Sanitize user inputs.
# Remove accidental user spaces that would otherwise confuse make.
SRC_LANG := $(strip ${SRC_LANG})
TGT_LANG := $(strip ${TGT_LANG})
PRIMARY_LM := $(strip ${PRIMARY_LM})
TRAIN_LM := $(strip ${TRAIN_LM})
MIXLM := $(strip ${MIXLM})
TRAIN_TM := $(strip ${TRAIN_TM})
MIXTM := $(strip ${MIXTM})
TUNE_DECODE := $(strip ${TUNE_DECODE})
TUNE_DECODE_VARIANTS := $(strip ${TUNE_DECODE_VARIANTS})
TUNE_RESCORE := $(strip ${TUNE_RESCORE})
TUNE_CE := $(strip ${TUNE_CE})
################################################################################
# Advanced configuration variables
# Compress extension.
GZ ?= .gz
# Raw file's extension.
ALIGNX ?= .al
# Extension for rule files which must also be source file.
RULEX ?= _${SRC_LANG}.rule
# Language extension for phrase table corpora.
# Extension for corpora.
LANGX ?= .lc
# Extension for compressed corpora.
LANGXZ ?= ${LANGX}${GZ}
# Language pair for this system.
LANGS ?= ${SRC_LANG} ${TGT_LANG}
# Auto-detect whether PortageII was compiled with ICU.
ifeq (${MAKELEVEL},0)
PORTAGE_VERSION:=$(shell portage_info -version)
ifeq (${PORTAGE_VERSION},)
$(warning Cannot find portage_info; make sure a version of PortageII is installed.)
endif
endif
ifeq ($(strip $(shell portage_info -with-icu > /dev/null && echo "true")),true)
ICU = 1
ifeq (${MAKELEVEL},0)
$(info ${PORTAGE_VERSION} was compiled with ICU)
endif
else
ifeq (${MAKELEVEL},0)
$(info ${PORTAGE_VERSION} was compiled without ICU)
endif
endif
ifeq (${MAKELEVEL},0)
$(info ) # Leave a blank line
endif
# Apply the default logic triggered by using PRIMARY_LM instead of TRAIN_LM or MIXLM
ifdef PRIMARY_LM
ifneq ($(or ${MIXLM},${TRAIN_LM}),)
$(error When you define PRIMARY_LM, leave TRAIN_LM and MIXLM undefined)
endif
ifdef MIXLM_PRETRAINED_TGT_LMS
ifeq (${MAKELEVEL},0)
$(info PRIMARY_LM and MIXLM_PRETRAINED_TGT_LMS specified: combining them in a MixLM)
endif
MIXLM=${PRIMARY_LM}
else
MIXLM_PRETRAINED_TGT_LMS_GLOB=$(wildcard ${PORTAGE_GENERIC_MODEL}/generic-2.1/lm/generic-2.1_${TGT_LANG}.tplm)
#$(info mixlm test ${MIXLM_PRETRAINED_TGT_LMS_GLOB})
ifneq (${MIXLM_PRETRAINED_TGT_LMS_GLOB},)
ifeq (${MAKELEVEL},0)
$(info Found pre-trained generic LM and PRIMARY_LM: combining them in a MixLM)
endif
MIXLM=${PRIMARY_LM}
MIXLM_PRETRAINED_TGT_LMS=${MIXLM_PRETRAINED_TGT_LMS_GLOB}
else
ifneq ($(filter ${TGT_LANG},en fr),)
LM_ERROR_MSG := Generic LM for ${TGT_LANG} not found; please install PortageII Generic Model 2.1, or set \
PORTAGE_GENERIC_MODEL, or set MIXLM_PRETRAINED_TGT_LMS, or use one of MixLM or TRAIN_LM instead of PRIMARY_LM
ifeq ($(filter clean clean.% doc help,${MAKECMDGOALS}),)
$(error ${LM_ERROR_MSG})
else
$(warning ${LM_ERROR_MSG})
endif
endif
ifneq ($(words ${PRIMARY_LM}),1)
ifeq (${MAKELEVEL},0)
$(info Multiple LMs in PRIMARY_LM: combining them in a MixLM)
endif
MIXLM=${PRIMARY_LM}
else
ifeq (${MAKELEVEL},0)
$(info No pre-trained generic LM found: using a regular in-domain LM)
endif
TRAIN_LM=${PRIMARY_LM}
endif
endif
endif
else
ifdef MIXLM
ifdef MIXLM_PRETRAINED_TGT_LMS
ifeq (${MAKELEVEL},0)
$(info MIXLM and MIXLM_PRETRAINED_TGT_LMS specified: combining them in a MixLM)
endif
else ifneq ($(words ${MIXLM}),1)
ifeq (${MAKELEVEL},0)
$(info Multiple LMs in MIXLM specified: combining them in a MixLM)
endif
endif
else ifdef MIXLM_PRETRAINED_TGT_LMS
ifneq ($(words ${MIXLM_PRETRAINED_TGT_LMS}),1)
ifeq (${MAKELEVEL},0)
$(info Multiple LMs in MIXLM_PRETRAINED_TGT_LMS specified: combining them in a MixLM)
endif
endif
endif
endif
# With the MIT LM toolkit, a dev set can be used to tune the LM KN discounting
# parameters. It is recommended that a dev set other than the TUNE_DECODE set
# be used for such tuning; it's okay to use TUNE_RESCORE or TUNE_CE if different
# from TUNE_DECODE. To tune the LM parameters, define TUNE_LM below.
TUNE_LM ?= ${TUNE_DECODE}
TUNE_LM := $(strip ${TUNE_LM})
# Make sure USE_DYNAMIC_MIXLM is not defined if not using MixLMs.
ifdef USE_DYNAMIC_MIXLM
ifeq ($(strip ${MIXLM} ${MIXLM_PRETRAINED_TGT_LMS}),)
USE_DYNAMIC_MIXLM =
endif
endif
USE_MIXTM := $(if $(strip ${MIXTM} ${MIXTM_PRETRAINED_TMS}),1)
ifdef USE_MIXTM
# By default we assume that the user wants to use the first MIXTM corpus (its
# word alignment models are needed) to train the weights for mixing the
# component translation models.
MIXTM_TRAIN_MIX ?= $(firstword ${MIXTM})
MIXTM_TRAIN_MIX := $(strip ${MIXTM_TRAIN_MIX})
# We need a dev set for tuning the mixture weights in a mixtm.
TUNE_MIXTM ?= ${TUNE_DECODE}
TUNE_MIXTM := $(strip ${TUNE_MIXTM})
# It may be beneficial to use a global Word Alignment Model for training a
# MIXTM. To do so, define MIXTM_USE_GLOBAL_WORD_ALIGNMENT_MODEL (uncomment):
#MIXTM_USE_GLOBAL_WORD_ALIGNMENT_MODEL ?= 1
ifdef REFERENCE_INDICES
TUNE_MIXTM_MULTIPLE_REFERENCES ?= $(strip ${TUNE_MIXTM}).multiple.references
endif
endif
# In the case where you want a merged_cpt, you will need to define the following:
# MERGED_CPT_ZN_MODEL & MERGED_CPT_JPT_TYPES
# What word alignment model to use for Zens-Ney's smoother when building a merged_cpt?
# MERGED_CPT_ZN_MODEL can be one of: IBM4, HMM3, HMM2, HMM1, IBM2 or IBM1
MERGED_CPT_ZN_MODEL ?= HMM3
# What type of jpts should be used to create the final merged_cpt?
# MERGED_CPT_JPT_TYPES can be sevaral of: FAST_ALIGN, IBM4, HMM3, HMM2, HMM1, IBM2 or IBM1
# Using IBM4 models in addition to IBM2 and HMM3 gives a small boost in BLEU
# for most scenarios, and increases reliability, so we now enable it by default.
MERGED_CPT_JPT_TYPES ?= IBM2 HMM3 IBM4
# Are we using alignment indicator features?
MERGED_CPT_USE_ALIGNMENT_INDICATORS ?= 0
# Specify the alignment symmetrization strategy:
# The former default, IBMOchAligner 3 (GDF), gives denser alignments and smaller
# phrase tables It works best for very large corpora. This is equivalent to "diag"
# in Koehn, Och and Marcu (2003), and is often referred to as "grow-diag-final".
#ALIGNMENT_SYMMETRIZATION_OPTIONS = -a GDF
# IBMOchAligner 4 (GDFA) gives sparser, higher confidence alignments, and larger
# phrase tables. It works best for small to large corpora: with up to hundreds
# of thousands of sentence pairs, maybe more, this is known to be the better option.
# This is equivalent to "diag-and" in Koehn, Och and Marcu (2003), and is often
# referred to as "grow-diag-final-and".
ALIGNMENT_SYMMETRIZATION_OPTIONS = -a GDFA
# Define what type of phrase table we want to generate.
# Can be one or more: ibm2_cpt, hmm1_cpt, hmm2_cpt, hmm3_cpt, ibm4_cpt,
# merged_cpt, or indicator_cpt.
# Typically, merged_cpt or indicator_cpt is used alone.
# WARNING: changes here must be manually reflected in
# models/rescore/rescore-model.template and models/confidence/ce-notm.template:
# look for upper case tokens like HMM3FWD and similar ones nearby.
# PT_TYPES ?= ibm2_cpt hmm3_cpt
ifeq (${MERGED_CPT_USE_ALIGNMENT_INDICATORS},1)
PT_TYPES ?= $(strip $(if ${TRAIN_TM}, indicator_cpt) \
$(if ${USE_MIXTM}, mix_cpt))
else
PT_TYPES ?= $(strip $(if $(strip ${TRAIN_TM}), merged_cpt) \
$(if ${USE_MIXTM}, mix_cpt))
endif
# Determine whether TMs will include the alignment (a=) field or not.
# This field is used for truecasing and tags transfer.
TMS_WITH_ALIGNMENT_FIELD ?= 1
# Define the significance threshold for significance pruning.
# Usually the default of a+e is the ideal value.
# SIG_THRESHOLD must be a positive real number or a pre-defined constant:
# 'a+e' (alpha + epsilon) is the significance threshold such that <1,1,1>
# phrase pairs are filtered out.
# 'a-e' (alpha - epsilon) is the significance threshold such that <1,1,1>
# phrase pairs are kept.
# Note: a (alpha) is the significance level of <1,1,1> phrase pairs, as
# discussed in Johnson et al, EMNLP 2007.
ifdef USE_SIGPRUNING
SIG_THRESHOLD ?= a+e
endif
SIG_THRESHOLD := $(strip ${SIG_THRESHOLD})
# On some clusters or highly parallel file systems, memory-mapped class files
# can be inefficient - uncomment to use the regular ones instead
SPARSE_MODEL_NO_MMCLS ?= 1
# By default, we assume that the user wants to use the corpora listed in
# TRAIN_TM and MIXTM to build the Lexicalized Distortion Model (LDM).
ifdef USE_LDM
TRAIN_LDM ?= $(sort ${TRAIN_TM} ${MIXTM})
endif
TRAIN_LDM := $(strip ${TRAIN_LDM})
# By default, we assume that the user wants to use the corpora listed in
# TRAIN_LDM to build the Hierarchical Lexicalized Distortion Model (HLDM)
# if TRAIN_LDM is defined, or the corpora listed in TRAIN_TM and MIXTM if
# TRAIN_LDM is not defined.
ifdef USE_HLDM
TRAIN_HLDM ?= $(or ${TRAIN_LDM}, $(sort ${TRAIN_TM} ${MIXTM}))
endif
TRAIN_HLDM := $(strip ${TRAIN_HLDM})
# TRAIN_SPARSE should be the concatenation of all TRAIN_TM and MIXTM, but
# creation of that corpus hasn't been added yet. For now, we add a hack to
# default TRAIN_SPARSE to the first word of TRAIN_TM.
ifdef USE_SPARSE
TRAIN_SPARSE ?= $(firstword ${TRAIN_TM})
endif
TRAIN_SPARSE := $(strip ${TRAIN_SPARSE})
# Word class granularities for coarse LMs.
# Defining COARSELM_NCLS_LIST triggers word class training too.
# If defined, COARSELM_NCLS_LIST must contain integer values.
# We recommend using a 200-class coarse LM and an 800-class coarse LM.
ifdef USE_COARSELM
COARSELM_NCLS_LIST ?= 200 800
endif
COARSELM_NCLS_LIST := $(strip ${COARSELM_NCLS_LIST})
# By default, we assume that the user wants to use PRIMARY_LM for the coarse LM,
# or TRAIN_LM (if no PRIMARY_LM), or MIXLM (if no PRIMARY_LM or TRAIN_LM).
ifdef USE_COARSELM
TRAIN_COARSELM ?= $(or ${PRIMARY_LM},${TRAIN_LM},${MIXLM})
endif
TRAIN_COARSELM := $(strip ${TRAIN_COARSELM})
# With the MIT LM toolkit, a dev set can be used to tune the LM KN discounting
# parameters. It is recommended that a dev set other than the TUNE_DECODE set
# be used for such tuning; it's okay to use TUNE_RESCORE or TUNE_CE if different
# from TUNE_DECODE. To tune the discount parameters for coarse LMs, define
# TUNE_COARSELM below. By default, we assume the user wants to use the same
# dev set to tune the coarse LMs as was used to turn the normal word LM.
ifdef USE_COARSELM
TUNE_COARSELM ?= ${TUNE_LM}
endif
TUNE_COARSELM := $(strip ${TUNE_COARSELM})
# Specifications for BiLMs.
# "word" indicates a word-based BiLM.
# Coarse BiLM specifications take the form: NNNbi-NNNs-NNNt where NNN is the
# word class granularity. Examples: 400bi-400s-400t, 400s-400t, 400bi
# Multiple BiLM specifications are permitted. Each will be applied to each
# corpus listed in TRAIN_BILM.
# If you enable BiLMs, we suggest a 400bi-400s-400t coarse BiLM
# (400 bitoken classes with 400 src word classes and 400 tgt word classes).
ifdef USE_BILM
BILM_SPEC ?= 400bi-400s-400t
endif
BILM_SPEC := $(strip ${BILM_SPEC})
# By default, we assume that the user wants to use the TRAIN_TM corpora
# for BiLMs too.
ifdef USE_BILM
TRAIN_BILM ?= ${TRAIN_TM}
endif
TRAIN_BILM := $(strip ${TRAIN_BILM})
BILM_NCLS_LIST := $(patsubst %s,%, $(patsubst %t,%, $(filter %s %t, $(subst -, ,${BILM_SPEC}))))
# With the MIT LM toolkit, a dev set can be used to tune the LM KN discounting
# parameters. It is recommended that a dev set other than the TUNE_DECODE set
# be used for such tuning; it's okay to use TUNE_RESCORE or TUNE_CE if different
# from TUNE_DECODE. To tune the discount parameters for BiLMs, define
# TUNE_BILM below. By default, we assume the user wants to use the same
# dev set to tune the BiLMs as was used to turn the normal word LM.
ifdef USE_BILM
TUNE_BILM ?= ${TUNE_LM}
endif
TUNE_BILM := $(strip ${TUNE_BILM})
# Word class granularities for NNJMs.
# Defining NNJM_NCLS triggers word class training too.
# If defined, NNJM_NCLS must contain integer values.
# We recommend using a 400-class for NNJM.
ifneq ("${NNJM_TRAIN_CORPUS}","")
NNJM_NCLS ?= 400
endif
# NNJM specific options that should stay empty.
NNJM_TRAIN_NNJM_OPTS ?=
NNJM_GENEX_OPTS ?=
# Word class granularities to train.
# By default, we train word classes (wcl) for the granularites identified for
# use in coarse models (coarse LMs, coarse BiLMs, coarse TMs).
# => WCL_NCLS_LIST = COARSELM_NCLS_LIST BILM_NCLS_LIST
# Defining WCL_NCLS_LIST triggers word class training.
# If defined, WCL_NCLS_LIST must contain integer values.
WCL_NCLS_LIST ?= $(sort ${COARSELM_NCLS_LIST} ${BILM_NCLS_LIST} ${NNJM_NCLS})
WCL_NCLS_LIST := $(strip ${WCL_NCLS_LIST})
# By default, we assume that the user wants to use the corpora listed in
# TRAIN_LM and MIXLM to build the word classes.
ifdef WCL_NCLS_LIST
TRAIN_WCL ?= $(sort ${TRAIN_LM} ${MIXLM} ${TRAIN_TM} ${MIXTM} ${TRAIN_COARSELM} ${TRAIN_BILM} \
${NNJM_TRAIN_CORPUS} ${NNJM_FINE_TUNING_TRAIN_CORPUS})
endif
TRAIN_WCL := $(strip ${TRAIN_WCL})
# Define what type of language model we want to generate.
LM_TYPES ?= binlm
# Parameters for models/decode/Makefile
#TEMPLATE_DIR ?= ${ROOT_DIR}/models/decode
PREFIX_DEV_COW ?= ${TUNE_DECODE}
PREFIX_DEV_RAT ?= ${TUNE_RESCORE}
# By default, we assume that the user wants to use the first corpus in PRIMARY_LM
# to build the truecasing models, or in TRAIN_LM (if no PRIMARY_LM), or in MIXLM
# (if no PRIMARY_LM or TRAIN_LM).
ifdef DO_TRUECASING
TRAIN_TC ?= $(firstword $(or ${PRIMARY_LM},${TRAIN_LM},${MIXLM}))
endif
TRAIN_TC := $(strip ${TRAIN_TC})
# Define the truecasing model filenames.
TRUECASING_MAP ?= ${TRAIN_TC}_${TGT_LANG}.map
TRUECASING_LM ?= ${TRAIN_TC}_${TGT_LANG}-kn-3g.binlm${GZ}
# Should we also use source language information in truecasing?
# NOTE: use of source language models in truecasing is not compatible with rescoring.
# We cannot use source language models for Arabic or Chinese as the source
# language because they are caseless.
ifeq ($(filter ${SRC_LANG}, ar ch),)
# Comment out to disable use of source language info; uncomment to enable.
TC_USE_SRC_MODELS ?= 1
endif
ifdef TC_USE_SRC_MODELS
TRUECASING_NC1_SRC_LM ?= ${TRAIN_TC}_${SRC_LANG}.nc1.binlm${GZ}
# Source language locale used during truecasing.
# If the default of ${SRC_LANG}_${SRC_LOCALE_COUNTRY}.UTF-8 is not correct,
# uncomment the following and declare the correct string here.
# For example:
#SRC_LOCALE ?= da_DK.utf8
#SRC_LOCALE ?= ${SRC_LANG}_${SRC_LOCALE_COUNTRY}.UTF-8
# Target language locale used during truecasing.
# If the default of ${TGT_LANG}_${TGT_LOCALE_COUNTRY}.UTF-8 is not correct,
# uncomment the following and declare the correct string here.
#TGT_LOCALE ?= ${TGT_LANG}_${TGT_LOCALE_COUNTRY}.UTF-8
# Make sure the source and target language locales are set.
SRC_LOCALE ?= ${SRC_LANG}_${SRC_LOCALE_COUNTRY}.UTF-8
SRC_LOCALE := $(strip ${SRC_LOCALE})
TGT_LOCALE ?= ${TGT_LANG}_${TGT_LOCALE_COUNTRY}.UTF-8
TGT_LOCALE := $(strip ${TGT_LOCALE})
endif # TC_USE_SRC_MODELS
# When working with TMX files, we assume the language code in the TMX is the
# upper case of $SRC_LANG/$TGT_LANG followed by '-' (hyphen) followed by
# $SRC_LOCALE_COUNTRY/$TGT_LOCALE_COUNTRY. When that's not true,
# uncomment the following and declare the correct strings here.
#TMX_SRC = EN-CA
#TMX_TGT = FR-CA
# If we are lucky enough to have a cluster, we'll change the shell for certain
# commands and allow them to run on nodes.
ifdef USING_CLUSTER
FRAMEWORK_SHELL = run-parallel.sh
else
FRAMEWORK_SHELL = /bin/bash
endif
# Some commands shouldn't be run with the cluster shell, will use this one
# instead.
LOCAL_SHELL = /bin/bash
########################################
# LANGUAGE SPECIFICS
ifeq (${SRC_LANG},ar)
DONT_LOWERCASE_SRC = 1
endif
# We include src_lang specific configuration just before we validate the configuration.
-include $(dir $(lastword ${MAKEFILE_LIST}))Makefile.params.${SRC_LANG}
########################################
# This sets a default value for PORTAGE_GENERIC_MODEL if it was not defined in
# the user's environment.
PORTAGE_GENERIC_MODEL ?= ${PORTAGE}/generic-model
########################################
# VALIDATION
ifeq (${SRC_LANG},)
$(error You must provide a SRC_LANG!)
endif
ifeq (${TGT_LANG},)
$(error You must provide a TGT_LANG!)
endif
ifeq (${SRC_LANG},${TGT_LANG})
$(error SRC_LANG=${SRC_LANG} cannot be the same as TGT_LANG=${TGT_LANG}!)
endif
ifeq (${SRC_LOCALE_COUNTRY},)
$(error You must provide a SRC_LOCALE_COUNTRY!)
endif
ifeq (${TGT_LOCALE_COUNTRY},)
$(error You must provide a TGT_LOCALE_COUNTRY!)
endif
ifeq ($(strip ${TRAIN_LM} ${LM_PRETRAINED_TGT_LMS} ${MIXLM} ${MIXLM_PRETRAINED_TGT_LMS}),)
$(error You must always define a training corpus and/or pretrained models for language models)
endif
ifneq ($(strip ${MIXLM} ${MIXLM_PRETRAINED_TGT_LMS}),)
ifeq ($(words ${MIXLM} ${MIXLM_PRETRAINED_TGT_LMS}),1)
$(error You must specify multiple LMs in MIXLM and/or MIXLM_PRETRAINED_TGT_LMS to train a MixLM)
endif
endif
ifeq ($(strip ${TRAIN_TM} ${TM_PRETRAINED_TMS} ${MIXTM} ${MIXTM_PRETRAINED_TMS}),)
$(error You must always define a training corpus and/or pretrained models for translation models)
endif
ifdef USE_MIXTM
ifeq (${MIXTM_TRAIN_MIX},)
$(error With a MixTM, you must define a training corpus (MIXTM_TRAIN_MIX) for training the mixture weights)
endif
ifneq ($(words ${MIXTM_TRAIN_MIX}),1)
$(error You must not provide more than one corpus in MIXTM_TRAIN_MIX)
endif
ifeq (${TUNE_MIXTM},)
$(error With a MixTM, you must define a tuning corpus (TUNE_MIXTM) for tuning the mixture weights)
endif
ifneq ($(filter FAST_ALIGN,${MERGED_CPT_ZN_MODEL} ${MERGED_CPT_JPT_TYPES}),)
ifeq (${UNITTESTING},)
$(error MixTM with fast_align is not implemented.)
# There is no force align with fast_align. To create TUNE_MIXTM's word
# alignment file, we would have to create a fast_align model with train then we
# would train a second model where we concatenate train + TUNE_MIXTM from which
# we would only keep the word alignment file for the subset of TUNE_MIXTM.
endif
endif
endif # USE_MIXTM
ifeq (${TUNE_DECODE},)
$(error You must always define a tuning corpus to train the decoder)
endif
ifdef DO_CE
ifeq (${TUNE_CE},)
$(error When asking for confidence estimation, you must also define a TUNE_CE)
endif
ifneq (${REFERENCE_INDICES},)
$(error Multiple references is not supported with confidence estimation)
endif
endif
ifdef DO_RESCORING
ifeq (${TUNE_RESCORE},)
$(error When asking for rescoring, you must also define a TUNE_RESCORE)
endif
ifdef TC_USE_SRC_MODELS
$(error When asking for rescoring, you must not define TC_USE_SRC_MODELS)
endif
endif
ifdef DO_TRUECASING
ifeq (${TRAIN_TC},)
$(error With truecasing, you must define a training corpus (TRAIN_TC) for training the truecasing model)
endif
# Check the SRC_LOCALE and TGT_LOCALE if needed for truecasing, but only once.
ifdef TC_USE_SRC_MODELS
ifeq (${MAKELEVEL},0)
CHECK_LOCALE = $(shell perl -e "use POSIX qw(locale_h); exit 1 unless defined setlocale(LC_CTYPE,q($1));"; echo $$?)
ifeq (${SRC_LOCALE},)
$(error With truecasing with TC_USE_SRC_MODELS, you must define the SRC_LOCALE!)
else
LOCALE_RC := $(call CHECK_LOCALE,${SRC_LOCALE})
ifneq (${LOCALE_RC},0)
$(info SRC_LANG: ${SRC_LANG})
$(info SRC_LOCALE_COUNTRY: ${SRC_LOCALE_COUNTRY})
$(info SRC_LOCALE: ${SRC_LOCALE})
$(error Error: Invalid locale ${SRC_LOCALE}; check values of SRC_LOCALE, SRC_LANG, SRC_LOCALE_COUNTRY; \
if correct, locale ${SRC_LOCALE} needs to be installed)
endif
endif # SRC_LOCALE
ifeq (${TGT_LOCALE},)
$(error With truecasing with TC_USE_SRC_MODELS, you must define the TGT_LOCALE!)
else
LOCALE_RC := $(call CHECK_LOCALE,${TGT_LOCALE})
ifneq (${LOCALE_RC},0)
$(info TGT_LANG: ${TGT_LANG})
$(info TGT_LOCALE_COUNTRY: ${TGT_LOCALE_COUNTRY})
$(info TGT_LOCALE: ${TGT_LOCALE})
$(error Error: Invalid locale ${TGT_LOCALE}; check values of TGT_LOCALE, TGT_LANG, TGT_LOCALE_COUNTRY; \
if correct, locale ${TGT_LOCALE} needs to be installed)
endif
endif # TGT_LOCALE
endif # MAKELEVEL 0
endif # TC_USE_SRC_MODELS
endif # DO_TRUECASING
ifdef USE_LDM
ifeq (${TRAIN_LDM},)
$(error With USE_LDM, you must define a training corpus (TRAIN_LDM) for training the distortion model)
endif
endif
ifdef USE_HLDM
ifeq (${TRAIN_HLDM},)
$(error With USE_HLDM, you must define a training corpus (TRAIN_HLDM) for training the distortion model)
endif
endif
ifdef USE_SPARSE
ifeq (${TRAIN_SPARSE},)
$(error With USE_SPARSE, you must define a training corpus (TRAIN_SPARSE) for training the sparse features)
endif
endif
ifdef USE_COARSELM
ifeq (${TRAIN_COARSELM},)
$(error With USE_COARSELM, you must define a training corpus (TRAIN_COARSELM) for training the coarse LM(s))
endif
ifeq (${COARSELM_NCLS_LIST},)
$(error With USE_COARSELM, you must define word class granularities (COARSELM_NCLS_LIST) for training the coarse LM(s))