-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequency-list-wizard.pl
executable file
·658 lines (561 loc) · 30.6 KB
/
frequency-list-wizard.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
#!/usr/bin/env perl
my $version = "1.2.0";
#################################################################################
# PROGRAM: Frequency List Wizard #
# AUTHOR: Scott Sadowsky #
# EMAIL: s s a d o w s k y A T g m a i l D O T c o m #
# WEB: http://sadowsky.cl/ #
# #
# HELP: For help, run: (SCRIPT) ./frequency-list-wizard.pl -h #
# (EXECUTABE) frequency-list-wizard.exe -h #
# #
# COPYRIGHT: Copyright (c) 2016 by Scott Sadowsky #
# LICENSE: Licensed under the GNU General Public License, version 3 (GPLv3) #
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
# Thanks to Stefan Evert for his help with the merge routine. #
# #
#################################################################################
# CHANGELOG
#
# 1.2.0
# - Now processes UTF-8 texts. FLW assumes this is the encoding used. To process
# Latin-1, you must specify this with "-lat1" on the command line.
# CONFIGURE
use strict; # quote strings, declare variables
use warnings; # on by default
use utf8; # so literals and identifiers can be in UTF-8
use v5.12; # or later to get "unicode_strings" feature
use warnings qw(FATAL utf8); # fatalize encoding glitches
use open qw(:std :utf8); # undeclared streams in UTF-8
# COMMENT THIS OUT when compiling executable!
#use warnings;
# Use modules
use Getopt::Long qw(:config no_auto_abbrev); # Module for processing command line options.
# Define variables - these can be changed to alter default behavior. Do this with care, however!
my $no_nums = 1; # Default value of option to NOT process words with numbers
my $no_punct = 1; # Default value of option to NOT process words with punctuation
my $print_stats = 1;
my $input_tokens = 0; # Initial count of all INPUT tokens
my $proc_tokens = 0; # Initial count of PROCESSED tokens
my $input_types = 0; # Initial count of all INPUT types
our $proc_types = 0; # Initial count of PROCESSED types
my $three_cols = 0;
my $kill_head = 1;
my $kill_tail = 1;
my $spaces_as_split_char = 1;
my $merge_allomorphs = 1;
my $pos_minimal = 0;
my $pos_full = 0;
my $calculate_hapax = 0; # Set to 0 after finished writing the routine.
my $delimiter = "\t"; # Default delimiter inserted between columns in output file
my $split_char = "\t"; # Default character that each input line is split on
my $punctuation = '(\*|\@|\.|\,|\;|\:|\=|\-|_|\"|\'|\+|\?|¿|\¿|\!|\¡|¡|\#|\$|\%|\&|\/|\(|\)|\^|\`|\[|\]|\{|\}|\~|\\\|\||<|>|¬|·)';
my $latin = 0; # Default encoding is UTF-8, expressed here as "not Latin-1"
# Define variables - INTERNAL USE ONLY! Changing these will most likely break FLW!
my $input_file_name = "";
my $output_file_name = "";
my $in_basename = "";
my $tokens;
my $word;
my %F;
my $extra_col = "";
my $need_help = 0;
my $need_usage = 0;
my $debug = 0;
# Meta-configs
my $process_as_words = 0;
my $process_as_lemmas = 0;
my $process_as_pos = 0;
my $process_as_synrels = 0;
my $process_as_word_plus_pos = 0;
my $process_as_lemma_plus_pos = 0;
# Read command line options
GetOptions(
"input|i=s" => \$input_file_name,
"output|o=s" => \$output_file_name,
"delimiter|del|d=s" => \$delimiter,
"spliton|so=s" => \$split_char,
"nonums|nn!" => \$no_nums,
"nopunct|np!" => \$no_punct,
"print-stats|ps!" => \$print_stats,
"3-col|3c!" => \$three_cols,
"killhead|kh!" => \$kill_head,
"killtail|kt!" => \$kill_tail,
"spaces-split|ss!" => \$spaces_as_split_char,
"help|h" => \$need_help,
"usage|u" => \$need_usage,
"mergeallo|ma" => \$merge_allomorphs,
"latin|lat" => \$latin,
# Meta-configurations
"words|w" => \$process_as_words,
"lemmas|l" => \$process_as_lemmas,
"pos|p" => \$process_as_pos,
"posfull|pf" => \$pos_full,
"posmin|pm" => \$pos_minimal,
"synrel|sr" => \$process_as_synrels,
"wordpos|wp" => \$process_as_word_plus_pos,
"lemmapos|lp" => \$process_as_lemma_plus_pos,
# Other modes
"debug|db!" => \$debug,
"metafreq|mf|hapax|hx|legomena|leg!" => \$calculate_hapax
);
# Set variables for meta-configurations
if ( ( $pos_minimal == 1 ) || ( $pos_full == 1 ) ) {
$process_as_pos = 1;
}
if ( ( $process_as_words == 1 ) || ( $process_as_lemmas == 1 ) ) {
$three_cols = 0;
$kill_head = 0;
$kill_tail = 0;
}
if ( $process_as_pos == 1 ) {
$no_nums = 0;
$no_punct = 0;
$three_cols = 0;
$kill_head = 1;
$kill_tail = 0;
if ( $pos_minimal == 1 ) {
$kill_head = 1;
$kill_tail = 1;
}
if ( $pos_full == 1 ) {
$kill_head = 0;
$kill_tail = 0;
}
}
if ( $process_as_synrels == 1 ) {
$no_nums = 0;
$no_punct = 0;
$three_cols = 0;
$kill_head = 0;
$kill_tail = 0;
}
if ( $process_as_word_plus_pos == 1 ) {
$no_nums = 1;
$no_punct = 1;
$three_cols = 1;
$kill_head = 1;
$kill_tail = 1;
}
if ( $process_as_lemma_plus_pos == 1 ) {
$no_nums = 1;
$no_punct = 1;
$three_cols = 1;
$kill_head = 1;
$kill_tail = 1;
}
if ( $calculate_hapax == 1 ) {
$no_nums = 0;
$no_punct = 0;
$three_cols = 0;
$kill_head = 0;
$kill_tail = 0;
$process_as_synrels = 0;
$merge_allomorphs = 0;
}
# Forcibly set variables #
if ( $kill_tail == 1 ) {
$kill_head = 1;
}
#print STDOUT "\n\$split_char = \"$split_char\"\n"; exit; # FOR DEBUGGING
# CATCH HELP AND USAGE REQUESTS #
if ( $need_help == 1 ) {
&help_me;
exit;
}
if ( $need_usage == 1 ) {
&help_me; # Provide help info, until/unless a separate usage section is created
exit;
}
# Print program info to terminal #
print STDOUT "\n#####################################################################################";
print STDOUT "\n# FREQUENCY LIST WIZARD $version #";
print STDOUT "\n# Copyright (c) 2016 Scott Sadowsky #";
print STDOUT "\n# #";
print STDOUT "\n# http://sadowsky.cl/ - ssadowsky at gmail period com #";
print STDOUT "\n# Licensed under the GNU General Public License, version 3 (GPLv3) #";
print STDOUT "\n# #";
print STDOUT "\n# For help, run the program with the -h switch. #";
print STDOUT "\n# Input files are expected to be UTF-8 encoded. Use the -lat switch for Latin-1 #";
print STDOUT "\n#####################################################################################";
print STDOUT "\n";
# INSURE AN INPUT FILE IS SPECIFIED #
if ( $input_file_name eq "" ) {
die "\nERROR: You must provide the name of the file to be processed with the -i switch:\n frequency-list-wizard.pl -i=INPUT-FILE.txt\n frequency-list-wizard.exe -i=INPUT-FILE.txt\n";
}
# IF NO OUTPUT FILE NAME IS SPECIFIED, GENERATE ONE AUTOMATICALLY #
if ( $output_file_name eq "" ) {
# Strip extension from input file name
$in_basename = $input_file_name;
$in_basename =~ s/(.+)\.(.+)/$1/;
# Generate output file name from input basename
if ( $calculate_hapax == 1 ) {
$output_file_name = "$in_basename.METAFREQ.txt";
}
else {
$output_file_name = "$in_basename.FLW.txt";
}
}
# PROCESS SPECIAL CASES OF CHARACTERS PROVIDED ON COMMAND LINE #
# Delimiter string
if ( $delimiter eq "t" ) { $delimiter = "\t"; }
if ( $delimiter eq "\\t" ) { $delimiter = "\t"; }
# Split-on string
if ( $split_char eq "t" ) { $split_char = "\t"; }
if ( $split_char eq "\\t" ) { $split_char = "\t"; }
##### READ INPUT AND OUTPUT FILES #####
# IF LATIN-1 (ISO-8859-1) ENCODING IS SPECIFIED BY USER, OPEN FILES AS SUCH
if ( $latin == 1 ) {
# READ INPUT FILE. MUST BE ISO-8859-1! #
open( INPUTFILE, '<:encoding(iso-8859-1)', "$input_file_name" )
|| die "\nERROR: Input file \($input_file_name\) couldn't be read!\n";
# OPEN OUTPUT FILE FOR WRITING. WILL BE ISO-8859-1 #
open( OUTPUTFILE, '>:encoding(iso-8859-1)', "$output_file_name" )
|| die "\nERROR: The output file \($output_file_name\) couldn't be opened for writing!\n";
}
# IF NO ENCODING IS SPECIFIED BY USER, USE DEFAULT (UTF-8)
else {
# READ INPUT FILE. MUST BE UTF-8!
open( INPUTFILE, '<:encoding(utf8)', "$input_file_name" )
|| die "\nERROR: Input file \($input_file_name\) couldn't be read!\n";
# OPEN OUTPUT FILE FOR WRITING. WILL BE UTF-8
open( OUTPUTFILE, '>:encoding(utf8)', "$output_file_name" )
|| die "\nERROR: The output file \($output_file_name\) couldn't be opened for writing!\n";
}
# DEBUG
if ( $debug == 1 ) {
print STDOUT "\n===================================";
print STDOUT "\n\$no_nums=\t$no_nums";
print STDOUT "\n\$no_punct=\t$no_punct";
print STDOUT "\n\$three_cols=\t$three_cols";
print STDOUT "\n\$kill_head=\t$kill_head";
print STDOUT "\n\$kill_head=\t$kill_tail";
print STDOUT "\n===================================\n\n";
}
# Print input and output file info to terminal
print STDOUT "\nInput file:\t$input_file_name";
print STDOUT "\nOutput file:\t$output_file_name";
print STDOUT "\n\n";
# PROCESS INPUT FILE LINE BY LINE #
while (<INPUTFILE>) {
chomp;
# PREPROCESSING #
# Strip leading spaces off line (CWB adds them to
# 1st (frequency) column in some modes).
$_ =~ s/^( )+//g;
# Treat 2+ spaces as the split character, if desired
if ( $spaces_as_split_char == 1 ) {
$_ =~ s/( ){2,}/$split_char/g;
}
# Eliminate unwanted portion of SynRel field (Connexor output) if desired #
if ( $process_as_synrels == 1 ) {
$_ =~ s/>.+$//;
}
# SPLIT INPUT LINES AND PRE-PROCESS THEM #
# PROCESS THREE-COLUMN INPUT FILES
if ( $three_cols == 1 ) {
# FOR HAPAX PROCESSING
# If hapax are to be calculated, read the FREQUENCY column of the #
# source file in as if it contained words, and set $tokens to 1 #
# (this has the effect of making frequencies countable). #
if ( $calculate_hapax == 1 ) {
( $word, $tokens, $extra_col ) = split(/$split_char/);
$tokens = 1;
}
# FOR NORMAL (NON-HAPAX) PROCESSING
# In all other cases (most of them), read columns in normally #
else {
( $tokens, $word, $extra_col ) = split(/$split_char/);
}
# Eliminate head info from THIRD column, if desired. #
if ( $kill_head == 1 ) {
$extra_col =~ s/^\@\w+ //;
}
# Eliminate extra POS info from THIRD column, if desired. #
if ( $kill_tail == 1 ) {
$extra_col =~ s/Heur /Heur_/; # Fuse "Heur" to main POS that follows
$extra_col =~ s/ .+$//; # Kill tail of POS info
}
# Join column 2 (typically "word") and column 3 (typically "pos" or #
# similar) unless hapax are being calculated. #
unless ( $calculate_hapax == 1 ) {
$word = $word . "#####" . $extra_col; # "#####" is a temporary divider
}
# Merge entries for specific allomorphs In LEMMA+POS or WORD+POS #
# form that Connexor treats separately, if desired #
if ( $merge_allomorphs == 1 ) {
# print STDOUT "\n\$word-BEF=\t$word"; # DEBUG
# In LEMMA+POS or WORD+POS form
$word =~ s/^(u|U)#####CC/o#####CC/;
$word =~ s/^(e|E)#####CC/y#####CC/;
$word =~ s/^(l|L)#####(DET.*)/el#####$2/; # For Connexor-style output, where "del" > "de" "l"
}
}
# PROCESS TWO-COLUMN INPUT FILES
else {
# If calculating hapax, invert columns: read FREQUENCIES into $word #
# and, after reading words into $tokens, assign $tokens a value of 1 #
# to make the frequencies countable. #
if ( $calculate_hapax == 1 ) {
( $word, $tokens ) = split(/$split_char/);
$tokens = 1;
}
else {
( $tokens, $word ) = split(/$split_char/);
}
# ONLY if processing as input POS only, eliminate extra POS info #
# from SECOND column, if desired. #
# Eliminate POS head info
if ( $kill_head == 1 ) {
$word =~ s/^\@\w+ //;
}
# Eliminate POS tail info
if ( ( $process_as_pos == 1 ) && ( $kill_tail == 1 ) ) {
$word =~ s/Heur /Heur_/; # Fuse "Heur" to main POS that follows
$word =~ s/ .+$//; # Kill tail of POS info
}
# Merge entries for specific allomorphs In LEMMA or WORD form #
# that Connexor treats separately, if desired #
if ( $merge_allomorphs == 1 ) {
$word =~ s/^(u|U)$/o/;
$word =~ s/^(e|E)$/y/;
$word =~ s/^(l|L)$/el/;
}
}
# INCREMENT INPUT type AND FREQUENCY COUNTERS #
$input_types = $input_types + 1;
$input_tokens = $input_tokens + $tokens;
# ELIMINATE HEAD INFO FROM *SECOND* COLUMN (MERGED OR NOT), IF DESIRED. #
# (This refers to part of the grammatical tagging done by Connexor. #
if ( $kill_head == 1 ) {
$word =~ s/^\@\w+ //;
}
# DEBUG
if ( $debug == 1 ) {
print STDOUT "\n\{$tokens\}\t\{$word\}\t\t\{$extra_col\}";
}
# PROCESS INPUT LINES #
# EXCLUDE UNWANTED ENTRIES (NUMS AND/OR PUNCTUATION AND SUM FREQUENCIES) #
# If only the NO-NUMBERS option was selected
if ( $no_nums == 1 && $no_punct == 0 ) {
unless (( ( $three_cols == 1 ) && ( $word =~ m/[0-9].*?(#####)/ ) )
|| ( ( $three_cols == 0 ) && ( $word =~ m/[0-9].*?($)/ ) ) )
{
$F{ lc($word) } += $tokens;
$proc_tokens = $proc_tokens + $tokens;
}
}
# If only the NO-PUNCTUATION option was selected
elsif ( $no_nums == 0 && $no_punct == 1 ) {
unless (( ( $three_cols == 1 ) && ( $word =~ m/$punctuation.*?(#####)/ ) )
|| ( ( $three_cols == 0 ) && ( $word =~ m/$punctuation.*?($)/ ) ) )
{
$F{ lc($word) } += $tokens;
$proc_tokens = $proc_tokens + $tokens;
}
}
# If BOTH no-numbers and no-punctuation options selected
elsif ( $no_nums == 1 && $no_punct == 1 ) {
unless (( ( $three_cols == 1 ) && ( $word =~ m/($punctuation.*?(#####)|[0-9].*?(#####))/ ) )
|| ( ( $three_cols == 0 ) && ( $word =~ m/($punctuation.*?($)|[0-9].*?($))/ ) ) )
{
$F{ lc($word) } += $tokens;
$proc_tokens = $proc_tokens + $tokens;
}
}
# If NEITHER option selected
else {
$F{ lc($word) } += $tokens;
$proc_tokens = $proc_tokens + $tokens;
}
}
END {
# PERFORM REVERSE NATURAL NUMERIC SORT (e.g. 200,10,2,1 instead of 200,2,10,1, etc.) #
# AND PRINT RESULTS TO OUPUT FILE #
# FOR HAPAX CALCULATIONS #
if ( $calculate_hapax == 1 ) {
# Print hapax output file header
print OUTPUTFILE "TYPE-OF-LEGOMENA\tCOUNT\n";
# Sort hash by VALUES and put keys in an array
my @keys = ( sort { $a <=> $b || $F{$b} <=> $F{$a} || $a cmp $b } keys %F );
# Process array
foreach my $key (@keys) {
# Don't process statistics lines from #
# source files (they begin with "="). #
unless ( $key =~ m/^=/ ) {
# Print each line of array to output file
print OUTPUTFILE $key, "$delimiter", $F{$key}, "\n";
}
}
}
# FOR NORMAL (NON-HAPAX) PROCESSING #
else {
# Sort hash by KEYS
foreach ( sort { $F{$b} <=> $F{$a} || length($b) <=> length($a) || $a cmp $b } keys %F ) {
# Increment processed types count
$proc_types = $proc_types + 1;
# Place value of $_ in a temp variable so it can be transformed,
# while allowing $_ to be used for hash lookup purposes
my $transformed_word = $_;
# Unmerge columns if they were merged
if ( $three_cols == 1 ) {
$transformed_word =~ s/#####/$delimiter/;
}
# Print each line to output file
print OUTPUTFILE $F{$_}, "$delimiter", $transformed_word, "\n";
}
}
# Print statistics of processed items, if desired (non-hapax only) #
if ( ( $print_stats == 1 ) && ( $calculate_hapax == 0 ) ) {
#print STDOUT ">>>FINAL \$proc_types=\t$proc_types\n"; # DEBUG_TEMP
# Calculate Type-Token Ratios and print type, token and TTR info #
if ( $input_tokens != 0 ) { # Avoid divide by zero errors
my $input_ttr = sprintf( '%.10f', ( $input_types / $input_tokens ) );
# Print type, token and TTR info for INPUT ITEMS
print OUTPUTFILE "============\t============\t============\n";
print OUTPUTFILE "=\t$input_types\t!INPUT_TYPES\n";
print OUTPUTFILE "=\t$input_tokens\t!INPUT_TOKENS\n";
print OUTPUTFILE "=\t$input_ttr\t!INPUT_TTR\n";
}
if ( $proc_tokens != 0 ) { # Avoid divide by zero errors
my $processed_ttr = sprintf( '%.10f', ( $proc_types / $proc_tokens ) );
# Print type, token and TTR info for PROCESSED ITEMS
print OUTPUTFILE "============\t============\t============\n";
print OUTPUTFILE "=\t$proc_types\t!PROCESSED_TYPES\n";
print OUTPUTFILE "=\t$proc_tokens\t!PROCESSED_TOKENS\n";
print OUTPUTFILE "=\t$processed_ttr\t!PROCESSED_TTR\n";
print OUTPUTFILE "============\t============\t============\n";
}
}
}
# End of program. #
exit;
# SUBROUTINE: HELP MESSAGE #
sub help_me {
print STDOUT "\n#####################################################################################";
print STDOUT "\n# FREQUENCY LIST WIZARD $version #";
print STDOUT "\n# Copyright (c) 2016 Scott Sadowsky #";
print STDOUT "\n# #";
print STDOUT "\n# http://sadowsky.cl - ssadowsky at gmail period com #";
print STDOUT "\n# Licensed under the GNU General Public License, version 3 (GPLv3) #";
print STDOUT "\n#####################################################################################";
print STDOUT "\n";
print STDOUT "\nUSAGE (script): ./frequency-list-wizard.pl -i=INFILE.TXT [OPTIONS]";
print STDOUT "\n (.exe) : frequency-list-wizard.exe -i=INFILE.TXT [OPTIONS]";
print STDOUT "\n";
print STDOUT "\nSUMMARY: Process frequency lists in various useful ways.";
print STDOUT "\n";
print STDOUT "\nREQUIREMENTS: - Input files are assumed to be in UTF-8 encoding.";
print STDOUT "\n - Use the -lat switch to process ISO-8859-1 (Latin-1) files.";
print STDOUT "\n - Frequency lists may have two or three columns. The third column is";
print STDOUT "\n optional. The -3c switch must be used with such lists.";
print STDOUT "\n - First column MUST contain the frequencies.";
print STDOUT "\n";
print STDOUT "\nDESCRIPTION: The default processing mode takes a 2-column frequency list in UTF-8";
print STDOUT "\n encoding, merges all entries that vary only by their capitalization ";
print STDOUT "\n (e.g. \'house\', \'House\' and \'HOUSE\'), and sums the frequencies";
print STDOUT "\n of each of these items to give you the total frequency per \'allo-";
print STDOUT "\n capitalization\' (which is almost certainly what is desired when";
print STDOUT "\n working with lexical items, lemmas, etc.). It performs a reverse natural";
print STDOUT "\n numeric sort on the results and outputs them to a text file.";
print STDOUT "\n";
print STDOUT "\n Three-column lists (e.g. frequency + lemma + POS) can be processed using";
print STDOUT "\n the \'-3c\' switch. This options allows identical lemmas with different";
print STDOUT "\n POSes to be processed (and counted) separately (e.g. \'jump\' (NOUN) and";
print STDOUT "\n \'jump\' (VERB)).";
print STDOUT "\n";
print STDOUT "\n If desired, FLW can also calculate the total number of types and tokens";
print STDOUT "\n in the frequency list, as well as its type-token ratio. (This is done by";
print STDOUT "\n default, and printed at the end of the processed frequency list).";
print STDOUT "\n";
print STDOUT "\n Optionally, FLW can eliminate entries containing numerals (-nn) and/or";
print STDOUT "\n punctuation marks (-np) from frequency lists. It can also merge certain";
print STDOUT "\n Spanish allomorphs (y + e, o + u) into a single item (-ma). All three";
print STDOUT "\n options are activated by default, and can be deactivated with the -nonn,";
print STDOUT "\n -nonp and -noma switches. The difference between the number of items in";
print STDOUT "\n the source frequency list and the number actually processed after";
print STDOUT "\n eliminating numbers or punctuation marks is reflected in the type and";
print STDOUT "\n token counts shown with the \'--print-stats\' option (\'INPUT_TYPES\'";
print STDOUT "\n versus \'PROCESSED_TYPES\', etc.).";
print STDOUT "\n";
print STDOUT "\n When using the 3-column option, POS information in the third column can";
print STDOUT "\n be pruned if it is in a Connexor-style format (e.g. \'\@NH N MSC SG\').";
print STDOUT "\n The -kh (--killhead) switch will eliminate the head of the field (\'\@NH \'),";
print STDOUT "\n while -kt (--killtail) will eliminate the tail (\' MSC SG\').";
print STDOUT "\n";
print STDOUT "\n The \'meta-frequency\' (AKA \'legomena\') processing mode (activated with the";
print STDOUT "\n \'-mf\' or \'-hx\' switches) calculates the frequency of each frequency in";
print STDOUT "\n the list. Its output is a frequency list of frequencies -- how many items";
print STDOUT "\n occur 1 time, 2 times, and so on.";
print STDOUT "\n";
print STDOUT "\nOPTIONS:";
print STDOUT "\n";
print STDOUT "\n -i, --input Name of input file. MANDATORY! Assumed to be UTF-8.";
print STDOUT "\n -o, --output Name of output file. If not provided, a name will be automatically";
print STDOUT "\n generated using the input file base name.";
print STDOUT "\n -lat,--latin Process Latin-1 (ISO-8859-1) texts. Output will be encoded same way.";
print STDOUT "\n -ps,--print-stats Calculate and print type, token and TTR statistics (DEFAULT: ON).";
print STDOUT "\n -mf, --meta-freq Calculate the frequencies of each frequency in the list. In other";
print STDOUT "\n words, generates a meta-frequency list, or list of n-legomena.";
print STDOUT "\n -leg, --legomena Same as -mf or --meta-freq.";
print STDOUT "\n";
print STDOUT "\n -nn, --nonums Eliminate list entries that contain numbers (e.g. \"Bill7\").";
print STDOUT "\n -np, --nopunct Eliminate list entries that contain punctuation (e.g. \"a\@b.com\").";
print STDOUT "\n -ma, --mergeallo Merge Spanish allomorphs (e.g. \"y\" and \"e\", \"o\" and \"u\").";
print STDOUT "\n";
print STDOUT "\n -3c, --3-col Process 3-column lists. Temporarily merges columns 2 (typically";
print STDOUT "\n \"word\") and 3 (\"POS\", \"lemma\", etc.). This allows processing";
print STDOUT "\n of identical items that have different POSes/lemmas assigned to";
print STDOUT "\n them (e.g. \"canto\" (NOUN SG MSC) and \"canto\" (V 1SG PRES IND)).";
print STDOUT "\n After processing, the merge is undone, giving the original";
print STDOUT "\n number of columns.";
print STDOUT "\n -kh, -killhead In lists that provide head info in the format \"\@NH \", eliminate";
print STDOUT "\n this information, leaving only POS info in the column (e.g.";
print STDOUT "\n Connexor). Assumes that this info is in the THIRD column.";
print STDOUT "\n -kt, -killtail In lists with POS info, eliminate all of this info EXCEPT the";
print STDOUT "\n general grammatical category (e.g. \"DET MSC SG\" becomes \"DET\").";
print STDOUT "\n Forces -killtail.";
print STDOUT "\n";
print STDOUT "\n -so, --spliton Define the character that input file lines will be split on. The";
print STDOUT "\n default value is \\t (tab).";
print STDOUT "\n -d, --delimiter Allows an alternative delimiter character to be used. This is the";
print STDOUT "\n character that is inserted between columns in the output file.";
print STDOUT "\n Entering \"t\" will produce \\t. The default value is \\t (tab).";
print STDOUT "\n -st,--spaces-split Treat 2 or more spaces as the split character. Typically for messy";
print STDOUT "\n lists. Care must be taken with this option, as any extraneous";
print STDOUT "\n space can (and will) have undesirable consequences.";
print STDOUT "\n";
print STDOUT "\nMETA-CONFIGURATIONS:";
print STDOUT "\n";
print STDOUT "\n -w, --words Process frequency list as words (2 columns: FREQ, WORD).";
print STDOUT "\n -l, --lemmas Process frequency list as lemmas (2 columns: FREQ, LEMMA).";
print STDOUT "\n -pm, --posmin Process frequency list as minimal POS (2 columns: FREQ, POS. Kills";
print STDOUT "\n POS head and tail).";
print STDOUT "\n -p, --pos Process frequency list as partial POS (2 columns: FREQ, POS. Kills";
print STDOUT "\n POS head, leaves tail intact).";
print STDOUT "\n -pf, --posfull Process frequency list as full POS (2 columns: FREQ, POS. Leaves";
print STDOUT "\n entire POS intact).";
print STDOUT "\n -sr, --synrel Process frequency list as syntactic relationships (2 columns, deactivates";
print STDOUT "\n potentially destructive options).";
print STDOUT "\n -wp, --wordpos Process frequency list as words + POS (3 columns: FREQ, WORD, POS. Kills";
print STDOUT "\n POS head and tail, and eliminates numbers and punctuation).";
print STDOUT "\n -lp, --lemmapos Process frequency list as lemmas + POS (3 columns: FREQ, LEMMA, POS. Kills";
print STDOUT "\n POS head and tail, and eliminates numbers and punctuation).";
print STDOUT "\n";
print STDOUT "\n -db, --debug Print debug info to STDOUT.";
print STDOUT "\n -h, --help Show this help information.";
print STDOUT "\n\n";
}