-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake-confusables.pl
executable file
·114 lines (100 loc) · 2.63 KB
/
make-confusables.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/home/ben/software/install/bin/perl
use warnings;
use strict;
use utf8;
use feature 'signatures';
no warnings 'experimental::signatures';
use FindBin '$Bin';
use JSON::Create 'write_json';
use Getopt::Long;
use Convert::Moji 'make_regex';
use List::Util 'uniq';
use File::Slurper qw!read_text write_text!;
use lib "$Bin/lib";
use Unicode::Confuse::Parse qw!parse_confusables metadata!;
my $file = '/home/ben/data/unicode/confusables/confusables.txt';
my $ok = GetOptions (
"file=s" => \$file,
verbose => \my $verbose,
);
if (! $ok || ! -f $file) {
print <<EOF;
This script is part of the Unicode::Confuse Perl distribution.
It regenerates the JSON file distributed with this module. Run it as
follows:
$0 --file </path/to/confusables.txt>
Download the confusables file from
https://www.unicode.org/Public/security/latest/confusables.txt
EOF
exit;
}
if ($verbose) {
print "Parsing the file '$file':\n";
}
my $con = parse_confusables ($file, $verbose);
my $dir = "$Bin/lib/Unicode/Confuse";
if (! -d $dir) {
system ("mkdir -p $dir");
}
my $out = "$dir/confusables.json";
if ($verbose) {
print "Writing the data as JSON to $out\n";
}
my %thing;
$thing{confusables} = $con;
my @keys = keys %$con;
push @keys, uniq (values %$con);
my %rev;
for my $k (keys %$con) {
my $v = $con->{$k};
push @{$rev{$v}}, $k;
}
for my $k (keys %rev) {
$rev{$k} = [uniq sort @{$rev{$k}}];
}
$thing{reverse} = \%rev;
write_json ($out, \%thing, indent => 1, sort => 1);
write_regex_module (\@keys);
my $metadata = metadata ($file);
write_json ("$Bin/confusables-metadata.json", $metadata,
indent => 1, sort => 1);
exit;
sub write_regex_module ($keys)
{
# This looks clunky but we can't "use" this module, because it
# "uses" the thing we are about to write out, and that will be
# circular, so we read it and grab the version using a regex.
my $pm = read_text ("$dir.pm");
my $version;
if ($pm =~ /\$VERSION\s*=\s*(\S+);/) {
$version = $1;
}
if (! defined $version) {
die "Failed to read $dir.pm";
}
my @inputs = @$keys;
@inputs = map {quotemeta} @inputs;
@inputs = sort {
length($b) <=> length($a) || $a cmp $b
} @inputs;
my $re = join ('|', map ("$_\n", @inputs));
my $out = "$dir/Regex.pm";
my $reout = <<EOF;
# This module was generated by $0
package Unicode::Confuse::Regex;
use warnings;
use strict;
use utf8;
our \$VERSION = $version;
# Some people, when faced with a problem, think "I know, I'll use a
# regular expression". Now they have two problems.
our \$re = qr!
$re!x;
1;
EOF
if (-f $out) {
chmod 0644, $out;
}
write_text ($out, $reout);
chmod 0444, $out;
}