-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrapheme-stats.py
64 lines (53 loc) · 2.19 KB
/
grapheme-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import unicodedata
import grapheme
import csv
# set up argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("infile",
help="the name of the input file",
type=str)
parser.add_argument("-inencoding",
help="the encoding of the input file",
type=str,
default=None)
parser.add_argument("-outencoding",
help="the encoding of the output file",
type=str,
default=None)
args = parser.parse_args()
# count the grapheme occurrences
grapheme_counts = dict()
with open(args.infile, "r", encoding=args.inencoding) as infile:
for graph in grapheme.graphemes(infile.read()):
grapheme_counts.setdefault(graph, 0)
grapheme_counts[graph] += 1
# print out the summary
with open(args.infile + '.csv', 'w', newline='', encoding=args.outencoding) as csvfile:
columns = ['grapheme', 'count', 'number of codepoints', 'codepoint names']
statistics_file = csv.DictWriter(csvfile, fieldnames=columns)
statistics_file.writeheader()
for (graph, count) in grapheme_counts.items():
grapheme_codepoint_names = list()
not_found_codepoints = dict()
for codepoint in graph:
try:
name = unicodedata.name(codepoint)
grapheme_codepoint_names.append(name)
except:
not_found_codepoints.setdefault(codepoint, 0)
not_found_codepoints[codepoint] += 1
# check if everythin is allright
if len(not_found_codepoints) != 0 and len(grapheme_codepoint_names) == 0:
continue
elif len(not_found_codepoints) == 0 and len(grapheme_codepoint_names) != 0:
pass
else:
print("Something could be wrong!")
# create a record for the csv row
entry = dict()
entry['grapheme'] = graph
entry['count'] = count
entry['codepoint names'] = "; ".join(grapheme_codepoint_names)
entry['number of codepoints'] = len(grapheme_codepoint_names)
statistics_file.writerow(entry)