-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrue-statistics
executable file
·47 lines (40 loc) · 1.59 KB
/
true-statistics
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/python
from meta import read_info_json
from eval import load_ground_truths
from language import TRAINING_CORPUS, load_corpus
from colour import print_GREEN, print_RED, GREY, YELLOW, C_NORMAL
from collections import Counter
def main():
info = read_info_json(TRAINING_CORPUS)
truths = load_ground_truths("%s/truth" % TRAINING_CORPUS)
langs = set()
for pid, lang, genre in info:
langs.add(lang)
print_GREEN("%s (%s)" % (pid, lang))
truth = truths[pid]
print "%d documents" % len(truth['documents'])
print "%d links" % len(truth['ranking'])
clusters = truth['clustering']
counts = Counter(len(x) for x in clusters)
print "%d clusters > 1" % sum(v for k, v in counts.items() if k > 1)
for k, v in sorted(counts.items(), reverse=True):
print "%2d clusters of size %d" % (v, k)
print
for lang in langs:
texts, problems = load_corpus(TRAINING_CORPUS, lang)
n = len(texts)
print_RED("%s: %d texts" % (lang, n))
lengths = sorted(len(x) for x in texts.values())
print "min %d median %d max %d" % (lengths[0],
lengths[n / 2],
lengths[-1])
total = sum(lengths)
print "mean %d total %d" % (total / n, total)
step = 1000
low, high = 0, 1000
while low < lengths[-1]:
c = sum(low < x < high for x in lengths)
print "%6d %s%s%s" % (high, YELLOW, '#' * c, C_NORMAL)
low = high
high += step
main()