-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfreq_words.py
65 lines (57 loc) · 1.66 KB
/
freq_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from collections import Counter
import counter_stats
import sys
if len(sys.argv) > 2:
MIN_RECOGNITION_PERCENT = float(sys.argv[2])
else:
MIN_RECOGNITION_PERCENT = 0.85
with open('words.txt') as infile:
known = set(infile.read().split('\n'))
lemma_file = sys.argv[1]
if not lemma_file.endswith('.frequency'):
print('Please use frequency file instead')
sys.exit()
print(lemma_file)
lemmas = Counter()
order = dict()
sentences = dict()
with open(lemma_file) as infile:
for i, line in enumerate(infile):
parts = line.split('\t')
if len(parts) != 3:
continue
count, lemma, sentence = parts
lemmas[lemma] += int(count)
order[lemma] = i
sentences[lemma] = sentence
counter_stats.full_table(lemmas)
table = counter_stats.cumulative_table(lemmas)
coverage = sum(p for w,_,p,_ in table if w in known)
count = sum(1 for l in lemmas.keys() if l in known)
print(count, 'known words provide {:.3f} already'.format(coverage))
last_count, number = table[0][1], 0
add = []
for w,c,p,_ in table:
if coverage > MIN_RECOGNITION_PERCENT:
break
if w in known:
continue
if c < last_count:
if number:
print('({}), +{}'.format(last_count, number))
last_count = c
number = 0
add.append(w)
number += 1
coverage += p
if number:
print('({}), +{}'.format(last_count, number))
if add:
print(len(add))
for w in add:
print(w)
answer = input('Add these words? ')
if answer and answer.lower()[0] == 'y':
with open('words.txt', 'a') as outfile:
for lemma in add:
outfile.write('{}\n'.format(lemma))