-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfreq_words_exclusive.py
77 lines (68 loc) · 1.95 KB
/
freq_words_exclusive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from collections import Counter
import counter_stats
import sys
COUNT = float(sys.argv[2])
with open('words.txt') as infile:
known = set(infile.read().split('\n'))
lemma_file = sys.argv[1]
if not lemma_file.endswith('.frequency'):
print('Please use frequency file instead')
sys.exit()
print(lemma_file)
lemmas = Counter()
order = dict()
with open(lemma_file) as infile:
for i, line in enumerate(infile):
parts = line.split('\t')
if len(parts) != 3:
continue
count, lemma, sentence = parts
lemmas[lemma] += int(count)
order[lemma] = i
req_file = sys.argv[3]
if not req_file.endswith('.frequency'):
print('Please use frequency file instead')
sys.exit()
choices = set()
sentences = dict()
with open(req_file) as infile:
for i, line in enumerate(infile):
parts = line.split('\t')
if len(parts) != 3:
continue
count, lemma, sentence = parts
choices.add(lemma)
sentences[lemma] = sentence
counter_stats.full_table(lemmas)
table = counter_stats.cumulative_table(lemmas)
coverage = sum(p for w,_,p,_ in table if w in known)
count = sum(1 for l in lemmas.keys() if l in known)
print(count, 'known words provide {:.3f} already'.format(coverage))
last_count, number = table[0][1], 0
add = []
for w,c,p,_ in table:
if len(add) > COUNT:
break
if w in known:
continue
if w not in choices:
continue
if c < last_count:
if number:
print('({}), +{}'.format(last_count, number))
last_count = c
number = 0
add.append(w)
number += 1
coverage += p
if number:
print('({}), +{}'.format(last_count, number))
if add:
print(len(add))
for w in add:
print(w)
answer = input('Add these words? ')
if answer and answer.lower()[0] == 'y':
with open('words.txt', 'a') as outfile:
for lemma in add:
outfile.write('{}\n'.format(lemma))