-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_coca_corpus.py
55 lines (41 loc) · 1.47 KB
/
extract_coca_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import io
import json
import numpy as np
import pandas as pd
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(NpEncoder, self).default(obj)
def buildCocaDicts(filename):
def fix_word_class(x):
c = x[:1]
return c if c in ['n', 'v', 'j', 'r', 'm'] else 'x'
table = pd.read_table(
filename,
usecols=['w1', 'L1', 'c1', 'coca', 'tcoca']
)
# Translate the old part of speech labels
table['pos'] = table['c1'].apply(fix_word_class)
# Create a pivot of lemma x part of speech
reduced = table.groupby(['pos', 'L1'])['coca'].sum().unstack(0)
# Create a summed series of lemma (part of speech agnostic)
Z = table.groupby('L1')['coca'].sum()
return {
'total_docs': Z.max(),
'all_lemmas': Z.to_dict(),
'nouns': reduced.loc[reduced['n'].notna(), 'n'].to_dict(),
'verbs': reduced.loc[reduced['v'].notna(), 'v'].to_dict(),
}
# -----------------------------------------------------------------------------
# Main
print('Reading Corpus of Contemporary American English...')
coca = buildCocaDicts('../b240.txt')
print('Writing Results...')
with io.open('corpus.json', 'w', encoding='utf-8') as f:
json.dump(coca, f, cls=NpEncoder)