-
Notifications
You must be signed in to change notification settings - Fork 0
/
names.py
executable file
·63 lines (53 loc) · 1.77 KB
/
names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import nltk
import string
from collections import Counter
from collections import defaultdict as dd
PUNCS = list(string.punctuation) + ["``", "''"]
NO_PUNC = str.maketrans('', '', string.punctuation)
# read bible data
with open('bibid.csv', 'r', encoding= 'utf8') as infile:
reader = csv.DictReader(infile)
verses = [v for v in reader]
# tokenize
tc = Counter()
for v in verses:
# remove tab
if v['text'].startswith('<t />'):
text = v['text'][5:]
else:
text = v['text']
# tokenize
tokens = nltk.word_tokenize(text)
v['tokens'] = tokens
tc.update(tk.translate(NO_PUNC) for tk in tokens if tk not in PUNCS)
lexicon = set(tc.keys())
# --------------------
# look for names
names = dd(set)
maybe_names = dd(set)
for vid, v in enumerate(verses):
for tid, tk in enumerate(v['tokens']):
token = tk.translate(NO_PUNC) # remove punc
if not token:
continue
elif token[0].isupper():
# if it's the first token ...
if token.upper() == token or tid == 0 or v['tokens'][tid - 1] == '``' or v['tokens'][tid - 1][-1] in '.!\'?':
# check if the lowered is also in lexicon
maybe_names[token].add(vid)
else:
names[token].add(vid)
print("Found names: {}".format(len(names)))
print("May be: {}".format(len(maybe_names)))
name_list = list(sorted(names))
# write to file
name_file = "name_freq.csv"
with open(name_file, "w") as outfile:
writer = csv.writer(outfile, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
for word, freq in tc.most_common():
if word in names:
writer.writerow((word, freq))
print("Donezo! Namelist has been written to {}".format(name_file))