forked from MolecularOdorRecognition/PerceptAnalyze
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewscript.py
121 lines (105 loc) · 3.6 KB
/
newscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import urllib.request
import gzip
import csv
import json
from bson.objectid import ObjectId
import os
import logging
logging.basicConfig(filename="log.out", filemode='w', level=logging.INFO)
logger = logging.getLogger(__name__)
try:
file = open('word.csv', 'r', encoding="utf8") # Open file
read = csv.reader(file, delimiter=',')
except OSError as err:
logger.error("OS error: {0}".format(err))
except IOError as io:
logger.error("IO Erroe", io)
f = open('newjson.txt', 'w') # psuedo code to create file
f.close()
xp = [] # list of words to download
yp = [] # list of normal words
zp = [] # list of stem words
# Creating 2 lists
for row in read:
# print(row)
row[0] = row[0].strip()
# print(row)
if row[0][-1] == '*':
zp.extend(row) # Separating * words
else:
yp.extend(row)
# Creating a list to download
for z in yp:
z = z.strip()
if z[:2] not in xp:
xp.append(z[:2])
xp.sort()
# xp = ['ta']
print(xp)
# Class Encoder to dump dictionary in json
class Encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, ObjectId):
return str(obj)
else:
return obj
# search function for matching stem words
def searchf(y, zp):
for x in zp:
if y[:len(x)-1] == x[:-1]:
return True
return False
for i in range(len(xp)):
target = open('newjson.txt', 'r')
logger.info('downloading '+xp[i])
try:
urllib.request.urlretrieve("http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-"+xp[i]+".gz", xp[i]+".gz")
except:
logger.error("Error Downloading "+xp[i])
continue
search = {}
# Reading search from newjson.txt
tar = target.read()
if len(tar) > 0:
search = json.loads(tar)
target.close()
for row in yp:
if row not in search.keys():
search[row] = {}
for row in zp:
if row[:-1] not in search.keys():
search[row[:-1]] = {}
logger.info('Opening '+xp[i])
# Opening .gz file and searching for words in yp or zp
with gzip.open(xp[i]+'.gz', 'rt', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
x = str(row[0])
y, z = x.split(' ')
if y.lower() in yp:
if z.lower() in yp or searchf(z.lower(), zp):
if z.lower() not in search[y.lower()].keys():
search[y.lower()][z.lower()] = 1
else:
search[y.lower()][z.lower()] += 1
else:
if searchf(y.lower(), zp):
if y.lower() not in search.keys():
# print(type(y), type(search))
'''if '_' in y.lower():
d, e = y.split("_")
y = d.lower()'''
search[y.lower()] = {}
logger.info('new key '+y.lower())
yp.append(y.lower())
if z.lower() in yp or searchf(z.lower(), zp):
search[y.lower()][z.lower()] = 1
target = open('newjson.txt', 'w')
jso = json.dumps(search, cls=Encoder) # dumping search
target.truncate()
target.write(jso) # writing search
target.close()
f.close()
logger.info('Removing '+xp[i])
os.remove(xp[i]+".gz")
file.close()