-
Notifications
You must be signed in to change notification settings - Fork 29
/
ngrams.py
executable file
·146 lines (132 loc) · 4.36 KB
/
ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python
import collections
import data
from numpy import *
from operator import itemgetter
from scipy.sparse import lil_matrix, csr_matrix
def words(s):
words = []
current = ""
not_mode = False
not_words = set(["not", "isn't", "doesn't"])
punctuation_map = {',':"COMMA", '.':"PERIOD", ':':"COLON", ';':"SEMI", '\'':"SINGLEQUOTE",
'"':"DOUBLEQUOTE", '?':"QUESTION"}
for i in s:
if i.isalnum() or (i == '_'):
current += i
elif i.isspace():
if not current:
continue
if not_mode:
current += "_NOT"
words.append(current)
if current in not_words:
not_mode = True
current = ""
else:
if i in punctuation_map.keys():
words.append(punctuation_map[i])
not_mode = False
if not current:
continue
if not_mode:
current += "_NOT"
words.append(current)
current = ""
if current:
words.append(current)
return words
def ngrams(n, s):
lwr = s.lower()
ws = words(lwr)
current = collections.deque(ws[:n])
grams = data.DefDict(0)
for pos in range(n, len(ws)):
grams[" ".join(current)] += 1
current.popleft()
current.append(ws[pos])
grams[" ".join(current)] += 1
return grams
def ngrams_range(ns, s):
g = {}
for n in ns:
g.update(ngrams(n, s))
return g
def ngrams_to_dictionary(grams):
keysets = [set(k) for k in grams]
allgramset = set()
allgramset = apply(allgramset.union, keysets)
return allgramset
def ngrams_to_matrix(grams, classes, return_gramsdict=False):
"""
Maps from list of raw gram frequencies and labels to a numerical matrix
of feature vectors, with option to also return the gramsdict for future
use
"""
print "Entering ngrams_to_matrix"
keysets = [set(k) for k in grams]
allgramset = set()
allgramset = apply(allgramset.union, keysets)
allgrams = list(allgramset)
print "> Listed"
vecs = []
print "> []"
allgramsdict = {}
for i in range(len(allgrams)):
allgramsdict[allgrams[i]] = i
for g, c in zip(grams, classes):
vec = grams_to_featurevector(allgramsdict, g, c)
vecs.append(vec)
print vstack(vecs).T.shape
ret = data.Data(vstack(vecs).T)
if return_gramsdict:
return (ret,allgramsdict)
return ret
def collapse_ngrams(grams):
"""
Collapse a list of dict of grams into a single dict
"""
collapsed = {}
for gram in grams:
for (key,value) in gram.iteritems():
if key in collapsed:
collapsed[key] += value
else:
collapsed[key] = value
return collapsed
def top_ngrams(grams,limit=0):
if limit==0:
return grams
return dict( sorted(grams.iteritems(), key=itemgetter(1), reverse=True)[:limit] )
def grams_to_featurevector(gramsdict, grams, label=None):
"""
Maps from gram frequencies and label to numerical feature vector according
to some mapping, generated within or from ngrams_to_matrix()
"""
if label:
vec = ones(len(gramsdict) + 1, dtype=uint16)
vec[-1] = label
else:
vec = ones(len(gramsdict), dtype=uint16)
for i in grams:
vec[gramsdict[i]] = grams[i]
return vec
def ngrams_to_idf(ngrams):
presence = [g.keys() for g in ngrams]
docfreq = {}
for i in presence:
for word in i:
if word not in docfreq:
docfreq[word] = 1
docfreq[word] += 1
return data.DefDict(float(len(ngrams)),
dict([(i, float(len(ngrams))/log(float(docfreq[i]))) for i in docfreq]))
if __name__ == "__main__":
print "Trigram example: %s" % ngrams(3, "Now is the time for all good men to not come to the aid of their party! Now is the time for all bad women to leave the aid of their country? This, being war, is bad")
g1 = ngrams(1, "Hello how are you")
g2 = ngrams(1, "Well, are you feeling well")
g3 = ngrams(1, "Well hello there hello")
print "Unigram example: %s" % g3
(data,gramsdict) = ngrams_to_matrix([g1, g2, g3], [1, 2, 1], return_gramsdict=True)
print "Matrix example: %s" % data.asMatrix()
print "Grams dict: %s" % gramsdict