forked from corcra/leekspeak
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmap_fns.py
281 lines (268 loc) · 9.25 KB
/
map_fns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/bin/python
# Create a bijection between base32 nmers and LANGUAGE.
from string import ascii_lowercase
from itertools import product
from scipy.spatial.distance import pdist, squareform
import editdistance
import numpy as np
import pdb
import sys
import re
import gzip
NMER_SIZE = 3
# --- loading and prep --- #
def get_base32():
"""
Generate nmers of base32 characters.
"""
alphabet = ascii_lowercase
base32_chars = alphabet + '234567'
base32_nmers = [''.join(x) for x in product(base32_chars, repeat=NMER_SIZE)]
return base32_nmers
def subset_language(vocabulary, vectors, wordlist, N=32768):
"""
Subset the vocabulary/vectors to those in a wordlist.
The wordlist is a list arranged in order of 'preference'.
Note: we hope the vocabulary is contained in the wordlist,
but it might not be. N is the number of words we require.
If the wordlist contains fewer than N words, (but the vocabulary has >= N),
we supplement the result from the vocabulary randomly.
Also, we want to make sure the order of vocabulary is random (because some
structure could negatively influence the optimisation procedure later).
"""
keep_indices = [] # indices of vocabulary/vectors to keep
added = 0
if type(wordlist) == str:
# load from path
print 'Loading wordlist from', wordlist
wordlist = np.loadtxt(wordlist, dtype=str)
else:
assert type(wordlist) == list or type(wordlist) == np.ndarray
print 'Subsetting vocabulary.'
for word in wordlist:
print word
if added == N:
break
try:
word_index = vocabulary.index(word)
keep_indices.append(word_index)
added += 1
except ValueError:
continue
print 'Acquired', len(keep_indices), 'words.'
miss = N - len(keep_indices)
if miss > 0:
print 'Supplementing with', miss, 'random words.'
for i in xrange(miss):
random_index = np.random.choice(len(vocabulary), 1)
while random_index in keep_indices:
random_index = np.random.choice(len(vocabulary), 1)
keep_indices.append(random_index)
print 'Shuffling.'
# shuffle
np.random.shuffle(keep_indices)
# populate new arrays
print 'Populating subsetted arrays.'
vectors_subset = np.array([vectors[i] for i in keep_indices])
vocabulary_subset = [vocabulary[i] for i in keep_indices]
return vocabulary_subset, vectors_subset
def get_language(path):
"""
Get the LANGUAGE words, and vectors!
Takes a path to a file like
apple 0.3410 0.24 0.4114
orange 0.613 3.414 0.512
Outputs a list like [apple, orange]
and a np array like [[0.3410, 0.24, 0.4114],
[0.613, 3.414, 0.512]]
"""
print 'Loading language from', path
vocabulary = []
vectors = []
if '.gz' in path:
fi = gzip.open(path, 'rb')
else:
fi = open(path, 'r')
for line in fi:
if '\t' in line:
sl = line.strip('\n').split('\t')
else:
sl = line.strip('\n').split(' ')
word = re.sub('\x00', '', sl[0])
vocabulary.append(word)
if len(sl) > 1:
vector = map(float, sl[1:])
else:
vector = np.random.normal(size=5)
vectors.append(vector)
vectors = np.array(vectors)
W = len(vocabulary)
print 'Loaded', W, 'words from', path
return vocabulary, vectors
# --- distance metrics --- #
def bespoke_distance(nmer1, nmer2):
"""
Hand-crafted distance function, probably not a real metric.
Thinking about what is 'hard to differentiate', as human looking at strings
Properties:
- adjacent swaps are hard to detect
- i ~ l (1 is not a problem as it does not exist in base32)
- b ~ d
- p ~ q
- m ~ n
- v ~ w
- c ~ e
- a ~ o
Note: this is largely arbitrary from me, partially influenced by:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3541865/table/t1-ptj3712663/
"""
# I can already feel how slow this is going to be.
# some stuff goes here etc zzzz
d = abs(np.random.normal())
return d
def base32_distances(base32_nmers, metric='levenshtein'):
"""
Get pairwise distances (different metrics)
This takes a little while
"""
N = len(base32_nmers)
total = N*(N-1.0)/2
print 'Calculating', N*(N-1)/2, 'pairwise distances.'
d = np.empty(shape=(N, N), dtype=np.float)
n = 0
for i in xrange(N):
for j in xrange(i, N):
n += 1
if n%500000 == 0:
sys.stdout.write('\r'+'%.4f' % (float(n*100)/total)+'%')
sys.stdout.flush()
if metric == 'levenshtein':
dij = editdistance.eval(base32_nmers[i], base32_nmers[j])
elif metric == 'bespoke':
dij = bespoke_distance(base32_nmers[i], base32_nmers[j])
else:
raise NotImplementedError
d[i, j] = dij
d[j, i] = dij
print ''
return d
# --- some optimisation stuff --- #
def get_proposal(A, B):
"""
Everyone loves MH.
"""
n = A.shape[0]
# get a pivot point
inner_products = np.einsum('i...,...i', A, B)
violated = np.random.choice(n, size=1, p=inner_products/np.sum(inner_products))[0]
# get the rest
v = np.array([x for (i, x) in enumerate(A[:, violated]) if not i == violated])
phi = np.array([x for (i, x) in enumerate(B[:, violated]) if not i == violated])
# now reorder... (argsort gives low to high, remember)
v_order = np.argsort(v)
phi_order = np.argsort(-phi)
# want to move the highest phi to the lowest v
ordering_subset = v_order[phi_order]
ordering_subset[np.where(ordering_subset >= violated)] += 1
# reinsert into ordering
ordering = np.empty(shape=len(v)+1, dtype=np.int)
ordering[:violated] = ordering_subset[:violated]
ordering[violated] = violated
ordering[(violated+1):] = ordering_subset[violated:]
assert len(set(ordering)) == len(ordering)
return ordering
def find_ordering(A, B, eps=0.00001):
"""
Reorder the rows/columns of B to maximise its difference to A.
... possibly.
Use Metropolis-Hastings for some reason.
"""
assert A.shape[0] == A.shape[1]
assert B.shape == A.shape
diff = np.mean(abs(A - B))
temperature = diff
delta = 100
cumulative_delta = 0
print diff, delta
accept, reject = 0, 0
while abs(delta) > eps:
# get proposal ordering
proposal_ordering = get_proposal(A, B)
proposal_B = B[proposal_ordering, :][:, proposal_ordering]
proposal_diff = np.mean(abs(A - proposal_B))
# accept with some probability
proposal_delta = diff - proposal_diff
prob = min(1, np.exp(-proposal_delta/temperature))
if np.random.random() <= prob:
accept += 1
ordering = proposal_ordering
diff = proposal_diff
delta = proposal_delta
cumulative_delta -= delta
B = proposal_B
print diff, -delta, cumulative_delta
if accept%100 == 0:
temperature /= 1.1
else:
reject += 1
if reject%100 == 0:
temperature *= 1.15
temperature *= 0.99999
acceptance_rate = float(accept)/(accept + reject)
return ordering, acceptance_rate, temperature
# --- maps --- #
def random_map(nmers, vocabulary):
"""
Totally random map, totally unconstrained, totally boring.
"""
forward_mapping = dict(zip(nmers, vocabulary))
backward_mapping = dict(zip(vocabulary, nmers))
return forward_mapping, backward_mapping
def diverse_map(nmers, vocabulary, vectors):
"""
Map which aims to map pairs of similar base32 nmers to pairs of dissimilar
language words.
"""
N = len(nmers)
A = base32_distances(nmers)
print A.shape
B = squareform(pdist(vectors))
print B.shape
ordering = find_ordering(A, B)
forward_mapping, backward_mapping = dict(), dict()
for i in xrange(N):
triple = nmers[i]
word = vocabulary[ordering[i]]
forward_mapping[triple] = word
backward_mapping[word] = triple
return forward_mapping, backward_mapping
def get_map(nmers, vocabulary, vectors=None, mapping='random'):
"""
Prep and get a map.
"""
N = len(nmers)
W = len(vocabulary)
if W < N:
print 'ERROR: Not enough words.'
return False
if W > N:
print 'There are', W, 'elements in the vocabulary and only', N,
print 'nmers: subsetting.'
vocabulary_subset = list(np.random.choice(vocabulary, N))
vocabulary = vocabulary_subset
if mapping == 'random':
print 'Using random map.'
forward_mapping, backward_mapping = random_map(nmers, vocabulary)
elif mapping == 'diverse':
print 'Using diverse map.'
if vectors is None:
print 'ERROR: diverse map requires vectors.'
return False
forward_mapping, backward_mapping = diverse_map(nmers, vocabulary, vectors)
else:
print 'ERROR: Not implemented :('
# sanity check
for (k, v) in forward_mapping.iteritems():
if not backward_mapping[v] == k:
print k, v
return forward_mapping, backward_mapping