-
Notifications
You must be signed in to change notification settings - Fork 6
/
cluster_vectors.py
51 lines (42 loc) · 1.53 KB
/
cluster_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from annoy import AnnoyIndex
from scipy import spatial
from nltk import ngrams
import random, json, glob, os, codecs, random
import numpy as np
# data structures
file_index_to_file_name = {}
file_index_to_file_vector = {}
chart_image_positions = {}
# config
dims = 2048
n_nearest_neighbors = 30
trees = 10000
infiles = glob.glob('image_vectors/*.npz')
# build ann index
t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
file_vector = np.loadtxt(i)
file_name = os.path.basename(i).split('.')[0]
file_index_to_file_name[file_index] = file_name
file_index_to_file_vector[file_index] = file_vector
t.add_item(file_index, file_vector)
t.build(trees)
# create a nearest neighbors json file for each input
if not os.path.exists('nearest_neighbors'):
os.makedirs('nearest_neighbors')
for i in file_index_to_file_name.keys():
master_file_name = file_index_to_file_name[i]
master_vector = file_index_to_file_vector[i]
named_nearest_neighbors = []
nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
for j in nearest_neighbors:
neighbor_file_name = file_index_to_file_name[j]
neighbor_file_vector = file_index_to_file_vector[j]
similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
rounded_similarity = int((similarity * 10000)) / 10000.0
named_nearest_neighbors.append({
'filename': neighbor_file_name,
'similarity': rounded_similarity
})
with open('nearest_neighbors/' + master_file_name + '.json', 'w') as out:
json.dump(named_nearest_neighbors, out)