-
Notifications
You must be signed in to change notification settings - Fork 5
/
module_clustering.py
111 lines (96 loc) · 3.06 KB
/
module_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
The purpose of this module is to:
* create product clusters
* provide TSNE plot
* provide final category labels
"""
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from yellowbrick.cluster import KElbowVisualizer
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
class p2cluster():
"""
This class creates product clusters
"""
def __init__(self, w2v_model):
"""
Class constructor
"""
self.w2v_model = w2v_model
self.labels = []
self.x = []
self.y = []
def tsne_train(self, perplexity=2, no_iterations=5000):
"""
Creates TSNE model
"""
tokens = []
for word in self.w2v_model.wv.vocab:
tokens.append(self.w2v_model.wv[word])
self.labels.append(word)
tsne_model = TSNE(
perplexity=perplexity,
n_components=2,
init="random",
n_iter=no_iterations,
random_state=23,
)
new_values = tsne_model.fit_transform(tokens)
for value in new_values:
self.x.append(value[0])
self.y.append(value[1])
def tsne_plot(self):
"""
Create TSNE plot
"""
plt.figure(figsize=(8, 8))
for i in range(len(self.x)):
plt.scatter(self.x[i], self.y[i])
plt.annotate(
self.labels[i],
xy=(self.x[i], self.y[i]),
xytext=(5, 2),
textcoords="offset points",
ha="right",
va="bottom",
)
plt.show()
def elbow_plot(self, min_val=2, max_val=40):
"""
Create elbow plot
"""
model = KMeans()
visualizer = KElbowVisualizer(model, k=(min_val, max_val))
visualizer.fit(np.column_stack((self.x, self.y))) # Fit the data to the visualizer
visualizer.show()
def train_cluster(self, nclut=25):
"""
Train kmeans clustering
"""
self.kmeans = KMeans(n_clusters=nclut, random_state=0).fit(np.column_stack((self.x, self.y)))
def clust_plot(self):
"""
Plot the clustering results
"""
p = sns.scatterplot(x=self.x, y=self.y, hue=self.kmeans.labels_, palette="deep") # other palette
p.legend_.remove()
plt.show()
def get_categories(self):
"""
Return DF with product categories
"""
product_categories = {
"tsne_x": self.x,
"tsne_y": self.y,
"product": self.labels,
"category_label": self.kmeans.predict(np.column_stack((self.x, self.y))),
"tmp_sort": self.labels,
}
product_categories = pd.DataFrame(data=product_categories)
product_categories["tmp_sort"] = product_categories["tmp_sort"].astype(float)
product_categories = product_categories.sort_values(by="tmp_sort")
del product_categories["tmp_sort"]
return product_categories