-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisuals_PCA.py
163 lines (126 loc) · 5.9 KB
/
visuals_PCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
def pca_results(good_data, pca):
'''
Create a DataFrame of the PCA results
Includes dimension feature weights and explained variance
Visualizes the PCA results
'''
# Dimension indexing
dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
# PCA components
components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys())
components.index = dimensions
# PCA explained variance
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = dimensions
# Create a bar plot visualization
fig, ax = plt.subplots(figsize = (14,8))
# Plot the feature weights as a function of the components
components.plot(ax = ax, kind = 'bar');
ax.set_ylabel("Feature Weights")
ax.set_xticklabels(dimensions, rotation=0)
# Display the explained variance ratios
for i, ev in enumerate(pca.explained_variance_ratio_):
ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev))
# Return a concatenated DataFrame
return pd.concat([variance_ratios, components], axis = 1)
def cluster_results(reduced_data, preds, centers, pca_samples):
'''
Visualizes the PCA-reduced cluster data in two dimensions
Adds cues for cluster centers and student-selected sample data
'''
predictions = pd.DataFrame(preds, columns = ['Cluster'])
plot_data = pd.concat([predictions, reduced_data], axis = 1)
# Generate the cluster plot
fig, ax = plt.subplots(figsize = (14,8))
# Color map
cmap = cm.get_cmap('gist_rainbow')
# Color the points based on assigned cluster
for i, cluster in plot_data.groupby('Cluster'):
cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
# Plot centers with indicators
for i, c in enumerate(centers):
ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
alpha = 1, linewidth = 2, marker = 'o', s=200);
ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
# Plot transformed sample points
ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \
s = 150, linewidth = 4, color = 'black', marker = 'x');
# Set plot title
ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross");
def biplot(good_data, reduced_data, pca):
'''
Produce a biplot that shows a scatterplot of the reduced
data and the projections of the original features.
good_data: original data, before transformation.
Needs to be a pandas dataframe with valid column names
reduced_data: the reduced data (the first two dimensions are plotted)
pca: pca object that contains the components_ attribute
return: a matplotlib AxesSubplot object (for any additional customization)
This procedure is inspired by the script:
https://github.com/teddyroland/python-biplot
'''
fig, ax = plt.subplots(figsize = (25,35))
# scatterplot of the reduced data
ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'],
facecolors='b', edgecolors='b', s=70, alpha=0.5)
feature_vectors = pca.components_.T
# we use scaling factors to make the arrows easier to see
arrow_size, text_pos = 12, 12,
# projections of the original features
for i, v in enumerate(feature_vectors):
ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1],
head_width=0.05, head_length=0.05, linewidth=1, color='red')
ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black',
ha='center', va='center', fontsize=18)
ax.set_xlabel("Dimension 1", fontsize=14)
ax.set_ylabel("Dimension 2", fontsize=14)
ax.set_title("PC plane with original feature projections.", fontsize=16);
return ax
def channel_results(reduced_data, outliers, pca_samples):
'''
Visualizes the PCA-reduced cluster data in two dimensions using the full dataset
Data is labeled by "Channel" and cues added for student-selected sample data
'''
# Check that the dataset is loadable
try:
full_data = pd.read_csv("customers.csv")
except:
print "Dataset could not be loaded. Is the file missing?"
return False
# Create the Channel DataFrame
channel = pd.DataFrame(full_data['Channel'], columns = ['Channel'])
channel = channel.drop(channel.index[outliers]).reset_index(drop = True)
labeled = pd.concat([reduced_data, channel], axis = 1)
# Generate the cluster plot
fig, ax = plt.subplots(figsize = (14,8))
# Color map
cmap = cm.get_cmap('gist_rainbow')
# Color the points based on assigned Channel
labels = ['Hotel/Restaurant/Cafe', 'Retailer']
grouped = labeled.groupby('Channel')
for i, channel in grouped:
channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
color = cmap((i-1)*1.0/2), label = labels[i-1], s=30);
# Plot transformed sample points
for i, sample in enumerate(pca_samples):
ax.scatter(x = sample[0], y = sample[1], \
s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none');
ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125);
# Set plot title
ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");