-
Notifications
You must be signed in to change notification settings - Fork 105
/
Copy pathKNN_without_scikit_learn.py
153 lines (102 loc) · 3.93 KB
/
KNN_without_scikit_learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# coding: utf-8
# In[64]:
#importing libraries
#!pip install pandas
#!pip install matplotlib
#!pip install sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# In[71]:
#loading dataset
dataset = pd.read_csv("diabetes.csv")
dataset.head(5)#to visialize dataset
# In[72]:
'''
About data set
Here we need to predict the class label as outcome whether the person has diabetes or not
given Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age. we need to find whether the person has diabetes or not
'''
# In[73]:
# lets find how many classes are there in Species
dataset["Outcome"].value_counts()
# In[74]:
# we can see that the classes are balanced with 50 each so knn works well without any biased
# In[75]:
#splitting the dataset into Train, Dev, Test
# Train dataset which we use to train the model(KNN)
# Dev dataset which we use to find the best K value
# Test dataset which we test the model with the K value we got from above step
# Why we are using 3 datasets? Because we need to reduce generalization error(G). THis is the error which we got from new data(Test data).Because we are finding the best K value using Dev dataset.
label_data = dataset["Outcome"]
del dataset["Outcome"]
eighty_percent_data = (80*dataset.shape[0])//100
X_train, X_test, y_train, y_test = dataset[:eighty_percent_data],dataset[eighty_percent_data:],label_data[:eighty_percent_data],label_data[eighty_percent_data:]
eighty_percent_data = (80*len(X_train))//100
X_train, X_dev, y_train, y_dev = X_train[:eighty_percent_data],X_train[eighty_percent_data:],y_train[:eighty_percent_data],y_train[eighty_percent_data:]
# In[79]:
# first we find distances of each point in dev set with train set
# np.linalg.norm will find the distance between two points
distances = []
for i in range(len(X_dev)):
each_distance = []
for j in range(len(X_train)):
each_distance.append((np.linalg.norm(X_dev.iloc[i]-X_train.iloc[j]),j))
distances.append(sorted(each_distance)) #sorting the distance in ascending order
# In[89]:
# Now we Train the model with the different K values
K_values = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
accuracy_scores = []
for i in K_values:
pred = []
for j in range(len(X_dev)):
zero_count,one_count = 0,0 #used for majority voting
for k in range(i):
if y_train.iloc[distances[j][k][1]]==0:
zero_count+=1
else:
one_count+=1
if zero_count>one_count:
pred.append(0)
else:
pred.append(1)
correct_count = 0
for l in range(len(pred)):
if pred[l] == y_dev.iloc[l]:
correct_count+=1
accuracy_scores.append(correct_count/len(pred))
# In[91]:
optimal_k = K_values[accuracy_scores.index(max(accuracy_scores))]
# In[92]:
print('\nThe optimal number of neighbors is %d.' % optimal_k)
plt.plot(K_values, accuracy_scores)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Accuracy score')
plt.show()
# In[95]:
#here we can see that K=15 has highest accuracy. Now we build the model with K=15 on unseen data(Test dataset)
distances = []
for i in range(len(X_test)):
each_distance = []
for j in range(len(X_train)):
each_distance.append((np.linalg.norm(X_test.iloc[i]-X_train.iloc[j]),j))
distances.append(sorted(each_distance)) #sorting the distance in ascending order
# In[102]:
#predicting the data with 15 neigbhors
pred = []
for j in range(len(X_test)):
zero_count,one_count = 0,0 #used for majority voting
for k in range(i):
if y_train.iloc[distances[j][k][1]]==0:
zero_count+=1
else:
one_count+=1
if zero_count>one_count:
pred.append(0)
else:
pred.append(1)
correct_count = 0
for l in range(len(pred)):
if pred[l] == y_test.iloc[l]:
correct_count+=1
print("accuracy score ",(correct_count/len(pred))*100)