Skip to content

Commit 264e211

Browse files
committed
added hills analysis and updated adults analysis code
1 parent 4d9ef77 commit 264e211

File tree

2 files changed

+291
-19
lines changed

2 files changed

+291
-19
lines changed

P1/Adults/adult.py

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
import pydot
1414
from sklearn import tree, neighbors, ensemble, cross_validation
1515
from sklearn.svm import SVC, LinearSVC
16+
from sklearn.metrics import confusion_matrix
1617
from StringIO import StringIO
1718
from pybrain.datasets.classification import ClassificationDataSet
1819
from pybrain.tools.shortcuts import buildNetwork
1920
from pybrain.tools.validation import CrossValidator
2021
from pybrain.supervised.trainers import BackpropTrainer
2122
from pybrain.tools.xml.networkwriter import NetworkWriter
2223
from pybrain.tools.xml.networkreader import NetworkReader
24+
from pprint import pprint
2325

2426
import matplotlib.pyplot as plt
2527

@@ -73,11 +75,12 @@ def load(filename):
7375
adults = [line for line in data if "?" not in line] # remove lines with unknown data
7476

7577
return np.loadtxt(adults,
76-
delimiter=', ',
77-
converters=converters,
78-
dtype='u4',
79-
skiprows=1
80-
)
78+
delimiter=', ',
79+
converters=converters,
80+
dtype='u4',
81+
skiprows=1
82+
)
83+
8184

8285
def start_adult():
8386
"""
@@ -92,7 +95,7 @@ def start_adult():
9295
rx, ry = np.hsplit(te, [14])
9396
ty = ty.flatten()
9497
ry = ry.flatten()
95-
return tx, ty, rx, ry
98+
return tx, ty, rx, ry
9699

97100

98101
def decisionTree(tx, ty, rx, ry, height):
@@ -101,7 +104,7 @@ def decisionTree(tx, ty, rx, ry, height):
101104
clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=1)
102105
clf.fit(tx, ty)
103106
dotdata = StringIO()
104-
tree.export_graphviz(clf, out_file=dotdata)
107+
tree.export_graphviz(clf, out_file=dotdata)
105108
graph = pydot.graph_from_dot_data(dotdata.getvalue())
106109
graph.write_pdf("out.pdf")
107110
return sum((clf.predict(rx) - ry)**2)/float(len(ry)) # + cross_validation.cross_val_score(clf, tx, ty).mean()
@@ -120,8 +123,8 @@ def knntester(tx, ty, rx, ry, iterations):
120123
"""
121124
er = []
122125
et = []
123-
positions = range(1,iterations)
124-
for n in xrange(1,iterations):
126+
positions = range(1, iterations)
127+
for n in xrange(1, iterations):
125128
neigh = neighbors.KNeighborsClassifier(n_neighbors=n, weights='distance')
126129
neigh = neigh.fit(tx, ty)
127130
er.append(sum((neigh.predict(rx) - ry)**2)/float(len(ry)))
@@ -140,7 +143,7 @@ def knntester(tx, ty, rx, ry, iterations):
140143

141144
def nn(tx, ty, rx, ry, iterations):
142145
network = buildNetwork(14, 5, 5, 1)
143-
ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
146+
ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
144147
for i in xrange(len(tx)):
145148
ds.addSample(tx[i], [ty[i]])
146149
trainer = BackpropTrainer(network, ds)
@@ -149,6 +152,7 @@ def nn(tx, ty, rx, ry, iterations):
149152
results = sum((np.array([round(network.activate(test)) for test in rx]) - ry)**2)/float(len(ry))
150153
return results
151154

155+
152156
def loadnn(name):
153157
network = NetworkReader(name)
154158

@@ -160,7 +164,7 @@ def boosting(tx, ty, rx, ry, n):
160164

161165

162166
def svm(tx, ty, rx, ry):
163-
clf = SVC()
167+
clf = SVC(kernel="linear")
164168
clf.fit(tx, ty)
165169
return sum((clf.predict(rx) - ry)**2)/float(len(ry))
166170

@@ -177,7 +181,12 @@ def boostTest(tx, ty, rx, ry, iterations):
177181
resultsr.append(sum((clf.predict(rx) - ry)**2)/float(len(ry)))
178182
plt.plot(num, resultst, 'ro', num, resultsr, 'bo')
179183
plt.axis([0, iterations, 0, 1])
184+
plt.title("Boosted Decision Tree Error")
185+
plt.ylabel("Error Rate")
186+
plt.xlabel("Number of Estimators")
187+
plt.savefig('boostgraph.png', dpi=500)
180188
plt.show()
189+
return resultsr
181190

182191

183192
def nntester(tx, ty, rx, ry, iterations):
@@ -189,7 +198,7 @@ def nntester(tx, ty, rx, ry, iterations):
189198
resultsr = []
190199
positions = range(iterations)
191200
network = buildNetwork(14, 14, 1, bias=True)
192-
ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
201+
ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
193202
for i in xrange(len(tx)):
194203
ds.addSample(tx[i], [ty[i]])
195204
trainer = BackpropTrainer(network, ds, learningrate=0.01)
@@ -210,7 +219,7 @@ def nntester(tx, ty, rx, ry, iterations):
210219

211220
def cvnntester(tx, ty, rx, ry, iterations, folds):
212221
network = buildNetwork(14, 14, 1, bias=True)
213-
ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
222+
ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
214223
for i in xrange(len(tx)):
215224
ds.addSample(tx[i], [ty[i]])
216225
trainer = BackpropTrainer(network, ds, learningrate=0.01)
@@ -219,7 +228,32 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
219228
print sum((np.array([round(network.activate(test)) for test in rx]) - ry)**2)/float(len(ry))
220229

221230

231+
def treeTest(tx, ty, rx, ry, iterations):
232+
resultst = []
233+
resultsr = []
234+
num = range(iterations)
235+
for i in xrange(iterations):
236+
print i
237+
clf = tree.DecisionTreeClassifier(max_depth=i+1, criterion="entropy")
238+
clf.fit(tx, ty)
239+
resultst.append(sum((clf.predict(tx) - ty)**2)/float(len(ty)))
240+
resultsr.append(sum((clf.predict(rx) - ry)**2)/float(len(ry)))
241+
plt.plot(num, resultst, 'ro', num, resultsr, 'bo')
242+
plt.axis([0, iterations, 0, 1])
243+
plt.title("Decision Tree error")
244+
plt.ylabel("Error Rate")
245+
plt.xlabel("Maximum Tree Depth")
246+
plt.savefig('entropytree.png', dpi=500)
247+
plt.show()
248+
return resultsr
249+
250+
251+
def treeConfusion(tx, ty, rx, ry):
252+
clf = tree.DecisionTreeClassifier(max_depth=7, criterion="gini")
253+
results = clf.fit(tx, ty).predict(rx)
222254

255+
cm = confusion_matrix(ry, results)
256+
return cm
223257

224258

225259
if __name__ == "__main__":
@@ -233,14 +267,18 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
233267
# print "Boosting (100): " + str(boosting(tx, ty, rx, ry, 100))
234268
# print "Boosting (500): " + str(boosting(tx, ty, rx, ry, 500))
235269
# print "SVM: " + str(svm(tx, ty, rx, ry))
236-
# boostTest(tx, ty, rx, ry, 25)
270+
# pprint(boostTest(tx, ty, rx, ry, 500))
271+
# pprint(treeTest(tx, ty, rx, ry, 25))
237272
# nntester(tx, ty, rx, ry, 500)
238273
# knntester(tx, ty, rx, ry, 100)
239-
cvnntester(tx, ty, rx, ry, 500, 10)
240-
274+
# cvnntester(tx, ty, rx, ry, 500, 10)
275+
# print treeConfusion(tx, ty, rx, ry)
241276

242277
"""
243-
decision stump result: .248922% error, as a baseline
244-
pruned decision tree result: .217984% error
245-
unpruned adaboost decision stump (10): .15926% error
278+
decision stump result: .248922 error, as a baseline
279+
pruned decision tree result: .217984 error
280+
unpruned adaboost decision stump (10): .15926 error
281+
cross validated 10 fold nn: .241235 error
282+
SVM rbf Kernel: .24567 error
283+
adaboost (500): .134 error
246284
"""

0 commit comments

Comments
 (0)