1313import pydot
1414from sklearn import tree , neighbors , ensemble , cross_validation
1515from sklearn .svm import SVC , LinearSVC
16+ from sklearn .metrics import confusion_matrix
1617from StringIO import StringIO
1718from pybrain .datasets .classification import ClassificationDataSet
1819from pybrain .tools .shortcuts import buildNetwork
1920from pybrain .tools .validation import CrossValidator
2021from pybrain .supervised .trainers import BackpropTrainer
2122from pybrain .tools .xml .networkwriter import NetworkWriter
2223from pybrain .tools .xml .networkreader import NetworkReader
24+ from pprint import pprint
2325
2426import matplotlib .pyplot as plt
2527
@@ -73,11 +75,12 @@ def load(filename):
7375 adults = [line for line in data if "?" not in line ] # remove lines with unknown data
7476
7577 return np .loadtxt (adults ,
76- delimiter = ', ' ,
77- converters = converters ,
78- dtype = 'u4' ,
79- skiprows = 1
80- )
78+ delimiter = ', ' ,
79+ converters = converters ,
80+ dtype = 'u4' ,
81+ skiprows = 1
82+ )
83+
8184
8285def start_adult ():
8386 """
@@ -92,7 +95,7 @@ def start_adult():
9295 rx , ry = np .hsplit (te , [14 ])
9396 ty = ty .flatten ()
9497 ry = ry .flatten ()
95- return tx , ty , rx , ry
98+ return tx , ty , rx , ry
9699
97100
98101def decisionTree (tx , ty , rx , ry , height ):
@@ -101,7 +104,7 @@ def decisionTree(tx, ty, rx, ry, height):
101104 clf = tree .DecisionTreeClassifier (criterion = "gini" , max_depth = 1 )
102105 clf .fit (tx , ty )
103106 dotdata = StringIO ()
104- tree .export_graphviz (clf , out_file = dotdata )
107+ tree .export_graphviz (clf , out_file = dotdata )
105108 graph = pydot .graph_from_dot_data (dotdata .getvalue ())
106109 graph .write_pdf ("out.pdf" )
107110 return sum ((clf .predict (rx ) - ry )** 2 )/ float (len (ry )) # + cross_validation.cross_val_score(clf, tx, ty).mean()
@@ -120,8 +123,8 @@ def knntester(tx, ty, rx, ry, iterations):
120123 """
121124 er = []
122125 et = []
123- positions = range (1 ,iterations )
124- for n in xrange (1 ,iterations ):
126+ positions = range (1 , iterations )
127+ for n in xrange (1 , iterations ):
125128 neigh = neighbors .KNeighborsClassifier (n_neighbors = n , weights = 'distance' )
126129 neigh = neigh .fit (tx , ty )
127130 er .append (sum ((neigh .predict (rx ) - ry )** 2 )/ float (len (ry )))
@@ -140,7 +143,7 @@ def knntester(tx, ty, rx, ry, iterations):
140143
141144def nn (tx , ty , rx , ry , iterations ):
142145 network = buildNetwork (14 , 5 , 5 , 1 )
143- ds = ClassificationDataSet (14 ,1 , class_labels = ["<50K" , ">=50K" ])
146+ ds = ClassificationDataSet (14 , 1 , class_labels = ["<50K" , ">=50K" ])
144147 for i in xrange (len (tx )):
145148 ds .addSample (tx [i ], [ty [i ]])
146149 trainer = BackpropTrainer (network , ds )
@@ -149,6 +152,7 @@ def nn(tx, ty, rx, ry, iterations):
149152 results = sum ((np .array ([round (network .activate (test )) for test in rx ]) - ry )** 2 )/ float (len (ry ))
150153 return results
151154
155+
152156def loadnn (name ):
153157 network = NetworkReader (name )
154158
@@ -160,7 +164,7 @@ def boosting(tx, ty, rx, ry, n):
160164
161165
162166def svm (tx , ty , rx , ry ):
163- clf = SVC ()
167+ clf = SVC (kernel = "linear" )
164168 clf .fit (tx , ty )
165169 return sum ((clf .predict (rx ) - ry )** 2 )/ float (len (ry ))
166170
@@ -177,7 +181,12 @@ def boostTest(tx, ty, rx, ry, iterations):
177181 resultsr .append (sum ((clf .predict (rx ) - ry )** 2 )/ float (len (ry )))
178182 plt .plot (num , resultst , 'ro' , num , resultsr , 'bo' )
179183 plt .axis ([0 , iterations , 0 , 1 ])
184+ plt .title ("Boosted Decision Tree Error" )
185+ plt .ylabel ("Error Rate" )
186+ plt .xlabel ("Number of Estimators" )
187+ plt .savefig ('boostgraph.png' , dpi = 500 )
180188 plt .show ()
189+ return resultsr
181190
182191
183192def nntester (tx , ty , rx , ry , iterations ):
@@ -189,7 +198,7 @@ def nntester(tx, ty, rx, ry, iterations):
189198 resultsr = []
190199 positions = range (iterations )
191200 network = buildNetwork (14 , 14 , 1 , bias = True )
192- ds = ClassificationDataSet (14 ,1 , class_labels = ["<50K" , ">=50K" ])
201+ ds = ClassificationDataSet (14 , 1 , class_labels = ["<50K" , ">=50K" ])
193202 for i in xrange (len (tx )):
194203 ds .addSample (tx [i ], [ty [i ]])
195204 trainer = BackpropTrainer (network , ds , learningrate = 0.01 )
@@ -210,7 +219,7 @@ def nntester(tx, ty, rx, ry, iterations):
210219
211220def cvnntester (tx , ty , rx , ry , iterations , folds ):
212221 network = buildNetwork (14 , 14 , 1 , bias = True )
213- ds = ClassificationDataSet (14 ,1 , class_labels = ["<50K" , ">=50K" ])
222+ ds = ClassificationDataSet (14 , 1 , class_labels = ["<50K" , ">=50K" ])
214223 for i in xrange (len (tx )):
215224 ds .addSample (tx [i ], [ty [i ]])
216225 trainer = BackpropTrainer (network , ds , learningrate = 0.01 )
@@ -219,7 +228,32 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
219228 print sum ((np .array ([round (network .activate (test )) for test in rx ]) - ry )** 2 )/ float (len (ry ))
220229
221230
231+ def treeTest (tx , ty , rx , ry , iterations ):
232+ resultst = []
233+ resultsr = []
234+ num = range (iterations )
235+ for i in xrange (iterations ):
236+ print i
237+ clf = tree .DecisionTreeClassifier (max_depth = i + 1 , criterion = "entropy" )
238+ clf .fit (tx , ty )
239+ resultst .append (sum ((clf .predict (tx ) - ty )** 2 )/ float (len (ty )))
240+ resultsr .append (sum ((clf .predict (rx ) - ry )** 2 )/ float (len (ry )))
241+ plt .plot (num , resultst , 'ro' , num , resultsr , 'bo' )
242+ plt .axis ([0 , iterations , 0 , 1 ])
243+ plt .title ("Decision Tree error" )
244+ plt .ylabel ("Error Rate" )
245+ plt .xlabel ("Maximum Tree Depth" )
246+ plt .savefig ('entropytree.png' , dpi = 500 )
247+ plt .show ()
248+ return resultsr
249+
250+
251+ def treeConfusion (tx , ty , rx , ry ):
252+ clf = tree .DecisionTreeClassifier (max_depth = 7 , criterion = "gini" )
253+ results = clf .fit (tx , ty ).predict (rx )
222254
255+ cm = confusion_matrix (ry , results )
256+ return cm
223257
224258
225259if __name__ == "__main__" :
@@ -233,14 +267,18 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
233267 # print "Boosting (100): " + str(boosting(tx, ty, rx, ry, 100))
234268 # print "Boosting (500): " + str(boosting(tx, ty, rx, ry, 500))
235269 # print "SVM: " + str(svm(tx, ty, rx, ry))
236- # boostTest(tx, ty, rx, ry, 25)
270+ # pprint(boostTest(tx, ty, rx, ry, 500))
271+ # pprint(treeTest(tx, ty, rx, ry, 25))
237272 # nntester(tx, ty, rx, ry, 500)
238273 # knntester(tx, ty, rx, ry, 100)
239- cvnntester (tx , ty , rx , ry , 500 , 10 )
240-
274+ # cvnntester(tx, ty, rx, ry, 500, 10)
275+ # print treeConfusion(tx, ty, rx, ry)
241276
242277"""
243- decision stump result: .248922% error, as a baseline
244- pruned decision tree result: .217984% error
245- unpruned adaboost decision stump (10): .15926% error
278+ decision stump result: .248922 error, as a baseline
279+ pruned decision tree result: .217984 error
280+ unpruned adaboost decision stump (10): .15926 error
281+ cross validated 10 fold nn: .241235 error
282+ SVM rbf Kernel: .24567 error
283+ adaboost (500): .134 error
246284"""
0 commit comments