joshuamorton
diff --git a/‎P1/Adults/adult.py‎
Lines changed: 57 additions & 19 deletions b/‎P1/Adults/adult.py‎
Lines changed: 57 additions & 19 deletions
@@ -13,13 +13,15 @@
 import pydot
 from sklearn import tree, neighbors, ensemble, cross_validation
 from sklearn.svm import SVC, LinearSVC
+from sklearn.metrics import confusion_matrix
 from StringIO import StringIO
 from pybrain.datasets.classification import ClassificationDataSet
 from pybrain.tools.shortcuts import buildNetwork
 from pybrain.tools.validation import CrossValidator
 from pybrain.supervised.trainers import BackpropTrainer
 from pybrain.tools.xml.networkwriter import NetworkWriter
 from pybrain.tools.xml.networkreader import NetworkReader
+from pprint import pprint
 
 import matplotlib.pyplot as plt
 
@@ -73,11 +75,12 @@ def load(filename):
         adults = [line for line in data if "?" not in line]  # remove lines with unknown data
 
     return np.loadtxt(adults,
-        delimiter=', ',
-        converters=converters,
-        dtype='u4',
-        skiprows=1
-        )
+                      delimiter=', ',
+                      converters=converters,
+                      dtype='u4',
+                      skiprows=1
+                      )
+
 
 def start_adult():
     """
@@ -92,7 +95,7 @@ def start_adult():
     rx, ry = np.hsplit(te, [14])
     ty = ty.flatten()
     ry = ry.flatten()
-    return tx, ty, rx, ry 
+    return tx, ty, rx, ry
 
 
 def decisionTree(tx, ty, rx, ry, height):
@@ -101,7 +104,7 @@ def decisionTree(tx, ty, rx, ry, height):
     clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=1)
     clf.fit(tx, ty)
     dotdata = StringIO()
-    tree.export_graphviz(clf, out_file=dotdata) 
+    tree.export_graphviz(clf, out_file=dotdata)
     graph = pydot.graph_from_dot_data(dotdata.getvalue())
     graph.write_pdf("out.pdf")
     return sum((clf.predict(rx) - ry)**2)/float(len(ry))  # + cross_validation.cross_val_score(clf, tx, ty).mean()
@@ -120,8 +123,8 @@ def knntester(tx, ty, rx, ry, iterations):
     """
     er = []
     et = []
-    positions = range(1,iterations)
-    for n in xrange(1,iterations):
+    positions = range(1, iterations)
+    for n in xrange(1, iterations):
         neigh = neighbors.KNeighborsClassifier(n_neighbors=n, weights='distance')
         neigh = neigh.fit(tx, ty)
         er.append(sum((neigh.predict(rx) - ry)**2)/float(len(ry)))
@@ -140,7 +143,7 @@ def knntester(tx, ty, rx, ry, iterations):
 
 def nn(tx, ty, rx, ry, iterations):
     network = buildNetwork(14, 5, 5, 1)
-    ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
+    ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
     for i in xrange(len(tx)):
         ds.addSample(tx[i], [ty[i]])
     trainer = BackpropTrainer(network, ds)
@@ -149,6 +152,7 @@ def nn(tx, ty, rx, ry, iterations):
     results = sum((np.array([round(network.activate(test)) for test in rx]) - ry)**2)/float(len(ry))
     return results
 
+
 def loadnn(name):
     network = NetworkReader(name)
 
@@ -160,7 +164,7 @@ def boosting(tx, ty, rx, ry, n):
 
 
 def svm(tx, ty, rx, ry):
-    clf = SVC()
+    clf = SVC(kernel="linear")
     clf.fit(tx, ty)
     return sum((clf.predict(rx) - ry)**2)/float(len(ry))
 
@@ -177,7 +181,12 @@ def boostTest(tx, ty, rx, ry, iterations):
         resultsr.append(sum((clf.predict(rx) - ry)**2)/float(len(ry)))
     plt.plot(num, resultst, 'ro', num, resultsr, 'bo')
     plt.axis([0, iterations, 0, 1])
+    plt.title("Boosted Decision Tree Error")
+    plt.ylabel("Error Rate")
+    plt.xlabel("Number of Estimators")
+    plt.savefig('boostgraph.png', dpi=500)
     plt.show()
+    return resultsr
 
 
 def nntester(tx, ty, rx, ry, iterations):
@@ -189,7 +198,7 @@ def nntester(tx, ty, rx, ry, iterations):
     resultsr = []
     positions = range(iterations)
     network = buildNetwork(14, 14, 1, bias=True)
-    ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
+    ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
     for i in xrange(len(tx)):
         ds.addSample(tx[i], [ty[i]])
     trainer = BackpropTrainer(network, ds, learningrate=0.01)
@@ -210,7 +219,7 @@ def nntester(tx, ty, rx, ry, iterations):
 
 def cvnntester(tx, ty, rx, ry, iterations, folds):
     network = buildNetwork(14, 14, 1, bias=True)
-    ds = ClassificationDataSet(14,1, class_labels=["<50K", ">=50K"])
+    ds = ClassificationDataSet(14, 1, class_labels=["<50K", ">=50K"])
     for i in xrange(len(tx)):
         ds.addSample(tx[i], [ty[i]])
     trainer = BackpropTrainer(network, ds, learningrate=0.01)
@@ -219,7 +228,32 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
     print sum((np.array([round(network.activate(test)) for test in rx]) - ry)**2)/float(len(ry))
 
 
+def treeTest(tx, ty, rx, ry, iterations):
+    resultst = []
+    resultsr = []
+    num = range(iterations)
+    for i in xrange(iterations):
+        print i
+        clf = tree.DecisionTreeClassifier(max_depth=i+1, criterion="entropy")
+        clf.fit(tx, ty)
+        resultst.append(sum((clf.predict(tx) - ty)**2)/float(len(ty)))
+        resultsr.append(sum((clf.predict(rx) - ry)**2)/float(len(ry)))
+    plt.plot(num, resultst, 'ro', num, resultsr, 'bo')
+    plt.axis([0, iterations, 0, 1])
+    plt.title("Decision Tree error")
+    plt.ylabel("Error Rate")
+    plt.xlabel("Maximum Tree Depth")
+    plt.savefig('entropytree.png', dpi=500)
+    plt.show()
+    return resultsr
+
+
+def treeConfusion(tx, ty, rx, ry):
+    clf = tree.DecisionTreeClassifier(max_depth=7, criterion="gini")
+    results = clf.fit(tx, ty).predict(rx)
 
+    cm = confusion_matrix(ry, results)
+    return cm
 
 
 if __name__ == "__main__":
@@ -233,14 +267,18 @@ def cvnntester(tx, ty, rx, ry, iterations, folds):
     # print "Boosting (100): " + str(boosting(tx, ty, rx, ry, 100))
     # print "Boosting (500): " + str(boosting(tx, ty, rx, ry, 500))
     # print "SVM: " + str(svm(tx, ty, rx, ry))
-    # boostTest(tx, ty, rx, ry, 25)
+    # pprint(boostTest(tx, ty, rx, ry, 500))
+    # pprint(treeTest(tx, ty, rx, ry, 25))
     # nntester(tx, ty, rx, ry, 500)
     # knntester(tx, ty, rx, ry, 100)
-    cvnntester(tx, ty, rx, ry, 500, 10)
-
+    # cvnntester(tx, ty, rx, ry, 500, 10)
+    # print treeConfusion(tx, ty, rx, ry)
 
 """
-decision stump result: .248922% error, as a baseline
-pruned decision tree result: .217984% error
-unpruned adaboost decision stump (10): .15926% error
+decision stump result: .248922 error, as a baseline
+pruned decision tree result: .217984 error
+unpruned adaboost decision stump (10): .15926 error
+cross validated 10 fold nn: .241235 error
+SVM rbf Kernel: .24567 error
+adaboost (500): .134 error
 """