Skip to content

Commit 35f9058

Browse files
committed
ensemble knn + gbr
1 parent 80bda65 commit 35f9058

File tree

1 file changed

+236
-0
lines changed

1 file changed

+236
-0
lines changed

models/ensemble.py

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# vim: tabstop=2 expandtab shiftwidth=2 softtabstop=2
4+
5+
from __future__ import division
6+
import math
7+
import matplotlib.pyplot as plt
8+
import numpy as np
9+
import pandas as pd
10+
import scipy
11+
import sys
12+
#from pybrain.datasets.supervised import SupervisedDataSet
13+
#from pybrain.tools.shortcuts import buildNetwork
14+
#from pybrain.supervised.trainers import BackpropTrainer
15+
from scipy.stats.stats import pearsonr
16+
from sklearn.base import BaseEstimator, TransformerMixin
17+
#from nolearn.dbn import DBN
18+
from sklearn.cross_validation import *
19+
from sklearn.decomposition import *
20+
from sklearn.ensemble import *
21+
from sklearn.externals import joblib
22+
from sklearn.feature_selection import *
23+
from sklearn.linear_model import *
24+
from sklearn.neighbors import *
25+
from sklearn.pipeline import *
26+
from sklearn.preprocessing import *
27+
from sklearn.svm import *
28+
from sklearn.tree import *
29+
from sklearn import metrics
30+
def distance_on_unit_sphere(lat1, long1, lat2, long2):
31+
32+
#source code at http://www.johndcook.com/blog/python_longitude_latitude/
33+
# Convert latitude and longitude to
34+
# spherical coordinates in radians.
35+
degrees_to_radians = math.pi/180.0
36+
37+
# phi = 90 - latitude
38+
phi1 = (90.0 - lat1)*degrees_to_radians
39+
phi2 = (90.0 - lat2)*degrees_to_radians
40+
41+
# theta = longitude
42+
theta1 = long1*degrees_to_radians
43+
theta2 = long2*degrees_to_radians
44+
45+
# Compute spherical distance from spherical coordinates.
46+
47+
# For two locations in spherical coordinates
48+
# (1, theta, phi) and (1, theta, phi)
49+
# cosine( arc length ) =
50+
# sin phi sin phi' cos(theta-theta') + cos phi cos phi'
51+
# distance = rho * arc length
52+
53+
cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
54+
math.cos(phi1)*math.cos(phi2))
55+
if (cos>1):
56+
cos=1
57+
arc = math.acos( cos )
58+
59+
# Remember to multiply arc by the radius of the earth
60+
# in your favorite set of units to get length.
61+
return arc*6373
62+
63+
64+
def weight_vector(lat1,long1,points,sigma):
65+
weights=[]
66+
dist=[]
67+
for point in points:
68+
distance=distance_on_unit_sphere(lat1,long1,point[0],point[1])
69+
dist.append(distance)
70+
weight=gaussian_weight(distance,sigma)
71+
weights.append(weight)
72+
#print "House at distance ",str(distance)," has weight ",str(weight)
73+
return weights,dist
74+
75+
def gaussian_weight(distance,sigma):
76+
return math.exp(-distance**2/(2*sigma**2))
77+
np.random.seed(42)
78+
79+
80+
data = pd.read_csv('data/modified_listings.csv')
81+
extra_data = pd.read_csv('data/post_extra_data.csv')
82+
extra_data = extra_data[['MlsNumber','LivingArea']]
83+
merged = pd.merge(data, extra_data, on='MlsNumber', suffixes=['_left', '_right'])
84+
merged.to_csv('data/all_data.csv')
85+
for col in ['AP']:#, 'COP', 'AP', 'LS', 'MA']:#, 'PPR', '2X', '3X', '4X', '5X', 'AU', 'UNI', 'MEM']:
86+
print "*" * 80
87+
print col
88+
89+
#X = merged[['LivingArea','WalkScore', 'NbPieces', 'NbChambres', 'NbSallesEaux', 'NbSallesBains', 'NbFoyerPoele', 'NbEquipements', 'NbGarages', 'NbStationnements', 'NbPiscines', 'NbBordEaux']]
90+
geo_points=merged[['Lat','Lng']]
91+
X = merged.drop(['MlsNumber', 'Lat', 'Lng', 'BuyPrice'], axis=1, inplace=False)
92+
Y = merged[['BuyPrice']]
93+
#X=X[['LivingArea','NbChambres','NbSallesBains']]
94+
X = X[merged[col]==1]
95+
Y = Y[merged[col]==1]
96+
geo_points=geo_points[merged[col]==1]
97+
print 'X.shape: ', X.shape
98+
print 'Y.shape: ', Y.shape
99+
print 'geo_points.shape', geo_points.shape
100+
101+
# filter rows with NaN
102+
mask = ~np.isnan(X).any(axis=1)
103+
X = X[mask]
104+
Y = Y[mask]
105+
geo_points=geo_points[mask]
106+
print 'After NaN filter: ', X.shape
107+
108+
# remove high-end listings
109+
mask = Y['BuyPrice'] < 1000000
110+
X = X[mask]
111+
Y = Y[mask]
112+
geo_points=geo_points[mask]
113+
print 'After upper-bound filter: ', X.shape
114+
115+
# remove low-end listings
116+
mask = Y['BuyPrice'] > 10**5
117+
X = X[mask]
118+
Y = Y[mask]
119+
geo_points=geo_points[mask]
120+
print 'After lower-bound filter: ', X.shape
121+
122+
print "mean: ", Y['BuyPrice'].mean()
123+
print "median: ", Y['BuyPrice'].median()
124+
print "std: ", Y['BuyPrice'].std()
125+
126+
# remove outliers
127+
mask = np.abs(Y['BuyPrice']-Y['BuyPrice'].median()) <= (3*Y['BuyPrice'].std())
128+
X = X[mask]
129+
Y = Y[mask]
130+
geo_points=geo_points[mask]
131+
132+
columns = X.columns.values
133+
XX=X[['LivingArea','NbChambres','NbSallesBains']]
134+
X = np.array(X)
135+
XX = np.array(XX)
136+
Y = np.array(Y)
137+
Y = Y.reshape(Y.shape[0])
138+
geo_points=np.array(geo_points)
139+
skf = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=42)
140+
141+
var_num=1
142+
num_neigh=range(2,75)
143+
#var_range=np.array(range(16,80))/20.0
144+
var_range=np.array([0.4])
145+
for neigh in [100]:
146+
j=-1
147+
for var_num in np.nditer(var_range):
148+
j=j+1
149+
print neigh
150+
print var_num
151+
L_rmse = []
152+
L_corr = []
153+
L_diff = []
154+
k_clf=KNeighborsRegressor(n_neighbors=neigh)
155+
vari=var_num
156+
#clf=KNeighborsRegressor(n_neighbors=8)
157+
scaler=StandardScaler()
158+
159+
clf = Pipeline([
160+
('scaler', StandardScaler()),
161+
162+
('clf', k_clf),
163+
164+
])
165+
num=1229
166+
X=scaler.fit_transform(X,Y)
167+
XX=scaler.fit_transform(XX,Y)
168+
#X=pca.fit_transform(X)
169+
#clf.fit(X,Y)
170+
k_clf.fit(XX,Y)
171+
y_pred_all=k_clf.predict(XX).astype(float)
172+
dist,indic= k_clf.kneighbors(XX[num])
173+
#print type(indic)
174+
indic= indic.tolist()[0]
175+
176+
weights= weight_vector(geo_points[num][0],geo_points[num][1],geo_points[indic].tolist(),5)
177+
#print Y[indic].mean()
178+
weights_array=np.array(weights)
179+
180+
181+
y_pred=[]
182+
norm_dist=[]
183+
nbad = 0
184+
for i in range(0,len(Y)):
185+
#print 'I am in the loop'
186+
dist,indic=k_clf.kneighbors(XX[i])
187+
188+
indic= indic.tolist()[0]
189+
weights,distance= weight_vector(geo_points[i][0],geo_points[i][1],geo_points[indic[1:]].tolist(),vari)
190+
n_nearby = len(filter(lambda x: x < 0.1, distance))
191+
if n_nearby > 0:
192+
weights_array=np.array(weights)
193+
dist_array=np.array(distance)
194+
y_pred.append(np.dot(weights_array,Y[indic[1:]])/weights_array.sum())
195+
#y_pred.append(Y[indic[1:]].mean())
196+
norm_dist.append(np.dot(dist_array,weights_array)/weights_array.sum())
197+
#norm_dist.append(dist_array.mean())
198+
else:
199+
clf = GradientBoostingRegressor()
200+
mask = np.in1d(np.arange(X.shape[0]), [i])
201+
clf.fit(X[~mask], Y[~mask])
202+
p = clf.predict(X[mask])
203+
y_pred.append(p)
204+
nbad += 1
205+
206+
print 'got it all'
207+
print '*' * 80
208+
print nbad
209+
y_pred=np.array(y_pred)
210+
rmse = math.sqrt(metrics.mean_squared_error(y_pred, Y))
211+
corr = pearsonr(y_pred, Y)
212+
diff = np.array([abs(p-a)/a for (p,a) in zip(y_pred,Y)]).mean()
213+
print "RMSE: ", rmse
214+
print "corr: ", corr
215+
print "%diff: ", diff
216+
217+
"""
218+
AP
219+
X.shape: (4425, 133)
220+
Y.shape: (4425, 1)
221+
geo_points.shape (4425, 2)
222+
After NaN filter: (3092, 133)
223+
After upper-bound filter: (3057, 133)
224+
After lower-bound filter: (3054, 133)
225+
mean: 326815.843157
226+
median: 289000.0
227+
std: 146918.0813
228+
100
229+
0.4
230+
got it all
231+
********************************************************************************
232+
1331
233+
RMSE: 50190.4479516
234+
corr: (0.90608458893495725, 0.0)
235+
%diff: 0.0998519388801
236+
"""

0 commit comments

Comments
 (0)