|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# vim: tabstop=2 expandtab shiftwidth=2 softtabstop=2 |
| 4 | + |
| 5 | +from __future__ import division |
| 6 | +import math |
| 7 | +import matplotlib.pyplot as plt |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | +import scipy |
| 11 | +import sys |
| 12 | +#from pybrain.datasets.supervised import SupervisedDataSet |
| 13 | +#from pybrain.tools.shortcuts import buildNetwork |
| 14 | +#from pybrain.supervised.trainers import BackpropTrainer |
| 15 | +from scipy.stats.stats import pearsonr |
| 16 | +from sklearn.base import BaseEstimator, TransformerMixin |
| 17 | +#from nolearn.dbn import DBN |
| 18 | +from sklearn.cross_validation import * |
| 19 | +from sklearn.decomposition import * |
| 20 | +from sklearn.ensemble import * |
| 21 | +from sklearn.externals import joblib |
| 22 | +from sklearn.feature_selection import * |
| 23 | +from sklearn.linear_model import * |
| 24 | +from sklearn.neighbors import * |
| 25 | +from sklearn.pipeline import * |
| 26 | +from sklearn.preprocessing import * |
| 27 | +from sklearn.svm import * |
| 28 | +from sklearn.tree import * |
| 29 | +from sklearn import metrics |
| 30 | +def distance_on_unit_sphere(lat1, long1, lat2, long2): |
| 31 | + |
| 32 | + #source code at http://www.johndcook.com/blog/python_longitude_latitude/ |
| 33 | + # Convert latitude and longitude to |
| 34 | + # spherical coordinates in radians. |
| 35 | + degrees_to_radians = math.pi/180.0 |
| 36 | + |
| 37 | + # phi = 90 - latitude |
| 38 | + phi1 = (90.0 - lat1)*degrees_to_radians |
| 39 | + phi2 = (90.0 - lat2)*degrees_to_radians |
| 40 | + |
| 41 | + # theta = longitude |
| 42 | + theta1 = long1*degrees_to_radians |
| 43 | + theta2 = long2*degrees_to_radians |
| 44 | + |
| 45 | + # Compute spherical distance from spherical coordinates. |
| 46 | + |
| 47 | + # For two locations in spherical coordinates |
| 48 | + # (1, theta, phi) and (1, theta, phi) |
| 49 | + # cosine( arc length ) = |
| 50 | + # sin phi sin phi' cos(theta-theta') + cos phi cos phi' |
| 51 | + # distance = rho * arc length |
| 52 | + |
| 53 | + cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) + |
| 54 | + math.cos(phi1)*math.cos(phi2)) |
| 55 | + if (cos>1): |
| 56 | + cos=1 |
| 57 | + arc = math.acos( cos ) |
| 58 | + |
| 59 | + # Remember to multiply arc by the radius of the earth |
| 60 | + # in your favorite set of units to get length. |
| 61 | + return arc*6373 |
| 62 | + |
| 63 | + |
| 64 | +def weight_vector(lat1,long1,points,sigma): |
| 65 | + weights=[] |
| 66 | + dist=[] |
| 67 | + for point in points: |
| 68 | + distance=distance_on_unit_sphere(lat1,long1,point[0],point[1]) |
| 69 | + dist.append(distance) |
| 70 | + weight=gaussian_weight(distance,sigma) |
| 71 | + weights.append(weight) |
| 72 | + #print "House at distance ",str(distance)," has weight ",str(weight) |
| 73 | + return weights,dist |
| 74 | + |
| 75 | +def gaussian_weight(distance,sigma): |
| 76 | + return math.exp(-distance**2/(2*sigma**2)) |
| 77 | +np.random.seed(42) |
| 78 | + |
| 79 | + |
| 80 | +data = pd.read_csv('data/modified_listings.csv') |
| 81 | +extra_data = pd.read_csv('data/post_extra_data.csv') |
| 82 | +extra_data = extra_data[['MlsNumber','LivingArea']] |
| 83 | +merged = pd.merge(data, extra_data, on='MlsNumber', suffixes=['_left', '_right']) |
| 84 | +merged.to_csv('data/all_data.csv') |
| 85 | +for col in ['AP']:#, 'COP', 'AP', 'LS', 'MA']:#, 'PPR', '2X', '3X', '4X', '5X', 'AU', 'UNI', 'MEM']: |
| 86 | + print "*" * 80 |
| 87 | + print col |
| 88 | + |
| 89 | + #X = merged[['LivingArea','WalkScore', 'NbPieces', 'NbChambres', 'NbSallesEaux', 'NbSallesBains', 'NbFoyerPoele', 'NbEquipements', 'NbGarages', 'NbStationnements', 'NbPiscines', 'NbBordEaux']] |
| 90 | + geo_points=merged[['Lat','Lng']] |
| 91 | + X = merged.drop(['MlsNumber', 'Lat', 'Lng', 'BuyPrice'], axis=1, inplace=False) |
| 92 | + Y = merged[['BuyPrice']] |
| 93 | + #X=X[['LivingArea','NbChambres','NbSallesBains']] |
| 94 | + X = X[merged[col]==1] |
| 95 | + Y = Y[merged[col]==1] |
| 96 | + geo_points=geo_points[merged[col]==1] |
| 97 | + print 'X.shape: ', X.shape |
| 98 | + print 'Y.shape: ', Y.shape |
| 99 | + print 'geo_points.shape', geo_points.shape |
| 100 | + |
| 101 | + # filter rows with NaN |
| 102 | + mask = ~np.isnan(X).any(axis=1) |
| 103 | + X = X[mask] |
| 104 | + Y = Y[mask] |
| 105 | + geo_points=geo_points[mask] |
| 106 | + print 'After NaN filter: ', X.shape |
| 107 | + |
| 108 | + # remove high-end listings |
| 109 | + mask = Y['BuyPrice'] < 1000000 |
| 110 | + X = X[mask] |
| 111 | + Y = Y[mask] |
| 112 | + geo_points=geo_points[mask] |
| 113 | + print 'After upper-bound filter: ', X.shape |
| 114 | + |
| 115 | + # remove low-end listings |
| 116 | + mask = Y['BuyPrice'] > 10**5 |
| 117 | + X = X[mask] |
| 118 | + Y = Y[mask] |
| 119 | + geo_points=geo_points[mask] |
| 120 | + print 'After lower-bound filter: ', X.shape |
| 121 | + |
| 122 | + print "mean: ", Y['BuyPrice'].mean() |
| 123 | + print "median: ", Y['BuyPrice'].median() |
| 124 | + print "std: ", Y['BuyPrice'].std() |
| 125 | + |
| 126 | + # remove outliers |
| 127 | + mask = np.abs(Y['BuyPrice']-Y['BuyPrice'].median()) <= (3*Y['BuyPrice'].std()) |
| 128 | + X = X[mask] |
| 129 | + Y = Y[mask] |
| 130 | + geo_points=geo_points[mask] |
| 131 | + |
| 132 | + columns = X.columns.values |
| 133 | + XX=X[['LivingArea','NbChambres','NbSallesBains']] |
| 134 | + X = np.array(X) |
| 135 | + XX = np.array(XX) |
| 136 | + Y = np.array(Y) |
| 137 | + Y = Y.reshape(Y.shape[0]) |
| 138 | + geo_points=np.array(geo_points) |
| 139 | + skf = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=42) |
| 140 | + |
| 141 | + var_num=1 |
| 142 | + num_neigh=range(2,75) |
| 143 | + #var_range=np.array(range(16,80))/20.0 |
| 144 | + var_range=np.array([0.4]) |
| 145 | + for neigh in [100]: |
| 146 | + j=-1 |
| 147 | + for var_num in np.nditer(var_range): |
| 148 | + j=j+1 |
| 149 | + print neigh |
| 150 | + print var_num |
| 151 | + L_rmse = [] |
| 152 | + L_corr = [] |
| 153 | + L_diff = [] |
| 154 | + k_clf=KNeighborsRegressor(n_neighbors=neigh) |
| 155 | + vari=var_num |
| 156 | + #clf=KNeighborsRegressor(n_neighbors=8) |
| 157 | + scaler=StandardScaler() |
| 158 | + |
| 159 | + clf = Pipeline([ |
| 160 | + ('scaler', StandardScaler()), |
| 161 | + |
| 162 | + ('clf', k_clf), |
| 163 | + |
| 164 | + ]) |
| 165 | + num=1229 |
| 166 | + X=scaler.fit_transform(X,Y) |
| 167 | + XX=scaler.fit_transform(XX,Y) |
| 168 | + #X=pca.fit_transform(X) |
| 169 | + #clf.fit(X,Y) |
| 170 | + k_clf.fit(XX,Y) |
| 171 | + y_pred_all=k_clf.predict(XX).astype(float) |
| 172 | + dist,indic= k_clf.kneighbors(XX[num]) |
| 173 | + #print type(indic) |
| 174 | + indic= indic.tolist()[0] |
| 175 | + |
| 176 | + weights= weight_vector(geo_points[num][0],geo_points[num][1],geo_points[indic].tolist(),5) |
| 177 | + #print Y[indic].mean() |
| 178 | + weights_array=np.array(weights) |
| 179 | + |
| 180 | + |
| 181 | + y_pred=[] |
| 182 | + norm_dist=[] |
| 183 | + nbad = 0 |
| 184 | + for i in range(0,len(Y)): |
| 185 | + #print 'I am in the loop' |
| 186 | + dist,indic=k_clf.kneighbors(XX[i]) |
| 187 | + |
| 188 | + indic= indic.tolist()[0] |
| 189 | + weights,distance= weight_vector(geo_points[i][0],geo_points[i][1],geo_points[indic[1:]].tolist(),vari) |
| 190 | + n_nearby = len(filter(lambda x: x < 0.1, distance)) |
| 191 | + if n_nearby > 0: |
| 192 | + weights_array=np.array(weights) |
| 193 | + dist_array=np.array(distance) |
| 194 | + y_pred.append(np.dot(weights_array,Y[indic[1:]])/weights_array.sum()) |
| 195 | + #y_pred.append(Y[indic[1:]].mean()) |
| 196 | + norm_dist.append(np.dot(dist_array,weights_array)/weights_array.sum()) |
| 197 | + #norm_dist.append(dist_array.mean()) |
| 198 | + else: |
| 199 | + clf = GradientBoostingRegressor() |
| 200 | + mask = np.in1d(np.arange(X.shape[0]), [i]) |
| 201 | + clf.fit(X[~mask], Y[~mask]) |
| 202 | + p = clf.predict(X[mask]) |
| 203 | + y_pred.append(p) |
| 204 | + nbad += 1 |
| 205 | + |
| 206 | + print 'got it all' |
| 207 | + print '*' * 80 |
| 208 | + print nbad |
| 209 | + y_pred=np.array(y_pred) |
| 210 | + rmse = math.sqrt(metrics.mean_squared_error(y_pred, Y)) |
| 211 | + corr = pearsonr(y_pred, Y) |
| 212 | + diff = np.array([abs(p-a)/a for (p,a) in zip(y_pred,Y)]).mean() |
| 213 | + print "RMSE: ", rmse |
| 214 | + print "corr: ", corr |
| 215 | + print "%diff: ", diff |
| 216 | + |
| 217 | +""" |
| 218 | +AP |
| 219 | +X.shape: (4425, 133) |
| 220 | +Y.shape: (4425, 1) |
| 221 | +geo_points.shape (4425, 2) |
| 222 | +After NaN filter: (3092, 133) |
| 223 | +After upper-bound filter: (3057, 133) |
| 224 | +After lower-bound filter: (3054, 133) |
| 225 | +mean: 326815.843157 |
| 226 | +median: 289000.0 |
| 227 | +std: 146918.0813 |
| 228 | +100 |
| 229 | +0.4 |
| 230 | +got it all |
| 231 | +******************************************************************************** |
| 232 | +1331 |
| 233 | +RMSE: 50190.4479516 |
| 234 | +corr: (0.90608458893495725, 0.0) |
| 235 | +%diff: 0.0998519388801 |
| 236 | +""" |
0 commit comments