diff --git a/Homeworks/Homework-I/Paczoski_Maciej/Paczoski Maciej HW-I.html b/Homeworks/Homework-I/Paczoski_Maciej/Paczoski Maciej HW-I.html new file mode 100644 index 0000000..794af7c --- /dev/null +++ b/Homeworks/Homework-I/Paczoski_Maciej/Paczoski Maciej HW-I.html @@ -0,0 +1,15357 @@ + + +
+ + +import pandas as pd
+import numpy as np
+import dalex
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import RandomForestRegressor
+from sklearn import metrics
+
df = pd.read_csv("housing.csv")
+df.describe()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|
count | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20433.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +
mean | +-119.569704 | +35.631861 | +28.639486 | +2635.763081 | +537.870553 | +1425.476744 | +499.539680 | +3.870671 | +206855.816909 | +
std | +2.003532 | +2.135952 | +12.585558 | +2181.615252 | +421.385070 | +1132.462122 | +382.329753 | +1.899822 | +115395.615874 | +
min | +-124.350000 | +32.540000 | +1.000000 | +2.000000 | +1.000000 | +3.000000 | +1.000000 | +0.499900 | +14999.000000 | +
25% | +-121.800000 | +33.930000 | +18.000000 | +1447.750000 | +296.000000 | +787.000000 | +280.000000 | +2.563400 | +119600.000000 | +
50% | +-118.490000 | +34.260000 | +29.000000 | +2127.000000 | +435.000000 | +1166.000000 | +409.000000 | +3.534800 | +179700.000000 | +
75% | +-118.010000 | +37.710000 | +37.000000 | +3148.000000 | +647.000000 | +1725.000000 | +605.000000 | +4.743250 | +264725.000000 | +
max | +-114.310000 | +41.950000 | +52.000000 | +39320.000000 | +6445.000000 | +35682.000000 | +6082.000000 | +15.000100 | +500001.000000 | +
df = df.dropna()
+
df["ocean_proximity"].unique()
+
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], + dtype=object)+
le = LabelEncoder()
+df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
+
X = df.drop("median_house_value", axis=1)
+y = df["median_house_value"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
regr = RandomForestRegressor(n_estimators=5, random_state=0)
+regr.fit(X_train, y_train)
+y_pred = regr.predict(X_test)
+MSE = metrics.mean_squared_error(y_test, y_pred)
+
exp = dalex.Explainer(regr, X_test, y_test)
+
Preparation of a new explainer is initiated + + -> data : 4087 rows 9 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 4087 values + -> model_class : sklearn.ensemble._forest.RandomForestRegressor (default) + -> label : Not specified, model's class short name will be used. (default) + -> predict function : <function yhat_default at 0x0000028752051FC0> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 4.41e+04, mean = 2.08e+05, max = 5e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -3.16e+05, mean = -6.51e+02, max = 3.19e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
d:\coding\daily\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names + warnings.warn( ++
observation = X_test.iloc[[100, 200]]
+observation_pred = regr.predict(observation)
+order = X_test.columns.to_list()
+
exp.predict_parts(observation.iloc[[0]], type="break_down", order=order).plot()
+exp.predict_parts(observation.iloc[[0]], type="shap").plot()
+
Break down plot shows that longitude has largest positive contribution to prediction while latitude has largest negative contribution. We may however expect that those two variables are interacting and they comes down to location, not just single coordinate. Total rooms variable impact changes from positive to negative between plots which suggest that it may be interacting with another variable.
+ +exp.predict_parts(observation.iloc[[1]], type="break_down", order=order).plot()
+exp.predict_parts(observation.iloc[[1]], type="shap").plot()
+
Most variables have opposite to the first observation influence on prediction. Latitude and longitude variables contributions again suggest interaction.
+ +\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "
import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import RandomForestRegressor
+from sklearn import metrics
+from lime import lime_tabular
+import warnings
+warnings.filterwarnings("ignore")
+import random
+
df = pd.read_csv("housing.csv")
+df.describe()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|
count | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20433.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +
mean | +-119.569704 | +35.631861 | +28.639486 | +2635.763081 | +537.870553 | +1425.476744 | +499.539680 | +3.870671 | +206855.816909 | +
std | +2.003532 | +2.135952 | +12.585558 | +2181.615252 | +421.385070 | +1132.462122 | +382.329753 | +1.899822 | +115395.615874 | +
min | +-124.350000 | +32.540000 | +1.000000 | +2.000000 | +1.000000 | +3.000000 | +1.000000 | +0.499900 | +14999.000000 | +
25% | +-121.800000 | +33.930000 | +18.000000 | +1447.750000 | +296.000000 | +787.000000 | +280.000000 | +2.563400 | +119600.000000 | +
50% | +-118.490000 | +34.260000 | +29.000000 | +2127.000000 | +435.000000 | +1166.000000 | +409.000000 | +3.534800 | +179700.000000 | +
75% | +-118.010000 | +37.710000 | +37.000000 | +3148.000000 | +647.000000 | +1725.000000 | +605.000000 | +4.743250 | +264725.000000 | +
max | +-114.310000 | +41.950000 | +52.000000 | +39320.000000 | +6445.000000 | +35682.000000 | +6082.000000 | +15.000100 | +500001.000000 | +
df = df.dropna()
+
df["ocean_proximity"].unique()
+
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], + dtype=object)+
le = LabelEncoder()
+df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
+
X = df.drop("median_house_value", axis=1)
+y = df["median_house_value"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
regr = RandomForestRegressor(n_estimators=32, random_state=0)
+regr.fit(X_train, y_train)
+y_pred = regr.predict(X_test)
+metrics.r2_score(y_test, y_pred)
+
0.8110934679631914+
regr.predict(X_test.iloc[[32]])
+
array([204815.625])+
explainer = lime_tabular.LimeTabularExplainer(
+ X_train.values,
+ feature_names=X_train.columns.values.tolist(),
+ mode="regression",
+)
+
random.seed(16)
+explainer.explain_instance(X_test.values[32], regr.predict).show_in_notebook()
+
random.seed(16)
+explainer.explain_instance(X_test.values[64], regr.predict).show_in_notebook()
+random.seed(16)
+explainer.explain_instance(X_test.values[128], regr.predict).show_in_notebook()
+random.seed(16)
+explainer.explain_instance(X_test.values[256], regr.predict).show_in_notebook()
+random.seed(16)
+explainer.explain_instance(X_test.values[512], regr.predict).show_in_notebook()
+
Results of LIME decomposition ensure that pricing mainly depends on location described by 3 variables - longitude, latitude and ocean_proximity. Variable median_income is listed as most influential twice but often its impact is far lesser. So even though we got LIME explainer to work, judging by median_income variable, it may not be sufficiently stable.
+ +\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "
import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
+import dalex as dx
+from sklearn import metrics
+import warnings
+from IPython.display import Image
+warnings.filterwarnings("ignore")
+import random
+
df = pd.read_csv("housing.csv")
+df.describe()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|
count | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20433.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +
mean | +-119.569704 | +35.631861 | +28.639486 | +2635.763081 | +537.870553 | +1425.476744 | +499.539680 | +3.870671 | +206855.816909 | +
std | +2.003532 | +2.135952 | +12.585558 | +2181.615252 | +421.385070 | +1132.462122 | +382.329753 | +1.899822 | +115395.615874 | +
min | +-124.350000 | +32.540000 | +1.000000 | +2.000000 | +1.000000 | +3.000000 | +1.000000 | +0.499900 | +14999.000000 | +
25% | +-121.800000 | +33.930000 | +18.000000 | +1447.750000 | +296.000000 | +787.000000 | +280.000000 | +2.563400 | +119600.000000 | +
50% | +-118.490000 | +34.260000 | +29.000000 | +2127.000000 | +435.000000 | +1166.000000 | +409.000000 | +3.534800 | +179700.000000 | +
75% | +-118.010000 | +37.710000 | +37.000000 | +3148.000000 | +647.000000 | +1725.000000 | +605.000000 | +4.743250 | +264725.000000 | +
max | +-114.310000 | +41.950000 | +52.000000 | +39320.000000 | +6445.000000 | +35682.000000 | +6082.000000 | +15.000100 | +500001.000000 | +
df = df.dropna()
+
df["ocean_proximity"].unique()
+
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], + dtype=object)+
le = LabelEncoder()
+df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
+
X = df.drop("median_house_value", axis=1)
+y = df["median_house_value"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
modelGBR = GradientBoostingRegressor(random_state=0)
+modelGBR.fit(X_train, y_train)
+y_pred = modelGBR.predict(X_test)
+print("GradientBoostingRegressor: ", metrics.r2_score(y_test, y_pred))
+
GradientBoostingRegressor: 0.7705056681773399 ++
modelRFR = RandomForestRegressor(random_state=0)
+modelRFR.fit(X_train, y_train)
+y_pred = modelRFR.predict(X_test)
+print("RandomForestRegressor: ", metrics.r2_score(y_test, y_pred))
+
RandomForestRegressor: 0.8142447370110542 ++
modelMLPR = MLPRegressor(learning_rate_init=0.01, random_state=0)
+modelMLPR.fit(X_train, y_train)
+y_pred = modelMLPR.predict(X_test)
+print("MLPRegressor: ", metrics.r2_score(y_test, y_pred))
+
MLPRegressor: 0.6507014083261967 ++
RandomForestRegressor
has best score, so I will be using it for model prediction decomposition using the Ceteris Paribus profiles.
print("Observation 123")
+print("y_pred: ",modelRFR.predict(X_test.iloc[[123]])[0])
+print("y_true: ",y_test.iloc[123])
+
Observation 123 +y_pred: 233134.0 +y_true: 200000.0 ++
variables=['longitude', 'latitude', 'median_income', 'ocean_proximity']
+expRFR = dx.Explainer(modelRFR, X_train, y_train, label = "RandomForest")
+obs_expRFR = expRFR.predict_profile(X_test.iloc[[123]])
+
Preparation of a new explainer is initiated + + -> data : 16346 rows 9 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 16346 values + -> model_class : sklearn.ensemble._forest.RandomForestRegressor (default) + -> label : RandomForest + -> predict function : <function yhat_default at 0x0000014BAC789480> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 4.25e+04, mean = 2.07e+05, max = 5e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -1.07e+05, mean = -1.49e+02, max = 1.48e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 9/9 [00:00<00:00, 100.73it/s] ++
# obs_expRFR.plot()
+
Image("plot_1.png")
+
RandomForest
model prediction mainly depends on variables median_income
, ocean_proximity
and coordinates - longitude
, latitude
. Other factors have an impact only on limits.
expGBR = dx.Explainer(modelGBR, X_train, y_train, label = "GradientBoosting")
+obs_expGBR=expGBR.predict_profile(X_test.iloc[[123]])
+
+expMLPR = dx.Explainer(modelMLPR, X_train, y_train, label = "MLP")
+obs_expMLPR=expMLPR.predict_profile(X_test.iloc[[123]])
+
Preparation of a new explainer is initiated + + -> data : 16346 rows 9 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 16346 values + -> model_class : sklearn.ensemble._gb.GradientBoostingRegressor (default) + -> label : GradientBoosting + -> predict function : <function yhat_default at 0x0000014BAC789480> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 4.1e+04, mean = 2.07e+05, max = 5.58e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -3e+05, mean = 8.89e-12, max = 3.83e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 9/9 [00:00<00:00, 322.17it/s] ++
Preparation of a new explainer is initiated + + -> data : 16346 rows 9 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 16346 values + -> model_class : sklearn.neural_network._multilayer_perceptron.MLPRegressor (default) + -> label : MLP + -> predict function : <function yhat_default at 0x0000014BAC789480> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 2.28e+04, mean = 2.07e+05, max = 7.89e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -4.84e+05, mean = -5.58e+02, max = 4.42e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 9/9 [00:00<00:00, 290.06it/s] ++
# obs_expRFR.plot((obs_expGBR, obs_expMLPR))
+
Image("plot_2.png")
+
Comparing Ceteris Paribus profiles for different models a few conclusions can be made. RandomForest
and GradientBoosting
scored similarly and their plots for all variables are similar. However, MLP
model tends to overestimate variables other models see as less important.
\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "
import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
+from sklearn import metrics
+import warnings
+
+warnings.filterwarnings("ignore")
+import random
+import eli5
+from eli5.sklearn import PermutationImportance
+
df = pd.read_csv("housing.csv")
+df.describe()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|
count | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20433.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +
mean | +-119.569704 | +35.631861 | +28.639486 | +2635.763081 | +537.870553 | +1425.476744 | +499.539680 | +3.870671 | +206855.816909 | +
std | +2.003532 | +2.135952 | +12.585558 | +2181.615252 | +421.385070 | +1132.462122 | +382.329753 | +1.899822 | +115395.615874 | +
min | +-124.350000 | +32.540000 | +1.000000 | +2.000000 | +1.000000 | +3.000000 | +1.000000 | +0.499900 | +14999.000000 | +
25% | +-121.800000 | +33.930000 | +18.000000 | +1447.750000 | +296.000000 | +787.000000 | +280.000000 | +2.563400 | +119600.000000 | +
50% | +-118.490000 | +34.260000 | +29.000000 | +2127.000000 | +435.000000 | +1166.000000 | +409.000000 | +3.534800 | +179700.000000 | +
75% | +-118.010000 | +37.710000 | +37.000000 | +3148.000000 | +647.000000 | +1725.000000 | +605.000000 | +4.743250 | +264725.000000 | +
max | +-114.310000 | +41.950000 | +52.000000 | +39320.000000 | +6445.000000 | +35682.000000 | +6082.000000 | +15.000100 | +500001.000000 | +
df = df.dropna()
+
df["ocean_proximity"].unique()
+
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], + dtype=object)+
le = LabelEncoder()
+df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
+
X = df.drop("median_house_value", axis=1)
+y = df["median_house_value"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
modelGBR = GradientBoostingRegressor(random_state=0)
+modelGBR.fit(X_train, y_train)
+y_pred = modelGBR.predict(X_test)
+print("GradientBoostingRegressor: ", metrics.r2_score(y_test, y_pred))
+
GradientBoostingRegressor: 0.7705056681773399 ++
modelRFR = RandomForestRegressor(random_state=0)
+modelRFR.fit(X_train, y_train)
+y_pred = modelRFR.predict(X_test)
+print("RandomForestRegressor: ", metrics.r2_score(y_test, y_pred))
+
RandomForestRegressor: 0.8142447370110542 ++
modelMLPR = MLPRegressor(learning_rate_init=0.01, random_state=0)
+modelMLPR.fit(X_train, y_train)
+y_pred = modelMLPR.predict(X_test)
+print("MLPRegressor: ", metrics.r2_score(y_test, y_pred))
+
MLPRegressor: 0.6507014083261967 ++
pm_gbr = PermutationImportance(modelGBR, random_state=0).fit(X_train, y_train)
+eli5.show_weights(pm_gbr, feature_names=X_train.columns.tolist())
+
Weight | +Feature | +
---|---|
+ 0.7184 + + ± 0.0052 + + | ++ median_income + | +
+ 0.4546 + + ± 0.0084 + + | ++ longitude + | +
+ 0.3930 + + ± 0.0065 + + | ++ latitude + | +
+ 0.1181 + + ± 0.0038 + + | ++ ocean_proximity + | +
+ 0.0853 + + ± 0.0038 + + | ++ population + | +
+ 0.0602 + + ± 0.0024 + + | ++ total_bedrooms + | +
+ 0.0342 + + ± 0.0032 + + | ++ housing_median_age + | +
+ 0.0130 + + ± 0.0008 + + | ++ households + | +
+ 0.0069 + + ± 0.0003 + + | ++ total_rooms + | +
pm_rfr = PermutationImportance(modelRFR, random_state=0).fit(X_train, y_train)
+eli5.show_weights(pm_rfr, feature_names=X_train.columns.tolist())
+
Weight | +Feature | +
---|---|
+ 0.8113 + + ± 0.0130 + + | ++ median_income + | +
+ 0.6294 + + ± 0.0088 + + | ++ longitude + | +
+ 0.4539 + + ± 0.0058 + + | ++ latitude + | +
+ 0.2942 + + ± 0.0110 + + | ++ ocean_proximity + | +
+ 0.1180 + + ± 0.0049 + + | ++ housing_median_age + | +
+ 0.0724 + + ± 0.0017 + + | ++ population + | +
+ 0.0394 + + ± 0.0005 + + | ++ total_rooms + | +
+ 0.0378 + + ± 0.0016 + + | ++ total_bedrooms + | +
+ 0.0213 + + ± 0.0004 + + | ++ households + | +
pm_mlpr = PermutationImportance(modelMLPR, random_state=1).fit(X_train, y_train)
+eli5.show_weights(pm_mlpr, feature_names=X_train.columns.tolist())
+
Weight | +Feature | +
---|---|
+ 2.0908 + + ± 0.0581 + + | ++ households + | +
+ 1.2646 + + ± 0.0120 + + | ++ population + | +
+ 1.1676 + + ± 0.0388 + + | ++ total_rooms + | +
+ 1.1415 + + ± 0.0151 + + | ++ median_income + | +
+ 0.5288 + + ± 0.0131 + + | ++ total_bedrooms + | +
+ 0.0726 + + ± 0.0014 + + | ++ housing_median_age + | +
+ 0.0284 + + ± 0.0017 + + | ++ latitude + | +
+ 0.0053 + + ± 0.0005 + + | ++ longitude + | +
+ 0.0034 + + ± 0.0007 + + | ++ ocean_proximity + | +
Both GradientBoosting
and RandomForest
models shows very similar results, with median_income
being most important variable. Slightly less influencial is combination of coordinates, longitude
and latitude
, followed by ocean_proximity
. Other varaiables has low weight, so these models don't cosider them important. However MLP
neural network model gives opposite results, with households
, population
and total_rooms
as most important variables. Those factors barely have any weight in first two models. Only median_income
seems to be meanigful in all models. We might expect that MLP
mostly differentiate between high population density localizations like Los Angeles aglomeration and more rural premises, while GradientBoosting
and RandomForest
models consider more factors.
\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "
Weight | \n", + "Feature | \n", + "
---|---|
\n", + " 0.7184\n", + " \n", + " ± 0.0052\n", + " \n", + " | \n", + "\n", + " median_income\n", + " | \n", + "
\n", + " 0.4546\n", + " \n", + " ± 0.0084\n", + " \n", + " | \n", + "\n", + " longitude\n", + " | \n", + "
\n", + " 0.3930\n", + " \n", + " ± 0.0065\n", + " \n", + " | \n", + "\n", + " latitude\n", + " | \n", + "
\n", + " 0.1181\n", + " \n", + " ± 0.0038\n", + " \n", + " | \n", + "\n", + " ocean_proximity\n", + " | \n", + "
\n", + " 0.0853\n", + " \n", + " ± 0.0038\n", + " \n", + " | \n", + "\n", + " population\n", + " | \n", + "
\n", + " 0.0602\n", + " \n", + " ± 0.0024\n", + " \n", + " | \n", + "\n", + " total_bedrooms\n", + " | \n", + "
\n", + " 0.0342\n", + " \n", + " ± 0.0032\n", + " \n", + " | \n", + "\n", + " housing_median_age\n", + " | \n", + "
\n", + " 0.0130\n", + " \n", + " ± 0.0008\n", + " \n", + " | \n", + "\n", + " households\n", + " | \n", + "
\n", + " 0.0069\n", + " \n", + " ± 0.0003\n", + " \n", + " | \n", + "\n", + " total_rooms\n", + " | \n", + "
Weight | \n", + "Feature | \n", + "
---|---|
\n", + " 0.8113\n", + " \n", + " ± 0.0130\n", + " \n", + " | \n", + "\n", + " median_income\n", + " | \n", + "
\n", + " 0.6294\n", + " \n", + " ± 0.0088\n", + " \n", + " | \n", + "\n", + " longitude\n", + " | \n", + "
\n", + " 0.4539\n", + " \n", + " ± 0.0058\n", + " \n", + " | \n", + "\n", + " latitude\n", + " | \n", + "
\n", + " 0.2942\n", + " \n", + " ± 0.0110\n", + " \n", + " | \n", + "\n", + " ocean_proximity\n", + " | \n", + "
\n", + " 0.1180\n", + " \n", + " ± 0.0049\n", + " \n", + " | \n", + "\n", + " housing_median_age\n", + " | \n", + "
\n", + " 0.0724\n", + " \n", + " ± 0.0017\n", + " \n", + " | \n", + "\n", + " population\n", + " | \n", + "
\n", + " 0.0394\n", + " \n", + " ± 0.0005\n", + " \n", + " | \n", + "\n", + " total_rooms\n", + " | \n", + "
\n", + " 0.0378\n", + " \n", + " ± 0.0016\n", + " \n", + " | \n", + "\n", + " total_bedrooms\n", + " | \n", + "
\n", + " 0.0213\n", + " \n", + " ± 0.0004\n", + " \n", + " | \n", + "\n", + " households\n", + " | \n", + "
Weight | \n", + "Feature | \n", + "
---|---|
\n", + " 2.0908\n", + " \n", + " ± 0.0581\n", + " \n", + " | \n", + "\n", + " households\n", + " | \n", + "
\n", + " 1.2646\n", + " \n", + " ± 0.0120\n", + " \n", + " | \n", + "\n", + " population\n", + " | \n", + "
\n", + " 1.1676\n", + " \n", + " ± 0.0388\n", + " \n", + " | \n", + "\n", + " total_rooms\n", + " | \n", + "
\n", + " 1.1415\n", + " \n", + " ± 0.0151\n", + " \n", + " | \n", + "\n", + " median_income\n", + " | \n", + "
\n", + " 0.5288\n", + " \n", + " ± 0.0131\n", + " \n", + " | \n", + "\n", + " total_bedrooms\n", + " | \n", + "
\n", + " 0.0726\n", + " \n", + " ± 0.0014\n", + " \n", + " | \n", + "\n", + " housing_median_age\n", + " | \n", + "
\n", + " 0.0284\n", + " \n", + " ± 0.0017\n", + " \n", + " | \n", + "\n", + " latitude\n", + " | \n", + "
\n", + " 0.0053\n", + " \n", + " ± 0.0005\n", + " \n", + " | \n", + "\n", + " longitude\n", + " | \n", + "
\n", + " 0.0034\n", + " \n", + " ± 0.0007\n", + " \n", + " | \n", + "\n", + " ocean_proximity\n", + " | \n", + "
import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
+from sklearn import metrics
+import warnings
+from IPython.display import Image
+warnings.filterwarnings("ignore")
+import random
+import dalex as dx
+
df = pd.read_csv("housing.csv")
+df.describe()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|
count | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20433.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +20640.000000 | +
mean | +-119.569704 | +35.631861 | +28.639486 | +2635.763081 | +537.870553 | +1425.476744 | +499.539680 | +3.870671 | +206855.816909 | +
std | +2.003532 | +2.135952 | +12.585558 | +2181.615252 | +421.385070 | +1132.462122 | +382.329753 | +1.899822 | +115395.615874 | +
min | +-124.350000 | +32.540000 | +1.000000 | +2.000000 | +1.000000 | +3.000000 | +1.000000 | +0.499900 | +14999.000000 | +
25% | +-121.800000 | +33.930000 | +18.000000 | +1447.750000 | +296.000000 | +787.000000 | +280.000000 | +2.563400 | +119600.000000 | +
50% | +-118.490000 | +34.260000 | +29.000000 | +2127.000000 | +435.000000 | +1166.000000 | +409.000000 | +3.534800 | +179700.000000 | +
75% | +-118.010000 | +37.710000 | +37.000000 | +3148.000000 | +647.000000 | +1725.000000 | +605.000000 | +4.743250 | +264725.000000 | +
max | +-114.310000 | +41.950000 | +52.000000 | +39320.000000 | +6445.000000 | +35682.000000 | +6082.000000 | +15.000100 | +500001.000000 | +
df = df.dropna()
+
df["ocean_proximity"].unique()
+
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], + dtype=object)+
le = LabelEncoder()
+df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
+
X = df.drop("median_house_value", axis=1)
+y = df["median_house_value"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
modelGBR = GradientBoostingRegressor(random_state=0)
+modelGBR.fit(X_train, y_train)
+y_pred = modelGBR.predict(X_test)
+print("GradientBoostingRegressor: ", metrics.r2_score(y_test, y_pred))
+
GradientBoostingRegressor: 0.7705056681773399 ++
gbr_exp = dx.Explainer(modelGBR, X_train, y_train, label="Gradient Boosting")
+pdp_gbr = gbr_exp.model_profile(type="partial")
+pdp_gbr.result["_label_"] = "PDP profiles"
+
Preparation of a new explainer is initiated + + -> data : 16346 rows 9 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 16346 values + -> model_class : sklearn.ensemble._gb.GradientBoostingRegressor (default) + -> label : Gradient Boosting + -> predict function : <function yhat_default at 0x0000020CFC139480> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 4.1e+04, mean = 2.07e+05, max = 5.58e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -3e+05, mean = 8.89e-12, max = 3.83e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 9/9 [00:00<00:00, 39.08it/s] ++
pdp_gbr.plot(
+ geom="profiles",
+ variables=["median_income", "longitude", "latitude", "ocean_proximity"],
+)
+
# Image("plot_1.png")
+
ale_gbr = gbr_exp.model_profile(type="accumulated")
+ale_gbr.result["_label_"] = "ALE profiles"
+
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 9/9 [00:00<00:00, 35.81it/s] +Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 9/9 [00:00<00:00, 11.67it/s] ++
ale_gbr.plot(
+ geom="profiles",
+ variables=["median_income", "longitude", "latitude", "ocean_proximity"],
+)
+
# Image("plot_2.png")
+
pdp_gbr.plot(
+ ale_gbr, variables=["median_income", "longitude", "latitude", "ocean_proximity"],
+ title="PDP and ALE profiles"
+)
+
# Image("plot_3.png")
+
Variables for which PDP
and ALE
were calculated, are most influential according to Homework IV. Judging from the fact that all lines are almost perfectly parallel in PDP
and ALE
plots, we can expect little to no interactions between those variables. `
\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "