From 6e2cbe667333bad0873d8957669ef8eda96d5c97 Mon Sep 17 00:00:00 2001 From: DawidPludowski <72541839+DawidPludowski@users.noreply.github.com> Date: Thu, 31 Mar 2022 14:32:56 +0200 Subject: [PATCH 1/5] Add hw1 Pludowski Dawid --- .../Homework-I/Pludowski/PludowskiD.html | 15007 ++++++++++++++++ .../Homework-I/Pludowski/PludowskiD.ipynb | 2020 +++ 2 files changed, 17027 insertions(+) create mode 100644 Homeworks/Homework-I/Pludowski/PludowskiD.html create mode 100644 Homeworks/Homework-I/Pludowski/PludowskiD.ipynb diff --git a/Homeworks/Homework-I/Pludowski/PludowskiD.html b/Homeworks/Homework-I/Pludowski/PludowskiD.html new file mode 100644 index 0000000..093a91a --- /dev/null +++ b/Homeworks/Homework-I/Pludowski/PludowskiD.html @@ -0,0 +1,15007 @@ + + +
+ + +import numpy as np
+import pandas as pd
+import seaborn as sns
+
+import pickle
+
+import dalex as dx
+
+# libraries that are used in creating objects from pickle
+from sklearn.ensemble import RandomForestRegressor
+
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import OneHotEncoder
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
# code necessery to load model from pickle
+
+rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
+
+class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
+ def __init__(self, add_bedrooms_per_room = True):
+ self.add_bedrooms_per_room = add_bedrooms_per_room
+ def fit(self, X, y=None):
+ return self
+ def transform(self, X):
+ rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
+ population_per_household = X[:, population_ix] / X[:, households_ix]
+ if self.add_bedrooms_per_room:
+ bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
+ return np.c_[X, rooms_per_household, population_per_household,
+ bedrooms_per_room]
+ else:
+ return np.c_[X, rooms_per_household, population_per_household]
+
with open('full_model.pkl', 'rb') as f:
+ model = pickle.load(f)
+with open('test_dataset.pkl', 'rb') as f:
+ test_data = pickle.load(f)
+
X = test_data.drop(columns=['median_house_value'])
+y = test_data['median_house_value']
+
Please note that train data is not provided here as model was trained in different noebook to remove redundant code.
+ +model_exp = dx.Explainer(model, X, y,
+ label = "housing RF Pipeline")
+
Preparation of a new explainer is initiated + + -> data : 4128 rows 10 cols + -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. + -> target variable : 4128 values + -> model_class : sklearn.ensemble._forest.RandomForestRegressor (default) + -> label : housing RF Pipeline + -> predict function : <function yhat_default at 0x000001B4602FC160> will be used (default) + -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. + -> predicted values : min = 4.95e+04, mean = 2.08e+05, max = 5e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -2.49e+05, mean = -1.66e+03, max = 2.94e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
observation_1 = X.iloc[[5]]
+observation_2 = X.iloc[[321]]
+
prediction_1 = model.predict(observation_1)
+prediction_2 = model.predict(observation_2)
+
+print("Real value of observation 1: {y_1:.0f}; predicted value: {y_1_hat:.0f}".format(y_1=list(y.iloc[[5]])[0], y_1_hat=prediction_1[0]))
+print("Real value of observation 2: {y_2:.0f}; predicted value: {y_2_hat:.0f}".format(y_2=list(y.iloc[[321]])[0], y_2_hat=prediction_2[0]))
+
Real value of observation 1: 120600; predicted value: 137187 +Real value of observation 2: 298900; predicted value: 275727 ++
Model predict target
very well, so further explanation can be present.
order = X.columns.to_list()
+
# first observation
+model_exp.predict_parts(observation_1.iloc[[0]],
+ type = 'break_down',
+ order=order).plot()
+model_exp.predict_parts(observation_1.iloc[[0]],
+ type = 'shap').plot()
+
Based on break_down
plot, the greatest positive inpact on prediction has total_rooms
and longitude
. However, longitude
without latitude
does not create significant information and we may expect that only interaction of that variables really matters in the model.
The greates negative impact has ocean_proximity
and latitude
. Again, only in interaction latitude
create siginifact infromation.
On shap
plot, households
's impact is positive while on break_down
it is negative. It may suggest that interaction between households
and other variables exists. We may expect interaction with total_*
or population
variables, as ratio of that variables tell more about housing in the area than single variables.
# second observation
+model_exp.predict_parts(observation_2,
+ type = 'break_down',
+ order = order).plot()
+model_exp.predict_parts(observation_2,
+ type = 'shap').plot()
+
While in first observation most variables has negative impact, in the second observation they have mainly positive impact. It is worth mentioning that the greatest negative impact in second observation is observed in households
and total_rooms
while they have the greatest positive impact in the first observation. The linear positive correlation between target
and those two variables should be examinated.
Since values obtained by break_down
and shap
are different, some interactions in model exist. We can easily find them using break_down_interactions
plots:
model_exp.predict_parts(observation_1,
+ type = 'break_down_interactions',
+ interaction_preference=1).plot()
+
model_exp.predict_parts(observation_2,
+ type = 'break_down_interactions',
+ interaction_preference=1).plot()
+
For each observation the interactions are different; however, we can notice that:
+latitude
and longitude
interact with each other;total_bedrooms
and total_rooms
- we may expect that their ratio could be informaiton about average size of the houses in the area;library(ranger)
+library(DALEX)
+library(DALEXtra)
+library(lime)
+
+set.seed(123)
+
+df <- read.csv2('./../data.csv', sep=',')
+df['median_house_value'] <- lapply(df['median_house_value'], FUN = as.integer)
+
+ranger_model <- ranger(median_house_value ~., data = df)
+res <- predict(ranger_model, df[2137,])$predictions
+cat(res)
+## 277727.6
+explainer_rf <- DALEX::explain(ranger_model,
+ data = df,
+ y = df$median_house_value,
+ label = "random forest")
+## Preparation of a new explainer is initiated
+## -> model label : random forest
+## -> data : 20640 rows 13 cols
+## -> target variable : 20640 values
+## -> predict function : yhat.ranger will be used ( default )
+## -> predicted values : No value for predict function target column. ( default )
+## -> model_info : package ranger , ver. 0.13.1 , task regression ( default )
+## -> predicted values : numerical, min = 44585.64 , mean = 207152.6 , max = 499971.5
+## -> residual function : difference between y and yhat ( default )
+## -> residuals : numerical, min = -145492.8 , mean = -296.7636 , max = 199630.6
+## A new explainer has been created!
+model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
+predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
+
+lime_pr <- predict_surrogate(explainer = explainer_rf,
+ new_observation = as.data.frame(df[2137,]),
+ n_features = 6,
+ n_permutations = 1000,
+ type = "lime")
+
+lime_pr
+plot(lime_pr)
+
+LIME
decomposition shows that ocean_proximity
and total_rooms
have the greatest impact on final prediction. Explanation fit is significantly low, though.
lime_pr <- predict_surrogate(explainer = explainer_rf,
+ new_observation = as.data.frame(df[420,]),
+ n_features = 6,
+ n_permutations = 1000,
+ type = "lime")
+
+lime_pr
+plot(lime_pr)
+ As shown in previous homework, NEAR BAY
value is supposed to have positive impact on model prediction; however, here we obtained negative impact of that value. It may be caused by the fact that in terms of longitude
and latitude
, houses near bay has neighbor observations only in one direction. Moreover, explanation fit is really low, which may lead to unstable explanation with that method.
In both LIME decomposition number of total rooms has similar negative impact of model prediction. There is noticeable difference between impact of longitude
in each observation, which could be explain be the fact, that little change in distance can change NEAR BAY
to <1H OCEAN
, while even great change of that value cannot change INLAND
into other value. total_rooms
and total_bedrooms
seem to have stable impact in neighbors of both observation, maybe because that such a values are equally important independently from other attributes of house.
In summary, we may expect that some attributes, such as longitude
or latitude
are unstable somewhat and other, like total_rooms
might be much more stable.
Author: Dawid Pludowski
+import pandas as pd
+import pickle as pkl
+import dalex as dx
+
+from sklearn.model_selection import train_test_split
+
+import warnings
+warnings.filterwarnings('ignore')
+
Most data preprocessing was done for purpose of previous homeworks; only python
-wise preprocessing, such as managing with categories, is required.
df = pd.read_csv('../data_scaled.csv')
+
df.head()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +rooms_per_household | +bedrooms_per_room | +population_per_household | +ocean_proximity | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +-1.327835 | +1.052548 | +0.982143 | +-0.804819 | +-0.972476 | +-0.974429 | +-0.977033 | +2.344766 | +0.628559 | +-1.149930 | +-0.049597 | +NEAR BAY | +452600.0 | +
1 | +-1.322844 | +1.043185 | +-0.607019 | +2.045890 | +1.357143 | +0.861439 | +1.669961 | +2.332238 | +0.327041 | +-0.990381 | +-0.092512 | +NEAR BAY | +358500.0 | +
2 | +-1.332827 | +1.038503 | +1.856182 | +-0.535746 | +-0.827024 | +-0.820777 | +-0.843637 | +1.782699 | +1.155620 | +-1.445865 | +-0.025843 | +NEAR BAY | +352100.0 | +
3 | +-1.337818 | +1.038503 | +1.856182 | +-0.624215 | +-0.719723 | +-0.766028 | +-0.733781 | +0.932968 | +0.156966 | +-0.493627 | +-0.050329 | +NEAR BAY | +341300.0 | +
4 | +-1.337818 | +1.038503 | +1.856182 | +-0.462404 | +-0.612423 | +-0.759847 | +-0.629157 | +-0.012881 | +0.344711 | +-0.707889 | +-0.085616 | +NEAR BAY | +342200.0 | +
mapping = {
+ 'NEAR BAY': 0,
+ 'ISLAND': 1,
+ 'NEAR OCEAN': 2,
+ '<1H OCEAN': 3,
+ 'INLAND': 4
+}
+
+df['ocean_proximity'] = df['ocean_proximity'].map(mapping)
+
X = df.drop(columns=['median_house_value'])
+y = df[['median_house_value']]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+
from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
+
+from sklearn.model_selection import RandomizedSearchCV
+
dt = DecisionTreeRegressor(max_depth=10)
+dt_tuned = RandomizedSearchCV(
+ dt,
+ {
+ 'criterion': ['squared_error', 'absolute_error'],
+ 'max_depth': [i for i in range(5, 25, 2)],
+ 'min_samples_split': [i for i in range(2, 10)],
+ 'min_samples_leaf': [i for i in range(1, 5)]
+ },
+ n_iter=15,
+ random_state=2137
+)
+
+
+dt_tuned.fit(X_train, y_train)
+print(dt_tuned.best_estimator_)
+
# with open('decision_tree.pkl', 'rb') as file:
+# dt_tuned = pkl.load(file)
+
rf = RandomForestRegressor()
+rf_tuned = RandomizedSearchCV(
+ dt,
+ {
+ 'criterion': ['squared_error', 'absolute_error'],
+ 'max_features': ['sqrt', 'log2'],
+ 'min_samples_split': [i for i in range(2, 10)],
+ 'min_samples_leaf': [i for i in range(1, 5)],
+ 'max_depth': [i for i in range(3, 10, 2)]
+ },
+ n_iter=15,
+ random_state=2137
+)
+
+rf_tuned.fit(X_train, y_train)
+print(rf_tuned.best_estimator_)
+
# with open('random_forest.pkl', 'rb') as file:
+# rf_tuned = pkl.load(file)
+
mlp = MLPRegressor(
+ random_state=2137
+)
+
+mlp_tuned = RandomizedSearchCV(
+ mlp,
+ {
+ 'hidden_layer_sizes': [
+ (10, 100, 20),
+ (5, 50, 50, 10),
+ (25, 100, 20)
+ ]
+ },
+ n_iter=3
+)
+
+mlp_tuned.fit(X_train, y_train)
+print(mlp_tuned.best_estimator_)
+
# with open('mlp.pkl', 'rb') as file:
+# mlp_tuned = pkl.load(file)
+
print(f'decision tree score: {dt_tuned.score(X_test, y_test)}')
+print(f'random forest score: {rf_tuned.score(X_test, y_test)}')
+print(f'neural network score: {mlp_tuned.score(X_test, y_test)}')
+
decision tree score: 0.7122037298977866 +random forest score: 0.6655753723827369 +neural network score: 0.7030360456509592 ++
Decision tree's performance get best scoring, so we will consider it as base model in the following part of the notebook.
+ +observation = df.iloc[[2137]]
+observation
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +rooms_per_household | +bedrooms_per_room | +population_per_household | +ocean_proximity | +median_house_value | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2137 | +-0.075017 | +0.551589 | +-1.083767 | +-0.211208 | +0.064765 | +-0.204406 | +-0.045877 | +-0.62848 | +-0.370457 | +0.803675 | +-0.057143 | +4 | +87500.0 | +
prediction = dt_tuned.predict(observation.drop(columns=['median_house_value']))
+true_value = observation['median_house_value']
+
+print(f'true value: {int(true_value)}')
+print(f'predicted value: {prediction[0]}')
+
true value: 87500 +predicted value: 79850.0 ++
Predicted value is close to real one.
+ +dt_explainer = dx.Explainer(
+ dt_tuned,
+ X,
+ y,
+ label='decision tree'
+)
+
Preparation of a new explainer is initiated + + -> data : 20640 rows 12 cols + -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. + -> target variable : 20640 values + -> model_class : sklearn.model_selection._search.RandomizedSearchCV (default) + -> label : decision tree + -> predict function : <function yhat_default at 0x000001F58FBF98B0> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 3.62e+04, mean = 2.02e+05, max = 5e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -3.88e+05, mean = 4.6e+03, max = 4.31e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
dt_observation_exp = dt_explainer.predict_profile(observation.drop(columns=['median_house_value']))
+
Calculating ceteris paribus: 100%|████████████████████████████████████████████████████| 12/12 [00:00<00:00, 398.05it/s] ++
dt_observation_exp.plot(
+ variables=['median_income', 'ocean_proximity', 'households', 'housing_median_age']
+)
+
Despite the highest scoring, decision tree's decisions are based only on 2-3 variables out of 12 (rest of plots are not shown for sake of notebook clarity). It may suggest that this kind of model cannot use full information that is hidden in data and thus, other models should be considered.
+The greatest impact on the prediciton has median_income
and it follows the rule the richer inhabitants are, the more expensive the neighbourhood is, which is reasonable.
rf_explainer = dx.Explainer(
+ rf_tuned,
+ X,
+ y,
+ label='random forest'
+)
+
+mlp_expaliner = dx.Explainer(
+ mlp_tuned,
+ X,
+ y,
+ label='neural network'
+)
+
Preparation of a new explainer is initiated + + -> data : 20640 rows 12 cols + -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. + -> target variable : 20640 values + -> model_class : sklearn.model_selection._search.RandomizedSearchCV (default) + -> label : random forest + -> predict function : <function yhat_default at 0x000001F58FBF98B0> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 5.24e+04, mean = 1.99e+05, max = 5e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -3.58e+05, mean = 8.16e+03, max = 3.96e+05 + -> model_info : package sklearn + +A new explainer has been created! +Preparation of a new explainer is initiated + + -> data : 20640 rows 12 cols + -> target variable : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray. + -> target variable : 20640 values + -> model_class : sklearn.model_selection._search.RandomizedSearchCV (default) + -> label : neural network + -> predict function : <function yhat_default at 0x000001F58FBF98B0> will be used (default) + -> predict function : Accepts pandas.DataFrame and numpy.ndarray. + -> predicted values : min = 3.14e+04, mean = 2.08e+05, max = 7.78e+05 + -> model type : regression will be used (default) + -> residual function : difference between y and yhat (default) + -> residuals : min = -6.11e+05, mean = -1.2e+03, max = 4.2e+05 + -> model_info : package sklearn + +A new explainer has been created! ++
rf_observation_exp = rf_explainer.predict_profile(observation.drop(columns=['median_house_value']))
+mlp_observation_exp = mlp_expaliner.predict_profile(observation.drop(columns=['median_house_value']))
+
Calculating ceteris paribus: 100%|████████████████████████████████████████████████████| 12/12 [00:00<00:00, 428.69it/s] +Calculating ceteris paribus: 100%|████████████████████████████████████████████████████| 12/12 [00:00<00:00, 387.16it/s] ++
dt_observation_exp.plot((rf_observation_exp, mlp_observation_exp), variables=['median_income', 'ocean_proximity', 'households', 'housing_median_age'])
+
The models comparsion in that certain observation shows that neural network might be more sensitive on changes in households
variables. Moreover, full CP plots (not shown in the notebook) suggest that change in any variable has impact on network decision, whilst it is not true for random forest and decision tree. Further analysis should be performed to check whether neural network changes in CP profiles are reasonable; if so, neural network should be considered as the best model to estimate median price, as its scoring is only a bit lower than in decision tree and its prediction is more subtle.
CP plots show that the best model (in terms of scoring) may not take all information into account and due to that fact, be poor explainer of the real world. However, one should remember that dataset do contain some interactions (like longitute
and latitude
) and correlation (ratio variables) and because of that, CP plots are not methods to explain model performance.
\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "rooms_per_household | \n", + "bedrooms_per_room | \n", + "population_per_household | \n", + "ocean_proximity | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "-1.327835 | \n", + "1.052548 | \n", + "0.982143 | \n", + "-0.804819 | \n", + "-0.972476 | \n", + "-0.974429 | \n", + "-0.977033 | \n", + "2.344766 | \n", + "0.628559 | \n", + "-1.149930 | \n", + "-0.049597 | \n", + "NEAR BAY | \n", + "452600.0 | \n", + "
1 | \n", + "-1.322844 | \n", + "1.043185 | \n", + "-0.607019 | \n", + "2.045890 | \n", + "1.357143 | \n", + "0.861439 | \n", + "1.669961 | \n", + "2.332238 | \n", + "0.327041 | \n", + "-0.990381 | \n", + "-0.092512 | \n", + "NEAR BAY | \n", + "358500.0 | \n", + "
2 | \n", + "-1.332827 | \n", + "1.038503 | \n", + "1.856182 | \n", + "-0.535746 | \n", + "-0.827024 | \n", + "-0.820777 | \n", + "-0.843637 | \n", + "1.782699 | \n", + "1.155620 | \n", + "-1.445865 | \n", + "-0.025843 | \n", + "NEAR BAY | \n", + "352100.0 | \n", + "
3 | \n", + "-1.337818 | \n", + "1.038503 | \n", + "1.856182 | \n", + "-0.624215 | \n", + "-0.719723 | \n", + "-0.766028 | \n", + "-0.733781 | \n", + "0.932968 | \n", + "0.156966 | \n", + "-0.493627 | \n", + "-0.050329 | \n", + "NEAR BAY | \n", + "341300.0 | \n", + "
4 | \n", + "-1.337818 | \n", + "1.038503 | \n", + "1.856182 | \n", + "-0.462404 | \n", + "-0.612423 | \n", + "-0.759847 | \n", + "-0.629157 | \n", + "-0.012881 | \n", + "0.344711 | \n", + "-0.707889 | \n", + "-0.085616 | \n", + "NEAR BAY | \n", + "342200.0 | \n", + "
\n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "rooms_per_household | \n", + "bedrooms_per_room | \n", + "population_per_household | \n", + "ocean_proximity | \n", + "median_house_value | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2137 | \n", + "-0.075017 | \n", + "0.551589 | \n", + "-1.083767 | \n", + "-0.211208 | \n", + "0.064765 | \n", + "-0.204406 | \n", + "-0.045877 | \n", + "-0.62848 | \n", + "-0.370457 | \n", + "0.803675 | \n", + "-0.057143 | \n", + "4 | \n", + "87500.0 | \n", + "
Author: Dawid Pludowski
+ +from sklearn.ensemble import GradientBoostingRegressor
+
+import pandas as pd
+import pickle as pkl
+import dalex as dx
+import os
+
os.chdir('../')
+
# %%capture
+
+with open('resources/models/gradient_boosting_onehot.pkl', 'rb') as f:
+ gb = pkl.load(f)
+
+with open('resources/models/decision_tree_onehot.pkl', 'rb') as f:
+ dt = pkl.load(f)
+
+with open('resources/models/linear_regression_onehot.pkl', 'rb') as f:
+ lr = pkl.load(f)
+
+df_train = pd.read_csv('resources/data/housing_train.csv')
+df_test = pd.read_csv('resources/data/housing_test.csv')
+
+X, y = df_train.drop(columns=['median_house_value']), df_train[['median_house_value']]
+
%%capture
+gb_exp = dx.Explainer(gb, X, y, label='gradient boosting')
+dt_exp = dx.Explainer(dt, X, y, label='decision tree')
+lr_exp = dx.Explainer(lr, X, y, label='linear regression')
+
mp_gb = gb_exp.model_parts()
+mp_dt = dt_exp.model_parts()
+mp_lr = lr_exp.model_parts()
+
gb_exp.plot()
+
mp_dt.plot()
+mp_lr.plot()
+
First three variables (i.e. median_income
, longitude
and latitude
) are the most importnat in performing predictions for all models; however, their impact differs: for linear regression and gradient boosting median_income
is much more important than others variables, whilst for decision tree longitute
and latitude
have similar impact as median_income
.
Variables such as household
and total_rooms
have the least impact on model performace. It may be caused by the fact, that combinations of that varaibles (e.g. ratio) has more information than raw variables alone.
It is woth mentioning that tree-based models value ocean_proximity
more than linear one. As in all models ocean_proximity
was encoded with onehot method, linear model may have problem to use information encoded in such a way. Because of that, one should consider use ordinal encoding rather than onehot, as it is may be more effective for linear models.
All presented models except decision tree have variables which may be consider as unnecessary noise rather than valuable information. For gradient boosting it is households
and total_rooms
, for linear regression it is housing_median_age
, total_rooms
, households
and ocean_proximity
. For final models, two options should be considered:
%load_ext lab_black
+
Author: Dawid Pludowski
+ +from sklearn.ensemble import GradientBoostingRegressor
+
+import pandas as pd
+import pickle as pkl
+import dalex as dx
+import os
+
os.chdir("../")
+
# %%capture
+
+with open("resources/models/gradient_boosting_onehot.pkl", "rb") as f:
+ gb = pkl.load(f)
+
+df_train = pd.read_csv("resources/data/housing_train.csv")
+df_test = pd.read_csv("resources/data/housing_test.csv")
+
+X, y = df_train.drop(columns=["median_house_value"]), df_train[["median_house_value"]]
+
X.head()
+
+ | longitude | +latitude | +housing_median_age | +total_rooms | +total_bedrooms | +population | +households | +median_income | +ocean_proximity | +
---|---|---|---|---|---|---|---|---|---|
0 | +-118.28 | +33.98 | +19.0 | +883.0 | +313.0 | +726.0 | +277.0 | +0.9809 | +<1H OCEAN | +
1 | +-122.23 | +37.46 | +33.0 | +2643.0 | +464.0 | +1015.0 | +427.0 | +4.2232 | +NEAR OCEAN | +
2 | +-118.26 | +33.79 | +42.0 | +1162.0 | +264.0 | +1044.0 | +241.0 | +3.5488 | +<1H OCEAN | +
3 | +-119.26 | +35.87 | +24.0 | +1590.0 | +390.0 | +1686.0 | +372.0 | +1.6469 | +INLAND | +
4 | +-121.96 | +37.54 | +14.0 | +5106.0 | +1207.0 | +2738.0 | +1108.0 | +3.9909 | +<1H OCEAN | +
%%capture
+gb_exp = dx.Explainer(gb, X, y, label='gradient boosting')
+
pdp_gb = gb_exp.model_profile(type="partial")
+ale_gb = gb_exp.model_profile(type="accumulated")
+
+pdp_gb.result["_label_"] = "PDP profiles"
+ale_gb.result["_label_"] = "ALE profiles"
+
Calculating ceteris paribus: 100%|██████████| 9/9 [00:00<00:00, 33.04it/s] +Calculating ceteris paribus: 100%|██████████| 9/9 [00:00<00:00, 32.83it/s] +Calculating accumulated dependency: 100%|██████████| 8/8 [00:00<00:00, 12.00it/s] ++
pdp_gb.plot(
+ geom="profiles",
+ variables=["housing_median_age", "median_income", "latitude", "longitude"],
+)
+
ale_gb.plot(
+ geom="profiles",
+ variables=["housing_median_age", "median_income", "latitude", "longitude"],
+)
+
pdp_gb.plot(
+ ale_gb, variables=["housing_median_age", "median_income", "latitude", "longitude"]
+)
+
On both PDP and ALE profiles lines are closely in parallel, which nay suggest that selected model does not see much interactions in dataset. However, there is a little difference in pikes in longitude
and latitude
: slightly below 38 in latitude
and -122 in longtude
. What it is interesting, it matches to coordinates of San Francisco, where household are relatively more expensive comparing to others locations in Califormia. That shows that gradient boosting was able to obtain the information hidden in interaction between longitude
and latitude
. Similar to San Francisco, Los Angeles has also more expensive housholds; however, selected model seems not to recognize that, as profiles are in parallel in its coordinations (34, -118). On the other hand, on the east and north from Los Angeles there are only few other households in dataset, so maybe that is way model did not learn that interaction.
It is woth mentioning that median_income
profiles lines are more close to each other than lines in housing_medina_age
. That suggests that income of owners has more potential infromation about houshold price than household age.