-
Notifications
You must be signed in to change notification settings - Fork 0
/
modelling.py
113 lines (88 loc) · 5.22 KB
/
modelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 18 20:45:25 2023
@author: Florian Korn
"""
# imports
import os
os.chdir('C:/Eigene Dateien/HackaTumProjekt')
from file_reader import reader_groundwater, reader_river
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import datetime
from sklearn.metrics import r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# build data set
def load_data():
df_groundwater = reader_groundwater('src/groundwater/')
df_wheater = pd.read_excel("src/temperature_data/meteostat_export.xlsx", parse_dates=['date'])
df_river = reader_river('src/river/')
return df_groundwater, df_wheater, df_river
df_groundwater, df_wheater, df_river = load_data()
def plz_to_messstation(PLZ):
mapping_plz_municipality = pd.read_excel('src/mapping/munich_postal_code.xlsx', index_col = 0, header = None)
mapping_plz_municipality['combined'] = mapping_plz_municipality.values.tolist()
municipality = mapping_plz_municipality[mapping_plz_municipality['combined'].map(lambda x:PLZ in x)].index.values[0]
municipality_to_groundwater = pd.read_csv('src/mapping/municipality_to_groundwater.csv', sep = ',', index_col = 0)
return str(int(municipality_to_groundwater[municipality_to_groundwater['name'] == municipality]['groundwater_ID'].values[0]))
def train_model(PLZ):
messstation_filter = plz_to_messstation(PLZ)
df_groundwater, df_wheater, df_river = load_data()
df_ml = df_river.merge(df_wheater, left_on = 'Datum', right_on = 'date', how = 'outer').merge(df_groundwater[df_groundwater['messstation'].isin([messstation_filter])][['Datum', 'Grundwasserstand [m ü. NN]']], on = 'Datum', how = 'outer')
df_ml = df_ml.drop(columns = 'date')
df_ml = df_ml.set_index('Datum')
df_ml = df_ml[df_ml.index >= datetime.datetime(2018, 1, 1, 0, 0)]
df_ml['season'] = df_ml.index.month
df_ml.dropna(subset=['Grundwasserstand [m ü. NN]'], inplace=True)
num_attribs = df_ml.columns
num_attribs = num_attribs.to_list()
num_attribs = df_ml.columns.to_list()[:-1]
imputer = make_column_transformer((SimpleImputer(strategy = 'constant', fill_value = 0, copy = False), ['prcp', 'snow', 'wdir']),
remainder = 'passthrough',
verbose_feature_names_out = False)
imputer2 = make_column_transformer((SimpleImputer(strategy = 'mean', copy = False), ['tsun']),
remainder = 'passthrough',
verbose_feature_names_out = False)
scaling = make_column_transformer((StandardScaler(), num_attribs),
remainder = 'passthrough',
verbose_feature_names_out = False)
preprocessing = make_pipeline(imputer, imputer2, scaling).set_output(transform="pandas")
# Train and Test set
X_train, X_test, y_train, y_test = train_test_split(df_ml.iloc[:, :-1],
df_ml.iloc[:, [-1]],
train_size = 0.7,
random_state = 190,
shuffle=False)
sequential_features = SequentialFeatureSelector(estimator = LinearRegression(),
n_features_to_select = 'auto',
direction = 'forward',
scoring = 'r2',
n_jobs = -1)
pipe_LR = make_pipeline(preprocessing,
sequential_features,
LinearRegression(n_jobs = -1)) # standardization needed because of lasso ridge regression (big coefficients -> strong punishment)
pipe_LR = pipe_LR.fit(X_train, y_train)
pipe_LR.score(X_test, y_test)
#print(pipe_LR.steps[1][-1].get_feature_names_out(df_ml.iloc[:,:-1].columns))
return make_pipeline(preprocessing,
sequential_features,
LinearRegression(n_jobs = -1)).fit(df_ml.iloc[:, :-1], df_ml.iloc[:, [-1]]), df_ml, preprocessing
def visualisation_groundwater(LR_model, df_ml, preprocessing, date):
date = pd.to_datetime(date)
df_ml = df_ml[df_ml.index < date]
df_ml = df_ml.sort_index().iloc[-6:-1, :-2]
pred = LR_model.predict(preprocessing.fit_transform(df_ml))
data = [
{'date': str(date), 'level': pred[4], 'type': 'Predicted'},
{'date': str(date), 'level': pred[3], 'type': 'Predicted'},
{'date': str(date), 'level': pred[2], 'type': 'Predicted'},
{'date': str(date), 'level': pred[1], 'type': 'Predicted'},
{'date': str(date), 'level': pred[0], 'type': 'Predicted'}
]
return data