-
Notifications
You must be signed in to change notification settings - Fork 0
/
rf_mok.py
150 lines (101 loc) · 8.88 KB
/
rf_mok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
#Reading in features
features = pd.read_csv('malaria_data/mok_meta.csv', index_col=False)
#print('The shape of our features is:', features.shape)
# Descriptive statistics for each column
# print(features.describe())
labels = np.array(features['Clearance'])# Remove the labels from the features
# axis 1 refers to the columns
features = features.drop('SampleID', axis = 1)
features = features.drop('GenotypeID', axis = 1)
features = features.drop('Clearance', axis = 1)# Saving feature names for later use
#print(features.columns)
# One-hot encode the data using pandas get_dummies
# print(features.iloc[:,5:].head(5))
features = pd.get_dummies(features)# Display the first 5 rows of the last 12 columns
feature_list = list(features.columns)# Convert to numpy array
features = np.array(features)
# print(feature_list)
# Using Skicit-learn to split data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 50)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
# # The baseline predictions are the historical averages
# baseline_preds = test_features[:, feature_list.index('average')]# Baseline errors, and display average baseline error
# baseline_errors = abs(baseline_preds - test_labels)print('Average baseline error: ', round(np.mean(baseline_errors), 2))Average baseline error: 5.06 degrees.
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)# Train the model on training data
rf.fit(train_features, train_labels)
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)# Calculate the absolute errors
errors = abs(predictions - test_labels)# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))
print("actual prediction")
for i in range(len(predictions)):
print(test_labels[i], predictions[i])
# [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
# Import tools needed for visualization
# from sklearn.tree import export_graphviz
# import pydot# Pull out one tree from the forest
# tree = rf.estimators_[5]# Import tools needed for visualization
# from sklearn.tree import export_graphviz
# import pydot# Pull out one tree from the forest
# tree = rf.estimators_[5]# Export the image to a dot file
# export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)# Use dot file to create a graph
# (graph, ) = pydot.graph_from_dot_file('tree.dot')# Write graph to a png file
# graph.write_png('tree.png')
# Get numerical feature importances
'''
importances = list(rf.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = False)# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
'''
# plt.style.use('fivethirtyeight')# list of x locations for plotting
# x_values = list(range(len(importances)))# Make a bar chart
# plt.bar(x_values, importances, orientation = 'vertical')# Tick labels for x axis
# plt.xticks(x_values, feature_list, rotation='vertical')# Axis labels and title
# plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances')
# Use datetime for creating date objects for plotting
# import datetime# Dates of training values
# months = features[:, feature_list.index('month')]
# days = features[:, feature_list.index('day')]
# years = features[:, feature_list.index('year')]# List and then convert to datetime object
# dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
# dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]# Dataframe with true values and dates
# test_samples1 = features[:, feature_list.index('RNA')]
# print(len(test_samples1))
# true_data = pd.DataFrame(data = {'sample': test_samples1, 'actual': labels})# Dates of predictions
# # months = test_features[:, feature_list.index('month')]
# # days = test_features[:, feature_list.index('day')]
# # years = test_features[:, feature_list.index('year')]# Column of dates
# # test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]# Convert to datetime objects
# # test_dates = [datetime.datetime.strptime(sample, '%Y-%m-%d') for date in test_dates]# Dataframe with predictions and dates
# test_samples2 = test_features[:, feature_list.index('GenotypeID')]
# predictions_data = pd.DataFrame(data = {'sample': test_samples2, 'prediction': predictions})# Plot the actual values
# plt.plot(true_data['sample'], true_data['actual'], 'b-', label = 'actual')# Plot the predicted values
# plt.plot(predictions_data['sample'], predictions_data['prediction'], 'ro', label = 'prediction')
# plt.xticks(rotation = '60');
# plt.legend()# Graph labels
# plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values')
fig, ax = plt.subplots()
#ax.plot(range(len(test_labels)), test_labels,label = "Actual Clearence")
#ax.plot(range(len(predictions)), predictions,label = "Predictions")
ax.scatter(range(len(test_labels)), test_labels,label = "Actual Clearence")
ax.scatter(range(len(predictions)), predictions,label = "Predictions")
ax.set(xlabel='Patients', ylabel='Time', title='Mok Data Set Malaria Clearence Predictions')
ax.legend()
#plt.plot(range(262), test_labels, 'b-', label = 'actual')# Plot the predicted values
#plt.plot(range(262), predictions, 'ro', label = 'prediction')
plt.show()