-
Notifications
You must be signed in to change notification settings - Fork 1
/
RF.py
46 lines (41 loc) · 1.16 KB
/
RF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# imports and definitions
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
#import sys
#file_path = sys.argv[1]
#text = sys.argv[2]
# for random functions
seed_number = 123
# load the dataset to pandas
filename = "cleaned_data.csv"
df = pd.read_csv(filename, header=0)
# get train/test splits
features = df.iloc[:,:4]
targets = df['species'].to_numpy()
test_proportion = 0.3
X_train, X_test, y_train, y_test = train_test_split(
features,
targets,
test_size=test_proportion,
stratify=targets,
random_state=seed_number,
)
# define the model
n_trees = 100
rf = RandomForestClassifier(
n_estimators=n_trees, oob_score=True, random_state=seed_number
)
# train the model
rf.fit(X_train, y_train)
# score our model and print the output
predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
with open("output.txt", "a") as f:
f.write(
"Result from random forest:\n"
"Out-of-bag score estimate: {0:.3f}\n"
"Mean accuracy score: {1:.3f}".format(rf.oob_score_, accuracy
))