-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
274 lines (227 loc) · 9.42 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import json
import os
import pickle
from string import punctuation
import pandas as pd
from sqlalchemy import create_engine
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
# Set up classes
class WordCount(BaseEstimator, TransformerMixin):
'''
Custom scikit-learn transformer to count the number of words in text.
'''
def word_count(self, text):
table = text.maketrans(dict.fromkeys(punctuation))
words = word_tokenize(text.lower().strip().translate(table))
return len(words)
def fit(self, x, y=None):
return self
def transform(self, x):
count = pd.Series(x).apply(self.word_count)
return pd.DataFrame(count)
class CharacterCount(BaseEstimator, TransformerMixin):
'''
Custom scikit-learn transformer to count the number of characters in text,
including spaces and punctuation.
'''
def character_count(self, text):
return len(text)
def fit(self, x, y=None):
return self
def transform(self, x):
count = pd.Series(x).apply(self.character_count)
return pd.DataFrame(count)
class NounCount(BaseEstimator, TransformerMixin):
'''
Custom scikit-learn transformer to count the number of nouns in text after
tokenization including removal of stop words, lemmatization of nouns and
verbs, and stemming, using nltk's WordNetLemmatizer and PorterStemmer.
'''
def noun_count(self, text):
count = 0
sentence_list = sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
for _, tag in pos_tags:
if tag in ['PRP', 'NN']:
count += 1
return count
def fit(self, x, y=None):
return self
def transform(self, x):
count = pd.Series(x).apply(self.noun_count)
return pd.DataFrame(count)
class VerbCount(BaseEstimator, TransformerMixin):
'''
Custom scikit-learn transformer to count the number of nouns in text after
tokenization using a custom "tokenize" function.
'''
def verb_count(self, text):
count = 0
sentence_list = sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
for _, tag in pos_tags:
if tag in ['VB', 'VBP']:
count += 1
return count
def fit(self, x, y=None):
return self
def transform(self, x):
count = pd.Series(x).apply(self.verb_count)
return pd.DataFrame(count)
# Create functions
def load_data(database_filepath):
'''
Load 'messages' table from a database and extract X and Y values and
category names.
'''
engine_location = 'sqlite:///' + database_filepath
engine = create_engine(engine_location)
df = pd.read_sql_table('messages', engine)
X = df['message']
Y = df.loc[:, 'related':'direct_report']
category_names = Y.columns
return X, Y, category_names
def tokenize(text):
'''
Tokenizes text after standardizing text, removing punctuation and stop words
by lemmatizing nouns and verbs, and stemming, using nltk's WordNetLemmatizer
and PorterStemmer.
'''
table = text.maketrans(dict.fromkeys(punctuation))
words = word_tokenize(text.lower().strip().translate(table))
words = [word for word in words if word not in stopwords.words('english')]
lemmed = [WordNetLemmatizer().lemmatize(word) for word in words]
lemmed = [WordNetLemmatizer().lemmatize(word, pos='v') for word in lemmed]
stemmed = [PorterStemmer().stem(word) for word in lemmed]
return stemmed
def build_model(X_train, Y_train, params=None):
'''
Create a multi-output Random Forest classifier machine learning pipeline for
natural language processing with tdidf, word_count, character_count,
noun_count, and verb_count features. If params are provided, grid search is
conducted for optimization. Default paramas are max_df=0.5,
max_features=5000, ngram_range=(1, 2), use_idf=False for tfidf feature and
min_samples_split=25, max_depth=500, n_estimators=300 for the classifier.
Args:
X_train: Array-like. Text to be analayzed.
Y_train: Array-like. Classification labels.
params: Optionall. Dictionary. Range of parameters to search with grid
search.
Returns:
Fitted Random Forest classifer.
'''
if not params:
model = Pipeline([
("features", FeatureUnion([
("text", TfidfVectorizer(tokenizer=tokenize, max_df=0.5,
max_features=5000, ngram_range=(1, 2),
use_idf=False)),
("word_count", WordCount()),
("character_count", CharacterCount()),
("noun_count", NounCount()),
("verb_count", VerbCount())
])),
("clf", MultiOutputClassifier(RandomForestClassifier(
min_samples_split=25, random_state=42,
max_depth=500, n_estimators=300)))
])
else:
pipeline = Pipeline([
("features", FeatureUnion([
("text", TfidfVectorizer(tokenizer=tokenize)),
("word_count", WordCount()),
("character_count", CharacterCount()),
("noun_count", NounCount()),
("verb_count", VerbCount())
])),
("clf", MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])
scorer = make_scorer(f1_score, average='micro')
model = GridSearchCV(pipeline, params, cv=5, n_jobs=3, scoring=scorer)
print('Training model...')
model.fit(X_train, Y_train)
return model
def evaluate_model(model, X_test, Y_test, col_names):
'''
Print the precision, recall and f1-scores for a multi-output
classification.
'''
y_preds = model.predict(X_test)
for label, pred, col in zip(Y_test.values.transpose(), y_preds.transpose(),
col_names):
print(col)
print(classification_report(label, pred))
def save_model(model, model_filepath):
'''
Pickle model in specified location.
'''
# Assume maximum depth of one directory for location
if model_filepath.find('/'):
folder_name = model_filepath.split('/')[0]
if not os.path.exists(folder_name):
os.makedirs(folder_name)
with open(model_filepath, 'wb') as file:
pickle.dump(model, file)
def main(database_filepath, model_filepath, params):
'''
Extract datafrom database, train a multi-output Random Forrest classifier,
print evaluation statistics, and save the model.
'''
print('Loading data...\n DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print('Building model...')
model = build_model(X_train, Y_train, params)
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)
print('Saving model...\n MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)
print('Trained model saved!')
if __name__ == '__main__':
# Create argparser
import argparse
parser = argparse.ArgumentParser(description='Categorize train ml/nlp pipeline')
parser.add_argument("database_filepath", help="File path for database")
parser.add_argument("model_filepath", help="File path for saving model")
parser.add_argument('-d', '--params_dict',
help='Dictionary of model parameters. Dictionary should be\
passed in string form with values in a list, e.g. \
"{key: [value(s)]}". To see available params, use \
"train_classifer.py database/filepath model/filepath\
-p"',
type=json.loads)
parser.add_argument('-p', '--available_params', action='store_true',
help='Details of model parameter keys')
args = parser.parse_args()
if args.available_params:
pipeline = Pipeline([
("features", FeatureUnion([
("text", TfidfVectorizer(tokenizer=tokenize)),
("word_count", WordCount()),
("character_count", CharacterCount()),
("noun_count", NounCount()),
("verb_count", VerbCount())
])),
("clf", MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])
print(pipeline.get_params())
else:
main(database_filepath=args.database_filepath,
model_filepath=args.model_filepath, params=args.params_dict)