-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes_trainer.py
58 lines (49 loc) · 1.71 KB
/
naive_bayes_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import re
import numpy as np
from mrjob.job import MRJob
from operator import itemgetter
from mrjob.step import MRStep
word_search_re = re.compile(r"[\w]+")
class NaiveBayesTrainer(MRJob):
def steps(self):
"""
first step: map gender with each words and then count probability of word in post
second step: combine frequencies for each gender
"""
return [
MRStep(mapper=self.extract_words_mapping,
reducer=self.reducer_count_words),
MRStep(reducer=self.compare_words_reducer)
]
def extract_words_mapping(self, key, value):
"""
:returns generator gender-word tuples and probability of 1 word
"""
# split text by whitespaces
tokens = value.split()
# get the gender
gender = eval(tokens[0])
blog_post = eval(" ".join(tokens[1:]))
all_words = word_search_re.findall(blog_post)
all_words = [word.lower() for word in all_words]
for word in all_words:
yield (gender, word, len(all_words)), 1
def reducer_count_words(self, key, flag):
"""
:returns generator of word and gender-probability tuple
"""
frequency = sum(flag)
gender, word, num_words = key
yield word, (gender, float(frequency / num_words))
def compare_words_reducer(self, word, values):
"""
:returns for each word its dictionary containing gender as a key and prob as a value
"""
per_gender = {}
for value in values:
gender, probability = value
per_gender[gender] = probability
yield word, per_gender
if __name__ == '__main__':
NaiveBayesTrainer.run()