-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate.py
145 lines (120 loc) · 5.19 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/env python
"""
Filename: evaluate.py
Author: Emily Daniels
Date: May 2016
Purpose: A Naive Bayes classifier determines if the poems generated by a
recurrent neural network trained on a collection of one author's novels would
also reflect the writing style of a second author and retains the poems at
that intersection.
"""
from __future__ import print_function
from nltk.classify import PositiveNaiveBayesClassifier
import csv
import re
import os
import nltk
import itertools
from generate import Generate
from datetime import datetime
def split_text(filename):
with open(filename, 'rU') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# extra decoding to account for non UTF-8 characters
sentences = itertools.chain(*[nltk.sent_tokenize(
x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower()) for
x in reader])
return sentences
def create_word_emotion_map(filename):
i = 0
w_e_map = {}
# format: aback \t anger \t 0
with open(filename, 'rU') as f:
rows = csv.reader(f, delimiter='\t')
for target_word, category, association in rows:
word = {'word': target_word.decode('utf-8').lower(),
'category': category,
'association': association}
w_e_map[i] = word
i += 1
return w_e_map
def extract_words_for_emotions(emotions, w_e_map):
w_e_list = []
for emotion in emotions:
for v in w_e_map.itervalues():
if emotion == v['category'] and v['association'] != '0':
w_e_list.append(v['word'])
return w_e_list
def features(sentence):
words = sentence.lower().split()
return dict(('contains(%s)' % w, True) for w in words)
def write_poems(emotions, train_file, compare_file, poems):
filename = 'Poems_of_' + "_".join(emotions) + '_by_' + os.path.splitext(
os.path.basename(train_file))[0] + '_ala_' + \
os.path.splitext(os.path.basename(compare_file))[
0] + '_' + str(
datetime.now().strftime("%Y-%m-%d %H-%M-%S")) + '.txt'
with open(filename, "wb") as f:
for poem in poems:
f.write("%s\n" % poem)
if __name__ == "__main__":
# define the parameters to be used to train the model-
# here some standard values are used
# these can be changed when first training a model, but if using a
# pre-trained model they must match what was used
vocabulary_size = 8000
hidden_dim = 80
learning_rate = 0.005
nepoch = 100
# this parameter is set to false so that the most recent model file will be
# loaded and used- if the model has not been trained, set it to true to
# start the training
enable_training = False
# create a csv file of the first author's novels stripped of anything
# not part of the text
train_file = 'data/austen.csv'
# create a csv file of the second author's novels stripped of anything
# not part of the text
compare_file = 'data/h.g.wells.csv'
# either the most recently trained file or use none to start a new training
model_file = 'data/rnn-austen-80-8000-2016-04-19-14-09-39.npz'
# NRC Emotion Lexicon: http://www.saifmohammad.com/WebPages/lexicons.html
word_emolex = 'data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'
# list of emotions to use as a filter for the poems generated
emotions = ['anger', 'sadness']
# the number of sentences the model should generate and use to select haiku
# worthy lines
num_sentences = 1000
# generate haiku poems from the first author's novels
print("Generating poems...")
generate = Generate(vocabulary_size, hidden_dim, learning_rate, nepoch,
enable_training, model_file, train_file, num_sentences)
poems = generate.poems
print("Creating word to emotion map...")
w_e_map = create_word_emotion_map(word_emolex)
specific_word_emo_list = extract_words_for_emotions(emotions, w_e_map)
print("Filtering through specific emotions...")
filtered_emotion_poem_list = []
for word in specific_word_emo_list:
for poem in poems:
if re.search(word, poem):
filtered_emotion_poem_list.append(poem)
poems.remove(poem)
# train the classifier to distinguish between the different writing styles
print("Training classifier...")
first_author_sentences = split_text(train_file)
second_author_sentences = split_text(compare_file)
positive_featuresets = list(map(features, second_author_sentences))
unlabeled_featuresets = list(map(features, first_author_sentences))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
unlabeled_featuresets)
print("Filtering through specific influences...")
filtered_influences_poem_list = []
for poem in filtered_emotion_poem_list:
if classifier.classify(features(poem)):
filtered_influences_poem_list.append(poem)
# write the filtered list of poems to a text file
print("Writing poems to file...")
write_poems(emotions, train_file, compare_file,
filtered_influences_poem_list)