evaluate.py

#! /usr/bin/env python
"""
Filename: evaluate.py
Author: Emily Daniels
Date: May 2016
Purpose: A Naive Bayes classifier determines if the poems generated by a
recurrent neural network trained on a collection of one author's novels would
also reflect the writing style of a second author and retains the poems at
that intersection.
"""
from __future__ import print_function
from nltk.classify import PositiveNaiveBayesClassifier
import csv
import re
import os
import nltk
import itertools
from generate import Generate
from datetime import datetime


def split_text(filename):
    with open(filename, 'rU') as f:
        reader = csv.reader(f, skipinitialspace=True)
        reader.next()
        # extra decoding to account for non UTF-8 characters
        sentences = itertools.chain(*[nltk.sent_tokenize(
            x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower()) for
                                      x in reader])
    return sentences


def create_word_emotion_map(filename):
    i = 0
    w_e_map = {}
    # format: aback \t anger \t 0
    with open(filename, 'rU') as f:
        rows = csv.reader(f, delimiter='\t')
        for target_word, category, association in rows:
            word = {'word': target_word.decode('utf-8').lower(),
                    'category': category,
                    'association': association}
            w_e_map[i] = word
            i += 1
    return w_e_map


def extract_words_for_emotions(emotions, w_e_map):
    w_e_list = []
    for emotion in emotions:
        for v in w_e_map.itervalues():
            if emotion == v['category'] and v['association'] != '0':
                w_e_list.append(v['word'])
    return w_e_list


def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


def write_poems(emotions, train_file, compare_file, poems):
    filename = 'Poems_of_' + "_".join(emotions) + '_by_' + os.path.splitext(
        os.path.basename(train_file))[0] + '_ala_' + \
               os.path.splitext(os.path.basename(compare_file))[
                   0] + '_' + str(
        datetime.now().strftime("%Y-%m-%d %H-%M-%S")) + '.txt'
    with open(filename, "wb") as f:
        for poem in poems:
            f.write("%s\n" % poem)


if __name__ == "__main__":
    # define the parameters to be used to train the model-
    # here some standard values are used
    # these can be changed when first training a model, but if using a
    # pre-trained model they must match what was used
    vocabulary_size = 8000
    hidden_dim = 80
    learning_rate = 0.005
    nepoch = 100

    # this parameter is set to false so that the most recent model file will be
    # loaded and used- if the model has not been trained, set it to true to
    # start the training
    enable_training = False

    # create a csv file of the first author's novels stripped of anything
    # not part of the text
    train_file = 'data/austen.csv'

    # create a csv file of the second author's novels stripped of anything
    # not part of the text
    compare_file = 'data/h.g.wells.csv'

    # either the most recently trained file or use none to start a new training
    model_file = 'data/rnn-austen-80-8000-2016-04-19-14-09-39.npz'

    # NRC Emotion Lexicon: http://www.saifmohammad.com/WebPages/lexicons.html
    word_emolex = 'data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'

    # list of emotions to use as a filter for the poems generated
    emotions = ['anger', 'sadness']

    # the number of sentences the model should generate and use to select haiku
    # worthy lines
    num_sentences = 1000

    # generate haiku poems from the first author's novels
    print("Generating poems...")
    generate = Generate(vocabulary_size, hidden_dim, learning_rate, nepoch,
                        enable_training, model_file, train_file, num_sentences)
    poems = generate.poems

    print("Creating word to emotion map...")
    w_e_map = create_word_emotion_map(word_emolex)
    specific_word_emo_list = extract_words_for_emotions(emotions, w_e_map)

    print("Filtering through specific emotions...")
    filtered_emotion_poem_list = []
    for word in specific_word_emo_list:
        for poem in poems:
            if re.search(word, poem):
                filtered_emotion_poem_list.append(poem)
                poems.remove(poem)

    # train the classifier to distinguish between the different writing styles
    print("Training classifier...")
    first_author_sentences = split_text(train_file)
    second_author_sentences = split_text(compare_file)
    positive_featuresets = list(map(features, second_author_sentences))
    unlabeled_featuresets = list(map(features, first_author_sentences))
    classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
                                                    unlabeled_featuresets)

    print("Filtering through specific influences...")
    filtered_influences_poem_list = []
    for poem in filtered_emotion_poem_list:
        if classifier.classify(features(poem)):
            filtered_influences_poem_list.append(poem)

    # write the filtered list of poems to a text file
    print("Writing poems to file...")
    write_poems(emotions, train_file, compare_file,
                filtered_influences_poem_list)