dash_app.py

##################################
### SENTIMENT ANALYSER WEB APP ###
##################################
# MICHAELA LAWRENCE 23/10/18

#This python script generates a web app that takes a survey monkey excel sheet output as an input (upload) and analyses the sentiment in the responses to each question that takes plain text as an input. The outputs generated by the web app are: Sentiment analysis (compound scores) using vader; word clouds for each questions responses; box plots for the compund score of each response to a question (response text is displayed with hover); The pie chart displaying the ratios of positive, negative and neutral scores in a question. 

#Importing dash and asociated packages to build the web app and graphs within the web app 
import dash_html_components as html
import dash_core_components as dcc
import dash

import plotly
import dash_table_experiments as dte
from dash.dependencies import Input, Output, State

import pandas as pd
import numpy as np

import json
import datetime
import operator
import os

import base64
import io
import plotly.graph_objs as go

#Importing vader sentiment analyser 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Import wordcloud packages 
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

#To generate lkist of stop words that are not usefull 
import nltk
from nltk.corpus import stopwords


#Initialise the web app
app = dash.Dash()

#Serve the web app localy - acces from the browser given after build 
app.scripts.config.serve_locally = True
#The app will still run if there are errors 
app.config['suppress_callback_exceptions'] = True

##############
### LAYOUT ###
##############

app.layout = html.Div([

#UPLOAD BOX
#Title
    html.H5("Upload survey monkey excel files"),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select Files')
        ]),
        style={
            'width': '98%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        #allow multiple files to be uploaded 
        multiple=False),

#whitespace
    html.Br(),
    #TABLE DISPLAYING QUESTION NUMBERS AND QUESTION TEXT 
    html.H5("Table of questions"),
    #Rows is iven as an empty dictionary in a list - generated later 
    html.Div(dte.DataTable(rows=[{}], id='table')),

    html.Br(),
    #PROPAGATE BUTTON TO GENERATE DROP DOWN OPTIONS - essential
    html.Button(
        id='propagate-button',
        n_clicks=0,
        children='Propagate Table Data'
    ),

    html.Br(),
    #DROPDOWN LIST 
    html.H5("Select which questions you would like to have analysed"),
    dcc.Dropdown(id='dropdown_table_filterColumn',
        multi = True,
        placeholder='Filter Column'),

    #DISPLAY QUESTIONS YOU HAVE SELECTED -simple text 
    html.Div(id = 'output-container'),
    html.H5("Sentiment analysis"),

    #DISPLAY VADER OUTPUTS - simple text 
    html.Div(id = 'vader-output-container'),
    html.Br(),

    #DISPLAY WORDCLUODS 
    html.H5('Wordclouds'),
    html.Div([html.Img(id = 'wordcloud-image-container', src = '')], id = 'wordcloud-div',style={'width': '33%', 'display': 'inline-block'}),
#    html.Div([html.Img(id = 'wordcloud-image-container-pos', src = '')], id = 'wordcloud-div-pos',style={'width': '33%', 'display': 'inline-block'})

    #COMPOUND SCORE BOX PLOTS THAT DISPLAY RESPONSE TEXT ON HOVER 
    html.H5("Compund score box plots"),
    #simple text explenation 
    html.Div('Crosses are the median for each question and diamonds are the first and third quartiles'),
    html.Div(dcc.Graph(id = 'scatter_graph'), id = 'scatter_container'),
    
    #PIE CHART DISPLAYING THE RATIOS OF POSITIVE, NEGATIVE, NEUTAL SENTIMENT FOR THE QUESTION MOST RECENTLY ADDED TO THE DROPDOWN LIST 
    html.H5('Polorisation pie charts'),
    html.Div(dcc.Graph(id = 'pie_chart'), id = 'pie_container')
        

])

###############################
### FUNCTIONS AND CALLBACKS ###
###############################

# FILE UPLOAD
# Takes contents and filename from the upload input 
def parse_contents(contents, filename):
    #split contents into its type and string 
    content_type, content_string = contents.split(',')
    #Define the excel file that was uploaded as decoded
    decoded = base64.b64decode(content_string)
    # If satatement that performed desired actions only if the file is an excel file. 
    if 'xls' in filename:
            # Assume that the user uploaded an excel file
            #sheet names
            x1 = pd.ExcelFile(io.BytesIO(decoded))
            #creating an array of sheet names 
            q_question_lis = x1.sheet_names
            #read the excel file into a dictionary of dataframes - only the question numbers nothing else  
            df_q_only = pd.read_excel(io.BytesIO(decoded), sheet_name = q_question_lis, header = None, usecols =0, skiprows = 1, nrows = 1)
            #get the values ie dataframes (questions) from the dictionary
            lis_of_dfs = list(df_q_only.values())
            #concatanate these dataframes into a single dataframe
            df_q_only = pd.concat(lis_of_dfs)
            #need to change the index because currently its all zeros 
            df_q_only.index = list(range(0, len(q_question_lis)))
            #rename the column
            df_q_only.columns = ['question_str']
            #create a dataframe that contains the question numbers as the index and the question text as the only column
            q_question_lis_df = pd.DataFrame(np.array(q_question_lis).reshape(df_q_only.shape), columns = ['question_num'])
            #concatenate the list of question numbers with the actual questions
            df_q_label = pd.concat([q_question_lis_df, df_q_only], axis = 1)
    else:
        #Tell the user that they have not uploaded an excel file
        return html.Div([
            'There was an error processing this file, did you definitly upload an excel .xls file?.'
        ])
#return the dataframe containing questions to be displayed by the callback bellow
    return df_q_label


#Side note: app.callback works in the following way 
#@app.callback(
#    Output(component_id='', component_property=''),
#    [Input(component_id='', component_property='')]
#)
#One can have as many inputs as they like but only one output (as long as they are all ready written in the layout)


# CALLBACK: table creation
@app.callback(Output('table', 'rows'),
              [Input('upload-data', 'contents'),
               Input('upload-data', 'filename')])
#Functions defined after callbacks have to have inputs in same order as inputs in app.callback 
def update_output(contents, filename):
    #if statement avoids warnings if you havent uploaded anything yet
    if contents is not None:
        df = parse_contents(contents, filename)
        #recall that the table rows takes a list of dictionaries to display the table 
        return df.to_dict('records')
    #else leave the table rows empty - display nothing  
    else:
        return [{}]


# CALLBACK: update options of filter dropdown
#Click the button to populate the dropdown list with questons to select - options could not be updated without using a button 
@app.callback(Output('dropdown_table_filterColumn', 'options'),
              [Input('propagate-button', 'n_clicks'),
               Input('table', 'rows')])
def update_filter_column_options(n_clicks_update, tablerows):
    #If the button has not been clicked the options for the dropdown remain empty  
    if n_clicks_update < 1:
        print("df empty")
        return []

    else:
        dff = pd.DataFrame(tablerows) 

        print("updating... dff empty?:", dff.empty) #result is True, labels stay empty
        #Takig the first column only i.e. Question 1 etc 
        dfff = dff.iloc[:,0]
        #Display Question 1 etc. value recorded as 1 etc 
        return [{'label': i, 'value': index+1} for index, i in dfff.iteritems()]


#DISPLAY THE QUESTION NUMBERS THAT HAVE BEEN SELECTED 
@app.callback(Output('output-container', 'children'),
    [Input('dropdown_table_filterColumn', 'value')])
def update_output_two(value):
    return 'You have selected question numbers: "{}"'.format(value)


#vader analyser
analyzer = SentimentIntensityAnalyzer()


#DISPLAY THE VADER RESULTS 
@app.callback(Output('vader-output-container', 'children'),
    [Input('upload-data', 'contents'),
    Input('dropdown_table_filterColumn', 'value')])
#function
def update_output_vader(contents, value):
    if contents and value is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        #start an empty string 
        bizz = str()
        for i in value:
            #choose the sheet number (question) to look at from the dropdown options values
            sheet_no = 'Question ' + str(i)
            #read in that particular exel sheet question ONLY as a dataframe 
            q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
            #Read in exel sheet text responses only as a new dataframe 
            response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
            #Drop nan values 
            response_df = response_df.dropna()
            #Renaming the column so it doesnt have spaces or capital letters  
            response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
            #resetting the index 
            response_df = response_df.reset_index(drop = True)
            #create a new column in the dataframe that contains the analysis of the response text (as a dictionary with compund, pos, neu, neg)
            response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
            #Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc. 
            objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
            response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
            # finding the average compund score from each of the responses - to be displayed 
            av_compound_score = response_df['compound'].mean()
            #If statement that displayes comments on the average compund score 
            if av_compound_score >= 0.7:
                av_compound_score_string = 'overwhelmingly positive'
            elif av_compound_score >= 0.3:
                av_compound_score_string = 'pretty positive'
            elif av_compound_score >= -0.3:
                av_compound_score_string = 'fairly neutral'
            elif av_compound_score >= 0.3:
                av_compound_score_string = 'pretty negarive'
            else:
                av_compound_score_string = 'overwhelmigly negative'    
            # add the string         
            bizz = bizz + 'Your sentiment analyser score for Question {}'.format(i) +': "{}"'.format(q_df[0][0]) +' is : {0:.3f}, '.format(av_compound_score) + '{}.'.format(av_compound_score_string)
        return bizz
    # else is displayed if dropdown has not been selected
    else:
        return html.Div([
            'No sentiment yet.'
        ])


### FIG TO URI FUNCTION
def fig_to_uri(in_fig, close_all=True, **save_args):
    # type: (plt.Figure) -> str
    """
    Save a figure as a URI
    :param in_fig:
    :return:
    """
    out_img = io.BytesIO() #create an empty BytesIO object 
    in_fig.figure.savefig(out_img, format='png', **save_args) #figure is 'saved' localy 
    if close_all:
        in_fig.figure.clf()
        plt.close('all')
    out_img.seek(0)  # rewind file
    encoded = base64.b64encode(out_img.read()).decode("ascii").replace("\n", "") # encode the image 
    return "data:image/png;base64,{}".format(encoded) # return encoded image


#stopwords = set(STOPWORDS)
stopwords = stopwords.words('english')
# Must include apostrophied words because they are seperated in the stopword list by nltk. Also must include Nothing because some people just write 'Nothing' in the box
stopwords.extend(["Nothing", "would", "I've"])

def wordcloud_subfig_builder(text):
    wordcloud = WordCloud(stopwords=stopwords, max_font_size=150, max_words=100, background_color="white", height = 400, width = 800).generate(text)


@app.callback(Output('wordcloud-image-container', 'src'),
    [Input('upload-data', 'contents'),
    Input('dropdown_table_filterColumn', 'value')])
#function
def build_wordcloud_image(contents, value):
    if contents and value is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        sheet_nos = ['Question ' + str(i) for i in value]
        response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_nos, header = 7, index_col=0, usecols = 2)
        for i, v in enumerate(response_df):
            to_be_cloud_df = response_df.get(v)
            # Drop nan values 
            to_be_cloud_df = to_be_cloud_df.dropna()
            # join all of the reponses into one massive string 
            text = " ".join(to_be_cloud_df['Response Text'])
            # generating the wordcloud 
            wordcloud = WordCloud(stopwords=stopwords, max_font_size=150, max_words=90, background_color="white", height = 350, width = 700).generate(text)
            #column of subplots 
            ax1 = plt.subplot(len(response_df),1,i+1)
            #no need for meaningless axis 
            ax1.axis("off")
            #set title as 'Question 1' etc  
            ax1.set_title(list(response_df.keys())[i])
            ax1.imshow(wordcloud, interpolation="bilinear")
            i = i+1 # add one to the counter i 
        # use fig to uri function 
        out_url = fig_to_uri(ax1)
        return out_url # return the uri of the wordcloud figure we just made 
    else:
        image_filename = 'placeholder.PNG' # replace with your own image
        encoded_image = base64.b64encode(open(image_filename, 'rb').read())
        return 'data:image/png;base64,{}'.format(encoded_image.decode())


# Green text function to be used for positive wordclouds 
def green_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(132,100%%, %d%%)" % np.random.randint(49,51))

# Redtext function to be used for negative wordclouds
def red_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%%, %d%%)" % np.random.randint(49,51))


#################
### BOX PLOTS ###
#################

#Initialising some lists 
# No if statement to avoid warnings for values being empty 
empty_x =[]
empty_y = []
@app.callback(
    Output('scatter_graph', 'figure'),
    [Input('upload-data', 'contents'),
    Input('dropdown_table_filterColumn', 'value')])
def update_graph(contents, value):
#    if contents and value is not None:
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    the_list_of_data = []
    for i in value:
        sheet_no = 'Question ' + str(i)
        q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
        response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
        # change the dataframe that we want to analyse into one massive string MIGHT NOT BE THE BEST IDEA 
        response_df = response_df.dropna()
        response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
        response_df = response_df.reset_index(drop = True)
        response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
        #Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc. 
        objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
        response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
        x_values = [i]*response_df.shape[0]
        y_values = response_df['compound']
        x_and_y = {
            'x': x_values,
            'y': y_values,
            'text':response_df['response_text'],
            'mode': 'markers',
            'marker':{
                'size': 15,
                'opacity': 0.5,
                'line': {'width': 0.5, 'color': 'white'}
            }}
        the_list_of_data.append(x_and_y)
        mean_for_each_x_and_y = {
            'x':[i],
            'y':np.percentile(response_df['compound'], [50]),
            'mode': 'markers',
            'marker': {
                'size': 20,
                'opacity':0.9,
                'symbol':'cross'}
        }
        the_list_of_data.append(mean_for_each_x_and_y)
        first_and_third_quartiles = {
            'x':[i]*2,
            'y':np.percentile(response_df['compound'], [25, 75]),
            'mode':'markers',
            'marker':{
                'size':18,
                'opacity':0.9,
                'symbol':'diamond'
            }
        }
        the_list_of_data.append(first_and_third_quartiles)
    return {
        'data': the_list_of_data,
        'layout': 
            {'xaxis': {
                'title': 'Question number',
                'type': 'linear'
            },
            'yaxis':{
                'title': 'Compound score',
                'type': 'linear',
                'range': [-1, 1]
            },
            'margin':{'l': 40, 'b': 40, 't': 10, 'r': 0},
            'hovermode':'closest',
            'showlegend':False}
    }

#################
### PIE CHART ###
#################

#Currently only displays a single pie chart - corresponding the the question most recently added to the dropdown list 
# No if statement to avoid warnings for values being empty 
@app.callback(
    Output('pie_chart', 'figure'),
    [Input('upload-data', 'contents'),
    Input('dropdown_table_filterColumn', 'value')])
def update_graph(contents, value):
#    if contents and value is not None:
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    #Starting empty lists 
    the_list_of_data_pie = []
    the_dic_for_layout_pie = []
    for i in value:
        sheet_no = 'Question ' + str(i)
        q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
        response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
        # change the dataframe that we want to analyse into one massive string MIGHT NOT BE THE BEST IDEA 
        response_df = response_df.dropna()
        response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
        response_df = response_df.reset_index(drop = True)
        response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
        #Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc. 
        objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
        response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
        pie_values = {
            'values': [response_df['pos'].mean(), response_df['neg'].mean(), response_df['neu'].mean()],
            'labels': ['positive', 'negative', 'neutral'],
#            'text':response_df['response_text'],
            'type':'pie',
            'name':sheet_no,
            "domain": {"x": [0, i]}
            }        
        the_list_of_data_pie.append(pie_values)
        for_layout = {
            'title':sheet_no,
            'margin':{'l': 40, 'b': 40, 't': 10, 'r': 0},
            'hovermode':'closest',
            'showlegend':False}
        the_dic_for_layout_pie.append(for_layout)
    #Return data and layout for pie chart 
    return {
        'data': the_list_of_data_pie,
        'layout': the_dic_for_layout_pie}


#FORMATTING PURPOSES ONLY - CAN BE DELETED NO PROBLEMS 
app.css.append_css({
    "external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"
})

if __name__ == '__main__':
    app.run_server(debug=True)