-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdash_app.py
508 lines (424 loc) · 20.2 KB
/
dash_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
##################################
### SENTIMENT ANALYSER WEB APP ###
##################################
# MICHAELA LAWRENCE 23/10/18
#This python script generates a web app that takes a survey monkey excel sheet output as an input (upload) and analyses the sentiment in the responses to each question that takes plain text as an input. The outputs generated by the web app are: Sentiment analysis (compound scores) using vader; word clouds for each questions responses; box plots for the compund score of each response to a question (response text is displayed with hover); The pie chart displaying the ratios of positive, negative and neutral scores in a question.
#Importing dash and asociated packages to build the web app and graphs within the web app
import dash_html_components as html
import dash_core_components as dcc
import dash
import plotly
import dash_table_experiments as dte
from dash.dependencies import Input, Output, State
import pandas as pd
import numpy as np
import json
import datetime
import operator
import os
import base64
import io
import plotly.graph_objs as go
#Importing vader sentiment analyser
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#Import wordcloud packages
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
#To generate lkist of stop words that are not usefull
import nltk
from nltk.corpus import stopwords
#Initialise the web app
app = dash.Dash()
#Serve the web app localy - acces from the browser given after build
app.scripts.config.serve_locally = True
#The app will still run if there are errors
app.config['suppress_callback_exceptions'] = True
##############
### LAYOUT ###
##############
app.layout = html.Div([
#UPLOAD BOX
#Title
html.H5("Upload survey monkey excel files"),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '98%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
#allow multiple files to be uploaded
multiple=False),
#whitespace
html.Br(),
#TABLE DISPLAYING QUESTION NUMBERS AND QUESTION TEXT
html.H5("Table of questions"),
#Rows is iven as an empty dictionary in a list - generated later
html.Div(dte.DataTable(rows=[{}], id='table')),
html.Br(),
#PROPAGATE BUTTON TO GENERATE DROP DOWN OPTIONS - essential
html.Button(
id='propagate-button',
n_clicks=0,
children='Propagate Table Data'
),
html.Br(),
#DROPDOWN LIST
html.H5("Select which questions you would like to have analysed"),
dcc.Dropdown(id='dropdown_table_filterColumn',
multi = True,
placeholder='Filter Column'),
#DISPLAY QUESTIONS YOU HAVE SELECTED -simple text
html.Div(id = 'output-container'),
html.H5("Sentiment analysis"),
#DISPLAY VADER OUTPUTS - simple text
html.Div(id = 'vader-output-container'),
html.Br(),
#DISPLAY WORDCLUODS
html.H5('Wordclouds'),
html.Div([html.Img(id = 'wordcloud-image-container', src = '')], id = 'wordcloud-div',style={'width': '33%', 'display': 'inline-block'}),
# html.Div([html.Img(id = 'wordcloud-image-container-pos', src = '')], id = 'wordcloud-div-pos',style={'width': '33%', 'display': 'inline-block'})
#COMPOUND SCORE BOX PLOTS THAT DISPLAY RESPONSE TEXT ON HOVER
html.H5("Compund score box plots"),
#simple text explenation
html.Div('Crosses are the median for each question and diamonds are the first and third quartiles'),
html.Div(dcc.Graph(id = 'scatter_graph'), id = 'scatter_container'),
#PIE CHART DISPLAYING THE RATIOS OF POSITIVE, NEGATIVE, NEUTAL SENTIMENT FOR THE QUESTION MOST RECENTLY ADDED TO THE DROPDOWN LIST
html.H5('Polorisation pie charts'),
html.Div(dcc.Graph(id = 'pie_chart'), id = 'pie_container')
])
###############################
### FUNCTIONS AND CALLBACKS ###
###############################
# FILE UPLOAD
# Takes contents and filename from the upload input
def parse_contents(contents, filename):
#split contents into its type and string
content_type, content_string = contents.split(',')
#Define the excel file that was uploaded as decoded
decoded = base64.b64decode(content_string)
# If satatement that performed desired actions only if the file is an excel file.
if 'xls' in filename:
# Assume that the user uploaded an excel file
#sheet names
x1 = pd.ExcelFile(io.BytesIO(decoded))
#creating an array of sheet names
q_question_lis = x1.sheet_names
#read the excel file into a dictionary of dataframes - only the question numbers nothing else
df_q_only = pd.read_excel(io.BytesIO(decoded), sheet_name = q_question_lis, header = None, usecols =0, skiprows = 1, nrows = 1)
#get the values ie dataframes (questions) from the dictionary
lis_of_dfs = list(df_q_only.values())
#concatanate these dataframes into a single dataframe
df_q_only = pd.concat(lis_of_dfs)
#need to change the index because currently its all zeros
df_q_only.index = list(range(0, len(q_question_lis)))
#rename the column
df_q_only.columns = ['question_str']
#create a dataframe that contains the question numbers as the index and the question text as the only column
q_question_lis_df = pd.DataFrame(np.array(q_question_lis).reshape(df_q_only.shape), columns = ['question_num'])
#concatenate the list of question numbers with the actual questions
df_q_label = pd.concat([q_question_lis_df, df_q_only], axis = 1)
else:
#Tell the user that they have not uploaded an excel file
return html.Div([
'There was an error processing this file, did you definitly upload an excel .xls file?.'
])
#return the dataframe containing questions to be displayed by the callback bellow
return df_q_label
#Side note: app.callback works in the following way
#@app.callback(
# Output(component_id='', component_property=''),
# [Input(component_id='', component_property='')]
#)
#One can have as many inputs as they like but only one output (as long as they are all ready written in the layout)
# CALLBACK: table creation
@app.callback(Output('table', 'rows'),
[Input('upload-data', 'contents'),
Input('upload-data', 'filename')])
#Functions defined after callbacks have to have inputs in same order as inputs in app.callback
def update_output(contents, filename):
#if statement avoids warnings if you havent uploaded anything yet
if contents is not None:
df = parse_contents(contents, filename)
#recall that the table rows takes a list of dictionaries to display the table
return df.to_dict('records')
#else leave the table rows empty - display nothing
else:
return [{}]
# CALLBACK: update options of filter dropdown
#Click the button to populate the dropdown list with questons to select - options could not be updated without using a button
@app.callback(Output('dropdown_table_filterColumn', 'options'),
[Input('propagate-button', 'n_clicks'),
Input('table', 'rows')])
def update_filter_column_options(n_clicks_update, tablerows):
#If the button has not been clicked the options for the dropdown remain empty
if n_clicks_update < 1:
print("df empty")
return []
else:
dff = pd.DataFrame(tablerows)
print("updating... dff empty?:", dff.empty) #result is True, labels stay empty
#Takig the first column only i.e. Question 1 etc
dfff = dff.iloc[:,0]
#Display Question 1 etc. value recorded as 1 etc
return [{'label': i, 'value': index+1} for index, i in dfff.iteritems()]
#DISPLAY THE QUESTION NUMBERS THAT HAVE BEEN SELECTED
@app.callback(Output('output-container', 'children'),
[Input('dropdown_table_filterColumn', 'value')])
def update_output_two(value):
return 'You have selected question numbers: "{}"'.format(value)
#vader analyser
analyzer = SentimentIntensityAnalyzer()
#DISPLAY THE VADER RESULTS
@app.callback(Output('vader-output-container', 'children'),
[Input('upload-data', 'contents'),
Input('dropdown_table_filterColumn', 'value')])
#function
def update_output_vader(contents, value):
if contents and value is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
#start an empty string
bizz = str()
for i in value:
#choose the sheet number (question) to look at from the dropdown options values
sheet_no = 'Question ' + str(i)
#read in that particular exel sheet question ONLY as a dataframe
q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
#Read in exel sheet text responses only as a new dataframe
response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
#Drop nan values
response_df = response_df.dropna()
#Renaming the column so it doesnt have spaces or capital letters
response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
#resetting the index
response_df = response_df.reset_index(drop = True)
#create a new column in the dataframe that contains the analysis of the response text (as a dictionary with compund, pos, neu, neg)
response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
#Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc.
objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
# finding the average compund score from each of the responses - to be displayed
av_compound_score = response_df['compound'].mean()
#If statement that displayes comments on the average compund score
if av_compound_score >= 0.7:
av_compound_score_string = 'overwhelmingly positive'
elif av_compound_score >= 0.3:
av_compound_score_string = 'pretty positive'
elif av_compound_score >= -0.3:
av_compound_score_string = 'fairly neutral'
elif av_compound_score >= 0.3:
av_compound_score_string = 'pretty negarive'
else:
av_compound_score_string = 'overwhelmigly negative'
# add the string
bizz = bizz + 'Your sentiment analyser score for Question {}'.format(i) +': "{}"'.format(q_df[0][0]) +' is : {0:.3f}, '.format(av_compound_score) + '{}.'.format(av_compound_score_string)
return bizz
# else is displayed if dropdown has not been selected
else:
return html.Div([
'No sentiment yet.'
])
### FIG TO URI FUNCTION
def fig_to_uri(in_fig, close_all=True, **save_args):
# type: (plt.Figure) -> str
"""
Save a figure as a URI
:param in_fig:
:return:
"""
out_img = io.BytesIO() #create an empty BytesIO object
in_fig.figure.savefig(out_img, format='png', **save_args) #figure is 'saved' localy
if close_all:
in_fig.figure.clf()
plt.close('all')
out_img.seek(0) # rewind file
encoded = base64.b64encode(out_img.read()).decode("ascii").replace("\n", "") # encode the image
return "data:image/png;base64,{}".format(encoded) # return encoded image
#stopwords = set(STOPWORDS)
stopwords = stopwords.words('english')
# Must include apostrophied words because they are seperated in the stopword list by nltk. Also must include Nothing because some people just write 'Nothing' in the box
stopwords.extend(["Nothing", "would", "I've"])
def wordcloud_subfig_builder(text):
wordcloud = WordCloud(stopwords=stopwords, max_font_size=150, max_words=100, background_color="white", height = 400, width = 800).generate(text)
@app.callback(Output('wordcloud-image-container', 'src'),
[Input('upload-data', 'contents'),
Input('dropdown_table_filterColumn', 'value')])
#function
def build_wordcloud_image(contents, value):
if contents and value is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
sheet_nos = ['Question ' + str(i) for i in value]
response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_nos, header = 7, index_col=0, usecols = 2)
for i, v in enumerate(response_df):
to_be_cloud_df = response_df.get(v)
# Drop nan values
to_be_cloud_df = to_be_cloud_df.dropna()
# join all of the reponses into one massive string
text = " ".join(to_be_cloud_df['Response Text'])
# generating the wordcloud
wordcloud = WordCloud(stopwords=stopwords, max_font_size=150, max_words=90, background_color="white", height = 350, width = 700).generate(text)
#column of subplots
ax1 = plt.subplot(len(response_df),1,i+1)
#no need for meaningless axis
ax1.axis("off")
#set title as 'Question 1' etc
ax1.set_title(list(response_df.keys())[i])
ax1.imshow(wordcloud, interpolation="bilinear")
i = i+1 # add one to the counter i
# use fig to uri function
out_url = fig_to_uri(ax1)
return out_url # return the uri of the wordcloud figure we just made
else:
image_filename = 'placeholder.PNG' # replace with your own image
encoded_image = base64.b64encode(open(image_filename, 'rb').read())
return 'data:image/png;base64,{}'.format(encoded_image.decode())
# Green text function to be used for positive wordclouds
def green_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(132,100%%, %d%%)" % np.random.randint(49,51))
# Redtext function to be used for negative wordclouds
def red_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0,100%%, %d%%)" % np.random.randint(49,51))
#################
### BOX PLOTS ###
#################
#Initialising some lists
# No if statement to avoid warnings for values being empty
empty_x =[]
empty_y = []
@app.callback(
Output('scatter_graph', 'figure'),
[Input('upload-data', 'contents'),
Input('dropdown_table_filterColumn', 'value')])
def update_graph(contents, value):
# if contents and value is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
the_list_of_data = []
for i in value:
sheet_no = 'Question ' + str(i)
q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
# change the dataframe that we want to analyse into one massive string MIGHT NOT BE THE BEST IDEA
response_df = response_df.dropna()
response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
response_df = response_df.reset_index(drop = True)
response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
#Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc.
objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
x_values = [i]*response_df.shape[0]
y_values = response_df['compound']
x_and_y = {
'x': x_values,
'y': y_values,
'text':response_df['response_text'],
'mode': 'markers',
'marker':{
'size': 15,
'opacity': 0.5,
'line': {'width': 0.5, 'color': 'white'}
}}
the_list_of_data.append(x_and_y)
mean_for_each_x_and_y = {
'x':[i],
'y':np.percentile(response_df['compound'], [50]),
'mode': 'markers',
'marker': {
'size': 20,
'opacity':0.9,
'symbol':'cross'}
}
the_list_of_data.append(mean_for_each_x_and_y)
first_and_third_quartiles = {
'x':[i]*2,
'y':np.percentile(response_df['compound'], [25, 75]),
'mode':'markers',
'marker':{
'size':18,
'opacity':0.9,
'symbol':'diamond'
}
}
the_list_of_data.append(first_and_third_quartiles)
return {
'data': the_list_of_data,
'layout':
{'xaxis': {
'title': 'Question number',
'type': 'linear'
},
'yaxis':{
'title': 'Compound score',
'type': 'linear',
'range': [-1, 1]
},
'margin':{'l': 40, 'b': 40, 't': 10, 'r': 0},
'hovermode':'closest',
'showlegend':False}
}
#################
### PIE CHART ###
#################
#Currently only displays a single pie chart - corresponding the the question most recently added to the dropdown list
# No if statement to avoid warnings for values being empty
@app.callback(
Output('pie_chart', 'figure'),
[Input('upload-data', 'contents'),
Input('dropdown_table_filterColumn', 'value')])
def update_graph(contents, value):
# if contents and value is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
#Starting empty lists
the_list_of_data_pie = []
the_dic_for_layout_pie = []
for i in value:
sheet_no = 'Question ' + str(i)
q_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = None, usecols =0, skiprows = 1, nrows = 1)
response_df = pd.read_excel(io.BytesIO(decoded), sheet_name = sheet_no, header = 7, index_col=0, usecols = 2)
# change the dataframe that we want to analyse into one massive string MIGHT NOT BE THE BEST IDEA
response_df = response_df.dropna()
response_df.rename(columns={'Response Text': 'response_text'}, inplace=True)
response_df = response_df.reset_index(drop = True)
response_df['polarity_scores'] = response_df['response_text'].apply(lambda response_text: analyzer.polarity_scores(response_text))
#Next two lines of code break up the dictionary containing sentiment analysis into seperate columns for compound, pos etc.
objs = [response_df, pd.DataFrame(response_df['polarity_scores'].tolist()).iloc[:, :4]]
response_df = pd.concat(objs, axis=1).drop('polarity_scores', axis=1)
pie_values = {
'values': [response_df['pos'].mean(), response_df['neg'].mean(), response_df['neu'].mean()],
'labels': ['positive', 'negative', 'neutral'],
# 'text':response_df['response_text'],
'type':'pie',
'name':sheet_no,
"domain": {"x": [0, i]}
}
the_list_of_data_pie.append(pie_values)
for_layout = {
'title':sheet_no,
'margin':{'l': 40, 'b': 40, 't': 10, 'r': 0},
'hovermode':'closest',
'showlegend':False}
the_dic_for_layout_pie.append(for_layout)
#Return data and layout for pie chart
return {
'data': the_list_of_data_pie,
'layout': the_dic_for_layout_pie}
#FORMATTING PURPOSES ONLY - CAN BE DELETED NO PROBLEMS
app.css.append_css({
"external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"
})
if __name__ == '__main__':
app.run_server(debug=True)