-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path03embeddingsbackground.tex
553 lines (384 loc) · 20.4 KB
/
03embeddingsbackground.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
% !TeX document-id = {f19fb972-db1f-447e-9d78-531139c30778}
% !BIB program = biber
\documentclass[compress]{beamer}
\usepackage[T1]{fontenc}
\usepackage{pifont}
\usetheme[block=fill,subsectionpage=progressbar,sectionpage=progressbar]{metropolis}
\usepackage{wasysym}
\usepackage{etoolbox}
\usepackage[utf8]{inputenc}
\usepackage{threeparttable}
\usepackage{subcaption}
\usepackage{tikz-qtree}
\setbeamercovered{still covered={\opaqueness<1->{5}},again covered={\opaqueness<1->{100}}}
\usepackage{listings}
\lstset{
basicstyle=\scriptsize\ttfamily,
columns=flexible,
breaklines=true,
numbers=left,
%stepsize=1,
numberstyle=\tiny,
backgroundcolor=\color[rgb]{0.85,0.90,1}
}
\lstnewenvironment{lstlistingoutput}{\lstset{basicstyle=\footnotesize\ttfamily,
columns=flexible,
breaklines=true,
numbers=left,
%stepsize=1,
numberstyle=\tiny,
backgroundcolor=\color[rgb]{.7,.7,.7}}}{}
\lstnewenvironment{lstlistingoutputtiny}{\lstset{basicstyle=\tiny\ttfamily,
columns=flexible,
breaklines=true,
numbers=left,
%stepsize=1,
numberstyle=\tiny,
backgroundcolor=\color[rgb]{.7,.7,.7}}}{}
\usepackage[american]{babel}
\usepackage{csquotes}
\usepackage[style=apa, backend = biber]{biblatex}
\DeclareLanguageMapping{american}{american-UoN}
\addbibresource{../bdaca/bdaca.bib }
\renewcommand*{\bibfont}{\tiny}
\usepackage{tikz}
\usetikzlibrary{shapes,arrows,matrix}
\usepackage{multicol}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage{graphicx}
\graphicspath{{../bdaca/pictures/}}
\makeatletter
\setbeamertemplate{headline}{%
\begin{beamercolorbox}[colsep=1.5pt]{upper separation line head}
\end{beamercolorbox}
\begin{beamercolorbox}{section in head/foot}
\vskip2pt\insertnavigation{\paperwidth}\vskip2pt
\end{beamercolorbox}%
\begin{beamercolorbox}[colsep=1.5pt]{lower separation line head}
\end{beamercolorbox}
}
\makeatother
\setbeamercolor{section in head/foot}{fg=normal text.bg, bg=structure.fg}
\newcommand{\question}[1]{
\begin{frame}[plain]
\begin{columns}
\column{.3\textwidth}
\makebox[\columnwidth]{
\includegraphics[width=\columnwidth,height=\paperheight,keepaspectratio]{mannetje.png}}
\column{.7\textwidth}
\large
\textcolor{orange}{\textbf{\emph{#1}}}
\end{columns}
\end{frame}}
\newcommand{\instruction}[1]{\emph{\textcolor{gray}{[#1]}}}
\title{Beyond Counting Words: Working with Word Embeddings}
\author[Damian Trilling]{Damian Trilling \\ ~ \\ \footnotesize{[email protected] \\@damian0604} \\ \url{www.damiantrilling.net}}
\date{12--13 April 2021}
\institute[UvA]{Afdeling Communicatiewetenschap \\Universiteit van Amsterdam}
\begin{document}
\begin{frame}{}
\titlepage
\end{frame}
\begin{frame}{This part: The idea behind word embeddings}
\tableofcontents
\end{frame}
\setbeamercovered{transparent}
\section{From word counts to word vectors}
\begin{frame}{Our BOW approach until now}
\begin{block}{Representing a document by word frequency counts}
Result of preprocessing and vectorizing:
0. \texttt{He took the dog for a walk to the dog playground}\\
$\Rightarrow$ \texttt{took dog walk dog playground}\\
$\Rightarrow$ \texttt{'took':1, 'dog': 2, walk: 1, playground: 1}
\end{block}
Consider these other sentences
\begin{enumerate}
\item<2-> He took the doberman for a walk to the dog playground
\item<3-> He took the cat for a walk to the dog playground
\item<4-> He killed the dog on his walk to the dog playground
\end{enumerate}
\onslide<5>{The vectorized representations of these sentences have a ``distance'' (dissimilarity) of 1 each, but arguably, sentences 0 and 1 should be ``closer'' than others}
\end{frame}
\begin{frame}{Our BOW approach until now}
\begin{itemize}
\item Our vectorizers gave a random ID to each word
\item What if we instead would represent each word by another vector representing its meaning?
\item For, instance, `doberman' and `dog' should be represented by vectors that are close to each other in space, while `kill' and `walk' should be far from each other.
\end{itemize}
\pause
$\Rightarrow$ That's the idea behind word embeddings!
\pause
Or, more broadly: Can computers understand meanings, semantic relationships, different types of contexts?
\end{frame}
\section{Training word embeddings}
\begin{frame}{GloVe vs Word2Vec}
There are two popular approaches to training word embeddings: GloVe and word2vec.
\begin{itemize}
\item GloVe is count-based: dimensionality reduction on the co-occurrence counts matrix.
\item Word2Vec is a predictive model: neural network to predict words/contexts
\item That means that GloVe takes global context into account, word2vec local context
\item Some technical implications for how training can be implemented
\item \textbf{However, only subtle differences in final result.}
\end{itemize}
\end{frame}
\begin{frame}{Word2Vec: Continous Bag of Words (CBOW) vs skipgram}
Example sentence: ``the quick brown fox jumped over the lazy dog''
\begin{block}{CBOW: Predict a word given its context}
Dataset:
\texttt{([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...}
\end{block}
\pause
\begin{block}{skipgram: Predict the context given the word}
\texttt{(quick, the), (quick, brown), (brown, quick), (brown, fox), ...}
\end{block}
\tiny{Example taken from here: \url{https://medium.com/explore-artificial-intelligence/word2vec-a-baby-step-in-deep-learning-but-a-giant-leap-towards-natural-language-processing-40fe4e8602ba}}
\end{frame}
%window sizes
\begin{frame}{Continous Bag of Words (CBOW) vs skipgram}
\begin{itemize}
\item CBOW is faster
\item skipgram works better for infrequent words
\item Both are often used
\item Usually, we use larger window sizes (e.g, 5)
\item We need to specify the number of dimensions (typically 100--300)
\end{itemize}
\pause
\textit{In any event, as a result of the prediction task, we end up with a \{100|200|300\}-dimensional vector representation of each word.*}
\tiny{* If that makes you think of PCA/SVD, that's not completely crazy, see Levy, O., Goldberg, Y., \& Dagan, I. (2018). Improving Distributional Similarity with Lessons Learned from Word Embeddings. \textit{Transactions of the Association for Computational Linguistics, 3}, 211--225. doi:10.1162/tacl\_a\_00134}\\
\end{frame}
\begin{frame}{``...a word is characterized by the company it keeps...'' (Firth, 1957)}
\begin{block}{Word embeddings \ldots}
\begin{itemize}
\item help capturing the meaning of text
\item are low-dimensional vector representations that capture semantic meaning
\item are state-of-the-art in NLP...
\end{itemize}
\end{block}
\tiny{Firth, J. R. (1957). A synopsis of linguistic theory, 1930-1955. Studies in linguistic analysis.}
\end{frame}
%Due to developments in the field of NLP, algorithms have become increasingly apt to understand human language. Word embeddings are the current state of the art for capturing the meaning of texts. Word embeddings are vector representations of words. They are the current state of the art in NLP to understand, capture and process language. The basic idea of word embeddings is that one gets to know a word by looking at the company that it keeps; contexts is crucial in understanding word meaning.
\begin{frame}{You can literally calculate with words!}
And answer questions such as ``Man is to woman as king is to \_\_\_\_?''
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{embeddings.png}}
semantic relationships vs. syntactic relationships
\end{frame}
%word vectors that are able to capture the relationships between words in a surprisingly expressive way. Word embeddings are especially effective in understanding analogies, and for example understand that man is to woman as uncle to aunt, and king to king.
\section[Improving models]{Using word embeddings to improve models}
\begin{frame}[plain]
Using word embeddings to improve models
\end{frame}
\begin{frame}{In supervised machine learning}
\begin{itemize}[<+->]
\item Modify CountVectorizer or TfIdfVector such that for each term, you do not only count how often it occurs, but also multiply with its embedding vector
\item Often, pre-trained embeddings (e.g., trained on the whole wikipedia) are used
\item Thus, our supervised model will be able to deal with synonyms and related words!
\end{itemize}
\pause
Let's look at an example for using supervised sentiment analysis (i.e., what we did with IMDB-data before).
\end{frame}
\begin{frame}[plain]
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{rudkowsky2018-1}}
\tiny{Rudkowsky, E., Haselmayer, M., Wastian, M., Jenny, M., Emrich, Š., \& Sedlmair, M. (2018). More than Bags of Words: Sentiment Analysis with Word Embeddings. \textit{Communication Methods and Measures, 12}(2–3), 140–157. doi:10.1080/19312458.2018.1455817}
\end{frame}
\begin{frame}{It's not always black/white\ldots}
Sometimes, BOW may be just fine (for very negative sentences, it doesn't matter). But especially in less clear cases ('slightly negative'), embeddings increased performance.
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{rudkowsky2018-2}}
\vfill
\tiny{Rudkowsky, E., Haselmayer, M., Wastian, M., Jenny, M., Emrich, Š., \& Sedlmair, M. (2018). More than Bags of Words: Sentiment Analysis with Word Embeddings. \textit{Communication Methods and Measures, 12}(2–3), 140–157. doi:10.1080/19312458.2018.1455817}
\end{frame}
\begin{frame}{In document similarity calculation}
\begin{block}{Use cases}
\begin{itemize}
\item plagiarism detection
\item Are press releases/news agency copy/\ldots taken over?
\item Event detection
\end{itemize}
\end{block}
\pause
\begin{block}{Traditional measures}
\begin{itemize}
\item Levenshtein distance (How many characters|words do I need to change to transform string A into string B?)
\item Cosine similarity (``correlation'' between the BOW-representations of string A and string B)
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[plain]
BUT: This only works for literal overlap. What if the writer chooses synonyms?
\pause
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{wmd}}
\tiny{Kusner, M. J., Sun, Y., Kolkin, N. I., \& Weinberger, K. Q. (2015). From Word Embeddings To Document Distances. \textit{Proceedings of The 32nd International Conference on Machine Learning} (Vol. 37, pp. 957–966)}
\end{frame}
\begin{frame}{There are several approaches}
\begin{itemize}
\item word mover's distance
\item soft cosine similarity
\end{itemize}
In common: we use pre-trained embeddings to replace words (that otherwise would just have a random identifier and be unrelated) with vectors representing their meaning, when calculating our measure of interest
\end{frame}
\section[Detecting biases]{(Ab-)using word embeddings to detect biases}
\begin{frame}[plain]
(Ab-)using word embeddings to detect biases
\end{frame}
\begin{frame}{Biased embeddings}
\begin{itemize}
\item word embeddings are trained on large corpora
\item As the task is to learn how to predict a word from its context (CBOW) or vice versa (skip-gram), biased texts produce biased embeddings
\item If in the training corpus, the words ``man'' and ``computer programmer'' are used in the same context, then we will learn such a gender bias
\end{itemize}
\tiny{Bolukbasi, T., Chang, K.-W., Zou, J., Saligrama, V., \& Kalai, A. (2016). Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings, 1–25. Retrieved from http://arxiv.org/abs/1607.06520}
\end{frame}
\begin{frame}{Biased embeddings}
Usually, we do not want that (and it has a huge potential for a shitstorm)
~\\
\pause
unless\ldots
~\\
\pause
we actually want to chart such biases.
\end{frame}
\begin{frame}{An exmaple from our research}
We trained word embeddings on 3.3 million Dutch news articles.
Are vector representations of outgroups (Maroccans, Muslims) closer to representations of negative stereotype words than ingroups?
\vspace{.5cm}
\tiny{Kroon, A.C., Van der Meer, G.L.A., Jonkman, J.G.F., \&Trilling, D. (in press): Guilty by Association: Using Word Embeddings to Measure Ethnic Stereotypes in News Coverage. \emph{Journalism \& Mass
Communication Quarterly}}
\end{frame}
\begin{frame}[plain]
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{embeddingbias}}
\end{frame}
\section[AEM]{AEM: An application from our own research}
\begin{frame}[plain]
We can use pre-trained embeddings -- but can we make even better ones?
\textbf{The Amsterdam Embedding Model (AEM)}\\
\vspace{1cm}
{\footnotesize{Anne Kroon, Antske Fokkens, Damian Trilling, Felicia Loecherbach, Judith Moeller, Mariken A. C. G. van der Velden, Wouter van Atteveldt} }
\end{frame}
%For all these tasks, you need to process text.
%Humans are obviously very good in this. However, we are not capable of handling %humongous amounts of data. Therefore, we need computers.
%For computers, it used to be relatively hard to understand language, to capture %semantic relations, especially in different type of contexts.
\begin{frame}{Why do this?}
\begin{itemize}
\item Embedding models are of great interest to communication scholars
\item yet... Most publicly available models represent \textbf{English} language
\item The preparation of good-performing embedding models require a significant amount of \textbf{time} and \textbf{access to a large amount of data sets}
\item Few Dutch embedding models are available, but trained on ordinary human language from the World Wide Web.
\item These models do not capture the specifics of news article data and are therefore less suitable to study and understand dynamics of this domain
\item $\Rightarrow$ No model is available trained on Dutch news data
\end{itemize}
\end{frame}
%Properly trained embedding models are of great interest to communication scholars, because they can help with diverse tasks, such as topic classification, automated sentiment analysis or bias detection. Yet – currently no model exist that is trained on media data – and therefore effectively deals with the particularities of news media data.
%\subsection{The Amsterdam Embedding Model}
\begin{frame}{Project's Aim}
\begin{block}{Aim of the current project}
\begin{enumerate}
\item Develop and evaluate a high-quality embedding model
\item Assess performance in downstream tasks of interest to Communication Science (such as topic classification of newspaper data).
\item Facilitate distribution and use of the model
\item Offer clear methodological recommendations for researchers interested using our Dutch embedding model
\end{enumerate}
\end{block}
\end{frame}
%Therefore, this project was set out to develop a good word embedding model trained on Dutch media data, and facilitate its distribution
%\subsection{Approach and Preliminary Results}
\begin{frame}{Training data}
\begin{block}{Training data set}
\begin{itemize}
\item Dataset: diverse print and online news sources
\item Preprocessing: duplicate sentences were removed
\item Telegraaf (print \& online), NRC Handelsblad (print \& online), Volkskrant (print \& online), Algemeen Dabldad (print \& online), Trouw (print \& online), nu.nl , nos.nl
\item \# words: 1.18b (1181701742)
\item \# sentences: 77.1M (77151321)
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Training model}
\begin{block}{Training model}
\begin{itemize}
\item We trained the model using Gensim's Word2Vec package in Python
\item Skip-gram with negative sampling, window size of 5, 300-dimensional word vectors
\end{itemize}
\end{block}
\end{frame}
%\subsection{Evaluation}
\begin{frame}{Evaluation}
Evaluation of the Amsterdam Embedding Model
\end{frame}
\begin{frame}{Evaluation}
\begin{block}{Evaluation methods}
\begin{itemize}
\item To evaluate the model, we compare it to two other publicly available embedding models
\begin{itemize}
\item $\Rightarrow$ \textbf {'Wiki'}: Embedding model trained on Wikipedia data (FastText)
\item $\Rightarrow$ \textbf{'Cow'}: Embedding model trained on diverse .nl and .be sites (Schafer \& Bildhauer, 2012; Tulkens et al., 2016)
\item $\Rightarrow$ \textbf{'AEM'}: Amsterdam Embedding Model
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
%To evaluate our model, we will compare it (at least in a first step) – to two other, publicly available word embedding models: One trained on Wikipedia data, and the other on divers dutch and Belgian websites.
\begin{frame}{Evaluation data}
\begin{block}{Evaluation data}
\begin{enumerate}
\item 'relationship'/ analogy-task (Tulkens et al., 2016)
\begin{itemize}
\item \textbf{syntatic relationships}: dans dansen loop \textit{[lopen]}
\item \textbf{semantic relationships}: denemarken kopenhagen noorwegen \textit{[oslo]}
\end{itemize}
\item 5806 relationship tasks
\end{enumerate}
\end{block}
\end{frame}
%Tulkens et al created several Dutch relationship tasks. For example, when given dans (dance) dansen (dancing) and loop (walk), the computer has to guess that the word we are looking for is lopen (walking). This is an example of syntactic analogy. The same applies to capital - country relations (Copenhagen is to Denmark as … [Oslo] to Norway]. This is an example of a semantic analogy.
%Each model had to solve over 5K of these types of analogy tasks. We subsequently use these results to compare how well they do.
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{evaluation_data.png}}
\end{frame}
%As can be seen, the AEM (= Amsterdam Embedding Model) outperfoms the other models on both syntactic analogies and semantic analogies.
%\subsection{Illustration}
\begin{frame}{Illustration}
Illustration - Using the Amsterdam Embedding Model
\end{frame}
%Let's see what the model has learned about Dutch language. Now, we will provide some illustrations of how well the AEM understands the Dutch language. More specifically, we will provide a 2 dimensional visualisation of some random Dutch words in the word vector space..
\subsection{}
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{w2v_300_illustration.png}}
\end{frame}
% this is a 2 dimensional representation of the most similar words to fiets, hond, kat, roodborstje and kraai
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{visual2.png}}
\end{frame}
%as can be seen, words most similar to fiets are on the left.
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{fiets}}
\end{frame}
%the model has learned that fiets is similar to racefiets, wielrenfiets, rijwiel - and different from the animal department: it doesnt overlap with our kats/ dogs and birds clusters
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{visual1.png}}
\end{frame}
%the model recognizes that fiets is something else from honden and katten. Both mammals and pets, dogs and cats are quite similar and appear in the same cluster.
\begin{frame}{}
\makebox[\linewidth]{\includegraphics[width=\linewidth,height=\textheight, keepaspectratio]{visual.png}}
\end{frame}
%Finally, the model recognizes that dogs and kats are different from redbreasts and crows, with both appear in a 'bird-related' cluster.
%\begin{frame}{}
% \makebox[\linewidth]{\includegraphics[width=\linew%idth,height=\textheight, %keepaspectratio]{bias.png}}
%\end{frame}
%Now we know that the model understands Dutch pretty well. We can now apply to model to diverse tasks that are of greater interest to communication scholars. For example, we can see whether or not our training data contains bias.When we plot the most similar words to ‘criminals, Belgians and Moroccans, we see that Moroccans are much closer to criminals, that Belgians. THis could potentiall reveal bias in the training data.
%\subsection{Re-usability}
\begin{frame}{Re-usability}
Re-usability of the Amsterdam Embedding Model
\end{frame}
%\subsection{Availability of model and code}
\begin{frame}{Re-usability}
\begin{block}{Reusing model and data}
\begin{enumerate}
\item See \url{https://github.com/annekroon/amsterdam-embedding-model}
\item Open access to all the code
\end{enumerate}
\end{block}
\end{frame}
\begin{frame}[plain]
\printbibliography
\end{frame}
\end{document}