-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_text.R
26 lines (18 loc) · 932 Bytes
/
analyze_text.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# Create a term document matrix and a wordcloud
library(tm)
library(wordcloud)
text_data <- read_and_pp(file.path("~", "Code", "stack_ds_scrape"))
# Create term document matrix
tdmatrix <- TermDocumentMatrix(text_data)
freq_vector <- sort(rowSums(as.matrix(tdmatrix)), decreasing = TRUE)
text_frame <- data.frame(word = names(freq_vector), freq = freq_vector)
# Generate a word cloud
set.seed(999)
#wordcloud(words = text_frame$word, freq = text_frame$freq,
# random.order = FALSE, rot.per = 0.35, max.words = 200,
# scale = c(2, 0.25), colors = brewer.pal(8, "Dark2"))
# Re-read original text plainly (no corpus) for n-gram analysis
plain_text <- read_pt_and_pp(file.path("~", "Code", "stack_ds_scrape", "corpus", "qa_output.txt"))
# Generate a bigram matrix
bigram_matrix <- TermDocumentMatrix(Corpus(VectorSource(plain_text)),
control = list(tokenize = get_ngrams))