-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_etl.R
57 lines (48 loc) · 2.05 KB
/
news_etl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# This code was originally created by Giora Simchoni and Saharon Rosset and modified by Fabio Sigrist, 2023
library(tidyverse)
library(tm)
# The News popularity in social media dataset from UCI ML repository:
# https://archive.ics.uci.edu/ml/datasets/News+Popularity+in+Multiple+Social+Media+Platforms
news <- read_csv("../raw_data/News_Final.csv.zip") %>%
distinct(IDLink, .keep_all = T) %>%
mutate(hour = lubridate::hour(PublishDate),
day = lubridate::wday(PublishDate),
month = lubridate::month(PublishDate)) %>%
filter(Facebook != -1)
# recode title ID
news <- news %>% inner_join(
news %>%
distinct(Title) %>%
mutate(title_id = row_number() - 1),
by = "Title"
)
# recode source ID
news <- news %>% inner_join(
news %>%
distinct(Source) %>%
mutate(source_id = row_number() - 1),
by = "Source"
)
corpus <- Corpus(VectorSource(news$Headline))
corpus <- tm_map(corpus, content_transformer(tolower))
removeHandles <- function(x) gsub("@[[:alnum:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeHandles))
removeURL <- function(x) gsub("http[[:graph:]]+", "", x)
corpus <- tm_map(corpus, content_transformer(removeURL))
removeStrange <- function(x) gsub("(<.+>)+", "", x)
corpus <- tm_map(corpus, content_transformer(removeStrange))
myStopwords <- c(stopwords('english'),"economy","next","break","else","terms","while")
corpus <- tm_map(corpus, content_transformer(removeWords), myStopwords)
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, content_transformer(stripWhitespace))
DTM <- DocumentTermMatrix(corpus, control=list(wordLengths=c(4, Inf)))
DTM <- removeSparseTerms(DTM, 0.99)
df_words <- as_tibble(as.matrix(DTM))
colnames(df_words) <- str_c("w_", colnames(df_words))
df <- bind_cols(news, df_words)
df_topic <- as_tibble(model.matrix(~ 0 + df$Topic))
colnames(df_topic) <- c("topic_economy", "topic_microsoft", "topic_obama", "topic_palestine")
df %>%
bind_cols(df_topic) %>%
write_csv("../data/news.csv.gz")