-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path4_ShinyApp_data.Rmd
126 lines (101 loc) · 4.04 KB
/
4_ShinyApp_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
---
title: "Clean data for the ShinyApp"
output: html_document
---
```{r}
## Save package names as a vector of strings
pkgs <-
c(
"data.table",
"tidyverse",
"magrittr",
"stringr",
"tidylog"
)
## Install uninstalled packages
lapply(pkgs[!(pkgs %in% installed.packages())], install.packages)
## Load all packages to library and adjust options
lapply(pkgs, library, character.only = TRUE)
```
# Load data
```{r}
# Read in data
load("~/Downloads/doc2_minmax.RData")
article_vectors <- as.data.frame(doc2_minmax)
load("~/Downloads/dat_agg.RData")
#article_df <- as.matrix(dat_agg)
article_df <- as.data.frame(dat_agg)
load("~/Downloads/author2_minmax.RData")
AuthorVectors <- author2_minmax # as.data.frame(author2_minmax)
# read in the .txt file with embeddings
WordVectors <-
read.delim("~/Downloads/w2v_embeddings_100.txt",
sep = " ",
header = FALSE)
# set the first column (tokens) to row names
WordVectors <-
WordVectors %>% remove_rownames %>% column_to_rownames(var = "V1")
# replace V2:V101 column names with V1:V100
names <- paste0("V", seq(1:length(WordVectors)))
colnames(WordVectors) <- names
WordVectors <- as.matrix(WordVectors)
# Remove unnecessary objects
rm(doc2_minmax, dat_agg, author2_minmax, names)
```
# Preprocess data
```{r}
article_vectors <- as.matrix(article_vectors)
article_vectors[is.na(article_vectors)] <- 0
AuthorVectors <- as.matrix(AuthorVectors)
author_list <- row.names(AuthorVectors)
formatted_column <- sapply(1:22463,function(x) paste(
c("<h4>",as.character(article_df[x,]$title),"</h4>",
"<p style='font-size:12px'>", as.character(article_df[x,]$authors), "</p>",
"<p style='font-size:11px'>", as.character(article_df[x,]$journal), ", ",
as.character(article_df[x,]$doi), # ",", as.character(article_df[x,]$pages),
"</p>",
"<p style='font-size:11px'>", as.character(article_df[x,]$abstract), "</p>"), collapse=""
))
article_df$formatted_column <- formatted_column
# dictionary_words <- as.character(row.names(WordVectors))
# load the function wrap_string
wrap_string <- function(x) {paste(strwrap(x,50), collapse=" <br> ")}
wrap_title<-c()
for(i in 1:dim(article_df)[1]){
wrap_title[i] <- wrap_string(article_df$title[i])
}
wrap_title <- as.data.frame(wrap_title)
article_df <- dplyr::bind_cols(article_df,wrap_title)
article_df <- dplyr::bind_cols(article_df, index = 1:dim(article_df)[1])
article_df$wrap_title <- as.character(article_df$wrap_title)
# change year
article_df %<>%
mutate(year = as.integer(substr(publish_time, start=1, stop=4)))
article_df$index <- as.numeric(article_df$index)
# clean data in article_df
article_df <- article_df %>%
mutate(title = stringr::str_replace(title, "AAAS", ""),
title = gsub("[][!#$*,.:;<=>@^_`|~.{}]", "", title),
abstract = stringr::str_replace(abstract, "BACKGROUND:", ""),
abstract = stringr::str_replace(abstract, "Background: ", ""),
abstract = stringr::str_replace(abstract, "PURPOSE: ", ""),
abstract = stringr::str_replace(abstract, "Aim: ", ""),
abstract = stringr::str_replace(abstract, "Objective: ", ""),
abstract = stringr::str_replace(abstract, "INTRODUCTION: ", ""),
abstract = stringr::str_replace(abstract, "BACKGROUD: ", ""),
abstract = stringr::str_replace(abstract, "OBJECTIVES: ", ""),
abstract = stringr::str_replace(abstract, "<p><b>Objective</b>", ""),
abstract = stringr::str_replace(abstract, "ABSTRACT: ", ""),
abstract = stringr::str_replace(abstract, "ABSTRACT", ""),
# abstract = stringr::str_replace(abstract, "[Table:", ""),
# abstract = stringr::str_replace(abstract, "text]", ""),
# abstract = stringr::str_replace(abstract, "[Image:", ""),
# abstract = stringr::str_replace(abstract, "text]", ""),
abstract = stringr::str_replace(abstract, "SIMPLE SUMMARY: ", ""),
abstract = stringr::str_replace(abstract, "SUMMARY: ", "")
)
```
# Save data
```{r}
save(list = ls(all.names = TRUE),file="data/corona_archive.RData")
```