forked from cezary-rosinski/Institute-of-Literary-Research
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vimr_cz_pl_translations_bn.Rmd
83 lines (80 loc) · 2.77 KB
/
vimr_cz_pl_translations_bn.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
---
title: "R Notebook"
output: html_notebook
---
```{r}
options(java.parameters = "-Xmx32000m")
options(scipen = 999)
pacman::p_load(plyr,stringr,tidyverse,reshape2,zoo,sqldf,splitstackshape,dplyr,svMisc,googlesheets4)
data_full <- data.frame()
x <- 1:34
#download database in pieces
for (i in x) {
progress(match(i,x), max.value = length(x))
b <- sprintf("%02d", i-1)
path1 <- "C:/Users/User/Desktop/bn_all_30.10.2019/msplit000000"
path2 <- b
path3 <- ".mrk"
path <- paste(path1,path2,path3,sep = "")
bn1 <- readLines(path, encoding = "UTF-8")
data1 <- data.frame(bn1)
remove(bn1)
data1 <- data1 %>%
mutate(czy = grepl("^\\=LDR",bn1),
row = ifelse(grepl("^\\=LDR",bn1),seq.int(nrow(data1)),NA)) %>%
filter(bn1!="") %>%
fill(row) %>%
mutate(id = ifelse(grepl("^\\=009",bn1),str_replace_all(bn1,"(^\\=009 .*?\\w)(.*?)($)","\\2"),NA)) %>%
group_by(row) %>%
fill(id) %>%
fill(id,.direction = "up") %>%
ungroup() %>%
#wydobycie danych dla konkretnego roku lub zbioru lat
mutate(rok = ifelse(grepl("^\\=008",bn1),str_sub(bn1,14,17),NA)) %>%
group_by(row) %>%
fill(rok) %>%
fill(rok,.direction = "up") %>%
ungroup() %>%
filter(rok >= 1945) %>%
select(-rok) %>%
#wydobycie przekładów z czeskiego na polski
mutate(translation = ifelse(grepl("^\\=041",bn1),grepl("\\$apol\\$hcze",bn1),NA)) %>%
group_by(row) %>%
fill(translation) %>%
fill(translation,.direction = "up") %>%
ungroup() %>%
filter(translation==TRUE) %>%
select(-translation) %>% #koniec
select(-2,-3) %>%
mutate(field = str_replace_all(bn1, "(=)(\\w{3})(.*)", "\\2"),
content = str_replace_all(bn1,"(=)(\\w{3})(\\s{2})(.*)","\\4")) %>%
select(2,3,4) %>%
mutate(id_field = paste(id,field,sep = "|"))
if (length(data1$id)>0) {
count <- data1 %>%
select(4,3)
count <- as.data.frame(table(count$id_field))
data1 <- merge(x=data1,y=count, by.x = "id_field", by.y = "Var1")
remove(count)
data1_to_join <- data1 %>%
filter(Freq > 1)
data1_to_join <- ddply(data1_to_join, .(id_field), summarize, content = paste(content, collapse="|"))
data1_to_join <- mutate(data1_to_join,
id = str_replace_all(data1_to_join$id_field,"(.*)(\\|)(.*)", "\\1"),
field = str_replace_all(data1_to_join$id_field,"(.*)(\\|)(.*)", "\\3"))
data1_ok <- data1 %>%
filter(Freq == 1) %>%
select(-5) %>%
select(1,4,2,3)
data1 <- rbind(data1_ok,data1_to_join)
remove(data1_ok)
remove(data1_to_join)
data1 <- data1[order(as.character(data1$id_field)),]
data1 <- data1 %>%
select(3,4,2)
data_full <- rbind(data_full,data1)
} else {}
}
data_full_table <- dcast(data_full, id ~ field, value.var="content")
write.csv2(data_full_table, file = "C:/Users/User/Desktop/vimr_translations.csv", row.names = F, na = '', fileEncoding = 'UTF-8')
```