@@ -91,27 +91,150 @@ getHTML <- function(x){
9191# PDF TO TXT -------------------------------------------------------------------
9292getPDF <- function (x ){
9393
94- txtfiles <- character (length(x ))
95- for (i in 1 : length(x )){
96-
97- system(paste(' pdftotext -q -enc "ASCII7" "' , x [i ], ' "' , sep = " " ))
98- if (file.exists(gsub(" \\ .pdf$" , " \\ .txt" , x [i ]))) {
99- fileName <- gsub(" \\ .pdf$" , " \\ .txt" , x [i ])
100- strings <- readChar(fileName , file.info(fileName )$ size )
101-
102- # remove carriage returns and new lines
103- strings <- gsub(x = strings , pattern = " [\r\n ]" , replacement = " " )
104-
105- # save result in vector
106- txtfiles [i ] <- strings
107-
108- } else {
109-
110- warning(paste(" Failure in file" , x [i ]))
111- txtfiles [i ] <- " "
112-
113- }
114- }
94+ txtfiles <- sapply(x , pdftools :: pdf_text )
95+
96+ # encode everything in UTF-32
97+ # this should ensure the same output accross multiple operating systems
98+ txtfiles <- stringi :: stri_enc_toutf32(txtfiles )
99+
100+ # Replace known weird characters
101+
102+ # substitute double solidous (UTF-32 Decimal 11005) with equal sign (UTF-32
103+ # Decimal 61) [issue in APA journals]
104+ txtfiles <- lapply(txtfiles , gsub , pattern = " 11005" ,
105+ replacement = " 61" , fixed = TRUE )
106+
107+ # substitute 1/4 (UTF-32 decimal 188) with equal sign (UTF-32 Decimal 61);
108+ # [issue in Elsevier journal: Journal of Environmental Psychology]
109+ txtfiles <- lapply(txtfiles , gsub , pattern = " 188" ,
110+ replacement = " 61" , fixed = TRUE )
111+
112+ # substitute U+2B0D (C++ \u2b0d; UTF-32 Decimal 11021) with less than
113+ # sign (UTF-32 Decimal 60) [issue in APA journals]
114+ txtfiles <- lapply(txtfiles , gsub , pattern = " 11021" ,
115+ replacement = " 60" , fixed = TRUE )
116+
117+ # substitute ! (UTF-32 decimal 33) with less than sign (UTF-32 Decimal 60);
118+ # [issue in Oxford journal: Journal of Consumer Research]
119+ txtfiles <- lapply(txtfiles , gsub , pattern = " 33" ,
120+ replacement = " 60" , fixed = TRUE )
121+
122+ # substitute U+2AFA (UTF-32 Decimal 11002) with HYPHEN-MINUS sign (UTF-32
123+ # Decimal 45) [issue in APA journals]
124+ txtfiles <- lapply(txtfiles , gsub , pattern = " 11002" ,
125+ replacement = " 45" , fixed = TRUE )
126+
127+ # substitute U+2439 (C++ \u2439; UTF-32 Decimal 9273) with small greek chi
128+ # (UTF-32 Decimal 967) [issue in APA journals]
129+ txtfiles <- lapply(txtfiles , gsub , pattern = " 9273" ,
130+ replacement = " 967" , fixed = TRUE )
131+
132+ # Revert to UTF-8 encoding
133+ txtfiles <- stringi :: stri_enc_fromutf32(txtfiles )
134+
135+
136+ # Arrange text according to paper column layout
137+ txtfiles <- pdf_columns(txtfiles )
138+
139+ # Paste the differente pages together, so that each pdf is converted to
140+ # one string of text
141+ txtfiles <- stringr :: str_c(unlist(txtfiles ), collapse = " " )
142+
143+
144+
145+ # substitute the letter "b" in a NHST result for a "<". This is not feasible
146+ # in utf32 encoding, because making a regex that only substitutes the b in
147+ # a statistical result instead of ALL b's in the paper is very hard in
148+ # utf32 encoding. [issue in Elsevier journal: JESP]
149+ txtfiles <- lapply(txtfiles , gsub ,
150+ # don't match a b preceded by =<>, because the b itself
151+ # should be the comparison sign.
152+ # only match a b followed by a number, that gives further
153+ # proof that the b is in fact the comparison sign.
154+ pattern = RGX_B_SMALLER ,
155+ replacement = " <" , perl = TRUE )
156+
157+ # substitute the letter "N" in a NHST result for a ">", for the same reason
158+ # as above. [issue in Elsevier journal: JESP]
159+ txtfiles <- lapply(txtfiles , gsub ,
160+ # don't match a N preceded by =<>, because the N itself
161+ # should be the comparison sign.
162+ # only match a N followed by a number, that gives further
163+ # proof that the N is in fact the comparison sign.
164+ pattern = RGX_N_LARGER ,
165+ replacement = " >" , perl = TRUE )
166+
167+ # substitute the letter "p" that should be a "=". [issue in Oxford journal:
168+ # journal of consumer research]
169+ txtfiles <- lapply(txtfiles , gsub ,
170+ # don't match a p preceded by a "," or a ",\\s", because
171+ # that is the actual p-value.
172+ # only match a p followed by a number, that gives further
173+ # proof that the p is in fact the comparison sign.
174+ pattern = RGX_P_EQUAL ,
175+ replacement = " =" , perl = TRUE )
176+
177+ # substitute the letter "B" that should be a '"'. [issue in BRM]
178+ txtfiles <- lapply(txtfiles , gsub ,
179+ # only match a B followed by a letter that could indicate
180+ # a test statistic
181+ pattern = RGX_B_QUOTE ,
182+ replacement = ' "' , perl = TRUE )
183+
115184
116185 return (txtfiles )
117186}
187+
188+
189+ # helper function for getPDF() -------------------------------------------------
190+
191+ # This function helps maintaining the format of pdf files with a multiple
192+ # columns layout.
193+ # Credits to:
194+ # https://github.com/fsingletonthorn/EffectSizeScraping/blob/master/R/pdf_process.R
195+ # for original function
196+
197+ true_false <- function (x , chars ) {
198+ x > chars
199+ }
200+
201+ pdf_columns <- function (x , pattern = " \\ p{WHITE_SPACE}{3,}" ) {
202+ # \p{L} matches a single code point in the category "letter".
203+ # {3,} three or more
204+
205+ # This function is slightly adapted from pdfsearch
206+ # see: https://github.com/lebebr01/pdfsearch/blob/master/R/split_pdf.r
207+
208+ x_lines <- stringi :: stri_split_lines(x )
209+ x_lines <- lapply(x_lines , gsub ,
210+ pattern = " ^\\ s{1,20}" ,
211+ # ^ string that starts with
212+ # \ creates regular expression containing following...
213+ # \s matches any whitespace
214+ # {1,20} between 1 and 20 of these [ in your case this will become +]
215+ replacement = " " )
216+
217+ x_page <- lapply(
218+ x_lines ,
219+ stringi :: stri_split_regex ,
220+ pattern = pattern ,
221+ omit_empty = NA ,
222+ simplify = TRUE
223+ )
224+
225+ page_lines <- unlist(lapply(x_page , nrow ))
226+ columns <- unlist(lapply(x_page , ncol ))
227+
228+ num_chars <- lapply(x_page , base :: nchar )
229+ num_chars_tf <- lapply(num_chars , true_false , chars = 3 )
230+
231+ for (xx in seq_along(num_chars_tf )) {
232+ num_chars_tf [[xx ]][is.na(num_chars_tf [[xx ]])] <- FALSE
233+ }
234+
235+ output <- lapply(seq_along(x_page ), function (xx )
236+ x_page [[xx ]][num_chars_tf [[xx ]]])
237+
238+ output <- lapply(output , paste , collapse = " " )
239+ return (output )
240+ }
0 commit comments