Skip to content

Commit 0038858

Browse files
Merge pull request #98 from MicheleNuijten/update-pdftext
Update pdftext
2 parents 6604b84 + 88f85b9 commit 0038858

21 files changed

+849
-104
lines changed

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ Imports:
4747
rlang,
4848
rmarkdown,
4949
stringi,
50-
tcltk
50+
tcltk,
51+
pdftools
5152
Suggests:
5253
testthat
5354
ByteCompile: yes

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@ importFrom(graphics,plot.default)
3333
importFrom(graphics,points)
3434
importFrom(graphics,text)
3535
importFrom(rlang,.data)
36+
importFrom(stringi,stri_enc_toutf32)
37+
importFrom(stringi,stri_enc_fromutf32)
38+
importFrom(stringi,stri_split_lines)
39+
importFrom(stringi,stri_split_regex)
40+
importFrom(pdftools,pdf_text)

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
# statcheck 1.6.0
2+
3+
## Major changes
4+
5+
## Small updates
6+
* Changed the way pdf files are converted to text. Initially, statcheck relied on the external program Xpdf, which needed to be installed separately and added to the path. To simplify the workflow (and also have improved pdf conversions) statcheck now uses the R package pdftools.
7+
* Updated the test files for testing text-to-file. Before, I hard coded the true values statcheck should extract, but this was error prone. Now, there is a spreadsheet with manually extracted values (the gold standard) that is used as a reference. This is easier to update if necessary; the updates then only need to happen in one place (the spreadsheet) and no longer at multiple places in the code.
8+
9+
## Bug fixes
10+
111
# statcheck 1.5.0
212

313
## Major changes

R/doc-checkdirs.R

Lines changed: 0 additions & 20 deletions
This file was deleted.

R/doc-checkfiles.R

Lines changed: 0 additions & 14 deletions
This file was deleted.

R/file-to-txt.R

Lines changed: 144 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -91,27 +91,150 @@ getHTML <- function(x){
9191
# PDF TO TXT -------------------------------------------------------------------
9292
getPDF <- function(x){
9393

94-
txtfiles <- character(length(x))
95-
for (i in 1:length(x)){
96-
97-
system(paste('pdftotext -q -enc "ASCII7" "', x[i], '"', sep = ""))
98-
if (file.exists(gsub("\\.pdf$", "\\.txt", x[i]))) {
99-
fileName <- gsub("\\.pdf$", "\\.txt", x[i])
100-
strings <- readChar(fileName, file.info(fileName)$size)
101-
102-
# remove carriage returns and new lines
103-
strings <- gsub(x = strings, pattern = "[\r\n]", replacement = "")
104-
105-
# save result in vector
106-
txtfiles[i] <- strings
107-
108-
} else{
109-
110-
warning(paste("Failure in file", x[i]))
111-
txtfiles[i] <- ""
112-
113-
}
114-
}
94+
txtfiles <- sapply(x, pdftools::pdf_text)
95+
96+
# encode everything in UTF-32
97+
# this should ensure the same output accross multiple operating systems
98+
txtfiles <- stringi::stri_enc_toutf32(txtfiles)
99+
100+
# Replace known weird characters
101+
102+
# substitute double solidous (UTF-32 Decimal 11005) with equal sign (UTF-32
103+
# Decimal 61) [issue in APA journals]
104+
txtfiles <- lapply(txtfiles, gsub, pattern = "11005",
105+
replacement = "61", fixed = TRUE)
106+
107+
# substitute 1/4 (UTF-32 decimal 188) with equal sign (UTF-32 Decimal 61);
108+
# [issue in Elsevier journal: Journal of Environmental Psychology]
109+
txtfiles <- lapply(txtfiles, gsub, pattern = "188",
110+
replacement = "61", fixed = TRUE)
111+
112+
# substitute U+2B0D (C++ \u2b0d; UTF-32 Decimal 11021) with less than
113+
# sign (UTF-32 Decimal 60) [issue in APA journals]
114+
txtfiles <- lapply(txtfiles, gsub, pattern = "11021",
115+
replacement = "60", fixed = TRUE)
116+
117+
# substitute ! (UTF-32 decimal 33) with less than sign (UTF-32 Decimal 60);
118+
# [issue in Oxford journal: Journal of Consumer Research]
119+
txtfiles <- lapply(txtfiles, gsub, pattern = "33",
120+
replacement = "60", fixed = TRUE)
121+
122+
# substitute U+2AFA (UTF-32 Decimal 11002) with HYPHEN-MINUS sign (UTF-32
123+
# Decimal 45) [issue in APA journals]
124+
txtfiles <- lapply(txtfiles, gsub, pattern = "11002",
125+
replacement = "45", fixed = TRUE)
126+
127+
# substitute U+2439 (C++ \u2439; UTF-32 Decimal 9273) with small greek chi
128+
# (UTF-32 Decimal 967) [issue in APA journals]
129+
txtfiles <- lapply(txtfiles, gsub, pattern = "9273",
130+
replacement = "967", fixed = TRUE)
131+
132+
# Revert to UTF-8 encoding
133+
txtfiles <- stringi::stri_enc_fromutf32(txtfiles)
134+
135+
136+
# Arrange text according to paper column layout
137+
txtfiles <- pdf_columns(txtfiles)
138+
139+
# Paste the differente pages together, so that each pdf is converted to
140+
# one string of text
141+
txtfiles <- stringr::str_c(unlist(txtfiles), collapse = "")
142+
143+
144+
145+
# substitute the letter "b" in a NHST result for a "<". This is not feasible
146+
# in utf32 encoding, because making a regex that only substitutes the b in
147+
# a statistical result instead of ALL b's in the paper is very hard in
148+
# utf32 encoding. [issue in Elsevier journal: JESP]
149+
txtfiles <- lapply(txtfiles, gsub,
150+
# don't match a b preceded by =<>, because the b itself
151+
# should be the comparison sign.
152+
# only match a b followed by a number, that gives further
153+
# proof that the b is in fact the comparison sign.
154+
pattern = RGX_B_SMALLER,
155+
replacement = "<", perl = TRUE)
156+
157+
# substitute the letter "N" in a NHST result for a ">", for the same reason
158+
# as above. [issue in Elsevier journal: JESP]
159+
txtfiles <- lapply(txtfiles, gsub,
160+
# don't match a N preceded by =<>, because the N itself
161+
# should be the comparison sign.
162+
# only match a N followed by a number, that gives further
163+
# proof that the N is in fact the comparison sign.
164+
pattern = RGX_N_LARGER,
165+
replacement = ">", perl = TRUE)
166+
167+
# substitute the letter "p" that should be a "=". [issue in Oxford journal:
168+
# journal of consumer research]
169+
txtfiles <- lapply(txtfiles, gsub,
170+
# don't match a p preceded by a "," or a ",\\s", because
171+
# that is the actual p-value.
172+
# only match a p followed by a number, that gives further
173+
# proof that the p is in fact the comparison sign.
174+
pattern = RGX_P_EQUAL,
175+
replacement = "=", perl = TRUE)
176+
177+
# substitute the letter "B" that should be a '"'. [issue in BRM]
178+
txtfiles <- lapply(txtfiles, gsub,
179+
# only match a B followed by a letter that could indicate
180+
# a test statistic
181+
pattern = RGX_B_QUOTE,
182+
replacement = '"', perl = TRUE)
183+
115184

116185
return(txtfiles)
117186
}
187+
188+
189+
# helper function for getPDF() -------------------------------------------------
190+
191+
# This function helps maintaining the format of pdf files with a multiple
192+
# columns layout.
193+
# Credits to:
194+
# https://github.com/fsingletonthorn/EffectSizeScraping/blob/master/R/pdf_process.R
195+
# for original function
196+
197+
true_false <- function(x, chars) {
198+
x > chars
199+
}
200+
201+
pdf_columns <- function(x, pattern = "\\p{WHITE_SPACE}{3,}") {
202+
# \p{L} matches a single code point in the category "letter".
203+
# {3,} three or more
204+
205+
# This function is slightly adapted from pdfsearch
206+
# see: https://github.com/lebebr01/pdfsearch/blob/master/R/split_pdf.r
207+
208+
x_lines <- stringi::stri_split_lines(x)
209+
x_lines <- lapply(x_lines, gsub,
210+
pattern = "^\\s{1,20}",
211+
# ^ string that starts with
212+
# \ creates regular expression containing following...
213+
# \s matches any whitespace
214+
# {1,20} between 1 and 20 of these [ in your case this will become +]
215+
replacement = "")
216+
217+
x_page <- lapply(
218+
x_lines,
219+
stringi::stri_split_regex,
220+
pattern = pattern,
221+
omit_empty = NA,
222+
simplify = TRUE
223+
)
224+
225+
page_lines <- unlist(lapply(x_page, nrow))
226+
columns <- unlist(lapply(x_page, ncol))
227+
228+
num_chars <- lapply(x_page, base::nchar)
229+
num_chars_tf <- lapply(num_chars, true_false, chars = 3)
230+
231+
for (xx in seq_along(num_chars_tf)) {
232+
num_chars_tf[[xx]][is.na(num_chars_tf[[xx]])] <- FALSE
233+
}
234+
235+
output <- lapply(seq_along(x_page), function(xx)
236+
x_page[[xx]][num_chars_tf[[xx]]])
237+
238+
output <- lapply(output, paste, collapse = " ")
239+
return(output)
240+
}

R/helper-load-manual-reference.R

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# this script contains a helper function for the unit tests to load the
2+
# manual reference file and select relevant rows to compare statcheck output to
3+
4+
5+
load_manual <- function(
6+
path_manual, # path to the reference file
7+
apa = TRUE, # only consider apa reported stats
8+
pdf_conversion_issues = FALSE, # exclude cases where pdf conversion led to weird characters
9+
typesetting_issues = FALSE, # exclude cases where typesetting issues led to weird situations
10+
file_type = c("all", "pdf", "html"), # select specific file types
11+
file_id = NULL # select specific files based on file_id variable
12+
){
13+
14+
# load the reference file with manually extracted statistics
15+
manual <- read.csv2(system.file(path_manual, package = "statcheck"), header = TRUE)
16+
17+
# row selection based on arguments
18+
if(apa == TRUE){
19+
manual <- manual[manual$extract_apa == 1, ]
20+
}
21+
22+
if(pdf_conversion_issues == FALSE){
23+
manual <- manual[manual$pdf_conversion_issues == 0, ]
24+
}
25+
26+
if(typesetting_issues == FALSE){
27+
manual <- manual[manual$typesetting_issues == 0, ]
28+
}
29+
30+
if(file_type[1] == "pdf"){
31+
manual <- manual[manual$file_type == "pdf", ]
32+
} else if(file_type[1] == "html") {
33+
manual <- manual[manual$file_type == "pdf", ]
34+
}
35+
36+
if(!is.null(file_id)){
37+
manual <- manual[manual$file_id == file_id, ]
38+
}
39+
40+
return(manual)
41+
}

R/regex.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,24 @@ RGX_WEIRD_MINUS <- "\\s?[^\\d\\.\\s]+(?=\\d|\\.)"
9696
# F-tests and when df1 == 1, it gets typeset as the letter l or I
9797
RGX_DF1_I_L <- "I|l"
9898

99+
################################################################################
100+
###################### REGEXES FOR WEIRD PDF ENCODING ##########################
101+
################################################################################
102+
103+
# in some JESP articles, a < is translated with a b
104+
# this regex is used in file-to-txt.R to replace it
105+
RGX_B_SMALLER <- "(?<![=<>])b(?=\\s?-?\\s?\\.?\\d)"
106+
107+
# in some JESP articles, a > is translated with a N
108+
# this regex is used in file-to-txt.R to replace it
109+
RGX_N_LARGER <- "(?<![=<>])N(?=\\s?-?\\s?\\.?\\d)"
110+
111+
# in the journal of consumer research, a = is translated with a p
112+
# this regex is used in file-to-txt.R to replace it
113+
RGX_P_EQUAL <- "(?<!(,\\s)|,)p(?=\\s?-?\\s?\\.?\\d)"
99114

115+
# in the Nuijten et al. 2016 article, quotes are translated as B
116+
# this means that tests between quotes are not detected, because
117+
# statcheck can only find tests if there are not preceded by other
118+
# letters. Find upper case B followed by a test statistic
119+
RGX_B_QUOTE <- "B(?=(t|F|r|Q|z|Z))"
11.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)