Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
michalovadek authored Mar 8, 2021
1 parent 4028655 commit 0f1c71e
Show file tree
Hide file tree
Showing 20 changed files with 300 additions and 110 deletions.
17 changes: 16 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
# eurlex 0.3.5

## Major changes

- it is now possible to select all resource types available with `elx_make_query(resource_type = "any")`. Since there are nearly 1 million CELEX codes, use with discretion and expect long execution times
- results can be restricted to a particular directory code with `elx_make_query(directory = "18")` (directory code "18" denotes Common Foreign and Security Policy)
- results can be restricted to a particular sector with `elx_make_query(sector = 2)` (sector code 3 denotes EU international agreements)

## Minor changes

- new feature: request date of court case submission `elx_make_query(include_date_lodged = TRUE)`
- new feature: request type of court procedure and outcome `elx_make_query(include_court_procedure = TRUE)`
- new feature: request directory code of legal act `elx_make_query(include_directory = TRUE)`
- `elx_curia_list()` has a new default parameter `parse = TRUE` which creates separate columns for `ecli`, `see_case`, `appeal` applying regular expressions on `case_info`

# eurlex 0.3.4

## Major changes

- new feature: request citations referenced in target resource with elx_make_query(include_citations = TRUE); retrieved in CELEX form
- new feature: request document author(s) with elx_make_query(include_author = TRUE)
- new feature: request document author(s) with `elx_make_query(include_author = TRUE)`
- XML parsing is now more efficient due to utilising (rather than stripping) namespaces (but still room for improvement)

## Minor changes
Expand Down
60 changes: 52 additions & 8 deletions R/elx_curia_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@
#'
#' @param data Data to be scraped from four separate lists of cases maintained by Curia, defaults to "all"
#' which contains cases from Court of Justice, General Court and Civil Service Tribunal.
#' @param parse If `TRUE`, references to cases and appeals are parsed out from `case_info` into separate columns
#' @return
#' A data frame containing case identifiers and information as character columns.
#' A data frame containing case identifiers and information as character columns. Where the case id
#' contains a hyperlink to Eur-Lex, the CELEX identifier is retrieved as well.
#' @importFrom rlang .data
#' @export
#' @examples
#' \donttest{
#' elx_curia_list(data = "cst_all")
#' }

elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all")){
elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"),
parse = TRUE){

url_c1 <- "https://curia.europa.eu/en/content/juris/c1_juris.htm"
url_c2 <- "https://curia.europa.eu/en/content/juris/c2_juris.htm"
Expand All @@ -37,14 +40,20 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"
res_t2 <- elx_curia_scraper(url_t2)
res_f1 <- elx_curia_scraper(url_f1)

return(rbind(res_c1, res_c2, res_t2, res_f1))
res_all <- rbind(res_c1, res_c2, res_t2, res_f1)

if (parse == TRUE){res_all <- elx_curia_parse(res_all)}

return(res_all)

}

else if (data == "ecj_old"){

res_c1 <- elx_curia_scraper(url_c1)

if (parse == TRUE){res_c1 <- elx_curia_parse(res_c1)}

return(res_c1)

}
Expand All @@ -53,6 +62,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"

res_c2 <- elx_curia_scraper(url_c2)

if (parse == TRUE){res_c2 <- elx_curia_parse(res_c2)}

return(res_c2)

}
Expand All @@ -61,6 +72,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"

res_t2 <- elx_curia_scraper(url_t2)

if (parse == TRUE){res_t2 <- elx_curia_parse(res_t2)}

return(res_t2)

}
Expand All @@ -69,22 +82,23 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"

res_f1 <- elx_curia_scraper(url_f1)

if (parse == TRUE){res_f1 <- elx_curia_parse(res_f1)}

return(res_f1)

}


}


#' Scraper function
#' Curia scraper function
#'
#' @importFrom rlang .data
#'
#' @noRd
#'

elx_curia_scraper <- function(url){
elx_curia_scraper <- function(url, ...){

page <- xml2::read_html(url)

Expand Down Expand Up @@ -114,8 +128,38 @@ elx_curia_scraper <- function(url){
dplyr::mutate(n_id = dplyr::row_number()) %>%
dplyr::ungroup()

return(dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>%
dplyr::select(-.data$n_id))
out <- dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>%
dplyr::select(.data$case_id, .data$linked_celex, .data$case_info) %>%
dplyr::rename(case_id_celex = linked_celex)

return(out)

}

#' Curia parser function
#'
#' @importFrom rlang .data
#'
#' @noRd
#'

elx_curia_parse <- function(x, ...){

out <- x %>%
dplyr::mutate(ecli = stringr::str_extract(.data$case_info, "ECLI:EU:[:upper:]:[:digit:]{4}:[:digit:]+"),
see_case = stringr::str_extract(.data$case_info, "see Case .+") %>%
stringr::str_remove("see Case ") %>%
stringr::str_remove("APPEAL.*") %>%
stringr::str_squish() %>%
stringr::str_trim(),
appeal = stringr::str_extract(.data$case_info, "APPEAL.*") %>%
stringr::str_remove("APPEAL.? :") %>%
stringr::str_remove_all("\\;|\\,|\\.") %>%
stringr::str_squish() %>%
stringr::str_trim()
)

return(out)

}

76 changes: 72 additions & 4 deletions R/elx_make_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#'
#' @param resource_type Type of resource to be retrieved via SPARQL query
#' @param manual_type Define manually the type of resource to be retrieved
#' @param directory Restrict the results to a given directory code
#' @param sector Restrict the results to a given sector code
#' @param include_corrigenda If `TRUE`, results include corrigenda
#' @param include_celex If `TRUE`, results include CELEX identifier for each resource URI
#' @param include_date If `TRUE`, results include document date
Expand All @@ -21,6 +23,8 @@
#' @param include_author If `TRUE`, results include document author(s)
#' @param include_citations If `TRUE`, results include citations (CELEX-labelled)
#' @param include_court_procedure If `TRUE`, results include type of court procedure and outcome
#' @param include_directory If `TRUE`, results include the Eur-Lex directory code
#' @param include_sector If `TRUE`, results include the Eur-Lex sector code
#' @param order Order results by ids
#' @param limit Limit the number of results, for testing purposes mainly
#' @return
Expand All @@ -32,15 +36,17 @@
#' elx_make_query(resource_type = "caselaw")
#' elx_make_query(resource_type = "manual", manual_type = "SWD")

elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl"),
manual_type = "", include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE,
elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl","any"),
manual_type = "", directory = NULL, sector = NULL,
include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE,
include_date = FALSE, include_date_force = FALSE, include_date_endvalid = FALSE,
include_date_transpos = FALSE, include_date_lodged = FALSE,
include_force = FALSE, include_eurovoc = FALSE, include_author = FALSE,
include_citations = FALSE, include_court_procedure = FALSE,
include_directory = FALSE, include_sector = FALSE,
order = FALSE, limit = NULL){

if (!resource_type %in% c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined")
if (!resource_type %in% c("any","directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined")

if (resource_type == "manual" & nchar(manual_type) < 2){
stop("Please specify resource type manually (e.g. 'DIR', 'REG', 'JUDG').", call. = TRUE)
Expand Down Expand Up @@ -143,7 +149,54 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"

}

query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ")
if (include_directory == TRUE){

query <- paste(query, "?directory", sep = " ")

}

if (include_sector == TRUE){

query <- paste(query, "?sector", sep = " ")

}

if (resource_type == "any"){
query <- paste(query, "where{", sep = " ")
}

if (resource_type != "any"){
query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ")
}

if (!missing(directory)){
if (!is.character(directory)) stop("Directory code must be of character type", call. = TRUE)

query <- paste(query, "
VALUES (?value)
{ (<http://publications.europa.eu/resource/authority/fd_555/",directory,">)
(<http://publications.europa.eu/resource/authority/dir-eu-legal-act/",directory,">)
}
{?work cdm:resource_legal_is_about_concept_directory-code ?value.
}
UNION
{?work cdm:resource_legal_is_about_concept_directory-code ?directory.
?value skos:narrower+ ?directory.
}
", sep = "")

}

if (!missing(sector)){
if (!sector %in% 0:9) stop("Sector code must be an integer between 0 and 9", call. = TRUE)

query <- paste(query, "
?work cdm:resource_legal_id_sector ?sector.
FILTER(str(?sector)='", sector, "')
",
sep = "")

}

if (resource_type == "directive"){
query <- paste(query, "FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/DIR>||
Expand Down Expand Up @@ -178,6 +231,9 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"
?type=<http://publications.europa.eu/resource/authority/resource-type/ARRANG>||
?type=<http://publications.europa.eu/resource/authority/resource-type/CONVENTION>||
?type=<http://publications.europa.eu/resource/authority/resource-type/AGREE_AMEND>||
?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_ADOPT_INTERNATION>||
?type=<http://publications.europa.eu/resource/authority/resource-type/REG_ADOPT_INTERNATION>||
?type=<http://publications.europa.eu/resource/authority/resource-type/DEC_ADOPT_INTERNATION>||
?type=<http://publications.europa.eu/resource/authority/resource-type/MEMORANDUM_UNDERST>)", sep = " ")
}

Expand Down Expand Up @@ -347,6 +403,18 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"

}

if (include_directory == TRUE){

query <- paste(query, "OPTIONAL{?work cdm:resource_legal_is_about_concept_directory-code ?directory.}")

}

if (include_sector == TRUE){

query <- paste(query, "OPTIONAL{?work cdm:resource_legal_id_sector ?sector.}")

}

if (order == TRUE){
query <- paste(query, "} order by str(?date)")
} else {query <- paste(query, "}")}
Expand Down
12 changes: 6 additions & 6 deletions R/elx_parse_xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ elx_parse_xml <- function(sparql_response = ""){

out <- dplyr::tibble(res_cols, res_text) %>%
dplyr::mutate(is_work = dplyr::if_else(res_cols=="eurovoc", T, NA)) %>%
dplyr::group_by(is_work) %>%
dplyr::group_by(.data$is_work) %>%
dplyr::mutate(triplet = dplyr::row_number(),
triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>%
triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>%
dplyr::ungroup() %>%
tidyr::fill(triplet) %>%
tidyr::fill(.data$triplet) %>%
dplyr::select(-.data$is_work) %>%
tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>%
dplyr::select(-.data$triplet)
Expand All @@ -35,11 +35,11 @@ elx_parse_xml <- function(sparql_response = ""){

out <- dplyr::tibble(res_cols, res_text) %>%
dplyr::mutate(is_work = dplyr::if_else(res_cols=="work", T, NA)) %>%
dplyr::group_by(is_work) %>%
dplyr::group_by(.data$is_work) %>%
dplyr::mutate(triplet = dplyr::row_number(),
triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>%
triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>%
dplyr::ungroup() %>%
tidyr::fill(triplet) %>%
tidyr::fill(.data$triplet) %>%
dplyr::select(-.data$is_work) %>%
tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>%
dplyr::select(-.data$triplet)
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@ List of resource types in Cellar (NAL): http://publications.europa.eu/resource/a
NAL of corporate bodies:
http://publications.europa.eu/resource/authority/corporate-body

Indexation of data in Cellar: http://publications.europa.eu/resource/cellar/4874abcd-286a-11e8-b5fe-01aa75ed71a1.0001.03/DOC_1
Query builder:
https://op.europa.eu/en/advanced-sparql-query-editor

SPARQL endpoint:
http://publications.europa.eu/webapi/rdf/sparql
14 changes: 6 additions & 8 deletions docs/articles/eurlexpkg.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified docs/articles/eurlexpkg_files/figure-html/firstplot-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 0f1c71e

Please sign in to comment.