Add files via upload

michalovadek · Mar 8, 2021 · 0f1c71e · 0f1c71e
1 parent 4028655
commit 0f1c71e
Show file tree

Hide file tree

Showing 20 changed files with 300 additions and 110 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,24 @@
+# eurlex 0.3.5
+
+## Major changes
+
+- it is now possible to select all resource types available with `elx_make_query(resource_type = "any")`. Since there are nearly 1 million CELEX codes, use with discretion and expect long execution times
+- results can be restricted to a particular directory code with `elx_make_query(directory = "18")` (directory code "18" denotes Common Foreign and Security Policy)
+- results can be restricted to a particular sector with `elx_make_query(sector = 2)` (sector code 3 denotes EU international agreements)
+
+## Minor changes
+
+- new feature: request date of court case submission `elx_make_query(include_date_lodged = TRUE)`
+- new feature: request type of court procedure and outcome `elx_make_query(include_court_procedure = TRUE)`
+- new feature: request directory code of legal act `elx_make_query(include_directory = TRUE)`
+- `elx_curia_list()` has a new default parameter `parse = TRUE` which creates separate columns for `ecli`, `see_case`, `appeal` applying regular expressions on `case_info`
+
 # eurlex 0.3.4
 
 ## Major changes
 
 - new feature: request citations referenced in target resource with elx_make_query(include_citations = TRUE); retrieved in CELEX form
-- new feature: request document author(s) with elx_make_query(include_author = TRUE)
+- new feature: request document author(s) with `elx_make_query(include_author = TRUE)`
 - XML parsing is now more efficient due to utilising (rather than stripping) namespaces (but still room for improvement)
 
 ## Minor changes

diff --git a/R/elx_curia_list.R b/R/elx_curia_list.R
@@ -5,16 +5,19 @@
 #'
 #' @param data Data to be scraped from four separate lists of cases maintained by Curia, defaults to "all"
 #' which contains cases from Court of Justice, General Court and Civil Service Tribunal.
+#' @param parse If `TRUE`, references to cases and appeals are parsed out from `case_info` into separate columns
 #' @return
-#' A data frame containing case identifiers and information as character columns.
+#' A data frame containing case identifiers and information as character columns. Where the case id
+#' contains a hyperlink to Eur-Lex, the CELEX identifier is retrieved as well.
 #' @importFrom rlang .data
 #' @export
 #' @examples
 #' \donttest{
 #' elx_curia_list(data = "cst_all")
 #' }
 
-elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all")){
+elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"),
+                           parse = TRUE){
 
   url_c1 <- "https://curia.europa.eu/en/content/juris/c1_juris.htm"
   url_c2 <- "https://curia.europa.eu/en/content/juris/c2_juris.htm"
@@ -37,14 +40,20 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"
     res_t2 <- elx_curia_scraper(url_t2)
     res_f1 <- elx_curia_scraper(url_f1)
 
-    return(rbind(res_c1, res_c2, res_t2, res_f1))
+    res_all <- rbind(res_c1, res_c2, res_t2, res_f1)
+
+    if (parse == TRUE){res_all <- elx_curia_parse(res_all)}
+
+    return(res_all)
 
   }
 
   else if (data == "ecj_old"){
 
     res_c1 <- elx_curia_scraper(url_c1)
 
+    if (parse == TRUE){res_c1 <- elx_curia_parse(res_c1)}
+
     return(res_c1)
 
   }
@@ -53,6 +62,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"
 
     res_c2 <- elx_curia_scraper(url_c2)
 
+    if (parse == TRUE){res_c2 <- elx_curia_parse(res_c2)}
+
     return(res_c2)
 
   }
@@ -61,6 +72,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"
 
     res_t2 <- elx_curia_scraper(url_t2)
 
+    if (parse == TRUE){res_t2 <- elx_curia_parse(res_t2)}
+
     return(res_t2)
 
   }
@@ -69,22 +82,23 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"
 
     res_f1 <- elx_curia_scraper(url_f1)
 
+    if (parse == TRUE){res_f1 <- elx_curia_parse(res_f1)}
+
     return(res_f1)
 
   }
 
 
 }
 
-
-#' Scraper function
+#' Curia scraper function
 #'
 #' @importFrom rlang .data
 #'
 #' @noRd
 #'
 
-elx_curia_scraper <- function(url){
+elx_curia_scraper <- function(url, ...){
 
   page <- xml2::read_html(url)
 
@@ -114,8 +128,38 @@ elx_curia_scraper <- function(url){
     dplyr::mutate(n_id = dplyr::row_number()) %>%
     dplyr::ungroup()
 
-  return(dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>%
-           dplyr::select(-.data$n_id))
+  out <- dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>%
+    dplyr::select(.data$case_id, .data$linked_celex, .data$case_info) %>%
+    dplyr::rename(case_id_celex = linked_celex)
+
+  return(out)
+
+}
+
+#' Curia parser function
+#'
+#' @importFrom rlang .data
+#'
+#' @noRd
+#'
+
+elx_curia_parse <- function(x, ...){
+
+  out <- x %>%
+    dplyr::mutate(ecli = stringr::str_extract(.data$case_info, "ECLI:EU:[:upper:]:[:digit:]{4}:[:digit:]+"),
+                  see_case = stringr::str_extract(.data$case_info, "see Case .+") %>%
+                    stringr::str_remove("see Case ") %>%
+                    stringr::str_remove("APPEAL.*") %>%
+                    stringr::str_squish() %>%
+                    stringr::str_trim(),
+                  appeal = stringr::str_extract(.data$case_info, "APPEAL.*") %>%
+                    stringr::str_remove("APPEAL.? :") %>%
+                    stringr::str_remove_all("\\;|\\,|\\.") %>%
+                    stringr::str_squish() %>%
+                    stringr::str_trim()
+    )
+
+  return(out)
 
 }
 
diff --git a/R/elx_make_query.R b/R/elx_make_query.R
@@ -8,6 +8,8 @@
 #'
 #' @param resource_type Type of resource to be retrieved via SPARQL query
 #' @param manual_type Define manually the type of resource to be retrieved
+#' @param directory Restrict the results to a given directory code
+#' @param sector Restrict the results to a given sector code
 #' @param include_corrigenda If `TRUE`, results include corrigenda
 #' @param include_celex If `TRUE`, results include CELEX identifier for each resource URI
 #' @param include_date If `TRUE`, results include document date
@@ -21,6 +23,8 @@
 #' @param include_author If `TRUE`, results include document author(s)
 #' @param include_citations If `TRUE`, results include citations (CELEX-labelled)
 #' @param include_court_procedure If `TRUE`, results include type of court procedure and outcome
+#' @param include_directory If `TRUE`, results include the Eur-Lex directory code
+#' @param include_sector If `TRUE`, results include the Eur-Lex sector code
 #' @param order Order results by ids
 #' @param limit Limit the number of results, for testing purposes mainly
 #' @return
@@ -32,15 +36,17 @@
 #' elx_make_query(resource_type = "caselaw")
 #' elx_make_query(resource_type = "manual", manual_type = "SWD")
 
-elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl"),
-                           manual_type = "", include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE,
+elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl","any"),
+                           manual_type = "", directory = NULL, sector = NULL,
+                           include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE,
                            include_date = FALSE, include_date_force = FALSE, include_date_endvalid = FALSE,
                            include_date_transpos = FALSE, include_date_lodged = FALSE,
                            include_force = FALSE, include_eurovoc = FALSE, include_author = FALSE,
                            include_citations = FALSE, include_court_procedure = FALSE,
+                           include_directory = FALSE, include_sector = FALSE,
                            order = FALSE, limit = NULL){
 
-  if (!resource_type %in% c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined")
+  if (!resource_type %in% c("any","directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined")
 
   if (resource_type == "manual" & nchar(manual_type) < 2){
     stop("Please specify resource type manually (e.g. 'DIR', 'REG', 'JUDG').", call. = TRUE)
@@ -143,7 +149,54 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"
 
   }
 
-  query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ")
+  if (include_directory == TRUE){
+
+    query <- paste(query, "?directory", sep = " ")
+
+  }
+
+  if (include_sector == TRUE){
+
+    query <- paste(query, "?sector", sep = " ")
+
+  }
+
+  if (resource_type == "any"){
+    query <- paste(query, "where{", sep = " ")
+  }
+
+  if (resource_type != "any"){
+    query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ")
+  }
+
+  if (!missing(directory)){
+    if (!is.character(directory)) stop("Directory code must be of character type", call. = TRUE)
+
+    query <- paste(query, "
+    VALUES (?value)
+    { (<http://publications.europa.eu/resource/authority/fd_555/",directory,">)
+      (<http://publications.europa.eu/resource/authority/dir-eu-legal-act/",directory,">)
+    }
+    {?work cdm:resource_legal_is_about_concept_directory-code ?value.
+    }
+    UNION
+    {?work cdm:resource_legal_is_about_concept_directory-code ?directory.
+      ?value skos:narrower+ ?directory.
+    }
+    ", sep = "")
+
+  }
+
+  if (!missing(sector)){
+    if (!sector %in% 0:9) stop("Sector code must be an integer between 0 and 9", call. = TRUE)
+
+    query <- paste(query, "
+    ?work cdm:resource_legal_id_sector ?sector.
+    FILTER(str(?sector)='", sector, "')
+    ",
+                   sep = "")
+
+  }
 
   if (resource_type == "directive"){
     query <- paste(query, "FILTER(?type=<http://publications.europa.eu/resource/authority/resource-type/DIR>||
@@ -178,6 +231,9 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"
   ?type=<http://publications.europa.eu/resource/authority/resource-type/ARRANG>||
   ?type=<http://publications.europa.eu/resource/authority/resource-type/CONVENTION>||
   ?type=<http://publications.europa.eu/resource/authority/resource-type/AGREE_AMEND>||
+  ?type=<http://publications.europa.eu/resource/authority/resource-type/RECO_ADOPT_INTERNATION>||
+  ?type=<http://publications.europa.eu/resource/authority/resource-type/REG_ADOPT_INTERNATION>||
+  ?type=<http://publications.europa.eu/resource/authority/resource-type/DEC_ADOPT_INTERNATION>||
   ?type=<http://publications.europa.eu/resource/authority/resource-type/MEMORANDUM_UNDERST>)", sep = " ")
   }
 
@@ -347,6 +403,18 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision"
 
   }
 
+  if (include_directory == TRUE){
+
+    query <- paste(query, "OPTIONAL{?work cdm:resource_legal_is_about_concept_directory-code ?directory.}")
+
+  }
+
+  if (include_sector == TRUE){
+
+    query <- paste(query, "OPTIONAL{?work cdm:resource_legal_id_sector ?sector.}")
+
+  }
+
   if (order == TRUE){
     query <- paste(query, "} order by str(?date)")
   } else {query <- paste(query, "}")}

diff --git a/R/elx_parse_xml.R b/R/elx_parse_xml.R
@@ -22,11 +22,11 @@ elx_parse_xml <- function(sparql_response = ""){
 
     out <- dplyr::tibble(res_cols, res_text) %>%
       dplyr::mutate(is_work = dplyr::if_else(res_cols=="eurovoc", T, NA)) %>%
-      dplyr::group_by(is_work) %>%
+      dplyr::group_by(.data$is_work) %>%
       dplyr::mutate(triplet = dplyr::row_number(),
-                    triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>%
+                    triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>%
       dplyr::ungroup() %>%
-      tidyr::fill(triplet) %>%
+      tidyr::fill(.data$triplet) %>%
       dplyr::select(-.data$is_work) %>%
       tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>%
       dplyr::select(-.data$triplet)
@@ -35,11 +35,11 @@ elx_parse_xml <- function(sparql_response = ""){
 
     out <- dplyr::tibble(res_cols, res_text) %>%
       dplyr::mutate(is_work = dplyr::if_else(res_cols=="work", T, NA)) %>%
-      dplyr::group_by(is_work) %>%
+      dplyr::group_by(.data$is_work) %>%
       dplyr::mutate(triplet = dplyr::row_number(),
-                    triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>%
+                    triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>%
       dplyr::ungroup() %>%
-      tidyr::fill(triplet) %>%
+      tidyr::fill(.data$triplet) %>%
       dplyr::select(-.data$is_work) %>%
       tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>%
       dplyr::select(-.data$triplet)

diff --git a/README.md b/README.md
@@ -48,4 +48,8 @@ List of resource types in Cellar (NAL): http://publications.europa.eu/resource/a
 NAL of corporate bodies:
 http://publications.europa.eu/resource/authority/corporate-body
 
-Indexation of data in Cellar: http://publications.europa.eu/resource/cellar/4874abcd-286a-11e8-b5fe-01aa75ed71a1.0001.03/DOC_1
+Query builder:
+https://op.europa.eu/en/advanced-sparql-query-editor
+
+SPARQL endpoint:
+http://publications.europa.eu/webapi/rdf/sparql
diff --git a/docs/articles/eurlexpkg.html b/docs/articles/eurlexpkg.html
diff --git a/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png b/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png
diff --git a/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png b/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png
diff --git a/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png b/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png