diff --git a/NEWS.md b/NEWS.md index a3fa0b8..2c107f5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,24 @@ +# eurlex 0.3.5 + +## Major changes + +- it is now possible to select all resource types available with `elx_make_query(resource_type = "any")`. Since there are nearly 1 million CELEX codes, use with discretion and expect long execution times +- results can be restricted to a particular directory code with `elx_make_query(directory = "18")` (directory code "18" denotes Common Foreign and Security Policy) +- results can be restricted to a particular sector with `elx_make_query(sector = 2)` (sector code 3 denotes EU international agreements) + +## Minor changes + +- new feature: request date of court case submission `elx_make_query(include_date_lodged = TRUE)` +- new feature: request type of court procedure and outcome `elx_make_query(include_court_procedure = TRUE)` +- new feature: request directory code of legal act `elx_make_query(include_directory = TRUE)` +- `elx_curia_list()` has a new default parameter `parse = TRUE` which creates separate columns for `ecli`, `see_case`, `appeal` applying regular expressions on `case_info` + # eurlex 0.3.4 ## Major changes - new feature: request citations referenced in target resource with elx_make_query(include_citations = TRUE); retrieved in CELEX form -- new feature: request document author(s) with elx_make_query(include_author = TRUE) +- new feature: request document author(s) with `elx_make_query(include_author = TRUE)` - XML parsing is now more efficient due to utilising (rather than stripping) namespaces (but still room for improvement) ## Minor changes diff --git a/R/elx_curia_list.R b/R/elx_curia_list.R index 6ec1e3c..0de6f4c 100644 --- a/R/elx_curia_list.R +++ b/R/elx_curia_list.R @@ -5,8 +5,10 @@ #' #' @param data Data to be scraped from four separate lists of cases maintained by Curia, defaults to "all" #' which contains cases from Court of Justice, General Court and Civil Service Tribunal. +#' @param parse If `TRUE`, references to cases and appeals are parsed out from `case_info` into separate columns #' @return -#' A data frame containing case identifiers and information as character columns. +#' A data frame containing case identifiers and information as character columns. Where the case id +#' contains a hyperlink to Eur-Lex, the CELEX identifier is retrieved as well. #' @importFrom rlang .data #' @export #' @examples @@ -14,7 +16,8 @@ #' elx_curia_list(data = "cst_all") #' } -elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all")){ +elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all"), + parse = TRUE){ url_c1 <- "https://curia.europa.eu/en/content/juris/c1_juris.htm" url_c2 <- "https://curia.europa.eu/en/content/juris/c2_juris.htm" @@ -37,7 +40,11 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" res_t2 <- elx_curia_scraper(url_t2) res_f1 <- elx_curia_scraper(url_f1) - return(rbind(res_c1, res_c2, res_t2, res_f1)) + res_all <- rbind(res_c1, res_c2, res_t2, res_f1) + + if (parse == TRUE){res_all <- elx_curia_parse(res_all)} + + return(res_all) } @@ -45,6 +52,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" res_c1 <- elx_curia_scraper(url_c1) + if (parse == TRUE){res_c1 <- elx_curia_parse(res_c1)} + return(res_c1) } @@ -53,6 +62,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" res_c2 <- elx_curia_scraper(url_c2) + if (parse == TRUE){res_c2 <- elx_curia_parse(res_c2)} + return(res_c2) } @@ -61,6 +72,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" res_t2 <- elx_curia_scraper(url_t2) + if (parse == TRUE){res_t2 <- elx_curia_parse(res_t2)} + return(res_t2) } @@ -69,6 +82,8 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" res_f1 <- elx_curia_scraper(url_f1) + if (parse == TRUE){res_f1 <- elx_curia_parse(res_f1)} + return(res_f1) } @@ -76,15 +91,14 @@ elx_curia_list <- function(data = c("all","ecj_old","ecj_new","gc_all","cst_all" } - -#' Scraper function +#' Curia scraper function #' #' @importFrom rlang .data #' #' @noRd #' -elx_curia_scraper <- function(url){ +elx_curia_scraper <- function(url, ...){ page <- xml2::read_html(url) @@ -114,8 +128,38 @@ elx_curia_scraper <- function(url){ dplyr::mutate(n_id = dplyr::row_number()) %>% dplyr::ungroup() - return(dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>% - dplyr::select(-.data$n_id)) + out <- dplyr::left_join(tab, linked, by = c("case_id"="linked_id","n_id"="n_id")) %>% + dplyr::select(.data$case_id, .data$linked_celex, .data$case_info) %>% + dplyr::rename(case_id_celex = linked_celex) + + return(out) + +} + +#' Curia parser function +#' +#' @importFrom rlang .data +#' +#' @noRd +#' + +elx_curia_parse <- function(x, ...){ + + out <- x %>% + dplyr::mutate(ecli = stringr::str_extract(.data$case_info, "ECLI:EU:[:upper:]:[:digit:]{4}:[:digit:]+"), + see_case = stringr::str_extract(.data$case_info, "see Case .+") %>% + stringr::str_remove("see Case ") %>% + stringr::str_remove("APPEAL.*") %>% + stringr::str_squish() %>% + stringr::str_trim(), + appeal = stringr::str_extract(.data$case_info, "APPEAL.*") %>% + stringr::str_remove("APPEAL.? :") %>% + stringr::str_remove_all("\\;|\\,|\\.") %>% + stringr::str_squish() %>% + stringr::str_trim() + ) + + return(out) } diff --git a/R/elx_make_query.R b/R/elx_make_query.R index f4985e0..b72251b 100644 --- a/R/elx_make_query.R +++ b/R/elx_make_query.R @@ -8,6 +8,8 @@ #' #' @param resource_type Type of resource to be retrieved via SPARQL query #' @param manual_type Define manually the type of resource to be retrieved +#' @param directory Restrict the results to a given directory code +#' @param sector Restrict the results to a given sector code #' @param include_corrigenda If `TRUE`, results include corrigenda #' @param include_celex If `TRUE`, results include CELEX identifier for each resource URI #' @param include_date If `TRUE`, results include document date @@ -21,6 +23,8 @@ #' @param include_author If `TRUE`, results include document author(s) #' @param include_citations If `TRUE`, results include citations (CELEX-labelled) #' @param include_court_procedure If `TRUE`, results include type of court procedure and outcome +#' @param include_directory If `TRUE`, results include the Eur-Lex directory code +#' @param include_sector If `TRUE`, results include the Eur-Lex sector code #' @param order Order results by ids #' @param limit Limit the number of results, for testing purposes mainly #' @return @@ -32,15 +36,17 @@ #' elx_make_query(resource_type = "caselaw") #' elx_make_query(resource_type = "manual", manual_type = "SWD") -elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl"), - manual_type = "", include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE, +elx_make_query <- function(resource_type = c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl","any"), + manual_type = "", directory = NULL, sector = NULL, + include_corrigenda = FALSE, include_celex = TRUE, include_lbs = FALSE, include_date = FALSE, include_date_force = FALSE, include_date_endvalid = FALSE, include_date_transpos = FALSE, include_date_lodged = FALSE, include_force = FALSE, include_eurovoc = FALSE, include_author = FALSE, include_citations = FALSE, include_court_procedure = FALSE, + include_directory = FALSE, include_sector = FALSE, order = FALSE, limit = NULL){ - if (!resource_type %in% c("directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined") + if (!resource_type %in% c("any","directive","regulation","decision","recommendation","intagr","caselaw","manual","proposal","national_impl")) stop("'resource_type' must be defined") if (resource_type == "manual" & nchar(manual_type) < 2){ stop("Please specify resource type manually (e.g. 'DIR', 'REG', 'JUDG').", call. = TRUE) @@ -143,7 +149,54 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision" } - query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ") + if (include_directory == TRUE){ + + query <- paste(query, "?directory", sep = " ") + + } + + if (include_sector == TRUE){ + + query <- paste(query, "?sector", sep = " ") + + } + + if (resource_type == "any"){ + query <- paste(query, "where{", sep = " ") + } + + if (resource_type != "any"){ + query <- paste(query, "where{ ?work cdm:work_has_resource-type ?type.", sep = " ") + } + + if (!missing(directory)){ + if (!is.character(directory)) stop("Directory code must be of character type", call. = TRUE) + + query <- paste(query, " + VALUES (?value) + { () + () + } + {?work cdm:resource_legal_is_about_concept_directory-code ?value. + } + UNION + {?work cdm:resource_legal_is_about_concept_directory-code ?directory. + ?value skos:narrower+ ?directory. + } + ", sep = "") + + } + + if (!missing(sector)){ + if (!sector %in% 0:9) stop("Sector code must be an integer between 0 and 9", call. = TRUE) + + query <- paste(query, " + ?work cdm:resource_legal_id_sector ?sector. + FILTER(str(?sector)='", sector, "') + ", + sep = "") + + } if (resource_type == "directive"){ query <- paste(query, "FILTER(?type=|| @@ -178,6 +231,9 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision" ?type=|| ?type=|| ?type=|| + ?type=|| + ?type=|| + ?type=|| ?type=)", sep = " ") } @@ -347,6 +403,18 @@ elx_make_query <- function(resource_type = c("directive","regulation","decision" } + if (include_directory == TRUE){ + + query <- paste(query, "OPTIONAL{?work cdm:resource_legal_is_about_concept_directory-code ?directory.}") + + } + + if (include_sector == TRUE){ + + query <- paste(query, "OPTIONAL{?work cdm:resource_legal_id_sector ?sector.}") + + } + if (order == TRUE){ query <- paste(query, "} order by str(?date)") } else {query <- paste(query, "}")} diff --git a/R/elx_parse_xml.R b/R/elx_parse_xml.R index 72a05ef..815e273 100644 --- a/R/elx_parse_xml.R +++ b/R/elx_parse_xml.R @@ -22,11 +22,11 @@ elx_parse_xml <- function(sparql_response = ""){ out <- dplyr::tibble(res_cols, res_text) %>% dplyr::mutate(is_work = dplyr::if_else(res_cols=="eurovoc", T, NA)) %>% - dplyr::group_by(is_work) %>% + dplyr::group_by(.data$is_work) %>% dplyr::mutate(triplet = dplyr::row_number(), - triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>% + triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>% dplyr::ungroup() %>% - tidyr::fill(triplet) %>% + tidyr::fill(.data$triplet) %>% dplyr::select(-.data$is_work) %>% tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>% dplyr::select(-.data$triplet) @@ -35,11 +35,11 @@ elx_parse_xml <- function(sparql_response = ""){ out <- dplyr::tibble(res_cols, res_text) %>% dplyr::mutate(is_work = dplyr::if_else(res_cols=="work", T, NA)) %>% - dplyr::group_by(is_work) %>% + dplyr::group_by(.data$is_work) %>% dplyr::mutate(triplet = dplyr::row_number(), - triplet = dplyr::if_else(is_work==T, triplet, NA_integer_)) %>% + triplet = dplyr::if_else(.data$is_work==T, .data$triplet, NA_integer_)) %>% dplyr::ungroup() %>% - tidyr::fill(triplet) %>% + tidyr::fill(.data$triplet) %>% dplyr::select(-.data$is_work) %>% tidyr::pivot_wider(names_from = res_cols, values_from = res_text) %>% dplyr::select(-.data$triplet) diff --git a/README.md b/README.md index 9b07014..d384a33 100644 --- a/README.md +++ b/README.md @@ -48,4 +48,8 @@ List of resource types in Cellar (NAL): http://publications.europa.eu/resource/a NAL of corporate bodies: http://publications.europa.eu/resource/authority/corporate-body -Indexation of data in Cellar: http://publications.europa.eu/resource/cellar/4874abcd-286a-11e8-b5fe-01aa75ed71a1.0001.03/DOC_1 +Query builder: +https://op.europa.eu/en/advanced-sparql-query-editor + +SPARQL endpoint: +http://publications.europa.eu/webapi/rdf/sparql diff --git a/docs/articles/eurlexpkg.html b/docs/articles/eurlexpkg.html index 8a9fee5..5739615 100644 --- a/docs/articles/eurlexpkg.html +++ b/docs/articles/eurlexpkg.html @@ -217,14 +217,14 @@

# elx_make_query("directive") %>% # elx_run_query()
as_tibble(results)
-#> # A tibble: 4,303 x 3
+#> # A tibble: 4,316 x 3
 #>   work                                   type                            celex  
 #>   <chr>                                  <chr>                           <chr>  
 #> 1 http://publications.europa.eu/resourc~ http://publications.europa.eu/~ 31979L~
 #> 2 http://publications.europa.eu/resourc~ http://publications.europa.eu/~ 31989L~
 #> 3 http://publications.europa.eu/resourc~ http://publications.europa.eu/~ 31984L~
 #> 4 http://publications.europa.eu/resourc~ http://publications.europa.eu/~ 31966L~
-#> # ... with 4,299 more rows
+#> # ... with 4,312 more rows

The function outputs a data.frame where each column corresponds to one of the requested variables, while the rows accumulate observations of the resource type satisfying the query criteria. Obviously, the more data is to be returned, the longer the execution time, varying from a few seconds to several minutes, depending also on your connection.

The first column always contains the unique URI of a “work” (legislative act or court judgment) which identifies each resource in Cellar. Several human-readable identifiers are normally associated with each “work” but the most useful one is CELEX, retrieved by default.2

One column you should always pay attention to is type (as in resource_type). The URIs contained there reflect the FILTER argument in the SPARQL query, which is manually pre-specified. All resources are indexed as being of one type or another. For example, when retrieving directives, the results are going to return also delegated directives, which might not be desirable, depending on your needs. You can filter results by type to make the necessary adjustments. The queries are expansive by default in the spirit of erring on the side of over-inclusiveness rather than vice versa.

@@ -263,8 +263,7 @@

#> 4 31996H0592 http://eurovoc.europa.eu/1076 #> # ... with 6 more rows

By default, the endpoint returns the EuroVoc concept codes rather than the labels (keywords). The function elx_label_eurovoc() needs to be called to obtain a look-up table with the labels.

-
-eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc)
+
eurovoc_lookup <- elx_label_eurovoc(uri_eurovoc = rec_eurovoc$eurovoc)
 
 print(eurovoc_lookup)
 #> # A tibble: 9 x 2
@@ -276,8 +275,7 @@ 

#> 4 http://eurovoc.europa.eu/1318 Germany #> # ... with 5 more rows

The results include labels only for unique identifiers, but with dplyr::left_join() it is straightforward to append the labels to the entire dataset.

-
-rec_eurovoc %>%
+
rec_eurovoc %>%
   left_join(eurovoc_lookup)
 #> Joining, by = "eurovoc"
 #> # A tibble: 10 x 5
@@ -367,14 +365,14 @@ 

as_tibble() print(dirs_1970_title) -#> # A tibble: 78 x 6 +#> # A tibble: 70 x 6 #> work type celex date force title #> <chr> <chr> <chr> <chr> <chr> <chr> #> 1 http://publications~ http://publicatio~ 31975~ 1975~ true Council Directive ~ #> 2 http://publications~ http://publicatio~ 31977~ 1977~ true First Commission D~ #> 3 http://publications~ http://publicatio~ 31977~ 1977~ true Council Directive ~ #> 4 http://publications~ http://publicatio~ 31973~ 1973~ true Council Directive ~ -#> # ... with 74 more rows

+#> # ... with 66 more rows

I will use the tidytext package to get a quick idea of what the legislation is about.

library(tidytext)
 library(wordcloud)
diff --git a/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png b/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png
index c4c9ff6..8cd02e0 100644
Binary files a/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png and b/docs/articles/eurlexpkg_files/figure-html/firstplot-1.png differ
diff --git a/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png b/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png
index 285c6bc..631076f 100644
Binary files a/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png and b/docs/articles/eurlexpkg_files/figure-html/unnamed-chunk-8-1.png differ
diff --git a/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png b/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png
index ae68d97..10f938c 100644
Binary files a/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png and b/docs/articles/eurlexpkg_files/figure-html/wordcloud-1.png differ
diff --git a/docs/authors.html b/docs/authors.html
index a18139d..b8fc41e 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -129,13 +129,19 @@ 

Citation

-

Ovadek M (2020). -eurlex: An R package for retrieving official data on European Union law. +

Ovadek M (2021). +“Facilitating access to data on European Union laws.” +Political Research Exchange, 3. +doi: 10.1080/2474736X.2020.1870150.

-
@Manual{,
-  title = {eurlex: An R package for retrieving official data on European Union law},
+    
@Article{,
+  title = {Facilitating access to data on European Union laws},
   author = {Michal Ovadek},
-  year = {2020},
+  year = {2021},
+  journal = {Political Research Exchange},
+  volume = {3},
+  issue = {1},
+  doi = {10.1080/2474736X.2020.1870150},
 }

  • improvement to legal basis harvesting thanks to help from Eur-Lex insiders
  • legal basis results are now slightly more comprehensive and correct
  • legal basis results now include a new column detailing the “suffix” (paragraph, subparagraph, etc.) in string form
-
+

-Minor changes

+Minor changes
  • minor updates to documentation
@@ -187,9 +217,9 @@

eurlex 0.3.1 2020-09-11

-
+

-Minor changes

+Minor changes
  • elx_fetch_data() now prefers CELEX-based URLs (instead of Cellar URIs) as input, as they appear to yield fewer missing documents
  • @@ -200,9 +230,9 @@

    eurlex 0.3.0 Unreleased

    -
    +

    -Major changes

    +Major changes
    • elx_fetch_data("text") now retrieves plain text from html, pdf and MS Word documents
    • @@ -216,9 +246,9 @@

      eurlex 0.2.3 Unreleased

      -
      +

      -Minor changes

      +Minor changes
      • fixed serious bugs in elx_curia_list()
      • @@ -231,9 +261,9 @@

        eurlex 0.2.2 Unreleased

        -
        +

        -Major changes

        +Major changes
        • elx_council_votes() made fully operational
        • @@ -244,9 +274,9 @@

          eurlex 0.2.1 2020-08-19

          -
          +

          -Minor changes

          +Minor changes
          • optimization, reducing dependencies, etc.
          @@ -256,9 +286,9 @@

          eurlex 0.2.0 Unreleased

          -
          +

          -Major changes

          +Major changes
          • addition of proposals and national implementing laws to possible SPARQL queries
          • EuroVoc topics, retrievable in all EU languages, can now be included in SPARQL results
          • @@ -266,9 +296,9 @@

          • added elx_curia_list() to retrieve full list of EU court cases
          -
          +

          -Minor changes

          +Minor changes
          • switch from XML to xml2
          • SPARQL package dependency removed
          • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index a333b91..26598c4 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,5 +3,5 @@ pkgdown: 1.5.1 pkgdown_sha: ~ articles: eurlexpkg: eurlexpkg.html -last_built: 2020-11-07T00:50Z +last_built: 2021-03-08T23:15Z diff --git a/docs/reference/elx_curia_list.html b/docs/reference/elx_curia_list.html index 25d38fb..67f64ed 100644 --- a/docs/reference/elx_curia_list.html +++ b/docs/reference/elx_curia_list.html @@ -155,21 +155,21 @@

            Value

            Examples

            # \donttest{ -elx_curia_list(data = "cst_all")
            #> # A tibble: 1,759 x 3 -#> case_id case_info linked_celex -#> <chr> <chr> <chr> -#> 1 F-1/05 * Judgment of 26 October 2006, Landgren / ETF (F-1/05, ~ NA -#> 2 F-1/05 Order of 22 May 2007, Landgren / ETF (F-1/05, ECR-SC ~ NA -#> 3 F-1/05 I~ Order of 13 July 2007, Landgren / ETF (F-1/05 INT, EC~ NA -#> 4 F-1/05 Order of 9 November 2010, Landgren / ETF (F-1/05, unp~ NA -#> 5 F-2/05 Removed from the register on 18 June 2008, Kröppelin ~ NA -#> 6 F-3/05 Order of 15 May 2006, Schmit / Commission (F-3/05, EC~ NA -#> 7 F-4/05 Removed from the register on 18 June 2008, Huober / C~ NA -#> 8 F-5/05 * Judgment of 28 April 2009, Violetti and others / Comm~ NA -#> 9 F-6/05 Removed from the register on 18 June 2008, Kröppelin ~ NA -#> 10 F-7/05 Schmit / Commission (F-7/05) , see Case F-5/05 NA -#> # ... with 1,749 more rows
            # } -
            +elx_curia_list(data = "cst_all")
          #> # A tibble: 1,759 x 6 +#> case_id case_id_celex case_info ecli see_case appeal +#> <chr> <chr> <chr> <chr> <chr> <chr> +#> 1 F-1/05 * <NA> Judgment of 26 October 2006~ ECLI:EU:~ <NA> T-404~ +#> 2 F-1/05 <NA> Order of 22 May 2007, Landg~ ECLI:EU:~ <NA> <NA> +#> 3 F-1/05 ~ <NA> Order of 13 July 2007, Land~ ECLI:EU:~ <NA> <NA> +#> 4 F-1/05 <NA> Order of 9 November 2010, L~ ECLI:EU:~ <NA> <NA> +#> 5 F-2/05 <NA> Removed from the register o~ ECLI:EU:~ <NA> <NA> +#> 6 F-3/05 <NA> Order of 15 May 2006, Schmi~ ECLI:EU:~ <NA> <NA> +#> 7 F-4/05 <NA> Removed from the register o~ ECLI:EU:~ <NA> <NA> +#> 8 F-5/05 * <NA> Judgment of 28 April 2009, ~ ECLI:EU:~ <NA> T-261~ +#> 9 F-6/05 <NA> Removed from the register o~ ECLI:EU:~ <NA> <NA> +#> 10 F-7/05 <NA> Schmit / Commission (F-7/05~ <NA> F-5/05 <NA> +#> # ... with 1,749 more rows
          # } +
          # } +