From 12734d8b9bc12b7ff5bd530200ec96e38be6a12e Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 22 Nov 2017 11:36:30 -0900 Subject: [PATCH] Rewrite resource map updating routines to fix various issues This commit addresses the cause of #51 The previous version of `parse_resource_map` used a SPARQL query to pull the triples out of the resource map being updated but didn't actually parse the result in an RDF-aware manner. This lead to " characters ending up in various places in the RDF/XML which causes tons of issues. The new routine simply uses datapack::parseRDF instead and just filters out cito:documents, cito:isDocumentedBy, dcterms:identifier, and the DataONE R Client statement. --- R/packaging.R | 89 +++++------------------------------ tests/testthat/test_editing.R | 40 +++++++++------- 2 files changed, 34 insertions(+), 95 deletions(-) diff --git a/R/packaging.R b/R/packaging.R index 7002d33..9dec754 100644 --- a/R/packaging.R +++ b/R/packaging.R @@ -448,6 +448,10 @@ generate_resource_map <- function(metadata_pid, message("Adding ", nrow(other_statements), " custom statement(s) to the Resource Map.") + # Add an NA dataTypeURI to all the statements so the subsequent rbind works + # This is fine because they're all URIs and they don't need a datType + relationships$dataTypeURI <- NA + relationships <- rbind(relationships, other_statements) } @@ -1037,65 +1041,21 @@ update_package <- function(inventory, parse_resource_map <- function(path) { stopifnot(file.exists(path)) - world <- new("World") - storage <- new("Storage", - world, - "hashes", - name = "", - options = "hash-type='memory'") - model <- new("Model", world, storage, options = "") - parser <- new("Parser", world) - - redland::parseFileIntoModel(parser, world, path, model) - - query <- new("Query", - world, - "select ?s ?p ?o where { ?s ?p ?o }", - base_uri = NULL, - query_language = "sparql", - query_uri = NULL) - - queryResult <- redland::executeQuery(query, model) - - statements <- data.frame() - - while(!is.null(result <- redland::getNextResult(queryResult))) { - statements <- rbind(statements, - data.frame(subject = result$s, - predicate = result$p, - object = result$o, - stringsAsFactors = FALSE)) - } - - # Remove < and > around URIs. We do this because redland needs them to be - # without those characters or it complains about being unable to convert into - # a qname - statements$subject <- stringr::str_replace_all(statements$subject, "^[<]", "") - statements$predicate <- stringr::str_replace_all(statements$predicate, "^[<]", "") - statements$object <- stringr::str_replace_all(statements$object, "^[<]", "") - statements$subject <- stringr::str_replace_all(statements$subject, "[>]$", "") - statements$predicate <- stringr::str_replace_all(statements$predicate, "[>]$", "") - statements$object <- stringr::str_replace_all(statements$object, "[>]$", "") - - statements + rm <- new("ResourceMap") + datapack::parseRDF(rm, path) + datapack:::getTriples(rm) } #' Filter statements related to packaging #' +#' This is intended to be called after `datapack::getTriples` has been called +#' on a ResourceMap. +#' #' This function was written specifically for the case of updating a resource #' map while preserving any extra statements that have been added such as PROV #' statements. Statements are filtered according to these rules: #' -#' 1. If the subject or object is the ore:ResourceMap resource -#' 2. If the subject or object is the ore:Aggregation resource -#' 3. If the predicate is cito:documents or cito:isDocumentedBy -#' 4. Once filters 1-3 have been executed, any remaining triples are considered -#' for removal if they look like dangling dc:identifier statements -#' -#' The consequence of filter 4 is that dc:identifier statements are left in if -#' they are still in use by another statement -#' #' @param statements (data.frame) A set of Statements to be filtered #' #' @return (data.frame) The filtered Statements @@ -1106,36 +1066,11 @@ filter_packaging_statements <- function(statements) { stopifnot(is.data.frame(statements)) if (nrow(statements) == 0) return(statements) - # Collect URIs we're going to use to filter by - resource_map_uri <- statements[grepl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", statements$predicate) & grepl("http://www.openarchives.org/ore/terms/ResourceMap", statements$object),"subject"] - aggregation_uri <- statements[grepl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", statements$predicate) & grepl("http://www.openarchives.org/ore/terms/Aggregation", statements$object),"subject"] - - # Filter statements by subject - statements <- statements[!(statements$subject %in% c(resource_map_uri, aggregation_uri)),] - - # Filter statements by object - statements <- statements[!(statements$object %in% c(resource_map_uri, aggregation_uri)),] - # Filter cito:documents / cito:isDocumentedBy statements statements <- statements[!(statements$predicate == "http://purl.org/spar/cito/documents"),] statements <- statements[!(statements$predicate == "http://purl.org/spar/cito/isDocumentedBy"),] - - # If this is a simple package without extra statements, then we should just be - # left with some dc:identifier statements left over. Here we try to detect - # that case by collecting the unique subjects taking part in dc:identifier - # statements and filtering statements about subjects with only one statement - # about them - - dc_identifiers <- unique(statements[statements$predicate == "http://purl.org/dc/terms/identifier", "subject"]) - - for (identifier in dc_identifiers) { - if (nrow(statements[statements$subject == identifier | statements$object == identifier,]) == 1) { - statements <- statements[!(statements$subject == identifier | statements$object == identifier),] - } - } - - # Remove introduced by the second filter statement - statements <- statements[complete.cases(statements),] + statements <- statements[!(statements$predicate == "http://purl.org/dc/terms/identifier"),] + statements <- statements[!((statements$predicate == "http://xmlns.com/foaf/0.1/name" & statements$object == "DataONE R Client")),] statements } diff --git a/tests/testthat/test_editing.R b/tests/testthat/test_editing.R index a088262..0d607c3 100644 --- a/tests/testthat/test_editing.R +++ b/tests/testthat/test_editing.R @@ -15,7 +15,8 @@ test_that("we can publish an update", { update <- publish_update(mn, package$metadata, package$resource_map, - package$data) + package$data, + check_first = FALSE) expect_named(update, c("metadata", "resource_map", "data")) expect_true(all(object_exists(mn, unlist(update)))) @@ -36,7 +37,8 @@ test_that("an identifier can be manually specified when publishing an update", { package$metadata, package$resource_map, package$data, - identifier = new_identifier) + identifier = new_identifier, + check_first = FALSE) expect_equal(update$metadata, new_identifier) }) @@ -53,7 +55,7 @@ test_that("we can create a resource map", { response <- create_resource_map(mn, metadata_pid, data_pid) expect_true(object_exists(mn, response)) - expect_equal(response, get_package(mn, metadata_pid)$resource_map) + expect_equal(response, get_package(mn, response)$resource_map) }) @@ -121,7 +123,7 @@ test_that("SIDs are maintained when publishing an update to an object with a SID sid = new_sid) resmap_pid <- create_resource_map(mn, metadata_pid = pid) - response <- publish_update(mn, pid, resmap_pid) + response <- publish_update(mn, pid, resmap_pid, check_first = FALSE) sysmeta <- getSystemMetadata(mn, response$metadata) expect_equal(sysmeta@seriesId, new_sid) @@ -143,7 +145,10 @@ test_that("we can publish an update to an object", { csv <- data.frame(x = 1:50) write.csv(csv, tmp) - upd <- update_object(mn, old, tmp) + suppressWarnings({ + upd <- update_object(mn, old, tmp) + }) + file.remove(tmp) sm <- dataone::getSystemMetadata(mn, upd) @@ -181,27 +186,26 @@ test_that("extra statements are maintained between updates", { # Add some PROV triples to the Resource Map rm <- tempfile() writeLines(rawToChar(dataone::getObject(mn, pkg$resource_map)), rm) - # statements <- data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[1], reserved = TRUE)), - # predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", - # object = "http://www.w3.org/ns/prov#Entity") - # - # statements <- rbind(statements, - # data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE)), - # predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", - # object = "http://www.w3.org/ns/prov#Entity")) statements <- data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[1], reserved = TRUE)), predicate = "http://www.w3.org/ns/prov#wasDerivedFrom", - object = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE))) - - new_rm <- update_resource_map(mn, pkg$resource_map, pkg$metadata, pkg$data, other_statements = statements, public = TRUE) + object = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE)), + subjectType = "uri", + objectType = "uri", + dataTypeURI = NA) + + new_rm <- update_resource_map(mn, + pkg$resource_map, + pkg$metadata, + pkg$data, + other_statements = statements, + public = TRUE) rm <- tempfile() writeLines(rawToChar(dataone::getObject(mn, new_rm)), rm) statements <- parse_resource_map(rm) expect_true("http://www.w3.org/ns/prov#wasDerivedFrom" %in% statements$predicate) - new_new_rm <- update_resource_map(mn, new_rm, pkg$metadata, pkg$data, public = TRUE) rm <- tempfile() writeLines(rawToChar(dataone::getObject(mn, new_new_rm)), rm) @@ -220,7 +224,7 @@ test_that("rightsholder is properly set back after publishing an update", { set_result <- set_rights_holder(mn, unlist(pkg), "CN=arctic-data-admins,DC=dataone,DC=org") expect_true(all(set_result)) - new_pkg <- publish_update(mn, pkg$metadata, pkg$resource_map, pkg$data) + new_pkg <- publish_update(mn, pkg$metadata, pkg$resource_map, pkg$data, check_first = FALSE) rhs <- lapply(unlist(pkg), function(pid) { dataone::getSystemMetadata(mn, pid)@rightsHolder })