Skip to content

Commit

Permalink
Rewrite resource map updating routines to fix various issues
Browse files Browse the repository at this point in the history
This commit addresses the cause of #51

The previous version of `parse_resource_map` used a SPARQL
query to pull the triples out of the resource map being updated
but didn't actually parse the result in an RDF-aware manner.
This lead to " characters ending up in various places in
the RDF/XML which causes tons of issues.

The new routine simply uses datapack::parseRDF instead and
just filters out cito:documents, cito:isDocumentedBy, dcterms:identifier, and the DataONE R Client statement.
  • Loading branch information
amoeba committed Nov 22, 2017
1 parent 202f377 commit 12734d8
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 95 deletions.
89 changes: 12 additions & 77 deletions R/packaging.R
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,10 @@ generate_resource_map <- function(metadata_pid,

message("Adding ", nrow(other_statements), " custom statement(s) to the Resource Map.")

# Add an NA dataTypeURI to all the statements so the subsequent rbind works
# This is fine because they're all URIs and they don't need a datType
relationships$dataTypeURI <- NA

relationships <- rbind(relationships,
other_statements)
}
Expand Down Expand Up @@ -1037,65 +1041,21 @@ update_package <- function(inventory,
parse_resource_map <- function(path) {
stopifnot(file.exists(path))

world <- new("World")
storage <- new("Storage",
world,
"hashes",
name = "",
options = "hash-type='memory'")
model <- new("Model", world, storage, options = "")
parser <- new("Parser", world)

redland::parseFileIntoModel(parser, world, path, model)

query <- new("Query",
world,
"select ?s ?p ?o where { ?s ?p ?o }",
base_uri = NULL,
query_language = "sparql",
query_uri = NULL)

queryResult <- redland::executeQuery(query, model)

statements <- data.frame()

while(!is.null(result <- redland::getNextResult(queryResult))) {
statements <- rbind(statements,
data.frame(subject = result$s,
predicate = result$p,
object = result$o,
stringsAsFactors = FALSE))
}

# Remove < and > around URIs. We do this because redland needs them to be
# without those characters or it complains about being unable to convert into
# a qname
statements$subject <- stringr::str_replace_all(statements$subject, "^[<]", "")
statements$predicate <- stringr::str_replace_all(statements$predicate, "^[<]", "")
statements$object <- stringr::str_replace_all(statements$object, "^[<]", "")
statements$subject <- stringr::str_replace_all(statements$subject, "[>]$", "")
statements$predicate <- stringr::str_replace_all(statements$predicate, "[>]$", "")
statements$object <- stringr::str_replace_all(statements$object, "[>]$", "")

statements
rm <- new("ResourceMap")
datapack::parseRDF(rm, path)
datapack:::getTriples(rm)
}


#' Filter statements related to packaging
#'
#' This is intended to be called after `datapack::getTriples` has been called
#' on a ResourceMap.
#'
#' This function was written specifically for the case of updating a resource
#' map while preserving any extra statements that have been added such as PROV
#' statements. Statements are filtered according to these rules:
#'
#' 1. If the subject or object is the ore:ResourceMap resource
#' 2. If the subject or object is the ore:Aggregation resource
#' 3. If the predicate is cito:documents or cito:isDocumentedBy
#' 4. Once filters 1-3 have been executed, any remaining triples are considered
#' for removal if they look like dangling dc:identifier statements
#'
#' The consequence of filter 4 is that dc:identifier statements are left in if
#' they are still in use by another statement
#'
#' @param statements (data.frame) A set of Statements to be filtered
#'
#' @return (data.frame) The filtered Statements
Expand All @@ -1106,36 +1066,11 @@ filter_packaging_statements <- function(statements) {
stopifnot(is.data.frame(statements))
if (nrow(statements) == 0) return(statements)

# Collect URIs we're going to use to filter by
resource_map_uri <- statements[grepl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", statements$predicate) & grepl("http://www.openarchives.org/ore/terms/ResourceMap", statements$object),"subject"]
aggregation_uri <- statements[grepl("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", statements$predicate) & grepl("http://www.openarchives.org/ore/terms/Aggregation", statements$object),"subject"]

# Filter statements by subject
statements <- statements[!(statements$subject %in% c(resource_map_uri, aggregation_uri)),]

# Filter statements by object
statements <- statements[!(statements$object %in% c(resource_map_uri, aggregation_uri)),]

# Filter cito:documents / cito:isDocumentedBy statements
statements <- statements[!(statements$predicate == "http://purl.org/spar/cito/documents"),]
statements <- statements[!(statements$predicate == "http://purl.org/spar/cito/isDocumentedBy"),]

# If this is a simple package without extra statements, then we should just be
# left with some dc:identifier statements left over. Here we try to detect
# that case by collecting the unique subjects taking part in dc:identifier
# statements and filtering statements about subjects with only one statement
# about them

dc_identifiers <- unique(statements[statements$predicate == "http://purl.org/dc/terms/identifier", "subject"])

for (identifier in dc_identifiers) {
if (nrow(statements[statements$subject == identifier | statements$object == identifier,]) == 1) {
statements <- statements[!(statements$subject == identifier | statements$object == identifier),]
}
}

# Remove <NA> introduced by the second filter statement
statements <- statements[complete.cases(statements),]
statements <- statements[!(statements$predicate == "http://purl.org/dc/terms/identifier"),]
statements <- statements[!((statements$predicate == "http://xmlns.com/foaf/0.1/name" & statements$object == "DataONE R Client")),]

statements
}
Expand Down
40 changes: 22 additions & 18 deletions tests/testthat/test_editing.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ test_that("we can publish an update", {
update <- publish_update(mn,
package$metadata,
package$resource_map,
package$data)
package$data,
check_first = FALSE)

expect_named(update, c("metadata", "resource_map", "data"))
expect_true(all(object_exists(mn, unlist(update))))
Expand All @@ -36,7 +37,8 @@ test_that("an identifier can be manually specified when publishing an update", {
package$metadata,
package$resource_map,
package$data,
identifier = new_identifier)
identifier = new_identifier,
check_first = FALSE)

expect_equal(update$metadata, new_identifier)
})
Expand All @@ -53,7 +55,7 @@ test_that("we can create a resource map", {
response <- create_resource_map(mn, metadata_pid, data_pid)

expect_true(object_exists(mn, response))
expect_equal(response, get_package(mn, metadata_pid)$resource_map)
expect_equal(response, get_package(mn, response)$resource_map)

})

Expand Down Expand Up @@ -121,7 +123,7 @@ test_that("SIDs are maintained when publishing an update to an object with a SID
sid = new_sid)
resmap_pid <- create_resource_map(mn, metadata_pid = pid)

response <- publish_update(mn, pid, resmap_pid)
response <- publish_update(mn, pid, resmap_pid, check_first = FALSE)

sysmeta <- getSystemMetadata(mn, response$metadata)
expect_equal(sysmeta@seriesId, new_sid)
Expand All @@ -143,7 +145,10 @@ test_that("we can publish an update to an object", {
csv <- data.frame(x = 1:50)
write.csv(csv, tmp)

upd <- update_object(mn, old, tmp)
suppressWarnings({
upd <- update_object(mn, old, tmp)
})

file.remove(tmp)
sm <- dataone::getSystemMetadata(mn, upd)

Expand Down Expand Up @@ -181,27 +186,26 @@ test_that("extra statements are maintained between updates", {
# Add some PROV triples to the Resource Map
rm <- tempfile()
writeLines(rawToChar(dataone::getObject(mn, pkg$resource_map)), rm)
# statements <- data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[1], reserved = TRUE)),
# predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
# object = "http://www.w3.org/ns/prov#Entity")
#
# statements <- rbind(statements,
# data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE)),
# predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
# object = "http://www.w3.org/ns/prov#Entity"))

statements <- data.frame(subject = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[1], reserved = TRUE)),
predicate = "http://www.w3.org/ns/prov#wasDerivedFrom",
object = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE)))

new_rm <- update_resource_map(mn, pkg$resource_map, pkg$metadata, pkg$data, other_statements = statements, public = TRUE)
object = paste0("https://cn.dataone.org/cn/v2/resolve/", URLencode(pkg$data[2], reserved = TRUE)),
subjectType = "uri",
objectType = "uri",
dataTypeURI = NA)

new_rm <- update_resource_map(mn,
pkg$resource_map,
pkg$metadata,
pkg$data,
other_statements = statements,
public = TRUE)

rm <- tempfile()
writeLines(rawToChar(dataone::getObject(mn, new_rm)), rm)
statements <- parse_resource_map(rm)
expect_true("http://www.w3.org/ns/prov#wasDerivedFrom" %in% statements$predicate)


new_new_rm <- update_resource_map(mn, new_rm, pkg$metadata, pkg$data, public = TRUE)
rm <- tempfile()
writeLines(rawToChar(dataone::getObject(mn, new_new_rm)), rm)
Expand All @@ -220,7 +224,7 @@ test_that("rightsholder is properly set back after publishing an update", {
set_result <- set_rights_holder(mn, unlist(pkg), "CN=arctic-data-admins,DC=dataone,DC=org")
expect_true(all(set_result))

new_pkg <- publish_update(mn, pkg$metadata, pkg$resource_map, pkg$data)
new_pkg <- publish_update(mn, pkg$metadata, pkg$resource_map, pkg$data, check_first = FALSE)
rhs <- lapply(unlist(pkg), function(pid) {
dataone::getSystemMetadata(mn, pid)@rightsHolder
})
Expand Down

0 comments on commit 12734d8

Please sign in to comment.