diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..979ed80 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +.gitignore +.Rbuildignore +*.Rproj \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index e990bc8..3a79150 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,13 +8,15 @@ Authors@R: c(person("Vasilis", "Chasiotis", role = "aut", comment="Department of person("Martin", "Karlberg", role = "aut"), person("Mátyás", "Mészáros", email = "matyas.meszaros@ec.europa.eu", role = "cre"), person("Martina", "Patone", role = "aut"), - person("Erkand", "Muraku", role = "aut")) + person("Erkand", "Muraku", role = "aut"), + person("Clement", "Thomas", role = "aut"), + person("Loic", "Bienvenue", role = "aut")) Description: A candidate correspondence table between two classifications can be created when there are correspondence tables leading from the first classification to the second one via intermediate 'pivot' classifications. The correspondence table between two statistical classifications can be updated when one of the classifications gets updated to a new version. License: EUPL Encoding: UTF-8 -Imports: data.table, httr +Imports: data.table, httr, tidyverse, writexl Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 1830102..d41834b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,20 @@ # Generated by roxygen2: do not edit by hand -export(classEndpoint) +export(classificationEndpoint) +export(classificationQC) +export(correctionClassification) export(correspondenceList) +export(dataStructure) +export(lengthsFile) export(newCorrespondenceTable) export(prefixList) export(retrieveClassificationTable) export(retrieveCorrespondenceTable) export(structureData) export(updateCorrespondenceTable) +import(httr) +import(tidyverse) +import(writexl) importFrom(data.table,fread) importFrom(data.table,fwrite) importFrom(httr,POST) diff --git a/R/classificationEndpoint.R b/R/classificationEndpoint.R new file mode 100644 index 0000000..76ba32c --- /dev/null +++ b/R/classificationEndpoint.R @@ -0,0 +1,97 @@ +#' @title Retrieve a list of classification tables from CELLAR and FAO repositories or both. +#' @description The purpose of this function is to provide a comprehensive summary +#' of the data structure for each classification in CELLAR and FAO endpoint. +#' The summary includes information such as the prefix name, URI, key, concept scheme, and title associated with each classification. +#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, +#' making it easier to retrieve specific information or perform complex queries on linked data. This is an optional +#' parameter, which by default is set to \code{"ALL"}. +#' The valid values are \code{"CELLAR"}, \code{"FAO"} and \code{"ALL"} for both endpoints. +#' @import httr +#' @export +#' @return +#' \code{classificationEndpoint()} returns a table with information needed to retrieve the classification table: +#' \itemize{ +#' \item Prefix name: the SPARQL instruction for a declaration of a namespace prefix +#' \item Conceptscheme: taxonomy of the SKOS object to be retrieved +#' \item URI: the URL from which the SPARQL query was retrieved +#' \item Name: the name of the table retrieved +#' } +#' @examples +#' { +#' endpoint = "ALL" +#' list_data = classificationEndpoint(endpoint) +#' } + +classificationEndpoint = function(endpoint = "ALL") { + + ### Datasets in CELLAR + endpoint_cellar = "http://publications.europa.eu/webapi/rdf/sparql" + + SPARQL.query_cellar = paste0(" + SELECT DISTINCT ?s ?Title + WHERE { ?s a skos:ConceptScheme ; + skos:prefLabel ?Title ; + ?p + FILTER (LANG(?Title) = 'en')} + ORDER BY ?Title + ") + + response = POST(url = endpoint_cellar, accept("text/csv"), body = list(query = SPARQL.query_cellar), encode = "form") + data_cellar = read.csv(text=content(response, "text"), sep= ",") + + ## add prefix name + str_dt = t(sapply(data_cellar[,1], function(x) unlist(strsplit(as.character(x), "/+")))) + uri = paste0(str_dt[,1],"/", "/", str_dt[,2],"/",str_dt[,3],"/",str_dt[,4] ) + prefix = str_dt[,4] + prefix = gsub("\\.","",prefix) + #key = str_dt[,4] + conceptscheme = str_dt[,5] + title = data_cellar[,2] + data_cellar = cbind(prefix, conceptscheme, uri, title) + rownames(data_cellar) = 1:nrow(data_cellar) + colnames(data_cellar) = c("Prefix", "ConceptScheme", "URI", "Title") + + ### Datasets in FAO + endpoint_fao = "https://stats.fao.org/caliper/sparql/AllVocs" + SPARQL.query_fao = paste0(" + PREFIX skos: + SELECT DISTINCT ?classification ?label + + WHERE { + ?classification a skos:ConceptScheme . + ?classification skos:prefLabel ?label . + FILTER(regex(?label, 'classification', 'i')) + } + ORDER BY ?label + ") + + response = httr::POST(url = endpoint_fao, accept("text/csv"), body = list(query = SPARQL.query_fao), encode = "form") + data_fao = read.csv(text=content(response, "text"), sep= ",") + + ## add prefix name + str_dt = t(sapply(data_fao[,1], function(x) unlist(strsplit(as.character(x), "/+")))) + prefix = paste0(str_dt[,4], str_dt[,5]) + prefix = gsub("\\.","",prefix) + uri = paste0(str_dt[,1], "/", "/", str_dt[,2], "/", str_dt[,3], "/", str_dt[,4], "/", str_dt[,5]) + #class = prefix + ConceptScheme = str_dt[,6] + data_fao = cbind(prefix, ConceptScheme, uri, data_fao[,2]) + rownames(data_fao) = 1:nrow(data_fao) + colnames(data_fao) = c("Prefix", "ConceptScheme", "URI", "Title") + + if (endpoint == "ALL") { + data = list("CELLAR" = data_cellar, "FAO" = data_fao) + } + + if (endpoint == "CELLAR") { + data = list("CELLAR" = data_cellar) + } + + if (endpoint == "FAO") { + data = list("FAO" = data_fao) + } + + return(data) + +} + diff --git a/R/classificationQC.R b/R/classificationQC.R new file mode 100644 index 0000000..f1ef586 --- /dev/null +++ b/R/classificationQC.R @@ -0,0 +1,498 @@ +#' @title ClassificationQC performs a quality check control checks on a given statistical classifications +#' @description The purpose of this function perform quality control checks on statistical classifications. +#' It checks the compliance of classifications with structural rules and provides informative error messages +#' for violations. The function requires input files containing code and label information for each +#' classification position. It verifies the formatting requirements, uniqueness of codes, fullness of hierarchy, +#' uniqueness of labels, hierarchical label dependencies, single child code compliance, and sequencing of codes. +#' The function generates a QC output data frame with the classification data, hierarchical level, code segments, +#' and test outcomes.Additionally, it allows exporting the output to a CSV file. Overall, the classificationQC +#' function ensures the integrity and accuracy of statistical classifications. +#' @param classification Refers to a classification in csv file or an R dataframe structured with two columns, consisting +#' of codes and labels, respectively. If the classification is provided as a csv file, it should be stored in the working directory (as +#' defined using \code{getwd}). This is a mandatory argument. +#' @param lengthsfile Refers to a CSV file or a R dataframe (one record per hierarchical level) containing the initial and +#' last position of the segment of the code specific to that level. The number of lines of this CSV file or the R dataframe will +#' also implicitly define the number of hierarchical levels of the classification. This is a mandatory argument. +#' @param fullHierarchy It is used to test the fullness of hierarchy. If the parameter \code{fullHierarchy} is set to \code{FALSE}, +#' the function will check that every position at a lower level than 1 should have parents all the way up to level 1. +#' If it is set to \code{TRUE}, in addition to the previous, it will be checked that any position at a higher level +#' than k should have children all the way down to level k. +#' @param labelUniqueness It is used to test the that positions at the same hierarchical level have unique labels. If set to \code{TRUE}, +#' the compliance is checked and positions with duplicate labels are marked as 1 in the "duplicateLabel" column, +#' while positions with unique labels are marked as 0. +#' @param labelHierarchy It is used to ensure that hierarchical structure of labels is respected. +#' When set to \code{TRUE}, the function will check that single child have a label identical to the label of its parent and that +#' has if a position has a label identical to the label of one of its children, then that position should only have a single child. +#' @param singleChildCode It refers to CSV file with specific formatting to define valid codes for each level. If this parameter is not \code{NULL} +#' then it checks compliance with coding rules for single children and non-single children, as provided in the CSV file. +#' @param sequencing It refers to a CSV file to define the admissible codes for multiple children at each level. If this parameter +#' is not \code{NULL}, the function checks the sequencing of multiple children codes within each level, as provided in the CSV file. +#' @param XLSXout The valid values are \code{FALSE} or \code{TRUE}. In both cases the output will be returned as an R list. +#' If output should be saved as a xlsx file, the argument should be set as \code{TRUE}. By default, no xlsx file is produced. +#' @import tidyverse writexl +#' @export +#' @return +#' \code{classificationQC()} returns a list of dataframes identifying possible the cases violating the formatting requirements. The +#' databases returned depend on the rules checked. The databases produced are: +#' \itemize{ +#' \item{QC_output} The dataset includes all the original records in the classification. Colum "Level" refers to the hierarchical levels +#' of each position. Each code will be parsed into segment_k (column "Segmentk") and code_k (column "Codek"), corresponding to the code +#' and segment and hierarchical level k respectively. Additional columns are included to flag the corrected behaviour in each position. +#' These are +#' \itemize{ +#' \item Orphan: if fullHierarchy is set to FALSE, an "orphan" is a position at a hierarchical level (j) greater than 1 that lacks a parent at the hierarchical level (j-1) immediately above it. +#' Orphan positions are marked with a value of 1 in the "QC output" column, indicating their orphan status. Otherwise, they are assigned a value of 0. +#' \item Childless: if fullHierarchy is set to TRUE, a "childless" position is one at a hierarchical level (j) less than k that lacks a child +#' at the hierarchical level (j+1) immediately below it. Childless positions are marked with a value of 1 +#' indicating their childless status. Otherwise, they are assigned a value of 0. +#' \item DuplicateLabel: new column in the output that flags positions involved in duplicate label situations (where multiple positions share the same label at the same hierarchical level) +#' by assigning them a value of 1, while positions with unique labels are assigned a value of 0. +#' \item SingleChildMismatch: column in the output provides information about label hierarchy consistency in a hierarchical classification system. It indicates:c +#' Value 1: Mismatched labels between a parent and its single child. +#' Value 9: Parent-child pairs with matching labels, but the parent has multiple children. +# Value 0: Compliance with the label hierarchy rule, indicating no mismatches or violations. +#' \item SingleCodeError: column serves as a flag indicating whether a position is a single child and whether the corresponding "singleCode" contains the level j segment. +#' A value of 1 signifies a mismatch, while a value of 0 indicates compliance with the coding rules +#' \item MultipleCodeError: column serves as a flag indicating whether a position is not a single child and whether the corresponding "multipleCodej" contains the level j segment. +#' A value of 1 signifies a mismatch, while a value of 0 indicates compliance with the coding rules +#' \item GapBefore: takes the value 0 or 1 if there is a missing child in the 123456789 series. +#' \item LastSibling: takes the value 1 when it is the last child in the series 123456789 otherwise the value 0 +#' } +#' +#' \item{QC_noLevels} A subset of the QC_output dataframe including only records for which levels is not defined. In general if this dataframe +#' is not empty, it suggest that either the classification or the length file is not correctily specified. +#' \item{QC_orphan} A subset of the QC_output dataframe including only records that have no parents at the higher hierarchical level. +#' \item{QC_childless} A subset of the QC_output dataframe including only records that have no children at the lower hierarchical level. +#' \item{QC_duplicatesLabel} A subset of the QC_output dataframe including only records that have duplicated label in the same hierarchical level. +#' \item{QC_duplicatesCode} A subset of the QC_output dataframe including only records that have the same codes. +#' \item{QC_singleChildMismatch} A subset of the QC_output dataframe including only records that are single child and have different labels from +#' their parents or that are multiple children and have same labels to their parents. +#' \item{QC_singleCodeError} A subset of the QC_output dataframe including only records that are single children and have been wrongly coded (not following +#' the rule provided in the 'SingleChildMismatch' CSV file). +#' \item{QC_multipleCodeError} A subset of the QC_output dataframe including only records that are multiple children and have been wrongly coded (not following +#' the rule provided in the 'SingleChildMismatch' CSV file). +#' \item{QC_gapBefore} A subset of the QC_output dataframe including only records that are multiple children and have gap before in the sequencing provided in the +#' 'sequencing' CSV file. +#' \item{QC_lastSibling} A subset of the QC_output dataframe including only records that are multiple and last children following the sequencing provided in the +#' 'sequencing' CSV file. +#' } +#' +#' @examples +#' { +#' prefix = "nace2" +#' conceptScheme = "nace2" +#' endpoint = "CELLAR" +#' lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) +#' classification = retrieveClassificationTable(prefix, endpoint, conceptScheme, level="ALL")$ClassificationTable +#' classification = classification[,c(1,2)] +#' classification = correctionClassification(classification) +#' Output = classificationQC(classification, lengthsFile, fullHierarchy = TRUE, labelUniqueness = TRUE, labelHierarchy = TRUE, singleChildCode = NULL, sequencing = NULL) +#' View(Output$QC_output) +#' View(Output$QC_noLevels) +#' View(Output$QC_orphan) +#' View(Output$QC_childless) +#' View(Output$QC_duplicatesLabel) +#' View(Output$QC_duplicatesCode) +#' View(Output$QC_singleChildMismatch) +#' View(Output$QC_singleCodeError) +#' View(Output$QC_multipleCodeError) +#' View(Output$QC_gapBefore) +#' View(Output$QC_lastSibling) +#' } + + +classificationQC = function(classification, lengthsFile, fullHierarchy = TRUE, labelUniqueness = TRUE, labelHierarchy = TRUE, singleChildCode = NULL, sequencing = NULL, XLSXout = FALSE) { + + #if ((length(grep("csv", classification)) == 0) & !(is.data.frame(classification))){ + # stop("The classification should be provided as either a csv file or a R dataframe.") + #} + + if (is.data.frame(classification)){ + classification = classification + } + + #if (length(grep("csv", classification)) > 0){ + # classification = read.csv(file.path(paste0(getwd(), "/", classification))) + #} + + #check that classification has only two columns + if(ncol(classification) != 2){ + stop("The classification must have only two colums corresponding to code and label.") + } + + colnames(classification)[1:2] = c("Code", "Label") + + ## Length table + #if ((length(grep("csv", lengthsFile)) == 0) & !(is.data.frame(lengthsFile))){ + # stop("The lengthsFile should be provided as either a csv file or a R dataframe.") + #} + + if (is.data.frame(lengthsFile)){ + lengthsFile = lengthsFile + } + + #if (length(grep("csv", lengthsFile)) > 0){ + # lengthsFile = read.csv(file.path(paste0(getwd(), "/", lengthsFile))) + #} + + ### RULE 1 - Correctness of formatting requirements (lengths file) + + #check that char file has at least one row + if(nrow(lengths) == 0){ + stop("Lengths file must have at least one row") + } + + #check if value are strictly positive + negative_lengths <- which(lengths[,1] < 1 | lengths[,2] < 1) + if (length(negative_lengths) > 0) { + stop(paste("lengths must be strictly positive. Error at row:", negative_lengths)) + } + + #check if value are NA + na_lengths <- which(is.na(lengths[,1]) | is.na(lengths[,2])) + if (length(na_lengths) > 0) { + stop(paste("lengths must be missing. Error at row:", na_lengths)) + } + + #check the sequences rendered by these numbers should not overlap + for (i in 1:(nrow(lengths)-1)) { + if (lengths[i,2] >= lengths[i+1,1]) { + stop(paste("Sequences should not overlap in case of formatting errors. Error at row:", i+1)) + } + } + + #check that char file has valid character ranges -- NEEDED?! + if(!all(sapply(lengths$charb, function(x) grepl("[A-Za-z0-9]", x)) & + sapply(lengths$chare, function(x) grepl("[A-Za-z0-9]", x)) & + lengths$charb <= lengths$chare)){ + stop("Char file must contain only numbers") + } + + + ## Create QC_output + QC_output = classification + + #add hierarchical level column + QC_output$level = sapply(classification[,1], function(x) match(TRUE, nchar(x) == lengths$chare)) + + #add Code and segment columns + for(i in 1:nrow(lengths)){ + charb = lengths$charb[i] + chare = lengths$chare[i] + QC_output[, paste0("segment", i)] = ifelse(QC_output$level >= i, substr(QC_output[,1], charb, chare), NA) + QC_output[, paste0("Code", i)] = ifelse(QC_output$level >= i, substr(QC_output[,1], 1, chare), NA) + } + + ## RULE 2 - Compliance with formatting requirements (lengths file) + na_level = which(is.na(QC_output$level)) + if (length(na_level) > 0) { + warning("Some codes have no specified level. There might be possible errors in the classification or in the length file (see 'QC_noLevels').") + } + + QC_noLevels = QC_output[na_level, ] + + ## RULE 3 - Uniqueness of codes + dup = duplicated(classification[,1]) + QC_output$duplicateCode = as.numeric(dup) + if (any(dup)) { + class_dup <- classification[dup,] + errow_row <- which(dup) + warning("Codes in classification file must be unique (see 'QC_duplicatesCode').") + } + QC_duplicatesCode = QC_output[which(QC_output$duplicateCode == 1), ] + + ## RULE 4 - Fullness of hierarchy + QC_output$orphan = rep(NA, nrow(QC_output)) + + for (k in nrow(lengths):2) { + QC_output$exp_parents = rep(NA, nrow(QC_output)) + QC_output$exp_parents[which(QC_output$level == k)] = substr(QC_output[which(QC_output$level == k), paste0("Code", k)], 1, lengths[k-1,2]) + o_code = which(QC_output$exp_parents %in% QC_output[which(QC_output$level == k-1), paste0("Code", k-1)]) + QC_output$orphan[intersect(which(QC_output$level == k), o_code)] = 0 + noo_code = which(!QC_output$exp_parents %in% QC_output[which(QC_output$level == k-1), paste0("Code", k-1)]) + QC_output$orphan[intersect(which(QC_output$level == k), noo_code)] = 1 + #QC_output$orphan[which(is.na(QC_output$exp_parents))] = NA + } + QC_output = QC_output[, -which(colnames(QC_output) == "exp_parents")] + + #identify orphans + orphan = which(QC_output$orphan == 1) + if (length(orphan) > 0) { + warning("Some codes at a lower level than 1 have no parents at higher levels ('see QC_orphan').") + } + + QC_orphan = QC_output[orphan, ] + + #childless - if (fullHierarchy == TRUE) + if (fullHierarchy){ + QC_output$childless = rep(NA, nrow(QC_output)) + + for (k in 2:nrow(lengths)) { + exp_parents = substr(QC_output[which(QC_output$level == k), paste0("Code", k)], 1, lengths[k-1,2]) + QC_output$childless[which(QC_output[, paste0("Code", k-1)] %in% exp_parents & QC_output$level == k-1)] = 0 + QC_output$childless[which(!QC_output[, paste0("Code", k-1)] %in% exp_parents & QC_output$level == k-1)] = 1 + } + + #identify childless + childless = which(QC_output$childless == 1) + if (length(childless) > 0) { + warning(paste("Some codes at a higher level than ", nrow(lengths) ," have no children at lower levels ('see QC_childless').")) + } + QC_childless = QC_output[childless, ] + } + + ### RULE 5 - Uniqueness of labels + if (labelUniqueness){ + QC_output$duplicateLabel = 0 + + # Check for duplicate labels at each hierarchical level + for (le in unique(QC_output$level)) { + level_data = QC_output[QC_output$level == le,] + level_data$Label = substr(level_data$Label, lengths[le,2] + 1, nchar(level_data$Label)) + if (nrow(level_data) != length(unique(level_data$Label))) { + # There are duplicates, mark them in the QC output column + # The outcome of the test should be reported in a new ‘QC output’ column (duplicateLabel) assuming the value 1 for positions involved in duplicates (0 otherwise). + duplicate_labels = level_data$Label[duplicated(level_data$Label)] + ##Here we add 1 if we have the same label. + QC_output$duplicateLabel[QC_output$level == le & QC_output$Label %in% duplicate_labels] = 1 + } + } + + #identify duplicates + duplicatesLabel = which(QC_output$duplicateLabel == 1) + if (length(duplicatesLabel) > 0) { + warning(paste("Some codes at the same hierarchical level have the same labels (see 'QC_duplicatesLabel').")) + } + QC_duplicatesLabel = QC_output[duplicatesLabel,] + } + + + ## RULE 6 - Hierarchical label dependencies + if (labelHierarchy){ + QC_output$singleChildMismatch = 0 + + for (k in 1:(nrow(lengths)-1)) { + + #select only parents with children + parents_k = QC_output$Code[which(QC_output$level == k)] + #list the no. of child for each parent then select parents with single child / or multiple children + child_ls = sapply(unique(parents_k), function(x) length(unique(na.omit(QC_output[which(QC_output[[paste0("Code", k)]] == x), paste0("Code", k+1)])))) + code_singlechild = QC_output[which(QC_output[[paste0("Code", k)]] %in% names(which(child_ls == 1)) & !is.na(QC_output[[paste0("Code", k+1)]]) & QC_output$level == k +1), c(paste0("Code", k), paste0("Code", k+1))] + code_multichild = QC_output[which(QC_output[[paste0("Code", k)]] %in% names(which(child_ls > 1)) & !is.na(QC_output[[paste0("Code", k+1)]]) & QC_output$level == k +1), c(paste0("Code", k), paste0("Code", k+1))] + + #only label + label_nocode = gsub("(\\d)|\\.", "", QC_output$Label) + label_nocode = mapply(function(x,y) sub(x, "", y), QC_output$Code, label_nocode) + label_nocode = tolower(str_squish(label_nocode)) + + #check if single child have different labels from their parents (=1) + if (nrow(code_singlechild) != 0){ + for (c in 1:nrow(code_singlechild)){ + row_parent = which(QC_output$Code == as.character(code_singlechild[c,1])) + label_parent = label_nocode[row_parent] + row_child = which(QC_output$Code == as.character(code_singlechild[c,2])) + label_child = label_nocode[row_child] + if (label_parent != label_child) { + QC_output$singleChildMismatch[row_child] = 1 + } + } + } + #identify mismatches - + singleChildMismatch = which(QC_output$singleChildMismatch != 0) + if (length(singleChildMismatch) > 0) { + warning(paste("Some single child have different labels from their parents or some multiple children have same labels to their parents (see 'QC_singleChildMismatch').")) + } + QC_singleChildMismatch = QC_output[singleChildMismatch,] + } + } + + ## RULE 7 - Single child code compliance + if (!missing(singleChildCode)){ + + singleChildCode = read.csv(file.path(paste0(getwd(), "/", singleChildCode))) + + QC_output$singleCodeError = 0 + QC_output$multipleCodeError = 0 + + for (k in 1:(nrow(lengths)-1)) { + + if (unique(nchar(na.omit(QC_output[[paste0("segment", k+1)]]))) > 1) { + warning(paste0("Single child code compliance cannot be checked at level ", k+1, " as segments of code have more than one character.")) + QC_output$singleCodeError[which(nchar(na.omit(QC_output[[paste0("segment", k+1)]])) > 1)] = NA + QC_output$multipleCodeError[which(nchar(na.omit(QC_output[[paste0("segment", k+1)]])) > 1)] = NA + } + + if (unique(nchar(na.omit(QC_output[[paste0("segment", k+1)]]))) == 1) { + + #select only parents with children + parents_k = QC_output$Code[which(QC_output$level == k)] + #list the no. of child for each parent then select parents with single child / or multiple children + child_ls = sapply(unique(parents_k), function(x) length(unique(na.omit(QC_output[which(QC_output[[paste0("Code", k)]] == x), paste0("Code", k+1)])))) + code_singlechild = QC_output[which(QC_output[[paste0("Code", k)]] %in% names(which(child_ls == 1)) & !is.na(QC_output[[paste0("Code", k+1)]]) & QC_output$level == k +1), c(paste0("Code", k), paste0("Code", k+1))] + code_multichild = QC_output[which(QC_output[[paste0("Code", k)]] %in% names(which(child_ls > 1)) & !is.na(QC_output[[paste0("Code", k+1)]]) & QC_output$level == k +1), c(paste0("Code", k), paste0("Code", k+1))] + + level = singleChildCode[k, 1] + ##singleCode take all the code end by a "0" + single = singleChildCode[which(singleChildCode[,1] == level),2] + ## MultipleCode take all the code end as in the csv file #COULD BE LETTER AS WELL + multi = strsplit(as.character(singleChildCode[which(singleChildCode[,1] == level),3]),"")[[1]] + + #Determine the observed code end for single and multi children + single_code = str_sub(code_singlechild[,2], nchar(code_singlechild[,2]), nchar(code_singlechild[,2])) + multi_code = str_sub(code_multichild[,2], nchar(code_multichild[,2]), nchar(code_multichild[,2])) + + #Check if they are correct (single) + if (nrow(code_singlechild) != 0){ + for (c in 1:nrow(code_singlechild)){ + row_parent = which(QC_output$Code == as.character(code_singlechild[c,1])) + label_parent = label_nocode[row_parent] + row_child = which(QC_output$Code == as.character(code_singlechild[c,2])) + label_child = label_nocode[row_child] + + #check if single child have correct end code + end_code = str_sub(QC_output$Code[row_child], nchar(QC_output$Code[row_child]), nchar(QC_output$Code[row_child])) + if (!end_code %in% single) { + QC_output$singleCodeError[row_child] = 1 + } + } + } + + + #Check if they are correct (multi) + if (nrow(code_multichild) != 0){ + for (m in 1:nrow(code_multichild)){ + row_parent = which(QC_output$Code == as.character(code_multichild[m,1])) + label_parent = label_nocode[row_parent] + row_child = which(QC_output$Code == as.character(code_multichild[m,2])) + label_child = label_nocode[row_child] + #check if multiple child have same labels to their parents (=9) + if (label_parent == label_child) { + QC_output$singleChildMismatch[row_child] = 9 + } + + #check if multi child have correct end code + end_code = str_sub(QC_output$Code[row_child], nchar(QC_output$Code[row_child]), nchar(QC_output$Code[row_child])) + if (!end_code %in% multi) { + QC_output$multipleCodeError[row_child] = 1 + } + } + } + } + } + + + #identify mismatches + singleCodeError = which(QC_output$singleCodeError != 0) + if (length(singleCodeError) > 0) { + warning(paste("Some single children been wrongly coded (see 'QC_singleCodeError'.")) + } + QC_singleCodeError = QC_output[singleCodeError,] + + multipleCodeError = which(QC_output$multipleCodeError != 0) + if (length(multipleCodeError) > 0) { + warning(paste("Some multiple children been wrongly coded (see 'QC_multipleCodeError'.")) + } + QC_multipleCodeError = QC_output[multipleCodeError,] + } + + ## RULE 8 - Sequencing of codes + if (!missing(sequencing)){ + + sequencing = read.csv(file.path(paste0(getwd(), "/", sequencing))) + + QC_output$gapBefore = 0 + QC_output$lastSibling = 0 + + for (k in 1:(nrow(lengths)-1)) { + + if (unique(nchar(na.omit(QC_output[[paste0("segment", k+1)]]))) > 1) { + warning(paste0("Sequencing of codes cannot be checked at level ", k+1, " as segments of code have more than one character.")) + QC_output$gapBefore[which(nchar(na.omit(QC_output[[paste0("segment", k+1)]])) > 1)] = NA + QC_output$lastSibling[which(nchar(na.omit(QC_output[[paste0("segment", k+1)]])) > 1)] = NA + } + + if (unique(nchar(na.omit(QC_output[[paste0("segment", k+1)]]))) == 1) { + + #select only parents with children + parents_k = QC_output$Code[which(QC_output$level == k)] + #list the no. of child for each parent then select parents with single child / or multiple children + child_ls = sapply(unique(parents_k), function(x) length(unique(na.omit(QC_output[which(QC_output[[paste0("Code", k)]] == x), paste0("Code", k+1)])))) + code_multichild = QC_output[which(QC_output[[paste0("Code", k)]] %in% names(which(child_ls > 1)) & !is.na(QC_output[[paste0("Code", k+1)]]) & QC_output$level == k +1), c(paste0("Code", k), paste0("Code", k+1))] + + level = sequencing[k, 1] + + ## MultipleCode take all the code end as in the csv file #COULD BE LETTER AS WELL + multi = strsplit(as.character(sequencing[which(sequencing[,1] == level),2]),"")[[1]] + + #Determine the observed code end for single and multi children + multi_code = str_sub(code_multichild[,2], nchar(code_multichild[,2]), nchar(code_multichild[,2])) + + #identify last code for multi children + mcode_ls = sapply(unique(code_multichild[,1]), function(x) code_multichild[,2][which(code_multichild[,1] == x)]) + ecode_ls = lapply(mcode_ls, function(x) str_sub(x, nchar(x), nchar(x))) + + ## to avoid confusions I have added to take the first element only, but needs to be changed + last_dig = unlist(lapply(ecode_ls, function(x) which(x == max(x))[1])) + last_code = as.vector(mapply(function(x, y) x[y], mcode_ls, last_dig)) + + #identify code with gap before + gap_find = lapply(ecode_ls, function(x) match(multi, x)) + code_gap = lapply(gap_find, function(x) which(is.na(x)) + 1) + gapbefore_dig = mapply(function(x, y) na.omit(x[y]), gap_find, code_gap) + gapbefore_code = as.vector(unlist(mapply(function(x, y) x[y], mcode_ls, gapbefore_dig))) + + #flag in the QC_output + QC_output$lastSibling[which(QC_output$Code %in% last_code)] = 1 + QC_output$gapBefore[which(QC_output$Code %in% gapbefore_code)] = 1 + } + + #identify gab before + gap = which(QC_output$gapBefore == 1) + if (length(gap) > 0) { + warning(paste("There are gab in the sequencing of multiple children coding (see 'QC_gapBefore').")) + } + QC_gapBefore = QC_output[gap,] + + #identify last sibling - + lastSibling = which(QC_output$lastSibling == 1) + QC_lastSibling = QC_output[lastSibling,] + + } + } + + ## RESULTS + if (!(fullHierarchy)) { + QC_childless = data.frame() + } + + if (!(labelUniqueness)) { + QC_duplicatesLabel = data.frame() + } + + if (!(labelHierarchy)) { + QC_singleChildMismatch = data.frame() + } + + if (missing(singleChildCode)) { + QC_singleCodeError = data.frame() + QC_multipleCodeError = data.frame() + } + + if (missing(sequencing)) { + QC_lastSibling = data.frame() + } + + return_ls = list("QC_output" = QC_output, "QC_noLevels" = QC_noLevels, "QC_duplicatesCode" = QC_duplicatesCode, "QC_orphan" = QC_orphan, + "QC_childless" = QC_childless, "QC_duplicatesLabel" = QC_duplicatesLabel, "QC_singleChildMismatch" = QC_singleChildMismatch, + "QC_singleCodeError" = QC_singleCodeError, "QC_multipleCodeError" = QC_multipleCodeError, "QC_gapBefore" = QC_gapBefore, + "QC_lastSibling" = QC_lastSibling) + + if (XLSXout == TRUE){ + write_xlsx(return_ls, file.path(paste0(getwd(), "/QC_output.csv"))) + } + + return(return_ls) +} + + + diff --git a/R/correctionClassification.R b/R/correctionClassification.R new file mode 100644 index 0000000..3b9908a --- /dev/null +++ b/R/correctionClassification.R @@ -0,0 +1,101 @@ +#' @title Retrieve classification table from CELLAR and FAO repositories. +#' @description The aim of this function is to provide a table showing the different codes and labels for each classification +#' @param classification it returns a dataframe with two columns corrected according to the classification of CELLAR & FAO. +#' @export +#' @return +#' \code{correctionClassification()} returns a table with information needed to retrieve the classification table: +#' \itemize{ +#' \item Classification Code name (e.g. nace2): the code of each object +#' \item Classification Label: corresponding name of each object +#' } +#' @examples +#' { +#' prefix = "nace2" +#' conceptScheme = "nace2" +#' endpoint = "CELLAR" +#' classification = retrieveClassificationTable(prefix, endpoint, conceptScheme, level="ALL")$ClassificationTable +#' correct_classification = correctionClassification(classification) +#' View(correct_classification) +#' } + + +correctionClassification = function(classification){ + + if(ncol(classification) != 2){ + stop("The classification must have only two colums corresponding to code and label.") + } + + colnames(classification)[1:2] = c("Code", "Label") + + #add letter to code (for NACE - NACE 2.1 - CPA21 - and ISIC) + if (prefix %in% c("nace2", "nace21", "cpa21", "ISICrev4")) { + A_code = which(substr(classification$Code, 1, 2) %in% c("01", "02", "03")) + classification$Code[A_code] = paste0("A", classification$Code[A_code]) + B_code = which(substr(classification$Code, 1, 2) %in% c("05", "06", "07", "08", "09")) + classification$Code[B_code] = paste0("B", classification$Code[B_code]) + C_code = which(substr(classification$Code, 1, 2) %in% c("10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33")) + classification$Code[C_code] = paste0("C", classification$Code[C_code]) + D_code = which(substr(classification$Code, 1, 2) %in% c("35")) + classification$Code[D_code] = paste0("D", classification$Code[D_code]) + E_code = which(substr(classification$Code, 1, 2) %in% c("36", "37", "38", "39")) + classification$Code[E_code] = paste0("E", classification$Code[E_code]) + F_code = which(substr(classification$Code, 1, 2) %in% c("41", "42", "43")) + classification$Code[F_code] = paste0("F", classification$Code[F_code]) + G_code = which(substr(classification$Code, 1, 2) %in% c("45", "46", "47")) + classification$Code[G_code] = paste0("G", classification$Code[G_code]) + H_code = which(substr(classification$Code, 1, 2) %in% c("49", "50", "51", "52", "53")) + classification$Code[H_code] = paste0("H", classification$Code[H_code]) + I_code = which(substr(classification$Code, 1, 2) %in% c("55", "56")) + classification$Code[I_code] = paste0("I", classification$Code[I_code]) + J_code = which(substr(classification$Code, 1, 2) %in% c("58", "59", "60", "61", "62", "63")) + classification$Code[J_code] = paste0("J", classification$Code[J_code]) + K_code = which(substr(classification$Code, 1, 2) %in% c("64", "65", "66")) + classification$Code[K_code] = paste0("K", classification$Code[K_code]) + L_code = which(substr(classification$Code, 1, 2) %in% c("68")) + classification$Code[L_code] = paste0("L", classification$Code[L_code]) + M_code = which(substr(classification$Code, 1, 2) %in% c("69", "70", "71", "72", "73", "74", "75")) + classification$Code[M_code] = paste0("M", classification$Code[M_code]) + N_code = which(substr(classification$Code, 1, 2) %in% c("77", "78", "79", "80", "81", "82")) + classification$Code[N_code] = paste0("N", classification$Code[N_code]) + O_code = which(substr(classification$Code, 1, 2) %in% c("84")) + classification$Code[O_code] = paste0("O", classification$Code[O_code]) + P_code = which(substr(classification$Code, 1, 2) %in% c("85")) + classification$Code[P_code] = paste0("P", classification$Code[P_code]) + Q_code = which(substr(classification$Code, 1, 2) %in% c("86", "87", "88")) + classification$Code[Q_code] = paste0("Q", classification$Code[Q_code]) + R_code = which(substr(classification$Code, 1, 2) %in% c("90", "91", "92", "93")) + classification$Code[R_code] = paste0("R", classification$Code[R_code]) + S_code = which(substr(classification$Code, 1, 2) %in% c("94", "95", "96")) + classification$Code[S_code] = paste0("S", classification$Code[S_code]) + T_code = which(substr(classification$Code, 1, 2) %in% c("97", "98")) + classification$Code[T_code] = paste0("T", classification$Code[T_code]) + U_code = which(substr(classification$Code, 1, 2) %in% c("99")) + classification$Code[U_code] = paste0("U", classification$Code[U_code]) + } + + #remove .0 for 10, 11 and 12 division (for ecoicop) --- THIS WAS CHANGED (but not for hicp) + if (prefix %in% c("ecoicop")) { + level1_code = which(classification$Code %in% c("10.0", "11.0", "12.0")) + classification$Code[level1_code] = c("10", "11", "12") + } + + #remove weird code 00.99.t and 00.99.t (for prodcom2019) + if (prefix %in% c("prodcom2019")) { + level1_code = which(classification$Code %in% c("00.99.t", "00.99.z")) + classification = classification[-level1_code,] + } + + #remove section (for CN) - does not need correction NO SECTION + if (prefix %in% c("cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", "cn2023")) { + level1_code = which(gsub("[^a-zA-Z]", "", classification$Code)!= "") + classification = classification[-level1_code,] + } + + #remove . in the end of the code (for CBF) + if (prefix %in% c("cbf10")) { + classification[,1] = substr(classification[,1], 1, nchar(classification[,1])-1) + } + + return(classification) +} \ No newline at end of file diff --git a/R/dataStructure.R b/R/dataStructure.R new file mode 100644 index 0000000..981390d --- /dev/null +++ b/R/dataStructure.R @@ -0,0 +1,118 @@ +#' @title Retrieve information about the structure of each classification tables from CELLAR and FAO repositories. +#' @description Retrieve information, for all the classification available in the repositories (CELLAR and FAO), +#' about the level names their hierarchy and the numbers of records the function "structureData()" can be used. +#' @param prefix Prefixes are typically defined at the beginning of a SPARQL query +#' and are used throughout the query to make it more concise and easier to read. +#' Multiple prefixes can be defined in a single query to cover different namespaces used in the data set. +#' The function 'classificationEndpoint()' can be used to generate the prefixes for the selected classification table. +#' @param conceptScheme Refers to a unique identifier associated to specific classification table. +#' The conceptScheme can be obtained by utilizing the "classificationEndpoint()" function. +#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, +#' making it easier to retrieve specific information or perform complex queries on linked data. +#' The valid values are \code{"CELLAR"} or \code{"FAO"}. +#' @param language Refers to the specific language used for providing label, include and exclude information in the selected classification table. +#' By default is set to "en". This is an optional argument. +#' @import httr +#' @export +#' @return +#' \code{structureData()} returns the structure of a classification table from CELLAR and FAO in form a table with the following colums: +#' \itemize{ +#' \item Concept_Scheme: taxonomy of the SKOS object to be retrieved +#' \item Level: the levels of the objects in the collection +#' \item Depth: identify the hierarchy of each level +#' \item Count: the number of objects retrieved in each level +#' } +#' @examples +#' { +#' ## Obtain a list including the structure of each classification available +#' ## CELLAR +#' data_CELLAR = list() +#' endpoint = "CELLAR" +#' list_data = classificationEndpoint("ALL") +#' +#' for (i in 1:nrow(list_data$CELLAR)){ +#' prefix = list_data$CELLAR[i,1] +#' conceptScheme = list_data$CELLAR[i,2] +#' data_CELLAR[[i]] = dataStructure(prefix, conceptScheme, endpoint) +#' } +#' names(data_CELLAR) = list_data$CELLAR[,1] + +#' ## FAO +#' data_FAO = list() +#' endpoint = "FAO" +#' for (i in 1:nrow(list_data$FAO)){ +#' prefix = list_data$FAO[i,1] +#' conceptScheme = list_data$FAO[i,2] +#' data_FAO[[i]] = dataStructure(prefix, conceptScheme, endpoint) +#' } +#' names(data_FAO) = list_data$FAO[,1] +#' } + + +dataStructure = function(prefix, conceptScheme, endpoint, language = "en") { + + ### Define endpoint + if (endpoint == "CELLAR") { + source = "http://publications.europa.eu/webapi/rdf/sparql" + url = "data.europa.eu/" + } + if (endpoint == "FAO") { + source = "https://stats.fao.org/caliper/sparql/AllVocs" + url = "unstats.un.org/" + } + + ## Create Prefixes list + prefix_ls = prefixList(endpoint) + prefix_ls = as.character(paste(prefix_ls, collapse = "\n")) + ### Load prefixes from Excel file + #prefix_file = read.csv(paste0("//lu-fsp01/Data_Lux/AgSTAT/Projects/CorrespondenceTables_Rpck/Task 3/prefix_", endpoint, ".csv")) + #prefix = as.character(paste(prefix_file$List, collapse = "\n")) + + ### SPARQL query + SPARQL.query = paste0(prefix_ls, " + SELECT DISTINCT ?Concept_Scheme ?Level ?Depth (COUNT (distinct ?s) AS ?Count) + + WHERE { + ?s skos:prefLabel ?Label ; + #skos:inScheme ", prefix, ":", conceptScheme, " ; + skos:inScheme ?Scheme ; + ^skos:member ?Member ; + skos:prefLabel ?Label ; + skos:notation ?notation . + + ?Member a xkos:ClassificationLevel . + OPTIONAL {?member xkos:levels ?levels_temp . } + OPTIONAL {?member xkos:depth ?Depth . } + + FILTER (?Scheme = ", prefix, ":", conceptScheme, ") + FILTER (lang(?Label) = '", language, "') + #FILTER (datatype(?notation) = rdf:PlainLiteral) + + BIND (STR(?s) AS ?URL) + #BIND (STR(?notation) AS ?CODE ) + + BIND (STRAFTER(STR(", prefix, ":), 'http') AS ?CLASS_URL) + #BIND (STRAFTER((?CLASS_URL), '/') AS ?Class) + BIND (STRAFTER(STR(?Scheme), STR(?CLASS_URL)) AS ?Concept_Scheme) + BIND (STRAFTER(str(?Member), STR(?CLASS_URL)) As ?Level) + + FILTER (STRLEN(?Concept_Scheme) != 0) + FILTER (STRLEN(?Level) != 0) + #FILTER (?Concept_Scheme != STR('ag')) + } + + GROUP BY ?Concept_Scheme ?Level ?Depth + ORDER BY ?Concept_Scheme ?Level ?Depth + ") + + + response = httr::POST(url = source, accept("text/csv"), body = list(query = SPARQL.query), encode = "form") + table = read.csv(text=content(response, "text"), sep= ",") + table = table[order(table[,3],decreasing=FALSE),] + + if (nrow(table) == 0){ + message("This classification has no level. Please use level = 'ALL' when retrieving it using the retrieveClassificationTable") + } + + return(table) +} \ No newline at end of file diff --git a/R/lengthsFile.R b/R/lengthsFile.R new file mode 100644 index 0000000..6ff3fe1 --- /dev/null +++ b/R/lengthsFile.R @@ -0,0 +1,145 @@ +#' @title Retrieve correspondance tables lenghts for each level tables between classification from CELLAR and FAO repositories +#' @description The aim of this function is to provide a table showing the different levels of hierarchy for each classification and the length of each level. +#' @param endpoint SPARQL endpoints provide a standardized way to access data sets, +#' making it easier to retrieve specific information or perform complex queries on linked data. +#' The valid values are \code{"CELLAR"} or \code{"FAO"}. +#' @param prefix Prefixes are typically defined at the beginning of a SPARQL query +#' and are used throughout the query to make it more concise and easier to read. +#' Multiple prefixes can be defined in a single query to cover different namespaces used in the dataset. +#' The function 'classEndpoint()' can be used to generate the prefixes for the selected correspondence table. +#' @param conceptScheme Refers to a unique identifier associated to specific classification table. +#' The conceptScheme can be obtained by utilizing the "classEndpoint()" function. +#' @param correction The valid values are \code{FALSE} or \code{TRUE}. In both cases the lengths table as an R object. +#' If the output wants to have a correction for hierarchy levels \code{TRUE}. By default is set to "TRUE". +#' @export +#' @return +#' \code{lenghtsFile()} returns a table containing the lengths for each hierarchical level of the classification. +#' \itemize{ +#' \item charb: contains the length for each code for each hierarchical level +#' \item chare: contains the concatenated length of char b for each code for each hierarchical level +#' } +#' @examples +#' { +#' endpoint = "CELLAR" +#' prefix = "nace2" +#' conceptScheme = "nace2" +#' +#' lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) +#' +#' #View lengthsTable +#' View(lengthsTable) +#' +#' } + + + +lengthsFile = function(endpoint, prefix, conceptScheme, correction = TRUE) { + + ## Create 'lengths' table - + ## WE NEED TO CLARIFY IF THE LENGTHS FILE IS AUTOMATICALLY OBTAINED OR PROVIDED BY THE USER! + level_dt = dataStructure(prefix, conceptScheme, endpoint) + + if (isTRUE(correction)) { + # order (for prodcom) + if (prefix %in% c("prodcom2019", "prodcom2021", "prodcom2022")) { + level_dt = level_dt[c(2,1,3),] + } + # remove first level (for CN) + if (prefix %in% c("cn2017", "cn2018", "cn2019", "cn2020", "cn2021", "cn2021", "cn2022", "cn2023")) { + level_dt = level_dt[-1,] + } + # order (for CPA) + if (prefix %in% c("cpa21")) { + level_dt = level_dt[c(1,2,3,6,4,5),] + } + } + + #vectors to store info + level = length = start_pos = end_pos = numeric(nrow(level_dt)) + level = level_dt[,2] + + #level 1 not included (ASK!) + dt = retrieveClassificationTable(prefix, endpoint, conceptScheme, level[1])$ClassificationTable + if (isTRUE(correction)) { + ## remove .0 for 10, 11 and 12 division (ecoicop) + if (prefix %in% c("ecoicop")) { + dt[,1][which(dt[,1] %in% c("10.0", "11.0", "12.0"))] = c("10", "11", "12") + } + ## remove weird code 00.99.t and 00.99.t (for prodcom2019) + if (prefix %in% c("prodcom2019", "prodcom2021", "prodcom2022")) { + dt = dt[-which(dt[,1] %in% c("00.99.t", "00.99.z")),] + } + } + + if (length(unique(nchar(dt[,1]))) > 1) { + length[1] = 999 + start_pos[1] = NA + end_pos[1] = NA + } else { + length[1] = unique(nchar(dt[,1])) + start_pos[1] = sum(c(unique(sapply(dt[,1], function(x) regexpr("[^ .]", x)))), na.rm = T) + if (is.numeric(dt[,1]) == TRUE) { + rev_string = sapply(dt[,1], function(x) rev(x)) + } else { + rev_string = sapply(dt[,1], function(x) intToUtf8(rev(utf8ToInt(x)))) + } + end_pos[1] = sum(c(unique(nchar(dt[,1])), - unique(sapply(rev_string, function(x) regexpr("[^ .]", x))), 1), na.rm = T) + } + + #other levels + for (l in 2:nrow(level_dt)) { + dt = retrieveClassificationTable(prefix, endpoint, conceptScheme, level[l])$ClassificationTable + if (isTRUE(correction)) { + ## add letter to code (for NACE, NACE 2.1, CPA and ISIC) + if (prefix %in% c("nace2", "nace21", "cpa21", "ISICrev4")) { + dt[,1] = paste0("A", dt[,1]) + } + ## add leading zero for ICC_v11 + if (prefix %in% c("ICC_v11")) { + if (l == 2){ dt[,1] = sprintf("%.2f", dt[,1]) } + } + } + + if (length(unique(nchar(dt[,1]))) > 1) { + length[l] = 999 + start_pos[l] = NA + end_pos[l] = NA + } else { + length[l] = unique(nchar(dt[,1])) + code = sapply(dt[,1], function(x) substring(x, first = end_pos[l-1] + 1, last = length[l])) + start_pos[l] = sum(c(unique(sapply(code, function(x) regexpr("[^ .]", x))), end_pos[l-1]), na.rm = T) + rev_string = sapply(code, function(x) intToUtf8(rev(utf8ToInt(x)))) + end_pos[l] = sum(c(unique(nchar(code)), - unique(sapply(rev_string, function(x) regexpr("[^ .]", x))), 1, end_pos[l-1]), na.rm = T) + } + } + + #create length table + level_table = data.frame(level, length, start_pos, end_pos) + + charb = start_pos + chare = end_pos + lengths = data.frame(cbind(charb, chare)) + + if (TRUE %in% is.na(lengths[,1]) | TRUE %in% is.na(lengths[,2])){ + warning("There is a problem with the given classification and the lenghts file produced should not be trusted. Please check the classification and correct any issue.") + } + + if (isFALSE(correction)){ + warning("The lenghts file produced could be wrong. Please make sure the classification is correct.") + } + + return(lengths) +} + + + +## Retrieve Classification +#prefix = "nace2" +#conceptScheme = "nace2" +#endpoint = "CELLAR" + +#lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) +#lengthsTable + +#lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = FALSE) +#lengthsTable diff --git a/R/prefixList.R b/R/prefixList.R index 9d1fe5e..c25811b 100644 --- a/R/prefixList.R +++ b/R/prefixList.R @@ -45,4 +45,4 @@ prefixList = function(endpoint) { } return(prefix_all) -} \ No newline at end of file +} diff --git a/R/retrieveClassificationTable.R b/R/retrieveClassificationTable.R index 6241b56..964415f 100644 --- a/R/retrieveClassificationTable.R +++ b/R/retrieveClassificationTable.R @@ -21,6 +21,7 @@ #' } #' @examples #' { +#' endpoint = "CELLAR" #' prefix = "nace2" #' endpoint = "CELLAR" #' conceptScheme = "nace2" @@ -52,6 +53,7 @@ retrieveClassificationTable = function(prefix, endpoint, conceptScheme, level = ### CLASSIFICATION TABLE SPARQL QUERIES ### Define SPARQL query -- BASE: all levels SPARQL.query_0 = paste0(prefixlist, " + SELECT DISTINCT ?", prefix, " ?NAME ?Include ?Include_Also ?Exclude ?URL ?datatype WHERE { @@ -77,6 +79,7 @@ retrieveClassificationTable = function(prefix, endpoint, conceptScheme, level = ") + ### Define SPARQL query -- FILTER LEVEL SPARQL.query_level = paste0("FILTER (?Member = ", prefix, ":", level, ")") @@ -128,4 +131,5 @@ retrieveClassificationTable = function(prefix, endpoint, conceptScheme, level = } return(data) + } \ No newline at end of file diff --git a/man/classificationEndpoint.Rd b/man/classificationEndpoint.Rd new file mode 100644 index 0000000..78a31d5 --- /dev/null +++ b/man/classificationEndpoint.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/classificationEndpoint.R +\name{classificationEndpoint} +\alias{classificationEndpoint} +\title{Retrieve a list of classification tables from CELLAR and FAO repositories or both.} +\usage{ +classificationEndpoint(endpoint = "ALL") +} +\arguments{ +\item{endpoint}{SPARQL endpoints provide a standardized way to access data sets, +making it easier to retrieve specific information or perform complex queries on linked data. This is an optional +parameter, which by default is set to \code{"ALL"}. +The valid values are \code{"CELLAR"}, \code{"FAO"} and \code{"ALL"} for both endpoints.} +} +\value{ +\code{classificationEndpoint()} returns a table with information needed to retrieve the classification table: +\itemize{ + \item Prefix name: the SPARQL instruction for a declaration of a namespace prefix + \item Conceptscheme: taxonomy of the SKOS object to be retrieved + \item URI: the URL from which the SPARQL query was retrieved + \item Name: the name of the table retrieved +} +} +\description{ +The purpose of this function is to provide a comprehensive summary +of the data structure for each classification in CELLAR and FAO endpoint. +The summary includes information such as the prefix name, URI, key, concept scheme, and title associated with each classification. +} +\examples{ +{ + endpoint = "ALL" + list_data = classificationEndpoint(endpoint) + } +} diff --git a/man/classificationQC.Rd b/man/classificationQC.Rd new file mode 100644 index 0000000..316b8ee --- /dev/null +++ b/man/classificationQC.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/classificationQC.R +\name{classificationQC} +\alias{classificationQC} +\title{ClassificationQC performs a quality check control checks on a given statistical classifications} +\usage{ +classificationQC( + classification, + lengthsFile, + fullHierarchy = TRUE, + labelUniqueness = TRUE, + labelHierarchy = TRUE, + singleChildCode = NULL, + sequencing = NULL, + XLSXout = FALSE +) +} +\arguments{ +\item{classification}{Refers to a classification in csv file or an R dataframe structured with two columns, consisting +of codes and labels, respectively. If the classification is provided as a csv file, it should be stored in the working directory (as +defined using \code{getwd}). This is a mandatory argument.} + +\item{fullHierarchy}{It is used to test the fullness of hierarchy. If the parameter \code{fullHierarchy} is set to \code{FALSE}, +the function will check that every position at a lower level than 1 should have parents all the way up to level 1. +If it is set to \code{TRUE}, in addition to the previous, it will be checked that any position at a higher level +than k should have children all the way down to level k.} + +\item{labelUniqueness}{It is used to test the that positions at the same hierarchical level have unique labels. If set to \code{TRUE}, +the compliance is checked and positions with duplicate labels are marked as 1 in the "duplicateLabel" column, +while positions with unique labels are marked as 0.} + +\item{labelHierarchy}{It is used to ensure that hierarchical structure of labels is respected. +When set to \code{TRUE}, the function will check that single child have a label identical to the label of its parent and that +has if a position has a label identical to the label of one of its children, then that position should only have a single child.} + +\item{singleChildCode}{It refers to CSV file with specific formatting to define valid codes for each level. If this parameter is not \code{NULL} +then it checks compliance with coding rules for single children and non-single children, as provided in the CSV file.} + +\item{sequencing}{It refers to a CSV file to define the admissible codes for multiple children at each level. If this parameter +is not \code{NULL}, the function checks the sequencing of multiple children codes within each level, as provided in the CSV file.} + +\item{XLSXout}{The valid values are \code{FALSE} or \code{TRUE}. In both cases the output will be returned as an R list. +If output should be saved as a xlsx file, the argument should be set as \code{TRUE}. By default, no xlsx file is produced.} + +\item{lengthsfile}{Refers to a CSV file or a R dataframe (one record per hierarchical level) containing the initial and +last position of the segment of the code specific to that level. The number of lines of this CSV file or the R dataframe will +also implicitly define the number of hierarchical levels of the classification. This is a mandatory argument.} +} +\value{ +\code{classificationQC()} returns a list of dataframes identifying possible the cases violating the formatting requirements. The + databases returned depend on the rules checked. The databases produced are: + \itemize{ + \item{QC_output} The dataset includes all the original records in the classification. Colum "Level" refers to the hierarchical levels + of each position. Each code will be parsed into segment_k (column "Segmentk") and code_k (column "Codek"), corresponding to the code + and segment and hierarchical level k respectively. Additional columns are included to flag the corrected behaviour in each position. + These are + \itemize{ + \item Orphan: if fullHierarchy is set to FALSE, an "orphan" is a position at a hierarchical level (j) greater than 1 that lacks a parent at the hierarchical level (j-1) immediately above it. + Orphan positions are marked with a value of 1 in the "QC output" column, indicating their orphan status. Otherwise, they are assigned a value of 0. + \item Childless: if fullHierarchy is set to TRUE, a "childless" position is one at a hierarchical level (j) less than k that lacks a child + at the hierarchical level (j+1) immediately below it. Childless positions are marked with a value of 1 + indicating their childless status. Otherwise, they are assigned a value of 0. + \item DuplicateLabel: new column in the output that flags positions involved in duplicate label situations (where multiple positions share the same label at the same hierarchical level) + by assigning them a value of 1, while positions with unique labels are assigned a value of 0. + \item SingleChildMismatch: column in the output provides information about label hierarchy consistency in a hierarchical classification system. It indicates:c + Value 1: Mismatched labels between a parent and its single child. + Value 9: Parent-child pairs with matching labels, but the parent has multiple children. + \item SingleCodeError: column serves as a flag indicating whether a position is a single child and whether the corresponding "singleCode" contains the level j segment. + A value of 1 signifies a mismatch, while a value of 0 indicates compliance with the coding rules + \item MultipleCodeError: column serves as a flag indicating whether a position is not a single child and whether the corresponding "multipleCodej" contains the level j segment. + A value of 1 signifies a mismatch, while a value of 0 indicates compliance with the coding rules + \item GapBefore: takes the value 0 or 1 if there is a missing child in the 123456789 series. + \item LastSibling: takes the value 1 when it is the last child in the series 123456789 otherwise the value 0 + } + + \item{QC_noLevels} A subset of the QC_output dataframe including only records for which levels is not defined. In general if this dataframe + is not empty, it suggest that either the classification or the length file is not correctily specified. + \item{QC_orphan} A subset of the QC_output dataframe including only records that have no parents at the higher hierarchical level. + \item{QC_childless} A subset of the QC_output dataframe including only records that have no children at the lower hierarchical level. + \item{QC_duplicatesLabel} A subset of the QC_output dataframe including only records that have duplicated label in the same hierarchical level. + \item{QC_duplicatesCode} A subset of the QC_output dataframe including only records that have the same codes. + \item{QC_singleChildMismatch} A subset of the QC_output dataframe including only records that are single child and have different labels from + their parents or that are multiple children and have same labels to their parents. + \item{QC_singleCodeError} A subset of the QC_output dataframe including only records that are single children and have been wrongly coded (not following + the rule provided in the 'SingleChildMismatch' CSV file). + \item{QC_multipleCodeError} A subset of the QC_output dataframe including only records that are multiple children and have been wrongly coded (not following + the rule provided in the 'SingleChildMismatch' CSV file). + \item{QC_gapBefore} A subset of the QC_output dataframe including only records that are multiple children and have gap before in the sequencing provided in the + 'sequencing' CSV file. + \item{QC_lastSibling} A subset of the QC_output dataframe including only records that are multiple and last children following the sequencing provided in the + 'sequencing' CSV file. + } +} +\description{ +The purpose of this function perform quality control checks on statistical classifications. +It checks the compliance of classifications with structural rules and provides informative error messages +for violations. The function requires input files containing code and label information for each +classification position. It verifies the formatting requirements, uniqueness of codes, fullness of hierarchy, +uniqueness of labels, hierarchical label dependencies, single child code compliance, and sequencing of codes. +The function generates a QC output data frame with the classification data, hierarchical level, code segments, +and test outcomes.Additionally, it allows exporting the output to a CSV file. Overall, the classificationQC +function ensures the integrity and accuracy of statistical classifications. +} +\examples{ +{ + prefix = "nace2" + conceptScheme = "nace2" + endpoint = "CELLAR" + lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) + classification = retrieveClassificationTable(prefix, endpoint, conceptScheme, level="ALL")$ClassificationTable + classification = classification[,c(1,2)] + classification = correctionClassification(classification) + Output = classificationQC(classification, lengthsFile, fullHierarchy = TRUE, labelUniqueness = TRUE, labelHierarchy = TRUE, singleChildCode = NULL, sequencing = NULL) + View(Output$QC_output) + View(Output$QC_noLevels) + View(Output$QC_orphan) + View(Output$QC_childless) + View(Output$QC_duplicatesLabel) + View(Output$QC_duplicatesCode) + View(Output$QC_singleChildMismatch) + View(Output$QC_singleCodeError) + View(Output$QC_multipleCodeError) + View(Output$QC_gapBefore) + View(Output$QC_lastSibling) + } +} diff --git a/man/correctionClassification.Rd b/man/correctionClassification.Rd new file mode 100644 index 0000000..fb23816 --- /dev/null +++ b/man/correctionClassification.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correctionClassification.R +\name{correctionClassification} +\alias{correctionClassification} +\title{Retrieve classification table from CELLAR and FAO repositories.} +\usage{ +correctionClassification(classification) +} +\arguments{ +\item{classification}{it returns a dataframe with two columns corrected according to the classification of CELLAR & FAO.} +} +\value{ +\code{correctionClassification()} returns a table with information needed to retrieve the classification table: + \itemize{ + \item Classification Code name (e.g. nace2): the code of each object + \item Classification Label: corresponding name of each object + } +} +\description{ +The aim of this function is to provide a table showing the different codes and labels for each classification +} +\examples{ +{ +prefix = "nace2" +conceptScheme = "nace2" +endpoint = "CELLAR" +classification = retrieveClassificationTable(prefix, endpoint, conceptScheme, level="ALL")$ClassificationTable +correct_classification = correctionClassification(classification) +View(correct_classification) +} +} diff --git a/man/dataStructure.Rd b/man/dataStructure.Rd new file mode 100644 index 0000000..2de8449 --- /dev/null +++ b/man/dataStructure.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataStructure.R +\name{dataStructure} +\alias{dataStructure} +\title{Retrieve information about the structure of each classification tables from CELLAR and FAO repositories.} +\usage{ +dataStructure(prefix, conceptScheme, endpoint, language = "en") +} +\arguments{ +\item{prefix}{Prefixes are typically defined at the beginning of a SPARQL query +and are used throughout the query to make it more concise and easier to read. +Multiple prefixes can be defined in a single query to cover different namespaces used in the data set. +The function 'classificationEndpoint()' can be used to generate the prefixes for the selected classification table.} + +\item{conceptScheme}{Refers to a unique identifier associated to specific classification table. +The conceptScheme can be obtained by utilizing the "classificationEndpoint()" function.} + +\item{endpoint}{SPARQL endpoints provide a standardized way to access data sets, +making it easier to retrieve specific information or perform complex queries on linked data. +The valid values are \code{"CELLAR"} or \code{"FAO"}.} + +\item{language}{Refers to the specific language used for providing label, include and exclude information in the selected classification table. +By default is set to "en". This is an optional argument.} +} +\value{ +\code{structureData()} returns the structure of a classification table from CELLAR and FAO in form a table with the following colums: + \itemize{ + \item Concept_Scheme: taxonomy of the SKOS object to be retrieved + \item Level: the levels of the objects in the collection + \item Depth: identify the hierarchy of each level + \item Count: the number of objects retrieved in each level +} +} +\description{ +Retrieve information, for all the classification available in the repositories (CELLAR and FAO), +about the level names their hierarchy and the numbers of records the function "structureData()" can be used. +} +\examples{ +{ + ## Obtain a list including the structure of each classification available + ## CELLAR + data_CELLAR = list() + endpoint = "CELLAR" + list_data = classificationEndpoint("ALL") + + for (i in 1:nrow(list_data$CELLAR)){ + prefix = list_data$CELLAR[i,1] + conceptScheme = list_data$CELLAR[i,2] + data_CELLAR[[i]] = dataStructure(prefix, conceptScheme, endpoint) + } + names(data_CELLAR) = list_data$CELLAR[,1] + ## FAO + data_FAO = list() + endpoint = "FAO" + for (i in 1:nrow(list_data$FAO)){ + prefix = list_data$FAO[i,1] + conceptScheme = list_data$FAO[i,2] + data_FAO[[i]] = dataStructure(prefix, conceptScheme, endpoint) + } + names(data_FAO) = list_data$FAO[,1] + } +} diff --git a/man/lengthsFile.Rd b/man/lengthsFile.Rd new file mode 100644 index 0000000..b3de617 --- /dev/null +++ b/man/lengthsFile.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lengthsFile.R +\name{lengthsFile} +\alias{lengthsFile} +\title{Retrieve correspondance tables lenghts for each level tables between classification from CELLAR and FAO repositories} +\usage{ +lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) +} +\arguments{ +\item{endpoint}{SPARQL endpoints provide a standardized way to access data sets, +making it easier to retrieve specific information or perform complex queries on linked data. +The valid values are \code{"CELLAR"} or \code{"FAO"}.} + +\item{prefix}{Prefixes are typically defined at the beginning of a SPARQL query +and are used throughout the query to make it more concise and easier to read. +Multiple prefixes can be defined in a single query to cover different namespaces used in the dataset. +The function 'classEndpoint()' can be used to generate the prefixes for the selected correspondence table.} + +\item{conceptScheme}{Refers to a unique identifier associated to specific classification table. +The conceptScheme can be obtained by utilizing the "classEndpoint()" function.} + +\item{correction}{The valid values are \code{FALSE} or \code{TRUE}. In both cases the lengths table as an R object. +If the output wants to have a correction for hierarchy levels \code{TRUE}. By default is set to "TRUE".} +} +\value{ +\code{lenghtsFile()} returns a table containing the lengths for each hierarchical level of the classification. + \itemize{ + \item charb: contains the length for each code for each hierarchical level + \item chare: contains the concatenated length of char b for each code for each hierarchical level +} +} +\description{ +The aim of this function is to provide a table showing the different levels of hierarchy for each classification and the length of each level. +} +\examples{ +{ +endpoint = "CELLAR" +prefix = "nace2" +conceptScheme = "nace2" + +lengthsTable = lengthsFile(endpoint, prefix, conceptScheme, correction = TRUE) + +#View lengthsTable +View(lengthsTable) + +} +} diff --git a/vignettes/Retrieve_classification_and_correspondence_tables.Rmd b/vignettes/Retrieve_classification_and_correspondence_tables.Rmd index 700f669..8ade41c 100644 --- a/vignettes/Retrieve_classification_and_correspondence_tables.Rmd +++ b/vignettes/Retrieve_classification_and_correspondence_tables.Rmd @@ -63,7 +63,6 @@ dt = retrieveClassificationTable(prefix, endpoint, conceptScheme, level, languag Following, the other functions used to extract the required information for the retrieveClassificationTable() function. - **Application of function prefixList()** This function automatically obtains a list the prefixes from CELLAR and FAO used in the retrieveClassificationTable() and retrieveCorrespondenceTable() functions to retrieve the required tables. The list will be updated automatically each time a new classification or correspondence classification is introduced in CELLAR or FAO repository.