diff --git a/cmd/roy/harvest_wikidata.go b/cmd/roy/harvest_wikidata.go index 9f8da898b..f393a29cd 100644 --- a/cmd/roy/harvest_wikidata.go +++ b/cmd/roy/harvest_wikidata.go @@ -17,7 +17,6 @@ package main import ( "encoding/json" "fmt" - "io/ioutil" "log" "os" "strings" @@ -119,7 +118,7 @@ func harvestWikidata() error { ) path := config.WikidataDefinitionsPath() - err = ioutil.WriteFile( + err = os.WriteFile( path, []byte(fmt.Sprintf("%s", modifiedJSON)), config.WikidataFileMode(), diff --git a/cmd/roy/roy.go b/cmd/roy/roy.go index 00f1829b2..d8edae5d6 100644 --- a/cmd/roy/roy.go +++ b/cmd/roy/roy.go @@ -163,6 +163,7 @@ var ( _, htimeout, _, _ = config.HarvestOptions() timeout = harvest.Duration("timeout", htimeout, "set duration before timing-out harvesting requests e.g. 120s") throttlef = harvest.Duration("throttle", 0, "set a time to wait HTTP requests e.g. 50ms") + harvestWikidataSigLen = harvest.Int("siglen", config.WikidataSigLen(), "set minimum signature length for Wikidata, e.g. 6 chars == 3 bytes") harvestWikidataSig = harvest.Bool("wikidata", false, "harvest a static Wikidata report") harvestWikidataLang = harvest.String("lang", config.WikidataLang(), "two-letter language-code to download Wikidata strings, e.g. \"de\"") harvestWikidataEndpoint = harvest.String("wikidataendpoint", config.WikidataEndpoint(), "the endpoint to use to harvest Wikidata definitions from") @@ -497,6 +498,9 @@ func setHarvestOptions() { if *throttlef > 0 { config.SetHarvestThrottle(*throttlef) } + if *harvestWikidataSigLen != config.WikidataSigLen() { + config.SetWikidataSigLen(*harvestWikidataSigLen) + } if *harvestWikidataLang != "" { config.SetWikidataLang(*harvestWikidataLang) } diff --git a/pkg/config/internal/wikidatasparql/sparql.go b/pkg/config/internal/wikidatasparql/sparql.go index c0ce56d8d..a3dc6dc09 100644 --- a/pkg/config/internal/wikidatasparql/sparql.go +++ b/pkg/config/internal/wikidatasparql/sparql.go @@ -18,9 +18,17 @@ package wikidatasparql // the Wikidata identifier in Roy. import ( + "strconv" "strings" ) +// sigLenTemplate gives us a field which we can replace with a +// min signature length value of our own choosing. +const sigLenTemplate = "<>" + +// Default signature length to return from Wikidata. +var wikidataSigLen = 6 + // languateTemplate gives us a field which we can replace with a // language code of our own configuration. const languageTemplate = "<>" @@ -35,36 +43,54 @@ var wikidataLang = "en" // sparql represents the query required to pull all file format records // and signatures from the Wikidata query service. const sparql = ` - # Return all file format records from Wikidata. - # - select distinct ?uri ?uriLabel ?puid ?extension ?mimetype ?encoding ?referenceLabel ?date ?relativity ?offset ?sig - where - { - ?uri wdt:P31/wdt:P279* wd:Q235557. # Return records of type File Format. - optional { ?uri wdt:P2748 ?puid. } # PUID is used to map to PRONOM signatures proper. - optional { ?uri wdt:P1195 ?extension. } - optional { ?uri wdt:P1163 ?mimetype. } - optional { ?uri p:P4152 ?object; # Format identification pattern statement. - optional { ?object pq:P3294 ?encoding. } # We don't always have an encoding. - optional { ?object ps:P4152 ?sig. } # We always have a signature. - optional { ?object pq:P2210 ?relativity. } # Relativity to beginning or end of file. - optional { ?object pq:P4153 ?offset. } # Offset relative to the relativity. - optional { ?object prov:wasDerivedFrom ?provenance; - optional { ?provenance pr:P248 ?reference; - pr:P813 ?date. - } - } - } - service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE], <>". } - } - order by ?uri - ` +# Return all file format records from Wikidata. +SELECT DISTINCT ?uri ?uriLabel ?puid ?extension ?mimetype ?encoding ?referenceLabel ?date ?relativity ?offset ?sig WHERE { + { ?uri (wdt:P31/(wdt:P279*)) wd:Q235557. } + UNION + { ?uri (wdt:P31/(wdt:P279*)) wd:Q26085352. } + FILTER(EXISTS { ?uri (wdt:P2748|wdt:P1195|wdt:P1163|ps:P4152) _:b2. }) + FILTER((STRLEN(?sig)) >= <> ) + OPTIONAL { ?uri wdt:P2748 ?puid. } + OPTIONAL { ?uri wdt:P1195 ?extension. } + OPTIONAL { ?uri wdt:P1163 ?mimetype. } + OPTIONAL { + ?uri p:P4152 ?object. + OPTIONAL { ?object pq:P3294 ?encoding. } + OPTIONAL { ?object ps:P4152 ?sig. } + OPTIONAL { ?object pq:P2210 ?relativity. } + OPTIONAL { ?object pq:P4153 ?offset. } + OPTIONAL { + ?object prov:wasDerivedFrom ?provenance. + OPTIONAL { + ?provenance pr:P248 ?reference; + pr:P813 ?date. + } + } + } + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE], <>". } +} +ORDER BY (?uri) +` // WikidataSPARQL returns the SPARQL query needed to pull file-format // signatures from Wikidata replacing various template values as we // go. func WikidataSPARQL() string { - return strings.Replace(sparql, languageTemplate, wikidataLang, numberReplacements) + wdSparql := strings.Replace(sparql, languageTemplate, wikidataLang, numberReplacements) + wdSparql = strings.Replace(wdSparql, sigLenTemplate, strconv.Itoa(wikidataSigLen), numberReplacements) + return wdSparql +} + +// WikidataSigLen returns the minimum signature length we want the Wikidata +// SPARQL query to return. +func WikidataSigLen() int { + return wikidataSigLen +} + +// SetWikidataSigLen sets the minimum signature length we want the Wikidata +// SPARQL query to return. +func SetWikidataSigLen(len int) { + wikidataSigLen = len } // WikidataLang will return to the caller the ISO language code diff --git a/pkg/config/wikidata.go b/pkg/config/wikidata.go index 79ff44d0a..a1e7a7e3c 100644 --- a/pkg/config/wikidata.go +++ b/pkg/config/wikidata.go @@ -221,6 +221,18 @@ func WikidataSPARQL() string { return wikidatasparql.WikidataSPARQL() } +// WikidataSiglen returns the minimum signature length we want the Wikidata +// SPARQL query to return. +func WikidataSigLen() int { + return wikidatasparql.WikidataSigLen() +} + +// SetWikidataLang sets the minimum signature length we want the Wikidata +// SPARQL query to return. +func SetWikidataSigLen(len int) { + wikidatasparql.SetWikidataSigLen(len) +} + // WikidataLang returns the language we want to return results in from // Wikidata. func WikidataLang() string { @@ -262,7 +274,6 @@ func GetWikidataNoPRONOM() bool { // returned from Wikibase, e.g. for Wikidata this URL needs to be: // // e.g. https://www.wikidata.org/w/index.php -// func SetWikibaseURL(baseURL string) (func() private, error) { _, err := url.ParseRequestURI(baseURL) if err != nil { @@ -384,12 +395,11 @@ func SetCustomWikibaseQuery() error { // // Example: // -// { -// "PronomProp": "http://wikibase.example.com/entity/Q2", -// "BofProp": "http://wikibase.example.com/entity/Q3", -// "EofProp": "http://wikibase.example.com/entity/Q4" -// } -// +// { +// "PronomProp": "http://wikibase.example.com/entity/Q2", +// "BofProp": "http://wikibase.example.com/entity/Q3", +// "EofProp": "http://wikibase.example.com/entity/Q4" +// } func WikibasePropsPath() string { return filepath.Join(WikidataHome(), wikidata.wikibasePropsFile) }