Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of basic Wikidata identifier #138

Merged
merged 3 commits into from
Sep 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
743 changes: 743 additions & 0 deletions cmd/roy/data/wikidata/wikidata-test-definitions

Large diffs are not rendered by default.

62 changes: 62 additions & 0 deletions cmd/roy/harvest_wikidata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package main

import (
"fmt"
"io/ioutil"
"log"
"os"

"github.com/richardlehane/siegfried/pkg/config"
"github.com/ross-spencer/spargo/pkg/spargo"
)

// harvestWikidata will connect to the configured Wikidata query service
// and save the results of the configured query to disk.
func harvestWikidata() error {
log.Printf(
"Roy (Wikidata): Harvesting Wikidata definitions: lang '%s'",
config.WikidataLang(),
)
err := os.MkdirAll(config.WikidataHome(), os.ModePerm)
if err != nil {
return fmt.Errorf(
"Roy (Wikidata): Error harvesting Wikidata definitions: '%s'",
err,
)
}
log.Printf(
"Roy (Wikidata): Harvesting definitions from: '%s'",
config.WikidataEndpoint(),
)
sparqlMe := spargo.SPARQLClient{}
sparqlMe.ClientInit(config.WikidataEndpoint(), config.WikidataSPARQL())
sparqlMe.SetUserAgent(config.UserAgent())
res := sparqlMe.SPARQLGo()
path := config.WikidataDefinitionsPath()
err = ioutil.WriteFile(path, []byte(res.Human), config.WikidataFileMode())
if err != nil {
return fmt.Errorf(
"Roy (Wikidata): Error harvesting Wikidata: '%s'",
err,
)
}
log.Printf(
"Roy (Wikidata): Harvesting Wikidata definitions '%s' complete",
path,
)
return nil
}
183 changes: 115 additions & 68 deletions cmd/roy/roy.go

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions cmd/roy/roy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/richardlehane/siegfried/pkg/mimeinfo"
"github.com/richardlehane/siegfried/pkg/pronom"
"github.com/richardlehane/siegfried/pkg/sets"
wd "github.com/richardlehane/siegfried/pkg/wikidata"
)

var testhome = flag.String("home", "data", "override the default home directory")
Expand Down Expand Up @@ -66,6 +67,20 @@ func TestFreedesktop(t *testing.T) {
}
}

func TestWikidata(t *testing.T) {
s := siegfried.New()
config.SetHome(*testhome)
config.SetWikidataDefinitions("wikidata-test-definitions")
m, err := wd.New(config.SetWikidataNamespace())
if err != nil {
t.Fatal(err)
}
err = s.Add(m)
if err != nil {
t.Fatal(err)
}
}

func TestPronomTikaLoc(t *testing.T) {
s := siegfried.New()
config.SetHome(*testhome)
Expand Down
8 changes: 7 additions & 1 deletion cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ var (
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, "scan archive formats (zip, tar, gzip, warc, arc)")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
Expand All @@ -64,6 +64,7 @@ var (
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
)

var (
Expand Down Expand Up @@ -377,6 +378,11 @@ func main() {
serveFpr(config.Fpr(), s)
return
}
// present source in the basis field within the Wikidata identifier
// instead of its own field.
if *sourceinline {
config.SetWikidataSourceFieldOff()
}
// check -multi
if *multi > maxMulti || *multi < 1 || (*archive && *multi > 1) {
log.Println("[WARN] -multi must be > 0 and =< 1024. If -z, -multi must be 1. Resetting -multi to 1")
Expand Down
4 changes: 4 additions & 0 deletions cmd/sf/sf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ func TestSuite(t *testing.T) {
return nil
}
suite := filepath.Join(*testdata, "skeleton-suite")
_, err = os.Stat(suite)
if err != nil {
t.Fatal(err)
}
err = filepath.Walk(suite, wf)
if err != nil {
t.Fatal(err)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions cmd/sf/testdata/wikidata/arc/x-fmt-266-signature-id-201.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
1 change: 1 addition & 0 deletions cmd/sf/testdata/wikidata/wd/Q10287816.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�
Binary file added cmd/sf/testdata/wikidata/wd/Q28205479.info
Binary file not shown.
2 changes: 1 addition & 1 deletion cmd/sf/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func getHttp(url string) ([]byte, error) {
return nil, err
}
_, timeout, transport := config.UpdateOptions()
req.Header.Add("User-Agent", "siegfried/siegbot (+https://github.com/richardlehane/siegfried)")
req.Header.Add("User-Agent", config.UserAgent())
req.Header.Add("Cache-Control", "no-cache")
timer := time.AfterFunc(timeout, func() {
transport.CancelRequest(req)
Expand Down
235 changes: 235 additions & 0 deletions cmd/sf/wikidata_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
package main

import (
"flag"
"fmt"

"os"

"path/filepath"
"strings"
"testing"

"github.com/richardlehane/siegfried"
"github.com/richardlehane/siegfried/pkg/config"
"github.com/richardlehane/siegfried/pkg/wikidata"
)

// Path components associated with the Roy command folder.
const wikidataTestDefinitions = "wikidata-test-definitions"
const wikidataDefinitionsBaseDir = "definitionsBaseDir"

var royTestData = filepath.Join("..", "roy", "data")

// Path components within the Siegfried command folder.
const wikidataNamespace = "wikidata"
const siegfriedTestData = "testdata"
const wikidataTestData = "wikidata"
const wikidataPRONOMSkeletons = "pro"
const wikidataCustomSkeletons = "wd"
const wikidataArcSkeletons = "arc"
const wikidataExtensionMismatches = "ext_mismatch"
const wikidataContainerMatches = "container"

var (
wikidataDefinitions = flag.String(
wikidataDefinitionsBaseDir,
royTestData,
"Creates an flag var that is compatible with the config functions...",
)
)

var wdSiegfried *siegfried.Siegfried

func setupWikidata(pronomx bool, opts ...config.Option) error {
if opts == nil && wdSiegfried != nil {
return fmt.Errorf(
"Wikidata setup options are not properly configured",
)
}
wdSiegfried = siegfried.New()
config.SetHome(*wikidataDefinitions)
config.SetWikidataNamespace()
config.SetWikidataDefinitions(wikidataTestDefinitions)
if pronomx != true {
opts = append(opts, config.SetWikidataNoPRONOM())
} else {
opts = append(opts, config.SetWikidataPRONOM())
}
identifier, err := wikidata.New(opts...)
if err != nil {
return err
}
wdSiegfried.Add(identifier)
return nil
}

// identificationTests provides our structure for table driven tests.
type identificationTests struct {
fname string
qid string
extMatch bool
byteMatch bool
containerMatch bool
error bool
}

var skeletonSamples = []identificationTests{
identificationTests{
filepath.Join(wikidataPRONOMSkeletons, "fmt-11-signature-id-58.png"),
"Q178051", true, true, false, false},
identificationTests{
filepath.Join(wikidataPRONOMSkeletons, "fmt-279-signature-id-295.flac"),
"Q27881556", true, true, false, false},
identificationTests{
filepath.Join(wikidataCustomSkeletons, "Q10287816.gz"),
"Q10287816", true, true, false, false},
identificationTests{
filepath.Join(wikidataCustomSkeletons, "Q28205479.info"),
"Q28205479", true, true, false, false},
}

// Rudimentary consts that can help us determine the method of
// identification. Can also add "container name" here for when we want
// to validate PRONOM alongside Wikidata.
const extensionMatch = "extension match"
const byteMatch = "byte match"
const extensionMismatch = "extension mismatch"
const containerMatch = "container name"

// TestWikidataBasic will perform some rudimentary tests using some
// simple Skeleton files and the Wikidata identifier without PRONOM.
func TestWikidataBasic(t *testing.T) {
pronom := false
err := setupWikidata(pronom)
if err != nil {
t.Error(err)
}
for _, test := range skeletonSamples {
path := filepath.Join(siegfriedTestData, wikidataTestData, test.fname)
siegfriedRunner(path, test, t)
}
wdSiegfried = nil
}

var archiveSamples = []identificationTests{
identificationTests{
filepath.Join(wikidataArcSkeletons, "fmt-289-signature-id-305.warc"),
"Q7978505", true, true, false, false},
identificationTests{
filepath.Join(wikidataArcSkeletons, "fmt-410-signature-id-580.arc"),
"Q27824065", true, true, false, false},
identificationTests{
filepath.Join(wikidataArcSkeletons, "x-fmt-219-signature-id-525.arc"),
"Q27824060", true, true, false, false},
identificationTests{
filepath.Join(wikidataArcSkeletons, "x-fmt-265-signature-id-265.tar"),
"Q283579", true, true, false, false},
identificationTests{
filepath.Join(wikidataArcSkeletons, "x-fmt-266-signature-id-201.gz"),
"Q10287816", true, true, false, false},
}

func TestArchives(t *testing.T) {
pronom := true
err := setupWikidata(pronom)
if err != nil {
t.Error(err)
}
for _, test := range archiveSamples {
path := filepath.Join(siegfriedTestData, wikidataTestData, test.fname)
siegfriedRunner(path, test, t)
}
wdSiegfried = nil
}

var extensionMismatchSamples = []identificationTests{
identificationTests{
filepath.Join(wikidataExtensionMismatches, "fmt-11-signature-id-58.jpg"),
"Q178051", false, true, false, false},
identificationTests{
filepath.Join(wikidataExtensionMismatches, "fmt-279-signature-id-295.wav"),
"Q27881556", false, true, false, false},
}

func TestExtensionMismatches(t *testing.T) {
pronom := false
err := setupWikidata(pronom)
if err != nil {
t.Error(err)
}
for _, test := range extensionMismatchSamples {
path := filepath.Join(siegfriedTestData, wikidataTestData, test.fname)
siegfriedRunner(path, test, t)
}
wdSiegfried = nil
}

var containerSamples = []identificationTests{
identificationTests{
filepath.Join(wikidataContainerMatches, "fmt-292-container-signature-id-8010.odp"),
"Q27203973", true, true, true, false},
identificationTests{
filepath.Join(wikidataContainerMatches, "fmt-482-container-signature-id-14000.ibooks"),
"Q49988096", true, true, true, false},
identificationTests{
filepath.Join(wikidataContainerMatches, "fmt-680-container-signature-id-22120.ppp"),
"Q47520869", true, true, true, false},
identificationTests{
filepath.Join(wikidataContainerMatches, "fmt-998-container-signature-id-32000.ora"),
"Q747906", true, true, true, false},
}

func TestContainers(t *testing.T) {
pronom := true
err := setupWikidata(pronom)
if err != nil {
t.Error(err)
}
for _, test := range containerSamples {
path := filepath.Join(siegfriedTestData, wikidataTestData, test.fname)
siegfriedRunner(path, test, t)
}
wdSiegfried = nil
}

func siegfriedRunner(path string, test identificationTests, t *testing.T) {
file, err := os.Open(path)
if err != nil {
t.Errorf("failed to open %v, got: %v", path, err)
}
defer file.Close()
res, err := wdSiegfried.Identify(file, path, "")
if err != nil && !test.error {
t.Fatal(err)
}
if len(res) > 1 {
t.Errorf("Match length greater than one: '%d'", len(res))
}
namespace := res[0].Values()[0]
if namespace != wikidataNamespace {
t.Errorf("Namespace error, expected: '%s' received: '%s'", wikidataNamespace, namespace)
}
id := res[0].Values()[1]
basis := res[0].Values()[5]
warning := res[0].Values()[7]
if id != test.qid {
t.Errorf(
"QID match different than anticipated: '%s' expected '%s'",
id,
test.qid,
)
}
if test.extMatch && !strings.Contains(basis, extensionMatch) {
t.Errorf("Extension match not returned by identifier: %s", basis)
}
if test.byteMatch && !strings.Contains(basis, byteMatch) {
t.Errorf("Byte match not returned by identifier: %s", basis)
}
if test.containerMatch && !strings.Contains(basis, containerMatch) {
t.Errorf("Container match not returned by identifier: %s", basis)
}
if !test.extMatch && !strings.Contains(warning, extensionMismatch) {
t.Errorf("Expected an extension mismatch but it wasn't returned: %s", warning)
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/richardlehane/mscfb v1.0.3
github.com/richardlehane/webarchive v1.0.0
github.com/richardlehane/xmldetect v1.0.2
github.com/ross-spencer/spargo v0.0.0-20200323024642-38971d4365a7
golang.org/x/image v0.0.0-20200119044424-58c23975cae1
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4
golang.org/x/text v0.3.2 // indirect
Expand Down
Loading