diff --git a/cmd/sf/sf.go b/cmd/sf/sf.go index 41256fe85..d895a957f 100644 --- a/cmd/sf/sf.go +++ b/cmd/sf/sf.go @@ -40,30 +40,31 @@ const maxMulti = 1024 // flags var ( - updateShort = flag.Bool("u", false, "update or install the default signature file") - update = flag.Bool("update", false, "update or install the default signature file") - versionShort = flag.Bool("v", false, "display version information") - version = flag.Bool("version", false, "display version information") - logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout") - nr = flag.Bool("nr", false, "prevent automatic directory recursion") - yaml = flag.Bool("yaml", true, "YAML output format") - csvo = flag.Bool("csv", false, "CSV output format") - jsono = flag.Bool("json", false, "JSON output format") - droido = flag.Bool("droid", false, "DROID CSV output format") - sig = flag.String("sig", config.SignatureBase(), "set the signature file") - home = flag.String("home", config.Home(), "override the default home directory") - serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138") - multi = flag.Int("multi", 1, "set number of parallel file ID processes") - archive = flag.Bool("z", false, "scan archive formats (zip, tar, gzip, warc, arc)") - hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices) - throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms") - utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ") - coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)") - replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml") - list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt") - name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -") - conff = flag.String("conf", "", "set the configuration file") - setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file") + updateShort = flag.Bool("u", false, "update or install the default signature file") + update = flag.Bool("update", false, "update or install the default signature file") + versionShort = flag.Bool("v", false, "display version information") + version = flag.Bool("version", false, "display version information") + logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout") + nr = flag.Bool("nr", false, "prevent automatic directory recursion") + yaml = flag.Bool("yaml", true, "YAML output format") + csvo = flag.Bool("csv", false, "CSV output format") + jsono = flag.Bool("json", false, "JSON output format") + droido = flag.Bool("droid", false, "DROID CSV output format") + sig = flag.String("sig", config.SignatureBase(), "set the signature file") + home = flag.String("home", config.Home(), "override the default home directory") + serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138") + multi = flag.Int("multi", 1, "set number of parallel file ID processes") + archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes())) + selectArchives = flag.String("zs", config.ListAllArcTypes(), "select the archive types to decompress and identify the contents of") + hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices) + throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms") + utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ") + coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)") + replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml") + list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt") + name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -") + conff = flag.String("conf", "", "set the configuration file") + setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file") ) var ( @@ -371,6 +372,10 @@ func main() { } return } + // handle -zs + if *selectArchives != "" { + config.SetArchiveFilterPermissive(*selectArchives) + } // handle -fpr if *fprflag { log.Printf("FPR server started at %s. Use CTRL-C to quit.\n", config.Fpr()) diff --git a/pkg/config/decompress.go b/pkg/config/decompress.go index 8ec0bd152..12c91d7db 100644 --- a/pkg/config/decompress.go +++ b/pkg/config/decompress.go @@ -14,18 +14,132 @@ package config +import ( + "fmt" + "strings" +) + // Archive is a file format capable of decompression by sf. type Archive int const ( - None Archive = iota // None means the format cannot be decompressed by sf. + // None means the format cannot be decompressed by sf. + None Archive = iota + // Zip describes a Zip type archive. Zip + // Gzip describes a Gzip type archive. Gzip + // Tar describes a Tar type archive. Tar + // ARC describes an ARC web archive. ARC + // WARC describes a WARC web archive. WARC ) +const ( + zipArc = "zip" + tarArc = "tar" + gzipArc = "gzip" + warcArc = "warc" + arcArc = "arc" +) + +// ArcZipTypes returns a string array with all Zip identifiers Siegfried +// can match and decompress. +func ArcZipTypes() []string { + return []string{ + pronom.zip, + mimeinfo.zip, + loc.zip, + } +} + +// ArcGzipTypes returns a string array with all Gzip identifiers +// Siegfried can match and decompress. +func ArcGzipTypes() []string { + return []string{ + pronom.gzip, + mimeinfo.gzip, + } +} + +// ArcTarTypes returns a string array with all Tar identifiers Siegfried +// can match and decompress. +func ArcTarTypes() []string { + return []string{ + pronom.tar, + mimeinfo.tar, + } +} + +// ArcArcTypes returns a string array with all Arc identifiers Siegfried +// can match and decompress. +func ArcArcTypes() []string { + return []string{ + pronom.arc, + pronom.arc1_1, + mimeinfo.arc, + loc.arc, + } +} + +// ArcWarcTypes returns a string array with all Arc identifiers +// Siegfried can match and decompress. +func ArcWarcTypes() []string { + return []string{ + pronom.warc, + mimeinfo.warc, + loc.warc, + } +} + +// ListAllArcTypes returns a list of archive file-format extensions that +// can be used to filter the files Siegfried will decompress to identify +// the contents of. +func ListAllArcTypes() string { + return fmt.Sprintf("%s, %s, %s, %s, %s", + zipArc, + tarArc, + gzipArc, + warcArc, + arcArc, + ) +} + +var permissiveFilter []string + +// SetArchiveFilterPermissive enables a filter to be created on the +// types of archive that we want to extract from. Anything not in this +// list is not extracted. +func SetArchiveFilterPermissive(value string) []string { + arr := []string{} + arcList := strings.Split(value, ",") + for _, arc := range arcList { + switch strings.TrimSpace(strings.ToLower(arc)) { + case zipArc: + arr = append(arr, ArcZipTypes()...) + case tarArc: + arr = append(arr, ArcTarTypes()...) + case gzipArc: + arr = append(arr, ArcGzipTypes()...) + case warcArc: + arr = append(arr, ArcWarcTypes()...) + case arcArc: + arr = append(arr, ArcArcTypes()...) + } + } + permissiveFilter = arr + return arr +} + +// archiveFilterPermissive provides a getter for the configured +// zip-types we want to extract and identify the contents of with +// Siegfried. +func archiveFilterPermissive() []string { + return permissiveFilter +} + func (a Archive) String() string { switch a { case Zip: diff --git a/pkg/config/identifier.go b/pkg/config/identifier.go index c66dfa8d4..09a2b7372 100644 --- a/pkg/config/identifier.go +++ b/pkg/config/identifier.go @@ -264,18 +264,31 @@ func Extend() []string { return extensionPaths(identifier.extend) } +// Return true if value 'v' is contained in slice 's'. +func contains(v string, s []string) bool { + for _, n := range s { + if v == n { + return true + } + } + return false +} + // IsArchive returns an Archive that corresponds to the provided id (or none if no match). func IsArchive(id string) Archive { - switch id { - case pronom.zip, mimeinfo.zip, loc.zip: + if !contains(id, archiveFilterPermissive()) { + return None + } + switch { + case contains(id, ArcZipTypes()): return Zip - case pronom.gzip, mimeinfo.gzip: + case contains(id, ArcGzipTypes()): return Gzip - case pronom.tar, mimeinfo.tar: + case contains(id, ArcTarTypes()): return Tar - case pronom.arc, pronom.arc1_1, mimeinfo.arc, loc.arc: + case contains(id, ArcArcTypes()): return ARC - case pronom.warc, mimeinfo.warc, loc.warc: + case contains(id, ArcWarcTypes()): return WARC } return None diff --git a/pkg/config/identifier_test.go b/pkg/config/identifier_test.go new file mode 100644 index 000000000..d681603f0 --- /dev/null +++ b/pkg/config/identifier_test.go @@ -0,0 +1,70 @@ +// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Satisfies the Parseable interface to enable Roy to process Wikidata +// signatures into a Siegfried compatible identifier. + +package config + +import ( + "testing" +) + +// Valid archive UIDs. +var proZipUID = "x-fmt/263" +var locArcUID = "fdd000235" +var mimeTarUID = "application/x-tar" +var mimeWarcUID = "application/x-warc" +var mimeGzipUID = "application/gzip" + +// Non-archive UID. +var nonArcUID = "fmt/1000" + +// arcTest defines the structure needed for our table driven testing. +type arcTest struct { + filter string // The set of zip-type files to provide SetArchiveFilterPermissive(...) + uid string // A UID (PUID, FDD) that identifies a zip-type file. + result Archive // The anticipated result from our test. +} + +// isArcTests provide us a slice of tests and results to loop through. +var isArcTests = []arcTest{ + // Positive tests should return valid Archive values. + arcTest{ListAllArcTypes(), proZipUID, Zip}, + arcTest{"TAR", mimeTarUID, Tar}, + arcTest{"gZip", mimeGzipUID, Gzip}, + arcTest{"warc,zip,tar", mimeWarcUID, WARC}, + arcTest{"zip,arc", locArcUID, ARC}, + + // Negative tests should all return None. + arcTest{"zip,arc", mimeWarcUID, None}, + arcTest{"zip,arc", mimeGzipUID, None}, + arcTest{ListAllArcTypes(), nonArcUID, None}, + arcTest{"", nonArcUID, None}, +} + +// TestIsArchivePositive tests cases where the filter should return a +// positive match. +func TestIsArchivePositive(t *testing.T) { + for _, test := range isArcTests { + SetArchiveFilterPermissive(test.filter) + arc := IsArchive(test.uid) + if arc != test.result { + t.Errorf( + "Unexpected test result '%s', expected '%s'", + arc, test.result, + ) + } + } +} diff --git a/pkg/decompress/decompress.go b/pkg/decompress/decompress.go index d0d32a5ec..4c359dd03 100644 --- a/pkg/decompress/decompress.go +++ b/pkg/decompress/decompress.go @@ -44,7 +44,7 @@ func SetDroid() { func IsArc(ids []core.Identification) config.Archive { var arc config.Archive for _, id := range ids { - if id.Archive() > 0 { + if id.Archive() > config.None { return id.Archive() } }