Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an archive selector to Siegfried #141

Merged
merged 1 commit into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 30 additions & 25 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,32 @@ const maxMulti = 1024

// flags
var (
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
selectArchives = flag.String("zs", config.ListAllArcTypes(), "select the archive types to decompress and identify the contents of")
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
)

var (
Expand Down Expand Up @@ -372,6 +373,10 @@ func main() {
}
return
}
// handle -zs
if *selectArchives != "" {
config.SetArchiveFilterPermissive(*selectArchives)
}
// handle -fpr
if *fprflag {
log.Printf("FPR server started at %s. Use CTRL-C to quit.\n", config.Fpr())
Expand Down
35 changes: 35 additions & 0 deletions pkg/config/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package config

import (
"fmt"
"strings"
)

// Archive is a file format capable of decompression by sf.
Expand Down Expand Up @@ -106,6 +107,40 @@ func ListAllArcTypes() string {
)
}

var permissiveFilter []string

// SetArchiveFilterPermissive will take our comma separated list of
// archives we want to extract from the Siegfried command-line and use
// the values to construct a permissive filter. Anything not in the
// slice returned at the end of this function will not be extracted when
// -z flag is used.
func SetArchiveFilterPermissive(value string) []string {
arr := []string{}
arcList := strings.Split(value, ",")
for _, arc := range arcList {
switch strings.TrimSpace(strings.ToLower(arc)) {
case zipArc:
arr = append(arr, ArcZipTypes()...)
case tarArc:
arr = append(arr, ArcTarTypes()...)
case gzipArc:
arr = append(arr, ArcGzipTypes()...)
case warcArc:
arr = append(arr, ArcWarcTypes()...)
case arcArc:
arr = append(arr, ArcArcTypes()...)
}
}
permissiveFilter = arr
return arr
}

// archiveFilterPermissive provides a getter for the configured
// zip-types we want to extract and identify the contents of.
func archiveFilterPermissive() []string {
return permissiveFilter
}

func (a Archive) String() string {
switch a {
case Zip:
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ func contains(v string, s []string) bool {

// IsArchive returns an Archive that corresponds to the provided id (or none if no match).
func IsArchive(id string) Archive {
if !contains(id, archiveFilterPermissive()) {
return None
}
switch {
case contains(id, ArcZipTypes()):
return Zip
Expand Down
23 changes: 14 additions & 9 deletions pkg/config/identifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,31 @@ var nonArcUID = "fmt/1000"

// arcTest defines the structure needed for our table driven testing.
type arcTest struct {
filter string // The set of zip-type files to provide SetArchiveFilterPermissive(...)
uid string // A UID (PUID, FDD) that identifies a zip-type file.
result Archive // The anticipated result from our test.
}

// isArcTests provide us a slice of tests and results to loop through.
var isArcTests = []arcTest{
// Positive tests should return valid Archive values.
arcTest{proZipUID, Zip},
arcTest{mimeTarUID, Tar},
arcTest{mimeGzipUID, Gzip},
arcTest{mimeWarcUID, WARC},
arcTest{locArcUID, ARC},
arcTest{ListAllArcTypes(), proZipUID, Zip},
arcTest{"TAR", mimeTarUID, Tar},
arcTest{"gZip", mimeGzipUID, Gzip},
arcTest{"warc,zip,tar", mimeWarcUID, WARC},
arcTest{"zip,arc", locArcUID, ARC},
// Negative tests should all return None.
arcTest{nonArcUID, None},
arcTest{"zip,arc", mimeWarcUID, None},
arcTest{"zip,arc", mimeGzipUID, None},
arcTest{ListAllArcTypes(), nonArcUID, None},
arcTest{"", nonArcUID, None},
}

// TestIsArchive tests cases whether we return the correct result when
// testing whether something is an Archive.
func TestIsArchive(t *testing.T) {
// TestIsArchivePositive tests cases where the archive filter should
// return a positive match.
func TestIsArchivePositive(t *testing.T) {
for _, test := range isArcTests {
SetArchiveFilterPermissive(test.filter)
arc := IsArchive(test.uid)
if arc != test.result {
t.Errorf(
Expand Down