Skip to content

Commit

Permalink
Add an archive selector to Siegfried
Browse files Browse the repository at this point in the history
Using the -zs flag will aloow users to specify which archive types
Siegfried cracks open to identify the contents of.
  • Loading branch information
ross-spencer committed Sep 21, 2020
1 parent 205ba0b commit 7fecdc8
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 36 deletions.
55 changes: 30 additions & 25 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,32 @@ const maxMulti = 1024

// flags
var (
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
selectArchives = flag.String("zs", config.ListAllArcTypes(), "select the archive types to decompress and identify the contents of")
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
)

var (
Expand Down Expand Up @@ -372,6 +373,10 @@ func main() {
}
return
}
// handle -zs
if *selectArchives != "" {
config.SetArchiveFilterPermissive(*selectArchives)
}
// handle -fpr
if *fprflag {
log.Printf("FPR server started at %s. Use CTRL-C to quit.\n", config.Fpr())
Expand Down
38 changes: 36 additions & 2 deletions pkg/config/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package config

import (
"fmt"
"strings"
)

// Archive is a file format capable of decompression by sf.
Expand Down Expand Up @@ -106,8 +107,41 @@ func ListAllArcTypes() string {
)
}

func (a Archive) String() string {
switch a {
var permissiveFilter []string

// SetArchiveFilterPermissive enables a filter to be created on the
// types of archive that we want to extract from. Anything not in this
// list is not extracted.
func SetArchiveFilterPermissive(value string) []string {
arr := []string{}
arcList := strings.Split(value, ",")
for _, arc := range arcList {
switch strings.TrimSpace(strings.ToLower(arc)) {
case zipArc:
arr = append(arr, ArcZipTypes()...)
case tarArc:
arr = append(arr, ArcTarTypes()...)
case gzipArc:
arr = append(arr, ArcGzipTypes()...)
case warcArc:
arr = append(arr, ArcWarcTypes()...)
case arcArc:
arr = append(arr, ArcArcTypes()...)
}
}
permissiveFilter = arr
return arr
}

// archiveFilterPermissive provides a getter for the configured
// zip-types we want to extract and identify the contents of with
// Siegfried.
func archiveFilterPermissive() []string {
return permissiveFilter
}

func (archive Archive) String() string {
switch archive {
case Zip:
return "zip"
case Gzip:
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ func contains(v string, s []string) bool {

// IsArchive returns an Archive that corresponds to the provided id (or none if no match).
func IsArchive(id string) Archive {
if !contains(id, archiveFilterPermissive()) {
return None
}
switch {
case contains(id, ArcZipTypes()):
return Zip
Expand Down
23 changes: 14 additions & 9 deletions pkg/config/identifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,31 @@ var nonArcUID = "fmt/1000"

// arcTest defines the structure needed for our table driven testing.
type arcTest struct {
filter string // The set of zip-type files to provide SetArchiveFilterPermissive(...)
uid string // A UID (PUID, FDD) that identifies a zip-type file.
result Archive // The anticipated result from our test.
}

// isArcTests provide us a slice of tests and results to loop through.
var isArcTests = []arcTest{
// Positive tests should return valid Archive values.
arcTest{proZipUID, Zip},
arcTest{mimeTarUID, Tar},
arcTest{mimeGzipUID, Gzip},
arcTest{mimeWarcUID, WARC},
arcTest{locArcUID, ARC},
arcTest{ListAllArcTypes(), proZipUID, Zip},
arcTest{"TAR", mimeTarUID, Tar},
arcTest{"gZip", mimeGzipUID, Gzip},
arcTest{"warc,zip,tar", mimeWarcUID, WARC},
arcTest{"zip,arc", locArcUID, ARC},
// Negative tests should all return None.
arcTest{nonArcUID, None},
arcTest{"zip,arc", mimeWarcUID, None},
arcTest{"zip,arc", mimeGzipUID, None},
arcTest{ListAllArcTypes(), nonArcUID, None},
arcTest{"", nonArcUID, None},
}

// TestIsArchive tests cases whether we return the correct result when
// testing whether something is an Archive.
func TestIsArchive(t *testing.T) {
// TestIsArchivePositive tests cases where the filter should return a
// positive match.
func TestIsArchivePositive(t *testing.T) {
for _, test := range isArcTests {
SetArchiveFilterPermissive(test.filter)
arc := IsArchive(test.uid)
if arc != test.result {
t.Errorf(
Expand Down

0 comments on commit 7fecdc8

Please sign in to comment.