Skip to content

Commit

Permalink
Add an archive selector to Siegfried
Browse files Browse the repository at this point in the history
Using the -zs flag will aloow users to specify which archive types
Siegfried cracks open to identify the contents of.
  • Loading branch information
ross-spencer committed Aug 31, 2020
1 parent 74a791a commit 8bce6b7
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 32 deletions.
53 changes: 29 additions & 24 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,31 @@ const maxMulti = 1024

// flags
var (
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, "scan archive formats (zip, tar, gzip, warc, arc)")
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
updateShort = flag.Bool("u", false, "update or install the default signature file")
update = flag.Bool("update", false, "update or install the default signature file")
versionShort = flag.Bool("v", false, "display version information")
version = flag.Bool("version", false, "display version information")
logf = flag.String("log", "error", "log errors, warnings, debug or slow output, knowns or unknowns to stderr or stdout e.g. -log error,warn,unknown,stdout")
nr = flag.Bool("nr", false, "prevent automatic directory recursion")
yaml = flag.Bool("yaml", true, "YAML output format")
csvo = flag.Bool("csv", false, "CSV output format")
jsono = flag.Bool("json", false, "JSON output format")
droido = flag.Bool("droid", false, "DROID CSV output format")
sig = flag.String("sig", config.SignatureBase(), "set the signature file")
home = flag.String("home", config.Home(), "override the default home directory")
serve = flag.String("serve", "", "start siegfried server e.g. -serve localhost:5138")
multi = flag.Int("multi", 1, "set number of parallel file ID processes")
archive = flag.Bool("z", false, fmt.Sprintf("scan archive formats: (%s)", config.ListAllArcTypes()))
selectArchives = flag.String("zs", config.ListAllArcTypes(), "select the archive types to decompress and identify the contents of")
hashf = flag.String("hash", "", "calculate file checksum with hash algorithm; options "+checksum.HashChoices)
throttlef = flag.Duration("throttle", 0, "set a time to wait between scanning files e.g. 50ms")
utcf = flag.Bool("utc", false, "report file modified times in UTC, rather than local, TZ")
coe = flag.Bool("coe", false, "continue on fatal errors during directory walks (this may result in directories being skipped)")
replay = flag.Bool("replay", false, "replay one (or more) results files to change output or logging e.g. sf -replay -csv results.yaml")
list = flag.Bool("f", false, "scan one (or more) lists of filenames e.g. sf -f myfiles.txt")
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
)

var (
Expand Down Expand Up @@ -371,6 +372,10 @@ func main() {
}
return
}
// handle -zs
if *selectArchives != "" {
config.SetArchiveFilterPermissive(*selectArchives)
}
// handle -fpr
if *fprflag {
log.Printf("FPR server started at %s. Use CTRL-C to quit.\n", config.Fpr())
Expand Down
116 changes: 115 additions & 1 deletion pkg/config/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,132 @@

package config

import (
"fmt"
"strings"
)

// Archive is a file format capable of decompression by sf.
type Archive int

const (
None Archive = iota // None means the format cannot be decompressed by sf.
// None means the format cannot be decompressed by sf.
None Archive = iota
// Zip describes a Zip type archive.
Zip
// Gzip describes a Gzip type archive.
Gzip
// Tar describes a Tar type archive.
Tar
// ARC describes an ARC web archive.
ARC
// WARC describes a WARC web archive.
WARC
)

const (
zipArc = "zip"
tarArc = "tar"
gzipArc = "gzip"
warcArc = "warc"
arcArc = "arc"
)

// ArcZipTypes returns a string array with all Zip identifiers Siegfried
// can match and decompress.
func ArcZipTypes() []string {
return []string{
pronom.zip,
mimeinfo.zip,
loc.zip,
}
}

// ArcGzipTypes returns a string array with all Gzip identifiers
// Siegfried can match and decompress.
func ArcGzipTypes() []string {
return []string{
pronom.gzip,
mimeinfo.gzip,
}
}

// ArcTarTypes returns a string array with all Tar identifiers Siegfried
// can match and decompress.
func ArcTarTypes() []string {
return []string{
pronom.tar,
mimeinfo.tar,
}
}

// ArcArcTypes returns a string array with all Arc identifiers Siegfried
// can match and decompress.
func ArcArcTypes() []string {
return []string{
pronom.arc,
pronom.arc1_1,
mimeinfo.arc,
loc.arc,
}
}

// ArcWarcTypes returns a string array with all Arc identifiers
// Siegfried can match and decompress.
func ArcWarcTypes() []string {
return []string{
pronom.warc,
mimeinfo.warc,
loc.warc,
}
}

// ListAllArcTypes returns a list of archive file-format extensions that
// can be used to filter the files Siegfried will decompress to identify
// the contents of.
func ListAllArcTypes() string {
return fmt.Sprintf("%s, %s, %s, %s, %s",
zipArc,
tarArc,
gzipArc,
warcArc,
arcArc,
)
}

var permissiveFilter []string

// SetArchiveFilterPermissive enables a filter to be created on the
// types of archive that we want to extract from. Anything not in this
// list is not extracted.
func SetArchiveFilterPermissive(value string) []string {
arr := []string{}
arcList := strings.Split(value, ",")
for _, arc := range arcList {
switch strings.TrimSpace(strings.ToLower(arc)) {
case zipArc:
arr = append(arr, ArcZipTypes()...)
case tarArc:
arr = append(arr, ArcTarTypes()...)
case gzipArc:
arr = append(arr, ArcGzipTypes()...)
case warcArc:
arr = append(arr, ArcWarcTypes()...)
case arcArc:
arr = append(arr, ArcArcTypes()...)
}
}
permissiveFilter = arr
return arr
}

// archiveFilterPermissive provides a getter for the configured
// zip-types we want to extract and identify the contents of with
// Siegfried.
func archiveFilterPermissive() []string {
return permissiveFilter
}

func (a Archive) String() string {
switch a {
case Zip:
Expand Down
25 changes: 19 additions & 6 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,18 +264,31 @@ func Extend() []string {
return extensionPaths(identifier.extend)
}

// Return true if value 'v' is contained in slice 's'.
func contains(v string, s []string) bool {
for _, n := range s {
if v == n {
return true
}
}
return false
}

// IsArchive returns an Archive that corresponds to the provided id (or none if no match).
func IsArchive(id string) Archive {
switch id {
case pronom.zip, mimeinfo.zip, loc.zip:
if !contains(id, archiveFilterPermissive()) {
return None
}
switch {
case contains(id, ArcZipTypes()):
return Zip
case pronom.gzip, mimeinfo.gzip:
case contains(id, ArcGzipTypes()):
return Gzip
case pronom.tar, mimeinfo.tar:
case contains(id, ArcTarTypes()):
return Tar
case pronom.arc, pronom.arc1_1, mimeinfo.arc, loc.arc:
case contains(id, ArcArcTypes()):
return ARC
case pronom.warc, mimeinfo.warc, loc.warc:
case contains(id, ArcWarcTypes()):
return WARC
}
return None
Expand Down
70 changes: 70 additions & 0 deletions pkg/config/identifier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

// Satisfies the Parseable interface to enable Roy to process Wikidata
// signatures into a Siegfried compatible identifier.

package config

import (
"testing"
)

// Valid archive UIDs.
var proZipUID = "x-fmt/263"
var locArcUID = "fdd000235"
var mimeTarUID = "application/x-tar"
var mimeWarcUID = "application/x-warc"
var mimeGzipUID = "application/gzip"

// Non-archive UID.
var nonArcUID = "fmt/1000"

// arcTest defines the structure needed for our table driven testing.
type arcTest struct {
filter string // The set of zip-type files to provide SetArchiveFilterPermissive(...)
uid string // A UID (PUID, FDD) that identifies a zip-type file.
result Archive // The anticipated result from our test.
}

// isArcTests provide us a slice of tests and results to loop through.
var isArcTests = []arcTest{
// Positive tests should return valid Archive values.
arcTest{ListAllArcTypes(), proZipUID, Zip},
arcTest{"TAR", mimeTarUID, Tar},
arcTest{"gZip", mimeGzipUID, Gzip},
arcTest{"warc,zip,tar", mimeWarcUID, WARC},
arcTest{"zip,arc", locArcUID, ARC},

// Negative tests should all return None.
arcTest{"zip,arc", mimeWarcUID, None},
arcTest{"zip,arc", mimeGzipUID, None},
arcTest{ListAllArcTypes(), nonArcUID, None},
arcTest{"", nonArcUID, None},
}

// TestIsArchivePositive tests cases where the filter should return a
// positive match.
func TestIsArchivePositive(t *testing.T) {
for _, test := range isArcTests {
SetArchiveFilterPermissive(test.filter)
arc := IsArchive(test.uid)
if arc != test.result {
t.Errorf(
"Unexpected test result '%s', expected '%s'",
arc, test.result,
)
}
}
}
2 changes: 1 addition & 1 deletion pkg/decompress/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func SetDroid() {
func IsArc(ids []core.Identification) config.Archive {
var arc config.Archive
for _, id := range ids {
if id.Archive() > 0 {
if id.Archive() > config.None {
return id.Archive()
}
}
Expand Down

0 comments on commit 8bce6b7

Please sign in to comment.