-
-
Notifications
You must be signed in to change notification settings - Fork 6
/
fs.go
1126 lines (1007 loc) · 38 KB
/
fs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package archives
import (
"context"
"errors"
"fmt"
"io"
"io/fs"
"os"
"path"
"path/filepath"
"runtime"
"slices"
"strings"
"sync"
"time"
)
// FileSystem identifies the format of the input and returns a read-only file system.
// The input can be a filename, stream, or both.
//
// If only a filename is specified, it may be a path to a directory, archive file,
// compressed archive file, compressed regular file, or any other regular file on
// disk. If the filename is a directory, its contents are accessed directly from
// the device's file system. If the filename is an archive file, the contents can
// be accessed like a normal directory; compressed archive files are transparently
// decompressed as contents are accessed. And if the filename is any other file, it
// is the only file in the returned file system; if the file is compressed, it is
// transparently decompressed when read from.
//
// If a stream is specified, the filename (if available) is used as a hint to help
// identify its format. Streams of archive files must be able to be made into an
// io.SectionReader (for safe concurrency) which requires io.ReaderAt and io.Seeker
// (to efficiently determine size). The automatic format identification requires
// io.Reader and will use io.Seeker if supported to avoid buffering.
//
// Whether the data comes from disk or a stream, it is peeked at to automatically
// detect which format to use.
//
// This function essentially offers uniform read access to various kinds of files:
// directories, archives, compressed archives, individual files, and file streams
// are all treated the same way.
//
// NOTE: The performance of compressed tar archives is not great due to overhead
// with decompression. However, the fs.WalkDir() use case has been optimized to
// create an index on first call to ReadDir().
func FileSystem(ctx context.Context, filename string, stream ReaderAtSeeker) (fs.FS, error) {
if filename == "" && stream == nil {
return nil, errors.New("no input")
}
// if an input stream is specified, we'll use that for identification
// and for ArchiveFS (if it's an archive); but if not, we'll open the
// file and read it for identification, but in that case we won't want
// to also use it for the ArchiveFS (because we need to close what we
// opened, and ArchiveFS opens its own files), hence this separate var
idStream := stream
// if input is only a filename (no stream), check if it's a directory;
// if not, open it so we can determine which format to use (filename
// is not always a good indicator of file format)
if filename != "" && stream == nil {
info, err := os.Stat(filename)
if err != nil {
return nil, err
}
// real folders can be accessed easily
if info.IsDir() {
return DirFS(filename), nil
}
// if any archive formats recognize this file, access it like a folder
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()
idStream = file // use file for format identification only
}
// normally, callers should use the Reader value returned from Identify, but
// our input is a Seeker, so we know the original input value gets returned
format, _, err := Identify(ctx, filepath.Base(filename), idStream)
if errors.Is(err, NoMatch) {
return FileFS{Path: filename}, nil // must be an ordinary file
}
if err != nil {
return nil, fmt.Errorf("identify format: %w", err)
}
switch fileFormat := format.(type) {
case Extractor:
// if no stream was input, return an ArchiveFS that relies on the filepath
if stream == nil {
return &ArchiveFS{Path: filename, Format: fileFormat, Context: ctx}, nil
}
// otherwise, if a stream was input, return an ArchiveFS that relies on that
// determine size -- we know that the stream value we get back from
// Identify is the same type as what we input because it is a Seeker
size, err := streamSizeBySeeking(stream)
if err != nil {
return nil, fmt.Errorf("seeking for size: %w", err)
}
sr := io.NewSectionReader(stream, 0, size)
return &ArchiveFS{Stream: sr, Format: fileFormat, Context: ctx}, nil
case Compression:
return FileFS{Path: filename, Compression: fileFormat}, nil
}
return nil, fmt.Errorf("unable to create file system rooted at %s due to unsupported file or folder type", filename)
}
// ReaderAtSeeker is a type that can read, read at, and seek.
// os.File and io.SectionReader both implement this interface.
type ReaderAtSeeker interface {
io.Reader
io.ReaderAt
io.Seeker
}
// FileFS allows accessing a file on disk using a consistent file system interface.
// The value should be the path to a regular file, not a directory. This file will
// be the only entry in the file system and will be at its root. It can be accessed
// within the file system by the name of "." or the filename.
//
// If the file is compressed, set the Compression field so that reads from the
// file will be transparently decompressed.
type FileFS struct {
// The path to the file on disk.
Path string
// If file is compressed, setting this field will
// transparently decompress reads.
Compression Decompressor
}
// Open opens the named file, which must be the file used to create the file system.
func (f FileFS) Open(name string) (fs.File, error) {
if err := f.checkName(name, "open"); err != nil {
return nil, err
}
file, err := os.Open(f.Path)
if err != nil {
return nil, err
}
if f.Compression == nil {
return file, nil
}
r, err := f.Compression.OpenReader(file)
if err != nil {
return nil, err
}
return compressedFile{r, closeBoth{file, r}}, nil
}
// Stat stats the named file, which must be the file used to create the file system.
func (f FileFS) Stat(name string) (fs.FileInfo, error) {
if err := f.checkName(name, "stat"); err != nil {
return nil, err
}
return os.Stat(f.Path)
}
// ReadDir returns a directory listing with the file as the singular entry.
func (f FileFS) ReadDir(name string) ([]fs.DirEntry, error) {
if err := f.checkName(name, "stat"); err != nil {
return nil, err
}
info, err := f.Stat(name)
if err != nil {
return nil, err
}
return []fs.DirEntry{fs.FileInfoToDirEntry(info)}, nil
}
// checkName ensures the name is a valid path and also, in the case of
// the FileFS, that it is either ".", the filename originally passed in
// to create the FileFS, or the base of the filename (name without path).
// Other names do not make sense for a FileFS since the FS is only 1 file.
func (f FileFS) checkName(name, op string) error {
if name == f.Path {
return nil
}
if !fs.ValidPath(name) {
return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid}
}
if name != "." && name != filepath.Base(f.Path) {
return &fs.PathError{Op: op, Path: name, Err: fs.ErrNotExist}
}
return nil
}
// compressedFile is an fs.File that specially reads
// from a decompression reader, and which closes both
// that reader and the underlying file.
type compressedFile struct {
io.Reader // decompressor
closeBoth // file and decompressor
}
// DirFS is similar to os.dirFS (obtained via os.DirFS()), but it is
// exported so it can be used with type assertions. It also returns
// FileInfo/DirEntry values where Name() always returns the name of
// the directory instead of ".". This type does not guarantee any
// sort of sandboxing.
type DirFS string
// Open opens the named file.
func (d DirFS) Open(name string) (fs.File, error) {
if err := d.checkName(name, "open"); err != nil {
return nil, err
}
return os.Open(filepath.Join(string(d), name))
}
// ReadDir returns a listing of all the files in the named directory.
func (d DirFS) ReadDir(name string) ([]fs.DirEntry, error) {
if err := d.checkName(name, "readdir"); err != nil {
return nil, err
}
return os.ReadDir(filepath.Join(string(d), name))
}
// Stat returns info about the named file.
func (d DirFS) Stat(name string) (fs.FileInfo, error) {
if err := d.checkName(name, "stat"); err != nil {
return nil, err
}
info, err := os.Stat(filepath.Join(string(d), name))
if err != nil {
return info, err
}
if info.Name() == "." {
info = dotFileInfo{info, filepath.Base(string(d))}
}
return info, nil
}
// Sub returns an FS corresponding to the subtree rooted at dir.
func (d DirFS) Sub(dir string) (fs.FS, error) {
if err := d.checkName(dir, "sub"); err != nil {
return nil, err
}
info, err := d.Stat(dir)
if err != nil {
return nil, err
}
if !info.IsDir() {
return nil, fmt.Errorf("%s is not a directory", dir)
}
return DirFS(filepath.Join(string(d), dir)), nil
}
// checkName returns an error if name is not a valid path according to the docs of
// the io/fs package, with an extra cue taken from the standard lib's implementation
// of os.dirFS.Open(), which checks for invalid characters in Windows paths.
func (DirFS) checkName(name, op string) error {
if !fs.ValidPath(name) || runtime.GOOS == "windows" && strings.ContainsAny(name, `\:`) {
return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid}
}
return nil
}
// ArchiveFS allows reading an archive (or a compressed archive) using a
// consistent file system interface. Essentially, it allows traversal and
// reading of archive contents the same way as any normal directory on disk.
// The contents of compressed archives are transparently decompressed.
//
// A valid ArchiveFS value must set either Path or Stream, but not both.
// If Path is set, a literal file will be opened from the disk.
// If Stream is set, new SectionReaders will be implicitly created to
// access the stream, enabling safe, concurrent access.
//
// NOTE: Due to Go's file system APIs (see package io/fs), the performance
// of ArchiveFS can suffer when using fs.WalkDir(). To mitigate this,
// an optimized fs.ReadDirFS has been implemented that indexes the entire
// archive on the first call to ReadDir() (since the entire archive needs
// to be walked for every call to ReadDir() anyway, as archive contents are
// often unordered). The first call to ReadDir(), i.e. near the start of the
// walk, will be slow for large archives, but should be instantaneous after.
// If you don't care about walking a file system in directory order, consider
// calling Extract() on the underlying archive format type directly, which
// walks the archive in entry order, without needing to do any sorting.
//
// Note that fs.FS implementations, including this one, reject paths starting
// with "./". This can be problematic sometimes, as it is not uncommon for
// tarballs to contain a top-level/root directory literally named ".", which
// can happen if a tarball is created in the same directory it is archiving.
// The underlying Extract() calls are faithful to entries with this name,
// but file systems have certain semantics around "." that restrict its use.
// For example, a file named "." cannot be created on a real file system
// because it is a special name that means "current directory".
//
// We had to decide whether to honor the true name in the archive, or honor
// file system semantics. Given that this is a virtual file system and other
// code using the fs.FS APIs will trip over a literal directory named ".",
// we choose to honor file system semantics. Files named "." are ignored;
// directories with this name are effectively transparent; their contents
// get promoted up a directory/level. This means a file at "./x" where "."
// is a literal directory name, its name will be passed in as "x" in
// WalkDir callbacks. If you need the raw, uninterpeted values from an
// archive, use the formats' Extract() method directly. See
// https://github.com/golang/go/issues/70155 for a little more background.
//
// This does have one negative edge case... a tar containing contents like
// [x . ./x] will have a conflict on the file named "x" because "./x" will
// also be accessed with the name of "x".
type ArchiveFS struct {
// set one of these
Path string // path to the archive file on disk, or...
Stream *io.SectionReader // ...stream from which to read archive
Format Extractor // the archive format
Prefix string // optional subdirectory in which to root the fs
Context context.Context // optional; mainly for cancellation
// amortizing cache speeds up walks (esp. ReadDir)
contents map[string]fs.FileInfo
dirs map[string][]fs.DirEntry
}
// context always return a context, preferring f.Context if not nil.
func (f ArchiveFS) context() context.Context {
if f.Context != nil {
return f.Context
}
return context.Background()
}
// Open opens the named file from within the archive. If name is "." then
// the archive file itself will be opened as a directory file.
func (f ArchiveFS) Open(name string) (fs.File, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
}
// apply prefix if fs is rooted in a subtree
name = path.Join(f.Prefix, name)
// if we've already indexed the archive, we can know quickly if the file doesn't exist,
// and we can also return directory files with their entries instantly
if f.contents != nil {
if info, found := f.contents[name]; found {
if info.IsDir() {
if entries, ok := f.dirs[name]; ok {
return &dirFile{info: info, entries: entries}, nil
}
}
} else {
if entries, found := f.dirs[name]; found {
return &dirFile{info: implicitDirInfo{implicitDirEntry{name}}, entries: entries}, nil
}
return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)}
}
}
// if a filename is specified, open the archive file
var archiveFile *os.File
var err error
if f.Stream == nil {
archiveFile, err = os.Open(f.Path)
if err != nil {
return nil, err
}
defer func() {
// close the archive file if extraction failed; we can only
// count on the user/caller closing it if they successfully
// got the handle to the extracted file
if err != nil {
archiveFile.Close()
}
}()
} else if f.Stream == nil {
return nil, fmt.Errorf("no input; one of Path or Stream must be set")
}
// handle special case of opening the archive root
if name == "." {
var archiveInfo fs.FileInfo
if archiveFile != nil {
archiveInfo, err = archiveFile.Stat()
if err != nil {
return nil, err
}
} else {
archiveInfo = implicitDirInfo{
implicitDirEntry{"."},
}
}
var entries []fs.DirEntry
entries, err = f.ReadDir(name)
if err != nil {
return nil, err
}
if archiveFile != nil {
// the archiveFile is closed at return only if there's an
// error; in this case, though, we can close it regardless
if err := archiveFile.Close(); err != nil {
return nil, err
}
}
return &dirFile{
info: dirFileInfo{archiveInfo},
entries: entries,
}, nil
}
var inputStream io.Reader
if f.Stream == nil {
inputStream = archiveFile
} else {
inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
}
var decompressor io.ReadCloser
if decomp, ok := f.Format.(Decompressor); ok && decomp != nil {
decompressor, err = decomp.OpenReader(inputStream)
if err != nil {
return nil, err
}
inputStream = decompressor
}
// prepare the handler that we'll need if we have to iterate the
// archive to find the file being requested
var fsFile fs.File
handler := func(ctx context.Context, file FileInfo) error {
if err := ctx.Err(); err != nil {
return err
}
// paths in archives can't necessarily be trusted; also clean up any "./" prefix
file.NameInArchive = path.Clean(file.NameInArchive)
if !strings.HasPrefix(file.NameInArchive, name) {
return nil
}
// if this is the requested file, and it's a directory, set up the dirFile,
// which will include a listing of all its contents as we continue the walk
if file.NameInArchive == name && file.IsDir() {
fsFile = &dirFile{info: file} // will fill entries slice as we continue the walk
return nil
}
// if the named file was a directory and we are filling its entries,
// add this entry to the list
if df, ok := fsFile.(*dirFile); ok {
df.entries = append(df.entries, fs.FileInfoToDirEntry(file))
// don't traverse into subfolders
if file.IsDir() {
return fs.SkipDir
}
return nil
}
innerFile, err := file.Open()
if err != nil {
return err
}
fsFile = closeBoth{File: innerFile, c: archiveFile}
if decompressor != nil {
fsFile = closeBoth{fsFile, decompressor}
}
return fs.SkipAll
}
// when we start the walk, we pass in a nil list of files to extract, since
// files may have a "." component in them, and the underlying format doesn't
// know about our file system semantics, so we need to filter ourselves (it's
// not significantly less efficient).
if ar, ok := f.Format.(CompressedArchive); ok {
// bypass the CompressedArchive format's opening of the decompressor, since
// we already did it because we need to keep it open after returning.
// "I BYPASSED THE COMPRESSOR!" -Rey
err = ar.Extraction.Extract(f.context(), inputStream, handler)
} else {
err = f.Format.Extract(f.context(), inputStream, handler)
}
if err != nil {
return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("extract: %w", err)}
}
if fsFile == nil {
return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)}
}
return fsFile, nil
}
// Stat stats the named file from within the archive. If name is "." then
// the archive file itself is statted and treated as a directory file.
func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%s: %w", name, fs.ErrInvalid)}
}
if name == "." {
if f.Path != "" {
fileInfo, err := os.Stat(f.Path)
if err != nil {
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(a) %s: %w", name, err)}
}
return dirFileInfo{fileInfo}, nil
} else if f.Stream != nil {
return implicitDirInfo{implicitDirEntry{name}}, nil
}
}
// apply prefix if fs is rooted in a subtree
name = path.Join(f.Prefix, name)
// if archive has already been indexed, simply use it
if f.contents != nil {
if info, ok := f.contents[name]; ok {
return info, nil
}
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(b) %s: %w", name, fs.ErrNotExist)}
}
var archiveFile *os.File
var err error
if f.Stream == nil {
archiveFile, err = os.Open(f.Path)
if err != nil {
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(c) %s: %w", name, err)}
}
defer archiveFile.Close()
}
var result FileInfo
var fallback fs.FileInfo // possibly needed if only an implied directory
handler := func(ctx context.Context, file FileInfo) error {
if err := ctx.Err(); err != nil {
return err
}
cleanName := path.Clean(file.NameInArchive)
if cleanName == name {
result = file
return fs.SkipAll
}
// it's possible the requested name is an implicit directory;
// remember if we see it along the way, just in case
if fallback == nil && strings.HasPrefix(cleanName, name) {
fallback = implicitDirInfo{implicitDirEntry{name}}
}
return nil
}
var inputStream io.Reader = archiveFile
if f.Stream != nil {
inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
}
err = f.Format.Extract(f.context(), inputStream, handler)
if err != nil && result.FileInfo == nil {
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(d) %s: %w", name, fs.ErrNotExist)}
}
if result.FileInfo == nil {
// looks like the requested name does not exist in the archive,
// but we can return some basic info if it was an implicit directory
if fallback != nil {
return fallback, nil
}
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(e) %s: %w", name, fs.ErrNotExist)}
}
return result.FileInfo, nil
}
// ReadDir reads the named directory from within the archive. If name is "."
// then the root of the archive content is listed.
func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "readdir", Path: name, Err: fs.ErrInvalid}
}
// apply prefix if fs is rooted in a subtree
name = path.Join(f.Prefix, name)
// fs.WalkDir() calls ReadDir() once per directory, and for archives with
// lots of directories, that is very slow, since we have to traverse the
// entire archive in order to ensure that we got all the entries for a
// directory -- so we can fast-track this lookup if we've done the
// traversal already
if len(f.dirs) > 0 {
return f.dirs[name], nil
}
f.contents = make(map[string]fs.FileInfo)
f.dirs = make(map[string][]fs.DirEntry)
var archiveFile *os.File
var err error
if f.Stream == nil {
archiveFile, err = os.Open(f.Path)
if err != nil {
return nil, err
}
defer archiveFile.Close()
}
handler := func(ctx context.Context, file FileInfo) error {
if err := ctx.Err(); err != nil {
return err
}
// can't always trust path names
file.NameInArchive = path.Clean(file.NameInArchive)
// avoid infinite walk; apparently, creating a tar file in the target
// directory may result in an entry called "." in the archive; see #384
if file.NameInArchive == "." {
return nil
}
// if the name being requested isn't a directory, return an error similar to
// what most OSes return from the readdir system call when given a non-dir
if file.NameInArchive == name && !file.IsDir() {
return &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a directory")}
}
// index this file info for quick access
f.contents[file.NameInArchive] = file
// amortize the DirEntry list per directory, and prefer the real entry's DirEntry over an implicit/fake
// one we may have created earlier; first try to find if it exists, and if so, replace the value;
// otherwise insert it in sorted position
dir := path.Dir(file.NameInArchive)
dirEntry := fs.FileInfoToDirEntry(file)
idx, found := slices.BinarySearchFunc(f.dirs[dir], dirEntry, func(a, b fs.DirEntry) int {
return strings.Compare(a.Name(), b.Name())
})
if found {
f.dirs[dir][idx] = dirEntry
} else {
f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirEntry)
}
// this loop looks like an abomination, but it's really quite simple: we're
// just iterating the directories of the path up to the root; i.e. we lob off
// the base (last component) of the path until no separators remain, i.e. only
// one component remains -- then loop again to make sure it's not a duplicate
// (start without the base, since we know the full filename is an actual entry
// in the archive, we don't need to create an implicit directory entry for it)
startingPath := path.Dir(file.NameInArchive)
for dir, base := path.Dir(startingPath), path.Base(startingPath); base != "."; dir, base = path.Dir(dir), path.Base(dir) {
if err := ctx.Err(); err != nil {
return err
}
var dirInfo fs.DirEntry = implicitDirInfo{implicitDirEntry{base}}
// we are "filling in" any directories that could potentially be only implicit,
// and since a nested directory can have more than 1 item, we need to prevent
// duplication; for example: given a/b/c and a/b/d, we need to avoid adding
// an entry for "b" twice within "a" -- hence we search for it first, and if
// it doesn't already exist, we insert it in sorted position
idx, found := slices.BinarySearchFunc(f.dirs[dir], dirInfo, func(a, b fs.DirEntry) int {
return strings.Compare(a.Name(), b.Name())
})
if !found {
f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirInfo)
}
}
return nil
}
var inputStream io.Reader = archiveFile
if f.Stream != nil {
inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
}
err = f.Format.Extract(f.context(), inputStream, handler)
if err != nil {
// these being non-nil implies that we have indexed the archive,
// but if an error occurred, we likely only got part of the way
// through and our index is incomplete, and we'd have to re-walk
// the whole thing anyway; so reset these to nil to avoid bugs
f.dirs = nil
f.contents = nil
return nil, fmt.Errorf("extract: %w", err)
}
return f.dirs[name], nil
}
// Sub returns an FS corresponding to the subtree rooted at dir.
func (f *ArchiveFS) Sub(dir string) (fs.FS, error) {
if !fs.ValidPath(dir) {
return nil, &fs.PathError{Op: "sub", Path: dir, Err: fs.ErrInvalid}
}
info, err := f.Stat(dir)
if err != nil {
return nil, err
}
if !info.IsDir() {
return nil, fmt.Errorf("%s is not a directory", dir)
}
// result is the same as what we're starting with, except
// we indicate a path prefix to be used for all operations;
// the reason we don't append to the Path field directly
// is because the input might be a stream rather than a
// path on disk, and the Prefix field is applied on both
result := f
result.Prefix = dir
return result, nil
}
// DeepFS is a fs.FS that represents the real file system, but also has
// the ability to traverse into archive files as if they were part of the
// regular file system. If a filename component ends with an archive
// extension (e.g. .zip, .tar, .tar.gz, etc.), then the remainder of the
// filepath will be considered to be inside that archive.
//
// This allows treating archive files transparently as if they were part
// of the regular file system during a walk, which can be extremely useful
// for accessing data in an "ordinary" walk of the disk, without needing to
// first extract all the archives and use more disk space.
//
// Archives within archives are not supported.
//
// The listing of archive entries is retained for the lifetime of the
// DeepFS value for efficiency, but this can use more memory if archives
// contain a lot of files.
//
// The exported fields may be changed during the lifetime of a DeepFS value
// (but not concurrently). It is safe to use this type as an FS concurrently.
type DeepFS struct {
// The root filepath on disk.
Root string
// An optional context, mainly for cancellation.
Context context.Context
// remember archive file systems for efficiency
inners map[string]fs.FS
mu sync.Mutex
}
func (fsys *DeepFS) Open(name string) (fs.File, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
}
name = path.Join(filepath.ToSlash(fsys.Root), name)
realPath, innerPath := fsys.splitPath(name)
if innerPath != "" {
if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
return innerFsys.Open(innerPath)
}
}
return os.Open(realPath)
}
func (fsys *DeepFS) Stat(name string) (fs.FileInfo, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
}
name = path.Join(filepath.ToSlash(fsys.Root), name)
realPath, innerPath := fsys.splitPath(name)
if innerPath != "" {
if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
return fs.Stat(innerFsys, innerPath)
}
}
return os.Stat(realPath)
}
// ReadDir returns the directory listing for the given directory name,
// but for any entries that appear by their file extension to be archive
// files, they are slightly modified to always return true for IsDir(),
// since we have the unique ability to list the contents of archives as
// if they were directories.
func (fsys *DeepFS) ReadDir(name string) ([]fs.DirEntry, error) {
if !fs.ValidPath(name) {
return nil, &fs.PathError{Op: "readdir", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
}
name = path.Join(filepath.ToSlash(fsys.Root), name)
realPath, innerPath := fsys.splitPath(name)
if innerPath != "" {
if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
return fs.ReadDir(innerFsys, innerPath)
}
}
entries, err := os.ReadDir(realPath)
if err != nil {
return nil, err
}
// make sure entries that appear to be archive files indicate they are a directory
// so the fs package will try to walk them
for i, entry := range entries {
if slices.Contains(archiveExtensions, strings.ToLower(path.Ext(entry.Name()))) {
entries[i] = alwaysDirEntry{entry}
}
}
return entries, nil
}
// getInnerFsys reuses "inner" file systems, because for example, archives.ArchiveFS
// amortizes directory entries with the first call to ReadDir; if we don't reuse the
// file systems then they have to rescan the same archive multiple times.
func (fsys *DeepFS) getInnerFsys(realPath string) fs.FS {
realPath = filepath.Clean(realPath)
fsys.mu.Lock()
defer fsys.mu.Unlock()
if fsys.inners == nil {
fsys.inners = make(map[string]fs.FS)
} else if innerFsys, ok := fsys.inners[realPath]; ok {
return innerFsys
}
innerFsys, err := FileSystem(fsys.context(), realPath, nil)
if err == nil {
fsys.inners[realPath] = innerFsys
return innerFsys
}
return nil
}
// splitPath splits a file path into the "real" path and the "inner" path components,
// where the split point is the first extension of an archive filetype like ".zip" or
// ".tar.gz" that occurs in the path.
//
// The real path is the path that can be accessed on disk and will be returned with
// platform filepath separators. The inner path is the io/fs-compatible path that can
// be used within the archive.
//
// If no archive extension is found in the path, only the realPath is returned.
// If the input path is precisely an archive file (i.e. ends with an archive file
// extension), then innerPath is returned as "." which indicates the root of the archive.
func (*DeepFS) splitPath(path string) (realPath, innerPath string) {
if len(path) < 2 {
realPath = path
return
}
// slightly more LoC, but more efficient, than exploding the path on every slash,
// is segmenting the path by using indices and looking at slices of the same
// string on every iteration; this avoids many allocations which can be valuable
// since this can be a hot path
// start at 1 instead of 0 because we know if the first slash is at 0, the part will be empty
start, end := 1, strings.Index(path[1:], "/")+1
if end-start < 0 {
end = len(path)
}
for {
part := strings.TrimRight(strings.ToLower(path[start:end]), " ")
for _, ext := range archiveExtensions {
if strings.HasSuffix(part, ext) {
// we've found an archive extension, so the path until the end of this segment is
// the "real" OS path, and what remains (if anything( is the path within the archive
realPath = filepath.Clean(filepath.FromSlash(path[:end]))
if end < len(path) {
innerPath = path[end+1:]
} else {
// signal to the caller that this is an archive,
// even though it is the very root of the archive
innerPath = "."
}
return
}
}
// advance to the next segment, or end of string
start = end + 1
if start > len(path) {
break
}
end = strings.Index(path[start:], "/") + start
if end-start < 0 {
end = len(path)
}
}
// no archive extension found, so entire path is real path
realPath = filepath.Clean(filepath.FromSlash(path))
return
}
func (fsys *DeepFS) context() context.Context {
if fsys.Context != nil {
return fsys.Context
}
return context.Background()
}
// alwaysDirEntry always returns true for IsDir(). Because
// DeepFS is able to walk archive files as directories,
// this is used to trick fs.WalkDir to think they are
// directories and thus traverse into them.
type alwaysDirEntry struct {
fs.DirEntry
}
func (alwaysDirEntry) IsDir() bool { return true }
// archiveExtensions contains extensions for popular and supported
// archive types; sorted by popularity and with respect to some
// being prefixed by other extensions.
var archiveExtensions = []string{
".zip",
".tar",
".tgz",
".tar.gz",
".tar.bz2",
".tar.zst",
".tar.lz4",
".tar.xz",
".tar.sz",
".tar.s2",
".tar.lz",
}
// PathContainsArchive returns true if the path contains an archive file (i.e.
// whether the path traverses into an archive) solely by lexical analysis (no
// reading of files or headers is performed). Such a path is not typically
// usable by the OS, but can be used by the DeepFS type. Slash must be the
// path component separator. Example: "/foo/example.zip/path/in/archive"
func PathContainsArchive(path string) bool {
pathPlusSep := path + "/"
for _, ext := range archiveExtensions {
if strings.Contains(pathPlusSep, ext+"/") {
return true
}
}
return false
}
// TopDirOpen is a special Open() function that may be useful if
// a file system root was created by extracting an archive.
//
// It first tries the file name as given, but if that returns an
// error, it tries the name without the first element of the path.
// In other words, if "a/b/c" returns an error, then "b/c" will
// be tried instead.
//
// Consider an archive that contains a file "a/b/c". When the
// archive is extracted, the contents may be created without a
// new parent/root folder to contain them, and the path of the
// same file outside the archive may be lacking an exclusive root
// or parent container. Thus it is likely for a file system
// created for the same files extracted to disk to be rooted at
// one of the top-level files/folders from the archive instead of
// a parent folder. For example, the file known as "a/b/c" when
// rooted at the archive becomes "b/c" after extraction when rooted
// at "a" on disk (because no new, exclusive top-level folder was
// created). This difference in paths can make it difficult to use
// archives and directories uniformly. Hence these TopDir* functions
// which attempt to smooth over the difference.
//
// Some extraction utilities do create a container folder for
// archive contents when extracting, in which case the user
// may give that path as the root. In that case, these TopDir*
// functions are not necessary (but aren't harmful either). They
// are primarily useful if you are not sure whether the root is
// an archive file or is an extracted archive file, as they will
// work with the same filename/path inputs regardless of the
// presence of a top-level directory.
func TopDirOpen(fsys fs.FS, name string) (fs.File, error) {
file, err := fsys.Open(name)
if err == nil {
return file, nil
}
return fsys.Open(pathWithoutTopDir(name))
}
// TopDirStat is like TopDirOpen but for Stat.
func TopDirStat(fsys fs.FS, name string) (fs.FileInfo, error) {
info, err := fs.Stat(fsys, name)
if err == nil {
return info, nil
}
return fs.Stat(fsys, pathWithoutTopDir(name))
}
// TopDirReadDir is like TopDirOpen but for ReadDir.
func TopDirReadDir(fsys fs.FS, name string) ([]fs.DirEntry, error) {
entries, err := fs.ReadDir(fsys, name)
if err == nil {
return entries, nil
}
return fs.ReadDir(fsys, pathWithoutTopDir(name))
}
func pathWithoutTopDir(fpath string) string {
slashIdx := strings.Index(fpath, "/")
if slashIdx < 0 {
return fpath