2626#include < folly/String.h>
2727#include < folly/container/F14Map.h>
2828
29+ #include < fmt/format.h>
30+
31+ #include " dwarfs/checksum.h"
2932#include " dwarfs/entry.h"
3033#include " dwarfs/file_scanner.h"
3134#include " dwarfs/inode.h"
@@ -41,6 +44,9 @@ namespace dwarfs::detail {
4144
4245namespace {
4346
47+ constexpr size_t const kLargeFileThreshold = 1024 * 1024 ;
48+ constexpr size_t const kLargeFileStartHashSize = 4096 ;
49+
4450template <typename LoggerPolicy>
4551class file_scanner_ final : public file_scanner::impl {
4652 public:
@@ -94,26 +100,34 @@ class file_scanner_ final : public file_scanner::impl {
94100 uint32_t num_unique_{0 };
95101 folly::F14FastMap<uint64_t , inode::files_vector> hardlinks_;
96102 std::mutex mx_;
97- folly::F14FastMap<uint64_t , inode::files_vector> unique_size_;
103+ // The pair stores the file size and optionally a hash of the first
104+ // 4 KiB of the file. If there's a collision, the worst that can
105+ // happen is that we unnecessary hash a file that is not a duplicate.
106+ folly::F14FastMap<std::pair<uint64_t , uint64_t >, inode::files_vector>
107+ unique_size_;
108+ // We need this lookup table to later find the unique_size_ entry
109+ // given just a file pointer.
110+ folly::F14FastMap<file const *, uint64_t > file_start_hash_;
98111 folly::F14FastMap<uint64_t , std::shared_ptr<condition_barrier>>
99112 first_file_hashed_;
100113 folly::F14FastMap<uint64_t , inode::files_vector> by_raw_inode_;
101114 folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
102115};
103116
104117// The `unique_size_` table holds an entry for each file size we
105- // discover:
118+ // discover, and optionally - for large files - an XXH3 hash of the
119+ // first 4 KiB of the file.
106120//
107- // - When we first discover a new file size, we know for sure that
108- // this file is *not* a duplicate of a file we've seen before.
109- // Thus, we can immediately create a new inode, and we can
121+ // - When we first discover a new file size (+hash) , we know for
122+ // sure that this file is *not* a duplicate of a file we've seen
123+ // before. Thus, we can immediately create a new inode, and we can
110124// immediately start similarity scanning for this inode.
111125//
112- // - When we discover the second file of particular size, we must
113- // hash both files to see if they're identical. We already have
114- // an inode for the first file, so we must delay the creation of
115- // a new inode until we know that the second file is not a
116- // duplicate.
126+ // - When we discover the second file of particular size (+hash) , we
127+ // must fully hash both files (using the user-provided algorithm)
128+ // to see if they're identical. We already have an inode for the
129+ // first file, so we must delay the creation of a new inode until
130+ // we know that the second file is not a duplicate.
117131//
118132// - Exactly the same applies for subsequent files.
119133//
@@ -129,6 +143,15 @@ class file_scanner_ final : public file_scanner::impl {
129143// stored. As long as the first file's hash has not been stored,
130144// it is still present in `unique_size_`. It will be removed
131145// from `unique_size_` after its hash has been stored.
146+ //
147+ // - The optional hash value of the first 4 KiB of a large file is
148+ // useful if there are a lot of large files with the same size.
149+ // One potential scenario is uncompressed images which are very
150+ // likely to have the same size, but very unlikely to have the
151+ // same contents. The choice of 4 KiB is arbitrary, as is the
152+ // threshold of 1 MiB for "large files". The 4 KiB hash is computed
153+ // synchronously, so this could be a potential bottleneck; however,
154+ // it should happen rarely enough to not be a problem.
132155
133156template <typename LoggerPolicy>
134157file_scanner_<LoggerPolicy>::file_scanner_(
@@ -143,6 +166,8 @@ file_scanner_<LoggerPolicy>::file_scanner_(
143166
144167template <typename LoggerPolicy>
145168void file_scanner_<LoggerPolicy>::scan(file* p) {
169+ // This method is supposed to be called from a single thread only.
170+
146171 if (p->num_hard_links () > 1 ) {
147172 auto & vec = hardlinks_[p->raw_inode_num ()];
148173 vec.push_back (p);
@@ -178,11 +203,12 @@ void file_scanner_<LoggerPolicy>::finalize(uint32_t& inode_num) {
178203
179204 if (hash_algo_) {
180205 finalize_hardlinks ([this ](file const * p) -> inode::files_vector& {
181- auto it = by_hash_.find (p->hash ());
182- if (it != by_hash_.end ()) {
206+ if (auto it = by_hash_.find (p->hash ()); it != by_hash_.end ()) {
183207 return it->second ;
184208 }
185- return unique_size_.at (p->size ());
209+ auto it = file_start_hash_.find (p);
210+ uint64_t hash = it != file_start_hash_.end () ? it->second : 0 ;
211+ return unique_size_.at ({p->size (), hash});
186212 });
187213 finalize_files<true >(unique_size_, inode_num, obj_num);
188214 finalize_files (by_raw_inode_, inode_num, obj_num);
@@ -199,8 +225,26 @@ template <typename LoggerPolicy>
199225void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
200226 // We need no lock yet, as `unique_size_` is only manipulated from
201227 // this thread.
202- auto size = p->size ();
203- auto [it, is_new] = unique_size_.emplace (size, inode::files_vector ());
228+ uint64_t size = p->size ();
229+ uint64_t start_hash{0 };
230+
231+ if (size >= kLargeFileThreshold && !p->is_invalid ()) {
232+ try {
233+ auto mm = os_.map_file (p->fs_path (), kLargeFileStartHashSize );
234+ checksum cs (checksum::algorithm::XXH3_64);
235+ cs.update (mm->addr (), kLargeFileStartHashSize );
236+ cs.finalize (&start_hash);
237+ } catch (...) {
238+ LOG_ERROR << " failed to map file " << p->path_as_string () << " : "
239+ << folly::exceptionStr (std::current_exception ())
240+ << " , creating empty file" ;
241+ ++prog_.errors ;
242+ p->set_invalid ();
243+ }
244+ }
245+
246+ auto [it, is_new] = unique_size_.emplace (std::make_pair (size, start_hash),
247+ inode::files_vector ());
204248
205249 if (is_new) {
206250 // A file size that has never been seen before. We can safely
@@ -263,6 +307,8 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
263307 cv->notify ();
264308 });
265309
310+ // Clear files vector, but don't delete the hash table entry,
311+ // to indicate that files of this size *must* be hashed.
266312 it->second .clear ();
267313 }
268314
0 commit comments