Skip to content

Commit a0377f7

Browse files
committed
feat(file_scanner): prevent unnecessary hashing of large files
1 parent 13dcdfa commit a0377f7

File tree

1 file changed

+61
-15
lines changed

1 file changed

+61
-15
lines changed

src/dwarfs/file_scanner.cpp

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
#include <folly/String.h>
2727
#include <folly/container/F14Map.h>
2828

29+
#include <fmt/format.h>
30+
31+
#include "dwarfs/checksum.h"
2932
#include "dwarfs/entry.h"
3033
#include "dwarfs/file_scanner.h"
3134
#include "dwarfs/inode.h"
@@ -41,6 +44,9 @@ namespace dwarfs::detail {
4144

4245
namespace {
4346

47+
constexpr size_t const kLargeFileThreshold = 1024 * 1024;
48+
constexpr size_t const kLargeFileStartHashSize = 4096;
49+
4450
template <typename LoggerPolicy>
4551
class file_scanner_ final : public file_scanner::impl {
4652
public:
@@ -94,26 +100,34 @@ class file_scanner_ final : public file_scanner::impl {
94100
uint32_t num_unique_{0};
95101
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
96102
std::mutex mx_;
97-
folly::F14FastMap<uint64_t, inode::files_vector> unique_size_;
103+
// The pair stores the file size and optionally a hash of the first
104+
// 4 KiB of the file. If there's a collision, the worst that can
105+
// happen is that we unnecessary hash a file that is not a duplicate.
106+
folly::F14FastMap<std::pair<uint64_t, uint64_t>, inode::files_vector>
107+
unique_size_;
108+
// We need this lookup table to later find the unique_size_ entry
109+
// given just a file pointer.
110+
folly::F14FastMap<file const*, uint64_t> file_start_hash_;
98111
folly::F14FastMap<uint64_t, std::shared_ptr<condition_barrier>>
99112
first_file_hashed_;
100113
folly::F14FastMap<uint64_t, inode::files_vector> by_raw_inode_;
101114
folly::F14FastMap<std::string_view, inode::files_vector> by_hash_;
102115
};
103116

104117
// The `unique_size_` table holds an entry for each file size we
105-
// discover:
118+
// discover, and optionally - for large files - an XXH3 hash of the
119+
// first 4 KiB of the file.
106120
//
107-
// - When we first discover a new file size, we know for sure that
108-
// this file is *not* a duplicate of a file we've seen before.
109-
// Thus, we can immediately create a new inode, and we can
121+
// - When we first discover a new file size (+hash), we know for
122+
// sure that this file is *not* a duplicate of a file we've seen
123+
// before. Thus, we can immediately create a new inode, and we can
110124
// immediately start similarity scanning for this inode.
111125
//
112-
// - When we discover the second file of particular size, we must
113-
// hash both files to see if they're identical. We already have
114-
// an inode for the first file, so we must delay the creation of
115-
// a new inode until we know that the second file is not a
116-
// duplicate.
126+
// - When we discover the second file of particular size (+hash), we
127+
// must fully hash both files (using the user-provided algorithm)
128+
// to see if they're identical. We already have an inode for the
129+
// first file, so we must delay the creation of a new inode until
130+
// we know that the second file is not a duplicate.
117131
//
118132
// - Exactly the same applies for subsequent files.
119133
//
@@ -129,6 +143,15 @@ class file_scanner_ final : public file_scanner::impl {
129143
// stored. As long as the first file's hash has not been stored,
130144
// it is still present in `unique_size_`. It will be removed
131145
// from `unique_size_` after its hash has been stored.
146+
//
147+
// - The optional hash value of the first 4 KiB of a large file is
148+
// useful if there are a lot of large files with the same size.
149+
// One potential scenario is uncompressed images which are very
150+
// likely to have the same size, but very unlikely to have the
151+
// same contents. The choice of 4 KiB is arbitrary, as is the
152+
// threshold of 1 MiB for "large files". The 4 KiB hash is computed
153+
// synchronously, so this could be a potential bottleneck; however,
154+
// it should happen rarely enough to not be a problem.
132155

133156
template <typename LoggerPolicy>
134157
file_scanner_<LoggerPolicy>::file_scanner_(
@@ -143,6 +166,8 @@ file_scanner_<LoggerPolicy>::file_scanner_(
143166

144167
template <typename LoggerPolicy>
145168
void file_scanner_<LoggerPolicy>::scan(file* p) {
169+
// This method is supposed to be called from a single thread only.
170+
146171
if (p->num_hard_links() > 1) {
147172
auto& vec = hardlinks_[p->raw_inode_num()];
148173
vec.push_back(p);
@@ -178,11 +203,12 @@ void file_scanner_<LoggerPolicy>::finalize(uint32_t& inode_num) {
178203

179204
if (hash_algo_) {
180205
finalize_hardlinks([this](file const* p) -> inode::files_vector& {
181-
auto it = by_hash_.find(p->hash());
182-
if (it != by_hash_.end()) {
206+
if (auto it = by_hash_.find(p->hash()); it != by_hash_.end()) {
183207
return it->second;
184208
}
185-
return unique_size_.at(p->size());
209+
auto it = file_start_hash_.find(p);
210+
uint64_t hash = it != file_start_hash_.end() ? it->second : 0;
211+
return unique_size_.at({p->size(), hash});
186212
});
187213
finalize_files<true>(unique_size_, inode_num, obj_num);
188214
finalize_files(by_raw_inode_, inode_num, obj_num);
@@ -199,8 +225,26 @@ template <typename LoggerPolicy>
199225
void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
200226
// We need no lock yet, as `unique_size_` is only manipulated from
201227
// this thread.
202-
auto size = p->size();
203-
auto [it, is_new] = unique_size_.emplace(size, inode::files_vector());
228+
uint64_t size = p->size();
229+
uint64_t start_hash{0};
230+
231+
if (size >= kLargeFileThreshold && !p->is_invalid()) {
232+
try {
233+
auto mm = os_.map_file(p->fs_path(), kLargeFileStartHashSize);
234+
checksum cs(checksum::algorithm::XXH3_64);
235+
cs.update(mm->addr(), kLargeFileStartHashSize);
236+
cs.finalize(&start_hash);
237+
} catch (...) {
238+
LOG_ERROR << "failed to map file " << p->path_as_string() << ": "
239+
<< folly::exceptionStr(std::current_exception())
240+
<< ", creating empty file";
241+
++prog_.errors;
242+
p->set_invalid();
243+
}
244+
}
245+
246+
auto [it, is_new] = unique_size_.emplace(std::make_pair(size, start_hash),
247+
inode::files_vector());
204248

205249
if (is_new) {
206250
// A file size that has never been seen before. We can safely
@@ -263,6 +307,8 @@ void file_scanner_<LoggerPolicy>::scan_dedupe(file* p) {
263307
cv->notify();
264308
});
265309

310+
// Clear files vector, but don't delete the hash table entry,
311+
// to indicate that files of this size *must* be hashed.
266312
it->second.clear();
267313
}
268314

0 commit comments

Comments
 (0)