Skip to content

Commit a591dd7

Browse files
committed
feat: enhance duplicate file marking with progress tracking and callback support
1 parent c91b481 commit a591dd7

File tree

1 file changed

+51
-7
lines changed

1 file changed

+51
-7
lines changed

database/src/actions/fingerprint.rs

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -395,16 +395,24 @@ pub fn bytes_to_u32s(bytes: Vec<u8>) -> Result<Vec<u32>> {
395395
Ok(u32s)
396396
}
397397

398-
pub async fn mark_duplicate_files(
398+
pub async fn mark_duplicate_files<F>(
399399
db: &DatabaseConnection,
400400
similarity_threshold: f32,
401-
) -> Result<usize> {
401+
progress_callback: F,
402+
) -> Result<usize>
403+
where
404+
F: Fn(usize, usize) + Send + Sync + 'static,
405+
{
406+
let progress_callback = Arc::new(progress_callback);
407+
402408
info!(
403409
"Starting duplicate detection with similarity threshold: {}",
404410
similarity_threshold
405411
);
406412

407413
// Step 1: Get all file similarity pairs above the threshold
414+
progress_callback(0, 3); // 3 main stages: getting data, grouping, marking
415+
408416
let similarities = MediaFileSimilarity::find()
409417
.filter(media_file_similarity::Column::Similarity.gte(similarity_threshold))
410418
.all(db)
@@ -416,21 +424,35 @@ pub async fn mark_duplicate_files(
416424
"No similar files found above threshold {}",
417425
similarity_threshold
418426
);
427+
progress_callback(3, 3); // Complete all stages
419428
return Ok(0);
420429
}
421430

422431
info!(
423432
"Found {} similar file pairs above threshold",
424433
similarities.len()
425434
);
435+
progress_callback(1, 3); // Completed first stage
426436

427437
// Step 2: Group files into clusters of similar content
428438
let file_groups = group_similar_files(&similarities);
429439
info!("Created {} groups of similar files", file_groups.len());
440+
progress_callback(2, 3); // Completed second stage
430441

431442
// Step 3: For each group, keep the highest sample rate file and mark others as duplicates
432-
let marked_count = mark_duplicates(db, file_groups).await?;
443+
let total_groups = file_groups.len();
444+
let progress_callback_for_marking = {
445+
let progress_callback = Arc::clone(&progress_callback);
446+
move |current: usize, _: usize| {
447+
// Map group progress to overall progress (from 2 to 3)
448+
let overall_progress = 2.0 + (current as f32 / total_groups as f32);
449+
progress_callback(overall_progress.floor() as usize, 3);
450+
}
451+
};
452+
453+
let marked_count = mark_duplicates(db, file_groups, progress_callback_for_marking).await?;
433454
info!("Marked {} files as duplicates", marked_count);
455+
progress_callback(3, 3); // Completed all stages
434456

435457
Ok(marked_count)
436458
}
@@ -485,11 +507,20 @@ fn group_similar_files(similarities: &[media_file_similarity::Model]) -> Vec<Vec
485507
groups
486508
}
487509

488-
async fn mark_duplicates(db: &DatabaseConnection, file_groups: Vec<Vec<i32>>) -> Result<usize> {
510+
async fn mark_duplicates<F>(
511+
db: &DatabaseConnection,
512+
file_groups: Vec<Vec<i32>>,
513+
progress_callback: F,
514+
) -> Result<usize>
515+
where
516+
F: Fn(usize, usize) + Send + Sync + 'static,
517+
{
489518
let mut total_marked = 0;
519+
let total_groups = file_groups.len();
490520

491-
for group in file_groups {
521+
for (group_index, group) in file_groups.into_iter().enumerate() {
492522
if group.len() <= 1 {
523+
progress_callback(group_index + 1, total_groups);
493524
continue;
494525
}
495526

@@ -534,6 +565,8 @@ async fn mark_duplicates(db: &DatabaseConnection, file_groups: Vec<Vec<i32>>) ->
534565
);
535566
}
536567
}
568+
569+
progress_callback(group_index + 1, total_groups);
537570
}
538571

539572
Ok(total_marked)
@@ -555,23 +588,34 @@ pub async fn get_duplicate_files(db: &DatabaseConnection) -> Result<Vec<media_fi
555588
}
556589

557590
// Function to reset duplicate marks
558-
pub async fn reset_duplicate_marks(db: &DatabaseConnection) -> Result<usize> {
591+
pub async fn reset_duplicate_marks<F>(
592+
db: &DatabaseConnection,
593+
progress_callback: F,
594+
) -> Result<usize>
595+
where
596+
F: Fn(usize, usize) + Send + Sync + 'static,
597+
{
559598
let fingerprints = MediaFileFingerprint::find()
560599
.filter(media_file_fingerprint::Column::IsDuplicated.eq(1))
561600
.all(db)
562601
.await
563602
.context("Failed to retrieve marked fingerprints")?;
564603

604+
let total_fingerprints = fingerprints.len();
605+
progress_callback(0, total_fingerprints);
606+
565607
let mut updated_count = 0;
566608

567-
for fp in fingerprints {
609+
for (index, fp) in fingerprints.into_iter().enumerate() {
568610
let mut fp_active: media_file_fingerprint::ActiveModel = fp.into();
569611
fp_active.is_duplicated = ActiveValue::Set(0); // Reset duplicate mark
570612
fp_active
571613
.update(db)
572614
.await
573615
.context("Failed to reset duplicate mark")?;
574616
updated_count += 1;
617+
618+
progress_callback(index + 1, total_fingerprints);
575619
}
576620

577621
info!("Reset duplicate marks for {} files", updated_count);

0 commit comments

Comments
 (0)