@@ -395,16 +395,24 @@ pub fn bytes_to_u32s(bytes: Vec<u8>) -> Result<Vec<u32>> {
395395 Ok ( u32s)
396396}
397397
398- pub async fn mark_duplicate_files (
398+ pub async fn mark_duplicate_files < F > (
399399 db : & DatabaseConnection ,
400400 similarity_threshold : f32 ,
401- ) -> Result < usize > {
401+ progress_callback : F ,
402+ ) -> Result < usize >
403+ where
404+ F : Fn ( usize , usize ) + Send + Sync + ' static ,
405+ {
406+ let progress_callback = Arc :: new ( progress_callback) ;
407+
402408 info ! (
403409 "Starting duplicate detection with similarity threshold: {}" ,
404410 similarity_threshold
405411 ) ;
406412
407413 // Step 1: Get all file similarity pairs above the threshold
414+ progress_callback ( 0 , 3 ) ; // 3 main stages: getting data, grouping, marking
415+
408416 let similarities = MediaFileSimilarity :: find ( )
409417 . filter ( media_file_similarity:: Column :: Similarity . gte ( similarity_threshold) )
410418 . all ( db)
@@ -416,21 +424,35 @@ pub async fn mark_duplicate_files(
416424 "No similar files found above threshold {}" ,
417425 similarity_threshold
418426 ) ;
427+ progress_callback ( 3 , 3 ) ; // Complete all stages
419428 return Ok ( 0 ) ;
420429 }
421430
422431 info ! (
423432 "Found {} similar file pairs above threshold" ,
424433 similarities. len( )
425434 ) ;
435+ progress_callback ( 1 , 3 ) ; // Completed first stage
426436
427437 // Step 2: Group files into clusters of similar content
428438 let file_groups = group_similar_files ( & similarities) ;
429439 info ! ( "Created {} groups of similar files" , file_groups. len( ) ) ;
440+ progress_callback ( 2 , 3 ) ; // Completed second stage
430441
431442 // Step 3: For each group, keep the highest sample rate file and mark others as duplicates
432- let marked_count = mark_duplicates ( db, file_groups) . await ?;
443+ let total_groups = file_groups. len ( ) ;
444+ let progress_callback_for_marking = {
445+ let progress_callback = Arc :: clone ( & progress_callback) ;
446+ move |current : usize , _: usize | {
447+ // Map group progress to overall progress (from 2 to 3)
448+ let overall_progress = 2.0 + ( current as f32 / total_groups as f32 ) ;
449+ progress_callback ( overall_progress. floor ( ) as usize , 3 ) ;
450+ }
451+ } ;
452+
453+ let marked_count = mark_duplicates ( db, file_groups, progress_callback_for_marking) . await ?;
433454 info ! ( "Marked {} files as duplicates" , marked_count) ;
455+ progress_callback ( 3 , 3 ) ; // Completed all stages
434456
435457 Ok ( marked_count)
436458}
@@ -485,11 +507,20 @@ fn group_similar_files(similarities: &[media_file_similarity::Model]) -> Vec<Vec
485507 groups
486508}
487509
488- async fn mark_duplicates ( db : & DatabaseConnection , file_groups : Vec < Vec < i32 > > ) -> Result < usize > {
510+ async fn mark_duplicates < F > (
511+ db : & DatabaseConnection ,
512+ file_groups : Vec < Vec < i32 > > ,
513+ progress_callback : F ,
514+ ) -> Result < usize >
515+ where
516+ F : Fn ( usize , usize ) + Send + Sync + ' static ,
517+ {
489518 let mut total_marked = 0 ;
519+ let total_groups = file_groups. len ( ) ;
490520
491- for group in file_groups {
521+ for ( group_index , group) in file_groups. into_iter ( ) . enumerate ( ) {
492522 if group. len ( ) <= 1 {
523+ progress_callback ( group_index + 1 , total_groups) ;
493524 continue ;
494525 }
495526
@@ -534,6 +565,8 @@ async fn mark_duplicates(db: &DatabaseConnection, file_groups: Vec<Vec<i32>>) ->
534565 ) ;
535566 }
536567 }
568+
569+ progress_callback ( group_index + 1 , total_groups) ;
537570 }
538571
539572 Ok ( total_marked)
@@ -555,23 +588,34 @@ pub async fn get_duplicate_files(db: &DatabaseConnection) -> Result<Vec<media_fi
555588}
556589
557590// Function to reset duplicate marks
558- pub async fn reset_duplicate_marks ( db : & DatabaseConnection ) -> Result < usize > {
591+ pub async fn reset_duplicate_marks < F > (
592+ db : & DatabaseConnection ,
593+ progress_callback : F ,
594+ ) -> Result < usize >
595+ where
596+ F : Fn ( usize , usize ) + Send + Sync + ' static ,
597+ {
559598 let fingerprints = MediaFileFingerprint :: find ( )
560599 . filter ( media_file_fingerprint:: Column :: IsDuplicated . eq ( 1 ) )
561600 . all ( db)
562601 . await
563602 . context ( "Failed to retrieve marked fingerprints" ) ?;
564603
604+ let total_fingerprints = fingerprints. len ( ) ;
605+ progress_callback ( 0 , total_fingerprints) ;
606+
565607 let mut updated_count = 0 ;
566608
567- for fp in fingerprints {
609+ for ( index , fp ) in fingerprints. into_iter ( ) . enumerate ( ) {
568610 let mut fp_active: media_file_fingerprint:: ActiveModel = fp. into ( ) ;
569611 fp_active. is_duplicated = ActiveValue :: Set ( 0 ) ; // Reset duplicate mark
570612 fp_active
571613 . update ( db)
572614 . await
573615 . context ( "Failed to reset duplicate mark" ) ?;
574616 updated_count += 1 ;
617+
618+ progress_callback ( index + 1 , total_fingerprints) ;
575619 }
576620
577621 info ! ( "Reset duplicate marks for {} files" , updated_count) ;
0 commit comments