1010)
1111from modules .word_frequency .word_frequency_crud import crud_word_frequency
1212from repos .db .sql_repo import SQLRepo
13- from rq import get_current_job
1413from scipy import sparse
1514from sklearn .metrics .pairwise import manhattan_distances
16- from systems .job_system .job_dto import EndpointGeneration , JobPriority
15+ from systems .job_system .job_dto import EndpointGeneration , Job , JobPriority
1716from systems .job_system .job_register_decorator import register_job
1817
1918
2625)
2726def find_duplicates_job (
2827 payload : DuplicateFinderInput ,
28+ job : Job ,
2929) -> DuplicateFinderOutput :
30- job = get_current_job ()
31- assert job is not None , "Job must be running in a worker context"
32-
33- job .meta ["status_message" ] = "Started duplicate finding"
34- job .save_meta ()
30+ job .update (status_message = "Started duplicate finding" )
3531
3632 logger .info ("Finding duplicate text sdocs" )
3733 t0 = time .time ()
@@ -40,8 +36,7 @@ def find_duplicates_job(
4036 db , project_id = payload .project_id , doctype = DocType .text
4137 )
4238 t1 = time .time ()
43- job .meta ["status_message" ] = "Fetched word frequencies from database"
44- job .save_meta ()
39+ job .update (status_message = "Fetched word frequencies from database" )
4540 logger .info (f"query took: { t1 - t0 } " )
4641
4742 t0 = time .time ()
@@ -75,8 +70,7 @@ def find_duplicates_job(
7570 (values , (index , indices )), shape = (len (idx2sdoc_id ), vocab_size )
7671 )
7772 t1 = time .time ()
78- job .meta ["status_message" ] = "Created document word vectors"
79- job .save_meta ()
73+ job .update (status_message = "Created document word vectors" )
8074 logger .info (f"document vector creation took: { t1 - t0 } " )
8175 logger .info (f"vocab size: { vocab_size } " )
8276 logger .info (f"document_vectors shape: { document_vectors .shape } " )
@@ -85,8 +79,7 @@ def find_duplicates_job(
8579 t0 = time .time ()
8680 word_dists = manhattan_distances (document_vectors , document_vectors )
8781 t1 = time .time ()
88- job .meta ["status_message" ] = "Computed distances between documents"
89- job .save_meta ()
82+ job .update (status_message = "Computed distances between documents" )
9083 logger .info (f"manhatten distance took: { t1 - t0 } " )
9184
9285 # mask out self distances and one half of the matrix
@@ -103,8 +96,7 @@ def find_duplicates_job(
10396 )
10497 ).tolist ()
10598 t1 = time .time ()
106- job .meta ["status_message" ] = "Identified duplicate pairs"
107- job .save_meta ()
99+ job .update (status_message = "Identified duplicate pairs" )
108100 logger .info (f"finding duplicates took: { t1 - t0 } " )
109101
110102 # map back to sdoc_ids
@@ -120,8 +112,7 @@ def find_duplicates_job(
120112 G .to_undirected ()
121113 subgraph_nodes = [list (subgraph ) for subgraph in nx .connected_components (G )]
122114 t1 = time .time ()
123- job .meta ["status_message" ] = "Finished finding duplicates!"
124- job .save_meta ()
115+ job .update (status_message = "Finished finding duplicates!" )
125116 logger .info (f"graph grouping took: { t1 - t0 } " )
126117
127118 return DuplicateFinderOutput (duplicates = subgraph_nodes )
0 commit comments