cleanup

fynnos · fynnos · commit 2d86eb7fd246 · 2025-08-18T13:45:21.000Z
diff --git a/backend/src/modules/doc_processing/entrypoints/doc_chunking_job.py b/backend/src/modules/doc_processing/entrypoints/doc_chunking_job.py
@@ -47,7 +47,9 @@ def handle_pdf_chunking_job(
         logger.error(f"File {payload.filepath} does not exist!")
         raise Exception(f"File {payload.filepath} does not exist!")
 
-    # TODO: these extractions have varying compute requirements
+    # TODO: these extractions have varying compute requirements when run
+    #       across multiple machines or using GPU
+
     # Chunk the document
     if payload.filepath.suffix == ".txt":
         chunks = chunk_txt(payload)
diff --git a/backend/src/modules/doc_processing/html/spacy_job.py b/backend/src/modules/doc_processing/html/spacy_job.py
@@ -251,7 +251,6 @@ def extract_span_annotations(
 def extract_tok_sent_data(
     spacy_output: SpacyPipelineOutput,
 ) -> dict:
-    # FIXME: take tokens/sentences from whisper and store audio token time offsets
     token_starts: list[int] = []
     token_ends: list[int] = []
     for token in spacy_output.tokens:
diff --git a/backend/src/modules/doc_processing/text/html_extraction_job.py b/backend/src/modules/doc_processing/text/html_extraction_job.py
@@ -40,7 +40,8 @@ def handle_extract_html_job(
         logger.error(f"File {payload.filepath} does not exist!")
         raise Exception(f"File {payload.filepath} does not exist!")
 
-    # TODO: these extractions have varying compute requirements
+    # TODO: these extractions have varying compute requirements when run across
+    #       multiple machines or if GPU is used for PDFs via Docling etc.
     if payload.filepath.suffix == ".txt":
         doc_html, extracted_images = extract_html_from_text(payload.filepath)
     elif payload.filepath.suffix == ".docx" or payload.filepath.suffix == ".doc":
@@ -56,21 +57,8 @@ def handle_extract_html_job(
     # Clean HTML (may use readability, always uses heuristics)
     html = clean_html(doc_html)
 
-    folder_id = payload.folder_id
-    # if len(extracted_images) > 0:
-    #     with SQLRepo().db_session() as db:
-    #         folder = crud_folder.create(
-    #             db,
-    #             create_dto=FolderCreate(
-    #                 project_id=payload.project_id,
-    #                 folder_type=FolderType.SDOC_FOLDER,
-    #                 name=payload.filepath.name,
-    #                 parent_id=payload.folder_id,
-    #             ),
-    #         )
-    #         folder_id = folder.id
     return ExtractHTMLJobOutput(
-        html=html, image_paths=extracted_images, folder_id=folder_id
+        html=html, image_paths=extracted_images, folder_id=payload.folder_id
     )