Skip to content

Commit 2d86eb7

Browse files
committed
cleanup
1 parent 4006728 commit 2d86eb7

File tree

3 files changed

+6
-17
lines changed

3 files changed

+6
-17
lines changed

backend/src/modules/doc_processing/entrypoints/doc_chunking_job.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ def handle_pdf_chunking_job(
4747
logger.error(f"File {payload.filepath} does not exist!")
4848
raise Exception(f"File {payload.filepath} does not exist!")
4949

50-
# TODO: these extractions have varying compute requirements
50+
# TODO: these extractions have varying compute requirements when run
51+
# across multiple machines or using GPU
52+
5153
# Chunk the document
5254
if payload.filepath.suffix == ".txt":
5355
chunks = chunk_txt(payload)

backend/src/modules/doc_processing/html/spacy_job.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ def extract_span_annotations(
251251
def extract_tok_sent_data(
252252
spacy_output: SpacyPipelineOutput,
253253
) -> dict:
254-
# FIXME: take tokens/sentences from whisper and store audio token time offsets
255254
token_starts: list[int] = []
256255
token_ends: list[int] = []
257256
for token in spacy_output.tokens:

backend/src/modules/doc_processing/text/html_extraction_job.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def handle_extract_html_job(
4040
logger.error(f"File {payload.filepath} does not exist!")
4141
raise Exception(f"File {payload.filepath} does not exist!")
4242

43-
# TODO: these extractions have varying compute requirements
43+
# TODO: these extractions have varying compute requirements when run across
44+
# multiple machines or if GPU is used for PDFs via Docling etc.
4445
if payload.filepath.suffix == ".txt":
4546
doc_html, extracted_images = extract_html_from_text(payload.filepath)
4647
elif payload.filepath.suffix == ".docx" or payload.filepath.suffix == ".doc":
@@ -56,21 +57,8 @@ def handle_extract_html_job(
5657
# Clean HTML (may use readability, always uses heuristics)
5758
html = clean_html(doc_html)
5859

59-
folder_id = payload.folder_id
60-
# if len(extracted_images) > 0:
61-
# with SQLRepo().db_session() as db:
62-
# folder = crud_folder.create(
63-
# db,
64-
# create_dto=FolderCreate(
65-
# project_id=payload.project_id,
66-
# folder_type=FolderType.SDOC_FOLDER,
67-
# name=payload.filepath.name,
68-
# parent_id=payload.folder_id,
69-
# ),
70-
# )
71-
# folder_id = folder.id
7260
return ExtractHTMLJobOutput(
73-
html=html, image_paths=extracted_images, folder_id=folder_id
61+
html=html, image_paths=extracted_images, folder_id=payload.folder_id
7462
)
7563

7664

0 commit comments

Comments
 (0)