-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Khaled Sulayman <[email protected]>
- Loading branch information
1 parent
0ad71c1
commit 61ed3d4
Showing
3 changed files
with
97 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from pathlib import Path | ||
from typing import List | ||
import json | ||
import re | ||
|
||
# Third Party | ||
from docling.datamodel.base_models import PipelineOptions | ||
from docling.document_converter import DocumentConverter | ||
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter | ||
|
||
# First Party | ||
from instructlab.sdg.utils.docprocessor import DocProcessor | ||
|
||
|
||
|
||
_DEFAULT_CHUNK_OVERLAP = 100 | ||
|
||
|
||
def _num_tokens_from_words(num_words) -> int: | ||
return int(num_words * 1.3) # 1 word ~ 1.3 token | ||
|
||
|
||
def _num_chars_from_tokens(num_tokens) -> int: | ||
return int(num_tokens * 4) # 1 token ~ 4 English character | ||
|
||
|
||
def chunk_markdowns( | ||
documents: List | str, server_ctx_size, chunk_word_count | ||
) -> List[str]: | ||
"""Naively chunk markdown documents based on the word count provided by the user. | ||
Args: | ||
documents (list): List of markdown documents. | ||
server_ctx_size (int): Context window size of server. | ||
chunk_word_count (int): Maximum number of words to chunk a document. | ||
Returns: | ||
List[str]: List of chunked documents. | ||
""" | ||
num_tokens_per_doc = _num_tokens_from_words(chunk_word_count) | ||
if num_tokens_per_doc > int(server_ctx_size - 1024): | ||
raise ValueError( | ||
"Error: {}".format( | ||
str( | ||
f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})" | ||
) | ||
) | ||
) | ||
# Placeholder for params | ||
content = [] | ||
chunk_size = _num_chars_from_tokens(num_tokens_per_doc) | ||
chunk_overlap = _DEFAULT_CHUNK_OVERLAP | ||
|
||
# Using Markdown as default, document-specific chunking will be implemented in separate pr. | ||
md_text_splitter = RecursiveCharacterTextSplitter.from_language( | ||
language=Language.MARKDOWN, | ||
chunk_size=chunk_size, | ||
chunk_overlap=chunk_overlap, | ||
) | ||
|
||
# Determine file type for heuristics, default with markdown | ||
for doc in documents: | ||
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table. | ||
doc = re.sub(r"-{2,}\|", "-|", doc) | ||
# Remove unnecessary spaces in front of pipe characters in a markdown table. | ||
doc = re.sub(r"\ +\|", " |", doc) | ||
temp = md_text_splitter.create_documents([doc]) | ||
content.extend([item.page_content for item in temp]) | ||
return content | ||
|
||
|
||
def chunk_pdfs(pdf_docs: List, leaf_node_path: Path): | ||
"""Semantically chunk PDF documents. | ||
TODO | ||
""" | ||
print("THIS IS KHALED: CHUNKING PDF DOCS") | ||
converter = DocumentConverter(pipeline_options=PipelineOptions()) | ||
parsed_pdfs = converter.convert(pdf_docs) | ||
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] | ||
|
||
docling_jsons_path = Path("~/docling-jsons") | ||
|
||
# TODO name files better | ||
for i, pd in enumerate(parsed_dicts): | ||
fp = docling_jsons_path / f"docling_{i}.json" | ||
|
||
with open(fp, "w") as json_file: | ||
for entry in pd: | ||
json_file.write(json.dumps(entry) + "\n") | ||
|
||
chunked_pdfs = DocProcessor( | ||
parsed_doc_dir=docling_jsons_path, | ||
qna_yaml_path=leaf_node_path / "qna.yaml", | ||
) | ||
|
||
return chunked_pdfs | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters