From 61ed3d4f912e0c9e82bd9b77ace1aa5c55e7561b Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Thu, 26 Sep 2024 13:44:09 -0400 Subject: [PATCH] separate chunking utils Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/chunking.py | 85 +------------------- src/instructlab/sdg/utils/chunking2.py | 95 +++++++++++++++++++++++ src/instructlab/sdg/utils/docprocessor.py | 2 +- 3 files changed, 97 insertions(+), 85 deletions(-) create mode 100644 src/instructlab/sdg/utils/chunking2.py diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py index f55eea30..7a6f05e7 100644 --- a/src/instructlab/sdg/utils/chunking.py +++ b/src/instructlab/sdg/utils/chunking.py @@ -7,25 +7,12 @@ import logging import re -# Third Party -from docling.datamodel.base_models import PipelineOptions -from docling.document_converter import DocumentConverter -from langchain_text_splitters import Language, RecursiveCharacterTextSplitter +from chunking2 import chunk_markdowns, chunk_pdfs -# First Party -from instructlab.sdg.utils.docprocessor import DocProcessor - -_DEFAULT_CHUNK_OVERLAP = 100 logger = logging.getLogger(__name__) -def _num_tokens_from_words(num_words) -> int: - return int(num_words * 1.3) # 1 word ~ 1.3 token - - -def _num_chars_from_tokens(num_tokens) -> int: - return int(num_tokens * 4) # 1 token ~ 4 English character def _extract_filetypes_from_docs(documents: List): @@ -90,73 +77,3 @@ def chunk_documents( return chunked_mds + chunked_pdfs - -def chunk_markdowns( - documents: List | str, server_ctx_size, chunk_word_count -) -> List[str]: - """Naively chunk markdown documents based on the word count provided by the user. - Args: - documents (list): List of markdown documents. - server_ctx_size (int): Context window size of server. - chunk_word_count (int): Maximum number of words to chunk a document. - Returns: - List[str]: List of chunked documents. - """ - num_tokens_per_doc = _num_tokens_from_words(chunk_word_count) - if num_tokens_per_doc > int(server_ctx_size - 1024): - raise ValueError( - "Error: {}".format( - str( - f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})" - ) - ) - ) - # Placeholder for params - content = [] - chunk_size = _num_chars_from_tokens(num_tokens_per_doc) - chunk_overlap = _DEFAULT_CHUNK_OVERLAP - - # Using Markdown as default, document-specific chunking will be implemented in separate pr. - md_text_splitter = RecursiveCharacterTextSplitter.from_language( - language=Language.MARKDOWN, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - - # Determine file type for heuristics, default with markdown - for doc in documents: - # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table. - doc = re.sub(r"-{2,}\|", "-|", doc) - # Remove unnecessary spaces in front of pipe characters in a markdown table. - doc = re.sub(r"\ +\|", " |", doc) - temp = md_text_splitter.create_documents([doc]) - content.extend([item.page_content for item in temp]) - return content - - -def chunk_pdfs(pdf_docs: List, leaf_node_path: Path): - """Semantically chunk PDF documents. - - TODO - """ - print("THIS IS KHALED: CHUNKING PDF DOCS") - converter = DocumentConverter(pipeline_options=PipelineOptions()) - parsed_pdfs = converter.convert(pdf_docs) - parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] - - docling_jsons_path = Path("~/docling-jsons") - - # TODO name files better - for i, pd in enumerate(parsed_dicts): - fp = docling_jsons_path / f"docling_{i}.json" - - with open(fp, "w") as json_file: - for entry in pd: - json_file.write(json.dumps(entry) + "\n") - - chunked_pdfs = DocProcessor( - parsed_doc_dir=docling_jsons_path, - qna_yaml_path=leaf_node_path / "qna.yaml", - ) - - return chunked_pdfs diff --git a/src/instructlab/sdg/utils/chunking2.py b/src/instructlab/sdg/utils/chunking2.py new file mode 100644 index 00000000..e633af78 --- /dev/null +++ b/src/instructlab/sdg/utils/chunking2.py @@ -0,0 +1,95 @@ +from pathlib import Path +from typing import List +import json +import re + +# Third Party +from docling.datamodel.base_models import PipelineOptions +from docling.document_converter import DocumentConverter +from langchain_text_splitters import Language, RecursiveCharacterTextSplitter + +# First Party +from instructlab.sdg.utils.docprocessor import DocProcessor + + + +_DEFAULT_CHUNK_OVERLAP = 100 + + +def _num_tokens_from_words(num_words) -> int: + return int(num_words * 1.3) # 1 word ~ 1.3 token + + +def _num_chars_from_tokens(num_tokens) -> int: + return int(num_tokens * 4) # 1 token ~ 4 English character + + +def chunk_markdowns( + documents: List | str, server_ctx_size, chunk_word_count +) -> List[str]: + """Naively chunk markdown documents based on the word count provided by the user. + Args: + documents (list): List of markdown documents. + server_ctx_size (int): Context window size of server. + chunk_word_count (int): Maximum number of words to chunk a document. + Returns: + List[str]: List of chunked documents. + """ + num_tokens_per_doc = _num_tokens_from_words(chunk_word_count) + if num_tokens_per_doc > int(server_ctx_size - 1024): + raise ValueError( + "Error: {}".format( + str( + f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})" + ) + ) + ) + # Placeholder for params + content = [] + chunk_size = _num_chars_from_tokens(num_tokens_per_doc) + chunk_overlap = _DEFAULT_CHUNK_OVERLAP + + # Using Markdown as default, document-specific chunking will be implemented in separate pr. + md_text_splitter = RecursiveCharacterTextSplitter.from_language( + language=Language.MARKDOWN, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + # Determine file type for heuristics, default with markdown + for doc in documents: + # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table. + doc = re.sub(r"-{2,}\|", "-|", doc) + # Remove unnecessary spaces in front of pipe characters in a markdown table. + doc = re.sub(r"\ +\|", " |", doc) + temp = md_text_splitter.create_documents([doc]) + content.extend([item.page_content for item in temp]) + return content + + +def chunk_pdfs(pdf_docs: List, leaf_node_path: Path): + """Semantically chunk PDF documents. + + TODO + """ + print("THIS IS KHALED: CHUNKING PDF DOCS") + converter = DocumentConverter(pipeline_options=PipelineOptions()) + parsed_pdfs = converter.convert(pdf_docs) + parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] + + docling_jsons_path = Path("~/docling-jsons") + + # TODO name files better + for i, pd in enumerate(parsed_dicts): + fp = docling_jsons_path / f"docling_{i}.json" + + with open(fp, "w") as json_file: + for entry in pd: + json_file.write(json.dumps(entry) + "\n") + + chunked_pdfs = DocProcessor( + parsed_doc_dir=docling_jsons_path, + qna_yaml_path=leaf_node_path / "qna.yaml", + ) + + return chunked_pdfs \ No newline at end of file diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py index dee58cba..0166bdfb 100644 --- a/src/instructlab/sdg/utils/docprocessor.py +++ b/src/instructlab/sdg/utils/docprocessor.py @@ -10,7 +10,7 @@ import yaml # Local -from .chunking import chunk_markdowns +from .chunking2 import chunk_markdowns logger = logging.getLogger(__name__)