Skip to content

Commit

Permalink
separate chunking utils
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Sep 26, 2024
1 parent 0ad71c1 commit 61ed3d4
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 85 deletions.
85 changes: 1 addition & 84 deletions src/instructlab/sdg/utils/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,12 @@
import logging
import re

Check warning on line 8 in src/instructlab/sdg/utils/chunking.py

View workflow job for this annotation

GitHub Actions / pylint

W0611: Unused import re (unused-import)

# Third Party
from docling.datamodel.base_models import PipelineOptions
from docling.document_converter import DocumentConverter
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from chunking2 import chunk_markdowns, chunk_pdfs

Check failure on line 10 in src/instructlab/sdg/utils/chunking.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'chunking2' (import-error)

# First Party
from instructlab.sdg.utils.docprocessor import DocProcessor

_DEFAULT_CHUNK_OVERLAP = 100

logger = logging.getLogger(__name__)


def _num_tokens_from_words(num_words) -> int:
return int(num_words * 1.3) # 1 word ~ 1.3 token


def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character


def _extract_filetypes_from_docs(documents: List):
Expand Down Expand Up @@ -90,73 +77,3 @@ def chunk_documents(

return chunked_mds + chunked_pdfs

Check warning on line 79 in src/instructlab/sdg/utils/chunking.py

View workflow job for this annotation

GitHub Actions / pylint

C0305: Trailing newlines (trailing-newlines)

def chunk_markdowns(
documents: List | str, server_ctx_size, chunk_word_count
) -> List[str]:
"""Naively chunk markdown documents based on the word count provided by the user.
Args:
documents (list): List of markdown documents.
server_ctx_size (int): Context window size of server.
chunk_word_count (int): Maximum number of words to chunk a document.
Returns:
List[str]: List of chunked documents.
"""
num_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
if num_tokens_per_doc > int(server_ctx_size - 1024):
raise ValueError(
"Error: {}".format(
str(
f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
)
)
)
# Placeholder for params
content = []
chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in separate pr.
md_text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
for doc in documents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
doc = re.sub(r"-{2,}\|", "-|", doc)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
doc = re.sub(r"\ +\|", " |", doc)
temp = md_text_splitter.create_documents([doc])
content.extend([item.page_content for item in temp])
return content


def chunk_pdfs(pdf_docs: List, leaf_node_path: Path):
"""Semantically chunk PDF documents.
TODO
"""
print("THIS IS KHALED: CHUNKING PDF DOCS")
converter = DocumentConverter(pipeline_options=PipelineOptions())
parsed_pdfs = converter.convert(pdf_docs)
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]

docling_jsons_path = Path("~/docling-jsons")

# TODO name files better
for i, pd in enumerate(parsed_dicts):
fp = docling_jsons_path / f"docling_{i}.json"

with open(fp, "w") as json_file:
for entry in pd:
json_file.write(json.dumps(entry) + "\n")

chunked_pdfs = DocProcessor(
parsed_doc_dir=docling_jsons_path,
qna_yaml_path=leaf_node_path / "qna.yaml",
)

return chunked_pdfs
95 changes: 95 additions & 0 deletions src/instructlab/sdg/utils/chunking2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from pathlib import Path
from typing import List
import json
import re

# Third Party
from docling.datamodel.base_models import PipelineOptions

Check failure on line 7 in src/instructlab/sdg/utils/chunking2.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.datamodel.base_models' (import-error)
from docling.document_converter import DocumentConverter

Check failure on line 8 in src/instructlab/sdg/utils/chunking2.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.document_converter' (import-error)
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

# First Party
from instructlab.sdg.utils.docprocessor import DocProcessor



_DEFAULT_CHUNK_OVERLAP = 100


def _num_tokens_from_words(num_words) -> int:
return int(num_words * 1.3) # 1 word ~ 1.3 token


def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character


def chunk_markdowns(
documents: List | str, server_ctx_size, chunk_word_count
) -> List[str]:
"""Naively chunk markdown documents based on the word count provided by the user.
Args:
documents (list): List of markdown documents.
server_ctx_size (int): Context window size of server.
chunk_word_count (int): Maximum number of words to chunk a document.
Returns:
List[str]: List of chunked documents.
"""
num_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
if num_tokens_per_doc > int(server_ctx_size - 1024):
raise ValueError(
"Error: {}".format(
str(
f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
)
)
)
# Placeholder for params
content = []
chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in separate pr.
md_text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
for doc in documents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
doc = re.sub(r"-{2,}\|", "-|", doc)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
doc = re.sub(r"\ +\|", " |", doc)
temp = md_text_splitter.create_documents([doc])
content.extend([item.page_content for item in temp])
return content


def chunk_pdfs(pdf_docs: List, leaf_node_path: Path):
"""Semantically chunk PDF documents.
TODO
"""
print("THIS IS KHALED: CHUNKING PDF DOCS")
converter = DocumentConverter(pipeline_options=PipelineOptions())
parsed_pdfs = converter.convert(pdf_docs)
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]

docling_jsons_path = Path("~/docling-jsons")

# TODO name files better
for i, pd in enumerate(parsed_dicts):
fp = docling_jsons_path / f"docling_{i}.json"

with open(fp, "w") as json_file:

Check warning on line 86 in src/instructlab/sdg/utils/chunking2.py

View workflow job for this annotation

GitHub Actions / pylint

W1514: Using open without explicitly specifying an encoding (unspecified-encoding)
for entry in pd:
json_file.write(json.dumps(entry) + "\n")

chunked_pdfs = DocProcessor(
parsed_doc_dir=docling_jsons_path,
qna_yaml_path=leaf_node_path / "qna.yaml",
)

return chunked_pdfs

Check warning on line 95 in src/instructlab/sdg/utils/chunking2.py

View workflow job for this annotation

GitHub Actions / pylint

C0304: Final newline missing (missing-final-newline)
2 changes: 1 addition & 1 deletion src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import yaml

# Local
from .chunking import chunk_markdowns
from .chunking2 import chunk_markdowns

logger = logging.getLogger(__name__)

Expand Down

0 comments on commit 61ed3d4

Please sign in to comment.