separate chunking utils

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Sep 26, 2024 · 61ed3d4 · 61ed3d4
1 parent 0ad71c1
commit 61ed3d4
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 85 deletions.
diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py
@@ -7,25 +7,12 @@
 import logging
 import re
 
-# Third Party
-from docling.datamodel.base_models import PipelineOptions
-from docling.document_converter import DocumentConverter
-from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
+from chunking2 import chunk_markdowns, chunk_pdfs
 
-# First Party
-from instructlab.sdg.utils.docprocessor import DocProcessor
-
-_DEFAULT_CHUNK_OVERLAP = 100
 
 logger = logging.getLogger(__name__)
 
 
-def _num_tokens_from_words(num_words) -> int:
-    return int(num_words * 1.3)  # 1 word ~ 1.3 token
-
-
-def _num_chars_from_tokens(num_tokens) -> int:
-    return int(num_tokens * 4)  # 1 token ~ 4 English character
 
 
 def _extract_filetypes_from_docs(documents: List):
@@ -90,73 +77,3 @@ def chunk_documents(
 
     return chunked_mds + chunked_pdfs
 
-
-def chunk_markdowns(
-    documents: List | str, server_ctx_size, chunk_word_count
-) -> List[str]:
-    """Naively chunk markdown documents based on the word count provided by the user.
-    Args:
-        documents (list): List of markdown documents.
-        server_ctx_size (int): Context window size of server.
-        chunk_word_count (int): Maximum number of words to chunk a document.
-    Returns:
-         List[str]: List of chunked documents.
-    """
-    num_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
-    if num_tokens_per_doc > int(server_ctx_size - 1024):
-        raise ValueError(
-            "Error: {}".format(
-                str(
-                    f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
-                )
-            )
-        )
-    # Placeholder for params
-    content = []
-    chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
-    chunk_overlap = _DEFAULT_CHUNK_OVERLAP
-
-    # Using Markdown as default, document-specific chunking will be implemented in separate pr.
-    md_text_splitter = RecursiveCharacterTextSplitter.from_language(
-        language=Language.MARKDOWN,
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-    )
-
-    # Determine file type for heuristics, default with markdown
-    for doc in documents:
-        # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
-        doc = re.sub(r"-{2,}\|", "-|", doc)
-        # Remove unnecessary spaces in front of pipe characters in a markdown table.
-        doc = re.sub(r"\  +\|", " |", doc)
-        temp = md_text_splitter.create_documents([doc])
-        content.extend([item.page_content for item in temp])
-    return content
-
-
-def chunk_pdfs(pdf_docs: List, leaf_node_path: Path):
-    """Semantically chunk PDF documents.
-
-    TODO
-    """
-    print("THIS IS KHALED: CHUNKING PDF DOCS")
-    converter = DocumentConverter(pipeline_options=PipelineOptions())
-    parsed_pdfs = converter.convert(pdf_docs)
-    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
-
-    docling_jsons_path = Path("~/docling-jsons")
-
-    # TODO name files better
-    for i, pd in enumerate(parsed_dicts):
-        fp = docling_jsons_path / f"docling_{i}.json"
-
-        with open(fp, "w") as json_file:
-            for entry in pd:
-                json_file.write(json.dumps(entry) + "\n")
-
-    chunked_pdfs = DocProcessor(
-        parsed_doc_dir=docling_jsons_path,
-        qna_yaml_path=leaf_node_path / "qna.yaml",
-    )
-
-    return chunked_pdfs
diff --git a/src/instructlab/sdg/utils/chunking2.py b/src/instructlab/sdg/utils/chunking2.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+from typing import List
+import json
+import re
+
+# Third Party
+from docling.datamodel.base_models import PipelineOptions
+from docling.document_converter import DocumentConverter
+from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
+
+# First Party
+from instructlab.sdg.utils.docprocessor import DocProcessor
+
+
+
+_DEFAULT_CHUNK_OVERLAP = 100
+
+
+def _num_tokens_from_words(num_words) -> int:
+    return int(num_words * 1.3)  # 1 word ~ 1.3 token
+
+
+def _num_chars_from_tokens(num_tokens) -> int:
+    return int(num_tokens * 4)  # 1 token ~ 4 English character
+
+
+def chunk_markdowns(
+    documents: List | str, server_ctx_size, chunk_word_count
+) -> List[str]:
+    """Naively chunk markdown documents based on the word count provided by the user.
+    Args:
+        documents (list): List of markdown documents.
+        server_ctx_size (int): Context window size of server.
+        chunk_word_count (int): Maximum number of words to chunk a document.
+    Returns:
+         List[str]: List of chunked documents.
+    """
+    num_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
+    if num_tokens_per_doc > int(server_ctx_size - 1024):
+        raise ValueError(
+            "Error: {}".format(
+                str(
+                    f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
+                )
+            )
+        )
+    # Placeholder for params
+    content = []
+    chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
+    chunk_overlap = _DEFAULT_CHUNK_OVERLAP
+
+    # Using Markdown as default, document-specific chunking will be implemented in separate pr.
+    md_text_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=Language.MARKDOWN,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+
+    # Determine file type for heuristics, default with markdown
+    for doc in documents:
+        # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
+        doc = re.sub(r"-{2,}\|", "-|", doc)
+        # Remove unnecessary spaces in front of pipe characters in a markdown table.
+        doc = re.sub(r"\  +\|", " |", doc)
+        temp = md_text_splitter.create_documents([doc])
+        content.extend([item.page_content for item in temp])
+    return content
+
+
+def chunk_pdfs(pdf_docs: List, leaf_node_path: Path):
+    """Semantically chunk PDF documents.
+
+    TODO
+    """
+    print("THIS IS KHALED: CHUNKING PDF DOCS")
+    converter = DocumentConverter(pipeline_options=PipelineOptions())
+    parsed_pdfs = converter.convert(pdf_docs)
+    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
+
+    docling_jsons_path = Path("~/docling-jsons")
+
+    # TODO name files better
+    for i, pd in enumerate(parsed_dicts):
+        fp = docling_jsons_path / f"docling_{i}.json"
+
+        with open(fp, "w") as json_file:
+            for entry in pd:
+                json_file.write(json.dumps(entry) + "\n")
+
+    chunked_pdfs = DocProcessor(
+        parsed_doc_dir=docling_jsons_path,
+        qna_yaml_path=leaf_node_path / "qna.yaml",
+    )
+
+    return chunked_pdfs
diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py
@@ -10,7 +10,7 @@
 import yaml
 
 # Local
-from .chunking import chunk_markdowns
+from .chunking2 import chunk_markdowns
 
 logger = logging.getLogger(__name__)