Allow _get_documents() to read PDF

Signed-off-by: Khaled Sulayman <[email protected]> Co-authored-by: Aakanksha Duggal <[email protected]>
instructlab · Sep 25, 2024 · bff2796 · bff2796
1 parent 1146df3
commit bff2796
Showing 1 changed file with 27 additions and 7 deletions.
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -18,6 +18,7 @@
 )
 import git
 import gitdb
+import PyPDF2
 import yaml
 
 # Local
@@ -108,17 +109,23 @@ def _get_documents(
     skip_checkout: bool = False,
 ) -> List[str]:
     """
-    Retrieve the content of files from a Git repository.
+    Retrieve the content of files (Markdown and PDF) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
+        skip_checkout (bool, optional): If True, skips checking out the specific commit. Defaults to False.
 
     Returns:
-         List[str]: List of document contents.
-    """ ""
+        List[str]: List of document contents (Markdown as text and PDFs as extracted text).
+    
+    Raises:
+        SystemExit: If no valid documents are found.
+        OSError, GitCommandError, FileNotFoundError: For errors during Git operations or file access.
+    """
     repo_url = source.get("repo")
     commit_hash = source.get("commit")
     file_patterns = source.get("patterns", [])
+
     with tempfile.TemporaryDirectory() as temp_dir:
         try:
             repo = git.Repo.clone_from(repo_url, temp_dir)
@@ -130,16 +137,28 @@ def _get_documents(
             logger.debug("Processing files...")
             for pattern in file_patterns:
                 for file_path in glob.glob(os.path.join(repo.working_dir, pattern)):
-                    if os.path.isfile(file_path) and file_path.endswith(".md"):
-                        with open(file_path, "r", encoding="utf-8") as file:
-                            file_contents.append(file.read())
+                    if os.path.isfile(file_path):
+                        if file_path.endswith(".md"):
+                            # Process markdown files
+                            with open(file_path, "r", encoding="utf-8") as file:
+                                file_contents.append(file.read())
+                        elif file_path.endswith(".pdf"):
+                            # Process PDF files
+                            with open(file_path, "rb") as file:
+                                reader = PyPDF2.PdfReader(file)
+                                pdf_text = ""
+                                for page in range(len(reader.pages)):
+                                    pdf_text += reader.pages[page].extract_text()
+                                file_contents.append(pdf_text)
 
             if file_contents:
                 return file_contents
             raise SystemExit("Couldn't find knowledge documents")
+
         except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
+            logger.error("Error retrieving documents: %s", str(e))
             raise e
-
+        
 
 # pylint: disable=broad-exception-caught
 def _read_taxonomy_file(file_path: str | Path, yamllint_config: str | None = None):
@@ -278,6 +297,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
             documents=leaf_node[0]["document"],
             server_ctx_size=server_ctx_size,
             chunk_word_count=chunk_word_count,
+            qna_yaml_path=leaf_node,  # TODO get actual yaml filepath
         )
         if leaf_node[0].get("document")
         else []