Skip to content

Commit

Permalink
Allow _get_documents() to read PDF
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
Co-authored-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
khaledsulayman and aakankshaduggal committed Sep 25, 2024
1 parent 1146df3 commit bff2796
Showing 1 changed file with 27 additions and 7 deletions.
34 changes: 27 additions & 7 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
import git
import gitdb
import PyPDF2

Check failure on line 21 in src/instructlab/sdg/utils/taxonomy.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'PyPDF2' (import-error)
import yaml

# Local
Expand Down Expand Up @@ -108,17 +109,23 @@ def _get_documents(
skip_checkout: bool = False,
) -> List[str]:
"""
Retrieve the content of files from a Git repository.
Retrieve the content of files (Markdown and PDF) from a Git repository.
Args:
source (dict): Source info containing repository URL, commit hash, and list of file patterns.
skip_checkout (bool, optional): If True, skips checking out the specific commit. Defaults to False.
Returns:
List[str]: List of document contents.
""" ""
List[str]: List of document contents (Markdown as text and PDFs as extracted text).
Raises:
SystemExit: If no valid documents are found.
OSError, GitCommandError, FileNotFoundError: For errors during Git operations or file access.
"""
repo_url = source.get("repo")
commit_hash = source.get("commit")
file_patterns = source.get("patterns", [])

with tempfile.TemporaryDirectory() as temp_dir:
try:
repo = git.Repo.clone_from(repo_url, temp_dir)
Expand All @@ -130,16 +137,28 @@ def _get_documents(
logger.debug("Processing files...")
for pattern in file_patterns:
for file_path in glob.glob(os.path.join(repo.working_dir, pattern)):
if os.path.isfile(file_path) and file_path.endswith(".md"):
with open(file_path, "r", encoding="utf-8") as file:
file_contents.append(file.read())
if os.path.isfile(file_path):
if file_path.endswith(".md"):
# Process markdown files
with open(file_path, "r", encoding="utf-8") as file:
file_contents.append(file.read())
elif file_path.endswith(".pdf"):
# Process PDF files
with open(file_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
pdf_text = ""
for page in range(len(reader.pages)):
pdf_text += reader.pages[page].extract_text()
file_contents.append(pdf_text)

if file_contents:
return file_contents
raise SystemExit("Couldn't find knowledge documents")

except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
logger.error("Error retrieving documents: %s", str(e))
raise e


# pylint: disable=broad-exception-caught
def _read_taxonomy_file(file_path: str | Path, yamllint_config: str | None = None):
Expand Down Expand Up @@ -278,6 +297,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
qna_yaml_path=leaf_node, # TODO get actual yaml filepath
)
if leaf_node[0].get("document")
else []
Expand Down

0 comments on commit bff2796

Please sign in to comment.