From 270d869b243c14a823646b7b68fcf9816a19d5eb Mon Sep 17 00:00:00 2001 From: Chris Ayers Date: Thu, 1 Feb 2024 20:03:20 -0500 Subject: [PATCH] Add JsonFileParser to FileStrategy (#1195) * Add JsonFileParser to FileStrategy * Refactor JSON parser in prepdocs.py * fixed linting errors with ruff * Fix formatting in filestrategy.py and test_jsonparser.py * Added new textsplitter and tests * Added File processors and refactor of prepdocs.py * fix ruff formatting issues * fix linting errors * Update scripts/prepdocslib/jsonparser.py Co-authored-by: Pamela Fox * Update scripts/prepdocslib/parser.py Co-authored-by: Pamela Fox * Added sample json data, fixed bug in file extension * Fix file extension retrieval in File class * Refactor prepdocs.py script * renamed data examples, added test * Fix offset, add tests * Add pragma no cover * Use the whole version of dataclass * Run ruff on imports * Reformatting --------- Co-authored-by: Pamela Fox Co-authored-by: Pamela Fox --- data/Json_Examples/2189.json | 14 ++ data/Json_Examples/2190.json | 14 ++ data/Json_Examples/2191.json | 14 ++ data/Json_Examples/2192.json | 14 ++ data/Json_Examples/query.json | 244 ++++++++++++++++++++++++ pyproject.toml | 4 +- scripts/prepdocs.py | 38 ++-- scripts/prepdocslib/fileprocessor.py | 10 + scripts/prepdocslib/filestrategy.py | 22 ++- scripts/prepdocslib/jsonparser.py | 23 +++ scripts/prepdocslib/listfilestrategy.py | 3 + scripts/prepdocslib/page.py | 24 +++ scripts/prepdocslib/parser.py | 14 ++ scripts/prepdocslib/pdfparser.py | 35 +--- scripts/prepdocslib/textsplitter.py | 43 ++++- tests/test_jsonparser.py | 32 ++++ tests/test_listfilestrategy.py | 6 + tests/test_prepdocslib_textsplitter.py | 47 ++++- 18 files changed, 531 insertions(+), 70 deletions(-) create mode 100644 data/Json_Examples/2189.json create mode 100644 data/Json_Examples/2190.json create mode 100644 data/Json_Examples/2191.json create mode 100644 data/Json_Examples/2192.json create mode 100644 data/Json_Examples/query.json create mode 100644 scripts/prepdocslib/fileprocessor.py create mode 100644 scripts/prepdocslib/jsonparser.py create mode 100644 scripts/prepdocslib/page.py create mode 100644 scripts/prepdocslib/parser.py create mode 100644 tests/test_jsonparser.py diff --git a/data/Json_Examples/2189.json b/data/Json_Examples/2189.json new file mode 100644 index 0000000000..d7066c9fbd --- /dev/null +++ b/data/Json_Examples/2189.json @@ -0,0 +1,14 @@ +{ + "AreaPath": "SmartHotel360", + "AssignedTo": null, + "Categories": null, + "ChangedDate": "2023-12-13T23:08:38.69Z", + "ClosedDate": null, + "CreatedDate": "2023-12-13T23:08:38.69Z", + "Description": "As a customer, I would like to reserve a conference room such that:

1. It should display available date and time slots
2. Give an option to reserve a conference room for X hours
3. One can reserve a conference room for max 4 hours per day
", + "Id": 2189, + "State": "New", + "StateChangeDate": "2023-12-13T23:08:38.69Z", + "Tags": "Reservation", + "Title": "As a customer, I would like to reserve a conference room" +} diff --git a/data/Json_Examples/2190.json b/data/Json_Examples/2190.json new file mode 100644 index 0000000000..5a45f1158f --- /dev/null +++ b/data/Json_Examples/2190.json @@ -0,0 +1,14 @@ +{ + "AreaPath": "SmartHotel360", + "AssignedTo": null, + "Categories": null, + "ChangedDate": "2023-12-13T23:08:38.997Z", + "ClosedDate": null, + "CreatedDate": "2023-12-13T23:08:38.997Z", + "Description": "

Enter the guest's name to whom you would\nlike to send a confirmation, display the company, contact, source\nand agent associated\nwith the reservation.

", + "Id": 2190, + "State": "New", + "StateChangeDate": "2023-12-13T23:08:38.997Z", + "Tags": "Notification", + "Title": "As a reservation agent, I would like to send confirmations to guest" +} diff --git a/data/Json_Examples/2191.json b/data/Json_Examples/2191.json new file mode 100644 index 0000000000..455e4c9a24 --- /dev/null +++ b/data/Json_Examples/2191.json @@ -0,0 +1,14 @@ +{ + "AreaPath": "SmartHotel360", + "AssignedTo": null, + "Categories": null, + "ChangedDate": "2023-12-13T23:08:39.17Z", + "ClosedDate": null, + "CreatedDate": "2023-12-13T23:08:39.17Z", + "Description": "

If you have not picked up\nyour vehicle you can remove or cancel your reservation by clicking here.


1. Car reserved should have an option to cancel the request
2. Car driver should receive a notification about cancellation
", + "Id": 2191, + "State": "New", + "StateChangeDate": "2023-12-13T23:08:39.17Z", + "Tags": "Reservation", + "Title": "As a customer, I should be able to remove a car reservation " +} diff --git a/data/Json_Examples/2192.json b/data/Json_Examples/2192.json new file mode 100644 index 0000000000..d2e489f317 --- /dev/null +++ b/data/Json_Examples/2192.json @@ -0,0 +1,14 @@ +{ + "AreaPath": "SmartHotel360", + "AssignedTo": null, + "Categories": null, + "ChangedDate": "2023-12-13T23:08:39.383Z", + "ClosedDate": null, + "CreatedDate": "2023-12-13T23:08:39.383Z", + "Description": "As a courtesy, grant an\nextra hour or two to leave the room, especially if it isn't booked\nfor the upcoming evening. But customer must call the front desk\nin advance and request a late checkout.


1. Late Check-in time should be displayed
2. Request should be sent to front-desk 
3. Any extra charge should be displayed
", + "Id": 2192, + "State": "New", + "StateChangeDate": "2023-12-13T23:08:39.383Z", + "Tags": "Front-desk; Members; Reservation", + "Title": "As a customer, I should be able to request hotel for late Check-out" +} diff --git a/data/Json_Examples/query.json b/data/Json_Examples/query.json new file mode 100644 index 0000000000..ceb9dc1a6f --- /dev/null +++ b/data/Json_Examples/query.json @@ -0,0 +1,244 @@ +[ + { + "fields": { + "System.Id": 2348, + "System.State": "New", + "System.Title": "Provide related items or frequently bought together section when people browse or search", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2348, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2348" + }, + { + "fields": { + "System.Id": 2349, + "System.State": "New", + "System.Title": "As tester, I need to test the website on all the relevant broswers and devices and be sure that it can handle our load.", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2349, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2349" + }, + { + "fields": { + "System.Id": 2350, + "System.State": "New", + "System.Title": "As a customer, I should be able to put items to shopping cart", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2350, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2350" + }, + { + "fields": { + "System.Id": 2351, + "System.State": "New", + "System.Title": "As a customer, I should be able to print my purchase order", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2351, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2351" + }, + { + "fields": { + "System.Id": 2352, + "System.State": "New", + "System.Title": "As a customer, I would like to have a sort capabaility by price and customer ratings", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2352, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2352" + }, + { + "fields": { + "System.Id": 2353, + "System.State": "New", + "System.Title": "Recommended products must be based on customer purchase pattern history", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2353, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2353" + }, + { + "fields": { + "System.Id": 2354, + "System.State": "New", + "System.Title": "As a customer, I would like to save my addresses so that I can easily select the address for delivery", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2354, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2354" + }, + { + "fields": { + "System.Id": 2355, + "System.State": "New", + "System.Title": "As marketer, I want to run an A|B test on alternative Web Sites using Application Insights.", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2355, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2355" + }, + { + "fields": { + "System.AssignedTo": { + "_links": { + "avatar": { + "href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0" + } + }, + "descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0", + "displayName": "Chris Ayers", + "id": "cd8258ec-ad87-4c0d-9026-e5e343447185", + "imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0", + "uniqueName": "chrisayers@microsoft.com", + "url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185" + }, + "System.Id": 2356, + "System.State": "Done", + "System.Title": "Provide customers the ability to track status of the package", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2356, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2356" + }, + { + "fields": { + "System.AssignedTo": { + "_links": { + "avatar": { + "href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0" + } + }, + "descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0", + "displayName": "Chris Ayers", + "id": "cd8258ec-ad87-4c0d-9026-e5e343447185", + "imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0", + "uniqueName": "chrisayers@microsoft.com", + "url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185" + }, + "System.Id": 2357, + "System.State": "Done", + "System.Title": "As a customer, I would like to have the ability to send my items as gift", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2357, + "relations": null, + "rev": 2, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2357" + }, + { + "fields": { + "System.Id": 2358, + "System.State": "Committed", + "System.Title": "As a customer, I would like to store my credit card details securely", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2358, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2358" + }, + { + "fields": { + "System.Id": 2359, + "System.State": "Committed", + "System.Title": "As a customer, I should be able to select different shipping option", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2359, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2359" + }, + { + "fields": { + "System.Id": 2360, + "System.State": "Committed", + "System.Title": "As developer, I want to use Azure Machine Learning to provide a recommendations engine behind the website.", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2360, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2360" + }, + { + "fields": { + "System.Id": 2361, + "System.State": "Committed", + "System.Title": "Provide tentative duration for shipping.", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2361, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2361" + }, + { + "fields": { + "System.Id": 2362, + "System.State": "Approved", + "System.Title": "Notify the user about any changes made to the order", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2362, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2362" + }, + { + "fields": { + "System.Id": 2363, + "System.State": "Approved", + "System.Title": "As a admin, I should be able to update prices on ad-hoc condition", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2363, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2363" + }, + { + "fields": { + "System.Id": 2364, + "System.State": "Approved", + "System.Title": "As a customer, I would like to provide my feedback on items that I have purchased", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2364, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2364" + }, + { + "fields": { + "System.Id": 2365, + "System.State": "Approved", + "System.Title": "As a customer, I would like to have a wishlist where I can add items for future purchase", + "System.WorkItemType": "Product Backlog Item" + }, + "id": 2365, + "relations": null, + "rev": 1, + "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2365" + } +] diff --git a/pyproject.toml b/pyproject.toml index a24e3e7541..fe608e275f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.ruff] target-version = "py38" -select = ["E", "F", "I", "UP"] -ignore = ["E501", "E701"] # line too long, multiple statements on one line +lint.select = ["E", "F", "I", "UP"] +lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line src = ["app/backend", "scripts"] [tool.ruff.isort] diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index bb73bc6602..a346041c6d 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -14,15 +14,18 @@ OpenAIEmbeddings, OpenAIEmbeddingService, ) +from prepdocslib.fileprocessor import FileProcessor from prepdocslib.filestrategy import DocumentAction, FileStrategy +from prepdocslib.jsonparser import JsonParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, LocalListFileStrategy, ) -from prepdocslib.pdfparser import DocumentAnalysisPdfParser, LocalPdfParser, PdfParser +from prepdocslib.parser import Parser +from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser from prepdocslib.strategy import SearchInfo, Strategy -from prepdocslib.textsplitter import TextSplitter +from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter def is_key_empty(key): @@ -52,25 +55,29 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi verbose=args.verbose, ) - pdf_parser: PdfParser - if args.localpdfparser: - pdf_parser = LocalPdfParser() - else: - # check if Azure Document Intelligence credentials are provided - if args.formrecognizerservice is None: - print( - "Error: Azure Document Intelligence service is not provided. Please provide --formrecognizerservice or use --localpdfparser for local pypdf parser." - ) - exit(1) + pdf_parser: Parser + doc_int_parser: DocumentAnalysisParser + + # check if Azure Document Intelligence credentials are provided + if args.formrecognizerservice is not None: formrecognizer_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( credential if is_key_empty(args.formrecognizerkey) else AzureKeyCredential(args.formrecognizerkey) ) - pdf_parser = DocumentAnalysisPdfParser( + doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/", credential=formrecognizer_creds, verbose=args.verbose, ) - + if args.localpdfparser or args.formrecognizerservice is None: + pdf_parser = LocalPdfParser() + else: + pdf_parser = doc_int_parser + sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=args.searchimages) + file_processors = { + ".pdf": FileProcessor(pdf_parser, sentence_text_splitter), + ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), + ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), + } use_vectors = not args.novectors embeddings: Optional[OpenAIEmbeddings] = None if use_vectors and args.openaihost != "openai": @@ -128,8 +135,7 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi return FileStrategy( list_file_strategy=list_file_strategy, blob_manager=blob_manager, - pdf_parser=pdf_parser, - text_splitter=TextSplitter(has_image_embeddings=args.searchimages), + file_processors=file_processors, document_action=document_action, embeddings=embeddings, image_embeddings=image_embeddings, diff --git a/scripts/prepdocslib/fileprocessor.py b/scripts/prepdocslib/fileprocessor.py new file mode 100644 index 0000000000..3b58130db8 --- /dev/null +++ b/scripts/prepdocslib/fileprocessor.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + +from .parser import Parser +from .textsplitter import TextSplitter + + +@dataclass(frozen=True) +class FileProcessor: + parser: Parser + splitter: TextSplitter diff --git a/scripts/prepdocslib/filestrategy.py b/scripts/prepdocslib/filestrategy.py index c44add7ebc..0a63e50fe8 100644 --- a/scripts/prepdocslib/filestrategy.py +++ b/scripts/prepdocslib/filestrategy.py @@ -3,11 +3,10 @@ from .blobmanager import BlobManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings +from .fileprocessor import FileProcessor from .listfilestrategy import ListFileStrategy -from .pdfparser import PdfParser from .searchmanager import SearchManager, Section from .strategy import SearchInfo, Strategy -from .textsplitter import TextSplitter class DocumentAction(Enum): @@ -25,8 +24,7 @@ def __init__( self, list_file_strategy: ListFileStrategy, blob_manager: BlobManager, - pdf_parser: PdfParser, - text_splitter: TextSplitter, + file_processors: dict[str, FileProcessor], document_action: DocumentAction = DocumentAction.Add, embeddings: Optional[OpenAIEmbeddings] = None, image_embeddings: Optional[ImageEmbeddings] = None, @@ -36,8 +34,7 @@ def __init__( ): self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager - self.pdf_parser = pdf_parser - self.text_splitter = text_splitter + self.file_processors = file_processors self.document_action = document_action self.embeddings = embeddings self.image_embeddings = image_embeddings @@ -61,12 +58,21 @@ async def run(self, search_info: SearchInfo): files = self.list_file_strategy.list() async for file in files: try: - pages = [page async for page in self.pdf_parser.parse(content=file.content)] + key = file.file_extension() + processor = self.file_processors[key] + if not processor: + # skip file if no parser is found + if search_info.verbose: + print(f"Skipping '{file.filename()}'.") + continue + if search_info.verbose: + print(f"Parsing '{file.filename()}'") + pages = [page async for page in processor.parser.parse(content=file.content)] if search_info.verbose: print(f"Splitting '{file.filename()}' into sections") sections = [ Section(split_page, content=file, category=self.category) - for split_page in self.text_splitter.split_pages(pages) + for split_page in processor.splitter.split_pages(pages) ] blob_sas_uris = await self.blob_manager.upload_blob(file) diff --git a/scripts/prepdocslib/jsonparser.py b/scripts/prepdocslib/jsonparser.py new file mode 100644 index 0000000000..48c3eac046 --- /dev/null +++ b/scripts/prepdocslib/jsonparser.py @@ -0,0 +1,23 @@ +import json +from typing import IO, AsyncGenerator + +from .page import Page +from .parser import Parser + + +class JsonParser(Parser): + """ + Concrete parser that can parse JSON into Page objects. A top-level object becomes a single Page, while a top-level array becomes multiple Page objects. + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + offset = 0 + data = json.loads(content.read()) + if isinstance(data, list): + for i, obj in enumerate(data): + offset += 1 # For opening bracket or comma before object + page_text = json.dumps(obj) + yield Page(i, offset, page_text) + offset += len(page_text) + elif isinstance(data, dict): + yield Page(0, 0, json.dumps(data)) diff --git a/scripts/prepdocslib/listfilestrategy.py b/scripts/prepdocslib/listfilestrategy.py index 153a1081d5..d0b24876f1 100644 --- a/scripts/prepdocslib/listfilestrategy.py +++ b/scripts/prepdocslib/listfilestrategy.py @@ -26,6 +26,9 @@ def __init__(self, content: IO, acls: Optional[dict[str, list]] = None): def filename(self): return os.path.basename(self.content.name) + def file_extension(self): + return os.path.splitext(self.content.name)[1] + def filename_to_id(self): filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", self.filename()) filename_hash = base64.b16encode(self.filename().encode("utf-8")).decode("ascii") diff --git a/scripts/prepdocslib/page.py b/scripts/prepdocslib/page.py new file mode 100644 index 0000000000..f12fe70b94 --- /dev/null +++ b/scripts/prepdocslib/page.py @@ -0,0 +1,24 @@ +class Page: + """ + A single page from a document + + Attributes: + page_num (int): Page number + offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") + text (str): The text of the page + """ + + def __init__(self, page_num: int, offset: int, text: str): + self.page_num = page_num + self.offset = offset + self.text = text + + +class SplitPage: + """ + A section of a page that has been split into a smaller chunk. + """ + + def __init__(self, page_num: int, text: str): + self.page_num = page_num + self.text = text diff --git a/scripts/prepdocslib/parser.py b/scripts/prepdocslib/parser.py new file mode 100644 index 0000000000..09d12e0ad6 --- /dev/null +++ b/scripts/prepdocslib/parser.py @@ -0,0 +1,14 @@ +from abc import ABC +from typing import IO, AsyncGenerator + +from .page import Page + + +class Parser(ABC): + """ + Abstract parser that parses content into Page objects + """ + + async def parse(self, content: IO) -> AsyncGenerator[Page, None]: + if False: + yield # pragma: no cover - this is necessary for mypy to type check diff --git a/scripts/prepdocslib/pdfparser.py b/scripts/prepdocslib/pdfparser.py index c7e2b64491..10c2d9a2f2 100644 --- a/scripts/prepdocslib/pdfparser.py +++ b/scripts/prepdocslib/pdfparser.py @@ -1,5 +1,4 @@ import html -from abc import ABC from typing import IO, AsyncGenerator, Union from azure.ai.formrecognizer import DocumentTable @@ -8,36 +7,12 @@ from azure.core.credentials_async import AsyncTokenCredential from pypdf import PdfReader +from .page import Page +from .parser import Parser from .strategy import USER_AGENT -class Page: - """ - A single page from a pdf - - Attributes: - page_num (int): Page number - offset (int): If the text of the entire PDF was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") - text (str): The text of the page - """ - - def __init__(self, page_num: int, offset: int, text: str): - self.page_num = page_num - self.offset = offset - self.text = text - - -class PdfParser(ABC): - """ - Abstract parser that parses PDFs into pages - """ - - async def parse(self, content: IO) -> AsyncGenerator[Page, None]: - if False: - yield - - -class LocalPdfParser(PdfParser): +class LocalPdfParser(Parser): """ Concrete parser backed by PyPDF that can parse PDFs into pages To learn more, please visit https://pypi.org/project/pypdf/ @@ -53,7 +28,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: offset += len(page_text) -class DocumentAnalysisPdfParser(PdfParser): +class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse PDFS into pages To learn more, please visit https://learn.microsoft.com/azure/ai-services/document-intelligence/overview @@ -108,7 +83,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: if table_id == -1: page_text += form_recognizer_results.content[page_offset + idx] elif table_id not in added_tables: - page_text += DocumentAnalysisPdfParser.table_to_html(tables_on_page[table_id]) + page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id]) added_tables.add(table_id) yield Page(page_num=page_num, offset=offset, text=page_text) diff --git a/scripts/prepdocslib/textsplitter.py b/scripts/prepdocslib/textsplitter.py index 7421b57377..928aedd7e6 100644 --- a/scripts/prepdocslib/textsplitter.py +++ b/scripts/prepdocslib/textsplitter.py @@ -1,19 +1,22 @@ +from abc import ABC from typing import Generator, List -from .pdfparser import Page +from .page import Page, SplitPage -class SplitPage: +class TextSplitter(ABC): """ - A section of a page that has been split into a smaller chunk. + Splits a list of pages into smaller chunks + :param pages: The pages to split + :return: A generator of SplitPage """ - def __init__(self, page_num: int, text: str): - self.page_num = page_num - self.text = text + def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]: + if False: + yield # pragma: no cover - this is necessary for mypy to type check -class TextSplitter: +class SentenceTextSplitter(TextSplitter): """ Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once """ @@ -105,3 +108,29 @@ def find_page(offset): if start + self.section_overlap < end: yield SplitPage(page_num=find_page(start), text=all_text[start:end]) + + +class SimpleTextSplitter(TextSplitter): + """ + Class that splits pages into smaller chunks based on a max object length. It is not aware of the content of the page. + This is required because embedding models may not be able to analyze an entire page at once + """ + + def __init__(self, max_object_length: int = 1000, verbose: bool = False): + self.max_object_length = max_object_length + self.verbose = verbose + + def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]: + all_text = "".join(page.text for page in pages) + if len(all_text.strip()) == 0: + return + + length = len(all_text) + if length <= self.max_object_length: + yield SplitPage(page_num=0, text=all_text) + return + + # its too big, so we need to split it + for i in range(0, length, self.max_object_length): + yield SplitPage(page_num=i // self.max_object_length, text=all_text[i : i + self.max_object_length]) + return diff --git a/tests/test_jsonparser.py b/tests/test_jsonparser.py new file mode 100644 index 0000000000..9ebc70919d --- /dev/null +++ b/tests/test_jsonparser.py @@ -0,0 +1,32 @@ +import io + +import pytest + +from scripts.prepdocslib.jsonparser import JsonParser + + +@pytest.mark.asyncio +async def test_jsonparser_single_obj(): + file = io.StringIO('{"test": "test"}') + file.name = "test.json" + jsonparser = JsonParser() + pages = [page async for page in jsonparser.parse(file)] + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == '{"test": "test"}' + + +@pytest.mark.asyncio +async def test_jsonparser_array_multiple_obj(): + file = io.StringIO('[{"test1": "test"},{"test2": "test"}]') + file.name = "test.json" + jsonparser = JsonParser() + pages = [page async for page in jsonparser.parse(file)] + assert len(pages) == 2 + assert pages[0].page_num == 0 + assert pages[0].offset == 1 + assert pages[0].text == '{"test1": "test"}' + assert pages[1].page_num == 1 + assert pages[1].offset == 19 + assert pages[1].text == '{"test2": "test"}' diff --git a/tests/test_listfilestrategy.py b/tests/test_listfilestrategy.py index 6bed5b4b51..bc72c4eba7 100644 --- a/tests/test_listfilestrategy.py +++ b/tests/test_listfilestrategy.py @@ -19,6 +19,12 @@ def test_file_filename(): assert File(empty).filename() == "foo.pdf" +def test_file_file_extension(): + empty = io.BytesIO() + empty.name = "test/foo.pdf" + assert File(empty).file_extension() == ".pdf" + + def test_file_contextmanager(): empty = io.BytesIO() empty.name = "test/foo.pdf" diff --git a/tests/test_prepdocslib_textsplitter.py b/tests/test_prepdocslib_textsplitter.py index e6bcc08cb5..830bae53c1 100644 --- a/tests/test_prepdocslib_textsplitter.py +++ b/tests/test_prepdocslib_textsplitter.py @@ -4,19 +4,20 @@ import pytest from scripts.prepdocslib.listfilestrategy import LocalListFileStrategy +from scripts.prepdocslib.page import Page from scripts.prepdocslib.pdfparser import LocalPdfParser from scripts.prepdocslib.searchmanager import Section -from scripts.prepdocslib.textsplitter import Page, TextSplitter +from scripts.prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter -def test_split_empty_pages(): - t = TextSplitter(False, True) +def test_sentencetextsplitter_split_empty_pages(): + t = SentenceTextSplitter(False, True) assert list(t.split_pages([])) == [] -def test_split_small_pages(): - t = TextSplitter(has_image_embeddings=False, verbose=True) +def test_sentencetextsplitter_split_small_pages(): + t = SentenceTextSplitter(has_image_embeddings=False, verbose=True) split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text="Not a large page")])) assert len(split_pages) == 1 @@ -25,8 +26,8 @@ def test_split_small_pages(): @pytest.mark.asyncio -async def test_list_parse_and_split(tmp_path): - text_splitter = TextSplitter(False, True) +async def test_sentencetextsplitter_list_parse_and_split(tmp_path): + text_splitter = SentenceTextSplitter(False, True) pdf_parser = LocalPdfParser() for pdf in Path("data").glob("*.pdf"): shutil.copy(str(pdf.absolute()), tmp_path) @@ -44,3 +45,35 @@ async def test_list_parse_and_split(tmp_path): assert sections processed += 1 assert processed > 1 + + +def test_simpletextsplitter_split_empty_pages(): + t = SimpleTextSplitter(True) + + assert list(t.split_pages([])) == [] + + +def test_simpletextsplitter_split_small_pages(): + t = SimpleTextSplitter(verbose=True) + + split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text='{"test": "Not a large page"}')])) + assert len(split_pages) == 1 + assert split_pages[0].page_num == 0 + assert split_pages[0].text == '{"test": "Not a large page"}' + + +def test_sentencetextsplitter_split_pages(): + max_object_length = 10 + t = SimpleTextSplitter(max_object_length=max_object_length, verbose=True) + + split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text='{"test": "Not a large page"}')])) + assert len(split_pages) == 3 + assert split_pages[0].page_num == 0 + assert split_pages[0].text == '{"test": "' + assert len(split_pages[0].text) <= max_object_length + assert split_pages[1].page_num == 1 + assert split_pages[1].text == "Not a larg" + assert len(split_pages[1].text) <= max_object_length + assert split_pages[2].page_num == 2 + assert split_pages[2].text == 'e page"}' + assert len(split_pages[2].text) <= max_object_length