Skip to content

Commit

Permalink
Add JsonFileParser to FileStrategy (#1195)
Browse files Browse the repository at this point in the history
* Add JsonFileParser to FileStrategy

* Refactor JSON parser in prepdocs.py

* fixed linting errors with ruff

* Fix formatting in filestrategy.py and test_jsonparser.py

* Added new textsplitter and tests

* Added File processors and refactor of prepdocs.py

* fix ruff formatting issues

* fix linting errors

* Update scripts/prepdocslib/jsonparser.py

Co-authored-by: Pamela Fox <[email protected]>

* Update scripts/prepdocslib/parser.py

Co-authored-by: Pamela Fox <[email protected]>

* Added sample json data, fixed bug in file extension

* Fix file extension retrieval in File class

* Refactor prepdocs.py script

* renamed data examples, added test

* Fix offset, add tests

* Add pragma no cover

* Use the whole version of dataclass

* Run ruff on imports

* Reformatting

---------

Co-authored-by: Pamela Fox <[email protected]>
Co-authored-by: Pamela Fox <[email protected]>
  • Loading branch information
3 people authored Feb 2, 2024
1 parent 232a6e0 commit 270d869
Show file tree
Hide file tree
Showing 18 changed files with 531 additions and 70 deletions.
14 changes: 14 additions & 0 deletions data/Json_Examples/2189.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"AreaPath": "SmartHotel360",
"AssignedTo": null,
"Categories": null,
"ChangedDate": "2023-12-13T23:08:38.69Z",
"ClosedDate": null,
"CreatedDate": "2023-12-13T23:08:38.69Z",
"Description": "As a customer, I would like to reserve a conference room such that:<div><br> </div><div>1. It should display available date and time slots </div><div>2. Give an option to reserve a conference room for X hours </div><div>3. One can reserve a conference room for max 4 hours per day </div>",
"Id": 2189,
"State": "New",
"StateChangeDate": "2023-12-13T23:08:38.69Z",
"Tags": "Reservation",
"Title": "As a customer, I would like to reserve a conference room"
}
14 changes: 14 additions & 0 deletions data/Json_Examples/2190.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"AreaPath": "SmartHotel360",
"AssignedTo": null,
"Categories": null,
"ChangedDate": "2023-12-13T23:08:38.997Z",
"ClosedDate": null,
"CreatedDate": "2023-12-13T23:08:38.997Z",
"Description": "<p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;background:white;\">Enter the&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;\">guest's<span style=\"background:white;\">&nbsp;name to whom you&nbsp;</span></span>would\nlike to send<span style=\"background:white;\">&nbsp;a&nbsp;</span>confirmation,&nbsp;<span style=\"background:white;\">display the company, contact, source\nand&nbsp;</span>agent<span style=\"background:white;\">&nbsp;associated\nwith the&nbsp;</span>reservation<span style=\"background:white;\">.</span> </p>",
"Id": 2190,
"State": "New",
"StateChangeDate": "2023-12-13T23:08:38.997Z",
"Tags": "Notification",
"Title": "As a reservation agent, I would like to send confirmations to guest"
}
14 changes: 14 additions & 0 deletions data/Json_Examples/2191.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"AreaPath": "SmartHotel360",
"AssignedTo": null,
"Categories": null,
"ChangedDate": "2023-12-13T23:08:39.17Z",
"ClosedDate": null,
"CreatedDate": "2023-12-13T23:08:39.17Z",
"Description": "<div><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">If you have not picked up\nyour&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;\">vehicle<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;you can remove or cancel your&nbsp;</span></span>reservation<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;by clicking here.</span> </p><p class=MsoNormal><span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"><br></span> </p> </div><div>1. Car reserved should have an option to cancel the request </div><div>2. Car driver should receive a notification about cancellation </div>",
"Id": 2191,
"State": "New",
"StateChangeDate": "2023-12-13T23:08:39.17Z",
"Tags": "Reservation",
"Title": "As a customer, I should be able to remove a car reservation "
}
14 changes: 14 additions & 0 deletions data/Json_Examples/2192.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"AreaPath": "SmartHotel360",
"AssignedTo": null,
"Categories": null,
"ChangedDate": "2023-12-13T23:08:39.383Z",
"ClosedDate": null,
"CreatedDate": "2023-12-13T23:08:39.383Z",
"Description": "<span style=\"font-family:&quot;Segoe UI&quot;, sans-serif;font-size:10.5pt;\">As a courtesy, grant an\nextra hour or two to leave&nbsp;the&nbsp;room, especially if it isn't booked\nfor&nbsp;the upcoming evening. But customer must&nbsp;call the&nbsp;front desk\nin advance and&nbsp;request&nbsp;a&nbsp;late checkout.</span><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"></span> </p><p class=MsoNormal><span style=\"font-size:12.0pt;line-height:107%;\"></span> </p><div><div><br> </div><div>1. Late Check-in time should be displayed </div><div>2. Request should be sent to front-desk&nbsp; </div><div>3. Any extra charge should be displayed </div> </div>",
"Id": 2192,
"State": "New",
"StateChangeDate": "2023-12-13T23:08:39.383Z",
"Tags": "Front-desk; Members; Reservation",
"Title": "As a customer, I should be able to request hotel for late Check-out"
}
244 changes: 244 additions & 0 deletions data/Json_Examples/query.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
[
{
"fields": {
"System.Id": 2348,
"System.State": "New",
"System.Title": "Provide related items or frequently bought together section when people browse or search",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2348,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2348"
},
{
"fields": {
"System.Id": 2349,
"System.State": "New",
"System.Title": "As tester, I need to test the website on all the relevant broswers and devices and be sure that it can handle our load.",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2349,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2349"
},
{
"fields": {
"System.Id": 2350,
"System.State": "New",
"System.Title": "As a customer, I should be able to put items to shopping cart",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2350,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2350"
},
{
"fields": {
"System.Id": 2351,
"System.State": "New",
"System.Title": "As a customer, I should be able to print my purchase order",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2351,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2351"
},
{
"fields": {
"System.Id": 2352,
"System.State": "New",
"System.Title": "As a customer, I would like to have a sort capabaility by price and customer ratings",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2352,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2352"
},
{
"fields": {
"System.Id": 2353,
"System.State": "New",
"System.Title": "Recommended products must be based on customer purchase pattern history",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2353,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2353"
},
{
"fields": {
"System.Id": 2354,
"System.State": "New",
"System.Title": "As a customer, I would like to save my addresses so that I can easily select the address for delivery",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2354,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2354"
},
{
"fields": {
"System.Id": 2355,
"System.State": "New",
"System.Title": "As marketer, I want to run an A|B test on alternative Web Sites using Application Insights.",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2355,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2355"
},
{
"fields": {
"System.AssignedTo": {
"_links": {
"avatar": {
"href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
}
},
"descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
"displayName": "Chris Ayers",
"id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
"imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
"uniqueName": "[email protected]",
"url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
},
"System.Id": 2356,
"System.State": "Done",
"System.Title": "Provide customers the ability to track status of the package",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2356,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2356"
},
{
"fields": {
"System.AssignedTo": {
"_links": {
"avatar": {
"href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
}
},
"descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
"displayName": "Chris Ayers",
"id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
"imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
"uniqueName": "[email protected]",
"url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
},
"System.Id": 2357,
"System.State": "Done",
"System.Title": "As a customer, I would like to have the ability to send my items as gift",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2357,
"relations": null,
"rev": 2,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2357"
},
{
"fields": {
"System.Id": 2358,
"System.State": "Committed",
"System.Title": "As a customer, I would like to store my credit card details securely",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2358,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2358"
},
{
"fields": {
"System.Id": 2359,
"System.State": "Committed",
"System.Title": "As a customer, I should be able to select different shipping option",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2359,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2359"
},
{
"fields": {
"System.Id": 2360,
"System.State": "Committed",
"System.Title": "As developer, I want to use Azure Machine Learning to provide a recommendations engine behind the website.",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2360,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2360"
},
{
"fields": {
"System.Id": 2361,
"System.State": "Committed",
"System.Title": "Provide tentative duration for shipping.",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2361,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2361"
},
{
"fields": {
"System.Id": 2362,
"System.State": "Approved",
"System.Title": "Notify the user about any changes made to the order",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2362,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2362"
},
{
"fields": {
"System.Id": 2363,
"System.State": "Approved",
"System.Title": "As a admin, I should be able to update prices on ad-hoc condition",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2363,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2363"
},
{
"fields": {
"System.Id": 2364,
"System.State": "Approved",
"System.Title": "As a customer, I would like to provide my feedback on items that I have purchased",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2364,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2364"
},
{
"fields": {
"System.Id": 2365,
"System.State": "Approved",
"System.Title": "As a customer, I would like to have a wishlist where I can add items for future purchase",
"System.WorkItemType": "Product Backlog Item"
},
"id": 2365,
"relations": null,
"rev": 1,
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2365"
}
]
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.ruff]
target-version = "py38"
select = ["E", "F", "I", "UP"]
ignore = ["E501", "E701"] # line too long, multiple statements on one line
lint.select = ["E", "F", "I", "UP"]
lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line
src = ["app/backend", "scripts"]

[tool.ruff.isort]
Expand Down
38 changes: 22 additions & 16 deletions scripts/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,18 @@
OpenAIEmbeddings,
OpenAIEmbeddingService,
)
from prepdocslib.fileprocessor import FileProcessor
from prepdocslib.filestrategy import DocumentAction, FileStrategy
from prepdocslib.jsonparser import JsonParser
from prepdocslib.listfilestrategy import (
ADLSGen2ListFileStrategy,
ListFileStrategy,
LocalListFileStrategy,
)
from prepdocslib.pdfparser import DocumentAnalysisPdfParser, LocalPdfParser, PdfParser
from prepdocslib.parser import Parser
from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser
from prepdocslib.strategy import SearchInfo, Strategy
from prepdocslib.textsplitter import TextSplitter
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter


def is_key_empty(key):
Expand Down Expand Up @@ -52,25 +55,29 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
verbose=args.verbose,
)

pdf_parser: PdfParser
if args.localpdfparser:
pdf_parser = LocalPdfParser()
else:
# check if Azure Document Intelligence credentials are provided
if args.formrecognizerservice is None:
print(
"Error: Azure Document Intelligence service is not provided. Please provide --formrecognizerservice or use --localpdfparser for local pypdf parser."
)
exit(1)
pdf_parser: Parser
doc_int_parser: DocumentAnalysisParser

# check if Azure Document Intelligence credentials are provided
if args.formrecognizerservice is not None:
formrecognizer_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
credential if is_key_empty(args.formrecognizerkey) else AzureKeyCredential(args.formrecognizerkey)
)
pdf_parser = DocumentAnalysisPdfParser(
doc_int_parser = DocumentAnalysisParser(
endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/",
credential=formrecognizer_creds,
verbose=args.verbose,
)

if args.localpdfparser or args.formrecognizerservice is None:
pdf_parser = LocalPdfParser()
else:
pdf_parser = doc_int_parser
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=args.searchimages)
file_processors = {
".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
}
use_vectors = not args.novectors
embeddings: Optional[OpenAIEmbeddings] = None
if use_vectors and args.openaihost != "openai":
Expand Down Expand Up @@ -128,8 +135,7 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
return FileStrategy(
list_file_strategy=list_file_strategy,
blob_manager=blob_manager,
pdf_parser=pdf_parser,
text_splitter=TextSplitter(has_image_embeddings=args.searchimages),
file_processors=file_processors,
document_action=document_action,
embeddings=embeddings,
image_embeddings=image_embeddings,
Expand Down
10 changes: 10 additions & 0 deletions scripts/prepdocslib/fileprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass

from .parser import Parser
from .textsplitter import TextSplitter


@dataclass(frozen=True)
class FileProcessor:
parser: Parser
splitter: TextSplitter
Loading

0 comments on commit 270d869

Please sign in to comment.