From 270d869b243c14a823646b7b68fcf9816a19d5eb Mon Sep 17 00:00:00 2001
From: Chris Ayers <clayers@gmail.com>
Date: Thu, 1 Feb 2024 20:03:20 -0500
Subject: [PATCH] Add JsonFileParser to FileStrategy (#1195)

* Add JsonFileParser to FileStrategy

* Refactor JSON parser in prepdocs.py

* fixed linting errors with ruff

* Fix formatting in filestrategy.py and test_jsonparser.py

* Added new textsplitter and tests

* Added File processors and refactor of prepdocs.py

* fix ruff formatting issues

* fix linting errors

* Update scripts/prepdocslib/jsonparser.py

Co-authored-by: Pamela Fox <pamela.fox@gmail.com>

* Update scripts/prepdocslib/parser.py

Co-authored-by: Pamela Fox <pamela.fox@gmail.com>

* Added sample json data, fixed bug in file extension

* Fix file extension retrieval in File class

* Refactor prepdocs.py script

* renamed data examples, added test

* Fix offset, add tests

* Add pragma no cover

* Use the whole version of dataclass

* Run ruff on imports

* Reformatting

---------

Co-authored-by: Pamela Fox <pamela.fox@gmail.com>
Co-authored-by: Pamela Fox <pamelafox@microsoft.com>
---
 data/Json_Examples/2189.json            |  14 ++
 data/Json_Examples/2190.json            |  14 ++
 data/Json_Examples/2191.json            |  14 ++
 data/Json_Examples/2192.json            |  14 ++
 data/Json_Examples/query.json           | 244 ++++++++++++++++++++++++
 pyproject.toml                          |   4 +-
 scripts/prepdocs.py                     |  38 ++--
 scripts/prepdocslib/fileprocessor.py    |  10 +
 scripts/prepdocslib/filestrategy.py     |  22 ++-
 scripts/prepdocslib/jsonparser.py       |  23 +++
 scripts/prepdocslib/listfilestrategy.py |   3 +
 scripts/prepdocslib/page.py             |  24 +++
 scripts/prepdocslib/parser.py           |  14 ++
 scripts/prepdocslib/pdfparser.py        |  35 +---
 scripts/prepdocslib/textsplitter.py     |  43 ++++-
 tests/test_jsonparser.py                |  32 ++++
 tests/test_listfilestrategy.py          |   6 +
 tests/test_prepdocslib_textsplitter.py  |  47 ++++-
 18 files changed, 531 insertions(+), 70 deletions(-)
 create mode 100644 data/Json_Examples/2189.json
 create mode 100644 data/Json_Examples/2190.json
 create mode 100644 data/Json_Examples/2191.json
 create mode 100644 data/Json_Examples/2192.json
 create mode 100644 data/Json_Examples/query.json
 create mode 100644 scripts/prepdocslib/fileprocessor.py
 create mode 100644 scripts/prepdocslib/jsonparser.py
 create mode 100644 scripts/prepdocslib/page.py
 create mode 100644 scripts/prepdocslib/parser.py
 create mode 100644 tests/test_jsonparser.py
diff --git a/data/Json_Examples/2189.json b/data/Json_Examples/2189.json
new file mode 100644
index 0000000000..d7066c9fbd
--- /dev/null
+++ b/data/Json_Examples/2189.json
@@ -0,0 +1,14 @@
+{
+  "AreaPath": "SmartHotel360",
+  "AssignedTo": null,
+  "Categories": null,
+  "ChangedDate": "2023-12-13T23:08:38.69Z",
+  "ClosedDate": null,
+  "CreatedDate": "2023-12-13T23:08:38.69Z",
+  "Description": "As a customer, I would like to reserve a conference room such that:<div><br> </div><div>1. It should display available date and time slots </div><div>2. Give an option to reserve a conference room for X hours </div><div>3. One can reserve a conference room for max 4 hours per day </div>",
+  "Id": 2189,
+  "State": "New",
+  "StateChangeDate": "2023-12-13T23:08:38.69Z",
+  "Tags": "Reservation",
+  "Title": "As a customer, I would like to reserve a conference room"
+}
diff --git a/data/Json_Examples/2190.json b/data/Json_Examples/2190.json
new file mode 100644
index 0000000000..5a45f1158f
--- /dev/null
+++ b/data/Json_Examples/2190.json
@@ -0,0 +1,14 @@
+{
+  "AreaPath": "SmartHotel360",
+  "AssignedTo": null,
+  "Categories": null,
+  "ChangedDate": "2023-12-13T23:08:38.997Z",
+  "ClosedDate": null,
+  "CreatedDate": "2023-12-13T23:08:38.997Z",
+  "Description": "<p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;background:white;\">Enter the&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;\">guest's<span style=\"background:white;\">&nbsp;name to whom you&nbsp;</span></span>would\nlike to send<span style=\"background:white;\">&nbsp;a&nbsp;</span>confirmation,&nbsp;<span style=\"background:white;\">display the company, contact, source\nand&nbsp;</span>agent<span style=\"background:white;\">&nbsp;associated\nwith the&nbsp;</span>reservation<span style=\"background:white;\">.</span> </p>",
+  "Id": 2190,
+  "State": "New",
+  "StateChangeDate": "2023-12-13T23:08:38.997Z",
+  "Tags": "Notification",
+  "Title": "As a reservation agent, I would like to send confirmations to guest"
+}
diff --git a/data/Json_Examples/2191.json b/data/Json_Examples/2191.json
new file mode 100644
index 0000000000..455e4c9a24
--- /dev/null
+++ b/data/Json_Examples/2191.json
@@ -0,0 +1,14 @@
+{
+  "AreaPath": "SmartHotel360",
+  "AssignedTo": null,
+  "Categories": null,
+  "ChangedDate": "2023-12-13T23:08:39.17Z",
+  "ClosedDate": null,
+  "CreatedDate": "2023-12-13T23:08:39.17Z",
+  "Description": "<div><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">If you have not picked up\nyour&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;\">vehicle<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;you can remove or cancel your&nbsp;</span></span>reservation<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;by clicking here.</span> </p><p class=MsoNormal><span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"><br></span> </p> </div><div>1. Car reserved should have an option to cancel the request </div><div>2. Car driver should receive a notification about cancellation </div>",
+  "Id": 2191,
+  "State": "New",
+  "StateChangeDate": "2023-12-13T23:08:39.17Z",
+  "Tags": "Reservation",
+  "Title": "As a customer, I should be able to remove a car reservation "
+}
diff --git a/data/Json_Examples/2192.json b/data/Json_Examples/2192.json
new file mode 100644
index 0000000000..d2e489f317
--- /dev/null
+++ b/data/Json_Examples/2192.json
@@ -0,0 +1,14 @@
+{
+  "AreaPath": "SmartHotel360",
+  "AssignedTo": null,
+  "Categories": null,
+  "ChangedDate": "2023-12-13T23:08:39.383Z",
+  "ClosedDate": null,
+  "CreatedDate": "2023-12-13T23:08:39.383Z",
+  "Description": "<span style=\"font-family:&quot;Segoe UI&quot;, sans-serif;font-size:10.5pt;\">As a courtesy, grant an\nextra hour or two to leave&nbsp;the&nbsp;room, especially if it isn't booked\nfor&nbsp;the upcoming evening. But customer must&nbsp;call the&nbsp;front desk\nin advance and&nbsp;request&nbsp;a&nbsp;late checkout.</span><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"></span> </p><p class=MsoNormal><span style=\"font-size:12.0pt;line-height:107%;\"></span> </p><div><div><br> </div><div>1. Late Check-in time should be displayed </div><div>2. Request should be sent to front-desk&nbsp; </div><div>3. Any extra charge should be displayed </div> </div>",
+  "Id": 2192,
+  "State": "New",
+  "StateChangeDate": "2023-12-13T23:08:39.383Z",
+  "Tags": "Front-desk; Members; Reservation",
+  "Title": "As a customer, I should be able to request hotel for late Check-out"
+}
diff --git a/data/Json_Examples/query.json b/data/Json_Examples/query.json
new file mode 100644
index 0000000000..ceb9dc1a6f
--- /dev/null
+++ b/data/Json_Examples/query.json
@@ -0,0 +1,244 @@
+[
+  {
+    "fields": {
+      "System.Id": 2348,
+      "System.State": "New",
+      "System.Title": "Provide related items or frequently bought together section when people browse or search",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2348,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2348"
+  },
+  {
+    "fields": {
+      "System.Id": 2349,
+      "System.State": "New",
+      "System.Title": "As tester, I need to test the website on all the relevant broswers and devices and be sure that it can handle our load.",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2349,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2349"
+  },
+  {
+    "fields": {
+      "System.Id": 2350,
+      "System.State": "New",
+      "System.Title": "As a customer, I should be able to put items to shopping cart",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2350,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2350"
+  },
+  {
+    "fields": {
+      "System.Id": 2351,
+      "System.State": "New",
+      "System.Title": "As a customer, I should be able to print my purchase order",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2351,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2351"
+  },
+  {
+    "fields": {
+      "System.Id": 2352,
+      "System.State": "New",
+      "System.Title": "As a customer, I would like to have a sort capabaility by price and customer ratings",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2352,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2352"
+  },
+  {
+    "fields": {
+      "System.Id": 2353,
+      "System.State": "New",
+      "System.Title": "Recommended products must be based on customer purchase pattern history",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2353,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2353"
+  },
+  {
+    "fields": {
+      "System.Id": 2354,
+      "System.State": "New",
+      "System.Title": "As a customer, I would like to save my addresses so that I can easily select the address for delivery",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2354,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2354"
+  },
+  {
+    "fields": {
+      "System.Id": 2355,
+      "System.State": "New",
+      "System.Title": "As marketer, I want to run an A|B test on alternative Web Sites using Application Insights.",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2355,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2355"
+  },
+  {
+    "fields": {
+      "System.AssignedTo": {
+        "_links": {
+          "avatar": {
+            "href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
+          }
+        },
+        "descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
+        "displayName": "Chris Ayers",
+        "id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
+        "imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
+        "uniqueName": "chrisayers@microsoft.com",
+        "url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
+      },
+      "System.Id": 2356,
+      "System.State": "Done",
+      "System.Title": "Provide customers the ability to track status of the package",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2356,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2356"
+  },
+  {
+    "fields": {
+      "System.AssignedTo": {
+        "_links": {
+          "avatar": {
+            "href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
+          }
+        },
+        "descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
+        "displayName": "Chris Ayers",
+        "id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
+        "imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
+        "uniqueName": "chrisayers@microsoft.com",
+        "url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
+      },
+      "System.Id": 2357,
+      "System.State": "Done",
+      "System.Title": "As a customer, I would like to have the ability to send my items as gift",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2357,
+    "relations": null,
+    "rev": 2,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2357"
+  },
+  {
+    "fields": {
+      "System.Id": 2358,
+      "System.State": "Committed",
+      "System.Title": "As a customer, I would like to store my credit card details securely",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2358,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2358"
+  },
+  {
+    "fields": {
+      "System.Id": 2359,
+      "System.State": "Committed",
+      "System.Title": "As a customer, I should be able to select different shipping option",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2359,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2359"
+  },
+  {
+    "fields": {
+      "System.Id": 2360,
+      "System.State": "Committed",
+      "System.Title": "As developer, I want to use Azure Machine Learning to provide a recommendations engine behind the website.",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2360,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2360"
+  },
+  {
+    "fields": {
+      "System.Id": 2361,
+      "System.State": "Committed",
+      "System.Title": "Provide tentative duration for shipping.",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2361,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2361"
+  },
+  {
+    "fields": {
+      "System.Id": 2362,
+      "System.State": "Approved",
+      "System.Title": "Notify the user about any changes made to the order",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2362,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2362"
+  },
+  {
+    "fields": {
+      "System.Id": 2363,
+      "System.State": "Approved",
+      "System.Title": "As a admin, I should be able to update prices on ad-hoc condition",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2363,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2363"
+  },
+  {
+    "fields": {
+      "System.Id": 2364,
+      "System.State": "Approved",
+      "System.Title": "As a customer, I would like to provide my feedback on items that I have purchased",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2364,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2364"
+  },
+  {
+    "fields": {
+      "System.Id": 2365,
+      "System.State": "Approved",
+      "System.Title": "As a customer, I would like to have a wishlist where I can add items for future purchase",
+      "System.WorkItemType": "Product Backlog Item"
+    },
+    "id": 2365,
+    "relations": null,
+    "rev": 1,
+    "url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2365"
+  }
+]
diff --git a/pyproject.toml b/pyproject.toml
index a24e3e7541..fe608e275f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.ruff]
 target-version = "py38"
-select = ["E", "F", "I", "UP"]
-ignore = ["E501", "E701"] # line too long, multiple statements on one line
+lint.select = ["E", "F", "I", "UP"]
+lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line
 src = ["app/backend", "scripts"]
 
 [tool.ruff.isort]
diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py
index bb73bc6602..a346041c6d 100644
--- a/scripts/prepdocs.py
+++ b/scripts/prepdocs.py
@@ -14,15 +14,18 @@
     OpenAIEmbeddings,
     OpenAIEmbeddingService,
 )
+from prepdocslib.fileprocessor import FileProcessor
 from prepdocslib.filestrategy import DocumentAction, FileStrategy
+from prepdocslib.jsonparser import JsonParser
 from prepdocslib.listfilestrategy import (
     ADLSGen2ListFileStrategy,
     ListFileStrategy,
     LocalListFileStrategy,
 )
-from prepdocslib.pdfparser import DocumentAnalysisPdfParser, LocalPdfParser, PdfParser
+from prepdocslib.parser import Parser
+from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser
 from prepdocslib.strategy import SearchInfo, Strategy
-from prepdocslib.textsplitter import TextSplitter
+from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
 
 
 def is_key_empty(key):
@@ -52,25 +55,29 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
         verbose=args.verbose,
     )
 
-    pdf_parser: PdfParser
-    if args.localpdfparser:
-        pdf_parser = LocalPdfParser()
-    else:
-        # check if Azure Document Intelligence credentials are provided
-        if args.formrecognizerservice is None:
-            print(
-                "Error: Azure Document Intelligence service is not provided. Please provide --formrecognizerservice or use --localpdfparser for local pypdf parser."
-            )
-            exit(1)
+    pdf_parser: Parser
+    doc_int_parser: DocumentAnalysisParser
+
+    # check if Azure Document Intelligence credentials are provided
+    if args.formrecognizerservice is not None:
         formrecognizer_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
             credential if is_key_empty(args.formrecognizerkey) else AzureKeyCredential(args.formrecognizerkey)
         )
-        pdf_parser = DocumentAnalysisPdfParser(
+        doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/",
             credential=formrecognizer_creds,
             verbose=args.verbose,
         )
-
+    if args.localpdfparser or args.formrecognizerservice is None:
+        pdf_parser = LocalPdfParser()
+    else:
+        pdf_parser = doc_int_parser
+    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=args.searchimages)
+    file_processors = {
+        ".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
+        ".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
+        ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
+    }
     use_vectors = not args.novectors
     embeddings: Optional[OpenAIEmbeddings] = None
     if use_vectors and args.openaihost != "openai":
@@ -128,8 +135,7 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
     return FileStrategy(
         list_file_strategy=list_file_strategy,
         blob_manager=blob_manager,
-        pdf_parser=pdf_parser,
-        text_splitter=TextSplitter(has_image_embeddings=args.searchimages),
+        file_processors=file_processors,
         document_action=document_action,
         embeddings=embeddings,
         image_embeddings=image_embeddings,
diff --git a/scripts/prepdocslib/fileprocessor.py b/scripts/prepdocslib/fileprocessor.py
new file mode 100644
index 0000000000..3b58130db8
--- /dev/null
+++ b/scripts/prepdocslib/fileprocessor.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+
+from .parser import Parser
+from .textsplitter import TextSplitter
+
+
+@dataclass(frozen=True)
+class FileProcessor:
+    parser: Parser
+    splitter: TextSplitter
diff --git a/scripts/prepdocslib/filestrategy.py b/scripts/prepdocslib/filestrategy.py
index c44add7ebc..0a63e50fe8 100644
--- a/scripts/prepdocslib/filestrategy.py
+++ b/scripts/prepdocslib/filestrategy.py
@@ -3,11 +3,10 @@
 
 from .blobmanager import BlobManager
 from .embeddings import ImageEmbeddings, OpenAIEmbeddings
+from .fileprocessor import FileProcessor
 from .listfilestrategy import ListFileStrategy
-from .pdfparser import PdfParser
 from .searchmanager import SearchManager, Section
 from .strategy import SearchInfo, Strategy
-from .textsplitter import TextSplitter
 
 
 class DocumentAction(Enum):
@@ -25,8 +24,7 @@ def __init__(
         self,
         list_file_strategy: ListFileStrategy,
         blob_manager: BlobManager,
-        pdf_parser: PdfParser,
-        text_splitter: TextSplitter,
+        file_processors: dict[str, FileProcessor],
         document_action: DocumentAction = DocumentAction.Add,
         embeddings: Optional[OpenAIEmbeddings] = None,
         image_embeddings: Optional[ImageEmbeddings] = None,
@@ -36,8 +34,7 @@ def __init__(
     ):
         self.list_file_strategy = list_file_strategy
         self.blob_manager = blob_manager
-        self.pdf_parser = pdf_parser
-        self.text_splitter = text_splitter
+        self.file_processors = file_processors
         self.document_action = document_action
         self.embeddings = embeddings
         self.image_embeddings = image_embeddings
@@ -61,12 +58,21 @@ async def run(self, search_info: SearchInfo):
             files = self.list_file_strategy.list()
             async for file in files:
                 try:
-                    pages = [page async for page in self.pdf_parser.parse(content=file.content)]
+                    key = file.file_extension()
+                    processor = self.file_processors[key]
+                    if not processor:
+                        # skip file if no parser is found
+                        if search_info.verbose:
+                            print(f"Skipping '{file.filename()}'.")
+                        continue
+                    if search_info.verbose:
+                        print(f"Parsing '{file.filename()}'")
+                    pages = [page async for page in processor.parser.parse(content=file.content)]
                     if search_info.verbose:
                         print(f"Splitting '{file.filename()}' into sections")
                     sections = [
                         Section(split_page, content=file, category=self.category)
-                        for split_page in self.text_splitter.split_pages(pages)
+                        for split_page in processor.splitter.split_pages(pages)
                     ]
 
                     blob_sas_uris = await self.blob_manager.upload_blob(file)
diff --git a/scripts/prepdocslib/jsonparser.py b/scripts/prepdocslib/jsonparser.py
new file mode 100644
index 0000000000..48c3eac046
--- /dev/null
+++ b/scripts/prepdocslib/jsonparser.py
@@ -0,0 +1,23 @@
+import json
+from typing import IO, AsyncGenerator
+
+from .page import Page
+from .parser import Parser
+
+
+class JsonParser(Parser):
+    """
+    Concrete parser that can parse JSON into Page objects. A top-level object becomes a single Page, while a top-level array becomes multiple Page objects.
+    """
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        offset = 0
+        data = json.loads(content.read())
+        if isinstance(data, list):
+            for i, obj in enumerate(data):
+                offset += 1  # For opening bracket or comma before object
+                page_text = json.dumps(obj)
+                yield Page(i, offset, page_text)
+                offset += len(page_text)
+        elif isinstance(data, dict):
+            yield Page(0, 0, json.dumps(data))
diff --git a/scripts/prepdocslib/listfilestrategy.py b/scripts/prepdocslib/listfilestrategy.py
index 153a1081d5..d0b24876f1 100644
--- a/scripts/prepdocslib/listfilestrategy.py
+++ b/scripts/prepdocslib/listfilestrategy.py
@@ -26,6 +26,9 @@ def __init__(self, content: IO, acls: Optional[dict[str, list]] = None):
     def filename(self):
         return os.path.basename(self.content.name)
 
+    def file_extension(self):
+        return os.path.splitext(self.content.name)[1]
+
     def filename_to_id(self):
         filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", self.filename())
         filename_hash = base64.b16encode(self.filename().encode("utf-8")).decode("ascii")
diff --git a/scripts/prepdocslib/page.py b/scripts/prepdocslib/page.py
new file mode 100644
index 0000000000..f12fe70b94
--- /dev/null
+++ b/scripts/prepdocslib/page.py
@@ -0,0 +1,24 @@
+class Page:
+    """
+    A single page from a document
+
+    Attributes:
+        page_num (int): Page number
+        offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
+        text (str): The text of the page
+    """
+
+    def __init__(self, page_num: int, offset: int, text: str):
+        self.page_num = page_num
+        self.offset = offset
+        self.text = text
+
+
+class SplitPage:
+    """
+    A section of a page that has been split into a smaller chunk.
+    """
+
+    def __init__(self, page_num: int, text: str):
+        self.page_num = page_num
+        self.text = text
diff --git a/scripts/prepdocslib/parser.py b/scripts/prepdocslib/parser.py
new file mode 100644
index 0000000000..09d12e0ad6
--- /dev/null
+++ b/scripts/prepdocslib/parser.py
@@ -0,0 +1,14 @@
+from abc import ABC
+from typing import IO, AsyncGenerator
+
+from .page import Page
+
+
+class Parser(ABC):
+    """
+    Abstract parser that parses content into Page objects
+    """
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        if False:
+            yield  # pragma: no cover - this is necessary for mypy to type check
diff --git a/scripts/prepdocslib/pdfparser.py b/scripts/prepdocslib/pdfparser.py
index c7e2b64491..10c2d9a2f2 100644
--- a/scripts/prepdocslib/pdfparser.py
+++ b/scripts/prepdocslib/pdfparser.py
@@ -1,5 +1,4 @@
 import html
-from abc import ABC
 from typing import IO, AsyncGenerator, Union
 
 from azure.ai.formrecognizer import DocumentTable
@@ -8,36 +7,12 @@
 from azure.core.credentials_async import AsyncTokenCredential
 from pypdf import PdfReader
 
+from .page import Page
+from .parser import Parser
 from .strategy import USER_AGENT
 
 
-class Page:
-    """
-    A single page from a pdf
-
-    Attributes:
-        page_num (int): Page number
-        offset (int): If the text of the entire PDF was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
-        text (str): The text of the page
-    """
-
-    def __init__(self, page_num: int, offset: int, text: str):
-        self.page_num = page_num
-        self.offset = offset
-        self.text = text
-
-
-class PdfParser(ABC):
-    """
-    Abstract parser that parses PDFs into pages
-    """
-
-    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
-        if False:
-            yield
-
-
-class LocalPdfParser(PdfParser):
+class LocalPdfParser(Parser):
     """
     Concrete parser backed by PyPDF that can parse PDFs into pages
     To learn more, please visit https://pypi.org/project/pypdf/
@@ -53,7 +28,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
             offset += len(page_text)
 
 
-class DocumentAnalysisPdfParser(PdfParser):
+class DocumentAnalysisParser(Parser):
     """
     Concrete parser backed by Azure AI Document Intelligence that can parse PDFS into pages
     To learn more, please visit https://learn.microsoft.com/azure/ai-services/document-intelligence/overview
@@ -108,7 +83,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                     if table_id == -1:
                         page_text += form_recognizer_results.content[page_offset + idx]
                     elif table_id not in added_tables:
-                        page_text += DocumentAnalysisPdfParser.table_to_html(tables_on_page[table_id])
+                        page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id])
                         added_tables.add(table_id)
 
                 yield Page(page_num=page_num, offset=offset, text=page_text)
diff --git a/scripts/prepdocslib/textsplitter.py b/scripts/prepdocslib/textsplitter.py
index 7421b57377..928aedd7e6 100644
--- a/scripts/prepdocslib/textsplitter.py
+++ b/scripts/prepdocslib/textsplitter.py
@@ -1,19 +1,22 @@
+from abc import ABC
 from typing import Generator, List
 
-from .pdfparser import Page
+from .page import Page, SplitPage
 
 
-class SplitPage:
+class TextSplitter(ABC):
     """
-    A section of a page that has been split into a smaller chunk.
+    Splits a list of pages into smaller chunks
+    :param pages: The pages to split
+    :return: A generator of SplitPage
     """
 
-    def __init__(self, page_num: int, text: str):
-        self.page_num = page_num
-        self.text = text
+    def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
+        if False:
+            yield  # pragma: no cover - this is necessary for mypy to type check
 
 
-class TextSplitter:
+class SentenceTextSplitter(TextSplitter):
     """
     Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once
     """
@@ -105,3 +108,29 @@ def find_page(offset):
 
         if start + self.section_overlap < end:
             yield SplitPage(page_num=find_page(start), text=all_text[start:end])
+
+
+class SimpleTextSplitter(TextSplitter):
+    """
+    Class that splits pages into smaller chunks based on a max object length. It is not aware of the content of the page.
+    This is required because embedding models may not be able to analyze an entire page at once
+    """
+
+    def __init__(self, max_object_length: int = 1000, verbose: bool = False):
+        self.max_object_length = max_object_length
+        self.verbose = verbose
+
+    def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
+        all_text = "".join(page.text for page in pages)
+        if len(all_text.strip()) == 0:
+            return
+
+        length = len(all_text)
+        if length <= self.max_object_length:
+            yield SplitPage(page_num=0, text=all_text)
+            return
+
+        # its too big, so we need to split it
+        for i in range(0, length, self.max_object_length):
+            yield SplitPage(page_num=i // self.max_object_length, text=all_text[i : i + self.max_object_length])
+        return
diff --git a/tests/test_jsonparser.py b/tests/test_jsonparser.py
new file mode 100644
index 0000000000..9ebc70919d
--- /dev/null
+++ b/tests/test_jsonparser.py
@@ -0,0 +1,32 @@
+import io
+
+import pytest
+
+from scripts.prepdocslib.jsonparser import JsonParser
+
+
+@pytest.mark.asyncio
+async def test_jsonparser_single_obj():
+    file = io.StringIO('{"test": "test"}')
+    file.name = "test.json"
+    jsonparser = JsonParser()
+    pages = [page async for page in jsonparser.parse(file)]
+    assert len(pages) == 1
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == '{"test": "test"}'
+
+
+@pytest.mark.asyncio
+async def test_jsonparser_array_multiple_obj():
+    file = io.StringIO('[{"test1": "test"},{"test2": "test"}]')
+    file.name = "test.json"
+    jsonparser = JsonParser()
+    pages = [page async for page in jsonparser.parse(file)]
+    assert len(pages) == 2
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 1
+    assert pages[0].text == '{"test1": "test"}'
+    assert pages[1].page_num == 1
+    assert pages[1].offset == 19
+    assert pages[1].text == '{"test2": "test"}'
diff --git a/tests/test_listfilestrategy.py b/tests/test_listfilestrategy.py
index 6bed5b4b51..bc72c4eba7 100644
--- a/tests/test_listfilestrategy.py
+++ b/tests/test_listfilestrategy.py
@@ -19,6 +19,12 @@ def test_file_filename():
     assert File(empty).filename() == "foo.pdf"
 
 
+def test_file_file_extension():
+    empty = io.BytesIO()
+    empty.name = "test/foo.pdf"
+    assert File(empty).file_extension() == ".pdf"
+
+
 def test_file_contextmanager():
     empty = io.BytesIO()
     empty.name = "test/foo.pdf"
diff --git a/tests/test_prepdocslib_textsplitter.py b/tests/test_prepdocslib_textsplitter.py
index e6bcc08cb5..830bae53c1 100644
--- a/tests/test_prepdocslib_textsplitter.py
+++ b/tests/test_prepdocslib_textsplitter.py
@@ -4,19 +4,20 @@
 import pytest
 
 from scripts.prepdocslib.listfilestrategy import LocalListFileStrategy
+from scripts.prepdocslib.page import Page
 from scripts.prepdocslib.pdfparser import LocalPdfParser
 from scripts.prepdocslib.searchmanager import Section
-from scripts.prepdocslib.textsplitter import Page, TextSplitter
+from scripts.prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
 
 
-def test_split_empty_pages():
-    t = TextSplitter(False, True)
+def test_sentencetextsplitter_split_empty_pages():
+    t = SentenceTextSplitter(False, True)
 
     assert list(t.split_pages([])) == []
 
 
-def test_split_small_pages():
-    t = TextSplitter(has_image_embeddings=False, verbose=True)
+def test_sentencetextsplitter_split_small_pages():
+    t = SentenceTextSplitter(has_image_embeddings=False, verbose=True)
 
     split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text="Not a large page")]))
     assert len(split_pages) == 1
@@ -25,8 +26,8 @@ def test_split_small_pages():
 
 
 @pytest.mark.asyncio
-async def test_list_parse_and_split(tmp_path):
-    text_splitter = TextSplitter(False, True)
+async def test_sentencetextsplitter_list_parse_and_split(tmp_path):
+    text_splitter = SentenceTextSplitter(False, True)
     pdf_parser = LocalPdfParser()
     for pdf in Path("data").glob("*.pdf"):
         shutil.copy(str(pdf.absolute()), tmp_path)
@@ -44,3 +45,35 @@ async def test_list_parse_and_split(tmp_path):
         assert sections
         processed += 1
     assert processed > 1
+
+
+def test_simpletextsplitter_split_empty_pages():
+    t = SimpleTextSplitter(True)
+
+    assert list(t.split_pages([])) == []
+
+
+def test_simpletextsplitter_split_small_pages():
+    t = SimpleTextSplitter(verbose=True)
+
+    split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text='{"test": "Not a large page"}')]))
+    assert len(split_pages) == 1
+    assert split_pages[0].page_num == 0
+    assert split_pages[0].text == '{"test": "Not a large page"}'
+
+
+def test_sentencetextsplitter_split_pages():
+    max_object_length = 10
+    t = SimpleTextSplitter(max_object_length=max_object_length, verbose=True)
+
+    split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text='{"test": "Not a large page"}')]))
+    assert len(split_pages) == 3
+    assert split_pages[0].page_num == 0
+    assert split_pages[0].text == '{"test": "'
+    assert len(split_pages[0].text) <= max_object_length
+    assert split_pages[1].page_num == 1
+    assert split_pages[1].text == "Not a larg"
+    assert len(split_pages[1].text) <= max_object_length
+    assert split_pages[2].page_num == 2
+    assert split_pages[2].text == 'e page"}'
+    assert len(split_pages[2].text) <= max_object_length