From d093d597b5e8c7cadc707363c38997f4b2181976 Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Fri, 10 Jan 2025 11:02:52 +0800
Subject: [PATCH 1/9] format pdf reader

---
 poetry.lock                                   | 101 +----
 pyproject.toml                                |   4 +-
 pyproject_gpu.toml                            |   4 +-
 .../integrations/readers/pai_pdf_reader.py    | 377 ++++++++----------
 4 files changed, 189 insertions(+), 297 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 208029b0..ed3970bf 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5083,12 +5083,12 @@ source = ["Cython (>=3.0.11)"]
 
 [[package]]
 name = "magic-pdf"
-version = "0.10.5"
+version = "0.10.6"
 description = "A practical tool for converting PDF to Markdown"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "magic_pdf-0.10.5-py3-none-any.whl", hash = "sha256:723655c30da3d595da6873894dafa6783f453ac71e871ee7584488e316f3a0fa"},
+    {file = "magic_pdf-0.10.6-py3-none-any.whl", hash = "sha256:d9efb6d99a74d451ffac0118c202318f4e85ef67822a2c6183969034ea31bf9e"},
 ]
 
 [package.dependencies]
@@ -5111,6 +5111,7 @@ paddlepaddle = [
     {version = "3.0.0b1", optional = true, markers = "platform_system == \"Linux\" and extra == \"full\""},
     {version = "2.6.1", optional = true, markers = "(platform_system == \"Windows\" or platform_system == \"Darwin\") and extra == \"full\""},
 ]
+"pdfminer.six" = "20231228"
 pydantic = ">=2.7.2,<2.8.0"
 PyMuPDF = ">=1.24.9"
 PyYAML = {version = "*", optional = true, markers = "extra == \"full\""}
@@ -5118,13 +5119,17 @@ rapid-table = {version = "*", optional = true, markers = "extra == \"full\""}
 rapidocr-paddle = {version = "*", optional = true, markers = "extra == \"full\""}
 scikit-learn = ">=1.0.2"
 struct-eqtable = {version = "0.3.2", optional = true, markers = "extra == \"full\""}
-torch = ">=2.2.2,<=2.3.1"
+torch = [
+    {version = ">=2.2.2"},
+    {version = ">=2.2.2,<=2.3.1", optional = true, markers = "extra == \"full\""},
+]
+torchvision = {version = ">=0.17.2,<=0.18.1", optional = true, markers = "extra == \"full\""}
 transformers = "*"
-ultralytics = {version = "*", optional = true, markers = "extra == \"full\""}
-unimernet = {version = "0.2.1", optional = true, markers = "extra == \"full\""}
+ultralytics = {version = ">=8.3.48", optional = true, markers = "extra == \"full\""}
+unimernet = {version = "0.2.2", optional = true, markers = "extra == \"full\""}
 
 [package.extras]
-full = ["PyYAML", "accelerate", "detectron2", "doclayout-yolo (==0.0.2)", "einops", "matplotlib", "matplotlib (<=3.9.0)", "paddleocr (==2.7.3)", "paddlepaddle (==2.6.1)", "paddlepaddle (==3.0.0b1)", "rapid-table", "rapidocr-paddle", "struct-eqtable (==0.3.2)", "ultralytics", "unimernet (==0.2.1)"]
+full = ["PyYAML", "accelerate", "detectron2", "doclayout-yolo (==0.0.2)", "einops", "matplotlib", "matplotlib (<=3.9.0)", "paddleocr (==2.7.3)", "paddlepaddle (==2.6.1)", "paddlepaddle (==3.0.0b1)", "rapid-table", "rapidocr-paddle", "struct-eqtable (==0.3.2)", "torch (>=2.2.2,<=2.3.1)", "torchvision (>=0.17.2,<=0.18.1)", "ultralytics (>=8.3.48)", "unimernet (==0.2.2)"]
 lite = ["paddleocr (==2.7.3)", "paddlepaddle (==2.6.1)", "paddlepaddle (==3.0.0b1)"]
 old-linux = ["albumentations (<=1.4.20)"]
 
@@ -10280,71 +10285,6 @@ type = "legacy"
 url = "https://download.pytorch.org/whl/cpu"
 reference = "pytorch_cpu"
 
-[[package]]
-name = "torchtext"
-version = "0.17.2"
-description = "Text utilities, models, transforms, and datasets for PyTorch."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "torchtext-0.17.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:760ef1464ffe72c5a25f53617b011afeb31641f0ad016479b96d151d2acd8396"},
-    {file = "torchtext-0.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84b8b907f6bfbf637ea80521060c8800d3b9f5a5b4ba2daf88ce0f6e728dafa0"},
-    {file = "torchtext-0.17.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6bbfd762aa804b48bd7bd454997597adaa9cf785dba252a2e03241e48c6dbd35"},
-    {file = "torchtext-0.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:f09b677316bb2c1a2a326a01f3549a8f3096170209d5676a7dd6f47583b58e86"},
-    {file = "torchtext-0.17.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:1c0235333af0566f343bcfec7ffab0ef217152d75f3951432d4edd9eeb4185ad"},
-    {file = "torchtext-0.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ea1b44b2ea6792ec0aa1a72527dbd8d92c46e4b5f7180e854082df0e1ae4d60"},
-    {file = "torchtext-0.17.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:408b2c07351a6ac4090409860dca9fd8302e189403d77091c30a112824686c83"},
-    {file = "torchtext-0.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:a9f2af1380e9cd7bc55839ffa353c7bfd62eeae9e48383085e3b4494dcdf82c8"},
-    {file = "torchtext-0.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1eb3d76abce1f736f369787e089beb4fe6f93665f304fea996fe634e28c13bb9"},
-    {file = "torchtext-0.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39d34491ec9921e7f6877ec28558d401807364006ae6035ff980361b1a09e7e1"},
-    {file = "torchtext-0.17.2-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:04fb201d46f68a7f708aac7cb9605f8c4a67927f311849d5f00a8095f6132fe3"},
-    {file = "torchtext-0.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:0664f14781a6b045cb5e6610f3ad6240a8f7c1cb92e9567efaf54836a1a56a38"},
-    {file = "torchtext-0.17.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:72318d42a831e93a7ca22b801b8790790aecf8fc70e59d5ef0389ebb2b9f62d7"},
-    {file = "torchtext-0.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2f3d707f028b4876d12cb72a4a53c5f1c6fef7987b991cb2bc47662db07e6f92"},
-    {file = "torchtext-0.17.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8b531ca852280211ea13cfab031d196b9ce010bf4ec6434605c8419e27fc3231"},
-    {file = "torchtext-0.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:6d60cb71195cd75f30ac065ea60f6dd4d0b313b806724b07ae3bf4d93ece568f"},
-    {file = "torchtext-0.17.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:41de828f25f918db53145cf38bfeb1ae0f9e9a0d9c66494dbc39779475055036"},
-    {file = "torchtext-0.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f07201e6cd6d3ffad15cf3a65eb0e8409ad82a2859b880122e1b66aba5a3b2dd"},
-    {file = "torchtext-0.17.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:486b51a83f880700824be05ea0ea60e23cd821d15e0a89300c26d35b6f4eb8da"},
-    {file = "torchtext-0.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:a6268589890779947d8a5d1a589f8c9fcc88b76cbaf93e909005414cc660b536"},
-]
-
-[package.dependencies]
-numpy = "*"
-requests = "*"
-torch = "2.2.2"
-tqdm = "*"
-
-[[package]]
-name = "torchtext"
-version = "0.18.0"
-description = "Text utilities, models, transforms, and datasets for PyTorch."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "torchtext-0.18.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5826d5bbfe84a3c533e7e97659f72dbff73e1614c00c06709607d17c8446e09c"},
-    {file = "torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:3dc446f74aaa9aebab045fbefd102752675258e72ba447982c65e010e1cfd29a"},
-    {file = "torchtext-0.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:d4bfe9cb7b08cf7ff3473309d9f24ed243c3a847bfbb2c932925551bf7a05892"},
-    {file = "torchtext-0.18.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0f3855b2ada84f02298e72ad19c1a86f940df2f4ce62d89098955f3ae575d174"},
-    {file = "torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7ac7a392ae42d8b7675bdb31f1764bec77d4dec3a44bca5a2644c2cee3484453"},
-    {file = "torchtext-0.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:1e00475dbf629ba529d27903f2dd6b53c4a559f1483539b8c2a821d393bd24cf"},
-    {file = "torchtext-0.18.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fec43696fb6fa7573e740a8175fd69681106574fd1fc840211182d941b88a2ba"},
-    {file = "torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:eeebf2ec950c9f9d3b276faf6948e763836c215747354f0340746b32512d11f6"},
-    {file = "torchtext-0.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:99b5148f77aa5d94adb8d4d5b684181d87673b90ba266d858b1dd8812b418b95"},
-    {file = "torchtext-0.18.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6694b823cb409706a0efe4d6b0ccf6b5be5af695fad29aa062f1f63bd296e77b"},
-    {file = "torchtext-0.18.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0d60cde93217086372e6819806298a327aaa71f1818ff9c54380bbd5995dda78"},
-    {file = "torchtext-0.18.0-cp38-cp38-win_amd64.whl", hash = "sha256:6dd72c5fbca0680cfef14cb620f8edf7b01e4121916f4b45e2d50f1cdba53fe9"},
-    {file = "torchtext-0.18.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b74b0b1e93ff852a0410bdf2b630f4b00a870ec95be6266e01cd5e19acdf3e95"},
-    {file = "torchtext-0.18.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8e8d847a5e359718c1a97cab363de93aef93733c102528231f3b36c9cf580ce2"},
-    {file = "torchtext-0.18.0-cp39-cp39-win_amd64.whl", hash = "sha256:077639a367e1f77b2c7cefd952ec83c9f830a7568fb49f10cbc100eb965da06b"},
-]
-
-[package.dependencies]
-numpy = "*"
-requests = "*"
-torch = ">=2.3.0"
-tqdm = "*"
-
 [[package]]
 name = "torchvision"
 version = "0.17.2"
@@ -10704,13 +10644,13 @@ files = [
 
 [[package]]
 name = "ultralytics"
-version = "8.3.43"
+version = "8.3.58"
 description = "Ultralytics YOLO 🚀 for SOTA object detection, multi-object tracking, instance segmentation, pose estimation and image classification."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "ultralytics-8.3.43-py3-none-any.whl", hash = "sha256:b438a4f11b6a418f4592a72e25dec910539ebc6834b9002e5019e2984ec95081"},
-    {file = "ultralytics-8.3.43.tar.gz", hash = "sha256:ef17ec842e7b10e6e97028ae1d1af70108b59c4511c9799881f00f6ff4b3a93e"},
+    {file = "ultralytics-8.3.58-py3-none-any.whl", hash = "sha256:b6c87e845d6bfcc49c84bb2d7922af2f57fee637631bc6a2e95f409e5f867496"},
+    {file = "ultralytics-8.3.58.tar.gz", hash = "sha256:2d0ca4e5e7612365dc74c4c75e23ef0aecd677887dd7d272dbb77bd93e6ef087"},
 ]
 
 [package.dependencies]
@@ -10737,7 +10677,7 @@ tqdm = ">=4.64.0"
 ultralytics-thop = ">=2.0.0"
 
 [package.extras]
-dev = ["coverage[toml]", "ipython", "mkdocs (>=1.6.0)", "mkdocs-jupyter", "mkdocs-macros-plugin (>=1.0.5)", "mkdocs-material (>=9.5.9)", "mkdocs-redirects", "mkdocs-ultralytics-plugin (>=0.1.8)", "mkdocstrings[python]", "pytest", "pytest-cov"]
+dev = ["coverage[toml]", "ipython", "mkdocs (>=1.6.0)", "mkdocs-macros-plugin (>=1.0.5)", "mkdocs-material (>=9.5.9)", "mkdocs-redirects", "mkdocs-ultralytics-plugin (>=0.1.8)", "mkdocstrings[python]", "pytest", "pytest-cov"]
 export = ["coremltools (>=7.0)", "flatbuffers (>=23.5.26,<100)", "h5py (!=3.11.0)", "keras", "numpy (==1.23.5)", "onnx (>=1.12.0)", "openvino (>=2024.0.0)", "scikit-learn (>=1.3.2)", "tensorflow (>=2.0.0)", "tensorflowjs (>=3.9.0)", "tensorstore (>=0.1.63)"]
 extra = ["albumentations (>=1.4.6)", "hub-sdk (>=0.0.12)", "ipython", "pycocotools (>=2.0.7)"]
 logging = ["comet", "dvclive (>=2.12.0)", "tensorboard (>=2.13.0)"]
@@ -10784,12 +10724,12 @@ tbb = ["tbb (>=2019.0)"]
 
 [[package]]
 name = "unimernet"
-version = "0.2.1"
+version = "0.2.2"
 description = "UniMERNet: A Universal Network for Real-World Mathematical Expression Recognition"
 optional = false
 python-versions = ">=3.10"
 files = [
-    {file = "unimernet-0.2.1-py3-none-any.whl", hash = "sha256:19fdc0dc7d541523cf9ef05549681ddae5e181f5f536259b5af4be3366e3a049"},
+    {file = "unimernet-0.2.2-py3-none-any.whl", hash = "sha256:68a57568138d18f2cc9421de98376c6d220b9de359e6a5a40f29dacd71f38345"},
 ]
 
 [package.dependencies]
@@ -10804,9 +10744,8 @@ omegaconf = ">=2.3.0,<3.0.0"
 opencv-python = ">=4.6.0,<5.0.0"
 rapidfuzz = ">=3.8.1,<4.0.0"
 timm = ">=0.9.16,<0.10.0"
-torch = ">=2.2.2,<=2.3.1"
-torchtext = ">=0.17.2,<=0.18.0"
-torchvision = ">=0.17.2,<=0.18.1"
+torch = ">=2.2.2"
+torchvision = ">=0.17.2"
 transformers = "4.42.4"
 wand = ">=0.6.13,<0.7.0"
 webdataset = ">=0.2.86,<0.3.0"
@@ -11538,4 +11477,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11.0,<3.12"
-content-hash = "14e9121ebcfd5df38ffe64393c48c012dbe068911fb26b3b85f8cf47cffc922c"
+content-hash = "1ece5b6523463aa5b71fb1d4c27e2b9540c490c36355444292ed5adc656cd335"
diff --git a/pyproject.toml b/pyproject.toml
index c037d64f..a1aba419 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,14 +96,14 @@ detectron2 = [
   {markers = "sys_platform == 'win32'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.3.0cpu-cp311-cp311-win_amd64.whl"},
   {markers = "sys_platform != 'win32' and sys_platform != 'linux' ", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.2.2cpu-cp311-cp311-macosx_10_9_universal2.whl"}
 ]
-magic-pdf = {version = "0.10.5", extras = ["full"]}
+magic-pdf = {extras = ["full"], version = "^0.10.6"}
 peft = "^0.12.0"
 duckduckgo-search = "6.2.12"
 aliyun-bootstrap = "1.0.2"
 docx = "^0.2.4"
 python-pptx = "^1.0.2"
 aspose-slides = "^24.10.0"
-ultralytics = "8.3.43"
+ultralytics = "^8.3.58"
 datasketch = "^1.6.5"
 primp = "0.9.1"
 tablestore = "^6.1.0"
diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml
index 343dacb3..01b501be 100644
--- a/pyproject_gpu.toml
+++ b/pyproject_gpu.toml
@@ -90,7 +90,7 @@ detectron2 = [
   {markers = "sys_platform == 'win32'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.3.0cu121-cp311-cp311-win_amd64.whl"},
   {markers = "sys_platform != 'win32' and sys_platform != 'linux' ", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/detectron2-0.6%2B864913fpt2.2.2cpu-cp311-cp311-macosx_10_9_universal2.whl"}
 ]
-magic-pdf = {version = "0.10.5", extras = ["full"]}
+magic-pdf = {version = "0.10.6", extras = ["full"]}
 paddlepaddle-gpu = [
   {markers = "sys_platform == 'linux'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/paddlepaddle_gpu-3.0.0b1-cp311-cp311-linux_x86_64.whl"},
   {markers = "sys_platform != 'linux'", url = "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/packages/python_wheels/paddlepaddle_gpu-3.0.0b1-cp311-cp311-win_amd64.whl"}
@@ -100,7 +100,7 @@ aliyun-bootstrap = "^1.0.2"
 docx = "^0.2.4"
 python-pptx = "^1.0.2"
 aspose-slides = "^24.10.0"
-ultralytics = "8.3.43"
+ultralytics = "8.3.58"
 datasketch = "^1.6.5"
 tablestore = "^6.1.0"
 mistletoe = "^1.4.0"
diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py
index a875356f..3cc6755f 100644
--- a/src/pai_rag/integrations/readers/pai_pdf_reader.py
+++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py
@@ -4,40 +4,25 @@
 from typing import Dict, List, Optional, Union, Any
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
 from pai_rag.utils.markdown_utils import (
     transform_local_to_oss,
-    is_horizontal_table,
 )
-from bs4 import BeautifulSoup
-from llama_index.core import Settings
-from magic_pdf.pipe.UNIPipe import UNIPipe
-from magic_pdf.pipe.OCRPipe import OCRPipe
-import magic_pdf.model as model_config
-from rapidocr_onnxruntime import RapidOCR
-from rapid_table import RapidTable
 from operator import itemgetter
-import time
 import tempfile
 import re
 from PIL import Image
 import os
-import json
 from loguru import logger
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
+from magic_pdf.config.ocr_content_type import BlockType, ContentType
+from magic_pdf.libs.commons import join_path
+from urllib.parse import urlparse
 
 
-model_config.__use_inside_model__ = True
-
-
-IMAGE_MAX_PIXELS = 512 * 512
-TABLE_SUMMARY_MAX_ROW_NUM = 5
-TABLE_SUMMARY_MAX_COL_NUM = 10
-TABLE_SUMMARY_MAX_CELL_TOKEN = 20
-TABLE_SUMMARY_MAX_TOKEN = 200
-PAGE_TABLE_SUMMARY_MAX_TOKEN = 400
-IMAGE_URL_PATTERN = r"(https?://[^\s]+?[\s\w.-]*\.(jpg|jpeg|png|gif|bmp))"
-IMAGE_LOCAL_PATTERN = r"!\[(?P<alt_text>.*?)\]\((?P<url>/[^()\s]+(?:\s[^()\s]*)?/\S*?\.(jpg|jpeg|png|gif|bmp))\)"
-IMAGE_COMBINED_PATTERN = r"!\[.*?\]\((https?://[^\s()]+|/[^()\s]+(?:\s[^()\s]*)?/\S*?\.(jpg|jpeg|png|gif|bmp))\)"
 DEFAULT_HEADING_DIFF_THRESHOLD = 2
 
 
@@ -68,166 +53,152 @@ def _transform_local_to_oss(self, pdf_name: str, local_url: str):
         image = Image.open(local_url)
         return transform_local_to_oss(self._oss_cache, image, pdf_name)
 
-    def replace_image_paths(self, pdf_name: str, content: str):
-        local_image_pattern = IMAGE_LOCAL_PATTERN
-        matches = re.findall(local_image_pattern, content)
-        for alt_text, local_url, image_type in matches:
-            if self._oss_cache:
-                time_tag = int(time.time())
-                oss_url = self._transform_local_to_oss(pdf_name, local_url)
-                updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
-                content = content.replace(
-                    f"![{alt_text}]({local_url})", f"![{updated_alt_text}]({oss_url})"
-                )
-            else:
-                content = content.replace(f"![{alt_text}]({local_url})", "")
-
-        return content
-
-    @staticmethod
-    def perform_ocr(img_path: str) -> str:
-        table_engine = RapidTable()
-        ocr_engine = RapidOCR()
-        ocr_result, _ = ocr_engine(img_path)
-        table_html_str, table_cell_bboxes, elapse = table_engine(img_path, ocr_result)
-        return table_html_str
-
-    @staticmethod
-    def html_table_to_list_of_lists(html):
-        soup = BeautifulSoup(html, "lxml")
-        table = soup.find("table")
-        if not table:
-            return []
-        table_data = []
-        for row in table.find_all("tr"):
-            cols = row.find_all(["td", "th"])
-            table_data.append([col.get_text(strip=True) for col in cols])
-        return table_data
-
-    @staticmethod
-    def add_table_ocr_content(
-        markdown_content: str, image_path: str, ocr_content: str
-    ) -> str:
-        pattern = rf"!\[(.*?)\]\({re.escape(image_path)}\)"
-        regex = re.compile(pattern)
-        replacement = "table"
-        offset = 0  # 用于记录已插入文本导致的偏移量
-        for match in regex.finditer(markdown_content):
-            # 记录匹配的起始和结束偏移量
-            start, end = match.span()
-            # 提取alt_text的起始和结束位置，以便替换
-            alt_start, alt_end = match.span(1)  # span(1)针对第一个捕获组
-            new_alt_start = alt_start + offset
-            new_alt_end = alt_end + offset
+    def is_url(self, url: str) -> bool:
+        """判断是否为 URL"""
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except ValueError:
+            return False
 
-            markdown_content = (
-                markdown_content[:new_alt_start]
-                + replacement
-                + markdown_content[new_alt_end:]
-            )
-            new_start = start + offset
-            ocr_content = f"\n\n{ocr_content}\n\n"
-            markdown_content = (
-                markdown_content[:new_start]
-                + ocr_content
-                + markdown_content[new_start:]
+    def create_markdwon(
+        self,
+        pdf_name: str,
+        pdf_info_dict: list,
+        img_buket_path: str = "",
+    ):
+        output_content = []
+        text_height_min = float("inf")
+        text_height_max = 0
+        title_list = []
+        for page_info in pdf_info_dict:
+            paras_of_layout = page_info.get("para_blocks")
+            if not paras_of_layout:
+                continue
+            page_markdown, text_height_min, text_height_max = self.create_page_markdwon(
+                pdf_name,
+                paras_of_layout,
+                img_buket_path,
+                title_list,
+                text_height_min,
+                text_height_max,
             )
-            # 更新偏移量，因为刚刚插入了新文本
-            offset += len(ocr_content) + len(replacement)
-        return markdown_content
-
-    @staticmethod
-    def limit_cell_size(cell: str, max_chars: int) -> str:
-        return (cell[:max_chars] + "...") if len(cell) > max_chars else cell
-
-    @staticmethod
-    def limit_table_content(table: List[List]) -> List[List]:
-        return [
-            [
-                PaiPDFReader.limit_cell_size(str(cell), TABLE_SUMMARY_MAX_CELL_TOKEN)
-                for cell in row
-            ]
-            for row in table
-        ]
-
-    @staticmethod
-    def tables_summarize(table: List[List]) -> str:
-        table = PaiPDFReader.limit_table_content(table)
-        if not is_horizontal_table(table):
-            table = list(zip(*table))
-        table = table[:TABLE_SUMMARY_MAX_ROW_NUM]
-        table = [row[:TABLE_SUMMARY_MAX_COL_NUM] for row in table]
-
-        prompt_text = f"请为以下表格生成一个摘要: {table}"
-        response = Settings.llm.complete(
-            prompt_text,
-            max_tokens=200,
-            n=1,
+            output_content.extend(page_markdown)
+        markdwon_content = "\n\n".join(output_content)
+        markdown_result = self.post_process_multi_level_headings(
+            title_list, markdwon_content, text_height_min, text_height_max
         )
-        summarized_text = response
-        return summarized_text.text
+        return markdown_result
 
-    def process_table(self, markdown_content, json_data):
-        ocr_count = 0
+    def create_page_markdwon(
+        self,
+        pdf_name,
+        paras_of_layout,
+        img_buket_path,
+        title_list,
+        text_height_min,
+        text_height_max,
+    ):
+        page_markdown = []
+        for para_block in paras_of_layout:
+            text_height_min, text_height_max = self.collect_title_info(
+                para_block, title_list, text_height_min, text_height_max
+            )
+            para_text = ""
+            para_type = para_block["type"]
+            if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
+                para_text = merge_para_with_text(para_block)
+            elif para_type == BlockType.Title:
+                para_text = f"# {merge_para_with_text(para_block)}"
+            elif para_type == BlockType.InterlineEquation:
+                para_text = merge_para_with_text(para_block)
+            elif para_type == BlockType.Image:
+                for block in para_block["blocks"]:  # 1st.拼image_body
+                    if block["type"] == BlockType.ImageBody:
+                        for line in block["lines"]:
+                            for span in line["spans"]:
+                                if span["type"] == ContentType.Image:
+                                    if span.get("image_path", "") and not self.is_url(
+                                        span.get("image_path", "")
+                                    ):
+                                        image_path = join_path(
+                                            img_buket_path, span["image_path"]
+                                        )
+                                        oss_url = self._transform_local_to_oss(
+                                            pdf_name, image_path
+                                        )
+                                        para_text += f"\n![]({oss_url})  \n"
+                for block in para_block["blocks"]:  # 2nd.拼image_caption
+                    if block["type"] == BlockType.ImageCaption:
+                        para_text += merge_para_with_text(block) + "  \n"
+                for block in para_block["blocks"]:  # 3rd.拼image_footnote
+                    if block["type"] == BlockType.ImageFootnote:
+                        para_text += merge_para_with_text(block) + "  \n"
+            elif para_type == BlockType.Table:
+                for block in para_block["blocks"]:  # 1st.拼table_caption
+                    if block["type"] == BlockType.TableCaption:
+                        para_text += merge_para_with_text(block) + "  \n"
+                for block in para_block["blocks"]:  # 2nd.拼table_body
+                    if block["type"] == BlockType.TableBody:
+                        for line in block["lines"]:
+                            for span in line["spans"]:
+                                if span["type"] == ContentType.Table:
+                                    # if processed by table model
+                                    if span.get("latex", ""):
+                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get("html", ""):
+                                        para_text += f"\n\n{span['html']}\n\n"
+                                    if span.get("image_path", "") and not self.is_url(
+                                        span.get("image_path", "")
+                                    ):
+                                        image_path = join_path(
+                                            img_buket_path, span["image_path"]
+                                        )
+                                        oss_url = self._transform_local_to_oss(
+                                            pdf_name, image_path
+                                        )
+                                        para_text += f"\n![]({oss_url})  \n"
+                for block in para_block["blocks"]:  # 3rd.拼table_footnote
+                    if block["type"] == BlockType.TableFootnote:
+                        para_text += merge_para_with_text(block) + "  \n"
+
+            if para_text.strip() == "":
+                continue
+            else:
+                page_markdown.append(para_text.strip() + "  ")
 
-        for item in json_data:
-            if item["type"] == "table" and "img_path" in item:
-                img_path = item["img_path"]
-                if os.path.exists(img_path):
-                    ocr_count += 1
-                    ocr_content = PaiPDFReader.perform_ocr(img_path)
-                    if self.enable_table_summary:
-                        table_list_data = PaiPDFReader.html_table_to_list_of_lists(
-                            ocr_content
-                        )
-                        summarized_table_text = PaiPDFReader.tables_summarize(
-                            table_list_data
-                        )[:TABLE_SUMMARY_MAX_TOKEN]
-                        ocr_content += f"\n\n{summarized_table_text}\n\n"
-                        markdown_content = PaiPDFReader.add_table_ocr_content(
-                            markdown_content, item["img_path"], ocr_content
-                        )
-                    else:
-                        markdown_content = PaiPDFReader.add_table_ocr_content(
-                            markdown_content, item["img_path"], ocr_content
-                        )
-                else:
-                    logger.warning(f"警告：图片文件不存在 {img_path}")
-        return markdown_content
+        return page_markdown, text_height_min, text_height_max
 
-    def post_process_multi_level_headings(self, json_data, md_content):
+    def collect_title_info(
+        self, para_block, title_list, text_height_min, text_height_max
+    ):
+        if not para_block.get("lines", None) or len(para_block["lines"]) <= 0:
+            return text_height_min, text_height_max
+        x0, y0, x1, y1 = para_block["lines"][0]["bbox"]
+        content_height = y1 - y0
+        if para_block["type"] == BlockType.Title:
+            title_height = int(content_height)
+            title_text = ""
+            for line in para_block["lines"]:
+                for span in line["spans"]:
+                    if span["type"] == "inline_equation":
+                        span["content"] = " $" + span["content"] + "$ "
+                    title_text += span["content"]
+            title_text = title_text.replace("\\", "\\\\")
+            title_list.append((title_text, title_height))
+        elif para_block["type"] == "text":
+            if content_height < text_height_min:
+                text_height_min = content_height
+            if content_height > text_height_max:
+                text_height_max = content_height
+        return text_height_min, text_height_max
+
+    def post_process_multi_level_headings(
+        self, title_list, md_content, text_height_min, text_height_max
+    ):
         logger.info(
             "*****************************start process headings*****************************"
         )
-        pages_list = json_data["pdf_info"]
-        if not pages_list:
-            return md_content
-        text_height_min = float("inf")
-        text_height_max = 0
-        title_list = []
-        for page in pages_list:
-            page_infos = page["preproc_blocks"]
-            for item in page_infos:
-                if not item.get("lines", None) or len(item["lines"]) <= 0:
-                    continue
-                x0, y0, x1, y1 = item["lines"][0]["bbox"]
-                content_height = y1 - y0
-                if item["type"] == "title":
-                    title_height = int(content_height)
-                    title_text = ""
-                    for line in item["lines"]:
-                        for span in line["spans"]:
-                            if span["type"] == "inline_equation":
-                                span["content"] = " $" + span["content"] + "$ "
-                            title_text += span["content"]
-                    title_text = title_text.replace("\\", "\\\\")
-                    title_list.append((title_text, title_height))
-                elif item["type"] == "text":
-                    if content_height < text_height_min:
-                        text_height_min = content_height
-                    if content_height > text_height_max:
-                        text_height_max = content_height
-
         sorted_list = sorted(title_list, key=itemgetter(1), reverse=True)
         diff_list = [
             (sorted_list[i][1] - sorted_list[i + 1][1], i)
@@ -261,13 +232,15 @@ def post_process_multi_level_headings(self, json_data, md_content):
             new_title = title_level + title_text
             md_content = re.sub(re.escape(old_title), new_title, md_content)
 
+        logger.info(
+            "*****************************process headings ended*****************************"
+        )
+
         return md_content
 
     def parse_pdf(
         self,
         pdf_path: str,
-        parse_method: str = "auto",
-        model_json_path: str = None,
     ):
         """
         执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录
@@ -281,46 +254,30 @@ def parse_pdf(
             pdf_name = pdf_name.replace(" ", "_")
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_file_path = os.path.join(temp_dir, pdf_name)
-                pdf_bytes = open(pdf_path, "rb").read()  # 读取 pdf 文件的二进制数据
 
-                if model_json_path:
-                    model_json = json.loads(
-                        open(model_json_path, "r", encoding="utf-8").read()
-                    )
-                else:
-                    model_json = []
-
-                # 执行解析步骤
                 image_writer = FileBasedDataWriter(temp_file_path)
+                reader1 = FileBasedDataReader("")
+                pdf_bytes = reader1.read(pdf_path)
+                ds = PymuDocDataset(pdf_bytes)
 
                 # 选择解析方式
-                if parse_method == "auto":
-                    jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-                    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-                elif parse_method == "ocr":
-                    pipe = OCRPipe(pdf_bytes, model_json, image_writer)
+                if (
+                    self.enable_mandatory_ocr
+                    or ds.classify() == SupportedPdfParseMethod.OCR
+                ):
+                    infer_result = ds.apply(doc_analyze, ocr=True)
+                    pipe_result = infer_result.pipe_ocr_mode(image_writer)
                 else:
-                    logger.error("unknown parse method, only auto, ocr, txt allowed")
-                    exit(1)
-
-                # 执行分类
-                pipe.pipe_classify()
+                    infer_result = ds.apply(doc_analyze, ocr=False)
+                    pipe_result = infer_result.pipe_txt_mode(image_writer)
 
-                # 如果没有传入模型数据，则使用内置模型解析
-                if len(model_json) == 0:
-                    pipe.pipe_analyze()  # 解析
+                content_list = pipe_result._pipe_res["pdf_info"]
 
-                # 执行解析
-                pipe.pipe_parse()
-                content_list = pipe.pipe_mk_uni_format(temp_file_path, drop_mode="none")
-                md_content = pipe.pipe_mk_markdown(temp_file_path, drop_mode="none")
-                md_content = self.post_process_multi_level_headings(
-                    pipe.pdf_mid_data, md_content
+                md_content = self.create_markdwon(
+                    pdf_name, content_list, temp_file_path
                 )
-                md_content = self.process_table(md_content, content_list)
-                new_md_content = self.replace_image_paths(pdf_name, md_content)
 
-            return new_md_content
+            return md_content
 
         except Exception as e:
             logger.error(e)
@@ -355,11 +312,7 @@ def load(
         Returns:
             List[Document]: list of documents.
         """
-        if self.enable_mandatory_ocr:
-            parse_method = "ocr"
-        else:
-            parse_method = "auto"
-        md_content = self.parse_pdf(file_path, parse_method)
+        md_content = self.parse_pdf(file_path)
         logger.info(f"[PaiPDFReader] successfully processed pdf file {file_path}.")
         docs = []
         if metadata:

From 8fe3c943ffa56fe0d649c27edf7a435a64a89924 Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Tue, 14 Jan 2025 17:31:41 +0800
Subject: [PATCH 2/9] reformat pdf reader

---
 .../integrations/readers/pai_pdf_reader.py    | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py
index 3cc6755f..0a0feca1 100644
--- a/src/pai_rag/integrations/readers/pai_pdf_reader.py
+++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py
@@ -10,7 +10,6 @@
 )
 from operator import itemgetter
 import tempfile
-import re
 from PIL import Image
 import os
 from loguru import logger
@@ -21,6 +20,7 @@
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from urllib.parse import urlparse
+from collections import defaultdict
 
 
 DEFAULT_HEADING_DIFF_THRESHOLD = 2
@@ -61,7 +61,7 @@ def is_url(self, url: str) -> bool:
         except ValueError:
             return False
 
-    def create_markdwon(
+    def create_markdown(
         self,
         pdf_name: str,
         pdf_info_dict: list,
@@ -71,33 +71,46 @@ def create_markdwon(
         text_height_min = float("inf")
         text_height_max = 0
         title_list = []
+        # 存储每个title及其index
+        title_dict = defaultdict(list)
+        # 记录index
+        index_count = 0
         for page_info in pdf_info_dict:
             paras_of_layout = page_info.get("para_blocks")
             if not paras_of_layout:
                 continue
-            page_markdown, text_height_min, text_height_max = self.create_page_markdwon(
+            (
+                page_markdown,
+                text_height_min,
+                text_height_max,
+                index_count,
+            ) = self.create_page_markdown(
                 pdf_name,
                 paras_of_layout,
                 img_buket_path,
                 title_list,
+                title_dict,
                 text_height_min,
                 text_height_max,
+                index_count,
             )
             output_content.extend(page_markdown)
-        markdwon_content = "\n\n".join(output_content)
-        markdown_result = self.post_process_multi_level_headings(
-            title_list, markdwon_content, text_height_min, text_height_max
+        self.post_process_multi_level_headings(
+            title_list, output_content, title_dict, text_height_min, text_height_max
         )
+        markdown_result = "\n\n".join(output_content)
         return markdown_result
 
-    def create_page_markdwon(
+    def create_page_markdown(
         self,
         pdf_name,
         paras_of_layout,
         img_buket_path,
         title_list,
+        title_dict,
         text_height_min,
         text_height_max,
+        index_count,
     ):
         page_markdown = []
         for para_block in paras_of_layout:
@@ -110,6 +123,7 @@ def create_page_markdwon(
                 para_text = merge_para_with_text(para_block)
             elif para_type == BlockType.Title:
                 para_text = f"# {merge_para_with_text(para_block)}"
+                title_dict[para_text].append(index_count)
             elif para_type == BlockType.InterlineEquation:
                 para_text = merge_para_with_text(para_block)
             elif para_type == BlockType.Image:
@@ -166,8 +180,9 @@ def create_page_markdwon(
                 continue
             else:
                 page_markdown.append(para_text.strip() + "  ")
+            index_count += 1
 
-        return page_markdown, text_height_min, text_height_max
+        return page_markdown, text_height_min, text_height_max, index_count
 
     def collect_title_info(
         self, para_block, title_list, text_height_min, text_height_max
@@ -178,15 +193,9 @@ def collect_title_info(
         content_height = y1 - y0
         if para_block["type"] == BlockType.Title:
             title_height = int(content_height)
-            title_text = ""
-            for line in para_block["lines"]:
-                for span in line["spans"]:
-                    if span["type"] == "inline_equation":
-                        span["content"] = " $" + span["content"] + "$ "
-                    title_text += span["content"]
-            title_text = title_text.replace("\\", "\\\\")
+            title_text = merge_para_with_text(para_block)
             title_list.append((title_text, title_height))
-        elif para_block["type"] == "text":
+        elif para_block["type"] == BlockType.Text:
             if content_height < text_height_min:
                 text_height_min = content_height
             if content_height > text_height_max:
@@ -194,7 +203,7 @@ def collect_title_info(
         return text_height_min, text_height_max
 
     def post_process_multi_level_headings(
-        self, title_list, md_content, text_height_min, text_height_max
+        self, title_list, output_content, title_dict, text_height_min, text_height_max
     ):
         logger.info(
             "*****************************start process headings*****************************"
@@ -230,14 +239,14 @@ def post_process_multi_level_headings(
                 title_level = ""
             old_title = "# " + title_text
             new_title = title_level + title_text
-            md_content = re.sub(re.escape(old_title), new_title, md_content)
+            if len(title_dict.get(old_title)) > 0:
+                md_index = title_dict.get(old_title).pop()
+                output_content[md_index] = new_title
 
         logger.info(
             "*****************************process headings ended*****************************"
         )
 
-        return md_content
-
     def parse_pdf(
         self,
         pdf_path: str,
@@ -256,8 +265,9 @@ def parse_pdf(
                 temp_file_path = os.path.join(temp_dir, pdf_name)
 
                 image_writer = FileBasedDataWriter(temp_file_path)
-                reader1 = FileBasedDataReader("")
-                pdf_bytes = reader1.read(pdf_path)
+                # parent_dir is "", if the pdf_path is relative path, it will be joined with parent_dir.
+                file_reader = FileBasedDataReader("")
+                pdf_bytes = file_reader.read(pdf_path)
                 ds = PymuDocDataset(pdf_bytes)
 
                 # 选择解析方式
@@ -273,7 +283,7 @@ def parse_pdf(
 
                 content_list = pipe_result._pipe_res["pdf_info"]
 
-                md_content = self.create_markdwon(
+                md_content = self.create_markdown(
                     pdf_name, content_list, temp_file_path
                 )
 

From a7f33a0660dc8bbbc9c3eb10ec191b4b134271cd Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Tue, 14 Jan 2025 20:24:57 +0800
Subject: [PATCH 3/9] update format

---
 .../integrations/readers/pai_pdf_reader.py    | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py
index 0a0feca1..45f2df65 100644
--- a/src/pai_rag/integrations/readers/pai_pdf_reader.py
+++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py
@@ -20,7 +20,6 @@
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from urllib.parse import urlparse
-from collections import defaultdict
 
 
 DEFAULT_HEADING_DIFF_THRESHOLD = 2
@@ -71,8 +70,8 @@ def create_markdown(
         text_height_min = float("inf")
         text_height_max = 0
         title_list = []
-        # 存储每个title及其index
-        title_dict = defaultdict(list)
+        # 存储每个title的index
+        md_title_index = []
         # 记录index
         index_count = 0
         for page_info in pdf_info_dict:
@@ -89,15 +88,17 @@ def create_markdown(
                 paras_of_layout,
                 img_buket_path,
                 title_list,
-                title_dict,
+                md_title_index,
                 text_height_min,
                 text_height_max,
                 index_count,
             )
             output_content.extend(page_markdown)
-        self.post_process_multi_level_headings(
-            title_list, output_content, title_dict, text_height_min, text_height_max
+        new_title_list = self.post_process_multi_level_headings(
+            title_list, text_height_min, text_height_max
         )
+        for idx, content_idx in enumerate(md_title_index):
+            output_content[content_idx] = new_title_list[idx]
         markdown_result = "\n\n".join(output_content)
         return markdown_result
 
@@ -107,7 +108,7 @@ def create_page_markdown(
         paras_of_layout,
         img_buket_path,
         title_list,
-        title_dict,
+        md_title_index,
         text_height_min,
         text_height_max,
         index_count,
@@ -123,7 +124,7 @@ def create_page_markdown(
                 para_text = merge_para_with_text(para_block)
             elif para_type == BlockType.Title:
                 para_text = f"# {merge_para_with_text(para_block)}"
-                title_dict[para_text].append(index_count)
+                md_title_index.append(index_count)
             elif para_type == BlockType.InterlineEquation:
                 para_text = merge_para_with_text(para_block)
             elif para_type == BlockType.Image:
@@ -203,14 +204,18 @@ def collect_title_info(
         return text_height_min, text_height_max
 
     def post_process_multi_level_headings(
-        self, title_list, output_content, title_dict, text_height_min, text_height_max
+        self, title_list, text_height_min, text_height_max
     ):
         logger.info(
             "*****************************start process headings*****************************"
         )
-        sorted_list = sorted(title_list, key=itemgetter(1), reverse=True)
+        indexed_title_list = [
+            (idx, title_text, title_height)
+            for idx, (title_text, title_height) in enumerate(title_list)
+        ]
+        sorted_list = sorted(indexed_title_list, key=itemgetter(2), reverse=True)
         diff_list = [
-            (sorted_list[i][1] - sorted_list[i + 1][1], i)
+            (sorted_list[i][2] - sorted_list[i + 1][2], i)
             for i in range(len(sorted_list) - 1)
         ]
         sorted_diff = sorted(diff_list, key=itemgetter(0), reverse=True)
@@ -221,31 +226,34 @@ def post_process_multi_level_headings(
             if diff >= DEFAULT_HEADING_DIFF_THRESHOLD and len(slice_index) <= 5:
                 slice_index.append(index)
         slice_index.sort(reverse=True)
+        rank_mapping = {}  # idx到rank的映射
         rank = 1
         cur_index = 0
         if len(slice_index) > 0:
             cur_index = slice_index.pop()
-        for index, (title_text, title_height) in enumerate(sorted_list):
+        for index, (idx, title_text, title_height) in enumerate(sorted_list):
             if index > cur_index:
                 rank += 1
                 if len(slice_index) > 0:
                     cur_index = slice_index.pop()
                 else:
                     cur_index = len(sorted_list) - 1
-            title_level = "#" * rank + " "
+            rank_mapping[idx] = rank
+        new_title_list = []
+        for original_idx, (title_text, title_height) in enumerate(title_list):
+            assigned_rank = rank_mapping.get(original_idx, 6)
+            title_level = "#" * assigned_rank + " "
+
+            # 高度范围在纯文本之间不作为标题
             if text_height_min <= text_height_max and int(
                 text_height_min
             ) <= title_height <= int(text_height_max):
                 title_level = ""
-            old_title = "# " + title_text
+
             new_title = title_level + title_text
-            if len(title_dict.get(old_title)) > 0:
-                md_index = title_dict.get(old_title).pop()
-                output_content[md_index] = new_title
+            new_title_list.append(new_title)
 
-        logger.info(
-            "*****************************process headings ended*****************************"
-        )
+        return new_title_list
 
     def parse_pdf(
         self,

From c6f1d56bb936fdc96cd452d6f730573a7432b673 Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Tue, 14 Jan 2025 20:54:01 +0800
Subject: [PATCH 4/9] update format

---
 src/pai_rag/integrations/readers/pai_pdf_reader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py
index 45f2df65..4bf6563d 100644
--- a/src/pai_rag/integrations/readers/pai_pdf_reader.py
+++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py
@@ -252,6 +252,7 @@ def post_process_multi_level_headings(
 
             new_title = title_level + title_text
             new_title_list.append(new_title)
+            logger.info(f"transform {title_text} to {new_title}")
 
         return new_title_list
 

From d6ebf471ad617305809c34985dd229cbcfcaf3d9 Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Tue, 14 Jan 2025 21:07:25 +0800
Subject: [PATCH 5/9] fix test

---
 .../test_post_process_multi_level_headings.py | 65 +++++--------------
 1 file changed, 15 insertions(+), 50 deletions(-)

diff --git a/tests/utils/test_post_process_multi_level_headings.py b/tests/utils/test_post_process_multi_level_headings.py
index de0d7bc2..d1abb82d 100644
--- a/tests/utils/test_post_process_multi_level_headings.py
+++ b/tests/utils/test_post_process_multi_level_headings.py
@@ -1,62 +1,27 @@
-import json
 import pytest
 import os
 
 
-json_str = r"""{
-    "pdf_info": [
-        {
-            "preproc_blocks": [
-                {
-                    "type": "title",
-                    "bbox": [
-                        59,
-                        67,
-                        196,
-                        89
-                    ],
-                    "lines": [
-                        {
-                            "bbox": [
-                                57,
-                                70,
-                                72,
-                                85
-                            ],
-                            "spans": [
-                                {
-                                    "bbox": [
-                                        57,
-                                        70,
-                                        72,
-                                        85
-                                    ],
-                                    "score": 0.75,
-                                    "content": "\\leftarrow",
-                                    "type": "inline_equation"
-                                }
-                            ]
-                        }
-                    ]
-                }
-            ]
-        }
-        ]
-}"""
-
-md_str = """#  $\leftarrow$  """
-
-
 @pytest.mark.skipif(
     os.getenv("SKIP_GPU_TESTS", "false") == "true",
     reason="Need to execute in a CUDA environment.",
 )
 def test_post_process_multi_level_headings():
+    title_list = [
+        ("title_1", 6),
+        ("title_2", 10),
+        ("title_3", 8),
+        ("title_4", 7),
+        ("title_5", 14),
+    ]
     from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
 
     pdf_process = PaiPDFReader()
-    json_content = json.loads(json_str)
-    md_content_escape = pdf_process.post_process_multi_level_headings(
-        json_content, md_str
-    )
-    assert md_content_escape == "#  $\leftarrow$  "
+    new_title_list = pdf_process.post_process_multi_level_headings(title_list, 0, 0)
+    assert new_title_list == [
+        "### title_1",
+        "## title_2",
+        "### title_3",
+        "### title_4",
+        "# title_5",
+    ]

From cdb68132ba0377cdf0a55fbe0b178872e74e37bb Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Wed, 15 Jan 2025 10:25:23 +0800
Subject: [PATCH 6/9] remove empty node

---
 tests/integrations/test_markdown_mode_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integrations/test_markdown_mode_parser.py b/tests/integrations/test_markdown_mode_parser.py
index d726fe0d..52f95e45 100644
--- a/tests/integrations/test_markdown_mode_parser.py
+++ b/tests/integrations/test_markdown_mode_parser.py
@@ -32,4 +32,4 @@ def test_markdown_parser():
     for doc_node in documents:
         splitted_nodes.extend(md_node_parser.get_nodes_from_documents([doc_node]))
 
-    assert len(splitted_nodes) == 11
+    assert len(splitted_nodes) == 10

From 33be251633a844050ffa908a421060bbf63b5edc Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Wed, 15 Jan 2025 11:20:36 +0800
Subject: [PATCH 7/9] update test

---
 tests/integrations/test_markdown_mode_parser.py |  9 +++++++++
 tests/testdata/data/json_data/pai_document.json | 12 ++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 tests/testdata/data/json_data/pai_document.json

diff --git a/tests/integrations/test_markdown_mode_parser.py b/tests/integrations/test_markdown_mode_parser.py
index 52f95e45..a3ed802a 100644
--- a/tests/integrations/test_markdown_mode_parser.py
+++ b/tests/integrations/test_markdown_mode_parser.py
@@ -1,6 +1,7 @@
 import os
 import pytest
 from pathlib import Path
+import json
 
 BASE_DIR = Path(__file__).parent.parent.parent
 
@@ -32,4 +33,12 @@ def test_markdown_parser():
     for doc_node in documents:
         splitted_nodes.extend(md_node_parser.get_nodes_from_documents([doc_node]))
 
+    text_list = [node.text for node in splitted_nodes]
+
+    with open(
+        "tests/testdata/data/json_data/pai_document.json", "r", encoding="utf-8"
+    ) as file:
+        chunk_text = json.load(file)
+
+    assert text_list == chunk_text
     assert len(splitted_nodes) == 10
diff --git a/tests/testdata/data/json_data/pai_document.json b/tests/testdata/data/json_data/pai_document.json
new file mode 100644
index 00000000..910e1f46
--- /dev/null
+++ b/tests/testdata/data/json_data/pai_document.json
@@ -0,0 +1,12 @@
+[
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:This topic describes the FAQ about algorit hm components.What can I do when the format conversion component reports an error? · What can I do when \"blob\" is displayed on the data present ation page of the PAl? · what canI do when the xl3-auto-arima component reports an error? · What can Ido when the Doc2Vec component reports the CallExecutorToParseTaskFail error?What can I do when the format conversion component reports an error?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:By default, the format conversion component runs 100 workers. Make sure that up to 100 data ent ries are converted at a time.What can I do when \"blob\" is displayed on the data presentation page of the PAl?SymptomOn the canvas of the Machine Learning Studio page, when Iright-clicked a component and selected View Data, some text is displayed as \"blob.\"Solution",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:Characters that cannot be transcoded are displayed as \"blob.\" Ignore this error, because nodes in the downst ream can read and process the data.What can I do when the x13-auto-arima component reports an error?Make sure that up to 1,200 training data samples are imported into the x13-auto-arima component.What can I do when the Doc2Vec component reports the CallExecutorToParseTaskFail error?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:Make sure that the number of data samples imported into the Doc2Vec component is less than $2470000\\times10000.$ The data is calculated in the formula:  (Number of documents $^+$ Number of  words) x Vector length . Make sure that the number of users who use the Doc2Vec component is less than $42432500\\times7712293\\times300$ . Exceeding the preceding limits causes memory application failures. If your data size is excessively large, you can reduce the number of data samples and then calculate the formula again. Make sure that the imported data is split into words.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data:This topic describes the FAQ about model data.Why does my experiment generate an empty model? ·How do I download a model generated in my experiment?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data >> How do I upload data?:Why does my experiment generate an empty model?SymptomWhen I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.Solutioni. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.i. Click General. On the General page, select Auto Generate PMML.il. Run the experiment again.How do I download a model generated in my experiment?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data >> How do I upload data?:1.Go to the Machine Learning Studio page. In the left-side navigation pane, click Models.\n\n2.In the model list, right-clickthe model you want to download and then select Save Model.How do I upload data?For more information about how to upload a data file, see Prepare data.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:If you want to preview CsV and JsoN files that are stored in Object Storage Service (Oss) buckets on the Machine Learning Studio platform, you must enable Cross-Origin Resource Sharing (CoRs) for the OsSbuckets.ThistopicdescribeshowtoenableCoRsforOss.LimitsThis feature is supported only by Machine Learning Studio 2.0.Procedure",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:2.On the details page of the Oss bucket, choose Access Cont rol $>$ Cross-Origin Resource Sharing (CORS). In the Cross-Origin Resource Sharing (CORS) section, click Conf igure.\n\n3.Click Create Rule. In the Create Rule panel, set the following parameters.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>None"
+]
\ No newline at end of file

From 47a0814b911727413e23411305ab02eca611ddcd Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Wed, 15 Jan 2025 14:10:03 +0800
Subject: [PATCH 8/9] fix md parser

---
 .../nodeparsers/pai/pai_markdown_parser.py    | 10 +++++-----
 .../testdata/data/json_data/pai_document.json | 20 +++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py b/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py
index dfcfe7e3..5c5793f4 100644
--- a/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py
+++ b/src/pai_rag/integrations/nodeparsers/pai/pai_markdown_parser.py
@@ -65,7 +65,7 @@ def _cut(self, raw_section: str) -> Iterator[str]:
         return self.base_parser.split_text(raw_section)
 
     def _format_section_header(self, section_headers) -> str:
-        return " >> ".join([h.content for h in section_headers])
+        return "\n".join([h.content for h in section_headers])
 
     def _format_tree_nodes(
         self, node, doc_node, ref_doc, nodes_list, chunk_images_list
@@ -92,7 +92,7 @@ def _format_tree_nodes(
             return ""
         if not node.children:
             return node.content
-        return node.content + "\n\n".join(
+        return node.content + "\n".join(
             [
                 self._format_tree_nodes(
                     child, doc_node, ref_doc, nodes_list, chunk_images_list
@@ -202,7 +202,7 @@ def traverse_tree(
             for chunk_text in self._cut(tree_node.content):
                 if title_stack:
                     new_chunk_text = (
-                        f"{self._format_section_header(title_stack)}:{chunk_text}"
+                        f"{self._format_section_header(title_stack)} : {chunk_text}"
                     )
                 else:
                     new_chunk_text = chunk_text
@@ -251,12 +251,12 @@ def traverse_tree(
                         image_info = ImageInfo(image_url=child.content)
                         chunk_images_list.append(json.dumps(image_info.__dict__))
                     else:
-                        chunk_text += self._format_tree_nodes(
+                        chunk_text += "\n" + self._format_tree_nodes(
                             child, doc_node, ref_doc, nodes_list, chunk_images_list
                         )
                 if title_stack:
                     new_chunk_text = (
-                        f"{self._format_section_header(title_stack)}:{chunk_text}"
+                        f"{self._format_section_header(title_stack)} : {chunk_text}"
                     )
                 else:
                     new_chunk_text = chunk_text
diff --git a/tests/testdata/data/json_data/pai_document.json b/tests/testdata/data/json_data/pai_document.json
index 910e1f46..a1e539b1 100644
--- a/tests/testdata/data/json_data/pai_document.json
+++ b/tests/testdata/data/json_data/pai_document.json
@@ -1,12 +1,12 @@
 [
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:This topic describes the FAQ about algorit hm components.What can I do when the format conversion component reports an error? · What can I do when \"blob\" is displayed on the data present ation page of the PAl? · what canI do when the xl3-auto-arima component reports an error? · What can Ido when the Doc2Vec component reports the CallExecutorToParseTaskFail error?What can I do when the format conversion component reports an error?",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:By default, the format conversion component runs 100 workers. Make sure that up to 100 data ent ries are converted at a time.What can I do when \"blob\" is displayed on the data presentation page of the PAl?SymptomOn the canvas of the Machine Learning Studio page, when Iright-clicked a component and selected View Data, some text is displayed as \"blob.\"Solution",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:Characters that cannot be transcoded are displayed as \"blob.\" Ignore this error, because nodes in the downst ream can read and process the data.What can I do when the x13-auto-arima component reports an error?Make sure that up to 1,200 training data samples are imported into the x13-auto-arima component.What can I do when the Doc2Vec component reports the CallExecutorToParseTaskFail error?",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components:Make sure that the number of data samples imported into the Doc2Vec component is less than $2470000\\times10000.$ The data is calculated in the formula:  (Number of documents $^+$ Number of  words) x Vector length . Make sure that the number of users who use the Doc2Vec component is less than $42432500\\times7712293\\times300$ . Exceeding the preceding limits causes memory application failures. If your data size is excessively large, you can reduce the number of data samples and then calculate the formula again. Make sure that the imported data is split into words.",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data:This topic describes the FAQ about model data.Why does my experiment generate an empty model? ·How do I download a model generated in my experiment?",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data >> How do I upload data?:Why does my experiment generate an empty model?SymptomWhen I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.Solutioni. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.i. Click General. On the General page, select Auto Generate PMML.il. Run the experiment again.How do I download a model generated in my experiment?",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.2. FAQ about model data >> How do I upload data?:1.Go to the Machine Learning Studio page. In the left-side navigation pane, click Models.\n\n2.In the model list, right-clickthe model you want to download and then select Save Model.How do I upload data?For more information about how to upload a data file, see Prepare data.",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:If you want to preview CsV and JsoN files that are stored in Object Storage Service (Oss) buckets on the Machine Learning Studio platform, you must enable Cross-Origin Resource Sharing (CoRs) for the OsSbuckets.ThistopicdescribeshowtoenableCoRsforOss.LimitsThis feature is supported only by Machine Learning Studio 2.0.Procedure",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:2.On the details page of the Oss bucket, choose Access Cont rol $>$ Cross-Origin Resource Sharing (CORS). In the Cross-Origin Resource Sharing (CORS) section, click Conf igure.\n\n3.Click Create Rule. In the Create Rule panel, set the following parameters.",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components >> 2.3. Preview Oss files in Machine Learning Studio:<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>None"
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : \nThis topic describes the FAQ about algorit hm components.\nWhat can I do when the format conversion component reports an error? · What can I do when \"blob\" is displayed on the data present ation page of the PAl? · what canI do when the xl3-auto-arima component reports an error? · What can Ido when the Doc2Vec component reports the CallExecutorToParseTaskFail error?\nWhat can I do when the format conversion component reports an error?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : \nBy default, the format conversion component runs 100 workers. Make sure that up to 100 data ent ries are converted at a time.\nWhat can I do when \"blob\" is displayed on the data presentation page of the PAl?\nSymptom\nOn the canvas of the Machine Learning Studio page, when Iright-clicked a component and selected View Data, some text is displayed as \"blob.\"\nSolution",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : \nCharacters that cannot be transcoded are displayed as \"blob.\" Ignore this error, because nodes in the downst ream can read and process the data.\nWhat can I do when the x13-auto-arima component reports an error?\nMake sure that up to 1,200 training data samples are imported into the x13-auto-arima component.\nWhat can I do when the Doc2Vec component reports the CallExecutorToParseTaskFail error?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : Make sure that the number of data samples imported into the Doc2Vec component is less than $2470000\\times10000.$ The data is calculated in the formula:  (Number of documents $^+$ Number of  words) x Vector length . Make sure that the number of users who use the Doc2Vec component is less than $42432500\\times7712293\\times300$ . Exceeding the preceding limits causes memory application failures. If your data size is excessively large, you can reduce the number of data samples and then calculate the formula again. Make sure that the imported data is split into words.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data : \nThis topic describes the FAQ about model data.\nWhy does my experiment generate an empty model? ·How do I download a model generated in my experiment?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data\nHow do I upload data? : \nWhy does my experiment generate an empty model?\nSymptom\nWhen I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.\nSolution\ni. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.i. Click General. On the General page, select Auto Generate PMML.il. Run the experiment again.\nHow do I download a model generated in my experiment?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data\nHow do I upload data? : \n1.Go to the Machine Learning Studio page. In the left-side navigation pane, click Models.\n2.In the model list, right-clickthe model you want to download and then select Save Model.\nHow do I upload data?\nFor more information about how to upload a data file, see Prepare data.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \nIf you want to preview CsV and JsoN files that are stored in Object Storage Service (Oss) buckets on the Machine Learning Studio platform, you must enable Cross-Origin Resource Sharing (CoRs) for the OsSbuckets.ThistopicdescribeshowtoenableCoRsforOss.\nLimits\nThis feature is supported only by Machine Learning Studio 2.0.\nProcedure",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \n2.On the details page of the Oss bucket, choose Access Cont rol $>$ Cross-Origin Resource Sharing (CORS). In the Cross-Origin Resource Sharing (CORS) section, click Conf igure.\n3.Click Create Rule. In the Create Rule panel, set the following parameters.",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \n<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>\nNone"
 ]
\ No newline at end of file

From 3ee631d21fe826524b67f0484a73ab3d4483479b Mon Sep 17 00:00:00 2001
From: ranxia <chenanyu.cay@alibaba-inc.com>
Date: Wed, 15 Jan 2025 15:28:38 +0800
Subject: [PATCH 9/9] fix test

---
 .../integrations/test_markdown_mode_parser.py |  2 +-
 .../testdata/data/json_data/pai_document.json |  4 +-
 tests/testdata/data/md_data/pai_document.md   | 72 +++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 tests/testdata/data/md_data/pai_document.md

diff --git a/tests/integrations/test_markdown_mode_parser.py b/tests/integrations/test_markdown_mode_parser.py
index a3ed802a..79e0d69b 100644
--- a/tests/integrations/test_markdown_mode_parser.py
+++ b/tests/integrations/test_markdown_mode_parser.py
@@ -25,7 +25,7 @@ def test_markdown_parser():
         cls=PaiDataReader,
         reader_config=config.data_reader,
     )
-    input_dir = "tests/testdata/data/pdf_data"
+    input_dir = "tests/testdata/data/md_data"
     ModelScopeDownloader().load_rag_models()
     documents = directory_reader.load_data(file_path_or_directory=input_dir)
     md_node_parser = MarkdownNodeParser(enable_multimodal=False)
diff --git a/tests/testdata/data/json_data/pai_document.json b/tests/testdata/data/json_data/pai_document.json
index a1e539b1..f66daa8e 100644
--- a/tests/testdata/data/json_data/pai_document.json
+++ b/tests/testdata/data/json_data/pai_document.json
@@ -4,9 +4,9 @@
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : \nCharacters that cannot be transcoded are displayed as \"blob.\" Ignore this error, because nodes in the downst ream can read and process the data.\nWhat can I do when the x13-auto-arima component reports an error?\nMake sure that up to 1,200 training data samples are imported into the x13-auto-arima component.\nWhat can I do when the Doc2Vec component reports the CallExecutorToParseTaskFail error?",
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components : Make sure that the number of data samples imported into the Doc2Vec component is less than $2470000\\times10000.$ The data is calculated in the formula:  (Number of documents $^+$ Number of  words) x Vector length . Make sure that the number of users who use the Doc2Vec component is less than $42432500\\times7712293\\times300$ . Exceeding the preceding limits causes memory application failures. If your data size is excessively large, you can reduce the number of data samples and then calculate the formula again. Make sure that the imported data is split into words.",
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data : \nThis topic describes the FAQ about model data.\nWhy does my experiment generate an empty model? ·How do I download a model generated in my experiment?",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data\nHow do I upload data? : \nWhy does my experiment generate an empty model?\nSymptom\nWhen I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.\nSolution\ni. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.i. Click General. On the General page, select Auto Generate PMML.il. Run the experiment again.\nHow do I download a model generated in my experiment?",
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data\nHow do I upload data? : \nWhy does my experiment generate an empty model?\nSymptom\nWhen I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.\nSolution\ni. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.\ni. Click General. On the General page, select Auto Generate PMML.\nil. Run the experiment again.\nHow do I download a model generated in my experiment?",
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.2. FAQ about model data\nHow do I upload data? : \n1.Go to the Machine Learning Studio page. In the left-side navigation pane, click Models.\n2.In the model list, right-clickthe model you want to download and then select Save Model.\nHow do I upload data?\nFor more information about how to upload a data file, see Prepare data.",
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \nIf you want to preview CsV and JsoN files that are stored in Object Storage Service (Oss) buckets on the Machine Learning Studio platform, you must enable Cross-Origin Resource Sharing (CoRs) for the OsSbuckets.ThistopicdescribeshowtoenableCoRsforOss.\nLimits\nThis feature is supported only by Machine Learning Studio 2.0.\nProcedure",
     "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \n2.On the details page of the Oss bucket, choose Access Cont rol $>$ Cross-Origin Resource Sharing (CORS). In the Cross-Origin Resource Sharing (CORS) section, click Conf igure.\n3.Click Create Rule. In the Create Rule panel, set the following parameters.",
-    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \n<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>\nNone"
+    "2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components\n2.3. Preview Oss files in Machine Learning Studio : \n<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>"
 ]
\ No newline at end of file
diff --git a/tests/testdata/data/md_data/pai_document.md b/tests/testdata/data/md_data/pai_document.md
new file mode 100644
index 00000000..0250cdf5
--- /dev/null
+++ b/tests/testdata/data/md_data/pai_document.md
@@ -0,0 +1,72 @@
+# 2.PAl-Studio/Designer FAQ 2.1. FAQ about algorithm components 
+
+This topic describes the FAQ about algorit hm components.  
+
+What can I do when the format conversion component reports an error? · What can I do when "blob" is displayed on the data present ation page of the PAl? · what canI do when the xl3-auto-arima component reports an error? · What can Ido when the Doc2Vec component reports the CallExecutorToParseTaskFail error?  
+
+What can I do when the format conversion component reports an error? 
+
+By default, the format conversion component runs 100 workers. Make sure that up to 100 data ent ries are converted at a time.  
+
+What can I do when "blob" is displayed on the data presentation page of the PAl?  
+
+Symptom  
+
+On the canvas of the Machine Learning Studio page, when Iright-clicked a component and selected View Data, some text is displayed as "blob."  
+
+Solution  
+
+Characters that cannot be transcoded are displayed as "blob." Ignore this error, because nodes in the downst ream can read and process the data.  
+
+What can I do when the x13-auto-arima component reports an error? 
+
+Make sure that up to 1,200 training data samples are imported into the x13-auto-arima component.  
+
+What can I do when the Doc2Vec component reports the CallExecutorToParseTaskFail error? 
+
+Make sure that the number of data samples imported into the Doc2Vec component is less than $247\,0000\times10000.$ The data is calculated in the formula:  (Number of documents $^+$ Number of  words) x Vector length . Make sure that the number of users who use the Doc2Vec component is less than $42432500\times7712293\times300$ . Exceeding the preceding limits causes memory application failures. If your data size is excessively large, you can reduce the number of data samples and then calculate the formula again. Make sure that the imported data is split into words.  
+
+## 2.2. FAQ about model data 
+
+This topic describes the FAQ about model data.  
+
+Why does my experiment generate an empty model? ·How do I download a model generated in my experiment?  
+
+##### How do I upload data? 
+
+Why does my experiment generate an empty model? 
+
+Symptom  
+
+When I right-clicked a model training component in the canvas and chose Model Option $>$ View Model, the out put is empty.  
+
+Solution  
+
+i. Go to the Machine Learning Studio page. In the left-side navigation pane, click Settings.   
+i. Click General. On the General page, select Auto Generate PMML.   
+il. Run the experiment again.  
+
+How do I download a model generated in my experiment? 
+
+1. Go to the Machine Learning Studio page. In the left-side navigation pane, click Models.   
+2. In the model list, right-clickthe model you want to download and then select Save Model.  
+
+How do I upload data?  
+
+For more information about how to upload a data file, see Prepare data.  
+
+## 2.3. Preview Oss files in Machine Learning Studio 
+
+If you want to preview CsV and JsoN files that are stored in Object Storage Service (Oss) buckets on the Machine Learning Studio platform, you must enable Cross-Origin Resource Sharing (CoRs) for the OsSbuckets.ThistopicdescribeshowtoenableCoRsforOss.  
+
+Limits 
+
+This feature is supported only by Machine Learning Studio 2.0.  
+
+Procedure 
+
+2. On the details page of the Oss bucket, choose Access Cont rol $>$ Cross-Origin Resource Sharing (CORS). In the Cross-Origin Resource Sharing (CORS) section, click Conf igure.  
+
+3. Click Create Rule. In the Create Rule panel, set the following parameters.  
+
+<html><body><table><tr><td>Parameter</td><td>Required</td><td>Description</td></tr><tr><td>Sources</td><td>Yes</td><td>Thesourcesofcross-regionrequeststhatyouwantto allow.Usethefollowingaddressesforthisparameter: https://*.console.aliyun.com http://*.console.aliyun.com</td></tr></table></body></html>