modified according to comments

qbc2016 · qbc2016 · commit 40d982aea3c5 · 2026-01-19T14:29:00.000+08:00
diff --git a/src/agentscope/rag/_reader/_excel_reader.py b/src/agentscope/rag/_reader/_excel_reader.py
@@ -8,11 +8,33 @@
 
 from ._reader_base import ReaderBase
 from ._text_reader import TextReader
+from ._utils import _get_media_type_from_data
 from .._document import Document, DocMetadata
 from ...message import ImageBlock, Base64Source, TextBlock
 from ..._logging import logger
 
 
+def _get_excel_column_name(col_index: int) -> str:
+    """Convert a 0-based column index to Excel column name (A, B, ..., Z, AA,
+    AB, ...).
+
+    Args:
+        col_index (`int`):
+            The 0-based column index.
+
+    Returns:
+        `str`:
+            The Excel column name (e.g., 'A' for 0, 'B' for 1, 'AA' for 26).
+    """
+    result = ""
+    col_index += 1  # Convert to 1-based
+    while col_index > 0:
+        col_index -= 1
+        result = chr(ord("A") + col_index % 26) + result
+        col_index //= 26
+    return result
+
+
 def _extract_table_data(df: Any) -> list[list[str]]:
     """Extract table data from a DataFrame, handling NaN values.
 
@@ -95,45 +117,46 @@ def _extract_images_from_worksheet(
     return images
 
 
-def _get_media_type_from_data(data: bytes) -> str:
-    """Determine media type from image data.
+class ExcelReader(ReaderBase):
+    """The Excel reader that supports reading text, image, and table
+    content from Excel files (.xlsx, .xls files), and chunking the text
+    content into smaller pieces.
 
-    Args:
-        data (`bytes`):
-            The raw image data.
+    .. note:: The table content can be extracted in Markdown or JSON format.
 
-    Returns:
-        `str`:
-            The MIME type of the image (e.g., "image/png", "image/jpeg").
-    """
-    # Image signature mapping
-    signatures = {
-        b"\x89PNG\r\n\x1a\n": "image/png",
-        b"\xff\xd8": "image/jpeg",
-        b"GIF87a": "image/gif",
-        b"GIF89a": "image/gif",
-        b"BM": "image/bmp",
-    }
+        **Markdown format example** (``include_cell_coordinates=False``):
 
-    # Check signatures
-    for signature, media_type in signatures.items():
-        if data.startswith(signature):
-            return media_type
+        .. code-block:: text
 
-    # Check WebP (RIFF at start + WEBP at offset 8)
-    if len(data) > 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
-        return "image/webp"
+            | Name  | Age | City     |
+            |-------|-----|----------|
+            | Alice | 25  | New York |
+            | Bob   | 30  | London   |
 
-    # Default to JPEG
-    return "image/jpeg"
+        **Markdown format example** (``include_cell_coordinates=True``):
 
+        .. code-block:: text
 
-class ExcelReader(ReaderBase):
-    """The Excel reader that supports reading text, image, and table
-    content from Excel files (.xlsx, .xls files), and chunking the text
-    content into smaller pieces.
+            | [A1] Name  | [B1] Age | [C1] City     |
+            |------------|----------|---------------|
+            | [A2] Alice | [B2] 25  | [C2] New York |
+            | [A3] Bob   | [B3] 30  | [C3] London   |
 
-    .. note:: The table content can be extracted in Markdown or JSON format.
+        **JSON format example** (``include_cell_coordinates=False``):
+
+        .. code-block:: json
+
+            ["Name", "Age", "City"]
+            ["Alice", "25", "New York"]
+            ["Bob", "30", "London"]
+
+        **JSON format example** (``include_cell_coordinates=True``):
+
+        .. code-block:: json
+
+            {"A1": "Name", "B1": "Age", "C1": "City"}
+            {"A2": "Alice", "B2": "25", "C2": "New York"}
+            {"A3": "Bob", "B3": "30", "C3": "London"}
     """
 
     def __init__(
@@ -558,10 +581,21 @@ def _table_to_markdown(
         # structure
         def escape_pipes(cell_text: str) -> str:
             """Escape pipe characters in cell content."""
-            return cell_text.replace("|", "||")
+            return cell_text.replace("|", "\\|")
+
+        def format_cell(cell: str, row_idx: int, col_idx: int) -> str:
+            """Format cell content with optional coordinates."""
+            escaped = escape_pipes(cell)
+            if self.include_cell_coordinates:
+                coord = f"{_get_excel_column_name(col_idx)}{row_idx + 1}"
+                return f"[{coord}] {escaped}"
+            return escaped
 
         # Header row (first row)
-        escaped_header = [escape_pipes(cell) for cell in table_data[0]]
+        escaped_header = [
+            format_cell(cell, 0, col_idx)
+            for col_idx, cell in enumerate(table_data[0])
+        ]
         header_row = "| " + " | ".join(escaped_header) + " |\n"
         md_table += header_row
 
@@ -570,13 +604,16 @@ def escape_pipes(cell_text: str) -> str:
         md_table += separator_row
 
         # Data rows
-        for row in table_data[1:]:
+        for row_idx, row in enumerate(table_data[1:], start=1):
             # Ensure row has same number of columns as header
             while len(row) < num_cols:
                 row.append("")
-            # Escape pipe characters in each cell
-            escaped_row = [escape_pipes(cell) for cell in row[:num_cols]]
-            data_row = "| " + " | ".join(escaped_row) + " |\n"
+            # Format each cell with optional coordinates
+            formatted_row = [
+                format_cell(cell, row_idx, col_idx)
+                for col_idx, cell in enumerate(row[:num_cols])
+            ]
+            data_row = "| " + " | ".join(formatted_row) + " |\n"
             md_table += data_row
 
         return md_table
@@ -609,10 +646,16 @@ def _table_to_json(
             "<system-info>A table loaded as a JSON array:</system-info>",
         )
 
-        for row in table_data:
-            json_strs.append(
-                json.dumps(row, ensure_ascii=False),
-            )
+        for row_idx, row in enumerate(table_data):
+            if self.include_cell_coordinates:
+                # Include cell coordinates in the format {"A1": "value", ...}
+                row_dict = {
+                    f"{_get_excel_column_name(col_idx)}{row_idx + 1}": cell
+                    for col_idx, cell in enumerate(row)
+                }
+                json_strs.append(json.dumps(row_dict, ensure_ascii=False))
+            else:
+                json_strs.append(json.dumps(row, ensure_ascii=False))
 
         return "\n".join(json_strs)
 
diff --git a/src/agentscope/rag/_reader/_utils.py b/src/agentscope/rag/_reader/_utils.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""The image reader to read and chunk image files."""
+
+
+def _get_media_type_from_data(data: bytes) -> str:
+    """Determine media type from image data.
+
+    Args:
+        data (`bytes`):
+            The raw image data.
+
+    Returns:
+        `str`:
+            The MIME type of the image (e.g., "image/png", "image/jpeg").
+    """
+    # Image signature mapping
+    signatures = {
+        b"\x89PNG\r\n\x1a\n": "image/png",
+        b"\xff\xd8": "image/jpeg",
+        b"GIF87a": "image/gif",
+        b"GIF89a": "image/gif",
+        b"BM": "image/bmp",
+    }
+
+    # Check signatures
+    for signature, media_type in signatures.items():
+        if data.startswith(signature):
+            return media_type
+
+    # Check WebP (RIFF at start + WEBP at offset 8)
+    if len(data) > 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
+        return "image/webp"
+
+    # Default to JPEG
+    return "image/jpeg"
diff --git a/src/agentscope/rag/_reader/_word_reader.py b/src/agentscope/rag/_reader/_word_reader.py
@@ -505,4 +505,4 @@ def get_doc_id(self, word_path: str) -> str:
             `str`:
                 The generated document ID.
         """
-        return hashlib.sha256(word_path.encode("utf-8")).hexdigest()
+        return hashlib.md5(word_path.encode("utf-8")).hexdigest()
diff --git a/tests/rag_reader_test.py b/tests/rag_reader_test.py
@@ -115,16 +115,6 @@ async def test_word_reader_with_images_and_tables(self) -> None:
             ["text"] * 4 + ["image"] * 2 + ["text", "image", "text", "text"],
         )
 
-        import json
-
-        print(
-            json.dumps(
-                [_.metadata.content.get("text") for _ in docs],
-                indent=4,
-                ensure_ascii=False,
-            ),
-        )
-
         self.assertEqual(
             [_.metadata.content.get("text") for _ in docs],
             [
@@ -164,6 +154,7 @@ async def test_excel_reader_with_images_and_tables(self) -> None:
             split_by="sentence",
             include_image=True,
             separate_table=True,
+            include_cell_coordinates=True,
             table_format="markdown",
         )
         excel_path = os.path.join(
@@ -177,71 +168,74 @@ async def test_excel_reader_with_images_and_tables(self) -> None:
         # then table blocks from second sheet
         # Order is based on row positions: table (row 0-5) → image (row 9)
         # → table (row 0-4)
+        # Note: with include_cell_coordinates=True, cell coordinates are added
+        # to each cell (e.g., [A1], [B1], etc.), which increases text length
+        # and results in more chunks
         self.assertListEqual(
             [_.metadata.content["type"] for _ in docs],
-            ["text"] * 2 + ["image"] * 1 + ["text"] * 5,
-        )
-
-        import json
-
-        print(
-            json.dumps(
-                [_.metadata.content.get("text") for _ in docs],
-                indent=4,
-                ensure_ascii=False,
-            ),
+            ["text"] * 3 + ["image"] * 1 + ["text"] * 5,
         )
 
         # Verify exact document content
         doc_texts = [_.metadata.content.get("text") for _ in docs]
 
-        # Verify sheet headers and table content
+        # Verify sheet headers and table content with cell coordinates
         # First text block should contain Employee Info sheet header and table
-        # Note: Due to chunk_size=200, the last row is truncated
+        # Note: Due to chunk_size=200, the rows are truncated
         # Order: table (row 0-5) → image (row 9) → table (row 0-4)
         self.assertEqual(
             doc_texts[0],
             "Sheet: Employee Info\n"
-            "| John Smith | 25 | Engineering | 8000 | 2020-01-15 |\n"
+            "| [A1] John Smith | [B1] 25 | [C1] Engineering | "
+            "[D1] 8000 | [E1] 2020-01-15 |\n"
             "| --- | --- | --- | --- | --- |\n"
-            "| Jane Doe | 30 | Sales | 12000 | 2019-03-20 |\n"
-            "| Mike || Johnson | 35 | HR | 9000 | 2021-06-1",
+            "| [A2] Jane Doe | [B2] 30 | [C2] Sales | "
+            "[D2] 12000 | [E2] 2019-03-2",
         )
         # Second text block continues the employee table
-        # Note: Starts with "0 |" from the truncated previous row
         self.assertEqual(
             doc_texts[1],
             "0 |\n"
-            "| Sarah Wilson | 28 | Finance | 10000 | 2020-09-05 |\n"
-            "| David Brown | 32 | Marketing | 11000 | 2018-12-01 |",
+            "| [A3] Mike \\| Johnson | [B3] 35 | [C3] HR | "
+            "[D3] 9000 | [E3] 2021-06-10 |\n"
+            "| [A4] Sarah Wilson | [B4] 28 | [C4] Finance | "
+            "[D4] 10000 | [E4] 2020-09-05 |\n"
+            "| [A5] David Brown | [B5] 32 | [C5] Marketi",
+        )
+        # Third text block continues the employee table
+        self.assertEqual(
+            doc_texts[2],
+            "ng | [D5] 11000 | [E5] 2018-12-01 |",
         )
         # Image block (text is None)
-        self.assertIsNone(doc_texts[2])
-        # Third text block should contain Product Info sheet header and
+        self.assertIsNone(doc_texts[3])
+        # Fourth text block should contain Product Info sheet header and
         # start of table
         self.assertEqual(
-            doc_texts[3],
+            doc_texts[4],
             "Sheet: Product Info\n"
-            "| Product A | 100 | 50 | High-quality Product A, suitable for "
-            "various scenarios.",
+            "| [A1] Product A | [B1] 100 | [C1] 50 | "
+            "[D1] High-quality Product A, suitable for various scenarios.",
         )
         # Remaining blocks continue the product table
         self.assertEqual(
-            doc_texts[4],
+            doc_texts[5],
             "|\n"
             "| --- | --- | --- | --- |\n"
-            "| Product B | 200 | 30 | Product B offers excellent performance.",
+            "| [A2] Product B | [B2] 200 | [C2] 30 | "
+            "[D2] Product B offers excellent performance.",
         )
         self.assertEqual(
-            doc_texts[5],
+            doc_texts[6],
             "|\n"
-            "| Product C | 300 | 20 | Product C is a market-leading solution.",
+            "| [A3] Product C | [B3] 300 | [C3] 20 | "
+            "[D3] Product C is a market-leading solution.",
         )
         self.assertEqual(
-            doc_texts[6],
+            doc_texts[7],
             "|\n"
-            "| Product D | 400 | 40 | Product D provides comprehensive "
-            "functionality.",
+            "| [A4] Product D | [B4] 400 | [C4] 40 | "
+            "[D4] Product D provides comprehensive functionality.",
         )
 
         # Verify image media types