Skip to content

Commit 40d982a

Browse files
committed
modified according to comments
1 parent eb155bb commit 40d982a

File tree

4 files changed

+155
-83
lines changed

4 files changed

+155
-83
lines changed

src/agentscope/rag/_reader/_excel_reader.py

Lines changed: 84 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,33 @@
88

99
from ._reader_base import ReaderBase
1010
from ._text_reader import TextReader
11+
from ._utils import _get_media_type_from_data
1112
from .._document import Document, DocMetadata
1213
from ...message import ImageBlock, Base64Source, TextBlock
1314
from ..._logging import logger
1415

1516

17+
def _get_excel_column_name(col_index: int) -> str:
18+
"""Convert a 0-based column index to Excel column name (A, B, ..., Z, AA,
19+
AB, ...).
20+
21+
Args:
22+
col_index (`int`):
23+
The 0-based column index.
24+
25+
Returns:
26+
`str`:
27+
The Excel column name (e.g., 'A' for 0, 'B' for 1, 'AA' for 26).
28+
"""
29+
result = ""
30+
col_index += 1 # Convert to 1-based
31+
while col_index > 0:
32+
col_index -= 1
33+
result = chr(ord("A") + col_index % 26) + result
34+
col_index //= 26
35+
return result
36+
37+
1638
def _extract_table_data(df: Any) -> list[list[str]]:
1739
"""Extract table data from a DataFrame, handling NaN values.
1840
@@ -95,45 +117,46 @@ def _extract_images_from_worksheet(
95117
return images
96118

97119

98-
def _get_media_type_from_data(data: bytes) -> str:
99-
"""Determine media type from image data.
120+
class ExcelReader(ReaderBase):
121+
"""The Excel reader that supports reading text, image, and table
122+
content from Excel files (.xlsx, .xls files), and chunking the text
123+
content into smaller pieces.
100124
101-
Args:
102-
data (`bytes`):
103-
The raw image data.
125+
.. note:: The table content can be extracted in Markdown or JSON format.
104126
105-
Returns:
106-
`str`:
107-
The MIME type of the image (e.g., "image/png", "image/jpeg").
108-
"""
109-
# Image signature mapping
110-
signatures = {
111-
b"\x89PNG\r\n\x1a\n": "image/png",
112-
b"\xff\xd8": "image/jpeg",
113-
b"GIF87a": "image/gif",
114-
b"GIF89a": "image/gif",
115-
b"BM": "image/bmp",
116-
}
127+
**Markdown format example** (``include_cell_coordinates=False``):
117128
118-
# Check signatures
119-
for signature, media_type in signatures.items():
120-
if data.startswith(signature):
121-
return media_type
129+
.. code-block:: text
122130
123-
# Check WebP (RIFF at start + WEBP at offset 8)
124-
if len(data) > 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
125-
return "image/webp"
131+
| Name | Age | City |
132+
|-------|-----|----------|
133+
| Alice | 25 | New York |
134+
| Bob | 30 | London |
126135
127-
# Default to JPEG
128-
return "image/jpeg"
136+
**Markdown format example** (``include_cell_coordinates=True``):
129137
138+
.. code-block:: text
130139
131-
class ExcelReader(ReaderBase):
132-
"""The Excel reader that supports reading text, image, and table
133-
content from Excel files (.xlsx, .xls files), and chunking the text
134-
content into smaller pieces.
140+
| [A1] Name | [B1] Age | [C1] City |
141+
|------------|----------|---------------|
142+
| [A2] Alice | [B2] 25 | [C2] New York |
143+
| [A3] Bob | [B3] 30 | [C3] London |
135144
136-
.. note:: The table content can be extracted in Markdown or JSON format.
145+
**JSON format example** (``include_cell_coordinates=False``):
146+
147+
.. code-block:: json
148+
149+
["Name", "Age", "City"]
150+
["Alice", "25", "New York"]
151+
["Bob", "30", "London"]
152+
153+
**JSON format example** (``include_cell_coordinates=True``):
154+
155+
.. code-block:: json
156+
157+
{"A1": "Name", "B1": "Age", "C1": "City"}
158+
{"A2": "Alice", "B2": "25", "C2": "New York"}
159+
{"A3": "Bob", "B3": "30", "C3": "London"}
137160
"""
138161

139162
def __init__(
@@ -558,10 +581,21 @@ def _table_to_markdown(
558581
# structure
559582
def escape_pipes(cell_text: str) -> str:
560583
"""Escape pipe characters in cell content."""
561-
return cell_text.replace("|", "||")
584+
return cell_text.replace("|", "\\|")
585+
586+
def format_cell(cell: str, row_idx: int, col_idx: int) -> str:
587+
"""Format cell content with optional coordinates."""
588+
escaped = escape_pipes(cell)
589+
if self.include_cell_coordinates:
590+
coord = f"{_get_excel_column_name(col_idx)}{row_idx + 1}"
591+
return f"[{coord}] {escaped}"
592+
return escaped
562593

563594
# Header row (first row)
564-
escaped_header = [escape_pipes(cell) for cell in table_data[0]]
595+
escaped_header = [
596+
format_cell(cell, 0, col_idx)
597+
for col_idx, cell in enumerate(table_data[0])
598+
]
565599
header_row = "| " + " | ".join(escaped_header) + " |\n"
566600
md_table += header_row
567601

@@ -570,13 +604,16 @@ def escape_pipes(cell_text: str) -> str:
570604
md_table += separator_row
571605

572606
# Data rows
573-
for row in table_data[1:]:
607+
for row_idx, row in enumerate(table_data[1:], start=1):
574608
# Ensure row has same number of columns as header
575609
while len(row) < num_cols:
576610
row.append("")
577-
# Escape pipe characters in each cell
578-
escaped_row = [escape_pipes(cell) for cell in row[:num_cols]]
579-
data_row = "| " + " | ".join(escaped_row) + " |\n"
611+
# Format each cell with optional coordinates
612+
formatted_row = [
613+
format_cell(cell, row_idx, col_idx)
614+
for col_idx, cell in enumerate(row[:num_cols])
615+
]
616+
data_row = "| " + " | ".join(formatted_row) + " |\n"
580617
md_table += data_row
581618

582619
return md_table
@@ -609,10 +646,16 @@ def _table_to_json(
609646
"<system-info>A table loaded as a JSON array:</system-info>",
610647
)
611648

612-
for row in table_data:
613-
json_strs.append(
614-
json.dumps(row, ensure_ascii=False),
615-
)
649+
for row_idx, row in enumerate(table_data):
650+
if self.include_cell_coordinates:
651+
# Include cell coordinates in the format {"A1": "value", ...}
652+
row_dict = {
653+
f"{_get_excel_column_name(col_idx)}{row_idx + 1}": cell
654+
for col_idx, cell in enumerate(row)
655+
}
656+
json_strs.append(json.dumps(row_dict, ensure_ascii=False))
657+
else:
658+
json_strs.append(json.dumps(row, ensure_ascii=False))
616659

617660
return "\n".join(json_strs)
618661

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# -*- coding: utf-8 -*-
2+
"""The image reader to read and chunk image files."""
3+
4+
5+
def _get_media_type_from_data(data: bytes) -> str:
6+
"""Determine media type from image data.
7+
8+
Args:
9+
data (`bytes`):
10+
The raw image data.
11+
12+
Returns:
13+
`str`:
14+
The MIME type of the image (e.g., "image/png", "image/jpeg").
15+
"""
16+
# Image signature mapping
17+
signatures = {
18+
b"\x89PNG\r\n\x1a\n": "image/png",
19+
b"\xff\xd8": "image/jpeg",
20+
b"GIF87a": "image/gif",
21+
b"GIF89a": "image/gif",
22+
b"BM": "image/bmp",
23+
}
24+
25+
# Check signatures
26+
for signature, media_type in signatures.items():
27+
if data.startswith(signature):
28+
return media_type
29+
30+
# Check WebP (RIFF at start + WEBP at offset 8)
31+
if len(data) > 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
32+
return "image/webp"
33+
34+
# Default to JPEG
35+
return "image/jpeg"

src/agentscope/rag/_reader/_word_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,4 +505,4 @@ def get_doc_id(self, word_path: str) -> str:
505505
`str`:
506506
The generated document ID.
507507
"""
508-
return hashlib.sha256(word_path.encode("utf-8")).hexdigest()
508+
return hashlib.md5(word_path.encode("utf-8")).hexdigest()

tests/rag_reader_test.py

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,6 @@ async def test_word_reader_with_images_and_tables(self) -> None:
115115
["text"] * 4 + ["image"] * 2 + ["text", "image", "text", "text"],
116116
)
117117

118-
import json
119-
120-
print(
121-
json.dumps(
122-
[_.metadata.content.get("text") for _ in docs],
123-
indent=4,
124-
ensure_ascii=False,
125-
),
126-
)
127-
128118
self.assertEqual(
129119
[_.metadata.content.get("text") for _ in docs],
130120
[
@@ -164,6 +154,7 @@ async def test_excel_reader_with_images_and_tables(self) -> None:
164154
split_by="sentence",
165155
include_image=True,
166156
separate_table=True,
157+
include_cell_coordinates=True,
167158
table_format="markdown",
168159
)
169160
excel_path = os.path.join(
@@ -177,71 +168,74 @@ async def test_excel_reader_with_images_and_tables(self) -> None:
177168
# then table blocks from second sheet
178169
# Order is based on row positions: table (row 0-5) → image (row 9)
179170
# → table (row 0-4)
171+
# Note: with include_cell_coordinates=True, cell coordinates are added
172+
# to each cell (e.g., [A1], [B1], etc.), which increases text length
173+
# and results in more chunks
180174
self.assertListEqual(
181175
[_.metadata.content["type"] for _ in docs],
182-
["text"] * 2 + ["image"] * 1 + ["text"] * 5,
183-
)
184-
185-
import json
186-
187-
print(
188-
json.dumps(
189-
[_.metadata.content.get("text") for _ in docs],
190-
indent=4,
191-
ensure_ascii=False,
192-
),
176+
["text"] * 3 + ["image"] * 1 + ["text"] * 5,
193177
)
194178

195179
# Verify exact document content
196180
doc_texts = [_.metadata.content.get("text") for _ in docs]
197181

198-
# Verify sheet headers and table content
182+
# Verify sheet headers and table content with cell coordinates
199183
# First text block should contain Employee Info sheet header and table
200-
# Note: Due to chunk_size=200, the last row is truncated
184+
# Note: Due to chunk_size=200, the rows are truncated
201185
# Order: table (row 0-5) → image (row 9) → table (row 0-4)
202186
self.assertEqual(
203187
doc_texts[0],
204188
"Sheet: Employee Info\n"
205-
"| John Smith | 25 | Engineering | 8000 | 2020-01-15 |\n"
189+
"| [A1] John Smith | [B1] 25 | [C1] Engineering | "
190+
"[D1] 8000 | [E1] 2020-01-15 |\n"
206191
"| --- | --- | --- | --- | --- |\n"
207-
"| Jane Doe | 30 | Sales | 12000 | 2019-03-20 |\n"
208-
"| Mike || Johnson | 35 | HR | 9000 | 2021-06-1",
192+
"| [A2] Jane Doe | [B2] 30 | [C2] Sales | "
193+
"[D2] 12000 | [E2] 2019-03-2",
209194
)
210195
# Second text block continues the employee table
211-
# Note: Starts with "0 |" from the truncated previous row
212196
self.assertEqual(
213197
doc_texts[1],
214198
"0 |\n"
215-
"| Sarah Wilson | 28 | Finance | 10000 | 2020-09-05 |\n"
216-
"| David Brown | 32 | Marketing | 11000 | 2018-12-01 |",
199+
"| [A3] Mike \\| Johnson | [B3] 35 | [C3] HR | "
200+
"[D3] 9000 | [E3] 2021-06-10 |\n"
201+
"| [A4] Sarah Wilson | [B4] 28 | [C4] Finance | "
202+
"[D4] 10000 | [E4] 2020-09-05 |\n"
203+
"| [A5] David Brown | [B5] 32 | [C5] Marketi",
204+
)
205+
# Third text block continues the employee table
206+
self.assertEqual(
207+
doc_texts[2],
208+
"ng | [D5] 11000 | [E5] 2018-12-01 |",
217209
)
218210
# Image block (text is None)
219-
self.assertIsNone(doc_texts[2])
220-
# Third text block should contain Product Info sheet header and
211+
self.assertIsNone(doc_texts[3])
212+
# Fourth text block should contain Product Info sheet header and
221213
# start of table
222214
self.assertEqual(
223-
doc_texts[3],
215+
doc_texts[4],
224216
"Sheet: Product Info\n"
225-
"| Product A | 100 | 50 | High-quality Product A, suitable for "
226-
"various scenarios.",
217+
"| [A1] Product A | [B1] 100 | [C1] 50 | "
218+
"[D1] High-quality Product A, suitable for various scenarios.",
227219
)
228220
# Remaining blocks continue the product table
229221
self.assertEqual(
230-
doc_texts[4],
222+
doc_texts[5],
231223
"|\n"
232224
"| --- | --- | --- | --- |\n"
233-
"| Product B | 200 | 30 | Product B offers excellent performance.",
225+
"| [A2] Product B | [B2] 200 | [C2] 30 | "
226+
"[D2] Product B offers excellent performance.",
234227
)
235228
self.assertEqual(
236-
doc_texts[5],
229+
doc_texts[6],
237230
"|\n"
238-
"| Product C | 300 | 20 | Product C is a market-leading solution.",
231+
"| [A3] Product C | [B3] 300 | [C3] 20 | "
232+
"[D3] Product C is a market-leading solution.",
239233
)
240234
self.assertEqual(
241-
doc_texts[6],
235+
doc_texts[7],
242236
"|\n"
243-
"| Product D | 400 | 40 | Product D provides comprehensive "
244-
"functionality.",
237+
"| [A4] Product D | [B4] 400 | [C4] 40 | "
238+
"[D4] Product D provides comprehensive functionality.",
245239
)
246240

247241
# Verify image media types

0 commit comments

Comments
 (0)