88
99from ._reader_base import ReaderBase
1010from ._text_reader import TextReader
11+ from ._utils import _get_media_type_from_data
1112from .._document import Document , DocMetadata
1213from ...message import ImageBlock , Base64Source , TextBlock
1314from ..._logging import logger
1415
1516
17+ def _get_excel_column_name (col_index : int ) -> str :
18+ """Convert a 0-based column index to Excel column name (A, B, ..., Z, AA,
19+ AB, ...).
20+
21+ Args:
22+ col_index (`int`):
23+ The 0-based column index.
24+
25+ Returns:
26+ `str`:
27+ The Excel column name (e.g., 'A' for 0, 'B' for 1, 'AA' for 26).
28+ """
29+ result = ""
30+ col_index += 1 # Convert to 1-based
31+ while col_index > 0 :
32+ col_index -= 1
33+ result = chr (ord ("A" ) + col_index % 26 ) + result
34+ col_index //= 26
35+ return result
36+
37+
1638def _extract_table_data (df : Any ) -> list [list [str ]]:
1739 """Extract table data from a DataFrame, handling NaN values.
1840
@@ -95,45 +117,46 @@ def _extract_images_from_worksheet(
95117 return images
96118
97119
98- def _get_media_type_from_data (data : bytes ) -> str :
99- """Determine media type from image data.
120+ class ExcelReader (ReaderBase ):
121+ """The Excel reader that supports reading text, image, and table
122+ content from Excel files (.xlsx, .xls files), and chunking the text
123+ content into smaller pieces.
100124
101- Args:
102- data (`bytes`):
103- The raw image data.
125+ .. note:: The table content can be extracted in Markdown or JSON format.
104126
105- Returns:
106- `str`:
107- The MIME type of the image (e.g., "image/png", "image/jpeg").
108- """
109- # Image signature mapping
110- signatures = {
111- b"\x89 PNG\r \n \x1a \n " : "image/png" ,
112- b"\xff \xd8 " : "image/jpeg" ,
113- b"GIF87a" : "image/gif" ,
114- b"GIF89a" : "image/gif" ,
115- b"BM" : "image/bmp" ,
116- }
127+ **Markdown format example** (``include_cell_coordinates=False``):
117128
118- # Check signatures
119- for signature , media_type in signatures .items ():
120- if data .startswith (signature ):
121- return media_type
129+ .. code-block:: text
122130
123- # Check WebP (RIFF at start + WEBP at offset 8)
124- if len (data ) > 12 and data [:4 ] == b"RIFF" and data [8 :12 ] == b"WEBP" :
125- return "image/webp"
131+ | Name | Age | City |
132+ |-------|-----|----------|
133+ | Alice | 25 | New York |
134+ | Bob | 30 | London |
126135
127- # Default to JPEG
128- return "image/jpeg"
136+ **Markdown format example** (``include_cell_coordinates=True``):
129137
138+ .. code-block:: text
130139
131- class ExcelReader ( ReaderBase ):
132- """The Excel reader that supports reading text, image, and table
133- content from Excel files (.xlsx, .xls files), and chunking the text
134- content into smaller pieces.
140+ | [A1] Name | [B1] Age | [C1] City |
141+ |------------|----------|---------------|
142+ | [A2] Alice | [B2] 25 | [C2] New York |
143+ | [A3] Bob | [B3] 30 | [C3] London |
135144
136- .. note:: The table content can be extracted in Markdown or JSON format.
145+ **JSON format example** (``include_cell_coordinates=False``):
146+
147+ .. code-block:: json
148+
149+ ["Name", "Age", "City"]
150+ ["Alice", "25", "New York"]
151+ ["Bob", "30", "London"]
152+
153+ **JSON format example** (``include_cell_coordinates=True``):
154+
155+ .. code-block:: json
156+
157+ {"A1": "Name", "B1": "Age", "C1": "City"}
158+ {"A2": "Alice", "B2": "25", "C2": "New York"}
159+ {"A3": "Bob", "B3": "30", "C3": "London"}
137160 """
138161
139162 def __init__ (
@@ -558,10 +581,21 @@ def _table_to_markdown(
558581 # structure
559582 def escape_pipes (cell_text : str ) -> str :
560583 """Escape pipe characters in cell content."""
561- return cell_text .replace ("|" , "||" )
584+ return cell_text .replace ("|" , "\\ |" )
585+
586+ def format_cell (cell : str , row_idx : int , col_idx : int ) -> str :
587+ """Format cell content with optional coordinates."""
588+ escaped = escape_pipes (cell )
589+ if self .include_cell_coordinates :
590+ coord = f"{ _get_excel_column_name (col_idx )} { row_idx + 1 } "
591+ return f"[{ coord } ] { escaped } "
592+ return escaped
562593
563594 # Header row (first row)
564- escaped_header = [escape_pipes (cell ) for cell in table_data [0 ]]
595+ escaped_header = [
596+ format_cell (cell , 0 , col_idx )
597+ for col_idx , cell in enumerate (table_data [0 ])
598+ ]
565599 header_row = "| " + " | " .join (escaped_header ) + " |\n "
566600 md_table += header_row
567601
@@ -570,13 +604,16 @@ def escape_pipes(cell_text: str) -> str:
570604 md_table += separator_row
571605
572606 # Data rows
573- for row in table_data [1 :]:
607+ for row_idx , row in enumerate ( table_data [1 :], start = 1 ) :
574608 # Ensure row has same number of columns as header
575609 while len (row ) < num_cols :
576610 row .append ("" )
577- # Escape pipe characters in each cell
578- escaped_row = [escape_pipes (cell ) for cell in row [:num_cols ]]
579- data_row = "| " + " | " .join (escaped_row ) + " |\n "
611+ # Format each cell with optional coordinates
612+ formatted_row = [
613+ format_cell (cell , row_idx , col_idx )
614+ for col_idx , cell in enumerate (row [:num_cols ])
615+ ]
616+ data_row = "| " + " | " .join (formatted_row ) + " |\n "
580617 md_table += data_row
581618
582619 return md_table
@@ -609,10 +646,16 @@ def _table_to_json(
609646 "<system-info>A table loaded as a JSON array:</system-info>" ,
610647 )
611648
612- for row in table_data :
613- json_strs .append (
614- json .dumps (row , ensure_ascii = False ),
615- )
649+ for row_idx , row in enumerate (table_data ):
650+ if self .include_cell_coordinates :
651+ # Include cell coordinates in the format {"A1": "value", ...}
652+ row_dict = {
653+ f"{ _get_excel_column_name (col_idx )} { row_idx + 1 } " : cell
654+ for col_idx , cell in enumerate (row )
655+ }
656+ json_strs .append (json .dumps (row_dict , ensure_ascii = False ))
657+ else :
658+ json_strs .append (json .dumps (row , ensure_ascii = False ))
616659
617660 return "\n " .join (json_strs )
618661
0 commit comments