fix issues with exporting tables containing cells that span multiple rows and columns (#119)

BotAndyGao · web-flow · commit 72bf34f85874 · 2024-11-26T18:20:50.000+08:00
* 1、解决导出表格标注时添加colspan和rowspan时的异常 2、解决导出的gt文件中gt属性中html标签合规的问题 * fix code style * 修复单元格占多行又占多列导出报错的问题。issues：导出表格标注报错 #113
diff --git a/PPOCRLabel.py b/PPOCRLabel.py
@@ -3181,7 +3181,6 @@ def exportJSON(self):
         """
         export PPLabel and CSV to JSON (PubTabNet)
         """
-        import pandas as pd
 
         # automatically save annotations
         self.saveFilestate()
diff --git a/libs/utils.py b/libs/utils.py
@@ -232,14 +232,16 @@ def convert_token(html_list):
             elif col == "td":
                 token_list.extend(["<td>", "</td>"])
             else:
-                token_list.append("<td")
-                if "colspan" in col:
-                    _, n = col.split("colspan=")
-                    token_list.append(' colspan="{}"'.format(int(n)))
-                if "rowspan" in col:
-                    _, n = col.split("rowspan=")
-                    token_list.append(' rowspan="{}"'.format(int(n)))
-                token_list.extend([">", "</td>"])
+                token_list.append("<td")  # Start the td tag
+                # Use regex to match "colspan" and "rowspan" attributes and their values
+                colspan_match = re.search(r"colspan=(\d+)", col)
+                rowspan_match = re.search(r"rowspan=(\d+)", col)
+                if colspan_match:
+                    token_list.append(f' colspan="{colspan_match.group(1)}"')
+                if rowspan_match:
+                    token_list.append(f' rowspan="{rowspan_match.group(1)}"')
+                token_list.append(">")  # End the opening td tag
+                token_list.append("</td>")  # Close the td tag
         token_list.append("</tr>")
     token_list.append("</tbody>")