Skip to content

Commit

Permalink
fix(markdown): fix empty block handling
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Jan 30, 2025
1 parent d7c0828 commit c94c0d3
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 30 deletions.
68 changes: 38 additions & 30 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import List, Optional, Set, Union

import marko
import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline
Expand Down Expand Up @@ -163,14 +164,14 @@ def process_inline_text(

def iterate_elements(
self,
element: marko.block.Element,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
Expand Down Expand Up @@ -205,17 +206,22 @@ def traverse(node: marko.block.BlockElement):
)

elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
for child in element.children:
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
has_non_empty_list_items = True
break

self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
list_label = GroupLabel.LIST
if element.ordered:
list_label = GroupLabel.ORDERED_LIST
parent_element = doc.add_group(
label=list_label, name=f"list", parent=parent_element
)
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_element = doc.add_group(
label=label, name=f"list", parent=parent_element
)

elif isinstance(element, marko.block.ListItem):
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")
Expand Down Expand Up @@ -245,20 +251,18 @@ def traverse(node: marko.block.BlockElement):

doc.add_picture(parent=parent_element, caption=fig_caption)

elif isinstance(element, marko.block.Paragraph):
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self.process_inline_text(parent_element, doc)

elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = str(element.children).strip()
snippet_text = element.children.strip()
# Detect start of the table:
if "|" in snippet_text:
# most likely part of the markdown table
self.in_table = True
if len(self.md_table_buffer) > 0:
self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
snippet_text
)
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else:
self.md_table_buffer.append(snippet_text)
else:
Expand All @@ -274,18 +278,15 @@ def traverse(node: marko.block.BlockElement):
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.FencedCode):
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.inline.LineBreak):
Expand All @@ -309,14 +310,21 @@ def traverse(node: marko.block.BlockElement):
self.close_table(doc)
_log.debug("Some other element: {}".format(element))

processed_block_types = (
marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
# marko.block.Paragraph,
marko.inline.RawText,
)

# Iterate through the element's children (if any)
if not isinstance(element, marko.block.ListItem):
if not isinstance(element, marko.block.Heading):
if not isinstance(element, marko.block.FencedCode):
# if not isinstance(element, marko.block.Paragraph):
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
if hasattr(element, "children") and not isinstance(
element, processed_block_types
):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)

def is_valid(self) -> bool:
return self.valid
Expand Down
33 changes: 33 additions & 0 deletions tests/data/groundtruth/docling_v2/blocks.md.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Unordered list:

- foo

Empty unordered list:

Ordered list:

- bar

Empty ordered list:

Heading:

# my heading

Empty heading:

Indented code block:

```
print("Hi!")
```

Empty indented code block:

Fenced code block:

```
print("Hello world!")
```

Empty fenced code block:
43 changes: 43 additions & 0 deletions tests/data/md/blocks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
Unordered list:

- foo

Empty unordered list:

-

Ordered list:

1. bar

Empty ordered list:

1.

Heading:

# my heading

Empty heading:

#

Indented code block:

print("Hi!")

Empty indented code block:



Fenced code block:

```python
print("Hello world!")
```

Empty fenced code block:

```
```

0 comments on commit c94c0d3

Please sign in to comment.