Skip to content

Commit

Permalink
Merge pull request #9 from RapidAI/develop
Browse files Browse the repository at this point in the history
fix: fixed issue #3 #7 #8
  • Loading branch information
SWHL authored Dec 24, 2024
2 parents c2721b0 + 3cd44e1 commit 103602a
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 40 deletions.
Binary file added 1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,18 @@
| `yolov8n_layout_report`| 研报 | `yolov8n_layout_report.onnx` | `['Text', 'Title', 'Header', 'Footer', 'Figure', 'Table', 'Toc', 'Figure caption', 'Table caption']` |
| `yolov8n_layout_publaynet`| 英文 | `yolov8n_layout_publaynet.onnx` | `["Text", "Title", "List", "Table", "Figure"]` |
| `yolov8n_layout_general6`| 通用 | `yolov8n_layout_general6.onnx` | `["Text", "Title", "Figure", "Table", "Caption", "Equation"]` |
| 🔥`doclayout_yolo`| 通用 | `doclayout_yolo_docstructbench_imgsz1024.onnx` | `['title', 'text', 'abandon', 'figure', 'figure_caption', 'table', 'table_caption', 'table_footnote', 'isolate_formula', 'formula_caption']` |
| 🔥`doclayout_docstructbench`| 通用 | `doclayout_yolo_docstructbench_imgsz1024.onnx` | `['title', 'plain text', 'abandon', 'figure', 'figure_caption', 'table', 'table_caption', 'table_footnote', 'isolate_formula', 'formula_caption']` |
| 🔥`doclayout_d4la`| 通用 | `doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.onnx` | `['DocTitle', 'ParaTitle', 'ParaText', 'ListText', 'RegionTitle', 'Date', 'LetterHead', 'LetterDear', 'LetterSign', 'Question', 'OtherText', 'RegionKV', 'RegionList', 'Abstract', 'Author', 'TableName', 'Table', 'Figure', 'FigureName', 'Equation', 'Reference', 'Footer', 'PageHeader', 'PageFooter', 'Number', 'Catalog', 'PageNumber']` |
| 🔥`doclayout_docsynth`| 通用 | `doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.onnx` | `['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']` |

PP模型来源:[PaddleOCR 版面分析](https://github.com/PaddlePaddle/PaddleOCR/blob/133d67f27dc8a241d6b2e30a9f047a0fb75bebbe/ppstructure/layout/README_ch.md)

yolov8n系列来源:[360LayoutAnalysis](https://github.com/360AILAB-NLP/360LayoutAnalysis)

doclayout版本暂时有问题,不推荐使用。正在更新中....
~~(推荐使用)🔥doclayout_yolo模型来源:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO),该模型是目前最为优秀的开源模型,支持学术论文、Textbook、Financial、Exam Paper、Fuzzy Scans、PPT和Poster 7种文档类型的版面检测。值得一提的是,该模型支持的类别中存在`abandon`一类,主要是文档页面的页眉页脚部分,便于后续快速舍弃。~~

模型下载地址为:[link](https://github.com/RapidAI/RapidLayout/releases/tag/v0.0.0)
(推荐使用)🔥doclayout_yolo模型来源:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO),该模型是目前最为优秀的开源模型,挑选了3个基于不同训练集训练得到的模型。其中`doclayout_docstructbench`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/tree/main)`doclayout_d4la`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-D4LA-Docsynth300K_pretrained/blob/main/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.pt)`doclayout_docsynth`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/tree/main)

DocLayout模型下载地址为:[link](https://github.com/RapidAI/RapidLayout/releases/tag/v0.0.0)

### 安装

Expand Down
5 changes: 3 additions & 2 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

from rapid_layout import RapidLayout, VisLayout

layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.1)
layout_engine = RapidLayout(model_type="doclayout_docsynth")

img_path = "tests/test_files/PMC3576793_00004.jpg"
img = cv2.imread(img_path)

boxes, scores, class_names, elapse = layout_engine(img)
boxes, scores, class_names, elapse = layout_engine(img_path)
print(boxes.shape)
ploted_img = VisLayout.draw_detections(img, boxes, scores, class_names)
if ploted_img is not None:
cv2.imwrite("layout_res.png", ploted_img)
4 changes: 3 additions & 1 deletion rapid_layout/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
"yolov8n_layout_report": f"{ROOT_URL}/yolov8n_layout_report.onnx",
"yolov8n_layout_publaynet": f"{ROOT_URL}/yolov8n_layout_publaynet.onnx",
"yolov8n_layout_general6": f"{ROOT_URL}/yolov8n_layout_general6.onnx",
"doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
"doclayout_docstructbench": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
"doclayout_d4la": f"{ROOT_URL}/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.onnx",
"doclayout_docsynth": f"{ROOT_URL}/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.onnx",
}
DEFAULT_MODEL_PATH = str(ROOT_DIR / "models" / "layout_cdla.onnx")

Expand Down
85 changes: 85 additions & 0 deletions rapid_layout/utils/augment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: [email protected]
import cv2
import numpy as np


class LetterBox:
"""Resize image and padding for detection, instance segmentation, pose."""

def __init__(
self,
new_shape=(640, 640),
auto=False,
scaleFill=False,
scaleup=True,
center=True,
stride=32,
):
"""Initialize LetterBox object with specific parameters."""
self.new_shape = new_shape
self.auto = auto
self.scaleFill = scaleFill
self.scaleup = scaleup
self.stride = stride
self.center = center # Put the image in the middle or top-left

def __call__(self, labels=None, image=None):
"""Return updated labels and image with added border."""
if labels is None:
labels = {}
img = labels.get("img") if image is None else image
shape = img.shape[:2] # current shape [height, width]
new_shape = labels.pop("rect_shape", self.new_shape)
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)

# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not self.scaleup: # only scale down, do not scale up (for better val mAP)
r = min(r, 1.0)

# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if self.auto: # minimum rectangle
dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride) # wh padding
elif self.scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = (
new_shape[1] / shape[1],
new_shape[0] / shape[0],
) # width, height ratios

if self.center:
dw /= 2 # divide padding into 2 sides
dh /= 2

if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
) # add border
if labels.get("ratio_pad"):
labels["ratio_pad"] = (labels["ratio_pad"], (left, top)) # for evaluation

if len(labels):
labels = self._update_labels(labels, ratio, dw, dh)
labels["img"] = img
labels["resized_shape"] = new_shape
return labels
else:
return img

def _update_labels(self, labels, ratio, padw, padh):
"""Update labels."""
labels["instances"].convert_bbox(format="xyxy")
labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
labels["instances"].scale(*ratio)
labels["instances"].add_padding(padw, padh)
return labels
81 changes: 58 additions & 23 deletions rapid_layout/utils/post_prepross.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def extract_boxes(self, predictions):


class DocLayoutPostProcess:
def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
def __init__(self, labels: List[str], conf_thres=0.2, iou_thres=0.5):
self.labels = labels
self.conf_threshold = conf_thres
self.iou_threshold = iou_thres
Expand All @@ -308,31 +308,18 @@ def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):

def __call__(
self,
output,
preds,
ori_img_shape: Tuple[int, int],
img_shape: Tuple[int, int] = (1024, 1024),
):
self.img_height, self.img_width = ori_img_shape
self.input_height, self.input_width = img_shape

output = output[0].squeeze()
boxes = output[:, :-2]
confidences = output[:, -2]
class_ids = output[:, -1].astype(int)

mask = confidences > self.conf_threshold
boxes = boxes[mask, :]
confidences = confidences[mask]
class_ids = class_ids[mask]

# Rescale boxes to original image dimensions
boxes = rescale_boxes(
boxes,
self.input_width,
self.input_height,
self.img_width,
self.img_height,
)
preds = preds[0]
mask = preds[..., 4] > self.conf_threshold
preds = [p[mask[idx]] for idx, p in enumerate(preds)][0]
preds[:, :4] = scale_boxes(list(img_shape), preds[:, :4], list(ori_img_shape))

boxes = preds[:, :4]
confidences = preds[:, 4]
class_ids = preds[:, 5].astype(int)
labels = [self.labels[i] for i in class_ids]
return boxes, confidences, labels

Expand All @@ -345,6 +332,54 @@ def rescale_boxes(boxes, input_width, input_height, img_width, img_height):
return boxes


def scale_boxes(
img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False
):
"""
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
specified in (img1_shape) to the shape of a different image (img0_shape).
Args:
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
img0_shape (tuple): the shape of the target image, in the format of (height, width).
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
calculated based on the size difference between the two images.
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
rescaling.
xywh (bool): The box format is xywh or not, default=False.
Returns:
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
"""
if ratio_pad is None: # calculate from img0_shape
gain = min(
img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]
) # gain = old / new
pad = (
round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
) # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]

if padding:
boxes[..., 0] -= pad[0] # x padding
boxes[..., 1] -= pad[1] # y padding
if not xywh:
boxes[..., 2] -= pad[0] # x padding
boxes[..., 3] -= pad[1] # y padding
boxes[..., :4] /= gain
return clip_boxes(boxes, img0_shape)


def clip_boxes(boxes, shape):
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
return boxes


def nms(boxes, scores, iou_threshold):
# Sort by score
sorted_indices = np.argsort(scores)[::-1]
Expand Down
17 changes: 9 additions & 8 deletions rapid_layout/utils/pre_procss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import cv2
import numpy as np

from .augment import LetterBox

InputType = Union[str, np.ndarray, bytes, Path]


class PPPreProcess:

def __init__(self, img_size: Tuple[int, int]):
self.size = img_size
self.mean = np.array([0.485, 0.456, 0.406])
Expand Down Expand Up @@ -41,7 +42,6 @@ def permute(self, img: np.ndarray) -> np.ndarray:


class YOLOv8PreProcess:

def __init__(self, img_size: Tuple[int, int]):
self.img_size = img_size

Expand All @@ -54,14 +54,15 @@ def __call__(self, image: np.ndarray) -> np.ndarray:


class DocLayoutPreProcess:

def __init__(self, img_size: Tuple[int, int]):
self.img_size = img_size
self.letterbox = LetterBox(new_shape=img_size, auto=False, stride=32)

def __call__(self, image: np.ndarray) -> np.ndarray:
input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
input_img = cv2.resize(image, self.img_size)
input_img = input_img / 255.0
input_img = input_img.transpose(2, 0, 1)
input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
input_img = self.letterbox(image=image)
input_img = input_img[None, ...]
input_img = input_img[..., ::-1].transpose(0, 3, 1, 2)
input_img = np.ascontiguousarray(input_img)
input_img = input_img / 255
input_tensor = input_img.astype(np.float32)
return input_tensor
9 changes: 7 additions & 2 deletions tests/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,15 @@
[
("yolov8n_layout_publaynet", 12),
("yolov8n_layout_general6", 13),
("doclayout_yolo", 14),
(
"doclayout_docstructbench",
14,
),
("doclayout_d4la", 11),
("doclayout_docsynth", 14),
],
)
def test_yolov8n_layout(model_type, gt):
def test_layout(model_type, gt):
img_path = test_file_dir / "PMC3576793_00004.jpg"
engine = RapidLayout(model_type=model_type)
boxes, scores, class_names, *elapse = engine(img_path)
Expand Down

0 comments on commit 103602a

Please sign in to comment.