Merge pull request #348 from alibaba/0104-yuluo/pdf-tables

feat:add pdf tables parse
alibaba · Jan 7, 2025 · 49743df · 49743df
2 parents 73294b5 + 2557b74
commit 49743df
Show file tree

Hide file tree

Showing 8 changed files with 392 additions and 0 deletions.
diff --git a/community/document-parsers/document-parser-pdf-tables/pom.xml b/community/document-parsers/document-parser-pdf-tables/pom.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Copyright 2024-2025 the original author or authors.
+  ~
+  ~ Licensed under the Apache License, Version 2.0 (the "License");
+  ~ you may not use this file except in compliance with the License.
+  ~ You may obtain a copy of the License at
+  ~
+  ~ https://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+		 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+		 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<parent>
+		<groupId>com.alibaba.cloud.ai</groupId>
+		<artifactId>spring-ai-alibaba</artifactId>
+		<version>${revision}</version>
+		<relativePath>../../../pom.xml</relativePath>
+	</parent>
+
+	<artifactId>document-parser-pdf-tables</artifactId>
+
+	<properties>
+		<maven.compiler.source>17</maven.compiler.source>
+		<maven.compiler.target>17</maven.compiler.target>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+	</properties>
+
+	<dependencies>
+
+		<dependency>
+			<groupId>com.alibaba.cloud.ai</groupId>
+			<artifactId>spring-ai-alibaba-core</artifactId>
+			<version>${project.parent.version}</version>
+		</dependency>
+
+		<!-- test dependencies -->
+		<dependency>
+			<groupId>org.springframework.ai</groupId>
+			<artifactId>spring-ai-test</artifactId>
+			<scope>test</scope>
+		</dependency>
+
+		<dependency>
+			<groupId>fr.neolegal</groupId>
+			<artifactId>tabula</artifactId>
+			<version>1.0.12</version>
+		</dependency>
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.springframework.boot</groupId>
+				<artifactId>spring-boot-maven-plugin</artifactId>
+				<version>${spring-boot.version}</version>
+			</plugin>
+		</plugins>
+	</build>
+
+</project>
diff --git a/...rser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java b/...rser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2024-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.alibaba.cloud.ai.parser.pdf.tables;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.alibaba.cloud.ai.document.DocumentParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import technology.tabula.ObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.Table;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import org.springframework.ai.document.Document;
+
+/**
+ * The purpose of this class is to extract tabular data from PDF files, compared to Apache
+ * Pdfbox. Tabula is more recognizable. tabula-java:
+ * <a href="https://github.com/tabulapdf/tabula-java">tabula-java</a> return
+ * List<Document> {@link Document}
+ *
+ * @author yuluo
+ * @author <a href="mailto:[email protected]">yuluo</a>
+ */
+
+public class PdfTablesParser implements DocumentParser {
+
+	/**
+	 * The page number of the PDF file to be parsed. Default value is 1.
+	 */
+	private final Integer page;
+
+	/**
+	 * The metadata of the PDF file to be parsed.
+	 */
+	private final Map<String, String> metadata;
+
+	public PdfTablesParser() {
+
+		this(1);
+	}
+
+	public PdfTablesParser(Integer pageNumber) {
+
+		this(pageNumber, Map.of());
+	}
+
+	public PdfTablesParser(Integer pageNumber, Map<String, String> metadata) {
+
+		this.page = pageNumber;
+		this.metadata = metadata;
+	}
+
+	@Override
+	public List<Document> parse(InputStream inputStream) {
+
+		try {
+			return data2Document(parseTables(extraTableData(inputStream)));
+		}
+		catch (Exception e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	protected List<Table> extraTableData(InputStream in) throws Exception {
+
+		PDDocument document = PDDocument.load(in);
+
+		// check pdf files
+		int numberOfPages = document.getNumberOfPages();
+		if (numberOfPages < 0) {
+
+			throw new RuntimeException("No page found in the PDF file.");
+		}
+
+		if (page > numberOfPages) {
+
+			throw new RuntimeException("The page number is greater than the number of pages in the PDF file.");
+		}
+
+		SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
+
+		// extract page by page numbers.
+		Page extract = new ObjectExtractor(document).extract(this.page);
+
+		return sea.extract(extract);
+	}
+
+	protected List<String> parseTables(List<Table> data) {
+
+		if (data.isEmpty()) {
+			return Collections.emptyList();
+		}
+
+		return data.stream()
+			.flatMap(table -> table.getRows()
+				.stream()
+				.map(cells -> cells.stream()
+					.map(content -> content.getText().replace("\r", "").replace("\n", " "))
+					.reduce((first, second) -> first + "|" + second)
+					.orElse("") + "|"))
+			.collect(Collectors.toList());
+	}
+
+	private List<Document> data2Document(List<String> data) {
+
+		List<Document> documents = new ArrayList<>();
+
+		if (data.isEmpty()) {
+			return null;
+		}
+
+		for (String datum : data) {
+			Document doc = new Document(datum);
+			documents.add(addMetadata(doc));
+		}
+
+		return documents;
+	}
+
+	private Document addMetadata(Document document) {
+
+		if (metadata.isEmpty()) {
+			return document;
+		}
+
+		for (Map.Entry<String, String> entry : metadata.entrySet()) {
+			document.getMetadata().put(entry.getKey(), entry.getValue());
+		}
+
+		return document;
+	}
+
+}
diff --git a/...pdf-tables/src/test/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParserTests.java b/...pdf-tables/src/test/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParserTests.java
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2024-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.alibaba.cloud.ai.parser.pdf.tables;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.locationtech.jts.util.Assert;
+import technology.tabula.ObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.PageIterator;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import org.springframework.ai.document.Document;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+
+/**
+ * @author yuluo
+ * @author <a href="mailto:[email protected]">yuluo</a>
+ */
+
+class PdfTablesParserTests {
+
+	private Resource resource;
+
+	private Resource resource2;
+
+	@BeforeEach
+	void setUp() {
+
+		resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf");
+		resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf");
+
+		if (!resource.exists()) {
+			throw new RuntimeException("Resource not found: " + resource);
+		}
+	}
+
+	/**
+	 * tabula-java use.
+	 */
+	@Test
+	void PdfTableTest() throws IOException {
+
+		InputStream in = new FileInputStream(resource.getFile());
+		try (PDDocument document = PDDocument.load(in)) {
+			SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
+			PageIterator pi = new ObjectExtractor(document).extract();
+			while (pi.hasNext()) {
+				// iterate over the pages of the document
+				Page page = pi.next();
+				List<Table> table = sea.extract(page);
+				// iterate over the tables of the page
+				for (Table tables : table) {
+					List<List<RectangularTextContainer>> rows = tables.getRows();
+					// iterate over the rows of the table
+					for (List<RectangularTextContainer> cells : rows) {
+						// print all column-cells of the row plus linefeed
+						for (RectangularTextContainer content : cells) {
+							// Note: Cell.getText() uses \r to concat text chunk
+							String text = content.getText().replace("\r", " ");
+							System.out.print(text + "|");
+						}
+						System.out.println();
+					}
+				}
+			}
+		}
+
+	}
+
+	@Test
+	void PdfTablesParseTest() throws IOException {
+
+		String res = """
+				|name|age|sex|
+				|zhangsan|20|m|
+				|lisi|21|w|
+				|wangwu|22|m|
+				|zhangliu|23|w|
+				|songqi|24|w|
+				""";
+
+		InputStream in = new FileInputStream(resource.getFile());
+		PdfTablesParser pdfTablesParser = new PdfTablesParser();
+		List<Document> docs = pdfTablesParser.parse(in);
+
+		StringBuilder sb = new StringBuilder();
+		docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n"));
+
+		Assert.equals(res, sb.toString());
+	}
+
+	@Test
+	void PdfTablesParseTest2() throws IOException {
+
+		String res = """
+				Sample Date:|May 2001|
+				Prepared by:|Accelio Present Applied Technology|
+				Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4|
+				Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.|
+				""";
+
+		InputStream in = new FileInputStream(resource2.getFile());
+		PdfTablesParser pdfTablesParser = new PdfTablesParser();
+		List<Document> docs = pdfTablesParser.parse(in);
+
+		StringBuilder sb = new StringBuilder();
+		docs.forEach(doc -> sb.append(doc.getText() + "\n"));
+
+		Assert.equals(res, sb.toString());
+
+	}
+
+	@Test
+	void PdfTablesParseTest3() throws IOException {
+
+		String res = """
+				|Filename|||escription|escription||
+				|||||||
+				ap_bookmark.IFD|The template design.||||||
+				ap_bookmark.mdf|The template targeted for PDF output.||||||
+				ap_bookmark.dat|A sample data file in DAT format.||||||
+				ap_bookmark.bmk|A sample bookmark file.||||||
+				ap_bookmark.pdf|Sample PDF output.||||||
+				ap_bookmark_doc.pdf|A document describing the sample.||||||
+				|To bookmark by|Use the command line parameter||
+				|Invoices|-abmkap_bookmark.bmk -abmsinvoices||
+				|Type|-abmkap_bookmark.bmk -abmstype||
+				|Amount|-abmkap_bookmark.bmk -abmsamount||
+				""";
+
+		InputStream in = new FileInputStream(resource2.getFile());
+		PdfTablesParser pdfTablesParser = new PdfTablesParser(3);
+		List<Document> docs = pdfTablesParser.parse(in);
+
+		StringBuilder sb = new StringBuilder();
+		docs.forEach(doc -> sb.append(doc.getText() + "\n"));
+
+		Assert.equals(res, sb.toString());
+
+	}
+
+}
diff --git a/community/document-parsers/document-parser-pdf-tables/src/test/resources/pdf-tables.pdf b/community/document-parsers/document-parser-pdf-tables/src/test/resources/pdf-tables.pdf
diff --git a/community/document-parsers/document-parser-pdf-tables/src/test/resources/sample1.pdf b/community/document-parsers/document-parser-pdf-tables/src/test/resources/sample1.pdf
diff --git a/community/document-readers/github-document-reader/pom.xml b/community/document-readers/github-document-reader/pom.xml
@@ -74,6 +74,7 @@
             <plugin>
                 <groupId>org.springframework.boot</groupId>
                 <artifactId>spring-boot-maven-plugin</artifactId>
+                <version>${spring-boot.version}</version>
             </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>