diff --git a/community/document-parsers/document-parser-pdf-tables/pom.xml b/community/document-parsers/document-parser-pdf-tables/pom.xml new file mode 100644 index 00000000..80402948 --- /dev/null +++ b/community/document-parsers/document-parser-pdf-tables/pom.xml @@ -0,0 +1,70 @@ + + + + + 4.0.0 + + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-pdf-tables + + + 17 + 17 + UTF-8 + + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + + org.springframework.ai + spring-ai-test + test + + + + fr.neolegal + tabula + 1.0.12 + + + + + + + org.springframework.boot + spring-boot-maven-plugin + ${spring-boot.version} + + + + + diff --git a/community/document-parsers/document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java b/community/document-parsers/document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java new file mode 100644 index 00000000..3731573c --- /dev/null +++ b/community/document-parsers/document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java @@ -0,0 +1,153 @@ +/* + * Copyright 2024-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.cloud.ai.parser.pdf.tables; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.apache.pdfbox.pdmodel.PDDocument; +import technology.tabula.ObjectExtractor; +import technology.tabula.Page; +import technology.tabula.Table; +import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; + +import org.springframework.ai.document.Document; + +/** + * The purpose of this class is to extract tabular data from PDF files, compared to Apache + * Pdfbox. Tabula is more recognizable. tabula-java: + * tabula-java return + * List {@link Document} + * + * @author yuluo + * @author yuluo + */ + +public class PdfTablesParser implements DocumentParser { + + /** + * The page number of the PDF file to be parsed. Default value is 1. + */ + private final Integer page; + + /** + * The metadata of the PDF file to be parsed. + */ + private final Map metadata; + + public PdfTablesParser() { + + this(1); + } + + public PdfTablesParser(Integer pageNumber) { + + this(pageNumber, Map.of()); + } + + public PdfTablesParser(Integer pageNumber, Map metadata) { + + this.page = pageNumber; + this.metadata = metadata; + } + + @Override + public List parse(InputStream inputStream) { + + try { + return data2Document(parseTables(extraTableData(inputStream))); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + + protected List extraTableData(InputStream in) throws Exception { + + PDDocument document = PDDocument.load(in); + + // check pdf files + int numberOfPages = document.getNumberOfPages(); + if (numberOfPages < 0) { + + throw new RuntimeException("No page found in the PDF file."); + } + + if (page > numberOfPages) { + + throw new RuntimeException("The page number is greater than the number of pages in the PDF file."); + } + + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + + // extract page by page numbers. + Page extract = new ObjectExtractor(document).extract(this.page); + + return sea.extract(extract); + } + + protected List parseTables(List
data) { + + if (data.isEmpty()) { + return Collections.emptyList(); + } + + return data.stream() + .flatMap(table -> table.getRows() + .stream() + .map(cells -> cells.stream() + .map(content -> content.getText().replace("\r", "").replace("\n", " ")) + .reduce((first, second) -> first + "|" + second) + .orElse("") + "|")) + .collect(Collectors.toList()); + } + + private List data2Document(List data) { + + List documents = new ArrayList<>(); + + if (data.isEmpty()) { + return null; + } + + for (String datum : data) { + Document doc = new Document(datum); + documents.add(addMetadata(doc)); + } + + return documents; + } + + private Document addMetadata(Document document) { + + if (metadata.isEmpty()) { + return document; + } + + for (Map.Entry entry : metadata.entrySet()) { + document.getMetadata().put(entry.getKey(), entry.getValue()); + } + + return document; + } + +} diff --git a/community/document-parsers/document-parser-pdf-tables/src/test/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParserTests.java b/community/document-parsers/document-parser-pdf-tables/src/test/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParserTests.java new file mode 100644 index 00000000..cc20bc4c --- /dev/null +++ b/community/document-parsers/document-parser-pdf-tables/src/test/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParserTests.java @@ -0,0 +1,166 @@ +/* + * Copyright 2024-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.cloud.ai.parser.pdf.tables; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.locationtech.jts.util.Assert; +import technology.tabula.ObjectExtractor; +import technology.tabula.Page; +import technology.tabula.PageIterator; +import technology.tabula.RectangularTextContainer; +import technology.tabula.Table; +import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; + +import org.springframework.ai.document.Document; +import org.springframework.core.io.DefaultResourceLoader; +import org.springframework.core.io.Resource; + +/** + * @author yuluo + * @author yuluo + */ + +class PdfTablesParserTests { + + private Resource resource; + + private Resource resource2; + + @BeforeEach + void setUp() { + + resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf"); + resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf"); + + if (!resource.exists()) { + throw new RuntimeException("Resource not found: " + resource); + } + } + + /** + * tabula-java use. + */ + @Test + void PdfTableTest() throws IOException { + + InputStream in = new FileInputStream(resource.getFile()); + try (PDDocument document = PDDocument.load(in)) { + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + PageIterator pi = new ObjectExtractor(document).extract(); + while (pi.hasNext()) { + // iterate over the pages of the document + Page page = pi.next(); + List
table = sea.extract(page); + // iterate over the tables of the page + for (Table tables : table) { + List> rows = tables.getRows(); + // iterate over the rows of the table + for (List cells : rows) { + // print all column-cells of the row plus linefeed + for (RectangularTextContainer content : cells) { + // Note: Cell.getText() uses \r to concat text chunk + String text = content.getText().replace("\r", " "); + System.out.print(text + "|"); + } + System.out.println(); + } + } + } + } + + } + + @Test + void PdfTablesParseTest() throws IOException { + + String res = """ + |name|age|sex| + |zhangsan|20|m| + |lisi|21|w| + |wangwu|22|m| + |zhangliu|23|w| + |songqi|24|w| + """; + + InputStream in = new FileInputStream(resource.getFile()); + PdfTablesParser pdfTablesParser = new PdfTablesParser(); + List docs = pdfTablesParser.parse(in); + + StringBuilder sb = new StringBuilder(); + docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n")); + + Assert.equals(res, sb.toString()); + } + + @Test + void PdfTablesParseTest2() throws IOException { + + String res = """ + Sample Date:|May 2001| + Prepared by:|Accelio Present Applied Technology| + Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4| + Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.| + """; + + InputStream in = new FileInputStream(resource2.getFile()); + PdfTablesParser pdfTablesParser = new PdfTablesParser(); + List docs = pdfTablesParser.parse(in); + + StringBuilder sb = new StringBuilder(); + docs.forEach(doc -> sb.append(doc.getText() + "\n")); + + Assert.equals(res, sb.toString()); + + } + + @Test + void PdfTablesParseTest3() throws IOException { + + String res = """ + |Filename|||escription|escription|| + ||||||| + ap_bookmark.IFD|The template design.|||||| + ap_bookmark.mdf|The template targeted for PDF output.|||||| + ap_bookmark.dat|A sample data file in DAT format.|||||| + ap_bookmark.bmk|A sample bookmark file.|||||| + ap_bookmark.pdf|Sample PDF output.|||||| + ap_bookmark_doc.pdf|A document describing the sample.|||||| + |To bookmark by|Use the command line parameter|| + |Invoices|-abmkap_bookmark.bmk -abmsinvoices|| + |Type|-abmkap_bookmark.bmk -abmstype|| + |Amount|-abmkap_bookmark.bmk -abmsamount|| + """; + + InputStream in = new FileInputStream(resource2.getFile()); + PdfTablesParser pdfTablesParser = new PdfTablesParser(3); + List docs = pdfTablesParser.parse(in); + + StringBuilder sb = new StringBuilder(); + docs.forEach(doc -> sb.append(doc.getText() + "\n")); + + Assert.equals(res, sb.toString()); + + } + +} diff --git a/community/document-parsers/document-parser-pdf-tables/src/test/resources/pdf-tables.pdf b/community/document-parsers/document-parser-pdf-tables/src/test/resources/pdf-tables.pdf new file mode 100644 index 00000000..830880f2 Binary files /dev/null and b/community/document-parsers/document-parser-pdf-tables/src/test/resources/pdf-tables.pdf differ diff --git a/community/document-parsers/document-parser-pdf-tables/src/test/resources/sample1.pdf b/community/document-parsers/document-parser-pdf-tables/src/test/resources/sample1.pdf new file mode 100644 index 00000000..8efd05c3 Binary files /dev/null and b/community/document-parsers/document-parser-pdf-tables/src/test/resources/sample1.pdf differ diff --git a/community/document-readers/github-document-reader/pom.xml b/community/document-readers/github-document-reader/pom.xml index 3b9e3fba..759fddd3 100644 --- a/community/document-readers/github-document-reader/pom.xml +++ b/community/document-readers/github-document-reader/pom.xml @@ -74,6 +74,7 @@ org.springframework.boot spring-boot-maven-plugin + ${spring-boot.version} org.apache.maven.plugins diff --git a/community/document-readers/yuque-document-reader/pom.xml b/community/document-readers/yuque-document-reader/pom.xml index 0fc0388a..8d978ab7 100644 --- a/community/document-readers/yuque-document-reader/pom.xml +++ b/community/document-readers/yuque-document-reader/pom.xml @@ -74,6 +74,7 @@ org.springframework.boot spring-boot-maven-plugin + ${spring-boot.version} org.apache.maven.plugins diff --git a/pom.xml b/pom.xml index b14bad14..d87a0b22 100644 --- a/pom.xml +++ b/pom.xml @@ -69,6 +69,7 @@ community/document-parsers/document-parser-apache-pdfbox community/document-parsers/document-parser-markdown community/document-parsers/document-parser-tika + community/document-parsers/document-parser-pdf-tables