Skip to content

Commit

Permalink
Merge pull request #348 from alibaba/0104-yuluo/pdf-tables
Browse files Browse the repository at this point in the history
feat:add pdf tables parse
  • Loading branch information
chickenlj authored Jan 7, 2025
2 parents 73294b5 + 2557b74 commit 49743df
Show file tree
Hide file tree
Showing 8 changed files with 392 additions and 0 deletions.
70 changes: 70 additions & 0 deletions community/document-parsers/document-parser-pdf-tables/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2024-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>document-parser-pdf-tables</artifactId>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>

<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<!-- test dependencies -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>fr.neolegal</groupId>
<artifactId>tabula</artifactId>
<version>1.0.12</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.alibaba.cloud.ai.parser.pdf.tables;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import com.alibaba.cloud.ai.document.DocumentParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.Table;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import org.springframework.ai.document.Document;

/**
* The purpose of this class is to extract tabular data from PDF files, compared to Apache
* Pdfbox. Tabula is more recognizable. tabula-java:
* <a href="https://github.com/tabulapdf/tabula-java">tabula-java</a> return
* List<Document> {@link Document}
*
* @author yuluo
* @author <a href="mailto:[email protected]">yuluo</a>
*/

public class PdfTablesParser implements DocumentParser {

/**
* The page number of the PDF file to be parsed. Default value is 1.
*/
private final Integer page;

/**
* The metadata of the PDF file to be parsed.
*/
private final Map<String, String> metadata;

public PdfTablesParser() {

this(1);
}

public PdfTablesParser(Integer pageNumber) {

this(pageNumber, Map.of());
}

public PdfTablesParser(Integer pageNumber, Map<String, String> metadata) {

this.page = pageNumber;
this.metadata = metadata;
}

@Override
public List<Document> parse(InputStream inputStream) {

try {
return data2Document(parseTables(extraTableData(inputStream)));
}
catch (Exception e) {
throw new RuntimeException(e);
}
}

protected List<Table> extraTableData(InputStream in) throws Exception {

PDDocument document = PDDocument.load(in);

// check pdf files
int numberOfPages = document.getNumberOfPages();
if (numberOfPages < 0) {

throw new RuntimeException("No page found in the PDF file.");
}

if (page > numberOfPages) {

throw new RuntimeException("The page number is greater than the number of pages in the PDF file.");
}

SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

// extract page by page numbers.
Page extract = new ObjectExtractor(document).extract(this.page);

return sea.extract(extract);
}

protected List<String> parseTables(List<Table> data) {

if (data.isEmpty()) {
return Collections.emptyList();
}

return data.stream()
.flatMap(table -> table.getRows()
.stream()
.map(cells -> cells.stream()
.map(content -> content.getText().replace("\r", "").replace("\n", " "))
.reduce((first, second) -> first + "|" + second)
.orElse("") + "|"))
.collect(Collectors.toList());
}

private List<Document> data2Document(List<String> data) {

List<Document> documents = new ArrayList<>();

if (data.isEmpty()) {
return null;
}

for (String datum : data) {
Document doc = new Document(datum);
documents.add(addMetadata(doc));
}

return documents;
}

private Document addMetadata(Document document) {

if (metadata.isEmpty()) {
return document;
}

for (Map.Entry<String, String> entry : metadata.entrySet()) {
document.getMetadata().put(entry.getKey(), entry.getValue());
}

return document;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Copyright 2024-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.alibaba.cloud.ai.parser.pdf.tables;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.locationtech.jts.util.Assert;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.PageIterator;
import technology.tabula.RectangularTextContainer;
import technology.tabula.Table;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import org.springframework.ai.document.Document;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;

/**
* @author yuluo
* @author <a href="mailto:[email protected]">yuluo</a>
*/

class PdfTablesParserTests {

private Resource resource;

private Resource resource2;

@BeforeEach
void setUp() {

resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf");
resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf");

if (!resource.exists()) {
throw new RuntimeException("Resource not found: " + resource);
}
}

/**
* tabula-java use.
*/
@Test
void PdfTableTest() throws IOException {

InputStream in = new FileInputStream(resource.getFile());
try (PDDocument document = PDDocument.load(in)) {
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
PageIterator pi = new ObjectExtractor(document).extract();
while (pi.hasNext()) {
// iterate over the pages of the document
Page page = pi.next();
List<Table> table = sea.extract(page);
// iterate over the tables of the page
for (Table tables : table) {
List<List<RectangularTextContainer>> rows = tables.getRows();
// iterate over the rows of the table
for (List<RectangularTextContainer> cells : rows) {
// print all column-cells of the row plus linefeed
for (RectangularTextContainer content : cells) {
// Note: Cell.getText() uses \r to concat text chunk
String text = content.getText().replace("\r", " ");
System.out.print(text + "|");
}
System.out.println();
}
}
}
}

}

@Test
void PdfTablesParseTest() throws IOException {

String res = """
|name|age|sex|
|zhangsan|20|m|
|lisi|21|w|
|wangwu|22|m|
|zhangliu|23|w|
|songqi|24|w|
""";

InputStream in = new FileInputStream(resource.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser();
List<Document> docs = pdfTablesParser.parse(in);

StringBuilder sb = new StringBuilder();
docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n"));

Assert.equals(res, sb.toString());
}

@Test
void PdfTablesParseTest2() throws IOException {

String res = """
Sample Date:|May 2001|
Prepared by:|Accelio Present Applied Technology|
Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4|
Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.|
""";

InputStream in = new FileInputStream(resource2.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser();
List<Document> docs = pdfTablesParser.parse(in);

StringBuilder sb = new StringBuilder();
docs.forEach(doc -> sb.append(doc.getText() + "\n"));

Assert.equals(res, sb.toString());

}

@Test
void PdfTablesParseTest3() throws IOException {

String res = """
|Filename|||escription|escription||
|||||||
ap_bookmark.IFD|The template design.||||||
ap_bookmark.mdf|The template targeted for PDF output.||||||
ap_bookmark.dat|A sample data file in DAT format.||||||
ap_bookmark.bmk|A sample bookmark file.||||||
ap_bookmark.pdf|Sample PDF output.||||||
ap_bookmark_doc.pdf|A document describing the sample.||||||
|To bookmark by|Use the command line parameter||
|Invoices|-abmkap_bookmark.bmk -abmsinvoices||
|Type|-abmkap_bookmark.bmk -abmstype||
|Amount|-abmkap_bookmark.bmk -abmsamount||
""";

InputStream in = new FileInputStream(resource2.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser(3);
List<Document> docs = pdfTablesParser.parse(in);

StringBuilder sb = new StringBuilder();
docs.forEach(doc -> sb.append(doc.getText() + "\n"));

Assert.equals(res, sb.toString());

}

}
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions community/document-readers/github-document-reader/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
Expand Down
Loading

0 comments on commit 49743df

Please sign in to comment.