Skip to content

Commit

Permalink
Merge pull request #336 from sincerity-being/main
Browse files Browse the repository at this point in the history
upadate pdf-box parser
  • Loading branch information
chickenlj authored Jan 2, 2025
2 parents 1f5c1c8 + 98f0536 commit e1244c4
Show file tree
Hide file tree
Showing 9 changed files with 438 additions and 145 deletions.
12 changes: 2 additions & 10 deletions community/document-parsers/document-parser-apache-pdfbox/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<pdfbox.version>2.0.32</pdfbox.version>
</properties>

<dependencies>
Expand All @@ -36,15 +35,8 @@
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>

<!-- test dependencies -->
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package com.alibaba.cloud.ai.parser.apache.pdfbox;

import java.awt.Rectangle;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import com.alibaba.cloud.ai.document.DocumentParser;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.ai.reader.pdf.layout.PDFLayoutTextStripperByArea;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;

/**
* Groups the parsed PDF pages into {@link Document}s. You can group one or more pages
* into a single output document. Use {@link PdfDocumentReaderConfig} for customization
* options. The default configuration is: - pagesPerDocument = 1 - pageTopMargin = 0 -
* pageBottomMargin = 0
*
* @author HeYQ
*/
public class PagePdfDocumentParser implements DocumentParser {

public static final String METADATA_START_PAGE_NUMBER = "page_number";

public static final String METADATA_END_PAGE_NUMBER = "end_page_number";

private static final String PDF_PAGE_REGION = "pdfPageRegion";

private final Logger logger = LoggerFactory.getLogger(getClass());

private final PdfDocumentReaderConfig config;

public PagePdfDocumentParser() {
this(PdfDocumentReaderConfig.defaultConfig());
}

public PagePdfDocumentParser(PdfDocumentReaderConfig config) {
this.config = config;
}

@Override
public List<Document> parse(InputStream inputStream) {

List<Document> readDocuments = new ArrayList<>();
try {
var pdfTextStripper = new PDFLayoutTextStripperByArea();

int pageNumber = 0;
int pagesPerDocument = 0;
int startPageNumber = pageNumber;

List<String> pageTextGroupList = new ArrayList<>();
PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessReadBuffer(inputStream));
PDDocument document = pdfParser.parse();

int totalPages = document.getDocumentCatalog().getPages().getCount();
// if less than 10
int logFrequency = totalPages > 10 ? totalPages / 10 : 1;
// pages, print
// each iteration
int counter = 0;

PDPage lastPage = document.getDocumentCatalog().getPages().iterator().next();
for (PDPage page : document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
logger.info("Processing PDF page: {}", (counter + 1));
}
counter++;

pagesPerDocument++;

if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
&& pagesPerDocument >= this.config.pagesPerDocument) {
pagesPerDocument = 0;

var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
if (StringUtils.hasText(aggregatedPageTextGroup)) {
readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber));
}
pageTextGroupList.clear();

startPageNumber = pageNumber + 1;
}
int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();

int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
int yW = (int) page.getMediaBox().getHeight()
- (this.config.pageTopMargin + this.config.pageBottomMargin);

pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
pdfTextStripper.extractRegions(page);
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);

if (StringUtils.hasText(pageText)) {

pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);

pageTextGroupList.add(pageText);
}
pageNumber++;
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
}
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
pageNumber));
}
logger.info("Processing {} pages", totalPages);
return readDocuments;

}
catch (IOException e) {
throw new RuntimeException(e);
}
}

protected Document toDocument(String docText, int startPageNumber, int endPageNumber) {
Document doc = new Document(docText);
doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber);
if (startPageNumber != endPageNumber) {
doc.getMetadata().put(METADATA_END_PAGE_NUMBER, endPageNumber);
}
return doc;
}

}
Loading

0 comments on commit e1244c4

Please sign in to comment.