Skip to content

Commit

Permalink
Merge pull request #5 from gotson/pdf
Browse files Browse the repository at this point in the history
PDF support
  • Loading branch information
gotson authored Aug 29, 2019
2 parents e1584a9 + cef03ae commit 3dcd217
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 5 deletions.
1 change: 1 addition & 0 deletions komga/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ dependencies {

implementation("org.apache.tika:tika-core:1.22")
implementation("com.github.junrar:junrar:4.0.0")
implementation("org.apache.pdfbox:pdfbox:2.0.16")
implementation("net.grey-panther:natural-comparator:1.1")

implementation("net.coobird:thumbnailator:0.4.8")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import org.gotson.komga.domain.model.BookMetadata
import org.gotson.komga.domain.model.Status
import org.gotson.komga.domain.model.path
import org.gotson.komga.infrastructure.archive.ContentDetector
import org.gotson.komga.infrastructure.archive.PdfExtractor
import org.gotson.komga.infrastructure.archive.RarExtractor
import org.gotson.komga.infrastructure.archive.ZipExtractor
import org.springframework.stereotype.Service
Expand All @@ -20,12 +21,14 @@ private val logger = KotlinLogging.logger {}
class BookParser(
private val contentDetector: ContentDetector,
private val zipExtractor: ZipExtractor,
private val rarExtractor: RarExtractor
private val rarExtractor: RarExtractor,
private val pdfExtractor: PdfExtractor
) {

val supportedMediaTypes = mapOf(
"application/zip" to zipExtractor,
"application/x-rar-compressed" to rarExtractor
"application/x-rar-compressed" to rarExtractor,
"application/pdf" to pdfExtractor
)

private val natSortComparator: Comparator<String> = CaseInsensitiveSimpleNaturalComparator.getInstance()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ private val logger = KotlinLogging.logger {}
@Service
class FileSystemScanner {

val supportedExtensions = listOf("cbz", "zip", "cbr", "rar")
val supportedExtensions = listOf("cbz", "zip", "cbr", "rar", "pdf")

fun scanRootFolder(root: Path): List<Serie> {
logger.info { "Scanning folder: $root" }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
package org.gotson.komga.infrastructure.archive

import mu.KotlinLogging
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine
import org.apache.pdfbox.cos.COSName
import org.apache.pdfbox.io.IOUtils
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.font.PDFont
import org.apache.pdfbox.pdmodel.graphics.color.PDColor
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB
import org.apache.pdfbox.pdmodel.graphics.color.PDPattern
import org.apache.pdfbox.pdmodel.graphics.image.PDImage
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern
import org.apache.pdfbox.util.Matrix
import org.apache.pdfbox.util.Vector
import org.apache.tika.config.TikaConfig
import org.gotson.komga.domain.model.BookPage
import org.springframework.stereotype.Service
import java.awt.geom.Point2D
import java.io.ByteArrayOutputStream
import java.net.URLConnection
import java.nio.file.Files
import java.nio.file.Path
import javax.imageio.ImageIO

private val logger = KotlinLogging.logger {}

/**
* Largely inspired by https://github.com/apache/pdfbox/blob/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java
*/
@Service
class PdfExtractor(
private val tika: TikaConfig
) : ArchiveExtractor() {

private val JPEG = listOf(
COSName.DCT_DECODE.name,
COSName.DCT_DECODE_ABBREVIATION.name
)

override fun getPagesList(path: Path): List<BookPage> {
return PDDocument.load(Files.newInputStream(path)).use { pdf ->
pdf.pages.mapIndexed { index, page ->
val extractor = ImageGraphicsEngine(page, false)
extractor.run()
BookPage(index.toString(), extractor.mediaType)
}
}
}

override fun getPageStream(path: Path, entryName: String): ByteArray {
return PDDocument.load(Files.newInputStream(path)).use { pdf ->
val extractor = ImageGraphicsEngine(pdf.pages[entryName.toInt()], true)
extractor.run()
extractor.buffer
}
}

private inner class ImageGraphicsEngine(
page: PDPage,
val extractImage: Boolean
) : PDFGraphicsStreamEngine(page) {

lateinit var buffer: ByteArray
lateinit var mediaType: String

fun run() {
val p = getPage()
processPage(p)
val res = p.resources
for (name in res.extGStateNames) {
val softMask = res.getExtGState(name).softMask
if (softMask != null) {
val group = softMask.group
if (group != null) {
processSoftMask(group)
}
}
}
}

// find out if it is a tiling pattern, then process that one
private fun processColor(color: PDColor) {
if (color.colorSpace is PDPattern) {
val pattern = color.colorSpace as PDPattern
val abstractPattern = pattern.getPattern(color)
if (abstractPattern is PDTilingPattern) {
processTilingPattern(abstractPattern, null, null)
}
}
}

override fun drawImage(pdImage: PDImage) {
if (pdImage is PDImageXObject) {
if (pdImage.isStencil()) {
processColor(graphicsState.nonStrokingColor)
}
}
writeToBuffer(pdImage, extractImage)
}

override fun fillAndStrokePath(windingRule: Int) {
processColor(graphicsState.nonStrokingColor)
}

override fun fillPath(windingRule: Int) {
processColor(graphicsState.nonStrokingColor)
}

override fun strokePath() {
processColor(graphicsState.nonStrokingColor)
}

override fun showGlyph(textRenderingMatrix: Matrix,
font: PDFont,
code: Int,
unicode: String,
displacement: Vector) {
val renderingMode = graphicsState.textState.renderingMode
if (renderingMode.isFill) {
processColor(graphicsState.nonStrokingColor)
}
if (renderingMode.isStroke) {
processColor(graphicsState.strokingColor)
}
}

override fun shadingFill(shadingName: COSName?) {}
override fun clip(windingRule: Int) {}
override fun endPath() {}
override fun closePath() {}
override fun getCurrentPoint(): Point2D = Point2D.Float(0F, 0F)
override fun moveTo(x: Float, y: Float) {}
override fun lineTo(x: Float, y: Float) {}
override fun appendRectangle(p0: Point2D?, p1: Point2D?, p2: Point2D?, p3: Point2D?) {}
override fun curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float) {}

private fun writeToBuffer(pdImage: PDImage, extractImage: Boolean) {
var suffix: String? = pdImage.suffix
logger.trace { "PDF image suffix: $suffix" }
if (suffix == null || suffix == "jb2") {
suffix = "png"
} else if (suffix == "jpx") {
// use jp2 suffix for file because jpx not known by windows
suffix = "jp2"
}
logger.trace { "PDF image computed suffix: $suffix" }

ByteArrayOutputStream().use { out ->
val image = pdImage.image
if (image != null) {
when (suffix) {
"jpg" -> {
mediaType = "image/jpeg"
val colorSpaceName = pdImage.colorSpace.name
if (!hasMasks(pdImage) && (PDDeviceGray.INSTANCE.name == colorSpaceName || PDDeviceRGB.INSTANCE.name == colorSpaceName)) {
// RGB or Gray colorspace: get and write the unmodified JPEG stream
if (extractImage) {
logger.debug { "RGB or Gray colorspace, get the unmodified JPEG stream" }
val data = pdImage.createInputStream(JPEG)
IOUtils.copy(data, out)
IOUtils.closeQuietly(data)
}
} else {
// for CMYK and other "unusual" colorspaces, the JPEG will be converted
if (extractImage) {
logger.debug { "CMYK or other colorspace, converting to JPEG" }
ImageIO.write(image, suffix, out)
}
}
}
"jp2" -> {
mediaType = "image/jp2"
val colorSpaceName = pdImage.colorSpace.name
if (!hasMasks(pdImage) && (PDDeviceGray.INSTANCE.name == colorSpaceName || PDDeviceRGB.INSTANCE.name == colorSpaceName)) {
// RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
if (extractImage) {
logger.debug { "RGB or Gray colorspace, get the unmodified JPEG2000 stream" }
val data = pdImage.createInputStream(listOf(COSName.JPX_DECODE.name))
IOUtils.copy(data, out)
IOUtils.closeQuietly(data)
}
} else {
// for CMYK and other "unusual" colorspaces, the image will be converted
// ImageIOUtil.writeImage(image, "jpeg2000", out)
if (extractImage) {
logger.debug { "CMYK or other colorspace, converting to JPEG2000" }
ImageIO.write(image, "jpeg2000", out)
}
}
}
else -> {
mediaType = URLConnection.guessContentTypeFromName("file.$suffix") ?: "application/octet-stream"
if (extractImage) {
logger.debug { "Converting to $suffix" }
ImageIO.write(image, suffix, out)
}
}
}
}
buffer = out.toByteArray()
}
}

}

private fun hasMasks(pdImage: PDImage): Boolean {
return if (pdImage is PDImageXObject) {
pdImage.mask != null || pdImage.softMask != null
} else false
}
}
5 changes: 4 additions & 1 deletion komga/src/main/resources/application-dev.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#logging.level.org.springframework.security.web.FilterChainProxy: DEBUG
komga:
root-folder: D:\\files\\issues
root-folder: D:\\files\\pdf
threads:
parse: 1
# root-folder-scan-cron: "*/5 * * * * ?"
spring:
profiles:
include: flyway
logging:
level:
org.gotson.komga: debug
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import org.gotson.komga.domain.model.makeBook
import org.gotson.komga.domain.model.makeBookPage
import org.gotson.komga.domain.model.path
import org.gotson.komga.infrastructure.archive.ContentDetector
import org.gotson.komga.infrastructure.archive.PdfExtractor
import org.gotson.komga.infrastructure.archive.RarExtractor
import org.gotson.komga.infrastructure.archive.ZipExtractor
import org.junit.jupiter.api.Test
Expand All @@ -16,8 +17,9 @@ class BookParserTest {
private val mockContent = mockk<ContentDetector>()
private val mockZip = mockk<ZipExtractor>()
private val mockRar = mockk<RarExtractor>()
private val mockPDf = mockk<PdfExtractor>()

private val bookParser = BookParser(mockContent, mockZip, mockRar)
private val bookParser = BookParser(mockContent, mockZip, mockRar, mockPDf)

@Test
fun `given book with unordered pages when parsing then thumbnail should always be the first in natural order`() {
Expand Down

0 comments on commit 3dcd217

Please sign in to comment.