Skip to content

Commit

Permalink
fix: better handling of broken epub
Browse files Browse the repository at this point in the history
Closes: #1844
  • Loading branch information
gotson committed Jan 23, 2025
1 parent 214f687 commit 42047cd
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ data class EpubPackage(
inline fun <R> Path.epub(block: (EpubPackage) -> R): R =
ZipFile.builder().setPath(this).use { zip ->
val opfFile = zip.getPackagePath()
val opfDoc = zip.getEntryInputStream(opfFile).use { Jsoup.parse(it, null, "", Parser.xmlParser()) }
val opfDoc = zip.getEntryInputStream(opfFile)?.use { Jsoup.parse(it, null, "", Parser.xmlParser()) } ?: throw MediaUnsupportedException("Could not open OPF resource")
val opfDir = Paths.get(opfFile).parent
block(EpubPackage(zip, opfDoc, opfDir, opfDoc.getManifest()))
}
Expand All @@ -30,9 +30,9 @@ inline fun <R> Path.epub(block: (EpubPackage) -> R): R =
*/
fun ZipFile.getPackagePath(): String =
getEntryInputStream("META-INF/container.xml")
.use { Jsoup.parse(it, null, "") }
.getElementsByTag("rootfile")
.first()
?.use { Jsoup.parse(it, null, "") }
?.getElementsByTag("rootfile")
?.first()
?.attr("full-path") ?: throw MediaUnsupportedException("META-INF/container.xml does not contain rootfile tag")

/**
Expand All @@ -41,7 +41,7 @@ fun ZipFile.getPackagePath(): String =
fun getPackageFileContent(path: Path): String? =
ZipFile.builder().setPath(path).use { zip ->
try {
zip.getEntryInputStream(zip.getPackagePath()).reader().use { it.readText() }
zip.getEntryInputStream(zip.getPackagePath())?.reader()?.use { it.readText() }
} catch (e: Exception) {
null
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,12 @@ class EpubExtractor(
val href = coverManifestItem.href
val mediaType = coverManifestItem.mediaType
val coverPath = normalizeHref(opfDir, href)
TypedBytes(
zip.getEntryBytes(coverPath),
mediaType,
)
zip.getEntryBytes(coverPath)?.let { coverBytes ->
TypedBytes(
coverBytes,
mediaType,
)
}
} else {
null
}
Expand Down Expand Up @@ -151,7 +153,7 @@ class EpubExtractor(
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
.map { pagePath ->
val doc = epub.zip.getEntryInputStream(pagePath).use { Jsoup.parse(it, null, "") }
val doc = epub.zip.getEntryInputStream(pagePath)?.use { Jsoup.parse(it, null, "") } ?: return@map emptyList()

// if a page has text over the threshold then the book is not divina compatible
if (doc.body().text().length > letterCountThreshold) return emptyList()
Expand Down Expand Up @@ -217,8 +219,8 @@ class EpubExtractor(
val readingOrder = resources.filter { it.subType == MediaFile.SubType.EPUB_PAGE }

readingOrder.forEach { mediaFile ->
val doc = epub.zip.getEntryInputStream(mediaFile.fileName).use { Jsoup.parse(it, null, "") }
if (!doc.getElementsByClass("koboSpan").isNullOrEmpty()) return true
val doc = epub.zip.getEntryInputStream(mediaFile.fileName)?.use { Jsoup.parse(it, null, "") }
if (!doc?.getElementsByClass("koboSpan").isNullOrEmpty()) return true
}
} catch (e: Exception) {
logger.warn(e) { "Error while checking if EPUB is KEPUB" }
Expand Down Expand Up @@ -257,7 +259,7 @@ class EpubExtractor(
val koboPositions =
when {
isFixedLayout -> emptyMap()
isKepub -> computePositionsFromKoboSpan(readingOrder) { filename -> epub.zip.getEntryInputStream(filename).use { it.readBytes().decodeToString() } }
isKepub -> computePositionsFromKoboSpan(readingOrder) { filename -> epub.zip.getEntryInputStream(filename).use { it?.readBytes()?.decodeToString() } }
kepubConverter.isAvailable -> {
try {
val kepub =
Expand Down Expand Up @@ -327,12 +329,12 @@ class EpubExtractor(
*/
private fun computePositionsFromKoboSpan(
readingOrder: List<MediaFile>,
resourceSupplier: (String) -> String,
): Map<String, List<Pair<String, Float>>> =
resourceSupplier: (String) -> String?,
): Map<String, List<Pair<String, Float>>?> =
readingOrder.associate { file ->
val doc = Jsoup.parse(resourceSupplier(file.fileName), Parser.htmlParser().setTrackPosition(true))
val doc = resourceSupplier(file.fileName)?.let { resource -> Jsoup.parse(resource, Parser.htmlParser().setTrackPosition(true)) }
file.fileName to
doc.select("span.koboSpan").mapNotNull { koboSpan ->
doc?.select("span.koboSpan")?.mapNotNull { koboSpan ->
val id = koboSpan.id()
if (!id.isNullOrBlank()) {
// progression is built from the position in the file of each koboSpan, divided by the file size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import kotlin.io.path.Path
fun EpubPackage.getNavResource(): ResourceContent? =
manifest.values.firstOrNull { it.properties.contains("nav") }?.let { nav ->
val href = normalizeHref(opfDir, nav.href)
ResourceContent(Path(href), zip.getEntryBytes(href).decodeToString())
zip.getEntryBytes(href)?.decodeToString()?.let { navContent ->
ResourceContent(Path(href), navContent)
}
}

fun processNav(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ private val possibleNcxItemIds = listOf("toc", "ncx", "ncxtoc")
fun EpubPackage.getNcxResource(): ResourceContent? =
(manifest.values.firstOrNull { it.mediaType == "application/x-dtbncx+xml" } ?: manifest.values.firstOrNull { possibleNcxItemIds.contains(it.id) })?.let { ncx ->
val href = normalizeHref(opfDir, ncx.href)
ResourceContent(Path(href), zip.getEntryBytes(href).decodeToString())
zip.getEntryBytes(href)?.decodeToString()?.let { ncxContent ->
ResourceContent(Path(href), ncxContent)
}
}

fun processNcx(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import java.nio.file.Path

inline fun <R> ZipFile.Builder.use(block: (ZipFile) -> R) = this.get().use(block)

fun ZipFile.getEntryInputStream(entryName: String): InputStream = this.getInputStream(this.getEntry(entryName))
fun ZipFile.getEntryInputStream(entryName: String): InputStream? = this.getEntry(entryName)?.let { entry -> this.getInputStream(entry) }

fun ZipFile.getEntryBytes(entryName: String): ByteArray = this.getInputStream(this.getEntry(entryName)).use { it.readBytes() }
fun ZipFile.getEntryBytes(entryName: String): ByteArray? = this.getEntry(entryName)?.let { entry -> this.getInputStream(entry).use { it.readBytes() } }

fun getZipEntryBytes(
path: Path,
Expand Down

0 comments on commit 42047cd

Please sign in to comment.