Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve resilience against malformed or corrupt documents #152

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
using PdfSharp.Pdf.IO;
using PdfSharp.Pdf.Security;
using PdfSharp.Pdf.Internal;
using System.Text.RegularExpressions;
using System.Text;
using PdfSharp.Logging;
using Microsoft.Extensions.Logging;

namespace PdfSharp.Pdf.Advanced
{
Expand Down Expand Up @@ -215,6 +219,142 @@ internal void Finish()
_document.IrefTable.IsUnderConstruction = false;
}

/// <summary>
/// Attempts to rebuild the trailer and iref-table if original ones seem to be corrupt
/// </summary>
/// <exception cref="PdfReaderException"></exception>
internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser parser)
{
PdfSharpLogHost.PdfReadingLogger.LogInformation("Attempt to rebuild trailer...");
#if NET6_0_OR_GREATER
ArgumentNullException.ThrowIfNull(document, nameof(document));
#else
if (document == null)
throw new ArgumentNullException(nameof(document));
#endif
if (document._lexer == null)
throw new InvalidOperationException("Document must have a lexer set");

// TODO: for performance reasons, we would normally use static properties for the Regex
// (and Source-Generators for newer .Net Versions !)
// but since this should be a one-time operation, we declare them inline for clarity)

// start on an object, e.g. "1 0 obj"
var rxObjectStart = new Regex("\\b(?<num>\\d+)\\s+(?<gen>\\d+)\\s+obj\\b");
// start of a trailer, e.g. "trailer <<"
var rxTrailerStart = new Regex("\\btrailer\\s*<<");
var irefTable = new PdfCrossReferenceTable(document);
var trailerStart = 0L;
try
{
// scan the whole file and collect object-ids
stream.Position = 0;
var buffer = new byte[4096];
var nextStreamPos = stream.Position + 1; // start of the next chunk
while (stream.Position < stream.Length)
{
var bufStart = stream.Position;
var readLength = stream.Read(buffer, 0, buffer.Length);
var readString = Encoding.ASCII.GetString(buffer, 0, readLength);
// search for objects
var numObjectsFound = 0;
var objectMatches = rxObjectStart.Matches(readString);
foreach (Match match in objectMatches)
{
if (match.Success)
{
var objNumber = int.Parse(match.Groups["num"].Value);
var generationNumber = int.Parse(match.Groups["gen"].Value);
var objId = new PdfObjectID(objNumber, generationNumber);
var existingObj = irefTable[objId];
if (existingObj != null)
// always use the object found later in the file
// this handles newer objects written by incremental updates
existingObj.Position = bufStart + match.Index;
else
irefTable.Add(new PdfReference(objId, (int)bufStart + match.Index));
nextStreamPos = bufStart + match.Index + match.Length;
numObjectsFound++;
}
}
// search for the trailer
var trailerMatches = rxTrailerStart.Matches(readString);
foreach (Match match in trailerMatches)
{
if (match.Success)
{
// if trailer is found multiple times, the last one wins (conforms to spec)
trailerStart = bufStart + match.Index;
nextStreamPos = Math.Max(nextStreamPos, trailerStart + match.Length);
}
}
// read with overlap to avoid splitting an object-declaration
if (readLength == buffer.Length)
stream.Position = Math.Max(0, stream.Position - 12);
if (stream.Position < stream.Length)
{
if (trailerMatches.Count > 0 || numObjectsFound > 0)
stream.Position = nextStreamPos;
else
// read with overlap to avoid splitting an object-declaration
stream.Position = Math.Max(0, stream.Position - 12);
}
}
document.IrefTable = irefTable;
irefTable.IsUnderConstruction = true;

var allRefs = irefTable.AllReferences;
var trailer = new PdfTrailer(document);

if (trailerStart > 0L)
{
// read the entries of the trailer dictionary
stream.Position = trailerStart;
document._lexer.Position = trailerStart;
parser.ReadSymbol(Symbol.Trailer);
parser.ReadSymbol(Symbol.BeginDictionary);
parser.ReadDictionary(trailer, false);
// TODO: what about /Prev entry ? these may also be corrupt (need a file to verify)
// in theory, this can be ignored, because we already have read ALL objects
}
if (!trailer.Elements.ContainsKey(Keys.Root))
{
// cases:
// 1. no trailer found (maybe cut off at end of file)
// 2. trailer is corrupt (found one with just a single /Size entry, /Root was missing)
// read all found objects searching for the catalog (/Root entry)
foreach (var objRef in allRefs)
{
parser.MoveToObject(objRef.ObjectID);
var obj = parser.ReadIndirectObject(objRef);
if (obj is PdfDictionary dict)
{
var type = dict.Elements.GetName(PdfCatalog.Keys.Type);
// ensure we use a valid catalog (we may find multiple)
if (type == "/Catalog" && dict.Elements.ContainsKey(PdfCatalog.Keys.Pages))
{
trailer.Elements[Keys.Root] = dict.Reference;
}
}
}
}
// still no catalog ? then throw
if (!trailer.Elements.ContainsKey(Keys.Root))
throw new PdfReaderException(
"Unable to rebuild trailer and iref-table, catalog dictionary not found. The pdf is corrupt");

var largestObjectNumber = allRefs.Max(x => x.ObjectID.ObjectNumber);
trailer.Elements.SetInteger(Keys.Size, largestObjectNumber + 1);
PdfSharpLogHost.PdfReadingLogger.LogInformation("Trailer was rebuild with {count} found objects", irefTable.AllObjectIDs.Length);
return trailer;
}
catch (Exception ex)
{
throw new PdfReaderException("Unable to rebuild trailer and iref-table, pdf is corrupt", ex);
}
}


/// <summary>
/// Predefined keys of this dictionary.
/// </summary>
Expand Down
53 changes: 38 additions & 15 deletions src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,9 @@ public Symbol ScanNextToken(bool testForObjectReference)
return Symbol = Symbol.Eof;

default:
Debug.Assert(!Char.IsLetter(ch), "PDFsharp did something wrong. See code below.");
ParserDiagnostics.HandleUnexpectedCharacter(ch, DumpNeighborhoodOfPosition());
return Symbol = Symbol.None;
// just skip over unexpected character
ScanNextChar(true);
goto TryAgain;
}
}

Expand Down Expand Up @@ -855,20 +855,43 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep
if (start == 144848)
_ = sizeof(int);
#endif
var rawString = RandomReadRawString(start, searchLength);

// When we come here, we have either an invalid or no \Length entry.
// Best we can do is to consider all byte before 'endstream' are part of the stream content.
// In case the stream is zipped, this is no problem. In case the stream is encrypted
// it would be a serious problem. But we wait if this really happens.
int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
if (idxEndStream == -1)
var firstStart = start;
while (start < _pdfLength)
{
SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
return -1;
var rawString = RandomReadRawString(start, Math.Min(searchLength, (int)(_pdfLength - start)));

// When we come here, we have either an invalid or no \Length entry.
// Best we can do is to consider all byte before 'endstream' are part of the stream content.
// In case the stream is zipped, this is no problem. In case the stream is encrypted
// it would be a serious problem. But we wait if this really happens.
int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
if (idxEndStream >= 0)
{
// The spec says (7.3.8, Stream Objects):
// "There should be an end-of-line marker after the data and before endstream;
// this marker shall not be included in the stream length"

// check bytes before the keyword for possible CRLF or LF or CR
// (CR alone SHALL NOT be used but check it anyway)
// sanity check, should always pass since we SHOULD have read the "stream" keyword before we came here
if (start + idxEndStream >= 2)
{
_pdfStream.Position = start + idxEndStream - 2;
var b1 = _pdfStream.ReadByte();
var b2 = _pdfStream.ReadByte();
if (b2 == '\n' || b2 == '\r') // possible CRLF or single LF or single CR
{
idxEndStream--;
if (b1 == '\r' && b2 != '\r') // handle CRLF but not CRCR
idxEndStream--;
}
}
return (int)(start - firstStart + idxEndStream);
}
start += Math.Max(1, searchLength - "endstream".Length - 1);
}

return idxEndStream;
SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
return -1;
}

/// <summary>
Expand Down
25 changes: 18 additions & 7 deletions src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public Parser(PdfDocument? document, Stream objectStream, Parser documentParser)
/// </summary>
/// <param name="objectID">The ID of the object to move.</param>
/// <param name="suppressObjectOrderExceptions">Suppresses exceptions that may be caused by not yet available objects.</param>
public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions)
public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions = null)
{
SizeType? position = _document.IrefTable[objectID]?.Position;
if (!position.HasValue)
Expand Down Expand Up @@ -369,9 +369,20 @@ void ReadDictionaryStream(PdfDictionary dict, SuppressExceptions? suppressObject
// Step 3: We try to read the stream content.
// Maybe we have to re-read it in case 'endstream' was not at the
// right place after reading with the length value coming from /Length.
var bytes = _lexer.ScanStream(startPosition, streamLength);
var stream = new PdfDictionary.PdfStream(bytes, dict);
dict.Stream = stream;
byte[] bytes;
try
{
// this may throw if startPosition + streamLength > length of stream
bytes = _lexer.ScanStream(startPosition, streamLength);
var stream = new PdfDictionary.PdfStream(bytes, dict);
dict.Stream = stream;
}
catch
{
// reset stream position
_lexer.Position = startPosition;
// ignore exception, we'll try again after determining real stream-length
}
#if DEBUG_ // Check it with Notepad++ directly in PDF file.
// ReSharper disable once ConditionIsAlwaysTrueOrFalseAccordingToNullableAPIContract
if (bytes is not null && bytes.Length > 0)
Expand Down Expand Up @@ -829,7 +840,7 @@ PdfItem ReadReference(PdfReference iref, bool includeReferences)
/// <summary>
/// Reads the next symbol that must be the specified one.
/// </summary>
Symbol ReadSymbol(Symbol symbol)
internal Symbol ReadSymbol(Symbol symbol)
{
Symbol current = ScanNextToken(symbol == Symbol.ObjRef);
if (symbol != current)
Expand Down Expand Up @@ -903,7 +914,7 @@ SizeType ReadSize()
/// <summary>
/// Reads the PdfObject of the reference, no matter if it’s saved at document level or inside an ObjectStream.
/// </summary>
internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions, bool withoutDecrypting = false)
internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions = null, bool withoutDecrypting = false)
{
try
{
Expand Down Expand Up @@ -1406,7 +1417,7 @@ bool CheckXRefTableEntry(SizeType position, int id, int generation, out int idCh
/// <summary>
/// Reads cross-reference stream(s).
/// </summary>
PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
internal PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
{
// Read cross-reference stream.
//Debug.Assert(_lexer.Symbol == Symbol.Integer);
Expand Down
9 changes: 8 additions & 1 deletion src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,14 @@ PdfDocument OpenFromStream(Stream stream, string? password, PdfDocumentOpenMode
var parser = new Parser(_document, options ?? new PdfReaderOptions(), _logger);

// 1. Read all trailers or cross-reference streams, but no objects.
_document.Trailer = parser.ReadTrailer();
try
{
_document.Trailer = parser.ReadTrailer();
}
catch
{
_document.Trailer = PdfTrailer.Rebuild(_document, stream, parser);
}
if (_document.Trailer == null!)
ParserDiagnostics.ThrowParserException(
"Invalid PDF file: no trailer found."); // TODO L10N using PsMsgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,7 @@ public void Flatten()
/// <summary>
/// Gets the standard security handler, if existing and encryption is active.
/// </summary>
internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer.EffectiveSecurityHandler;
internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer?.EffectiveSecurityHandler;

internal PdfTrailer Trailer { get; set; } = default!;

Expand Down
4 changes: 3 additions & 1 deletion src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,11 @@ static bool TryRereadAsUnicode(ref string? value)
return true;
}

#if true // UTF-16LE is not defined as valid text string encoding in PDF reference.
#if false // UTF-16LE is not defined as valid text string encoding in PDF reference.
if (value is ['\xFF', '\xFE', ..])
{
throw new NotImplementedException("Found UTF-16LE string. Please send us the PDF file and we will fix it (issues (at) pdfsharp.net).");
}
#else
// Adobe Reader also supports UTF-16LE.
if (value is ['\xFF', '\xFE', ..])
Expand Down