Skip to content

PDFsharp 6.1.1 does not parse object references with line feeds

ThomasHoevel edited this page Aug 6, 2024 · 1 revision

PDFsharp 6.1.1 does not parse object references with line feeds. PDF files created from Google Spreadsheets may contain line feeds within object references.

The bug will be fixed with PDFsharp 6.2.0 Preview 1 coming soon.

If you want to fix this bug with PDFsharp 6.1.1, then please download the source code from the repository, open file Lexer.cs and replace method ScanNumber with the version given below.

/// <summary>
/// Scans a number or an object reference.
/// Returns one of the following symbols.
/// Symbol.ObjRef if testForObjectReference is true and the pattern "nnn ggg R" can be found.
/// Symbol.Real if a decimal point exists or the number of digits is too large for 64-bit integer.
/// Symbol.Integer if the long value is in the range of 32-bit integer.
/// Symbol.LongInteger otherwise.
/// </summary>
public Symbol ScanNumber(bool testForObjectReference)
{
    // We found a PDF file created with Acrobat 7 with this entry 
    //   /Checksum 2996984786   # larger than 2.147.483.648 (2^31)
    //
    // Also got an AutoCAD PDF file that contains
    //   /C 264584027963392     # 15 digits
    //
    // So we introduced a LongInteger.

    // Note: This is a copy of CLexer.ScanNumber with minimal changes. Keep both versions in sync as far as possible.
    // Update StL: Function is revised for object reference look ahead.

    const int maxDigitsForObjectNumber = 7;      // max: 8_388_608 / 0x_7F_FF_FF
    const int maxDigitsForGenerationNumber = 5;  // max: 65_535    / 0x_FF_FF
    const int maxDigitsForLong = 18;
    const int maxDecimalDigits = 10;
    var value = 0L;
    var totalDigits = 0;
    var decimalDigits = 0;
    var period = false;
    var negative = false;
    var ch = _currChar;
    Debug.Assert(ch is '+' or '-' or '.' or (>= '0' and <= '9'));

    // If first char is not a digit, it cannot be an object reference.
    if (testForObjectReference && ch is not (>= '0' and <= '9'))
        testForObjectReference = false;
#if DEBUG_
    var pos = Position;
    var neighborhood = GetNeighborhoodOfCurrentPosition(Position);
    Console.WriteLine(neighborhood);
#endif
    ClearToken();
    if (ch is '+' or '-')
    {
        if (ch == '-')
            negative = true;
        _token.Append(ch);
        ch = ScanNextChar(true);

        // Never saw this in any PDF file, but possible.
        if (ch is not ('.' or >= '0' and <= '9'))
        {
            PdfSharpLogHost.Logger.LogError("+/- not followed by a number.");
        }
    }

    // Scan the number.
    while (true)
    {
        if (ch is >= '0' and <= '9')
        {
            _token.Append(ch);
            ++totalDigits;
            if (decimalDigits < maxDecimalDigits)
            {
                // Calculate the value if it still fits into long.
                if (totalDigits <= maxDigitsForLong)
                    value = 10 * value + ch - '0';
            }
            if (period)
                ++decimalDigits;
        }
        else if (ch == '.')
        {
            // More than one period?
            if (period)
                ContentReaderDiagnostics.ThrowContentReaderException("More than one period in number.");

            period = true;
            _token.Append(ch);
        }
        else
            break;
        ch = ScanNextChar(true);
    }

    // Can the scanned number be the first part of an object reference?
    if (testForObjectReference && period is false
        && totalDigits <= maxDigitsForObjectNumber
        && IsWhiteSpace(_currChar))
    {
#if DEBUG
        LexerHelper.TryCheckReferenceCount++;
#endif
        int gen = TryReadReference();
        if (gen >= 0)
        {
#if DEBUG
            LexerHelper.TryCheckReferenceSuccessCount++;
#endif
            _tokenAsObjectID = ((int)value, gen);
            return Symbol.ObjRef;
        }
    }

    if (totalDigits > maxDigitsForLong || decimalDigits > maxDecimalDigits)
    {
        // The number is too big for long or has too many decimal digits for our own code,
        // so we provide it as real only.
        // Number will be parsed by .NET.
        _tokenAsReal = Double.Parse(_token.ToString(), CultureInfo.InvariantCulture);
        return Symbol.Real;
    }

    if (negative)
        value = -value;

    if (period)
    {
        if (decimalDigits > 0)
        {
            _tokenAsReal = value / PowersOf10[decimalDigits];
        }
        else
        {
            _tokenAsReal = value;
            _tokenAsLong = value;
        }
        return Symbol.Real;
    }
    _tokenAsLong = value;
    _tokenAsReal = Double.NaN;

    Debug.Assert(Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture) == value);

    if (value is >= Int32.MinValue and < Int32.MaxValue)
        return Symbol.Integer;

    return Symbol.LongInteger;

    // Try to read generation number followed by an 'R'.
    // Returns -1 if not an object reference.
    int TryReadReference()
    {
        Debug.Assert(IsWhiteSpace(_currChar));

        // A Reference has the form "nnn ggg R". The original implementation of the parser used a
        // reduce/shift algorithm in the first place. But this case is the only one we need to
        // look ahead 3 tokens.
        // This is a new implementation that checks whether a scanned integer is followed by
        // another integer and an 'R'. 

        // Save current position and token.
        SizeType position = Position;
        string token = _token.ToString();

        // White-space expected.
        if (!IsWhiteSpace(_currChar))
            goto NotAReference;

        // Skip white-spaces.
        while (IsWhiteSpace(_currChar))
            ScanNextChar(true);

        // First digit of generation expected.
        if (_currChar is not (>= '0' and <= '9'))
            goto NotAReference;

        // Read generation number.
        var generationNumber = _currChar - '0';
        ScanNextChar(true);
        int digitCount = 1;
        while (_currChar is >= '0' and <= '9')
        {
            if (++digitCount > maxDigitsForGenerationNumber)
                goto NotAReference;
            generationNumber = generationNumber * 10 + _currChar - '0';
            ScanNextChar(true);
        }

        // White-space expected.
        if (!IsWhiteSpace(_currChar))
            goto NotAReference;

        // Skip white-spaces.
        while (IsWhiteSpace(_currChar))
            ScanNextChar(true);

        // "R" expected.
        // We can ignore _nextChar because there is no other valid token that starts with an uppercase letter 'R'.
        if (_currChar != 'R')
            goto NotAReference;

        ScanNextChar(true);

        return generationNumber;

    NotAReference:
        // Restore stream position.
        Position = position;
        // Restore token because setting position clears it.
        _token.Append(token);
        return -1;
    }
}