From 5136238f2a973f693cea53c340dcff23a655531f Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Fri, 2 Feb 2018 21:16:40 -0500 Subject: Detect and report bad tokens in content normalization --- ChangeLog | 19 +++ libqpdf/ContentNormalizer.cc | 26 +++- libqpdf/QPDF_Stream.cc | 27 ++++ libqpdf/qpdf/ContentNormalizer.hh | 7 + qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf.test | 10 +- qpdf/qtest/qpdf/coalesce.qdf | 231 +++++++++++++++++++++++++++++++++ qpdf/qtest/qpdf/good14.out | 17 ++- qpdf/qtest/qpdf/normalize-warnings.out | 9 ++ 9 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 qpdf/qtest/qpdf/coalesce.qdf create mode 100644 qpdf/qtest/qpdf/normalize-warnings.out diff --git a/ChangeLog b/ChangeLog index b061c584..7d94eb9f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -153,6 +153,25 @@ * Provide heavily annoated examples/pdf-filter-tokens.cc example that illustrates use of some simple token filters. + * When normalizing content streams, as in qdf mode, issue warning + about bad tokens. Content streams are only normalized when this is + explicitly requested, so this has no impact on normal operation. + However, in qdf mode, if qpdf detects a bad token, it means that + either there's a bug in qpdf's lexer, that the file is damaged, or + that the page's contents are split in a weird way. In any of those + cases, qpdf could potentially damage the stream's contents by + replacing carrige returns with newlines or otherwise messing with + spaces. The mostly likely case of this would be an inline image's + compressed data being divided across two streams and having the + compressed data in the second stream contain a carriage return as + part of its binary data. If you are using qdf mode just to look at + PDF files in text editors, this usually doesn't matter. In cases + of contents split across multiple streams, coalescing streams + would eliminate the problem, so the warning mentions this. Prior + to this enhancement, the chances of qdf mode writing incorrect + data were already very low. This change should make it nearly + impossible for qdf mode to unknowingly write invalid data. + 2018-02-04 Jay Berkenbilt * Add QPDFWriter::setLinearizationPass1Filename method and diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc index 35a8ad74..f85ab829 100644 --- a/libqpdf/ContentNormalizer.cc +++ b/libqpdf/ContentNormalizer.cc @@ -1,7 +1,9 @@ #include #include -ContentNormalizer::ContentNormalizer() +ContentNormalizer::ContentNormalizer() : + any_bad_tokens(false), + last_token_was_bad(false) { } @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) std::string value = token.getRawValue(); QPDFTokenizer::token_type_e token_type = token.getType(); + if (token_type == QPDFTokenizer::tt_bad) + { + this->any_bad_tokens = true; + this->last_token_was_bad = true; + } + else if (token_type != QPDFTokenizer::tt_eof) + { + this->last_token_was_bad = false; + } + switch (token_type) { case QPDFTokenizer::tt_space: @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF() { finish(); } + +bool +ContentNormalizer::anyBadTokens() const +{ + return this->any_bad_tokens; +} + +bool +ContentNormalizer::lastTokenWasBad()const +{ + return this->last_token_was_bad; +} diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index a026f9a4..bb1e24e6 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, } } + if (filter && + (! suppress_warnings) && + normalizer.getPointer() && + normalizer->anyBadTokens()) + { + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), + "", this->offset, + "content normalization encountered bad tokens")); + if (normalizer->lastTokenWasBad()) + { + QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize"); + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), + "", this->offset, + "normalized content ended with a bad token;" + " you may be able to resolve this by" + " coalescing content streams in combination" + " with normalizing content. From the command" + " line, specify --coalesce-contents")); + } + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), + "", this->offset, + "Resulting stream data may be corrupted but is" + " may still useful for manual inspection." + " For more information on this warning, search" + " for content normalization in the manual.")); + } + return filter; } diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh index 504f15e8..89b28f3a 100644 --- a/libqpdf/qpdf/ContentNormalizer.hh +++ b/libqpdf/qpdf/ContentNormalizer.hh @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter virtual ~ContentNormalizer(); virtual void handleToken(QPDFTokenizer::Token const&); virtual void handleEOF(); + + bool anyBadTokens() const; + bool lastTokenWasBad() const; + + private: + bool any_bad_tokens; + bool last_token_was_bad; }; #endif // __CONTENTNORMALIZER_HH__ diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index a1ce662d..2c51867f 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0 QPDFObjectHandle non-stream in stream array 0 QPDFObjectHandle coalesce called on stream 0 QPDFObjectHandle coalesce provide stream data 0 +QPDF_Stream bad token at end during normalize 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index a3572859..45c750fd 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor", show_ntests(); # ---------- $td->notify("--- Coalesce contents ---"); -$n_tests += 4; +$n_tests += 6; +$td->runtest("qdf with normalize warnings", + {$td->COMMAND => + "qpdf --qdf --static-id coalesce.pdf a.pdf"}, + {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "coalesce.qdf"}); $td->runtest("coalesce contents with qdf", {$td->COMMAND => "qpdf --qdf --static-id" . diff --git a/qpdf/qtest/qpdf/coalesce.qdf b/qpdf/qtest/qpdf/coalesce.qdf new file mode 100644 index 00000000..5007dc12 --- /dev/null +++ b/qpdf/qtest/qpdf/coalesce.qdf @@ -0,0 +1,231 @@ +%PDF-1.3 +% +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 2 + /Kids [ + 3 0 R + 4 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents [ + 5 0 R + 7 0 R + 9 0 R + 11 0 R + ] + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 13 0 R + >> + /ProcSet 14 0 R + >> + /Type /Page +>> +endobj + +%% Page 2 +%% Original object ID: 4 0 +4 0 obj +<< + /Contents 15 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 17 0 R + >> + /ProcSet 18 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 5 0 +5 0 obj +<< + /Length 6 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Pot +endstream +endobj + +%QDF: ignore_newline +6 0 obj +33 +endobj + +%% Contents for page 1 +%% Original object ID: 7 0 +7 0 obj +<< + /Length 8 0 R +>> +stream +ato) Tj +ET [ /array +endstream +endobj + +%QDF: ignore_newline +8 0 obj +19 +endobj + +%% Contents for page 1 +%% Original object ID: 9 0 +9 0 obj +<< + /Length 10 0 R +>> +stream +/split ] BI +/CS /G/W 66/H 47/BPC 8/F/Fl/DP<> +ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHt +endstream +endobj + +%QDF: ignore_newline +10 0 obj +253 +endobj + +%% Contents for page 1 +%% Original object ID: 11 0 +11 0 obj +<< + /Length 12 0 R +>> +stream +QTt*hUw%)p"DiRjDYNUAvF& +u#cW ߉WO +EI +endstream +endobj + +%QDF: ignore_newline +12 0 obj +65 +endobj + +%% Original object ID: 13 0 +13 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 14 0 +14 0 obj +[ + /PDF + /Text +] +endobj + +%% Contents for page 2 +%% Original object ID: 15 0 +15 0 obj +<< + /Length 16 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +16 0 obj +44 +endobj + +%% Original object ID: 17 0 +17 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 18 0 +18 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 19 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000252 00000 n +0000000524 00000 n +0000000769 00000 n +0000000879 00000 n +0000000948 00000 n +0000001044 00000 n +0000001113 00000 n +0000001444 00000 n +0000001516 00000 n +0000001660 00000 n +0000001708 00000 n +0000001855 00000 n +0000001942 00000 n +0000002043 00000 n +0000002091 00000 n +0000002238 00000 n +trailer << + /Root 1 0 R + /Size 19 + /ID [<31415926535897932384626433832795>] +>> +startxref +2274 +%%EOF diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out index 87819670..84bf7133 100644 --- a/qpdf/qtest/qpdf/good14.out +++ b/qpdf/qtest/qpdf/good14.out @@ -13,7 +13,9 @@ three lines <8a8b> (ab) <8c>
) > -<610062> (MOO)-- stream 1 -- +<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens +WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +-- stream 1 -- This stream does end with a newline. // tests: // bad tokens preserved @@ -31,10 +33,18 @@ This stream does end with a newline. /good name /bad#00name +WARNING: good14.pdf (file position 860): content normalization encountered bad tokens +WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. -- stream 2 -- (This stream ends with a \001 bad token +WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens +WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. -- stream 3 -- - +WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens +WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. test 3 done diff --git a/qpdf/qtest/qpdf/normalize-warnings.out b/qpdf/qtest/qpdf/normalize-warnings.out new file mode 100644 index 00000000..73947b1a --- /dev/null +++ b/qpdf/qtest/qpdf/normalize-warnings.out @@ -0,0 +1,9 @@ +WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens +WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens +WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens +WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +qpdf: operation succeeded with warnings; resulting file may have some problems -- cgit v1.2.3-70-g09d2