From 5cfcd4f361063df8e216489915758ce40a15f15b Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 17 Jan 2019 08:56:58 -0500 Subject: Additional checks for unreferenced resources Explicitly abandon removal of unreferenced resources if there are any lexical errors in the page's contents. This case always generated a warning, but it now also prevents removal of unreferenced resources, this strongly decreasing the likelihood of data loss. --- libqpdf/QPDFPageObjectHelper.cc | 17 +++ qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf.test | 16 ++- qpdf/qtest/qpdf/coalesce-split-1-2.pdf | 231 +++++++++++++++++++++++++++++++++ qpdf/qtest/qpdf/coalesce-split.out | 10 ++ 5 files changed, 272 insertions(+), 3 deletions(-) create mode 100644 qpdf/qtest/qpdf/coalesce-split-1-2.pdf create mode 100644 qpdf/qtest/qpdf/coalesce-split.out diff --git a/libqpdf/QPDFPageObjectHelper.cc b/libqpdf/QPDFPageObjectHelper.cc index ba647372..c5ede04f 100644 --- a/libqpdf/QPDFPageObjectHelper.cc +++ b/libqpdf/QPDFPageObjectHelper.cc @@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter( class NameWatcher: public QPDFObjectHandle::TokenFilter { public: + NameWatcher() : + saw_bad(false) + { + } virtual ~NameWatcher() { } virtual void handleToken(QPDFTokenizer::Token const&); std::set names; + bool saw_bad; }; void @@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const& token) this->names.insert( QPDFObjectHandle::newName(token.getValue()).getName()); } + else if (token.getType() == QPDFTokenizer::tt_bad) + { + saw_bad = true; + } writeToken(token); } @@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources() "; not attempting to remove unreferenced objects from this page"); return; } + if (nw.saw_bad) + { + QTC::TC("qpdf", "QPDFPageObjectHelper bad token finding names"); + this->oh.warnIfPossible( + "Bad token found while scanning content stream; " + "not attempting to remove unreferenced objects from this page"); + return; + } // Walk through /Font and /XObject dictionaries, removing any // resources that are not referenced. We must make copies of // resource dictionaries down into the dictionaries are mutating diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 58f2cdca..08f82592 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0 QPDF copy foreign stream with buffer 0 QPDF immediate copy stream data 0 qpdf copy same page more than once 1 +QPDFPageObjectHelper bad token finding names 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index b2b92f2c..44aa5421 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -1384,7 +1384,7 @@ my @sp_cases = ( [11, 'pdf extension', '', 'split-out.Pdf'], [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'], ); -$n_tests += 21; +$n_tests += 23; for (@sp_cases) { $n_tests += 1 + $_->[0]; @@ -1482,10 +1482,20 @@ $td->runtest("split shared font, xobject", foreach my $i (qw(1 2 3 4)) { $td->runtest("check output ($i)", - {$td->FILE => "shared-font-xobject-split-$i.pdf"}, - {$td->FILE => "split-out-shared-font-xobject-$i.pdf"}); + {$td->FILE => "split-out-shared-font-xobject-$i.pdf"}, + {$td->FILE => "shared-font-xobject-split-$i.pdf"}); } +$td->runtest("unreferenced resources with bad token", + {$td->COMMAND => + "qpdf --qdf --static-id --split-pages=2" . + " coalesce.pdf split-out-bad-token.pdf"}, + {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "split-out-bad-token-1-2.pdf"}, + {$td->FILE => "coalesce-split-1-2.pdf"}); + show_ntests(); # ---------- $td->notify("--- Keep Files Open ---"); diff --git a/qpdf/qtest/qpdf/coalesce-split-1-2.pdf b/qpdf/qtest/qpdf/coalesce-split-1-2.pdf new file mode 100644 index 00000000..4542411e --- /dev/null +++ b/qpdf/qtest/qpdf/coalesce-split-1-2.pdf @@ -0,0 +1,231 @@ +%PDF-1.3 +% +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 2 + /Kids [ + 3 0 R + 4 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents [ + 5 0 R + 7 0 R + 9 0 R + 11 0 R + ] + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 13 0 R + >> + /ProcSet 14 0 R + >> + /Type /Page +>> +endobj + +%% Page 2 +%% Original object ID: 14 0 +4 0 obj +<< + /Contents 15 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 17 0 R + >> + /ProcSet 18 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 4 0 +5 0 obj +<< + /Length 6 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Pot +endstream +endobj + +%QDF: ignore_newline +6 0 obj +33 +endobj + +%% Contents for page 1 +%% Original object ID: 6 0 +7 0 obj +<< + /Length 8 0 R +>> +stream +ato) Tj +ET [ /array +endstream +endobj + +%QDF: ignore_newline +8 0 obj +19 +endobj + +%% Contents for page 1 +%% Original object ID: 8 0 +9 0 obj +<< + /Length 10 0 R +>> +stream +/split ] BI +/CS /G/W 66/H 47/BPC 8/F/Fl/DP<> +ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHt +endstream +endobj + +%QDF: ignore_newline +10 0 obj +253 +endobj + +%% Contents for page 1 +%% Original object ID: 10 0 +11 0 obj +<< + /Length 12 0 R +>> +stream +QTt*hUw%)p"DiRjDYNUAvF& +u#cW ߉WO +EI +endstream +endobj + +%QDF: ignore_newline +12 0 obj +65 +endobj + +%% Original object ID: 12 0 +13 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 13 0 +14 0 obj +[ + /PDF + /Text +] +endobj + +%% Contents for page 2 +%% Original object ID: 15 0 +15 0 obj +<< + /Length 16 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Potato) Tj +ET +endstream +endobj + +16 0 obj +44 +endobj + +%% Original object ID: 17 0 +17 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 18 0 +18 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 19 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000252 00000 n +0000000525 00000 n +0000000770 00000 n +0000000880 00000 n +0000000949 00000 n +0000001045 00000 n +0000001114 00000 n +0000001445 00000 n +0000001517 00000 n +0000001661 00000 n +0000001709 00000 n +0000001856 00000 n +0000001943 00000 n +0000002044 00000 n +0000002092 00000 n +0000002239 00000 n +trailer << + /Root 1 0 R + /Size 19 + /ID [<31415926535897932384626433832795><31415926535897932384626433832795>] +>> +startxref +2275 +%%EOF diff --git a/qpdf/qtest/qpdf/coalesce-split.out b/qpdf/qtest/qpdf/coalesce-split.out new file mode 100644 index 00000000..5e18173c --- /dev/null +++ b/qpdf/qtest/qpdf/coalesce-split.out @@ -0,0 +1,10 @@ +WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page +WARNING: empty PDF: content normalization encountered bad tokens +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +WARNING: empty PDF: content normalization encountered bad tokens +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +WARNING: empty PDF: content normalization encountered bad tokens +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. +qpdf: operation succeeded with warnings; resulting file may have some problems -- cgit v1.2.3-54-g00ecf