From 67d5ed3a64a81f9192c17dc71f02e69f60f8a1f8 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 4 Apr 2020 12:36:45 -0400 Subject: Implement remove-unreferenced-resources=auto --- qpdf/qpdf.cc | 144 +++++++++++++++++++++++- qpdf/qpdf.testcov | 3 + qpdf/qtest/qpdf.test | 15 ++- qpdf/qtest/qpdf/disable-kfo.out | 104 +++++++++++++++++ qpdf/qtest/qpdf/enable-kfo.out | 22 ++++ qpdf/qtest/qpdf/kfo-n.out | 20 ++++ qpdf/qtest/qpdf/kfo-y.out | 20 ++++ qpdf/qtest/qpdf/shared-form-images-xobject.pdf | Bin 0 -> 9793 bytes qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf | Bin 0 -> 4840 bytes qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf | Bin 0 -> 5060 bytes qpdf/qtest/qpdf/split-pages-group.out | 2 + qpdf/qtest/qpdf/uo-6.out | 2 + qpdf/qtest/qpdf/verbose-merge.out | 8 ++ 13 files changed, 336 insertions(+), 4 deletions(-) create mode 100644 qpdf/qtest/qpdf/shared-form-images-xobject.pdf create mode 100644 qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf create mode 100644 qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc index 442ee6cc..38aa2f4a 100644 --- a/qpdf/qpdf.cc +++ b/qpdf/qpdf.cc @@ -4750,6 +4750,140 @@ static void handle_transformations(QPDF& pdf, Options& o) } } +static bool should_remove_unreferenced_resources(QPDF& pdf, Options& o) +{ + if (o.remove_unreferenced_page_resources == re_no) + { + return false; + } + else if (o.remove_unreferenced_page_resources == re_yes) + { + return true; + } + + // Unreferenced resources are common in files where resources + // dictionaries are shared across pages. As a heuristic, we look + // in the file for shared resources dictionaries or shared XObject + // subkeys of resources dictionaries either on pages or on form + // XObjects in pages. If we find any, then there is a higher + // likeilihood that the expensive process of finding unreferenced + // resources is worth it. + + // Return true as soon as we find any shared resources. + + std::set resources_seen; // shared resources detection + std::set nodes_seen; // loop detection + + if (o.verbose) + { + std::cout << whoami << ": " << pdf.getFilename() + << ": checking for shared resources" << std::endl; + } + + std::list queue; + queue.push_back(pdf.getRoot().getKey("/Pages")); + while (! queue.empty()) + { + QPDFObjectHandle node = *queue.begin(); + QPDFObjGen og = node.getObjGen(); + if (nodes_seen.count(og)) + { + continue; + } + nodes_seen.insert(og); + queue.pop_front(); + QPDFObjectHandle dict = node.isStream() ? node.getDict() : node; + QPDFObjectHandle kids = dict.getKey("/Kids"); + if (kids.isArray()) + { + // This is a non-leaf node. + if (dict.hasKey("/Resources")) + { + QTC::TC("qpdf", "qpdf found resources in non-leaf"); + if (o.verbose) + { + std::cout << " found resources in non-leaf page node " + << og.getObj() << " " << og.getGen() + << std::endl; + } + return true; + } + int n = kids.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + queue.push_back(kids.getArrayItem(i)); + } + } + else + { + // This is a leaf node or a form XObject. + QPDFObjectHandle resources = dict.getKey("/Resources"); + if (resources.isIndirect()) + { + QPDFObjGen resources_og = resources.getObjGen(); + if (resources_seen.count(resources_og)) + { + QTC::TC("qpdf", "qpdf found shared resources in leaf"); + if (o.verbose) + { + std::cout << " found shared resources in leaf node " + << og.getObj() << " " << og.getGen() + << ": " + << resources_og.getObj() << " " + << resources_og.getGen() + << std::endl; + } + return true; + } + resources_seen.insert(resources_og); + } + QPDFObjectHandle xobject = resources.getKey("/XObject"); + if (xobject.isIndirect()) + { + QPDFObjGen xobject_og = xobject.getObjGen(); + if (resources_seen.count(xobject_og)) + { + QTC::TC("qpdf", "qpdf found shared xobject in leaf"); + if (o.verbose) + { + std::cout << " found shared xobject in leaf node " + << og.getObj() << " " << og.getGen() + << ": " + << xobject_og.getObj() << " " + << xobject_og.getGen() + << std::endl; + } + return true; + } + resources_seen.insert(xobject_og); + } + if (xobject.isDictionary()) + { + for (auto k: xobject.getKeys()) + { + QPDFObjectHandle xobj = xobject.getKey(k); + if (xobj.isStream() && + xobj.getDict().getKey("/Type").isName() && + ("/XObject" == + xobj.getDict().getKey("/Type").getName()) && + xobj.getDict().getKey("/Subtype").isName() && + ("/Form" == + xobj.getDict().getKey("/Subtype").getName())) + { + queue.push_back(xobj); + } + } + } + } + } + + if (o.verbose) + { + std::cout << whoami << ": no shared resources found" << std::endl; + } + return false; +} + static void handle_page_specs(QPDF& pdf, Options& o) { // Parse all page specifications and translate them into lists of @@ -4883,8 +5017,12 @@ static void handle_page_specs(QPDF& pdf, Options& o) cis = page_spec_cfis[filename]; cis->stayOpen(true); } - QPDFPageDocumentHelper dh(*((*iter).second)); - dh.removeUnreferencedResources(); + QPDF& other(*((*iter).second)); + if (should_remove_unreferenced_resources(other, o)) + { + QPDFPageDocumentHelper dh(other); + dh.removeUnreferencedResources(); + } if (cis) { cis->stayOpen(false); @@ -5368,7 +5506,7 @@ static void do_split_pages(QPDF& pdf, Options& o) before = std::string(o.outfilename) + "-"; } - if (o.remove_unreferenced_page_resources != re_no) + if (should_remove_unreferenced_resources(pdf, o)) { QPDFPageDocumentHelper dh(pdf); dh.removeUnreferencedResources(); diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 09d40c23..6834c7ad 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -450,3 +450,6 @@ QPDFWriter no encryption sig contents 0 QPDFPageObjectHelper colorspace lookup 0 QPDFWriter ignore XRef in qdf mode 0 QPDFPageObjectHelper filter form xobject 0 +qpdf found resources in non-leaf 0 +qpdf found shared resources in leaf 0 +qpdf found shared xobject in leaf 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 1184fa2b..560bee07 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -1699,7 +1699,7 @@ my @sp_cases = ( [11, 'pdf extension', '', 'split-out.Pdf'], [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'], ); -$n_tests += 32; +$n_tests += 35; $n_compare_pdfs += 1; for (@sp_cases) { @@ -1808,6 +1808,7 @@ foreach my $i (qw(1 2 3 4)) $td->runtest("unreferenced resources with bad token", {$td->COMMAND => "qpdf --qdf --static-id --split-pages=2" . + " --remove-unreferenced-resources=yes" . " coalesce.pdf split-out-bad-token.pdf"}, {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, $td->NORMALIZE_NEWLINES); @@ -1834,6 +1835,18 @@ $td->runtest("check output", {$td->FILE => "shared-form-images-merged.pdf"}); compare_pdfs("shared-form-images.pdf", "a.pdf"); +$td->runtest("shared form xobject subkey", + {$td->COMMAND => "qpdf --qdf --static-id --split-pages". + " shared-form-images-xobject.pdf" . + " split-out-shared-form-xobject.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}); +foreach my $i (qw(1 2)) +{ + $td->runtest("check output ($i)", + {$td->FILE => "split-out-shared-form-xobject-$i.pdf"}, + {$td->FILE => "shared-form-xobject-split-$i.pdf"}); +} + show_ntests(); # ---------- $td->notify("--- Keep Files Open ---"); diff --git a/qpdf/qtest/qpdf/disable-kfo.out b/qpdf/qtest/qpdf/disable-kfo.out index a33044e0..f8f4622e 100644 --- a/qpdf/qtest/qpdf/disable-kfo.out +++ b/qpdf/qtest/qpdf/disable-kfo.out @@ -50,6 +50,110 @@ qpdf: processing 048-kfo.pdf qpdf: processing 049-kfo.pdf qpdf: processing 050-kfo.pdf qpdf: processing 051-kfo.pdf +qpdf: empty PDF: checking for shared resources +qpdf: no shared resources found +qpdf: 001-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 002-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 003-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 004-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 005-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 006-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 007-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 008-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 009-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 010-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 011-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 012-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 013-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 014-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 015-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 016-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 017-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 018-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 019-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 020-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 021-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 022-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 023-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 024-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 025-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 026-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 027-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 028-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 029-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 030-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 031-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 032-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 033-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 034-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 035-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 036-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 037-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 038-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 039-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 040-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 041-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 042-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 043-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 044-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 045-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 046-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 047-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 048-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 049-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 050-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 051-kfo.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from 001-kfo.pdf qpdf: adding pages from 002-kfo.pdf diff --git a/qpdf/qtest/qpdf/enable-kfo.out b/qpdf/qtest/qpdf/enable-kfo.out index 0c011518..e49e7a12 100644 --- a/qpdf/qtest/qpdf/enable-kfo.out +++ b/qpdf/qtest/qpdf/enable-kfo.out @@ -9,6 +9,28 @@ qpdf: processing 016-kfo.pdf qpdf: processing 017-kfo.pdf qpdf: processing 018-kfo.pdf qpdf: processing 019-kfo.pdf +qpdf: empty PDF: checking for shared resources +qpdf: no shared resources found +qpdf: 010-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 011-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 012-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 013-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 014-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 015-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 016-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 017-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 018-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 019-kfo.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from 010-kfo.pdf qpdf: adding pages from 011-kfo.pdf diff --git a/qpdf/qtest/qpdf/kfo-n.out b/qpdf/qtest/qpdf/kfo-n.out index 0fe71ca0..1964bcdd 100644 --- a/qpdf/qtest/qpdf/kfo-n.out +++ b/qpdf/qtest/qpdf/kfo-n.out @@ -7,6 +7,26 @@ qpdf: processing 006-kfo.pdf qpdf: processing 007-kfo.pdf qpdf: processing 008-kfo.pdf qpdf: processing 009-kfo.pdf +qpdf: empty PDF: checking for shared resources +qpdf: no shared resources found +qpdf: 001-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 002-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 003-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 004-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 005-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 006-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 007-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 008-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 009-kfo.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from 001-kfo.pdf qpdf: adding pages from 002-kfo.pdf diff --git a/qpdf/qtest/qpdf/kfo-y.out b/qpdf/qtest/qpdf/kfo-y.out index 0fe71ca0..1964bcdd 100644 --- a/qpdf/qtest/qpdf/kfo-y.out +++ b/qpdf/qtest/qpdf/kfo-y.out @@ -7,6 +7,26 @@ qpdf: processing 006-kfo.pdf qpdf: processing 007-kfo.pdf qpdf: processing 008-kfo.pdf qpdf: processing 009-kfo.pdf +qpdf: empty PDF: checking for shared resources +qpdf: no shared resources found +qpdf: 001-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 002-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 003-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 004-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 005-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 006-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 007-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 008-kfo.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 009-kfo.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from 001-kfo.pdf qpdf: adding pages from 002-kfo.pdf diff --git a/qpdf/qtest/qpdf/shared-form-images-xobject.pdf b/qpdf/qtest/qpdf/shared-form-images-xobject.pdf new file mode 100644 index 00000000..b8c5ead1 Binary files /dev/null and b/qpdf/qtest/qpdf/shared-form-images-xobject.pdf differ diff --git a/qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf b/qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf new file mode 100644 index 00000000..a0a9ec88 Binary files /dev/null and b/qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf differ diff --git a/qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf b/qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf new file mode 100644 index 00000000..06dce552 Binary files /dev/null and b/qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf differ diff --git a/qpdf/qtest/qpdf/split-pages-group.out b/qpdf/qtest/qpdf/split-pages-group.out index ec00ab7e..d89f5e48 100644 --- a/qpdf/qtest/qpdf/split-pages-group.out +++ b/qpdf/qtest/qpdf/split-pages-group.out @@ -1,3 +1,5 @@ +qpdf: 11-pages.pdf: checking for shared resources +qpdf: no shared resources found qpdf: wrote file split-out-group-01-05.pdf qpdf: wrote file split-out-group-06-10.pdf qpdf: wrote file split-out-group-11-11.pdf diff --git a/qpdf/qtest/qpdf/uo-6.out b/qpdf/qtest/qpdf/uo-6.out index 22395614..5a199670 100644 --- a/qpdf/qtest/qpdf/uo-6.out +++ b/qpdf/qtest/qpdf/uo-6.out @@ -1,4 +1,6 @@ qpdf: selecting --keep-open-files=y +qpdf: fxo-red.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from fxo-red.pdf qpdf: processing underlay/overlay diff --git a/qpdf/qtest/qpdf/verbose-merge.out b/qpdf/qtest/qpdf/verbose-merge.out index 9de13e16..6f9846fb 100644 --- a/qpdf/qtest/qpdf/verbose-merge.out +++ b/qpdf/qtest/qpdf/verbose-merge.out @@ -2,6 +2,14 @@ qpdf: selecting --keep-open-files=y qpdf: processing 20-pages.pdf qpdf: processing ./20-pages.pdf qpdf: processing minimal.pdf +qpdf: ./20-pages.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: 20-pages.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: minimal.pdf: checking for shared resources +qpdf: no shared resources found +qpdf: page-labels-and-outlines.pdf: checking for shared resources +qpdf: no shared resources found qpdf: removing unreferenced pages from primary input qpdf: adding pages from page-labels-and-outlines.pdf qpdf: adding pages from 20-pages.pdf -- cgit v1.2.3-70-g09d2