aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2021-01-04 20:56:44 +0100
committerJay Berkenbilt <ejb@ql.org>2021-01-04 21:17:35 +0100
commit891751f618fb95b82af289edfd2e1219e3522e6f (patch)
tree575984a8662a447d7aff7e98a25dd17bcea7a175
parentdc92574c10f3e2516ec6445b88c5d584f40df4e5 (diff)
downloadqpdf-891751f618fb95b82af289edfd2e1219e3522e6f.tar.zst
Remove unreferenced resources only from relevant pages
-rw-r--r--ChangeLog5
-rw-r--r--TODO7
-rw-r--r--manual/qpdf-manual.xml9
-rw-r--r--qpdf/qpdf.cc12
-rw-r--r--qpdf/qtest/qpdf.test7
-rw-r--r--qpdf/qtest/qpdf/shared-images-errors-1.out3
6 files changed, 28 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 1e31efb4..40eb1014 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2021-01-04 Jay Berkenbilt <ejb@ql.org>
+ * When qpdf CLI extracts pages, it now only attempts to remove
+ unreferenced resourecs from the pages that it is keeping. This
+ change dramatically reduces the time it takes to extract a small
+ number of pages from a large, complex file.
+
* Move getNext()->write() calls in some pipelines to ensure that
state gates properly reset even if the next pipeline's write
throws an exception (fuzz issue 28262).
diff --git a/TODO b/TODO
index 7b620a96..9d687d56 100644
--- a/TODO
+++ b/TODO
@@ -1,10 +1,3 @@
-Candidates for upcoming release
-===============================
-
-* Remember to check work `qpdf` project for private issues
- * file with very slow page extraction
- * big page even with --remove-unreferenced-resources=yes, even with --empty
-
Fuzz Errors
===========
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index f93f4a17..98fca51e 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -5001,6 +5001,15 @@ print "\n";
<literal>/DecodeParms</literal>.
</para>
</listitem>
+ <listitem>
+ <para>
+ When extracting pages, the <command>qpdf</command> CLI only
+ removes unreferenced resources from the pages that are being
+ kept, resulting in a significant performance improvement
+ when extracting small numbers of pages from large, complex
+ documents.
+ </para>
+ </listitem>
</itemizedlist>
</listitem>
<listitem>
diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc
index 0d1ab988..2e35c96f 100644
--- a/qpdf/qpdf.cc
+++ b/qpdf/qpdf.cc
@@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o)
page_spec.range));
}
+ std::map<unsigned long long, bool> remove_unreferenced;
if (o.remove_unreferenced_page_resources != re_no)
{
for (std::map<std::string, QPDF*>::iterator iter =
@@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o)
cis->stayOpen(true);
}
QPDF& other(*((*iter).second));
- if (should_remove_unreferenced_resources(other, o))
+ auto other_uuid = other.getUniqueId();
+ if (remove_unreferenced.count(other_uuid) == 0)
{
- QPDFPageDocumentHelper dh(other);
- dh.removeUnreferencedResources();
+ remove_unreferenced[other_uuid] =
+ should_remove_unreferenced_resources(other, o);
}
if (cis)
{
@@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o)
else
{
copied_pages[from_uuid].insert(to_copy_og);
+ if (remove_unreferenced[from_uuid])
+ {
+ to_copy.removeUnreferencedResources();
+ }
}
dh.addPage(to_copy, false);
if (page_data.qpdf == &pdf)
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 6919bfcf..83cbacd3 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -2247,12 +2247,15 @@ $td->runtest("check output",
{$td->FILE => "a.pdf"},
{$td->FILE => "shared-images-errors-2-out.pdf"});
+# This test used to generate warnings about images on pages we didn't
+# care about, but qpdf was modified not to process those pages, so the
+# "irrelevant" errors went away.
$td->runtest("shared resources irrelevant errors",
{$td->COMMAND =>
"qpdf --qdf --static-id" .
" shared-images-errors.pdf --pages . 1 -- a.pdf"},
- {$td->FILE => "shared-images-errors-1.out",
- $td->EXIT_STATUS => 3},
+ {$td->STRING => "",
+ $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("check output",
{$td->FILE => "a.pdf"},
diff --git a/qpdf/qtest/qpdf/shared-images-errors-1.out b/qpdf/qtest/qpdf/shared-images-errors-1.out
deleted file mode 100644
index 5b98f88f..00000000
--- a/qpdf/qtest/qpdf/shared-images-errors-1.out
+++ /dev/null
@@ -1,3 +0,0 @@
-WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check
-WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page
-qpdf: operation succeeded with warnings; resulting file may have some problems