aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--include/qpdf/QPDF.hh23
-rw-r--r--libqpdf/QPDF_optimization.cc3
-rw-r--r--libqpdf/QPDF_pages.cc26
-rw-r--r--manual/qpdf-manual.xml7
-rw-r--r--qpdf/qpdf.testcov1
-rw-r--r--qpdf/qtest/qpdf.test26
-rw-r--r--qpdf/qtest/qpdf/extract-duplicate-page.pdfbin0 -> 1370 bytes
-rw-r--r--qpdf/qtest/qpdf/linearize-duplicate-page.pdfbin0 -> 1899 bytes
-rw-r--r--qpdf/qtest/qpdf/page_api_2.out2
-rw-r--r--qpdf/test_driver.cc19
11 files changed, 98 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index 9536fd79..f24e1b75 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2019-01-28 Jay Berkenbilt <ejb@ql.org>
+
+ * When linearizing or getting the list of all pages in a file,
+ replace duplicated page objects with a shallow copy of the page
+ object. Linearization and all page manipulation APIs require page
+ objects to be unique. Pages that were originally duplicated will
+ still share contents and any other indirect resources. Fixes #268.
+
2019-01-26 Jay Berkenbilt <ejb@ql.org>
* Add --overlay and --underlay options. Fixes #207.
diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh
index ef0f0eb7..cfb10aac 100644
--- a/include/qpdf/QPDF.hh
+++ b/include/qpdf/QPDF.hh
@@ -527,15 +527,16 @@ class QPDF
void optimize(std::map<int, int> const& object_stream_data,
bool allow_changes = true);
- // Traverse page tree return all /Page objects. For efficiency,
- // this method returns a const reference to an internal vector of
- // pages. Calls to addPage, addPageAt, and removePage safely
- // update this, but directly manipulation of the pages three or
- // pushing inheritable objects to the page level may invalidate
- // it. See comments for updateAllPagesCache() for additional
- // notes. Newer code should use
- // QPDFPageDocumentHelper::getAllPages instead. The decision to
- // expose this internal cache was arguably incorrect, but it is
+ // Traverse page tree return all /Page objects. It also detects
+ // and resolves cases in which the same /Page object is
+ // duplicated. For efficiency, this method returns a const
+ // reference to an internal vector of pages. Calls to addPage,
+ // addPageAt, and removePage safely update this, but directly
+ // manipulation of the pages three or pushing inheritable objects
+ // to the page level may invalidate it. See comments for
+ // updateAllPagesCache() for additional notes. Newer code should
+ // use QPDFPageDocumentHelper::getAllPages instead. The decision
+ // to expose this internal cache was arguably incorrect, but it is
// being left here for compatibility. It is, however, completely
// safe to use this for files that you are not modifying.
QPDF_DLL
@@ -895,6 +896,10 @@ class QPDF
void getAllPagesInternal2(QPDFObjectHandle cur_pages,
std::vector<QPDFObjectHandle>& result,
std::set<QPDFObjGen>& visited);
+ void getAllPagesInternal3(QPDFObjectHandle cur_pages,
+ std::vector<QPDFObjectHandle>& result,
+ std::set<QPDFObjGen>& visited,
+ std::set<QPDFObjGen>& seen);
void insertPage(QPDFObjectHandle newpage, int pos);
int findPage(QPDFObjGen const& og);
int findPage(QPDFObjectHandle& page);
diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc
index f282e5f3..ecece7db 100644
--- a/libqpdf/QPDF_optimization.cc
+++ b/libqpdf/QPDF_optimization.cc
@@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
return;
}
+ // Calling getAllPages() resolves any duplicated page objects.
+ getAllPages();
+
// key_ancestors is a mapping of page attribute keys to a stack of
// Pages nodes that contain values for them.
std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors;
diff --git a/libqpdf/QPDF_pages.cc b/libqpdf/QPDF_pages.cc
index ea5afdb5..7b6b369e 100644
--- a/libqpdf/QPDF_pages.cc
+++ b/libqpdf/QPDF_pages.cc
@@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
void
QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
- std::vector<QPDFObjectHandle>& result,
- std::set<QPDFObjGen>& visited)
+ std::vector<QPDFObjectHandle>& result,
+ std::set<QPDFObjGen>& visited)
+{
+ std::set<QPDFObjGen> seen;
+ getAllPagesInternal3(cur_pages, result, visited, seen);
+}
+
+void
+QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages,
+ std::vector<QPDFObjectHandle>& result,
+ std::set<QPDFObjGen>& visited,
+ std::set<QPDFObjGen>& seen)
{
QPDFObjGen this_og = cur_pages.getObjGen();
if (visited.count(this_og) > 0)
@@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
int n = kids.getArrayNItems();
for (int i = 0; i < n; ++i)
{
- getAllPagesInternal2(kids.getArrayItem(i), result, visited);
+ QPDFObjectHandle kid = kids.getArrayItem(i);
+ if (seen.count(kid.getObjGen()))
+ {
+ // Make a copy of the page. This does the same as
+ // shallowCopyPage in QPDFPageObjectHelper.
+ QTC::TC("qpdf", "QPDF resolve duplicated page object");
+ kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
+ kids.setArrayItem(i, kid);
+ }
+ getAllPagesInternal3(kid, result, visited, seen);
}
}
else if (type == "/Page")
{
+ seen.insert(this_og);
result.push_back(cur_pages);
}
else
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index b673be25..aaeaeb54 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -4410,6 +4410,13 @@ print "\n";
suite and properly handled.
</para>
</listitem>
+ <listitem>
+ <para>
+ Linearization and page manipulation APIs now detect and
+ recover from files that have duplicate Page objects in the
+ pages tree.
+ </para>
+ </listitem>
</itemizedlist>
</listitem>
<listitem>
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 821880cf..98857597 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0
QPDFPageObjectHelper non-trivial inheritance 0
QPDFPageObjectHelper copy shared attribute 0
qpdf from_nr from repeat_nr 0
+QPDF resolve duplicated page object 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 946ffff6..9b72706f 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -582,7 +582,7 @@ $td->runtest("check output",
{$td->FILE => "page_api_1-out2.pdf"});
$td->runtest("duplicate page",
{$td->COMMAND => "test_driver 17 page_api_2.pdf"},
- {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2},
+ {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("delete and re-add a page",
{$td->COMMAND => "test_driver 18 page_api_1.pdf"},
@@ -1729,6 +1729,30 @@ foreach my $f (qw(screen print))
show_ntests();
# ----------
+$td->notify("--- Duplicated Page Object ---");
+$n_tests += 4;
+
+$td->runtest("linearize duplicated pages",
+ {$td->COMMAND =>
+ "qpdf --static-id --linearize" .
+ " page_api_2.pdf a.pdf"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("compare files",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "linearize-duplicate-page.pdf"});
+$td->runtest("extract duplicated pages",
+ {$td->COMMAND =>
+ "qpdf --static-id page_api_2.pdf" .
+ " --pages . -- a.pdf"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("compare files",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "extract-duplicate-page.pdf"});
+
+show_ntests();
+# ----------
$td->notify("--- Merging and Splitting ---");
$n_tests += 22;
diff --git a/qpdf/qtest/qpdf/extract-duplicate-page.pdf b/qpdf/qtest/qpdf/extract-duplicate-page.pdf
new file mode 100644
index 00000000..191943aa
--- /dev/null
+++ b/qpdf/qtest/qpdf/extract-duplicate-page.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/linearize-duplicate-page.pdf b/qpdf/qtest/qpdf/linearize-duplicate-page.pdf
new file mode 100644
index 00000000..27a8898e
--- /dev/null
+++ b/qpdf/qtest/qpdf/linearize-duplicate-page.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/page_api_2.out b/qpdf/qtest/qpdf/page_api_2.out
index 9fc019c2..30e9b9c3 100644
--- a/qpdf/qtest/qpdf/page_api_2.out
+++ b/qpdf/qtest/qpdf/page_api_2.out
@@ -1 +1 @@
-page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data
+test 17 done
diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc
index a03c4b98..743a0082 100644
--- a/qpdf/test_driver.cc
+++ b/qpdf/test_driver.cc
@@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2)
}
else if (n == 17)
{
- // The input file to this test case is broken to exercise an
- // error condition.
+ // The input file to this test case has a duplicated page.
+ QPDFObjectHandle page_kids =
+ pdf.getRoot().getKey("/Pages").getKey("/Kids");
+ assert(page_kids.getArrayItem(0).getObjGen() ==
+ page_kids.getArrayItem(1).getObjGen());
std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages();
+ assert(pages.size() == 3);
+ assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen()));
+ assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() ==
+ QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen());
pdf.removePage(pages.at(0));
- std::cout << "you can't see this" << std::endl;
+ assert(pages.size() == 2);
+ PointerHolder<Buffer> b = QPDFObjectHandle(pages.at(0)).
+ getKey("/Contents").getStreamData();
+ std::string contents = std::string(
+ reinterpret_cast<char const*>(b->getBuffer()),
+ b->getSize());
+ assert(contents.find("page 0") != std::string::npos);
}
else if (n == 18)
{