From 8971443e4680fc1c0babe56da58cc9070a9dae2e Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 5 Apr 2021 10:45:18 -0400 Subject: QPDF::addPage*: handle duplicate pages more robustly --- ChangeLog | 13 +++++++++++++ TODO | 25 ------------------------- include/qpdf/QPDFPageDocumentHelper.hh | 21 +++++++++++++++------ libqpdf/QPDF_pages.cc | 25 ++++++++++++++++++++++--- qpdf/qpdf.testcov | 2 +- qpdf/qtest/qpdf.test | 2 +- qpdf/qtest/qpdf/page_api_1.out | 2 +- qpdf/test_driver.cc | 16 +++++++++++----- 8 files changed, 64 insertions(+), 42 deletions(-) diff --git a/ChangeLog b/ChangeLog index b2f131c3..f720fa24 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2021-04-05 Jay Berkenbilt + + * When adding a page, if the page already exists, make a shallow + copy of the page instead of throwing an exception. This makes the + behavior of adding a page from the library consistent with what + the CLI does and also with what the library does if it starts with + a file that already has a duplicated page. Note that this means + that, in some cases, the page you pass to addPage or addPageAt + (either in QPDF or QPDFPageDocumentHelper) will not be the same + object that actually gets added. (This has actually always been + the case.) That means that, if you are going to do subsequent + modification on the page, you should retrieve it again. + 2021-03-11 Jay Berkenbilt * 10.3.1: release diff --git a/TODO b/TODO index a1f72e68..55c6a2c1 100644 --- a/TODO +++ b/TODO @@ -1,12 +1,6 @@ Document-level work =================== -* Implement the tree helper described in the ABI section and provide - the getPagesTree method so QPDF::getAllPages can be deprecated in - 10.4 and people have time to transition before qpdf 11. Maybe I - should have a qpdf 11 branch so I can make sure I have the - interfaces the way I want them before releasing 10.4. - * QPDFPageCopier -- object for moving pages around within files or between files and performing various transformations @@ -188,25 +182,6 @@ Comments appear in the code prefixed by "ABI" before copying, though maybe we don't because it could cause multiple copies to be made...usually it's better to handle that explicitly. -* (Also API) Create a helper class for tree structures like the pages - tree that have /Kids and /Count. It would be great to be able to - eliminate flattenPagesTree and all the page object to page caching, - and to get rid of getAllPages() that returns a reference to an - internal data structure. The tree helper object can have - bidirectional iterator and const_iterator, implement at and - operator[], and have reasonable insertion and deletion semantics. - Then a new getAllPages, perhaps getPagesTree, can return that and - make it easy to change existing code even if it assumes the result - of getAllPages() is mutating automatically as the pages tree is - manipulated. This will free us up to finally get rid of the - troublesome pages cache and its exposure by getAllPages. See also - note below with "balance the pages tree". Maybe we can get rid of - pageobj_to_pages_pos as well or convert it into something that is - merely advisory and not so heavily trusted. We should either make - the code not care if the same object appears more than once in the - pages tree or detect it and shallow copy, though the latter would - cause a significant performance penalty when manipulating pages if - we get rid of the cache. Page splitting/merging ====================== diff --git a/include/qpdf/QPDFPageDocumentHelper.hh b/include/qpdf/QPDFPageDocumentHelper.hh index 02c039b5..867ddac0 100644 --- a/include/qpdf/QPDFPageDocumentHelper.hh +++ b/include/qpdf/QPDFPageDocumentHelper.hh @@ -73,12 +73,21 @@ class QPDFPageDocumentHelper: public QPDFDocumentHelper // indirect. If it is an indirect object from another QPDF, this // method will call pushInheritedAttributesToPage on the other // file and then copy the page to this QPDF using the same - // underlying code as copyForeignObject. Note that you can call - // copyForeignObject directly to copy a page from a different - // file, but the resulting object will not be a page in the new - // file. You could do this, for example, to convert a page into a - // form XObject, though for that, you're better off using - // QPDFPageObjectHelper::getFormXObjectForPage. + // underlying code as copyForeignObject. At this stage, if the + // indirect object is already in the pages tree, a shallow copy is + // made to avoid adding the same page more than once. In version + // 10.3.1 and earlier, adding a page that already existed would + // throw an exception and could cause qpdf to crash on subsequent + // page insertions in some cases. Note that this means that, in + // some cases, the page actually added won't be exactly the same + // object as the one passed in. If you want to do subsequent + // modification on the page, you should retrieve it again. + // + // Note that you can call copyForeignObject directly to copy a + // page from a different file, but the resulting object will not + // be a page in the new file. You could do this, for example, to + // convert a page into a form XObject, though for that, you're + // better off using QPDFPageObjectHelper::getFormXObjectForPage. // // This method does not have any specific awareness of annotations // or form fields, so if you just add a page without thinking diff --git a/libqpdf/QPDF_pages.cc b/libqpdf/QPDF_pages.cc index 2d778415..8b55e0ef 100644 --- a/libqpdf/QPDF_pages.cc +++ b/libqpdf/QPDF_pages.cc @@ -14,7 +14,15 @@ // to all_pages and has been in the public API long before the // introduction of mutation APIs, so we're pretty much stuck with it. // Anyway, there are lots of calls to it in the library, so the -// efficiency of having it cached is probably worth keeping it. +// efficiency of having it cached is probably worth keeping it. At one +// point, I had partially implemented a helper class specifically for +// the pages tree, but once you work in all the logic that handles +// repairing the /Type keys of page tree nodes (both /Pages and /Page) +// and deal with duplicate pages, it's just as complex and less +// efficient than what's here. So, in spite of the fact that a const +// reference is returned, the current code is fine and does not need +// to be replaced. A partial implementation of QPDFPagesTree is in +// github in attic in case there is ever a reason to resurrect it. // The goal of this code is to ensure that the all_pages vector, which // users may have a reference to, and the pageobj_to_pages_pos map, @@ -178,7 +186,10 @@ QPDF::flattenPagesTree() size_t const len = this->m->all_pages.size(); for (size_t pos = 0; pos < len; ++pos) { - // populate pageobj_to_pages_pos and fix parent pointer + // Populate pageobj_to_pages_pos and fix parent pointer. There + // should be no duplicates at this point because + // pushInheritedAttributesToPage calls getAllPages which + // resolves duplicates. insertPageobjToPage(this->m->all_pages.at(pos), toI(pos), true); this->m->all_pages.at(pos).replaceKey("/Parent", pages); } @@ -201,7 +212,8 @@ QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, if (! this->m->pageobj_to_pages_pos.insert( std::make_pair(og, pos)).second) { - QTC::TC("qpdf", "QPDF duplicate page reference"); + // The library never calls insertPageobjToPage in a way + // that causes this to happen. setLastObjectDescription("page " + QUtil::int_to_string(pos) + " (numbered from zero)", og.getObj(), og.getGen()); @@ -246,6 +258,13 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos) (pos == QIntC::to_int(this->m->all_pages.size())) ? 1 : // at end 2); // insert in middle + auto og = newpage.getObjGen(); + if (this->m->pageobj_to_pages_pos.count(og)) + { + QTC::TC("qpdf", "QPDF resolve duplicated page in insert"); + newpage = makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy()); + } + QPDFObjectHandle pages = getRoot().getKey("/Pages"); QPDFObjectHandle kids = pages.getKey("/Kids"); assert ((pos >= 0) && (QIntC::to_size(pos) <= this->m->all_pages.size())); diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index b4e7c46d..b2e8057f 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -193,7 +193,6 @@ qpdf-c called qpdf_init_write_memory 0 exercise processFile(name) 0 exercise processFile(FILE*) 0 exercise processMemoryFile 0 -QPDF duplicate page reference 0 QPDF remove page 2 QPDF insert page 2 QPDF updateAllPagesCache 0 @@ -592,3 +591,4 @@ QPDFAcroFormDocumentHelper /DA parse error 0 QPDFAcroFormDocumentHelper AP parse error 0 qpdf copy fields not this file 0 qpdf copy fields non-first from orig 0 +QPDF resolve duplicated page in insert 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index afb6668b..b3f99975 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -954,7 +954,7 @@ $td->runtest("check output", {$td->FILE => "page_api_1-out3.pdf"}); $td->runtest("duplicate page", {$td->COMMAND => "test_driver 19 page_api_1.pdf"}, - {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 2}, + {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 0}, $td->NORMALIZE_NEWLINES); $td->runtest("remove page we don't have", {$td->COMMAND => "test_driver 22 page_api_1.pdf"}, diff --git a/qpdf/qtest/qpdf/page_api_1.out b/qpdf/qtest/qpdf/page_api_1.out index f705b886..d6b9f43e 100644 --- a/qpdf/qtest/qpdf/page_api_1.out +++ b/qpdf/qtest/qpdf/page_api_1.out @@ -1 +1 @@ -page_api_1.pdf (page 10 (numbered from zero): object 9 0): duplicate page reference found; this would cause loss of data +test 19 done diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 6c9086d8..9a21b82c 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -970,12 +970,18 @@ void runtest(int n, char const* filename1, char const* arg2) else if (n == 19) { // Remove a page and re-insert it in the same file. - QPDFPageDocumentHelper dh(pdf); - std::vector pages = dh.getAllPages(); + std::vector const& pages = pdf.getAllPages(); - // Try to insert a page that's already there. - dh.addPage(pages.at(5), false); - std::cout << "you can't see this" << std::endl; + // Try to insert a page that's already there. A shallow copy + // gets inserted instead. + auto newpage = pages.at(5); + size_t count = pages.size(); + pdf.addPage(newpage, false); + auto last = pages.back(); + assert(pages.size() == count + 1); + assert(! (last.getObjGen() == newpage.getObjGen())); + assert(last.getKey("/Contents").getObjGen() == + newpage.getKey("/Contents").getObjGen()); } else if (n == 20) { -- cgit v1.2.3-54-g00ecf