aboutsummaryrefslogtreecommitdiffstats
path: root/qpdf/qpdf.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2020-04-04 18:36:45 +0200
committerJay Berkenbilt <ejb@ql.org>2020-04-04 19:19:49 +0200
commit67d5ed3a64a81f9192c17dc71f02e69f60f8a1f8 (patch)
tree98dca0bf8694fceaa3db17f91a381fb7d35eab6c /qpdf/qpdf.cc
parent1e766dcda26886137cb70fa02de631a1cbb956a0 (diff)
downloadqpdf-67d5ed3a64a81f9192c17dc71f02e69f60f8a1f8.tar.zst
Implement remove-unreferenced-resources=auto
Diffstat (limited to 'qpdf/qpdf.cc')
-rw-r--r--qpdf/qpdf.cc144
1 files changed, 141 insertions, 3 deletions
diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc
index 442ee6cc..38aa2f4a 100644
--- a/qpdf/qpdf.cc
+++ b/qpdf/qpdf.cc
@@ -4750,6 +4750,140 @@ static void handle_transformations(QPDF& pdf, Options& o)
}
}
+static bool should_remove_unreferenced_resources(QPDF& pdf, Options& o)
+{
+ if (o.remove_unreferenced_page_resources == re_no)
+ {
+ return false;
+ }
+ else if (o.remove_unreferenced_page_resources == re_yes)
+ {
+ return true;
+ }
+
+ // Unreferenced resources are common in files where resources
+ // dictionaries are shared across pages. As a heuristic, we look
+ // in the file for shared resources dictionaries or shared XObject
+ // subkeys of resources dictionaries either on pages or on form
+ // XObjects in pages. If we find any, then there is a higher
+ // likeilihood that the expensive process of finding unreferenced
+ // resources is worth it.
+
+ // Return true as soon as we find any shared resources.
+
+ std::set<QPDFObjGen> resources_seen; // shared resources detection
+ std::set<QPDFObjGen> nodes_seen; // loop detection
+
+ if (o.verbose)
+ {
+ std::cout << whoami << ": " << pdf.getFilename()
+ << ": checking for shared resources" << std::endl;
+ }
+
+ std::list<QPDFObjectHandle> queue;
+ queue.push_back(pdf.getRoot().getKey("/Pages"));
+ while (! queue.empty())
+ {
+ QPDFObjectHandle node = *queue.begin();
+ QPDFObjGen og = node.getObjGen();
+ if (nodes_seen.count(og))
+ {
+ continue;
+ }
+ nodes_seen.insert(og);
+ queue.pop_front();
+ QPDFObjectHandle dict = node.isStream() ? node.getDict() : node;
+ QPDFObjectHandle kids = dict.getKey("/Kids");
+ if (kids.isArray())
+ {
+ // This is a non-leaf node.
+ if (dict.hasKey("/Resources"))
+ {
+ QTC::TC("qpdf", "qpdf found resources in non-leaf");
+ if (o.verbose)
+ {
+ std::cout << " found resources in non-leaf page node "
+ << og.getObj() << " " << og.getGen()
+ << std::endl;
+ }
+ return true;
+ }
+ int n = kids.getArrayNItems();
+ for (int i = 0; i < n; ++i)
+ {
+ queue.push_back(kids.getArrayItem(i));
+ }
+ }
+ else
+ {
+ // This is a leaf node or a form XObject.
+ QPDFObjectHandle resources = dict.getKey("/Resources");
+ if (resources.isIndirect())
+ {
+ QPDFObjGen resources_og = resources.getObjGen();
+ if (resources_seen.count(resources_og))
+ {
+ QTC::TC("qpdf", "qpdf found shared resources in leaf");
+ if (o.verbose)
+ {
+ std::cout << " found shared resources in leaf node "
+ << og.getObj() << " " << og.getGen()
+ << ": "
+ << resources_og.getObj() << " "
+ << resources_og.getGen()
+ << std::endl;
+ }
+ return true;
+ }
+ resources_seen.insert(resources_og);
+ }
+ QPDFObjectHandle xobject = resources.getKey("/XObject");
+ if (xobject.isIndirect())
+ {
+ QPDFObjGen xobject_og = xobject.getObjGen();
+ if (resources_seen.count(xobject_og))
+ {
+ QTC::TC("qpdf", "qpdf found shared xobject in leaf");
+ if (o.verbose)
+ {
+ std::cout << " found shared xobject in leaf node "
+ << og.getObj() << " " << og.getGen()
+ << ": "
+ << xobject_og.getObj() << " "
+ << xobject_og.getGen()
+ << std::endl;
+ }
+ return true;
+ }
+ resources_seen.insert(xobject_og);
+ }
+ if (xobject.isDictionary())
+ {
+ for (auto k: xobject.getKeys())
+ {
+ QPDFObjectHandle xobj = xobject.getKey(k);
+ if (xobj.isStream() &&
+ xobj.getDict().getKey("/Type").isName() &&
+ ("/XObject" ==
+ xobj.getDict().getKey("/Type").getName()) &&
+ xobj.getDict().getKey("/Subtype").isName() &&
+ ("/Form" ==
+ xobj.getDict().getKey("/Subtype").getName()))
+ {
+ queue.push_back(xobj);
+ }
+ }
+ }
+ }
+ }
+
+ if (o.verbose)
+ {
+ std::cout << whoami << ": no shared resources found" << std::endl;
+ }
+ return false;
+}
+
static void handle_page_specs(QPDF& pdf, Options& o)
{
// Parse all page specifications and translate them into lists of
@@ -4883,8 +5017,12 @@ static void handle_page_specs(QPDF& pdf, Options& o)
cis = page_spec_cfis[filename];
cis->stayOpen(true);
}
- QPDFPageDocumentHelper dh(*((*iter).second));
- dh.removeUnreferencedResources();
+ QPDF& other(*((*iter).second));
+ if (should_remove_unreferenced_resources(other, o))
+ {
+ QPDFPageDocumentHelper dh(other);
+ dh.removeUnreferencedResources();
+ }
if (cis)
{
cis->stayOpen(false);
@@ -5368,7 +5506,7 @@ static void do_split_pages(QPDF& pdf, Options& o)
before = std::string(o.outfilename) + "-";
}
- if (o.remove_unreferenced_page_resources != re_no)
+ if (should_remove_unreferenced_resources(pdf, o))
{
QPDFPageDocumentHelper dh(pdf);
dh.removeUnreferencedResources();