summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--TODO9
-rw-r--r--libqpdf/QPDFPageObjectHelper.cc33
-rw-r--r--libqpdf/ResourceFinder.cc55
-rw-r--r--libqpdf/qpdf/ResourceFinder.hh13
-rw-r--r--qpdf/qtest/qpdf/split-tokens-split.out1
6 files changed, 83 insertions, 35 deletions
diff --git a/ChangeLog b/ChangeLog
index 036c2bca..d9b09752 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
2021-03-01 Jay Berkenbilt <ejb@ql.org>
+ * Improve code that finds unreferenced resources to ignore names
+ in the content stream that are not fonts or XObjects. This should
+ reduce the number of cases when qpdf needlessly decides not to
+ remove unreferenced resources. Hopefully it doesn't create any new
+ bugs where it removes unreferenced resources that it isn't
+ supposed to.
+
* QPDFObjectHandle::ParserCallbacks: add virtual handleWarning
method, and provide default (empty) implementation of it and
handleEOF().
diff --git a/TODO b/TODO
index b7562008..5a0b8e0b 100644
--- a/TODO
+++ b/TODO
@@ -34,15 +34,6 @@ Document-level work
--copy-attachments-from to preserve these. What will the strategy be
for deduplicating in the automatic case?
-* When I get to tagged PDF, note that the presence of /Artifact and
- /Standard (and maybe others?) causes a false positive on detection
- of unresolved names. Example: form-fields-and-annotations.pdf. This
- used to give a warning (never in a released version), but the
- warning was removed. See comments about tagged pdf in
- QPDFPageObjectHelper::removeUnreferencedResourcesHelper. Another
- potential solution is to recognize names that refer to fonts and
- xobjects but only looking at names used with Tf and Do operators.
-
Fuzz Errors
===========
diff --git a/libqpdf/QPDFPageObjectHelper.cc b/libqpdf/QPDFPageObjectHelper.cc
index 58144a3f..344ff15e 100644
--- a/libqpdf/QPDFPageObjectHelper.cc
+++ b/libqpdf/QPDFPageObjectHelper.cc
@@ -684,7 +684,7 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
ResourceFinder rf;
try
{
- ph.filterContents(&rf);
+ ph.parseContents(&rf);
}
catch (std::exception& e)
{
@@ -711,9 +711,9 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
QPDFObjectHandle resources = ph.getAttribute("/Resources", true);
std::vector<QPDFObjectHandle> rdicts;
std::set<std::string> known_names;
+ std::vector<std::string> to_filter = {"/Font", "/XObject"};
if (resources.isDictionary())
{
- std::vector<std::string> to_filter = {"/Font", "/XObject"};
for (auto const& iter: to_filter)
{
QPDFObjectHandle dict = resources.getKey(iter);
@@ -729,12 +729,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
}
std::set<std::string> local_unresolved;
- for (auto const& name: rf.getNames())
+ auto names_by_rtype = rf.getNamesByResourceType();
+ for (auto const& i1: to_filter)
{
- if (! known_names.count(name))
+ for (auto const& n_iter: names_by_rtype[i1])
{
- unresolved.insert(name);
- local_unresolved.insert(name);
+ std::string const& name = n_iter.first;
+ if (! known_names.count(name))
+ {
+ unresolved.insert(name);
+ local_unresolved.insert(name);
+ }
}
}
// Older versions of the PDF spec allowed form XObjects to omit
@@ -754,11 +759,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
if ((! local_unresolved.empty()) && resources.isDictionary())
{
- // Don't issue a warning for this case. There are some cases
- // of names that aren't XObject references, for example,
- // /Artifact in tagged PDF. Until we are certain that we know
- // the meaning of every name in a content stream, we don't
- // want to give warnings because they will be false positives.
+ // It's not worth issuing a warning for this case. From qpdf
+ // 10.3, we are hopefully only looking at names that are
+ // referencing fonts and XObjects, but until we're certain
+ // that we know the meaning of every name in a content stream,
+ // we don't want to give warnings that might be false
+ // positives. Also, this can happen in legitimate cases with
+ // older PDFs, and there's nothing to be done about it, so
+ // there's no good reason to issue a warning. The only sad
+ // thing is that it was a false positive that alerted me to a
+ // logic error in the code, and any future such errors would
+ // now be hidden.
QTC::TC("qpdf", "QPDFPageObjectHelper unresolved names");
return false;
}
diff --git a/libqpdf/ResourceFinder.cc b/libqpdf/ResourceFinder.cc
index 74ba671f..6b9929e4 100644
--- a/libqpdf/ResourceFinder.cc
+++ b/libqpdf/ResourceFinder.cc
@@ -1,28 +1,53 @@
#include <qpdf/ResourceFinder.hh>
ResourceFinder::ResourceFinder() :
+ last_name_offset(0),
saw_bad(false)
{
}
void
-ResourceFinder::handleToken(QPDFTokenizer::Token const& token)
+ResourceFinder::handleObject(QPDFObjectHandle obj, size_t offset, size_t)
{
- if ((token.getType() == QPDFTokenizer::tt_word) &&
- (! this->last_name.empty()))
+ if (obj.isOperator() && (! this->last_name.empty()))
{
- this->names.insert(this->last_name);
+ static std::map<std::string, std::string> op_to_rtype = {
+ {"CS", "/ColorSpace"},
+ {"cs", "/ColorSpace"},
+ {"gs", "/ExtGState"},
+ {"Tf", "/Font"},
+ {"SCN", "/Pattern"},
+ {"scn", "/Pattern"},
+ {"BDC", "/Properties"},
+ {"DP", "/Properties"},
+ {"sh", "/Shading"},
+ {"Do", "/XObject"},
+ };
+ std::string op = obj.getOperatorValue();
+ std::string resource_type;
+ auto iter = op_to_rtype.find(op);
+ if (iter != op_to_rtype.end())
+ {
+ resource_type = iter->second;
+ }
+ if (! resource_type.empty())
+ {
+ this->names.insert(this->last_name);
+ this->names_by_resource_type[
+ resource_type][this->last_name].insert(this->last_name_offset);
+ }
}
- else if (token.getType() == QPDFTokenizer::tt_name)
+ else if (obj.isName())
{
- this->last_name =
- QPDFObjectHandle::newName(token.getValue()).getName();
+ this->last_name = obj.getName();
+ this->last_name_offset = offset;
}
- else if (token.getType() == QPDFTokenizer::tt_bad)
- {
- saw_bad = true;
- }
- writeToken(token);
+}
+
+void
+ResourceFinder::handleWarning()
+{
+ this->saw_bad = true;
}
std::set<std::string> const&
@@ -31,6 +56,12 @@ ResourceFinder::getNames() const
return this->names;
}
+std::map<std::string, std::map<std::string, std::set<size_t>>> const&
+ResourceFinder::getNamesByResourceType() const
+{
+ return this->names_by_resource_type;
+}
+
bool
ResourceFinder::sawBad() const
{
diff --git a/libqpdf/qpdf/ResourceFinder.hh b/libqpdf/qpdf/ResourceFinder.hh
index 0ac74eab..ac3d5b4c 100644
--- a/libqpdf/qpdf/ResourceFinder.hh
+++ b/libqpdf/qpdf/ResourceFinder.hh
@@ -3,19 +3,26 @@
#include <qpdf/QPDFObjectHandle.hh>
-class ResourceFinder: public QPDFObjectHandle::TokenFilter
+class ResourceFinder: public QPDFObjectHandle::ParserCallbacks
{
public:
ResourceFinder();
virtual ~ResourceFinder() = default;
- virtual void handleToken(QPDFTokenizer::Token const&) override;
+ virtual void handleObject(QPDFObjectHandle, size_t, size_t) override;
+ virtual void handleWarning() override;
std::set<std::string> const& getNames() const;
+ std::map<std::string,
+ std::map<std::string,
+ std::set<size_t>>> const& getNamesByResourceType() const;
bool sawBad() const;
private:
std::string last_name;
+ size_t last_name_offset;
std::set<std::string> names;
- std::map<std::string, std::set<std::string>> names_by_resource_type;
+ std::map<std::string,
+ std::map<std::string,
+ std::set<size_t>>> names_by_resource_type;
bool saw_bad;
};
diff --git a/qpdf/qtest/qpdf/split-tokens-split.out b/qpdf/qtest/qpdf/split-tokens-split.out
index ab9f3b7a..8e1003be 100644
--- a/qpdf/qtest/qpdf/split-tokens-split.out
+++ b/qpdf/qtest/qpdf/split-tokens-split.out
@@ -1,3 +1,4 @@
+WARNING: page object 3 0 stream 5 0, stream 7 0, stream 9 0, stream 11 0 (content, offset 375): null character not allowed in name token
WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this object
WARNING: empty PDF: content normalization encountered bad tokens
WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents