6 files changed, 83 insertions, 35 deletions
diff --git a/ChangeLog b/ChangeLog
index 036c2bca..d9b09752 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2021-03-01  Jay Berkenbilt  <ejb@ql.org>
 
+	* Improve code that finds unreferenced resources to ignore names
+	in the content stream that are not fonts or XObjects. This should
+	reduce the number of cases when qpdf needlessly decides not to
+	remove unreferenced resources. Hopefully it doesn't create any new
+	bugs where it removes unreferenced resources that it isn't
+	supposed to.
+
 	* QPDFObjectHandle::ParserCallbacks: add virtual handleWarning
 	method, and provide default (empty) implementation of it and
 	handleEOF().
diff --git a/TODO b/TODO
index b7562008..5a0b8e0b 100644
--- a/TODO
+++ b/TODO
@@ -34,15 +34,6 @@ Document-level work
   --copy-attachments-from to preserve these. What will the strategy be
   for deduplicating in the automatic case?
 
-* When I get to tagged PDF, note that the presence of /Artifact and
-  /Standard (and maybe others?) causes a false positive on detection
-  of unresolved names. Example: form-fields-and-annotations.pdf. This
-  used to give a warning (never in a released version), but the
-  warning was removed. See comments about tagged pdf in
-  QPDFPageObjectHelper::removeUnreferencedResourcesHelper. Another
-  potential solution is to recognize names that refer to fonts and
-  xobjects but only looking at names used with Tf and Do operators.
-
 Fuzz Errors
 ===========
 
diff --git a/libqpdf/QPDFPageObjectHelper.cc b/libqpdf/QPDFPageObjectHelper.cc
index 58144a3f..344ff15e 100644
--- a/libqpdf/QPDFPageObjectHelper.cc
+++ b/libqpdf/QPDFPageObjectHelper.cc
@@ -684,7 +684,7 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
     ResourceFinder rf;
     try
     {
-        ph.filterContents(&rf);
+        ph.parseContents(&rf);
     }
     catch (std::exception& e)
     {
@@ -711,9 +711,9 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
     QPDFObjectHandle resources = ph.getAttribute("/Resources", true);
     std::vector<QPDFObjectHandle> rdicts;
     std::set<std::string> known_names;
+    std::vector<std::string> to_filter = {"/Font", "/XObject"};
     if (resources.isDictionary())
     {
-        std::vector<std::string> to_filter = {"/Font", "/XObject"};
         for (auto const& iter: to_filter)
         {
             QPDFObjectHandle dict = resources.getKey(iter);
@@ -729,12 +729,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
     }
 
     std::set<std::string> local_unresolved;
-    for (auto const& name: rf.getNames())
+    auto names_by_rtype = rf.getNamesByResourceType();
+    for (auto const& i1: to_filter)
     {
-        if (! known_names.count(name))
+        for (auto const& n_iter: names_by_rtype[i1])
         {
-            unresolved.insert(name);
-            local_unresolved.insert(name);
+            std::string const& name = n_iter.first;
+            if (! known_names.count(name))
+            {
+                unresolved.insert(name);
+                local_unresolved.insert(name);
+            }
         }
     }
     // Older versions of the PDF spec allowed form XObjects to omit
@@ -754,11 +759,17 @@ QPDFPageObjectHelper::removeUnreferencedResourcesHelper(
 
     if ((! local_unresolved.empty()) && resources.isDictionary())
     {
-        // Don't issue a warning for this case. There are some cases
-        // of names that aren't XObject references, for example,
-        // /Artifact in tagged PDF. Until we are certain that we know
-        // the meaning of every name in a content stream, we don't
-        // want to give warnings because they will be false positives.
+        // It's not worth issuing a warning for this case. From qpdf
+        // 10.3, we are hopefully only looking at names that are
+        // referencing fonts and XObjects, but until we're certain
+        // that we know the meaning of every name in a content stream,
+        // we don't want to give warnings that might be false
+        // positives. Also, this can happen in legitimate cases with
+        // older PDFs, and there's nothing to be done about it, so
+        // there's no good reason to issue a warning. The only sad
+        // thing is that it was a false positive that alerted me to a
+        // logic error in the code, and any future such errors would
+        // now be hidden.
         QTC::TC("qpdf", "QPDFPageObjectHelper unresolved names");
         return false;
     }
diff --git a/libqpdf/ResourceFinder.cc b/libqpdf/ResourceFinder.cc
index 74ba671f..6b9929e4 100644
--- a/libqpdf/ResourceFinder.cc
+++ b/libqpdf/ResourceFinder.cc
@@ -1,28 +1,53 @@
 #include <qpdf/ResourceFinder.hh>
 
 ResourceFinder::ResourceFinder() :
+    last_name_offset(0),
     saw_bad(false)
 {
 }
 
 void
-ResourceFinder::handleToken(QPDFTokenizer::Token const& token)
+ResourceFinder::handleObject(QPDFObjectHandle obj, size_t offset, size_t)
 {
-    if ((token.getType() == QPDFTokenizer::tt_word) &&
-        (! this->last_name.empty()))
+    if (obj.isOperator() && (! this->last_name.empty()))
     {
-        this->names.insert(this->last_name);
+        static std::map<std::string, std::string> op_to_rtype = {
+            {"CS", "/ColorSpace"},
+            {"cs", "/ColorSpace"},
+            {"gs", "/ExtGState"},
+            {"Tf", "/Font"},
+            {"SCN", "/Pattern"},
+            {"scn", "/Pattern"},
+            {"BDC", "/Properties"},
+            {"DP", "/Properties"},
+            {"sh", "/Shading"},
+            {"Do", "/XObject"},
+        };
+        std::string op = obj.getOperatorValue();
+        std::string resource_type;
+        auto iter = op_to_rtype.find(op);
+        if (iter != op_to_rtype.end())
+        {
+            resource_type = iter->second;
+        }
+        if (! resource_type.empty())
+        {
+            this->names.insert(this->last_name);
+            this->names_by_resource_type[
+                resource_type][this->last_name].insert(this->last_name_offset);
+        }
     }
-    else if (token.getType() == QPDFTokenizer::tt_name)
+    else if (obj.isName())
     {
-        this->last_name =
-            QPDFObjectHandle::newName(token.getValue()).getName();
+        this->last_name = obj.getName();
+        this->last_name_offset = offset;
     }
-    else if (token.getType() == QPDFTokenizer::tt_bad)
-    {
-        saw_bad = true;
-    }
-    writeToken(token);
+}
+
+void
+ResourceFinder::handleWarning()
+{
+    this->saw_bad = true;
 }
 
 std::set<std::string> const&
@@ -31,6 +56,12 @@ ResourceFinder::getNames() const
     return this->names;
 }
 
+std::map<std::string, std::map<std::string, std::set<size_t>>> const&
+ResourceFinder::getNamesByResourceType() const
+{
+    return this->names_by_resource_type;
+}
+
 bool
 ResourceFinder::sawBad() const
 {
diff --git a/libqpdf/qpdf/ResourceFinder.hh b/libqpdf/qpdf/ResourceFinder.hh
index 0ac74eab..ac3d5b4c 100644
--- a/libqpdf/qpdf/ResourceFinder.hh
+++ b/libqpdf/qpdf/ResourceFinder.hh
@@ -3,19 +3,26 @@
 
 #include <qpdf/QPDFObjectHandle.hh>
 
-class ResourceFinder: public QPDFObjectHandle::TokenFilter
+class ResourceFinder: public QPDFObjectHandle::ParserCallbacks
 {
   public:
     ResourceFinder();
     virtual ~ResourceFinder() = default;
-    virtual void handleToken(QPDFTokenizer::Token const&) override;
+    virtual void handleObject(QPDFObjectHandle, size_t, size_t) override;
+    virtual void handleWarning() override;
     std::set<std::string> const& getNames() const;
+    std::map<std::string,
+             std::map<std::string,
+                      std::set<size_t>>> const& getNamesByResourceType() const;
     bool sawBad() const;
 
   private:
     std::string last_name;
+    size_t last_name_offset;
     std::set<std::string> names;
-    std::map<std::string, std::set<std::string>> names_by_resource_type;
+    std::map<std::string,
+             std::map<std::string,
+                      std::set<size_t>>> names_by_resource_type;
     bool saw_bad;
 };
 
diff --git a/qpdf/qtest/qpdf/split-tokens-split.out b/qpdf/qtest/qpdf/split-tokens-split.out
index ab9f3b7a..8e1003be 100644
--- a/qpdf/qtest/qpdf/split-tokens-split.out
+++ b/qpdf/qtest/qpdf/split-tokens-split.out
@@ -1,3 +1,4 @@
+WARNING: page object 3 0 stream 5 0, stream 7 0, stream 9 0, stream 11 0 (content, offset 375): null character not allowed in name token
 WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this object
 WARNING: empty PDF: content normalization encountered bad tokens
 WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents