Add additional interface for filtering page contents

author: Jay Berkenbilt <ejb@ql.org> 2018-02-11 21:41:02 +0100
committer: Jay Berkenbilt <ejb@ql.org> 2018-02-19 03:05:47 +0100
commit: 5708b5d0aa9c94ab663509fbb865aa27a134aeb3 (patch)
tree: 30a85d51d3d720dfca0a09b9dba4eef0c3fe2bec
parent: fd02944e1953931e07f124448350db91038020af (diff)
download: qpdf-5708b5d0aa9c94ab663509fbb865aa27a134aeb3.tar.zst
9 files changed, 215 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index 97d65238..0c298abb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2018-02-11  Jay Berkenbilt  <ejb@ql.org>
+
+	* Add QPDFObjectHandle::filterPageContents method to provide a
+	different interface for applying token filters to page contents
+	without modifying the ultimate output.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>
 
         * Changes listed on today's date are numerous and reflect
diff --git a/examples/build.mk b/examples/build.mk
index f5b44669..b5748c11 100644
--- a/examples/build.mk
+++ b/examples/build.mk
@@ -7,7 +7,8 @@ BINS_examples = \
 	pdf-create \
 	pdf-parse-content \
 	pdf-split-pages \
-	pdf-filter-tokens
+	pdf-filter-tokens \
+	pdf-count-strings
 CBINS_examples = pdf-linearize
 
 TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/examples/pdf-count-strings.cc b/examples/pdf-count-strings.cc
new file mode 100644
index 00000000..81718298
--- /dev/null
+++ b/examples/pdf-count-strings.cc
@@ -0,0 +1,131 @@
+//
+// This example illustrates the use of QPDFObjectHandle::TokenFilter
+// with filterPageContents. See also pdf-filter-tokens.cc for an
+// example that uses QPDFObjectHandle::TokenFilter with
+// addContentTokenFilter.
+//
+
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+#include <qpdf/Pl_StdioFile.hh>
+
+static char const* whoami = 0;
+
+void usage()
+{
+    std::cerr << "Usage: " << whoami << " infile" << std::endl
+	      << "Applies token filters to infile"
+              << std::endl;
+    exit(2);
+}
+
+class StringCounter: public QPDFObjectHandle::TokenFilter
+{
+  public:
+    StringCounter() :
+        count(0)
+    {
+    }
+    virtual ~StringCounter()
+    {
+    }
+    virtual void handleToken(QPDFTokenizer::Token const&);
+    virtual void handleEOF();
+    int getCount() const;
+
+  private:
+    int count;
+};
+
+void
+StringCounter::handleToken(QPDFTokenizer::Token const& token)
+{
+    // Count string tokens
+    if (token.getType() == QPDFTokenizer::tt_string)
+    {
+        ++this->count;
+    }
+    // Preserve input verbatim by passing each token to any specified
+    // downstream filter.
+    writeToken(token);
+}
+
+void
+StringCounter::handleEOF()
+{
+    // Write a comment at the end of the stream just to show how we
+    // can enhance the output if we want.
+    write("\n% strings found: ");
+    write(QUtil::int_to_string(this->count));
+    // If you override handleEOF, you must always remember to call finish().
+    finish();
+}
+
+int
+StringCounter::getCount() const
+{
+    return this->count;
+}
+
+int main(int argc, char* argv[])
+{
+    whoami = QUtil::getWhoami(argv[0]);
+
+    // For libtool's sake....
+    if (strncmp(whoami, "lt-", 3) == 0)
+    {
+	whoami += 3;
+    }
+
+    if (argc != 2)
+    {
+	usage();
+    }
+    char const* infilename = argv[1];
+
+    try
+    {
+	QPDF pdf;
+	pdf.processFile(infilename);
+        std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+        int pageno = 0;
+        for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+             iter != pages.end(); ++iter)
+        {
+            QPDFObjectHandle page = *iter;
+            ++pageno;
+            // Pass the contents of a page through our string counter.
+            // If it's an even page, capture the output. This
+            // illustrates that you may capture any output generated
+            // by the filter, or you may ignore it.
+            StringCounter counter;
+            if (pageno % 2)
+            {
+                // Ignore output for odd pages.
+                page.filterPageContents(&counter);
+            }
+            else
+            {
+                // Write output to stdout for even pages.
+                Pl_StdioFile out("stdout", stdout);
+                std::cout << "% Contents of page " << pageno << std::endl;
+                page.filterPageContents(&counter, &out);
+                std::cout << "\n% end " << pageno << std::endl;
+            }
+            std::cout << "Page " << pageno
+                      << ": strings = " << counter.getCount() << std::endl;
+        }
+    }
+    catch (std::exception& e)
+    {
+	std::cerr << whoami << ": " << e.what() << std::endl;
+	exit(2);
+    }
+
+    return 0;
+}
diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc
index 2566f72c..809c160b 100644
--- a/examples/pdf-filter-tokens.cc
+++ b/examples/pdf-filter-tokens.cc
@@ -1,6 +1,8 @@
 //
-// This example illustrates the use of QPDFObjectHandle::TokenFilter.
-// Please see comments inline for details.
+// This example illustrates the use of QPDFObjectHandle::TokenFilter
+// with addContentTokenFilter. Please see comments inline for details.
+// See also pdf-count-strings.cc for a use of
+// QPDFObjectHandle::TokenFilter with filterPageContents.
 //
 
 #include <iostream>
diff --git a/examples/qtest/count-strings.test b/examples/qtest/count-strings.test
new file mode 100644
index 00000000..ba3f835b
--- /dev/null
+++ b/examples/qtest/count-strings.test
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+require 5.008;
+BEGIN { $^W = 1; }
+use strict;
+
+chdir("count-strings");
+
+require TestDriver;
+
+my $td = new TestDriver('pdf-count-strings');
+
+$td->runtest("filter tokens",
+	     {$td->COMMAND => "pdf-count-strings in.pdf"},
+	     {$td->FILE => "out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
+$td->report(1);
diff --git a/examples/qtest/count-strings/in.pdf b/examples/qtest/count-strings/in.pdf
new file mode 100644
index 00000000..591614c4
--- /dev/null
+++ b/examples/qtest/count-strings/in.pdf
diff --git a/examples/qtest/count-strings/out b/examples/qtest/count-strings/out
new file mode 100644
index 00000000..87b024fc
--- /dev/null
+++ b/examples/qtest/count-strings/out
@@ -0,0 +1,16 @@
+Page 1: strings = 3
+% Contents of page 2
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Four ) Tj
+  (Five ) Tj
+  (Six )
+  (beautiful ) Tj
+  (strings) Tj
+  (!) Tj
+ET
+
+% strings found: 6
+% end 2
+Page 2: strings = 6
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index f0b8f2af..1f0d550a 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -80,9 +80,10 @@ class QPDFObjectHandle
     // The TokenFilter class provides a way to filter content streams
     // in a lexically aware fashion. TokenFilters can be attached to
     // streams using the addTokenFilter or addContentTokenFilter
-    // methods. The handleToken method is called for each token,
-    // including the eof token, and then handleEOF is called at the
-    // very end. Handlers may call write (or writeToken) to pass data
+    // methods or can be applied on the spot by filterPageContents.
+    // The handleToken method is called for each token, including the
+    // eof token, and then handleEOF is called at the very end.
+    // Handlers may call write (or writeToken) to pass data
     // downstream. The finish() method must be called exactly one time
     // to ensure that any written data is flushed out. The default
     // handleEOF calls finish. If you override handleEOF, you must
@@ -91,8 +92,9 @@ class QPDFObjectHandle
     // Failure to call finish() may result in some of the data you
     // have written being lost. You should not rely on a destructor
     // for calling finish() since the destructor call may occur later
-    // than you expect. Please see examples/token-filters.cc for
-    // examples of using TokenFilters.
+    // than you expect. Please see examples/pdf-filter-tokens.cc and
+    // examples/pdf-count-strings.cc for examples of using
+    // TokenFilters.
     //
     // Please note that when you call token.getValue() on a token of
     // type tt_string, you get the string value without any
@@ -255,6 +257,18 @@ class QPDFObjectHandle
     QPDF_DLL
     void parsePageContents(ParserCallbacks* callbacks);
 
+    // Pass a page's contents through the given TokenFilter. If a
+    // pipeline is also provided, it will be the target of the write
+    // methods from the token filter. If a pipeline is not specified,
+    // any output generated by the token filter will be discarded. Use
+    // this interface if you need to pass a page's contents through
+    // filter for work purposes without having that filter
+    // automatically applied to the page's contents, as happens with
+    // addContentTokenFilter. See examples/pdf-count-strings.cc for an
+    // example.
+    QPDF_DLL
+    void filterPageContents(TokenFilter* filter, Pipeline* next = 0);
+
     // Pipe a page's contents through the given pipeline. This method
     // works whether the contents are a single stream or an array of
     // streams. Call on a page object.
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index bba95938..5d7b0bb9 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -15,6 +15,8 @@
 #include <qpdf/QPDF_Reserved.hh>
 #include <qpdf/Pl_Buffer.hh>
 #include <qpdf/Pl_Concatenate.hh>
+#include <qpdf/Pl_QPDFTokenizer.hh>
+#include <qpdf/Pl_Discard.hh>
 #include <qpdf/BufferInputSource.hh>
 #include <qpdf/QPDFExc.hh>
 
@@ -999,6 +1001,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
 }
 
 void
+QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
+{
+    assertPageObject();
+    std::string description = "token filter for page object " +
+        QUtil::int_to_string(this->objid) + " " +
+        QUtil::int_to_string(this->generation);
+    Pl_QPDFTokenizer token_pipeline(description.c_str(), filter);
+    PointerHolder<Pipeline> next_p;
+    if (next == 0)
+    {
+        next_p = new Pl_Discard();
+        next = next_p.getPointer();
+    }
+    filter->setPipeline(next);
+    this->pipePageContents(&token_pipeline);
+}
+
+void
 QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
                                      ParserCallbacks* callbacks)
 {
author	Jay Berkenbilt <ejb@ql.org>	2018-02-11 21:41:02 +0100
committer	Jay Berkenbilt <ejb@ql.org>	2018-02-19 03:05:47 +0100
commit	5708b5d0aa9c94ab663509fbb865aa27a134aeb3 (patch)
tree	30a85d51d3d720dfca0a09b9dba4eef0c3fe2bec
parent	fd02944e1953931e07f124448350db91038020af (diff)
download	qpdf-5708b5d0aa9c94ab663509fbb865aa27a134aeb3.tar.zst