Filter tokens example

author: Jay Berkenbilt <ejb@ql.org> 2018-02-03 20:52:40 +0100
committer: Jay Berkenbilt <ejb@ql.org> 2018-02-19 03:05:47 +0100
commit: 30709935af023dd66a17f2d494aa7dc84b7177e1 (patch)
tree: e6cb3beda0109ca3f6bc10348f60da2d5b9a5c81
parent: 99101044429c3c91bd11bdd1b26e5b6c2ceb140b (diff)
download: qpdf-30709935af023dd66a17f2d494aa7dc84b7177e1.tar.zst
7 files changed, 264 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 20cb0e80..b061c584 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -150,6 +150,9 @@
 	QPDFObjectHandle::pipeStreamData, you don't need to worry about
 	this at all.
 
+	* Provide heavily annoated examples/pdf-filter-tokens.cc example
+	that illustrates use of some simple token filters.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>
 
 	* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/examples/build.mk b/examples/build.mk
index 518f4d55..f5b44669 100644
--- a/examples/build.mk
+++ b/examples/build.mk
@@ -6,7 +6,8 @@ BINS_examples = \
 	pdf-invert-images \
 	pdf-create \
 	pdf-parse-content \
-	pdf-split-pages
+	pdf-split-pages \
+	pdf-filter-tokens
 CBINS_examples = pdf-linearize
 
 TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc
new file mode 100644
index 00000000..2566f72c
--- /dev/null
+++ b/examples/pdf-filter-tokens.cc
@@ -0,0 +1,239 @@
+//
+// This example illustrates the use of QPDFObjectHandle::TokenFilter.
+// Please see comments inline for details.
+//
+
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <deque>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFWriter.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+
+static char const* whoami = 0;
+
+void usage()
+{
+    std::cerr << "Usage: " << whoami << " infile outfile" << std::endl
+	      << "Applies token filters to infile and writes outfile"
+              << std::endl;
+    exit(2);
+}
+
+// The StringReverser class is a trivial example of using a token
+// filter. This class only overrides the pure virtual handleToken
+// function and preserves the default handleEOF function.
+class StringReverser: public QPDFObjectHandle::TokenFilter
+{
+  public:
+    virtual ~StringReverser()
+    {
+    }
+    virtual void handleToken(QPDFTokenizer::Token const&);
+};
+
+void
+StringReverser::handleToken(QPDFTokenizer::Token const& token)
+{
+    // For string tokens, reverse the characters. For other tokens,
+    // just pass them through. Notice that we construct a new string
+    // token and write that, thus allowing the library to handle any
+    // subtleties about properly encoding unprintable characters. This
+    // function doesn't handle multibyte characters at all. It's not
+    // intended to be an example of the correct way to reverse
+    // strings. It's just intended to give a simple example of a
+    // pretty minimal filter and to show an example of writing a
+    // constructed token.
+    if (token.getType() == QPDFTokenizer::tt_string)
+    {
+        std::string value = token.getValue();
+        std::reverse(value.begin(), value.end());
+        writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, value));
+    }
+    else
+    {
+        writeToken(token);
+    }
+}
+
+// The ColorToGray filter finds all "rg" operators in the content
+// stream and replaces them with "g" operators, thus mapping color to
+// grayscale. Note that it only applies to content streams, not
+// images, so this will not replace color images with grayscale
+// images.
+class ColorToGray: public QPDFObjectHandle::TokenFilter
+{
+  public:
+    virtual ~ColorToGray()
+    {
+    }
+    virtual void handleToken(QPDFTokenizer::Token const&);
+    virtual void handleEOF();
+
+  private:
+    bool isNumeric(QPDFTokenizer::token_type_e);
+    bool isIgnorable(QPDFTokenizer::token_type_e);
+    double numericValue(QPDFTokenizer::Token const&);
+
+    std::deque<QPDFTokenizer::Token> all_stack;
+    std::deque<QPDFTokenizer::Token> stack;
+};
+
+bool
+ColorToGray::isNumeric(QPDFTokenizer::token_type_e token_type)
+{
+    return ((token_type == QPDFTokenizer::tt_integer) ||
+            (token_type == QPDFTokenizer::tt_real));
+}
+
+bool
+ColorToGray::isIgnorable(QPDFTokenizer::token_type_e token_type)
+{
+    return ((token_type == QPDFTokenizer::tt_space) ||
+            (token_type == QPDFTokenizer::tt_comment));
+}
+
+double
+ColorToGray::numericValue(QPDFTokenizer::Token const& token)
+{
+    return QPDFObjectHandle::parse(token.getValue()).getNumericValue();
+}
+
+void
+ColorToGray::handleToken(QPDFTokenizer::Token const& token)
+{
+    // Track the number of non-ignorable tokens we've seen. If we see
+    // an "rg" following three numbers, convert it to a grayscale
+    // value. Keep writing tokens to the output as we can.
+
+    // There are several things to notice here. We keep two stacks:
+    // one of "meaningful" tokens, and one of all tokens. This way we
+    // can preserve whitespace or comments that we encounter in the
+    // stream and there preserve layout. As we receive tokens, we keep
+    // the last four meaningful tokens. If we see three numbers
+    // followed by rg, we use the three numbers to calculate a gray
+    // value that is perceptually similar to the color value and then
+    // write the "g" operator to the output, discarding any spaces or
+    // comments encountered embedded in the "rg" operator.
+
+    // The stack and all_stack members are updated in such a way that
+    // they always contain exactly the same non-ignorable tokens. The
+    // stack member contains the tokens that would be left if you
+    // removed all space and comment tokens from all_stack.
+
+    // On each new token, flush out any space or comment tokens. Store
+    // the incoming token. If we just got an rg preceded by the right
+    // kinds of operands, replace the command. Flush any additional
+    // accumulated tokens to keep the stack only four tokens deep.
+
+    while ((! this->all_stack.empty()) &&
+           isIgnorable(this->all_stack.at(0).getType()))
+    {
+        writeToken(this->all_stack.at(0));
+        this->all_stack.pop_front();
+    }
+    this->all_stack.push_back(token);
+    QPDFTokenizer::token_type_e token_type = token.getType();
+    if (! isIgnorable(token_type))
+    {
+        this->stack.push_back(token);
+        if ((this->stack.size() == 4) &&
+            (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "rg")) &&
+            (isNumeric(this->stack.at(0).getType())) &&
+            (isNumeric(this->stack.at(1).getType())) &&
+            (isNumeric(this->stack.at(2).getType())))
+        {
+            double r = numericValue(this->stack.at(0));
+            double g = numericValue(this->stack.at(1));
+            double b = numericValue(this->stack.at(2));
+            double gray = ((0.3 * r) + (0.59 * b) + (0.11 * g));
+            if (gray > 1.0)
+            {
+                gray = 1.0;
+            }
+            if (gray < 0.0)
+            {
+                gray = 0.0;
+            }
+            write(QUtil::double_to_string(gray, 3));
+            write(" g");
+            this->stack.clear();
+            this->all_stack.clear();
+        }
+    }
+    if (this->stack.size() == 4)
+    {
+        writeToken(this->all_stack.at(0));
+        this->all_stack.pop_front();
+        this->stack.pop_front();
+    }
+}
+
+void
+ColorToGray::handleEOF()
+{
+    // Flush out any remaining accumulated tokens.
+    while (! this->all_stack.empty())
+    {
+        writeToken(this->all_stack.at(0));
+        this->all_stack.pop_front();
+    }
+    // Remember to call finish(). If you override handleEOF, it is
+    // essential that you call finish() or else you are likely to lose
+    // some data in buffers of downstream pipelines that are not
+    // flushed out. This is also mentioned in comments in
+    // QPDFObjectHandle.hh.
+    finish();
+}
+
+int main(int argc, char* argv[])
+{
+    whoami = QUtil::getWhoami(argv[0]);
+
+    // For libtool's sake....
+    if (strncmp(whoami, "lt-", 3) == 0)
+    {
+	whoami += 3;
+    }
+
+    if (argc != 3)
+    {
+	usage();
+    }
+    char const* infilename = argv[1];
+    char const* outfilename = argv[2];
+
+    try
+    {
+	QPDF pdf;
+	pdf.processFile(infilename);
+        std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+        for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+             iter != pages.end(); ++iter)
+        {
+            // Attach two token filters to each page of this file.
+            // When the file is written, or when the pages' contents
+            // are retrieved in any other way, the filters will be
+            // applied. See comments on the filters for additional
+            // details.
+            QPDFObjectHandle page = *iter;
+            page.addContentTokenFilter(new StringReverser);
+            page.addContentTokenFilter(new ColorToGray);
+        }
+
+        QPDFWriter w(pdf, outfilename);
+        w.setStaticID(true);    // for testing only
+        w.write();
+    }
+    catch (std::exception& e)
+    {
+	std::cerr << whoami << ": " << e.what() << std::endl;
+	exit(2);
+    }
+
+    return 0;
+}
diff --git a/examples/qtest/filter-tokens.test b/examples/qtest/filter-tokens.test
new file mode 100644
index 00000000..6b93eb8f
--- /dev/null
+++ b/examples/qtest/filter-tokens.test
@@ -0,0 +1,20 @@
+#!/usr/bin/env perl
+require 5.008;
+BEGIN { $^W = 1; }
+use strict;
+
+chdir("filter-tokens");
+
+require TestDriver;
+
+my $td = new TestDriver('pdf-filter-tokens');
+
+$td->runtest("filter tokens",
+	     {$td->COMMAND => "pdf-filter-tokens in.pdf a.pdf"},
+	     {$td->STRING => "", $td->EXIT_STATUS => 0});
+
+$td->runtest("check output",
+	     {$td->FILE => "a.pdf"},
+	     {$td->FILE => "out.pdf"});
+
+$td->report(2);
diff --git a/examples/qtest/filter-tokens/a.pdf b/examples/qtest/filter-tokens/a.pdf
new file mode 100644
index 00000000..ef7cdbce
--- /dev/null
+++ b/examples/qtest/filter-tokens/a.pdf
diff --git a/examples/qtest/filter-tokens/in.pdf b/examples/qtest/filter-tokens/in.pdf
new file mode 100644
index 00000000..f60a30d6
--- /dev/null
+++ b/examples/qtest/filter-tokens/in.pdf
diff --git a/examples/qtest/filter-tokens/out.pdf b/examples/qtest/filter-tokens/out.pdf
new file mode 100644
index 00000000..ef7cdbce
--- /dev/null
+++ b/examples/qtest/filter-tokens/out.pdf
author	Jay Berkenbilt <ejb@ql.org>	2018-02-03 20:52:40 +0100
committer	Jay Berkenbilt <ejb@ql.org>	2018-02-19 03:05:47 +0100
commit	30709935af023dd66a17f2d494aa7dc84b7177e1 (patch)
tree	e6cb3beda0109ca3f6bc10348f60da2d5b9a5c81
parent	99101044429c3c91bd11bdd1b26e5b6c2ceb140b (diff)
download	qpdf-30709935af023dd66a17f2d494aa7dc84b7177e1.tar.zst