aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-03 20:52:40 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:47 +0100
commit30709935af023dd66a17f2d494aa7dc84b7177e1 (patch)
treee6cb3beda0109ca3f6bc10348f60da2d5b9a5c81
parent99101044429c3c91bd11bdd1b26e5b6c2ceb140b (diff)
downloadqpdf-30709935af023dd66a17f2d494aa7dc84b7177e1.tar.zst
Filter tokens example
-rw-r--r--ChangeLog3
-rw-r--r--examples/build.mk3
-rw-r--r--examples/pdf-filter-tokens.cc239
-rw-r--r--examples/qtest/filter-tokens.test20
-rw-r--r--examples/qtest/filter-tokens/a.pdfbin0 -> 53518 bytes
-rw-r--r--examples/qtest/filter-tokens/in.pdfbin0 -> 57165 bytes
-rw-r--r--examples/qtest/filter-tokens/out.pdfbin0 -> 53518 bytes
7 files changed, 264 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 20cb0e80..b061c584 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -150,6 +150,9 @@
QPDFObjectHandle::pipeStreamData, you don't need to worry about
this at all.
+ * Provide heavily annoated examples/pdf-filter-tokens.cc example
+ that illustrates use of some simple token filters.
+
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/examples/build.mk b/examples/build.mk
index 518f4d55..f5b44669 100644
--- a/examples/build.mk
+++ b/examples/build.mk
@@ -6,7 +6,8 @@ BINS_examples = \
pdf-invert-images \
pdf-create \
pdf-parse-content \
- pdf-split-pages
+ pdf-split-pages \
+ pdf-filter-tokens
CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc
new file mode 100644
index 00000000..2566f72c
--- /dev/null
+++ b/examples/pdf-filter-tokens.cc
@@ -0,0 +1,239 @@
+//
+// This example illustrates the use of QPDFObjectHandle::TokenFilter.
+// Please see comments inline for details.
+//
+
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <deque>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFWriter.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+
+static char const* whoami = 0;
+
+void usage()
+{
+ std::cerr << "Usage: " << whoami << " infile outfile" << std::endl
+ << "Applies token filters to infile and writes outfile"
+ << std::endl;
+ exit(2);
+}
+
+// The StringReverser class is a trivial example of using a token
+// filter. This class only overrides the pure virtual handleToken
+// function and preserves the default handleEOF function.
+class StringReverser: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ virtual ~StringReverser()
+ {
+ }
+ virtual void handleToken(QPDFTokenizer::Token const&);
+};
+
+void
+StringReverser::handleToken(QPDFTokenizer::Token const& token)
+{
+ // For string tokens, reverse the characters. For other tokens,
+ // just pass them through. Notice that we construct a new string
+ // token and write that, thus allowing the library to handle any
+ // subtleties about properly encoding unprintable characters. This
+ // function doesn't handle multibyte characters at all. It's not
+ // intended to be an example of the correct way to reverse
+ // strings. It's just intended to give a simple example of a
+ // pretty minimal filter and to show an example of writing a
+ // constructed token.
+ if (token.getType() == QPDFTokenizer::tt_string)
+ {
+ std::string value = token.getValue();
+ std::reverse(value.begin(), value.end());
+ writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, value));
+ }
+ else
+ {
+ writeToken(token);
+ }
+}
+
+// The ColorToGray filter finds all "rg" operators in the content
+// stream and replaces them with "g" operators, thus mapping color to
+// grayscale. Note that it only applies to content streams, not
+// images, so this will not replace color images with grayscale
+// images.
+class ColorToGray: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ virtual ~ColorToGray()
+ {
+ }
+ virtual void handleToken(QPDFTokenizer::Token const&);
+ virtual void handleEOF();
+
+ private:
+ bool isNumeric(QPDFTokenizer::token_type_e);
+ bool isIgnorable(QPDFTokenizer::token_type_e);
+ double numericValue(QPDFTokenizer::Token const&);
+
+ std::deque<QPDFTokenizer::Token> all_stack;
+ std::deque<QPDFTokenizer::Token> stack;
+};
+
+bool
+ColorToGray::isNumeric(QPDFTokenizer::token_type_e token_type)
+{
+ return ((token_type == QPDFTokenizer::tt_integer) ||
+ (token_type == QPDFTokenizer::tt_real));
+}
+
+bool
+ColorToGray::isIgnorable(QPDFTokenizer::token_type_e token_type)
+{
+ return ((token_type == QPDFTokenizer::tt_space) ||
+ (token_type == QPDFTokenizer::tt_comment));
+}
+
+double
+ColorToGray::numericValue(QPDFTokenizer::Token const& token)
+{
+ return QPDFObjectHandle::parse(token.getValue()).getNumericValue();
+}
+
+void
+ColorToGray::handleToken(QPDFTokenizer::Token const& token)
+{
+ // Track the number of non-ignorable tokens we've seen. If we see
+ // an "rg" following three numbers, convert it to a grayscale
+ // value. Keep writing tokens to the output as we can.
+
+ // There are several things to notice here. We keep two stacks:
+ // one of "meaningful" tokens, and one of all tokens. This way we
+ // can preserve whitespace or comments that we encounter in the
+ // stream and there preserve layout. As we receive tokens, we keep
+ // the last four meaningful tokens. If we see three numbers
+ // followed by rg, we use the three numbers to calculate a gray
+ // value that is perceptually similar to the color value and then
+ // write the "g" operator to the output, discarding any spaces or
+ // comments encountered embedded in the "rg" operator.
+
+ // The stack and all_stack members are updated in such a way that
+ // they always contain exactly the same non-ignorable tokens. The
+ // stack member contains the tokens that would be left if you
+ // removed all space and comment tokens from all_stack.
+
+ // On each new token, flush out any space or comment tokens. Store
+ // the incoming token. If we just got an rg preceded by the right
+ // kinds of operands, replace the command. Flush any additional
+ // accumulated tokens to keep the stack only four tokens deep.
+
+ while ((! this->all_stack.empty()) &&
+ isIgnorable(this->all_stack.at(0).getType()))
+ {
+ writeToken(this->all_stack.at(0));
+ this->all_stack.pop_front();
+ }
+ this->all_stack.push_back(token);
+ QPDFTokenizer::token_type_e token_type = token.getType();
+ if (! isIgnorable(token_type))
+ {
+ this->stack.push_back(token);
+ if ((this->stack.size() == 4) &&
+ (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "rg")) &&
+ (isNumeric(this->stack.at(0).getType())) &&
+ (isNumeric(this->stack.at(1).getType())) &&
+ (isNumeric(this->stack.at(2).getType())))
+ {
+ double r = numericValue(this->stack.at(0));
+ double g = numericValue(this->stack.at(1));
+ double b = numericValue(this->stack.at(2));
+ double gray = ((0.3 * r) + (0.59 * b) + (0.11 * g));
+ if (gray > 1.0)
+ {
+ gray = 1.0;
+ }
+ if (gray < 0.0)
+ {
+ gray = 0.0;
+ }
+ write(QUtil::double_to_string(gray, 3));
+ write(" g");
+ this->stack.clear();
+ this->all_stack.clear();
+ }
+ }
+ if (this->stack.size() == 4)
+ {
+ writeToken(this->all_stack.at(0));
+ this->all_stack.pop_front();
+ this->stack.pop_front();
+ }
+}
+
+void
+ColorToGray::handleEOF()
+{
+ // Flush out any remaining accumulated tokens.
+ while (! this->all_stack.empty())
+ {
+ writeToken(this->all_stack.at(0));
+ this->all_stack.pop_front();
+ }
+ // Remember to call finish(). If you override handleEOF, it is
+ // essential that you call finish() or else you are likely to lose
+ // some data in buffers of downstream pipelines that are not
+ // flushed out. This is also mentioned in comments in
+ // QPDFObjectHandle.hh.
+ finish();
+}
+
+int main(int argc, char* argv[])
+{
+ whoami = QUtil::getWhoami(argv[0]);
+
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ if (argc != 3)
+ {
+ usage();
+ }
+ char const* infilename = argv[1];
+ char const* outfilename = argv[2];
+
+ try
+ {
+ QPDF pdf;
+ pdf.processFile(infilename);
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+ for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ // Attach two token filters to each page of this file.
+ // When the file is written, or when the pages' contents
+ // are retrieved in any other way, the filters will be
+ // applied. See comments on the filters for additional
+ // details.
+ QPDFObjectHandle page = *iter;
+ page.addContentTokenFilter(new StringReverser);
+ page.addContentTokenFilter(new ColorToGray);
+ }
+
+ QPDFWriter w(pdf, outfilename);
+ w.setStaticID(true); // for testing only
+ w.write();
+ }
+ catch (std::exception& e)
+ {
+ std::cerr << whoami << ": " << e.what() << std::endl;
+ exit(2);
+ }
+
+ return 0;
+}
diff --git a/examples/qtest/filter-tokens.test b/examples/qtest/filter-tokens.test
new file mode 100644
index 00000000..6b93eb8f
--- /dev/null
+++ b/examples/qtest/filter-tokens.test
@@ -0,0 +1,20 @@
+#!/usr/bin/env perl
+require 5.008;
+BEGIN { $^W = 1; }
+use strict;
+
+chdir("filter-tokens");
+
+require TestDriver;
+
+my $td = new TestDriver('pdf-filter-tokens');
+
+$td->runtest("filter tokens",
+ {$td->COMMAND => "pdf-filter-tokens in.pdf a.pdf"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0});
+
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "out.pdf"});
+
+$td->report(2);
diff --git a/examples/qtest/filter-tokens/a.pdf b/examples/qtest/filter-tokens/a.pdf
new file mode 100644
index 00000000..ef7cdbce
--- /dev/null
+++ b/examples/qtest/filter-tokens/a.pdf
Binary files differ
diff --git a/examples/qtest/filter-tokens/in.pdf b/examples/qtest/filter-tokens/in.pdf
new file mode 100644
index 00000000..f60a30d6
--- /dev/null
+++ b/examples/qtest/filter-tokens/in.pdf
Binary files differ
diff --git a/examples/qtest/filter-tokens/out.pdf b/examples/qtest/filter-tokens/out.pdf
new file mode 100644
index 00000000..ef7cdbce
--- /dev/null
+++ b/examples/qtest/filter-tokens/out.pdf
Binary files differ