aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-11 21:41:02 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:47 +0100
commit5708b5d0aa9c94ab663509fbb865aa27a134aeb3 (patch)
tree30a85d51d3d720dfca0a09b9dba4eef0c3fe2bec
parentfd02944e1953931e07f124448350db91038020af (diff)
downloadqpdf-5708b5d0aa9c94ab663509fbb865aa27a134aeb3.tar.zst
Add additional interface for filtering page contents
-rw-r--r--ChangeLog6
-rw-r--r--examples/build.mk3
-rw-r--r--examples/pdf-count-strings.cc131
-rw-r--r--examples/pdf-filter-tokens.cc6
-rw-r--r--examples/qtest/count-strings.test17
-rw-r--r--examples/qtest/count-strings/in.pdfbin0 -> 1348 bytes
-rw-r--r--examples/qtest/count-strings/out16
-rw-r--r--include/qpdf/QPDFObjectHandle.hh24
-rw-r--r--libqpdf/QPDFObjectHandle.cc20
9 files changed, 215 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index 97d65238..0c298abb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2018-02-11 Jay Berkenbilt <ejb@ql.org>
+
+ * Add QPDFObjectHandle::filterPageContents method to provide a
+ different interface for applying token filters to page contents
+ without modifying the ultimate output.
+
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Changes listed on today's date are numerous and reflect
diff --git a/examples/build.mk b/examples/build.mk
index f5b44669..b5748c11 100644
--- a/examples/build.mk
+++ b/examples/build.mk
@@ -7,7 +7,8 @@ BINS_examples = \
pdf-create \
pdf-parse-content \
pdf-split-pages \
- pdf-filter-tokens
+ pdf-filter-tokens \
+ pdf-count-strings
CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/examples/pdf-count-strings.cc b/examples/pdf-count-strings.cc
new file mode 100644
index 00000000..81718298
--- /dev/null
+++ b/examples/pdf-count-strings.cc
@@ -0,0 +1,131 @@
+//
+// This example illustrates the use of QPDFObjectHandle::TokenFilter
+// with filterPageContents. See also pdf-filter-tokens.cc for an
+// example that uses QPDFObjectHandle::TokenFilter with
+// addContentTokenFilter.
+//
+
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+#include <qpdf/Pl_StdioFile.hh>
+
+static char const* whoami = 0;
+
+void usage()
+{
+ std::cerr << "Usage: " << whoami << " infile" << std::endl
+ << "Applies token filters to infile"
+ << std::endl;
+ exit(2);
+}
+
+class StringCounter: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ StringCounter() :
+ count(0)
+ {
+ }
+ virtual ~StringCounter()
+ {
+ }
+ virtual void handleToken(QPDFTokenizer::Token const&);
+ virtual void handleEOF();
+ int getCount() const;
+
+ private:
+ int count;
+};
+
+void
+StringCounter::handleToken(QPDFTokenizer::Token const& token)
+{
+ // Count string tokens
+ if (token.getType() == QPDFTokenizer::tt_string)
+ {
+ ++this->count;
+ }
+ // Preserve input verbatim by passing each token to any specified
+ // downstream filter.
+ writeToken(token);
+}
+
+void
+StringCounter::handleEOF()
+{
+ // Write a comment at the end of the stream just to show how we
+ // can enhance the output if we want.
+ write("\n% strings found: ");
+ write(QUtil::int_to_string(this->count));
+ // If you override handleEOF, you must always remember to call finish().
+ finish();
+}
+
+int
+StringCounter::getCount() const
+{
+ return this->count;
+}
+
+int main(int argc, char* argv[])
+{
+ whoami = QUtil::getWhoami(argv[0]);
+
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ if (argc != 2)
+ {
+ usage();
+ }
+ char const* infilename = argv[1];
+
+ try
+ {
+ QPDF pdf;
+ pdf.processFile(infilename);
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+ int pageno = 0;
+ for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ QPDFObjectHandle page = *iter;
+ ++pageno;
+ // Pass the contents of a page through our string counter.
+ // If it's an even page, capture the output. This
+ // illustrates that you may capture any output generated
+ // by the filter, or you may ignore it.
+ StringCounter counter;
+ if (pageno % 2)
+ {
+ // Ignore output for odd pages.
+ page.filterPageContents(&counter);
+ }
+ else
+ {
+ // Write output to stdout for even pages.
+ Pl_StdioFile out("stdout", stdout);
+ std::cout << "% Contents of page " << pageno << std::endl;
+ page.filterPageContents(&counter, &out);
+ std::cout << "\n% end " << pageno << std::endl;
+ }
+ std::cout << "Page " << pageno
+ << ": strings = " << counter.getCount() << std::endl;
+ }
+ }
+ catch (std::exception& e)
+ {
+ std::cerr << whoami << ": " << e.what() << std::endl;
+ exit(2);
+ }
+
+ return 0;
+}
diff --git a/examples/pdf-filter-tokens.cc b/examples/pdf-filter-tokens.cc
index 2566f72c..809c160b 100644
--- a/examples/pdf-filter-tokens.cc
+++ b/examples/pdf-filter-tokens.cc
@@ -1,6 +1,8 @@
//
-// This example illustrates the use of QPDFObjectHandle::TokenFilter.
-// Please see comments inline for details.
+// This example illustrates the use of QPDFObjectHandle::TokenFilter
+// with addContentTokenFilter. Please see comments inline for details.
+// See also pdf-count-strings.cc for a use of
+// QPDFObjectHandle::TokenFilter with filterPageContents.
//
#include <iostream>
diff --git a/examples/qtest/count-strings.test b/examples/qtest/count-strings.test
new file mode 100644
index 00000000..ba3f835b
--- /dev/null
+++ b/examples/qtest/count-strings.test
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+require 5.008;
+BEGIN { $^W = 1; }
+use strict;
+
+chdir("count-strings");
+
+require TestDriver;
+
+my $td = new TestDriver('pdf-count-strings');
+
+$td->runtest("filter tokens",
+ {$td->COMMAND => "pdf-count-strings in.pdf"},
+ {$td->FILE => "out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
+$td->report(1);
diff --git a/examples/qtest/count-strings/in.pdf b/examples/qtest/count-strings/in.pdf
new file mode 100644
index 00000000..591614c4
--- /dev/null
+++ b/examples/qtest/count-strings/in.pdf
Binary files differ
diff --git a/examples/qtest/count-strings/out b/examples/qtest/count-strings/out
new file mode 100644
index 00000000..87b024fc
--- /dev/null
+++ b/examples/qtest/count-strings/out
@@ -0,0 +1,16 @@
+Page 1: strings = 3
+% Contents of page 2
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Four ) Tj
+ (Five ) Tj
+ (Six )
+ (beautiful ) Tj
+ (strings) Tj
+ (!) Tj
+ET
+
+% strings found: 6
+% end 2
+Page 2: strings = 6
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index f0b8f2af..1f0d550a 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -80,9 +80,10 @@ class QPDFObjectHandle
// The TokenFilter class provides a way to filter content streams
// in a lexically aware fashion. TokenFilters can be attached to
// streams using the addTokenFilter or addContentTokenFilter
- // methods. The handleToken method is called for each token,
- // including the eof token, and then handleEOF is called at the
- // very end. Handlers may call write (or writeToken) to pass data
+ // methods or can be applied on the spot by filterPageContents.
+ // The handleToken method is called for each token, including the
+ // eof token, and then handleEOF is called at the very end.
+ // Handlers may call write (or writeToken) to pass data
// downstream. The finish() method must be called exactly one time
// to ensure that any written data is flushed out. The default
// handleEOF calls finish. If you override handleEOF, you must
@@ -91,8 +92,9 @@ class QPDFObjectHandle
// Failure to call finish() may result in some of the data you
// have written being lost. You should not rely on a destructor
// for calling finish() since the destructor call may occur later
- // than you expect. Please see examples/token-filters.cc for
- // examples of using TokenFilters.
+ // than you expect. Please see examples/pdf-filter-tokens.cc and
+ // examples/pdf-count-strings.cc for examples of using
+ // TokenFilters.
//
// Please note that when you call token.getValue() on a token of
// type tt_string, you get the string value without any
@@ -255,6 +257,18 @@ class QPDFObjectHandle
QPDF_DLL
void parsePageContents(ParserCallbacks* callbacks);
+ // Pass a page's contents through the given TokenFilter. If a
+ // pipeline is also provided, it will be the target of the write
+ // methods from the token filter. If a pipeline is not specified,
+ // any output generated by the token filter will be discarded. Use
+ // this interface if you need to pass a page's contents through
+ // filter for work purposes without having that filter
+ // automatically applied to the page's contents, as happens with
+ // addContentTokenFilter. See examples/pdf-count-strings.cc for an
+ // example.
+ QPDF_DLL
+ void filterPageContents(TokenFilter* filter, Pipeline* next = 0);
+
// Pipe a page's contents through the given pipeline. This method
// works whether the contents are a single stream or an array of
// streams. Call on a page object.
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index bba95938..5d7b0bb9 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -15,6 +15,8 @@
#include <qpdf/QPDF_Reserved.hh>
#include <qpdf/Pl_Buffer.hh>
#include <qpdf/Pl_Concatenate.hh>
+#include <qpdf/Pl_QPDFTokenizer.hh>
+#include <qpdf/Pl_Discard.hh>
#include <qpdf/BufferInputSource.hh>
#include <qpdf/QPDFExc.hh>
@@ -999,6 +1001,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
}
void
+QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
+{
+ assertPageObject();
+ std::string description = "token filter for page object " +
+ QUtil::int_to_string(this->objid) + " " +
+ QUtil::int_to_string(this->generation);
+ Pl_QPDFTokenizer token_pipeline(description.c_str(), filter);
+ PointerHolder<Pipeline> next_p;
+ if (next == 0)
+ {
+ next_p = new Pl_Discard();
+ next = next_p.getPointer();
+ }
+ filter->setPipeline(next);
+ this->pipePageContents(&token_pipeline);
+}
+
+void
QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks)
{