aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-03 00:21:34 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:46 +0100
commit99101044429c3c91bd11bdd1b26e5b6c2ceb140b (patch)
tree5ab366eab31ddf76e80f99bd1d34c421291f1c4e
parentb8723e97f4b94fe03e631aab0309382ead3137ed (diff)
downloadqpdf-99101044429c3c91bd11bdd1b26e5b6c2ceb140b.tar.zst
Implement TokenFilter and refactor Pl_QPDFTokenizer
Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a general filter that passes data through a TokenFilter.
-rw-r--r--ChangeLog43
-rw-r--r--include/qpdf/QPDFObjectHandle.hh87
-rw-r--r--include/qpdf/QPDFTokenizer.hh11
-rw-r--r--libqpdf/ContentNormalizer.cc77
-rw-r--r--libqpdf/Pl_QPDFTokenizer.cc121
-rw-r--r--libqpdf/QPDFObjectHandle.cc66
-rw-r--r--libqpdf/QPDFTokenizer.cc18
-rw-r--r--libqpdf/QPDFWriter.cc6
-rw-r--r--libqpdf/QPDF_Stream.cc34
-rw-r--r--libqpdf/build.mk1
-rw-r--r--libqpdf/qpdf/ContentNormalizer.hh15
-rw-r--r--libqpdf/qpdf/Pl_QPDFTokenizer.hh32
-rw-r--r--libqpdf/qpdf/QPDF_Stream.hh5
-rw-r--r--qpdf/qtest/qpdf.test13
-rw-r--r--qpdf/qtest/qpdf/token-filters-out.pdf171
-rw-r--r--qpdf/test_driver.cc46
16 files changed, 631 insertions, 115 deletions
diff --git a/ChangeLog b/ChangeLog
index 256d83ea..20cb0e80 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -107,6 +107,49 @@
applications that use page-level APIs in QPDFObjectHandle to be
more tolerant of certain types of damaged files.
+ * Add QPDFObjectHandle::TokenFilter class and methods to use it to
+ perform lexical filtering on content streams. You can call
+ QPDFObjectHandle::addTokenFilter on stream object, or you can call
+ the higher level QPDFObjectHandle::addContentTokenFilter on a page
+ object to cause the stream's contents to passed through a token
+ filter while being retrieved by QPDFWriter or any other consumer.
+ For details on using TokenFilter, please see comments in
+ QPDFObjectHandle.hh.
+
+ * Enhance the string, type QPDFTokenizer::Token constructor to
+ initialize a raw value in addition to a value. Tokens have a
+ value, which is a canonical representation, and a raw value. For
+ all tokens except strings and names, the raw value and the value
+ are the same. For strings, the value excludes the outer delimiters
+ and has non-printing characters normalized. For names, the value
+ resolves non-printing characters. In order to better facilitate
+ token filters that mostly preserve contents and to enable
+ developers to be mostly unconcerned about the nuances of token
+ values and raw values, creating string and name tokens now
+ properly handles this subtlety of values and raw values. When
+ constructing string tokens, take care to avoid passing in the
+ outer delimiters. This has always been the case, but it is now
+ clarified in comments in QPDFObjectHandle.hh::TokenFilter. This
+ has no impact on any existing code unless there's some code
+ somewhere that was relying on Token::getRawValue() returning an
+ empty string for a manually constructed token. The token class's
+ operator== method still only looks at type and value, not raw
+ value. For example, string tokens for <41> and (A) would still be
+ equal because both are representations of the string "A".
+
+ * Add QPDFObjectHandle::isDataModified method. This method just
+ returns true if addTokenFilter has been called on the stream. It
+ enables a caller to determine whether it is safe to optimize away
+ piping of stream data in cases where the input and output are
+ expected to be the same. QPDFWriter uses this internally to skip
+ the optimization of not re-compressing already compressed streams
+ if addTokenFilter has been called. Most developers will not have
+ to worry about this as it is used internally in the library in the
+ places that need it. If you are manually retrieving stream data
+ with QPDFObjectHandle::getStreamData or
+ QPDFObjectHandle::pipeStreamData, you don't need to worry about
+ this at all.
+
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index 14dadd6c..f0b8f2af 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -35,6 +35,7 @@
#include <qpdf/PointerHolder.hh>
#include <qpdf/Buffer.hh>
#include <qpdf/InputSource.hh>
+#include <qpdf/QPDFTokenizer.hh>
#include <qpdf/QPDFObject.hh>
@@ -76,6 +77,66 @@ class QPDFObjectHandle
Pipeline* pipeline) = 0;
};
+ // The TokenFilter class provides a way to filter content streams
+ // in a lexically aware fashion. TokenFilters can be attached to
+ // streams using the addTokenFilter or addContentTokenFilter
+ // methods. The handleToken method is called for each token,
+ // including the eof token, and then handleEOF is called at the
+ // very end. Handlers may call write (or writeToken) to pass data
+ // downstream. The finish() method must be called exactly one time
+ // to ensure that any written data is flushed out. The default
+ // handleEOF calls finish. If you override handleEOF, you must
+ // ensure that finish() is called either there or in response to
+ // whatever event causes you to terminate creation of output.
+ // Failure to call finish() may result in some of the data you
+ // have written being lost. You should not rely on a destructor
+ // for calling finish() since the destructor call may occur later
+ // than you expect. Please see examples/token-filters.cc for
+ // examples of using TokenFilters.
+ //
+ // Please note that when you call token.getValue() on a token of
+ // type tt_string, you get the string value without any
+ // delimiters. token.getRawValue() will return something suitable
+ // for being written to output, or calling writeToken with a
+ // string token will also work. The correct way to construct a
+ // string token that would write the literal value (str) is
+ // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str").
+ class TokenFilter
+ {
+ public:
+ QPDF_DLL
+ TokenFilter()
+ {
+ }
+ QPDF_DLL
+ virtual ~TokenFilter()
+ {
+ }
+ virtual void handleToken(QPDFTokenizer::Token const&) = 0;
+ virtual void handleEOF()
+ {
+ // If you override handleEOF, you must be sure to call
+ // finish().
+ finish();
+ }
+
+ // This is called internally by the qpdf library.
+ void setPipeline(Pipeline*);
+
+ protected:
+ QPDF_DLL
+ void write(char const* data, size_t len);
+ QPDF_DLL
+ void write(std::string const& str);
+ QPDF_DLL
+ void writeToken(QPDFTokenizer::Token const&);
+ QPDF_DLL
+ void finish();
+
+ private:
+ Pipeline* pipeline;
+ };
+
// This class is used by parse to decrypt strings when reading an
// object that contains encrypted strings.
class StringDecrypter
@@ -223,6 +284,23 @@ class QPDFObjectHandle
static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks);
+ // Attach a token filter to a page's contents. If the page's
+ // contents is an array of streams, it is automatically coalesced.
+ // The token filter is applied to the page's contents as a single
+ // stream.
+ QPDF_DLL
+ void addContentTokenFilter(PointerHolder<TokenFilter> token_filter);
+
+ // As of qpdf 8, it is possible to add custom token filters to a
+ // stream. The tokenized stream data is passed through the token
+ // filter after all original filters but before content stream
+ // normalization if requested. This is a low-level interface to
+ // add it to a stream. You will usually want to call
+ // addContentTokenFilter instead, which can be applied to a page
+ // object, and which will automatically handle the case of pages
+ // whose contents are split across multiple streams.
+ void addTokenFilter(PointerHolder<TokenFilter> token_filter);
+
// Type-specific factories
QPDF_DLL
static QPDFObjectHandle newNull();
@@ -414,6 +492,13 @@ class QPDFObjectHandle
QPDF_DLL
QPDFObjectHandle getDict();
+ // If addTokenFilter has been called for this stream, then the
+ // original data should be considered to be modified. This means we
+ // should avoid optimizations such as not filtering a stream that
+ // is already compressed.
+ QPDF_DLL
+ bool isDataModified();
+
// Returns filtered (uncompressed) stream data. Throws an
// exception if the stream is filtered and we can't decode it.
QPDF_DLL
@@ -608,7 +693,7 @@ class QPDFObjectHandle
// stream or an array of streams. If this page's content is an
// array, concatenate the streams into a single stream. This can
// be useful when working with files that split content streams in
- // arbitary spots, such as in the middle of a token, as that can
+ // arbitrary spots, such as in the middle of a token, as that can
// confuse some software. You could also call this after calling
// addPageContents.
QPDF_DLL
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index fe2e95f7..eb9215aa 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -62,13 +62,8 @@ class QPDFTokenizer
{
public:
Token() : type(tt_bad) {}
-
- Token(token_type_e type, std::string const& value) :
- type(type),
- value(value)
- {
- }
-
+ QPDF_DLL
+ Token(token_type_e type, std::string const& value);
Token(token_type_e type, std::string const& value,
std::string raw_value, std::string error_message) :
type(type),
@@ -93,7 +88,7 @@ class QPDFTokenizer
{
return this->error_message;
}
- bool operator==(Token const& rhs)
+ bool operator==(Token const& rhs) const
{
// Ignore fields other than type and value
return ((this->type != tt_bad) &&
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
new file mode 100644
index 00000000..35a8ad74
--- /dev/null
+++ b/libqpdf/ContentNormalizer.cc
@@ -0,0 +1,77 @@
+#include <qpdf/ContentNormalizer.hh>
+#include <qpdf/QUtil.hh>
+
+ContentNormalizer::ContentNormalizer()
+{
+}
+
+ContentNormalizer::~ContentNormalizer()
+{
+}
+
+void
+ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
+{
+ std::string value = token.getRawValue();
+ QPDFTokenizer::token_type_e token_type = token.getType();
+
+ switch (token_type)
+ {
+ case QPDFTokenizer::tt_space:
+ {
+ size_t len = value.length();
+ for (size_t i = 0; i < len; ++i)
+ {
+ char ch = value.at(i);
+ if (ch == '\r')
+ {
+ if ((i + 1 < len) && (value.at(i + 1) == '\n'))
+ {
+ // ignore
+ }
+ else
+ {
+ write("\n");
+ }
+ }
+ else
+ {
+ write(&ch, 1);
+ }
+ }
+ }
+ break;
+
+ case QPDFTokenizer::tt_string:
+ // Replacing string and name tokens in this way normalizes
+ // their representation as this will automatically handle
+ // quoting of unprintable characters, etc.
+ writeToken(QPDFTokenizer::Token(
+ QPDFTokenizer::tt_string, token.getValue()));
+ break;
+
+ case QPDFTokenizer::tt_name:
+ writeToken(QPDFTokenizer::Token(
+ QPDFTokenizer::tt_name, token.getValue()));
+ break;
+
+ default:
+ writeToken(token);
+ break;
+ }
+
+ value = token.getRawValue();
+ if (((token_type == QPDFTokenizer::tt_string) ||
+ (token_type == QPDFTokenizer::tt_name)) &&
+ ((value.find('\r') != std::string::npos) ||
+ (value.find('\n') != std::string::npos)))
+ {
+ write("\n");
+ }
+}
+
+void
+ContentNormalizer::handleEOF()
+{
+ finish();
+}
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 9595cd75..4fc37767 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,107 +1,51 @@
#include <qpdf/Pl_QPDFTokenizer.hh>
-#include <qpdf/QPDF_String.hh>
-#include <qpdf/QPDF_Name.hh>
#include <qpdf/QTC.hh>
-#include <qpdf/QUtil.hh>
#include <stdexcept>
#include <string.h>
-Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
- Pipeline(identifier, next),
- just_wrote_nl(false),
+Pl_QPDFTokenizer::Members::Members() :
+ filter(0),
last_char_was_cr(false),
unread_char(false),
char_to_unread('\0')
{
- tokenizer.allowEOF();
- tokenizer.includeIgnorable();
}
-Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
+Pl_QPDFTokenizer::Members::~Members()
{
}
-void
-Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)
+Pl_QPDFTokenizer::Pl_QPDFTokenizer(
+ char const* identifier,
+ QPDFObjectHandle::TokenFilter* filter)
+ :
+ Pipeline(identifier, 0),
+ m(new Members)
{
- if (len)
- {
- getNext()->write(QUtil::unsigned_char_pointer(buf), len);
- this->just_wrote_nl = (buf[len-1] == '\n');
- }
+ m->filter = filter;
+ m->tokenizer.allowEOF();
+ m->tokenizer.includeIgnorable();
}
-void
-Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
+Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
{
- std::string value = token.getRawValue();
-
- switch (token.getType())
- {
- case QPDFTokenizer::tt_space:
- {
- size_t len = value.length();
- for (size_t i = 0; i < len; ++i)
- {
- char ch = value.at(i);
- if (ch == '\r')
- {
- if ((i + 1 < len) && (value.at(i + 1) == '\n'))
- {
- // ignore
- }
- else
- {
- writeNext("\n", 1);
- }
- }
- else
- {
- writeNext(&ch, 1);
- }
- }
- }
- value.clear();
- break;
-
- case QPDFTokenizer::tt_string:
- value = QPDF_String(token.getValue()).unparse();
-
- break;
-
- case QPDFTokenizer::tt_name:
- value = QPDF_Name(token.getValue()).unparse();
- break;
-
- default:
- break;
- }
- writeNext(value.c_str(), value.length());
}
void
Pl_QPDFTokenizer::processChar(char ch)
{
- tokenizer.presentCharacter(ch);
+ this->m->tokenizer.presentCharacter(ch);
QPDFTokenizer::Token token;
- if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ if (this->m->tokenizer.getToken(
+ token, this->m->unread_char, this->m->char_to_unread))
{
- writeToken(token);
- std::string value = token.getRawValue();
- QPDFTokenizer::token_type_e token_type = token.getType();
- if (((token_type == QPDFTokenizer::tt_string) ||
- (token_type == QPDFTokenizer::tt_name)) &&
- ((value.find('\r') != std::string::npos) ||
- (value.find('\n') != std::string::npos)))
+ this->m->filter->handleToken(token);
+ if ((token.getType() == QPDFTokenizer::tt_word) &&
+ (token.getValue() == "ID"))
{
- writeNext("\n", 1);
- }
- if ((token.getType() == QPDFTokenizer::tt_word) &&
- (token.getValue() == "ID"))
- {
QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
- tokenizer.expectInlineImage();
- }
+ this->m->tokenizer.expectInlineImage();
+ }
}
}
@@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch)
void
Pl_QPDFTokenizer::checkUnread()
{
- if (this->unread_char)
+ if (this->m->unread_char)
{
- processChar(this->char_to_unread);
- if (this->unread_char)
+ processChar(this->m->char_to_unread);
+ if (this->m->unread_char)
{
throw std::logic_error(
"INTERNAL ERROR: unread_char still true after processing "
@@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
void
Pl_QPDFTokenizer::finish()
{
- this->tokenizer.presentEOF();
+ this->m->tokenizer.presentEOF();
QPDFTokenizer::Token token;
- if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ if (this->m->tokenizer.getToken(
+ token, this->m->unread_char, this->m->char_to_unread))
{
- writeToken(token);
- if (unread_char)
- {
- if (this->char_to_unread == '\r')
- {
- this->char_to_unread = '\n';
- }
- writeNext(&this->char_to_unread, 1);
- }
+ this->m->filter->handleToken(token);
}
- getNext()->finish();
+ this->m->filter->handleEOF();
}
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 51de87e1..bba95938 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -63,6 +63,50 @@ CoalesceProvider::provideStreamData(int, int, Pipeline* p)
}
void
+QPDFObjectHandle::TokenFilter::setPipeline(Pipeline* p)
+{
+ this->pipeline = p;
+}
+
+void
+QPDFObjectHandle::TokenFilter::write(char const* data, size_t len)
+{
+ if (! this->pipeline)
+ {
+ throw std::logic_error(
+ "TokenFilter::write called before setPipeline");
+ }
+ if (len)
+ {
+ this->pipeline->write(QUtil::unsigned_char_pointer(data), len);
+ }
+}
+
+void
+QPDFObjectHandle::TokenFilter::write(std::string const& str)
+{
+ write(str.c_str(), str.length());
+}
+
+void
+QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
+{
+ std::string value = token.getRawValue();
+ write(value.c_str(), value.length());
+}
+
+void
+QPDFObjectHandle::TokenFilter::finish()
+{
+ if (! this->pipeline)
+ {
+ throw std::logic_error(
+ "TokenFilter::finish called before setPipeline");
+ }
+ this->pipeline->finish();
+}
+
+void
QPDFObjectHandle::ParserCallbacks::terminateParsing()
{
throw TerminateParsing();
@@ -508,6 +552,13 @@ QPDFObjectHandle::getDict()
return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict();
}
+bool
+QPDFObjectHandle::isDataModified()
+{
+ assertStream();
+ return dynamic_cast<QPDF_Stream*>(obj.getPointer())->isDataModified();
+}
+
void
QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict)
{
@@ -1033,6 +1084,21 @@ QPDFObjectHandle::parseContentStream_data(
}
}
+void
+QPDFObjectHandle::addContentTokenFilter(PointerHolder<TokenFilter> filter)
+{
+ coalesceContentStreams();
+ this->getKey("/Contents").addTokenFilter(filter);
+}
+
+void
+QPDFObjectHandle::addTokenFilter(PointerHolder<TokenFilter> filter)
+{
+ assertStream();
+ return dynamic_cast<QPDF_Stream*>(
+ obj.getPointer())->addTokenFilter(filter);
+}
+
QPDFObjectHandle
QPDFObjectHandle::parse(PointerHolder<InputSource> input,
std::string const& object_description,
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 078b1af0..c3a017d0 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -7,6 +7,7 @@
#include <qpdf/QTC.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFObjectHandle.hh>
#include <stdexcept>
#include <string.h>
@@ -39,6 +40,23 @@ QPDFTokenizer::Members::~Members()
{
}
+QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
+ type(type),
+ value(value),
+ raw_value(value)
+{
+ if (type == tt_string)
+ {
+ raw_value = QPDFObjectHandle::newString(value).unparse();
+ }
+ else if (type == tt_string)
+ {
+ raw_value = QPDFObjectHandle::newName(value).unparse();
+ }
+}
+
+
+
QPDFTokenizer::QPDFTokenizer() :
m(new Members())
{
diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc
index f7f834b5..f277189a 100644
--- a/libqpdf/QPDFWriter.cc
+++ b/libqpdf/QPDFWriter.cc
@@ -1591,7 +1591,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
{
is_metadata = true;
}
- bool filter = (this->m->compress_streams ||
+ bool filter = (object.isDataModified() ||
+ this->m->compress_streams ||
this->m->stream_decode_level);
if (this->m->compress_streams)
{
@@ -1602,7 +1603,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
// compressed with a lossy compression scheme, but we
// don't support any of those right now.
QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter");
- if (filter_obj.isName() &&
+ if ((! object.isDataModified()) &&
+ filter_obj.isName() &&
((filter_obj.getName() == "/FlateDecode") ||
(filter_obj.getName() == "/Fl")))
{
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index 89b6b5a9..a026f9a4 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -13,7 +13,7 @@
#include <qpdf/Pl_RunLength.hh>
#include <qpdf/Pl_DCT.hh>
#include <qpdf/Pl_Count.hh>
-
+#include <qpdf/ContentNormalizer.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFExc.hh>
@@ -91,6 +91,12 @@ QPDF_Stream::getDict() const
return this->stream_dict;
}
+bool
+QPDF_Stream::isDataModified() const
+{
+ return (! this->token_filters.empty());
+}
+
PointerHolder<Buffer>
QPDF_Stream::getStreamData(qpdf_stream_decode_level_e decode_level)
{
@@ -440,21 +446,36 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
// create to be deleted when this function finishes.
std::vector<PointerHolder<Pipeline> > to_delete;
+ PointerHolder<ContentNormalizer> normalizer;
if (filter)
{
if (encode_flags & qpdf_ef_compress)
{
- pipeline = new Pl_Flate("compress object stream", pipeline,
+ pipeline = new Pl_Flate("compress stream", pipeline,
Pl_Flate::a_deflate);
to_delete.push_back(pipeline);
}
if (encode_flags & qpdf_ef_normalize)
{
- pipeline = new Pl_QPDFTokenizer("normalizer", pipeline);
+ normalizer = new ContentNormalizer();
+ normalizer->setPipeline(pipeline);
+ pipeline = new Pl_QPDFTokenizer(
+ "normalizer", normalizer.getPointer());
to_delete.push_back(pipeline);
}
+ for (std::vector<PointerHolder<
+ QPDFObjectHandle::TokenFilter> >::reverse_iterator iter =
+ this->token_filters.rbegin();
+ iter != this->token_filters.rend(); ++iter)
+ {
+ (*iter)->setPipeline(pipeline);
+ pipeline = new Pl_QPDFTokenizer(
+ "token filter", (*iter).getPointer());
+ to_delete.push_back(pipeline);
+ }
+
for (std::vector<std::string>::reverse_iterator iter = filters.rbegin();
iter != filters.rend(); ++iter)
{
@@ -613,6 +634,13 @@ QPDF_Stream::replaceStreamData(
}
void
+QPDF_Stream::addTokenFilter(
+ PointerHolder<QPDFObjectHandle::TokenFilter> token_filter)
+{
+ this->token_filters.push_back(token_filter);
+}
+
+void
QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter,
QPDFObjectHandle const& decode_parms,
size_t length)
diff --git a/libqpdf/build.mk b/libqpdf/build.mk
index c75c3dd9..11895623 100644
--- a/libqpdf/build.mk
+++ b/libqpdf/build.mk
@@ -9,6 +9,7 @@ SRCS_libqpdf = \
libqpdf/BitWriter.cc \
libqpdf/Buffer.cc \
libqpdf/BufferInputSource.cc \
+ libqpdf/ContentNormalizer.cc \
libqpdf/FileInputSource.cc \
libqpdf/InputSource.cc \
libqpdf/InsecureRandomDataProvider.cc \
diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh
new file mode 100644
index 00000000..504f15e8
--- /dev/null
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@@ -0,0 +1,15 @@
+#ifndef __CONTENTNORMALIZER_HH__
+#define __CONTENTNORMALIZER_HH__
+
+#include <qpdf/QPDFObjectHandle.hh>
+
+class ContentNormalizer: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ ContentNormalizer();
+ virtual ~ContentNormalizer();
+ virtual void handleToken(QPDFTokenizer::Token const&);
+ virtual void handleEOF();
+};
+
+#endif // __CONTENTNORMALIZER_HH__
diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
index 54507f68..9f4ac133 100644
--- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh
+++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
@@ -4,6 +4,8 @@
#include <qpdf/Pipeline.hh>
#include <qpdf/QPDFTokenizer.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/QPDFObjectHandle.hh>
//
// Treat incoming text as a stream consisting of valid PDF tokens, but
@@ -16,7 +18,8 @@
class Pl_QPDFTokenizer: public Pipeline
{
public:
- Pl_QPDFTokenizer(char const* identifier, Pipeline* next);
+ Pl_QPDFTokenizer(char const* identifier,
+ QPDFObjectHandle::TokenFilter* filter);
virtual ~Pl_QPDFTokenizer();
virtual void write(unsigned char* buf, size_t len);
virtual void finish();
@@ -24,14 +27,25 @@ class Pl_QPDFTokenizer: public Pipeline
private:
void processChar(char ch);
void checkUnread();
- void writeNext(char const*, size_t len);
- void writeToken(QPDFTokenizer::Token&);
-
- QPDFTokenizer tokenizer;
- bool just_wrote_nl;
- bool last_char_was_cr;
- bool unread_char;
- char char_to_unread;
+
+ class Members
+ {
+ friend class Pl_QPDFTokenizer;
+
+ public:
+ ~Members();
+
+ private:
+ Members();
+ Members(Members const&);
+
+ QPDFObjectHandle::TokenFilter* filter;
+ QPDFTokenizer tokenizer;
+ bool last_char_was_cr;
+ bool unread_char;
+ char char_to_unread;
+ };
+ PointerHolder<Members> m;
};
#endif // __PL_QPDFTOKENIZER_HH__
diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh
index 5350fc0d..86b796cf 100644
--- a/libqpdf/qpdf/QPDF_Stream.hh
+++ b/libqpdf/qpdf/QPDF_Stream.hh
@@ -20,6 +20,7 @@ class QPDF_Stream: public QPDFObject
virtual QPDFObject::object_type_e getTypeCode() const;
virtual char const* getTypeName() const;
QPDFObjectHandle getDict() const;
+ bool isDataModified() const;
// See comments in QPDFObjectHandle.hh for these methods.
bool pipeStreamData(Pipeline*,
@@ -35,6 +36,8 @@ class QPDF_Stream: public QPDFObject
PointerHolder<QPDFObjectHandle::StreamDataProvider> provider,
QPDFObjectHandle const& filter,
QPDFObjectHandle const& decode_parms);
+ void addTokenFilter(
+ PointerHolder<QPDFObjectHandle::TokenFilter> token_filter);
void replaceDict(QPDFObjectHandle new_dict);
@@ -72,6 +75,8 @@ class QPDF_Stream: public QPDFObject
size_t length;
PointerHolder<Buffer> stream_data;
PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider;
+ std::vector<
+ PointerHolder<QPDFObjectHandle::TokenFilter> > token_filters;
};
#endif // __QPDF_STREAM_HH__
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 9d279267..a3572859 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -758,6 +758,19 @@ $td->runtest("check output",
show_ntests();
# ----------
+$td->notify("--- Token filters ---");
+$n_tests += 2;
+
+$td->runtest("token filter",
+ {$td->COMMAND => "test_driver 41 coalesce.pdf"},
+ {$td->STRING => "test 41 done\n", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "token-filters-out.pdf"});
+
+show_ntests();
+# ----------
$td->notify("--- Newline before endstream ---");
$n_tests += 10;
diff --git a/qpdf/qtest/qpdf/token-filters-out.pdf b/qpdf/qtest/qpdf/token-filters-out.pdf
new file mode 100644
index 00000000..6d24497c
--- /dev/null
+++ b/qpdf/qtest/qpdf/token-filters-out.pdf
@@ -0,0 +1,171 @@
+%PDF-1.3
+%
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+ /Pages 2 0 R
+ /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+ /Count 2
+ /Kids [
+ 3 0 R
+ 4 0 R
+ ]
+ /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+ /Contents 5 0 R
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 7 0 R
+ >>
+ /ProcSet 8 0 R
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+ /Contents 9 0 R
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 11 0 R
+ >>
+ /ProcSet 12 0 R
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 19 0
+5 0 obj
+<<
+ /Length 6 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Salad) Tj
+ET [ /array/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHtQTt*hUw%)p"DiRjDYNUAvF& u#cW ߉WO
+EI/bye
+endstream
+endobj
+
+6 0 obj
+375
+endobj
+
+%% Original object ID: 13 0
+7 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Name /F1
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+8 0 obj
+[
+ /PDF
+ /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+9 0 obj
+<<
+ /Length 10 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Salad) Tj
+ET
+/bye
+endstream
+endobj
+
+10 0 obj
+48
+endobj
+
+%% Original object ID: 17 0
+11 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Name /F1
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+12 0 obj
+[
+ /PDF
+ /Text
+]
+endobj
+
+xref
+0 13
+0000000000 65535 f
+0000000052 00000 n
+0000000133 00000 n
+0000000252 00000 n
+0000000481 00000 n
+0000000726 00000 n
+0000001156 00000 n
+0000001204 00000 n
+0000001350 00000 n
+0000001436 00000 n
+0000001540 00000 n
+0000001588 00000 n
+0000001735 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 13
+ /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+1771
+%%EOF
diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc
index 001e6dfb..027d942c 100644
--- a/qpdf/test_driver.cc
+++ b/qpdf/test_driver.cc
@@ -97,6 +97,36 @@ ParserCallbacks::handleEOF()
std::cout << "-EOF-" << std::endl;
}
+class TokenFilter: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ TokenFilter()
+ {
+ }
+ virtual ~TokenFilter()
+ {
+ }
+ virtual void handleToken(QPDFTokenizer::Token const& t)
+ {
+ if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Potato"))
+ {
+ // Exercise unparsing of strings by token constructor
+ writeToken(
+ QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Salad"));
+ }
+ else
+ {
+ writeToken(t);
+ }
+ }
+ virtual void handleEOF()
+ {
+ writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/bye"));
+ write("\n");
+ finish();
+ }
+};
+
static std::string getPageContents(QPDFObjectHandle page)
{
PointerHolder<Buffer> b1 =
@@ -1345,6 +1375,22 @@ void runtest(int n, char const* filename1, char const* arg2)
w.setStaticID(true);
w.write();
}
+ else if (n == 41)
+ {
+ // Apply a token filter. This test case is crafted to work
+ // with coalesce.pdf.
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+ for (std::vector<QPDFObjectHandle>::iterator iter =
+ pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ (*iter).addContentTokenFilter(new TokenFilter);
+ }
+ QPDFWriter w(pdf, "a.pdf");
+ w.setQDFMode(true);
+ w.setStaticID(true);
+ w.write();
+ }
else
{
throw std::runtime_error(std::string("invalid test ") +