diff options
-rw-r--r-- | ChangeLog | 43 | ||||
-rw-r--r-- | include/qpdf/QPDFObjectHandle.hh | 87 | ||||
-rw-r--r-- | include/qpdf/QPDFTokenizer.hh | 11 | ||||
-rw-r--r-- | libqpdf/ContentNormalizer.cc | 77 | ||||
-rw-r--r-- | libqpdf/Pl_QPDFTokenizer.cc | 121 | ||||
-rw-r--r-- | libqpdf/QPDFObjectHandle.cc | 66 | ||||
-rw-r--r-- | libqpdf/QPDFTokenizer.cc | 18 | ||||
-rw-r--r-- | libqpdf/QPDFWriter.cc | 6 | ||||
-rw-r--r-- | libqpdf/QPDF_Stream.cc | 34 | ||||
-rw-r--r-- | libqpdf/build.mk | 1 | ||||
-rw-r--r-- | libqpdf/qpdf/ContentNormalizer.hh | 15 | ||||
-rw-r--r-- | libqpdf/qpdf/Pl_QPDFTokenizer.hh | 32 | ||||
-rw-r--r-- | libqpdf/qpdf/QPDF_Stream.hh | 5 | ||||
-rw-r--r-- | qpdf/qtest/qpdf.test | 13 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/token-filters-out.pdf | 171 | ||||
-rw-r--r-- | qpdf/test_driver.cc | 46 |
16 files changed, 631 insertions, 115 deletions
@@ -107,6 +107,49 @@ applications that use page-level APIs in QPDFObjectHandle to be more tolerant of certain types of damaged files. + * Add QPDFObjectHandle::TokenFilter class and methods to use it to + perform lexical filtering on content streams. You can call + QPDFObjectHandle::addTokenFilter on stream object, or you can call + the higher level QPDFObjectHandle::addContentTokenFilter on a page + object to cause the stream's contents to passed through a token + filter while being retrieved by QPDFWriter or any other consumer. + For details on using TokenFilter, please see comments in + QPDFObjectHandle.hh. + + * Enhance the string, type QPDFTokenizer::Token constructor to + initialize a raw value in addition to a value. Tokens have a + value, which is a canonical representation, and a raw value. For + all tokens except strings and names, the raw value and the value + are the same. For strings, the value excludes the outer delimiters + and has non-printing characters normalized. For names, the value + resolves non-printing characters. In order to better facilitate + token filters that mostly preserve contents and to enable + developers to be mostly unconcerned about the nuances of token + values and raw values, creating string and name tokens now + properly handles this subtlety of values and raw values. When + constructing string tokens, take care to avoid passing in the + outer delimiters. This has always been the case, but it is now + clarified in comments in QPDFObjectHandle.hh::TokenFilter. This + has no impact on any existing code unless there's some code + somewhere that was relying on Token::getRawValue() returning an + empty string for a manually constructed token. The token class's + operator== method still only looks at type and value, not raw + value. For example, string tokens for <41> and (A) would still be + equal because both are representations of the string "A". + + * Add QPDFObjectHandle::isDataModified method. This method just + returns true if addTokenFilter has been called on the stream. It + enables a caller to determine whether it is safe to optimize away + piping of stream data in cases where the input and output are + expected to be the same. QPDFWriter uses this internally to skip + the optimization of not re-compressing already compressed streams + if addTokenFilter has been called. Most developers will not have + to worry about this as it is used internally in the library in the + places that need it. If you are manually retrieving stream data + with QPDFObjectHandle::getStreamData or + QPDFObjectHandle::pipeStreamData, you don't need to worry about + this at all. + 2018-02-04 Jay Berkenbilt <ejb@ql.org> * Add QPDFWriter::setLinearizationPass1Filename method and diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 14dadd6c..f0b8f2af 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -35,6 +35,7 @@ #include <qpdf/PointerHolder.hh> #include <qpdf/Buffer.hh> #include <qpdf/InputSource.hh> +#include <qpdf/QPDFTokenizer.hh> #include <qpdf/QPDFObject.hh> @@ -76,6 +77,66 @@ class QPDFObjectHandle Pipeline* pipeline) = 0; }; + // The TokenFilter class provides a way to filter content streams + // in a lexically aware fashion. TokenFilters can be attached to + // streams using the addTokenFilter or addContentTokenFilter + // methods. The handleToken method is called for each token, + // including the eof token, and then handleEOF is called at the + // very end. Handlers may call write (or writeToken) to pass data + // downstream. The finish() method must be called exactly one time + // to ensure that any written data is flushed out. The default + // handleEOF calls finish. If you override handleEOF, you must + // ensure that finish() is called either there or in response to + // whatever event causes you to terminate creation of output. + // Failure to call finish() may result in some of the data you + // have written being lost. You should not rely on a destructor + // for calling finish() since the destructor call may occur later + // than you expect. Please see examples/token-filters.cc for + // examples of using TokenFilters. + // + // Please note that when you call token.getValue() on a token of + // type tt_string, you get the string value without any + // delimiters. token.getRawValue() will return something suitable + // for being written to output, or calling writeToken with a + // string token will also work. The correct way to construct a + // string token that would write the literal value (str) is + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). + class TokenFilter + { + public: + QPDF_DLL + TokenFilter() + { + } + QPDF_DLL + virtual ~TokenFilter() + { + } + virtual void handleToken(QPDFTokenizer::Token const&) = 0; + virtual void handleEOF() + { + // If you override handleEOF, you must be sure to call + // finish(). + finish(); + } + + // This is called internally by the qpdf library. + void setPipeline(Pipeline*); + + protected: + QPDF_DLL + void write(char const* data, size_t len); + QPDF_DLL + void write(std::string const& str); + QPDF_DLL + void writeToken(QPDFTokenizer::Token const&); + QPDF_DLL + void finish(); + + private: + Pipeline* pipeline; + }; + // This class is used by parse to decrypt strings when reading an // object that contains encrypted strings. class StringDecrypter @@ -223,6 +284,23 @@ class QPDFObjectHandle static void parseContentStream(QPDFObjectHandle stream_or_array, ParserCallbacks* callbacks); + // Attach a token filter to a page's contents. If the page's + // contents is an array of streams, it is automatically coalesced. + // The token filter is applied to the page's contents as a single + // stream. + QPDF_DLL + void addContentTokenFilter(PointerHolder<TokenFilter> token_filter); + + // As of qpdf 8, it is possible to add custom token filters to a + // stream. The tokenized stream data is passed through the token + // filter after all original filters but before content stream + // normalization if requested. This is a low-level interface to + // add it to a stream. You will usually want to call + // addContentTokenFilter instead, which can be applied to a page + // object, and which will automatically handle the case of pages + // whose contents are split across multiple streams. + void addTokenFilter(PointerHolder<TokenFilter> token_filter); + // Type-specific factories QPDF_DLL static QPDFObjectHandle newNull(); @@ -414,6 +492,13 @@ class QPDFObjectHandle QPDF_DLL QPDFObjectHandle getDict(); + // If addTokenFilter has been called for this stream, then the + // original data should be considered to be modified. This means we + // should avoid optimizations such as not filtering a stream that + // is already compressed. + QPDF_DLL + bool isDataModified(); + // Returns filtered (uncompressed) stream data. Throws an // exception if the stream is filtered and we can't decode it. QPDF_DLL @@ -608,7 +693,7 @@ class QPDFObjectHandle // stream or an array of streams. If this page's content is an // array, concatenate the streams into a single stream. This can // be useful when working with files that split content streams in - // arbitary spots, such as in the middle of a token, as that can + // arbitrary spots, such as in the middle of a token, as that can // confuse some software. You could also call this after calling // addPageContents. QPDF_DLL diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index fe2e95f7..eb9215aa 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -62,13 +62,8 @@ class QPDFTokenizer { public: Token() : type(tt_bad) {} - - Token(token_type_e type, std::string const& value) : - type(type), - value(value) - { - } - + QPDF_DLL + Token(token_type_e type, std::string const& value); Token(token_type_e type, std::string const& value, std::string raw_value, std::string error_message) : type(type), @@ -93,7 +88,7 @@ class QPDFTokenizer { return this->error_message; } - bool operator==(Token const& rhs) + bool operator==(Token const& rhs) const { // Ignore fields other than type and value return ((this->type != tt_bad) && diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc new file mode 100644 index 00000000..35a8ad74 --- /dev/null +++ b/libqpdf/ContentNormalizer.cc @@ -0,0 +1,77 @@ +#include <qpdf/ContentNormalizer.hh> +#include <qpdf/QUtil.hh> + +ContentNormalizer::ContentNormalizer() +{ +} + +ContentNormalizer::~ContentNormalizer() +{ +} + +void +ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) +{ + std::string value = token.getRawValue(); + QPDFTokenizer::token_type_e token_type = token.getType(); + + switch (token_type) + { + case QPDFTokenizer::tt_space: + { + size_t len = value.length(); + for (size_t i = 0; i < len; ++i) + { + char ch = value.at(i); + if (ch == '\r') + { + if ((i + 1 < len) && (value.at(i + 1) == '\n')) + { + // ignore + } + else + { + write("\n"); + } + } + else + { + write(&ch, 1); + } + } + } + break; + + case QPDFTokenizer::tt_string: + // Replacing string and name tokens in this way normalizes + // their representation as this will automatically handle + // quoting of unprintable characters, etc. + writeToken(QPDFTokenizer::Token( + QPDFTokenizer::tt_string, token.getValue())); + break; + + case QPDFTokenizer::tt_name: + writeToken(QPDFTokenizer::Token( + QPDFTokenizer::tt_name, token.getValue())); + break; + + default: + writeToken(token); + break; + } + + value = token.getRawValue(); + if (((token_type == QPDFTokenizer::tt_string) || + (token_type == QPDFTokenizer::tt_name)) && + ((value.find('\r') != std::string::npos) || + (value.find('\n') != std::string::npos))) + { + write("\n"); + } +} + +void +ContentNormalizer::handleEOF() +{ + finish(); +} diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 9595cd75..4fc37767 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -1,107 +1,51 @@ #include <qpdf/Pl_QPDFTokenizer.hh> -#include <qpdf/QPDF_String.hh> -#include <qpdf/QPDF_Name.hh> #include <qpdf/QTC.hh> -#include <qpdf/QUtil.hh> #include <stdexcept> #include <string.h> -Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : - Pipeline(identifier, next), - just_wrote_nl(false), +Pl_QPDFTokenizer::Members::Members() : + filter(0), last_char_was_cr(false), unread_char(false), char_to_unread('\0') { - tokenizer.allowEOF(); - tokenizer.includeIgnorable(); } -Pl_QPDFTokenizer::~Pl_QPDFTokenizer() +Pl_QPDFTokenizer::Members::~Members() { } -void -Pl_QPDFTokenizer::writeNext(char const* buf, size_t len) +Pl_QPDFTokenizer::Pl_QPDFTokenizer( + char const* identifier, + QPDFObjectHandle::TokenFilter* filter) + : + Pipeline(identifier, 0), + m(new Members) { - if (len) - { - getNext()->write(QUtil::unsigned_char_pointer(buf), len); - this->just_wrote_nl = (buf[len-1] == '\n'); - } + m->filter = filter; + m->tokenizer.allowEOF(); + m->tokenizer.includeIgnorable(); } -void -Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() { - std::string value = token.getRawValue(); - - switch (token.getType()) - { - case QPDFTokenizer::tt_space: - { - size_t len = value.length(); - for (size_t i = 0; i < len; ++i) - { - char ch = value.at(i); - if (ch == '\r') - { - if ((i + 1 < len) && (value.at(i + 1) == '\n')) - { - // ignore - } - else - { - writeNext("\n", 1); - } - } - else - { - writeNext(&ch, 1); - } - } - } - value.clear(); - break; - - case QPDFTokenizer::tt_string: - value = QPDF_String(token.getValue()).unparse(); - - break; - - case QPDFTokenizer::tt_name: - value = QPDF_Name(token.getValue()).unparse(); - break; - - default: - break; - } - writeNext(value.c_str(), value.length()); } void Pl_QPDFTokenizer::processChar(char ch) { - tokenizer.presentCharacter(ch); + this->m->tokenizer.presentCharacter(ch); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - std::string value = token.getRawValue(); - QPDFTokenizer::token_type_e token_type = token.getType(); - if (((token_type == QPDFTokenizer::tt_string) || - (token_type == QPDFTokenizer::tt_name)) && - ((value.find('\r') != std::string::npos) || - (value.find('\n') != std::string::npos))) + this->m->filter->handleToken(token); + if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "ID")) { - writeNext("\n", 1); - } - if ((token.getType() == QPDFTokenizer::tt_word) && - (token.getValue() == "ID")) - { QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); - tokenizer.expectInlineImage(); - } + this->m->tokenizer.expectInlineImage(); + } } } @@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch) void Pl_QPDFTokenizer::checkUnread() { - if (this->unread_char) + if (this->m->unread_char) { - processChar(this->char_to_unread); - if (this->unread_char) + processChar(this->m->char_to_unread); + if (this->m->unread_char) { throw std::logic_error( "INTERNAL ERROR: unread_char still true after processing " @@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len) void Pl_QPDFTokenizer::finish() { - this->tokenizer.presentEOF(); + this->m->tokenizer.presentEOF(); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - if (unread_char) - { - if (this->char_to_unread == '\r') - { - this->char_to_unread = '\n'; - } - writeNext(&this->char_to_unread, 1); - } + this->m->filter->handleToken(token); } - getNext()->finish(); + this->m->filter->handleEOF(); } diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 51de87e1..bba95938 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -63,6 +63,50 @@ CoalesceProvider::provideStreamData(int, int, Pipeline* p) } void +QPDFObjectHandle::TokenFilter::setPipeline(Pipeline* p) +{ + this->pipeline = p; +} + +void +QPDFObjectHandle::TokenFilter::write(char const* data, size_t len) +{ + if (! this->pipeline) + { + throw std::logic_error( + "TokenFilter::write called before setPipeline"); + } + if (len) + { + this->pipeline->write(QUtil::unsigned_char_pointer(data), len); + } +} + +void +QPDFObjectHandle::TokenFilter::write(std::string const& str) +{ + write(str.c_str(), str.length()); +} + +void +QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) +{ + std::string value = token.getRawValue(); + write(value.c_str(), value.length()); +} + +void +QPDFObjectHandle::TokenFilter::finish() +{ + if (! this->pipeline) + { + throw std::logic_error( + "TokenFilter::finish called before setPipeline"); + } + this->pipeline->finish(); +} + +void QPDFObjectHandle::ParserCallbacks::terminateParsing() { throw TerminateParsing(); @@ -508,6 +552,13 @@ QPDFObjectHandle::getDict() return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict(); } +bool +QPDFObjectHandle::isDataModified() +{ + assertStream(); + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->isDataModified(); +} + void QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict) { @@ -1033,6 +1084,21 @@ QPDFObjectHandle::parseContentStream_data( } } +void +QPDFObjectHandle::addContentTokenFilter(PointerHolder<TokenFilter> filter) +{ + coalesceContentStreams(); + this->getKey("/Contents").addTokenFilter(filter); +} + +void +QPDFObjectHandle::addTokenFilter(PointerHolder<TokenFilter> filter) +{ + assertStream(); + return dynamic_cast<QPDF_Stream*>( + obj.getPointer())->addTokenFilter(filter); +} + QPDFObjectHandle QPDFObjectHandle::parse(PointerHolder<InputSource> input, std::string const& object_description, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 078b1af0..c3a017d0 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -7,6 +7,7 @@ #include <qpdf/QTC.hh> #include <qpdf/QPDFExc.hh> #include <qpdf/QUtil.hh> +#include <qpdf/QPDFObjectHandle.hh> #include <stdexcept> #include <string.h> @@ -39,6 +40,23 @@ QPDFTokenizer::Members::~Members() { } +QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : + type(type), + value(value), + raw_value(value) +{ + if (type == tt_string) + { + raw_value = QPDFObjectHandle::newString(value).unparse(); + } + else if (type == tt_string) + { + raw_value = QPDFObjectHandle::newName(value).unparse(); + } +} + + + QPDFTokenizer::QPDFTokenizer() : m(new Members()) { diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index f7f834b5..f277189a 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -1591,7 +1591,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, { is_metadata = true; } - bool filter = (this->m->compress_streams || + bool filter = (object.isDataModified() || + this->m->compress_streams || this->m->stream_decode_level); if (this->m->compress_streams) { @@ -1602,7 +1603,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, // compressed with a lossy compression scheme, but we // don't support any of those right now. QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); - if (filter_obj.isName() && + if ((! object.isDataModified()) && + filter_obj.isName() && ((filter_obj.getName() == "/FlateDecode") || (filter_obj.getName() == "/Fl"))) { diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index 89b6b5a9..a026f9a4 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -13,7 +13,7 @@ #include <qpdf/Pl_RunLength.hh> #include <qpdf/Pl_DCT.hh> #include <qpdf/Pl_Count.hh> - +#include <qpdf/ContentNormalizer.hh> #include <qpdf/QTC.hh> #include <qpdf/QPDF.hh> #include <qpdf/QPDFExc.hh> @@ -91,6 +91,12 @@ QPDF_Stream::getDict() const return this->stream_dict; } +bool +QPDF_Stream::isDataModified() const +{ + return (! this->token_filters.empty()); +} + PointerHolder<Buffer> QPDF_Stream::getStreamData(qpdf_stream_decode_level_e decode_level) { @@ -440,21 +446,36 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, // create to be deleted when this function finishes. std::vector<PointerHolder<Pipeline> > to_delete; + PointerHolder<ContentNormalizer> normalizer; if (filter) { if (encode_flags & qpdf_ef_compress) { - pipeline = new Pl_Flate("compress object stream", pipeline, + pipeline = new Pl_Flate("compress stream", pipeline, Pl_Flate::a_deflate); to_delete.push_back(pipeline); } if (encode_flags & qpdf_ef_normalize) { - pipeline = new Pl_QPDFTokenizer("normalizer", pipeline); + normalizer = new ContentNormalizer(); + normalizer->setPipeline(pipeline); + pipeline = new Pl_QPDFTokenizer( + "normalizer", normalizer.getPointer()); to_delete.push_back(pipeline); } + for (std::vector<PointerHolder< + QPDFObjectHandle::TokenFilter> >::reverse_iterator iter = + this->token_filters.rbegin(); + iter != this->token_filters.rend(); ++iter) + { + (*iter)->setPipeline(pipeline); + pipeline = new Pl_QPDFTokenizer( + "token filter", (*iter).getPointer()); + to_delete.push_back(pipeline); + } + for (std::vector<std::string>::reverse_iterator iter = filters.rbegin(); iter != filters.rend(); ++iter) { @@ -613,6 +634,13 @@ QPDF_Stream::replaceStreamData( } void +QPDF_Stream::addTokenFilter( + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter) +{ + this->token_filters.push_back(token_filter); +} + +void QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms, size_t length) diff --git a/libqpdf/build.mk b/libqpdf/build.mk index c75c3dd9..11895623 100644 --- a/libqpdf/build.mk +++ b/libqpdf/build.mk @@ -9,6 +9,7 @@ SRCS_libqpdf = \ libqpdf/BitWriter.cc \ libqpdf/Buffer.cc \ libqpdf/BufferInputSource.cc \ + libqpdf/ContentNormalizer.cc \ libqpdf/FileInputSource.cc \ libqpdf/InputSource.cc \ libqpdf/InsecureRandomDataProvider.cc \ diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh new file mode 100644 index 00000000..504f15e8 --- /dev/null +++ b/libqpdf/qpdf/ContentNormalizer.hh @@ -0,0 +1,15 @@ +#ifndef __CONTENTNORMALIZER_HH__ +#define __CONTENTNORMALIZER_HH__ + +#include <qpdf/QPDFObjectHandle.hh> + +class ContentNormalizer: public QPDFObjectHandle::TokenFilter +{ + public: + ContentNormalizer(); + virtual ~ContentNormalizer(); + virtual void handleToken(QPDFTokenizer::Token const&); + virtual void handleEOF(); +}; + +#endif // __CONTENTNORMALIZER_HH__ diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh index 54507f68..9f4ac133 100644 --- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh +++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh @@ -4,6 +4,8 @@ #include <qpdf/Pipeline.hh> #include <qpdf/QPDFTokenizer.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/QPDFObjectHandle.hh> // // Treat incoming text as a stream consisting of valid PDF tokens, but @@ -16,7 +18,8 @@ class Pl_QPDFTokenizer: public Pipeline { public: - Pl_QPDFTokenizer(char const* identifier, Pipeline* next); + Pl_QPDFTokenizer(char const* identifier, + QPDFObjectHandle::TokenFilter* filter); virtual ~Pl_QPDFTokenizer(); virtual void write(unsigned char* buf, size_t len); virtual void finish(); @@ -24,14 +27,25 @@ class Pl_QPDFTokenizer: public Pipeline private: void processChar(char ch); void checkUnread(); - void writeNext(char const*, size_t len); - void writeToken(QPDFTokenizer::Token&); - - QPDFTokenizer tokenizer; - bool just_wrote_nl; - bool last_char_was_cr; - bool unread_char; - char char_to_unread; + + class Members + { + friend class Pl_QPDFTokenizer; + + public: + ~Members(); + + private: + Members(); + Members(Members const&); + + QPDFObjectHandle::TokenFilter* filter; + QPDFTokenizer tokenizer; + bool last_char_was_cr; + bool unread_char; + char char_to_unread; + }; + PointerHolder<Members> m; }; #endif // __PL_QPDFTOKENIZER_HH__ diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index 5350fc0d..86b796cf 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -20,6 +20,7 @@ class QPDF_Stream: public QPDFObject virtual QPDFObject::object_type_e getTypeCode() const; virtual char const* getTypeName() const; QPDFObjectHandle getDict() const; + bool isDataModified() const; // See comments in QPDFObjectHandle.hh for these methods. bool pipeStreamData(Pipeline*, @@ -35,6 +36,8 @@ class QPDF_Stream: public QPDFObject PointerHolder<QPDFObjectHandle::StreamDataProvider> provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms); + void addTokenFilter( + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter); void replaceDict(QPDFObjectHandle new_dict); @@ -72,6 +75,8 @@ class QPDF_Stream: public QPDFObject size_t length; PointerHolder<Buffer> stream_data; PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider; + std::vector< + PointerHolder<QPDFObjectHandle::TokenFilter> > token_filters; }; #endif // __QPDF_STREAM_HH__ diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 9d279267..a3572859 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -758,6 +758,19 @@ $td->runtest("check output", show_ntests(); # ---------- +$td->notify("--- Token filters ---"); +$n_tests += 2; + +$td->runtest("token filter", + {$td->COMMAND => "test_driver 41 coalesce.pdf"}, + {$td->STRING => "test 41 done\n", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "token-filters-out.pdf"}); + +show_ntests(); +# ---------- $td->notify("--- Newline before endstream ---"); $n_tests += 10; diff --git a/qpdf/qtest/qpdf/token-filters-out.pdf b/qpdf/qtest/qpdf/token-filters-out.pdf new file mode 100644 index 00000000..6d24497c --- /dev/null +++ b/qpdf/qtest/qpdf/token-filters-out.pdf @@ -0,0 +1,171 @@ +%PDF-1.3 +% +%QDF-1.0 + +%% Original object ID: 1 0 +1 0 obj +<< + /Pages 2 0 R + /Type /Catalog +>> +endobj + +%% Original object ID: 2 0 +2 0 obj +<< + /Count 2 + /Kids [ + 3 0 R + 4 0 R + ] + /Type /Pages +>> +endobj + +%% Page 1 +%% Original object ID: 3 0 +3 0 obj +<< + /Contents 5 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 7 0 R + >> + /ProcSet 8 0 R + >> + /Type /Page +>> +endobj + +%% Page 2 +%% Original object ID: 4 0 +4 0 obj +<< + /Contents 9 0 R + /MediaBox [ + 0 + 0 + 612 + 792 + ] + /Parent 2 0 R + /Resources << + /Font << + /F1 11 0 R + >> + /ProcSet 12 0 R + >> + /Type /Page +>> +endobj + +%% Contents for page 1 +%% Original object ID: 19 0 +5 0 obj +<< + /Length 6 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Salad) Tj +ET [ /array/split ] BI +/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>> +ID xI P|C;U`7ZĘ}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y%Y8?&}j;3lpsHtQTt*hUw%)p"DiRjDYNUAvF&
u#cW ߉WO
+EI/bye +endstream +endobj + +6 0 obj +375 +endobj + +%% Original object ID: 13 0 +7 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 14 0 +8 0 obj +[ + /PDF + /Text +] +endobj + +%% Contents for page 2 +%% Original object ID: 15 0 +9 0 obj +<< + /Length 10 0 R +>> +stream +BT + /F1 24 Tf + 72 720 Td + (Salad) Tj +ET +/bye +endstream +endobj + +10 0 obj +48 +endobj + +%% Original object ID: 17 0 +11 0 obj +<< + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + /Name /F1 + /Subtype /Type1 + /Type /Font +>> +endobj + +%% Original object ID: 18 0 +12 0 obj +[ + /PDF + /Text +] +endobj + +xref +0 13 +0000000000 65535 f +0000000052 00000 n +0000000133 00000 n +0000000252 00000 n +0000000481 00000 n +0000000726 00000 n +0000001156 00000 n +0000001204 00000 n +0000001350 00000 n +0000001436 00000 n +0000001540 00000 n +0000001588 00000 n +0000001735 00000 n +trailer << + /Root 1 0 R + /Size 13 + /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>] +>> +startxref +1771 +%%EOF diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index 001e6dfb..027d942c 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -97,6 +97,36 @@ ParserCallbacks::handleEOF() std::cout << "-EOF-" << std::endl; } +class TokenFilter: public QPDFObjectHandle::TokenFilter +{ + public: + TokenFilter() + { + } + virtual ~TokenFilter() + { + } + virtual void handleToken(QPDFTokenizer::Token const& t) + { + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Potato")) + { + // Exercise unparsing of strings by token constructor + writeToken( + QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Salad")); + } + else + { + writeToken(t); + } + } + virtual void handleEOF() + { + writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/bye")); + write("\n"); + finish(); + } +}; + static std::string getPageContents(QPDFObjectHandle page) { PointerHolder<Buffer> b1 = @@ -1345,6 +1375,22 @@ void runtest(int n, char const* filename1, char const* arg2) w.setStaticID(true); w.write(); } + else if (n == 41) + { + // Apply a token filter. This test case is crafted to work + // with coalesce.pdf. + std::vector<QPDFObjectHandle> pages = pdf.getAllPages(); + for (std::vector<QPDFObjectHandle>::iterator iter = + pages.begin(); + iter != pages.end(); ++iter) + { + (*iter).addContentTokenFilter(new TokenFilter); + } + QPDFWriter w(pdf, "a.pdf"); + w.setQDFMode(true); + w.setStaticID(true); + w.write(); + } else { throw std::runtime_error(std::string("invalid test ") + |