diff options
author | Jay Berkenbilt <ejb@ql.org> | 2018-02-03 00:21:34 +0100 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2018-02-19 03:05:46 +0100 |
commit | 99101044429c3c91bd11bdd1b26e5b6c2ceb140b (patch) | |
tree | 5ab366eab31ddf76e80f99bd1d34c421291f1c4e /libqpdf | |
parent | b8723e97f4b94fe03e631aab0309382ead3137ed (diff) | |
download | qpdf-99101044429c3c91bd11bdd1b26e5b6c2ceb140b.tar.zst |
Implement TokenFilter and refactor Pl_QPDFTokenizer
Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a
TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a
general filter that passes data through a TokenFilter.
Diffstat (limited to 'libqpdf')
-rw-r--r-- | libqpdf/ContentNormalizer.cc | 77 | ||||
-rw-r--r-- | libqpdf/Pl_QPDFTokenizer.cc | 121 | ||||
-rw-r--r-- | libqpdf/QPDFObjectHandle.cc | 66 | ||||
-rw-r--r-- | libqpdf/QPDFTokenizer.cc | 18 | ||||
-rw-r--r-- | libqpdf/QPDFWriter.cc | 6 | ||||
-rw-r--r-- | libqpdf/QPDF_Stream.cc | 34 | ||||
-rw-r--r-- | libqpdf/build.mk | 1 | ||||
-rw-r--r-- | libqpdf/qpdf/ContentNormalizer.hh | 15 | ||||
-rw-r--r-- | libqpdf/qpdf/Pl_QPDFTokenizer.hh | 32 | ||||
-rw-r--r-- | libqpdf/qpdf/QPDF_Stream.hh | 5 |
10 files changed, 269 insertions, 106 deletions
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc new file mode 100644 index 00000000..35a8ad74 --- /dev/null +++ b/libqpdf/ContentNormalizer.cc @@ -0,0 +1,77 @@ +#include <qpdf/ContentNormalizer.hh> +#include <qpdf/QUtil.hh> + +ContentNormalizer::ContentNormalizer() +{ +} + +ContentNormalizer::~ContentNormalizer() +{ +} + +void +ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) +{ + std::string value = token.getRawValue(); + QPDFTokenizer::token_type_e token_type = token.getType(); + + switch (token_type) + { + case QPDFTokenizer::tt_space: + { + size_t len = value.length(); + for (size_t i = 0; i < len; ++i) + { + char ch = value.at(i); + if (ch == '\r') + { + if ((i + 1 < len) && (value.at(i + 1) == '\n')) + { + // ignore + } + else + { + write("\n"); + } + } + else + { + write(&ch, 1); + } + } + } + break; + + case QPDFTokenizer::tt_string: + // Replacing string and name tokens in this way normalizes + // their representation as this will automatically handle + // quoting of unprintable characters, etc. + writeToken(QPDFTokenizer::Token( + QPDFTokenizer::tt_string, token.getValue())); + break; + + case QPDFTokenizer::tt_name: + writeToken(QPDFTokenizer::Token( + QPDFTokenizer::tt_name, token.getValue())); + break; + + default: + writeToken(token); + break; + } + + value = token.getRawValue(); + if (((token_type == QPDFTokenizer::tt_string) || + (token_type == QPDFTokenizer::tt_name)) && + ((value.find('\r') != std::string::npos) || + (value.find('\n') != std::string::npos))) + { + write("\n"); + } +} + +void +ContentNormalizer::handleEOF() +{ + finish(); +} diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 9595cd75..4fc37767 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -1,107 +1,51 @@ #include <qpdf/Pl_QPDFTokenizer.hh> -#include <qpdf/QPDF_String.hh> -#include <qpdf/QPDF_Name.hh> #include <qpdf/QTC.hh> -#include <qpdf/QUtil.hh> #include <stdexcept> #include <string.h> -Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : - Pipeline(identifier, next), - just_wrote_nl(false), +Pl_QPDFTokenizer::Members::Members() : + filter(0), last_char_was_cr(false), unread_char(false), char_to_unread('\0') { - tokenizer.allowEOF(); - tokenizer.includeIgnorable(); } -Pl_QPDFTokenizer::~Pl_QPDFTokenizer() +Pl_QPDFTokenizer::Members::~Members() { } -void -Pl_QPDFTokenizer::writeNext(char const* buf, size_t len) +Pl_QPDFTokenizer::Pl_QPDFTokenizer( + char const* identifier, + QPDFObjectHandle::TokenFilter* filter) + : + Pipeline(identifier, 0), + m(new Members) { - if (len) - { - getNext()->write(QUtil::unsigned_char_pointer(buf), len); - this->just_wrote_nl = (buf[len-1] == '\n'); - } + m->filter = filter; + m->tokenizer.allowEOF(); + m->tokenizer.includeIgnorable(); } -void -Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() { - std::string value = token.getRawValue(); - - switch (token.getType()) - { - case QPDFTokenizer::tt_space: - { - size_t len = value.length(); - for (size_t i = 0; i < len; ++i) - { - char ch = value.at(i); - if (ch == '\r') - { - if ((i + 1 < len) && (value.at(i + 1) == '\n')) - { - // ignore - } - else - { - writeNext("\n", 1); - } - } - else - { - writeNext(&ch, 1); - } - } - } - value.clear(); - break; - - case QPDFTokenizer::tt_string: - value = QPDF_String(token.getValue()).unparse(); - - break; - - case QPDFTokenizer::tt_name: - value = QPDF_Name(token.getValue()).unparse(); - break; - - default: - break; - } - writeNext(value.c_str(), value.length()); } void Pl_QPDFTokenizer::processChar(char ch) { - tokenizer.presentCharacter(ch); + this->m->tokenizer.presentCharacter(ch); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - std::string value = token.getRawValue(); - QPDFTokenizer::token_type_e token_type = token.getType(); - if (((token_type == QPDFTokenizer::tt_string) || - (token_type == QPDFTokenizer::tt_name)) && - ((value.find('\r') != std::string::npos) || - (value.find('\n') != std::string::npos))) + this->m->filter->handleToken(token); + if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "ID")) { - writeNext("\n", 1); - } - if ((token.getType() == QPDFTokenizer::tt_word) && - (token.getValue() == "ID")) - { QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); - tokenizer.expectInlineImage(); - } + this->m->tokenizer.expectInlineImage(); + } } } @@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch) void Pl_QPDFTokenizer::checkUnread() { - if (this->unread_char) + if (this->m->unread_char) { - processChar(this->char_to_unread); - if (this->unread_char) + processChar(this->m->char_to_unread); + if (this->m->unread_char) { throw std::logic_error( "INTERNAL ERROR: unread_char still true after processing " @@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len) void Pl_QPDFTokenizer::finish() { - this->tokenizer.presentEOF(); + this->m->tokenizer.presentEOF(); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - if (unread_char) - { - if (this->char_to_unread == '\r') - { - this->char_to_unread = '\n'; - } - writeNext(&this->char_to_unread, 1); - } + this->m->filter->handleToken(token); } - getNext()->finish(); + this->m->filter->handleEOF(); } diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 51de87e1..bba95938 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -63,6 +63,50 @@ CoalesceProvider::provideStreamData(int, int, Pipeline* p) } void +QPDFObjectHandle::TokenFilter::setPipeline(Pipeline* p) +{ + this->pipeline = p; +} + +void +QPDFObjectHandle::TokenFilter::write(char const* data, size_t len) +{ + if (! this->pipeline) + { + throw std::logic_error( + "TokenFilter::write called before setPipeline"); + } + if (len) + { + this->pipeline->write(QUtil::unsigned_char_pointer(data), len); + } +} + +void +QPDFObjectHandle::TokenFilter::write(std::string const& str) +{ + write(str.c_str(), str.length()); +} + +void +QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) +{ + std::string value = token.getRawValue(); + write(value.c_str(), value.length()); +} + +void +QPDFObjectHandle::TokenFilter::finish() +{ + if (! this->pipeline) + { + throw std::logic_error( + "TokenFilter::finish called before setPipeline"); + } + this->pipeline->finish(); +} + +void QPDFObjectHandle::ParserCallbacks::terminateParsing() { throw TerminateParsing(); @@ -508,6 +552,13 @@ QPDFObjectHandle::getDict() return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict(); } +bool +QPDFObjectHandle::isDataModified() +{ + assertStream(); + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->isDataModified(); +} + void QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict) { @@ -1033,6 +1084,21 @@ QPDFObjectHandle::parseContentStream_data( } } +void +QPDFObjectHandle::addContentTokenFilter(PointerHolder<TokenFilter> filter) +{ + coalesceContentStreams(); + this->getKey("/Contents").addTokenFilter(filter); +} + +void +QPDFObjectHandle::addTokenFilter(PointerHolder<TokenFilter> filter) +{ + assertStream(); + return dynamic_cast<QPDF_Stream*>( + obj.getPointer())->addTokenFilter(filter); +} + QPDFObjectHandle QPDFObjectHandle::parse(PointerHolder<InputSource> input, std::string const& object_description, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 078b1af0..c3a017d0 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -7,6 +7,7 @@ #include <qpdf/QTC.hh> #include <qpdf/QPDFExc.hh> #include <qpdf/QUtil.hh> +#include <qpdf/QPDFObjectHandle.hh> #include <stdexcept> #include <string.h> @@ -39,6 +40,23 @@ QPDFTokenizer::Members::~Members() { } +QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : + type(type), + value(value), + raw_value(value) +{ + if (type == tt_string) + { + raw_value = QPDFObjectHandle::newString(value).unparse(); + } + else if (type == tt_string) + { + raw_value = QPDFObjectHandle::newName(value).unparse(); + } +} + + + QPDFTokenizer::QPDFTokenizer() : m(new Members()) { diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index f7f834b5..f277189a 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -1591,7 +1591,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, { is_metadata = true; } - bool filter = (this->m->compress_streams || + bool filter = (object.isDataModified() || + this->m->compress_streams || this->m->stream_decode_level); if (this->m->compress_streams) { @@ -1602,7 +1603,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, // compressed with a lossy compression scheme, but we // don't support any of those right now. QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); - if (filter_obj.isName() && + if ((! object.isDataModified()) && + filter_obj.isName() && ((filter_obj.getName() == "/FlateDecode") || (filter_obj.getName() == "/Fl"))) { diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index 89b6b5a9..a026f9a4 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -13,7 +13,7 @@ #include <qpdf/Pl_RunLength.hh> #include <qpdf/Pl_DCT.hh> #include <qpdf/Pl_Count.hh> - +#include <qpdf/ContentNormalizer.hh> #include <qpdf/QTC.hh> #include <qpdf/QPDF.hh> #include <qpdf/QPDFExc.hh> @@ -91,6 +91,12 @@ QPDF_Stream::getDict() const return this->stream_dict; } +bool +QPDF_Stream::isDataModified() const +{ + return (! this->token_filters.empty()); +} + PointerHolder<Buffer> QPDF_Stream::getStreamData(qpdf_stream_decode_level_e decode_level) { @@ -440,21 +446,36 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, // create to be deleted when this function finishes. std::vector<PointerHolder<Pipeline> > to_delete; + PointerHolder<ContentNormalizer> normalizer; if (filter) { if (encode_flags & qpdf_ef_compress) { - pipeline = new Pl_Flate("compress object stream", pipeline, + pipeline = new Pl_Flate("compress stream", pipeline, Pl_Flate::a_deflate); to_delete.push_back(pipeline); } if (encode_flags & qpdf_ef_normalize) { - pipeline = new Pl_QPDFTokenizer("normalizer", pipeline); + normalizer = new ContentNormalizer(); + normalizer->setPipeline(pipeline); + pipeline = new Pl_QPDFTokenizer( + "normalizer", normalizer.getPointer()); to_delete.push_back(pipeline); } + for (std::vector<PointerHolder< + QPDFObjectHandle::TokenFilter> >::reverse_iterator iter = + this->token_filters.rbegin(); + iter != this->token_filters.rend(); ++iter) + { + (*iter)->setPipeline(pipeline); + pipeline = new Pl_QPDFTokenizer( + "token filter", (*iter).getPointer()); + to_delete.push_back(pipeline); + } + for (std::vector<std::string>::reverse_iterator iter = filters.rbegin(); iter != filters.rend(); ++iter) { @@ -613,6 +634,13 @@ QPDF_Stream::replaceStreamData( } void +QPDF_Stream::addTokenFilter( + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter) +{ + this->token_filters.push_back(token_filter); +} + +void QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms, size_t length) diff --git a/libqpdf/build.mk b/libqpdf/build.mk index c75c3dd9..11895623 100644 --- a/libqpdf/build.mk +++ b/libqpdf/build.mk @@ -9,6 +9,7 @@ SRCS_libqpdf = \ libqpdf/BitWriter.cc \ libqpdf/Buffer.cc \ libqpdf/BufferInputSource.cc \ + libqpdf/ContentNormalizer.cc \ libqpdf/FileInputSource.cc \ libqpdf/InputSource.cc \ libqpdf/InsecureRandomDataProvider.cc \ diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh new file mode 100644 index 00000000..504f15e8 --- /dev/null +++ b/libqpdf/qpdf/ContentNormalizer.hh @@ -0,0 +1,15 @@ +#ifndef __CONTENTNORMALIZER_HH__ +#define __CONTENTNORMALIZER_HH__ + +#include <qpdf/QPDFObjectHandle.hh> + +class ContentNormalizer: public QPDFObjectHandle::TokenFilter +{ + public: + ContentNormalizer(); + virtual ~ContentNormalizer(); + virtual void handleToken(QPDFTokenizer::Token const&); + virtual void handleEOF(); +}; + +#endif // __CONTENTNORMALIZER_HH__ diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh index 54507f68..9f4ac133 100644 --- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh +++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh @@ -4,6 +4,8 @@ #include <qpdf/Pipeline.hh> #include <qpdf/QPDFTokenizer.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/QPDFObjectHandle.hh> // // Treat incoming text as a stream consisting of valid PDF tokens, but @@ -16,7 +18,8 @@ class Pl_QPDFTokenizer: public Pipeline { public: - Pl_QPDFTokenizer(char const* identifier, Pipeline* next); + Pl_QPDFTokenizer(char const* identifier, + QPDFObjectHandle::TokenFilter* filter); virtual ~Pl_QPDFTokenizer(); virtual void write(unsigned char* buf, size_t len); virtual void finish(); @@ -24,14 +27,25 @@ class Pl_QPDFTokenizer: public Pipeline private: void processChar(char ch); void checkUnread(); - void writeNext(char const*, size_t len); - void writeToken(QPDFTokenizer::Token&); - - QPDFTokenizer tokenizer; - bool just_wrote_nl; - bool last_char_was_cr; - bool unread_char; - char char_to_unread; + + class Members + { + friend class Pl_QPDFTokenizer; + + public: + ~Members(); + + private: + Members(); + Members(Members const&); + + QPDFObjectHandle::TokenFilter* filter; + QPDFTokenizer tokenizer; + bool last_char_was_cr; + bool unread_char; + char char_to_unread; + }; + PointerHolder<Members> m; }; #endif // __PL_QPDFTOKENIZER_HH__ diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index 5350fc0d..86b796cf 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -20,6 +20,7 @@ class QPDF_Stream: public QPDFObject virtual QPDFObject::object_type_e getTypeCode() const; virtual char const* getTypeName() const; QPDFObjectHandle getDict() const; + bool isDataModified() const; // See comments in QPDFObjectHandle.hh for these methods. bool pipeStreamData(Pipeline*, @@ -35,6 +36,8 @@ class QPDF_Stream: public QPDFObject PointerHolder<QPDFObjectHandle::StreamDataProvider> provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms); + void addTokenFilter( + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter); void replaceDict(QPDFObjectHandle new_dict); @@ -72,6 +75,8 @@ class QPDF_Stream: public QPDFObject size_t length; PointerHolder<Buffer> stream_data; PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider; + std::vector< + PointerHolder<QPDFObjectHandle::TokenFilter> > token_filters; }; #endif // __QPDF_STREAM_HH__ |