diff options
author | Jay Berkenbilt <ejb@ql.org> | 2018-02-03 00:21:34 +0100 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2018-02-19 03:05:46 +0100 |
commit | 99101044429c3c91bd11bdd1b26e5b6c2ceb140b (patch) | |
tree | 5ab366eab31ddf76e80f99bd1d34c421291f1c4e /include | |
parent | b8723e97f4b94fe03e631aab0309382ead3137ed (diff) | |
download | qpdf-99101044429c3c91bd11bdd1b26e5b6c2ceb140b.tar.zst |
Implement TokenFilter and refactor Pl_QPDFTokenizer
Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a
TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a
general filter that passes data through a TokenFilter.
Diffstat (limited to 'include')
-rw-r--r-- | include/qpdf/QPDFObjectHandle.hh | 87 | ||||
-rw-r--r-- | include/qpdf/QPDFTokenizer.hh | 11 |
2 files changed, 89 insertions, 9 deletions
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 14dadd6c..f0b8f2af 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -35,6 +35,7 @@ #include <qpdf/PointerHolder.hh> #include <qpdf/Buffer.hh> #include <qpdf/InputSource.hh> +#include <qpdf/QPDFTokenizer.hh> #include <qpdf/QPDFObject.hh> @@ -76,6 +77,66 @@ class QPDFObjectHandle Pipeline* pipeline) = 0; }; + // The TokenFilter class provides a way to filter content streams + // in a lexically aware fashion. TokenFilters can be attached to + // streams using the addTokenFilter or addContentTokenFilter + // methods. The handleToken method is called for each token, + // including the eof token, and then handleEOF is called at the + // very end. Handlers may call write (or writeToken) to pass data + // downstream. The finish() method must be called exactly one time + // to ensure that any written data is flushed out. The default + // handleEOF calls finish. If you override handleEOF, you must + // ensure that finish() is called either there or in response to + // whatever event causes you to terminate creation of output. + // Failure to call finish() may result in some of the data you + // have written being lost. You should not rely on a destructor + // for calling finish() since the destructor call may occur later + // than you expect. Please see examples/token-filters.cc for + // examples of using TokenFilters. + // + // Please note that when you call token.getValue() on a token of + // type tt_string, you get the string value without any + // delimiters. token.getRawValue() will return something suitable + // for being written to output, or calling writeToken with a + // string token will also work. The correct way to construct a + // string token that would write the literal value (str) is + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). + class TokenFilter + { + public: + QPDF_DLL + TokenFilter() + { + } + QPDF_DLL + virtual ~TokenFilter() + { + } + virtual void handleToken(QPDFTokenizer::Token const&) = 0; + virtual void handleEOF() + { + // If you override handleEOF, you must be sure to call + // finish(). + finish(); + } + + // This is called internally by the qpdf library. + void setPipeline(Pipeline*); + + protected: + QPDF_DLL + void write(char const* data, size_t len); + QPDF_DLL + void write(std::string const& str); + QPDF_DLL + void writeToken(QPDFTokenizer::Token const&); + QPDF_DLL + void finish(); + + private: + Pipeline* pipeline; + }; + // This class is used by parse to decrypt strings when reading an // object that contains encrypted strings. class StringDecrypter @@ -223,6 +284,23 @@ class QPDFObjectHandle static void parseContentStream(QPDFObjectHandle stream_or_array, ParserCallbacks* callbacks); + // Attach a token filter to a page's contents. If the page's + // contents is an array of streams, it is automatically coalesced. + // The token filter is applied to the page's contents as a single + // stream. + QPDF_DLL + void addContentTokenFilter(PointerHolder<TokenFilter> token_filter); + + // As of qpdf 8, it is possible to add custom token filters to a + // stream. The tokenized stream data is passed through the token + // filter after all original filters but before content stream + // normalization if requested. This is a low-level interface to + // add it to a stream. You will usually want to call + // addContentTokenFilter instead, which can be applied to a page + // object, and which will automatically handle the case of pages + // whose contents are split across multiple streams. + void addTokenFilter(PointerHolder<TokenFilter> token_filter); + // Type-specific factories QPDF_DLL static QPDFObjectHandle newNull(); @@ -414,6 +492,13 @@ class QPDFObjectHandle QPDF_DLL QPDFObjectHandle getDict(); + // If addTokenFilter has been called for this stream, then the + // original data should be considered to be modified. This means we + // should avoid optimizations such as not filtering a stream that + // is already compressed. + QPDF_DLL + bool isDataModified(); + // Returns filtered (uncompressed) stream data. Throws an // exception if the stream is filtered and we can't decode it. QPDF_DLL @@ -608,7 +693,7 @@ class QPDFObjectHandle // stream or an array of streams. If this page's content is an // array, concatenate the streams into a single stream. This can // be useful when working with files that split content streams in - // arbitary spots, such as in the middle of a token, as that can + // arbitrary spots, such as in the middle of a token, as that can // confuse some software. You could also call this after calling // addPageContents. QPDF_DLL diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index fe2e95f7..eb9215aa 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -62,13 +62,8 @@ class QPDFTokenizer { public: Token() : type(tt_bad) {} - - Token(token_type_e type, std::string const& value) : - type(type), - value(value) - { - } - + QPDF_DLL + Token(token_type_e type, std::string const& value); Token(token_type_e type, std::string const& value, std::string raw_value, std::string error_message) : type(type), @@ -93,7 +88,7 @@ class QPDFTokenizer { return this->error_message; } - bool operator==(Token const& rhs) + bool operator==(Token const& rhs) const { // Ignore fields other than type and value return ((this->type != tt_bad) && |