diff options
Diffstat (limited to 'libqpdf/Pl_QPDFTokenizer.cc')
-rw-r--r-- | libqpdf/Pl_QPDFTokenizer.cc | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc new file mode 100644 index 00000000..63f0caaf --- /dev/null +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -0,0 +1,179 @@ + +#include <qpdf/Pl_QPDFTokenizer.hh> +#include <qpdf/QPDF_String.hh> +#include <qpdf/QPDF_Name.hh> + +Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + newline_after_next_token(false), + just_wrote_nl(false), + last_char_was_cr(false), + unread_char(false), + char_to_unread('\0'), + pass_through(false) +{ +} + +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() +{ +} + +void +Pl_QPDFTokenizer::writeNext(char const* buf, int len) +{ + if (len) + { + unsigned char* t = new unsigned char[len]; + memcpy(t, buf, len); + getNext()->write(t, len); + delete [] t; + this->just_wrote_nl = (buf[len-1] == '\n'); + } +} + +void +Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) +{ + std::string value = token.getRawValue(); + + switch (token.getType()) + { + case QPDFTokenizer::tt_string: + value = QPDF_String(token.getValue()).unparse(); + break; + + case QPDFTokenizer::tt_name: + value = QPDF_Name(token.getValue()).unparse(); + break; + + default: + break; + } + writeNext(value.c_str(), value.length()); +} + +void +Pl_QPDFTokenizer::processChar(char ch) +{ + if (this->pass_through) + { + // We're not noramlizing anymore -- just write this without + // looking at it. + writeNext(&ch, 1); + return; + } + + tokenizer.presentCharacter(ch); + QPDFTokenizer::Token token; + if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + { + writeToken(token); + if (this->newline_after_next_token) + { + writeNext("\n", 1); + this->newline_after_next_token = false; + } + if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "BI")) + { + // Uh oh.... we're not sophisticated enough to handle + // inline images safely. We'd have to to set up all the + // filters and pipe the iamge data through it until the + // filtered output was the right size for an image of the + // specified dimensions. Then we'd either have to write + // out raw image data or continue to write filtered data, + // resuming normalization when we get to the end. + // Insetad, for now, we'll just turn off noramlization for + // the remainder of this stream. + this->pass_through = true; + if (this->unread_char) + { + writeNext(&this->char_to_unread, 1); + this->unread_char = false; + } + } + } + else + { + bool suppress = false; + if ((ch == '\n') && (this->last_char_was_cr)) + { + // Always ignore \n following \r + suppress = true; + } + + if ((this->last_char_was_cr = (ch == '\r'))) + { + ch = '\n'; + } + + if (this->tokenizer.betweenTokens()) + { + if (! suppress) + { + writeNext(&ch, 1); + } + } + else + { + if (ch == '\n') + { + this->newline_after_next_token = true; + } + } + } +} + + +void +Pl_QPDFTokenizer::checkUnread() +{ + if (this->unread_char) + { + processChar(this->char_to_unread); + if (this->unread_char) + { + throw QEXC::Internal("unread_char still true after processing " + "unread character"); + } + } +} + +void +Pl_QPDFTokenizer::write(unsigned char* buf, int len) +{ + checkUnread(); + for (int i = 0; i < len; ++i) + { + processChar(buf[i]); + checkUnread(); + } +} + +void +Pl_QPDFTokenizer::finish() +{ + this->tokenizer.presentEOF(); + if (! this->pass_through) + { + QPDFTokenizer::Token token; + if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + { + writeToken(token); + if (unread_char) + { + if (this->char_to_unread == '\r') + { + this->char_to_unread = '\n'; + } + writeNext(&this->char_to_unread, 1); + } + } + } + if (! this->just_wrote_nl) + { + writeNext("\n", 1); + } + + getNext()->finish(); +} |