diff options
Diffstat (limited to 'libqpdf/QPDFTokenizer.cc')
-rw-r--r-- | libqpdf/QPDFTokenizer.cc | 246 |
1 files changed, 230 insertions, 16 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 9c2a1e05..80fcf347 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -13,6 +13,69 @@ #include <string.h> #include <cstdlib> +static bool is_delimiter(char ch) +{ + return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0); +} + +class QPDFWordTokenFinder: public InputSource::Finder +{ + public: + QPDFWordTokenFinder(PointerHolder<InputSource> is, + std::string const& str) : + is(is), + str(str) + { + } + virtual ~QPDFWordTokenFinder() + { + } + virtual bool check(); + + private: + PointerHolder<InputSource> is; + std::string str; +}; + +bool +QPDFWordTokenFinder::check() +{ + // Find a word token matching the given string, preceded by a + // delimiter, and followed by a delimiter or EOF. + QPDFTokenizer tokenizer; + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); + qpdf_offset_t pos = is->tell(); + if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) + { + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); + return false; + } + qpdf_offset_t token_start = is->getLastOffset(); + char next; + bool next_okay = false; + if (is->read(&next, 1) == 0) + { + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); + next_okay = true; + } + else + { + next_okay = is_delimiter(next); + } + is->seek(pos, SEEK_SET); + if (! next_okay) + { + return false; + } + if (token_start == 0) + { + // Can't actually happen...we never start the search at the + // beginning of the input. + return false; + } + return true; +} + QPDFTokenizer::Members::Members() : pound_special_in_name(true), allow_eof(false), @@ -31,9 +94,11 @@ QPDFTokenizer::Members::reset() error_message = ""; unread_char = false; char_to_unread = '\0'; + inline_image_bytes = 0; string_depth = 0; string_ignoring_newline = false; last_char_was_bs = false; + last_char_was_cr = false; } QPDFTokenizer::Members::~Members() @@ -90,7 +155,7 @@ QPDFTokenizer::isSpace(char ch) bool QPDFTokenizer::isDelimiter(char ch) { - return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0); + return is_delimiter(ch); } void @@ -217,6 +282,7 @@ QPDFTokenizer::presentCharacter(char ch) memset(this->m->bs_num_register, '\0', sizeof(this->m->bs_num_register)); this->m->last_char_was_bs = false; + this->m->last_char_was_cr = false; this->m->state = st_in_string; } else if (ch == '<') @@ -334,8 +400,7 @@ QPDFTokenizer::presentCharacter(char ch) } else if (this->m->state == st_in_string) { - if (this->m->string_ignoring_newline && - (! ((ch == '\r') || (ch == '\n')))) + if (this->m->string_ignoring_newline && (ch != '\n')) { this->m->string_ignoring_newline = false; } @@ -353,9 +418,10 @@ QPDFTokenizer::presentCharacter(char ch) bs_num_count = 0; } - if (this->m->string_ignoring_newline && ((ch == '\r') || (ch == '\n'))) + if (this->m->string_ignoring_newline && (ch == '\n')) { // ignore + this->m->string_ignoring_newline = false; } else if (ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) @@ -386,8 +452,10 @@ QPDFTokenizer::presentCharacter(char ch) this->m->val += '\f'; break; - case '\r': case '\n': + break; + + case '\r': this->m->string_ignoring_newline = true; break; @@ -417,11 +485,26 @@ QPDFTokenizer::presentCharacter(char ch) this->m->type = tt_string; this->m->state = st_token_ready; } + else if (ch == '\r') + { + // CR by itself is converted to LF + this->m->val += '\n'; + } + else if (ch == '\n') + { + // CR LF is converted to LF + if (! this->m->last_char_was_cr) + { + this->m->val += ch; + } + } else { this->m->val += ch; } + this->m->last_char_was_cr = + ((! this->m->string_ignoring_newline) && (ch == '\r')); this->m->last_char_was_bs = ((! this->m->last_char_was_bs) && (ch == '\\')); } @@ -449,21 +532,28 @@ QPDFTokenizer::presentCharacter(char ch) } else if (this->m->state == st_inline_image) { + this->m->val += ch; size_t len = this->m->val.length(); - if ((len >= 4) && - isDelimiter(this->m->val.at(len-4)) && - (this->m->val.at(len-3) == 'E') && - (this->m->val.at(len-2) == 'I') && - isDelimiter(this->m->val.at(len-1))) + if (len == this->m->inline_image_bytes) { + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); this->m->type = tt_inline_image; - this->m->unread_char = true; - this->m->char_to_unread = ch; + this->m->inline_image_bytes = 0; this->m->state = st_token_ready; } - else + else if ((this->m->inline_image_bytes == 0) && + (len >= 4) && + isDelimiter(this->m->val.at(len-4)) && + (this->m->val.at(len-3) == 'E') && + (this->m->val.at(len-2) == 'I') && + isDelimiter(this->m->val.at(len-1))) { - this->m->val += ch; + QTC::TC("qpdf", "QPDFTokenizer found EI the old way"); + this->m->val.erase(len - 1); + this->m->type = tt_inline_image; + this->m->unread_char = true; + this->m->char_to_unread = ch; + this->m->state = st_token_ready; } } else @@ -471,7 +561,6 @@ QPDFTokenizer::presentCharacter(char ch) handled = false; } - if (handled) { // okay @@ -546,7 +635,7 @@ QPDFTokenizer::presentEOF() (this->m->val.at(len-2) == 'E') && (this->m->val.at(len-1) == 'I')) { - QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way"); this->m->type = tt_inline_image; this->m->state = st_token_ready; } @@ -582,14 +671,139 @@ QPDFTokenizer::presentEOF() void QPDFTokenizer::expectInlineImage() { + expectInlineImage(PointerHolder<InputSource>()); +} + +void +QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input) +{ if (this->m->state != st_top) { throw std::logic_error("QPDFTokenizer::expectInlineImage called" " when tokenizer is in improper state"); } + findEI(input); this->m->state = st_inline_image; } +void +QPDFTokenizer::findEI(PointerHolder<InputSource> input) +{ + if (! input.getPointer()) + { + return; + } + + qpdf_offset_t last_offset = input->getLastOffset(); + qpdf_offset_t pos = input->tell(); + + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. + // Then read the next several tokens or up to EOF. If we find any + // suspicious-looking or tokens, this is probably still part of + // the image data, so keep looking for EI. Stop at the first EI + // that passes. If we get to the end without finding one, return + // the last EI we found. Store the number of bytes expected in the + // inline image including the EI and use that to break out of + // inline image, falling back to the old method if needed. + + bool okay = false; + bool first_try = true; + while (! okay) + { + QPDFWordTokenFinder f(input, "EI"); + if (! input->findFirst("EI", input->tell(), 0, f)) + { + break; + } + this->m->inline_image_bytes = input->tell() - pos - 2; + + QPDFTokenizer check; + bool found_bad = false; + // Look at the next 10 tokens or up to EOF. The next inline + // image's image data would look like bad tokens, but there + // will always be at least 10 tokens between one inline + // image's EI and the next valid one's ID since width, height, + // bits per pixel, and color space are all required as well as + // a BI and ID. If we get 10 good tokens in a row or hit EOF, + // we can be pretty sure we've found the actual EI. + for (int i = 0; i < 10; ++i) + { + QPDFTokenizer::Token t = + check.readToken(input, "checker", true); + token_type_e type = t.getType(); + if (type == tt_eof) + { + okay = true; + } + else if (type == tt_bad) + { + found_bad = true; + } + else if (type == tt_word) + { + // The qpdf tokenizer lumps alphabetic and otherwise + // uncategorized characters into "words". We recognize + // strings of alphabetic characters as potential valid + // operators for purposes of telling whether we're in + // valid content or not. It's not perfect, but it + // should work more reliably than what we used to do, + // which was already good enough for the vast majority + // of files. + bool found_alpha = false; + bool found_non_printable = false; + bool found_other = false; + std::string value = t.getValue(); + for (std::string::iterator iter = value.begin(); + iter != value.end(); ++iter) + { + char ch = *iter; + if (((ch >= 'a') && (ch <= 'z')) || + ((ch >= 'A') && (ch <= 'Z')) || + (ch == '*')) + { + // Treat '*' as alpha since there are valid + // PDF operators that contain * along with + // alphabetic characters. + found_alpha = true; + } + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) + { + found_non_printable = true; + break; + } + else + { + found_other = true; + } + } + if (found_non_printable || (found_alpha && found_other)) + { + found_bad = true; + } + } + if (okay || found_bad) + { + break; + } + } + if (! found_bad) + { + okay = true; + } + if (! okay) + { + first_try = false; + } + } + if (okay && (! first_try)) + { + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); + } + + input->seek(pos, SEEK_SET); + input->setLastOffset(last_offset); +} + bool QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) { |