From 2b6c79bcaeee0548f3d7291876eb3821e14b8227 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Wed, 30 Jan 2019 14:26:08 -0500 Subject: Improve locating inline image's EI We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection. --- libqpdf/QPDFTokenizer.cc | 134 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 119 insertions(+), 15 deletions(-) (limited to 'libqpdf/QPDFTokenizer.cc') diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index e03f927b..2671fcbb 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() qpdf_offset_t pos = is->tell(); if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); return false; } qpdf_offset_t token_start = is->getLastOffset(); @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! next_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); return false; } if (token_start == 0) @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! prev_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); return false; } return true; @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage() void QPDFTokenizer::expectInlineImage(PointerHolder input) { - if (input.getPointer()) + if (this->m->state != st_top) + { + throw std::logic_error("QPDFTokenizer::expectInlineImage called" + " when tokenizer is in improper state"); + } + findEI(input); + this->m->state = st_inline_image; +} + +void +QPDFTokenizer::findEI(PointerHolder input) +{ + if (! input.getPointer()) { - qpdf_offset_t last_offset = input->getLastOffset(); - qpdf_offset_t pos = input->tell(); + return; + } + + qpdf_offset_t last_offset = input->getLastOffset(); + qpdf_offset_t pos = input->tell(); + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. + // Then read the next several tokens or up to EOF. If we find any + // suspicious-looking or tokens, this is probably still part of + // the image data, so keep looking for EI. Stop at the first EI + // that passes. If we get to the end without finding one, return + // the last EI we found. Store the number of bytes expected in the + // inline image including the EI and use that to break out of + // inline image, falling back to the old method if needed. + + bool okay = false; + bool first_try = true; + while (! okay) + { QPDFWordTokenFinder f(input, "EI"); - if (input->findFirst("EI", pos, 0, f)) + if (! input->findFirst("EI", input->tell(), 0, f)) { - this->m->inline_image_bytes = input->tell() - pos; + break; + } + this->m->inline_image_bytes = input->tell() - pos; + + QPDFTokenizer check; + bool found_bad = false; + // Look at the next 10 tokens or up to EOF. The next inline + // image's image data would look like bad tokens, but there + // will always be at least 10 tokens between one inline + // image's EI and the next valid one's ID since width, height, + // bits per pixel, and color space are all required as well as + // a BI and ID. If we get 10 good tokens in a row or hit EOF, + // we can be pretty sure we've found the actual EI. + for (int i = 0; i < 10; ++i) + { + QPDFTokenizer::Token t = + check.readToken(input, "checker", true); + token_type_e type = t.getType(); + if (type == tt_eof) + { + okay = true; + } + else if (type == tt_bad) + { + found_bad = true; + } + else if (type == tt_word) + { + // The qpdf tokenizer lumps alphabetic and otherwise + // uncategorized characters into "words". We recognize + // strings of alphabetic characters as potential valid + // operators for purposes of telling whether we're in + // valid content or not. It's not perfect, but it + // should work more reliably than what we used to do, + // which was already good enough for the vast majority + // of files. + bool found_alpha = false; + bool found_non_printable = false; + bool found_other = false; + std::string value = t.getValue(); + for (std::string::iterator iter = value.begin(); + iter != value.end(); ++iter) + { + char ch = *iter; + if (((ch >= 'a') && (ch <= 'z')) || + ((ch >= 'A') && (ch <= 'Z')) || + (ch == '*')) + { + // Treat '*' as alpha since there are valid + // PDF operators that contain * along with + // alphabetic characters. + found_alpha = true; + } + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) + { + found_non_printable = true; + break; + } + else + { + found_other = true; + } + } + if (found_non_printable || (found_alpha && found_other)) + { + found_bad = true; + } + } + if (okay || found_bad) + { + break; + } + } + if (! found_bad) + { + okay = true; + } + if (! okay) + { + first_try = false; } - - input->seek(pos, SEEK_SET); - input->setLastOffset(last_offset); } - if (this->m->state != st_top) + if (okay && (! first_try)) { - throw std::logic_error("QPDFTokenizer::expectInlineImage called" - " when tokenizer is in improper state"); + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); } - this->m->state = st_inline_image; + + input->seek(pos, SEEK_SET); + input->setLastOffset(last_offset); } bool -- cgit v1.2.3-54-g00ecf