diff options
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | include/qpdf/QPDFTokenizer.hh | 1 | ||||
-rw-r--r-- | libqpdf/QPDFTokenizer.cc | 134 | ||||
-rw-r--r-- | qpdf/qpdf.testcov | 3 | ||||
-rw-r--r-- | qpdf/qtest/qpdf.test | 20 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/large-inline-image.pdf | bin | 0 -> 1744913 bytes | |||
-rw-r--r-- | qpdf/qtest/qpdf/large-inline-image.qdf | bin | 0 -> 2140781 bytes |
7 files changed, 155 insertions, 15 deletions
@@ -1,3 +1,15 @@ +2019-01-30 Jay Berkenbilt <ejb@ql.org> + + * Improve locating of an inline image's EI operator to correctly + handle the case of EI appearing inside the image data. + + * Very low-level QPDFTokenizer API now includes an + expectInlineImage method that takes an input stream, enabling it + to locate an inline image's EI operator better. This is called + automatically everywhere within the qpdf library. Most user code + will never have to use the low-level tokenizer API. If you use + Pl_QPDFTokenizer, this will be done automatically for you. + 2019-01-29 Jay Berkenbilt <ejb@ql.org> * Bug fix: when returning an inline image token, the tokenizer no diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 31f2f398..424ac099 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -198,6 +198,7 @@ class QPDFTokenizer void resolveLiteral(); bool isSpace(char); bool isDelimiter(char); + void findEI(PointerHolder<InputSource> input); enum state_e { st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index e03f927b..2671fcbb 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() qpdf_offset_t pos = is->tell(); if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); return false; } qpdf_offset_t token_start = is->getLastOffset(); @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! next_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); return false; } if (token_start == 0) @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() is->seek(pos, SEEK_SET); if (! prev_okay) { -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); return false; } return true; @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage() void QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input) { - if (input.getPointer()) + if (this->m->state != st_top) + { + throw std::logic_error("QPDFTokenizer::expectInlineImage called" + " when tokenizer is in improper state"); + } + findEI(input); + this->m->state = st_inline_image; +} + +void +QPDFTokenizer::findEI(PointerHolder<InputSource> input) +{ + if (! input.getPointer()) { - qpdf_offset_t last_offset = input->getLastOffset(); - qpdf_offset_t pos = input->tell(); + return; + } + + qpdf_offset_t last_offset = input->getLastOffset(); + qpdf_offset_t pos = input->tell(); + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. + // Then read the next several tokens or up to EOF. If we find any + // suspicious-looking or tokens, this is probably still part of + // the image data, so keep looking for EI. Stop at the first EI + // that passes. If we get to the end without finding one, return + // the last EI we found. Store the number of bytes expected in the + // inline image including the EI and use that to break out of + // inline image, falling back to the old method if needed. + + bool okay = false; + bool first_try = true; + while (! okay) + { QPDFWordTokenFinder f(input, "EI"); - if (input->findFirst("EI", pos, 0, f)) + if (! input->findFirst("EI", input->tell(), 0, f)) { - this->m->inline_image_bytes = input->tell() - pos; + break; + } + this->m->inline_image_bytes = input->tell() - pos; + + QPDFTokenizer check; + bool found_bad = false; + // Look at the next 10 tokens or up to EOF. The next inline + // image's image data would look like bad tokens, but there + // will always be at least 10 tokens between one inline + // image's EI and the next valid one's ID since width, height, + // bits per pixel, and color space are all required as well as + // a BI and ID. If we get 10 good tokens in a row or hit EOF, + // we can be pretty sure we've found the actual EI. + for (int i = 0; i < 10; ++i) + { + QPDFTokenizer::Token t = + check.readToken(input, "checker", true); + token_type_e type = t.getType(); + if (type == tt_eof) + { + okay = true; + } + else if (type == tt_bad) + { + found_bad = true; + } + else if (type == tt_word) + { + // The qpdf tokenizer lumps alphabetic and otherwise + // uncategorized characters into "words". We recognize + // strings of alphabetic characters as potential valid + // operators for purposes of telling whether we're in + // valid content or not. It's not perfect, but it + // should work more reliably than what we used to do, + // which was already good enough for the vast majority + // of files. + bool found_alpha = false; + bool found_non_printable = false; + bool found_other = false; + std::string value = t.getValue(); + for (std::string::iterator iter = value.begin(); + iter != value.end(); ++iter) + { + char ch = *iter; + if (((ch >= 'a') && (ch <= 'z')) || + ((ch >= 'A') && (ch <= 'Z')) || + (ch == '*')) + { + // Treat '*' as alpha since there are valid + // PDF operators that contain * along with + // alphabetic characters. + found_alpha = true; + } + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) + { + found_non_printable = true; + break; + } + else + { + found_other = true; + } + } + if (found_non_printable || (found_alpha && found_other)) + { + found_bad = true; + } + } + if (okay || found_bad) + { + break; + } + } + if (! found_bad) + { + okay = true; + } + if (! okay) + { + first_try = false; } - - input->seek(pos, SEEK_SET); - input->setLastOffset(last_offset); } - if (this->m->state != st_top) + if (okay && (! first_try)) { - throw std::logic_error("QPDFTokenizer::expectInlineImage called" - " when tokenizer is in improper state"); + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); } - this->m->state = st_inline_image; + + input->seek(pos, SEEK_SET); + input->setLastOffset(last_offset); } bool diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 6dcebd6e..21b43dc8 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0 qpdf from_nr from repeat_nr 0 QPDF resolve duplicated page object 0 QPDF handle direct page object 0 +QPDFTokenizer finder found wrong word 0 +QPDFTokenizer finder word not preceded by delimiter 0 QPDFTokenizer found EI the old way 0 QPDFTokenizer found EI by byte count 0 QPDFTokenizer inline image at EOF the old way 0 +QPDFTokenizer found EI after more than one try 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 6abc7edb..da40d389 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -693,6 +693,26 @@ $td->runtest("check pass1 file", show_ntests(); # ---------- +$td->notify("--- Inline Images ---"); +$n_tests += 2; + +# The file large-inline-image.pdf is a hand-crafted file with several +# inline images of various sizes including one that is two megabytes, +# encoded in base85, and has a base85-encoding that contains EI +# surrounded by delimiters several times. This exercises the EI +# detection code added in qpdf 8.4. + +$td->runtest("complex inline image parsing", + {$td->COMMAND => + "qpdf --qdf --static-id large-inline-image.pdf a.pdf"}, + {$td->STRING => "", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); +$td->runtest("check output", + {$td->FILE => "a.pdf"}, + {$td->FILE => "large-inline-image.qdf"}); + +show_ntests(); +# ---------- $td->notify("--- Tokenizer ---"); $n_tests += 5; diff --git a/qpdf/qtest/qpdf/large-inline-image.pdf b/qpdf/qtest/qpdf/large-inline-image.pdf Binary files differnew file mode 100644 index 00000000..0a47c192 --- /dev/null +++ b/qpdf/qtest/qpdf/large-inline-image.pdf diff --git a/qpdf/qtest/qpdf/large-inline-image.qdf b/qpdf/qtest/qpdf/large-inline-image.qdf Binary files differnew file mode 100644 index 00000000..a82ea105 --- /dev/null +++ b/qpdf/qtest/qpdf/large-inline-image.qdf |