aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2019-01-30 20:26:08 +0100
committerJay Berkenbilt <ejb@ql.org>2019-01-31 15:26:37 +0100
commit2b6c79bcaeee0548f3d7291876eb3821e14b8227 (patch)
tree5152859c6f94227d0bc2edb6b59250a24d4b6603
parentec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b (diff)
downloadqpdf-2b6c79bcaeee0548f3d7291876eb3821e14b8227.tar.zst
Improve locating inline image's EI
We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
-rw-r--r--ChangeLog12
-rw-r--r--include/qpdf/QPDFTokenizer.hh1
-rw-r--r--libqpdf/QPDFTokenizer.cc134
-rw-r--r--qpdf/qpdf.testcov3
-rw-r--r--qpdf/qtest/qpdf.test20
-rw-r--r--qpdf/qtest/qpdf/large-inline-image.pdfbin0 -> 1744913 bytes
-rw-r--r--qpdf/qtest/qpdf/large-inline-image.qdfbin0 -> 2140781 bytes
7 files changed, 155 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 44396f55..cbe9357e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2019-01-30 Jay Berkenbilt <ejb@ql.org>
+
+ * Improve locating of an inline image's EI operator to correctly
+ handle the case of EI appearing inside the image data.
+
+ * Very low-level QPDFTokenizer API now includes an
+ expectInlineImage method that takes an input stream, enabling it
+ to locate an inline image's EI operator better. This is called
+ automatically everywhere within the qpdf library. Most user code
+ will never have to use the low-level tokenizer API. If you use
+ Pl_QPDFTokenizer, this will be done automatically for you.
+
2019-01-29 Jay Berkenbilt <ejb@ql.org>
* Bug fix: when returning an inline image token, the tokenizer no
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 31f2f398..424ac099 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -198,6 +198,7 @@ class QPDFTokenizer
void resolveLiteral();
bool isSpace(char);
bool isDelimiter(char);
+ void findEI(PointerHolder<InputSource> input);
enum state_e {
st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index e03f927b..2671fcbb 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -47,7 +47,7 @@ QPDFWordTokenFinder::check()
qpdf_offset_t pos = is->tell();
if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
{
-/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+ QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
return false;
}
qpdf_offset_t token_start = is->getLastOffset();
@@ -65,7 +65,6 @@ QPDFWordTokenFinder::check()
is->seek(pos, SEEK_SET);
if (! next_okay)
{
-/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
return false;
}
if (token_start == 0)
@@ -80,7 +79,7 @@ QPDFWordTokenFinder::check()
is->seek(pos, SEEK_SET);
if (! prev_okay)
{
-/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+ QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
return false;
}
return true;
@@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage()
void
QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
{
- if (input.getPointer())
+ if (this->m->state != st_top)
+ {
+ throw std::logic_error("QPDFTokenizer::expectInlineImage called"
+ " when tokenizer is in improper state");
+ }
+ findEI(input);
+ this->m->state = st_inline_image;
+}
+
+void
+QPDFTokenizer::findEI(PointerHolder<InputSource> input)
+{
+ if (! input.getPointer())
{
- qpdf_offset_t last_offset = input->getLastOffset();
- qpdf_offset_t pos = input->tell();
+ return;
+ }
+
+ qpdf_offset_t last_offset = input->getLastOffset();
+ qpdf_offset_t pos = input->tell();
+ // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
+ // Then read the next several tokens or up to EOF. If we find any
+ // suspicious-looking or tokens, this is probably still part of
+ // the image data, so keep looking for EI. Stop at the first EI
+ // that passes. If we get to the end without finding one, return
+ // the last EI we found. Store the number of bytes expected in the
+ // inline image including the EI and use that to break out of
+ // inline image, falling back to the old method if needed.
+
+ bool okay = false;
+ bool first_try = true;
+ while (! okay)
+ {
QPDFWordTokenFinder f(input, "EI");
- if (input->findFirst("EI", pos, 0, f))
+ if (! input->findFirst("EI", input->tell(), 0, f))
{
- this->m->inline_image_bytes = input->tell() - pos;
+ break;
+ }
+ this->m->inline_image_bytes = input->tell() - pos;
+
+ QPDFTokenizer check;
+ bool found_bad = false;
+ // Look at the next 10 tokens or up to EOF. The next inline
+ // image's image data would look like bad tokens, but there
+ // will always be at least 10 tokens between one inline
+ // image's EI and the next valid one's ID since width, height,
+ // bits per pixel, and color space are all required as well as
+ // a BI and ID. If we get 10 good tokens in a row or hit EOF,
+ // we can be pretty sure we've found the actual EI.
+ for (int i = 0; i < 10; ++i)
+ {
+ QPDFTokenizer::Token t =
+ check.readToken(input, "checker", true);
+ token_type_e type = t.getType();
+ if (type == tt_eof)
+ {
+ okay = true;
+ }
+ else if (type == tt_bad)
+ {
+ found_bad = true;
+ }
+ else if (type == tt_word)
+ {
+ // The qpdf tokenizer lumps alphabetic and otherwise
+ // uncategorized characters into "words". We recognize
+ // strings of alphabetic characters as potential valid
+ // operators for purposes of telling whether we're in
+ // valid content or not. It's not perfect, but it
+ // should work more reliably than what we used to do,
+ // which was already good enough for the vast majority
+ // of files.
+ bool found_alpha = false;
+ bool found_non_printable = false;
+ bool found_other = false;
+ std::string value = t.getValue();
+ for (std::string::iterator iter = value.begin();
+ iter != value.end(); ++iter)
+ {
+ char ch = *iter;
+ if (((ch >= 'a') && (ch <= 'z')) ||
+ ((ch >= 'A') && (ch <= 'Z')) ||
+ (ch == '*'))
+ {
+ // Treat '*' as alpha since there are valid
+ // PDF operators that contain * along with
+ // alphabetic characters.
+ found_alpha = true;
+ }
+ else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
+ {
+ found_non_printable = true;
+ break;
+ }
+ else
+ {
+ found_other = true;
+ }
+ }
+ if (found_non_printable || (found_alpha && found_other))
+ {
+ found_bad = true;
+ }
+ }
+ if (okay || found_bad)
+ {
+ break;
+ }
+ }
+ if (! found_bad)
+ {
+ okay = true;
+ }
+ if (! okay)
+ {
+ first_try = false;
}
-
- input->seek(pos, SEEK_SET);
- input->setLastOffset(last_offset);
}
- if (this->m->state != st_top)
+ if (okay && (! first_try))
{
- throw std::logic_error("QPDFTokenizer::expectInlineImage called"
- " when tokenizer is in improper state");
+ QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
}
- this->m->state = st_inline_image;
+
+ input->seek(pos, SEEK_SET);
+ input->setLastOffset(last_offset);
}
bool
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 6dcebd6e..21b43dc8 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0
qpdf from_nr from repeat_nr 0
QPDF resolve duplicated page object 0
QPDF handle direct page object 0
+QPDFTokenizer finder found wrong word 0
+QPDFTokenizer finder word not preceded by delimiter 0
QPDFTokenizer found EI the old way 0
QPDFTokenizer found EI by byte count 0
QPDFTokenizer inline image at EOF the old way 0
+QPDFTokenizer found EI after more than one try 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 6abc7edb..da40d389 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -693,6 +693,26 @@ $td->runtest("check pass1 file",
show_ntests();
# ----------
+$td->notify("--- Inline Images ---");
+$n_tests += 2;
+
+# The file large-inline-image.pdf is a hand-crafted file with several
+# inline images of various sizes including one that is two megabytes,
+# encoded in base85, and has a base85-encoding that contains EI
+# surrounded by delimiters several times. This exercises the EI
+# detection code added in qpdf 8.4.
+
+$td->runtest("complex inline image parsing",
+ {$td->COMMAND =>
+ "qpdf --qdf --static-id large-inline-image.pdf a.pdf"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "large-inline-image.qdf"});
+
+show_ntests();
+# ----------
$td->notify("--- Tokenizer ---");
$n_tests += 5;
diff --git a/qpdf/qtest/qpdf/large-inline-image.pdf b/qpdf/qtest/qpdf/large-inline-image.pdf
new file mode 100644
index 00000000..0a47c192
--- /dev/null
+++ b/qpdf/qtest/qpdf/large-inline-image.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/large-inline-image.qdf b/qpdf/qtest/qpdf/large-inline-image.qdf
new file mode 100644
index 00000000..a82ea105
--- /dev/null
+++ b/qpdf/qtest/qpdf/large-inline-image.qdf
Binary files differ