2 files changed, 37 insertions, 18 deletions
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 28ea4faa..3bd3fb6c 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,6 +1,7 @@
 #include <qpdf/Pl_QPDFTokenizer.hh>
 #include <qpdf/QPDF_String.hh>
 #include <qpdf/QPDF_Name.hh>
+#include <qpdf/QTC.hh>
 #include <stdexcept>
 #include <string.h>
 
@@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
     last_char_was_cr(false),
     unread_char(false),
     char_to_unread('\0'),
-    pass_through(false)
+    in_inline_image(false)
 {
+    memset(this->image_buf, 0, IMAGE_BUF_SIZE);
 }
 
 Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
@@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
 void
 Pl_QPDFTokenizer::processChar(char ch)
 {
-    if (this->pass_through)
+    if (this->in_inline_image)
     {
-	// We're not normalizing anymore -- just write this without
-	// looking at it.
-	writeNext(&ch, 1);
+	// Scan through the input looking for EI surrounded by
+	// whitespace.  If that pattern appears in the inline image's
+	// representation, we're hosed, but this situation seems
+	// excessively unlikely, and this code path is only followed
+	// during content stream normalization, which is pretty much
+	// used for debugging and human inspection of PDF files.
+	memmove(this->image_buf,
+		this->image_buf + 1,
+		IMAGE_BUF_SIZE - 1);
+	this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
+	if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
+	    (this->image_buf[1] == 'E') &&
+	    (this->image_buf[2] == 'I') &&
+	    strchr(" \t\n\v\f\r", this->image_buf[3]))
+	{
+	    // We've found an EI operator.  We've already written the
+	    // EI operator to output; terminate with a newline
+	    // character and resume normal processing.
+	    writeNext("\n", 1);
+	    this->in_inline_image = false;
+	    QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
+	}
+	else
+	{
+	    writeNext(&ch, 1);
+	}
 	return;
     }
 
@@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch)
 	    this->newline_after_next_token = false;
 	}
 	if ((token.getType() == QPDFTokenizer::tt_word) &&
-	    (token.getValue() == "BI"))
+	    (token.getValue() == "ID"))
 	{
-	    // Uh oh.... we're not sophisticated enough to handle
-	    // inline images safely.  We'd have to to set up all the
-	    // filters and pipe the image data through it until the
-	    // filtered output was the right size for an image of the
-	    // specified dimensions.  Then we'd either have to write
-	    // out raw image data or continue to write filtered data,
-	    // resuming normalization when we get to the end.
-	    // Instead, for now, we'll just turn off normalization for
-	    // the remainder of this stream.
-	    this->pass_through = true;
+	    // Suspend normal scanning until we find an EI token.
+	    this->in_inline_image = true;
 	    if (this->unread_char)
 	    {
 		writeNext(&this->char_to_unread, 1);
@@ -156,7 +173,7 @@ void
 Pl_QPDFTokenizer::finish()
 {
     this->tokenizer.presentEOF();
-    if (! this->pass_through)
+    if (! this->in_inline_image)
     {
 	QPDFTokenizer::Token token;
 	if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
index d300d7cd..3f816f5d 100644
--- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh
+++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
@@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline
     bool last_char_was_cr;
     bool unread_char;
     char char_to_unread;
-    bool pass_through;
+    bool in_inline_image;
+    static int const IMAGE_BUF_SIZE = 4; // must be >= 4
+    char image_buf[IMAGE_BUF_SIZE];
 };
 
 #endif // __PL_QPDFTOKENIZER_HH__