1 files changed, 230 insertions, 16 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 9c2a1e05..80fcf347 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -13,6 +13,69 @@
 #include <string.h>
 #include <cstdlib>
 
+static bool is_delimiter(char ch)
+{
+    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+}
+
+class QPDFWordTokenFinder: public InputSource::Finder
+{
+  public:
+    QPDFWordTokenFinder(PointerHolder<InputSource> is,
+                        std::string const& str) :
+        is(is),
+        str(str)
+    {
+    }
+    virtual ~QPDFWordTokenFinder()
+    {
+    }
+    virtual bool check();
+
+  private:
+    PointerHolder<InputSource> is;
+    std::string str;
+};
+
+bool
+QPDFWordTokenFinder::check()
+{
+    // Find a word token matching the given string, preceded by a
+    // delimiter, and followed by a delimiter or EOF.
+    QPDFTokenizer tokenizer;
+    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+    qpdf_offset_t pos = is->tell();
+    if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
+    {
+        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+        return false;
+    }
+    qpdf_offset_t token_start = is->getLastOffset();
+    char next;
+    bool next_okay = false;
+    if (is->read(&next, 1) == 0)
+    {
+        QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+        next_okay = true;
+    }
+    else
+    {
+        next_okay = is_delimiter(next);
+    }
+    is->seek(pos, SEEK_SET);
+    if (! next_okay)
+    {
+        return false;
+    }
+    if (token_start == 0)
+    {
+        // Can't actually happen...we never start the search at the
+        // beginning of the input.
+        return false;
+    }
+    return true;
+}
+
 QPDFTokenizer::Members::Members() :
     pound_special_in_name(true),
     allow_eof(false),
@@ -31,9 +94,11 @@ QPDFTokenizer::Members::reset()
     error_message = "";
     unread_char = false;
     char_to_unread = '\0';
+    inline_image_bytes = 0;
     string_depth = 0;
     string_ignoring_newline = false;
     last_char_was_bs = false;
+    last_char_was_cr = false;
 }
 
 QPDFTokenizer::Members::~Members()
@@ -90,7 +155,7 @@ QPDFTokenizer::isSpace(char ch)
 bool
 QPDFTokenizer::isDelimiter(char ch)
 {
-    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+    return is_delimiter(ch);
 }
 
 void
@@ -217,6 +282,7 @@ QPDFTokenizer::presentCharacter(char ch)
 	    memset(this->m->bs_num_register, '\0',
                    sizeof(this->m->bs_num_register));
 	    this->m->last_char_was_bs = false;
+	    this->m->last_char_was_cr = false;
 	    this->m->state = st_in_string;
 	}
 	else if (ch == '<')
@@ -334,8 +400,7 @@ QPDFTokenizer::presentCharacter(char ch)
     }
     else if (this->m->state == st_in_string)
     {
-	if (this->m->string_ignoring_newline &&
-            (! ((ch == '\r') || (ch == '\n'))))
+	if (this->m->string_ignoring_newline && (ch != '\n'))
 	{
 	    this->m->string_ignoring_newline = false;
 	}
@@ -353,9 +418,10 @@ QPDFTokenizer::presentCharacter(char ch)
 	    bs_num_count = 0;
 	}
 
-	if (this->m->string_ignoring_newline && ((ch == '\r') || (ch == '\n')))
+	if (this->m->string_ignoring_newline && (ch == '\n'))
 	{
 	    // ignore
+            this->m->string_ignoring_newline = false;
 	}
 	else if (ch_is_octal &&
                  (this->m->last_char_was_bs || (bs_num_count > 0)))
@@ -386,8 +452,10 @@ QPDFTokenizer::presentCharacter(char ch)
 		this->m->val += '\f';
 		break;
 
-	      case '\r':
 	      case '\n':
+                break;
+
+	      case '\r':
 		this->m->string_ignoring_newline = true;
 		break;
 
@@ -417,11 +485,26 @@ QPDFTokenizer::presentCharacter(char ch)
 	    this->m->type = tt_string;
 	    this->m->state = st_token_ready;
 	}
+        else if (ch == '\r')
+        {
+            // CR by itself is converted to LF
+            this->m->val += '\n';
+        }
+        else if (ch == '\n')
+        {
+            // CR LF is converted to LF
+            if (! this->m->last_char_was_cr)
+            {
+                this->m->val += ch;
+            }
+        }
 	else
 	{
 	    this->m->val += ch;
 	}
 
+        this->m->last_char_was_cr =
+            ((! this->m->string_ignoring_newline) && (ch == '\r'));
 	this->m->last_char_was_bs =
             ((! this->m->last_char_was_bs) && (ch == '\\'));
     }
@@ -449,21 +532,28 @@ QPDFTokenizer::presentCharacter(char ch)
     }
     else if (this->m->state == st_inline_image)
     {
+        this->m->val += ch;
         size_t len = this->m->val.length();
-        if ((len >= 4) &&
-            isDelimiter(this->m->val.at(len-4)) &&
-            (this->m->val.at(len-3) == 'E') &&
-            (this->m->val.at(len-2) == 'I') &&
-            isDelimiter(this->m->val.at(len-1)))
+        if (len == this->m->inline_image_bytes)
         {
+            QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
             this->m->type = tt_inline_image;
-            this->m->unread_char = true;
-            this->m->char_to_unread = ch;
+            this->m->inline_image_bytes = 0;
             this->m->state = st_token_ready;
         }
-        else
+        else if ((this->m->inline_image_bytes == 0) &&
+                 (len >= 4) &&
+                 isDelimiter(this->m->val.at(len-4)) &&
+                 (this->m->val.at(len-3) == 'E') &&
+                 (this->m->val.at(len-2) == 'I') &&
+                 isDelimiter(this->m->val.at(len-1)))
         {
-            this->m->val += ch;
+            QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
+            this->m->val.erase(len - 1);
+            this->m->type = tt_inline_image;
+            this->m->unread_char = true;
+            this->m->char_to_unread = ch;
+            this->m->state = st_token_ready;
         }
     }
     else
@@ -471,7 +561,6 @@ QPDFTokenizer::presentCharacter(char ch)
 	handled = false;
     }
 
-
     if (handled)
     {
 	// okay
@@ -546,7 +635,7 @@ QPDFTokenizer::presentEOF()
             (this->m->val.at(len-2) == 'E') &&
             (this->m->val.at(len-1) == 'I'))
         {
-            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
             this->m->type = tt_inline_image;
             this->m->state = st_token_ready;
         }
@@ -582,14 +671,139 @@ QPDFTokenizer::presentEOF()
 void
 QPDFTokenizer::expectInlineImage()
 {
+    expectInlineImage(PointerHolder<InputSource>());
+}
+
+void
+QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
+{
     if (this->m->state != st_top)
     {
         throw std::logic_error("QPDFTokenizer::expectInlineImage called"
                                " when tokenizer is in improper state");
     }
+    findEI(input);
     this->m->state = st_inline_image;
 }
 
+void
+QPDFTokenizer::findEI(PointerHolder<InputSource> input)
+{
+    if (! input.getPointer())
+    {
+        return;
+    }
+
+    qpdf_offset_t last_offset = input->getLastOffset();
+    qpdf_offset_t pos = input->tell();
+
+    // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
+    // Then read the next several tokens or up to EOF. If we find any
+    // suspicious-looking or tokens, this is probably still part of
+    // the image data, so keep looking for EI. Stop at the first EI
+    // that passes. If we get to the end without finding one, return
+    // the last EI we found. Store the number of bytes expected in the
+    // inline image including the EI and use that to break out of
+    // inline image, falling back to the old method if needed.
+
+    bool okay = false;
+    bool first_try = true;
+    while (! okay)
+    {
+        QPDFWordTokenFinder f(input, "EI");
+        if (! input->findFirst("EI", input->tell(), 0, f))
+        {
+            break;
+        }
+        this->m->inline_image_bytes = input->tell() - pos - 2;
+
+        QPDFTokenizer check;
+        bool found_bad = false;
+        // Look at the next 10 tokens or up to EOF. The next inline
+        // image's image data would look like bad tokens, but there
+        // will always be at least 10 tokens between one inline
+        // image's EI and the next valid one's ID since width, height,
+        // bits per pixel, and color space are all required as well as
+        // a BI and ID. If we get 10 good tokens in a row or hit EOF,
+        // we can be pretty sure we've found the actual EI.
+        for (int i = 0; i < 10; ++i)
+        {
+            QPDFTokenizer::Token t =
+                check.readToken(input, "checker", true);
+            token_type_e type = t.getType();
+            if (type == tt_eof)
+            {
+                okay = true;
+            }
+            else if (type == tt_bad)
+            {
+                found_bad = true;
+            }
+            else if (type == tt_word)
+            {
+                // The qpdf tokenizer lumps alphabetic and otherwise
+                // uncategorized characters into "words". We recognize
+                // strings of alphabetic characters as potential valid
+                // operators for purposes of telling whether we're in
+                // valid content or not. It's not perfect, but it
+                // should work more reliably than what we used to do,
+                // which was already good enough for the vast majority
+                // of files.
+                bool found_alpha = false;
+                bool found_non_printable = false;
+                bool found_other = false;
+                std::string value = t.getValue();
+                for (std::string::iterator iter = value.begin();
+                     iter != value.end(); ++iter)
+                {
+                    char ch = *iter;
+                    if (((ch >= 'a') && (ch <= 'z')) ||
+                        ((ch >= 'A') && (ch <= 'Z')) ||
+                        (ch == '*'))
+                    {
+                        // Treat '*' as alpha since there are valid
+                        // PDF operators that contain * along with
+                        // alphabetic characters.
+                        found_alpha = true;
+                    }
+                    else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
+                    {
+                        found_non_printable = true;
+                        break;
+                    }
+                    else
+                    {
+                        found_other = true;
+                    }
+                }
+                if (found_non_printable || (found_alpha && found_other))
+                {
+                    found_bad = true;
+                }
+            }
+            if (okay || found_bad)
+            {
+                break;
+            }
+        }
+        if (! found_bad)
+        {
+            okay = true;
+        }
+        if (! okay)
+        {
+            first_try = false;
+        }
+    }
+    if (okay && (! first_try))
+    {
+        QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
+    }
+
+    input->seek(pos, SEEK_SET);
+    input->setLastOffset(last_offset);
+}
+
 bool
 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
 {