8 files changed, 176 insertions, 73 deletions
diff --git a/include/qpdf/Pl_QPDFTokenizer.hh b/include/qpdf/Pl_QPDFTokenizer.hh
index 52630d2a..a571b079 100644
--- a/include/qpdf/Pl_QPDFTokenizer.hh
+++ b/include/qpdf/Pl_QPDFTokenizer.hh
@@ -27,6 +27,7 @@
 #include <qpdf/QPDFTokenizer.hh>
 #include <qpdf/PointerHolder.hh>
 #include <qpdf/QPDFObjectHandle.hh>
+#include <qpdf/Pl_Buffer.hh>
 
 // Tokenize the incoming text using QPDFTokenizer and pass the tokens
 // in turn to a QPDFObjectHandle::TokenFilter object. All bytes of
@@ -56,9 +57,6 @@ class Pl_QPDFTokenizer: public Pipeline
     virtual void finish();
 
   private:
-    void processChar(char ch);
-    void checkUnread();
-
     class Members
     {
         friend class Pl_QPDFTokenizer;
@@ -73,9 +71,7 @@ class Pl_QPDFTokenizer: public Pipeline
 
         QPDFObjectHandle::TokenFilter* filter;
         QPDFTokenizer tokenizer;
-        bool last_char_was_cr;
-        bool unread_char;
-        char char_to_unread;
+        Pl_Buffer buf;
     };
     PointerHolder<Members> m;
 };
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 370296b2..31f2f398 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -178,7 +178,15 @@ class QPDFTokenizer
     // including the next EI token. After you call this method, the
     // next call to readToken (or the token created next time getToken
     // returns true) will either be tt_inline_image or tt_bad. This is
-    // the only way readToken returns a tt_inline_image token.
+    // the only way readToken returns a tt_inline_image token. The
+    // version of this method that takes a PointerHolder<InputSource>
+    // does a better job of locating the end of the inline image and
+    // should be used whenever the input source is available. It
+    // preserves both tell() and getLastOffset(). The version without
+    // the input source will always end the inline image the first
+    // time it sees something that looks like an EI operator.
+    QPDF_DLL
+    void expectInlineImage(PointerHolder<InputSource> input);
     QPDF_DLL
     void expectInlineImage();
 
@@ -223,6 +231,7 @@ class QPDFTokenizer
         std::string error_message;
         bool unread_char;
         char char_to_unread;
+        size_t inline_image_bytes;
 
         // State for strings
         int string_depth;
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 577c5cc7..bd5d88ab 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,13 +1,13 @@
 #include <qpdf/Pl_QPDFTokenizer.hh>
 #include <qpdf/QTC.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/BufferInputSource.hh>
 #include <stdexcept>
 #include <string.h>
 
 Pl_QPDFTokenizer::Members::Members() :
     filter(0),
-    last_char_was_cr(false),
-    unread_char(false),
-    char_to_unread('\0')
+    buf("tokenizer buffer")
 {
 }
 
@@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
 }
 
 void
-Pl_QPDFTokenizer::processChar(char ch)
+Pl_QPDFTokenizer::write(unsigned char* data, size_t len)
 {
-    this->m->tokenizer.presentCharacter(ch);
-    QPDFTokenizer::Token token;
-    if (this->m->tokenizer.getToken(
-            token, this->m->unread_char, this->m->char_to_unread))
-    {
-	this->m->filter->handleToken(token);
-        if ((token.getType() == QPDFTokenizer::tt_word) &&
-            (token.getValue() == "ID"))
-        {
-            QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
-            this->m->tokenizer.expectInlineImage();
-        }
-    }
-}
-
-
-void
-Pl_QPDFTokenizer::checkUnread()
-{
-    if (this->m->unread_char)
-    {
-	processChar(this->m->char_to_unread);
-	if (this->m->unread_char)
-	{
-	    throw std::logic_error(
-		"INTERNAL ERROR: unread_char still true after processing "
-		"unread character");
-	}
-    }
-}
-
-void
-Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
-{
-    checkUnread();
-    for (size_t i = 0; i < len; ++i)
-    {
-	processChar(buf[i]);
-	checkUnread();
-    }
+    this->m->buf.write(data, len);
 }
 
 void
 Pl_QPDFTokenizer::finish()
 {
-    this->m->tokenizer.presentEOF();
-    QPDFTokenizer::Token token;
-    if (this->m->tokenizer.getToken(
-            token, this->m->unread_char, this->m->char_to_unread))
+    this->m->buf.finish();
+    PointerHolder<InputSource> input =
+        new BufferInputSource("tokenizer data",
+                              this->m->buf.getBuffer(), true);
+
+    while (true)
     {
+        QPDFTokenizer::Token token = this->m->tokenizer.readToken(
+            input, "offset " + QUtil::int_to_string(input->tell()),
+            true);
 	this->m->filter->handleToken(token);
+        if (token.getType() == QPDFTokenizer::tt_eof)
+        {
+            break;
+        }
+        else if ((token.getType() == QPDFTokenizer::tt_word) &&
+                 (token.getValue() == "ID"))
+        {
+            QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
+            this->m->tokenizer.expectInlineImage(input);
+        }
     }
-
     this->m->filter->handleEOF();
     QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline(
         m->filter, 0);
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index ecaa49bd..de5d56b3 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data(
             // terminated the token.  Read until end of inline image.
             char ch;
             input->read(&ch, 1);
-            tokenizer.expectInlineImage();
+            tokenizer.expectInlineImage(input);
             QPDFTokenizer::Token t =
                 tokenizer.readToken(input, description, true);
             if (t.getType() == QPDFTokenizer::tt_bad)
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index c11c8218..e03f927b 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -13,6 +13,79 @@
 #include <string.h>
 #include <cstdlib>
 
+static bool is_delimiter(char ch)
+{
+    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+}
+
+class QPDFWordTokenFinder: public InputSource::Finder
+{
+  public:
+    QPDFWordTokenFinder(PointerHolder<InputSource> is,
+                        std::string const& str) :
+        is(is),
+        str(str)
+    {
+    }
+    virtual ~QPDFWordTokenFinder()
+    {
+    }
+    virtual bool check();
+
+  private:
+    PointerHolder<InputSource> is;
+    std::string str;
+};
+
+bool
+QPDFWordTokenFinder::check()
+{
+    // Find a word token matching the given string, preceded by a
+    // delimiter, and followed by a delimiter or EOF.
+    QPDFTokenizer tokenizer;
+    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+    qpdf_offset_t pos = is->tell();
+    if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+        return false;
+    }
+    qpdf_offset_t token_start = is->getLastOffset();
+    char next;
+    bool next_okay = false;
+    if (is->read(&next, 1) == 0)
+    {
+        QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+        next_okay = true;
+    }
+    else
+    {
+        next_okay = is_delimiter(next);
+    }
+    is->seek(pos, SEEK_SET);
+    if (! next_okay)
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
+        return false;
+    }
+    if (token_start == 0)
+    {
+        // Can't actually happen...we never start the search at the
+        // beginning of the input.
+        return false;
+    }
+    is->seek(token_start - 1, SEEK_SET);
+    char prev;
+    bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev));
+    is->seek(pos, SEEK_SET);
+    if (! prev_okay)
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+        return false;
+    }
+    return true;
+}
+
 QPDFTokenizer::Members::Members() :
     pound_special_in_name(true),
     allow_eof(false),
@@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset()
     error_message = "";
     unread_char = false;
     char_to_unread = '\0';
+    inline_image_bytes = 0;
     string_depth = 0;
     string_ignoring_newline = false;
     last_char_was_bs = false;
@@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch)
 bool
 QPDFTokenizer::isDelimiter(char ch)
 {
-    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+    return is_delimiter(ch);
 }
 
 void
@@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch)
     {
         this->m->val += ch;
         size_t len = this->m->val.length();
-        if ((len >= 4) &&
-            isDelimiter(this->m->val.at(len-4)) &&
-            (this->m->val.at(len-3) == 'E') &&
-            (this->m->val.at(len-2) == 'I') &&
-            isDelimiter(this->m->val.at(len-1)))
+        if (len == this->m->inline_image_bytes)
+        {
+            QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+            this->m->type = tt_inline_image;
+            this->m->inline_image_bytes = 0;
+            this->m->state = st_token_ready;
+        }
+        else if ((this->m->inline_image_bytes == 0) &&
+                 (len >= 4) &&
+                 isDelimiter(this->m->val.at(len-4)) &&
+                 (this->m->val.at(len-3) == 'E') &&
+                 (this->m->val.at(len-2) == 'I') &&
+                 isDelimiter(this->m->val.at(len-1)))
         {
+            QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
             this->m->val.erase(len - 1);
             this->m->type = tt_inline_image;
             this->m->unread_char = true;
@@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF()
             (this->m->val.at(len-2) == 'E') &&
             (this->m->val.at(len-1) == 'I'))
         {
-            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
             this->m->type = tt_inline_image;
             this->m->state = st_token_ready;
         }
@@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF()
 void
 QPDFTokenizer::expectInlineImage()
 {
+    expectInlineImage(PointerHolder<InputSource>());
+}
+
+void
+QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
+{
+    if (input.getPointer())
+    {
+        qpdf_offset_t last_offset = input->getLastOffset();
+        qpdf_offset_t pos = input->tell();
+
+        QPDFWordTokenFinder f(input, "EI");
+        if (input->findFirst("EI", pos, 0, f))
+        {
+            this->m->inline_image_bytes = input->tell() - pos;
+        }
+
+        input->seek(pos, SEEK_SET);
+        input->setLastOffset(last_offset);
+    }
     if (this->m->state != st_top)
     {
         throw std::logic_error("QPDFTokenizer::expectInlineImage called"
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 5150e567..6dcebd6e 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -430,3 +430,6 @@ QPDFPageObjectHelper copy shared attribute 0
 qpdf from_nr from repeat_nr 0
 QPDF resolve duplicated page object 0
 QPDF handle direct page object 0
+QPDFTokenizer found EI the old way 0
+QPDFTokenizer found EI by byte count 0
+QPDFTokenizer inline image at EOF the old way 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index ca7ea12b..6abc7edb 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -694,7 +694,7 @@ $td->runtest("check pass1 file",
 show_ntests();
 # ----------
 $td->notify("--- Tokenizer ---");
-$n_tests += 4;
+$n_tests += 5;
 
 $td->runtest("tokenizer with no ignorable",
              {$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
@@ -706,6 +706,11 @@ $td->runtest("tokenizer",
              {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
              $td->NORMALIZE_NEWLINES);
 
+$td->runtest("tokenizer with old inline image code",
+             {$td->COMMAND => "test_tokenizer -old-ei tokens.pdf"},
+             {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 $td->runtest("tokenizer with max_len",
              {$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
              {$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc
index 9f65281b..ecbb3552 100644
--- a/qpdf/test_tokenizer.cc
+++ b/qpdf/test_tokenizer.cc
@@ -16,7 +16,7 @@ static char const* whoami = 0;
 void usage()
 {
     std::cerr << "Usage: " << whoami
-              << " [-maxlen len | -no-ignorable] filename"
+              << " [-maxlen len | -no-ignorable | -old-ei] filename"
               << std::endl;
     exit(2);
 }
@@ -132,7 +132,7 @@ try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
 static void
 dump_tokens(PointerHolder<InputSource> is, std::string const& label,
             size_t max_len, bool include_ignorable,
-            bool skip_streams, bool skip_inline_images)
+            bool skip_streams, bool skip_inline_images, bool old_ei)
 {
     Finder f1(is, "endstream");
     std::cout << "--- BEGIN " << label << " ---" << std::endl;
@@ -183,7 +183,14 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
         else if (skip_inline_images &&
                  (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
         {
-            tokenizer.expectInlineImage();
+            if (old_ei)
+            {
+                tokenizer.expectInlineImage();
+            }
+            else
+            {
+                tokenizer.expectInlineImage(is);
+            }
             inline_image_offset = is->tell();
         }
         else if (token.getType() == QPDFTokenizer::tt_eof)
@@ -195,7 +202,7 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
 }
 
 static void process(char const* filename, bool include_ignorable,
-                    size_t max_len)
+                    size_t max_len, bool old_ei)
 {
     PointerHolder<InputSource> is;
 
@@ -203,7 +210,7 @@ static void process(char const* filename, bool include_ignorable,
     FileInputSource* fis = new FileInputSource();
     fis->setFilename(filename);
     is = fis;
-    dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
+    dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false);
 
     // Tokenize content streams, skipping inline images
     QPDF qpdf;
@@ -222,7 +229,7 @@ static void process(char const* filename, bool include_ignorable,
             "content data", content_data.getPointer());
         is = bis;
         dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
-                    max_len, include_ignorable, false, true);
+                    max_len, include_ignorable, false, true, old_ei);
     }
 
     // Tokenize object streams
@@ -241,7 +248,7 @@ static void process(char const* filename, bool include_ignorable,
             is = bis;
             dump_tokens(is, "OBJECT STREAM " +
                         QUtil::int_to_string((*iter).getObjectID()),
-                        max_len, include_ignorable, false, false);
+                        max_len, include_ignorable, false, false, false);
         }
     }
 }
@@ -266,6 +273,7 @@ int main(int argc, char* argv[])
     char const* filename = 0;
     size_t max_len = 0;
     bool include_ignorable = true;
+    bool old_ei = false;
     for (int i = 1; i < argc; ++i)
     {
         if (argv[i][0] == '-')
@@ -282,6 +290,10 @@ int main(int argc, char* argv[])
             {
                 include_ignorable = false;
             }
+            else if (strcmp(argv[i], "-old-ei") == 0)
+            {
+                old_ei = true;
+            }
             else
             {
                 usage();
@@ -303,7 +315,7 @@ int main(int argc, char* argv[])
 
     try
     {
-        process(filename, include_ignorable, max_len);
+        process(filename, include_ignorable, max_len, old_ei);
     }
     catch (std::exception& e)
     {