Refactor QPDFTokenizer's inline image handling

Add a version of expectInlineImage that takes an input source and searches for EI. This is in preparation for improving the way EI is found. This commit just refactors the code without changing the functionality and adds tests to make sure the old and new code behave identically.
author: Jay Berkenbilt <ejb@ql.org> 2019-01-30 20:20:56 +0100
committer: Jay Berkenbilt <ejb@ql.org> 2019-01-31 15:26:37 +0100
commit: ec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b (patch)
tree: 970f4526f39909838f837eb1de5ac672881e9d58 /libqpdf
parent: 31372edce0b60211c7af98340b3afa054f414ca4 (diff)
download: qpdf-ec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b.tar.zst
3 files changed, 135 insertions, 57 deletions
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 577c5cc7..bd5d88ab 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,13 +1,13 @@
 #include <qpdf/Pl_QPDFTokenizer.hh>
 #include <qpdf/QTC.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/BufferInputSource.hh>
 #include <stdexcept>
 #include <string.h>
 
 Pl_QPDFTokenizer::Members::Members() :
     filter(0),
-    last_char_was_cr(false),
-    unread_char(false),
-    char_to_unread('\0')
+    buf("tokenizer buffer")
 {
 }
 
@@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
 }
 
 void
-Pl_QPDFTokenizer::processChar(char ch)
+Pl_QPDFTokenizer::write(unsigned char* data, size_t len)
 {
-    this->m->tokenizer.presentCharacter(ch);
-    QPDFTokenizer::Token token;
-    if (this->m->tokenizer.getToken(
-            token, this->m->unread_char, this->m->char_to_unread))
-    {
-	this->m->filter->handleToken(token);
-        if ((token.getType() == QPDFTokenizer::tt_word) &&
-            (token.getValue() == "ID"))
-        {
-            QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
-            this->m->tokenizer.expectInlineImage();
-        }
-    }
-}
-
-
-void
-Pl_QPDFTokenizer::checkUnread()
-{
-    if (this->m->unread_char)
-    {
-	processChar(this->m->char_to_unread);
-	if (this->m->unread_char)
-	{
-	    throw std::logic_error(
-		"INTERNAL ERROR: unread_char still true after processing "
-		"unread character");
-	}
-    }
-}
-
-void
-Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
-{
-    checkUnread();
-    for (size_t i = 0; i < len; ++i)
-    {
-	processChar(buf[i]);
-	checkUnread();
-    }
+    this->m->buf.write(data, len);
 }
 
 void
 Pl_QPDFTokenizer::finish()
 {
-    this->m->tokenizer.presentEOF();
-    QPDFTokenizer::Token token;
-    if (this->m->tokenizer.getToken(
-            token, this->m->unread_char, this->m->char_to_unread))
+    this->m->buf.finish();
+    PointerHolder<InputSource> input =
+        new BufferInputSource("tokenizer data",
+                              this->m->buf.getBuffer(), true);
+
+    while (true)
     {
+        QPDFTokenizer::Token token = this->m->tokenizer.readToken(
+            input, "offset " + QUtil::int_to_string(input->tell()),
+            true);
 	this->m->filter->handleToken(token);
+        if (token.getType() == QPDFTokenizer::tt_eof)
+        {
+            break;
+        }
+        else if ((token.getType() == QPDFTokenizer::tt_word) &&
+                 (token.getValue() == "ID"))
+        {
+            QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
+            this->m->tokenizer.expectInlineImage(input);
+        }
     }
-
     this->m->filter->handleEOF();
     QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline(
         m->filter, 0);
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index ecaa49bd..de5d56b3 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data(
             // terminated the token.  Read until end of inline image.
             char ch;
             input->read(&ch, 1);
-            tokenizer.expectInlineImage();
+            tokenizer.expectInlineImage(input);
             QPDFTokenizer::Token t =
                 tokenizer.readToken(input, description, true);
             if (t.getType() == QPDFTokenizer::tt_bad)
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index c11c8218..e03f927b 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -13,6 +13,79 @@
 #include <string.h>
 #include <cstdlib>
 
+static bool is_delimiter(char ch)
+{
+    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+}
+
+class QPDFWordTokenFinder: public InputSource::Finder
+{
+  public:
+    QPDFWordTokenFinder(PointerHolder<InputSource> is,
+                        std::string const& str) :
+        is(is),
+        str(str)
+    {
+    }
+    virtual ~QPDFWordTokenFinder()
+    {
+    }
+    virtual bool check();
+
+  private:
+    PointerHolder<InputSource> is;
+    std::string str;
+};
+
+bool
+QPDFWordTokenFinder::check()
+{
+    // Find a word token matching the given string, preceded by a
+    // delimiter, and followed by a delimiter or EOF.
+    QPDFTokenizer tokenizer;
+    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+    qpdf_offset_t pos = is->tell();
+    if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+        return false;
+    }
+    qpdf_offset_t token_start = is->getLastOffset();
+    char next;
+    bool next_okay = false;
+    if (is->read(&next, 1) == 0)
+    {
+        QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+        next_okay = true;
+    }
+    else
+    {
+        next_okay = is_delimiter(next);
+    }
+    is->seek(pos, SEEK_SET);
+    if (! next_okay)
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
+        return false;
+    }
+    if (token_start == 0)
+    {
+        // Can't actually happen...we never start the search at the
+        // beginning of the input.
+        return false;
+    }
+    is->seek(token_start - 1, SEEK_SET);
+    char prev;
+    bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev));
+    is->seek(pos, SEEK_SET);
+    if (! prev_okay)
+    {
+///        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+        return false;
+    }
+    return true;
+}
+
 QPDFTokenizer::Members::Members() :
     pound_special_in_name(true),
     allow_eof(false),
@@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset()
     error_message = "";
     unread_char = false;
     char_to_unread = '\0';
+    inline_image_bytes = 0;
     string_depth = 0;
     string_ignoring_newline = false;
     last_char_was_bs = false;
@@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch)
 bool
 QPDFTokenizer::isDelimiter(char ch)
 {
-    return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+    return is_delimiter(ch);
 }
 
 void
@@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch)
     {
         this->m->val += ch;
         size_t len = this->m->val.length();
-        if ((len >= 4) &&
-            isDelimiter(this->m->val.at(len-4)) &&
-            (this->m->val.at(len-3) == 'E') &&
-            (this->m->val.at(len-2) == 'I') &&
-            isDelimiter(this->m->val.at(len-1)))
+        if (len == this->m->inline_image_bytes)
+        {
+            QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+            this->m->type = tt_inline_image;
+            this->m->inline_image_bytes = 0;
+            this->m->state = st_token_ready;
+        }
+        else if ((this->m->inline_image_bytes == 0) &&
+                 (len >= 4) &&
+                 isDelimiter(this->m->val.at(len-4)) &&
+                 (this->m->val.at(len-3) == 'E') &&
+                 (this->m->val.at(len-2) == 'I') &&
+                 isDelimiter(this->m->val.at(len-1)))
         {
+            QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
             this->m->val.erase(len - 1);
             this->m->type = tt_inline_image;
             this->m->unread_char = true;
@@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF()
             (this->m->val.at(len-2) == 'E') &&
             (this->m->val.at(len-1) == 'I'))
         {
-            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+            QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
             this->m->type = tt_inline_image;
             this->m->state = st_token_ready;
         }
@@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF()
 void
 QPDFTokenizer::expectInlineImage()
 {
+    expectInlineImage(PointerHolder<InputSource>());
+}
+
+void
+QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
+{
+    if (input.getPointer())
+    {
+        qpdf_offset_t last_offset = input->getLastOffset();
+        qpdf_offset_t pos = input->tell();
+
+        QPDFWordTokenFinder f(input, "EI");
+        if (input->findFirst("EI", pos, 0, f))
+        {
+            this->m->inline_image_bytes = input->tell() - pos;
+        }
+
+        input->seek(pos, SEEK_SET);
+        input->setLastOffset(last_offset);
+    }
     if (this->m->state != st_top)
     {
         throw std::logic_error("QPDFTokenizer::expectInlineImage called"
author	Jay Berkenbilt <ejb@ql.org>	2019-01-30 20:20:56 +0100
committer	Jay Berkenbilt <ejb@ql.org>	2019-01-31 15:26:37 +0100
commit	ec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b (patch)
tree	970f4526f39909838f837eb1de5ac672881e9d58 /libqpdf
parent	31372edce0b60211c7af98340b3afa054f414ca4 (diff)
download	qpdf-ec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b.tar.zst