1 files changed, 716 insertions, 389 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 1726e1b9..cd8f932d 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -73,28 +73,20 @@ QPDFWordTokenFinder::check()
     return true;
 }
 
-QPDFTokenizer::Members::Members() :
-    allow_eof(false),
-    include_ignorable(false)
-{
-    reset();
-}
-
 void
-QPDFTokenizer::Members::reset()
+QPDFTokenizer::reset()
 {
-    state = st_top;
+    state = st_before_token;
     type = tt_bad;
-    val = "";
-    raw_val = "";
+    val.clear();
+    raw_val.clear();
     error_message = "";
-    unread_char = false;
+    before_token = true;
+    in_token = false;
     char_to_unread = '\0';
     inline_image_bytes = 0;
     string_depth = 0;
-    string_ignoring_newline = false;
-    last_char_was_bs = false;
-    last_char_was_cr = false;
+    bad = false;
 }
 
 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@@ -110,20 +102,22 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
 }
 
 QPDFTokenizer::QPDFTokenizer() :
-    m(new Members())
+    allow_eof(false),
+    include_ignorable(false)
 {
+    reset();
 }
 
 void
 QPDFTokenizer::allowEOF()
 {
-    this->m->allow_eof = true;
+    this->allow_eof = true;
 }
 
 void
 QPDFTokenizer::includeIgnorable()
 {
-    this->m->include_ignorable = true;
+    this->include_ignorable = true;
 }
 
 bool
@@ -139,376 +133,719 @@ QPDFTokenizer::isDelimiter(char ch)
 }
 
 void
-QPDFTokenizer::resolveLiteral()
-{
-    if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) {
-        this->m->type = tt_name;
-        // Deal with # in name token.  Note: '/' by itself is a
-        // valid name, so don't strip leading /.  That way we
-        // don't have to deal with the empty string as a name.
-        std::string nval = "/";
-        size_t len = this->m->val.length();
-        for (size_t i = 1; i < len; ++i) {
-            char ch = this->m->val.at(i);
-            if (ch == '#') {
-                if ((i + 2 < len) &&
-                    QUtil::is_hex_digit(this->m->val.at(i + 1)) &&
-                    QUtil::is_hex_digit(this->m->val.at(i + 2))) {
-                    char num[3];
-                    num[0] = this->m->val.at(i + 1);
-                    num[1] = this->m->val.at(i + 2);
-                    num[2] = '\0';
-                    char ch2 = static_cast<char>(strtol(num, nullptr, 16));
-                    if (ch2 == '\0') {
-                        this->m->type = tt_bad;
-                        QTC::TC("qpdf", "QPDFTokenizer null in name");
-                        this->m->error_message =
-                            "null character not allowed in name token";
-                        nval += "#00";
-                    } else {
-                        nval.append(1, ch2);
-                    }
-                    i += 2;
-                } else {
-                    QTC::TC("qpdf", "QPDFTokenizer bad name");
-                    this->m->error_message =
-                        "name with stray # will not work with PDF >= 1.2";
-                    // Use null to encode a bad # -- this is reversed
-                    // in QPDF_Name::normalizeName.
-                    nval += '\0';
-                }
-            } else {
-                nval.append(1, ch);
-            }
-        }
-        this->m->val = nval;
-    } else if (QUtil::is_number(this->m->val.c_str())) {
-        if (this->m->val.find('.') != std::string::npos) {
-            this->m->type = tt_real;
-        } else {
-            this->m->type = tt_integer;
-        }
-    } else if ((this->m->val == "true") || (this->m->val == "false")) {
-        this->m->type = tt_bool;
-    } else if (this->m->val == "null") {
-        this->m->type = tt_null;
-    } else {
-        // I don't really know what it is, so leave it as tt_word.
-        // Lots of cases ($, #, etc.) other than actual words fall
-        // into this category, but that's okay at least for now.
-        this->m->type = tt_word;
+QPDFTokenizer::presentCharacter(char ch)
+{
+    handleCharacter(ch);
+
+    if (this->in_token) {
+        this->raw_val += ch;
     }
 }
 
 void
-QPDFTokenizer::presentCharacter(char ch)
+QPDFTokenizer::handleCharacter(char ch)
 {
-    if (this->m->state == st_token_ready) {
+    // State machine is implemented such that the final character may not be
+    // handled.  This happens whenever you have to use a character from the
+    // next token to detect the end of the current token.
+
+    switch (this->state) {
+    case st_top:
+        inTop(ch);
+        return;
+
+    case st_in_space:
+        inSpace(ch);
+        return;
+
+    case st_in_comment:
+        inComment(ch);
+        return;
+
+    case st_lt:
+        inLt(ch);
+        return;
+
+    case st_gt:
+        inGt(ch);
+        return;
+
+    case st_in_string:
+        inString(ch);
+        return;
+
+    case st_name:
+        inName(ch);
+        return;
+
+    case st_number:
+        inNumber(ch);
+        return;
+
+    case st_real:
+        inReal(ch);
+        return;
+
+    case st_string_after_cr:
+        inStringAfterCR(ch);
+        return;
+
+    case st_string_escape:
+        inStringEscape(ch);
+        return;
+
+    case st_char_code:
+        inCharCode(ch);
+        return;
+
+    case st_literal:
+        inLiteral(ch);
+        return;
+
+    case st_inline_image:
+        inInlineImage(ch);
+        return;
+
+    case st_in_hexstring:
+        inHexstring(ch);
+        return;
+
+    case st_in_hexstring_2nd:
+        inHexstring2nd(ch);
+        return;
+
+    case st_name_hex1:
+        inNameHex1(ch);
+        return;
+
+    case st_name_hex2:
+        inNameHex2(ch);
+        return;
+
+    case st_sign:
+        inSign(ch);
+        return;
+
+    case st_decimal:
+        inDecimal(ch);
+        return;
+
+    case (st_before_token):
+        inBeforeToken(ch);
+        return;
+
+    case (st_token_ready):
+        inTokenReady(ch);
+        return;
+
+    default:
         throw std::logic_error(
-            "INTERNAL ERROR: QPDF tokenizer presented character "
-            "while token is waiting");
+            "INTERNAL ERROR: invalid state while reading token");
     }
+}
 
-    char orig_ch = ch;
-
-    // State machine is implemented such that some characters may be
-    // handled more than once.  This happens whenever you have to use
-    // the character that caused a state change in the new state.
+void
+QPDFTokenizer::inTokenReady(char ch)
+{
+    throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
+                           "while token is waiting");
+}
 
-    bool handled = true;
-    if (this->m->state == st_top) {
-        // Note: we specifically do not use ctype here.  It is
-        // locale-dependent.
-        if (isSpace(ch)) {
-            if (this->m->include_ignorable) {
-                this->m->state = st_in_space;
-                this->m->val += ch;
-            }
-        } else if (ch == '%') {
-            this->m->state = st_in_comment;
-            if (this->m->include_ignorable) {
-                this->m->val += ch;
-            }
-        } else if (ch == '(') {
-            this->m->string_depth = 1;
-            this->m->string_ignoring_newline = false;
-            memset(
-                this->m->bs_num_register,
-                '\0',
-                sizeof(this->m->bs_num_register));
-            this->m->last_char_was_bs = false;
-            this->m->last_char_was_cr = false;
-            this->m->state = st_in_string;
-        } else if (ch == '<') {
-            this->m->state = st_lt;
-        } else if (ch == '>') {
-            this->m->state = st_gt;
-        } else {
-            this->m->val += ch;
-            if (ch == ')') {
-                this->m->type = tt_bad;
-                QTC::TC("qpdf", "QPDFTokenizer bad )");
-                this->m->error_message = "unexpected )";
-                this->m->state = st_token_ready;
-            } else if (ch == '[') {
-                this->m->type = tt_array_open;
-                this->m->state = st_token_ready;
-            } else if (ch == ']') {
-                this->m->type = tt_array_close;
-                this->m->state = st_token_ready;
-            } else if (ch == '{') {
-                this->m->type = tt_brace_open;
-                this->m->state = st_token_ready;
-            } else if (ch == '}') {
-                this->m->type = tt_brace_close;
-                this->m->state = st_token_ready;
-            } else {
-                this->m->state = st_literal;
-            }
-        }
-    } else if (this->m->state == st_in_space) {
-        // We only enter this state if include_ignorable is true.
-        if (!isSpace(ch)) {
-            this->m->type = tt_space;
-            this->m->unread_char = true;
-            this->m->char_to_unread = ch;
-            this->m->state = st_token_ready;
-        } else {
-            this->m->val += ch;
-        }
-    } else if (this->m->state == st_in_comment) {
-        if ((ch == '\r') || (ch == '\n')) {
-            if (this->m->include_ignorable) {
-                this->m->type = tt_comment;
-                this->m->unread_char = true;
-                this->m->char_to_unread = ch;
-                this->m->state = st_token_ready;
-            } else {
-                this->m->state = st_top;
-            }
-        } else if (this->m->include_ignorable) {
-            this->m->val += ch;
-        }
-    } else if (this->m->state == st_lt) {
-        if (ch == '<') {
-            this->m->val = "<<";
-            this->m->type = tt_dict_open;
-            this->m->state = st_token_ready;
-        } else {
-            handled = false;
-            this->m->state = st_in_hexstring;
-        }
-    } else if (this->m->state == st_gt) {
-        if (ch == '>') {
-            this->m->val = ">>";
-            this->m->type = tt_dict_close;
-            this->m->state = st_token_ready;
-        } else {
-            this->m->val = ">";
-            this->m->type = tt_bad;
-            QTC::TC("qpdf", "QPDFTokenizer bad >");
-            this->m->error_message = "unexpected >";
-            this->m->unread_char = true;
-            this->m->char_to_unread = ch;
-            this->m->state = st_token_ready;
+void
+QPDFTokenizer::inBeforeToken(char ch)
+{
+    // Note: we specifically do not use ctype here.  It is
+    // locale-dependent.
+    if (isSpace(ch)) {
+        this->before_token = !this->include_ignorable;
+        this->in_token = this->include_ignorable;
+        if (this->include_ignorable) {
+            this->state = st_in_space;
+            this->val += ch;
         }
-    } else if (this->m->state == st_in_string) {
-        if (this->m->string_ignoring_newline && (ch != '\n')) {
-            this->m->string_ignoring_newline = false;
+    } else if (ch == '%') {
+        this->before_token = !this->include_ignorable;
+        this->in_token = this->include_ignorable;
+        this->state = st_in_comment;
+        if (this->include_ignorable) {
+            this->val += ch;
         }
+    } else {
+        this->before_token = false;
+        this->in_token = true;
+        inTop(ch);
+    }
+}
 
-        size_t bs_num_count = strlen(this->m->bs_num_register);
-        bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
-        if ((bs_num_count == 3) || ((bs_num_count > 0) && (!ch_is_octal))) {
-            // We've accumulated \ddd.  PDF Spec says to ignore
-            // high-order overflow.
-            this->m->val +=
-                static_cast<char>(strtol(this->m->bs_num_register, nullptr, 8));
-            memset(
-                this->m->bs_num_register,
-                '\0',
-                sizeof(this->m->bs_num_register));
-            bs_num_count = 0;
-        }
+void
+QPDFTokenizer::inTop(char ch)
+{
+    switch (ch) {
+    case '(':
+        this->string_depth = 1;
+        this->state = st_in_string;
+        return;
 
-        if (this->m->string_ignoring_newline && (ch == '\n')) {
-            // ignore
-            this->m->string_ignoring_newline = false;
-        } else if (
-            ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) {
-            this->m->bs_num_register[bs_num_count++] = ch;
-        } else if (this->m->last_char_was_bs) {
-            switch (ch) {
-            case 'n':
-                this->m->val += '\n';
-                break;
+    case '<':
+        this->state = st_lt;
+        return;
 
-            case 'r':
-                this->m->val += '\r';
-                break;
+    case '>':
+        this->state = st_gt;
+        return;
 
-            case 't':
-                this->m->val += '\t';
-                break;
+    case (')'):
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad )");
+        this->error_message = "unexpected )";
+        this->val += ch;
+        this->state = st_token_ready;
+        return;
 
-            case 'b':
-                this->m->val += '\b';
-                break;
+    case '[':
+        this->type = tt_array_open;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
 
-            case 'f':
-                this->m->val += '\f';
-                break;
+    case ']':
+        this->type = tt_array_close;
+        this->val += ch;
+        this->state = st_token_ready;
+        return;
 
-            case '\n':
-                break;
+    case '{':
+        this->type = tt_brace_open;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
 
-            case '\r':
-                this->m->string_ignoring_newline = true;
-                break;
+    case '}':
+        this->type = tt_brace_close;
+        this->state = st_token_ready;
+        this->val += ch;
+        return;
 
-            default:
-                // PDF spec says backslash is ignored before anything else
-                this->m->val += ch;
-                break;
-            }
-        } else if (ch == '\\') {
-            // last_char_was_bs is set/cleared below as appropriate
-            if (bs_num_count) {
-                throw std::logic_error(
-                    "INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 "
-                    "when ch == '\\'");
-            }
-        } else if (ch == '(') {
-            this->m->val += ch;
-            ++this->m->string_depth;
-        } else if ((ch == ')') && (--this->m->string_depth == 0)) {
-            this->m->type = tt_string;
-            this->m->state = st_token_ready;
-        } else if (ch == '\r') {
-            // CR by itself is converted to LF
-            this->m->val += '\n';
-        } else if (ch == '\n') {
-            // CR LF is converted to LF
-            if (!this->m->last_char_was_cr) {
-                this->m->val += ch;
-            }
-        } else {
-            this->m->val += ch;
-        }
+    case '/':
+        this->state = st_name;
+        this->val += ch;
+        return;
 
-        this->m->last_char_was_cr =
-            ((!this->m->string_ignoring_newline) && (ch == '\r'));
-        this->m->last_char_was_bs =
-            ((!this->m->last_char_was_bs) && (ch == '\\'));
-    } else if (this->m->state == st_literal) {
-        if (isDelimiter(ch)) {
-            // A C-locale whitespace character or delimiter terminates
-            // token.  It is important to unread the whitespace
-            // character even though it is ignored since it may be the
-            // newline after a stream keyword.  Removing it here could
-            // make the stream-reading code break on some files,
-            // though not on any files in the test suite as of this
-            // writing.
-
-            this->m->type = tt_word;
-            this->m->unread_char = true;
-            this->m->char_to_unread = ch;
-            this->m->state = st_token_ready;
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+        this->state = st_number;
+        this->val += ch;
+        return;
+
+    case '+':
+    case '-':
+        this->state = st_sign;
+        this->val += ch;
+        return;
+
+    case '.':
+        this->state = st_decimal;
+        this->val += ch;
+        return;
+
+    default:
+        this->state = st_literal;
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inSpace(char ch)
+{
+    // We only enter this state if include_ignorable is true.
+    if (!isSpace(ch)) {
+        this->type = tt_space;
+        this->in_token = false;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+        return;
+    } else {
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inComment(char ch)
+{
+    if ((ch == '\r') || (ch == '\n')) {
+        if (this->include_ignorable) {
+            this->type = tt_comment;
+            this->in_token = false;
+            this->char_to_unread = ch;
+            this->state = st_token_ready;
         } else {
-            this->m->val += ch;
+            this->state = st_before_token;
         }
-    } else if (this->m->state == st_inline_image) {
-        this->m->val += ch;
-        size_t len = this->m->val.length();
-        if (len == this->m->inline_image_bytes) {
-            QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
-            this->m->type = tt_inline_image;
-            this->m->inline_image_bytes = 0;
-            this->m->state = st_token_ready;
+    } else if (this->include_ignorable) {
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inString(char ch)
+{
+    switch (ch) {
+    case '\\':
+        this->state = st_string_escape;
+        return;
+
+    case '(':
+        this->val += ch;
+        ++this->string_depth;
+        return;
+
+    case ')':
+        if (--this->string_depth == 0) {
+            this->type = tt_string;
+            this->state = st_token_ready;
+            return;
         }
+
+        this->val += ch;
+        return;
+
+    case '\r':
+        // CR by itself is converted to LF
+        this->val += '\n';
+        this->state = st_string_after_cr;
+        return;
+
+    case '\n':
+        this->val += ch;
+        return;
+
+    default:
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inName(char ch)
+{
+    if (isDelimiter(ch)) {
+        // A C-locale whitespace character or delimiter terminates
+        // token.  It is important to unread the whitespace
+        // character even though it is ignored since it may be the
+        // newline after a stream keyword.  Removing it here could
+        // make the stream-reading code break on some files,
+        // though not on any files in the test suite as of this
+        // writing.
+
+        this->type = this->bad ? tt_bad : tt_name;
+        this->in_token = false;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    } else if (ch == '#') {
+        this->char_code = 0;
+        this->state = st_name_hex1;
     } else {
-        handled = false;
-    }
-
-    if (handled) {
-        // okay
-    } else if (this->m->state == st_in_hexstring) {
-        if (ch == '>') {
-            this->m->type = tt_string;
-            this->m->state = st_token_ready;
-            if (this->m->val.length() % 2) {
-                // PDF spec says odd hexstrings have implicit
-                // trailing 0.
-                this->m->val += '0';
-            }
-            char num[3];
-            num[2] = '\0';
-            std::string nval;
-            for (unsigned int i = 0; i < this->m->val.length(); i += 2) {
-                num[0] = this->m->val.at(i);
-                num[1] = this->m->val.at(i + 1);
-                char nch = static_cast<char>(strtol(num, nullptr, 16));
-                nval += nch;
-            }
-            this->m->val = nval;
-        } else if (QUtil::is_hex_digit(ch)) {
-            this->m->val += ch;
-        } else if (isSpace(ch)) {
-            // ignore
-        } else {
-            this->m->type = tt_bad;
-            QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
-            this->m->error_message =
-                std::string("invalid character (") + ch + ") in hexstring";
-            this->m->state = st_token_ready;
-        }
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inNameHex1(char ch)
+{
+    this->hex_char = ch;
+
+    if ('0' <= ch && ch <= '9') {
+        this->char_code = 16 * (int(ch) - int('0'));
+        this->state = st_name_hex2;
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code = 16 * (10 + int(ch) - int('A'));
+        this->state = st_name_hex2;
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code = 16 * (10 + int(ch) - int('a'));
+        this->state = st_name_hex2;
+
     } else {
-        throw std::logic_error(
-            "INTERNAL ERROR: invalid state while reading token");
+        QTC::TC("qpdf", "QPDFTokenizer bad name 1");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->state = st_name;
+        inName(ch);
+    }
+}
+
+void
+QPDFTokenizer::inNameHex2(char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        this->char_code += int(ch) - int('0');
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code += 10 + int(ch) - int('A');
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code += 10 + int(ch) - int('a');
+
+    } else {
+        QTC::TC("qpdf", "QPDFTokenizer bad name 2");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->val += this->hex_char;
+        this->state = st_name;
+        inName(ch);
+        return;
+    }
+    if (this->char_code == 0) {
+        QTC::TC("qpdf", "QPDFTokenizer null in name");
+        this->error_message = "null character not allowed in name token";
+        this->val += "#00";
+        this->state = st_name;
+        this->bad = true;
+    } else {
+        this->val += char(this->char_code);
+        this->state = st_name;
+    }
+}
+
+void
+QPDFTokenizer::inSign(char ch)
+{
+    if (QUtil::is_digit(ch)) {
+        this->state = st_number;
+        this->val += ch;
+    } else if (ch == '.') {
+        this->state = st_decimal;
+        this->val += ch;
+    } else {
+        this->state = st_literal;
+        inLiteral(ch);
     }
+}
 
-    if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) {
-        resolveLiteral();
+void
+QPDFTokenizer::inDecimal(char ch)
+{
+    if (QUtil::is_digit(ch)) {
+        this->state = st_real;
+        this->val += ch;
+    } else {
+        this->state = st_literal;
+        inLiteral(ch);
     }
+}
+
+void
+QPDFTokenizer::inNumber(char ch)
+{
+    if (QUtil::is_digit(ch)) {
+        this->val += ch;
+    } else if (ch == '.') {
+        this->state = st_real;
+        this->val += ch;
+    } else if (isDelimiter(ch)) {
+        this->type = tt_integer;
+        this->state = st_token_ready;
+        this->in_token = false;
+        this->char_to_unread = ch;
+    } else {
+        this->state = st_literal;
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inReal(char ch)
+{
+    if (QUtil::is_digit(ch)) {
+        this->val += ch;
+    } else if (isDelimiter(ch)) {
+        this->type = tt_real;
+        this->state = st_token_ready;
+        this->in_token = false;
+        this->char_to_unread = ch;
+    } else {
+        this->state = st_literal;
+        this->val += ch;
+    }
+}
+void
+QPDFTokenizer::inStringEscape(char ch)
+{
+    this->state = st_in_string;
+    switch (ch) {
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+        this->state = st_char_code;
+        this->char_code = 0;
+        this->digit_count = 0;
+        inCharCode(ch);
+        return;
+
+    case 'n':
+        this->val += '\n';
+        return;
+
+    case 'r':
+        this->val += '\r';
+        return;
+
+    case 't':
+        this->val += '\t';
+        return;
+
+    case 'b':
+        this->val += '\b';
+        return;
 
-    if (!(betweenTokens() ||
-          ((this->m->state == st_token_ready) && this->m->unread_char))) {
-        this->m->raw_val += orig_ch;
+    case 'f':
+        this->val += '\f';
+        return;
+
+    case '\n':
+        return;
+
+    case '\r':
+        this->state = st_string_after_cr;
+        return;
+
+    default:
+        // PDF spec says backslash is ignored before anything else
+        this->val += ch;
+        return;
+    }
+}
+
+void
+QPDFTokenizer::inStringAfterCR(char ch)
+{
+    this->state = st_in_string;
+    if (ch != '\n') {
+        inString(ch);
+    }
+}
+
+void
+QPDFTokenizer::inLt(char ch)
+{
+    if (ch == '<') {
+        this->val += "<<";
+        this->type = tt_dict_open;
+        this->state = st_token_ready;
+        return;
+    }
+
+    this->state = st_in_hexstring;
+    inHexstring(ch);
+}
+
+void
+QPDFTokenizer::inGt(char ch)
+{
+    if (ch == '>') {
+        this->val += ">>";
+        this->type = tt_dict_close;
+        this->state = st_token_ready;
+    } else {
+        this->val += ">";
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad >");
+        this->error_message = "unexpected >";
+        this->in_token = false;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    }
+}
+
+void
+QPDFTokenizer::inLiteral(char ch)
+{
+    if (isDelimiter(ch)) {
+        // A C-locale whitespace character or delimiter terminates
+        // token.  It is important to unread the whitespace
+        // character even though it is ignored since it may be the
+        // newline after a stream keyword.  Removing it here could
+        // make the stream-reading code break on some files,
+        // though not on any files in the test suite as of this
+        // writing.
+
+        this->in_token = false;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+        this->type = (this->val == "true") || (this->val == "false")
+            ? tt_bool
+            : (this->val == "null" ? tt_null : tt_word);
+    } else {
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inHexstring(char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        this->char_code = 16 * (int(ch) - int('0'));
+        this->state = st_in_hexstring_2nd;
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code = 16 * (10 + int(ch) - int('A'));
+        this->state = st_in_hexstring_2nd;
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code = 16 * (10 + int(ch) - int('a'));
+        this->state = st_in_hexstring_2nd;
+
+    } else if (ch == '>') {
+        this->type = tt_string;
+        this->state = st_token_ready;
+
+    } else if (isSpace(ch)) {
+        // ignore
+
+    } else {
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
+        this->error_message =
+            std::string("invalid character (") + ch + ") in hexstring";
+        this->state = st_token_ready;
+    }
+}
+
+void
+QPDFTokenizer::inHexstring2nd(char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        this->val += char(this->char_code + int(ch) - int('0'));
+        this->state = st_in_hexstring;
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->val += char(this->char_code + 10 + int(ch) - int('A'));
+        this->state = st_in_hexstring;
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->val += char(this->char_code + 10 + int(ch) - int('a'));
+        this->state = st_in_hexstring;
+
+    } else if (ch == '>') {
+        // PDF spec says odd hexstrings have implicit trailing 0.
+        this->val += char(this->char_code);
+        this->type = tt_string;
+        this->state = st_token_ready;
+
+    } else if (isSpace(ch)) {
+        // ignore
+
+    } else {
+        this->type = tt_bad;
+        QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
+        this->error_message =
+            std::string("invalid character (") + ch + ") in hexstring";
+        this->state = st_token_ready;
+    }
+}
+
+void
+QPDFTokenizer::inCharCode(char ch)
+{
+    if (('0' <= ch) && (ch <= '7')) {
+        this->char_code = 8 * this->char_code + (int(ch) - int('0'));
+        if (++(this->digit_count) < 3) {
+            return;
+        }
+        // We've accumulated \ddd.  PDF Spec says to ignore
+        // high-order overflow.
+    }
+    this->val += char(this->char_code % 256);
+    this->state = st_in_string;
+    return;
+}
+
+void
+QPDFTokenizer::inInlineImage(char ch)
+{
+    this->val += ch;
+    if (this->val.length() == this->inline_image_bytes) {
+        QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+        this->type = tt_inline_image;
+        this->inline_image_bytes = 0;
+        this->state = st_token_ready;
     }
 }
 
 void
 QPDFTokenizer::presentEOF()
 {
-    if (this->m->state == st_literal) {
+    switch (this->state) {
+    case st_name:
+    case st_name_hex1:
+    case st_name_hex2:
+    case st_number:
+    case st_real:
+    case st_sign:
+    case st_decimal:
+    case st_literal:
         QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
-        resolveLiteral();
-    } else if (
-        (this->m->include_ignorable) && (this->m->state == st_in_space)) {
-        this->m->type = tt_space;
-    } else if (
-        (this->m->include_ignorable) && (this->m->state == st_in_comment)) {
-        this->m->type = tt_comment;
-    } else if (betweenTokens()) {
-        this->m->type = tt_eof;
-    } else if (this->m->state != st_token_ready) {
+        // Push any delimiter to the state machine to finish off the final
+        // token.
+        presentCharacter('\f');
+        this->in_token = true;
+        break;
+
+    case st_top:
+    case st_before_token:
+        this->type = tt_eof;
+        break;
+
+    case st_in_space:
+        this->type = this->include_ignorable ? tt_space : tt_eof;
+        break;
+
+    case st_in_comment:
+        this->type = this->include_ignorable ? tt_comment : tt_bad;
+        break;
+
+    case st_token_ready:
+        break;
+
+    default:
         QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
-        this->m->type = tt_bad;
-        this->m->error_message = "EOF while reading token";
+        this->type = tt_bad;
+        this->error_message = "EOF while reading token";
     }
-
-    this->m->state = st_token_ready;
+    this->state = st_token_ready;
 }
 
 void
 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
 {
-    if (this->m->state != st_top) {
+    if (this->state != st_before_token) {
         throw std::logic_error("QPDFTokenizer::expectInlineImage called"
                                " when tokenizer is in improper state");
     }
     findEI(input);
-    this->m->state = st_inline_image;
+    this->before_token = false;
+    this->in_token = true;
+    this->state = st_inline_image;
 }
 
 void
@@ -537,7 +874,7 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
         if (!input->findFirst("EI", input->tell(), 0, f)) {
             break;
         }
-        this->m->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
+        this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
 
         QPDFTokenizer check;
         bool found_bad = false;
@@ -610,19 +947,16 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
 bool
 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
 {
-    bool ready = (this->m->state == st_token_ready);
-    unread_char = this->m->unread_char;
-    ch = this->m->char_to_unread;
+    bool ready = (this->state == st_token_ready);
+    unread_char = !this->in_token && !this->before_token;
+    ch = this->char_to_unread;
     if (ready) {
-        if (this->m->type == tt_bad) {
-            this->m->val = this->m->raw_val;
-        }
-        token = Token(
-            this->m->type,
-            this->m->val,
-            this->m->raw_val,
-            this->m->error_message);
-        this->m->reset();
+        token = (this->type == tt_bad)
+            ? Token(
+                  this->type, this->raw_val, this->raw_val, this->error_message)
+            : Token(this->type, this->val, this->raw_val, this->error_message);
+
+        this->reset();
     }
     return ready;
 }
@@ -630,11 +964,7 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
 bool
 QPDFTokenizer::betweenTokens()
 {
-    return (
-        (this->m->state == st_top) ||
-        ((!this->m->include_ignorable) &&
-         ((this->m->state == st_in_comment) ||
-          (this->m->state == st_in_space))));
+    return this->before_token;
 }
 
 QPDFTokenizer::Token
@@ -644,49 +974,46 @@ QPDFTokenizer::readToken(
     bool allow_bad,
     size_t max_len)
 {
-    qpdf_offset_t offset = input->tell();
-    Token token;
-    bool unread_char;
-    char char_to_unread;
-    bool presented_eof = false;
-    while (!getToken(token, unread_char, char_to_unread)) {
+    qpdf_offset_t offset = input->fastTell();
+
+    while (this->state != st_token_ready) {
         char ch;
-        if (input->read(&ch, 1) == 0) {
-            if (!presented_eof) {
-                presentEOF();
-                presented_eof = true;
-                if ((this->m->type == tt_eof) && (!this->m->allow_eof)) {
-                    // Nothing in the qpdf library calls readToken
-                    // without allowEOF anymore, so this case is not
-                    // exercised.
-                    this->m->type = tt_bad;
-                    this->m->error_message = "unexpected EOF";
-                    offset = input->getLastOffset();
-                }
-            } else {
-                throw std::logic_error(
-                    "getToken returned false after presenting EOF");
+        if (!input->fastRead(ch)) {
+            presentEOF();
+
+            if ((this->type == tt_eof) && (!this->allow_eof)) {
+                // Nothing in the qpdf library calls readToken
+                // without allowEOF anymore, so this case is not
+                // exercised.
+                this->type = tt_bad;
+                this->error_message = "unexpected EOF";
+                offset = input->getLastOffset();
             }
         } else {
-            presentCharacter(ch);
-            if (betweenTokens() && (input->getLastOffset() == offset)) {
+            handleCharacter(ch);
+            if (this->before_token) {
                 ++offset;
             }
-            if (max_len && (this->m->raw_val.length() >= max_len) &&
-                (this->m->state != st_token_ready)) {
+            if (this->in_token) {
+                this->raw_val += ch;
+            }
+            if (max_len && (this->raw_val.length() >= max_len) &&
+                (this->state != st_token_ready)) {
                 // terminate this token now
                 QTC::TC("qpdf", "QPDFTokenizer block long token");
-                this->m->type = tt_bad;
-                this->m->state = st_token_ready;
-                this->m->error_message =
+                this->type = tt_bad;
+                this->state = st_token_ready;
+                this->error_message =
                     "exceeded allowable length while reading token";
             }
         }
     }
 
-    if (unread_char) {
-        input->unreadCh(char_to_unread);
-    }
+    Token token;
+    bool unread_char;
+    char char_to_unread;
+    getToken(token, unread_char, char_to_unread);
+    input->fastUnread(unread_char);
 
     if (token.getType() != tt_eof) {
         input->setLastOffset(offset);