From 931fbb615623f00de0942f12e3e5b2b6e141b09f Mon Sep 17 00:00:00 2001
From: m-holger <m-holger@kubitscheck.org>
Date: Tue, 23 Aug 2022 00:14:43 +0100
Subject: Integrate names into state machine in QPDFTokenizer

---
 include/qpdf/QPDFTokenizer.hh |   9 ++-
 libqpdf/QPDFTokenizer.cc      | 157 ++++++++++++++++++++++++++++++------------
 qpdf/qpdf.testcov             |   3 +-
 3 files changed, 124 insertions(+), 45 deletions(-)
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 204667a8..d723ff6e 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -203,6 +203,7 @@ class QPDFTokenizer
         st_in_hexstring,
         st_in_string,
         st_in_hexstring_2nd,
+        st_name,
         st_literal,
         st_in_space,
         st_in_comment,
@@ -212,6 +213,8 @@ class QPDFTokenizer
         st_lt,
         st_gt,
         st_inline_image,
+        st_name_hex1,
+        st_name_hex2,
         st_token_ready
     };
 
@@ -220,6 +223,7 @@ class QPDFTokenizer
     void inSpace(char);
     void inComment(char);
     void inString(char);
+    void inName(char);
     void inLt(char);
     void inGt(char);
     void inStringAfterCR(char);
@@ -230,7 +234,8 @@ class QPDFTokenizer
     void inHexstring2nd(char);
     void inInlineImage(char);
     void inTokenReady(char);
-
+    void inNameHex1(char);
+    void inNameHex2(char);
     void reset();
 
     // Lexer state
@@ -247,10 +252,12 @@ class QPDFTokenizer
     bool unread_char;
     char char_to_unread;
     size_t inline_image_bytes;
+    bool bad;
 
     // State for strings
     int string_depth;
     int char_code;
+    char hex_char;
     int digit_count;
 };
 
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index df148c10..a35fa258 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -85,6 +85,7 @@ QPDFTokenizer::reset()
     char_to_unread = '\0';
     inline_image_bytes = 0;
     string_depth = 0;
+    bad = false;
 }
 
 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch)
 void
 QPDFTokenizer::resolveLiteral()
 {
-    if ((this->val.length() > 0) && (this->val.at(0) == '/')) {
-        this->type = tt_name;
-        // Deal with # in name token.  Note: '/' by itself is a
-        // valid name, so don't strip leading /.  That way we
-        // don't have to deal with the empty string as a name.
-        std::string nval = "/";
-        size_t len = this->val.length();
-        for (size_t i = 1; i < len; ++i) {
-            char ch = this->val.at(i);
-            if (ch == '#') {
-                if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) &&
-                    QUtil::is_hex_digit(this->val.at(i + 2))) {
-                    char num[3];
-                    num[0] = this->val.at(i + 1);
-                    num[1] = this->val.at(i + 2);
-                    num[2] = '\0';
-                    char ch2 = static_cast<char>(strtol(num, nullptr, 16));
-                    if (ch2 == '\0') {
-                        this->type = tt_bad;
-                        QTC::TC("qpdf", "QPDFTokenizer null in name");
-                        this->error_message =
-                            "null character not allowed in name token";
-                        nval += "#00";
-                    } else {
-                        nval.append(1, ch2);
-                    }
-                    i += 2;
-                } else {
-                    QTC::TC("qpdf", "QPDFTokenizer bad name");
-                    this->error_message =
-                        "name with stray # will not work with PDF >= 1.2";
-                    // Use null to encode a bad # -- this is reversed
-                    // in QPDF_Name::normalizeName.
-                    nval += '\0';
-                }
-            } else {
-                nval.append(1, ch);
-            }
-        }
-        this->val.clear();
-        this->val += nval;
-    } else if (QUtil::is_number(this->val.c_str())) {
+    if (QUtil::is_number(this->val.c_str())) {
         if (this->val.find('.') != std::string::npos) {
             this->type = tt_real;
         } else {
@@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch)
         inString(ch);
         return;
 
+    case st_name:
+        inName(ch);
+        return;
+
     case st_string_after_cr:
         inStringAfterCR(ch);
         return;
@@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch)
         inHexstring2nd(ch);
         return;
 
+    case st_name_hex1:
+        inNameHex1(ch);
+        return;
+
+    case st_name_hex2:
+        inNameHex2(ch);
+        return;
+
     case (st_token_ready):
         inTokenReady(ch);
         return;
@@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch)
         this->val += ch;
         return;
 
+    case '/':
+        this->state = st_name;
+        this->val += ch;
+        return;
+
     default:
         this->state = st_literal;
         this->val += ch;
@@ -432,6 +409,93 @@ QPDFTokenizer::inString(char ch)
     }
 }
 
+void
+QPDFTokenizer::inName(char ch)
+{
+    if (isDelimiter(ch)) {
+        // A C-locale whitespace character or delimiter terminates
+        // token.  It is important to unread the whitespace
+        // character even though it is ignored since it may be the
+        // newline after a stream keyword.  Removing it here could
+        // make the stream-reading code break on some files,
+        // though not on any files in the test suite as of this
+        // writing.
+
+        this->type = this->bad ? tt_bad : tt_name;
+        this->unread_char = true;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    } else if (ch == '#') {
+        this->char_code = 0;
+        this->state = st_name_hex1;
+    } else {
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inNameHex1(char ch)
+{
+    this->hex_char = ch;
+
+    if ('0' <= ch && ch <= '9') {
+        this->char_code = 16 * (int(ch) - int('0'));
+        this->state = st_name_hex2;
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code = 16 * (10 + int(ch) - int('A'));
+        this->state = st_name_hex2;
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code = 16 * (10 + int(ch) - int('a'));
+        this->state = st_name_hex2;
+
+    } else {
+        QTC::TC("qpdf", "QPDFTokenizer bad name 1");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->state = st_name;
+        inName(ch);
+    }
+}
+
+void
+QPDFTokenizer::inNameHex2(char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        this->char_code += int(ch) - int('0');
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code += 10 + int(ch) - int('A');
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code += 10 + int(ch) - int('a');
+
+    } else {
+        QTC::TC("qpdf", "QPDFTokenizer bad name 2");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->val += this->hex_char;
+        this->state = st_name;
+        inName(ch);
+        return;
+    }
+    if (this->char_code == 0) {
+        QTC::TC("qpdf", "QPDFTokenizer null in name");
+        this->error_message = "null character not allowed in name token";
+        this->val += "#00";
+        this->state = st_name;
+        this->bad = true;
+    } else {
+        this->val += char(this->char_code);
+        this->state = st_name;
+    }
+}
+
 void
 QPDFTokenizer::inStringEscape(char ch)
 {
@@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch)
 void
 QPDFTokenizer::presentEOF()
 {
-    if (this->state == st_literal) {
+    if (this->state == st_name || this->state == st_name_hex1 ||
+        this->state == st_name_hex2) {
+        // Push any delimiter to the state machine to finish off the final
+        // token.
+        presentCharacter('\f');
+        this->unread_char = false;
+    } else if (this->state == st_literal) {
         QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
         resolveLiteral();
+
     } else if ((this->include_ignorable) && (this->state == st_in_space)) {
         this->type = tt_space;
     } else if ((this->include_ignorable) && (this->state == st_in_comment)) {
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 81edf947..9e106902 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -68,7 +68,8 @@ QPDFTokenizer bad > 0
 QPDFTokenizer bad hexstring character 0
 QPDFTokenizer bad hexstring 2nd character 0
 QPDFTokenizer null in name 0
-QPDFTokenizer bad name 0
+QPDFTokenizer bad name 1 0
+QPDFTokenizer bad name 2 0
 QPDF_Stream invalid filter 0
 QPDF UseOutlines but no Outlines 0
 QPDFObjectHandle makeDirect loop 0
-- 
cgit v1.2.3-54-g00ecf