Integrate names into state machine in QPDFTokenizer

author: m-holger <m-holger@kubitscheck.org> 2022-08-23 01:14:43 +0200
committer: m-holger <m-holger@kubitscheck.org> 2022-08-25 12:26:38 +0200
commit: 931fbb615623f00de0942f12e3e5b2b6e141b09f (patch)
tree: 8bcaed30fda487a532355b7f11833f1ebe887f67 /libqpdf/QPDFTokenizer.cc
parent: a3f3238f371f07cd2b2e1a96753cde6590712dc5 (diff)
download: qpdf-931fbb615623f00de0942f12e3e5b2b6e141b09f.tar.zst
1 files changed, 114 insertions, 43 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index df148c10..a35fa258 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -85,6 +85,7 @@ QPDFTokenizer::reset()
     char_to_unread = '\0';
     inline_image_bytes = 0;
     string_depth = 0;
+    bad = false;
 }
 
 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch)
 void
 QPDFTokenizer::resolveLiteral()
 {
-    if ((this->val.length() > 0) && (this->val.at(0) == '/')) {
-        this->type = tt_name;
-        // Deal with # in name token.  Note: '/' by itself is a
-        // valid name, so don't strip leading /.  That way we
-        // don't have to deal with the empty string as a name.
-        std::string nval = "/";
-        size_t len = this->val.length();
-        for (size_t i = 1; i < len; ++i) {
-            char ch = this->val.at(i);
-            if (ch == '#') {
-                if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) &&
-                    QUtil::is_hex_digit(this->val.at(i + 2))) {
-                    char num[3];
-                    num[0] = this->val.at(i + 1);
-                    num[1] = this->val.at(i + 2);
-                    num[2] = '\0';
-                    char ch2 = static_cast<char>(strtol(num, nullptr, 16));
-                    if (ch2 == '\0') {
-                        this->type = tt_bad;
-                        QTC::TC("qpdf", "QPDFTokenizer null in name");
-                        this->error_message =
-                            "null character not allowed in name token";
-                        nval += "#00";
-                    } else {
-                        nval.append(1, ch2);
-                    }
-                    i += 2;
-                } else {
-                    QTC::TC("qpdf", "QPDFTokenizer bad name");
-                    this->error_message =
-                        "name with stray # will not work with PDF >= 1.2";
-                    // Use null to encode a bad # -- this is reversed
-                    // in QPDF_Name::normalizeName.
-                    nval += '\0';
-                }
-            } else {
-                nval.append(1, ch);
-            }
-        }
-        this->val.clear();
-        this->val += nval;
-    } else if (QUtil::is_number(this->val.c_str())) {
+    if (QUtil::is_number(this->val.c_str())) {
         if (this->val.find('.') != std::string::npos) {
             this->type = tt_real;
         } else {
@@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch)
         inString(ch);
         return;
 
+    case st_name:
+        inName(ch);
+        return;
+
     case st_string_after_cr:
         inStringAfterCR(ch);
         return;
@@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch)
         inHexstring2nd(ch);
         return;
 
+    case st_name_hex1:
+        inNameHex1(ch);
+        return;
+
+    case st_name_hex2:
+        inNameHex2(ch);
+        return;
+
     case (st_token_ready):
         inTokenReady(ch);
         return;
@@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch)
         this->val += ch;
         return;
 
+    case '/':
+        this->state = st_name;
+        this->val += ch;
+        return;
+
     default:
         this->state = st_literal;
         this->val += ch;
@@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch)
 }
 
 void
+QPDFTokenizer::inName(char ch)
+{
+    if (isDelimiter(ch)) {
+        // A C-locale whitespace character or delimiter terminates
+        // token.  It is important to unread the whitespace
+        // character even though it is ignored since it may be the
+        // newline after a stream keyword.  Removing it here could
+        // make the stream-reading code break on some files,
+        // though not on any files in the test suite as of this
+        // writing.
+
+        this->type = this->bad ? tt_bad : tt_name;
+        this->unread_char = true;
+        this->char_to_unread = ch;
+        this->state = st_token_ready;
+    } else if (ch == '#') {
+        this->char_code = 0;
+        this->state = st_name_hex1;
+    } else {
+        this->val += ch;
+    }
+}
+
+void
+QPDFTokenizer::inNameHex1(char ch)
+{
+    this->hex_char = ch;
+
+    if ('0' <= ch && ch <= '9') {
+        this->char_code = 16 * (int(ch) - int('0'));
+        this->state = st_name_hex2;
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code = 16 * (10 + int(ch) - int('A'));
+        this->state = st_name_hex2;
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code = 16 * (10 + int(ch) - int('a'));
+        this->state = st_name_hex2;
+
+    } else {
+        QTC::TC("qpdf", "QPDFTokenizer bad name 1");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->state = st_name;
+        inName(ch);
+    }
+}
+
+void
+QPDFTokenizer::inNameHex2(char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        this->char_code += int(ch) - int('0');
+
+    } else if ('A' <= ch && ch <= 'F') {
+        this->char_code += 10 + int(ch) - int('A');
+
+    } else if ('a' <= ch && ch <= 'f') {
+        this->char_code += 10 + int(ch) - int('a');
+
+    } else {
+        QTC::TC("qpdf", "QPDFTokenizer bad name 2");
+        this->error_message = "name with stray # will not work with PDF >= 1.2";
+        // Use null to encode a bad # -- this is reversed
+        // in QPDF_Name::normalizeName.
+        this->val += '\0';
+        this->val += this->hex_char;
+        this->state = st_name;
+        inName(ch);
+        return;
+    }
+    if (this->char_code == 0) {
+        QTC::TC("qpdf", "QPDFTokenizer null in name");
+        this->error_message = "null character not allowed in name token";
+        this->val += "#00";
+        this->state = st_name;
+        this->bad = true;
+    } else {
+        this->val += char(this->char_code);
+        this->state = st_name;
+    }
+}
+
+void
 QPDFTokenizer::inStringEscape(char ch)
 {
     this->state = st_in_string;
@@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch)
 void
 QPDFTokenizer::presentEOF()
 {
-    if (this->state == st_literal) {
+    if (this->state == st_name || this->state == st_name_hex1 ||
+        this->state == st_name_hex2) {
+        // Push any delimiter to the state machine to finish off the final
+        // token.
+        presentCharacter('\f');
+        this->unread_char = false;
+    } else if (this->state == st_literal) {
         QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
         resolveLiteral();
+
     } else if ((this->include_ignorable) && (this->state == st_in_space)) {
         this->type = tt_space;
     } else if ((this->include_ignorable) && (this->state == st_in_comment)) {
author	m-holger <m-holger@kubitscheck.org>	2022-08-23 01:14:43 +0200
committer	m-holger <m-holger@kubitscheck.org>	2022-08-25 12:26:38 +0200
commit	931fbb615623f00de0942f12e3e5b2b6e141b09f (patch)
tree	8bcaed30fda487a532355b7f11833f1ebe887f67 /libqpdf/QPDFTokenizer.cc
parent	a3f3238f371f07cd2b2e1a96753cde6590712dc5 (diff)
download	qpdf-931fbb615623f00de0942f12e3e5b2b6e141b09f.tar.zst