From 931fbb615623f00de0942f12e3e5b2b6e141b09f Mon Sep 17 00:00:00 2001 From: m-holger Date: Tue, 23 Aug 2022 00:14:43 +0100 Subject: Integrate names into state machine in QPDFTokenizer --- include/qpdf/QPDFTokenizer.hh | 9 ++- libqpdf/QPDFTokenizer.cc | 157 ++++++++++++++++++++++++++++++------------ qpdf/qpdf.testcov | 3 +- 3 files changed, 124 insertions(+), 45 deletions(-) diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 204667a8..d723ff6e 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -203,6 +203,7 @@ class QPDFTokenizer st_in_hexstring, st_in_string, st_in_hexstring_2nd, + st_name, st_literal, st_in_space, st_in_comment, @@ -212,6 +213,8 @@ class QPDFTokenizer st_lt, st_gt, st_inline_image, + st_name_hex1, + st_name_hex2, st_token_ready }; @@ -220,6 +223,7 @@ class QPDFTokenizer void inSpace(char); void inComment(char); void inString(char); + void inName(char); void inLt(char); void inGt(char); void inStringAfterCR(char); @@ -230,7 +234,8 @@ class QPDFTokenizer void inHexstring2nd(char); void inInlineImage(char); void inTokenReady(char); - + void inNameHex1(char); + void inNameHex2(char); void reset(); // Lexer state @@ -247,10 +252,12 @@ class QPDFTokenizer bool unread_char; char char_to_unread; size_t inline_image_bytes; + bool bad; // State for strings int string_depth; int char_code; + char hex_char; int digit_count; }; diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index df148c10..a35fa258 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -85,6 +85,7 @@ QPDFTokenizer::reset() char_to_unread = '\0'; inline_image_bytes = 0; string_depth = 0; + bad = false; } QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch) void QPDFTokenizer::resolveLiteral() { - if ((this->val.length() > 0) && (this->val.at(0) == '/')) { - this->type = tt_name; - // Deal with # in name token. Note: '/' by itself is a - // valid name, so don't strip leading /. That way we - // don't have to deal with the empty string as a name. - std::string nval = "/"; - size_t len = this->val.length(); - for (size_t i = 1; i < len; ++i) { - char ch = this->val.at(i); - if (ch == '#') { - if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) && - QUtil::is_hex_digit(this->val.at(i + 2))) { - char num[3]; - num[0] = this->val.at(i + 1); - num[1] = this->val.at(i + 2); - num[2] = '\0'; - char ch2 = static_cast(strtol(num, nullptr, 16)); - if (ch2 == '\0') { - this->type = tt_bad; - QTC::TC("qpdf", "QPDFTokenizer null in name"); - this->error_message = - "null character not allowed in name token"; - nval += "#00"; - } else { - nval.append(1, ch2); - } - i += 2; - } else { - QTC::TC("qpdf", "QPDFTokenizer bad name"); - this->error_message = - "name with stray # will not work with PDF >= 1.2"; - // Use null to encode a bad # -- this is reversed - // in QPDF_Name::normalizeName. - nval += '\0'; - } - } else { - nval.append(1, ch); - } - } - this->val.clear(); - this->val += nval; - } else if (QUtil::is_number(this->val.c_str())) { + if (QUtil::is_number(this->val.c_str())) { if (this->val.find('.') != std::string::npos) { this->type = tt_real; } else { @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch) inString(ch); return; + case st_name: + inName(ch); + return; + case st_string_after_cr: inStringAfterCR(ch); return; @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch) inHexstring2nd(ch); return; + case st_name_hex1: + inNameHex1(ch); + return; + + case st_name_hex2: + inNameHex2(ch); + return; + case (st_token_ready): inTokenReady(ch); return; @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch) this->val += ch; return; + case '/': + this->state = st_name; + this->val += ch; + return; + default: this->state = st_literal; this->val += ch; @@ -432,6 +409,93 @@ QPDFTokenizer::inString(char ch) } } +void +QPDFTokenizer::inName(char ch) +{ + if (isDelimiter(ch)) { + // A C-locale whitespace character or delimiter terminates + // token. It is important to unread the whitespace + // character even though it is ignored since it may be the + // newline after a stream keyword. Removing it here could + // make the stream-reading code break on some files, + // though not on any files in the test suite as of this + // writing. + + this->type = this->bad ? tt_bad : tt_name; + this->unread_char = true; + this->char_to_unread = ch; + this->state = st_token_ready; + } else if (ch == '#') { + this->char_code = 0; + this->state = st_name_hex1; + } else { + this->val += ch; + } +} + +void +QPDFTokenizer::inNameHex1(char ch) +{ + this->hex_char = ch; + + if ('0' <= ch && ch <= '9') { + this->char_code = 16 * (int(ch) - int('0')); + this->state = st_name_hex2; + + } else if ('A' <= ch && ch <= 'F') { + this->char_code = 16 * (10 + int(ch) - int('A')); + this->state = st_name_hex2; + + } else if ('a' <= ch && ch <= 'f') { + this->char_code = 16 * (10 + int(ch) - int('a')); + this->state = st_name_hex2; + + } else { + QTC::TC("qpdf", "QPDFTokenizer bad name 1"); + this->error_message = "name with stray # will not work with PDF >= 1.2"; + // Use null to encode a bad # -- this is reversed + // in QPDF_Name::normalizeName. + this->val += '\0'; + this->state = st_name; + inName(ch); + } +} + +void +QPDFTokenizer::inNameHex2(char ch) +{ + if ('0' <= ch && ch <= '9') { + this->char_code += int(ch) - int('0'); + + } else if ('A' <= ch && ch <= 'F') { + this->char_code += 10 + int(ch) - int('A'); + + } else if ('a' <= ch && ch <= 'f') { + this->char_code += 10 + int(ch) - int('a'); + + } else { + QTC::TC("qpdf", "QPDFTokenizer bad name 2"); + this->error_message = "name with stray # will not work with PDF >= 1.2"; + // Use null to encode a bad # -- this is reversed + // in QPDF_Name::normalizeName. + this->val += '\0'; + this->val += this->hex_char; + this->state = st_name; + inName(ch); + return; + } + if (this->char_code == 0) { + QTC::TC("qpdf", "QPDFTokenizer null in name"); + this->error_message = "null character not allowed in name token"; + this->val += "#00"; + this->state = st_name; + this->bad = true; + } else { + this->val += char(this->char_code); + this->state = st_name; + } +} + void QPDFTokenizer::inStringEscape(char ch) { @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch) void QPDFTokenizer::presentEOF() { - if (this->state == st_literal) { + if (this->state == st_name || this->state == st_name_hex1 || + this->state == st_name_hex2) { + // Push any delimiter to the state machine to finish off the final + // token. + presentCharacter('\f'); + this->unread_char = false; + } else if (this->state == st_literal) { QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); resolveLiteral(); + } else if ((this->include_ignorable) && (this->state == st_in_space)) { this->type = tt_space; } else if ((this->include_ignorable) && (this->state == st_in_comment)) { diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 81edf947..9e106902 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -68,7 +68,8 @@ QPDFTokenizer bad > 0 QPDFTokenizer bad hexstring character 0 QPDFTokenizer bad hexstring 2nd character 0 QPDFTokenizer null in name 0 -QPDFTokenizer bad name 0 +QPDFTokenizer bad name 1 0 +QPDFTokenizer bad name 2 0 QPDF_Stream invalid filter 0 QPDF UseOutlines but no Outlines 0 QPDFObjectHandle makeDirect loop 0 -- cgit v1.2.3-54-g00ecf