aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf
diff options
context:
space:
mode:
authorm-holger <m-holger@kubitscheck.org>2022-08-21 21:08:58 +0200
committerm-holger <m-holger@kubitscheck.org>2022-08-25 12:26:05 +0200
commita3f3238f371f07cd2b2e1a96753cde6590712dc5 (patch)
tree35610e8c73c3cb112cd01e774ae899371a01a8c6 /libqpdf
parent6111a6a424324ed8d926852ed6ba22d4bf13fa62 (diff)
downloadqpdf-a3f3238f371f07cd2b2e1a96753cde6590712dc5.tar.zst
Split QPDFTokenizer::handleCharacter into individual methods
Diffstat (limited to 'libqpdf')
-rw-r--r--libqpdf/QPDFTokenizer.cc510
1 files changed, 287 insertions, 223 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 3b601af3..df148c10 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch)
// the character that caused a state change in the new state.
switch (this->state) {
- case (st_token_ready):
- throw std::logic_error(
- "INTERNAL ERROR: QPDF tokenizer presented character "
- "while token is waiting");
-
case st_top:
- // Note: we specifically do not use ctype here. It is
- // locale-dependent.
- if (isSpace(ch)) {
- if (this->include_ignorable) {
- this->state = st_in_space;
- this->val += ch;
- }
- return;
- }
- switch (ch) {
- case '%':
- this->state = st_in_comment;
- if (this->include_ignorable) {
- this->val += ch;
- }
- return;
-
- case '(':
- this->string_depth = 1;
- this->state = st_in_string;
- return;
-
- case '<':
- this->state = st_lt;
- return;
-
- case '>':
- this->state = st_gt;
- return;
-
- case (')'):
- this->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer bad )");
- this->error_message = "unexpected )";
- this->val += ch;
- this->state = st_token_ready;
- return;
-
- case '[':
- this->type = tt_array_open;
- this->state = st_token_ready;
- this->val += ch;
- return;
-
- case ']':
- this->type = tt_array_close;
- this->val += ch;
- this->state = st_token_ready;
- return;
-
- case '{':
- this->type = tt_brace_open;
- this->state = st_token_ready;
- this->val += ch;
- return;
-
- case '}':
- this->type = tt_brace_close;
- this->state = st_token_ready;
- this->val += ch;
- return;
-
- default:
- this->state = st_literal;
- this->val += ch;
- return;
- }
+ inTop(ch);
+ return;
case st_in_space:
- // We only enter this state if include_ignorable is true.
- if (!isSpace(ch)) {
- this->type = tt_space;
- this->unread_char = true;
- this->char_to_unread = ch;
- this->state = st_token_ready;
- return;
- } else {
- this->val += ch;
- return;
- }
+ inSpace(ch);
+ return;
case st_in_comment:
- if ((ch == '\r') || (ch == '\n')) {
- if (this->include_ignorable) {
- this->type = tt_comment;
- this->unread_char = true;
- this->char_to_unread = ch;
- this->state = st_token_ready;
- } else {
- this->state = st_top;
- }
- } else if (this->include_ignorable) {
- this->val += ch;
- }
+ inComment(ch);
return;
case st_lt:
- if (ch == '<') {
- this->val += "<<";
- this->type = tt_dict_open;
- this->state = st_token_ready;
- return;
- }
-
- this->state = st_in_hexstring;
- inHexstring(ch);
+ inLt(ch);
return;
case st_gt:
- if (ch == '>') {
- this->val += ">>";
- this->type = tt_dict_close;
- this->state = st_token_ready;
- } else {
- this->val += ">";
- this->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer bad >");
- this->error_message = "unexpected >";
- this->unread_char = true;
- this->char_to_unread = ch;
- this->state = st_token_ready;
- }
+ inGt(ch);
return;
case st_in_string:
@@ -352,107 +242,308 @@ QPDFTokenizer::handleCharacter(char ch)
return;
case st_string_after_cr:
- // CR LF in strings are either ignored or normalized to CR
- this->state = st_in_string;
- if (ch != '\n') {
- inString(ch);
- }
+ inStringAfterCR(ch);
return;
case st_string_escape:
- this->state = st_in_string;
- switch (ch) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- this->state = st_char_code;
- this->char_code = 0;
- this->digit_count = 0;
- inCharCode(ch);
- return;
+ inStringEscape(ch);
+ return;
- case 'n':
- this->val += '\n';
- return;
+ case st_char_code:
+ inCharCode(ch);
+ return;
- case 'r':
- this->val += '\r';
- return;
+ case st_literal:
+ inLiteral(ch);
+ return;
- case 't':
- this->val += '\t';
- return;
+ case st_inline_image:
+ inInlineImage(ch);
+ return;
+ this->val += ch;
- case 'b':
- this->val += '\b';
- return;
+ case st_in_hexstring:
+ inHexstring(ch);
+ return;
- case 'f':
- this->val += '\f';
- return;
+ case st_in_hexstring_2nd:
+ inHexstring2nd(ch);
+ return;
- case '\n':
- return;
+ case (st_token_ready):
+ inTokenReady(ch);
+ return;
- case '\r':
- this->state = st_string_after_cr;
- return;
+ default:
+ throw std::logic_error(
+ "INTERNAL ERROR: invalid state while reading token");
+ }
+}
+
+void
+QPDFTokenizer::inTokenReady(char ch)
+{
+ throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
+ "while token is waiting");
+}
- default:
- // PDF spec says backslash is ignored before anything else
+void
+QPDFTokenizer::inTop(char ch)
+{
+ // Note: we specifically do not use ctype here. It is
+ // locale-dependent.
+ if (isSpace(ch)) {
+ if (this->include_ignorable) {
+ this->state = st_in_space;
this->val += ch;
return;
}
+ return;
+ }
+ switch (ch) {
+ case '%':
+ this->state = st_in_comment;
+ if (this->include_ignorable) {
+ this->val += ch;
+ }
+ return;
- case st_char_code:
- inCharCode(ch);
+ case '(':
+ this->string_depth = 1;
+ this->state = st_in_string;
return;
- case st_literal:
- if (isDelimiter(ch)) {
- // A C-locale whitespace character or delimiter terminates
- // token. It is important to unread the whitespace
- // character even though it is ignored since it may be the
- // newline after a stream keyword. Removing it here could
- // make the stream-reading code break on some files,
- // though not on any files in the test suite as of this
- // writing.
-
- this->type = tt_word;
+ case '<':
+ this->state = st_lt;
+ return;
+
+ case '>':
+ this->state = st_gt;
+ return;
+
+ case (')'):
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad )");
+ this->error_message = "unexpected )";
+ this->val += ch;
+ this->state = st_token_ready;
+ return;
+
+ case '[':
+ this->type = tt_array_open;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
+
+ case ']':
+ this->type = tt_array_close;
+ this->val += ch;
+ this->state = st_token_ready;
+ return;
+
+ case '{':
+ this->type = tt_brace_open;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
+
+ case '}':
+ this->type = tt_brace_close;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
+
+ default:
+ this->state = st_literal;
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inSpace(char ch)
+{
+ // We only enter this state if include_ignorable is true.
+ if (!isSpace(ch)) {
+ this->type = tt_space;
+ this->unread_char = true;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ return;
+ } else {
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inComment(char ch)
+{
+ if ((ch == '\r') || (ch == '\n')) {
+ if (this->include_ignorable) {
+ this->type = tt_comment;
this->unread_char = true;
this->char_to_unread = ch;
this->state = st_token_ready;
} else {
- this->val += ch;
+ this->state = st_top;
}
+ } else if (this->include_ignorable) {
+ this->val += ch;
+ }
+}
+
+void
+QPDFTokenizer::inString(char ch)
+{
+ switch (ch) {
+ case '\\':
+ this->state = st_string_escape;
return;
- case st_inline_image:
+ case '(':
this->val += ch;
- if (this->val.length() == this->inline_image_bytes) {
- QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
- this->type = tt_inline_image;
- this->inline_image_bytes = 0;
+ ++this->string_depth;
+ return;
+
+ case ')':
+ if (--this->string_depth == 0) {
+ this->type = tt_string;
this->state = st_token_ready;
+ return;
}
+
+ this->val += ch;
return;
- case st_in_hexstring:
- inHexstring(ch);
+ case '\r':
+ // CR by itself is converted to LF
+ this->val += '\n';
+ this->state = st_string_after_cr;
return;
- case st_in_hexstring_2nd:
- inHexstring2nd(ch);
+ case '\n':
+ this->val += ch;
return;
default:
- throw std::logic_error(
- "INTERNAL ERROR: invalid state while reading token");
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inStringEscape(char ch)
+{
+ this->state = st_in_string;
+ switch (ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ this->state = st_char_code;
+ this->char_code = 0;
+ this->digit_count = 0;
+ inCharCode(ch);
+ return;
+
+ case 'n':
+ this->val += '\n';
+ return;
+
+ case 'r':
+ this->val += '\r';
+ return;
+
+ case 't':
+ this->val += '\t';
+ return;
+
+ case 'b':
+ this->val += '\b';
+ return;
+
+ case 'f':
+ this->val += '\f';
+ return;
+
+ case '\n':
+ return;
+
+ case '\r':
+ this->state = st_string_after_cr;
+ return;
+
+ default:
+ // PDF spec says backslash is ignored before anything else
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inStringAfterCR(char ch)
+{
+ this->state = st_in_string;
+ if (ch != '\n') {
+ inString(ch);
+ }
+}
+
+void
+QPDFTokenizer::inLt(char ch)
+{
+ if (ch == '<') {
+ this->val += "<<";
+ this->type = tt_dict_open;
+ this->state = st_token_ready;
+ return;
+ }
+
+ this->state = st_in_hexstring;
+ inHexstring(ch);
+}
+
+void
+QPDFTokenizer::inGt(char ch)
+{
+ if (ch == '>') {
+ this->val += ">>";
+ this->type = tt_dict_close;
+ this->state = st_token_ready;
+ } else {
+ this->val += ">";
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad >");
+ this->error_message = "unexpected >";
+ this->unread_char = true;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ }
+}
+
+void
+QPDFTokenizer::inLiteral(char ch)
+{
+ if (isDelimiter(ch)) {
+ // A C-locale whitespace character or delimiter terminates
+ // token. It is important to unread the whitespace
+ // character even though it is ignored since it may be the
+ // newline after a stream keyword. Removing it here could
+ // make the stream-reading code break on some files,
+ // though not on any files in the test suite as of this
+ // writing.
+
+ this->type = tt_word;
+ this->unread_char = true;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ } else {
+ this->val += ch;
}
}
@@ -521,45 +612,6 @@ QPDFTokenizer::inHexstring2nd(char ch)
}
void
-QPDFTokenizer::inString(char ch)
-{
- switch (ch) {
- case '\\':
- this->state = st_string_escape;
- return;
-
- case '(':
- this->val += ch;
- ++this->string_depth;
- return;
-
- case ')':
- if (--this->string_depth == 0) {
- this->type = tt_string;
- this->state = st_token_ready;
- return;
- }
-
- this->val += ch;
- return;
-
- case '\r':
- // CR by itself is converted to LF
- this->val += '\n';
- this->state = st_string_after_cr;
- return;
-
- case '\n':
- this->val += ch;
- return;
-
- default:
- this->val += ch;
- return;
- }
-}
-
-void
QPDFTokenizer::inCharCode(char ch)
{
if (('0' <= ch) && (ch <= '7')) {
@@ -576,6 +628,18 @@ QPDFTokenizer::inCharCode(char ch)
}
void
+QPDFTokenizer::inInlineImage(char ch)
+{
+ this->val += ch;
+ if (this->val.length() == this->inline_image_bytes) {
+ QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+ this->type = tt_inline_image;
+ this->inline_image_bytes = 0;
+ this->state = st_token_ready;
+ }
+}
+
+void
QPDFTokenizer::presentEOF()
{
if (this->state == st_literal) {