aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDFTokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/QPDFTokenizer.cc')
-rw-r--r--libqpdf/QPDFTokenizer.cc458
1 files changed, 458 insertions, 0 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
new file mode 100644
index 00000000..4eed6f16
--- /dev/null
+++ b/libqpdf/QPDFTokenizer.cc
@@ -0,0 +1,458 @@
+
+#include <qpdf/QPDFTokenizer.hh>
+
+// DO NOT USE ctype -- it is locale dependent for some things, and
+// it's not worth the risk of including it in case it may accidentally
+// be used.
+
+#include <qpdf/PCRE.hh>
+#include <qpdf/QEXC.hh>
+#include <qpdf/QTC.hh>
+
+// See note above about ctype.
+static bool is_hex_digit(char ch)
+{
+ return (strchr("0123456789abcdefABCDEF", ch) != 0);
+}
+
+QPDFTokenizer::QPDFTokenizer() :
+ pound_special_in_name(true)
+{
+ reset();
+}
+
+void
+QPDFTokenizer::allowPoundAnywhereInName()
+{
+ QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name");
+ this->pound_special_in_name = false;
+}
+
+void
+QPDFTokenizer::reset()
+{
+ state = st_top;
+ type = tt_bad;
+ val = "";
+ raw_val = "";
+ error_message = "";
+ unread_char = false;
+ char_to_unread = '\0';
+ string_depth = 0;
+ string_ignoring_newline = false;
+ last_char_was_bs = false;
+}
+
+void
+QPDFTokenizer::presentCharacter(char ch)
+{
+ static PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$");
+
+ if (state == st_token_ready)
+ {
+ throw QEXC::Internal("QPDF tokenizer presented character "
+ "while token is waiting");
+ }
+
+ char orig_ch = ch;
+
+ // State machine is implemented such that some characters may be
+ // handled more than once. This happens whenever you have to use
+ // the character that caused a state change in the new state.
+
+ bool handled = true;
+ if (state == st_top)
+ {
+ // Note: we specifically do not use ctype here. It is
+ // locale-dependent.
+ if (strchr(" \t\n\v\f\r", ch))
+ {
+ // ignore
+ }
+ else if (ch == '%')
+ {
+ // Discard comments
+ state = st_in_comment;
+ }
+ else if (ch == '(')
+ {
+ string_depth = 1;
+ string_ignoring_newline = false;
+ memset(bs_num_register, '\0', sizeof(bs_num_register));
+ last_char_was_bs = false;
+ state = st_in_string;
+ }
+ else if (ch == '<')
+ {
+ state = st_lt;
+ }
+ else if (ch == '>')
+ {
+ state = st_gt;
+ }
+ else
+ {
+ val += ch;
+ if (ch == ')')
+ {
+ type = tt_bad;
+ QTC::TC("qpdf", "QPDF_Tokenizer bad )");
+ error_message = "unexpected )";
+ state = st_token_ready;
+ }
+ else if (ch == '[')
+ {
+ type = tt_array_open;
+ state = st_token_ready;
+ }
+ else if (ch == ']')
+ {
+ type = tt_array_close;
+ state = st_token_ready;
+ }
+ else if (ch == '{')
+ {
+ type = tt_brace_open;
+ state = st_token_ready;
+ }
+ else if (ch == '}')
+ {
+ type = tt_brace_close;
+ state = st_token_ready;
+ }
+ else
+ {
+ state = st_literal;
+ }
+ }
+ }
+ else if (state == st_in_comment)
+ {
+ if ((ch == '\r') || (ch == '\n'))
+ {
+ state = st_top;
+ }
+ }
+ else if (state == st_lt)
+ {
+ if (ch == '<')
+ {
+ val = "<<";
+ type = tt_dict_open;
+ state = st_token_ready;
+ }
+ else
+ {
+ handled = false;
+ state = st_in_hexstring;
+ }
+ }
+ else if (state == st_gt)
+ {
+ if (ch == '>')
+ {
+ val = ">>";
+ type = tt_dict_close;
+ state = st_token_ready;
+ }
+ else
+ {
+ val = ">";
+ type = tt_bad;
+ QTC::TC("qpdf", "QPDF_Tokenizer bad >");
+ error_message = "unexpected >";
+ unread_char = true;
+ char_to_unread = ch;
+ state = st_token_ready;
+ }
+ }
+ else if (state == st_in_string)
+ {
+ if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n'))))
+ {
+ string_ignoring_newline = false;
+ }
+
+ unsigned int bs_num_count = strlen(bs_num_register);
+ bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
+ if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal)))
+ {
+ // We've accumulated \ddd. PDF Spec says to ignore
+ // high-order overflow.
+ val += (char) strtol(bs_num_register, 0, 8);
+ memset(bs_num_register, '\0', sizeof(bs_num_register));
+ bs_num_count = 0;
+ }
+
+ if (string_ignoring_newline && ((ch == '\r') || (ch == '\n')))
+ {
+ // ignore
+ }
+ else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0)))
+ {
+ bs_num_register[bs_num_count++] = ch;
+ }
+ else if (last_char_was_bs)
+ {
+ switch (ch)
+ {
+ case 'n':
+ val += '\n';
+ break;
+
+ case 'r':
+ val += '\r';
+ break;
+
+ case 't':
+ val += '\t';
+ break;
+
+ case 'b':
+ val += '\b';
+ break;
+
+ case 'f':
+ val += '\f';
+ break;
+
+ case '\r':
+ case '\n':
+ string_ignoring_newline = true;
+ break;
+
+ default:
+ // PDF spec says backslash is ignored before anything else
+ val += ch;
+ break;
+ }
+ }
+ else if (ch == '\\')
+ {
+ // last_char_was_bs is set/cleared below as appropriate
+ if (bs_num_count)
+ {
+ throw QEXC::Internal("QPDFTokenizer: bs_num_count != 0 "
+ "when ch == '\\'");
+ }
+ }
+ else if (ch == '(')
+ {
+ val += ch;
+ ++string_depth;
+ }
+ else if ((ch == ')') && (--string_depth == 0))
+ {
+ type = tt_string;
+ state = st_token_ready;
+ }
+ else
+ {
+ val += ch;
+ }
+
+ last_char_was_bs = ((! last_char_was_bs) && (ch == '\\'));
+ }
+ else if (state == st_literal)
+ {
+ if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0)
+ {
+ // A C-loacle whitespace character or delimiter terminates
+ // token. It is important to unread the whitespace
+ // character even though it is ignored since it may be the
+ // newline after a stream keyword. Removing it here could
+ // make the stream-reading code break on some files,
+ // though not on any files in the test suite as of this
+ // writing.
+
+ type = tt_word;
+ unread_char = true;
+ char_to_unread = ch;
+ state = st_token_ready;
+ }
+ else
+ {
+ val += ch;
+ }
+ }
+ else
+ {
+ handled = false;
+ }
+
+
+ if (handled)
+ {
+ // okay
+ }
+ else if (state == st_in_hexstring)
+ {
+ if (ch == '>')
+ {
+ type = tt_string;
+ state = st_token_ready;
+ if (val.length() % 2)
+ {
+ // PDF spec says odd hexstrings have implicit
+ // trailing 0.
+ val += '0';
+ }
+ char num[3];
+ num[2] = '\0';
+ std::string nval;
+ for (unsigned int i = 0; i < val.length(); i += 2)
+ {
+ num[0] = val[i];
+ num[1] = val[i+1];
+ char nch = (char)(strtol(num, 0, 16));
+ nval += nch;
+ }
+ val = nval;
+ }
+ else if (is_hex_digit(ch))
+ {
+ val += ch;
+ }
+ else if (strchr(" \t\n\v\f\r", ch))
+ {
+ // ignore
+ }
+ else
+ {
+ type = tt_bad;
+ QTC::TC("qpdf", "QPDF_Tokenizer bad (");
+ error_message = std::string("invalid character (") +
+ ch + ") in hexstring";
+ state = st_token_ready;
+ }
+ }
+ else
+ {
+ throw QEXC::Internal("invalid state while reading token");
+ }
+
+ if ((state == st_token_ready) && (type == tt_word))
+ {
+ if ((val.length() > 0) && (val[0] == '/'))
+ {
+ type = tt_name;
+ // Deal with # in name token. Note: '/' by itself is a
+ // valid name, so don't strip leading /. That way we
+ // don't have to deal with the empty string as a name.
+ std::string nval = "/";
+ char const* valstr = val.c_str() + 1;
+ for (char const* p = valstr; *p; ++p)
+ {
+ if ((*p == '#') && this->pound_special_in_name)
+ {
+ if (p[1] && p[2] &&
+ is_hex_digit(p[1]) && is_hex_digit(p[2]))
+ {
+ char num[3];
+ num[0] = p[1];
+ num[1] = p[2];
+ num[2] = '\0';
+ char ch = (char)(strtol(num, 0, 16));
+ if (ch == '\0')
+ {
+ type = tt_bad;
+ QTC::TC("qpdf", "QPDF_Tokenizer null in name");
+ error_message =
+ "null character not allowed in name token";
+ nval += "#00";
+ }
+ else
+ {
+ nval += ch;
+ }
+ p += 2;
+ }
+ else
+ {
+ QTC::TC("qpdf", "QPDF_Tokenizer bad name");
+ type = tt_bad;
+ error_message = "invalid name token";
+ nval += *p;
+ }
+ }
+ else
+ {
+ nval += *p;
+ }
+ }
+ val = nval;
+ }
+ else if (num_re.match(val.c_str()))
+ {
+ if (val.find('.') != std::string::npos)
+ {
+ type = tt_real;
+ }
+ else
+ {
+ type = tt_integer;
+ }
+ }
+ else if ((val == "true") || (val == "false"))
+ {
+ type = tt_bool;
+ }
+ else if (val == "null")
+ {
+ type = tt_null;
+ }
+ else
+ {
+ // I don't really know what it is, so leave it as tt_word.
+ // Lots of cases ($, #, etc.) other than actual words fall
+ // into this category, but that's okay at least for now.
+ type = tt_word;
+ }
+ }
+
+ if (! (betweenTokens() || ((state == st_token_ready) && unread_char)))
+ {
+ this->raw_val += orig_ch;
+ }
+}
+
+void
+QPDFTokenizer::presentEOF()
+{
+ switch (state)
+ {
+ case st_token_ready:
+ case st_top:
+ // okay
+ break;
+
+ case st_in_comment:
+ state = st_top;
+ break;
+
+ default:
+ type = tt_bad;
+ error_message = "EOF while reading token";
+ state = st_token_ready;
+ }
+}
+
+bool
+QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
+{
+ bool ready = (this->state == st_token_ready);
+ unread_char = this->unread_char;
+ ch = this->char_to_unread;
+ if (ready)
+ {
+ token = Token(type, val, raw_val, error_message);
+ reset();
+ }
+ return ready;
+}
+
+bool
+QPDFTokenizer::betweenTokens()
+{
+ return ((state == st_top) || (state == st_in_comment));
+}