From 9a0b88bf7777c153dc46ace22db74ef24d51583a Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 29 Apr 2008 12:55:25 +0000 Subject: update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 --- libqpdf/QPDFTokenizer.cc | 458 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 libqpdf/QPDFTokenizer.cc (limited to 'libqpdf/QPDFTokenizer.cc') diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc new file mode 100644 index 00000000..4eed6f16 --- /dev/null +++ b/libqpdf/QPDFTokenizer.cc @@ -0,0 +1,458 @@ + +#include + +// DO NOT USE ctype -- it is locale dependent for some things, and +// it's not worth the risk of including it in case it may accidentally +// be used. + +#include +#include +#include + +// See note above about ctype. +static bool is_hex_digit(char ch) +{ + return (strchr("0123456789abcdefABCDEF", ch) != 0); +} + +QPDFTokenizer::QPDFTokenizer() : + pound_special_in_name(true) +{ + reset(); +} + +void +QPDFTokenizer::allowPoundAnywhereInName() +{ + QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name"); + this->pound_special_in_name = false; +} + +void +QPDFTokenizer::reset() +{ + state = st_top; + type = tt_bad; + val = ""; + raw_val = ""; + error_message = ""; + unread_char = false; + char_to_unread = '\0'; + string_depth = 0; + string_ignoring_newline = false; + last_char_was_bs = false; +} + +void +QPDFTokenizer::presentCharacter(char ch) +{ + static PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$"); + + if (state == st_token_ready) + { + throw QEXC::Internal("QPDF tokenizer presented character " + "while token is waiting"); + } + + char orig_ch = ch; + + // State machine is implemented such that some characters may be + // handled more than once. This happens whenever you have to use + // the character that caused a state change in the new state. + + bool handled = true; + if (state == st_top) + { + // Note: we specifically do not use ctype here. It is + // locale-dependent. + if (strchr(" \t\n\v\f\r", ch)) + { + // ignore + } + else if (ch == '%') + { + // Discard comments + state = st_in_comment; + } + else if (ch == '(') + { + string_depth = 1; + string_ignoring_newline = false; + memset(bs_num_register, '\0', sizeof(bs_num_register)); + last_char_was_bs = false; + state = st_in_string; + } + else if (ch == '<') + { + state = st_lt; + } + else if (ch == '>') + { + state = st_gt; + } + else + { + val += ch; + if (ch == ')') + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad )"); + error_message = "unexpected )"; + state = st_token_ready; + } + else if (ch == '[') + { + type = tt_array_open; + state = st_token_ready; + } + else if (ch == ']') + { + type = tt_array_close; + state = st_token_ready; + } + else if (ch == '{') + { + type = tt_brace_open; + state = st_token_ready; + } + else if (ch == '}') + { + type = tt_brace_close; + state = st_token_ready; + } + else + { + state = st_literal; + } + } + } + else if (state == st_in_comment) + { + if ((ch == '\r') || (ch == '\n')) + { + state = st_top; + } + } + else if (state == st_lt) + { + if (ch == '<') + { + val = "<<"; + type = tt_dict_open; + state = st_token_ready; + } + else + { + handled = false; + state = st_in_hexstring; + } + } + else if (state == st_gt) + { + if (ch == '>') + { + val = ">>"; + type = tt_dict_close; + state = st_token_ready; + } + else + { + val = ">"; + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad >"); + error_message = "unexpected >"; + unread_char = true; + char_to_unread = ch; + state = st_token_ready; + } + } + else if (state == st_in_string) + { + if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n')))) + { + string_ignoring_newline = false; + } + + unsigned int bs_num_count = strlen(bs_num_register); + bool ch_is_octal = ((ch >= '0') && (ch <= '7')); + if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal))) + { + // We've accumulated \ddd. PDF Spec says to ignore + // high-order overflow. + val += (char) strtol(bs_num_register, 0, 8); + memset(bs_num_register, '\0', sizeof(bs_num_register)); + bs_num_count = 0; + } + + if (string_ignoring_newline && ((ch == '\r') || (ch == '\n'))) + { + // ignore + } + else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0))) + { + bs_num_register[bs_num_count++] = ch; + } + else if (last_char_was_bs) + { + switch (ch) + { + case 'n': + val += '\n'; + break; + + case 'r': + val += '\r'; + break; + + case 't': + val += '\t'; + break; + + case 'b': + val += '\b'; + break; + + case 'f': + val += '\f'; + break; + + case '\r': + case '\n': + string_ignoring_newline = true; + break; + + default: + // PDF spec says backslash is ignored before anything else + val += ch; + break; + } + } + else if (ch == '\\') + { + // last_char_was_bs is set/cleared below as appropriate + if (bs_num_count) + { + throw QEXC::Internal("QPDFTokenizer: bs_num_count != 0 " + "when ch == '\\'"); + } + } + else if (ch == '(') + { + val += ch; + ++string_depth; + } + else if ((ch == ')') && (--string_depth == 0)) + { + type = tt_string; + state = st_token_ready; + } + else + { + val += ch; + } + + last_char_was_bs = ((! last_char_was_bs) && (ch == '\\')); + } + else if (state == st_literal) + { + if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0) + { + // A C-loacle whitespace character or delimiter terminates + // token. It is important to unread the whitespace + // character even though it is ignored since it may be the + // newline after a stream keyword. Removing it here could + // make the stream-reading code break on some files, + // though not on any files in the test suite as of this + // writing. + + type = tt_word; + unread_char = true; + char_to_unread = ch; + state = st_token_ready; + } + else + { + val += ch; + } + } + else + { + handled = false; + } + + + if (handled) + { + // okay + } + else if (state == st_in_hexstring) + { + if (ch == '>') + { + type = tt_string; + state = st_token_ready; + if (val.length() % 2) + { + // PDF spec says odd hexstrings have implicit + // trailing 0. + val += '0'; + } + char num[3]; + num[2] = '\0'; + std::string nval; + for (unsigned int i = 0; i < val.length(); i += 2) + { + num[0] = val[i]; + num[1] = val[i+1]; + char nch = (char)(strtol(num, 0, 16)); + nval += nch; + } + val = nval; + } + else if (is_hex_digit(ch)) + { + val += ch; + } + else if (strchr(" \t\n\v\f\r", ch)) + { + // ignore + } + else + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad ("); + error_message = std::string("invalid character (") + + ch + ") in hexstring"; + state = st_token_ready; + } + } + else + { + throw QEXC::Internal("invalid state while reading token"); + } + + if ((state == st_token_ready) && (type == tt_word)) + { + if ((val.length() > 0) && (val[0] == '/')) + { + type = tt_name; + // Deal with # in name token. Note: '/' by itself is a + // valid name, so don't strip leading /. That way we + // don't have to deal with the empty string as a name. + std::string nval = "/"; + char const* valstr = val.c_str() + 1; + for (char const* p = valstr; *p; ++p) + { + if ((*p == '#') && this->pound_special_in_name) + { + if (p[1] && p[2] && + is_hex_digit(p[1]) && is_hex_digit(p[2])) + { + char num[3]; + num[0] = p[1]; + num[1] = p[2]; + num[2] = '\0'; + char ch = (char)(strtol(num, 0, 16)); + if (ch == '\0') + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer null in name"); + error_message = + "null character not allowed in name token"; + nval += "#00"; + } + else + { + nval += ch; + } + p += 2; + } + else + { + QTC::TC("qpdf", "QPDF_Tokenizer bad name"); + type = tt_bad; + error_message = "invalid name token"; + nval += *p; + } + } + else + { + nval += *p; + } + } + val = nval; + } + else if (num_re.match(val.c_str())) + { + if (val.find('.') != std::string::npos) + { + type = tt_real; + } + else + { + type = tt_integer; + } + } + else if ((val == "true") || (val == "false")) + { + type = tt_bool; + } + else if (val == "null") + { + type = tt_null; + } + else + { + // I don't really know what it is, so leave it as tt_word. + // Lots of cases ($, #, etc.) other than actual words fall + // into this category, but that's okay at least for now. + type = tt_word; + } + } + + if (! (betweenTokens() || ((state == st_token_ready) && unread_char))) + { + this->raw_val += orig_ch; + } +} + +void +QPDFTokenizer::presentEOF() +{ + switch (state) + { + case st_token_ready: + case st_top: + // okay + break; + + case st_in_comment: + state = st_top; + break; + + default: + type = tt_bad; + error_message = "EOF while reading token"; + state = st_token_ready; + } +} + +bool +QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) +{ + bool ready = (this->state == st_token_ready); + unread_char = this->unread_char; + ch = this->char_to_unread; + if (ready) + { + token = Token(type, val, raw_val, error_message); + reset(); + } + return ready; +} + +bool +QPDFTokenizer::betweenTokens() +{ + return ((state == st_top) || (state == st_in_comment)); +} -- cgit v1.2.3-70-g09d2