From e8e8f6f43c760523520dfe7a5c76d88c959599f6 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Mon, 17 Jan 2022 18:40:38 -0500 Subject: Add JSON::parse --- libqpdf/JSON.cc | 772 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 772 insertions(+) (limited to 'libqpdf') diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index a45dbd2e..423c0b0a 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -2,6 +2,7 @@ #include #include #include +#include JSON::Members::~Members() { @@ -437,3 +438,774 @@ JSON::checkSchemaInternal(JSON_value* this_v, JSON_value* sch_v, return errors.empty(); } + +namespace { + class JSONParser + { + public: + JSONParser() : + lex_state(ls_top), + number_before_point(0), + number_after_point(0), + number_after_e(0), + number_saw_point(false), + number_saw_e(false), + cstr(nullptr), + end(nullptr), + tok_start(nullptr), + tok_end(nullptr), + p(nullptr), + parser_state(ps_top) + { + } + + PointerHolder parse(std::string const& s); + + private: + void getToken(); + void handleToken(); + static std::string decode_string(std::string const& json); + + enum parser_state_e { + ps_top, + ps_dict_begin, + ps_dict_after_key, + ps_dict_after_colon, + ps_dict_after_item, + ps_dict_after_comma, + ps_array_begin, + ps_array_after_item, + ps_array_after_comma, + ps_done, + }; + + enum lex_state_e { + ls_top, + ls_number, + ls_alpha, + ls_string, + ls_backslash, + }; + + lex_state_e lex_state; + size_t number_before_point; + size_t number_after_point; + size_t number_after_e; + bool number_saw_point; + bool number_saw_e; + char const* cstr; + char const* end; + char const* tok_start; + char const* tok_end; + char const* p; + parser_state_e parser_state; + std::vector> stack; + std::vector ps_stack; + std::string dict_key; + }; +} + +std::string +JSONParser::decode_string(std::string const& str) +{ + // The string has already been validated when this private method + // is called, so errors are logic errors instead of runtime + // errors. + size_t len = str.length(); + if ((len < 2) || (str.at(0) != '"') || (str.at(len-1) != '"')) + { + throw std::logic_error( + "JSON Parse: decode_string called with other than \"...\""); + } + char const* s = str.c_str(); + // Move inside the quotation marks + ++s; + len -= 2; + std::string result; + for (size_t i = 0; i < len; ++i) + { + if (s[i] == '\\') + { + if (i + 1 >= len) + { + throw std::logic_error("JSON parse: nothing after \\"); + } + char ch = s[++i]; + switch (ch) + { + case '\\': + case '\"': + result.append(1, ch); + break; + case 'b': + result.append(1, '\b'); + break; + case 'f': + result.append(1, '\f'); + break; + case 'n': + result.append(1, '\n'); + break; + case 'r': + result.append(1, '\r'); + break; + case 't': + result.append(1, '\t'); + break; + case 'u': + if (i + 4 >= len) + { + throw std::logic_error( + "JSON parse: not enough characters after \\u"); + } + { + std::string hex = + QUtil::hex_decode(std::string(s+i+1, s+i+5)); + i += 4; + unsigned char high = static_cast(hex.at(0)); + unsigned char low = static_cast(hex.at(1)); + unsigned long codepoint = high; + codepoint <<= 8; + codepoint += low; + result += QUtil::toUTF8(codepoint); + } + break; + default: + throw std::logic_error( + "JSON parse: bad character after \\"); + break; + } + } + else + { + result.append(1, s[i]); + } + } + return result; +} + +void JSONParser::getToken() +{ + while (p < end) + { + if (*p == 0) + { + QTC::TC("libtests", "JSON parse null character"); + throw std::runtime_error( + "JSON: null character at offset " + + QUtil::int_to_string(p - cstr)); + } + switch (lex_state) + { + case ls_top: + if (*p == '"') + { + tok_start = p; + tok_end = nullptr; + lex_state = ls_string; + } + else if (QUtil::is_space(*p)) + { + // ignore + } + else if ((*p >= 'a') && (*p <= 'z')) + { + tok_start = p; + tok_end = nullptr; + lex_state = ls_alpha; + } + else if (*p == '-') + { + tok_start = p; + tok_end = nullptr; + lex_state = ls_number; + number_before_point = 0; + number_after_point = 0; + number_after_e = 0; + number_saw_point = false; + number_saw_e = false; + } + else if ((*p >= '0') && (*p <= '9')) + { + tok_start = p; + tok_end = nullptr; + lex_state = ls_number; + number_before_point = 1; + number_after_point = 0; + number_after_e = 0; + number_saw_point = false; + number_saw_e = false; + } + else if (*p == '.') + { + tok_start = p; + tok_end = nullptr; + lex_state = ls_number; + number_before_point = 0; + number_after_point = 0; + number_after_e = 0; + number_saw_point = true; + number_saw_e = false; + } + else if (strchr("{}[]:,", *p)) + { + tok_start = p; + tok_end = p + 1; + } + else + { + QTC::TC("libtests", "JSON parse bad character"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unexpected character " + std::string(p, 1)); + } + break; + + case ls_number: + if ((*p >= '0') && (*p <= '9')) + { + if (number_saw_e) + { + ++number_after_e; + } + else if (number_saw_point) + { + ++number_after_point; + } + else + { + ++number_before_point; + } + } + else if (*p == '.') + { + if (number_saw_e) + { + QTC::TC("libtests", "JSON parse point after e"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": numeric literal: decimal point after e"); + } + else if (number_saw_point) + { + QTC::TC("libtests", "JSON parse duplicate point"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": numeric literal: decimal point already seen"); + } + else + { + number_saw_point = true; + } + } + else if (*p == 'e') + { + if (number_saw_e) + { + QTC::TC("libtests", "JSON parse duplicate e"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": numeric literal: e already seen"); + } + else + { + number_saw_e = true; + } + } + else if ((*p == '+') || (*p == '-')) + { + if (number_saw_e && (number_after_e == 0)) + { + // okay + } + else + { + QTC::TC("libtests", "JSON parse unexpected sign"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": numeric literal: unexpected sign"); + } + } + else if (QUtil::is_space(*p)) + { + tok_end = p; + } + else if (strchr("{}[]:,", *p)) + { + tok_end = p; + --p; + } + else + { + QTC::TC("libtests", "JSON parse numeric bad character"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": numeric literal: unexpected character " + + std::string(p, 1)); + } + break; + + case ls_alpha: + if ((*p >= 'a') && (*p <= 'z')) + { + // okay + } + else if (QUtil::is_space(*p)) + { + tok_end = p; + } + else if (strchr("{}[]:,", *p)) + { + tok_end = p; + --p; + } + else + { + QTC::TC("libtests", "JSON parse keyword bad character"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": keyword: unexpected character " + std::string(p, 1)); + } + break; + + case ls_string: + if (*p == '"') + { + tok_end = p + 1; + } + else if (*p == '\\') + { + lex_state = ls_backslash; + } + break; + + case ls_backslash: + if (strchr("\\\"bfnrt", *p)) + { + lex_state = ls_string; + } + else if (*p == 'u') + { + if (p + 4 >= end) + { + QTC::TC("libtests", "JSON parse premature end of u"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": \\u must be followed by four characters"); + } + for (size_t i = 1; i <= 4; ++i) + { + if (! QUtil::is_hex_digit(p[i])) + { + QTC::TC("libtests", "JSON parse bad hex after u"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": \\u must be followed by four hex digits"); + } + } + p += 4; + lex_state = ls_string; + } + else + { + QTC::TC("libtests", "JSON parse backslash bad character"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": invalid character after backslash: " + + std::string(p, 1)); + } + break; + } + ++p; + if (tok_start && tok_end) + { + break; + } + } + if (p == end) + { + if (tok_start && (! tok_end)) + { + switch (lex_state) + { + case ls_top: + // Can't happen + throw std::logic_error( + "tok_start set in ls_top while parsing " + + std::string(cstr)); + break; + + case ls_number: + case ls_alpha: + tok_end = p; + break; + + case ls_string: + case ls_backslash: + QTC::TC("libtests", "JSON parse unterminated string"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unterminated string"); + break; + } + } + } +} + +void +JSONParser::handleToken() +{ + if (! (tok_start && tok_end)) + { + return; + } + + // Get token value. + std::string value(tok_start, tok_end); + + if (parser_state == ps_done) + { + QTC::TC("libtests", "JSON parse junk after object"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": material follows end of object: " + value); + } + + // Git string value + std::string svalue; + if (lex_state == ls_string) + { + // Token includes the quotation marks + if (tok_end - tok_start < 2) + { + throw std::logic_error("JSON string length < 2"); + } + svalue = decode_string(value); + } + // Based on the lexical state and value, figure out whether we are + // looking at an item or a delimiter. It will always be exactly + // one of those two or an error condition. + + PointerHolder item; + char delimiter = '\0'; + switch (lex_state) + { + case ls_top: + switch (*tok_start) + { + case '{': + item = new JSON(JSON::makeDictionary()); + break; + + case '[': + item = new JSON(JSON::makeArray()); + break; + + default: + delimiter = *tok_start; + break; + } + break; + + case ls_number: + if (number_saw_point && (number_after_point == 0)) + { + QTC::TC("libtests", "JSON parse decimal with no digits"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": decimal point with no digits"); + } + if ((number_before_point > 1) && + ((tok_start[0] == '0') || + ((tok_start[0] == '-') && (tok_start[1] == '0')))) + { + QTC::TC("libtests", "JSON parse leading zero"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": number with leading zero"); + } + if ((number_before_point == 0) && (number_after_point == 0)) + { + QTC::TC("libtests", "JSON parse number no digits"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": number with no digits"); + } + item = new JSON(JSON::makeNumber(value)); + break; + + case ls_alpha: + if (value == "true") + { + item = new JSON(JSON::makeBool(true)); + } + else if (value == "false") + { + item = new JSON(JSON::makeBool(false)); + } + else if (value == "null") + { + item = new JSON(JSON::makeNull()); + } + else + { + QTC::TC("libtests", "JSON parse invalid keyword"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": invalid keyword " + value); + } + break; + + case ls_string: + item = new JSON(JSON::makeString(svalue)); + break; + + case ls_backslash: + throw std::logic_error( + "tok_end is set while state = ls_backslash"); + break; + } + + if ((item.getPointer() == nullptr) == (delimiter == '\0')) + { + throw std::logic_error( + "JSONParser::handleToken: logic error: exactly one of item" + " or delimiter must be set"); + } + + // See whether what we have is allowed at this point. + + if (item.getPointer()) + { + switch (parser_state) + { + case ps_done: + throw std::logic_error("can't happen; ps_done already handled"); + break; + + case ps_dict_after_key: + QTC::TC("libtests", "JSON parse expected colon"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": expected ':'"); + break; + + case ps_dict_after_item: + QTC::TC("libtests", "JSON parse expected , or }"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": expected ',' or '}'"); + break; + + case ps_array_after_item: + QTC::TC("libtests", "JSON parse expected, or ]"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": expected ',' or ']'"); + break; + + case ps_dict_begin: + case ps_dict_after_comma: + if (lex_state != ls_string) + { + QTC::TC("libtests", "JSON parse string as dict key"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": expect string as dictionary key"); + } + break; + + case ps_top: + case ps_dict_after_colon: + case ps_array_begin: + case ps_array_after_comma: + break; + // okay + } + } + else if (delimiter == '}') + { + if (! ((parser_state == ps_dict_begin) || + (parser_state == ps_dict_after_item))) + + { + QTC::TC("libtests", "JSON parse unexpected }"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unexpected dictionary end delimiter"); + } + } + else if (delimiter == ']') + { + if (! ((parser_state == ps_array_begin) || + (parser_state == ps_array_after_item))) + + { + QTC::TC("libtests", "JSON parse unexpected ]"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unexpected array end delimiter"); + } + } + else if (delimiter == ':') + { + if (parser_state != ps_dict_after_key) + { + QTC::TC("libtests", "JSON parse unexpected :"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unexpected colon"); + } + } + else if (delimiter == ',') + { + if (! ((parser_state == ps_dict_after_item) || + (parser_state == ps_array_after_item))) + { + QTC::TC("libtests", "JSON parse unexpected ,"); + throw std::runtime_error( + "JSON: offset " + QUtil::int_to_string(p - cstr) + + ": unexpected comma"); + } + } + else if (delimiter != '\0') + { + throw std::logic_error("JSONParser::handleToken: bad delimiter"); + } + + // Now we know we have a delimiter or item that is allowed. Do + // whatever we need to do with it. + + parser_state_e next_state = ps_top; + if (delimiter == ':') + { + next_state = ps_dict_after_colon; + } + else if (delimiter == ',') + { + if (parser_state == ps_dict_after_item) + { + next_state = ps_dict_after_comma; + } + else if (parser_state == ps_array_after_item) + { + next_state = ps_array_after_comma; + } + else + { + throw std::logic_error( + "JSONParser::handleToken: unexpected parser" + " state for comma"); + } + } + else if ((delimiter == '}') || (delimiter == ']')) + { + next_state = ps_stack.back(); + ps_stack.pop_back(); + if (next_state != ps_done) + { + stack.pop_back(); + } + } + else if (delimiter != '\0') + { + throw std::logic_error( + "JSONParser::handleToken: unexpected delimiter in transition"); + } + else if (item.getPointer()) + { + PointerHolder tos; + if (! stack.empty()) + { + tos = stack.back(); + } + switch (parser_state) + { + case ps_dict_begin: + case ps_dict_after_comma: + this->dict_key = svalue; + item = nullptr; + next_state = ps_dict_after_key; + break; + + case ps_dict_after_colon: + tos->addDictionaryMember(dict_key, *item); + next_state = ps_dict_after_item; + break; + + case ps_array_begin: + case ps_array_after_comma: + next_state = ps_array_after_item; + tos->addArrayElement(*item); + break; + + case ps_top: + next_state = ps_done; + break; + + case ps_dict_after_key: + case ps_dict_after_item: + case ps_array_after_item: + case ps_done: + throw std::logic_error( + "JSONParser::handleToken: unexpected parser state"); + } + } + else + { + throw std::logic_error( + "JSONParser::handleToken: unexpected null item in transition"); + } + + // Prepare for next token + if (item.getPointer()) + { + if (item->isDictionary()) + { + stack.push_back(item); + ps_stack.push_back(next_state); + next_state = ps_dict_begin; + } + else if (item->isArray()) + { + stack.push_back(item); + ps_stack.push_back(next_state); + next_state = ps_array_begin; + } + else if (parser_state == ps_top) + { + stack.push_back(item); + } + } + parser_state = next_state; + tok_start = nullptr; + tok_end = nullptr; + lex_state = ls_top; +} + +PointerHolder +JSONParser::parse(std::string const& s) +{ + cstr = s.c_str(); + end = cstr + s.length(); + p = cstr; + + while (p < end) + { + getToken(); + handleToken(); + } + if (parser_state != ps_done) + { + QTC::TC("libtests", "JSON parse preature EOF"); + throw std::runtime_error("JSON: premature end of input"); + } + return stack.back(); +} + +JSON +JSON::parse(std::string const& s) +{ + JSONParser jp; + return *jp.parse(s); +} -- cgit v1.2.3-54-g00ecf