From 6c7326b290462372bb6c23462b2087149cf5fcc6 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 19 May 2022 20:28:13 -0400 Subject: JSON fix: correctly parse UTF-16 surrogate pairs --- libqpdf/JSON.cc | 83 ++++++++++++++++++++++++----- libtests/libtests.testcov | 3 ++ libtests/qtest/json_parse.test | 5 +- libtests/qtest/json_parse/bad-37.json | 1 + libtests/qtest/json_parse/bad-37.out | 1 + libtests/qtest/json_parse/bad-38.json | 1 + libtests/qtest/json_parse/bad-38.out | 1 + libtests/qtest/json_parse/bad-39.json | 1 + libtests/qtest/json_parse/bad-39.out | 1 + libtests/qtest/json_parse/good-11-react.out | 16 ++++++ libtests/qtest/json_parse/good-11.json | 4 ++ libtests/qtest/json_parse/save-11.json | 13 +++++ 12 files changed, 115 insertions(+), 15 deletions(-) create mode 100644 libtests/qtest/json_parse/bad-37.json create mode 100644 libtests/qtest/json_parse/bad-37.out create mode 100644 libtests/qtest/json_parse/bad-38.json create mode 100644 libtests/qtest/json_parse/bad-38.out create mode 100644 libtests/qtest/json_parse/bad-39.json create mode 100644 libtests/qtest/json_parse/bad-39.out create mode 100644 libtests/qtest/json_parse/good-11-react.out create mode 100644 libtests/qtest/json_parse/good-11.json create mode 100644 libtests/qtest/json_parse/save-11.json diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index a2aff78b..3072a58b 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -574,7 +574,15 @@ namespace private: void getToken(); void handleToken(); - static std::string decode_string(std::string const& json); + static std::string + decode_string(std::string const& json, size_t offset); + static void handle_u_code( + char const* s, + size_t offset, + size_t i, + unsigned long& high_surrogate, + size_t& high_offset, + std::string& result); enum parser_state_e { ps_top, @@ -620,8 +628,54 @@ namespace }; } // namespace +void +JSONParser::handle_u_code( + char const* s, + size_t offset, + size_t i, + unsigned long& high_surrogate, + size_t& high_offset, + std::string& result) +{ + std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); + unsigned char high = static_cast(hex.at(0)); + unsigned char low = static_cast(hex.at(1)); + unsigned long codepoint = high; + codepoint <<= 8; + codepoint += low; + if ((codepoint & 0xFC00) == 0xD800) { + // high surrogate + size_t new_high_offset = offset + i; + if (high_offset) { + QTC::TC("libtests", "JSON 16 high high"); + throw std::runtime_error( + "JSON: offset " + QUtil::uint_to_string(new_high_offset) + + ": UTF-16 high surrogate found after previous high surrogate" + " at offset " + + QUtil::uint_to_string(high_offset)); + } + high_offset = new_high_offset; + high_surrogate = codepoint; + } else if ((codepoint & 0xFC00) == 0xDC00) { + // low surrogate + if (offset + i != (high_offset + 6)) { + QTC::TC("libtests", "JSON 16 low not after high"); + throw std::runtime_error( + "JSON: offset " + QUtil::uint_to_string(offset + i) + + ": UTF-16 low surrogate found not immediately after high" + " surrogate"); + } + high_offset = 0; + codepoint = + 0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF); + result += QUtil::toUTF8(codepoint); + } else { + result += QUtil::toUTF8(codepoint); + } +} + std::string -JSONParser::decode_string(std::string const& str) +JSONParser::decode_string(std::string const& str, size_t offset) { // The string has already been validated when this private method // is called, so errors are logic errors instead of runtime @@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const& str) // Move inside the quotation marks ++s; len -= 2; + // Keep track of UTF-16 surrogate pairs. + unsigned long high_surrogate = 0; + size_t high_offset = 0; std::string result; for (size_t i = 0; i < len; ++i) { if (s[i] == '\\') { @@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const& str) throw std::logic_error( "JSON parse: not enough characters after \\u"); } - { - std::string hex = - QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); - i += 4; - unsigned char high = static_cast(hex.at(0)); - unsigned char low = static_cast(hex.at(1)); - unsigned long codepoint = high; - codepoint <<= 8; - codepoint += low; - result += QUtil::toUTF8(codepoint); - } + handle_u_code( + s, offset, i, high_surrogate, high_offset, result); + i += 4; break; default: throw std::logic_error("JSON parse: bad character after \\"); @@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const& str) result.append(1, s[i]); } } + if (high_offset) { + QTC::TC("libtests", "JSON 16 dangling high"); + throw std::runtime_error( + "JSON: offset " + QUtil::uint_to_string(high_offset) + + ": UTF-16 high surrogate not followed by low surrogate"); + } return result; } @@ -933,7 +988,7 @@ JSONParser::handleToken() if (token.length() < 2) { throw std::logic_error("JSON string length < 2"); } - s_value = decode_string(token); + s_value = decode_string(token, offset - token.length()); } // Based on the lexical state and value, figure out whether we are // looking at an item or a delimiter. It will always be exactly diff --git a/libtests/libtests.testcov b/libtests/libtests.testcov index 3e4d63d0..1f006e81 100644 --- a/libtests/libtests.testcov +++ b/libtests/libtests.testcov @@ -89,3 +89,6 @@ JSONHandler unhandled value 0 JSONHandler unexpected key 0 JSON schema other type 0 JSON optional key 0 +JSON 16 high high 0 +JSON 16 low not after high 0 +JSON 16 dangling high 0 diff --git a/libtests/qtest/json_parse.test b/libtests/qtest/json_parse.test index 15b251cc..6d57e92c 100644 --- a/libtests/qtest/json_parse.test +++ b/libtests/qtest/json_parse.test @@ -32,7 +32,7 @@ if ($^O ne 'msys') cleanup(); -my $good = 10; +my $good = 11; for (my $i = 1; $i <= $good; ++$i) { @@ -117,6 +117,9 @@ my @bad = ( "premature end after u", # 34 "bad hex digit", # 35 "parser depth exceeded", # 36 + "stray low surrogate", # 37 + "high high surrogate", # 38 + "dangling high surrogate", # 39 ); my $i = 0; diff --git a/libtests/qtest/json_parse/bad-37.json b/libtests/qtest/json_parse/bad-37.json new file mode 100644 index 00000000..3fd031aa --- /dev/null +++ b/libtests/qtest/json_parse/bad-37.json @@ -0,0 +1 @@ +[1, "u:potato: \udd54", 2] diff --git a/libtests/qtest/json_parse/bad-37.out b/libtests/qtest/json_parse/bad-37.out new file mode 100644 index 00000000..8b811a34 --- /dev/null +++ b/libtests/qtest/json_parse/bad-37.out @@ -0,0 +1 @@ +exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate diff --git a/libtests/qtest/json_parse/bad-38.json b/libtests/qtest/json_parse/bad-38.json new file mode 100644 index 00000000..78444f98 --- /dev/null +++ b/libtests/qtest/json_parse/bad-38.json @@ -0,0 +1 @@ +"u:\ud83ezz\ud83ezz" diff --git a/libtests/qtest/json_parse/bad-38.out b/libtests/qtest/json_parse/bad-38.out new file mode 100644 index 00000000..1b4461f1 --- /dev/null +++ b/libtests/qtest/json_parse/bad-38.out @@ -0,0 +1 @@ +exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3 diff --git a/libtests/qtest/json_parse/bad-39.json b/libtests/qtest/json_parse/bad-39.json new file mode 100644 index 00000000..2edab94b --- /dev/null +++ b/libtests/qtest/json_parse/bad-39.json @@ -0,0 +1 @@ +"u:\ud83e all alone" diff --git a/libtests/qtest/json_parse/bad-39.out b/libtests/qtest/json_parse/bad-39.out new file mode 100644 index 00000000..a408dba8 --- /dev/null +++ b/libtests/qtest/json_parse/bad-39.out @@ -0,0 +1 @@ +exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate diff --git a/libtests/qtest/json_parse/good-11-react.out b/libtests/qtest/json_parse/good-11-react.out new file mode 100644 index 00000000..6cf3345e --- /dev/null +++ b/libtests/qtest/json_parse/good-11-react.out @@ -0,0 +1,16 @@ +array start +array item: [4, 0): [] +array start +array item: [5, 11): "u:π" +array item: [13, 23): "u:π" +array item: [25, 39): "b:EFBBBFCF80" +array item: [41, 53): "b:feff03c0" +container end: [4, 54): [] +array item: [58, 0): [] +array start +array item: [59, 67): "u:🥔" +array item: [69, 85): "u:🥔" +array item: [87, 103): "b:feffd83eDD54" +container end: [58, 104): [] +container end: [0, 106): [] +[] diff --git a/libtests/qtest/json_parse/good-11.json b/libtests/qtest/json_parse/good-11.json new file mode 100644 index 00000000..0a492795 --- /dev/null +++ b/libtests/qtest/json_parse/good-11.json @@ -0,0 +1,4 @@ +[ + ["u:π", "u:\u03c0", "b:EFBBBFCF80", "b:feff03c0"], + ["u:🥔", "u:\ud83e\udd54", "b:feffd83eDD54"] +] diff --git a/libtests/qtest/json_parse/save-11.json b/libtests/qtest/json_parse/save-11.json new file mode 100644 index 00000000..f935b8bf --- /dev/null +++ b/libtests/qtest/json_parse/save-11.json @@ -0,0 +1,13 @@ +[ + [ + "u:π", + "u:π", + "b:EFBBBFCF80", + "b:feff03c0" + ], + [ + "u:🥔", + "u:🥔", + "b:feffd83eDD54" + ] +] -- cgit v1.2.3-54-g00ecf