aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libqpdf/JSON.cc83
-rw-r--r--libtests/libtests.testcov3
-rw-r--r--libtests/qtest/json_parse.test5
-rw-r--r--libtests/qtest/json_parse/bad-37.json1
-rw-r--r--libtests/qtest/json_parse/bad-37.out1
-rw-r--r--libtests/qtest/json_parse/bad-38.json1
-rw-r--r--libtests/qtest/json_parse/bad-38.out1
-rw-r--r--libtests/qtest/json_parse/bad-39.json1
-rw-r--r--libtests/qtest/json_parse/bad-39.out1
-rw-r--r--libtests/qtest/json_parse/good-11-react.out16
-rw-r--r--libtests/qtest/json_parse/good-11.json4
-rw-r--r--libtests/qtest/json_parse/save-11.json13
12 files changed, 115 insertions, 15 deletions
diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc
index a2aff78b..3072a58b 100644
--- a/libqpdf/JSON.cc
+++ b/libqpdf/JSON.cc
@@ -574,7 +574,15 @@ namespace
private:
void getToken();
void handleToken();
- static std::string decode_string(std::string const& json);
+ static std::string
+ decode_string(std::string const& json, size_t offset);
+ static void handle_u_code(
+ char const* s,
+ size_t offset,
+ size_t i,
+ unsigned long& high_surrogate,
+ size_t& high_offset,
+ std::string& result);
enum parser_state_e {
ps_top,
@@ -620,8 +628,54 @@ namespace
};
} // namespace
+void
+JSONParser::handle_u_code(
+ char const* s,
+ size_t offset,
+ size_t i,
+ unsigned long& high_surrogate,
+ size_t& high_offset,
+ std::string& result)
+{
+ std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
+ unsigned char high = static_cast<unsigned char>(hex.at(0));
+ unsigned char low = static_cast<unsigned char>(hex.at(1));
+ unsigned long codepoint = high;
+ codepoint <<= 8;
+ codepoint += low;
+ if ((codepoint & 0xFC00) == 0xD800) {
+ // high surrogate
+ size_t new_high_offset = offset + i;
+ if (high_offset) {
+ QTC::TC("libtests", "JSON 16 high high");
+ throw std::runtime_error(
+ "JSON: offset " + QUtil::uint_to_string(new_high_offset) +
+ ": UTF-16 high surrogate found after previous high surrogate"
+ " at offset " +
+ QUtil::uint_to_string(high_offset));
+ }
+ high_offset = new_high_offset;
+ high_surrogate = codepoint;
+ } else if ((codepoint & 0xFC00) == 0xDC00) {
+ // low surrogate
+ if (offset + i != (high_offset + 6)) {
+ QTC::TC("libtests", "JSON 16 low not after high");
+ throw std::runtime_error(
+ "JSON: offset " + QUtil::uint_to_string(offset + i) +
+ ": UTF-16 low surrogate found not immediately after high"
+ " surrogate");
+ }
+ high_offset = 0;
+ codepoint =
+ 0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF);
+ result += QUtil::toUTF8(codepoint);
+ } else {
+ result += QUtil::toUTF8(codepoint);
+ }
+}
+
std::string
-JSONParser::decode_string(std::string const& str)
+JSONParser::decode_string(std::string const& str, size_t offset)
{
// The string has already been validated when this private method
// is called, so errors are logic errors instead of runtime
@@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const& str)
// Move inside the quotation marks
++s;
len -= 2;
+ // Keep track of UTF-16 surrogate pairs.
+ unsigned long high_surrogate = 0;
+ size_t high_offset = 0;
std::string result;
for (size_t i = 0; i < len; ++i) {
if (s[i] == '\\') {
@@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const& str)
throw std::logic_error(
"JSON parse: not enough characters after \\u");
}
- {
- std::string hex =
- QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
- i += 4;
- unsigned char high = static_cast<unsigned char>(hex.at(0));
- unsigned char low = static_cast<unsigned char>(hex.at(1));
- unsigned long codepoint = high;
- codepoint <<= 8;
- codepoint += low;
- result += QUtil::toUTF8(codepoint);
- }
+ handle_u_code(
+ s, offset, i, high_surrogate, high_offset, result);
+ i += 4;
break;
default:
throw std::logic_error("JSON parse: bad character after \\");
@@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const& str)
result.append(1, s[i]);
}
}
+ if (high_offset) {
+ QTC::TC("libtests", "JSON 16 dangling high");
+ throw std::runtime_error(
+ "JSON: offset " + QUtil::uint_to_string(high_offset) +
+ ": UTF-16 high surrogate not followed by low surrogate");
+ }
return result;
}
@@ -933,7 +988,7 @@ JSONParser::handleToken()
if (token.length() < 2) {
throw std::logic_error("JSON string length < 2");
}
- s_value = decode_string(token);
+ s_value = decode_string(token, offset - token.length());
}
// Based on the lexical state and value, figure out whether we are
// looking at an item or a delimiter. It will always be exactly
diff --git a/libtests/libtests.testcov b/libtests/libtests.testcov
index 3e4d63d0..1f006e81 100644
--- a/libtests/libtests.testcov
+++ b/libtests/libtests.testcov
@@ -89,3 +89,6 @@ JSONHandler unhandled value 0
JSONHandler unexpected key 0
JSON schema other type 0
JSON optional key 0
+JSON 16 high high 0
+JSON 16 low not after high 0
+JSON 16 dangling high 0
diff --git a/libtests/qtest/json_parse.test b/libtests/qtest/json_parse.test
index 15b251cc..6d57e92c 100644
--- a/libtests/qtest/json_parse.test
+++ b/libtests/qtest/json_parse.test
@@ -32,7 +32,7 @@ if ($^O ne 'msys')
cleanup();
-my $good = 10;
+my $good = 11;
for (my $i = 1; $i <= $good; ++$i)
{
@@ -117,6 +117,9 @@ my @bad = (
"premature end after u", # 34
"bad hex digit", # 35
"parser depth exceeded", # 36
+ "stray low surrogate", # 37
+ "high high surrogate", # 38
+ "dangling high surrogate", # 39
);
my $i = 0;
diff --git a/libtests/qtest/json_parse/bad-37.json b/libtests/qtest/json_parse/bad-37.json
new file mode 100644
index 00000000..3fd031aa
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-37.json
@@ -0,0 +1 @@
+[1, "u:potato: \udd54", 2]
diff --git a/libtests/qtest/json_parse/bad-37.out b/libtests/qtest/json_parse/bad-37.out
new file mode 100644
index 00000000..8b811a34
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-37.out
@@ -0,0 +1 @@
+exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate
diff --git a/libtests/qtest/json_parse/bad-38.json b/libtests/qtest/json_parse/bad-38.json
new file mode 100644
index 00000000..78444f98
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-38.json
@@ -0,0 +1 @@
+"u:\ud83ezz\ud83ezz"
diff --git a/libtests/qtest/json_parse/bad-38.out b/libtests/qtest/json_parse/bad-38.out
new file mode 100644
index 00000000..1b4461f1
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-38.out
@@ -0,0 +1 @@
+exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3
diff --git a/libtests/qtest/json_parse/bad-39.json b/libtests/qtest/json_parse/bad-39.json
new file mode 100644
index 00000000..2edab94b
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-39.json
@@ -0,0 +1 @@
+"u:\ud83e all alone"
diff --git a/libtests/qtest/json_parse/bad-39.out b/libtests/qtest/json_parse/bad-39.out
new file mode 100644
index 00000000..a408dba8
--- /dev/null
+++ b/libtests/qtest/json_parse/bad-39.out
@@ -0,0 +1 @@
+exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate
diff --git a/libtests/qtest/json_parse/good-11-react.out b/libtests/qtest/json_parse/good-11-react.out
new file mode 100644
index 00000000..6cf3345e
--- /dev/null
+++ b/libtests/qtest/json_parse/good-11-react.out
@@ -0,0 +1,16 @@
+array start
+array item: [4, 0): []
+array start
+array item: [5, 11): "u:π"
+array item: [13, 23): "u:π"
+array item: [25, 39): "b:EFBBBFCF80"
+array item: [41, 53): "b:feff03c0"
+container end: [4, 54): []
+array item: [58, 0): []
+array start
+array item: [59, 67): "u:🥔"
+array item: [69, 85): "u:🥔"
+array item: [87, 103): "b:feffd83eDD54"
+container end: [58, 104): []
+container end: [0, 106): []
+[]
diff --git a/libtests/qtest/json_parse/good-11.json b/libtests/qtest/json_parse/good-11.json
new file mode 100644
index 00000000..0a492795
--- /dev/null
+++ b/libtests/qtest/json_parse/good-11.json
@@ -0,0 +1,4 @@
+[
+ ["u:π", "u:\u03c0", "b:EFBBBFCF80", "b:feff03c0"],
+ ["u:🥔", "u:\ud83e\udd54", "b:feffd83eDD54"]
+]
diff --git a/libtests/qtest/json_parse/save-11.json b/libtests/qtest/json_parse/save-11.json
new file mode 100644
index 00000000..f935b8bf
--- /dev/null
+++ b/libtests/qtest/json_parse/save-11.json
@@ -0,0 +1,13 @@
+[
+ [
+ "u:π",
+ "u:π",
+ "b:EFBBBFCF80",
+ "b:feff03c0"
+ ],
+ [
+ "u:🥔",
+ "u:🥔",
+ "b:feffd83eDD54"
+ ]
+]