aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QUtil.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2023-12-25 16:51:40 +0100
committerJay Berkenbilt <ejb@ql.org>2023-12-25 17:17:17 +0100
commit6d4115b7c565b6750ba4649d120446a1bd2b5af2 (patch)
treec930f839d89b655ceb008d1a15e88094d36cb7b5 /libqpdf/QUtil.cc
parent986d2485784d57d7a84cc5af50e67bde827b0dc9 (diff)
downloadqpdf-6d4115b7c565b6750ba4649d120446a1bd2b5af2.tar.zst
Detect overlong UTF-8 strings
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r--libqpdf/QUtil.cc37
1 files changed, 34 insertions, 3 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index fcba203f..25c7281f 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
unsigned long
QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
{
+ auto o_pos = pos;
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
@@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
return 0xfffd;
}
- unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
+ auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(pos++));
@@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
+ unsigned long lower_bound = 0;
+ switch (pos - o_pos) {
+ case 2:
+ lower_bound = 1 << 7;
+ break;
+ case 3:
+ lower_bound = 1 << 11;
+ break;
+ case 4:
+ lower_bound = 1 << 16;
+ break;
+ case 5:
+ lower_bound = 1 << 12;
+ break;
+ case 6:
+ lower_bound = 1 << 26;
+ break;
+ default:
+ lower_bound = 0;
+ }
+
+ if (lower_bound > 0 && codepoint < lower_bound) {
+ // Too many bytes were used, but return whatever character was encoded.
+ error = true;
+ }
return codepoint;
}
@@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
bool any_errors = false;
while (pos < len) {
bool error = false;
+ auto old_pos = pos;
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
- }
- if (codepoint >= 128) {
+ for (auto p = old_pos; p < pos; p++) {
+ if (static_cast<unsigned char>(val.at(p)) >= 128) {
+ has_8bit_chars = true;
+ }
+ }
+ } else if (codepoint >= 128) {
has_8bit_chars = true;
}
}