aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog5
-rw-r--r--libqpdf/QUtil.cc37
-rw-r--r--libtests/qutil.cc17
3 files changed, 56 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index 1ed0dcde..f313fab4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2023-12-25 Jay Berkenbilt <ejb@ql.org>
+
+ * Detect overlong UTF-8 in the UTF-8 decoder, and fix detection of
+ 8-bit characters in erroneous UTF-8 strings.
+
2023-12-24 Jay Berkenbilt <ejb@ql.org>
* 11.7.0: release
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index fcba203f..25c7281f 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1485,6 +1485,7 @@ encode_pdfdoc(unsigned long codepoint)
unsigned long
QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
{
+ auto o_pos = pos;
size_t len = utf8_val.length();
unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
@@ -1505,7 +1506,7 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
return 0xfffd;
}
- unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
+ auto codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
ch = static_cast<unsigned char>(utf8_val.at(pos++));
@@ -1517,6 +1518,31 @@ QUtil::get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& e
codepoint <<= 6;
codepoint += (ch & 0x3f);
}
+ unsigned long lower_bound = 0;
+ switch (pos - o_pos) {
+ case 2:
+ lower_bound = 1 << 7;
+ break;
+ case 3:
+ lower_bound = 1 << 11;
+ break;
+ case 4:
+ lower_bound = 1 << 16;
+ break;
+ case 5:
+ lower_bound = 1 << 12;
+ break;
+ case 6:
+ lower_bound = 1 << 26;
+ break;
+ default:
+ lower_bound = 0;
+ }
+
+ if (lower_bound > 0 && codepoint < lower_bound) {
+ // Too many bytes were used, but return whatever character was encoded.
+ error = true;
+ }
return codepoint;
}
@@ -1799,11 +1825,16 @@ QUtil::analyze_encoding(
bool any_errors = false;
while (pos < len) {
bool error = false;
+ auto old_pos = pos;
unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
- }
- if (codepoint >= 128) {
+ for (auto p = old_pos; p < pos; p++) {
+ if (static_cast<unsigned char>(val.at(p)) >= 128) {
+ has_8bit_chars = true;
+ }
+ }
+ } else if (codepoint >= 128) {
has_8bit_chars = true;
}
}
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index e882a33a..ca6ee314 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -266,6 +266,23 @@ to_utf8_test()
} catch (std::runtime_error& e) {
std::cout << "0x80000000: " << e.what() << std::endl;
}
+
+ // Overlong characters: characters represented by more bytes than necessary.
+ size_t pos = 0;
+ std::string utf8 = "\xC0\x80" // 1 << 7
+ "\xE0\x80\x80" // 1 << 11
+ "\xF0\x80\x80\x80" // 1 << 16
+ "\xF8\x80\x80\x80\x80" // 1 << 21
+ "\xFC\x80\x80\x80\x80\x80"; // 1 << 26
+ auto check = [&pos, &utf8](unsigned long wanted_pos) {
+ bool error = false;
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos);
+ };
+ check(2);
+ check(5);
+ check(9);
+ check(14);
+ check(20);
}
static void