diff options
author | Jay Berkenbilt <ejb@ql.org> | 2022-09-26 14:05:28 +0200 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2022-09-26 14:06:47 +0200 |
commit | f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7 (patch) | |
tree | 4699cc60ca8e4779db4635a7342f4ff9dfffceb1 /libqpdf/QUtil.cc | |
parent | 4fb7d1335a4660bb8748773294f2dea979fcdbb7 (diff) | |
download | qpdf-f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7.tar.zst |
Fix edge case in character encoding (fixes #778)
Avoid representing as PDF Doc encoding any string whose PDF Doc
encoding representation starts with a UTF-16 or UTF-8 marker.
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r-- | libqpdf/QUtil.cc | 32 |
1 files changed, 30 insertions, 2 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index bcf4aa4e..7f23bd03 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1565,10 +1565,38 @@ transcode_utf8( { bool okay = true; result.clear(); - if (encoding == e_utf16) { + size_t len = utf8_val.length(); + switch (encoding) { + case e_utf16: result += "\xfe\xff"; + break; + case e_pdfdoc: + // We need to avoid having the result start with something + // that will be interpreted as UTF-16 or UTF-8, meaning we + // can't end up with a string that starts with "fe ff", + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the + // PDF spec, but recognized by most readers including qpdf), + // or "ef bb bf" (UTF-8). It's more efficient to check the + // input string to see if it will map to one of those + // sequences than to check the output string since all cases + // start with the same starting character. + if ((len >= 4) && (utf8_val[0] == '\xc3')) { + static std::string fe_ff("\xbe\xc3\xbf"); + static std::string ff_fe("\xbf\xc3\xbe"); + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf"); + // C++-20 has starts_with, but when this was written, qpdf + // had a minimum supported version of C++-17. + if ((utf8_val.compare(1, 3, fe_ff) == 0) || + (utf8_val.compare(1, 3, ff_fe) == 0) || + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) { + result += unknown; + okay = false; + } + } + break; + default: + break; } - size_t len = utf8_val.length(); size_t pos = 0; while (pos < len) { bool error = false; |