diff options
author | Jay Berkenbilt <ejb@ql.org> | 2022-02-15 21:56:06 +0100 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2022-02-15 22:13:12 +0100 |
commit | a478cbb6dc0e630b919813ad0e7ae1a72510c69d (patch) | |
tree | d7106d522f0bf2691c16e76eead59f5707ab67c7 /libqpdf | |
parent | fbd3e56da787d18e7a8794580d0e95b7669d1bc4 (diff) | |
download | qpdf-a478cbb6dc0e630b919813ad0e7ae1a72510c69d.tar.zst |
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept
UTF-16LE as well, so now qpdf does too.
Diffstat (limited to 'libqpdf')
-rw-r--r-- | libqpdf/QUtil.cc | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index f01746b6..d0802334 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -2400,7 +2400,8 @@ bool QUtil::is_utf16(std::string const& val) { return ((val.length() >= 2) && - (val.at(0) == '\xfe') && (val.at(1) == '\xff')); + (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || + ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); } std::string @@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const& val) unsigned long codepoint = 0L; size_t len = val.length(); size_t start = 0; + bool is_le = false; if (is_utf16(val)) { + if (static_cast<unsigned char>(val.at(0)) == 0xff) + { + is_le = true; + } start += 2; } // If the string has an odd number of bytes, the last byte is @@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const& val) // codepoint not followed by a low codepoint will be // discarded, and a low codepoint not preceded by a high // codepoint will just get its low 10 bits output. + auto msb = is_le ? i+1 : i; + auto lsb = is_le ? i : i+1; unsigned short bits = QIntC::to_ushort( - (static_cast<unsigned char>(val.at(i)) << 8) + - static_cast<unsigned char>(val.at(i+1))); + (static_cast<unsigned char>(val.at(msb)) << 8) + + static_cast<unsigned char>(val.at(lsb))); if ((bits & 0xFC00) == 0xD800) { codepoint = 0x10000U + ((bits & 0x3FFU) << 10U); |