diff options
author | Jay Berkenbilt <ejb@ql.org> | 2022-02-15 21:56:06 +0100 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2022-02-15 22:13:12 +0100 |
commit | a478cbb6dc0e630b919813ad0e7ae1a72510c69d (patch) | |
tree | d7106d522f0bf2691c16e76eead59f5707ab67c7 /include | |
parent | fbd3e56da787d18e7a8794580d0e95b7669d1bc4 (diff) | |
download | qpdf-a478cbb6dc0e630b919813ad0e7ae1a72510c69d.tar.zst |
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept
UTF-16LE as well, so now qpdf does too.
Diffstat (limited to 'include')
-rw-r--r-- | include/qpdf/QUtil.hh | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh index b4cb1f6a..c1c22110 100644 --- a/include/qpdf/QUtil.hh +++ b/include/qpdf/QUtil.hh @@ -267,8 +267,11 @@ namespace QUtil QPDF_DLL std::string toUTF16(unsigned long uval); - // Test whether this is a UTF-16 big-endian string. This is - // indicated by first two bytes being 0xFE 0xFF. + // Test whether this is a UTF-16 string. This is indicated by + // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE + // (little-endian). Starting in qpdf 10.6.2, this detects + // little-endian as well as big-endian. Even though the PDF spec + // doesn't allow little-endian, most readers seem to accept it. QPDF_DLL bool is_utf16(std::string const&); @@ -309,8 +312,8 @@ namespace QUtil bool utf8_to_pdf_doc( std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); - // Convert a UTF-16 big-endian encoded string to UTF-8. - // Unrepresentable code points are converted to U+FFFD. + // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code + // points are converted to U+FFFD. QPDF_DLL std::string utf16_to_utf8(std::string const& utf16); @@ -331,7 +334,9 @@ namespace QUtil // help us guess. If there are no characters with the high bit // set, has_8bit_chars is false, and the other values are also // false, even though ASCII strings are valid UTF-8. is_valid_utf8 - // means that the string is non-trivially valid UTF-8. + // means that the string is non-trivially valid UTF-8. Although + // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just + // about everything else) accepts UTF-16LE (as of 10.6.2). QPDF_DLL void analyze_encoding(std::string const& str, bool& has_8bit_chars, |