aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2022-02-15 21:56:06 +0100
committerJay Berkenbilt <ejb@ql.org>2022-02-15 22:13:12 +0100
commita478cbb6dc0e630b919813ad0e7ae1a72510c69d (patch)
treed7106d522f0bf2691c16e76eead59f5707ab67c7 /include
parentfbd3e56da787d18e7a8794580d0e95b7669d1bc4 (diff)
downloadqpdf-a478cbb6dc0e630b919813ad0e7ae1a72510c69d.tar.zst
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.
Diffstat (limited to 'include')
-rw-r--r--include/qpdf/QUtil.hh15
1 files changed, 10 insertions, 5 deletions
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index b4cb1f6a..c1c22110 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -267,8 +267,11 @@ namespace QUtil
QPDF_DLL
std::string toUTF16(unsigned long uval);
- // Test whether this is a UTF-16 big-endian string. This is
- // indicated by first two bytes being 0xFE 0xFF.
+ // Test whether this is a UTF-16 string. This is indicated by
+ // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
+ // (little-endian). Starting in qpdf 10.6.2, this detects
+ // little-endian as well as big-endian. Even though the PDF spec
+ // doesn't allow little-endian, most readers seem to accept it.
QPDF_DLL
bool is_utf16(std::string const&);
@@ -309,8 +312,8 @@ namespace QUtil
bool utf8_to_pdf_doc(
std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
- // Convert a UTF-16 big-endian encoded string to UTF-8.
- // Unrepresentable code points are converted to U+FFFD.
+ // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
+ // points are converted to U+FFFD.
QPDF_DLL
std::string utf16_to_utf8(std::string const& utf16);
@@ -331,7 +334,9 @@ namespace QUtil
// help us guess. If there are no characters with the high bit
// set, has_8bit_chars is false, and the other values are also
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
- // means that the string is non-trivially valid UTF-8.
+ // means that the string is non-trivially valid UTF-8. Although
+ // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
+ // about everything else) accepts UTF-16LE (as of 10.6.2).
QPDF_DLL
void analyze_encoding(std::string const& str,
bool& has_8bit_chars,