aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2022-02-15 21:56:06 +0100
committerJay Berkenbilt <ejb@ql.org>2022-02-15 22:13:12 +0100
commita478cbb6dc0e630b919813ad0e7ae1a72510c69d (patch)
treed7106d522f0bf2691c16e76eead59f5707ab67c7
parentfbd3e56da787d18e7a8794580d0e95b7669d1bc4 (diff)
downloadqpdf-a478cbb6dc0e630b919813ad0e7ae1a72510c69d.tar.zst
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.
-rw-r--r--ChangeLog4
-rw-r--r--include/qpdf/QUtil.hh15
-rw-r--r--libqpdf/QUtil.cc14
-rw-r--r--libtests/qtest/qutil/qutil.out1
-rw-r--r--libtests/qutil.cc4
-rw-r--r--qpdf/qtest/qpdf.test9
-rw-r--r--qpdf/qtest/qpdf/utf16le-attachments.out8
-rw-r--r--qpdf/qtest/qpdf/utf16le.pdfbin0 -> 3805 bytes
8 files changed, 45 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index 894504c0..08cb1b16 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2022-02-15 Jay Berkenbilt <ejb@ql.org>
+ * When analyzing PDF strings, recognize UTF-16LE as UTF-16. The
+ PDF spec only allows UTF-16BE, but most readers seem to allow
+ both. Fixes #649.
+
* Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index b4cb1f6a..c1c22110 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -267,8 +267,11 @@ namespace QUtil
QPDF_DLL
std::string toUTF16(unsigned long uval);
- // Test whether this is a UTF-16 big-endian string. This is
- // indicated by first two bytes being 0xFE 0xFF.
+ // Test whether this is a UTF-16 string. This is indicated by
+ // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
+ // (little-endian). Starting in qpdf 10.6.2, this detects
+ // little-endian as well as big-endian. Even though the PDF spec
+ // doesn't allow little-endian, most readers seem to accept it.
QPDF_DLL
bool is_utf16(std::string const&);
@@ -309,8 +312,8 @@ namespace QUtil
bool utf8_to_pdf_doc(
std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
- // Convert a UTF-16 big-endian encoded string to UTF-8.
- // Unrepresentable code points are converted to U+FFFD.
+ // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
+ // points are converted to U+FFFD.
QPDF_DLL
std::string utf16_to_utf8(std::string const& utf16);
@@ -331,7 +334,9 @@ namespace QUtil
// help us guess. If there are no characters with the high bit
// set, has_8bit_chars is false, and the other values are also
// false, even though ASCII strings are valid UTF-8. is_valid_utf8
- // means that the string is non-trivially valid UTF-8.
+ // means that the string is non-trivially valid UTF-8. Although
+ // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
+ // about everything else) accepts UTF-16LE (as of 10.6.2).
QPDF_DLL
void analyze_encoding(std::string const& str,
bool& has_8bit_chars,
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index f01746b6..d0802334 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -2400,7 +2400,8 @@ bool
QUtil::is_utf16(std::string const& val)
{
return ((val.length() >= 2) &&
- (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+ (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
+ ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
}
std::string
@@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const& val)
unsigned long codepoint = 0L;
size_t len = val.length();
size_t start = 0;
+ bool is_le = false;
if (is_utf16(val))
{
+ if (static_cast<unsigned char>(val.at(0)) == 0xff)
+ {
+ is_le = true;
+ }
start += 2;
}
// If the string has an odd number of bytes, the last byte is
@@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const& val)
// codepoint not followed by a low codepoint will be
// discarded, and a low codepoint not preceded by a high
// codepoint will just get its low 10 bits output.
+ auto msb = is_le ? i+1 : i;
+ auto lsb = is_le ? i : i+1;
unsigned short bits =
QIntC::to_ushort(
- (static_cast<unsigned char>(val.at(i)) << 8) +
- static_cast<unsigned char>(val.at(i+1)));
+ (static_cast<unsigned char>(val.at(msb)) << 8) +
+ static_cast<unsigned char>(val.at(lsb)));
if ((bits & 0xFC00) == 0xD800)
{
codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out
index fa284237..fc6a0df1 100644
--- a/libtests/qtest/qutil/qutil.out
+++ b/libtests/qtest/qutil/qutil.out
@@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0
0x80000000 -> ff fd
π
π
+LE: π
---- utf8_to_ascii
¿Does π have fingers?
?Does ? have fingers?
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 2e4d9cdd..a1340c0e 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -303,6 +303,7 @@ void to_utf16_test()
std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
std::cout << QUtil::utf16_to_utf8(s) << std::endl;
std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
+ std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl;
}
void utf8_to_ascii_test()
@@ -388,7 +389,8 @@ void transcoding_test()
check_analyze("pi = \317\200", true, true, false);
check_analyze("pi != \317", true, false, false);
check_analyze("pi != 22/7", false, false, false);
- check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
+ check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
+ check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
std::cout << "analysis done" << std::endl;
std::string input1("a\302\277b");
std::string input2("a\317\200b");
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 14205d88..16921a27 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -73,7 +73,7 @@ flush_tiff_cache();
show_ntests();
# ----------
$td->notify("--- Character Encoding ---");
-$n_tests += 3;
+$n_tests += 4;
$td->runtest("PDF doc encoding to Unicode",
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
@@ -88,6 +88,13 @@ $td->runtest("UTF-16 encoding errors",
{$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
+# UTF-16LE is not allowed by the PDF spec, but it seems that most
+# readers accept it.
+$td->runtest("UTF-16LE strings",
+ {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"},
+ {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
# Tests to exercise QPDFArgParser belong in arg_parser.test in
# libtests. These tests are supposed to be specific to the qpdf cli.
# Since they were written prior to moving QPDFArgParser into the
diff --git a/qpdf/qtest/qpdf/utf16le-attachments.out b/qpdf/qtest/qpdf/utf16le-attachments.out
new file mode 100644
index 00000000..74abc20b
--- /dev/null
+++ b/qpdf/qtest/qpdf/utf16le-attachments.out
@@ -0,0 +1,8 @@
+potato.png -> 6,0
+ preferred name: π.png
+ all names:
+ /F -> π.png
+ /UF -> π.png
+ all data streams:
+ /F -> 6,0
+ /UF -> 6,0
diff --git a/qpdf/qtest/qpdf/utf16le.pdf b/qpdf/qtest/qpdf/utf16le.pdf
new file mode 100644
index 00000000..17c7f2bc
--- /dev/null
+++ b/qpdf/qtest/qpdf/utf16le.pdf
Binary files differ