aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2008-11-23 19:49:13 +0100
committerJay Berkenbilt <ejb@ql.org>2008-11-23 19:49:13 +0100
commit337b9007088670363ff6444b2bffa7e8aa6498dc (patch)
tree0f84d714951401be621816b81d2d9c0e181aabf0
parent6e07eb1aaef7bb049b83083f6f37a16209b146df (diff)
downloadqpdf-337b9007088670363ff6444b2bffa7e8aa6498dc.tar.zst
handle UTF-16BE fully
git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649
-rw-r--r--ChangeLog6
-rw-r--r--libqpdf/QPDF_String.cc40
-rw-r--r--qpdf/qpdf.testcov1
-rw-r--r--qpdf/qtest/qpdf/misc-3.out1
-rw-r--r--qpdf/qtest/qpdf/misc-3.pdf19
5 files changed, 54 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index deb9ee43..8f143c15 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2008-11-23 Jay Berkenbilt <ejb@ql.org>
+
+ * libqpdf/QPDF_String.cc (QPDF_String::getUTF8Val): handle
+ UTF-16BE properly rather than just treating the string as a string
+ of 16-bit characters.
+
2008-06-30 Jay Berkenbilt <ejb@ql.org>
* 2.0.2: release
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index cc8ca042..739006b4 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -2,6 +2,8 @@
#include <qpdf/QPDF_String.hh>
#include <qpdf/QUtil.hh>
+#include <qpdf/QTC.hh>
+
// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.
@@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const
(this->val[0] == '\xfe') && (this->val[1] == '\xff'))
{
// This is a Unicode string using big-endian UTF-16. This
- // code is not actually correct as it doesn't properly handle
- // characters past 0xffff.
+ // code uses unsigned long and unsigned short to hold
+ // codepoint values. It requires unsigned long to be at least
+ // 32 bits and unsigned short to be at least 16 bits, but it
+ // will work fine if they are larger.
+ unsigned long codepoint = 0L;
for (unsigned int i = 2; i < len; i += 2)
{
- result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) +
- ((unsigned char) this->val[i+1]));
+ // Convert from UTF16-BE. If we get a malformed
+ // codepoint, this code will generate incorrect output
+ // without giving a warning. Specifically, a high
+ // codepoint not followed by a low codepoint will be
+ // discarded, and a low codepoint not preceded by a high
+ // codepoint will just get its low 10 bits output.
+ unsigned short bits =
+ (((unsigned char) this->val[i]) << 8) +
+ ((unsigned char) this->val[i+1]);
+ if ((bits & 0xFC00) == 0xD800)
+ {
+ codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+ continue;
+ }
+ else if ((bits & 0xFC00) == 0xDC00)
+ {
+ if (codepoint != 0)
+ {
+ QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
+ }
+ codepoint += bits & 0x3FF;
+ }
+ else
+ {
+ codepoint = bits;
+ }
+
+ result += QUtil::toUTF8(codepoint);
+ codepoint = 0;
}
}
else
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index e6323600..0c2c0416 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -115,3 +115,4 @@ QPDF_Stream pipeStreamData with null pipeline 0
QPDFWriter not recompressing /FlateDecode 0
QPDF piping xref stream from encrypted file 0
unable to filter 0
+QPDF_String non-trivial UTF-16 0
diff --git a/qpdf/qtest/qpdf/misc-3.out b/qpdf/qtest/qpdf/misc-3.out
index f9c89df2..390e9a61 100644
--- a/qpdf/qtest/qpdf/misc-3.out
+++ b/qpdf/qtest/qpdf/misc-3.out
@@ -8,6 +8,7 @@ QStrings:
No Special Characters
These: ¿÷¢þ and no more
πωτατω
+treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮
QNumbers:
1.000
3.142
diff --git a/qpdf/qtest/qpdf/misc-3.pdf b/qpdf/qtest/qpdf/misc-3.pdf
index 6b9aa3c7..4225d239 100644
--- a/qpdf/qtest/qpdf/misc-3.pdf
+++ b/qpdf/qtest/qpdf/misc-3.pdf
@@ -13,6 +13,7 @@
(No Special Characters)
(These: and no more)
<feff03c003c903c403b103c403c9>
+ <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60>
]
/Type /Catalog
>>
@@ -109,19 +110,19 @@ xref
0 10
0000000000 65535 f
0000000025 00000 n
-0000000226 00000 n
-0000000308 00000 n
-0000000543 00000 n
-0000000642 00000 n
-0000000684 00000 n
-0000000782 00000 n
-0000000801 00000 n
-0000000919 00000 n
+0000000377 00000 n
+0000000459 00000 n
+0000000694 00000 n
+0000000793 00000 n
+0000000835 00000 n
+0000000933 00000 n
+0000000952 00000 n
+0000001070 00000 n
trailer <<
/Root 1 0 R
/Size 10
/ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>]
>>
startxref
-954
+1105
%%EOF