aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDF_String.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2008-11-23 19:49:13 +0100
committerJay Berkenbilt <ejb@ql.org>2008-11-23 19:49:13 +0100
commit337b9007088670363ff6444b2bffa7e8aa6498dc (patch)
tree0f84d714951401be621816b81d2d9c0e181aabf0 /libqpdf/QPDF_String.cc
parent6e07eb1aaef7bb049b83083f6f37a16209b146df (diff)
downloadqpdf-337b9007088670363ff6444b2bffa7e8aa6498dc.tar.zst
handle UTF-16BE fully
git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649
Diffstat (limited to 'libqpdf/QPDF_String.cc')
-rw-r--r--libqpdf/QPDF_String.cc40
1 files changed, 36 insertions, 4 deletions
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index cc8ca042..739006b4 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -2,6 +2,8 @@
#include <qpdf/QPDF_String.hh>
#include <qpdf/QUtil.hh>
+#include <qpdf/QTC.hh>
+
// DO NOT USE ctype -- it is locale dependent for some things, and
// it's not worth the risk of including it in case it may accidentally
// be used.
@@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const
(this->val[0] == '\xfe') && (this->val[1] == '\xff'))
{
// This is a Unicode string using big-endian UTF-16. This
- // code is not actually correct as it doesn't properly handle
- // characters past 0xffff.
+ // code uses unsigned long and unsigned short to hold
+ // codepoint values. It requires unsigned long to be at least
+ // 32 bits and unsigned short to be at least 16 bits, but it
+ // will work fine if they are larger.
+ unsigned long codepoint = 0L;
for (unsigned int i = 2; i < len; i += 2)
{
- result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) +
- ((unsigned char) this->val[i+1]));
+ // Convert from UTF16-BE. If we get a malformed
+ // codepoint, this code will generate incorrect output
+ // without giving a warning. Specifically, a high
+ // codepoint not followed by a low codepoint will be
+ // discarded, and a low codepoint not preceded by a high
+ // codepoint will just get its low 10 bits output.
+ unsigned short bits =
+ (((unsigned char) this->val[i]) << 8) +
+ ((unsigned char) this->val[i+1]);
+ if ((bits & 0xFC00) == 0xD800)
+ {
+ codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+ continue;
+ }
+ else if ((bits & 0xFC00) == 0xDC00)
+ {
+ if (codepoint != 0)
+ {
+ QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
+ }
+ codepoint += bits & 0x3FF;
+ }
+ else
+ {
+ codepoint = bits;
+ }
+
+ result += QUtil::toUTF8(codepoint);
+ codepoint = 0;
}
}
else