aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDF_String.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2019-01-13 14:00:14 +0100
committerJay Berkenbilt <ejb@ql.org>2019-01-17 17:43:56 +0100
commit698485468a8b7d0f38d817d6055898932f46cc26 (patch)
tree17fb98679692513f189d4b6049dcbce3333899bd /libqpdf/QPDF_String.cc
parent5cfcd4f361063df8e216489915758ce40a15f15b (diff)
downloadqpdf-698485468a8b7d0f38d817d6055898932f46cc26.tar.zst
Move remaining existing transcoding to QUtil
Diffstat (limited to 'libqpdf/QPDF_String.cc')
-rw-r--r--libqpdf/QPDF_String.cc93
1 files changed, 3 insertions, 90 deletions
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index 7cfb6bcc..bf1141d1 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -8,43 +8,6 @@
// be used.
#include <string.h>
-// First element is 128
-static unsigned short pdf_doc_to_unicode[] = {
- 0x2022, // 0x80 BULLET
- 0x2020, // 0x81 DAGGER
- 0x2021, // 0x82 DOUBLE DAGGER
- 0x2026, // 0x83 HORIZONTAL ELLIPSIS
- 0x2014, // 0x84 EM DASH
- 0x2013, // 0x85 EN DASH
- 0x0192, // 0x86 SMALL LETTER F WITH HOOK
- 0x2044, // 0x87 FRACTION SLASH (solidus)
- 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- 0x2212, // 0x8a MINUS SIGN
- 0x2030, // 0x8b PER MILLE SIGN
- 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
- 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
- 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
- 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
- 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
- 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
- 0x2122, // 0x92 TRADE MARK SIGN
- 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
- 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
- 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
- 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
- 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
- 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
- 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
- 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
- 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
- 0x0153, // 0x9c LATIN SMALL LIGATURE OE
- 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
- 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
- 0xfffd, // 0x9f UNDEFINED
- 0x20ac, // 0xa0 EURO SIGN
-};
-
// See above about ctype.
static bool is_ascii_printable(unsigned char ch)
{
@@ -210,62 +173,12 @@ QPDF_String::getVal() const
std::string
QPDF_String::getUTF8Val() const
{
- std::string result;
- size_t len = this->val.length();
- if ((len >= 2) && (len % 2 == 0) &&
- (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
+ if (QUtil::is_utf16(this->val))
{
- // This is a Unicode string using big-endian UTF-16. This
- // code uses unsigned long and unsigned short to hold
- // codepoint values. It requires unsigned long to be at least
- // 32 bits and unsigned short to be at least 16 bits, but it
- // will work fine if they are larger.
- unsigned long codepoint = 0L;
- for (unsigned int i = 2; i < len; i += 2)
- {
- // Convert from UTF16-BE. If we get a malformed
- // codepoint, this code will generate incorrect output
- // without giving a warning. Specifically, a high
- // codepoint not followed by a low codepoint will be
- // discarded, and a low codepoint not preceded by a high
- // codepoint will just get its low 10 bits output.
- unsigned short bits =
- (static_cast<unsigned char>(this->val.at(i)) << 8) +
- static_cast<unsigned char>(this->val.at(i+1));
- if ((bits & 0xFC00) == 0xD800)
- {
- codepoint = 0x10000 + ((bits & 0x3FF) << 10);
- continue;
- }
- else if ((bits & 0xFC00) == 0xDC00)
- {
- if (codepoint != 0)
- {
- QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
- }
- codepoint += bits & 0x3FF;
- }
- else
- {
- codepoint = bits;
- }
-
- result += QUtil::toUTF8(codepoint);
- codepoint = 0;
- }
+ return QUtil::utf16_to_utf8(this->val);
}
else
{
- for (unsigned int i = 0; i < len; ++i)
- {
- unsigned char ch = static_cast<unsigned char>(this->val.at(i));
- unsigned short val = ch;
- if ((ch >= 128) && (ch <= 160))
- {
- val = pdf_doc_to_unicode[ch - 128];
- }
- result += QUtil::toUTF8(val);
- }
+ return QUtil::pdf_doc_to_utf8(this->val);
}
- return result;
}