From 698485468a8b7d0f38d817d6055898932f46cc26 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 13 Jan 2019 08:00:14 -0500 Subject: Move remaining existing transcoding to QUtil --- libqpdf/QPDF_String.cc | 93 ++------------------------------------------------ 1 file changed, 3 insertions(+), 90 deletions(-) (limited to 'libqpdf/QPDF_String.cc') diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index 7cfb6bcc..bf1141d1 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -8,43 +8,6 @@ // be used. #include -// First element is 128 -static unsigned short pdf_doc_to_unicode[] = { - 0x2022, // 0x80 BULLET - 0x2020, // 0x81 DAGGER - 0x2021, // 0x82 DOUBLE DAGGER - 0x2026, // 0x83 HORIZONTAL ELLIPSIS - 0x2014, // 0x84 EM DASH - 0x2013, // 0x85 EN DASH - 0x0192, // 0x86 SMALL LETTER F WITH HOOK - 0x2044, // 0x87 FRACTION SLASH (solidus) - 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 0x2212, // 0x8a MINUS SIGN - 0x2030, // 0x8b PER MILLE SIGN - 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) - 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) - 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) - 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) - 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) - 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) - 0x2122, // 0x92 TRADE MARK SIGN - 0xfb01, // 0x93 LATIN SMALL LIGATURE FI - 0xfb02, // 0x94 LATIN SMALL LIGATURE FL - 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE - 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE - 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON - 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS - 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON - 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I - 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE - 0x0153, // 0x9c LATIN SMALL LIGATURE OE - 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON - 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON - 0xfffd, // 0x9f UNDEFINED - 0x20ac, // 0xa0 EURO SIGN -}; - // See above about ctype. static bool is_ascii_printable(unsigned char ch) { @@ -210,62 +173,12 @@ QPDF_String::getVal() const std::string QPDF_String::getUTF8Val() const { - std::string result; - size_t len = this->val.length(); - if ((len >= 2) && (len % 2 == 0) && - (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff')) + if (QUtil::is_utf16(this->val)) { - // This is a Unicode string using big-endian UTF-16. This - // code uses unsigned long and unsigned short to hold - // codepoint values. It requires unsigned long to be at least - // 32 bits and unsigned short to be at least 16 bits, but it - // will work fine if they are larger. - unsigned long codepoint = 0L; - for (unsigned int i = 2; i < len; i += 2) - { - // Convert from UTF16-BE. If we get a malformed - // codepoint, this code will generate incorrect output - // without giving a warning. Specifically, a high - // codepoint not followed by a low codepoint will be - // discarded, and a low codepoint not preceded by a high - // codepoint will just get its low 10 bits output. - unsigned short bits = - (static_cast(this->val.at(i)) << 8) + - static_cast(this->val.at(i+1)); - if ((bits & 0xFC00) == 0xD800) - { - codepoint = 0x10000 + ((bits & 0x3FF) << 10); - continue; - } - else if ((bits & 0xFC00) == 0xDC00) - { - if (codepoint != 0) - { - QTC::TC("qpdf", "QPDF_String non-trivial UTF-16"); - } - codepoint += bits & 0x3FF; - } - else - { - codepoint = bits; - } - - result += QUtil::toUTF8(codepoint); - codepoint = 0; - } + return QUtil::utf16_to_utf8(this->val); } else { - for (unsigned int i = 0; i < len; ++i) - { - unsigned char ch = static_cast(this->val.at(i)); - unsigned short val = ch; - if ((ch >= 128) && (ch <= 160)) - { - val = pdf_doc_to_unicode[ch - 128]; - } - result += QUtil::toUTF8(val); - } + return QUtil::pdf_doc_to_utf8(this->val); } - return result; } -- cgit v1.2.3-54-g00ecf