diff options
author | Jay Berkenbilt <ejb@ql.org> | 2018-02-18 00:47:57 +0100 |
---|---|---|
committer | Jay Berkenbilt <ejb@ql.org> | 2018-02-19 03:06:27 +0100 |
commit | 4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3 (patch) | |
tree | 455bf56b35aeda95a9e4581f7e0c31cf7d07c877 /libqpdf | |
parent | 2780a1871d2603e9b273580fb7978d277832c2fc (diff) | |
download | qpdf-4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3.tar.zst |
Properly handle strings with PDF Doc Encoding (fixes #179)
The QPDF_String::getUTF8Val() method was not treating strings that
weren't explicitly Unicode as PDF Doc Encoded. This only affects
characters in the range 0x80 through 0xa0.
Diffstat (limited to 'libqpdf')
-rw-r--r-- | libqpdf/QPDF_String.cc | 45 |
1 files changed, 44 insertions, 1 deletions
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index ca8d3adc..60a3e0df 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -8,6 +8,43 @@ // be used. #include <string.h> +// First element is 128 +static unsigned short pdf_doc_to_unicode[] = { + 0x2022, // 0x80 BULLET + 0x2020, // 0x81 DAGGER + 0x2021, // 0x82 DOUBLE DAGGER + 0x2026, // 0x83 HORIZONTAL ELLIPSIS + 0x2014, // 0x84 EM DASH + 0x2013, // 0x85 EN DASH + 0x0192, // 0x86 SMALL LETTER F WITH HOOK + 0x2044, // 0x87 FRACTION SLASH (solidus) + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x2212, // 0x8a MINUS SIGN + 0x2030, // 0x8b PER MILLE SIGN + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 0x2122, // 0x92 TRADE MARK SIGN + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE + 0x0153, // 0x9c LATIN SMALL LIGATURE OE + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON + 0xfffd, // 0x9f UNDEFINED + 0x20ac, // 0xa0 EURO SIGN +}; + // See above about ctype. static bool is_ascii_printable(unsigned char ch) { @@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const { for (unsigned int i = 0; i < len; ++i) { - result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i))); + unsigned char ch = static_cast<unsigned char>(this->val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = pdf_doc_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); } } return result; |