From 4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 17 Feb 2018 18:47:57 -0500 Subject: Properly handle strings with PDF Doc Encoding (fixes #179) The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0. --- libqpdf/QPDF_String.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'libqpdf/QPDF_String.cc') diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index ca8d3adc..60a3e0df 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -8,6 +8,43 @@ // be used. #include +// First element is 128 +static unsigned short pdf_doc_to_unicode[] = { + 0x2022, // 0x80 BULLET + 0x2020, // 0x81 DAGGER + 0x2021, // 0x82 DOUBLE DAGGER + 0x2026, // 0x83 HORIZONTAL ELLIPSIS + 0x2014, // 0x84 EM DASH + 0x2013, // 0x85 EN DASH + 0x0192, // 0x86 SMALL LETTER F WITH HOOK + 0x2044, // 0x87 FRACTION SLASH (solidus) + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x2212, // 0x8a MINUS SIGN + 0x2030, // 0x8b PER MILLE SIGN + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 0x2122, // 0x92 TRADE MARK SIGN + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE + 0x0153, // 0x9c LATIN SMALL LIGATURE OE + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON + 0xfffd, // 0x9f UNDEFINED + 0x20ac, // 0xa0 EURO SIGN +}; + // See above about ctype. static bool is_ascii_printable(unsigned char ch) { @@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const { for (unsigned int i = 0; i < len; ++i) { - result += QUtil::toUTF8(static_cast(this->val.at(i))); + unsigned char ch = static_cast(this->val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = pdf_doc_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); } } return result; -- cgit v1.2.3-70-g09d2