aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDF_String.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-18 00:47:57 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:06:27 +0100
commit4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3 (patch)
tree455bf56b35aeda95a9e4581f7e0c31cf7d07c877 /libqpdf/QPDF_String.cc
parent2780a1871d2603e9b273580fb7978d277832c2fc (diff)
downloadqpdf-4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3.tar.zst
Properly handle strings with PDF Doc Encoding (fixes #179)
The QPDF_String::getUTF8Val() method was not treating strings that weren't explicitly Unicode as PDF Doc Encoded. This only affects characters in the range 0x80 through 0xa0.
Diffstat (limited to 'libqpdf/QPDF_String.cc')
-rw-r--r--libqpdf/QPDF_String.cc45
1 files changed, 44 insertions, 1 deletions
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index ca8d3adc..60a3e0df 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -8,6 +8,43 @@
// be used.
#include <string.h>
+// First element is 128
+static unsigned short pdf_doc_to_unicode[] = {
+ 0x2022, // 0x80 BULLET
+ 0x2020, // 0x81 DAGGER
+ 0x2021, // 0x82 DOUBLE DAGGER
+ 0x2026, // 0x83 HORIZONTAL ELLIPSIS
+ 0x2014, // 0x84 EM DASH
+ 0x2013, // 0x85 EN DASH
+ 0x0192, // 0x86 SMALL LETTER F WITH HOOK
+ 0x2044, // 0x87 FRACTION SLASH (solidus)
+ 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 0x2212, // 0x8a MINUS SIGN
+ 0x2030, // 0x8b PER MILLE SIGN
+ 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+ 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
+ 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
+ 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
+ 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
+ 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+ 0x2122, // 0x92 TRADE MARK SIGN
+ 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
+ 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
+ 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
+ 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
+ 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
+ 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
+ 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
+ 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
+ 0x0153, // 0x9c LATIN SMALL LIGATURE OE
+ 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
+ 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
+ 0xfffd, // 0x9f UNDEFINED
+ 0x20ac, // 0xa0 EURO SIGN
+};
+
// See above about ctype.
static bool is_ascii_printable(unsigned char ch)
{
@@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const
{
for (unsigned int i = 0; i < len; ++i)
{
- result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i)));
+ unsigned char ch = static_cast<unsigned char>(this->val.at(i));
+ unsigned short val = ch;
+ if ((ch >= 128) && (ch <= 160))
+ {
+ val = pdf_doc_to_unicode[ch - 128];
+ }
+ result += QUtil::toUTF8(val);
}
}
return result;