From 370710657a7e7c771668107d1b6407fc350a2891 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 11 Jan 2022 15:06:17 -0500 Subject: Add missing characters from PDF doc encoding (fixes #606) --- libqpdf/QUtil.cc | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'libqpdf') diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index daa663a3..c71e7923 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -37,8 +37,20 @@ # include #endif -// First element is 128 +// First element is 24 +static unsigned short pdf_doc_low_to_unicode[] = { + 0x02d8, // 0x18 BREVE + 0x02c7, // 0x19 CARON + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT + 0x02d9, // 0x1b DOT ABOVE + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT + 0x02db, // 0x1d OGONEK + 0x02da, // 0x1e RING ABOVE + 0x02dc, // 0x1f SMALL TILDE +}; +// First element is 127 static unsigned short pdf_doc_to_unicode[] = { + 0xfffd, // 0x7f UNDEFINED 0x2022, // 0x80 BULLET 0x2020, // 0x81 DAGGER 0x2021, // 0x82 DOUBLE DAGGER @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) unsigned char ch = '\0'; switch (codepoint) { + case 0x02d8: + ch = 0x18; + break; + case 0x02c7: + ch = 0x19; + break; + case 0x02c6: + ch = 0x1a; + break; + case 0x02d9: + ch = 0x1b; + break; + case 0x02dd: + ch = 0x1c; + break; + case 0x02db: + ch = 0x1d; + break; + case 0x02da: + ch = 0x1e; + break; + case 0x02dc: + ch = 0x1f; + break; case 0x2022: ch = 0x80; break; @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val) { unsigned char ch = static_cast(val.at(i)); unsigned short ch_short = ch; - if ((ch >= 128) && (ch <= 160)) + if ((ch >= 127) && (ch <= 160)) + { + ch_short = pdf_doc_to_unicode[ch - 127]; + } + else if ((ch >= 24) && (ch <= 31)) { - ch_short = pdf_doc_to_unicode[ch - 128]; + ch_short = pdf_doc_low_to_unicode[ch - 24]; } result += QUtil::toUTF8(ch_short); } -- cgit v1.2.3-54-g00ecf