From 6817ca585a44eec2c50961ba7c714b419774da36 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 13 Jan 2019 08:47:10 -0500 Subject: Bidirectional transcoding for win, mac, pdf, utf8, utf16 --- libqpdf/QUtil.cc | 706 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 449 insertions(+), 257 deletions(-) (limited to 'libqpdf') diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index ac501676..9dbce98e 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -66,6 +66,171 @@ static unsigned short pdf_doc_to_unicode[] = { 0xfffd, // 0x9f UNDEFINED 0x20ac, // 0xa0 EURO SIGN }; +static unsigned short win_ansi_to_unicode[] = { + 0x20ac, // 0x80 + 0xfffd, // 0x81 + 0x201a, // 0x82 + 0x0192, // 0x83 + 0x201e, // 0x84 + 0x2026, // 0x85 + 0x2020, // 0x86 + 0x2021, // 0x87 + 0x02c6, // 0x88 + 0x2030, // 0x89 + 0x0160, // 0x8a + 0x2039, // 0x8b + 0x0152, // 0x8c + 0xfffd, // 0x8d + 0x017d, // 0x8e + 0xfffd, // 0x8f + 0xfffd, // 0x90 + 0x2018, // 0x91 + 0x2019, // 0x92 + 0x201c, // 0x93 + 0x201d, // 0x94 + 0x2022, // 0x95 + 0x2013, // 0x96 + 0x2014, // 0x97 + 0x0303, // 0x98 + 0x2122, // 0x99 + 0x0161, // 0x9a + 0x203a, // 0x9b + 0x0153, // 0x9c + 0xfffd, // 0x9d + 0x017e, // 0x9e + 0x0178, // 0x9f + 0x00a0, // 0xa0 +}; +static unsigned short mac_roman_to_unicode[] = { + 0x00c4, // 0x80 + 0x00c5, // 0x81 + 0x00c7, // 0x82 + 0x00c9, // 0x83 + 0x00d1, // 0x84 + 0x00d6, // 0x85 + 0x00dc, // 0x86 + 0x00e1, // 0x87 + 0x00e0, // 0x88 + 0x00e2, // 0x89 + 0x00e4, // 0x8a + 0x00e3, // 0x8b + 0x00e5, // 0x8c + 0x00e7, // 0x8d + 0x00e9, // 0x8e + 0x00e8, // 0x8f + 0x00ea, // 0x90 + 0x00eb, // 0x91 + 0x00ed, // 0x92 + 0x00ec, // 0x93 + 0x00ee, // 0x94 + 0x00ef, // 0x95 + 0x00f1, // 0x96 + 0x00f3, // 0x97 + 0x00f2, // 0x98 + 0x00f4, // 0x99 + 0x00f6, // 0x9a + 0x00f5, // 0x9b + 0x00fa, // 0x9c + 0x00f9, // 0x9d + 0x00fb, // 0x9e + 0x00fc, // 0x9f + 0x2020, // 0xa0 + 0x00b0, // 0xa1 + 0x00a2, // 0xa2 + 0x00a3, // 0xa3 + 0x00a7, // 0xa4 + 0x2022, // 0xa5 + 0x00b6, // 0xa6 + 0x00df, // 0xa7 + 0x00ae, // 0xa8 + 0x00a9, // 0xa9 + 0x2122, // 0xaa + 0x0301, // 0xab + 0x0308, // 0xac + 0xfffd, // 0xad + 0x00c6, // 0xae + 0x00d8, // 0xaf + 0xfffd, // 0xb0 + 0x00b1, // 0xb1 + 0xfffd, // 0xb2 + 0xfffd, // 0xb3 + 0x00a5, // 0xb4 + 0x03bc, // 0xb5 + 0xfffd, // 0xb6 + 0xfffd, // 0xb7 + 0xfffd, // 0xb8 + 0xfffd, // 0xb9 + 0xfffd, // 0xba + 0x1d43, // 0xbb + 0x1d52, // 0xbc + 0xfffd, // 0xbd + 0x00e6, // 0xbe + 0x00f8, // 0xbf + 0x00bf, // 0xc0 + 0x00a1, // 0xc1 + 0x00ac, // 0xc2 + 0xfffd, // 0xc3 + 0x0192, // 0xc4 + 0xfffd, // 0xc5 + 0xfffd, // 0xc6 + 0x00ab, // 0xc7 + 0x00bb, // 0xc8 + 0x2026, // 0xc9 + 0xfffd, // 0xca + 0x00c0, // 0xcb + 0x00c3, // 0xcc + 0x00d5, // 0xcd + 0x0152, // 0xce + 0x0153, // 0xcf + 0x2013, // 0xd0 + 0x2014, // 0xd1 + 0x201c, // 0xd2 + 0x201d, // 0xd3 + 0x2018, // 0xd4 + 0x2019, // 0xd5 + 0x00f7, // 0xd6 + 0xfffd, // 0xd7 + 0x00ff, // 0xd8 + 0x0178, // 0xd9 + 0x2044, // 0xda + 0x00a4, // 0xdb + 0x2039, // 0xdc + 0x203a, // 0xdd + 0xfb01, // 0xde + 0xfb02, // 0xdf + 0x2021, // 0xe0 + 0x00b7, // 0xe1 + 0x201a, // 0xe2 + 0x201e, // 0xe3 + 0x2030, // 0xe4 + 0x00c2, // 0xe5 + 0x00ca, // 0xe6 + 0x00c1, // 0xe7 + 0x00cb, // 0xe8 + 0x00c8, // 0xe9 + 0x00cd, // 0xea + 0x00ce, // 0xeb + 0x00cf, // 0xec + 0x00cc, // 0xed + 0x00d3, // 0xee + 0x00d4, // 0xef + 0xfffd, // 0xf0 + 0x00d2, // 0xf1 + 0x00da, // 0xf2 + 0x00db, // 0xf3 + 0x00d9, // 0xf4 + 0x0131, // 0xf5 + 0x02c6, // 0xf6 + 0x0303, // 0xf7 + 0x0304, // 0xf8 + 0x0306, // 0xf9 + 0x0307, // 0xfa + 0x030a, // 0xfb + 0x0327, // 0xfc + 0x030b, // 0xfd + 0x0328, // 0xfe + 0x02c7, // 0xff +}; std::string QUtil::int_to_string(long long num, int length) @@ -946,23 +1111,17 @@ encode_winansi(unsigned long codepoint) case 0x20ac: ch = 0x80; break; - case 0x152: - ch = 0x8c; - break; - case 0x160: - ch = 0x8a; - break; - case 0x178: - ch = 0x9f; + case 0x201a: + ch = 0x82; break; - case 0x17d: - ch = 0x8e; + case 0x192: + ch = 0x83; break; - case 0x2022: - ch = 0x95; + case 0x201e: + ch = 0x84; break; - case 0x2c6: - ch = 0x88; + case 0x2026: + ch = 0x85; break; case 0x2020: ch = 0x86; @@ -970,32 +1129,29 @@ encode_winansi(unsigned long codepoint) case 0x2021: ch = 0x87; break; - case 0x2026: - ch = 0x85; - break; - case 0x2014: - ch = 0x97; + case 0x2c6: + ch = 0x88; break; - case 0x2013: - ch = 0x96; + case 0x2030: + ch = 0x89; break; - case 0x192: - ch = 0x83; + case 0x160: + ch = 0x8a; break; case 0x2039: ch = 0x8b; break; - case 0x203a: - ch = 0x9b; + case 0x152: + ch = 0x8c; break; - case 0x153: - ch = 0x9c; + case 0x17d: + ch = 0x8e; break; - case 0x2030: - ch = 0x89; + case 0x2018: + ch = 0x91; break; - case 0x201e: - ch = 0x84; + case 0x2019: + ch = 0x92; break; case 0x201c: ch = 0x93; @@ -1003,17 +1159,14 @@ encode_winansi(unsigned long codepoint) case 0x201d: ch = 0x94; break; - case 0x2018: - ch = 0x91; - break; - case 0x2019: - ch = 0x92; + case 0x2022: + ch = 0x95; break; - case 0x201a: - ch = 0x82; + case 0x2013: + ch = 0x96; break; - case 0x161: - ch = 0x9a; + case 0x2014: + ch = 0x97; break; case 0x303: ch = 0x98; @@ -1021,9 +1174,24 @@ encode_winansi(unsigned long codepoint) case 0x2122: ch = 0x99; break; + case 0x161: + ch = 0x9a; + break; + case 0x203a: + ch = 0x9b; + break; + case 0x153: + ch = 0x9c; + break; case 0x17e: ch = 0x9e; break; + case 0x178: + ch = 0x9f; + break; + case 0xa0: + ch = 0xa0; + break; default: break; } @@ -1038,290 +1206,209 @@ encode_macroman(unsigned long codepoint) unsigned char ch = '\0'; switch (codepoint) { - case 0xc6: - ch = 0xae; - break; - case 0xc1: - ch = 0xe7; - break; - case 0xc2: - ch = 0xe5; - break; case 0xc4: ch = 0x80; break; - case 0xc0: - ch = 0xcb; - break; case 0xc5: ch = 0x81; break; - case 0xc3: - ch = 0xcc; - break; case 0xc7: ch = 0x82; break; case 0xc9: ch = 0x83; break; - case 0xca: - ch = 0xe6; - break; - case 0xcb: - ch = 0xe8; - break; - case 0xc8: - ch = 0xe9; - break; - case 0xcd: - ch = 0xea; - break; - case 0xce: - ch = 0xeb; - break; - case 0xcf: - ch = 0xec; - break; - case 0xcc: - ch = 0xed; - break; case 0xd1: ch = 0x84; break; - case 0x152: - ch = 0xce; - break; - case 0xd3: - ch = 0xee; - break; - case 0xd4: - ch = 0xef; - break; case 0xd6: ch = 0x85; break; - case 0xd2: - ch = 0xf1; - break; - case 0xd8: - ch = 0xaf; - break; - case 0xd5: - ch = 0xcd; - break; - case 0xda: - ch = 0xf2; - break; - case 0xdb: - ch = 0xf3; - break; case 0xdc: ch = 0x86; break; - case 0xd9: - ch = 0xf4; - break; - case 0x178: - ch = 0xd9; - break; case 0xe1: ch = 0x87; break; + case 0xe0: + ch = 0x88; + break; case 0xe2: ch = 0x89; break; - case 0x301: - ch = 0xab; - break; case 0xe4: ch = 0x8a; break; - case 0xe6: - ch = 0xbe; - break; - case 0xe0: - ch = 0x88; - break; - case 0xe5: - ch = 0x8c; - break; case 0xe3: ch = 0x8b; break; - case 0x306: - ch = 0xf9; - break; - case 0x2022: - ch = 0xa5; - break; - case 0x2c7: - ch = 0xff; + case 0xe5: + ch = 0x8c; break; case 0xe7: ch = 0x8d; break; - case 0x327: - ch = 0xfc; + case 0xe9: + ch = 0x8e; break; - case 0xa2: - ch = 0xa2; + case 0xe8: + ch = 0x8f; break; - case 0x2c6: - ch = 0xf6; + case 0xea: + ch = 0x90; break; - case 0xa9: - ch = 0xa9; + case 0xeb: + ch = 0x91; break; - case 0xa4: - ch = 0xdb; + case 0xed: + ch = 0x92; break; - case 0x2020: - ch = 0xa0; + case 0xec: + ch = 0x93; break; - case 0x2021: - ch = 0xe0; + case 0xee: + ch = 0x94; break; - case 0xb0: - ch = 0xa1; + case 0xef: + ch = 0x95; break; - case 0x308: - ch = 0xac; + case 0xf1: + ch = 0x96; break; - case 0xf7: - ch = 0xd6; + case 0xf3: + ch = 0x97; break; - case 0x307: - ch = 0xfa; + case 0xf2: + ch = 0x98; break; - case 0x131: - ch = 0xf5; + case 0xf4: + ch = 0x99; break; - case 0xe9: - ch = 0x8e; + case 0xf6: + ch = 0x9a; break; - case 0xea: - ch = 0x90; + case 0xf5: + ch = 0x9b; break; - case 0xeb: - ch = 0x91; + case 0xfa: + ch = 0x9c; break; - case 0xe8: - ch = 0x8f; + case 0xf9: + ch = 0x9d; break; - case 0x2026: - ch = 0xc9; + case 0xfb: + ch = 0x9e; break; - case 0x2014: - ch = 0xd1; + case 0xfc: + ch = 0x9f; break; - case 0x2013: - ch = 0xd0; + case 0x2020: + ch = 0xa0; break; - case 0xa1: - ch = 0xc1; + case 0xb0: + ch = 0xa1; break; - case 0xfb01: - ch = 0xde; + case 0xa2: + ch = 0xa2; break; - case 0xfb02: - ch = 0xdf; + case 0xa3: + ch = 0xa3; break; - case 0x192: - ch = 0xc4; + case 0xa7: + ch = 0xa4; break; - case 0x2044: - ch = 0xda; + case 0x2022: + ch = 0xa5; + break; + case 0xb6: + ch = 0xa6; break; case 0xdf: ch = 0xa7; break; - case 0xab: - ch = 0xc7; - break; - case 0xbb: - ch = 0xc8; - break; - case 0x2039: - ch = 0xdc; + case 0xae: + ch = 0xa8; break; - case 0x203a: - ch = 0xdd; + case 0xa9: + ch = 0xa9; break; - case 0x30b: - ch = 0xfd; + case 0x2122: + ch = 0xaa; break; - case 0xed: - ch = 0x92; + case 0x301: + ch = 0xab; break; - case 0xee: - ch = 0x94; + case 0x308: + ch = 0xac; break; - case 0xef: - ch = 0x95; + case 0xc6: + ch = 0xae; break; - case 0xec: - ch = 0x93; + case 0xd8: + ch = 0xaf; break; - case 0xac: - ch = 0xc2; + case 0xb1: + ch = 0xb1; break; - case 0x304: - ch = 0xf8; + case 0xa5: + ch = 0xb4; break; case 0x3bc: ch = 0xb5; break; - case 0xf1: - ch = 0x96; + case 0x1d43: + ch = 0xbb; break; - case 0xf3: - ch = 0x97; + case 0x1d52: + ch = 0xbc; break; - case 0xf4: - ch = 0x99; + case 0xe6: + ch = 0xbe; break; - case 0xf6: - ch = 0x9a; + case 0xf8: + ch = 0xbf; break; - case 0x153: - ch = 0xcf; + case 0xbf: + ch = 0xc0; break; - case 0x328: - ch = 0xfe; + case 0xa1: + ch = 0xc1; break; - case 0xf2: - ch = 0x98; + case 0xac: + ch = 0xc2; break; - case 0x1d43: - ch = 0xbb; + case 0x192: + ch = 0xc4; break; - case 0x1d52: - ch = 0xbc; + case 0xab: + ch = 0xc7; break; - case 0xf8: - ch = 0xbf; + case 0xbb: + ch = 0xc8; break; - case 0xf5: - ch = 0x9b; + case 0x2026: + ch = 0xc9; break; - case 0xb6: - ch = 0xa6; + case 0xc0: + ch = 0xcb; break; - case 0xb7: - ch = 0xe1; + case 0xc3: + ch = 0xcc; break; - case 0x2030: - ch = 0xe4; + case 0xd5: + ch = 0xcd; break; - case 0xb1: - ch = 0xb1; + case 0x152: + ch = 0xce; break; - case 0xbf: - ch = 0xc0; + case 0x153: + ch = 0xcf; break; - case 0x201e: - ch = 0xe3; + case 0x2013: + ch = 0xd0; + break; + case 0x2014: + ch = 0xd1; break; case 0x201c: ch = 0xd2; @@ -1335,44 +1422,125 @@ encode_macroman(unsigned long codepoint) case 0x2019: ch = 0xd5; break; + case 0xf7: + ch = 0xd6; + break; + case 0xff: + ch = 0xd8; + break; + case 0x178: + ch = 0xd9; + break; + case 0x2044: + ch = 0xda; + break; + case 0xa4: + ch = 0xdb; + break; + case 0x2039: + ch = 0xdc; + break; + case 0x203a: + ch = 0xdd; + break; + case 0xfb01: + ch = 0xde; + break; + case 0xfb02: + ch = 0xdf; + break; + case 0x2021: + ch = 0xe0; + break; + case 0xb7: + ch = 0xe1; + break; case 0x201a: ch = 0xe2; break; - case 0xae: - ch = 0xa8; + case 0x201e: + ch = 0xe3; break; - case 0x30a: - ch = 0xfb; + case 0x2030: + ch = 0xe4; break; - case 0xa7: - ch = 0xa4; + case 0xc2: + ch = 0xe5; break; - case 0xa3: - ch = 0xa3; + case 0xca: + ch = 0xe6; + break; + case 0xc1: + ch = 0xe7; + break; + case 0xcb: + ch = 0xe8; + break; + case 0xc8: + ch = 0xe9; + break; + case 0xcd: + ch = 0xea; + break; + case 0xce: + ch = 0xeb; + break; + case 0xcf: + ch = 0xec; + break; + case 0xcc: + ch = 0xed; + break; + case 0xd3: + ch = 0xee; + break; + case 0xd4: + ch = 0xef; + break; + case 0xd2: + ch = 0xf1; + break; + case 0xda: + ch = 0xf2; + break; + case 0xdb: + ch = 0xf3; + break; + case 0xd9: + ch = 0xf4; + break; + case 0x131: + ch = 0xf5; + break; + case 0x2c6: + ch = 0xf6; break; case 0x303: ch = 0xf7; break; - case 0x2122: - ch = 0xaa; + case 0x304: + ch = 0xf8; break; - case 0xfa: - ch = 0x9c; + case 0x306: + ch = 0xf9; break; - case 0xfb: - ch = 0x9e; + case 0x307: + ch = 0xfa; break; - case 0xfc: - ch = 0x9f; + case 0x30a: + ch = 0xfb; break; - case 0xf9: - ch = 0x9d; + case 0x327: + ch = 0xfc; break; - case 0xff: - ch = 0xd8; + case 0x30b: + ch = 0xfd; break; - case 0xa5: - ch = 0xb4; + case 0x328: + ch = 0xfe; + break; + case 0x2c7: + ch = 0xff; break; default: break; @@ -1561,7 +1729,7 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, { result += QUtil::toUTF16(codepoint); } - else if ((codepoint >= 160) && (codepoint < 256) && + else if ((codepoint > 160) && (codepoint < 256) && ((encoding == e_winansi) || (encoding == e_pdfdoc))) { ch = static_cast(codepoint & 0xff); @@ -1686,13 +1854,37 @@ QUtil::utf16_to_utf8(std::string const& val) std::string QUtil::win_ansi_to_utf8(std::string const& val) { - return "QXXXQ"; + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast(val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = win_ansi_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; } std::string QUtil::mac_roman_to_utf8(std::string const& val) { - return "QXXXQ"; + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast(val.at(i)); + unsigned short val = ch; + if (ch >= 128) + { + val = mac_roman_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; } std::string -- cgit v1.2.3-54-g00ecf