diff options
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r-- | libqpdf/QUtil.cc | 1387 |
1 files changed, 1370 insertions, 17 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index e2bc0bac..58646ade 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -7,12 +7,15 @@ # include <qpdf/InsecureRandomDataProvider.hh> #endif #include <qpdf/SecureRandomDataProvider.hh> +#include <qpdf/QPDFSystemError.hh> +#include <qpdf/QTC.hh> #include <cmath> #include <iomanip> #include <sstream> #include <fstream> #include <stdexcept> +#include <set> #include <stdio.h> #include <errno.h> #include <ctype.h> @@ -28,6 +31,208 @@ #include <sys/stat.h> #endif +// First element is 128 +static unsigned short pdf_doc_to_unicode[] = { + 0x2022, // 0x80 BULLET + 0x2020, // 0x81 DAGGER + 0x2021, // 0x82 DOUBLE DAGGER + 0x2026, // 0x83 HORIZONTAL ELLIPSIS + 0x2014, // 0x84 EM DASH + 0x2013, // 0x85 EN DASH + 0x0192, // 0x86 SMALL LETTER F WITH HOOK + 0x2044, // 0x87 FRACTION SLASH (solidus) + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x2212, // 0x8a MINUS SIGN + 0x2030, // 0x8b PER MILLE SIGN + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) + 0x2122, // 0x92 TRADE MARK SIGN + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE + 0x0153, // 0x9c LATIN SMALL LIGATURE OE + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON + 0xfffd, // 0x9f UNDEFINED + 0x20ac, // 0xa0 EURO SIGN +}; +static unsigned short win_ansi_to_unicode[] = { + 0x20ac, // 0x80 + 0xfffd, // 0x81 + 0x201a, // 0x82 + 0x0192, // 0x83 + 0x201e, // 0x84 + 0x2026, // 0x85 + 0x2020, // 0x86 + 0x2021, // 0x87 + 0x02c6, // 0x88 + 0x2030, // 0x89 + 0x0160, // 0x8a + 0x2039, // 0x8b + 0x0152, // 0x8c + 0xfffd, // 0x8d + 0x017d, // 0x8e + 0xfffd, // 0x8f + 0xfffd, // 0x90 + 0x2018, // 0x91 + 0x2019, // 0x92 + 0x201c, // 0x93 + 0x201d, // 0x94 + 0x2022, // 0x95 + 0x2013, // 0x96 + 0x2014, // 0x97 + 0x0303, // 0x98 + 0x2122, // 0x99 + 0x0161, // 0x9a + 0x203a, // 0x9b + 0x0153, // 0x9c + 0xfffd, // 0x9d + 0x017e, // 0x9e + 0x0178, // 0x9f + 0x00a0, // 0xa0 +}; +static unsigned short mac_roman_to_unicode[] = { + 0x00c4, // 0x80 + 0x00c5, // 0x81 + 0x00c7, // 0x82 + 0x00c9, // 0x83 + 0x00d1, // 0x84 + 0x00d6, // 0x85 + 0x00dc, // 0x86 + 0x00e1, // 0x87 + 0x00e0, // 0x88 + 0x00e2, // 0x89 + 0x00e4, // 0x8a + 0x00e3, // 0x8b + 0x00e5, // 0x8c + 0x00e7, // 0x8d + 0x00e9, // 0x8e + 0x00e8, // 0x8f + 0x00ea, // 0x90 + 0x00eb, // 0x91 + 0x00ed, // 0x92 + 0x00ec, // 0x93 + 0x00ee, // 0x94 + 0x00ef, // 0x95 + 0x00f1, // 0x96 + 0x00f3, // 0x97 + 0x00f2, // 0x98 + 0x00f4, // 0x99 + 0x00f6, // 0x9a + 0x00f5, // 0x9b + 0x00fa, // 0x9c + 0x00f9, // 0x9d + 0x00fb, // 0x9e + 0x00fc, // 0x9f + 0x2020, // 0xa0 + 0x00b0, // 0xa1 + 0x00a2, // 0xa2 + 0x00a3, // 0xa3 + 0x00a7, // 0xa4 + 0x2022, // 0xa5 + 0x00b6, // 0xa6 + 0x00df, // 0xa7 + 0x00ae, // 0xa8 + 0x00a9, // 0xa9 + 0x2122, // 0xaa + 0x0301, // 0xab + 0x0308, // 0xac + 0xfffd, // 0xad + 0x00c6, // 0xae + 0x00d8, // 0xaf + 0xfffd, // 0xb0 + 0x00b1, // 0xb1 + 0xfffd, // 0xb2 + 0xfffd, // 0xb3 + 0x00a5, // 0xb4 + 0x03bc, // 0xb5 + 0xfffd, // 0xb6 + 0xfffd, // 0xb7 + 0xfffd, // 0xb8 + 0xfffd, // 0xb9 + 0xfffd, // 0xba + 0x1d43, // 0xbb + 0x1d52, // 0xbc + 0xfffd, // 0xbd + 0x00e6, // 0xbe + 0x00f8, // 0xbf + 0x00bf, // 0xc0 + 0x00a1, // 0xc1 + 0x00ac, // 0xc2 + 0xfffd, // 0xc3 + 0x0192, // 0xc4 + 0xfffd, // 0xc5 + 0xfffd, // 0xc6 + 0x00ab, // 0xc7 + 0x00bb, // 0xc8 + 0x2026, // 0xc9 + 0xfffd, // 0xca + 0x00c0, // 0xcb + 0x00c3, // 0xcc + 0x00d5, // 0xcd + 0x0152, // 0xce + 0x0153, // 0xcf + 0x2013, // 0xd0 + 0x2014, // 0xd1 + 0x201c, // 0xd2 + 0x201d, // 0xd3 + 0x2018, // 0xd4 + 0x2019, // 0xd5 + 0x00f7, // 0xd6 + 0xfffd, // 0xd7 + 0x00ff, // 0xd8 + 0x0178, // 0xd9 + 0x2044, // 0xda + 0x00a4, // 0xdb + 0x2039, // 0xdc + 0x203a, // 0xdd + 0xfb01, // 0xde + 0xfb02, // 0xdf + 0x2021, // 0xe0 + 0x00b7, // 0xe1 + 0x201a, // 0xe2 + 0x201e, // 0xe3 + 0x2030, // 0xe4 + 0x00c2, // 0xe5 + 0x00ca, // 0xe6 + 0x00c1, // 0xe7 + 0x00cb, // 0xe8 + 0x00c8, // 0xe9 + 0x00cd, // 0xea + 0x00ce, // 0xeb + 0x00cf, // 0xec + 0x00cc, // 0xed + 0x00d3, // 0xee + 0x00d4, // 0xef + 0xfffd, // 0xf0 + 0x00d2, // 0xf1 + 0x00da, // 0xf2 + 0x00db, // 0xf3 + 0x00d9, // 0xf4 + 0x0131, // 0xf5 + 0x02c6, // 0xf6 + 0x0303, // 0xf7 + 0x0304, // 0xf8 + 0x0306, // 0xf9 + 0x0307, // 0xfa + 0x030a, // 0xfb + 0x0327, // 0xfc + 0x030b, // 0xfd + 0x0328, // 0xfe + 0x02c7, // 0xff +}; + std::string QUtil::int_to_string(long long num, int length) { @@ -132,22 +337,7 @@ QUtil::unsigned_char_pointer(char const* str) void QUtil::throw_system_error(std::string const& description) { -#ifdef _MSC_VER - // "94" is mentioned in the MSVC docs, but it's still safe if the - // message is longer. strerror_s is a templated function that - // knows the size of buf and truncates. - char buf[94]; - if (strerror_s(buf, errno) != 0) - { - throw std::runtime_error(description + ": failed with an unknown error"); - } - else - { - throw std::runtime_error(description + ": " + buf); - } -#else - throw std::runtime_error(description + ": " + strerror(errno)); -#endif + throw QPDFSystemError(description, errno); } int @@ -228,13 +418,14 @@ QUtil::same_file(char const* name1, char const* name2) return false; } #ifdef _WIN32 + bool same = false; +# ifndef AVOID_WINDOWS_HANDLE HANDLE fh1 = CreateFile(name1, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); HANDLE fh2 = CreateFile(name2, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); BY_HANDLE_FILE_INFORMATION fi1; BY_HANDLE_FILE_INFORMATION fi2; - bool same = false; if ((fh1 != INVALID_HANDLE_VALUE) && (fh2 != INVALID_HANDLE_VALUE) && GetFileInformationByHandle(fh1, &fi1) && @@ -253,6 +444,7 @@ QUtil::same_file(char const* name1, char const* name2) { CloseHandle(fh2); } +# endif return same; #else struct stat st1; @@ -732,3 +924,1164 @@ QUtil::strcasecmp(char const *s1, char const *s2) return ::strcasecmp(s1, s2); #endif } + +static int maybe_from_end(int num, bool from_end, int max) +{ + if (from_end) + { + if (num > max) + { + num = 0; + } + else + { + num = max + 1 - num; + } + } + return num; +} + +std::vector<int> +QUtil::parse_numrange(char const* range, int max) +{ + std::vector<int> result; + char const* p = range; + try + { + std::vector<int> work; + static int const comma = -1; + static int const dash = -2; + + enum { st_top, + st_in_number, + st_after_number } state = st_top; + bool last_separator_was_dash = false; + int cur_number = 0; + bool from_end = false; + while (*p) + { + char ch = *p; + if (isdigit(ch)) + { + if (! ((state == st_top) || (state == st_in_number))) + { + throw std::runtime_error("digit not expected"); + } + state = st_in_number; + cur_number *= 10; + cur_number += (ch - '0'); + } + else if (ch == 'z') + { + // z represents max + if (! (state == st_top)) + { + throw std::runtime_error("z not expected"); + } + state = st_after_number; + cur_number = max; + } + else if (ch == 'r') + { + if (! (state == st_top)) + { + throw std::runtime_error("r not expected"); + } + state = st_in_number; + from_end = true; + } + else if ((ch == ',') || (ch == '-')) + { + if (! ((state == st_in_number) || (state == st_after_number))) + { + throw std::runtime_error("unexpected separator"); + } + cur_number = maybe_from_end(cur_number, from_end, max); + work.push_back(cur_number); + cur_number = 0; + from_end = false; + if (ch == ',') + { + state = st_top; + last_separator_was_dash = false; + work.push_back(comma); + } + else if (ch == '-') + { + if (last_separator_was_dash) + { + throw std::runtime_error("unexpected dash"); + } + state = st_top; + last_separator_was_dash = true; + work.push_back(dash); + } + } + else + { + throw std::runtime_error("unexpected character"); + } + ++p; + } + if ((state == st_in_number) || (state == st_after_number)) + { + cur_number = maybe_from_end(cur_number, from_end, max); + work.push_back(cur_number); + } + else + { + throw std::runtime_error("number expected"); + } + + p = 0; + for (size_t i = 0; i < work.size(); i += 2) + { + int num = work.at(i); + // max == 0 means we don't know the max and are just + // testing for valid syntax. + if ((max > 0) && ((num < 1) || (num > max))) + { + throw std::runtime_error( + "number " + QUtil::int_to_string(num) + " out of range"); + } + if (i == 0) + { + result.push_back(work.at(i)); + } + else + { + int separator = work.at(i-1); + if (separator == comma) + { + result.push_back(num); + } + else if (separator == dash) + { + int lastnum = result.back(); + if (num > lastnum) + { + for (int j = lastnum + 1; j <= num; ++j) + { + result.push_back(j); + } + } + else + { + for (int j = lastnum - 1; j >= num; --j) + { + result.push_back(j); + } + } + } + else + { + throw std::logic_error( + "INTERNAL ERROR parsing numeric range"); + } + } + } + } + catch (std::runtime_error const& e) + { + std::string message; + if (p) + { + message = "error at * in numeric range " + + std::string(range, p - range) + "*" + p + ": " + e.what(); + } + else + { + message = "error in numeric range " + + std::string(range) + ": " + e.what(); + } + throw std::runtime_error(message); + } + return result; +} + +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc }; + +static unsigned char +encode_winansi(unsigned long codepoint) +{ + // Use this ugly switch statement to avoid a static, which is not + // thread-safe. + unsigned char ch = '\0'; + switch (codepoint) + { + case 0x20ac: + ch = 0x80; + break; + case 0x201a: + ch = 0x82; + break; + case 0x192: + ch = 0x83; + break; + case 0x201e: + ch = 0x84; + break; + case 0x2026: + ch = 0x85; + break; + case 0x2020: + ch = 0x86; + break; + case 0x2021: + ch = 0x87; + break; + case 0x2c6: + ch = 0x88; + break; + case 0x2030: + ch = 0x89; + break; + case 0x160: + ch = 0x8a; + break; + case 0x2039: + ch = 0x8b; + break; + case 0x152: + ch = 0x8c; + break; + case 0x17d: + ch = 0x8e; + break; + case 0x2018: + ch = 0x91; + break; + case 0x2019: + ch = 0x92; + break; + case 0x201c: + ch = 0x93; + break; + case 0x201d: + ch = 0x94; + break; + case 0x2022: + ch = 0x95; + break; + case 0x2013: + ch = 0x96; + break; + case 0x2014: + ch = 0x97; + break; + case 0x303: + ch = 0x98; + break; + case 0x2122: + ch = 0x99; + break; + case 0x161: + ch = 0x9a; + break; + case 0x203a: + ch = 0x9b; + break; + case 0x153: + ch = 0x9c; + break; + case 0x17e: + ch = 0x9e; + break; + case 0x178: + ch = 0x9f; + break; + case 0xa0: + ch = 0xa0; + break; + default: + break; + } + return ch; +} + +static unsigned char +encode_macroman(unsigned long codepoint) +{ + // Use this ugly switch statement to avoid a static, which is not + // thread-safe. + unsigned char ch = '\0'; + switch (codepoint) + { + case 0xc4: + ch = 0x80; + break; + case 0xc5: + ch = 0x81; + break; + case 0xc7: + ch = 0x82; + break; + case 0xc9: + ch = 0x83; + break; + case 0xd1: + ch = 0x84; + break; + case 0xd6: + ch = 0x85; + break; + case 0xdc: + ch = 0x86; + break; + case 0xe1: + ch = 0x87; + break; + case 0xe0: + ch = 0x88; + break; + case 0xe2: + ch = 0x89; + break; + case 0xe4: + ch = 0x8a; + break; + case 0xe3: + ch = 0x8b; + break; + case 0xe5: + ch = 0x8c; + break; + case 0xe7: + ch = 0x8d; + break; + case 0xe9: + ch = 0x8e; + break; + case 0xe8: + ch = 0x8f; + break; + case 0xea: + ch = 0x90; + break; + case 0xeb: + ch = 0x91; + break; + case 0xed: + ch = 0x92; + break; + case 0xec: + ch = 0x93; + break; + case 0xee: + ch = 0x94; + break; + case 0xef: + ch = 0x95; + break; + case 0xf1: + ch = 0x96; + break; + case 0xf3: + ch = 0x97; + break; + case 0xf2: + ch = 0x98; + break; + case 0xf4: + ch = 0x99; + break; + case 0xf6: + ch = 0x9a; + break; + case 0xf5: + ch = 0x9b; + break; + case 0xfa: + ch = 0x9c; + break; + case 0xf9: + ch = 0x9d; + break; + case 0xfb: + ch = 0x9e; + break; + case 0xfc: + ch = 0x9f; + break; + case 0x2020: + ch = 0xa0; + break; + case 0xb0: + ch = 0xa1; + break; + case 0xa2: + ch = 0xa2; + break; + case 0xa3: + ch = 0xa3; + break; + case 0xa7: + ch = 0xa4; + break; + case 0x2022: + ch = 0xa5; + break; + case 0xb6: + ch = 0xa6; + break; + case 0xdf: + ch = 0xa7; + break; + case 0xae: + ch = 0xa8; + break; + case 0xa9: + ch = 0xa9; + break; + case 0x2122: + ch = 0xaa; + break; + case 0x301: + ch = 0xab; + break; + case 0x308: + ch = 0xac; + break; + case 0xc6: + ch = 0xae; + break; + case 0xd8: + ch = 0xaf; + break; + case 0xb1: + ch = 0xb1; + break; + case 0xa5: + ch = 0xb4; + break; + case 0x3bc: + ch = 0xb5; + break; + case 0x1d43: + ch = 0xbb; + break; + case 0x1d52: + ch = 0xbc; + break; + case 0xe6: + ch = 0xbe; + break; + case 0xf8: + ch = 0xbf; + break; + case 0xbf: + ch = 0xc0; + break; + case 0xa1: + ch = 0xc1; + break; + case 0xac: + ch = 0xc2; + break; + case 0x192: + ch = 0xc4; + break; + case 0xab: + ch = 0xc7; + break; + case 0xbb: + ch = 0xc8; + break; + case 0x2026: + ch = 0xc9; + break; + case 0xc0: + ch = 0xcb; + break; + case 0xc3: + ch = 0xcc; + break; + case 0xd5: + ch = 0xcd; + break; + case 0x152: + ch = 0xce; + break; + case 0x153: + ch = 0xcf; + break; + case 0x2013: + ch = 0xd0; + break; + case 0x2014: + ch = 0xd1; + break; + case 0x201c: + ch = 0xd2; + break; + case 0x201d: + ch = 0xd3; + break; + case 0x2018: + ch = 0xd4; + break; + case 0x2019: + ch = 0xd5; + break; + case 0xf7: + ch = 0xd6; + break; + case 0xff: + ch = 0xd8; + break; + case 0x178: + ch = 0xd9; + break; + case 0x2044: + ch = 0xda; + break; + case 0xa4: + ch = 0xdb; + break; + case 0x2039: + ch = 0xdc; + break; + case 0x203a: + ch = 0xdd; + break; + case 0xfb01: + ch = 0xde; + break; + case 0xfb02: + ch = 0xdf; + break; + case 0x2021: + ch = 0xe0; + break; + case 0xb7: + ch = 0xe1; + break; + case 0x201a: + ch = 0xe2; + break; + case 0x201e: + ch = 0xe3; + break; + case 0x2030: + ch = 0xe4; + break; + case 0xc2: + ch = 0xe5; + break; + case 0xca: + ch = 0xe6; + break; + case 0xc1: + ch = 0xe7; + break; + case 0xcb: + ch = 0xe8; + break; + case 0xc8: + ch = 0xe9; + break; + case 0xcd: + ch = 0xea; + break; + case 0xce: + ch = 0xeb; + break; + case 0xcf: + ch = 0xec; + break; + case 0xcc: + ch = 0xed; + break; + case 0xd3: + ch = 0xee; + break; + case 0xd4: + ch = 0xef; + break; + case 0xd2: + ch = 0xf1; + break; + case 0xda: + ch = 0xf2; + break; + case 0xdb: + ch = 0xf3; + break; + case 0xd9: + ch = 0xf4; + break; + case 0x131: + ch = 0xf5; + break; + case 0x2c6: + ch = 0xf6; + break; + case 0x303: + ch = 0xf7; + break; + case 0x304: + ch = 0xf8; + break; + case 0x306: + ch = 0xf9; + break; + case 0x307: + ch = 0xfa; + break; + case 0x30a: + ch = 0xfb; + break; + case 0x327: + ch = 0xfc; + break; + case 0x30b: + ch = 0xfd; + break; + case 0x328: + ch = 0xfe; + break; + case 0x2c7: + ch = 0xff; + break; + default: + break; + } + return ch; +} + +static unsigned char +encode_pdfdoc(unsigned long codepoint) +{ + // Use this ugly switch statement to avoid a static, which is not + // thread-safe. + unsigned char ch = '\0'; + switch (codepoint) + { + case 0x2022: + ch = 0x80; + break; + case 0x2020: + ch = 0x81; + break; + case 0x2021: + ch = 0x82; + break; + case 0x2026: + ch = 0x83; + break; + case 0x2014: + ch = 0x84; + break; + case 0x2013: + ch = 0x85; + break; + case 0x0192: + ch = 0x86; + break; + case 0x2044: + ch = 0x87; + break; + case 0x2039: + ch = 0x88; + break; + case 0x203a: + ch = 0x89; + break; + case 0x2212: + ch = 0x8a; + break; + case 0x2030: + ch = 0x8b; + break; + case 0x201e: + ch = 0x8c; + break; + case 0x201c: + ch = 0x8d; + break; + case 0x201d: + ch = 0x8e; + break; + case 0x2018: + ch = 0x8f; + break; + case 0x2019: + ch = 0x90; + break; + case 0x201a: + ch = 0x91; + break; + case 0x2122: + ch = 0x92; + break; + case 0xfb01: + ch = 0x93; + break; + case 0xfb02: + ch = 0x94; + break; + case 0x0141: + ch = 0x95; + break; + case 0x0152: + ch = 0x96; + break; + case 0x0160: + ch = 0x97; + break; + case 0x0178: + ch = 0x98; + break; + case 0x017d: + ch = 0x99; + break; + case 0x0131: + ch = 0x9a; + break; + case 0x0142: + ch = 0x9b; + break; + case 0x0153: + ch = 0x9c; + break; + case 0x0161: + ch = 0x9d; + break; + case 0x017e: + ch = 0x9e; + break; + case 0xfffd: + ch = 0x9f; + break; + case 0x20ac: + ch = 0xa0; + break; + default: + break; + } + return ch; +} + +unsigned long get_next_utf8_codepoint( + std::string const& utf8_val, size_t& pos, bool& error) +{ + size_t len = utf8_val.length(); + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos)); + error = false; + if (ch < 128) + { + return static_cast<unsigned long>(ch); + } + + size_t bytes_needed = 0; + unsigned bit_check = 0x40; + unsigned char to_clear = 0x80; + while (ch & bit_check) + { + ++bytes_needed; + to_clear |= bit_check; + bit_check >>= 1; + } + if (((bytes_needed > 5) || (bytes_needed < 1)) || + ((pos + bytes_needed) >= len)) + { + error = true; + return 0xfffd; + } + + unsigned long codepoint = (ch & ~to_clear); + while (bytes_needed > 0) + { + --bytes_needed; + ch = utf8_val.at(++pos); + if ((ch & 0xc0) != 0x80) + { + --pos; + codepoint = 0xfffd; + break; + } + codepoint <<= 6; + codepoint += (ch & 0x3f); + } + return codepoint; +} + +static bool +transcode_utf8(std::string const& utf8_val, std::string& result, + encoding_e encoding, char unknown) +{ + bool okay = true; + result.clear(); + if (encoding == e_utf16) + { + result += "\xfe\xff"; + } + size_t len = utf8_val.length(); + for (size_t i = 0; i < len; ++i) + { + bool error = false; + unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); + if (error) + { + okay = false; + if (encoding == e_utf16) + { + result += "\xff\xfd"; + } + else + { + result.append(1, unknown); + } + } + else if (codepoint < 128) + { + char ch = static_cast<char>(codepoint); + if (encoding == e_utf16) + { + result += QUtil::toUTF16(ch); + } + else + { + result.append(1, ch); + } + } + else if (encoding == e_utf16) + { + result += QUtil::toUTF16(codepoint); + } + else if ((codepoint > 160) && (codepoint < 256) && + ((encoding == e_winansi) || (encoding == e_pdfdoc))) + { + result.append(1, static_cast<unsigned char>(codepoint & 0xff)); + } + else + { + unsigned char ch = '\0'; + if (encoding == e_winansi) + { + ch = encode_winansi(codepoint); + } + else if (encoding == e_macroman) + { + ch = encode_macroman(codepoint); + } + else if (encoding == e_pdfdoc) + { + ch = encode_pdfdoc(codepoint); + } + if (ch == '\0') + { + okay = false; + ch = static_cast<unsigned char>(unknown); + } + result.append(1, ch); + } + } + return okay; +} + +static std::string +transcode_utf8(std::string const& utf8_val, encoding_e encoding, + char unknown) +{ + std::string result; + transcode_utf8(utf8_val, result, encoding, unknown); + return result; +} + +std::string +QUtil::utf8_to_utf16(std::string const& utf8) +{ + return transcode_utf8(utf8, e_utf16, 0); +} + +std::string +QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) +{ + return transcode_utf8(utf8, e_ascii, unknown_char); +} + +std::string +QUtil::utf8_to_win_ansi(std::string const& utf8, char unknown_char) +{ + return transcode_utf8(utf8, e_winansi, unknown_char); +} + +std::string +QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char) +{ + return transcode_utf8(utf8, e_macroman, unknown_char); +} + +std::string +QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char) +{ + return transcode_utf8(utf8, e_pdfdoc, unknown_char); +} + +bool +QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii, + char unknown_char) +{ + return transcode_utf8(utf8, ascii, e_ascii, unknown_char); +} + +bool +QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win, + char unknown_char) +{ + return transcode_utf8(utf8, win, e_winansi, unknown_char); +} + +bool +QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac, + char unknown_char) +{ + return transcode_utf8(utf8, mac, e_macroman, unknown_char); +} + +bool +QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, + char unknown_char) +{ + return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char); +} + +bool +QUtil::is_utf16(std::string const& val) +{ + return ((val.length() >= 2) && + (val.at(0) == '\xfe') && (val.at(1) == '\xff')); +} + +std::string +QUtil::utf16_to_utf8(std::string const& val) +{ + std::string result; + // This code uses unsigned long and unsigned short to hold + // codepoint values. It requires unsigned long to be at least + // 32 bits and unsigned short to be at least 16 bits, but it + // will work fine if they are larger. + unsigned long codepoint = 0L; + size_t len = val.length(); + size_t start = 0; + if (is_utf16(val)) + { + start += 2; + } + // If the string has an odd number of bytes, the last byte is + // ignored. + for (unsigned int i = start; i < len; i += 2) + { + // Convert from UTF16-BE. If we get a malformed + // codepoint, this code will generate incorrect output + // without giving a warning. Specifically, a high + // codepoint not followed by a low codepoint will be + // discarded, and a low codepoint not preceded by a high + // codepoint will just get its low 10 bits output. + unsigned short bits = + (static_cast<unsigned char>(val.at(i)) << 8) + + static_cast<unsigned char>(val.at(i+1)); + if ((bits & 0xFC00) == 0xD800) + { + codepoint = 0x10000 + ((bits & 0x3FF) << 10); + continue; + } + else if ((bits & 0xFC00) == 0xDC00) + { + if (codepoint != 0) + { + QTC::TC("qpdf", "QUtil non-trivial UTF-16"); + } + codepoint += bits & 0x3FF; + } + else + { + codepoint = bits; + } + + result += QUtil::toUTF8(codepoint); + codepoint = 0; + } + return result; +} + +std::string +QUtil::win_ansi_to_utf8(std::string const& val) +{ + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast<unsigned char>(val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = win_ansi_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; +} + +std::string +QUtil::mac_roman_to_utf8(std::string const& val) +{ + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast<unsigned char>(val.at(i)); + unsigned short val = ch; + if (ch >= 128) + { + val = mac_roman_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; +} + +std::string +QUtil::pdf_doc_to_utf8(std::string const& val) +{ + std::string result; + size_t len = val.length(); + for (unsigned int i = 0; i < len; ++i) + { + unsigned char ch = static_cast<unsigned char>(val.at(i)); + unsigned short val = ch; + if ((ch >= 128) && (ch <= 160)) + { + val = pdf_doc_to_unicode[ch - 128]; + } + result += QUtil::toUTF8(val); + } + return result; +} + +void +QUtil::analyze_encoding(std::string const& val, + bool& has_8bit_chars, + bool& is_valid_utf8, + bool& is_utf16) +{ + has_8bit_chars = is_utf16 = is_valid_utf8 = false; + if (QUtil::is_utf16(val)) + { + has_8bit_chars = true; + is_utf16 = true; + return; + } + size_t len = val.length(); + bool any_errors = false; + for (size_t i = 0; i < len; ++i) + { + bool error = false; + unsigned long codepoint = get_next_utf8_codepoint(val, i, error); + if (error) + { + any_errors = true; + } + if (codepoint >= 128) + { + has_8bit_chars = true; + } + } + if (has_8bit_chars && (! any_errors)) + { + is_valid_utf8 = true; + } +} + +std::vector<std::string> +QUtil::possible_repaired_encodings(std::string supplied) +{ + std::vector<std::string> result; + // Always include the original string + result.push_back(supplied); + bool has_8bit_chars = false; + bool is_valid_utf8 = false; + bool is_utf16 = false; + analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16); + if (! has_8bit_chars) + { + return result; + } + if (is_utf16) + { + // Convert to UTF-8 and pretend we got a UTF-8 string. + is_utf16 = false; + is_valid_utf8 = true; + supplied = utf16_to_utf8(supplied); + } + std::string output; + if (is_valid_utf8) + { + // Maybe we were given UTF-8 but wanted one of the single-byte + // encodings. + if (utf8_to_pdf_doc(supplied, output)) + { + result.push_back(output); + } + if (utf8_to_win_ansi(supplied, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(supplied, output)) + { + result.push_back(output); + } + } + else + { + // Maybe we were given one of the single-byte encodings but + // wanted UTF-8. + std::string from_pdf_doc(pdf_doc_to_utf8(supplied)); + result.push_back(from_pdf_doc); + std::string from_win_ansi(win_ansi_to_utf8(supplied)); + result.push_back(from_win_ansi); + std::string from_mac_roman(mac_roman_to_utf8(supplied)); + result.push_back(from_mac_roman); + + // Maybe we were given one of the other single-byte encodings + // but wanted one of the other ones. + if (utf8_to_win_ansi(from_pdf_doc, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(from_pdf_doc, output)) + { + result.push_back(output); + } + if (utf8_to_pdf_doc(from_win_ansi, output)) + { + result.push_back(output); + } + if (utf8_to_mac_roman(from_win_ansi, output)) + { + result.push_back(output); + } + if (utf8_to_pdf_doc(from_mac_roman, output)) + { + result.push_back(output); + } + if (utf8_to_win_ansi(from_mac_roman, output)) + { + result.push_back(output); + } + } + // De-duplicate + std::vector<std::string> t; + std::set<std::string> seen; + for (std::vector<std::string>::iterator iter = result.begin(); + iter != result.end(); ++iter) + { + if (! seen.count(*iter)) + { + seen.insert(*iter); + t.push_back(*iter); + } + } + return t; +} |