aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QUtil.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r--libqpdf/QUtil.cc1387
1 files changed, 1370 insertions, 17 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index e2bc0bac..58646ade 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -7,12 +7,15 @@
# include <qpdf/InsecureRandomDataProvider.hh>
#endif
#include <qpdf/SecureRandomDataProvider.hh>
+#include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QTC.hh>
#include <cmath>
#include <iomanip>
#include <sstream>
#include <fstream>
#include <stdexcept>
+#include <set>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
@@ -28,6 +31,208 @@
#include <sys/stat.h>
#endif
+// First element is 128
+static unsigned short pdf_doc_to_unicode[] = {
+ 0x2022, // 0x80 BULLET
+ 0x2020, // 0x81 DAGGER
+ 0x2021, // 0x82 DOUBLE DAGGER
+ 0x2026, // 0x83 HORIZONTAL ELLIPSIS
+ 0x2014, // 0x84 EM DASH
+ 0x2013, // 0x85 EN DASH
+ 0x0192, // 0x86 SMALL LETTER F WITH HOOK
+ 0x2044, // 0x87 FRACTION SLASH (solidus)
+ 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 0x2212, // 0x8a MINUS SIGN
+ 0x2030, // 0x8b PER MILLE SIGN
+ 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+ 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
+ 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
+ 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
+ 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
+ 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+ 0x2122, // 0x92 TRADE MARK SIGN
+ 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
+ 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
+ 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
+ 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
+ 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
+ 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
+ 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
+ 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
+ 0x0153, // 0x9c LATIN SMALL LIGATURE OE
+ 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
+ 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
+ 0xfffd, // 0x9f UNDEFINED
+ 0x20ac, // 0xa0 EURO SIGN
+};
+static unsigned short win_ansi_to_unicode[] = {
+ 0x20ac, // 0x80
+ 0xfffd, // 0x81
+ 0x201a, // 0x82
+ 0x0192, // 0x83
+ 0x201e, // 0x84
+ 0x2026, // 0x85
+ 0x2020, // 0x86
+ 0x2021, // 0x87
+ 0x02c6, // 0x88
+ 0x2030, // 0x89
+ 0x0160, // 0x8a
+ 0x2039, // 0x8b
+ 0x0152, // 0x8c
+ 0xfffd, // 0x8d
+ 0x017d, // 0x8e
+ 0xfffd, // 0x8f
+ 0xfffd, // 0x90
+ 0x2018, // 0x91
+ 0x2019, // 0x92
+ 0x201c, // 0x93
+ 0x201d, // 0x94
+ 0x2022, // 0x95
+ 0x2013, // 0x96
+ 0x2014, // 0x97
+ 0x0303, // 0x98
+ 0x2122, // 0x99
+ 0x0161, // 0x9a
+ 0x203a, // 0x9b
+ 0x0153, // 0x9c
+ 0xfffd, // 0x9d
+ 0x017e, // 0x9e
+ 0x0178, // 0x9f
+ 0x00a0, // 0xa0
+};
+static unsigned short mac_roman_to_unicode[] = {
+ 0x00c4, // 0x80
+ 0x00c5, // 0x81
+ 0x00c7, // 0x82
+ 0x00c9, // 0x83
+ 0x00d1, // 0x84
+ 0x00d6, // 0x85
+ 0x00dc, // 0x86
+ 0x00e1, // 0x87
+ 0x00e0, // 0x88
+ 0x00e2, // 0x89
+ 0x00e4, // 0x8a
+ 0x00e3, // 0x8b
+ 0x00e5, // 0x8c
+ 0x00e7, // 0x8d
+ 0x00e9, // 0x8e
+ 0x00e8, // 0x8f
+ 0x00ea, // 0x90
+ 0x00eb, // 0x91
+ 0x00ed, // 0x92
+ 0x00ec, // 0x93
+ 0x00ee, // 0x94
+ 0x00ef, // 0x95
+ 0x00f1, // 0x96
+ 0x00f3, // 0x97
+ 0x00f2, // 0x98
+ 0x00f4, // 0x99
+ 0x00f6, // 0x9a
+ 0x00f5, // 0x9b
+ 0x00fa, // 0x9c
+ 0x00f9, // 0x9d
+ 0x00fb, // 0x9e
+ 0x00fc, // 0x9f
+ 0x2020, // 0xa0
+ 0x00b0, // 0xa1
+ 0x00a2, // 0xa2
+ 0x00a3, // 0xa3
+ 0x00a7, // 0xa4
+ 0x2022, // 0xa5
+ 0x00b6, // 0xa6
+ 0x00df, // 0xa7
+ 0x00ae, // 0xa8
+ 0x00a9, // 0xa9
+ 0x2122, // 0xaa
+ 0x0301, // 0xab
+ 0x0308, // 0xac
+ 0xfffd, // 0xad
+ 0x00c6, // 0xae
+ 0x00d8, // 0xaf
+ 0xfffd, // 0xb0
+ 0x00b1, // 0xb1
+ 0xfffd, // 0xb2
+ 0xfffd, // 0xb3
+ 0x00a5, // 0xb4
+ 0x03bc, // 0xb5
+ 0xfffd, // 0xb6
+ 0xfffd, // 0xb7
+ 0xfffd, // 0xb8
+ 0xfffd, // 0xb9
+ 0xfffd, // 0xba
+ 0x1d43, // 0xbb
+ 0x1d52, // 0xbc
+ 0xfffd, // 0xbd
+ 0x00e6, // 0xbe
+ 0x00f8, // 0xbf
+ 0x00bf, // 0xc0
+ 0x00a1, // 0xc1
+ 0x00ac, // 0xc2
+ 0xfffd, // 0xc3
+ 0x0192, // 0xc4
+ 0xfffd, // 0xc5
+ 0xfffd, // 0xc6
+ 0x00ab, // 0xc7
+ 0x00bb, // 0xc8
+ 0x2026, // 0xc9
+ 0xfffd, // 0xca
+ 0x00c0, // 0xcb
+ 0x00c3, // 0xcc
+ 0x00d5, // 0xcd
+ 0x0152, // 0xce
+ 0x0153, // 0xcf
+ 0x2013, // 0xd0
+ 0x2014, // 0xd1
+ 0x201c, // 0xd2
+ 0x201d, // 0xd3
+ 0x2018, // 0xd4
+ 0x2019, // 0xd5
+ 0x00f7, // 0xd6
+ 0xfffd, // 0xd7
+ 0x00ff, // 0xd8
+ 0x0178, // 0xd9
+ 0x2044, // 0xda
+ 0x00a4, // 0xdb
+ 0x2039, // 0xdc
+ 0x203a, // 0xdd
+ 0xfb01, // 0xde
+ 0xfb02, // 0xdf
+ 0x2021, // 0xe0
+ 0x00b7, // 0xe1
+ 0x201a, // 0xe2
+ 0x201e, // 0xe3
+ 0x2030, // 0xe4
+ 0x00c2, // 0xe5
+ 0x00ca, // 0xe6
+ 0x00c1, // 0xe7
+ 0x00cb, // 0xe8
+ 0x00c8, // 0xe9
+ 0x00cd, // 0xea
+ 0x00ce, // 0xeb
+ 0x00cf, // 0xec
+ 0x00cc, // 0xed
+ 0x00d3, // 0xee
+ 0x00d4, // 0xef
+ 0xfffd, // 0xf0
+ 0x00d2, // 0xf1
+ 0x00da, // 0xf2
+ 0x00db, // 0xf3
+ 0x00d9, // 0xf4
+ 0x0131, // 0xf5
+ 0x02c6, // 0xf6
+ 0x0303, // 0xf7
+ 0x0304, // 0xf8
+ 0x0306, // 0xf9
+ 0x0307, // 0xfa
+ 0x030a, // 0xfb
+ 0x0327, // 0xfc
+ 0x030b, // 0xfd
+ 0x0328, // 0xfe
+ 0x02c7, // 0xff
+};
+
std::string
QUtil::int_to_string(long long num, int length)
{
@@ -132,22 +337,7 @@ QUtil::unsigned_char_pointer(char const* str)
void
QUtil::throw_system_error(std::string const& description)
{
-#ifdef _MSC_VER
- // "94" is mentioned in the MSVC docs, but it's still safe if the
- // message is longer. strerror_s is a templated function that
- // knows the size of buf and truncates.
- char buf[94];
- if (strerror_s(buf, errno) != 0)
- {
- throw std::runtime_error(description + ": failed with an unknown error");
- }
- else
- {
- throw std::runtime_error(description + ": " + buf);
- }
-#else
- throw std::runtime_error(description + ": " + strerror(errno));
-#endif
+ throw QPDFSystemError(description, errno);
}
int
@@ -228,13 +418,14 @@ QUtil::same_file(char const* name1, char const* name2)
return false;
}
#ifdef _WIN32
+ bool same = false;
+# ifndef AVOID_WINDOWS_HANDLE
HANDLE fh1 = CreateFile(name1, GENERIC_READ, FILE_SHARE_READ,
NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
HANDLE fh2 = CreateFile(name2, GENERIC_READ, FILE_SHARE_READ,
NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
BY_HANDLE_FILE_INFORMATION fi1;
BY_HANDLE_FILE_INFORMATION fi2;
- bool same = false;
if ((fh1 != INVALID_HANDLE_VALUE) &&
(fh2 != INVALID_HANDLE_VALUE) &&
GetFileInformationByHandle(fh1, &fi1) &&
@@ -253,6 +444,7 @@ QUtil::same_file(char const* name1, char const* name2)
{
CloseHandle(fh2);
}
+# endif
return same;
#else
struct stat st1;
@@ -732,3 +924,1164 @@ QUtil::strcasecmp(char const *s1, char const *s2)
return ::strcasecmp(s1, s2);
#endif
}
+
+static int maybe_from_end(int num, bool from_end, int max)
+{
+ if (from_end)
+ {
+ if (num > max)
+ {
+ num = 0;
+ }
+ else
+ {
+ num = max + 1 - num;
+ }
+ }
+ return num;
+}
+
+std::vector<int>
+QUtil::parse_numrange(char const* range, int max)
+{
+ std::vector<int> result;
+ char const* p = range;
+ try
+ {
+ std::vector<int> work;
+ static int const comma = -1;
+ static int const dash = -2;
+
+ enum { st_top,
+ st_in_number,
+ st_after_number } state = st_top;
+ bool last_separator_was_dash = false;
+ int cur_number = 0;
+ bool from_end = false;
+ while (*p)
+ {
+ char ch = *p;
+ if (isdigit(ch))
+ {
+ if (! ((state == st_top) || (state == st_in_number)))
+ {
+ throw std::runtime_error("digit not expected");
+ }
+ state = st_in_number;
+ cur_number *= 10;
+ cur_number += (ch - '0');
+ }
+ else if (ch == 'z')
+ {
+ // z represents max
+ if (! (state == st_top))
+ {
+ throw std::runtime_error("z not expected");
+ }
+ state = st_after_number;
+ cur_number = max;
+ }
+ else if (ch == 'r')
+ {
+ if (! (state == st_top))
+ {
+ throw std::runtime_error("r not expected");
+ }
+ state = st_in_number;
+ from_end = true;
+ }
+ else if ((ch == ',') || (ch == '-'))
+ {
+ if (! ((state == st_in_number) || (state == st_after_number)))
+ {
+ throw std::runtime_error("unexpected separator");
+ }
+ cur_number = maybe_from_end(cur_number, from_end, max);
+ work.push_back(cur_number);
+ cur_number = 0;
+ from_end = false;
+ if (ch == ',')
+ {
+ state = st_top;
+ last_separator_was_dash = false;
+ work.push_back(comma);
+ }
+ else if (ch == '-')
+ {
+ if (last_separator_was_dash)
+ {
+ throw std::runtime_error("unexpected dash");
+ }
+ state = st_top;
+ last_separator_was_dash = true;
+ work.push_back(dash);
+ }
+ }
+ else
+ {
+ throw std::runtime_error("unexpected character");
+ }
+ ++p;
+ }
+ if ((state == st_in_number) || (state == st_after_number))
+ {
+ cur_number = maybe_from_end(cur_number, from_end, max);
+ work.push_back(cur_number);
+ }
+ else
+ {
+ throw std::runtime_error("number expected");
+ }
+
+ p = 0;
+ for (size_t i = 0; i < work.size(); i += 2)
+ {
+ int num = work.at(i);
+ // max == 0 means we don't know the max and are just
+ // testing for valid syntax.
+ if ((max > 0) && ((num < 1) || (num > max)))
+ {
+ throw std::runtime_error(
+ "number " + QUtil::int_to_string(num) + " out of range");
+ }
+ if (i == 0)
+ {
+ result.push_back(work.at(i));
+ }
+ else
+ {
+ int separator = work.at(i-1);
+ if (separator == comma)
+ {
+ result.push_back(num);
+ }
+ else if (separator == dash)
+ {
+ int lastnum = result.back();
+ if (num > lastnum)
+ {
+ for (int j = lastnum + 1; j <= num; ++j)
+ {
+ result.push_back(j);
+ }
+ }
+ else
+ {
+ for (int j = lastnum - 1; j >= num; --j)
+ {
+ result.push_back(j);
+ }
+ }
+ }
+ else
+ {
+ throw std::logic_error(
+ "INTERNAL ERROR parsing numeric range");
+ }
+ }
+ }
+ }
+ catch (std::runtime_error const& e)
+ {
+ std::string message;
+ if (p)
+ {
+ message = "error at * in numeric range " +
+ std::string(range, p - range) + "*" + p + ": " + e.what();
+ }
+ else
+ {
+ message = "error in numeric range " +
+ std::string(range) + ": " + e.what();
+ }
+ throw std::runtime_error(message);
+ }
+ return result;
+}
+
+enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
+
+static unsigned char
+encode_winansi(unsigned long codepoint)
+{
+ // Use this ugly switch statement to avoid a static, which is not
+ // thread-safe.
+ unsigned char ch = '\0';
+ switch (codepoint)
+ {
+ case 0x20ac:
+ ch = 0x80;
+ break;
+ case 0x201a:
+ ch = 0x82;
+ break;
+ case 0x192:
+ ch = 0x83;
+ break;
+ case 0x201e:
+ ch = 0x84;
+ break;
+ case 0x2026:
+ ch = 0x85;
+ break;
+ case 0x2020:
+ ch = 0x86;
+ break;
+ case 0x2021:
+ ch = 0x87;
+ break;
+ case 0x2c6:
+ ch = 0x88;
+ break;
+ case 0x2030:
+ ch = 0x89;
+ break;
+ case 0x160:
+ ch = 0x8a;
+ break;
+ case 0x2039:
+ ch = 0x8b;
+ break;
+ case 0x152:
+ ch = 0x8c;
+ break;
+ case 0x17d:
+ ch = 0x8e;
+ break;
+ case 0x2018:
+ ch = 0x91;
+ break;
+ case 0x2019:
+ ch = 0x92;
+ break;
+ case 0x201c:
+ ch = 0x93;
+ break;
+ case 0x201d:
+ ch = 0x94;
+ break;
+ case 0x2022:
+ ch = 0x95;
+ break;
+ case 0x2013:
+ ch = 0x96;
+ break;
+ case 0x2014:
+ ch = 0x97;
+ break;
+ case 0x303:
+ ch = 0x98;
+ break;
+ case 0x2122:
+ ch = 0x99;
+ break;
+ case 0x161:
+ ch = 0x9a;
+ break;
+ case 0x203a:
+ ch = 0x9b;
+ break;
+ case 0x153:
+ ch = 0x9c;
+ break;
+ case 0x17e:
+ ch = 0x9e;
+ break;
+ case 0x178:
+ ch = 0x9f;
+ break;
+ case 0xa0:
+ ch = 0xa0;
+ break;
+ default:
+ break;
+ }
+ return ch;
+}
+
+static unsigned char
+encode_macroman(unsigned long codepoint)
+{
+ // Use this ugly switch statement to avoid a static, which is not
+ // thread-safe.
+ unsigned char ch = '\0';
+ switch (codepoint)
+ {
+ case 0xc4:
+ ch = 0x80;
+ break;
+ case 0xc5:
+ ch = 0x81;
+ break;
+ case 0xc7:
+ ch = 0x82;
+ break;
+ case 0xc9:
+ ch = 0x83;
+ break;
+ case 0xd1:
+ ch = 0x84;
+ break;
+ case 0xd6:
+ ch = 0x85;
+ break;
+ case 0xdc:
+ ch = 0x86;
+ break;
+ case 0xe1:
+ ch = 0x87;
+ break;
+ case 0xe0:
+ ch = 0x88;
+ break;
+ case 0xe2:
+ ch = 0x89;
+ break;
+ case 0xe4:
+ ch = 0x8a;
+ break;
+ case 0xe3:
+ ch = 0x8b;
+ break;
+ case 0xe5:
+ ch = 0x8c;
+ break;
+ case 0xe7:
+ ch = 0x8d;
+ break;
+ case 0xe9:
+ ch = 0x8e;
+ break;
+ case 0xe8:
+ ch = 0x8f;
+ break;
+ case 0xea:
+ ch = 0x90;
+ break;
+ case 0xeb:
+ ch = 0x91;
+ break;
+ case 0xed:
+ ch = 0x92;
+ break;
+ case 0xec:
+ ch = 0x93;
+ break;
+ case 0xee:
+ ch = 0x94;
+ break;
+ case 0xef:
+ ch = 0x95;
+ break;
+ case 0xf1:
+ ch = 0x96;
+ break;
+ case 0xf3:
+ ch = 0x97;
+ break;
+ case 0xf2:
+ ch = 0x98;
+ break;
+ case 0xf4:
+ ch = 0x99;
+ break;
+ case 0xf6:
+ ch = 0x9a;
+ break;
+ case 0xf5:
+ ch = 0x9b;
+ break;
+ case 0xfa:
+ ch = 0x9c;
+ break;
+ case 0xf9:
+ ch = 0x9d;
+ break;
+ case 0xfb:
+ ch = 0x9e;
+ break;
+ case 0xfc:
+ ch = 0x9f;
+ break;
+ case 0x2020:
+ ch = 0xa0;
+ break;
+ case 0xb0:
+ ch = 0xa1;
+ break;
+ case 0xa2:
+ ch = 0xa2;
+ break;
+ case 0xa3:
+ ch = 0xa3;
+ break;
+ case 0xa7:
+ ch = 0xa4;
+ break;
+ case 0x2022:
+ ch = 0xa5;
+ break;
+ case 0xb6:
+ ch = 0xa6;
+ break;
+ case 0xdf:
+ ch = 0xa7;
+ break;
+ case 0xae:
+ ch = 0xa8;
+ break;
+ case 0xa9:
+ ch = 0xa9;
+ break;
+ case 0x2122:
+ ch = 0xaa;
+ break;
+ case 0x301:
+ ch = 0xab;
+ break;
+ case 0x308:
+ ch = 0xac;
+ break;
+ case 0xc6:
+ ch = 0xae;
+ break;
+ case 0xd8:
+ ch = 0xaf;
+ break;
+ case 0xb1:
+ ch = 0xb1;
+ break;
+ case 0xa5:
+ ch = 0xb4;
+ break;
+ case 0x3bc:
+ ch = 0xb5;
+ break;
+ case 0x1d43:
+ ch = 0xbb;
+ break;
+ case 0x1d52:
+ ch = 0xbc;
+ break;
+ case 0xe6:
+ ch = 0xbe;
+ break;
+ case 0xf8:
+ ch = 0xbf;
+ break;
+ case 0xbf:
+ ch = 0xc0;
+ break;
+ case 0xa1:
+ ch = 0xc1;
+ break;
+ case 0xac:
+ ch = 0xc2;
+ break;
+ case 0x192:
+ ch = 0xc4;
+ break;
+ case 0xab:
+ ch = 0xc7;
+ break;
+ case 0xbb:
+ ch = 0xc8;
+ break;
+ case 0x2026:
+ ch = 0xc9;
+ break;
+ case 0xc0:
+ ch = 0xcb;
+ break;
+ case 0xc3:
+ ch = 0xcc;
+ break;
+ case 0xd5:
+ ch = 0xcd;
+ break;
+ case 0x152:
+ ch = 0xce;
+ break;
+ case 0x153:
+ ch = 0xcf;
+ break;
+ case 0x2013:
+ ch = 0xd0;
+ break;
+ case 0x2014:
+ ch = 0xd1;
+ break;
+ case 0x201c:
+ ch = 0xd2;
+ break;
+ case 0x201d:
+ ch = 0xd3;
+ break;
+ case 0x2018:
+ ch = 0xd4;
+ break;
+ case 0x2019:
+ ch = 0xd5;
+ break;
+ case 0xf7:
+ ch = 0xd6;
+ break;
+ case 0xff:
+ ch = 0xd8;
+ break;
+ case 0x178:
+ ch = 0xd9;
+ break;
+ case 0x2044:
+ ch = 0xda;
+ break;
+ case 0xa4:
+ ch = 0xdb;
+ break;
+ case 0x2039:
+ ch = 0xdc;
+ break;
+ case 0x203a:
+ ch = 0xdd;
+ break;
+ case 0xfb01:
+ ch = 0xde;
+ break;
+ case 0xfb02:
+ ch = 0xdf;
+ break;
+ case 0x2021:
+ ch = 0xe0;
+ break;
+ case 0xb7:
+ ch = 0xe1;
+ break;
+ case 0x201a:
+ ch = 0xe2;
+ break;
+ case 0x201e:
+ ch = 0xe3;
+ break;
+ case 0x2030:
+ ch = 0xe4;
+ break;
+ case 0xc2:
+ ch = 0xe5;
+ break;
+ case 0xca:
+ ch = 0xe6;
+ break;
+ case 0xc1:
+ ch = 0xe7;
+ break;
+ case 0xcb:
+ ch = 0xe8;
+ break;
+ case 0xc8:
+ ch = 0xe9;
+ break;
+ case 0xcd:
+ ch = 0xea;
+ break;
+ case 0xce:
+ ch = 0xeb;
+ break;
+ case 0xcf:
+ ch = 0xec;
+ break;
+ case 0xcc:
+ ch = 0xed;
+ break;
+ case 0xd3:
+ ch = 0xee;
+ break;
+ case 0xd4:
+ ch = 0xef;
+ break;
+ case 0xd2:
+ ch = 0xf1;
+ break;
+ case 0xda:
+ ch = 0xf2;
+ break;
+ case 0xdb:
+ ch = 0xf3;
+ break;
+ case 0xd9:
+ ch = 0xf4;
+ break;
+ case 0x131:
+ ch = 0xf5;
+ break;
+ case 0x2c6:
+ ch = 0xf6;
+ break;
+ case 0x303:
+ ch = 0xf7;
+ break;
+ case 0x304:
+ ch = 0xf8;
+ break;
+ case 0x306:
+ ch = 0xf9;
+ break;
+ case 0x307:
+ ch = 0xfa;
+ break;
+ case 0x30a:
+ ch = 0xfb;
+ break;
+ case 0x327:
+ ch = 0xfc;
+ break;
+ case 0x30b:
+ ch = 0xfd;
+ break;
+ case 0x328:
+ ch = 0xfe;
+ break;
+ case 0x2c7:
+ ch = 0xff;
+ break;
+ default:
+ break;
+ }
+ return ch;
+}
+
+static unsigned char
+encode_pdfdoc(unsigned long codepoint)
+{
+ // Use this ugly switch statement to avoid a static, which is not
+ // thread-safe.
+ unsigned char ch = '\0';
+ switch (codepoint)
+ {
+ case 0x2022:
+ ch = 0x80;
+ break;
+ case 0x2020:
+ ch = 0x81;
+ break;
+ case 0x2021:
+ ch = 0x82;
+ break;
+ case 0x2026:
+ ch = 0x83;
+ break;
+ case 0x2014:
+ ch = 0x84;
+ break;
+ case 0x2013:
+ ch = 0x85;
+ break;
+ case 0x0192:
+ ch = 0x86;
+ break;
+ case 0x2044:
+ ch = 0x87;
+ break;
+ case 0x2039:
+ ch = 0x88;
+ break;
+ case 0x203a:
+ ch = 0x89;
+ break;
+ case 0x2212:
+ ch = 0x8a;
+ break;
+ case 0x2030:
+ ch = 0x8b;
+ break;
+ case 0x201e:
+ ch = 0x8c;
+ break;
+ case 0x201c:
+ ch = 0x8d;
+ break;
+ case 0x201d:
+ ch = 0x8e;
+ break;
+ case 0x2018:
+ ch = 0x8f;
+ break;
+ case 0x2019:
+ ch = 0x90;
+ break;
+ case 0x201a:
+ ch = 0x91;
+ break;
+ case 0x2122:
+ ch = 0x92;
+ break;
+ case 0xfb01:
+ ch = 0x93;
+ break;
+ case 0xfb02:
+ ch = 0x94;
+ break;
+ case 0x0141:
+ ch = 0x95;
+ break;
+ case 0x0152:
+ ch = 0x96;
+ break;
+ case 0x0160:
+ ch = 0x97;
+ break;
+ case 0x0178:
+ ch = 0x98;
+ break;
+ case 0x017d:
+ ch = 0x99;
+ break;
+ case 0x0131:
+ ch = 0x9a;
+ break;
+ case 0x0142:
+ ch = 0x9b;
+ break;
+ case 0x0153:
+ ch = 0x9c;
+ break;
+ case 0x0161:
+ ch = 0x9d;
+ break;
+ case 0x017e:
+ ch = 0x9e;
+ break;
+ case 0xfffd:
+ ch = 0x9f;
+ break;
+ case 0x20ac:
+ ch = 0xa0;
+ break;
+ default:
+ break;
+ }
+ return ch;
+}
+
+unsigned long get_next_utf8_codepoint(
+ std::string const& utf8_val, size_t& pos, bool& error)
+{
+ size_t len = utf8_val.length();
+ unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
+ error = false;
+ if (ch < 128)
+ {
+ return static_cast<unsigned long>(ch);
+ }
+
+ size_t bytes_needed = 0;
+ unsigned bit_check = 0x40;
+ unsigned char to_clear = 0x80;
+ while (ch & bit_check)
+ {
+ ++bytes_needed;
+ to_clear |= bit_check;
+ bit_check >>= 1;
+ }
+ if (((bytes_needed > 5) || (bytes_needed < 1)) ||
+ ((pos + bytes_needed) >= len))
+ {
+ error = true;
+ return 0xfffd;
+ }
+
+ unsigned long codepoint = (ch & ~to_clear);
+ while (bytes_needed > 0)
+ {
+ --bytes_needed;
+ ch = utf8_val.at(++pos);
+ if ((ch & 0xc0) != 0x80)
+ {
+ --pos;
+ codepoint = 0xfffd;
+ break;
+ }
+ codepoint <<= 6;
+ codepoint += (ch & 0x3f);
+ }
+ return codepoint;
+}
+
+static bool
+transcode_utf8(std::string const& utf8_val, std::string& result,
+ encoding_e encoding, char unknown)
+{
+ bool okay = true;
+ result.clear();
+ if (encoding == e_utf16)
+ {
+ result += "\xfe\xff";
+ }
+ size_t len = utf8_val.length();
+ for (size_t i = 0; i < len; ++i)
+ {
+ bool error = false;
+ unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
+ if (error)
+ {
+ okay = false;
+ if (encoding == e_utf16)
+ {
+ result += "\xff\xfd";
+ }
+ else
+ {
+ result.append(1, unknown);
+ }
+ }
+ else if (codepoint < 128)
+ {
+ char ch = static_cast<char>(codepoint);
+ if (encoding == e_utf16)
+ {
+ result += QUtil::toUTF16(ch);
+ }
+ else
+ {
+ result.append(1, ch);
+ }
+ }
+ else if (encoding == e_utf16)
+ {
+ result += QUtil::toUTF16(codepoint);
+ }
+ else if ((codepoint > 160) && (codepoint < 256) &&
+ ((encoding == e_winansi) || (encoding == e_pdfdoc)))
+ {
+ result.append(1, static_cast<unsigned char>(codepoint & 0xff));
+ }
+ else
+ {
+ unsigned char ch = '\0';
+ if (encoding == e_winansi)
+ {
+ ch = encode_winansi(codepoint);
+ }
+ else if (encoding == e_macroman)
+ {
+ ch = encode_macroman(codepoint);
+ }
+ else if (encoding == e_pdfdoc)
+ {
+ ch = encode_pdfdoc(codepoint);
+ }
+ if (ch == '\0')
+ {
+ okay = false;
+ ch = static_cast<unsigned char>(unknown);
+ }
+ result.append(1, ch);
+ }
+ }
+ return okay;
+}
+
+static std::string
+transcode_utf8(std::string const& utf8_val, encoding_e encoding,
+ char unknown)
+{
+ std::string result;
+ transcode_utf8(utf8_val, result, encoding, unknown);
+ return result;
+}
+
+std::string
+QUtil::utf8_to_utf16(std::string const& utf8)
+{
+ return transcode_utf8(utf8, e_utf16, 0);
+}
+
+std::string
+QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
+{
+ return transcode_utf8(utf8, e_ascii, unknown_char);
+}
+
+std::string
+QUtil::utf8_to_win_ansi(std::string const& utf8, char unknown_char)
+{
+ return transcode_utf8(utf8, e_winansi, unknown_char);
+}
+
+std::string
+QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
+{
+ return transcode_utf8(utf8, e_macroman, unknown_char);
+}
+
+std::string
+QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
+{
+ return transcode_utf8(utf8, e_pdfdoc, unknown_char);
+}
+
+bool
+QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, ascii, e_ascii, unknown_char);
+}
+
+bool
+QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, win, e_winansi, unknown_char);
+}
+
+bool
+QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, mac, e_macroman, unknown_char);
+}
+
+bool
+QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char);
+}
+
+bool
+QUtil::is_utf16(std::string const& val)
+{
+ return ((val.length() >= 2) &&
+ (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+}
+
+std::string
+QUtil::utf16_to_utf8(std::string const& val)
+{
+ std::string result;
+ // This code uses unsigned long and unsigned short to hold
+ // codepoint values. It requires unsigned long to be at least
+ // 32 bits and unsigned short to be at least 16 bits, but it
+ // will work fine if they are larger.
+ unsigned long codepoint = 0L;
+ size_t len = val.length();
+ size_t start = 0;
+ if (is_utf16(val))
+ {
+ start += 2;
+ }
+ // If the string has an odd number of bytes, the last byte is
+ // ignored.
+ for (unsigned int i = start; i < len; i += 2)
+ {
+ // Convert from UTF16-BE. If we get a malformed
+ // codepoint, this code will generate incorrect output
+ // without giving a warning. Specifically, a high
+ // codepoint not followed by a low codepoint will be
+ // discarded, and a low codepoint not preceded by a high
+ // codepoint will just get its low 10 bits output.
+ unsigned short bits =
+ (static_cast<unsigned char>(val.at(i)) << 8) +
+ static_cast<unsigned char>(val.at(i+1));
+ if ((bits & 0xFC00) == 0xD800)
+ {
+ codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+ continue;
+ }
+ else if ((bits & 0xFC00) == 0xDC00)
+ {
+ if (codepoint != 0)
+ {
+ QTC::TC("qpdf", "QUtil non-trivial UTF-16");
+ }
+ codepoint += bits & 0x3FF;
+ }
+ else
+ {
+ codepoint = bits;
+ }
+
+ result += QUtil::toUTF8(codepoint);
+ codepoint = 0;
+ }
+ return result;
+}
+
+std::string
+QUtil::win_ansi_to_utf8(std::string const& val)
+{
+ std::string result;
+ size_t len = val.length();
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(val.at(i));
+ unsigned short val = ch;
+ if ((ch >= 128) && (ch <= 160))
+ {
+ val = win_ansi_to_unicode[ch - 128];
+ }
+ result += QUtil::toUTF8(val);
+ }
+ return result;
+}
+
+std::string
+QUtil::mac_roman_to_utf8(std::string const& val)
+{
+ std::string result;
+ size_t len = val.length();
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(val.at(i));
+ unsigned short val = ch;
+ if (ch >= 128)
+ {
+ val = mac_roman_to_unicode[ch - 128];
+ }
+ result += QUtil::toUTF8(val);
+ }
+ return result;
+}
+
+std::string
+QUtil::pdf_doc_to_utf8(std::string const& val)
+{
+ std::string result;
+ size_t len = val.length();
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(val.at(i));
+ unsigned short val = ch;
+ if ((ch >= 128) && (ch <= 160))
+ {
+ val = pdf_doc_to_unicode[ch - 128];
+ }
+ result += QUtil::toUTF8(val);
+ }
+ return result;
+}
+
+void
+QUtil::analyze_encoding(std::string const& val,
+ bool& has_8bit_chars,
+ bool& is_valid_utf8,
+ bool& is_utf16)
+{
+ has_8bit_chars = is_utf16 = is_valid_utf8 = false;
+ if (QUtil::is_utf16(val))
+ {
+ has_8bit_chars = true;
+ is_utf16 = true;
+ return;
+ }
+ size_t len = val.length();
+ bool any_errors = false;
+ for (size_t i = 0; i < len; ++i)
+ {
+ bool error = false;
+ unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
+ if (error)
+ {
+ any_errors = true;
+ }
+ if (codepoint >= 128)
+ {
+ has_8bit_chars = true;
+ }
+ }
+ if (has_8bit_chars && (! any_errors))
+ {
+ is_valid_utf8 = true;
+ }
+}
+
+std::vector<std::string>
+QUtil::possible_repaired_encodings(std::string supplied)
+{
+ std::vector<std::string> result;
+ // Always include the original string
+ result.push_back(supplied);
+ bool has_8bit_chars = false;
+ bool is_valid_utf8 = false;
+ bool is_utf16 = false;
+ analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
+ if (! has_8bit_chars)
+ {
+ return result;
+ }
+ if (is_utf16)
+ {
+ // Convert to UTF-8 and pretend we got a UTF-8 string.
+ is_utf16 = false;
+ is_valid_utf8 = true;
+ supplied = utf16_to_utf8(supplied);
+ }
+ std::string output;
+ if (is_valid_utf8)
+ {
+ // Maybe we were given UTF-8 but wanted one of the single-byte
+ // encodings.
+ if (utf8_to_pdf_doc(supplied, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_win_ansi(supplied, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(supplied, output))
+ {
+ result.push_back(output);
+ }
+ }
+ else
+ {
+ // Maybe we were given one of the single-byte encodings but
+ // wanted UTF-8.
+ std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
+ result.push_back(from_pdf_doc);
+ std::string from_win_ansi(win_ansi_to_utf8(supplied));
+ result.push_back(from_win_ansi);
+ std::string from_mac_roman(mac_roman_to_utf8(supplied));
+ result.push_back(from_mac_roman);
+
+ // Maybe we were given one of the other single-byte encodings
+ // but wanted one of the other ones.
+ if (utf8_to_win_ansi(from_pdf_doc, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(from_pdf_doc, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_pdf_doc(from_win_ansi, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(from_win_ansi, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_pdf_doc(from_mac_roman, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_win_ansi(from_mac_roman, output))
+ {
+ result.push_back(output);
+ }
+ }
+ // De-duplicate
+ std::vector<std::string> t;
+ std::set<std::string> seen;
+ for (std::vector<std::string>::iterator iter = result.begin();
+ iter != result.end(); ++iter)
+ {
+ if (! seen.count(*iter))
+ {
+ seen.insert(*iter);
+ t.push_back(*iter);
+ }
+ }
+ return t;
+}