aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QUtil.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r--libqpdf/QUtil.cc267
1 files changed, 258 insertions, 9 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 8424854b..ac501676 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -8,6 +8,7 @@
#endif
#include <qpdf/SecureRandomDataProvider.hh>
#include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QTC.hh>
#include <cmath>
#include <iomanip>
@@ -29,6 +30,43 @@
#include <sys/stat.h>
#endif
+// First element is 128
+static unsigned short pdf_doc_to_unicode[] = {
+ 0x2022, // 0x80 BULLET
+ 0x2020, // 0x81 DAGGER
+ 0x2021, // 0x82 DOUBLE DAGGER
+ 0x2026, // 0x83 HORIZONTAL ELLIPSIS
+ 0x2014, // 0x84 EM DASH
+ 0x2013, // 0x85 EN DASH
+ 0x0192, // 0x86 SMALL LETTER F WITH HOOK
+ 0x2044, // 0x87 FRACTION SLASH (solidus)
+ 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 0x2212, // 0x8a MINUS SIGN
+ 0x2030, // 0x8b PER MILLE SIGN
+ 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+ 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
+ 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
+ 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
+ 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
+ 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+ 0x2122, // 0x92 TRADE MARK SIGN
+ 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
+ 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
+ 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
+ 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
+ 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
+ 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
+ 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
+ 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
+ 0x0153, // 0x9c LATIN SMALL LIGATURE OE
+ 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
+ 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
+ 0xfffd, // 0x9f UNDEFINED
+ 0x20ac, // 0xa0 EURO SIGN
+};
+
std::string
QUtil::int_to_string(long long num, int length)
{
@@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
return result;
}
-enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
+enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
static unsigned char
encode_winansi(unsigned long codepoint)
@@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
return ch;
}
+static unsigned char
+encode_pdfdoc(unsigned long codepoint)
+{
+ // Use this ugly switch statement to avoid a static, which is not
+ // thread-safe.
+ unsigned char ch = '\0';
+ switch (codepoint)
+ {
+ case 0x2022:
+ ch = 0x80;
+ break;
+ case 0x2020:
+ ch = 0x81;
+ break;
+ case 0x2021:
+ ch = 0x82;
+ break;
+ case 0x2026:
+ ch = 0x83;
+ break;
+ case 0x2014:
+ ch = 0x84;
+ break;
+ case 0x2013:
+ ch = 0x85;
+ break;
+ case 0x0192:
+ ch = 0x86;
+ break;
+ case 0x2044:
+ ch = 0x87;
+ break;
+ case 0x2039:
+ ch = 0x88;
+ break;
+ case 0x203a:
+ ch = 0x89;
+ break;
+ case 0x2212:
+ ch = 0x8a;
+ break;
+ case 0x2030:
+ ch = 0x8b;
+ break;
+ case 0x201e:
+ ch = 0x8c;
+ break;
+ case 0x201c:
+ ch = 0x8d;
+ break;
+ case 0x201d:
+ ch = 0x8e;
+ break;
+ case 0x2018:
+ ch = 0x8f;
+ break;
+ case 0x2019:
+ ch = 0x90;
+ break;
+ case 0x201a:
+ ch = 0x91;
+ break;
+ case 0x2122:
+ ch = 0x92;
+ break;
+ case 0xfb01:
+ ch = 0x93;
+ break;
+ case 0xfb02:
+ ch = 0x94;
+ break;
+ case 0x0141:
+ ch = 0x95;
+ break;
+ case 0x0152:
+ ch = 0x96;
+ break;
+ case 0x0160:
+ ch = 0x97;
+ break;
+ case 0x0178:
+ ch = 0x98;
+ break;
+ case 0x017d:
+ ch = 0x99;
+ break;
+ case 0x0131:
+ ch = 0x9a;
+ break;
+ case 0x0142:
+ ch = 0x9b;
+ break;
+ case 0x0153:
+ ch = 0x9c;
+ break;
+ case 0x0161:
+ ch = 0x9d;
+ break;
+ case 0x017e:
+ ch = 0x9e;
+ break;
+ case 0xfffd:
+ ch = 0x9f;
+ break;
+ case 0x20ac:
+ ch = 0xa0;
+ break;
+ default:
+ break;
+ }
+ return ch;
+}
+
static std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
char unknown)
@@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
{
result += QUtil::toUTF16(codepoint);
}
+ else if ((codepoint >= 160) && (codepoint < 256) &&
+ ((encoding == e_winansi) || (encoding == e_pdfdoc)))
+ {
+ ch = static_cast<unsigned char>(codepoint & 0xff);
+ result.append(1, ch);
+ }
else
{
ch = '\0';
if (encoding == e_winansi)
{
- if ((codepoint >= 160) && (codepoint < 256))
- {
- ch = static_cast<unsigned char>(codepoint & 0xff);
- }
- else
- {
- ch = encode_winansi(codepoint);
- }
+ ch = encode_winansi(codepoint);
}
else if (encoding == e_macroman)
{
ch = encode_macroman(codepoint);
}
+ else if (encoding == e_pdfdoc)
+ {
+ ch = encode_pdfdoc(codepoint);
+ }
if (ch == '\0')
{
ch = static_cast<unsigned char>(unknown);
@@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
{
return transcode_utf8(utf8, e_macroman, unknown_char);
}
+
+std::string
+QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
+{
+ return transcode_utf8(utf8, e_pdfdoc, unknown_char);
+}
+
+bool
+QUtil::is_utf16(std::string const& val)
+{
+ return ((val.length() >= 2) &&
+ (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+}
+
+std::string
+QUtil::utf16_to_utf8(std::string const& val)
+{
+ std::string result;
+ // This code uses unsigned long and unsigned short to hold
+ // codepoint values. It requires unsigned long to be at least
+ // 32 bits and unsigned short to be at least 16 bits, but it
+ // will work fine if they are larger.
+ unsigned long codepoint = 0L;
+ size_t len = val.length();
+ size_t start = 0;
+ if (is_utf16(val))
+ {
+ start += 2;
+ }
+ // If the string has an odd number of bytes, the last byte is
+ // ignored.
+ for (unsigned int i = start; i < len; i += 2)
+ {
+ // Convert from UTF16-BE. If we get a malformed
+ // codepoint, this code will generate incorrect output
+ // without giving a warning. Specifically, a high
+ // codepoint not followed by a low codepoint will be
+ // discarded, and a low codepoint not preceded by a high
+ // codepoint will just get its low 10 bits output.
+ unsigned short bits =
+ (static_cast<unsigned char>(val.at(i)) << 8) +
+ static_cast<unsigned char>(val.at(i+1));
+ if ((bits & 0xFC00) == 0xD800)
+ {
+ codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+ continue;
+ }
+ else if ((bits & 0xFC00) == 0xDC00)
+ {
+ if (codepoint != 0)
+ {
+ QTC::TC("qpdf", "QUtil non-trivial UTF-16");
+ }
+ codepoint += bits & 0x3FF;
+ }
+ else
+ {
+ codepoint = bits;
+ }
+
+ result += QUtil::toUTF8(codepoint);
+ codepoint = 0;
+ }
+ return result;
+}
+
+std::string
+QUtil::win_ansi_to_utf8(std::string const& val)
+{
+ return "QXXXQ";
+}
+
+std::string
+QUtil::mac_roman_to_utf8(std::string const& val)
+{
+ return "QXXXQ";
+}
+
+std::string
+QUtil::pdf_doc_to_utf8(std::string const& val)
+{
+ std::string result;
+ size_t len = val.length();
+ for (unsigned int i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(val.at(i));
+ unsigned short val = ch;
+ if ((ch >= 128) && (ch <= 160))
+ {
+ val = pdf_doc_to_unicode[ch - 128];
+ }
+ result += QUtil::toUTF8(val);
+ }
+ return result;
+}