From 698485468a8b7d0f38d817d6055898932f46cc26 Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Sun, 13 Jan 2019 08:00:14 -0500
Subject: Move remaining existing transcoding to QUtil

---
 libqpdf/QPDF_String.cc |  93 +----------------
 libqpdf/QUtil.cc       | 267 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 261 insertions(+), 99 deletions(-)

(limited to 'libqpdf')
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index 7cfb6bcc..bf1141d1 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -8,43 +8,6 @@
 // be used.
 #include <string.h>
 
-// First element is 128
-static unsigned short pdf_doc_to_unicode[] = {
-    0x2022,    // 0x80    BULLET
-    0x2020,    // 0x81    DAGGER
-    0x2021,    // 0x82    DOUBLE DAGGER
-    0x2026,    // 0x83    HORIZONTAL ELLIPSIS
-    0x2014,    // 0x84    EM DASH
-    0x2013,    // 0x85    EN DASH
-    0x0192,    // 0x86    SMALL LETTER F WITH HOOK
-    0x2044,    // 0x87    FRACTION SLASH (solidus)
-    0x2039,    // 0x88    SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    0x203a,    // 0x89    SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    0x2212,    // 0x8a    MINUS SIGN
-    0x2030,    // 0x8b    PER MILLE SIGN
-    0x201e,    // 0x8c    DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
-    0x201c,    // 0x8d    LEFT DOUBLE QUOTATION MARK (double quote left)
-    0x201d,    // 0x8e    RIGHT DOUBLE QUOTATION MARK (quotedblright)
-    0x2018,    // 0x8f    LEFT SINGLE QUOTATION MARK (quoteleft)
-    0x2019,    // 0x90    RIGHT SINGLE QUOTATION MARK (quoteright)
-    0x201a,    // 0x91    SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
-    0x2122,    // 0x92    TRADE MARK SIGN
-    0xfb01,    // 0x93    LATIN SMALL LIGATURE FI
-    0xfb02,    // 0x94    LATIN SMALL LIGATURE FL
-    0x0141,    // 0x95    LATIN CAPITAL LETTER L WITH STROKE
-    0x0152,    // 0x96    LATIN CAPITAL LIGATURE OE
-    0x0160,    // 0x97    LATIN CAPITAL LETTER S WITH CARON
-    0x0178,    // 0x98    LATIN CAPITAL LETTER Y WITH DIAERESIS
-    0x017d,    // 0x99    LATIN CAPITAL LETTER Z WITH CARON
-    0x0131,    // 0x9a    LATIN SMALL LETTER DOTLESS I
-    0x0142,    // 0x9b    LATIN SMALL LETTER L WITH STROKE
-    0x0153,    // 0x9c    LATIN SMALL LIGATURE OE
-    0x0161,    // 0x9d    LATIN SMALL LETTER S WITH CARON
-    0x017e,    // 0x9e    LATIN SMALL LETTER Z WITH CARON
-    0xfffd,    // 0x9f    UNDEFINED
-    0x20ac,    // 0xa0    EURO SIGN
-};
-
 // See above about ctype.
 static bool is_ascii_printable(unsigned char ch)
 {
@@ -210,62 +173,12 @@ QPDF_String::getVal() const
 std::string
 QPDF_String::getUTF8Val() const
 {
-    std::string result;
-    size_t len = this->val.length();
-    if ((len >= 2) && (len % 2 == 0) &&
-	(this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
+    if (QUtil::is_utf16(this->val))
     {
-	// This is a Unicode string using big-endian UTF-16.  This
-	// code uses unsigned long and unsigned short to hold
-	// codepoint values.  It requires unsigned long to be at least
-	// 32 bits and unsigned short to be at least 16 bits, but it
-	// will work fine if they are larger.
-	unsigned long codepoint = 0L;
-	for (unsigned int i = 2; i < len; i += 2)
-	{
-	    // Convert from UTF16-BE.  If we get a malformed
-	    // codepoint, this code will generate incorrect output
-	    // without giving a warning.  Specifically, a high
-	    // codepoint not followed by a low codepoint will be
-	    // discarded, and a low codepoint not preceded by a high
-	    // codepoint will just get its low 10 bits output.
-	    unsigned short bits =
-		(static_cast<unsigned char>(this->val.at(i)) << 8) +
-		static_cast<unsigned char>(this->val.at(i+1));
-	    if ((bits & 0xFC00) == 0xD800)
-	    {
-		codepoint = 0x10000 + ((bits & 0x3FF) << 10);
-		continue;
-	    }
-	    else if ((bits & 0xFC00) == 0xDC00)
-	    {
-		if (codepoint != 0)
-		{
-		    QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
-		}
-		codepoint += bits & 0x3FF;
-	    }
-	    else
-	    {
-		codepoint = bits;
-	    }
-
-	    result += QUtil::toUTF8(codepoint);
-	    codepoint = 0;
-	}
+        return QUtil::utf16_to_utf8(this->val);
     }
     else
     {
-	for (unsigned int i = 0; i < len; ++i)
-	{
-            unsigned char ch = static_cast<unsigned char>(this->val.at(i));
-            unsigned short val = ch;
-            if ((ch >= 128) && (ch <= 160))
-            {
-                val = pdf_doc_to_unicode[ch - 128];
-            }
-	    result += QUtil::toUTF8(val);
-	}
+        return QUtil::pdf_doc_to_utf8(this->val);
     }
-    return result;
 }
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 8424854b..ac501676 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -8,6 +8,7 @@
 #endif
 #include <qpdf/SecureRandomDataProvider.hh>
 #include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QTC.hh>
 
 #include <cmath>
 #include <iomanip>
@@ -29,6 +30,43 @@
 #include <sys/stat.h>
 #endif
 
+// First element is 128
+static unsigned short pdf_doc_to_unicode[] = {
+    0x2022,    // 0x80    BULLET
+    0x2020,    // 0x81    DAGGER
+    0x2021,    // 0x82    DOUBLE DAGGER
+    0x2026,    // 0x83    HORIZONTAL ELLIPSIS
+    0x2014,    // 0x84    EM DASH
+    0x2013,    // 0x85    EN DASH
+    0x0192,    // 0x86    SMALL LETTER F WITH HOOK
+    0x2044,    // 0x87    FRACTION SLASH (solidus)
+    0x2039,    // 0x88    SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    0x203a,    // 0x89    SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    0x2212,    // 0x8a    MINUS SIGN
+    0x2030,    // 0x8b    PER MILLE SIGN
+    0x201e,    // 0x8c    DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
+    0x201c,    // 0x8d    LEFT DOUBLE QUOTATION MARK (double quote left)
+    0x201d,    // 0x8e    RIGHT DOUBLE QUOTATION MARK (quotedblright)
+    0x2018,    // 0x8f    LEFT SINGLE QUOTATION MARK (quoteleft)
+    0x2019,    // 0x90    RIGHT SINGLE QUOTATION MARK (quoteright)
+    0x201a,    // 0x91    SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
+    0x2122,    // 0x92    TRADE MARK SIGN
+    0xfb01,    // 0x93    LATIN SMALL LIGATURE FI
+    0xfb02,    // 0x94    LATIN SMALL LIGATURE FL
+    0x0141,    // 0x95    LATIN CAPITAL LETTER L WITH STROKE
+    0x0152,    // 0x96    LATIN CAPITAL LIGATURE OE
+    0x0160,    // 0x97    LATIN CAPITAL LETTER S WITH CARON
+    0x0178,    // 0x98    LATIN CAPITAL LETTER Y WITH DIAERESIS
+    0x017d,    // 0x99    LATIN CAPITAL LETTER Z WITH CARON
+    0x0131,    // 0x9a    LATIN SMALL LETTER DOTLESS I
+    0x0142,    // 0x9b    LATIN SMALL LETTER L WITH STROKE
+    0x0153,    // 0x9c    LATIN SMALL LIGATURE OE
+    0x0161,    // 0x9d    LATIN SMALL LETTER S WITH CARON
+    0x017e,    // 0x9e    LATIN SMALL LETTER Z WITH CARON
+    0xfffd,    // 0x9f    UNDEFINED
+    0x20ac,    // 0xa0    EURO SIGN
+};
+
 std::string
 QUtil::int_to_string(long long num, int length)
 {
@@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
     return result;
 }
 
-enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
+enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
 
 static unsigned char
 encode_winansi(unsigned long codepoint)
@@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
     return ch;
 }
 
+static unsigned char
+encode_pdfdoc(unsigned long codepoint)
+{
+    // Use this ugly switch statement to avoid a static, which is not
+    // thread-safe.
+    unsigned char ch = '\0';
+    switch (codepoint)
+    {
+      case 0x2022:
+        ch = 0x80;
+        break;
+      case 0x2020:
+        ch = 0x81;
+        break;
+      case 0x2021:
+        ch = 0x82;
+        break;
+      case 0x2026:
+        ch = 0x83;
+        break;
+      case 0x2014:
+        ch = 0x84;
+        break;
+      case 0x2013:
+        ch = 0x85;
+        break;
+      case 0x0192:
+        ch = 0x86;
+        break;
+      case 0x2044:
+        ch = 0x87;
+        break;
+      case 0x2039:
+        ch = 0x88;
+        break;
+      case 0x203a:
+        ch = 0x89;
+        break;
+      case 0x2212:
+        ch = 0x8a;
+        break;
+      case 0x2030:
+        ch = 0x8b;
+        break;
+      case 0x201e:
+        ch = 0x8c;
+        break;
+      case 0x201c:
+        ch = 0x8d;
+        break;
+      case 0x201d:
+        ch = 0x8e;
+        break;
+      case 0x2018:
+        ch = 0x8f;
+        break;
+      case 0x2019:
+        ch = 0x90;
+        break;
+      case 0x201a:
+        ch = 0x91;
+        break;
+      case 0x2122:
+        ch = 0x92;
+        break;
+      case 0xfb01:
+        ch = 0x93;
+        break;
+      case 0xfb02:
+        ch = 0x94;
+        break;
+      case 0x0141:
+        ch = 0x95;
+        break;
+      case 0x0152:
+        ch = 0x96;
+        break;
+      case 0x0160:
+        ch = 0x97;
+        break;
+      case 0x0178:
+        ch = 0x98;
+        break;
+      case 0x017d:
+        ch = 0x99;
+        break;
+      case 0x0131:
+        ch = 0x9a;
+        break;
+      case 0x0142:
+        ch = 0x9b;
+        break;
+      case 0x0153:
+        ch = 0x9c;
+        break;
+      case 0x0161:
+        ch = 0x9d;
+        break;
+      case 0x017e:
+        ch = 0x9e;
+        break;
+      case 0xfffd:
+        ch = 0x9f;
+        break;
+      case 0x20ac:
+        ch = 0xa0;
+        break;
+      default:
+        break;
+    }
+    return ch;
+}
+
 static std::string
 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
                char unknown)
@@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
                 {
                     result += QUtil::toUTF16(codepoint);
                 }
+                else if ((codepoint >= 160) && (codepoint < 256) &&
+                         ((encoding == e_winansi) || (encoding == e_pdfdoc)))
+                {
+                    ch = static_cast<unsigned char>(codepoint & 0xff);
+                    result.append(1, ch);
+                }
                 else
                 {
                     ch = '\0';
                     if (encoding == e_winansi)
                     {
-                        if ((codepoint >= 160) && (codepoint < 256))
-                        {
-                            ch = static_cast<unsigned char>(codepoint & 0xff);
-                        }
-                        else
-                        {
-                            ch = encode_winansi(codepoint);
-                        }
+                        ch = encode_winansi(codepoint);
                     }
                     else if (encoding == e_macroman)
                     {
                         ch = encode_macroman(codepoint);
                     }
+                    else if (encoding == e_pdfdoc)
+                    {
+                        ch = encode_pdfdoc(codepoint);
+                    }
                     if (ch == '\0')
                     {
                         ch = static_cast<unsigned char>(unknown);
@@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
 {
     return transcode_utf8(utf8, e_macroman, unknown_char);
 }
+
+std::string
+QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
+{
+    return transcode_utf8(utf8, e_pdfdoc, unknown_char);
+}
+
+bool
+QUtil::is_utf16(std::string const& val)
+{
+    return ((val.length() >= 2) &&
+            (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+}
+
+std::string
+QUtil::utf16_to_utf8(std::string const& val)
+{
+    std::string result;
+    // This code uses unsigned long and unsigned short to hold
+    // codepoint values. It requires unsigned long to be at least
+    // 32 bits and unsigned short to be at least 16 bits, but it
+    // will work fine if they are larger.
+    unsigned long codepoint = 0L;
+    size_t len = val.length();
+    size_t start = 0;
+    if (is_utf16(val))
+    {
+        start += 2;
+    }
+    // If the string has an odd number of bytes, the last byte is
+    // ignored.
+    for (unsigned int i = start; i < len; i += 2)
+    {
+        // Convert from UTF16-BE.  If we get a malformed
+        // codepoint, this code will generate incorrect output
+        // without giving a warning.  Specifically, a high
+        // codepoint not followed by a low codepoint will be
+        // discarded, and a low codepoint not preceded by a high
+        // codepoint will just get its low 10 bits output.
+        unsigned short bits =
+            (static_cast<unsigned char>(val.at(i)) << 8) +
+            static_cast<unsigned char>(val.at(i+1));
+        if ((bits & 0xFC00) == 0xD800)
+        {
+            codepoint = 0x10000 + ((bits & 0x3FF) << 10);
+            continue;
+        }
+        else if ((bits & 0xFC00) == 0xDC00)
+        {
+            if (codepoint != 0)
+            {
+                QTC::TC("qpdf", "QUtil non-trivial UTF-16");
+            }
+            codepoint += bits & 0x3FF;
+        }
+        else
+        {
+            codepoint = bits;
+        }
+
+        result += QUtil::toUTF8(codepoint);
+        codepoint = 0;
+    }
+    return result;
+}
+
+std::string
+QUtil::win_ansi_to_utf8(std::string const& val)
+{
+    return "QXXXQ";
+}
+
+std::string
+QUtil::mac_roman_to_utf8(std::string const& val)
+{
+    return "QXXXQ";
+}
+
+std::string
+QUtil::pdf_doc_to_utf8(std::string const& val)
+{
+    std::string result;
+    size_t len = val.length();
+    for (unsigned int i = 0; i < len; ++i)
+    {
+        unsigned char ch = static_cast<unsigned char>(val.at(i));
+        unsigned short val = ch;
+        if ((ch >= 128) && (ch <= 160))
+        {
+            val = pdf_doc_to_unicode[ch - 128];
+        }
+        result += QUtil::toUTF8(val);
+    }
+    return result;
+}
-- 
cgit v1.2.3-54-g00ecf