aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--include/qpdf/QUtil.hh10
-rw-r--r--libqpdf/QPDF_String.cc57
-rw-r--r--libqpdf/QUtil.cc61
4 files changed, 74 insertions, 58 deletions
diff --git a/ChangeLog b/ChangeLog
index b4b10f81..a6d7bcb2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2019-01-05 Jay Berkenbilt <ejb@ql.org>
+
+ * Add method QUtil::utf8_to_utf16.
+
2019-01-04 Jay Berkenbilt <ejb@ql.org>
* Add new option --optimize-images, which recompresses every image
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index 5532149c..c7473bf3 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -152,8 +152,14 @@ namespace QUtil
QPDF_DLL
std::string toUTF16(unsigned long uval);
- // Convert a UTF-8 encoded string to ASCII by replacing all
- // characters outside of ascii with the given unknown_char.
+ // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
+ // points are converted to U+FFFD.
+ QPDF_DLL
+ std::string utf8_to_utf16(std::string const& utf8);
+
+ // Convert a UTF-8 encoded string to the specified single-byte
+ // encoding system by replacing all unsupported characters with
+ // the given unknown_char.
QPDF_DLL
std::string utf8_to_ascii(
std::string const& utf8, char unknown_char = '?');
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index 633f1699..7cfb6bcc 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -64,65 +64,10 @@ QPDF_String::~QPDF_String()
{
}
-enum encoding_e { e_utf16 };
-
-static
-std::string
-transcode_utf8(std::string const& utf8_val, encoding_e encoding)
-{
- std::string result = "\xfe\xff";
- size_t len = utf8_val.length();
- for (size_t i = 0; i < len; ++i)
- {
- unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
- if (ch < 128)
- {
- result += QUtil::toUTF16(ch);
- }
- else
- {
- size_t bytes_needed = 0;
- unsigned bit_check = 0x40;
- unsigned char to_clear = 0x80;
- while (ch & bit_check)
- {
- ++bytes_needed;
- to_clear |= bit_check;
- bit_check >>= 1;
- }
-
- if (((bytes_needed > 5) || (bytes_needed < 1)) ||
- ((i + bytes_needed) >= len))
- {
- result += "\xff\xfd";
- }
- else
- {
- unsigned long codepoint = (ch & ~to_clear);
- while (bytes_needed > 0)
- {
- --bytes_needed;
- ch = utf8_val.at(++i);
- if ((ch & 0xc0) != 0x80)
- {
- --i;
- codepoint = 0xfffd;
- break;
- }
- codepoint <<= 6;
- codepoint += (ch & 0x3f);
- }
- result += QUtil::toUTF16(codepoint);
- }
- }
- }
- return result;
-}
-
QPDF_String*
QPDF_String::new_utf16(std::string const& utf8_val)
{
- return new QPDF_String(transcode_utf8(utf8_val, e_utf16));
+ return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
}
std::string
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 7c2d9bc9..ba4aea2c 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max)
return result;
}
+enum encoding_e { e_utf16 };
+
+static
+std::string
+transcode_utf8(std::string const& utf8_val, encoding_e encoding)
+{
+ std::string result = "\xfe\xff";
+ size_t len = utf8_val.length();
+ for (size_t i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
+ if (ch < 128)
+ {
+ result += QUtil::toUTF16(ch);
+ }
+ else
+ {
+ size_t bytes_needed = 0;
+ unsigned bit_check = 0x40;
+ unsigned char to_clear = 0x80;
+ while (ch & bit_check)
+ {
+ ++bytes_needed;
+ to_clear |= bit_check;
+ bit_check >>= 1;
+ }
+
+ if (((bytes_needed > 5) || (bytes_needed < 1)) ||
+ ((i + bytes_needed) >= len))
+ {
+ result += "\xff\xfd";
+ }
+ else
+ {
+ unsigned long codepoint = (ch & ~to_clear);
+ while (bytes_needed > 0)
+ {
+ --bytes_needed;
+ ch = utf8_val.at(++i);
+ if ((ch & 0xc0) != 0x80)
+ {
+ --i;
+ codepoint = 0xfffd;
+ break;
+ }
+ codepoint <<= 6;
+ codepoint += (ch & 0x3f);
+ }
+ result += QUtil::toUTF16(codepoint);
+ }
+ }
+ }
+ return result;
+}
+
+std::string
+QUtil::utf8_to_utf16(std::string const& utf8)
+{
+ return transcode_utf8(utf8, e_utf16);
+}
+
std::string
QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
{