aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog5
-rw-r--r--include/qpdf/QUtil.hh16
-rw-r--r--libqpdf/QUtil.cc48
-rw-r--r--libtests/qutil.cc16
4 files changed, 81 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index 9a75f4ad..7ff658c7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,6 +14,11 @@
the first bug in qpdf's history that could result in silent loss
of data when processing a correct input file. Fixes #276.
+2019-01-14 Jay Berkenbilt <ejb@ql.org>
+
+ * Add versions of utf8 to single-byte character transcoders that
+ return a success code.
+
2019-01-13 Jay Berkenbilt <ejb@ql.org>
* Add several more string transcoding and analysis methods to
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index d9b0783e..5fe8e97c 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -178,6 +178,22 @@ namespace QUtil
std::string utf8_to_pdf_doc(
std::string const& utf8, char unknown_char = '?');
+ // These versions return true if the conversion was successful and
+ // false if any unrepresentable characters were found and had to
+ // be substituted with the unknown character.
+ QPDF_DLL
+ bool utf8_to_ascii(
+ std::string const& utf8, std::string& ascii, char unknown_char = '?');
+ QPDF_DLL
+ bool utf8_to_win_ansi(
+ std::string const& utf8, std::string& win, char unknown_char = '?');
+ QPDF_DLL
+ bool utf8_to_mac_roman(
+ std::string const& utf8, std::string& mac, char unknown_char = '?');
+ QPDF_DLL
+ bool utf8_to_pdf_doc(
+ std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
+
// Convert a UTF-16 big-endian encoded string to UTF-8.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 19b6fdab..e645c4fc 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1705,11 +1705,12 @@ unsigned long get_next_utf8_codepoint(
return codepoint;
}
-static std::string
-transcode_utf8(std::string const& utf8_val, encoding_e encoding,
- char unknown)
+static bool
+transcode_utf8(std::string const& utf8_val, std::string& result,
+ encoding_e encoding, char unknown)
{
- std::string result;
+ bool okay = true;
+ result.clear();
if (encoding == e_utf16)
{
result += "\xfe\xff";
@@ -1721,6 +1722,7 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
if (error)
{
+ okay = false;
if (encoding == e_utf16)
{
result += "\xff\xfd";
@@ -1768,11 +1770,21 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
}
if (ch == '\0')
{
+ okay = false;
ch = static_cast<unsigned char>(unknown);
}
result.append(1, ch);
}
}
+ return okay;
+}
+
+static std::string
+transcode_utf8(std::string const& utf8_val, encoding_e encoding,
+ char unknown)
+{
+ std::string result;
+ transcode_utf8(utf8_val, result, encoding, unknown);
return result;
}
@@ -1807,6 +1819,34 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
}
bool
+QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, ascii, e_ascii, unknown_char);
+}
+
+bool
+QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, win, e_winansi, unknown_char);
+}
+
+bool
+QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, mac, e_macroman, unknown_char);
+}
+
+bool
+QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc,
+ char unknown_char)
+{
+ return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char);
+}
+
+bool
QUtil::is_utf16(std::string const& val)
{
return ((val.length() >= 2) &&
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 91a656be..35877b9c 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -292,6 +292,22 @@ void transcoding_test()
check_analyze("pi != 22/7", false, false, false);
check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
std::cout << "analysis done" << std::endl;
+ std::string input1("a\302\277b");
+ std::string input2("a\317\200b");
+ std::string input3("ab");
+ std::string output;
+ assert(! QUtil::utf8_to_ascii(input1, output));
+ assert(! QUtil::utf8_to_ascii(input2, output));
+ assert(QUtil::utf8_to_ascii(input3, output));
+ assert(QUtil::utf8_to_win_ansi(input1, output));
+ assert(! QUtil::utf8_to_win_ansi(input2, output));
+ assert(QUtil::utf8_to_win_ansi(input3, output));
+ assert(QUtil::utf8_to_mac_roman(input1, output));
+ assert(! QUtil::utf8_to_mac_roman(input2, output));
+ assert(QUtil::utf8_to_mac_roman(input3, output));
+ assert(QUtil::utf8_to_pdf_doc(input1, output));
+ assert(! QUtil::utf8_to_pdf_doc(input2, output));
+ assert(QUtil::utf8_to_pdf_doc(input3, output));
}
void print_whoami(char const* str)