From 3ef1b77304ec49ec2527d8cc3e17e1d0dd220720 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 5 Jan 2019 13:04:05 -0500 Subject: Refactor QUtil::utf8_to_ascii --- libqpdf/QUtil.cc | 59 ++++++++++++++++++++++++------------------ libtests/qtest/qutil/qutil.out | 6 ++--- libtests/qutil.cc | 2 +- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index ba4aea2c..04b9b190 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -893,20 +893,32 @@ QUtil::parse_numrange(char const* range, int max) return result; } -enum encoding_e { e_utf16 }; +enum encoding_e { e_utf16, e_ascii }; static std::string -transcode_utf8(std::string const& utf8_val, encoding_e encoding) +transcode_utf8(std::string const& utf8_val, encoding_e encoding, + char unknown) { - std::string result = "\xfe\xff"; + std::string result; + if (encoding == e_utf16) + { + result += "\xfe\xff"; + } size_t len = utf8_val.length(); for (size_t i = 0; i < len; ++i) { unsigned char ch = static_cast(utf8_val.at(i)); if (ch < 128) { - result += QUtil::toUTF16(ch); + if (encoding == e_utf16) + { + result += QUtil::toUTF16(ch); + } + else + { + result.append(1, ch); + } } else { @@ -923,7 +935,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) if (((bytes_needed > 5) || (bytes_needed < 1)) || ((i + bytes_needed) >= len)) { - result += "\xff\xfd"; + if (encoding == e_utf16) + { + result += "\xff\xfd"; + } + else + { + result.append(1, unknown); + } } else { @@ -941,7 +960,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) codepoint <<= 6; codepoint += (ch & 0x3f); } - result += QUtil::toUTF16(codepoint); + if (encoding == e_utf16) + { + result += QUtil::toUTF16(codepoint); + } + else + { + result.append(1, unknown); + } } } } @@ -951,28 +977,11 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) std::string QUtil::utf8_to_utf16(std::string const& utf8) { - return transcode_utf8(utf8, e_utf16); + return transcode_utf8(utf8, e_utf16, 0); } std::string QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) { - std::string ascii_value; - for (size_t i = 0; i < utf8.length(); ++i) - { - unsigned char ch = static_cast(utf8.at(i)); - if (ch < 128) - { - ascii_value.append(1, ch); - } - else if ((ch & 0xc0) == 0x80) - { - // Ignore subsequent byte of UTF-8 encoded character - } - else - { - ascii_value.append(1, unknown_char); - } - } - return ascii_value; + return transcode_utf8(utf8, e_ascii, unknown_char); } diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out index f47301e4..ca146c15 100644 --- a/libtests/qtest/qutil/qutil.out +++ b/libtests/qtest/qutil/qutil.out @@ -48,9 +48,9 @@ HAGOOGAMAGOOGLE: 0 0x7fffffff -> ff fd 0x80000000 -> ff fd ---- utf8_to_ascii -Does π have fingers? -Does ? have fingers? -Does * have fingers? +¿Does π have fingers? +?Does ? have fingers? +*Does * have fingers? ---- whoami quack1 quack2 diff --git a/libtests/qutil.cc b/libtests/qutil.cc index de51da58..364eae1c 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -222,7 +222,7 @@ void to_utf16_test() void utf8_to_ascii_test() { - char const* input = "Does \317\200 have fingers?"; + char const* input = "\302\277Does \317\200 have fingers?"; std::cout << input << std::endl << QUtil::utf8_to_ascii(input) -- cgit v1.2.3-70-g09d2