aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--include/qpdf/QUtil.hh14
-rw-r--r--libqpdf/QUtil.cc180
-rw-r--r--libtests/qtest/qutil/qutil.out1
-rw-r--r--libtests/qutil.cc19
5 files changed, 155 insertions, 66 deletions
diff --git a/ChangeLog b/ChangeLog
index e1087f20..9a75f4ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,6 +14,13 @@
the first bug in qpdf's history that could result in silent loss
of data when processing a correct input file. Fixes #276.
+2019-01-13 Jay Berkenbilt <ejb@ql.org>
+
+ * Add several more string transcoding and analysis methods to
+ QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
+ Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
+ UTF-16.
+
2019-01-12 Jay Berkenbilt <ejb@ql.org>
* In the --pages option, allow the same page to be specified more
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index ea3f5da8..d9b0783e 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -193,6 +193,20 @@ namespace QUtil
QPDF_DLL
std::string pdf_doc_to_utf8(std::string const& pdfdoc);
+ // Analyze a string for encoding. We can't tell the difference
+ // between any single-byte encodings, and we can't tell for sure
+ // whether a string that happens to be valid UTF-8 isn't a
+ // different encoding, but we can at least tell a few things to
+ // help us guess. If there are no characters with the high bit
+ // set, has_8bit_chars is false, and the other values are also
+ // false, even though ASCII strings are valid UTF-8. is_valid_utf8
+ // means that the string is non-trivially valid UTF-8.
+ QPDF_DLL
+ void analyze_encoding(std::string const& str,
+ bool& has_8bit_chars,
+ bool& is_valid_utf8,
+ bool& is_utf16);
+
// If secure random number generation is supported on your
// platform and qpdf was not compiled with insecure random number
// generation, this returns a cryptographically secure random
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 9dbce98e..19b6fdab 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
return ch;
}
+unsigned long get_next_utf8_codepoint(
+ std::string const& utf8_val, size_t& pos, bool& error)
+{
+ size_t len = utf8_val.length();
+ unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
+ error = false;
+ if (ch < 128)
+ {
+ return static_cast<unsigned long>(ch);
+ }
+
+ size_t bytes_needed = 0;
+ unsigned bit_check = 0x40;
+ unsigned char to_clear = 0x80;
+ while (ch & bit_check)
+ {
+ ++bytes_needed;
+ to_clear |= bit_check;
+ bit_check >>= 1;
+ }
+ if (((bytes_needed > 5) || (bytes_needed < 1)) ||
+ ((pos + bytes_needed) >= len))
+ {
+ error = true;
+ return 0xfffd;
+ }
+
+ unsigned long codepoint = (ch & ~to_clear);
+ while (bytes_needed > 0)
+ {
+ --bytes_needed;
+ ch = utf8_val.at(++pos);
+ if ((ch & 0xc0) != 0x80)
+ {
+ --pos;
+ codepoint = 0xfffd;
+ break;
+ }
+ codepoint <<= 6;
+ codepoint += (ch & 0x3f);
+ }
+ return codepoint;
+}
+
static std::string
transcode_utf8(std::string const& utf8_val, encoding_e encoding,
char unknown)
@@ -1673,9 +1717,22 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
size_t len = utf8_val.length();
for (size_t i = 0; i < len; ++i)
{
- unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
- if (ch < 128)
+ bool error = false;
+ unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
+ if (error)
+ {
+ if (encoding == e_utf16)
+ {
+ result += "\xff\xfd";
+ }
+ else
+ {
+ result.append(1, unknown);
+ }
+ }
+ else if (codepoint < 128)
{
+ char ch = static_cast<char>(codepoint);
if (encoding == e_utf16)
{
result += QUtil::toUTF16(ch);
@@ -1685,78 +1742,35 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding,
result.append(1, ch);
}
}
+ else if (encoding == e_utf16)
+ {
+ result += QUtil::toUTF16(codepoint);
+ }
+ else if ((codepoint > 160) && (codepoint < 256) &&
+ ((encoding == e_winansi) || (encoding == e_pdfdoc)))
+ {
+ result.append(1, static_cast<unsigned char>(codepoint & 0xff));
+ }
else
{
- size_t bytes_needed = 0;
- unsigned bit_check = 0x40;
- unsigned char to_clear = 0x80;
- while (ch & bit_check)
+ unsigned char ch = '\0';
+ if (encoding == e_winansi)
{
- ++bytes_needed;
- to_clear |= bit_check;
- bit_check >>= 1;
+ ch = encode_winansi(codepoint);
}
-
- if (((bytes_needed > 5) || (bytes_needed < 1)) ||
- ((i + bytes_needed) >= len))
+ else if (encoding == e_macroman)
{
- if (encoding == e_utf16)
- {
- result += "\xff\xfd";
- }
- else
- {
- result.append(1, unknown);
- }
+ ch = encode_macroman(codepoint);
}
- else
+ else if (encoding == e_pdfdoc)
{
- unsigned long codepoint = (ch & ~to_clear);
- while (bytes_needed > 0)
- {
- --bytes_needed;
- ch = utf8_val.at(++i);
- if ((ch & 0xc0) != 0x80)
- {
- --i;
- codepoint = 0xfffd;
- break;
- }
- codepoint <<= 6;
- codepoint += (ch & 0x3f);
- }
- if (encoding == e_utf16)
- {
- result += QUtil::toUTF16(codepoint);
- }
- else if ((codepoint > 160) && (codepoint < 256) &&
- ((encoding == e_winansi) || (encoding == e_pdfdoc)))
- {
- ch = static_cast<unsigned char>(codepoint & 0xff);
- result.append(1, ch);
- }
- else
- {
- ch = '\0';
- if (encoding == e_winansi)
- {
- ch = encode_winansi(codepoint);
- }
- else if (encoding == e_macroman)
- {
- ch = encode_macroman(codepoint);
- }
- else if (encoding == e_pdfdoc)
- {
- ch = encode_pdfdoc(codepoint);
- }
- if (ch == '\0')
- {
- ch = static_cast<unsigned char>(unknown);
- }
- result.append(1, ch);
- }
+ ch = encode_pdfdoc(codepoint);
}
+ if (ch == '\0')
+ {
+ ch = static_cast<unsigned char>(unknown);
+ }
+ result.append(1, ch);
}
}
return result;
@@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val)
}
return result;
}
+
+void
+QUtil::analyze_encoding(std::string const& val,
+ bool& has_8bit_chars,
+ bool& is_valid_utf8,
+ bool& is_utf16)
+{
+ has_8bit_chars = is_utf16 = is_valid_utf8 = false;
+ if (QUtil::is_utf16(val))
+ {
+ has_8bit_chars = true;
+ is_utf16 = true;
+ return;
+ }
+ size_t len = val.length();
+ bool any_errors = false;
+ for (size_t i = 0; i < len; ++i)
+ {
+ bool error = false;
+ unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
+ if (error)
+ {
+ any_errors = true;
+ }
+ if (codepoint >= 128)
+ {
+ has_8bit_chars = true;
+ }
+ }
+ if (has_8bit_chars && (! any_errors))
+ {
+ is_valid_utf8 = true;
+ }
+}
diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out
index 50ec26f9..c0789a36 100644
--- a/libtests/qtest/qutil/qutil.out
+++ b/libtests/qtest/qutil/qutil.out
@@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
bidirectional pdf doc done
bidirectional win ansi done
bidirectional mac roman done
+analysis done
---- whoami
quack1
quack2
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 355bb9a2..91a656be 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&),
}
}
+void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
+{
+ bool has_8bit_chars = false;
+ bool is_valid_utf8 = false;
+ bool is_utf16 = false;
+ QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
+ if (! ((has_8bit_chars == has8bit) &&
+ (is_valid_utf8 == utf8) &&
+ (is_utf16 == utf16)))
+ {
+ std::cout << "analysis failed: " << str << std::endl;
+ }
+}
+
void transcoding_test()
{
transcoding_test(&QUtil::pdf_doc_to_utf8,
@@ -273,6 +287,11 @@ void transcoding_test()
transcoding_test(&QUtil::mac_roman_to_utf8,
&QUtil::utf8_to_mac_roman, 255, "?");
std::cout << "bidirectional mac roman done" << std::endl;
+ check_analyze("pi = \317\200", true, true, false);
+ check_analyze("pi != \317", true, false, false);
+ check_analyze("pi != 22/7", false, false, false);
+ check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
+ std::cout << "analysis done" << std::endl;
}
void print_whoami(char const* str)