aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.dir-locals.el2
-rw-r--r--ChangeLog11
-rw-r--r--TODO3
-rw-r--r--include/qpdf/QUtil.hh21
-rw-r--r--libqpdf/QPDF_String.cc6
-rw-r--r--libqpdf/QUtil.cc33
-rw-r--r--libtests/qutil.cc27
-rw-r--r--qpdf/qtest/qpdf/unicode-errors.out2
8 files changed, 85 insertions, 20 deletions
diff --git a/.dir-locals.el b/.dir-locals.el
index 18e38e8d..052a2d96 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -1,4 +1,4 @@
-((nil . ((indent-tabs-mode . t)
+((nil . ((indent-tabs-mode . nil)
(qpdf-cc-style
.
("qpdf"
diff --git a/ChangeLog b/ChangeLog
index a2e19b9d..aa8842ce 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2022-04-23 Jay Berkenbilt <ejb@ql.org>
+
+ * Add new method QUtil::is_explicit_utf8 that tests whether a
+ string is explicitly marked as being UTF-8 encoded, as allowed by
+ the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
+ 0xBF, which is the UTF-8 encoding of U+FEFF.
+
+ * Add new method QUtil::get_next_utf8_codepoint as a low-level
+ helper for iterating through the UTF-8 characters in a byte
+ string.
+
2022-04-16 Jay Berkenbilt <ejb@ql.org>
* Breaking CLI change: the default value for --json is now
diff --git a/TODO b/TODO
index 1cbf977f..00b2e3c7 100644
--- a/TODO
+++ b/TODO
@@ -11,9 +11,6 @@ In order:
Other (do in any order):
Misc
-* Consider exposing get_next_utf8_codepoint in QUtil
-* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
- does to detect UTF-8 encoded strings per PDF 2.0 spec.
* Add an option --ignore-encryption to ignore encryption information
and treat encrypted files as if they weren't encrypted. This should
make it possible to solve #598 (--show-encryption without a
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index 2d0b7b56..8b2b5ff8 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -268,14 +268,33 @@ namespace QUtil
QPDF_DLL
std::string toUTF16(unsigned long uval);
+ // If utf8_val.at(pos) points to the beginning of a valid
+ // UTF-8-encoded character, return the codepoint of the character
+ // and set error to false. Otherwise, return 0xfffd and set error
+ // to true. In all cases, pos is advanced to the next position
+ // that may begin a valid character. When the string has been
+ // consumed, pos will be set to the string length. It is an error
+ // to pass a value of pos that is greater than or equal to the
+ // length of the string.
+ QPDF_DLL
+ unsigned long get_next_utf8_codepoint(
+ std::string const& utf8_val, size_t& pos, bool& error);
+
// Test whether this is a UTF-16 string. This is indicated by
// first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
- // (little-endian). Starting in qpdf 10.6.2, this detects
+ // (little-endian), each of which is the encoding of U+FEFF, the
+ // Unicode marker. Starting in qpdf 10.6.2, this detects
// little-endian as well as big-endian. Even though the PDF spec
// doesn't allow little-endian, most readers seem to accept it.
QPDF_DLL
bool is_utf16(std::string const&);
+ // Test whether this is an explicit UTF-8 string as allowed by the
+ // PDF 2.0 spec. This is indicated by first three bytes being 0xEF
+ // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
+ QPDF_DLL
+ bool is_explicit_utf8(std::string const&);
+
// Convert a UTF-8 encoded string to UTF-16 big-endian.
// Unrepresentable code points are converted to U+FFFD.
QPDF_DLL
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index 89ddc498..30d6708b 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
{
if (QUtil::is_utf16(this->val)) {
return QUtil::utf16_to_utf8(this->val);
- } else if (
- (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
- (val.at(2) == '\xBF')) {
+ } else if (QUtil::is_explicit_utf8(this->val)) {
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with
- // the above bytes, which is just UTF-8 encoding of U+FEFF.
+ // the three-byte representation of U+FEFF.
return this->val.substr(3);
} else {
return QUtil::pdf_doc_to_utf8(this->val);
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index a9e77777..5fa6c4b9 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
}
unsigned long
-get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
+QUtil::get_next_utf8_codepoint(
+ std::string const& utf8_val, size_t& pos, bool& error)
{
size_t len = utf8_val.length();
- unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
+ unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
error = false;
if (ch < 128) {
return static_cast<unsigned long>(ch);
@@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
bit_check >>= 1;
}
if (((bytes_needed > 5) || (bytes_needed < 1)) ||
- ((pos + bytes_needed) >= len)) {
+ ((pos + bytes_needed) > len)) {
error = true;
return 0xfffd;
}
@@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
while (bytes_needed > 0) {
--bytes_needed;
- ch = static_cast<unsigned char>(utf8_val.at(++pos));
+ ch = static_cast<unsigned char>(utf8_val.at(pos++));
if ((ch & 0xc0) != 0x80) {
--pos;
- codepoint = 0xfffd;
- break;
+ error = true;
+ return 0xfffd;
}
codepoint <<= 6;
codepoint += (ch & 0x3f);
@@ -1580,9 +1581,11 @@ transcode_utf8(
result += "\xfe\xff";
}
size_t len = utf8_val.length();
- for (size_t i = 0; i < len; ++i) {
+ size_t pos = 0;
+ while (pos < len) {
bool error = false;
- unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
+ unsigned long codepoint =
+ QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
if (error) {
okay = false;
if (encoding == e_utf16) {
@@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val)
((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
}
+bool
+QUtil::is_explicit_utf8(std::string const& val)
+{
+ // QPDF_String.cc knows that this is a 3-byte sequence.
+ return (
+ (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
+ (val.at(2) == '\xbf'));
+}
+
std::string
QUtil::utf16_to_utf8(std::string const& val)
{
@@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
return;
}
size_t len = val.length();
+ size_t pos = 0;
bool any_errors = false;
- for (size_t i = 0; i < len; ++i) {
+ while (pos < len) {
bool error = false;
- unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
+ unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
if (error) {
any_errors = true;
}
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 324dd84e..eb16bf0b 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -240,6 +240,33 @@ print_utf8(unsigned long val)
}
}
std::cout << std::endl;
+
+ // Boundary conditions for QUtil::get_next_utf8_codepoint, which is
+ // also tested indirectly through test_pdf_unicode.cc.
+ std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
+ size_t pos = 0;
+ bool error = false;
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
+ assert(pos == 2);
+ assert(!error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+ assert(pos == 3);
+ assert(error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+ assert(pos == 4);
+ assert(!error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+ assert(pos == 6);
+ assert(error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+ assert(pos == 7);
+ assert(!error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
+ assert(pos == 8);
+ assert(!error);
+ assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+ assert(pos == 9);
+ assert(error);
}
void
diff --git a/qpdf/qtest/qpdf/unicode-errors.out b/qpdf/qtest/qpdf/unicode-errors.out
index 403bb503..4fd7c276 100644
--- a/qpdf/qtest/qpdf/unicode-errors.out
+++ b/qpdf/qtest/qpdf/unicode-errors.out
@@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5
0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
-3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
+3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>