diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | libqpdf/QUtil.cc | 32 | ||||
-rw-r--r-- | libtests/qutil.cc | 15 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/unicode.in | 5 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/unicode.out | 5 |
5 files changed, 61 insertions, 2 deletions
@@ -1,3 +1,9 @@ +2022-09-26 Jay Berkenbilt <ejb@ql.org> + + * Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc + encoding representation starts with UTF-16 or UTF-8 markers. Fixes + #778. + 2022-09-14 Jay Berkenbilt <ejb@ql.org> * 11.1.0: release diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index bcf4aa4e..7f23bd03 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1565,10 +1565,38 @@ transcode_utf8( { bool okay = true; result.clear(); - if (encoding == e_utf16) { + size_t len = utf8_val.length(); + switch (encoding) { + case e_utf16: result += "\xfe\xff"; + break; + case e_pdfdoc: + // We need to avoid having the result start with something + // that will be interpreted as UTF-16 or UTF-8, meaning we + // can't end up with a string that starts with "fe ff", + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the + // PDF spec, but recognized by most readers including qpdf), + // or "ef bb bf" (UTF-8). It's more efficient to check the + // input string to see if it will map to one of those + // sequences than to check the output string since all cases + // start with the same starting character. + if ((len >= 4) && (utf8_val[0] == '\xc3')) { + static std::string fe_ff("\xbe\xc3\xbf"); + static std::string ff_fe("\xbf\xc3\xbe"); + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf"); + // C++-20 has starts_with, but when this was written, qpdf + // had a minimum supported version of C++-17. + if ((utf8_val.compare(1, 3, fe_ff) == 0) || + (utf8_val.compare(1, 3, ff_fe) == 0) || + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) { + result += unknown; + okay = false; + } + } + break; + default: + break; } - size_t len = utf8_val.length(); size_t pos = 0; while (pos < len) { bool error = false; diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 82c2dd1a..972046b9 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -436,6 +436,21 @@ transcoding_test() assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); std::cout << other_to_utf8 << std::endl; std::cout << "done other characters" << std::endl; + // These valid UTF8 strings when converted to PDFDoc would end up + // with a byte sequence that would be recognized as UTF-8 or + // UTF-16 rather than PDFDoc. A special case is required to store + // them as UTF-16 rather than PDFDoc. + static std::string fe_ff("\xc3\xbe\xc3\xbf potato"); + static std::string ff_fe("\xc3\xbf\xc3\xbe potato"); + static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato"); + assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc)); + assert(pdfdoc == "?\xfe\xff potato"); + assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc)); + assert(pdfdoc == "?\xff\xfe potato"); + assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc)); + assert(pdfdoc == "?\xef\xbb\xbf potato"); + assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc)); + assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc)); } void diff --git a/qpdf/qtest/qpdf/unicode.in b/qpdf/qtest/qpdf/unicode.in index 2984b5f3..1ddf1178 100644 --- a/qpdf/qtest/qpdf/unicode.in +++ b/qpdf/qtest/qpdf/unicode.in @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. 𝄞 𝄢 𝄪 𝅂 This can be encoded in ASCII. This can be encoded in PDFDocEncoding (€). +þÿ -- PDFDoc would look like UTF-16-BE +ÿþ -- PDFDoc would look like UTF-16-LE + -- PDFDoc would look like UTF-8 +ï»» -- PDFDoc okay +þþ -- PDFDoc okay diff --git a/qpdf/qtest/qpdf/unicode.out b/qpdf/qtest/qpdf/unicode.out index c1901585..4f8ee322 100644 --- a/qpdf/qtest/qpdf/unicode.out +++ b/qpdf/qtest/qpdf/unicode.out @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> +þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045> +ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045> + -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038> +ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179> +þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179> |