aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QUtil.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2022-02-15 14:29:29 +0100
committerJay Berkenbilt <ejb@ql.org>2022-02-15 14:32:38 +0100
commit1065bbb0165b4608bd715866332751be9213cd51 (patch)
tree1d6a5687ec36503843abd1629e7b42e376708042 /libqpdf/QUtil.cc
parent2b8d0f385b56d2a7307679ace4c50adbdbbddd03 (diff)
downloadqpdf-1065bbb0165b4608bd715866332751be9213cd51.tar.zst
Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)
There are codepoints in PDFDoc that are not valid UTF-8 but map to valid UTF-8. We were handling those correctly with bidirectional mapping. However, if those same code points appeared in UTF-8, where they have no meaning, they were left as fixed points when converting to PDFDoc, where they do have meaning. This change recognizes them as errors.
Diffstat (limited to 'libqpdf/QUtil.cc')
-rw-r--r--libqpdf/QUtil.cc17
1 files changed, 17 insertions, 0 deletions
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index c4aa3afb..f01746b6 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
{
result += QUtil::toUTF16(QIntC::to_ulong(ch));
}
+ else if ((encoding == e_pdfdoc) &&
+ (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127)))
+ {
+ // PDFDocEncoding maps some low characters to Unicode,
+ // so if we encounter those invalid UTF-8 code points,
+ // map them to unknown so reversing the mapping
+ // doesn't change them into other characters.
+ okay = false;
+ result.append(1, unknown);
+ }
else
{
result.append(1, ch);
@@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result,
{
result += QUtil::toUTF16(codepoint);
}
+ else if ((codepoint == 0xad) && (encoding == e_pdfdoc))
+ {
+ // PDFDocEncoding omits 0x00ad (soft hyphen), but rather
+ // than treating it as undefined, map it to a regular
+ // hyphen.
+ result.append(1, '-');
+ }
else if ((codepoint > 160) && (codepoint < 256) &&
((encoding == e_winansi) || (encoding == e_pdfdoc)))
{