aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog5
-rw-r--r--TODO8
-rw-r--r--libqpdf/QPDF_String.cc9
-rw-r--r--manual/release-notes.rst3
-rw-r--r--qpdf/qtest/qpdf/numeric-and-string-3.out3
-rw-r--r--qpdf/qtest/qpdf/numeric-and-string-3.pdf21
6 files changed, 38 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index 0622b834..da642862 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2022-02-22 Jay Berkenbilt <ejb@ql.org>
+
+ * Recognize PDF strings explicitly marked as UTF-8 as allowed by
+ the PDF 2.0 spec. Fixes #654.
+
2022-02-18 Jay Berkenbilt <ejb@ql.org>
* Bug fix: when generating appearance streams, the font size was
diff --git a/TODO b/TODO
index c1b1b440..6de30079 100644
--- a/TODO
+++ b/TODO
@@ -10,6 +10,14 @@ Priorities for 11:
* PointerHolder -> shared_ptr
* ABI
+Misc
+* Get rid of "ugly switch statements" in QUtil.cc -- replace with
+ static map initializers. (Search for "ugly switch statements" below
+ as well.)
+* Consider exposing get_next_utf8_codepoint in QUtil
+* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
+ does to detect UTF-8 encoded strings per PDF 2.0 spec.
+
Soon: Break ground on "Document-level work"
Code Formatting
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index f0153b1c..931ccd61 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -183,6 +183,15 @@ QPDF_String::getUTF8Val() const
{
return QUtil::utf16_to_utf8(this->val);
}
+ else if ((val.length() >= 3) &&
+ (val[0] == '\xEF') &&
+ (val[1] == '\xBB') &&
+ (val[2] == '\xBF'))
+ {
+ // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
+ // the above bytes, which is just UTF-8 encoding of U+FEFF.
+ return this->val.substr(3);
+ }
else
{
return QUtil::pdf_doc_to_utf8(this->val);
diff --git a/manual/release-notes.rst b/manual/release-notes.rst
index daec3b25..d7959b96 100644
--- a/manual/release-notes.rst
+++ b/manual/release-notes.rst
@@ -9,6 +9,9 @@ For a detailed list of changes, please see the file
10.6.3: XXX
- Bug fixes:
+ - Recognize strings explicitly encoded as UTF-8 as allowed by the
+ PDF 2.0 spec.
+
- Fix edge cases with appearance stream generation for form fields
whose ``/DA`` field lacks proper font size specification or that
specifies auto sizing. At this time, qpdf does not support auto
diff --git a/qpdf/qtest/qpdf/numeric-and-string-3.out b/qpdf/qtest/qpdf/numeric-and-string-3.out
index 390e9a61..0774b228 100644
--- a/qpdf/qtest/qpdf/numeric-and-string-3.out
+++ b/qpdf/qtest/qpdf/numeric-and-string-3.out
@@ -7,8 +7,9 @@ end page 1
QStrings:
No Special Characters
These: ¿÷¢þ and no more
+Explicit utf-8 with π
πωτατω
-treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮
+treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅯
QNumbers:
1.000
3.142
diff --git a/qpdf/qtest/qpdf/numeric-and-string-3.pdf b/qpdf/qtest/qpdf/numeric-and-string-3.pdf
index 4225d239..b6073704 100644
--- a/qpdf/qtest/qpdf/numeric-and-string-3.pdf
+++ b/qpdf/qtest/qpdf/numeric-and-string-3.pdf
@@ -12,8 +12,9 @@
/QStrings [
(No Special Characters)
(These: and no more)
+ (\357\273\277Explicit utf-8 with \317\200)
<feff03c003c903c403b103c403c9>
- <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60>
+ <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd61>
]
/Type /Catalog
>>
@@ -110,19 +111,19 @@ xref
0 10
0000000000 65535 f
0000000025 00000 n
-0000000377 00000 n
-0000000459 00000 n
-0000000694 00000 n
-0000000793 00000 n
-0000000835 00000 n
-0000000933 00000 n
-0000000952 00000 n
-0000001070 00000 n
+0000000424 00000 n
+0000000506 00000 n
+0000000741 00000 n
+0000000840 00000 n
+0000000882 00000 n
+0000000980 00000 n
+0000000999 00000 n
+0000001117 00000 n
trailer <<
/Root 1 0 R
/Size 10
/ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>]
>>
startxref
-1105
+1152
%%EOF