summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2023-12-21 23:14:28 +0100
committerJay Berkenbilt <ejb@ql.org>2023-12-21 23:43:29 +0100
commit4400ce84eeb204cdcb35950dd8fde094fc249051 (patch)
tree2a60ae462f22d21b35214a3fdaa4af5d63f8b149
parentbb12a7ff8df1582a2cb0583bc463a84f5a736219 (diff)
downloadqpdf-4400ce84eeb204cdcb35950dd8fde094fc249051.tar.zst
Add "n:/pdf-name" to qpdf JSON for binary names (fixes #1072)
-rw-r--r--ChangeLog7
-rw-r--r--libqpdf/QPDF_Name.cc10
-rw-r--r--libqpdf/QPDF_json.cc8
-rw-r--r--manual/json.rst9
-rw-r--r--manual/release-notes.rst7
-rw-r--r--qpdf/qtest/qpdf-json.test17
-rw-r--r--qpdf/qtest/qpdf/weird-tokens.json83
-rw-r--r--qpdf/qtest/qpdf/weird-tokens.pdf95
8 files changed, 234 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 414244be..7450ddbc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
2023-12-21 Jay Berkenbilt <ejb@ql.org>
+ * Fix to QPDF JSON: the syntax "n:/pdf-syntax" is now accepted as
+ an alternative way to represent names. This can be used for any
+ name (e.g. "n:/text#2fplain"), but it is necessary when the name
+ contains binary characters. For example, /one#a0two must be
+ represented as "n:/one#a0two" since the single byte a0 is not
+ valid in JSON. Fixes #1072.
+
* From M. Holger: Refactor QPDFParser for performance. See #1059
for a discussion.
diff --git a/libqpdf/QPDF_Name.cc b/libqpdf/QPDF_Name.cc
index 4597372e..5fde9c65 100644
--- a/libqpdf/QPDF_Name.cc
+++ b/libqpdf/QPDF_Name.cc
@@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version)
if (json_version == 1) {
return JSON::makeString(normalizeName(this->name));
} else {
- return JSON::makeString(this->name);
+ bool has_8bit_chars;
+ bool is_valid_utf8;
+ bool is_utf16;
+ QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16);
+ if (!has_8bit_chars || is_valid_utf8) {
+ return JSON::makeString(this->name);
+ } else {
+ return JSON::makeString("n:" + normalizeName(this->name));
+ }
}
}
diff --git a/libqpdf/QPDF_json.cc b/libqpdf/QPDF_json.cc
index f8fd689a..864e1a56 100644
--- a/libqpdf/QPDF_json.cc
+++ b/libqpdf/QPDF_json.cc
@@ -144,6 +144,12 @@ is_name(std::string const& v)
return ((v.length() > 1) && (v.at(0) == '/'));
}
+static bool
+is_pdf_name(std::string const& v)
+{
+ return ((v.length() > 3) && (v.substr(0, 3) == "n:/"));
+}
+
bool
QPDF::test_json_validators()
{
@@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const& value)
result = QPDFObjectHandle::newString(QUtil::hex_decode(str));
} else if (is_name(str_v)) {
result = QPDFObjectHandle::newName(str_v);
+ } else if (is_pdf_name(str_v)) {
+ result = QPDFObjectHandle::parse(str_v.substr(2));
} else {
QTC::TC("qpdf", "QPDF_json unrecognized string value");
error(value.getStart(), "unrecognized string value");
diff --git a/manual/json.rst b/manual/json.rst
index e848cc65..e07dde3b 100644
--- a/manual/json.rst
+++ b/manual/json.rst
@@ -258,6 +258,12 @@ Object Values
syntax resolved. For example, the name whose canonical form (per
the PDF specification) is ``text/plain`` would be represented in
JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``.
+ Starting with qpdf 11.7.0, the syntax ``"n:/pdf-syntax"`` is
+ accepted as an alternative. This can be used for any name (e.g.
+ ``"n:/text#2fplain"``), but it is necessary when the name contains
+ binary characters. For example, ``/one#a0two`` must be represented
+ as ``"n:/one#a0two"`` since the single byte ``a0`` is not valid in
+ JSON.
- Indirect object references are represented as JSON strings that
look like a PDF indirect object reference and have the form
@@ -824,7 +830,8 @@ version 2.
- Names are shown in qpdf's canonical form rather than in PDF
syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared
as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in
- v2.
+ v2. In qpdf 11.7.0, a fix was made to accept ``"n:/pdf-syntax"``
+ for names containing binary characters.
- The top-level representation of an object in ``"objects"`` is a
dictionary containing either a ``"value"`` key or a ``"stream"``
diff --git a/manual/release-notes.rst b/manual/release-notes.rst
index 5e41fef7..f720f99e 100644
--- a/manual/release-notes.rst
+++ b/manual/release-notes.rst
@@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change):
reference streams, linearization hint streams, and object
streams. This has been fixed.
+ - Fix to QPDF JSON: the syntax ``"n:/pdf-syntax"`` is now accepted
+ as an alternative way to represent names. This can be used for
+ any name (e.g. ``"n:/text#2fplain"``), but it is necessary when
+ the name contains binary characters. For example, ``/one#a0two``
+ must be represented as ``"n:/one#a0two"`` since the single byte
+ ``a0`` is not valid in JSON.
+
- Build Enhancements:
- The qpdf test suite now passes when qpdf is linked with an
diff --git a/qpdf/qtest/qpdf-json.test b/qpdf/qtest/qpdf-json.test
index 961b507a..9691d995 100644
--- a/qpdf/qtest/qpdf-json.test
+++ b/qpdf/qtest/qpdf-json.test
@@ -61,6 +61,7 @@ my @goodfiles = (
'form-fields-and-annotations.pdf',
'need-appearances.pdf',
'fxo-blue.pdf',
+ 'weird-tokens.pdf',
);
$n_tests += 6 * scalar(@goodfiles);
@@ -341,5 +342,21 @@ $td->runtest("check C API write to JSON stream",
{$td->FILE => "auto-4"},
{$td->FILE => "qpdf-ctest-47-4"});
+# Bugs #1072 and #1079 illustrate cases that qpdf-json got wrong. In
+# #1072, it was noticed that name tokens containing binary characters
+# (using #xx) would generate invalid JSON, even though qpdf's own JSON
+# parser would accept it. Also, the JSON spec allows real numbers in
+# scientific notation, but the PDF spec does not.
+$n_tests += 2;
+$td->runtest("handle binary names",
+ {$td->COMMAND =>
+ "qpdf --json-output weird-tokens.pdf a.json"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0});
+# Round-trip is tested above.
+$td->runtest("check json",
+ {$td->FILE => "a.json"},
+ {$td->FILE => "weird-tokens.json"},
+ $td->NORMALIZE_NEWLINES);
+
cleanup();
$td->report($n_tests);
diff --git a/qpdf/qtest/qpdf/weird-tokens.json b/qpdf/qtest/qpdf/weird-tokens.json
new file mode 100644
index 00000000..66f0ff06
--- /dev/null
+++ b/qpdf/qtest/qpdf/weird-tokens.json
@@ -0,0 +1,83 @@
+{
+ "qpdf": [
+ {
+ "jsonversion": 2,
+ "pdfversion": "2.0",
+ "pushedinheritedpageresources": false,
+ "calledgetallpages": false,
+ "maxobjectid": 6
+ },
+ {
+ "obj:1 0 R": {
+ "value": {
+ "/Extra": [
+ "u:Names with binary data",
+ "n:/ABCDEF+#ba#da#cc#e5",
+ "/ABCEDEF+Ï€",
+ "n:/one+#a0two",
+ "/text/plain",
+ "u:Very small/large reals",
+ 0.00001,
+ 1000000000000
+ ],
+ "/Pages": "2 0 R",
+ "/Type": "/Catalog"
+ }
+ },
+ "obj:2 0 R": {
+ "value": {
+ "/Count": 1,
+ "/Kids": [
+ "3 0 R"
+ ],
+ "/Type": "/Pages"
+ }
+ },
+ "obj:3 0 R": {
+ "value": {
+ "/Contents": "4 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "2 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "6 0 R"
+ }
+ },
+ "/Type": "/Page"
+ }
+ },
+ "obj:4 0 R": {
+ "stream": {
+ "data": "QlQKICAvRjEgMjQgVGYKICA3MiA3MjAgVGQKICAoUG90YXRvKSBUagpFVAo=",
+ "dict": {}
+ }
+ },
+ "obj:5 0 R": {
+ "value": 44
+ },
+ "obj:6 0 R": {
+ "value": {
+ "/BaseFont": "/Helvetica",
+ "/Encoding": "/WinAnsiEncoding",
+ "/Subtype": "/Type1",
+ "/Type": "/Font"
+ }
+ },
+ "trailer": {
+ "value": {
+ "/ID": [
+ "b:42841c13bbf709d79a200fa1691836f8",
+ "b:728c020f464c3cf7e02c12605fa7d88b"
+ ],
+ "/Root": "1 0 R",
+ "/Size": 7
+ }
+ }
+ }
+ ]
+}
diff --git a/qpdf/qtest/qpdf/weird-tokens.pdf b/qpdf/qtest/qpdf/weird-tokens.pdf
new file mode 100644
index 00000000..7c645df3
--- /dev/null
+++ b/qpdf/qtest/qpdf/weird-tokens.pdf
@@ -0,0 +1,95 @@
+%PDF-2.0
+%¿÷¢þ
+%QDF-1.0
+
+1 0 obj
+<<
+ /Extra [
+ (Names with binary data)
+ /ABCDEF+#ba#da#cc#e5
+ /ABCEDEF+#cf#80
+ /one+#a0two
+ /text#2fplain
+ (Very small/large reals)
+ 0.00001
+ 1000000000000
+ ]
+ /Pages 2 0 R
+ /Type /Catalog
+>>
+endobj
+
+2 0 obj
+<<
+ /Count 1
+ /Kids [
+ 3 0 R
+ ]
+ /Type /Pages
+>>
+endobj
+
+%% Page 1
+3 0 obj
+<<
+ /Contents 4 0 R
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 6 0 R
+ >>
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+4 0 obj
+<<
+ /Length 5 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Potato) Tj
+ET
+endstream
+endobj
+
+5 0 obj
+44
+endobj
+
+6 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+xref
+0 7
+0000000000 65535 f
+0000000025 00000 n
+0000000261 00000 n
+0000000343 00000 n
+0000000539 00000 n
+0000000638 00000 n
+0000000657 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 7
+ /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>]
+>>
+startxref
+763
+%%EOF