aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2022-05-04 13:32:30 +0200
committerJay Berkenbilt <ejb@ql.org>2022-05-04 14:32:44 +0200
commit8b25de24c9b1e6acba042ea9ecdee783839e20a6 (patch)
tree5e6b24aab3297cc13aeb4802ab2bc77ad9306288
parent6b576797cd2d54c8825e1ebf845ab2618ab4c3fd (diff)
downloadqpdf-8b25de24c9b1e6acba042ea9ecdee783839e20a6.tar.zst
Make "objects" and "pages" consistent in JSON output
-rw-r--r--ChangeLog11
-rw-r--r--cSpell.json1
-rw-r--r--libqpdf/QPDFJob.cc21
-rw-r--r--manual/json.rst10
-rw-r--r--manual/release-notes.rst7
-rw-r--r--qpdf/qtest/qpdf.test22
-rw-r--r--qpdf/qtest/qpdf/direct-pages-json-objects.out (renamed from qpdf/qtest/qpdf/direct-pages-json.out)57
-rw-r--r--qpdf/qtest/qpdf/direct-pages-json-pages.out157
-rw-r--r--qpdf/qtest/qpdf/page_api_2-json-objects.out160
-rw-r--r--qpdf/qtest/qpdf/page_api_2-json-pages.out (renamed from qpdf/qtest/qpdf/page_api_2-json.out)64
10 files changed, 401 insertions, 109 deletions
diff --git a/ChangeLog b/ChangeLog
index dfcadf49..be196fff 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2022-05-04 Jay Berkenbilt <ejb@ql.org>
+
+ * json v1 output: make "pages" and "objects" consistent.
+ Previously, "objects" always reflected the objects exactly as they
+ appeared in the original file, while "pages" reflected objects
+ after repair of the pages tree. This could be misleading. Now, if
+ "pages" is specified, "objects" shows the effects of repairing the
+ page tree, and if not, it doesn't. This makes no difference for
+ correct PDF files that don't have problems in the pages tree. JSON
+ v2 will behave in a similar way.
+
2022-05-03 Jay Berkenbilt <ejb@ql.org>
* Add new Pipeline class Pl_String which appends to a std::string&
diff --git a/cSpell.json b/cSpell.json
index f757e511..2a5a4db4 100644
--- a/cSpell.json
+++ b/cSpell.json
@@ -511,6 +511,7 @@
"unfilterable",
"unparse",
"unpickling",
+ "unrepaired",
"unretrieved",
"unversioned",
"upages",
diff --git a/libqpdf/QPDFJob.cc b/libqpdf/QPDFJob.cc
index 0c9b1583..ca56b8d5 100644
--- a/libqpdf/QPDFJob.cc
+++ b/libqpdf/QPDFJob.cc
@@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF& pdf)
bool all_keys = m->json_keys.empty();
// The list of selectable top-level keys id duplicated in the
// following places: job.yml, QPDFJob::json_schema, and
- // QPDFJob::doJSON. We do objects and objectinfo first so they
- // reflect the original file without any side effects caused by
- // other operations, such as repairing the pages tree.
- if (all_keys || m->json_keys.count("objects")) {
- doJSONObjects(pdf, j);
- }
- if (all_keys || m->json_keys.count("objectinfo")) {
- doJSONObjectinfo(pdf, j);
- }
+ // QPDFJob::doJSON.
if (all_keys || m->json_keys.count("pages")) {
doJSONPages(pdf, j);
}
@@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF& pdf)
doJSONAttachments(pdf, j);
}
+ // We do objects and objectinfo last so their information is
+ // consistent with repairing the page tree. To see the original
+ // file with any page tree problems and the page tree not
+ // flattened, select objects/objectinfo without other keys.
+ if (all_keys || m->json_keys.count("objects")) {
+ doJSONObjects(pdf, j);
+ }
+ if (all_keys || m->json_keys.count("objectinfo")) {
+ doJSONObjectinfo(pdf, j);
+ }
+
// Check against schema
JSON schema = json_schema(&m->json_keys);
diff --git a/manual/json.rst b/manual/json.rst
index 358cac90..ef6bed96 100644
--- a/manual/json.rst
+++ b/manual/json.rst
@@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need
to know about the JSON format, but there are a few non-obvious things to
be aware of:
+- If a PDF file has certain types of errors in its pages tree (such as
+ page objects that are direct or multiple pages sharing the same
+ object ID), qpdf will automatically repair the pages tree. If you
+ specify ``"objects"`` and/or ``"objectinfo"`` without any other
+ keys, you will see the original pages tree without any corrections.
+ If you specify any of keys that require page tree traversal (for
+ example, ``"pages"``, ``"outlines"``, or ``"pagelabel"``), then
+ ``"objects"`` and ``"objectinfo"`` will show the repaired page tree
+ so that object references will be consistent throughout the file.
+
- While qpdf guarantees that keys present in the help will be present
in the output, those fields may be null or empty if the information
is not known or absent in the file. Also, if you specify
diff --git a/manual/release-notes.rst b/manual/release-notes.rst
index 08e2fd52..f313cd82 100644
--- a/manual/release-notes.rst
+++ b/manual/release-notes.rst
@@ -125,6 +125,13 @@ For a detailed list of changes, please see the file
- Other changes
+ - In JSON v1 mode, the ``"objects"`` key now reflects the repaired
+ pages tree if ``"pages"`` (or any other key that has the side
+ effect of repairing the page tree) is specified. To see the
+ original objects with any unrepaired page tree errors, specify
+ ``"objects"`` and/or ``"objectinfo"`` by themselves. This is
+ consistent with how JSON v2 behaves.
+
- A new chapter on contributing to qpdf has been added to the
documentation. See :ref:`contributing`.
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index d8359f75..3b26c9c8 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -2829,7 +2829,7 @@ $td->runtest("check output",
show_ntests();
# ----------
$td->notify("--- Page Tree Issues ---");
-$n_tests += 9;
+$n_tests += 11;
$td->runtest("linearize duplicated pages",
{$td->COMMAND =>
@@ -2864,14 +2864,22 @@ $td->runtest("show direct pages",
$td->NORMALIZE_NEWLINES);
# Json mode for direct and duplicated pages illustrates that the
-# "objects" section still shows the original objects before correction
-# but the "pages" section shows the pages with their new object
-# numbers.
+# "objects" section the original objects before correction when
+# "pages" is not output but after correct when it is.# numbers.
foreach my $f (qw(page_api_2 direct-pages))
{
- $td->runtest("json for $f",
- {$td->COMMAND => "qpdf --json=latest $f.pdf"},
- {$td->FILE => "$f-json.out", $td->EXIT_STATUS => 0},
+ $td->runtest("json for $f (objects only)",
+ {$td->COMMAND =>
+ "qpdf --json=latest $f.pdf" .
+ " --json-key=objects --json-key=objectinfo"},
+ {$td->FILE => "$f-json-objects.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+ $td->runtest("json for $f (with pages)",
+ {$td->COMMAND =>
+ "qpdf --json=latest $f.pdf" .
+ " --json-key=objects --json-key=objectinfo" .
+ " --json-key=pages"},
+ {$td->FILE => "$f-json-pages.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
}
diff --git a/qpdf/qtest/qpdf/direct-pages-json.out b/qpdf/qtest/qpdf/direct-pages-json-objects.out
index 52e5e2dd..1c1c50a3 100644
--- a/qpdf/qtest/qpdf/direct-pages-json.out
+++ b/qpdf/qtest/qpdf/direct-pages-json-objects.out
@@ -1,37 +1,4 @@
{
- "acroform": {
- "fields": [],
- "hasacroform": false,
- "needappearances": false
- },
- "attachments": {},
- "encrypt": {
- "capabilities": {
- "accessibility": true,
- "extract": true,
- "moddifyannotations": true,
- "modify": true,
- "modifyassembly": true,
- "modifyforms": true,
- "modifyother": true,
- "printhigh": true,
- "printlow": true
- },
- "encrypted": false,
- "ownerpasswordmatched": false,
- "parameters": {
- "P": 0,
- "R": 0,
- "V": 0,
- "bits": 0,
- "filemethod": "none",
- "key": null,
- "method": "none",
- "streammethod": "none",
- "stringmethod": "none"
- },
- "userpasswordmatched": false
- },
"objectinfo": {
"1 0 R": {
"stream": {
@@ -145,30 +112,6 @@
"/Size": 7
}
},
- "outlines": [],
- "pagelabels": [],
- "pages": [
- {
- "contents": [
- "3 0 R"
- ],
- "images": [],
- "label": null,
- "object": "7 0 R",
- "outlines": [],
- "pageposfrom1": 1
- },
- {
- "contents": [
- "3 0 R"
- ],
- "images": [],
- "label": null,
- "object": "8 0 R",
- "outlines": [],
- "pageposfrom1": 2
- }
- ],
"parameters": {
"decodelevel": "generalized"
},
diff --git a/qpdf/qtest/qpdf/direct-pages-json-pages.out b/qpdf/qtest/qpdf/direct-pages-json-pages.out
new file mode 100644
index 00000000..ee2c03d4
--- /dev/null
+++ b/qpdf/qtest/qpdf/direct-pages-json-pages.out
@@ -0,0 +1,157 @@
+{
+ "objectinfo": {
+ "1 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "2 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "3 0 R": {
+ "stream": {
+ "filter": null,
+ "is": true,
+ "length": 44
+ }
+ },
+ "4 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "5 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "6 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "7 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "8 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ }
+ },
+ "objects": {
+ "1 0 R": {
+ "/Pages": "2 0 R",
+ "/Type": "/Catalog"
+ },
+ "2 0 R": {
+ "/Count": 2,
+ "/Kids": [
+ "7 0 R",
+ "8 0 R"
+ ],
+ "/Type": "/Pages"
+ },
+ "3 0 R": {
+ "/Length": "4 0 R"
+ },
+ "4 0 R": 44,
+ "5 0 R": {
+ "/BaseFont": "/Helvetica",
+ "/Encoding": "/WinAnsiEncoding",
+ "/Name": "/F1",
+ "/Subtype": "/Type1",
+ "/Type": "/Font"
+ },
+ "6 0 R": [
+ "/PDF",
+ "/Text"
+ ],
+ "7 0 R": {
+ "/Contents": "3 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "2 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "5 0 R"
+ },
+ "/ProcSet": "6 0 R"
+ },
+ "/Type": "/Page"
+ },
+ "8 0 R": {
+ "/Contents": "3 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "2 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "5 0 R"
+ },
+ "/ProcSet": "6 0 R"
+ },
+ "/Type": "/Page"
+ },
+ "trailer": {
+ "/ID": [
+ "\u0013#¥fi|WzfsU…©6ŸÎ<",
+ "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
+ ],
+ "/Root": "1 0 R",
+ "/Size": 7
+ }
+ },
+ "pages": [
+ {
+ "contents": [
+ "3 0 R"
+ ],
+ "images": [],
+ "label": null,
+ "object": "7 0 R",
+ "outlines": [],
+ "pageposfrom1": 1
+ },
+ {
+ "contents": [
+ "3 0 R"
+ ],
+ "images": [],
+ "label": null,
+ "object": "8 0 R",
+ "outlines": [],
+ "pageposfrom1": 2
+ }
+ ],
+ "parameters": {
+ "decodelevel": "generalized"
+ },
+ "version": 1
+}
diff --git a/qpdf/qtest/qpdf/page_api_2-json-objects.out b/qpdf/qtest/qpdf/page_api_2-json-objects.out
new file mode 100644
index 00000000..76feb0d6
--- /dev/null
+++ b/qpdf/qtest/qpdf/page_api_2-json-objects.out
@@ -0,0 +1,160 @@
+{
+ "objectinfo": {
+ "1 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "10 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "2 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "3 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "4 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "5 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "6 0 R": {
+ "stream": {
+ "filter": null,
+ "is": true,
+ "length": 47
+ }
+ },
+ "7 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "8 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
+ "9 0 R": {
+ "stream": {
+ "filter": null,
+ "is": true,
+ "length": 47
+ }
+ }
+ },
+ "objects": {
+ "1 0 R": {
+ "/Pages": "3 0 R",
+ "/Type": "/Catalog"
+ },
+ "10 0 R": 47,
+ "2 0 R": {
+ "/CreationDate": "D:20120621124041",
+ "/Producer": "Apex PDFWriter"
+ },
+ "3 0 R": {
+ "/Count": 3,
+ "/Kids": [
+ "4 0 R",
+ "4 0 R",
+ "5 0 R"
+ ],
+ "/Type": "/Pages"
+ },
+ "4 0 R": {
+ "/Contents": "6 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "3 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "8 0 R"
+ },
+ "/ProcSet": [
+ "/PDF",
+ "/Text"
+ ]
+ },
+ "/Type": "/Page"
+ },
+ "5 0 R": {
+ "/Contents": "9 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "3 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "8 0 R"
+ },
+ "/ProcSet": [
+ "/PDF",
+ "/Text"
+ ]
+ },
+ "/Type": "/Page"
+ },
+ "6 0 R": {
+ "/Length": "7 0 R"
+ },
+ "7 0 R": 47,
+ "8 0 R": {
+ "/BaseFont": "/Times-Roman",
+ "/Encoding": "/WinAnsiEncoding",
+ "/Subtype": "/Type1",
+ "/Type": "/Font"
+ },
+ "9 0 R": {
+ "/Length": "10 0 R"
+ },
+ "trailer": {
+ "/ID": [
+ "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
+ "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
+ ],
+ "/Info": "2 0 R",
+ "/Root": "1 0 R",
+ "/Size": 11
+ }
+ },
+ "parameters": {
+ "decodelevel": "generalized"
+ },
+ "version": 1
+}
diff --git a/qpdf/qtest/qpdf/page_api_2-json.out b/qpdf/qtest/qpdf/page_api_2-json-pages.out
index bef00d02..d08e18d6 100644
--- a/qpdf/qtest/qpdf/page_api_2-json.out
+++ b/qpdf/qtest/qpdf/page_api_2-json-pages.out
@@ -1,37 +1,4 @@
{
- "acroform": {
- "fields": [],
- "hasacroform": false,
- "needappearances": false
- },
- "attachments": {},
- "encrypt": {
- "capabilities": {
- "accessibility": true,
- "extract": true,
- "moddifyannotations": true,
- "modify": true,
- "modifyassembly": true,
- "modifyforms": true,
- "modifyother": true,
- "printhigh": true,
- "printlow": true
- },
- "encrypted": false,
- "ownerpasswordmatched": false,
- "parameters": {
- "P": 0,
- "R": 0,
- "V": 0,
- "bits": 0,
- "filemethod": "none",
- "key": null,
- "method": "none",
- "streammethod": "none",
- "stringmethod": "none"
- },
- "userpasswordmatched": false
- },
"objectinfo": {
"1 0 R": {
"stream": {
@@ -47,6 +14,13 @@
"length": null
}
},
+ "11 0 R": {
+ "stream": {
+ "filter": null,
+ "is": false,
+ "length": null
+ }
+ },
"2 0 R": {
"stream": {
"filter": null,
@@ -110,6 +84,26 @@
"/Type": "/Catalog"
},
"10 0 R": 47,
+ "11 0 R": {
+ "/Contents": "6 0 R",
+ "/MediaBox": [
+ 0,
+ 0,
+ 612,
+ 792
+ ],
+ "/Parent": "3 0 R",
+ "/Resources": {
+ "/Font": {
+ "/F1": "8 0 R"
+ },
+ "/ProcSet": [
+ "/PDF",
+ "/Text"
+ ]
+ },
+ "/Type": "/Page"
+ },
"2 0 R": {
"/CreationDate": "D:20120621124041",
"/Producer": "Apex PDFWriter"
@@ -118,7 +112,7 @@
"/Count": 3,
"/Kids": [
"4 0 R",
- "4 0 R",
+ "11 0 R",
"5 0 R"
],
"/Type": "/Pages"
@@ -186,8 +180,6 @@
"/Size": 11
}
},
- "outlines": [],
- "pagelabels": [],
"pages": [
{
"contents": [