From c76536dd9a150adb71fdcda11ee1a93f25128cc7 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 7 May 2022 13:33:45 -0400 Subject: Implement JSON v2 output --- libqpdf/QPDFJob.cc | 156 +++++++++++++++++++++++++++++++++---- libqpdf/QPDFJob_config.cc | 23 ++++++ libqpdf/QPDFObjectHandle.cc | 2 +- libqpdf/QPDF_Stream.cc | 16 ++-- libqpdf/qpdf/QPDF_Stream.hh | 2 +- libqpdf/qpdf/auto_job_help.hh | 21 ++++- libqpdf/qpdf/auto_job_init.hh | 5 +- libqpdf/qpdf/auto_job_json_init.hh | 9 ++- libqpdf/qpdf/auto_job_schema.hh | 2 + 9 files changed, 209 insertions(+), 27 deletions(-) (limited to 'libqpdf') diff --git a/libqpdf/QPDFJob.cc b/libqpdf/QPDFJob.cc index 521377f8..621e6933 100644 --- a/libqpdf/QPDFJob.cc +++ b/libqpdf/QPDFJob.cc @@ -401,6 +401,7 @@ QPDFJob::Members::Members() : flatten_rotation(false), list_attachments(false), json_version(0), + json_stream_data(qpdf_sj_none), test_json_schema(false), check(false), optimize_images(false), @@ -695,6 +696,17 @@ QPDFJob::checkConfiguration() " use --replace-input to intentionally" " overwrite the input file"); } + + if (m->json_version == 1) { + if (m->json_keys.count("qpdf")) { + usage("json key \"qpdf\" is not valid for json version 1"); + } + } else { + if (m->json_keys.count("objects") || m->json_keys.count("objectinfo")) { + usage("json keys \"objects\" and \"objectinfo\" are only valid for " + "json version 1"); + } + } } unsigned long @@ -1103,6 +1115,102 @@ QPDFJob::doJSONObjectinfo(Pipeline* p, bool& first, QPDF& pdf) JSON::writeDictionaryClose(p, first_object, 1); } +void +QPDFJob::doJSONStream( + Pipeline* p, + bool& first, + QPDF& pdf, + QPDFObjectHandle& obj, + std::string const& file_prefix) +{ + Pipeline* stream_p = nullptr; + FILE* f = nullptr; + std::shared_ptr f_pl; + std::string filename; + if (this->m->json_stream_data == qpdf_sj_file) { + filename = file_prefix + "-" + QUtil::int_to_string(obj.getObjectID()); + f = QUtil::safe_fopen(filename.c_str(), "wb"); + f_pl = std::make_shared("stream data", f); + stream_p = f_pl.get(); + } + auto j = JSON::makeDictionary(); + j.addDictionaryMember( + "stream", + obj.getStreamJSON( + this->m->json_version, + this->m->json_stream_data, + this->m->decode_level, + stream_p, + filename)); + + JSON::writeDictionaryItem(p, first, "obj:" + obj.unparse(), j, 2); + if (f) { + f_pl->finish(); + f_pl = nullptr; + fclose(f); + } +} + +void +QPDFJob::doJSONObject( + Pipeline* p, + bool& first, + QPDF& pdf, + std::string const& key, + QPDFObjectHandle& obj) +{ + auto j = JSON::makeDictionary(); + j.addDictionaryMember("value", obj.getJSON(this->m->json_version, true)); + JSON::writeDictionaryItem(p, first, key, j, 2); +} + +void +QPDFJob::doJSONQpdf(Pipeline* p, bool& first, QPDF& pdf) +{ + std::string file_prefix = this->m->json_stream_prefix; + if (this->m->json_stream_data == qpdf_sj_file) { + if (file_prefix.empty()) { + if (this->m->infilename.get()) { + file_prefix = this->m->infilename.get(); + } + if (file_prefix.empty()) { + usage( + "please specify --json-stream-prefix since the input file " + "name is unknown"); + } + } + } + + JSON::writeDictionaryKey(p, first, "qpdf", 0); + bool first_qpdf = true; + JSON::writeDictionaryOpen(p, first_qpdf, 1); + JSON::writeDictionaryItem( + p, first_qpdf, "jsonversion", JSON::makeInt(this->m->json_version), 1); + JSON::writeDictionaryItem( + p, first_qpdf, "pdfversion", JSON::makeString(pdf.getPDFVersion()), 1); + JSON::writeDictionaryKey(p, first_qpdf, "objects", 1); + bool first_object = true; + JSON::writeDictionaryOpen(p, first_object, 2); + bool all_objects = m->json_objects.empty(); + std::set wanted_og = getWantedJSONObjects(); + std::vector objects = pdf.getAllObjects(); + for (auto& obj: objects) { + if (all_objects || wanted_og.count(obj.getObjGen())) { + if (obj.isStream()) { + doJSONStream(p, first_object, pdf, obj, file_prefix); + } else { + doJSONObject(p, first_object, pdf, "obj:" + obj.unparse(), obj); + } + } + } + if (all_objects || m->json_objects.count("trailer")) { + auto trailer = pdf.getTrailer(); + doJSONObject(p, first_object, pdf, "trailer", trailer); + } + JSON::writeDictionaryClose(p, first_object, 2); + JSON::writeDictionaryClose(p, first_qpdf, 1); +} + void QPDFJob::doJSONPages(Pipeline* p, bool& first, QPDF& pdf) { @@ -1482,14 +1590,15 @@ QPDFJob::json_schema(int json_version, std::set* keys) // The list of selectable top-level keys id duplicated in the // following places: job.yml, QPDFJob::json_schema, and // QPDFJob::doJSON. - if (all_keys || keys->count("objects")) { - schema.addDictionaryMember("objects", JSON::parse(R"({ + if (json_version == 1) { + if (all_keys || keys->count("objects")) { + schema.addDictionaryMember("objects", JSON::parse(R"({ "": "json representation of object" })")); - } - if (all_keys || keys->count("objectinfo")) { - JSON objectinfo = - schema.addDictionaryMember("objectinfo", JSON::parse(R"({ + } + if (all_keys || keys->count("objectinfo")) { + JSON objectinfo = + schema.addDictionaryMember("objectinfo", JSON::parse(R"({ "": { "stream": { "filter": "if stream, its filters, otherwise null", @@ -1498,6 +1607,17 @@ QPDFJob::json_schema(int json_version, std::set* keys) } } })")); + } + } else { + if (all_keys || keys->count("qpdf")) { + schema.addDictionaryMember("qpdf", JSON::parse(R"({ + "jsonversion": "qpdf json output version", + "pdfversion": "PDF version from PDF header", + "objects": { + "": "json representation of object" + } +})")); + } } if (all_keys || keys->count("pages")) { JSON page = schema.addDictionaryMember("pages", JSON::parse(R"([ @@ -1705,15 +1825,21 @@ QPDFJob::doJSON(QPDF& pdf, Pipeline* p) doJSONOutlines(p, first, pdf); } - // We do objects and objectinfo last so their information is - // consistent with repairing the page tree. To see the original - // file with any page tree problems and the page tree not - // flattened, select objects/objectinfo without other keys. - if (all_keys || m->json_keys.count("objects")) { - doJSONObjects(p, first, pdf); - } - if (all_keys || m->json_keys.count("objectinfo")) { - doJSONObjectinfo(p, first, pdf); + // We do objects last so their information is consistent with + // repairing the page tree. To see the original file with any page + // tree problems and the page tree not flattened, select + // objects/objectinfo without other keys. + if (this->m->json_version == 1) { + if (all_keys || m->json_keys.count("objects")) { + doJSONObjects(p, first, pdf); + } + if (all_keys || m->json_keys.count("objectinfo")) { + doJSONObjectinfo(p, first, pdf); + } + } else { + if (all_keys || m->json_keys.count("qpdf")) { + doJSONQpdf(p, first, pdf); + } } JSON::writeDictionaryClose(p, first, 0); diff --git a/libqpdf/QPDFJob_config.cc b/libqpdf/QPDFJob_config.cc index 3f8f0840..d990de37 100644 --- a/libqpdf/QPDFJob_config.cc +++ b/libqpdf/QPDFJob_config.cc @@ -260,6 +260,29 @@ QPDFJob::Config::jsonObject(std::string const& parameter) return this; } +QPDFJob::Config* +QPDFJob::Config::jsonStreamData(std::string const& parameter) +{ + if (parameter == "none") { + o.m->json_stream_data = qpdf_sj_none; + } else if (parameter == "inline") { + o.m->json_stream_data = qpdf_sj_inline; + } else if (parameter == "file") { + o.m->json_stream_data = qpdf_sj_file; + } else { + usage("invalid json-streams option"); + } + + return this; +} + +QPDFJob::Config* +QPDFJob::Config::jsonStreamPrefix(std::string const& parameter) +{ + o.m->json_stream_prefix = parameter; + return this; +} + QPDFJob::Config* QPDFJob::Config::testJsonSchema() { diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 1d6a9ccf..10fb153c 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -1800,7 +1800,7 @@ QPDFObjectHandle::getJSON(int json_version, bool dereference_indirect) JSON QPDFObjectHandle::getStreamJSON( int json_version, - qpdf_stream_data_json_e json_data, + qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename) diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc index 67a3ad0d..ff62df73 100644 --- a/libqpdf/QPDF_Stream.cc +++ b/libqpdf/QPDF_Stream.cc @@ -189,7 +189,7 @@ QPDF_Stream::getJSON(int json_version) JSON QPDF_Stream::getStreamJSON( int json_version, - qpdf_stream_data_json_e json_data, + qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename) @@ -231,11 +231,17 @@ QPDF_Stream::getStreamJSON( } else { data_pipeline = &discard; } - filtered = pipeStreamData( - data_pipeline, nullptr, 0, decode_level, false, (attempt == 1)); - if (filter && (!filtered)) { + bool succeeded = pipeStreamData( + data_pipeline, + &filtered, + 0, + decode_level, + false, + (attempt == 1)); + if ((!succeeded) || (filter && (!filtered))) { // Try again filter = false; + decode_level = qpdf_dl_none; } else { if (buf_pl.get()) { buf = buf_pl->getBufferSharedPointer(); @@ -247,7 +253,7 @@ QPDF_Stream::getStreamJSON( // touching top-level keys. dict = this->stream_dict.unsafeShallowCopy(); dict.removeKey("/Length"); - if (filtered) { + if (filter && filtered) { dict.removeKey("/Filter"); dict.removeKey("/DecodeParms"); } diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh index fcf98ffa..51b215e2 100644 --- a/libqpdf/qpdf/QPDF_Stream.hh +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -63,7 +63,7 @@ class QPDF_Stream: public QPDFObject addTokenFilter(std::shared_ptr token_filter); JSON getStreamJSON( int json_version, - qpdf_stream_data_json_e json_data, + qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename); diff --git a/libqpdf/qpdf/auto_job_help.hh b/libqpdf/qpdf/auto_job_help.hh index 55d2cc63..47210371 100644 --- a/libqpdf/qpdf/auto_job_help.hh +++ b/libqpdf/qpdf/auto_job_help.hh @@ -817,6 +817,21 @@ objects will be shown. ap.addOptionHelp("--job-json-help", "json", "show format of job JSON", R"(Describe the format of the QPDFJob JSON input used by --job-json-file. )"); +ap.addOptionHelp("--json-stream-data", "json", "how to handle streams in json output", R"(--json-stream-data={none|inline|file} + +Control whether streams in json output should be omitted, +written inline (base64-encoded) or written to a file. If "file" +is chosen, the file will be the name of the input file appended +with -nnn where nnn is the object number. The prefix can be +overridden with --json-stream-prefix. +)"); +ap.addOptionHelp("--json-stream-prefix", "json", "prefix for json stream data files", R"(--json-stream-prefix=file-prefix + +When --json-stream-data=file is given, override the input file +name as the prefix for stream data files. Whatever is given here +will be appended with -nnn to create the name of the file that +will contain the data for the stream stream in object nnn. +)"); ap.addHelpTopic("testing", "options for testing or debugging", R"(The options below are useful when writing automated test code that includes files created by qpdf or when testing qpdf itself. )"); @@ -829,6 +844,9 @@ for testing only so that output files can be reproducible. Never use it for production files. This option is not secure since it significantly weakens the encryption. )"); +} +static void add_help_8(QPDFArgParser& ap) +{ ap.addOptionHelp("--linearize-pass1", "testing", "save pass 1 of linearization", R"(--linearize-pass1=file Write the first pass of linearization to the named file. The @@ -839,9 +857,6 @@ ap.addOptionHelp("--test-json-schema", "testing", "test generated json against s the output of qpdf --json and the output of qpdf --json-help. )"); } -static void add_help_8(QPDFArgParser& ap) -{ -} static void add_help(QPDFArgParser& ap) { add_help_1(ap); diff --git a/libqpdf/qpdf/auto_job_init.hh b/libqpdf/qpdf/auto_job_init.hh index b3191d4d..5c13275c 100644 --- a/libqpdf/qpdf/auto_job_init.hh +++ b/libqpdf/qpdf/auto_job_init.hh @@ -20,7 +20,8 @@ static char const* object_streams_choices[] = {"disable", "preserve", "generate" static char const* remove_unref_choices[] = {"auto", "yes", "no", 0}; static char const* flatten_choices[] = {"all", "print", "screen", 0}; static char const* json_version_choices[] = {"1", "2", "latest", 0}; -static char const* json_key_choices[] = {"acroform", "attachments", "encrypt", "objectinfo", "objects", "outlines", "pagelabels", "pages", 0}; +static char const* json_key_choices[] = {"acroform", "attachments", "encrypt", "objectinfo", "objects", "outlines", "pagelabels", "pages", "qpdf", 0}; +static char const* json_stream_data_choices[] = {"none", "inline", "file", 0}; static char const* print128_choices[] = {"full", "low", "none", 0}; static char const* modify128_choices[] = {"all", "annotate", "form", "assembly", "none", 0}; @@ -101,6 +102,7 @@ this->ap.addRequiredParameter("remove-attachment", [this](std::string const& x){ this->ap.addRequiredParameter("rotate", [this](std::string const& x){c_main->rotate(x);}, "[+|-]angle"); this->ap.addRequiredParameter("show-attachment", [this](std::string const& x){c_main->showAttachment(x);}, "attachment"); this->ap.addRequiredParameter("show-object", [this](std::string const& x){c_main->showObject(x);}, "trailer"); +this->ap.addRequiredParameter("json-stream-prefix", [this](std::string const& x){c_main->jsonStreamPrefix(x);}, "stream-file-prefix"); this->ap.addOptionalParameter("collate", [this](std::string const& x){c_main->collate(x);}); this->ap.addOptionalParameter("split-pages", [this](std::string const& x){c_main->splitPages(x);}); this->ap.addChoices("compress-streams", [this](std::string const& x){c_main->compressStreams(x);}, true, yn_choices); @@ -113,6 +115,7 @@ this->ap.addChoices("object-streams", [this](std::string const& x){c_main->objec this->ap.addChoices("password-mode", [this](std::string const& x){c_main->passwordMode(x);}, true, password_mode_choices); this->ap.addChoices("remove-unreferenced-resources", [this](std::string const& x){c_main->removeUnreferencedResources(x);}, true, remove_unref_choices); this->ap.addChoices("stream-data", [this](std::string const& x){c_main->streamData(x);}, true, stream_data_choices); +this->ap.addChoices("json-stream-data", [this](std::string const& x){c_main->jsonStreamData(x);}, true, json_stream_data_choices); this->ap.addChoices("json", [this](std::string const& x){c_main->json(x);}, false, json_version_choices); this->ap.registerOptionTable("pages", b(&ArgParser::argEndPages)); this->ap.addPositional(p(&ArgParser::argPagesPositional)); diff --git a/libqpdf/qpdf/auto_job_json_init.hh b/libqpdf/qpdf/auto_job_json_init.hh index 92c4d65c..c73eb3a7 100644 --- a/libqpdf/qpdf/auto_job_json_init.hh +++ b/libqpdf/qpdf/auto_job_json_init.hh @@ -13,7 +13,8 @@ static char const* object_streams_choices[] = {"disable", "preserve", "generate" static char const* remove_unref_choices[] = {"auto", "yes", "no", 0}; static char const* flatten_choices[] = {"all", "print", "screen", 0}; static char const* json_version_choices[] = {"1", "2", "latest", 0}; -static char const* json_key_choices[] = {"acroform", "attachments", "encrypt", "objectinfo", "objects", "outlines", "pagelabels", "pages", 0}; +static char const* json_key_choices[] = {"acroform", "attachments", "encrypt", "objectinfo", "objects", "outlines", "pagelabels", "pages", "qpdf", 0}; +static char const* json_stream_data_choices[] = {"none", "inline", "file", 0}; static char const* print128_choices[] = {"full", "low", "none", 0}; static char const* modify128_choices[] = {"all", "annotate", "form", "assembly", "none", 0}; @@ -252,6 +253,12 @@ beginArray(bindJSON(&Handlers::beginJsonObjectArray), bindBare(&Handlers::endJso addParameter([this](std::string const& p) { c_main->jsonObject(p); }); popHandler(); // array: .jsonObject[] popHandler(); // key: jsonObject +pushKey("jsonStreamData"); +addChoices(json_stream_data_choices, true, [this](std::string const& p) { c_main->jsonStreamData(p); }); +popHandler(); // key: jsonStreamData +pushKey("jsonStreamPrefix"); +addParameter([this](std::string const& p) { c_main->jsonStreamPrefix(p); }); +popHandler(); // key: jsonStreamPrefix pushKey("allowWeakCrypto"); addBare([this]() { c_main->allowWeakCrypto(); }); popHandler(); // key: allowWeakCrypto diff --git a/libqpdf/qpdf/auto_job_schema.hh b/libqpdf/qpdf/auto_job_schema.hh index 267dad23..c2310961 100644 --- a/libqpdf/qpdf/auto_job_schema.hh +++ b/libqpdf/qpdf/auto_job_schema.hh @@ -84,6 +84,8 @@ static constexpr char const* JOB_SCHEMA_DATA = R"({ "jsonObject": [ "limit which objects are in JSON" ], + "jsonStreamData": "how to handle streams in json output", + "jsonStreamPrefix": "prefix for json stream data files", "allowWeakCrypto": "allow insecure cryptographic algorithms", "keepFilesOpen": "manage keeping multiple files open", "keepFilesOpenThreshold": "set threshold for keepFilesOpen", -- cgit v1.2.3-54-g00ecf