From 7e7a9c437982b7ede2af9cd0b12b3e47b4bc3a3d Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 15 May 2022 17:38:06 -0400 Subject: Parse objects; stream data is not yet handled --- libqpdf/QPDF_json.cc | 373 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 321 insertions(+), 52 deletions(-) (limited to 'libqpdf/QPDF_json.cc') diff --git a/libqpdf/QPDF_json.cc b/libqpdf/QPDF_json.cc index 2f74a673..d71c75ba 100644 --- a/libqpdf/QPDF_json.cc +++ b/libqpdf/QPDF_json.cc @@ -1,40 +1,101 @@ #include #include +#include #include #include #include -namespace -{ - class JSONExc: public std::runtime_error - { - public: - JSONExc(JSON const& value, std::string const& msg) : - std::runtime_error( - "offset " + QUtil::uint_to_string(value.getStart()) + ": " + - msg) - { - } - }; -} // namespace +// This chart shows an example of the state transitions that would +// occur in parsing a minimal file. + +// | st_initial +// { | -> st_top +// "qpdf": { | -> st_qpdf +// "objects": { | -> st_objects +// "obj:1 0 R": { | -> st_object_top +// "value": { | -> st_object +// "/Pages": "2 0 R", | ... +// "/Type": "/Catalog" | ... +// } | <- st_object_top +// }, | <- st_objects +// "obj:2 0 R": { | -> st_object_top +// "value": 12 | -> st_object +// } | <- st_object_top +// }, | <- st_objects +// "obj:4 0 R": { | -> st_object_top +// "stream": { | -> st_stream +// "data": "cG90YXRv", | ... +// "dict": { | -> st_object +// "/K": true | ... +// } | <- st_stream +// } | <- st_object_top +// }, | <- st_objects +// "trailer": { | -> st_trailer +// "value": { | -> st_object +// "/Root": "1 0 R", | ... +// "/Size": 7 | ... +// } | <- st_trailer +// } | <- st_objects +// } | <- st_qpdf +// } | <- st_top +// } | <- st_initial + +static char const* JSON_PDF = ( + // force line break + "%PDF-1.3\n" + "xref\n" + "0 1\n" + "0000000000 65535 f \n" + "trailer << /Size 1 >>\n" + "startxref\n" + "9\n" + "%%EOF\n"); static std::regex PDF_VERSION_RE("^\\d+\\.\\d+$"); static std::regex OBJ_KEY_RE("^obj:(\\d+) (\\d+) R$"); +static std::regex INDIRECT_OBJ_RE("^(\\d+) (\\d+) R$"); +static std::regex UNICODE_RE("^u:(.*)$"); +static std::regex BINARY_RE("^b:((?:[0-9a-fA-F]{2})*)$"); +static std::regex NAME_RE("^/.*$"); -QPDF::JSONReactor::JSONReactor(QPDF& pdf, bool must_be_complete) : +QPDF::JSONReactor::JSONReactor( + QPDF& pdf, std::string const& filename, bool must_be_complete) : pdf(pdf), + filename(filename), must_be_complete(must_be_complete), + errors(false), + parse_error(false), saw_qpdf(false), + saw_objects(false), saw_json_version(false), saw_pdf_version(false), saw_trailer(false), state(st_initial), - next_state(st_top) + next_state(st_top), + saw_value(false), + saw_stream(false), + saw_dict(false), + saw_data(false), + saw_datafile(false) { state_stack.push_back(st_initial); } +void +QPDF::JSONReactor::error(size_t offset, std::string const& msg) +{ + this->errors = true; + this->pdf.warn( + qpdf_e_json, this->cur_object, QIntC::to_offset(offset), msg); +} + +bool +QPDF::JSONReactor::anyErrors() const +{ + return this->errors; +} + void QPDF::JSONReactor::containerStart() { @@ -46,7 +107,6 @@ void QPDF::JSONReactor::dictionaryStart() { containerStart(); - // QXXXQ } void @@ -57,7 +117,6 @@ QPDF::JSONReactor::arrayStart() QTC::TC("qpdf", "QPDF_json top-level array"); throw std::runtime_error("QPDF JSON must be a dictionary"); } - // QXXXQ } void @@ -68,23 +127,102 @@ QPDF::JSONReactor::containerEnd(JSON const& value) if (state == st_initial) { if (!this->saw_qpdf) { QTC::TC("qpdf", "QPDF_json missing qpdf"); - throw std::runtime_error("\"qpdf\" object was not seen"); + error(0, "\"qpdf\" object was not seen"); + } else { + if (!this->saw_json_version) { + QTC::TC("qpdf", "QPDF_json missing json version"); + error(0, "\"qpdf.jsonversion\" was not seen"); + } + if (must_be_complete && !this->saw_pdf_version) { + QTC::TC("qpdf", "QPDF_json missing pdf version"); + error(0, "\"qpdf.pdfversion\" was not seen"); + } + if (!this->saw_objects) { + QTC::TC("qpdf", "QPDF_json missing objects"); + error(0, "\"qpdf.objects\" was not seen"); + } else { + if (must_be_complete && !this->saw_trailer) { + QTC::TC("qpdf", "QPDF_json missing trailer"); + error(0, "\"qpdf.objects.trailer\" was not seen"); + } + } } - if (!this->saw_json_version) { - QTC::TC("qpdf", "QPDF_json missing json version"); - throw std::runtime_error("\"qpdf.jsonversion\" was not seen"); + } else if (state == st_objects) { + if (parse_error) { + // ignore + } else if (cur_object == "trailer") { + if (!saw_value) { + QTC::TC("qpdf", "QPDF_json trailer no value"); + error(value.getStart(), "\"trailer\" is missing \"value\""); + } + } else if (saw_value == saw_stream) { + QTC::TC("qpdf", "QPDF_json value stream both or neither"); + error( + value.getStart(), + "object must have exactly one of \"value\" or \"stream\""); } - if (must_be_complete && !this->saw_pdf_version) { - QTC::TC("qpdf", "QPDF_json missing pdf version"); - throw std::runtime_error("\"qpdf.pdfversion\" was not seen"); + object_stack.clear(); + this->cur_object = ""; + this->saw_dict = false; + this->saw_data = false; + this->saw_datafile = false; + this->saw_value = false; + this->saw_stream = false; + } else if (state == st_object_top) { + if (saw_stream) { + if (!saw_dict) { + QTC::TC("qpdf", "QPDF_json stream no dict"); + error(value.getStart(), "\"stream\" is missing \"dict\""); + } + if (must_be_complete) { + if (saw_data == saw_datafile) { + QTC::TC("qpdf", "QPDF_json data datafile both or neither"); + error( + value.getStart(), + "\"stream\" must have exactly one of \"data\" or " + "\"datafile\""); + } + } else if (saw_data && saw_datafile) { + // QXXXQ + /// QTC::TC("qpdf", "QPDF_json data and datafile"); + error( + value.getStart(), + "\"stream\" may at most one of \"data\" or \"datafile\""); + } + } + } else if ((state == st_stream) || (state == st_object)) { + if (!parse_error) { + object_stack.pop_back(); } - if (must_be_complete && !this->saw_trailer) { - /// QTC::TC("qpdf", "QPDF_json missing trailer"); - throw std::runtime_error("\"qpdf.objects.trailer\" was not seen"); + } else if (state == st_qpdf) { + for (auto const& og: this->reserved) { + // QXXXQ + // QTC::TC("qpdf", "QPDF_json non-trivial null reserved"); + this->pdf.replaceObject(og, QPDFObjectHandle::newNull()); } + this->reserved.clear(); } +} - // QXXXQ +QPDFObjectHandle +QPDF::JSONReactor::reserveObject(std::string const& obj, std::string const& gen) +{ + int o = QUtil::string_to_int(obj.c_str()); + int g = QUtil::string_to_int(gen.c_str()); + auto oh = pdf.reserveObjectIfNotExists(o, g); + if (oh.isReserved()) { + this->reserved.insert(QPDFObjGen(o, g)); + } + return oh; +} + +void +QPDF::JSONReactor::replaceObject( + QPDFObjectHandle to_replace, QPDFObjectHandle replacement) +{ + auto og = to_replace.getObjGen(); + this->reserved.erase(og); + this->pdf.replaceObject(og, replacement); } void @@ -100,16 +238,20 @@ QPDF::JSONReactor::nestedState( { // Use this method when the next state is for processing a nested // dictionary. - if (!value.isDictionary()) { - throw JSONExc(value, "\"" + key + "\" must be a dictionary"); + if (value.isDictionary()) { + this->next_state = next; + } else { + error(value.getStart(), "\"" + key + "\" must be a dictionary"); + this->next_state = st_ignore; + this->parse_error = true; } - this->next_state = next; } bool QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) { if (state == st_ignore) { + QTC::TC("qpdf", "QPDF_json ignoring in st_ignore"); // ignore } else if (state == st_top) { if (key == "qpdf") { @@ -118,6 +260,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) } else { // Ignore all other fields for forward compatibility. // Don't use nestedState since this can be any type. + // QXXXQ QTC next_state = st_ignore; } } else if (state == st_qpdf) { @@ -126,7 +269,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) std::string v; if (!(value.getNumber(v) && (v == "2"))) { QTC::TC("qpdf", "QPDF_json bad json version"); - throw JSONExc(value, "only JSON version 2 is supported"); + error(value.getStart(), "only JSON version 2 is supported"); } } else if (key == "pdfversion") { this->saw_pdf_version = true; @@ -141,81 +284,197 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) } if (!version_okay) { QTC::TC("qpdf", "QPDF_json bad pdf version"); - throw JSONExc(value, "invalid PDF version (must be x.y)"); + error(value.getStart(), "invalid PDF version (must be x.y)"); } } else if (key == "objects") { - nestedState(key, value, st_objects_top); + this->saw_objects = true; + nestedState(key, value, st_objects); } else { // ignore unknown keys for forward compatibility + // QXXXQ QTC + next_state = st_ignore; } - } else if (state == st_objects_top) { + } else if (state == st_objects) { std::smatch m; if (key == "trailer") { this->saw_trailer = true; - nestedState(key, value, st_trailer_top); - // QXXXQ + nestedState(key, value, st_trailer); + this->cur_object = "trailer"; } else if (std::regex_match(key, m, OBJ_KEY_RE)) { + // QXXXQ remember to handle null for delete + object_stack.push_back(reserveObject(m[1].str(), m[2].str())); nestedState(key, value, st_object_top); - // QXXXQ + this->cur_object = key; } else { QTC::TC("qpdf", "QPDF_json bad object key"); - throw JSONExc( - value, "object key should be \"trailer\" or \"obj:n n R\""); + error( + value.getStart(), + "object key should be \"trailer\" or \"obj:n n R\""); + next_state = st_ignore; + parse_error = true; } } else if (state == st_object_top) { + if (object_stack.size() == 0) { + throw std::logic_error("no object on stack in st_object_top"); + } + auto tos = object_stack.back(); + QPDFObjectHandle replacement; if (key == "value") { // Don't use nestedState since this can have any type. + this->saw_value = true; next_state = st_object; - // QXXXQ + replacement = makeObject(value); + replaceObject(tos, replacement); } else if (key == "stream") { + this->saw_stream = true; nestedState(key, value, st_stream); - // QXXXQ + if (tos.isStream()) { + // QXXXQ reusing -- need QTC + } else { + replacement = + pdf.reserveStream(tos.getObjectID(), tos.getGeneration()); + replaceObject(tos, replacement); + replacement.replaceStreamData( + "", "<<>>"_qpdf, "<<>>"_qpdf); // QXXXQ + } } else { // Ignore unknown keys for forward compatibility + // QXXXQ QTC + next_state = st_ignore; } - } else if (state == st_trailer_top) { + if (replacement.isInitialized()) { + object_stack.pop_back(); + object_stack.push_back(replacement); + } + } else if (state == st_trailer) { if (key == "value") { + this->saw_value = true; // The trailer must be a dictionary, so we can use nestedState. nestedState("trailer.value", value, st_object); - // QXXXQ + this->pdf.m->trailer = makeObject(value); } else if (key == "stream") { + // Don't need to set saw_stream here since there's already + // an error. QTC::TC("qpdf", "QPDF_json trailer stream"); - throw JSONExc(value, "the trailer may not be a stream"); + error(value.getStart(), "the trailer may not be a stream"); + next_state = st_ignore; + parse_error = true; } else { // Ignore unknown keys for forward compatibility + // QXXXQ QTC + next_state = st_ignore; } } else if (state == st_stream) { - if (key == "dict") { + if (object_stack.size() == 0) { + throw std::logic_error("no object on stack in st_stream"); + } + auto tos = object_stack.back(); + if (!tos.isStream()) { + // QXXXQ QTC in update mode + error(value.getStart(), "this object is not a stream"); + parse_error = true; + } else if (key == "dict") { + this->saw_dict = true; // Since a stream dictionary must be a dictionary, we can // use nestedState to transition to st_value. nestedState("stream.dict", value, st_object); - // QXXXQ + auto dict = makeObject(value); + if (dict.isDictionary()) { + tos.replaceDict(dict); + } else { + // An error had already been given by nestedState + QTC::TC("qpdf", "QPDF_json stream dict not dict"); + parse_error = true; + } } else if (key == "data") { + this->saw_data = true; // QXXXQ } else if (key == "datafile") { + this->saw_datafile = true; // QXXXQ } else { // Ignore unknown keys for forward compatibility. + // QXXXQ QTC next_state = st_ignore; } } else if (state == st_object) { - // QXXXQ + if (!parse_error) { + auto dict = object_stack.back(); + if (dict.isStream()) { + dict = dict.getDict(); + } + dict.replaceKey(key, makeObject(value)); + } } else { throw std::logic_error( "QPDF_json: unknown state " + QUtil::int_to_string(state)); } - - // QXXXQ return true; } bool QPDF::JSONReactor::arrayItem(JSON const& value) { - // QXXXQ + if (state == st_object) { + if (!parse_error) { + auto tos = object_stack.back(); + tos.appendItem(makeObject(value)); + } + } return true; } +QPDFObjectHandle +QPDF::JSONReactor::makeObject(JSON const& value) +{ + QPDFObjectHandle result; + std::string str_v; + bool bool_v = false; + std::smatch m; + if (value.isDictionary()) { + result = QPDFObjectHandle::newDictionary(); + object_stack.push_back(result); + } else if (value.isArray()) { + result = QPDFObjectHandle::newArray(); + object_stack.push_back(result); + } else if (value.isNull()) { + result = QPDFObjectHandle::newNull(); + } else if (value.getBool(bool_v)) { + result = QPDFObjectHandle::newBool(bool_v); + } else if (value.getNumber(str_v)) { + if (QUtil::is_long_long(str_v.c_str())) { + result = QPDFObjectHandle::newInteger( + QUtil::string_to_ll(str_v.c_str())); + } else { + result = QPDFObjectHandle::newReal(str_v); + } + } else if (value.getString(str_v)) { + if (std::regex_match(str_v, m, INDIRECT_OBJ_RE)) { + result = reserveObject(m[1].str(), m[2].str()); + } else if (std::regex_match(str_v, m, UNICODE_RE)) { + result = QPDFObjectHandle::newUnicodeString(m[1].str()); + } else if (std::regex_match(str_v, m, BINARY_RE)) { + result = QPDFObjectHandle::newString(QUtil::hex_decode(m[1].str())); + } else if (std::regex_match(str_v, m, NAME_RE)) { + result = QPDFObjectHandle::newName(str_v); + } else { + QTC::TC("qpdf", "QPDF_json unrecognized string value"); + error(value.getStart(), "unrecognized string value"); + result = QPDFObjectHandle::newNull(); + } + } + if (!result.isInitialized()) { + throw std::logic_error( + "JSONReactor::makeObject didn't initialize the object"); + } + + // QXXXQ include object number in description + result.setObjectDescription( + &this->pdf, + this->filename + " offset " + QUtil::uint_to_string(value.getStart())); + return result; +} + void QPDF::createFromJSON(std::string const& json_file) { @@ -225,6 +484,7 @@ QPDF::createFromJSON(std::string const& json_file) void QPDF::createFromJSON(std::shared_ptr is) { + processMemoryFile(is->getName().c_str(), JSON_PDF, strlen(JSON_PDF)); importJSON(is, true); } @@ -243,10 +503,19 @@ QPDF::updateFromJSON(std::shared_ptr is) void QPDF::importJSON(std::shared_ptr is, bool must_be_complete) { - JSONReactor reactor(*this, must_be_complete); + JSONReactor reactor(*this, is->getName(), must_be_complete); try { JSON::parse(*is, &reactor); } catch (std::runtime_error& e) { throw std::runtime_error(is->getName() + ": " + e.what()); } + if (reactor.anyErrors()) { + throw std::runtime_error(is->getName() + ": errors found in JSON"); + } + // QXXXQ + // std::cout << "trailer:\n" << getTrailer().unparse() << std::endl; + // for (auto& oh: getAllObjects()) { + // std::cout << oh.unparse() << ":" << std::endl; + // std::cout << oh.unparseResolved() << std::endl; + // } } -- cgit v1.2.3-70-g09d2