From 8d2a0eda5a76a341ae6b597f58e874d9e3bd571c Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 1 May 2022 14:06:31 -0400 Subject: Add reactors to the JSON parser --- ChangeLog | 7 +++ include/qpdf/JSON.hh | 84 ++++++++++++++++++++++++- libqpdf/JSON.cc | 69 +++++++++++++++++--- libtests/json_parse.cc | 98 ++++++++++++++++++++++++++++- libtests/qtest/json_parse.test | 9 ++- libtests/qtest/json_parse/good-01-react.out | 21 +++++++ libtests/qtest/json_parse/good-02-react.out | 3 + libtests/qtest/json_parse/good-03-react.out | 3 + libtests/qtest/json_parse/good-04-react.out | 18 ++++++ libtests/qtest/json_parse/good-05-react.out | 2 + libtests/qtest/json_parse/good-06-react.out | 2 + libtests/qtest/json_parse/good-07-react.out | 2 + libtests/qtest/json_parse/good-08-react.out | 11 ++++ libtests/qtest/json_parse/good-09-react.out | 8 +++ libtests/qtest/json_parse/good-10-react.out | 47 ++++++++++++++ libtests/qtest/json_parse/good-10.json | 4 ++ libtests/qtest/json_parse/save-10.json | 27 ++++++++ 17 files changed, 401 insertions(+), 14 deletions(-) create mode 100644 libtests/qtest/json_parse/good-01-react.out create mode 100644 libtests/qtest/json_parse/good-02-react.out create mode 100644 libtests/qtest/json_parse/good-03-react.out create mode 100644 libtests/qtest/json_parse/good-04-react.out create mode 100644 libtests/qtest/json_parse/good-05-react.out create mode 100644 libtests/qtest/json_parse/good-06-react.out create mode 100644 libtests/qtest/json_parse/good-07-react.out create mode 100644 libtests/qtest/json_parse/good-08-react.out create mode 100644 libtests/qtest/json_parse/good-09-react.out create mode 100644 libtests/qtest/json_parse/good-10-react.out create mode 100644 libtests/qtest/json_parse/good-10.json create mode 100644 libtests/qtest/json_parse/save-10.json diff --git a/ChangeLog b/ChangeLog index a25db26d..827a6330 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2022-05-01 Jay Berkenbilt + + * JSON: add reactors to the JSON parser, making it possible to + react to JSON parsing events as they occur and to block the + results from being stored. This makes it possible to incrementally + parse arbitrarily large JSON inputs. + 2022-04-30 Jay Berkenbilt * QPDFWriter: change encryption API calls diff --git a/include/qpdf/JSON.hh b/include/qpdf/JSON.hh index a2a0ea27..e5fa629d 100644 --- a/include/qpdf/JSON.hh +++ b/include/qpdf/JSON.hh @@ -141,9 +141,86 @@ class JSON QPDF_DLL bool checkSchema(JSON schema, std::list& errors); - // Create a JSON object from a string. + // An pointer to a Reactor class can be passed to parse, which + // will enable the caller to react to incremental events in the + // construction of the JSON object. This makes it possible to + // implement SAX-like handling of very large JSON objects. + class QPDF_DLL_CLASS Reactor + { + public: + QPDF_DLL + virtual ~Reactor() = default; + + // The start/end methods are called when parsing of a + // dictionary or array is started or ended. The item methods + // are called when an item is added to a dictionary or array. + // See important notes in "Item methods" below. + + // During parsing of a JSON string, the parser is operating on + // a single object at a time. When a dictionary or array is + // started, a new context begins, and when that dictionary or + // array is ended, the previous context is resumed. So, for + // example, if you have `{"a": [1]}`, you will receive the + // following method calls + // + // dictionaryStart -- current object is the top-level dictionary + // arrayStart -- current object is the array + // arrayItem -- called with the "1" object + // containerEnd -- now current object is the dictionary again + // dictionaryItem -- called with "a" and the just-completed array + // containerEnd -- current object is undefined + // + // If the top-level item in a JSON string is a scalar, the + // topLevelScalar() method will be called. No argument is + // passed since the object is the same as what is returned by + // parse(). + + QPDF_DLL + virtual void dictionaryStart() = 0; + QPDF_DLL + virtual void arrayStart() = 0; + QPDF_DLL + virtual void containerEnd(JSON const& value) = 0; + QPDF_DLL + virtual void topLevelScalar() = 0; + + // Item methods: + // + // The return value of the item methods indicate whether the + // item has been "consumed". If the item method returns true, + // then the item will not be added to the containing JSON + // object. This is what allows arbitrarily large JSON objects + // to be parsed and not have to be kept in memory. + // + // NOTE: When a dictionary or an array is added to a + // container, the dictionaryItem or arrayItem method is called + // when the child item's start delimiter is encountered, so + // the JSON object passed in at that time will always be + // in its initial, empty state. + + QPDF_DLL + virtual bool + dictionaryItem(std::string const& key, JSON const& value) = 0; + QPDF_DLL + virtual bool arrayItem(JSON const& value) = 0; + }; + + // Create a JSON object from a string. See above for information + // about how to use the Reactor. + QPDF_DLL + static JSON parse(std::string const&, Reactor* reactor = nullptr); + + // parse calls setOffsets to set the inclusive start and + // non-inclusive end offsets of an object relative to its input + // string. Otherwise, both values are 0. + QPDF_DLL + void setStart(size_t); + QPDF_DLL + void setEnd(size_t); + QPDF_DLL + size_t getStart() const; QPDF_DLL - static JSON parse(std::string const&); + size_t getEnd() const; private: static std::string encode_string(std::string const& utf8); @@ -217,6 +294,9 @@ class JSON Members(Members const&) = delete; std::shared_ptr value; + // start and end are only populated for objects created by parse + size_t start; + size_t end; }; std::shared_ptr m; diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc index 44106688..0f589804 100644 --- a/libqpdf/JSON.cc +++ b/libqpdf/JSON.cc @@ -1,12 +1,15 @@ #include +#include #include #include #include #include JSON::Members::Members(std::shared_ptr value) : - value(value) + value(value), + start(0), + end(0) { } @@ -455,7 +458,8 @@ namespace class JSONParser { public: - JSONParser() : + JSONParser(JSON::Reactor* reactor) : + reactor(reactor), lex_state(ls_top), number_before_point(0), number_after_point(0), @@ -499,6 +503,7 @@ namespace ls_backslash, }; + JSON::Reactor* reactor; lex_state_e lex_state; size_t number_before_point; size_t number_after_point; @@ -828,10 +833,18 @@ JSONParser::handleToken() switch (*tok_start) { case '{': item = std::make_shared(JSON::makeDictionary()); + item->setStart(QIntC::to_size(tok_start - cstr)); + if (reactor) { + reactor->dictionaryStart(); + } break; case '[': item = std::make_shared(JSON::makeArray()); + item->setStart(QIntC::to_size(tok_start - cstr)); + if (reactor) { + reactor->arrayStart(); + } break; default: @@ -997,6 +1010,11 @@ JSONParser::handleToken() } else if ((delimiter == '}') || (delimiter == ']')) { next_state = ps_stack.back(); ps_stack.pop_back(); + auto tos = stack.back(); + tos->setEnd(QIntC::to_size(tok_end - cstr)); + if (reactor) { + reactor->containerEnd(*tos); + } if (next_state != ps_done) { stack.pop_back(); } @@ -1004,6 +1022,11 @@ JSONParser::handleToken() throw std::logic_error( "JSONParser::handleToken: unexpected delimiter in transition"); } else if (item.get()) { + if (!(item->isArray() || item->isDictionary())) { + item->setStart(QIntC::to_size(tok_start - cstr)); + item->setEnd(QIntC::to_size(tok_end - cstr)); + } + std::shared_ptr tos; if (!stack.empty()) { tos = stack.back(); @@ -1017,14 +1040,18 @@ JSONParser::handleToken() break; case ps_dict_after_colon: - tos->addDictionaryMember(dict_key, *item); + if (!reactor || !reactor->dictionaryItem(dict_key, *item)) { + tos->addDictionaryMember(dict_key, *item); + } next_state = ps_dict_after_item; break; case ps_array_begin: case ps_array_after_comma: + if (!reactor || !reactor->arrayItem(*item)) { + tos->addArrayElement(*item); + } next_state = ps_array_after_item; - tos->addArrayElement(*item); break; case ps_top: @@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const& s) QTC::TC("libtests", "JSON parse premature EOF"); throw std::runtime_error("JSON: premature end of input"); } - return stack.back(); + auto const& tos = stack.back(); + if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) { + reactor->topLevelScalar(); + } + return tos; } JSON -JSON::parse(std::string const& s) +JSON::parse(std::string const& s, Reactor* reactor) { - JSONParser jp; + JSONParser jp(reactor); return *jp.parse(s); } + +void +JSON::setStart(size_t start) +{ + this->m->start = start; +} + +void +JSON::setEnd(size_t end) +{ + this->m->end = end; +} + +size_t +JSON::getStart() const +{ + return this->m->start; +} + +size_t +JSON::getEnd() const +{ + return this->m->end; +} diff --git a/libtests/json_parse.cc b/libtests/json_parse.cc index 77692eab..7f894c8d 100644 --- a/libtests/json_parse.cc +++ b/libtests/json_parse.cc @@ -1,21 +1,113 @@ #include #include +#include +#include #include +#include + +namespace +{ + class Reactor: public JSON::Reactor + { + public: + virtual ~Reactor() = default; + virtual void dictionaryStart() override; + virtual void arrayStart() override; + virtual void containerEnd(JSON const& value) override; + virtual void topLevelScalar() override; + virtual bool + dictionaryItem(std::string const& key, JSON const& value) override; + virtual bool arrayItem(JSON const& value) override; + + private: + void printItem(JSON const&); + }; +} // namespace + +void +Reactor::dictionaryStart() +{ + std::cout << "dictionary start" << std::endl; +} + +void +Reactor::arrayStart() +{ + std::cout << "array start" << std::endl; +} + +void +Reactor::containerEnd(JSON const& value) +{ + std::cout << "container end: "; + printItem(value); +} + +void +Reactor::topLevelScalar() +{ + std::cout << "top-level scalar" << std::endl; +} + +bool +Reactor::dictionaryItem(std::string const& key, JSON const& value) +{ + std::cout << "dictionary item: " << key << " -> "; + printItem(value); + if (key == "keep") { + return false; + } + return true; +} + +bool +Reactor::arrayItem(JSON const& value) +{ + std::cout << "array item: "; + printItem(value); + std::string n; + if (value.getString(n) && n == "keep") { + return false; + } + return true; +} + +void +Reactor::printItem(JSON const& j) +{ + std::cout << "[" << j.getStart() << ", " << j.getEnd() + << "): " << j.unparse() << std::endl; +} + +static void +usage() +{ + std::cerr << "Usage: json_parse file [--react]" << std::endl; + exit(2); +} int main(int argc, char* argv[]) { - if (argc != 2) { - std::cerr << "Usage: json_parse file" << std::endl; + if ((argc < 2) || (argc > 3)) { + usage(); return 2; } char const* filename = argv[1]; + std::shared_ptr reactor; + if (argc == 3) { + if (strcmp(argv[2], "--react") == 0) { + reactor = std::make_shared(); + } else { + usage(); + } + } try { std::shared_ptr buf; size_t size; QUtil::read_file_into_memory(filename, buf, size); std::string s(buf.get(), size); - std::cout << JSON::parse(s).unparse() << std::endl; + std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl; } catch (std::exception& e) { std::cerr << "exception: " << filename << ": " << e.what() << std::endl; return 2; diff --git a/libtests/qtest/json_parse.test b/libtests/qtest/json_parse.test index cdafb506..15b251cc 100644 --- a/libtests/qtest/json_parse.test +++ b/libtests/qtest/json_parse.test @@ -32,7 +32,7 @@ if ($^O ne 'msys') cleanup(); -my $good = 9; +my $good = 10; for (my $i = 1; $i <= $good; ++$i) { @@ -73,6 +73,11 @@ for (my $i = 1; $i <= $good; ++$i) {$td->FILE => "out.json"}, {$td->STRING => ""}); } + + $td->runtest("good $n reactor", + {$td->COMMAND => "json_parse good-$n.json --react"}, + {$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); } my @bad = ( @@ -127,7 +132,7 @@ foreach my $d (@bad) cleanup(); -$td->report((2 * $good) + scalar(@bad)); +$td->report((3 * $good) + scalar(@bad)); sub cleanup { diff --git a/libtests/qtest/json_parse/good-01-react.out b/libtests/qtest/json_parse/good-01-react.out new file mode 100644 index 00000000..d6167a6b --- /dev/null +++ b/libtests/qtest/json_parse/good-01-react.out @@ -0,0 +1,21 @@ +dictionary start +dictionary item: a -> [6, 11): "bcd" +array start +dictionary item: e -> [18, 0): [] +array item: [19, 20): 1 +array item: [41, 42): 2 +array item: [44, 45): 3 +array item: [46, 47): 4 +array item: [48, 54): "five" +dictionary start +array item: [56, 0): {} +dictionary item: six -> [64, 65): 7 +dictionary item: 8 -> [72, 73): 9 +container end: [56, 74): {} +array item: [76, 80): null +array item: [82, 86): true +array item: [107, 112): false +array item: [114, 134): "a\b\f\n\r\t\\\"/z" +container end: [18, 135): [] +container end: [0, 136): {} +{} diff --git a/libtests/qtest/json_parse/good-02-react.out b/libtests/qtest/json_parse/good-02-react.out new file mode 100644 index 00000000..12d1931a --- /dev/null +++ b/libtests/qtest/json_parse/good-02-react.out @@ -0,0 +1,3 @@ +dictionary start +container end: [0, 2): {} +{} diff --git a/libtests/qtest/json_parse/good-03-react.out b/libtests/qtest/json_parse/good-03-react.out new file mode 100644 index 00000000..02e632b7 --- /dev/null +++ b/libtests/qtest/json_parse/good-03-react.out @@ -0,0 +1,3 @@ +array start +container end: [0, 2): [] +[] diff --git a/libtests/qtest/json_parse/good-04-react.out b/libtests/qtest/json_parse/good-04-react.out new file mode 100644 index 00000000..bd18ccfc --- /dev/null +++ b/libtests/qtest/json_parse/good-04-react.out @@ -0,0 +1,18 @@ +array start +array start +array item: [1, 0): [] +array start +array item: [2, 0): [] +dictionary start +array item: [3, 0): {} +container end: [3, 5): {} +container end: [2, 6): [] +dictionary start +array item: [8, 0): {} +dictionary start +dictionary item: -> [13, 0): {} +container end: [13, 15): {} +container end: [8, 16): {} +container end: [1, 17): [] +container end: [0, 18): [] +[] diff --git a/libtests/qtest/json_parse/good-05-react.out b/libtests/qtest/json_parse/good-05-react.out new file mode 100644 index 00000000..daa37173 --- /dev/null +++ b/libtests/qtest/json_parse/good-05-react.out @@ -0,0 +1,2 @@ +top-level scalar +"x" diff --git a/libtests/qtest/json_parse/good-06-react.out b/libtests/qtest/json_parse/good-06-react.out new file mode 100644 index 00000000..adec78ee --- /dev/null +++ b/libtests/qtest/json_parse/good-06-react.out @@ -0,0 +1,2 @@ +top-level scalar +123 diff --git a/libtests/qtest/json_parse/good-07-react.out b/libtests/qtest/json_parse/good-07-react.out new file mode 100644 index 00000000..e08bca41 --- /dev/null +++ b/libtests/qtest/json_parse/good-07-react.out @@ -0,0 +1,2 @@ +top-level scalar +-123 diff --git a/libtests/qtest/json_parse/good-08-react.out b/libtests/qtest/json_parse/good-08-react.out new file mode 100644 index 00000000..c773d585 --- /dev/null +++ b/libtests/qtest/json_parse/good-08-react.out @@ -0,0 +1,11 @@ +array start +array item: [1, 2): 1 +array item: [4, 6): -2 +array item: [8, 11): 3.4 +array item: [13, 17): -5.6 +array item: [19, 23): -9e1 +array item: [25, 29): 10e2 +array item: [31, 37): 12.3e5 +array item: [39, 46): 12.6e-7 +container end: [0, 47): [] +[] diff --git a/libtests/qtest/json_parse/good-09-react.out b/libtests/qtest/json_parse/good-09-react.out new file mode 100644 index 00000000..edbf869b --- /dev/null +++ b/libtests/qtest/json_parse/good-09-react.out @@ -0,0 +1,8 @@ +array start +array item: [1, 7): "aπb" +array item: [9, 23): "a\b\f\n\r\tc" +array item: [25, 42): "aπbπc" +array item: [44, 52): "π" +array item: [54, 71): "a\u0018bʬc" +container end: [0, 72): [] +[] diff --git a/libtests/qtest/json_parse/good-10-react.out b/libtests/qtest/json_parse/good-10-react.out new file mode 100644 index 00000000..142d95d0 --- /dev/null +++ b/libtests/qtest/json_parse/good-10-react.out @@ -0,0 +1,47 @@ +dictionary start +array start +dictionary item: a -> [9, 0): [] +array item: [10, 11): 1 +array item: [13, 14): 2 +dictionary start +array item: [16, 0): {} +dictionary item: x -> [22, 25): "y" +container end: [16, 26): {} +array item: [28, 29): 3 +dictionary start +array item: [31, 0): {} +dictionary item: keep -> [40, 61): "not in final output" +container end: [31, 62): { + "keep": "not in final output" +} +container end: [9, 63): [] +array start +dictionary item: keep -> [75, 0): [] +array item: [76, 77): 1 +array item: [79, 83): null +array item: [85, 86): 2 +array item: [88, 93): false +array item: [95, 101): "keep" +array item: [103, 104): 3 +array start +array item: [106, 0): [] +array item: [107, 113): "this" +array item: [115, 121): "keep" +array item: [123, 128): "not" +array item: [130, 137): "final" +container end: [106, 138): [ + "keep" +] +container end: [75, 139): [ + "keep" +] +container end: [0, 141): { + "keep": [ + "keep" + ] +} +{ + "keep": [ + "keep" + ] +} diff --git a/libtests/qtest/json_parse/good-10.json b/libtests/qtest/json_parse/good-10.json new file mode 100644 index 00000000..8f2d46b3 --- /dev/null +++ b/libtests/qtest/json_parse/good-10.json @@ -0,0 +1,4 @@ +{ + "a": [1, 2, {"x": "y"}, 3, {"keep": "not in final output"}], + "keep": [1, null, 2, false, "keep", 3, ["this", "keep", "not", "final"]] +} diff --git a/libtests/qtest/json_parse/save-10.json b/libtests/qtest/json_parse/save-10.json new file mode 100644 index 00000000..1b111dbd --- /dev/null +++ b/libtests/qtest/json_parse/save-10.json @@ -0,0 +1,27 @@ +{ + "a": [ + 1, + 2, + { + "x": "y" + }, + 3, + { + "keep": "not in final output" + } + ], + "keep": [ + 1, + null, + 2, + false, + "keep", + 3, + [ + "this", + "keep", + "not", + "final" + ] + ] +} -- cgit v1.2.3-54-g00ecf