aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--include/qpdf/JSON.hh84
-rw-r--r--libqpdf/JSON.cc69
-rw-r--r--libtests/json_parse.cc98
-rw-r--r--libtests/qtest/json_parse.test9
-rw-r--r--libtests/qtest/json_parse/good-01-react.out21
-rw-r--r--libtests/qtest/json_parse/good-02-react.out3
-rw-r--r--libtests/qtest/json_parse/good-03-react.out3
-rw-r--r--libtests/qtest/json_parse/good-04-react.out18
-rw-r--r--libtests/qtest/json_parse/good-05-react.out2
-rw-r--r--libtests/qtest/json_parse/good-06-react.out2
-rw-r--r--libtests/qtest/json_parse/good-07-react.out2
-rw-r--r--libtests/qtest/json_parse/good-08-react.out11
-rw-r--r--libtests/qtest/json_parse/good-09-react.out8
-rw-r--r--libtests/qtest/json_parse/good-10-react.out47
-rw-r--r--libtests/qtest/json_parse/good-10.json4
-rw-r--r--libtests/qtest/json_parse/save-10.json27
17 files changed, 401 insertions, 14 deletions
diff --git a/ChangeLog b/ChangeLog
index a25db26d..827a6330 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2022-05-01 Jay Berkenbilt <ejb@ql.org>
+
+ * JSON: add reactors to the JSON parser, making it possible to
+ react to JSON parsing events as they occur and to block the
+ results from being stored. This makes it possible to incrementally
+ parse arbitrarily large JSON inputs.
+
2022-04-30 Jay Berkenbilt <ejb@ql.org>
* QPDFWriter: change encryption API calls
diff --git a/include/qpdf/JSON.hh b/include/qpdf/JSON.hh
index a2a0ea27..e5fa629d 100644
--- a/include/qpdf/JSON.hh
+++ b/include/qpdf/JSON.hh
@@ -141,9 +141,86 @@ class JSON
QPDF_DLL
bool checkSchema(JSON schema, std::list<std::string>& errors);
- // Create a JSON object from a string.
+ // An pointer to a Reactor class can be passed to parse, which
+ // will enable the caller to react to incremental events in the
+ // construction of the JSON object. This makes it possible to
+ // implement SAX-like handling of very large JSON objects.
+ class QPDF_DLL_CLASS Reactor
+ {
+ public:
+ QPDF_DLL
+ virtual ~Reactor() = default;
+
+ // The start/end methods are called when parsing of a
+ // dictionary or array is started or ended. The item methods
+ // are called when an item is added to a dictionary or array.
+ // See important notes in "Item methods" below.
+
+ // During parsing of a JSON string, the parser is operating on
+ // a single object at a time. When a dictionary or array is
+ // started, a new context begins, and when that dictionary or
+ // array is ended, the previous context is resumed. So, for
+ // example, if you have `{"a": [1]}`, you will receive the
+ // following method calls
+ //
+ // dictionaryStart -- current object is the top-level dictionary
+ // arrayStart -- current object is the array
+ // arrayItem -- called with the "1" object
+ // containerEnd -- now current object is the dictionary again
+ // dictionaryItem -- called with "a" and the just-completed array
+ // containerEnd -- current object is undefined
+ //
+ // If the top-level item in a JSON string is a scalar, the
+ // topLevelScalar() method will be called. No argument is
+ // passed since the object is the same as what is returned by
+ // parse().
+
+ QPDF_DLL
+ virtual void dictionaryStart() = 0;
+ QPDF_DLL
+ virtual void arrayStart() = 0;
+ QPDF_DLL
+ virtual void containerEnd(JSON const& value) = 0;
+ QPDF_DLL
+ virtual void topLevelScalar() = 0;
+
+ // Item methods:
+ //
+ // The return value of the item methods indicate whether the
+ // item has been "consumed". If the item method returns true,
+ // then the item will not be added to the containing JSON
+ // object. This is what allows arbitrarily large JSON objects
+ // to be parsed and not have to be kept in memory.
+ //
+ // NOTE: When a dictionary or an array is added to a
+ // container, the dictionaryItem or arrayItem method is called
+ // when the child item's start delimiter is encountered, so
+ // the JSON object passed in at that time will always be
+ // in its initial, empty state.
+
+ QPDF_DLL
+ virtual bool
+ dictionaryItem(std::string const& key, JSON const& value) = 0;
+ QPDF_DLL
+ virtual bool arrayItem(JSON const& value) = 0;
+ };
+
+ // Create a JSON object from a string. See above for information
+ // about how to use the Reactor.
+ QPDF_DLL
+ static JSON parse(std::string const&, Reactor* reactor = nullptr);
+
+ // parse calls setOffsets to set the inclusive start and
+ // non-inclusive end offsets of an object relative to its input
+ // string. Otherwise, both values are 0.
+ QPDF_DLL
+ void setStart(size_t);
+ QPDF_DLL
+ void setEnd(size_t);
+ QPDF_DLL
+ size_t getStart() const;
QPDF_DLL
- static JSON parse(std::string const&);
+ size_t getEnd() const;
private:
static std::string encode_string(std::string const& utf8);
@@ -217,6 +294,9 @@ class JSON
Members(Members const&) = delete;
std::shared_ptr<JSON_value> value;
+ // start and end are only populated for objects created by parse
+ size_t start;
+ size_t end;
};
std::shared_ptr<Members> m;
diff --git a/libqpdf/JSON.cc b/libqpdf/JSON.cc
index 44106688..0f589804 100644
--- a/libqpdf/JSON.cc
+++ b/libqpdf/JSON.cc
@@ -1,12 +1,15 @@
#include <qpdf/JSON.hh>
+#include <qpdf/QIntC.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QUtil.hh>
#include <cstring>
#include <stdexcept>
JSON::Members::Members(std::shared_ptr<JSON_value> value) :
- value(value)
+ value(value),
+ start(0),
+ end(0)
{
}
@@ -455,7 +458,8 @@ namespace
class JSONParser
{
public:
- JSONParser() :
+ JSONParser(JSON::Reactor* reactor) :
+ reactor(reactor),
lex_state(ls_top),
number_before_point(0),
number_after_point(0),
@@ -499,6 +503,7 @@ namespace
ls_backslash,
};
+ JSON::Reactor* reactor;
lex_state_e lex_state;
size_t number_before_point;
size_t number_after_point;
@@ -828,10 +833,18 @@ JSONParser::handleToken()
switch (*tok_start) {
case '{':
item = std::make_shared<JSON>(JSON::makeDictionary());
+ item->setStart(QIntC::to_size(tok_start - cstr));
+ if (reactor) {
+ reactor->dictionaryStart();
+ }
break;
case '[':
item = std::make_shared<JSON>(JSON::makeArray());
+ item->setStart(QIntC::to_size(tok_start - cstr));
+ if (reactor) {
+ reactor->arrayStart();
+ }
break;
default:
@@ -997,6 +1010,11 @@ JSONParser::handleToken()
} else if ((delimiter == '}') || (delimiter == ']')) {
next_state = ps_stack.back();
ps_stack.pop_back();
+ auto tos = stack.back();
+ tos->setEnd(QIntC::to_size(tok_end - cstr));
+ if (reactor) {
+ reactor->containerEnd(*tos);
+ }
if (next_state != ps_done) {
stack.pop_back();
}
@@ -1004,6 +1022,11 @@ JSONParser::handleToken()
throw std::logic_error(
"JSONParser::handleToken: unexpected delimiter in transition");
} else if (item.get()) {
+ if (!(item->isArray() || item->isDictionary())) {
+ item->setStart(QIntC::to_size(tok_start - cstr));
+ item->setEnd(QIntC::to_size(tok_end - cstr));
+ }
+
std::shared_ptr<JSON> tos;
if (!stack.empty()) {
tos = stack.back();
@@ -1017,14 +1040,18 @@ JSONParser::handleToken()
break;
case ps_dict_after_colon:
- tos->addDictionaryMember(dict_key, *item);
+ if (!reactor || !reactor->dictionaryItem(dict_key, *item)) {
+ tos->addDictionaryMember(dict_key, *item);
+ }
next_state = ps_dict_after_item;
break;
case ps_array_begin:
case ps_array_after_comma:
+ if (!reactor || !reactor->arrayItem(*item)) {
+ tos->addArrayElement(*item);
+ }
next_state = ps_array_after_item;
- tos->addArrayElement(*item);
break;
case ps_top:
@@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const& s)
QTC::TC("libtests", "JSON parse premature EOF");
throw std::runtime_error("JSON: premature end of input");
}
- return stack.back();
+ auto const& tos = stack.back();
+ if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) {
+ reactor->topLevelScalar();
+ }
+ return tos;
}
JSON
-JSON::parse(std::string const& s)
+JSON::parse(std::string const& s, Reactor* reactor)
{
- JSONParser jp;
+ JSONParser jp(reactor);
return *jp.parse(s);
}
+
+void
+JSON::setStart(size_t start)
+{
+ this->m->start = start;
+}
+
+void
+JSON::setEnd(size_t end)
+{
+ this->m->end = end;
+}
+
+size_t
+JSON::getStart() const
+{
+ return this->m->start;
+}
+
+size_t
+JSON::getEnd() const
+{
+ return this->m->end;
+}
diff --git a/libtests/json_parse.cc b/libtests/json_parse.cc
index 77692eab..7f894c8d 100644
--- a/libtests/json_parse.cc
+++ b/libtests/json_parse.cc
@@ -1,21 +1,113 @@
#include <qpdf/JSON.hh>
#include <qpdf/QUtil.hh>
+#include <cstdlib>
+#include <cstring>
#include <iostream>
+#include <memory>
+
+namespace
+{
+ class Reactor: public JSON::Reactor
+ {
+ public:
+ virtual ~Reactor() = default;
+ virtual void dictionaryStart() override;
+ virtual void arrayStart() override;
+ virtual void containerEnd(JSON const& value) override;
+ virtual void topLevelScalar() override;
+ virtual bool
+ dictionaryItem(std::string const& key, JSON const& value) override;
+ virtual bool arrayItem(JSON const& value) override;
+
+ private:
+ void printItem(JSON const&);
+ };
+} // namespace
+
+void
+Reactor::dictionaryStart()
+{
+ std::cout << "dictionary start" << std::endl;
+}
+
+void
+Reactor::arrayStart()
+{
+ std::cout << "array start" << std::endl;
+}
+
+void
+Reactor::containerEnd(JSON const& value)
+{
+ std::cout << "container end: ";
+ printItem(value);
+}
+
+void
+Reactor::topLevelScalar()
+{
+ std::cout << "top-level scalar" << std::endl;
+}
+
+bool
+Reactor::dictionaryItem(std::string const& key, JSON const& value)
+{
+ std::cout << "dictionary item: " << key << " -> ";
+ printItem(value);
+ if (key == "keep") {
+ return false;
+ }
+ return true;
+}
+
+bool
+Reactor::arrayItem(JSON const& value)
+{
+ std::cout << "array item: ";
+ printItem(value);
+ std::string n;
+ if (value.getString(n) && n == "keep") {
+ return false;
+ }
+ return true;
+}
+
+void
+Reactor::printItem(JSON const& j)
+{
+ std::cout << "[" << j.getStart() << ", " << j.getEnd()
+ << "): " << j.unparse() << std::endl;
+}
+
+static void
+usage()
+{
+ std::cerr << "Usage: json_parse file [--react]" << std::endl;
+ exit(2);
+}
int
main(int argc, char* argv[])
{
- if (argc != 2) {
- std::cerr << "Usage: json_parse file" << std::endl;
+ if ((argc < 2) || (argc > 3)) {
+ usage();
return 2;
}
char const* filename = argv[1];
+ std::shared_ptr<Reactor> reactor;
+ if (argc == 3) {
+ if (strcmp(argv[2], "--react") == 0) {
+ reactor = std::make_shared<Reactor>();
+ } else {
+ usage();
+ }
+ }
try {
std::shared_ptr<char> buf;
size_t size;
QUtil::read_file_into_memory(filename, buf, size);
std::string s(buf.get(), size);
- std::cout << JSON::parse(s).unparse() << std::endl;
+ std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl;
} catch (std::exception& e) {
std::cerr << "exception: " << filename << ": " << e.what() << std::endl;
return 2;
diff --git a/libtests/qtest/json_parse.test b/libtests/qtest/json_parse.test
index cdafb506..15b251cc 100644
--- a/libtests/qtest/json_parse.test
+++ b/libtests/qtest/json_parse.test
@@ -32,7 +32,7 @@ if ($^O ne 'msys')
cleanup();
-my $good = 9;
+my $good = 10;
for (my $i = 1; $i <= $good; ++$i)
{
@@ -73,6 +73,11 @@ for (my $i = 1; $i <= $good; ++$i)
{$td->FILE => "out.json"},
{$td->STRING => ""});
}
+
+ $td->runtest("good $n reactor",
+ {$td->COMMAND => "json_parse good-$n.json --react"},
+ {$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
}
my @bad = (
@@ -127,7 +132,7 @@ foreach my $d (@bad)
cleanup();
-$td->report((2 * $good) + scalar(@bad));
+$td->report((3 * $good) + scalar(@bad));
sub cleanup
{
diff --git a/libtests/qtest/json_parse/good-01-react.out b/libtests/qtest/json_parse/good-01-react.out
new file mode 100644
index 00000000..d6167a6b
--- /dev/null
+++ b/libtests/qtest/json_parse/good-01-react.out
@@ -0,0 +1,21 @@
+dictionary start
+dictionary item: a -> [6, 11): "bcd"
+array start
+dictionary item: e -> [18, 0): []
+array item: [19, 20): 1
+array item: [41, 42): 2
+array item: [44, 45): 3
+array item: [46, 47): 4
+array item: [48, 54): "five"
+dictionary start
+array item: [56, 0): {}
+dictionary item: six -> [64, 65): 7
+dictionary item: 8 -> [72, 73): 9
+container end: [56, 74): {}
+array item: [76, 80): null
+array item: [82, 86): true
+array item: [107, 112): false
+array item: [114, 134): "a\b\f\n\r\t\\\"/z"
+container end: [18, 135): []
+container end: [0, 136): {}
+{}
diff --git a/libtests/qtest/json_parse/good-02-react.out b/libtests/qtest/json_parse/good-02-react.out
new file mode 100644
index 00000000..12d1931a
--- /dev/null
+++ b/libtests/qtest/json_parse/good-02-react.out
@@ -0,0 +1,3 @@
+dictionary start
+container end: [0, 2): {}
+{}
diff --git a/libtests/qtest/json_parse/good-03-react.out b/libtests/qtest/json_parse/good-03-react.out
new file mode 100644
index 00000000..02e632b7
--- /dev/null
+++ b/libtests/qtest/json_parse/good-03-react.out
@@ -0,0 +1,3 @@
+array start
+container end: [0, 2): []
+[]
diff --git a/libtests/qtest/json_parse/good-04-react.out b/libtests/qtest/json_parse/good-04-react.out
new file mode 100644
index 00000000..bd18ccfc
--- /dev/null
+++ b/libtests/qtest/json_parse/good-04-react.out
@@ -0,0 +1,18 @@
+array start
+array start
+array item: [1, 0): []
+array start
+array item: [2, 0): []
+dictionary start
+array item: [3, 0): {}
+container end: [3, 5): {}
+container end: [2, 6): []
+dictionary start
+array item: [8, 0): {}
+dictionary start
+dictionary item: -> [13, 0): {}
+container end: [13, 15): {}
+container end: [8, 16): {}
+container end: [1, 17): []
+container end: [0, 18): []
+[]
diff --git a/libtests/qtest/json_parse/good-05-react.out b/libtests/qtest/json_parse/good-05-react.out
new file mode 100644
index 00000000..daa37173
--- /dev/null
+++ b/libtests/qtest/json_parse/good-05-react.out
@@ -0,0 +1,2 @@
+top-level scalar
+"x"
diff --git a/libtests/qtest/json_parse/good-06-react.out b/libtests/qtest/json_parse/good-06-react.out
new file mode 100644
index 00000000..adec78ee
--- /dev/null
+++ b/libtests/qtest/json_parse/good-06-react.out
@@ -0,0 +1,2 @@
+top-level scalar
+123
diff --git a/libtests/qtest/json_parse/good-07-react.out b/libtests/qtest/json_parse/good-07-react.out
new file mode 100644
index 00000000..e08bca41
--- /dev/null
+++ b/libtests/qtest/json_parse/good-07-react.out
@@ -0,0 +1,2 @@
+top-level scalar
+-123
diff --git a/libtests/qtest/json_parse/good-08-react.out b/libtests/qtest/json_parse/good-08-react.out
new file mode 100644
index 00000000..c773d585
--- /dev/null
+++ b/libtests/qtest/json_parse/good-08-react.out
@@ -0,0 +1,11 @@
+array start
+array item: [1, 2): 1
+array item: [4, 6): -2
+array item: [8, 11): 3.4
+array item: [13, 17): -5.6
+array item: [19, 23): -9e1
+array item: [25, 29): 10e2
+array item: [31, 37): 12.3e5
+array item: [39, 46): 12.6e-7
+container end: [0, 47): []
+[]
diff --git a/libtests/qtest/json_parse/good-09-react.out b/libtests/qtest/json_parse/good-09-react.out
new file mode 100644
index 00000000..edbf869b
--- /dev/null
+++ b/libtests/qtest/json_parse/good-09-react.out
@@ -0,0 +1,8 @@
+array start
+array item: [1, 7): "aπb"
+array item: [9, 23): "a\b\f\n\r\tc"
+array item: [25, 42): "aπbπc"
+array item: [44, 52): "π"
+array item: [54, 71): "a\u0018bʬc"
+container end: [0, 72): []
+[]
diff --git a/libtests/qtest/json_parse/good-10-react.out b/libtests/qtest/json_parse/good-10-react.out
new file mode 100644
index 00000000..142d95d0
--- /dev/null
+++ b/libtests/qtest/json_parse/good-10-react.out
@@ -0,0 +1,47 @@
+dictionary start
+array start
+dictionary item: a -> [9, 0): []
+array item: [10, 11): 1
+array item: [13, 14): 2
+dictionary start
+array item: [16, 0): {}
+dictionary item: x -> [22, 25): "y"
+container end: [16, 26): {}
+array item: [28, 29): 3
+dictionary start
+array item: [31, 0): {}
+dictionary item: keep -> [40, 61): "not in final output"
+container end: [31, 62): {
+ "keep": "not in final output"
+}
+container end: [9, 63): []
+array start
+dictionary item: keep -> [75, 0): []
+array item: [76, 77): 1
+array item: [79, 83): null
+array item: [85, 86): 2
+array item: [88, 93): false
+array item: [95, 101): "keep"
+array item: [103, 104): 3
+array start
+array item: [106, 0): []
+array item: [107, 113): "this"
+array item: [115, 121): "keep"
+array item: [123, 128): "not"
+array item: [130, 137): "final"
+container end: [106, 138): [
+ "keep"
+]
+container end: [75, 139): [
+ "keep"
+]
+container end: [0, 141): {
+ "keep": [
+ "keep"
+ ]
+}
+{
+ "keep": [
+ "keep"
+ ]
+}
diff --git a/libtests/qtest/json_parse/good-10.json b/libtests/qtest/json_parse/good-10.json
new file mode 100644
index 00000000..8f2d46b3
--- /dev/null
+++ b/libtests/qtest/json_parse/good-10.json
@@ -0,0 +1,4 @@
+{
+ "a": [1, 2, {"x": "y"}, 3, {"keep": "not in final output"}],
+ "keep": [1, null, 2, false, "keep", 3, ["this", "keep", "not", "final"]]
+}
diff --git a/libtests/qtest/json_parse/save-10.json b/libtests/qtest/json_parse/save-10.json
new file mode 100644
index 00000000..1b111dbd
--- /dev/null
+++ b/libtests/qtest/json_parse/save-10.json
@@ -0,0 +1,27 @@
+{
+ "a": [
+ 1,
+ 2,
+ {
+ "x": "y"
+ },
+ 3,
+ {
+ "keep": "not in final output"
+ }
+ ],
+ "keep": [
+ 1,
+ null,
+ 2,
+ false,
+ "keep",
+ 3,
+ [
+ "this",
+ "keep",
+ "not",
+ "final"
+ ]
+ ]
+}