aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf
diff options
context:
space:
mode:
authorm-holger <m-holger@kubitscheck.org>2022-08-16 14:59:32 +0200
committerm-holger <m-holger@kubitscheck.org>2022-08-30 06:56:23 +0200
commit6670c685ab9f929121c5498115b278c95574e461 (patch)
tree7ba128c33014cc33792090604530fcac7b93448c /libqpdf
parent0adfd74f8b5dc96091cd0b4251b08401f54df2ed (diff)
downloadqpdf-6670c685ab9f929121c5498115b278c95574e461.tar.zst
Move QPDFObjectHandle::parseInternal to new class QPDFParser
Part of #729
Diffstat (limited to 'libqpdf')
-rw-r--r--libqpdf/CMakeLists.txt1
-rw-r--r--libqpdf/QPDFObjectHandle.cc498
-rw-r--r--libqpdf/QPDFParser.cc503
-rw-r--r--libqpdf/qpdf/QPDFParser.hh50
4 files changed, 559 insertions, 493 deletions
diff --git a/libqpdf/CMakeLists.txt b/libqpdf/CMakeLists.txt
index cf807f6d..46d35959 100644
--- a/libqpdf/CMakeLists.txt
+++ b/libqpdf/CMakeLists.txt
@@ -80,6 +80,7 @@ set(libqpdf_SOURCES
QPDFPageDocumentHelper.cc
QPDFPageLabelDocumentHelper.cc
QPDFPageObjectHelper.cc
+ QPDFParser.cc
QPDFStreamFilter.cc
QPDFSystemError.cc
QPDFTokenizer.cc
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 8a2d59e3..377a1cbb 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -8,6 +8,7 @@
#include <qpdf/QPDFLogger.hh>
#include <qpdf/QPDFMatrix.hh>
#include <qpdf/QPDFPageObjectHelper.hh>
+#include <qpdf/QPDFParser.hh>
#include <qpdf/QPDF_Array.hh>
#include <qpdf/QPDF_Bool.hh>
#include <qpdf/QPDF_Dictionary.hh>
@@ -1879,8 +1880,8 @@ QPDFObjectHandle::parseContentStream_data(
tokenizer.readToken(input, "content", true);
qpdf_offset_t offset = input->getLastOffset();
input->seek(offset, SEEK_SET);
- QPDFObjectHandle obj = parseInternal(
- input, "content", tokenizer, empty, nullptr, context, true);
+ auto obj = QPDFParser(input, "content", tokenizer, nullptr, context)
+ .parse(empty, true);
if (!obj.isInitialized()) {
// EOF
break;
@@ -1943,497 +1944,8 @@ QPDFObjectHandle::parse(
StringDecrypter* decrypter,
QPDF* context)
{
- return parseInternal(
- input, object_description, tokenizer, empty, decrypter, context, false);
-}
-
-QPDFObjectHandle
-QPDFObjectHandle::parseInternal(
- std::shared_ptr<InputSource> input,
- std::string const& object_description,
- QPDFTokenizer& tokenizer,
- bool& empty,
- StringDecrypter* decrypter,
- QPDF* context,
- bool content_stream)
-{
- // This method must take care not to resolve any objects. Don't
- // check the type of any object without first ensuring that it is
- // a direct object. Otherwise, doing so may have the side effect
- // of reading the object and changing the file pointer. If you do
- // this, it will cause a logic error to be thrown from
- // QPDF::inParse().
-
- QPDF::ParseGuard pg(context);
-
- empty = false;
-
- QPDFObjectHandle object;
- bool set_offset = false;
-
- std::vector<SparseOHArray> olist_stack;
- olist_stack.push_back(SparseOHArray());
- std::vector<parser_state_e> state_stack;
- state_stack.push_back(st_top);
- std::vector<qpdf_offset_t> offset_stack;
- qpdf_offset_t offset = input->tell();
- offset_stack.push_back(offset);
- bool done = false;
- int bad_count = 0;
- int good_count = 0;
- bool b_contents = false;
- std::vector<std::string> contents_string_stack;
- contents_string_stack.push_back("");
- std::vector<qpdf_offset_t> contents_offset_stack;
- contents_offset_stack.push_back(-1);
- while (!done) {
- bool bad = false;
- SparseOHArray& olist = olist_stack.back();
- parser_state_e state = state_stack.back();
- offset = offset_stack.back();
- std::string& contents_string = contents_string_stack.back();
- qpdf_offset_t& contents_offset = contents_offset_stack.back();
-
- object = QPDFObjectHandle();
- set_offset = false;
-
- QPDFTokenizer::Token token =
- tokenizer.readToken(input, object_description, true);
- std::string const& token_error_message = token.getErrorMessage();
- if (!token_error_message.empty()) {
- // Tokens other than tt_bad can still generate warnings.
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- token_error_message));
- }
-
- switch (token.getType()) {
- case QPDFTokenizer::tt_eof:
- if (!content_stream) {
- QTC::TC("qpdf", "QPDFObjectHandle eof in parseInternal");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "unexpected EOF"));
- }
- bad = true;
- state = st_eof;
- break;
-
- case QPDFTokenizer::tt_bad:
- QTC::TC("qpdf", "QPDFObjectHandle bad token in parse");
- bad = true;
- object = newNull();
- break;
-
- case QPDFTokenizer::tt_brace_open:
- case QPDFTokenizer::tt_brace_close:
- QTC::TC("qpdf", "QPDFObjectHandle bad brace");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "treating unexpected brace token as null"));
- bad = true;
- object = newNull();
- break;
-
- case QPDFTokenizer::tt_array_close:
- if (state == st_array) {
- state = st_stop;
- } else {
- QTC::TC("qpdf", "QPDFObjectHandle bad array close");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "treating unexpected array close token as null"));
- bad = true;
- object = newNull();
- }
- break;
-
- case QPDFTokenizer::tt_dict_close:
- if (state == st_dictionary) {
- state = st_stop;
- } else {
- QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "unexpected dictionary close token"));
- bad = true;
- object = newNull();
- }
- break;
-
- case QPDFTokenizer::tt_array_open:
- case QPDFTokenizer::tt_dict_open:
- if (olist_stack.size() > 500) {
- QTC::TC("qpdf", "QPDFObjectHandle too deep");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "ignoring excessively deeply nested data structure"));
- bad = true;
- object = newNull();
- state = st_top;
- } else {
- olist_stack.push_back(SparseOHArray());
- state = st_start;
- offset_stack.push_back(input->tell());
- state_stack.push_back(
- (token.getType() == QPDFTokenizer::tt_array_open)
- ? st_array
- : st_dictionary);
- b_contents = false;
- contents_string_stack.push_back("");
- contents_offset_stack.push_back(-1);
- }
- break;
-
- case QPDFTokenizer::tt_bool:
- object = newBool((token.getValue() == "true"));
- break;
-
- case QPDFTokenizer::tt_null:
- object = newNull();
- break;
-
- case QPDFTokenizer::tt_integer:
- object = newInteger(QUtil::string_to_ll(token.getValue().c_str()));
- break;
-
- case QPDFTokenizer::tt_real:
- object = newReal(token.getValue());
- break;
-
- case QPDFTokenizer::tt_name:
- {
- std::string name = token.getValue();
- object = newName(name);
-
- if (name == "/Contents") {
- b_contents = true;
- } else {
- b_contents = false;
- }
- }
- break;
-
- case QPDFTokenizer::tt_word:
- {
- std::string const& value = token.getValue();
- if (content_stream) {
- object = QPDFObjectHandle::newOperator(value);
- } else if (
- (value == "R") && (state != st_top) &&
- (olist.size() >= 2) &&
- (!olist.at(olist.size() - 1).isIndirect()) &&
- (olist.at(olist.size() - 1).isInteger()) &&
- (!olist.at(olist.size() - 2).isIndirect()) &&
- (olist.at(olist.size() - 2).isInteger())) {
- if (context == nullptr) {
- QTC::TC(
- "qpdf",
- "QPDFObjectHandle indirect without context");
- throw std::logic_error(
- "QPDFObjectHandle::parse called without context"
- " on an object with indirect references");
- }
- // Try to resolve indirect objects
- object = newIndirect(
- context,
- QPDFObjGen(
- olist.at(olist.size() - 2).getIntValueAsInt(),
- olist.at(olist.size() - 1).getIntValueAsInt()));
- olist.remove_last();
- olist.remove_last();
- } else if ((value == "endobj") && (state == st_top)) {
- // We just saw endobj without having read
- // anything. Treat this as a null and do not move
- // the input source's offset.
- object = newNull();
- input->seek(input->getLastOffset(), SEEK_SET);
- empty = true;
- } else {
- QTC::TC("qpdf", "QPDFObjectHandle treat word as string");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "unknown token while reading object;"
- " treating as string"));
- bad = true;
- object = newString(value);
- }
- }
- break;
-
- case QPDFTokenizer::tt_string:
- {
- std::string val = token.getValue();
- if (decrypter) {
- if (b_contents) {
- contents_string = val;
- contents_offset = input->getLastOffset();
- b_contents = false;
- }
- decrypter->decryptString(val);
- }
- object = QPDFObjectHandle::newString(val);
- }
-
- break;
-
- default:
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "treating unknown token type as null while "
- "reading object"));
- bad = true;
- object = newNull();
- break;
- }
-
- if ((!object.isInitialized()) &&
- (!((state == st_start) || (state == st_stop) ||
- (state == st_eof)))) {
- throw std::logic_error("QPDFObjectHandle::parseInternal: "
- "unexpected uninitialized object");
- object = newNull();
- }
-
- if (bad) {
- ++bad_count;
- good_count = 0;
- } else {
- ++good_count;
- if (good_count > 3) {
- bad_count = 0;
- }
- }
- if (bad_count > 5) {
- // We had too many consecutive errors without enough
- // intervening successful objects. Give up.
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "too many errors; giving up on reading object"));
- state = st_top;
- object = newNull();
- }
-
- switch (state) {
- case st_eof:
- if (state_stack.size() > 1) {
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- input->getLastOffset(),
- "parse error while reading object"));
- }
- done = true;
- // In content stream mode, leave object uninitialized to
- // indicate EOF
- if (!content_stream) {
- object = newNull();
- }
- break;
-
- case st_dictionary:
- case st_array:
- setObjectDescriptionFromInput(
- object,
- context,
- object_description,
- input,
- input->getLastOffset());
- object.setParsedOffset(input->getLastOffset());
- set_offset = true;
- olist.append(object);
- break;
-
- case st_top:
- done = true;
- break;
-
- case st_start:
- break;
-
- case st_stop:
- if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
- throw std::logic_error(
- "QPDFObjectHandle::parseInternal: st_stop encountered"
- " with insufficient elements in stack");
- }
- parser_state_e old_state = state_stack.back();
- state_stack.pop_back();
- if (old_state == st_array) {
- // There's no newArray(SparseOHArray) since
- // SparseOHArray is not part of the public API.
- object = QPDFObjectHandle(QPDF_Array::create(olist));
- setObjectDescriptionFromInput(
- object, context, object_description, input, offset);
- // The `offset` points to the next of "[". Set the
- // rewind offset to point to the beginning of "[".
- // This has been explicitly tested with whitespace
- // surrounding the array start delimiter.
- // getLastOffset points to the array end token and
- // therefore can't be used here.
- object.setParsedOffset(offset - 1);
- set_offset = true;
- } else if (old_state == st_dictionary) {
- // Convert list to map. Alternating elements are keys.
- // Attempt to recover more or less gracefully from
- // invalid dictionaries.
- std::set<std::string> names;
- size_t n_elements = olist.size();
- for (size_t i = 0; i < n_elements; ++i) {
- QPDFObjectHandle oh = olist.at(i);
- if ((!oh.isIndirect()) && oh.isName()) {
- names.insert(oh.getName());
- }
- }
-
- std::map<std::string, QPDFObjectHandle> dict;
- int next_fake_key = 1;
- for (unsigned int i = 0; i < olist.size(); ++i) {
- QPDFObjectHandle key_obj = olist.at(i);
- QPDFObjectHandle val;
- if (key_obj.isIndirect() || (!key_obj.isName())) {
- bool found_fake = false;
- std::string candidate;
- while (!found_fake) {
- candidate = "/QPDFFake" +
- QUtil::int_to_string(next_fake_key++);
- found_fake = (names.count(candidate) == 0);
- QTC::TC(
- "qpdf",
- "QPDFObjectHandle found fake",
- (found_fake ? 0 : 1));
- }
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- offset,
- "expected dictionary key but found"
- " non-name object; inserting key " +
- candidate));
- val = key_obj;
- key_obj = newName(candidate);
- } else if (i + 1 >= olist.size()) {
- QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- offset,
- "dictionary ended prematurely; "
- "using null as value for last key"));
- val = newNull();
- setObjectDescriptionFromInput(
- val, context, object_description, input, offset);
- } else {
- val = olist.at(++i);
- }
- std::string key = key_obj.getName();
- if (dict.count(key) > 0) {
- QTC::TC("qpdf", "QPDFObjectHandle duplicate dict key");
- warn(
- context,
- QPDFExc(
- qpdf_e_damaged_pdf,
- input->getName(),
- object_description,
- offset,
- "dictionary has duplicated key " + key +
- "; last occurrence overrides earlier "
- "ones"));
- }
- dict[key] = val;
- }
- if (!contents_string.empty() && dict.count("/Type") &&
- dict["/Type"].isNameAndEquals("/Sig") &&
- dict.count("/ByteRange") && dict.count("/Contents") &&
- dict["/Contents"].isString()) {
- dict["/Contents"] =
- QPDFObjectHandle::newString(contents_string);
- dict["/Contents"].setParsedOffset(contents_offset);
- }
- object = newDictionary(dict);
- setObjectDescriptionFromInput(
- object, context, object_description, input, offset);
- // The `offset` points to the next of "<<". Set the
- // rewind offset to point to the beginning of "<<".
- // This has been explicitly tested with whitespace
- // surrounding the dictionary start delimiter.
- // getLastOffset points to the dictionary end token
- // and therefore can't be used here.
- object.setParsedOffset(offset - 2);
- set_offset = true;
- }
- olist_stack.pop_back();
- offset_stack.pop_back();
- if (state_stack.back() == st_top) {
- done = true;
- } else {
- olist_stack.back().append(object);
- }
- contents_string_stack.pop_back();
- contents_offset_stack.pop_back();
- }
- }
-
- if (!set_offset) {
- setObjectDescriptionFromInput(
- object, context, object_description, input, offset);
- object.setParsedOffset(offset);
- }
- return object;
+ return QPDFParser(input, object_description, tokenizer, decrypter, context)
+ .parse(empty, false);
}
qpdf_offset_t
diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc
new file mode 100644
index 00000000..e86a44bd
--- /dev/null
+++ b/libqpdf/QPDFParser.cc
@@ -0,0 +1,503 @@
+#include <qpdf/QPDFParser.hh>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+#include <qpdf/QPDF_Array.hh>
+#include <qpdf/QTC.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/SparseOHArray.hh>
+
+QPDFObjectHandle
+QPDFParser::parse(bool& empty, bool content_stream)
+{
+ // This method must take care not to resolve any objects. Don't
+ // check the type of any object without first ensuring that it is
+ // a direct object. Otherwise, doing so may have the side effect
+ // of reading the object and changing the file pointer. If you do
+ // this, it will cause a logic error to be thrown from
+ // QPDF::inParse().
+
+ QPDF::ParseGuard pg(context);
+
+ empty = false;
+
+ QPDFObjectHandle object;
+ bool set_offset = false;
+
+ std::vector<SparseOHArray> olist_stack;
+ olist_stack.push_back(SparseOHArray());
+ std::vector<parser_state_e> state_stack;
+ state_stack.push_back(st_top);
+ std::vector<qpdf_offset_t> offset_stack;
+ qpdf_offset_t offset = input->tell();
+ offset_stack.push_back(offset);
+ bool done = false;
+ int bad_count = 0;
+ int good_count = 0;
+ bool b_contents = false;
+ std::vector<std::string> contents_string_stack;
+ contents_string_stack.push_back("");
+ std::vector<qpdf_offset_t> contents_offset_stack;
+ contents_offset_stack.push_back(-1);
+ while (!done) {
+ bool bad = false;
+ SparseOHArray& olist = olist_stack.back();
+ parser_state_e state = state_stack.back();
+ offset = offset_stack.back();
+ std::string& contents_string = contents_string_stack.back();
+ qpdf_offset_t& contents_offset = contents_offset_stack.back();
+
+ object = QPDFObjectHandle();
+ set_offset = false;
+
+ QPDFTokenizer::Token token =
+ tokenizer.readToken(input, object_description, true);
+ std::string const& token_error_message = token.getErrorMessage();
+ if (!token_error_message.empty()) {
+ // Tokens other than tt_bad can still generate warnings.
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ token_error_message));
+ }
+
+ switch (token.getType()) {
+ case QPDFTokenizer::tt_eof:
+ if (!content_stream) {
+ QTC::TC("qpdf", "QPDFParser eof in parse");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "unexpected EOF"));
+ }
+ bad = true;
+ state = st_eof;
+ break;
+
+ case QPDFTokenizer::tt_bad:
+ QTC::TC("qpdf", "QPDFParser bad token in parse");
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ break;
+
+ case QPDFTokenizer::tt_brace_open:
+ case QPDFTokenizer::tt_brace_close:
+ QTC::TC("qpdf", "QPDFParser bad brace");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "treating unexpected brace token as null"));
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ break;
+
+ case QPDFTokenizer::tt_array_close:
+ if (state == st_array) {
+ state = st_stop;
+ } else {
+ QTC::TC("qpdf", "QPDFParser bad array close");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "treating unexpected array close token as null"));
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ }
+ break;
+
+ case QPDFTokenizer::tt_dict_close:
+ if (state == st_dictionary) {
+ state = st_stop;
+ } else {
+ QTC::TC("qpdf", "QPDFParser bad dictionary close");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "unexpected dictionary close token"));
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ }
+ break;
+
+ case QPDFTokenizer::tt_array_open:
+ case QPDFTokenizer::tt_dict_open:
+ if (olist_stack.size() > 500) {
+ QTC::TC("qpdf", "QPDFParser too deep");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "ignoring excessively deeply nested data structure"));
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ state = st_top;
+ } else {
+ olist_stack.push_back(SparseOHArray());
+ state = st_start;
+ offset_stack.push_back(input->tell());
+ state_stack.push_back(
+ (token.getType() == QPDFTokenizer::tt_array_open)
+ ? st_array
+ : st_dictionary);
+ b_contents = false;
+ contents_string_stack.push_back("");
+ contents_offset_stack.push_back(-1);
+ }
+ break;
+
+ case QPDFTokenizer::tt_bool:
+ object = QPDFObjectHandle::newBool((token.getValue() == "true"));
+ break;
+
+ case QPDFTokenizer::tt_null:
+ object = QPDFObjectHandle::newNull();
+ break;
+
+ case QPDFTokenizer::tt_integer:
+ object = QPDFObjectHandle::newInteger(
+ QUtil::string_to_ll(token.getValue().c_str()));
+ break;
+
+ case QPDFTokenizer::tt_real:
+ object = QPDFObjectHandle::newReal(token.getValue());
+ break;
+
+ case QPDFTokenizer::tt_name:
+ {
+ std::string name = token.getValue();
+ object = QPDFObjectHandle::newName(name);
+
+ if (name == "/Contents") {
+ b_contents = true;
+ } else {
+ b_contents = false;
+ }
+ }
+ break;
+
+ case QPDFTokenizer::tt_word:
+ {
+ std::string const& value = token.getValue();
+ if (content_stream) {
+ object = QPDFObjectHandle::newOperator(value);
+ } else if (
+ (value == "R") && (state != st_top) &&
+ (olist.size() >= 2) &&
+ (!olist.at(olist.size() - 1).isIndirect()) &&
+ (olist.at(olist.size() - 1).isInteger()) &&
+ (!olist.at(olist.size() - 2).isIndirect()) &&
+ (olist.at(olist.size() - 2).isInteger())) {
+ if (context == nullptr) {
+ QTC::TC("qpdf", "QPDFParser indirect without context");
+ throw std::logic_error(
+ "QPDFObjectHandle::parse called without context"
+ " on an object with indirect references");
+ }
+ // Try to resolve indirect objects
+ object = QPDFObjectHandle::newIndirect(
+ context,
+ QPDFObjGen(
+ olist.at(olist.size() - 2).getIntValueAsInt(),
+ olist.at(olist.size() - 1).getIntValueAsInt()));
+ olist.remove_last();
+ olist.remove_last();
+ } else if ((value == "endobj") && (state == st_top)) {
+ // We just saw endobj without having read
+ // anything. Treat this as a null and do not move
+ // the input source's offset.
+ object = QPDFObjectHandle::newNull();
+ input->seek(input->getLastOffset(), SEEK_SET);
+ empty = true;
+ } else {
+ QTC::TC("qpdf", "QPDFParser treat word as string");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "unknown token while reading object;"
+ " treating as string"));
+ bad = true;
+ object = QPDFObjectHandle::newString(value);
+ }
+ }
+ break;
+
+ case QPDFTokenizer::tt_string:
+ {
+ std::string val = token.getValue();
+ if (decrypter) {
+ if (b_contents) {
+ contents_string = val;
+ contents_offset = input->getLastOffset();
+ b_contents = false;
+ }
+ decrypter->decryptString(val);
+ }
+ object = QPDFObjectHandle::newString(val);
+ }
+
+ break;
+
+ default:
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "treating unknown token type as null while "
+ "reading object"));
+ bad = true;
+ object = QPDFObjectHandle::newNull();
+ break;
+ }
+
+ if ((!object.isInitialized()) &&
+ (!((state == st_start) || (state == st_stop) ||
+ (state == st_eof)))) {
+ throw std::logic_error("QPDFObjectHandle::parseInternal: "
+ "unexpected uninitialized object");
+ object = QPDFObjectHandle::newNull();
+ }
+
+ if (bad) {
+ ++bad_count;
+ good_count = 0;
+ } else {
+ ++good_count;
+ if (good_count > 3) {
+ bad_count = 0;
+ }
+ }
+ if (bad_count > 5) {
+ // We had too many consecutive errors without enough
+ // intervening successful objects. Give up.
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "too many errors; giving up on reading object"));
+ state = st_top;
+ object = QPDFObjectHandle::newNull();
+ }
+
+ switch (state) {
+ case st_eof:
+ if (state_stack.size() > 1) {
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ input->getLastOffset(),
+ "parse error while reading object"));
+ }
+ done = true;
+ // In content stream mode, leave object uninitialized to
+ // indicate EOF
+ if (!content_stream) {
+ object = QPDFObjectHandle::newNull();
+ }
+ break;
+
+ case st_dictionary:
+ case st_array:
+ QPDFObjectHandle::setObjectDescriptionFromInput(
+ object,
+ context,
+ object_description,
+ input,
+ input->getLastOffset());
+ object.setParsedOffset(input->getLastOffset());
+ set_offset = true;
+ olist.append(object);
+ break;
+
+ case st_top:
+ done = true;
+ break;
+
+ case st_start:
+ break;
+
+ case st_stop:
+ if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
+ throw std::logic_error(
+ "QPDFObjectHandle::parseInternal: st_stop encountered"
+ " with insufficient elements in stack");
+ }
+ parser_state_e old_state = state_stack.back();
+ state_stack.pop_back();
+ if (old_state == st_array) {
+ // There's no newArray(SparseOHArray) since
+ // SparseOHArray is not part of the public API.
+ object = QPDFObjectHandle(QPDF_Array::create(olist));
+ QPDFObjectHandle::setObjectDescriptionFromInput(
+ object, context, object_description, input, offset);
+ // The `offset` points to the next of "[". Set the
+ // rewind offset to point to the beginning of "[".
+ // This has been explicitly tested with whitespace
+ // surrounding the array start delimiter.
+ // getLastOffset points to the array end token and
+ // therefore can't be used here.
+ object.setParsedOffset(offset - 1);
+ set_offset = true;
+ } else if (old_state == st_dictionary) {
+ // Convert list to map. Alternating elements are keys.
+ // Attempt to recover more or less gracefully from
+ // invalid dictionaries.
+ std::set<std::string> names;
+ size_t n_elements = olist.size();
+ for (size_t i = 0; i < n_elements; ++i) {
+ QPDFObjectHandle oh = olist.at(i);
+ if ((!oh.isIndirect()) && oh.isName()) {
+ names.insert(oh.getName());
+ }
+ }
+
+ std::map<std::string, QPDFObjectHandle> dict;
+ int next_fake_key = 1;
+ for (unsigned int i = 0; i < olist.size(); ++i) {
+ QPDFObjectHandle key_obj = olist.at(i);
+ QPDFObjectHandle val;
+ if (key_obj.isIndirect() || (!key_obj.isName())) {
+ bool found_fake = false;
+ std::string candidate;
+ while (!found_fake) {
+ candidate = "/QPDFFake" +
+ QUtil::int_to_string(next_fake_key++);
+ found_fake = (names.count(candidate) == 0);
+ QTC::TC(
+ "qpdf",
+ "QPDFParser found fake",
+ (found_fake ? 0 : 1));
+ }
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ offset,
+ "expected dictionary key but found"
+ " non-name object; inserting key " +
+ candidate));
+ val = key_obj;
+ key_obj = QPDFObjectHandle::newName(candidate);
+ } else if (i + 1 >= olist.size()) {
+ QTC::TC("qpdf", "QPDFParser no val for last key");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ offset,
+ "dictionary ended prematurely; "
+ "using null as value for last key"));
+ val = QPDFObjectHandle::newNull();
+ QPDFObjectHandle::setObjectDescriptionFromInput(
+ val, context, object_description, input, offset);
+ } else {
+ val = olist.at(++i);
+ }
+ std::string key = key_obj.getName();
+ if (dict.count(key) > 0) {
+ QTC::TC("qpdf", "QPDFParser duplicate dict key");
+ warn(
+ context,
+ QPDFExc(
+ qpdf_e_damaged_pdf,
+ input->getName(),
+ object_description,
+ offset,
+ "dictionary has duplicated key " + key +
+ "; last occurrence overrides earlier "
+ "ones"));
+ }
+ dict[key] = val;
+ }
+ if (!contents_string.empty() && dict.count("/Type") &&
+ dict["/Type"].isNameAndEquals("/Sig") &&
+ dict.count("/ByteRange") && dict.count("/Contents") &&
+ dict["/Contents"].isString()) {
+ dict["/Contents"] =
+ QPDFObjectHandle::newString(contents_string);
+ dict["/Contents"].setParsedOffset(contents_offset);
+ }
+ object = QPDFObjectHandle::newDictionary(dict);
+ QPDFObjectHandle::setObjectDescriptionFromInput(
+ object, context, object_description, input, offset);
+ // The `offset` points to the next of "<<". Set the
+ // rewind offset to point to the beginning of "<<".
+ // This has been explicitly tested with whitespace
+ // surrounding the dictionary start delimiter.
+ // getLastOffset points to the dictionary end token
+ // and therefore can't be used here.
+ object.setParsedOffset(offset - 2);
+ set_offset = true;
+ }
+ olist_stack.pop_back();
+ offset_stack.pop_back();
+ if (state_stack.back() == st_top) {
+ done = true;
+ } else {
+ olist_stack.back().append(object);
+ }
+ contents_string_stack.pop_back();
+ contents_offset_stack.pop_back();
+ }
+ }
+
+ if (!set_offset) {
+ QPDFObjectHandle::setObjectDescriptionFromInput(
+ object, context, object_description, input, offset);
+ object.setParsedOffset(offset);
+ }
+ return object;
+}
+
+void
+QPDFParser::warn(QPDF* qpdf, QPDFExc const& e)
+{
+ // If parsing on behalf of a QPDF object and want to give a
+ // warning, we can warn through the object. If parsing for some
+ // other reason, such as an explicit creation of an object from a
+ // string, then just throw the exception.
+ if (qpdf) {
+ qpdf->warn(e);
+ } else {
+ throw e;
+ }
+}
diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh
new file mode 100644
index 00000000..e929c3f2
--- /dev/null
+++ b/libqpdf/qpdf/QPDFParser.hh
@@ -0,0 +1,50 @@
+#ifndef QPDFPARSER_HH
+#define QPDFPARSER_HH
+
+#include <qpdf/QPDFObjectHandle.hh>
+
+#include <memory>
+#include <string>
+
+class QPDFParser
+{
+ public:
+ QPDFParser() = delete;
+ QPDFParser(
+ std::shared_ptr<InputSource> input,
+ std::string const& object_description,
+ QPDFTokenizer& tokenizer,
+ QPDFObjectHandle::StringDecrypter* decrypter,
+ QPDF* context) :
+ input(input),
+ object_description(object_description),
+ tokenizer(tokenizer),
+ decrypter(decrypter),
+ context(context)
+ {
+ }
+ virtual ~QPDFParser() = default;
+
+ QPDFObjectHandle parse(bool& empty, bool content_stream);
+
+ private:
+ enum parser_state_e {
+ st_top,
+ st_start,
+ st_stop,
+ st_eof,
+ st_dictionary,
+ st_array
+ };
+
+ static void warn(QPDF*, QPDFExc const&);
+ void setParsedOffset(qpdf_offset_t offset);
+
+ std::shared_ptr<InputSource> input;
+ std::string const& object_description;
+ QPDFTokenizer& tokenizer;
+ QPDFObjectHandle::StringDecrypter* decrypter;
+ QPDF* context;
+};
+
+#endif // QPDFPARSER_HH