summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2023-12-21 21:43:50 +0100
committerJay Berkenbilt <ejb@ql.org>2023-12-21 21:43:50 +0100
commitb8fd18ae562ab8bae1d2e67c1ab63ff4ea62124b (patch)
tree2b4348b00beef1773725a1d3d18cfbcafa7d076e
parent64c840b1eacd610e290ede0d24c36349dae5916e (diff)
parent1285f9767af983df74b52c4f2aadcbfaec36a6fc (diff)
downloadqpdf-b8fd18ae562ab8bae1d2e67c1ab63ff4ea62124b.tar.zst
Merge branch 'parse_ref' into work
-rw-r--r--libqpdf/QPDFParser.cc674
-rw-r--r--libqpdf/qpdf/QPDFParser.hh47
-rw-r--r--qpdf/qpdf.testcov6
-rw-r--r--qpdf/qtest/parsing.test2
-rw-r--r--qpdf/qtest/qpdf/bad16-recover.out4
-rw-r--r--qpdf/qtest/qpdf/bad16.out2
-rw-r--r--qpdf/qtest/qpdf/bad36-recover.out2
-rw-r--r--qpdf/qtest/qpdf/bad36.out2
-rw-r--r--qpdf/qtest/qpdf/bad39.qdf102
-rw-r--r--qpdf/qtest/qpdf/issue-335a.out4
-rw-r--r--qpdf/qtest/qpdf/parse-object.out8
-rw-r--r--qpdf/test_driver.cc7
12 files changed, 555 insertions, 305 deletions
diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc
index 6dcbddb5..32c4f8e9 100644
--- a/libqpdf/QPDFParser.cc
+++ b/libqpdf/QPDFParser.cc
@@ -21,22 +21,7 @@
#include <memory>
-namespace
-{
- struct StackFrame
- {
- StackFrame(std::shared_ptr<InputSource> input) :
- offset(input->tell())
- {
- }
-
- std::vector<std::shared_ptr<QPDFObject>> olist;
- qpdf_offset_t offset;
- std::string contents_string{""};
- qpdf_offset_t contents_offset{-1};
- int null_count{0};
- };
-} // namespace
+using ObjectPtr = std::shared_ptr<QPDFObject>;
QPDFObjectHandle
QPDFParser::parse(bool& empty, bool content_stream)
@@ -46,371 +31,457 @@ QPDFParser::parse(bool& empty, bool content_stream)
// effect of reading the object and changing the file pointer. If you do this, it will cause a
// logic error to be thrown from QPDF::inParse().
- const static std::shared_ptr<QPDFObject> null_oh = QPDF_Null::create();
QPDF::ParseGuard pg(context);
-
empty = false;
+ start = input->tell();
- std::shared_ptr<QPDFObject> object;
- bool set_offset = false;
-
- std::vector<StackFrame> stack;
- stack.emplace_back(input);
- std::vector<parser_state_e> state_stack;
- state_stack.push_back(st_top);
- qpdf_offset_t offset;
- bool done = false;
- int bad_count = 0;
- int good_count = 0;
- bool b_contents = false;
- bool is_null = false;
+ if (!tokenizer.nextToken(*input, object_description)) {
+ warn(tokenizer.getErrorMessage());
+ }
+
+ switch (tokenizer.getType()) {
+ case QPDFTokenizer::tt_eof:
+ if (content_stream) {
+ // In content stream mode, leave object uninitialized to indicate EOF
+ return {};
+ }
+ QTC::TC("qpdf", "QPDFParser eof in parse");
+ warn("unexpected EOF");
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_bad:
+ QTC::TC("qpdf", "QPDFParser bad token in parse");
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_brace_open:
+ case QPDFTokenizer::tt_brace_close:
+ QTC::TC("qpdf", "QPDFParser bad brace");
+ warn("treating unexpected brace token as null");
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_array_close:
+ QTC::TC("qpdf", "QPDFParser bad array close");
+ warn("treating unexpected array close token as null");
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_dict_close:
+ QTC::TC("qpdf", "QPDFParser bad dictionary close");
+ warn("unexpected dictionary close token");
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_array_open:
+ case QPDFTokenizer::tt_dict_open:
+ stack.clear();
+ stack.emplace_back(
+ input,
+ (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
+ frame = &stack.back();
+ return parseRemainder(content_stream);
+
+ case QPDFTokenizer::tt_bool:
+ return withDescription<QPDF_Bool>(tokenizer.getValue() == "true");
+
+ case QPDFTokenizer::tt_null:
+ return {QPDF_Null::create()};
+
+ case QPDFTokenizer::tt_integer:
+ return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
+
+ case QPDFTokenizer::tt_real:
+ return withDescription<QPDF_Real>(tokenizer.getValue());
+
+ case QPDFTokenizer::tt_name:
+ return withDescription<QPDF_Name>(tokenizer.getValue());
+
+ case QPDFTokenizer::tt_word:
+ {
+ auto const& value = tokenizer.getValue();
+ if (content_stream) {
+ return withDescription<QPDF_Operator>(value);
+ } else if (value == "endobj") {
+ // We just saw endobj without having read anything. Treat this as a null and do
+ // not move the input source's offset.
+ input->seek(input->getLastOffset(), SEEK_SET);
+ empty = true;
+ return {QPDF_Null::create()};
+ } else {
+ QTC::TC("qpdf", "QPDFParser treat word as string");
+ warn("unknown token while reading object; treating as string");
+ return withDescription<QPDF_String>(value);
+ }
+ }
+
+ case QPDFTokenizer::tt_string:
+ if (decrypter) {
+ std::string s{tokenizer.getValue()};
+ decrypter->decryptString(s);
+ return withDescription<QPDF_String>(s);
+ } else {
+ return withDescription<QPDF_String>(tokenizer.getValue());
+ }
+
+ default:
+ warn("treating unknown token type as null while reading object");
+ return {QPDF_Null::create()};
+ }
+}
- while (!done) {
- bool bad = false;
- bool indirect_ref = false;
- is_null = false;
- auto& frame = stack.back();
- auto& olist = frame.olist;
- parser_state_e state = state_stack.back();
- offset = frame.offset;
+QPDFObjectHandle
+QPDFParser::parseRemainder(bool content_stream)
+{
+ // This method must take care not to resolve any objects. Don't check the type of any object
+ // without first ensuring that it is a direct object. Otherwise, doing so may have the side
+ // effect of reading the object and changing the file pointer. If you do this, it will cause a
+ // logic error to be thrown from QPDF::inParse().
- object = nullptr;
- set_offset = false;
+ bad_count = 0;
+ bool b_contents = false;
+ while (true) {
if (!tokenizer.nextToken(*input, object_description)) {
warn(tokenizer.getErrorMessage());
}
+ ++good_count; // optimistically
+
+ if (int_count != 0) {
+ // Special handling of indirect references. Treat integer tokens as part of an indirect
+ // reference until proven otherwise.
+ if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
+ if (++int_count > 2) {
+ // Process the oldest buffered integer.
+ addInt(int_count);
+ }
+ last_offset_buffer[int_count % 2] = input->getLastOffset();
+ int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
+ continue;
+
+ } else if (
+ int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
+ tokenizer.getValue() == "R") {
+ if (context == nullptr) {
+ QTC::TC("qpdf", "QPDFParser indirect without context");
+ throw std::logic_error("QPDFParser::parse called without context on an object "
+ "with indirect references");
+ }
+ auto ref_og = QPDFObjGen(
+ QIntC::to_int(int_buffer[(int_count - 1) % 2]),
+ QIntC::to_int(int_buffer[(int_count) % 2]));
+ if (ref_og.isIndirect()) {
+ // This action has the desirable side effect of causing dangling references
+ // (references to indirect objects that don't appear in the PDF) in any parsed
+ // object to appear in the object cache.
+ add(std::move(context->getObject(ref_og).obj));
+ } else {
+ QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
+ addNull();
+ }
+ int_count = 0;
+ continue;
+
+ } else if (int_count > 0) {
+ // Process the buffered integers before processing the current token.
+ if (int_count > 1) {
+ addInt(int_count - 1);
+ }
+ addInt(int_count);
+ int_count = 0;
+ }
+ }
switch (tokenizer.getType()) {
case QPDFTokenizer::tt_eof:
- if (!content_stream) {
- QTC::TC("qpdf", "QPDFParser eof in parse");
- warn("unexpected EOF");
+ warn("parse error while reading object");
+ if (content_stream) {
+ // In content stream mode, leave object uninitialized to indicate EOF
+ return {};
}
- bad = true;
- state = st_eof;
- break;
+ QTC::TC("qpdf", "QPDFParser eof in parseRemainder");
+ warn("unexpected EOF");
+ return {QPDF_Null::create()};
case QPDFTokenizer::tt_bad:
- QTC::TC("qpdf", "QPDFParser bad token in parse");
- bad = true;
- is_null = true;
- break;
+ QTC::TC("qpdf", "QPDFParser bad token in parseRemainder");
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
+ }
+ addNull();
+ continue;
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
- QTC::TC("qpdf", "QPDFParser bad brace");
+ QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder");
warn("treating unexpected brace token as null");
- bad = true;
- is_null = true;
- break;
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
+ }
+ addNull();
+ continue;
case QPDFTokenizer::tt_array_close:
- if (state == st_array) {
- state = st_stop;
+ if (frame->state == st_array) {
+ auto object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100);
+ setDescription(object, frame->offset - 1);
+ // The `offset` points to the next of "[". Set the rewind offset to point to the
+ // beginning of "[". This has been explicitly tested with whitespace surrounding the
+ // array start delimiter. getLastOffset points to the array end token and therefore
+ // can't be used here.
+ if (stack.size() <= 1) {
+ return object;
+ }
+ stack.pop_back();
+ frame = &stack.back();
+ add(std::move(object));
} else {
- QTC::TC("qpdf", "QPDFParser bad array close");
+ QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder");
warn("treating unexpected array close token as null");
- bad = true;
- is_null = true;
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
+ }
+ addNull();
}
- break;
+ continue;
case QPDFTokenizer::tt_dict_close:
- if (state == st_dictionary) {
- state = st_stop;
+ if (frame->state <= st_dictionary_value) {
+ // Attempt to recover more or less gracefully from invalid dictionaries.
+ auto& dict = frame->dict;
+
+ if (frame->state == st_dictionary_value) {
+ QTC::TC("qpdf", "QPDFParser no val for last key");
+ warn(
+ frame->offset,
+ "dictionary ended prematurely; using null as value for last key");
+ dict[frame->key] = QPDF_Null::create();
+ }
+
+ if (!frame->olist.empty())
+ fixMissingKeys();
+
+ if (!frame->contents_string.empty() && dict.count("/Type") &&
+ dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
+ dict.count("/Contents") && dict["/Contents"].isString()) {
+ dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
+ dict["/Contents"].setParsedOffset(frame->contents_offset);
+ }
+ auto object = QPDF_Dictionary::create(std::move(dict));
+ setDescription(object, frame->offset - 2);
+ // The `offset` points to the next of "<<". Set the rewind offset to point to the
+ // beginning of "<<". This has been explicitly tested with whitespace surrounding
+ // the dictionary start delimiter. getLastOffset points to the dictionary end token
+ // and therefore can't be used here.
+ if (stack.size() <= 1) {
+ return object;
+ }
+ stack.pop_back();
+ frame = &stack.back();
+ add(std::move(object));
} else {
- QTC::TC("qpdf", "QPDFParser bad dictionary close");
+ QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder");
warn("unexpected dictionary close token");
- bad = true;
- is_null = true;
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
+ }
+ addNull();
}
- break;
+ continue;
case QPDFTokenizer::tt_array_open:
case QPDFTokenizer::tt_dict_open:
- if (stack.size() > 500) {
+ if (stack.size() > 499) {
QTC::TC("qpdf", "QPDFParser too deep");
warn("ignoring excessively deeply nested data structure");
- bad = true;
- is_null = true;
- state = st_top;
+ return {QPDF_Null::create()};
} else {
- state = st_start;
- state_stack.push_back(
- (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
- : st_dictionary);
b_contents = false;
- stack.emplace_back(input);
+ stack.emplace_back(
+ input,
+ (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
+ : st_dictionary_key);
+ frame = &stack.back();
+ continue;
}
- break;
case QPDFTokenizer::tt_bool:
- object = QPDF_Bool::create((tokenizer.getValue() == "true"));
- break;
+ addScalar<QPDF_Bool>(tokenizer.getValue() == "true");
+ continue;
case QPDFTokenizer::tt_null:
- is_null = true;
- ++frame.null_count;
-
- break;
+ addNull();
+ continue;
case QPDFTokenizer::tt_integer:
- object = QPDF_Integer::create(
- QUtil::string_to_ll(std::string(tokenizer.getValue()).c_str()));
- break;
+ if (!content_stream) {
+ // Buffer token in case it is part of an indirect reference.
+ last_offset_buffer[1] = input->getLastOffset();
+ int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
+ int_count = 1;
+ } else {
+ addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
+ }
+ continue;
case QPDFTokenizer::tt_real:
- object = QPDF_Real::create(tokenizer.getValue());
- break;
+ addScalar<QPDF_Real>(tokenizer.getValue());
+ continue;
case QPDFTokenizer::tt_name:
- {
- auto name = tokenizer.getValue();
- object = QPDF_Name::create(name);
-
- if (name == "/Contents") {
- b_contents = true;
- } else {
- b_contents = false;
- }
+ if (frame->state == st_dictionary_key) {
+ frame->key = tokenizer.getValue();
+ frame->state = st_dictionary_value;
+ b_contents = decrypter && frame->key == "/Contents";
+ continue;
+ } else {
+ addScalar<QPDF_Name>(tokenizer.getValue());
}
- break;
+ continue;
case QPDFTokenizer::tt_word:
- {
- auto value = tokenizer.getValue();
- auto size = olist.size();
- if (content_stream) {
- object = QPDF_Operator::create(value);
- } else if (
- value == "R" && state != st_top && size >= 2 && olist.back() &&
- olist.back()->getTypeCode() == ::ot_integer &&
- !olist.back()->getObjGen().isIndirect() && olist.at(size - 2) &&
- olist.at(size - 2)->getTypeCode() == ::ot_integer &&
- !olist.at(size - 2)->getObjGen().isIndirect()) {
- if (context == nullptr) {
- QTC::TC("qpdf", "QPDFParser indirect without context");
- throw std::logic_error("QPDFObjectHandle::parse called without context on "
- "an object with indirect references");
- }
- auto ref_og = QPDFObjGen(
- QPDFObjectHandle(olist.at(size - 2)).getIntValueAsInt(),
- QPDFObjectHandle(olist.back()).getIntValueAsInt());
- if (ref_og.isIndirect()) {
- // This action has the desirable side effect of causing dangling references
- // (references to indirect objects that don't appear in the PDF) in any
- // parsed object to appear in the object cache.
- object = context->getObject(ref_og).obj;
- indirect_ref = true;
- } else {
- QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
- is_null = true;
- }
- olist.pop_back();
- olist.pop_back();
- } else if ((value == "endobj") && (state == st_top)) {
- // We just saw endobj without having read anything. Treat this as a null and do
- // not move the input source's offset.
- is_null = true;
- input->seek(input->getLastOffset(), SEEK_SET);
- empty = true;
- } else {
- QTC::TC("qpdf", "QPDFParser treat word as string");
- warn("unknown token while reading object; treating as string");
- bad = true;
- object = QPDF_String::create(value);
+ if (content_stream) {
+ addScalar<QPDF_Operator>(tokenizer.getValue());
+ } else {
+ QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
+ warn("unknown token while reading object; treating as string");
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
}
+ addScalar<QPDF_String>(tokenizer.getValue());
}
- break;
+ continue;
case QPDFTokenizer::tt_string:
{
- auto val = tokenizer.getValue();
+ auto const& val = tokenizer.getValue();
if (decrypter) {
if (b_contents) {
- frame.contents_string = val;
- frame.contents_offset = input->getLastOffset();
+ frame->contents_string = val;
+ frame->contents_offset = input->getLastOffset();
b_contents = false;
}
std::string s{val};
decrypter->decryptString(s);
- object = QPDF_String::create(s);
+ addScalar<QPDF_String>(s);
} else {
- object = QPDF_String::create(val);
+ addScalar<QPDF_String>(val);
}
}
-
- break;
+ continue;
default:
warn("treating unknown token type as null while reading object");
- bad = true;
- is_null = true;
- break;
- }
-
- if (object == nullptr && !is_null &&
- (!((state == st_start) || (state == st_stop) || (state == st_eof)))) {
- throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object");
- is_null = true;
- }
-
- if (bad) {
- ++bad_count;
- good_count = 0;
- } else {
- ++good_count;
- if (good_count > 3) {
- bad_count = 0;
+ if (tooManyBadTokens()) {
+ return {QPDF_Null::create()};
}
+ addNull();
}
- if (bad_count > 5) {
- // We had too many consecutive errors without enough intervening successful objects.
- // Give up.
- warn("too many errors; giving up on reading object");
- state = st_top;
- is_null = true;
- }
+ }
+}
- switch (state) {
- case st_eof:
- if (state_stack.size() > 1) {
- warn("parse error while reading object");
- }
- done = true;
- // In content stream mode, leave object uninitialized to indicate EOF
- if (!content_stream) {
- is_null = true;
- }
- break;
-
- case st_dictionary:
- case st_array:
- if (is_null) {
- object = null_oh;
- // No need to set description for direct nulls - they probably will become implicit.
- } else if (!indirect_ref) {
- setDescription(object, input->getLastOffset());
- }
- set_offset = true;
- olist.push_back(object);
- break;
+void
+QPDFParser::add(std::shared_ptr<QPDFObject>&& obj)
+{
+ if (frame->state != st_dictionary_value) {
+ // If state is st_dictionary_key then there is a missing key. Push onto olist for
+ // processing once the tt_dict_close token has been found.
+ frame->olist.emplace_back(std::move(obj));
+ } else {
+ if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {
+ warnDuplicateKey();
+ }
+ frame->state = st_dictionary_key;
+ }
+}
- case st_top:
- done = true;
- break;
+void
+QPDFParser::addNull()
+{
+ const static ObjectPtr null_obj = QPDF_Null::create();
- case st_start:
- break;
+ if (frame->state != st_dictionary_value) {
+ // If state is st_dictionary_key then there is a missing key. Push onto olist for
+ // processing once the tt_dict_close token has been found.
+ frame->olist.emplace_back(null_obj);
+ } else {
+ if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {
+ warnDuplicateKey();
+ }
+ frame->state = st_dictionary_key;
+ }
+ ++frame->null_count;
+}
- case st_stop:
- if ((state_stack.size() < 2) || (stack.size() < 2)) {
- throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
- "insufficient elements in stack");
- }
- parser_state_e old_state = state_stack.back();
- state_stack.pop_back();
- if (old_state == st_array) {
- object = QPDF_Array::create(std::move(olist), frame.null_count > 100);
- setDescription(object, offset - 1);
- // The `offset` points to the next of "[". Set the rewind offset to point to the
- // beginning of "[". This has been explicitly tested with whitespace surrounding the
- // array start delimiter. getLastOffset points to the array end token and therefore
- // can't be used here.
- set_offset = true;
- } else if (old_state == st_dictionary) {
- // Convert list to map. Alternating elements are keys. Attempt to recover more or
- // less gracefully from invalid dictionaries.
- std::set<std::string> names;
- for (auto& obj: olist) {
- if (obj) {
- if (obj->getTypeCode() == ::ot_name) {
- names.insert(obj->getStringValue());
- }
- }
- }
+void
+QPDFParser::addInt(int count)
+{
+ auto obj = QPDF_Integer::create(int_buffer[count % 2]);
+ obj->setDescription(context, description, last_offset_buffer[count % 2]);
+ add(std::move(obj));
+}
- std::map<std::string, QPDFObjectHandle> dict;
- int next_fake_key = 1;
- for (auto iter = olist.begin(); iter != olist.end();) {
- // Calculate key.
- std::string key;
- if (*iter && (*iter)->getTypeCode() == ::ot_name) {
- key = (*iter)->getStringValue();
- ++iter;
- } else {
- for (bool found_fake = false; !found_fake;) {
- key = "/QPDFFake" + std::to_string(next_fake_key++);
- found_fake = (names.count(key) == 0);
- QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
- }
- warn(
- offset,
- "expected dictionary key but found non-name object; inserting key " +
- key);
- }
- if (dict.count(key) > 0) {
- QTC::TC("qpdf", "QPDFParser duplicate dict key");
- warn(
- offset,
- "dictionary has duplicated key " + key +
- "; last occurrence overrides earlier ones");
- }
+template <typename T, typename... Args>
+void
+QPDFParser::addScalar(Args&&... args)
+{
+ auto obj = T::create(args...);
+ obj->setDescription(context, description, input->getLastOffset());
+ add(std::move(obj));
+}
- // Calculate value.
- std::shared_ptr<QPDFObject> val;
- if (iter != olist.end()) {
- val = *iter;
- ++iter;
- } else {
- QTC::TC("qpdf", "QPDFParser no val for last key");
- warn(
- offset,
- "dictionary ended prematurely; using null as value for last key");
- val = QPDF_Null::create();
- }
+template <typename T, typename... Args>
+QPDFObjectHandle
+QPDFParser::withDescription(Args&&... args)
+{
+ auto obj = T::create(args...);
+ obj->setDescription(context, description, start);
+ return {obj};
+}
- dict[std::move(key)] = std::move(val);
- }
- if (!frame.contents_string.empty() && dict.count("/Type") &&
- dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
- dict.count("/Contents") && dict["/Contents"].isString()) {
- dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string);
- dict["/Contents"].setParsedOffset(frame.contents_offset);
- }
- object = QPDF_Dictionary::create(std::move(dict));
- setDescription(object, offset - 2);
- // The `offset` points to the next of "<<". Set the rewind offset to point to the
- // beginning of "<<". This has been explicitly tested with whitespace surrounding
- // the dictionary start delimiter. getLastOffset points to the dictionary end token
- // and therefore can't be used here.
- set_offset = true;
- }
- stack.pop_back();
- if (state_stack.back() == st_top) {
- done = true;
- } else {
- stack.back().olist.push_back(object);
- }
- }
+void
+QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset)
+{
+ if (obj) {
+ obj->setDescription(context, description, parsed_offset);
}
+}
- if (is_null) {
- object = QPDF_Null::create();
+void
+QPDFParser::fixMissingKeys()
+{
+ std::set<std::string> names;
+ for (auto& obj: frame->olist) {
+ if (obj->getTypeCode() == ::ot_name) {
+ names.insert(obj->getStringValue());
+ }
}
- if (!set_offset) {
- setDescription(object, offset);
+ int next_fake_key = 1;
+ for (auto const& item: frame->olist) {
+ while (true) {
+ const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
+ const bool found_fake = frame->dict.count(key) == 0 && names.count(key) == 0;
+ QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
+ if (found_fake) {
+ warn(
+ frame->offset,
+ "expected dictionary key but found non-name object; inserting key " + key);
+ frame->dict[key] = item;
+ break;
+ }
+ }
}
- return object;
}
-void
-QPDFParser::setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset)
+bool
+QPDFParser::tooManyBadTokens()
{
- if (obj) {
- obj->setDescription(context, description, parsed_offset);
+ if (good_count <= 4) {
+ if (++bad_count > 5) {
+ warn("too many errors; giving up on reading object");
+ return true;
+ }
+ } else {
+ bad_count = 1;
}
+ good_count = 0;
+ return false;
}
void
@@ -427,6 +498,15 @@ QPDFParser::warn(QPDFExc const& e) const
}
void
+QPDFParser::warnDuplicateKey()
+{
+ QTC::TC("qpdf", "QPDFParser duplicate dict key");
+ warn(
+ frame->offset,
+ "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones");
+}
+
+void
QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
{
warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), object_description, offset, msg));
diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh
index 35f9f603..7f5f7804 100644
--- a/libqpdf/qpdf/QPDFParser.hh
+++ b/libqpdf/qpdf/QPDFParser.hh
@@ -31,11 +31,44 @@ class QPDFParser
QPDFObjectHandle parse(bool& empty, bool content_stream);
private:
- enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array };
+ // Parser state. Note:
+ // state < st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)
+ enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
+ struct StackFrame
+ {
+ StackFrame(std::shared_ptr<InputSource> const& input, parser_state_e state) :
+ state(state),
+ offset(input->tell())
+ {
+ }
+
+ std::vector<std::shared_ptr<QPDFObject>> olist;
+ std::map<std::string, QPDFObjectHandle> dict;
+ parser_state_e state;
+ std::string key;
+ qpdf_offset_t offset;
+ std::string contents_string;
+ qpdf_offset_t contents_offset{-1};
+ int null_count{0};
+ };
+
+ QPDFObjectHandle parseRemainder(bool content_stream);
+ void add(std::shared_ptr<QPDFObject>&& obj);
+ void addNull();
+ void addInt(int count);
+ template <typename T, typename... Args>
+ void addScalar(Args&&... args);
+ bool tooManyBadTokens();
+ void warnDuplicateKey();
+ void fixMissingKeys();
void warn(qpdf_offset_t offset, std::string const& msg) const;
void warn(std::string const& msg) const;
void warn(QPDFExc const&) const;
+ template <typename T, typename... Args>
+ // Create a new scalar object complete with parsed offset and description.
+ // NB the offset includes any leading whitespace.
+ QPDFObjectHandle withDescription(Args&&... args);
void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
std::shared_ptr<InputSource> input;
std::string const& object_description;
@@ -43,6 +76,18 @@ class QPDFParser
QPDFObjectHandle::StringDecrypter* decrypter;
QPDF* context;
std::shared_ptr<QPDFValue::Description> description;
+ std::vector<StackFrame> stack;
+ StackFrame* frame;
+ // Number of recent bad tokens.
+ int bad_count = 0;
+ // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
+ int good_count = 0;
+ // Start offset including any leading whitespace.
+ qpdf_offset_t start;
+ // Number of successive integer tokens.
+ int int_count = 0;
+ long long int_buffer[2]{0, 0};
+ qpdf_offset_t last_offset_buffer[2]{0, 0};
};
#endif // QPDFPARSER_HH
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index ec11c57b..cbb4ac1d 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -57,11 +57,14 @@ QPDF trailer lacks size 0
QPDF trailer size not integer 0
QPDF trailer prev not integer 0
QPDFParser bad brace 0
+QPDFParser bad brace in parseRemainder 0
QPDFParser bad array close 0
+QPDFParser bad array close in parseRemainder 0
QPDF stream without length 0
QPDF stream length not integer 0
QPDF missing endstream 0
QPDFParser bad dictionary close 0
+QPDFParser bad dictionary close in parseRemainder 0
QPDF can't find xref 0
QPDFTokenizer bad ) 0
QPDFTokenizer bad > 0
@@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0
QPDF object id 0 0
QPDF recursion loop in resolve 0
QPDFParser treat word as string 0
+QPDFParser treat word as string in parseRemainder 0
QPDFParser found fake 1
QPDFParser no val for last key 0
QPDF resolve failure to null 0
@@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0
QPDFObjectHandle coalesce provide stream data 0
QPDF_Stream bad token at end during normalize 0
QPDFParser bad token in parse 0
+QPDFParser bad token in parseRemainder 0
QPDFParser eof in parse 0
+QPDFParser eof in parseRemainder 0
QPDFObjectHandle array bounds 0
QPDFObjectHandle boolean returning false 0
QPDFObjectHandle integer returning 0 0
diff --git a/qpdf/qtest/parsing.test b/qpdf/qtest/parsing.test
index a3b47f23..bd1a7c6b 100644
--- a/qpdf/qtest/parsing.test
+++ b/qpdf/qtest/parsing.test
@@ -17,7 +17,7 @@ my $td = new TestDriver('parsing');
my $n_tests = 17;
$td->runtest("parse objects from string",
- {$td->COMMAND => "test_driver 31 good1.qdf"},
+ {$td->COMMAND => "test_driver 31 bad39.qdf"},
{$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
$td->runtest("EOF terminating literal tokens",
diff --git a/qpdf/qtest/qpdf/bad16-recover.out b/qpdf/qtest/qpdf/bad16-recover.out
index adddb4f7..0bedd64d 100644
--- a/qpdf/qtest/qpdf/bad16-recover.out
+++ b/qpdf/qtest/qpdf/bad16-recover.out
@@ -1,14 +1,14 @@
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf: file is damaged
WARNING: bad16.pdf (offset 712): expected trailer dictionary
WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
bad16.pdf: unable to find trailer dictionary while recovering damaged file
diff --git a/qpdf/qtest/qpdf/bad16.out b/qpdf/qtest/qpdf/bad16.out
index bcc37f35..ffba090a 100644
--- a/qpdf/qtest/qpdf/bad16.out
+++ b/qpdf/qtest/qpdf/bad16.out
@@ -1,6 +1,6 @@
WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
bad16.pdf (offset 712): expected trailer dictionary
diff --git a/qpdf/qtest/qpdf/bad36-recover.out b/qpdf/qtest/qpdf/bad36-recover.out
index ac05acd9..9aacd729 100644
--- a/qpdf/qtest/qpdf/bad36-recover.out
+++ b/qpdf/qtest/qpdf/bad36-recover.out
@@ -1,6 +1,6 @@
WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
-WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
+WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
/QTest is implicit
/QTest is direct and has type null (2)
/QTest is null
diff --git a/qpdf/qtest/qpdf/bad36.out b/qpdf/qtest/qpdf/bad36.out
index cee3c286..e60d8685 100644
--- a/qpdf/qtest/qpdf/bad36.out
+++ b/qpdf/qtest/qpdf/bad36.out
@@ -1,6 +1,6 @@
WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
-WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
+WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
/QTest is implicit
/QTest is direct and has type null (2)
/QTest is null
diff --git a/qpdf/qtest/qpdf/bad39.qdf b/qpdf/qtest/qpdf/bad39.qdf
new file mode 100644
index 00000000..1da316e6
--- /dev/null
+++ b/qpdf/qtest/qpdf/bad39.qdf
@@ -0,0 +1,102 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+ /Pages 2 0 R
+ /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+ /Count 1
+ /Kids [
+ 3 0 R
+ ]
+ /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+ /Contents 4 0 R
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 6 0 R
+ >>
+ /ProcSet 7 0 R
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 4 0
+4 0 obj
+<<
+ /Length 5 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Potato) Tj
+ET
+endstream
+endobj
+
+5 0 obj
+44
+endobj
+
+%% Original object ID: 6 0
+6 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Name /F1
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+%% Original object ID: 5 0
+7 0 obj
+[
+ /PDF
+ /Text
+]
+endobj
+
+xref
+0 8
+0000000000 65535 f
+0000000052 00000 n
+0000000133 00000 n
+0000000242 00000 n
+0000000484 00000 n
+0000000583 00000 n
+0000000629 00000 n
+0000001113 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 8
+ /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
+>>
+startxref
+809
+%%EOF
+7 0 obj
diff --git a/qpdf/qtest/qpdf/issue-335a.out b/qpdf/qtest/qpdf/issue-335a.out
index 456bc475..c5b64465 100644
--- a/qpdf/qtest/qpdf/issue-335a.out
+++ b/qpdf/qtest/qpdf/issue-335a.out
@@ -51,6 +51,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 134): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -74,6 +75,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 164): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -97,6 +99,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 231): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -448,6 +451,7 @@ WARNING: issue-335a.pdf (trailer, offset 1168): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1328): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2
WARNING: issue-335a.pdf (trailer, offset 1332): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 1033): dictionary has duplicated key /L
WARNING: issue-335a.pdf (trailer, offset 1333): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1344): unexpected )
WARNING: issue-335a.pdf (trailer, offset 1428): unexpected )
diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out
index 2e09f6ad..de7b42e6 100644
--- a/qpdf/qtest/qpdf/parse-object.out
+++ b/qpdf/qtest/qpdf/parse-object.out
@@ -1,5 +1,11 @@
[ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
-logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references
+logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
trailing data: parsed object (trailing test): trailing data found parsing object from string
WARNING: parsed object (offset 9): unknown token while reading object; treating as string
+WARNING: parsed object: treating unexpected brace token as null
+WARNING: parsed object: treating unexpected brace token as null
+WARNING: parsed object: unexpected dictionary close token
+WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF
+WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj
+WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj
test 31 done
diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc
index 3c8ce755..2b8eb761 100644
--- a/qpdf/test_driver.cc
+++ b/qpdf/test_driver.cc
@@ -1195,6 +1195,13 @@ test_31(QPDF& pdf, char const* arg2)
// mistakenly parsed as an indirect object.
assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]");
assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]");
+ // TC:QPDFParser bad brace
+ assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null");
+ assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null");
+ // TC:QPDFParser bad dictionary close
+ assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null");
+ // TC:QPDFParser eof in parse
+ assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull());
}
static void