diff options
Diffstat (limited to 'libqpdf/QPDFParser.cc')
-rw-r--r-- | libqpdf/QPDFParser.cc | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc new file mode 100644 index 00000000..9aa1f426 --- /dev/null +++ b/libqpdf/QPDFParser.cc @@ -0,0 +1,453 @@ +#include <qpdf/QPDFParser.hh> + +#include <qpdf/QPDF.hh> +#include <qpdf/QPDFObjGen.hh> +#include <qpdf/QPDFObjectHandle.hh> +#include <qpdf/QTC.hh> +#include <qpdf/QUtil.hh> + +namespace +{ + struct StackFrame + { + StackFrame(std::shared_ptr<InputSource> input) : + offset(input->tell()), + contents_string(""), + contents_offset(-1) + { + } + + std::vector<QPDFObjectHandle> olist; + qpdf_offset_t offset; + std::string contents_string; + qpdf_offset_t contents_offset; + }; +} // namespace + +QPDFObjectHandle +QPDFParser::parse(bool& empty, bool content_stream) +{ + // This method must take care not to resolve any objects. Don't + // check the type of any object without first ensuring that it is + // a direct object. Otherwise, doing so may have the side effect + // of reading the object and changing the file pointer. If you do + // this, it will cause a logic error to be thrown from + // QPDF::inParse(). + + QPDF::ParseGuard pg(context); + + empty = false; + + QPDFObjectHandle object; + bool set_offset = false; + + std::vector<StackFrame> stack; + stack.push_back(StackFrame(input)); + std::vector<parser_state_e> state_stack; + state_stack.push_back(st_top); + qpdf_offset_t offset; + bool done = false; + int bad_count = 0; + int good_count = 0; + bool b_contents = false; + bool is_null = false; + auto null_oh = QPDFObjectHandle::newNull(); + + while (!done) { + bool bad = false; + bool indirect_ref = false; + is_null = false; + auto& frame = stack.back(); + auto& olist = frame.olist; + parser_state_e state = state_stack.back(); + offset = frame.offset; + + object = QPDFObjectHandle(); + set_offset = false; + + QPDFTokenizer::Token token = + tokenizer.readToken(input, object_description, true); + std::string const& token_error_message = token.getErrorMessage(); + if (!token_error_message.empty()) { + // Tokens other than tt_bad can still generate warnings. + warn(token_error_message); + } + + switch (token.getType()) { + case QPDFTokenizer::tt_eof: + if (!content_stream) { + QTC::TC("qpdf", "QPDFParser eof in parse"); + warn("unexpected EOF"); + } + bad = true; + state = st_eof; + break; + + case QPDFTokenizer::tt_bad: + QTC::TC("qpdf", "QPDFParser bad token in parse"); + bad = true; + is_null = true; + break; + + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + QTC::TC("qpdf", "QPDFParser bad brace"); + warn("treating unexpected brace token as null"); + bad = true; + is_null = true; + break; + + case QPDFTokenizer::tt_array_close: + if (state == st_array) { + state = st_stop; + } else { + QTC::TC("qpdf", "QPDFParser bad array close"); + warn("treating unexpected array close token as null"); + bad = true; + is_null = true; + } + break; + + case QPDFTokenizer::tt_dict_close: + if (state == st_dictionary) { + state = st_stop; + } else { + QTC::TC("qpdf", "QPDFParser bad dictionary close"); + warn("unexpected dictionary close token"); + bad = true; + is_null = true; + } + break; + + case QPDFTokenizer::tt_array_open: + case QPDFTokenizer::tt_dict_open: + if (stack.size() > 500) { + QTC::TC("qpdf", "QPDFParser too deep"); + warn("ignoring excessively deeply nested data structure"); + bad = true; + is_null = true; + state = st_top; + } else { + state = st_start; + state_stack.push_back( + (token.getType() == QPDFTokenizer::tt_array_open) + ? st_array + : st_dictionary); + b_contents = false; + stack.push_back(StackFrame(input)); + } + break; + + case QPDFTokenizer::tt_bool: + object = QPDFObjectHandle::newBool((token.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + is_null = true; + break; + + case QPDFTokenizer::tt_integer: + object = QPDFObjectHandle::newInteger( + QUtil::string_to_ll(token.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDFObjectHandle::newReal(token.getValue()); + break; + + case QPDFTokenizer::tt_name: + { + std::string name = token.getValue(); + object = QPDFObjectHandle::newName(name); + + if (name == "/Contents") { + b_contents = true; + } else { + b_contents = false; + } + } + break; + + case QPDFTokenizer::tt_word: + { + std::string const& value = token.getValue(); + auto size = olist.size(); + if (content_stream) { + object = QPDFObjectHandle::newOperator(value); + } else if ( + (value == "R") && (state != st_top) && (size >= 2) && + (!olist.back().isIndirect()) && + (olist.back().isInteger()) && + (!olist.at(size - 2).isIndirect()) && + (olist.at(size - 2).isInteger())) { + if (context == nullptr) { + QTC::TC("qpdf", "QPDFParser indirect without context"); + throw std::logic_error( + "QPDFObjectHandle::parse called without context" + " on an object with indirect references"); + } + auto ref_og = QPDFObjGen( + olist.at(size - 2).getIntValueAsInt(), + olist.back().getIntValueAsInt()); + if (ref_og.isIndirect()) { + object = context->getObject(ref_og); + indirect_ref = true; + } else { + QTC::TC("qpdf", "QPDFParser indirect with 0 objid"); + is_null = true; + } + olist.pop_back(); + olist.pop_back(); + } else if ((value == "endobj") && (state == st_top)) { + // We just saw endobj without having read + // anything. Treat this as a null and do not move + // the input source's offset. + is_null = true; + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; + } else { + QTC::TC("qpdf", "QPDFParser treat word as string"); + warn("unknown token while reading object;" + " treating as string"); + bad = true; + object = QPDFObjectHandle::newString(value); + } + } + break; + + case QPDFTokenizer::tt_string: + { + std::string val = token.getValue(); + if (decrypter) { + if (b_contents) { + frame.contents_string = val; + frame.contents_offset = input->getLastOffset(); + b_contents = false; + } + decrypter->decryptString(val); + } + object = QPDFObjectHandle::newString(val); + } + + break; + + default: + warn("treating unknown token type as null while " + "reading object"); + bad = true; + is_null = true; + break; + } + + if (!object.isInitialized() && !is_null && + (!((state == st_start) || (state == st_stop) || + (state == st_eof)))) { + throw std::logic_error("QPDFObjectHandle::parseInternal: " + "unexpected uninitialized object"); + is_null = true; + } + + if (bad) { + ++bad_count; + good_count = 0; + } else { + ++good_count; + if (good_count > 3) { + bad_count = 0; + } + } + if (bad_count > 5) { + // We had too many consecutive errors without enough + // intervening successful objects. Give up. + warn("too many errors; giving up on reading object"); + state = st_top; + is_null = true; + } + + switch (state) { + case st_eof: + if (state_stack.size() > 1) { + warn("parse error while reading object"); + } + done = true; + // In content stream mode, leave object uninitialized to + // indicate EOF + if (!content_stream) { + is_null = true; + } + break; + + case st_dictionary: + case st_array: + if (!indirect_ref && !object.isDirectNull()) { + // No need to set description for direct nulls - they will + // become implicit. + setDescriptionFromInput(object, input->getLastOffset()); + object.setParsedOffset(input->getLastOffset()); + } + set_offset = true; + olist.push_back(is_null ? null_oh : object); + break; + + case st_top: + done = true; + break; + + case st_start: + break; + + case st_stop: + if ((state_stack.size() < 2) || (stack.size() < 2)) { + throw std::logic_error( + "QPDFObjectHandle::parseInternal: st_stop encountered" + " with insufficient elements in stack"); + } + parser_state_e old_state = state_stack.back(); + state_stack.pop_back(); + if (old_state == st_array) { + object = QPDFObjectHandle::newArray(olist); + setDescriptionFromInput(object, offset); + // The `offset` points to the next of "[". Set the rewind + // offset to point to the beginning of "[". This has been + // explicitly tested with whitespace surrounding the array start + // delimiter. getLastOffset points to the array end token and + // therefore can't be used here. + object.setParsedOffset(offset - 1); + set_offset = true; + } else if (old_state == st_dictionary) { + // Convert list to map. Alternating elements are keys. Attempt + // to recover more or less gracefully from invalid dictionaries. + std::set<std::string> names; + size_t n_elements = olist.size(); + for (size_t i = 0; i < n_elements; ++i) { + QPDFObjectHandle oh = olist.at(i); + if ((!oh.isIndirect()) && oh.isName()) { + names.insert(oh.getName()); + } + } + + std::map<std::string, QPDFObjectHandle> dict; + int next_fake_key = 1; + for (unsigned int i = 0; i < n_elements; ++i) { + QPDFObjectHandle key_obj = olist.at(i); + QPDFObjectHandle val; + if (key_obj.isIndirect() || (!key_obj.isName())) { + bool found_fake = false; + std::string candidate; + while (!found_fake) { + candidate = "/QPDFFake" + + QUtil::int_to_string(next_fake_key++); + found_fake = (names.count(candidate) == 0); + QTC::TC( + "qpdf", + "QPDFParser found fake", + (found_fake ? 0 : 1)); + } + warn( + offset, + "expected dictionary key but found" + " non-name object; inserting key " + + candidate); + val = key_obj; + key_obj = QPDFObjectHandle::newName(candidate); + } else if (i + 1 >= olist.size()) { + QTC::TC("qpdf", "QPDFParser no val for last key"); + warn( + offset, + "dictionary ended prematurely; " + "using null as value for last key"); + val = QPDFObjectHandle::newNull(); + setDescriptionFromInput(val, offset); + } else { + val = olist.at(++i); + } + std::string key = key_obj.getName(); + if (dict.count(key) > 0) { + QTC::TC("qpdf", "QPDFParser duplicate dict key"); + warn( + offset, + "dictionary has duplicated key " + key + + "; last occurrence overrides earlier " + "ones"); + } + dict[key] = val; + } + if (!frame.contents_string.empty() && dict.count("/Type") && + dict["/Type"].isNameAndEquals("/Sig") && + dict.count("/ByteRange") && dict.count("/Contents") && + dict["/Contents"].isString()) { + dict["/Contents"] = + QPDFObjectHandle::newString(frame.contents_string); + dict["/Contents"].setParsedOffset(frame.contents_offset); + } + object = QPDFObjectHandle::newDictionary(dict); + setDescriptionFromInput(object, offset); + // The `offset` points to the next of "<<". Set the rewind + // offset to point to the beginning of "<<". This has been + // explicitly tested with whitespace surrounding the dictionary + // start delimiter. getLastOffset points to the dictionary end + // token and therefore can't be used here. + object.setParsedOffset(offset - 2); + set_offset = true; + } + stack.pop_back(); + if (state_stack.back() == st_top) { + done = true; + } else { + stack.back().olist.push_back(is_null ? null_oh : object); + } + } + } + + if (is_null) { + object = QPDFObjectHandle::newNull(); + } + if (!set_offset) { + setDescriptionFromInput(object, offset); + object.setParsedOffset(offset); + } + return object; +} + +void +QPDFParser::setDescriptionFromInput( + QPDFObjectHandle oh, qpdf_offset_t offset) const +{ + oh.setObjectDescription( + context, + (input->getName() + ", " + object_description + " at offset " + + QUtil::int_to_string(offset))); +} + +void +QPDFParser::warn(QPDF* qpdf, QPDFExc const& e) +{ + // If parsing on behalf of a QPDF object and want to give a + // warning, we can warn through the object. If parsing for some + // other reason, such as an explicit creation of an object from a + // string, then just throw the exception. + if (qpdf) { + qpdf->warn(e); + } else { + throw e; + } +} + +void +QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const +{ + warn( + context, + QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), + object_description, + offset, + msg)); +} + +void +QPDFParser::warn(std::string const& msg) const +{ + warn(input->getLastOffset(), msg); +} |