From 6bbea4baa0c06b39b1b71f1aa6fc276789296556 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 21 Jul 2012 09:00:06 -0400 Subject: Implement QPDFObjectHandle::parse Move object parsing code from QPDF to QPDFObjectHandle and parameterize the parts of it that are specific to a QPDF object. Provide a version that can't handle indirect objects and that can be called on an arbitrary string. A side effect of this change is that the offset used when reporting invalid stream length has changed, but since the new value seems like a better value than the old one, the test suite has been updated rather than making the code backward compatible. This only effects the offset reported for invalid streams that lack /Length or have an invalid /Length key. Updated some test code and exmaples to use QPDFObjectHandle::parse. Supporting changes include adding a BufferInputSource constructor that takes a string. --- libqpdf/QPDFObjectHandle.cc | 269 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) (limited to 'libqpdf/QPDFObjectHandle.cc') diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 642dee69..6bb182e8 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -11,12 +11,15 @@ #include #include #include +#include +#include #include #include #include #include +#include QPDFObjectHandle::QPDFObjectHandle() : initialized(false), @@ -398,6 +401,13 @@ QPDFObjectHandle::getDict() return dynamic_cast(obj.getPointer())->getDict(); } +void +QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict) +{ + assertStream(); + dynamic_cast(obj.getPointer())->replaceDict(new_dict); +} + PointerHolder QPDFObjectHandle::getStreamData() { @@ -598,6 +608,265 @@ QPDFObjectHandle::unparseResolved() return this->obj->unparse(); } +QPDFObjectHandle +QPDFObjectHandle::parse(std::string const& object_str, + std::string const& object_description) +{ + PointerHolder input = + new BufferInputSource("parsed object", object_str); + QPDFTokenizer tokenizer; + bool empty = false; + QPDFObjectHandle result = + parse(input, object_description, tokenizer, empty, 0, 0); + size_t offset = (size_t) input->tell(); + while (offset < object_str.length()) + { + if (! isspace(object_str[offset])) + { + QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "trailing data found parsing object from string"); + } + ++offset; + } + return result; +} + +QPDFObjectHandle +QPDFObjectHandle::parse(PointerHolder input, + std::string const& object_description, + QPDFTokenizer& tokenizer, bool& empty, + StringDecrypter* decrypter, QPDF* context) +{ + return parseInternal(input, object_description, tokenizer, empty, + decrypter, context, false, false); +} + +QPDFObjectHandle +QPDFObjectHandle::parseInternal(PointerHolder input, + std::string const& object_description, + QPDFTokenizer& tokenizer, bool& empty, + StringDecrypter* decrypter, QPDF* context, + bool in_array, bool in_dictionary) +{ + empty = false; + if (in_dictionary && in_array) + { + // Although dictionaries and arrays arbitrarily nest, these + // variables indicate what is at the top of the stack right + // now, so they can, by definition, never both be true. + throw std::logic_error( + "INTERNAL ERROR: parseInternal: in_dict && in_array"); + } + + QPDFObjectHandle object; + + qpdf_offset_t offset = input->tell(); + std::vector olist; + bool done = false; + while (! done) + { + object = QPDFObjectHandle(); + + QPDFTokenizer::Token token = + tokenizer.readToken(input, object_description); + + switch (token.getType()) + { + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + // Don't know what to do with these for now + QTC::TC("qpdf", "QPDFObjectHandle bad brace"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected brace token"); + break; + + case QPDFTokenizer::tt_array_close: + if (in_array) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDFObjectHandle bad array close"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected array close token"); + } + break; + + case QPDFTokenizer::tt_dict_close: + if (in_dictionary) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close"); + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unexpected dictionary close token"); + } + break; + + case QPDFTokenizer::tt_array_open: + object = parseInternal( + input, object_description, tokenizer, empty, + decrypter, context, true, false); + break; + + case QPDFTokenizer::tt_dict_open: + object = parseInternal( + input, object_description, tokenizer, empty, + decrypter, context, false, true); + break; + + case QPDFTokenizer::tt_bool: + object = newBool((token.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + object = newNull(); + break; + + case QPDFTokenizer::tt_integer: + object = newInteger(QUtil::string_to_ll(token.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = newReal(token.getValue()); + break; + + case QPDFTokenizer::tt_name: + object = newName(token.getValue()); + break; + + case QPDFTokenizer::tt_word: + { + std::string const& value = token.getValue(); + if ((value == "R") && (in_array || in_dictionary) && + (olist.size() >= 2) && + (olist[olist.size() - 1].isInteger()) && + (olist[olist.size() - 2].isInteger())) + { + if (context == 0) + { + QTC::TC("qpdf", "QPDFObjectHandle indirect without context"); + throw std::logic_error( + "QPDFObjectHandle::parse called without context" + " on an object with indirect references"); + } + // Try to resolve indirect objects + object = newIndirect( + context, + olist[olist.size() - 2].getIntValue(), + olist[olist.size() - 1].getIntValue()); + olist.pop_back(); + olist.pop_back(); + } + else if ((value == "endobj") && + (! (in_array || in_dictionary))) + { + // We just saw endobj without having read + // anything. Treat this as a null and do not move + // the input source's offset. + object = newNull(); + input->seek(input->getLastOffset(), SEEK_SET); + empty = true; + } + else + { + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unknown token while reading object (" + + value + ")"); + } + } + break; + + case QPDFTokenizer::tt_string: + { + std::string val = token.getValue(); + if (decrypter) + { + decrypter->decryptString(val); + } + object = QPDFObjectHandle::newString(val); + } + + break; + + default: + throw QPDFExc(qpdf_e_damaged_pdf, input->getName(), + object_description, + input->getLastOffset(), + "unknown token type while reading object"); + break; + } + + if (in_dictionary || in_array) + { + if (! done) + { + olist.push_back(object); + } + } + else if (! object.isInitialized()) + { + throw std::logic_error( + "INTERNAL ERROR: uninitialized object (token = " + + QUtil::int_to_string(token.getType()) + + ", " + token.getValue() + ")"); + } + else + { + done = true; + } + } + + if (in_array) + { + object = newArray(olist); + } + else if (in_dictionary) + { + // Convert list to map. Alternating elements are keys. + std::map dict; + if (olist.size() % 2) + { + QTC::TC("qpdf", "QPDFObjectHandle dictionary odd number of elements"); + throw QPDFExc( + qpdf_e_damaged_pdf, input->getName(), + object_description, input->getLastOffset(), + "dictionary ending here has an odd number of elements"); + } + for (unsigned int i = 0; i < olist.size(); i += 2) + { + QPDFObjectHandle key_obj = olist[i]; + QPDFObjectHandle val = olist[i + 1]; + if (! key_obj.isName()) + { + throw QPDFExc( + qpdf_e_damaged_pdf, + input->getName(), object_description, offset, + std::string("dictionary key not name (") + + key_obj.unparse() + ")"); + } + dict[key_obj.getName()] = val; + } + object = newDictionary(dict); + } + + return object; +} + QPDFObjectHandle QPDFObjectHandle::newIndirect(QPDF* qpdf, int objid, int generation) { -- cgit v1.2.3-54-g00ecf