diff options
author | Jay Berkenbilt <jberkenbilt@users.noreply.github.com> | 2023-07-09 01:37:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-09 01:37:49 +0200 |
commit | 2c2436b23c683d0e64e47b1a5ca66c633558a6c6 (patch) | |
tree | 05811c65da7c0d650ba8e75d3686f50b1c72586e /libqpdf/QPDF.cc | |
parent | ddd78ac7c62d4494e7d6098f395f64bdf5950b00 (diff) | |
parent | ba3953f1bf20d5d0331f253e4da1e3e3ce10e895 (diff) | |
download | qpdf-2c2436b23c683d0e64e47b1a5ca66c633558a6c6.tar.zst |
Merge pull request #1004 from m-holger/ro
Split QPDF::readObject into readTrailer, readObject and readObjectInStream
Diffstat (limited to 'libqpdf/QPDF.cc')
-rw-r--r-- | libqpdf/QPDF.cc | 253 |
1 files changed, 143 insertions, 110 deletions
diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 9554027c..7edb76f9 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -567,7 +567,7 @@ QPDF::reconstruct_xref(QPDFExc& e) insertReconstructedXrefEntry(obj, token_start, gen); } } else if (!m->trailer.isInitialized() && t1.isWord("trailer")) { - QPDFObjectHandle t = readObject(m->file, "trailer", QPDFObjGen(), false); + QPDFObjectHandle t = readTrailer(); if (!t.isDictionary()) { // Oh well. It was worth a try. } else { @@ -855,7 +855,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) } // Set offset to previous xref table if any - QPDFObjectHandle cur_trailer = readObject(m->file, "trailer", QPDFObjGen(), false); + QPDFObjectHandle cur_trailer = readTrailer(); if (!cur_trailer.isDictionary()) { QTC::TC("qpdf", "QPDF missing trailer"); throw damagedPDF("", "expected trailer dictionary"); @@ -1268,124 +1268,160 @@ QPDF::setLastObjectDescription(std::string const& description, QPDFObjGen const& } QPDFObjectHandle -QPDF::readObject( - std::shared_ptr<InputSource> input, - std::string const& description, - QPDFObjGen const& og, - bool in_object_stream) +QPDF::readTrailer() { - setLastObjectDescription(description, og); - qpdf_offset_t offset = input->tell(); - + qpdf_offset_t offset = m->file->tell(); bool empty = false; - std::shared_ptr<StringDecrypter> decrypter_ph; - StringDecrypter* decrypter = nullptr; - if (m->encp->encrypted && (!in_object_stream)) { - decrypter_ph = std::make_shared<StringDecrypter>(this, og); - decrypter = decrypter_ph.get(); + auto object = QPDFParser(m->file, "trailer", m->tokenizer, nullptr, this).parse(empty, false); + if (empty) { + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in + // actual PDF files and Adobe Reader appears to ignore them. + warn(damagedPDF("trailer", "empty object treated as null")); + } else if (object.isDictionary() && readToken(m->file).isWord("stream")) { + warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer")); } - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, decrypter, this) + // Override last_offset so that it points to the beginning of the object we just read + m->file->setLastOffset(offset); + return object; +} + + +QPDFObjectHandle +QPDF::readObject(std::string const& description, QPDFObjGen og) +{ + setLastObjectDescription(description, og); + qpdf_offset_t offset = m->file->tell(); + bool empty = false; + + StringDecrypter decrypter{this, og}; + StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr; + auto object = QPDFParser(m->file, m->last_object_description, m->tokenizer, decrypter_ptr, this) .parse(empty, false); if (empty) { // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in // actual PDF files and Adobe Reader appears to ignore them. - warn(damagedPDF(input, input->getLastOffset(), "empty object treated as null")); - } else if (object.isDictionary() && (!in_object_stream)) { - // check for stream - qpdf_offset_t cur_offset = input->tell(); - if (readToken(input).isWord("stream")) { - // The PDF specification states that the word "stream" should be followed by either a - // carriage return and a newline or by a newline alone. It specifically disallowed - // following it by a carriage return alone since, in that case, there would be no way to - // tell whether the NL in a CR NL sequence was part of the stream data. However, some - // readers, including Adobe reader, accept a carriage return by itself when followed by - // a non-newline character, so that's what we do here. We have also seen files that have - // extraneous whitespace between the stream keyword and the newline. - bool done = false; - while (!done) { - done = true; - char ch; - if (input->read(&ch, 1) == 0) { - // A premature EOF here will result in some other problem that will get reported - // at another time. - } else if (ch == '\n') { - // ready to read stream data - QTC::TC("qpdf", "QPDF stream with NL only"); - } else if (ch == '\r') { - // Read another character - if (input->read(&ch, 1) != 0) { - if (ch == '\n') { - // Ready to read stream data - QTC::TC("qpdf", "QPDF stream with CRNL"); - } else { - // Treat the \r by itself as the whitespace after endstream and start - // reading stream data in spite of not having seen a newline. - QTC::TC("qpdf", "QPDF stream with CR only"); - input->unreadCh(ch); - warn(damagedPDF( - input, - input->tell(), - "stream keyword followed by carriage return " - "only")); - } - } - } else if (QUtil::is_space(ch)) { - warn(damagedPDF( - input, input->tell(), "stream keyword followed by extraneous whitespace")); - done = false; - } else { - QTC::TC("qpdf", "QPDF stream without newline"); - input->unreadCh(ch); - warn(damagedPDF( - input, - input->tell(), - "stream keyword not followed by proper line " - "terminator")); - } - } + warn(damagedPDF(m->file, m->file->getLastOffset(), "empty object treated as null")); + return object; + } + auto token = readToken(m->file); + if (object.isDictionary() && token.isWord("stream")) { + readStream(object, og, offset); + token = readToken(m->file); + } + if (!token.isWord("endobj")) { + QTC::TC("qpdf", "QPDF err expected endobj"); + warn(damagedPDF("expected endobj")); + } + return object; +} - // Must get offset before accessing any additional objects since resolving a previously - // unresolved indirect object will change file position. - qpdf_offset_t stream_offset = input->tell(); - size_t length = 0; +// After reading stream dictionary and stream keyword, read rest of stream. +void +QPDF::readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset) +{ + validateStreamLineEnd(object, og, offset); - try { - auto length_obj = object.getKey("/Length"); + // Must get offset before accessing any additional objects since resolving a previously + // unresolved indirect object will change file position. + qpdf_offset_t stream_offset = m->file->tell(); + size_t length = 0; - if (!length_obj.isInteger()) { - if (length_obj.isNull()) { - QTC::TC("qpdf", "QPDF stream without length"); - throw damagedPDF(input, offset, "stream dictionary lacks /Length key"); - } - QTC::TC("qpdf", "QPDF stream length not integer"); - throw damagedPDF( - input, offset, "/Length key in stream dictionary is not an integer"); - } + try { + auto length_obj = object.getKey("/Length"); - length = toS(length_obj.getUIntValue()); - // Seek in two steps to avoid potential integer overflow - input->seek(stream_offset, SEEK_SET); - input->seek(toO(length), SEEK_CUR); - if (!readToken(input).isWord("endstream")) { - QTC::TC("qpdf", "QPDF missing endstream"); - throw damagedPDF(input, input->getLastOffset(), "expected endstream"); - } - } catch (QPDFExc& e) { - if (m->attempt_recovery) { - warn(e); - length = recoverStreamLength(input, og, stream_offset); + if (!length_obj.isInteger()) { + if (length_obj.isNull()) { + QTC::TC("qpdf", "QPDF stream without length"); + throw damagedPDF(offset, "stream dictionary lacks /Length key"); + } + QTC::TC("qpdf", "QPDF stream length not integer"); + throw damagedPDF(offset, "/Length key in stream dictionary is not an integer"); + } + + length = toS(length_obj.getUIntValue()); + // Seek in two steps to avoid potential integer overflow + m->file->seek(stream_offset, SEEK_SET); + m->file->seek(toO(length), SEEK_CUR); + if (!readToken(m->file).isWord("endstream")) { + QTC::TC("qpdf", "QPDF missing endstream"); + throw damagedPDF("expected endstream"); + } + } catch (QPDFExc& e) { + if (m->attempt_recovery) { + warn(e); + length = recoverStreamLength(m->file, og, stream_offset); + } else { + throw; + } + } + object = newIndirect(og, QPDF_Stream::create(this, og, object, stream_offset, length)); +} + +void +QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset) +{ + // The PDF specification states that the word "stream" should be followed by either a carriage + // return and a newline or by a newline alone. It specifically disallowed following it by a + // carriage return alone since, in that case, there would be no way to tell whether the NL in a + // CR NL sequence was part of the stream data. However, some readers, including Adobe reader, + // accept a carriage return by itself when followed by a non-newline character, so that's what + // we do here. We have also seen files that have extraneous whitespace between the stream + // keyword and the newline. + while (true) { + char ch; + if (m->file->read(&ch, 1) == 0) { + // A premature EOF here will result in some other problem that will get reported at + // another time. + return; + } + if (ch == '\n') { + // ready to read stream data + QTC::TC("qpdf", "QPDF stream with NL only"); + return; + } + if (ch == '\r') { + // Read another character + if (m->file->read(&ch, 1) != 0) { + if (ch == '\n') { + // Ready to read stream data + QTC::TC("qpdf", "QPDF stream with CRNL"); } else { - throw; + // Treat the \r by itself as the whitespace after endstream and start reading + // stream data in spite of not having seen a newline. + QTC::TC("qpdf", "QPDF stream with CR only"); + m->file->unreadCh(ch); + warn(damagedPDF( + m->file->tell(), "stream keyword followed by carriage return only")); } } - object = newIndirect(og, QPDF_Stream::create(this, og, object, stream_offset, length)); - } else { - input->seek(cur_offset, SEEK_SET); + return; } + if (!QUtil::is_space(ch)) { + QTC::TC("qpdf", "QPDF stream without newline"); + m->file->unreadCh(ch); + warn(damagedPDF( + m->file->tell(), "stream keyword not followed by proper line terminator")); + return; + } + warn(damagedPDF(m->file->tell(), "stream keyword followed by extraneous whitespace")); } +} - // Override last_offset so that it points to the beginning of the object we just read - input->setLastOffset(offset); +QPDFObjectHandle +QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj) +{ + m->last_object_description.erase(7); // last_object_description starts with "object " + m->last_object_description += std::to_string(obj); + m->last_object_description += " 0"; + + bool empty = false; + auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this) + .parse(empty, false); + if (empty) { + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in + // actual PDF files and Adobe Reader appears to ignore them. + warn(damagedPDF(input, input->getLastOffset(), "empty object treated as null")); + } return object; } @@ -1559,12 +1595,7 @@ QPDF::readObjectAtOffset( } } - QPDFObjectHandle oh = readObject(m->file, description, og, false); - - if (!readToken(m->file).isWord("endobj")) { - QTC::TC("qpdf", "QPDF err expected endobj"); - warn(damagedPDF("expected endobj")); - } + QPDFObjectHandle oh = readObject(description, og); if (isUnresolved(og)) { // Store the object in the cache here so it gets cached whether we first know the offset or @@ -1744,13 +1775,15 @@ QPDF::resolveObjectsInStream(int obj_stream_number) // found here in the cache. Remember that some objects stored here might have been overridden // by new objects appended to the file, so it is necessary to recheck the xref table and only // cache what would actually be resolved here. + m->last_object_description.clear(); + m->last_object_description += "object "; for (auto const& iter: offsets) { QPDFObjGen og(iter.first, 0); QPDFXRefEntry const& entry = m->xref_table[og]; if ((entry.getType() == 2) && (entry.getObjStreamNumber() == obj_stream_number)) { int offset = iter.second; input->seek(offset, SEEK_SET); - QPDFObjectHandle oh = readObject(input, "", og, true); + QPDFObjectHandle oh = readObjectInStream(input, iter.first); updateCache(og, oh.getObj(), end_before_space, end_after_space); } else { QTC::TC("qpdf", "QPDF not caching overridden objstm object"); |