From 698a70e6a84cf7c0db667e9d9e021b4c34c85a3e Mon Sep 17 00:00:00 2001 From: m-holger Date: Wed, 24 May 2023 16:28:17 +0100 Subject: Code tidy - reflow comments and strings --- libqpdf/QPDF.cc | 490 +++++++++++++++++++++++--------------------------------- 1 file changed, 200 insertions(+), 290 deletions(-) (limited to 'libqpdf/QPDF.cc') diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 806a8cb2..64ff4715 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -32,8 +32,8 @@ #include #include -// This must be a fixed value. This API returns a const reference to -// it, and the C API relies on its being static as well. +// This must be a fixed value. This API returns a const reference to it, and the C API relies on its +// being static as well. std::string const QPDF::qpdf_version(QPDF_VERSION); static char const* EMPTY_PDF = ( @@ -212,33 +212,26 @@ QPDF::QPDF() : m(new Members()) { m->tokenizer.allowEOF(); - // Generate a unique ID. It just has to be unique among all QPDF - // objects allocated throughout the lifetime of this running - // application. + // Generate a unique ID. It just has to be unique among all QPDF objects allocated throughout + // the lifetime of this running application. static std::atomic unique_id{0}; m->unique_id = unique_id.fetch_add(1ULL); } QPDF::~QPDF() { - // If two objects are mutually referential (through each object - // having an array or dictionary that contains an indirect - // reference to the other), the circular references in the - // std::shared_ptr objects will prevent the objects from being - // deleted. Walk through all objects in the object cache, which is - // those objects that we read from the file, and break all - // resolved indirect references by replacing them with an internal - // object type representing that they have been destroyed. Note - // that we can't break references like this at any time when the - // QPDF object is active. The call to reset also causes all direct - // QPDFObjectHandle objects that are reachable from this object to - // release their association with this QPDF. Direct objects are - // not destroyed since they can be moved to other QPDF objects - // safely. - - // At this point, obviously no one is still using the QPDF object, - // but we'll explicitly clear the xref table anyway just to - // prevent any possibility of resolve() succeeding. + // If two objects are mutually referential (through each object having an array or dictionary + // that contains an indirect reference to the other), the circular references in the + // std::shared_ptr objects will prevent the objects from being deleted. Walk through all objects + // in the object cache, which is those objects that we read from the file, and break all + // resolved indirect references by replacing them with an internal object type representing that + // they have been destroyed. Note that we can't break references like this at any time when the + // QPDF object is active. The call to reset also causes all direct QPDFObjectHandle objects that + // are reachable from this object to release their association with this QPDF. Direct objects + // are not destroyed since they can be moved to other QPDF objects safely. + + // At this point, obviously no one is still using the QPDF object, but we'll explicitly clear + // the xref table anyway just to prevent any possibility of resolve() succeeding. m->xref_table.clear(); for (auto const& iter: m->obj_cache) { iter.second.object->disconnect(); @@ -406,18 +399,15 @@ QPDF::findHeader() } p += 5; std::string version; - // Note: The string returned by line.c_str() is always - // null-terminated. The code below never overruns the buffer - // because a null character always short-circuits further - // advancement. + // Note: The string returned by line.c_str() is always null-terminated. The code below never + // overruns the buffer because a null character always short-circuits further advancement. bool valid = validatePDFVersion(p, version); if (valid) { m->pdf_version = version; if (global_offset != 0) { - // Empirical evidence strongly suggests that when there is - // leading material prior to the PDF header, all explicit - // offsets in the file are such that 0 points to the - // beginning of the header. + // Empirical evidence strongly suggests that when there is leading material prior to the + // PDF header, all explicit offsets in the file are such that 0 points to the beginning + // of the header. QTC::TC("qpdf", "QPDF global offset"); m->file = std::shared_ptr(new OffsetInputSource(m->file, global_offset)); } @@ -448,14 +438,12 @@ QPDF::parse(char const* password) if (!m->file->findFirst("%PDF-", 0, 1024, hf)) { QTC::TC("qpdf", "QPDF not a pdf file"); warn(damagedPDF("", 0, "can't find PDF header")); - // QPDFWriter writes files that usually require at least - // version 1.2 for /FlateDecode + // QPDFWriter writes files that usually require at least version 1.2 for /FlateDecode m->pdf_version = "1.2"; } - // PDF spec says %%EOF must be found within the last 1024 bytes of - // the file. We add an extra 30 characters to leave room for the - // startxref stuff. + // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra + // 30 characters to leave room for the startxref stuff. m->file->seek(0, SEEK_END); qpdf_offset_t end_offset = m->file->tell(); qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); @@ -494,8 +482,8 @@ void QPDF::inParse(bool v) { if (m->in_parse == v) { - // This happens if QPDFParser::parse tries to - // resolve an indirect object while it is parsing. + // This happens if QPDFParser::parse tries to resolve an indirect object while it is + // parsing. throw std::logic_error("QPDF: re-entrant parsing detected. This is a qpdf bug." " Please report at https://github.com/qpdf/qpdf/issues."); } @@ -518,7 +506,7 @@ QPDF::warn( qpdf_offset_t offset, std::string const& message) { - warn(QPDFExc(error_code, this->getFilename(), object, offset, message)); + warn(QPDFExc(error_code, getFilename(), object, offset, message)); } void @@ -534,9 +522,8 @@ void QPDF::reconstruct_xref(QPDFExc& e) { if (m->reconstructed_xref) { - // Avoid xref reconstruction infinite loops. This is getting - // very hard to reproduce because qpdf is throwing many fewer - // exceptions while parsing. Most situations are warnings now. + // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because + // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. throw e; } @@ -572,8 +559,7 @@ QPDF::reconstruct_xref(QPDFExc& e) QPDFTokenizer::Token t1 = readToken(m->file, MAX_LEN); qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); if (token_start >= next_line_start) { - // don't process yet -- wait until we get to the line - // containing this token + // don't process yet -- wait until we get to the line containing this token } else if (t1.isInteger()) { QPDFTokenizer::Token t2 = readToken(m->file, MAX_LEN); if ((t2.isInteger()) && (readToken(m->file, MAX_LEN).isWord("obj"))) { @@ -594,22 +580,18 @@ QPDF::reconstruct_xref(QPDFExc& e) } if (!m->trailer.isInitialized()) { - // We could check the last encountered object to see if it was - // an xref stream. If so, we could try to get the trailer - // from there. This may make it possible to recover files - // with bad startxref pointers even when they have object - // streams. + // We could check the last encountered object to see if it was an xref stream. If so, we + // could try to get the trailer from there. This may make it possible to recover files with + // bad startxref pointers even when they have object streams. throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file"); } - // We could iterate through the objects looking for streams and - // try to find objects inside of them, but it's probably not worth - // the trouble. Acrobat can't recover files with any errors in an - // xref stream, and this would be a real long shot anyway. If we - // wanted to do anything that involved looking at stream contents, - // we'd also have to call initializeEncryption() here. It's safe - // to call it more than once. + // We could iterate through the objects looking for streams and try to find objects inside of + // them, but it's probably not worth the trouble. Acrobat can't recover files with any errors + // in an xref stream, and this would be a real long shot anyway. If we wanted to do anything + // that involved looking at stream contents, we'd also have to call initializeEncryption() here. + // It's safe to call it more than once. } void @@ -622,12 +604,10 @@ QPDF::read_xref(qpdf_offset_t xref_offset) char buf[7]; memset(buf, 0, sizeof(buf)); m->file->seek(xref_offset, SEEK_SET); - // Some files miss the mark a little with startxref. We could - // do a better job of searching in the neighborhood for - // something that looks like either an xref table or stream, - // but the simple heuristic of skipping whitespace can help - // with the xref table case and is harmless with the stream - // case. + // Some files miss the mark a little with startxref. We could do a better job of searching + // in the neighborhood for something that looks like either an xref table or stream, but the + // simple heuristic of skipping whitespace can help with the xref table case and is harmless + // with the stream case. bool done = false; bool skipped_space = false; while (!done) { @@ -646,9 +626,8 @@ QPDF::read_xref(qpdf_offset_t xref_offset) } m->file->read(buf, sizeof(buf) - 1); - // The PDF spec says xref must be followed by a line - // terminator, but files exist in the wild where it is - // terminated by arbitrary whitespace. + // The PDF spec says xref must be followed by a line terminator, but files exist in the wild + // where it is terminated by arbitrary whitespace. if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { if (skipped_space) { QTC::TC("qpdf", "QPDF xref skipped space"); @@ -662,8 +641,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset) : (buf[4] == ' ') ? 2 : 9999)); int skip = 4; - // buf is null-terminated, and QUtil::is_space('\0') is - // false, so this won't overrun. + // buf is null-terminated, and QUtil::is_space('\0') is false, so this won't overrun. while (QUtil::is_space(buf[skip])) { ++skip; } @@ -697,16 +675,16 @@ QPDF::read_xref(qpdf_offset_t xref_offset) ") is not one plus the highest object number (" + std::to_string(max_obj) + ")"))); } - // We no longer need the deleted_objects table, so go ahead and - // clear it out to make sure we never depend on its being set. + // We no longer need the deleted_objects table, so go ahead and clear it out to make sure we + // never depend on its being set. m->deleted_objects.clear(); } bool QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes) { - // is_space and is_digit both return false on '\0', so this will - // not overrun the null-terminated buffer. + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated + // buffer. char const* p = line.c_str(); char const* start = line.c_str(); @@ -753,8 +731,8 @@ QPDF::parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes) bool QPDF::parse_xrefEntry(std::string const& line, qpdf_offset_t& f1, int& f2, char& type) { - // is_space and is_digit both return false on '\0', so this will - // not overrun the null-terminated buffer. + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated + // buffer. char const* p = line.c_str(); // Skip zero or more spaces. There aren't supposed to be any. @@ -862,8 +840,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")"); } if (type == 'f') { - // Save deleted items until after we've checked the - // XRefStm, if any. + // Save deleted items until after we've checked the XRefStm, if any. deleted_items.push_back(QPDFObjGen(toI(i), f2)); } else { insertXrefEntry(toI(i), 1, f1, f2); @@ -902,9 +879,8 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer"); } else { if (cur_trailer.getKey("/XRefStm").isInteger()) { - // Read the xref stream but disregard any return value - // -- we'll use our trailer's /Prev key instead of the - // xref stream's. + // Read the xref stream but disregard any return value -- we'll use our trailer's + // /Prev key instead of the xref stream's. (void)read_xrefStream(cur_trailer.getKey("/XRefStm").getIntValue()); } else { throw damagedPDF("xref stream", xref_offset, "invalid /XRefStm"); @@ -1035,8 +1011,8 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) num_entries += toS(indx.at(i)); } - // entry_size and num_entries have both been validated to ensure - // that this multiplication does not cause an overflow. + // entry_size and num_entries have both been validated to ensure that this multiplication does + // not cause an overflow. size_t expected_size = entry_size * num_entries; std::shared_ptr bp = xref_obj.getStreamData(qpdf_dl_specialized); @@ -1060,9 +1036,8 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) bool saw_first_compressed_object = false; - // Actual size vs. expected size check above ensures that we will - // not overflow any buffers here. We know that entry_size * - // num_entries is equal to the size of the buffer. + // Actual size vs. expected size check above ensures that we will not overflow any buffers here. + // We know that entry_size * num_entries is equal to the size of the buffer. unsigned char const* data = bp->getBuffer(); for (size_t i = 0; i < num_entries; ++i) { // Read this entry @@ -1081,17 +1056,15 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) } } - // Get the object and generation number. The object number is - // based on /Index. The generation number is 0 unless this is - // an uncompressed object record, in which case the generation - // number appears as the third field. + // Get the object and generation number. The object number is based on /Index. The + // generation number is 0 unless this is an uncompressed object record, in which case the + // generation number appears as the third field. int obj = toI(indx.at(cur_chunk)); if ((obj < 0) || ((std::numeric_limits::max() - obj) < chunk_count)) { std::ostringstream msg; msg.imbue(std::locale::classic()); msg << "adding " << chunk_count << " to " << obj - << " while computing index in xref stream would cause" - << " an integer overflow"; + << " while computing index in xref stream would cause an integer overflow"; throw std::range_error(msg.str()); } obj += chunk_count; @@ -1113,10 +1086,8 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) m->first_xref_item_offset = xref_offset; } if (fields[0] == 0) { - // Ignore fields[2], which we don't care about in this - // case. This works around the issue of some PDF files - // that put invalid values, like -1, here for deleted - // objects. + // Ignore fields[2], which we don't care about in this case. This works around the issue + // of some PDF files that put invalid values, like -1, here for deleted objects. fields[2] = 0; } insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); @@ -1143,17 +1114,14 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) void QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2, bool overwrite) { - // Populate the xref table in such a way that the first reference - // to an object that we see, which is the one in the latest xref - // table in which it appears, is the one that gets stored. This - // works because we are reading more recent appends before older - // ones. Exception: if overwrite is true, then replace any - // existing object. This is used in xref recovery mode, which - // reads the file from beginning to end. - - // If there is already an entry for this object and generation in - // the table, it means that a later xref table has registered this - // object. Disregard this one. + // Populate the xref table in such a way that the first reference to an object that we see, + // which is the one in the latest xref table in which it appears, is the one that gets stored. + // This works because we are reading more recent appends before older ones. Exception: if + // overwrite is true, then replace any existing object. This is used in xref recovery mode, + // which reads the file from beginning to end. + + // If there is already an entry for this object and generation in the table, it means that a + // later xref table has registered this object. Disregard this one. { // private scope int gen = (f0 == 2 ? 0 : f2); QPDFObjGen og(obj, gen); @@ -1220,8 +1188,8 @@ QPDF::showXRefTable() } } -// Resolve all objects in the xref table. If this triggers a xref table -// reconstruction abort and return false. Otherwise return true. +// Resolve all objects in the xref table. If this triggers a xref table reconstruction abort and +// return false. Otherwise return true. bool QPDF::resolveXRefTable() { @@ -1237,8 +1205,8 @@ QPDF::resolveXRefTable() return true; } -// Ensure all objects in the pdf file, including those in indirect -// references, appear in the object cache. +// Ensure all objects in the pdf file, including those in indirect references, appear in the object +// cache. void QPDF::fixDanglingReferences(bool force) { @@ -1255,10 +1223,9 @@ QPDF::fixDanglingReferences(bool force) size_t QPDF::getObjectCount() { - // This method returns the next available indirect object number. - // makeIndirectObject uses it for this purpose. After - // fixDanglingReferences is called, all objects in the xref table - // will also be in obj_cache. + // This method returns the next available indirect object number. makeIndirectObject uses it for + // this purpose. After fixDanglingReferences is called, all objects in the xref table will also + // be in obj_cache. fixDanglingReferences(); QPDFObjGen og; if (!m->obj_cache.empty()) { @@ -1270,8 +1237,7 @@ QPDF::getObjectCount() std::vector QPDF::getAllObjects() { - // After fixDanglingReferences is called, all objects are in the - // object cache. + // After fixDanglingReferences is called, all objects are in the object cache. fixDanglingReferences(); std::vector result; for (auto const& iter: m->obj_cache) { @@ -1315,34 +1281,27 @@ QPDF::readObject( auto object = QPDFParser(input, m->last_object_description, m->tokenizer, decrypter, this) .parse(empty, false); if (empty) { - // Nothing in the PDF spec appears to allow empty objects, but - // they have been encountered in actual PDF files and Adobe - // Reader appears to ignore them. + // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in + // actual PDF files and Adobe Reader appears to ignore them. warn(damagedPDF(input, input->getLastOffset(), "empty object treated as null")); } else if (object.isDictionary() && (!in_object_stream)) { // check for stream qpdf_offset_t cur_offset = input->tell(); if (readToken(input).isWord("stream")) { - // The PDF specification states that the word "stream" - // should be followed by either a carriage return and - // a newline or by a newline alone. It specifically - // disallowed following it by a carriage return alone - // since, in that case, there would be no way to tell - // whether the NL in a CR NL sequence was part of the - // stream data. However, some readers, including - // Adobe reader, accept a carriage return by itself - // when followed by a non-newline character, so that's - // what we do here. We have also seen files that have - // extraneous whitespace between the stream keyword and - // the newline. + // The PDF specification states that the word "stream" should be followed by either a + // carriage return and a newline or by a newline alone. It specifically disallowed + // following it by a carriage return alone since, in that case, there would be no way to + // tell whether the NL in a CR NL sequence was part of the stream data. However, some + // readers, including Adobe reader, accept a carriage return by itself when followed by + // a non-newline character, so that's what we do here. We have also seen files that have + // extraneous whitespace between the stream keyword and the newline. bool done = false; while (!done) { done = true; char ch; if (input->read(&ch, 1) == 0) { - // A premature EOF here will result in some - // other problem that will get reported at - // another time. + // A premature EOF here will result in some other problem that will get reported + // at another time. } else if (ch == '\n') { // ready to read stream data QTC::TC("qpdf", "QPDF stream with NL only"); @@ -1353,10 +1312,8 @@ QPDF::readObject( // Ready to read stream data QTC::TC("qpdf", "QPDF stream with CRNL"); } else { - // Treat the \r by itself as the - // whitespace after endstream and - // start reading stream data in spite - // of not having seen a newline. + // Treat the \r by itself as the whitespace after endstream and start + // reading stream data in spite of not having seen a newline. QTC::TC("qpdf", "QPDF stream with CR only"); input->unreadCh(ch); warn(damagedPDF( @@ -1381,9 +1338,8 @@ QPDF::readObject( } } - // Must get offset before accessing any additional - // objects since resolving a previously unresolved - // indirect object will change file position. + // Must get offset before accessing any additional objects since resolving a previously + // unresolved indirect object will change file position. qpdf_offset_t stream_offset = input->tell(); size_t length = 0; @@ -1427,8 +1383,7 @@ QPDF::readObject( } } - // Override last_offset so that it points to the beginning of the - // object we just read + // Override last_offset so that it points to the beginning of the object we just read input->setLastOffset(offset); return object; } @@ -1449,8 +1404,7 @@ size_t QPDF::recoverStreamLength( std::shared_ptr input, QPDFObjGen const& og, qpdf_offset_t stream_offset) { - // Try to reconstruct stream length by looking for - // endstream or endobj + // Try to reconstruct stream length by looking for endstream or endobj warn(damagedPDF(input, stream_offset, "attempting to recover stream length")); PatternFinder ef(*this, &QPDF::findEndstream); @@ -1481,9 +1435,8 @@ QPDF::recoverStreamLength( } } if (this_obj_offset && (this_og == og)) { - // Well, we found endstream\nendobj within the space - // allowed for this object, so we're probably in good - // shape. + // Well, we found endstream\nendobj within the space allowed for this object, so we're + // probably in good shape. } else { QTC::TC("qpdf", "QPDF found wrong endstream in recovery"); } @@ -1518,14 +1471,12 @@ QPDF::readObjectAtOffset( { bool check_og = true; if (exp_og.getObj() == 0) { - // This method uses an expect object ID of 0 to indicate that - // we don't know or don't care what the actual object ID is at - // this offset. This is true when we read the xref stream and - // linearization hint streams. In this case, we don't verify - // the expect object ID/generation against what was read from - // the file. There is also no reason to attempt xref recovery - // if we get a failure in this case since the read attempt was - // not triggered by an xref lookup. + // This method uses an expect object ID of 0 to indicate that we don't know or don't care + // what the actual object ID is at this offset. This is true when we read the xref stream + // and linearization hint streams. In this case, we don't verify the expect object + // ID/generation against what was read from the file. There is also no reason to attempt + // xref recovery if we get a failure in this case since the read attempt was not triggered + // by an xref lookup. check_og = false; try_recovery = false; } @@ -1535,11 +1486,9 @@ QPDF::readObjectAtOffset( try_recovery = false; } - // Special case: if offset is 0, just return null. Some PDF - // writers, in particular "Mac OS X 10.7.5 Quartz PDFContext", may - // store deleted objects in the xref table as "0000000000 00000 - // n", which is not correct, but it won't hurt anything for to - // ignore these. + // Special case: if offset is 0, just return null. Some PDF writers, in particular + // "Mac OS X 10.7.5 Quartz PDFContext", may store deleted objects in the xref table as + // "0000000000 00000 n", which is not correct, but it won't hurt anything for to ignore these. if (offset == 0) { QTC::TC("qpdf", "QPDF bogus 0 offset", 0); warn(damagedPDF(0, "object has offset 0")); @@ -1579,8 +1528,7 @@ QPDF::readObjectAtOffset( // Will be retried below throw e; } else { - // We can try reading the object anyway even if the ID - // doesn't match. + // We can try reading the object anyway even if the ID doesn't match. warn(e); } } @@ -1617,16 +1565,13 @@ QPDF::readObjectAtOffset( } if (isUnresolved(og)) { - // Store the object in the cache here so it gets cached - // whether we first know the offset or whether we first know - // the object ID and generation (in which we case we would get - // here through resolve). - - // Determine the end offset of this object before and after - // white space. We use these numbers to validate - // linearization hint tables. Offsets and lengths of objects - // may imply the end of an object to be anywhere between these - // values. + // Store the object in the cache here so it gets cached whether we first know the offset or + // whether we first know the object ID and generation (in which we case we would get here + // through resolve). + + // Determine the end offset of this object before and after white space. We use these + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply + // the end of an object to be anywhere between these values. qpdf_offset_t end_before_space = m->file->tell(); // skip over spaces @@ -1643,41 +1588,31 @@ QPDF::readObjectAtOffset( } qpdf_offset_t end_after_space = m->file->tell(); if (skip_cache_if_in_xref && m->xref_table.count(og)) { - // Ordinarily, an object gets read here when resolved - // through xref table or stream. In the special case of - // the xref stream and linearization hint tables, the - // offset comes from another source. For the specific case - // of xref streams, the xref stream is read and loaded - // into the object cache very early in parsing. - // Ordinarily, when a file is updated by appending, items - // inserted into the xref table in later updates take - // precedence over earlier items. In the special case of - // reusing the object number previously used as the xref - // stream, we have the following order of events: + // Ordinarily, an object gets read here when resolved through xref table or stream. In + // the special case of the xref stream and linearization hint tables, the offset comes + // from another source. For the specific case of xref streams, the xref stream is read + // and loaded into the object cache very early in parsing. Ordinarily, when a file is + // updated by appending, items inserted into the xref table in later updates take + // precedence over earlier items. In the special case of reusing the object number + // previously used as the xref stream, we have the following order of events: // // * reused object gets loaded into the xref table // * old object is read here while reading xref streams // * original xref entry is ignored (since already in xref table) // - // It is the second step that causes a problem. Even - // though the xref table is correct in this case, the old - // object is already in the cache and so effectively - // prevails over the reused object. To work around this - // issue, we have a special case for the xref stream (via - // the skip_cache_if_in_xref): if the object is already in - // the xref stream, don't cache what we read here. + // It is the second step that causes a problem. Even though the xref table is correct in + // this case, the old object is already in the cache and so effectively prevails over + // the reused object. To work around this issue, we have a special case for the xref + // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream, + // don't cache what we read here. // - // It is likely that the same bug may exist for - // linearization hint tables, but the existing code uses - // end_before_space and end_after_space from the cache, so - // fixing that would require more significant rework. The - // chances of a linearization hint stream being reused - // seems smaller because the xref stream is probably the - // highest object in the file and the linearization hint - // stream would be some random place in the middle, so I'm - // leaving that bug unfixed for now. If the bug were to be - // fixed, we could use !check_og in place of - // skip_cache_if_in_xref. + // It is likely that the same bug may exist for linearization hint tables, but the + // existing code uses end_before_space and end_after_space from the cache, so fixing + // that would require more significant rework. The chances of a linearization hint + // stream being reused seems smaller because the xref stream is probably the highest + // object in the file and the linearization hint stream would be some random place in + // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we + // could use !check_og in place of skip_cache_if_in_xref. QTC::TC("qpdf", "QPDF skipping cache for known unchecked object"); } else { updateCache(og, oh.getObj(), end_before_space, end_after_space); @@ -1695,9 +1630,8 @@ QPDF::resolve(QPDFObjGen og) } if (m->resolving.count(og)) { - // This can happen if an object references itself directly or - // indirectly in some key that has to be resolved during - // object parsing, such as stream length. + // This can happen if an object references itself directly or indirectly in some key that + // has to be resolved during object parsing, such as stream length. QTC::TC("qpdf", "QPDF recursion loop in resolve"); warn(damagedPDF("", "loop detected resolving object " + og.unparse(' '))); updateCache(og, QPDF_Null::create(), -1, -1); @@ -1758,8 +1692,8 @@ QPDF::resolveObjectsInStream(int obj_stream_number) "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); } - // For linearization data in the object, use the data from the - // object stream for the objects in the stream. + // For linearization data in the object, use the data from the object stream for the objects in + // the stream. QPDFObjGen stream_og(obj_stream_number, 0); qpdf_offset_t end_before_space = m->obj_cache[stream_og].end_before_space; qpdf_offset_t end_after_space = m->obj_cache[stream_og].end_after_space; @@ -1804,11 +1738,10 @@ QPDF::resolveObjectsInStream(int obj_stream_number) offsets[num] = toI(offset + first); } - // To avoid having to read the object stream multiple times, store - // all objects that would be found here in the cache. Remember - // that some objects stored here might have been overridden by new - // objects appended to the file, so it is necessary to recheck the - // xref table and only cache what would actually be resolved here. + // To avoid having to read the object stream multiple times, store all objects that would be + // found here in the cache. Remember that some objects stored here might have been overridden + // by new objects appended to the file, so it is necessary to recheck the xref table and only + // cache what would actually be resolved here. for (auto const& iter: offsets) { QPDFObjGen og(iter.first, 0); QPDFXRefEntry const& entry = m->xref_table[og]; @@ -1936,8 +1869,7 @@ QPDF::reserveStream(QPDFObjGen const& og) QPDFObjectHandle QPDF::getObject(QPDFObjGen const& og) { - // This method is called by the parser and therefore must not - // resolve any objects. + // This method is called by the parser and therefore must not resolve any objects. if (!isCached(og)) { m->obj_cache[og] = ObjCache(QPDF_Unresolved::create(this, og), -1, -1); } @@ -1991,48 +1923,38 @@ QPDF::copyForeignObject(QPDFObjectHandle foreign) { // Here's an explanation of what's going on here. // - // A QPDFObjectHandle that is an indirect object has an owning - // QPDF. The object ID and generation refers to an object in the - // owning QPDF. When we copy the QPDFObjectHandle from a foreign - // QPDF into the local QPDF, we have to replace all indirect - // object references with references to the corresponding object - // in the local file. + // A QPDFObjectHandle that is an indirect object has an owning QPDF. The object ID and + // generation refers to an object in the owning QPDF. When we copy the QPDFObjectHandle from a + // foreign QPDF into the local QPDF, we have to replace all indirect object references with + // references to the corresponding object in the local file. // - // To do this, we maintain mappings from foreign object IDs to - // local object IDs for each foreign QPDF that we are copying - // from. The mapping is stored in an ObjCopier, which contains a + // To do this, we maintain mappings from foreign object IDs to local object IDs for each foreign + // QPDF that we are copying from. The mapping is stored in an ObjCopier, which contains a // mapping from the foreign ObjGen to the local QPDFObjectHandle. // - // To copy, we do a deep traversal of the foreign object with loop - // detection to discover all indirect objects that are - // encountered, stopping at page boundaries. Whenever we encounter - // an indirect object, we check to see if we have already created - // a local copy of it. If not, we allocate a "reserved" object - // (or, for a stream, just a new stream) and store in the map the + // To copy, we do a deep traversal of the foreign object with loop detection to discover all + // indirect objects that are encountered, stopping at page boundaries. Whenever we encounter an + // indirect object, we check to see if we have already created a local copy of it. If not, we + // allocate a "reserved" object (or, for a stream, just a new stream) and store in the map the // mapping from the foreign object ID to the new object. While we // do this, we keep a list of objects to copy. // - // Once we are done with the traversal, we copy all the objects - // that we need to copy. However, the copies will contain indirect - // object IDs that refer to objects in the foreign file. We need - // to replace them with references to objects in the local file. - // This is what replaceForeignIndirectObjects does. Once we have - // created a copy of the foreign object with all the indirect - // references replaced with new ones in the local context, we can - // replace the local reserved object with the copy. This mechanism - // allows us to copy objects with circular references in any - // order. - - // For streams, rather than copying the objects, we set up the - // stream data to pull from the original stream by using a stream - // data provider. This is done in a manner that doesn't require - // the original QPDF object but may require the original source of - // the stream data with special handling for immediate_copy_from. - // This logic is also in replaceForeignIndirectObjects. - - // Note that we explicitly allow use of copyForeignObject on page - // objects. It is a documented use case to copy pages this way if - // the intention is to not update the pages tree. + // Once we are done with the traversal, we copy all the objects that we need to copy. However, + // the copies will contain indirect object IDs that refer to objects in the foreign file. We + // need to replace them with references to objects in the local file. This is what + // replaceForeignIndirectObjects does. Once we have created a copy of the foreign object with + // all the indirect references replaced with new ones in the local context, we can replace the + // local reserved object with the copy. This mechanism allows us to copy objects with circular + // references in any order. + + // For streams, rather than copying the objects, we set up the stream data to pull from the + // original stream by using a stream data provider. This is done in a manner that doesn't + // require the original QPDF object but may require the original source of the stream data with + // special handling for immediate_copy_from. This logic is also in + // replaceForeignIndirectObjects. + + // Note that we explicitly allow use of copyForeignObject on page objects. It is a documented + // use case to copy pages this way if the intention is to not update the pages tree. if (!foreign.isIndirect()) { QTC::TC("qpdf", "QPDF copyForeign direct"); throw std::logic_error("QPDF::copyForeign called with direct object handle"); @@ -2049,12 +1971,10 @@ QPDF::copyForeignObject(QPDFObjectHandle foreign) " at the beginning of copyForeignObject"); } - // Make sure we have an object in this file for every referenced - // object in the old file. obj_copier.object_map maps foreign - // QPDFObjGen to local objects. For everything new that we have - // to copy, the local object will be a reservation, unless it is a - // stream, in which case the local object will already be a - // stream. + // Make sure we have an object in this file for every referenced object in the old file. + // obj_copier.object_map maps foreign QPDFObjGen to local objects. For everything new that we + // have to copy, the local object will be a reservation, unless it is a stream, in which case + // the local object will already be a stream. reserveObjects(foreign, obj_copier, true); if (!obj_copier.visiting.empty()) { @@ -2140,8 +2060,8 @@ QPDF::replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_cop QTC::TC("qpdf", "QPDF replace indirect"); auto mapping = obj_copier.object_map.find(foreign.getObjGen()); if (mapping == obj_copier.object_map.end()) { - // This case would occur if this is a reference to a Page - // or Pages object that we didn't traverse into. + // This case would occur if this is a reference to a Page or Pages object that we didn't + // traverse into. QTC::TC("qpdf", "QPDF replace foreign indirect with null"); result = QPDFObjectHandle::newNull(); } else { @@ -2192,9 +2112,8 @@ QPDF::replaceForeignIndirectObjects(QPDFObjectHandle foreign, ObjCopier& obj_cop void QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) { - // This method was originally written for copying foreign streams, - // but it is used by QPDFObjectHandle to copy streams from the - // same QPDF object as well. + // This method was originally written for copying foreign streams, but it is used by + // QPDFObjectHandle to copy streams from the same QPDF object as well. QPDFObjectHandle dict = result.getDict(); QPDFObjectHandle old_dict = foreign.getDict(); @@ -2204,8 +2123,8 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) std::shared_ptr(m->copied_stream_data_provider); } QPDFObjGen local_og(result.getObjGen()); - // Copy information from the foreign stream so we can pipe its - // data later without keeping the original QPDF object around. + // Copy information from the foreign stream so we can pipe its data later without keeping the + // original QPDF object around. QPDF& foreign_stream_qpdf = foreign.getQPDF("unable to retrieve owning qpdf from foreign stream"); @@ -2217,10 +2136,9 @@ QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign) } std::shared_ptr stream_buffer = stream->getStreamDataBuffer(); if ((foreign_stream_qpdf.m->immediate_copy_from) && (stream_buffer == nullptr)) { - // Pull the stream data into a buffer before attempting - // the copy operation. Do it on the source stream so that - // if the source stream is copied multiple times, we don't - // have to keep duplicating the memory. + // Pull the stream data into a buffer before attempting the copy operation. Do it on the + // source stream so that if the source stream is copied multiple times, we don't have to + // keep duplicating the memory. QTC::TC("qpdf", "QPDF immediate copy stream data"); foreign.replaceStreamData( foreign.getRawStreamData(), @@ -2263,8 +2181,7 @@ QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2) void QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2) { - // Force objects to be read from the input source if needed, then - // swap them in the cache. + // Force objects to be read from the input source if needed, then swap them in the cache. resolve(og1); resolve(og2); m->obj_cache[og1].object->swapWith(m->obj_cache[og2].object); @@ -2338,9 +2255,8 @@ QPDF::getRoot() if (!root.isDictionary()) { throw damagedPDF("", 0, "unable to find /Root dictionary"); } else if ( - // Check_mode is an interim solution to request #810 pending a more - // comprehensive review of the approach to more extensive checks and - // warning levels. + // Check_mode is an interim solution to request #810 pending a more comprehensive review of + // the approach to more extensive checks and warning levels. m->check_mode && !root.getKey("/Type").isNameAndEquals("/Catalog")) { warn(damagedPDF("", 0, "catalog /Type entry missing or invalid")); root.replaceKey("/Type", "/Catalog"_qpdf); @@ -2373,14 +2289,11 @@ QPDF::getObjectStreamData(std::map& omap) std::vector QPDF::getCompressibleObjGens() { - // Return a list of objects that are allowed to be in object - // streams. Walk through the objects by traversing the document - // from the root, including a traversal of the pages tree. This - // makes that objects that are on the same page are more likely to - // be in the same object stream, which is slightly more efficient, - // particularly with linearized files. This is better than - // iterating through the xref table since it avoids preserving - // orphaned items. + // Return a list of objects that are allowed to be in object streams. Walk through the objects + // by traversing the document from the root, including a traversal of the pages tree. This + // makes that objects that are on the same page are more likely to be in the same object stream, + // which is slightly more efficient, particularly with linearized files. This is better than + // iterating through the xref table since it avoids preserving orphaned items. // Exclude encryption dictionary, if any QPDFObjectHandle encryption_dict = m->trailer.getKey("/Encrypt"); @@ -2555,9 +2468,8 @@ QPDF::pipeForeignStreamData( will_retry); } -// Throw a generic exception when we lack context for something -// more specific. New code should not use this. This method exists -// to improve somewhat from calling assert in very old code. +// Throw a generic exception when we lack context for something more specific. New code should not +// use this. This method exists to improve somewhat from calling assert in very old code. void QPDF::stopOnError(std::string const& message) { @@ -2584,33 +2496,31 @@ QPDF::damagedPDF( return damagedPDF(input, m->last_object_description, offset, message); } -// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from -// m->file. +// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from m->file. QPDFExc QPDF::damagedPDF(std::string const& object, qpdf_offset_t offset, std::string const& message) { return QPDFExc(qpdf_e_damaged_pdf, m->file->getName(), object, offset, message); } -// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from -// m->file and the offset from .m->file->getLastOffset(). +// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from m->file and the +// offset from .m->file->getLastOffset(). QPDFExc QPDF::damagedPDF(std::string const& object, std::string const& message) { return damagedPDF(object, m->file->getLastOffset(), message); } -// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from -// m->file and the object from .m->last_object_description. +// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from m->file and the object +// from .m->last_object_description. QPDFExc QPDF::damagedPDF(qpdf_offset_t offset, std::string const& message) { return damagedPDF(m->last_object_description, offset, message); } -// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from -// m->file, the object from m->last_object_description and the offset from -// m->file->getLastOffset(). +// Return an exception of type qpdf_e_damaged_pdf. The filename is taken from m->file, the object +// from m->last_object_description and the offset from m->file->getLastOffset(). QPDFExc QPDF::damagedPDF(std::string const& message) { -- cgit v1.2.3-54-g00ecf