From f45dacf4cbfab73ce470d0a61d4acee14206ab2b Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 7 Nov 2021 15:23:29 -0500 Subject: Make recovery logic flexible about where objects end (fixes #573) Don't assume endobj is at the beginning of the line. This means we are looking at tokens for every line, but the odds of n n obj appearing in the middle of the object are likely much lower than endobj not being at the beginning of the line or missing entirely. This will probably have a negative impact on recovery time for very large files. Hopefully it will be worth it. --- libqpdf/QPDF.cc | 61 +++++++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'libqpdf/QPDF.cc') diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 94161c08..4ce60ea2 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -590,7 +590,6 @@ QPDF::reconstruct_xref(QPDFExc& e) this->m->file->seek(0, SEEK_END); qpdf_offset_t eof = this->m->file->tell(); this->m->file->seek(0, SEEK_SET); - bool in_obj = false; qpdf_offset_t line_start = 0; // Don't allow very long tokens here during recovery. static size_t const MAX_LEN = 100; @@ -604,46 +603,36 @@ QPDF::reconstruct_xref(QPDFExc& e) this->m->file->tell() - toO(t1.getValue().length()); if (token_start >= next_line_start) { - // don't process yet + // don't process yet -- wait until we get to the line + // containing this token } - else if (in_obj) - { - if (t1 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "endobj")) - { - in_obj = false; - } - } - else + else if (t1.getType() == QPDFTokenizer::tt_integer) { - if (t1.getType() == QPDFTokenizer::tt_integer) + QPDFTokenizer::Token t2 = + readToken(this->m->file, MAX_LEN); + QPDFTokenizer::Token t3 = + readToken(this->m->file, MAX_LEN); + if ((t2.getType() == QPDFTokenizer::tt_integer) && + (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj"))) { - QPDFTokenizer::Token t2 = - readToken(this->m->file, MAX_LEN); - QPDFTokenizer::Token t3 = - readToken(this->m->file, MAX_LEN); - if ((t2.getType() == QPDFTokenizer::tt_integer) && - (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj"))) - { - in_obj = true; - int obj = QUtil::string_to_int(t1.getValue().c_str()); - int gen = QUtil::string_to_int(t2.getValue().c_str()); - insertXrefEntry(obj, 1, token_start, gen, true); - } + int obj = QUtil::string_to_int(t1.getValue().c_str()); + int gen = QUtil::string_to_int(t2.getValue().c_str()); + insertXrefEntry(obj, 1, token_start, gen, true); } - else if ((! this->m->trailer.isInitialized()) && - (t1 == QPDFTokenizer::Token( - QPDFTokenizer::tt_word, "trailer"))) - { - QPDFObjectHandle t = + } + else if ((! this->m->trailer.isInitialized()) && + (t1 == QPDFTokenizer::Token( + QPDFTokenizer::tt_word, "trailer"))) + { + QPDFObjectHandle t = readObject(this->m->file, "trailer", 0, 0, false); - if (! t.isDictionary()) - { - // Oh well. It was worth a try. - } - else - { - setTrailer(t); - } + if (! t.isDictionary()) + { + // Oh well. It was worth a try. + } + else + { + setTrailer(t); } } this->m->file->seek(next_line_start, SEEK_SET); -- cgit v1.2.3-54-g00ecf