From 9a0b88bf7777c153dc46ace22db74ef24d51583a Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 29 Apr 2008 12:55:25 +0000 Subject: update release date to actual date git-svn-id: svn+q:///qpdf/trunk@599 71b93d88-0707-0410-a8cf-f5a4172ac649 --- libqpdf/QPDF_linearization.cc | 2103 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2103 insertions(+) create mode 100644 libqpdf/QPDF_linearization.cc (limited to 'libqpdf/QPDF_linearization.cc') diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc new file mode 100644 index 00000000..6c0cf3be --- /dev/null +++ b/libqpdf/QPDF_linearization.cc @@ -0,0 +1,2103 @@ +// See doc/linearization. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +template +static void +load_vector_int(BitStream& bit_stream, int nitems, std::vector& vec, + int bits_wanted, int T::*field) +{ + // nitems times, read bits_wanted from the given bit stream, + // storing results in the ith vector entry. + + for (int i = 0; i < nitems; ++i) + { + vec[i].*field = bit_stream.getBits(bits_wanted); + } + // The PDF spec says that each hint table starts at a byte + // boundary. Each "row" actually must start on a byte boundary. + bit_stream.skipToNextByte(); +} + +template +static void +load_vector_vector(BitStream& bit_stream, + int nitems1, std::vector& vec1, int T::*nitems2, + int bits_wanted, std::vector T::*vec2) +{ + // nitems1 times, read nitems2 (from the ith element of vec1) items + // into the vec2 vector field of the ith item of vec1. + for (int i1 = 0; i1 < nitems1; ++i1) + { + for (int i2 = 0; i2 < vec1[i1].*nitems2; ++i2) + { + (vec1[i1].*vec2).push_back(bit_stream.getBits(bits_wanted)); + } + } + bit_stream.skipToNextByte(); +} + +bool +QPDF::checkLinearization() +{ + bool result = false; + try + { + readLinearizationData(); + result = checkLinearizationInternal(); + } + catch (QPDFExc& e) + { + std::cout << e.what() << std::endl; + } + return result; +} + +bool +QPDF::isLinearized() +{ + // If the first object in the file is a dictionary with a suitable + // /Linearized key and has an /L key that accurately indicates the + // file size, initialize this->lindict and return true. + + // A linearized PDF spec's first object will be contained within + // the first 1024 bytes of the file and will be a dictionary with + // a valid /Linearized key. This routine looks for that and does + // no additional validation. + + // The PDF spec says the linearization dictionary must be + // completely contained within the first 1024 bytes of the file. + // Add a byte for a null terminator. + static int const tbuf_size = 1025; + + char* buf = new char[tbuf_size]; + this->file.seek(0, SEEK_SET); + PointerHolder b(buf); // guarantee deletion + memset(buf, '\0', tbuf_size); + this->file.read(buf, tbuf_size - 1); + + static PCRE lindict_re("(?s:(\\d+)\\s+0\\s+obj\\s*<<)"); + + off_t offset = -1; + int lindict_obj = 0; + char* p = buf; + while (lindict_obj == 0) + { + PCRE::Match m(lindict_re.match(p)); + if (m) + { + offset = m.getOffset(0) + (p - buf); + lindict_obj = atoi(m.getMatch(1).c_str()); + if (m.getMatch(0).find('\n') != std::string::npos) + { + QTC::TC("qpdf", "QPDF lindict found newline"); + } + } + else + { + if ((p = (char*)memchr(p, '\0', tbuf_size - (p - buf))) != 0) + { + QTC::TC("qpdf", "QPDF lindict null found"); + while ((p - buf < tbuf_size) && (*p == 0)) + { + ++p; + } + if ((p - buf) == tbuf_size) + { + break; + } + QTC::TC("qpdf", "QPDF lindict searching after null"); + } + } + } + + if (lindict_obj == 0) + { + return false; + } + + QPDFObjectHandle candidate = QPDFObjectHandle::Factory::newIndirect( + this, lindict_obj, 0); + if (! candidate.isDictionary()) + { + return false; + } + + QPDFObjectHandle linkey = candidate.getKey("/Linearized"); + if (! (linkey.isNumber() && ((int)floor(linkey.getNumericValue()) == 1))) + { + return false; + } + + QPDFObjectHandle L = candidate.getKey("/L"); + if (L.isInteger()) + { + int Li = L.getIntValue(); + this->file.seek(0, SEEK_END); + if (Li != this->file.tell()) + { + QTC::TC("qpdf", "QPDF /L mismatch"); + return false; + } + else + { + this->linp.file_size = Li; + } + } + + this->lindict = candidate; + + return true; +} + +void +QPDF::readLinearizationData() +{ + // This function throws an exception (which is trapped by + // checkLinearization()) for any errors that prevent loading. + + // Hint table parsing code needs at least 32 bits in a long. + assert(sizeof(long) >= 4); + + if (! isLinearized()) + { + throw QPDFExc(this->file.getName() + " is not linearized"); + } + + // /L is read and stored in linp by isLinearized() + QPDFObjectHandle H = lindict.getKey("/H"); + QPDFObjectHandle O = lindict.getKey("/O"); + QPDFObjectHandle E = lindict.getKey("/E"); + QPDFObjectHandle N = lindict.getKey("/N"); + QPDFObjectHandle T = lindict.getKey("/T"); + QPDFObjectHandle P = lindict.getKey("/P"); + + if (! (H.isArray() && + O.isInteger() && + E.isInteger() && + N.isInteger() && + T.isInteger() && + (P.isInteger() || P.isNull()))) + { + throw QPDFExc("some keys in linearization dictionary are of " + "the wrong type"); + } + + // Hint table array: offset length [ offset length ] + unsigned int n_H_items = H.getArrayNItems(); + if (! ((n_H_items == 2) || (n_H_items == 4))) + { + throw QPDFExc("H has the wrong number of items"); + } + + std::vector H_items; + for (unsigned int i = 0; i < n_H_items; ++i) + { + QPDFObjectHandle oh(H.getArrayItem(i)); + if (oh.isInteger()) + { + H_items.push_back(oh.getIntValue()); + } + else + { + throw QPDFExc("some H items are of the wrong type"); + } + } + + // H: hint table offset/length for primary and overflow hint tables + int H0_offset = H_items[0]; + int H0_length = H_items[1]; + int H1_offset = 0; + int H1_length = 0; + if (H_items.size() == 4) + { + // Acrobat doesn't read or write these (as PDF 1.4), so we + // don't have a way to generate a test case. + // QTC::TC("qpdf", "QPDF overflow hint table"); + H1_offset = H_items[2]; + H1_length = H_items[3]; + } + + // P: first page number + int first_page = 0; + if (P.isInteger()) + { + QTC::TC("qpdf", "QPDF P present in lindict"); + first_page = P.getIntValue(); + } + else + { + QTC::TC("qpdf", "QPDF P absent in lindict"); + } + + // Store linearization parameter data + + // file_size initialized by isLinearized() + this->linp.first_page_object = O.getIntValue(); + this->linp.first_page_end = E.getIntValue(); + this->linp.npages = N.getIntValue(); + this->linp.xref_zero_offset = T.getIntValue(); + this->linp.first_page = first_page; + this->linp.H_offset = H0_offset; + this->linp.H_length = H0_length; + + // Read hint streams + + Pl_Buffer pb("hint buffer"); + QPDFObjectHandle H0 = readHintStream(pb, H0_offset, H0_length); + if (H1_offset) + { + (void) readHintStream(pb, H1_offset, H1_length); + } + + // PDF 1.4 hint tables that we ignore: + + // /T thumbnail + // /A thread information + // /E named destination + // /V interactive form + // /I information dictionary + // /C logical structure + // /L page label + + // Individual hint table offsets + QPDFObjectHandle HS = H0.getKey("/S"); // shared object + QPDFObjectHandle HO = H0.getKey("/O"); // outline + + PointerHolder hbp = pb.getBuffer(); + Buffer* hb = hbp.getPointer(); + unsigned char const* h_buf = hb->getBuffer(); + int h_size = hb->getSize(); + + readHPageOffset(BitStream(h_buf, h_size)); + + int HSi = HS.getIntValue(); + readHSharedObject(BitStream(h_buf + HSi, h_size - HSi)); + + if (HO.isInteger()) + { + int HOi = HO.getIntValue(); + readHGeneric(BitStream(h_buf + HOi, h_size - HOi), + this->outline_hints); + } +} + +QPDFObjectHandle +QPDF::readHintStream(Pipeline& pl, off_t offset, size_t length) +{ + int obj; + int gen; + QPDFObjectHandle H = readObjectAtOffset(offset, 0, 0, obj, gen); + ObjCache& oc = this->obj_cache[ObjGen(obj, gen)]; + off_t min_end_offset = oc.end_before_space; + off_t max_end_offset = oc.end_after_space; + if (! H.isStream()) + { + throw QPDFExc("hint table is not a stream"); + } + + QPDFObjectHandle Hdict = H.getDict(); + + // Some versions of Acrobat make /Length indirect and place it + // immediately after the stream, increasing length to cover it, + // even though the specification says all objects in the + // linearization parameter dictionary must be direct. We have to + // get the file position of the end of length in this case. + QPDFObjectHandle length_obj = Hdict.getKey("/Length"); + if (length_obj.isIndirect()) + { + QTC::TC("qpdf", "QPDF hint table length indirect"); + // Force resolution + (void) length_obj.getIntValue(); + ObjCache& oc = this->obj_cache + [ObjGen(length_obj.getObjectID(), + length_obj.getGeneration())]; + min_end_offset = oc.end_before_space; + max_end_offset = oc.end_after_space; + } + else + { + QTC::TC("qpdf", "QPDF hint table length direct"); + } + off_t computed_end = offset + length; + if ((computed_end < min_end_offset) || + (computed_end > max_end_offset)) + { + std::cout << "expected = " << computed_end + << "; actual = " << min_end_offset << ".." + << max_end_offset << std::endl; + throw QPDFExc("hint table length mismatch"); + } + H.pipeStreamData(&pl, true, false, false); + return Hdict; +} + +void +QPDF::readHPageOffset(BitStream h) +{ + // All comments referring to the PDF spec refer to the spec for + // version 1.4. + + HPageOffset& t = this->page_offset_hints; + + t.min_nobjects = h.getBits(32); // 1 + t.first_page_offset = h.getBits(32); // 2 + t.nbits_delta_nobjects = h.getBits(16); // 3 + t.min_page_length = h.getBits(32); // 4 + t.nbits_delta_page_length = h.getBits(16); // 5 + t.min_content_offset = h.getBits(32); // 6 + t.nbits_delta_content_offset = h.getBits(16); // 7 + t.min_content_length = h.getBits(32); // 8 + t.nbits_delta_content_length = h.getBits(16); // 9 + t.nbits_nshared_objects = h.getBits(16); // 10 + t.nbits_shared_identifier = h.getBits(16); // 11 + t.nbits_shared_numerator = h.getBits(16); // 12 + t.shared_denominator = h.getBits(16); // 13 + + unsigned int nitems = this->linp.npages; + std::vector& entries = t.entries; + entries = std::vector(nitems); + + load_vector_int(h, nitems, entries, + t.nbits_delta_nobjects, + &HPageOffsetEntry::delta_nobjects); + load_vector_int(h, nitems, entries, + t.nbits_delta_page_length, + &HPageOffsetEntry::delta_page_length); + load_vector_int(h, nitems, entries, + t.nbits_nshared_objects, + &HPageOffsetEntry::nshared_objects); + load_vector_vector(h, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_identifier, + &HPageOffsetEntry::shared_identifiers); + load_vector_vector(h, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_numerator, + &HPageOffsetEntry::shared_numerators); + load_vector_int(h, nitems, entries, + t.nbits_delta_content_offset, + &HPageOffsetEntry::delta_content_offset); + load_vector_int(h, nitems, entries, + t.nbits_delta_content_length, + &HPageOffsetEntry::delta_content_length); +} + +void +QPDF::readHSharedObject(BitStream h) +{ + HSharedObject& t = this->shared_object_hints; + + t.first_shared_obj = h.getBits(32); // 1 + t.first_shared_offset = h.getBits(32); // 2 + t.nshared_first_page = h.getBits(32); // 3 + t.nshared_total = h.getBits(32); // 4 + t.nbits_nobjects = h.getBits(16); // 5 + t.min_group_length = h.getBits(32); // 6 + t.nbits_delta_group_length = h.getBits(16); // 7 + + QTC::TC("qpdf", "QPDF lin nshared_total > nshared_first_page", + (t.nshared_total > t.nshared_first_page) ? 1 : 0); + + int nitems = t.nshared_total; + std::vector& entries = t.entries; + entries = std::vector(nitems); + + load_vector_int(h, nitems, entries, + t.nbits_delta_group_length, + &HSharedObjectEntry::delta_group_length); + load_vector_int(h, nitems, entries, + 1, &HSharedObjectEntry::signature_present); + for (int i = 0; i < nitems; ++i) + { + if (entries[i].signature_present) + { + // Skip 128-bit MD5 hash. These are not supported by + // acrobat, so they should probably never be there. We + // have no test case for this. + for (int j = 0; j < 4; ++j) + { + (void) h.getBits(32); + } + } + } + load_vector_int(h, nitems, entries, + t.nbits_nobjects, + &HSharedObjectEntry::nobjects_minus_one); +} + +void +QPDF::readHGeneric(BitStream h, HGeneric& t) +{ + t.first_object = h.getBits(32); // 1 + t.first_object_offset = h.getBits(32); // 2 + t.nobjects = h.getBits(32); // 3 + t.group_length = h.getBits(32); // 4 +} + +bool +QPDF::checkLinearizationInternal() +{ + // All comments referring to the PDF spec refer to the spec for + // version 1.4. + + std::list errors; + std::list warnings; + + // Check all values in linearization parameter dictionary + + LinParameters& p = this->linp; + + // L: file size in bytes -- checked by isLinearized + + // O: object number of first page + std::vector const& pages = getAllPages(); + if (p.first_page_object != pages[0].getObjectID()) + { + QTC::TC("qpdf", "QPDF err /O mismatch"); + errors.push_back("first page object (/O) mismatch"); + } + + // N: number of pages + int npages = pages.size(); + if (p.npages != npages) + { + // Not tested in the test suite + errors.push_back("page count (/N) mismatch"); + } + + for (int i = 0; i < npages; ++i) + { + QPDFObjectHandle const& page = pages[i]; + ObjGen og(page.getObjectID(), page.getGeneration()); + if (this->xref_table[og].getType() == 2) + { + errors.push_back("page dictionary for page " + + QUtil::int_to_string(i) + " is compressed"); + } + } + + // T: offset of whitespace character preceding xref entry for object 0 + this->file.seek(p.xref_zero_offset, SEEK_SET); + while (1) + { + char ch; + this->file.read(&ch, 1); + if (! ((ch == ' ') || (ch == '\r') || (ch == '\n'))) + { + this->file.seek(-1, SEEK_CUR); + break; + } + } + if (this->file.tell() != this->first_xref_item_offset) + { + QTC::TC("qpdf", "QPDF err /T mismatch"); + errors.push_back("space before first xref item (/T) mismatch " + "(computed = " + + QUtil::int_to_string(this->first_xref_item_offset) + + "; file = " + QUtil::int_to_string(this->file.tell())); + } + + // P: first page number -- Implementation note 124 says Acrobat + // ignores this value, so we will too. + + // Check numbering of compressed objects in each xref section. + // For linearized files, all compressed objects are supposed to be + // at the end of the containing xref section if any object streams + // are in use. + + if (this->uncompressed_after_compressed) + { + errors.push_back("linearized file contains an uncompressed object" + " after a compressed one in a cross-reference stream"); + } + + // Further checking requires optimization and order calculation. + // Don't allow optimization to make changes. If it has to, then + // the file is not properly linearized. We use the xref table to + // figure out which objects are compressed and which are + // uncompressed. + { // local scope + std::map object_stream_data; + for (std::map::const_iterator iter = + this->xref_table.begin(); + iter != this->xref_table.end(); ++iter) + { + ObjGen const& og = (*iter).first; + QPDFXRefEntry const& entry = (*iter).second; + if (entry.getType() == 2) + { + object_stream_data[og.obj] = entry.getObjStreamNumber(); + } + } + optimize(object_stream_data, false); + calculateLinearizationData(object_stream_data); + } + + // E: offset of end of first page -- Implementation note 123 says + // Acrobat includes on extra object here by mistake. pdlin fails + // to place thumbnail images in section 9, so when thumbnails are + // present, it also gets the wrong value for /E. It also doesn't + // count outlines here when it should even though it places them + // in part 6. This code fails to put thread information + // dictionaries in part 9, so it actually gets the wrong value for + // E when threads are present. In that case, it would probably + // agree with pdlin. As of this writing, the test suite doesn't + // contain any files with threads. + + assert(! this->part6.empty()); + int min_E = -1; + int max_E = -1; + for (std::vector::iterator iter = this->part6.begin(); + iter != this->part6.end(); ++iter) + { + ObjGen og((*iter).getObjectID(), (*iter).getGeneration()); + // All objects have to have been dereferenced to be classified. + assert(this->obj_cache.count(og) > 0); + ObjCache const& oc = this->obj_cache[og]; + min_E = std::max(min_E, (int)oc.end_before_space); + max_E = std::max(max_E, (int)oc.end_after_space); + } + if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) + { + QTC::TC("qpdf", "QPDF warn /E mismatch"); + warnings.push_back("end of first page section (/E) mismatch: /E = " + + QUtil::int_to_string(p.first_page_end) + + "; computed = " + + QUtil::int_to_string(min_E) + ".." + + QUtil::int_to_string(max_E)); + } + + // Check hint tables + + std::map shared_idx_to_obj; + checkHSharedObject(errors, warnings, pages, shared_idx_to_obj); + checkHPageOffset(errors, warnings, pages, shared_idx_to_obj); + checkHOutlines(warnings); + + // Report errors + + bool result = true; + + if (! errors.empty()) + { + result = false; + for (std::list::iterator iter = errors.begin(); + iter != errors.end(); ++iter) + { + std::cout << "ERROR: " << (*iter) << std::endl; + } + } + + if (! warnings.empty()) + { + result = false; + for (std::list::iterator iter = warnings.begin(); + iter != warnings.end(); ++iter) + { + std::cout << "WARNING: " << (*iter) << std::endl; + } + } + + return result; +} + +int +QPDF::maxEnd(ObjUser const& ou) +{ + assert(this->obj_user_to_objects.count(ou) > 0); + std::set const& ogs = this->obj_user_to_objects[ou]; + int end = 0; + for (std::set::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + assert(this->obj_cache.count(og) > 0); + end = std::max( + end, (int)(this->obj_cache[og].end_after_space)); + } + return end; +} + +int +QPDF::getLinearizationOffset(ObjGen const& og) +{ + QPDFXRefEntry entry = this->xref_table[og]; + int result = 0; + switch (entry.getType()) + { + case 1: + result = entry.getOffset(); + break; + + case 2: + // For compressed objects, return the offset of the object + // stream that contains them. + result = getLinearizationOffset(ObjGen(entry.getObjStreamNumber(), 0)); + break; + + default: + throw QPDFExc( + this->file.getName(), 0, + "getLinearizationOffset called for xref entry not of type 1 or 2"); + break; + } + return result; +} + +QPDFObjectHandle +QPDF::getUncompressedObject(QPDFObjectHandle& obj, + std::map const& object_stream_data) +{ + if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0)) + { + return obj; + } + else + { + int repl = (*(object_stream_data.find(obj.getObjectID()))).second; + return objGenToIndirect(ObjGen(repl, 0)); + } +} + +int +QPDF::lengthNextN(int first_object, int n, + std::list& errors) +{ + int length = 0; + for (int i = 0; i < n; ++i) + { + ObjGen og(first_object + i, 0); + if (this->xref_table.count(og) == 0) + { + errors.push_back( + "no xref table entry for " + + QUtil::int_to_string(first_object + i) + " 0"); + } + else + { + assert(this->obj_cache.count(og) > 0); + length += this->obj_cache[og].end_after_space - + getLinearizationOffset(og); + } + } + return length; +} + +void +QPDF::checkHPageOffset(std::list& errors, + std::list& warnings, + std::vector const& pages, + std::map& shared_idx_to_obj) +{ + // Implementation note 126 says Acrobat always sets + // delta_content_offset and delta_content_length in the page + // offset header dictionary to 0. It also states that + // min_content_offset in the per-page information is always 0, + // which is an incorrect value. + + // Implementation note 127 explains that Acrobat always sets item + // 8 (min_content_length) to zero, item 9 + // (nbits_delta_content_length) to the value of item 5 + // (nbits_delta_page_length), and item 7 of each per-page hint + // table (delta_content_length) to item 2 (delta_page_length) of + // that entry. Acrobat ignores these values when reading files. + + // Empirically, it also seems that Acrobat sometimes puts items + // under a page's /Resources dictionary in with shared objects + // even when they are private. + + unsigned int npages = pages.size(); + int table_offset = adjusted_offset( + this->page_offset_hints.first_page_offset); + ObjGen first_page_og(pages[0].getObjectID(), pages[0].getGeneration()); + assert(this->xref_table.count(first_page_og) > 0); + int offset = getLinearizationOffset(first_page_og); + if (table_offset != offset) + { + warnings.push_back("first page object offset mismatch"); + } + + for (unsigned int pageno = 0; pageno < npages; ++pageno) + { + ObjGen page_og(pages[pageno].getObjectID(), + pages[pageno].getGeneration()); + int first_object = page_og.obj; + assert(this->xref_table.count(page_og) > 0); + offset = getLinearizationOffset(page_og); + + HPageOffsetEntry& he = this->page_offset_hints.entries[pageno]; + CHPageOffsetEntry& ce = this->c_page_offset_data.entries[pageno]; + int h_nobjects = he.delta_nobjects + + this->page_offset_hints.min_nobjects; + if (h_nobjects != ce.nobjects) + { + // This happens with pdlin when there are thumbnails. + warnings.push_back( + "object count mismatch for page " + + QUtil::int_to_string(pageno) + ": hint table = " + + QUtil::int_to_string(h_nobjects) + "; computed = " + + QUtil::int_to_string(ce.nobjects)); + } + + // Use value for number of objects in hint table rather than + // computed value if there is a discrepancy. + int length = lengthNextN(first_object, h_nobjects, errors); + int h_length = he.delta_page_length + + this->page_offset_hints.min_page_length; + if (length != h_length) + { + // This condition almost certainly indicates a bad hint + // table or a bug in this code. + errors.push_back( + "page length mismatch for page " + + QUtil::int_to_string(pageno) + ": hint table = " + + QUtil::int_to_string(h_length) + "; computed length = " + + QUtil::int_to_string(length) + " (offset = " + + QUtil::int_to_string(offset) + ")"); + } + + offset += h_length; + + // Translate shared object indexes to object numbers. + std::set hint_shared; + std::set computed_shared; + + if ((pageno == 0) && (he.nshared_objects > 0)) + { + // pdlin and Acrobat both do this even though the spec + // states clearly and unambiguously that they should not. + warnings.push_back("page 0 has shared identifier entries"); + } + + for (int i = 0; i < he.nshared_objects; ++i) + { + int idx = he.shared_identifiers[i]; + assert(shared_idx_to_obj.count(idx) > 0); + hint_shared.insert(shared_idx_to_obj[idx]); + } + + for (int i = 0; i < ce.nshared_objects; ++i) + { + int idx = ce.shared_identifiers[i]; + assert(idx < this->c_shared_object_data.nshared_total); + int obj = this->c_shared_object_data.entries[idx].object; + computed_shared.insert(obj); + } + + for (std::set::iterator iter = hint_shared.begin(); + iter != hint_shared.end(); ++iter) + { + if (! computed_shared.count(*iter)) + { + // pdlin puts thumbnails here even though it shouldn't + warnings.push_back( + "page " + QUtil::int_to_string(pageno) + + ": shared object " + QUtil::int_to_string(*iter) + + ": in hint table but not computed list"); + } + } + + for (std::set::iterator iter = computed_shared.begin(); + iter != computed_shared.end(); ++iter) + { + if (! hint_shared.count(*iter)) + { + // Acrobat does not put some things including at least + // built-in fonts and procsets here, at least in some + // cases. + warnings.push_back( + "page " + QUtil::int_to_string(pageno) + + ": shared object " + QUtil::int_to_string(*iter) + + ": in computed list but not hint table"); + } + } + } +} + +void +QPDF::checkHSharedObject(std::list& errors, + std::list& warnings, + std::vector const& pages, + std::map& idx_to_obj) +{ + // Implementation note 125 says shared object groups always + // contain only one object. Implementation note 128 says that + // Acrobat always nbits_nobjects to zero. Implementation note 130 + // says that Acrobat does not support more than one shared object + // per group. These are all consistent. + + // Implementation note 129 states that MD5 signatures are not + // implemented in Acrobat, so signature_present must always be + // zero. + + // Implementation note 131 states that first_shared_obj and + // first_shared_offset have meaningless values for single-page + // files. + + // Empirically, Acrobat and pdlin generate incorrect values for + // these whenever there are no shared objects not referenced by + // the first page (i.e., nshared_total == nshared_first_page). + + HSharedObject& so = this->shared_object_hints; + if (so.nshared_total < so.nshared_first_page) + { + errors.push_back("shared object hint table: ntotal < nfirst_page"); + } + else + { + // The first nshared_first_page objects are consecutive + // objects starting with the first page object. The rest are + // consecutive starting from the first_shared_obj object. + int cur_object = pages[0].getObjectID(); + for (int i = 0; i < so.nshared_total; ++i) + { + if (i == so.nshared_first_page) + { + QTC::TC("qpdf", "QPDF lin check shared past first page"); + if (this->part8.empty()) + { + errors.push_back( + "part 8 is empty but nshared_total > " + "nshared_first_page"); + } + else + { + int obj = this->part8[0].getObjectID(); + if (obj != so.first_shared_obj) + { + errors.push_back( + "first shared object number mismatch: " + "hint table = " + + QUtil::int_to_string(so.first_shared_obj) + + "; computed = " + + QUtil::int_to_string(obj)); + } + } + + cur_object = so.first_shared_obj; + + ObjGen og(cur_object, 0); + assert(this->xref_table.count(og) > 0); + int offset = getLinearizationOffset(og); + int h_offset = adjusted_offset(so.first_shared_offset); + if (offset != h_offset) + { + errors.push_back( + "first shared object offset mismatch: hint table = " + + QUtil::int_to_string(h_offset) + "; computed = " + + QUtil::int_to_string(offset)); + } + } + + idx_to_obj[i] = cur_object; + HSharedObjectEntry& se = so.entries[i]; + int nobjects = se.nobjects_minus_one + 1; + int length = lengthNextN(cur_object, nobjects, errors); + int h_length = so.min_group_length + se.delta_group_length; + if (length != h_length) + { + errors.push_back( + "shared object " + QUtil::int_to_string(i) + + " length mismatch: hint table = " + + QUtil::int_to_string(h_length) + "; computed = " + + QUtil::int_to_string(length)); + } + cur_object += nobjects; + } + } +} + +void +QPDF::checkHOutlines(std::list& warnings) +{ + // Empirically, Acrobat generates the correct value for the object + // number but incorrectly stores the next object number's offset + // as the offset, at least when outlines appear in part 6. It + // also generates an incorrect value for length (specifically, the + // length that would cover the correct number of objects from the + // wrong starting place). pdlin appears to generate correct + // values in those cases. + + if (this->c_outline_data.nobjects == this->outline_hints.nobjects) + { + if (this->c_outline_data.nobjects == 0) + { + return; + } + + if (this->c_outline_data.first_object == + this->outline_hints.first_object) + { + // Check length and offset. Acrobat gets these wrong. + QPDFObjectHandle outlines = getRoot().getKey("/Outlines"); + ObjGen og(outlines.getObjectID(), outlines.getGeneration()); + assert(this->xref_table.count(og) > 0); + int offset = getLinearizationOffset(og); + ObjUser ou(ObjUser::ou_root_key, "/Outlines"); + int length = maxEnd(ou) - offset; + int table_offset = + adjusted_offset(this->outline_hints.first_object_offset); + if (offset != table_offset) + { + warnings.push_back( + "incorrect offset in outlines table: hint table = " + + QUtil::int_to_string(table_offset) + + "; computed = " + QUtil::int_to_string(offset)); + } + int table_length = this->outline_hints.group_length; + if (length != table_length) + { + warnings.push_back( + "incorrect length in outlines table: hint table = " + + QUtil::int_to_string(table_length) + + "; computed = " + QUtil::int_to_string(length)); + } + } + else + { + warnings.push_back("incorrect first object number in outline " + "hints table."); + } + } + else + { + warnings.push_back("incorrect object count in outline hint table"); + } +} + +void +QPDF::showLinearizationData() +{ + try + { + readLinearizationData(); + checkLinearizationInternal(); + dumpLinearizationDataInternal(); + } + catch (QPDFExc& e) + { + std::cout << e.what() << std::endl; + } +} + +void +QPDF::dumpLinearizationDataInternal() +{ + std::cout << this->file.getName() << ": linearization data:" << std::endl + << std::endl; + + std::cout + << "file_size: " << this->linp.file_size << std::endl + << "first_page_object: " << this->linp.first_page_object << std::endl + << "first_page_end: " << this->linp.first_page_end << std::endl + << "npages: " << this->linp.npages << std::endl + << "xref_zero_offset: " << this->linp.xref_zero_offset << std::endl + << "first_page: " << this->linp.first_page << std::endl + << "H_offset: " << this->linp.H_offset << std::endl + << "H_length: " << this->linp.H_length << std::endl + << std::endl; + + std::cout << "Page Offsets Hint Table" << std::endl + << std::endl; + dumpHPageOffset(); + std::cout << std::endl + << "Shared Objects Hint Table" << std::endl + << std::endl; + dumpHSharedObject(); + + if (this->outline_hints.nobjects > 0) + { + std::cout << std::endl + << "Outlines Hint Table" << std::endl + << std::endl; + dumpHGeneric(this->outline_hints); + } +} + +int +QPDF::adjusted_offset(int offset) +{ + // All offsets >= H_offset have to be increased by H_length + // since all hint table location values disregard the hint table + // itself. + if (offset >= this->linp.H_offset) + { + return offset + this->linp.H_length; + } + return offset; +} + + +void +QPDF::dumpHPageOffset() +{ + HPageOffset& t = this->page_offset_hints; + std::cout + << "min_nobjects: " << t.min_nobjects + << std::endl + << "first_page_offset: " << adjusted_offset(t.first_page_offset) + << std::endl + << "nbits_delta_nobjects: " << t.nbits_delta_nobjects + << std::endl + << "min_page_length: " << t.min_page_length + << std::endl + << "nbits_delta_page_length: " << t.nbits_delta_page_length + << std::endl + << "min_content_offset: " << t.min_content_offset + << std::endl + << "nbits_delta_content_offset: " << t.nbits_delta_content_offset + << std::endl + << "min_content_length: " << t.min_content_length + << std::endl + << "nbits_delta_content_length: " << t.nbits_delta_content_length + << std::endl + << "nbits_nshared_objects: " << t.nbits_nshared_objects + << std::endl + << "nbits_shared_identifier: " << t.nbits_shared_identifier + << std::endl + << "nbits_shared_numerator: " << t.nbits_shared_numerator + << std::endl + << "shared_denominator: " << t.shared_denominator + << std::endl; + + for (int i1 = 0; i1 < this->linp.npages; ++i1) + { + HPageOffsetEntry& pe = t.entries[i1]; + std::cout + << "Page " << i1 << ":" << std::endl + << " nobjects: " << pe.delta_nobjects + t.min_nobjects + << std::endl + << " length: " << pe.delta_page_length + t.min_page_length + << std::endl + // content offset is relative to page, not file + << " content_offset: " + << pe.delta_content_offset + t.min_content_offset << std::endl + << " content_length: " + << pe.delta_content_length + t.min_content_length << std::endl + << " nshared_objects: " << pe.nshared_objects << std::endl; + for (int i2 = 0; i2 < pe.nshared_objects; ++i2) + { + std::cout << " identifier " << i2 << ": " + << pe.shared_identifiers[i2] << std::endl; + std::cout << " numerator " << i2 << ": " + << pe.shared_numerators[i2] << std::endl; + } + } +} + +void +QPDF::dumpHSharedObject() +{ + HSharedObject& t = this->shared_object_hints; + std::cout + << "first_shared_obj: " << t.first_shared_obj + << std::endl + << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) + << std::endl + << "nshared_first_page: " << t.nshared_first_page + << std::endl + << "nshared_total: " << t.nshared_total + << std::endl + << "nbits_nobjects: " << t.nbits_nobjects + << std::endl + << "min_group_length: " << t.min_group_length + << std::endl + << "nbits_delta_group_length: " << t.nbits_delta_group_length + << std::endl; + + for (int i = 0; i < t.nshared_total; ++i) + { + HSharedObjectEntry& se = t.entries[i]; + std::cout << "Shared Object " << i << ":" << std::endl; + std::cout << " group length: " + << se.delta_group_length + t.min_group_length << std::endl; + // PDF spec says signature present nobjects_minus_one are + // always 0, so print them only if they have a non-zero value. + if (se.signature_present) + { + std::cout << " signature present" << std::endl; + } + if (se.nobjects_minus_one != 0) + { + std::cout << " nobjects: " + << se.nobjects_minus_one + 1 << std::endl; + } + } +} + +void +QPDF::dumpHGeneric(HGeneric& t) +{ + std::cout + << "first_object: " << t.first_object + << std::endl + << "first_object_offset: " << adjusted_offset(t.first_object_offset) + << std::endl + << "nobjects: " << t.nobjects + << std::endl + << "group_length: " << t.group_length + << std::endl; +} + +QPDFObjectHandle +QPDF::objGenToIndirect(ObjGen const& og) +{ + return getObjectByID(og.obj, og.gen); +} + +void +QPDF::calculateLinearizationData(std::map const& object_stream_data) +{ + // This function calculates the ordering of objects, divides them + // into the appropriate parts, and computes some values for the + // linearization parameter dictionary and hint tables. The file + // must be optimized (via calling optimize()) prior to calling + // this function. Note that actual offsets and lengths are not + // computed here, but anything related to object ordering is. + + if (this->object_to_obj_users.empty()) + { + // Note that we can't call optimize here because we don't know + // whether it should be called with or without allow changes. + throw QEXC::Internal("QPDF::calculateLinearizationData " + "called before optimize()"); + } + + // Separate objects into the categories sufficient for us to + // determine which part of the linearized file should contain the + // object. This categorization is useful for other purposes as + // well. Part numbers refer to version 1.4 of the PDF spec. + + // Parts 1, 3, 5, 10, and 11 don't contain any objects from the + // original file (except the trailer dictionary in part 11). + + // Part 4 is the document catalog (root) and the following root + // keys: /ViewerPreferences, /PageMode, /Threads, /OpenAction, + // /AcroForm, /Encrypt. Note that Thread information dictionaries + // are supposed to appear in part 9, but we are disregarding that + // recommendation for now. + + // Part 6 is the first page section. It includes all remaining + // objects referenced by the first page including shared objects + // but not including thumbnails. Additionally, if /PageMode is + // /Outlines, then information from /Outlines also appears here. + + // Part 7 contains remaining objects private to pages other than + // the first page. + + // Part 8 contains all remaining shared objects except those that + // are shared only within thumbnails. + + // Part 9 contains all remaining objects. + + // We sort objects into the following categories: + + // * open_document: part 4 + + // * first_page_private: part 6 + + // * first_page_shared: part 6 + + // * other_page_private: part 7 + + // * other_page_shared: part 8 + + // * thumbnail_private: part 9 + + // * thumbnail_shared: part 9 + + // * other: part 9 + + // * outlines: part 6 or 9 + + QPDFObjectHandle root = getRoot(); + bool outlines_in_first_page = false; + QPDFObjectHandle pagemode = root.getKey("/PageMode"); + QTC::TC("qpdf", "QPDF categorize pagemode present", + pagemode.isName() ? 1 : 0); + if (pagemode.isName()) + { + if (pagemode.getName() == "/UseOutlines") + { + if (root.hasKey("/Outlines")) + { + outlines_in_first_page = true; + } + else + { + QTC::TC("qpdf", "QPDF UseOutlines but no Outlines"); + } + } + QTC::TC("qpdf", "QPDF categorize pagemode outlines", + outlines_in_first_page ? 1 : 0); + } + + std::set open_document_keys; + open_document_keys.insert("/ViewerPreferences"); + open_document_keys.insert("/PageMode"); + open_document_keys.insert("/Threads"); + open_document_keys.insert("/OpenAction"); + open_document_keys.insert("/AcroForm"); + + std::set lc_open_document; + std::set lc_first_page_private; + std::set lc_first_page_shared; + std::set lc_other_page_private; + std::set lc_other_page_shared; + std::set lc_thumbnail_private; + std::set lc_thumbnail_shared; + std::set lc_other; + std::set lc_outlines; + std::set lc_root; + + for (std::map >::iterator oiter = + this->object_to_obj_users.begin(); + oiter != this->object_to_obj_users.end(); ++oiter) + { + ObjGen const& og = (*oiter).first; + + std::set& ous = (*oiter).second; + + bool in_open_document = false; + bool in_first_page = false; + int other_pages = 0; + int thumbs = 0; + int others = 0; + bool in_outlines = false; + bool is_root = false; + + for (std::set::iterator uiter = ous.begin(); + uiter != ous.end(); ++uiter) + { + ObjUser const& ou = *uiter; + switch (ou.ou_type) + { + case ObjUser::ou_trailer_key: + if (ou.key == "/Encrypt") + { + in_open_document = true; + } + else + { + ++others; + } + break; + + case ObjUser::ou_thumb: + ++thumbs; + break; + + case ObjUser::ou_root_key: + if (open_document_keys.count(ou.key) > 0) + { + in_open_document = true; + } + else if (ou.key == "/Outlines") + { + in_outlines = true; + } + else + { + ++others; + } + break; + + case ObjUser::ou_page: + if (ou.pageno == 0) + { + in_first_page = true; + } + else + { + ++other_pages; + } + break; + + case ObjUser::ou_root: + is_root = true; + break; + + case ObjUser::ou_bad: + throw QEXC::Internal("QPDF::calculateLinearizationData: " + "invalid user type"); + break; + } + } + + if (is_root) + { + lc_root.insert(og); + } + else if (in_outlines) + { + lc_outlines.insert(og); + } + else if (in_open_document) + { + lc_open_document.insert(og); + } + else if ((in_first_page) && + (others == 0) && (other_pages == 0) && (thumbs == 0)) + { + lc_first_page_private.insert(og); + } + else if (in_first_page) + { + lc_first_page_shared.insert(og); + } + else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) + { + lc_other_page_private.insert(og); + } + else if (other_pages > 1) + { + lc_other_page_shared.insert(og); + } + else if ((thumbs == 1) && (others == 0)) + { + lc_thumbnail_private.insert(og); + } + else if (thumbs > 1) + { + lc_thumbnail_shared.insert(og); + } + else + { + lc_other.insert(og); + } + } + + // Generate ordering for objects in the output file. Sometimes we + // just dump right from a set into a vector. Rather than + // optimizing this by going straight into the vector, we'll leave + // these phases separate for now. That way, this section can be + // concerned only with ordering, and the above section can be + // considered only with categorization. Note that sets of ObjGens + // are sorted by ObjGen. In a linearized file, objects appear in + // sequence with the possible exception of hints tables which we + // won't see here anyway. That means that running + // calculateLinearizationData() on a linearized file should give + // results identical to the original file ordering. + + // We seem to traverse the page tree a lot in this code, but we + // can address this for a future code optimization if necessary. + // Premature optimization is the root of all evil. + std::vector pages; + { // local scope + // Map all page objects to the containing object stream. This + // should be a no-op in a properly linearized file. + std::vector t = getAllPages(); + for (std::vector::iterator iter = t.begin(); + iter != t.end(); ++iter) + { + pages.push_back(getUncompressedObject(*iter, object_stream_data)); + } + } + unsigned int npages = pages.size(); + + // We will be initializing some values of the computed hint + // tables. Specifically, we can initialize any items that deal + // with object numbers or counts but not any items that deal with + // lengths or offsets. The code that writes linearized files will + // have to fill in these values during the first pass. The + // validation code can compute them relatively easily given the + // rest of the information. + + this->c_linp.npages = npages; + this->c_page_offset_data.entries = std::vector(npages); + + // Part 4: open document objects. We don't care about the order. + + assert(lc_root.size() == 1); + this->part4.push_back(objGenToIndirect(*(lc_root.begin()))); + for (std::set::iterator iter = lc_open_document.begin(); + iter != lc_open_document.end(); ++iter) + { + this->part4.push_back(objGenToIndirect(*iter)); + } + + // Part 6: first page objects. Note: implementation note 124 + // states that Acrobat always treats page 0 as the first page for + // linearization regardless of /OpenAction. pdlin doesn't provide + // any option to set this and also disregards /OpenAction. We + // will do the same. + + // First, place the actual first page object itself. + ObjGen first_page_og(pages[0].getObjectID(), pages[0].getGeneration()); + if (! lc_first_page_private.count(first_page_og)) + { + throw QEXC::Internal("QPDF::calculateLinearizationData: first page " + "object not in lc_first_page_private"); + } + lc_first_page_private.erase(first_page_og); + this->c_linp.first_page_object = pages[0].getObjectID(); + this->part6.push_back(pages[0]); + + // The PDF spec "recommends" an order for the rest of the objects, + // but we are going to disregard it except to the extent that it + // groups private and shared objects contiguously for the sake of + // hint tables. + + for (std::set::iterator iter = lc_first_page_private.begin(); + iter != lc_first_page_private.end(); ++iter) + { + this->part6.push_back(objGenToIndirect(*iter)); + } + + for (std::set::iterator iter = lc_first_page_shared.begin(); + iter != lc_first_page_shared.end(); ++iter) + { + this->part6.push_back(objGenToIndirect(*iter)); + } + + // Place the outline dictionary if it goes in the first page section. + if (outlines_in_first_page) + { + pushOutlinesToPart(this->part6, lc_outlines, object_stream_data); + } + + // Fill in page offset hint table information for the first page. + // The PDF spec says that nshared_objects should be zero for the + // first page. pdlin does not appear to obey this, but it fills + // in garbage values for all the shared object identifiers on the + // first page. + + this->c_page_offset_data.entries[0].nobjects = this->part6.size(); + + // Part 7: other pages' private objects + + // For each page in order: + for (unsigned int i = 1; i < npages; ++i) + { + // Place this page's page object + + ObjGen page_og(pages[i].getObjectID(), pages[i].getGeneration()); + if (! lc_other_page_private.count(page_og)) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: page object for page " + + QUtil::int_to_string(i) + " not in lc_other_page_private"); + } + lc_other_page_private.erase(page_og); + this->part7.push_back(pages[i]); + + // Place all non-shared objects referenced by this page, + // updating the page object count for the hint table. + + this->c_page_offset_data.entries[i].nobjects = 1; + + ObjUser ou(ObjUser::ou_page, i); + assert(this->obj_user_to_objects.count(ou) > 0); + std::set ogs = this->obj_user_to_objects[ou]; + for (std::set::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = (*iter); + if (lc_other_page_private.count(og)) + { + lc_other_page_private.erase(og); + this->part7.push_back(objGenToIndirect(og)); + ++this->c_page_offset_data.entries[i].nobjects; + } + } + } + // That should have covered all part7 objects. + if (! lc_other_page_private.empty()) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: lc_other_page_private is " + "not empty after generation of part7"); + } + + // Part 8: other pages' shared objects + + // Order is unimportant. + for (std::set::iterator iter = lc_other_page_shared.begin(); + iter != lc_other_page_shared.end(); ++iter) + { + this->part8.push_back(objGenToIndirect(*iter)); + } + + // Part 9: other objects + + // The PDF specification makes recommendations on ordering here. + // We follow them only to a limited extent. Specifically, we put + // the pages tree first, then private thumbnail objects in page + // order, then shared thumbnail objects, and then outlines (unless + // in part 6). After that, we throw all remaining objects in + // arbitrary order. + + // Place the pages tree. + std::set pages_ogs = + this->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")]; + assert(! pages_ogs.empty()); + for (std::set::iterator iter = pages_ogs.begin(); + iter != pages_ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if (lc_other.count(og)) + { + lc_other.erase(og); + this->part9.push_back(objGenToIndirect(og)); + } + } + + // Place private thumbnail images in page order. Slightly more + // information would be required if we were going to bother with + // thumbnail hint tables. + for (unsigned int i = 0; i < npages; ++i) + { + QPDFObjectHandle thumb = pages[i].getKey("/Thumb"); + thumb = getUncompressedObject(thumb, object_stream_data); + if (! thumb.isNull()) + { + // Output the thumbnail itself + ObjGen thumb_og(thumb.getObjectID(), thumb.getGeneration()); + if (lc_thumbnail_private.count(thumb_og)) + { + lc_thumbnail_private.erase(thumb_og); + this->part9.push_back(thumb); + } + else + { + // No internal error this time...there's nothing to + // stop this object from having been referred to + // somewhere else outside of a page's /Thumb, and if + // it had been, there's nothing to prevent it from + // having been in some set other than + // lc_thumbnail_private. + } + std::set& ogs = + this->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, i)]; + for (std::set::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if (lc_thumbnail_private.count(og)) + { + lc_thumbnail_private.erase(og); + this->part9.push_back(objGenToIndirect(og)); + } + } + } + } + if (! lc_thumbnail_private.empty()) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: lc_thumbnail_private " + "not empty after placing thumbnails"); + } + + // Place shared thumbnail objects + for (std::set::iterator iter = lc_thumbnail_shared.begin(); + iter != lc_thumbnail_shared.end(); ++iter) + { + this->part9.push_back(objGenToIndirect(*iter)); + } + + // Place outlines unless in first page + if (! outlines_in_first_page) + { + pushOutlinesToPart(this->part9, lc_outlines, object_stream_data); + } + + // Place all remaining objects + for (std::set::iterator iter = lc_other.begin(); + iter != lc_other.end(); ++iter) + { + this->part9.push_back(objGenToIndirect(*iter)); + } + + // Make sure we got everything exactly once. + + unsigned int num_placed = this->part4.size() + this->part6.size() + + this->part7.size() + this->part8.size() + this->part9.size(); + unsigned int num_wanted = this->object_to_obj_users.size(); + if (num_placed != num_wanted) + { + throw QEXC::Internal("QPDF::calculateLinearizationData: wrong " + "number of objects placed (num_placed = " + + QUtil::int_to_string(num_placed) + + "; number of objects: " + + QUtil::int_to_string(num_wanted)); + } + + // Calculate shared object hint table information including + // references to shared objects from page offset hint data. + + // The shared object hint table consists of all part 6 (whether + // shared or not) in order followed by all part 8 objects in + // order. Add the objects to shared object data keeping a map of + // object number to index. Then populate the shared object + // information for the pages. + + // Note that two objects never have the same object number, so we + // can map from object number only without regards to generation. + std::map obj_to_index; + + this->c_shared_object_data.nshared_first_page = this->part6.size(); + this->c_shared_object_data.nshared_total = + this->c_shared_object_data.nshared_first_page + + this->part8.size(); + + std::vector& shared = + this->c_shared_object_data.entries; + for (std::vector::iterator iter = this->part6.begin(); + iter != this->part6.end(); ++iter) + { + QPDFObjectHandle& oh = *iter; + int obj = oh.getObjectID(); + obj_to_index[obj] = shared.size(); + shared.push_back(CHSharedObjectEntry(obj)); + } + QTC::TC("qpdf", "QPDF lin part 8 empty", this->part8.empty() ? 1 : 0); + if (! this->part8.empty()) + { + this->c_shared_object_data.first_shared_obj = + this->part8[0].getObjectID(); + for (std::vector::iterator iter = + this->part8.begin(); + iter != this->part8.end(); ++iter) + { + QPDFObjectHandle& oh = *iter; + int obj = oh.getObjectID(); + obj_to_index[obj] = shared.size(); + shared.push_back(CHSharedObjectEntry(obj)); + } + } + assert(this->c_shared_object_data.nshared_total == + (int) this->c_shared_object_data.entries.size()); + + // Now compute the list of shared objects for each page after the + // first page. + + for (unsigned int i = 1; i < npages; ++i) + { + CHPageOffsetEntry& pe = this->c_page_offset_data.entries[i]; + ObjUser ou(ObjUser::ou_page, i); + assert(this->obj_user_to_objects.count(ou) > 0); + std::set const& ogs = this->obj_user_to_objects[ou]; + for (std::set::const_iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if ((this->object_to_obj_users[og].size() > 1) && + (obj_to_index.count(og.obj) > 0)) + { + int idx = obj_to_index[og.obj]; + ++pe.nshared_objects; + pe.shared_identifiers.push_back(idx); + } + } + } +} + +void +QPDF::pushOutlinesToPart( + std::vector& part, + std::set& lc_outlines, + std::map const& object_stream_data) +{ + QPDFObjectHandle root = getRoot(); + QPDFObjectHandle outlines = root.getKey("/Outlines"); + if (outlines.isNull()) + { + return; + } + outlines = getUncompressedObject(outlines, object_stream_data); + ObjGen outlines_og(outlines.getObjectID(), outlines.getGeneration()); + QTC::TC("qpdf", "QPDF lin outlines in part", + ((&part == (&this->part6)) ? 0 + : (&part == (&this->part9)) ? 1 + : 9999)); // can't happen + this->c_outline_data.first_object = outlines_og.obj; + this->c_outline_data.nobjects = 1; + lc_outlines.erase(outlines_og); + part.push_back(outlines); + for (std::set::iterator iter = lc_outlines.begin(); + iter != lc_outlines.end(); ++iter) + { + part.push_back(objGenToIndirect(*iter)); + ++this->c_outline_data.nobjects; + } +} + +void +QPDF::getLinearizedParts( + std::map const& object_stream_data, + std::vector& part4, + std::vector& part6, + std::vector& part7, + std::vector& part8, + std::vector& part9) +{ + calculateLinearizationData(object_stream_data); + part4 = this->part4; + part6 = this->part6; + part7 = this->part7; + part8 = this->part8; + part9 = this->part9; +} + +static inline int nbits(int val) +{ + return (val == 0 ? 0 : (1 + nbits(val >> 1))); +} + +int +QPDF::outputLengthNextN( + int in_object, int n, + std::map const& lengths, + std::map const& obj_renumber) +{ + // Figure out the length of a series of n consecutive objects in + // the output file starting with whatever object in_object from + // the input file mapped to. + + assert(obj_renumber.count(in_object) > 0); + int first = (*(obj_renumber.find(in_object))).second; + int length = 0; + for (int i = 0; i < n; ++i) + { + assert(lengths.count(first + i) > 0); + length += (*(lengths.find(first + i))).second; + } + return length; +} + +void +QPDF::calculateHPageOffset( + std::map const& xref, + std::map const& lengths, + std::map const& obj_renumber) +{ + // Page Offset Hint Table + + // We are purposely leaving some values set to their initial zero + // values. + + std::vector const& pages = getAllPages(); + unsigned int npages = pages.size(); + CHPageOffset& cph = this->c_page_offset_data; + std::vector& cphe = cph.entries; + + // Calculate minimum and maximum values for number of objects per + // page and page length. + + int min_nobjects = cphe[0].nobjects; + int max_nobjects = min_nobjects; + int min_length = outputLengthNextN( + pages[0].getObjectID(), min_nobjects, lengths, obj_renumber); + int max_length = min_length; + int max_shared = cphe[0].nshared_objects; + + HPageOffset& ph = this->page_offset_hints; + std::vector& phe = ph.entries; + phe = std::vector(npages); + + for (unsigned int i = 0; i < npages; ++i) + { + // Calculate values for each page, assigning full values to + // the delta items. They will be adjusted later. + + // Repeat calculations for page 0 so we can assign to phe[i] + // without duplicating those assignments. + + int nobjects = cphe[i].nobjects; + int length = outputLengthNextN( + pages[i].getObjectID(), nobjects, lengths, obj_renumber); + int nshared = cphe[i].nshared_objects; + + min_nobjects = std::min(min_nobjects, nobjects); + max_nobjects = std::max(max_nobjects, nobjects); + min_length = std::min(min_length, length); + max_length = std::max(max_length, length); + max_shared = std::max(max_shared, nshared); + + phe[i].delta_nobjects = nobjects; + phe[i].delta_page_length = length; + phe[i].nshared_objects = nshared; + } + + ph.min_nobjects = min_nobjects; + int in_page0_id = pages[0].getObjectID(); + int out_page0_id = (*(obj_renumber.find(in_page0_id))).second; + ph.first_page_offset = (*(xref.find(out_page0_id))).second.getOffset(); + ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects); + ph.min_page_length = min_length; + ph.nbits_delta_page_length = nbits(max_length - min_length); + ph.nbits_nshared_objects = nbits(max_shared); + ph.nbits_shared_identifier = + nbits(this->c_shared_object_data.nshared_total); + ph.shared_denominator = 4; // doesn't matter + + // It isn't clear how to compute content offset and content + // length. Since we are not interleaving page objects with the + // content stream, we'll use the same values for content length as + // page length. We will use 0 as content offset because this is + // what Adobe does (implementation note 127) and pdlin as well. + ph.nbits_delta_content_length = ph.nbits_delta_page_length; + ph.min_content_length = ph.min_page_length; + + for (unsigned int i = 0; i < npages; ++i) + { + // Adjust delta entries + assert(phe[i].delta_nobjects >= min_nobjects); + assert(phe[i].delta_page_length >= min_length); + phe[i].delta_nobjects -= min_nobjects; + phe[i].delta_page_length -= min_length; + phe[i].delta_content_length = phe[i].delta_page_length; + + for (int j = 0; j < cphe[i].nshared_objects; ++j) + { + phe[i].shared_identifiers.push_back( + cphe[i].shared_identifiers[j]); + phe[i].shared_numerators.push_back(0); + } + } +} + +void +QPDF::calculateHSharedObject( + std::map const& xref, + std::map const& lengths, + std::map const& obj_renumber) +{ + CHSharedObject& cso = this->c_shared_object_data; + std::vector& csoe = cso.entries; + HSharedObject& so = this->shared_object_hints; + std::vector& soe = so.entries; + soe = std::vector(cso.nshared_total); + + int min_length = outputLengthNextN( + csoe[0].object, 1, lengths, obj_renumber); + int max_length = min_length; + + for (int i = 0; i < cso.nshared_total; ++i) + { + // Assign absolute numbers to deltas; adjust later + int length = outputLengthNextN( + csoe[i].object, 1, lengths, obj_renumber); + min_length = std::min(min_length, length); + max_length = std::max(max_length, length); + soe[i].delta_group_length = length; + } + + so.nshared_total = cso.nshared_total; + so.nshared_first_page = cso.nshared_first_page; + if (so.nshared_total > so.nshared_first_page) + { + so.first_shared_obj = + (*(obj_renumber.find(cso.first_shared_obj))).second; + so.first_shared_offset = + (*(xref.find(so.first_shared_obj))).second.getOffset(); + } + so.min_group_length = min_length; + so.nbits_delta_group_length = nbits(max_length - min_length); + + for (int i = 0; i < cso.nshared_total; ++i) + { + // Adjust deltas + assert(soe[i].delta_group_length >= min_length); + soe[i].delta_group_length -= min_length; + } +} + +void +QPDF::calculateHOutline( + std::map const& xref, + std::map const& lengths, + std::map const& obj_renumber) +{ + HGeneric& cho = this->c_outline_data; + + if (cho.nobjects == 0) + { + return; + } + + HGeneric& ho = this->outline_hints; + + ho.first_object = + (*(obj_renumber.find(cho.first_object))).second; + ho.first_object_offset = + (*(xref.find(ho.first_object))).second.getOffset(); + ho.nobjects = cho.nobjects; + ho.group_length = outputLengthNextN( + cho.first_object, ho.nobjects, lengths, obj_renumber); +} + +template +static void +write_vector_int(BitWriter& w, int nitems, std::vector& vec, + int bits, int T::*field) +{ + // nitems times, write bits bits from the given field of the ith + // vector to the given bit writer. + + for (int i = 0; i < nitems; ++i) + { + w.writeBits(vec[i].*field, bits); + } + // The PDF spec says that each hint table starts at a byte + // boundary. Each "row" actually must start on a byte boundary. + w.flush(); +} + +template +static void +write_vector_vector(BitWriter& w, + int nitems1, std::vector& vec1, int T::*nitems2, + int bits, std::vector T::*vec2) +{ + // nitems1 times, write nitems2 (from the ith element of vec1) items + // from the vec2 vector field of the ith item of vec1. + for (int i1 = 0; i1 < nitems1; ++i1) + { + for (int i2 = 0; i2 < vec1[i1].*nitems2; ++i2) + { + w.writeBits((vec1[i1].*vec2)[i2], bits); + } + } + w.flush(); +} + + +void +QPDF::writeHPageOffset(BitWriter& w) +{ + HPageOffset& t = this->page_offset_hints; + + w.writeBits(t.min_nobjects, 32); // 1 + w.writeBits(t.first_page_offset, 32); // 2 + w.writeBits(t.nbits_delta_nobjects, 16); // 3 + w.writeBits(t.min_page_length, 32); // 4 + w.writeBits(t.nbits_delta_page_length, 16); // 5 + w.writeBits(t.min_content_offset, 32); // 6 + w.writeBits(t.nbits_delta_content_offset, 16); // 7 + w.writeBits(t.min_content_length, 32); // 8 + w.writeBits(t.nbits_delta_content_length, 16); // 9 + w.writeBits(t.nbits_nshared_objects, 16); // 10 + w.writeBits(t.nbits_shared_identifier, 16); // 11 + w.writeBits(t.nbits_shared_numerator, 16); // 12 + w.writeBits(t.shared_denominator, 16); // 13 + + unsigned int nitems = getAllPages().size(); + std::vector& entries = t.entries; + + write_vector_int(w, nitems, entries, + t.nbits_delta_nobjects, + &HPageOffsetEntry::delta_nobjects); + write_vector_int(w, nitems, entries, + t.nbits_delta_page_length, + &HPageOffsetEntry::delta_page_length); + write_vector_int(w, nitems, entries, + t.nbits_nshared_objects, + &HPageOffsetEntry::nshared_objects); + write_vector_vector(w, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_identifier, + &HPageOffsetEntry::shared_identifiers); + write_vector_vector(w, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_numerator, + &HPageOffsetEntry::shared_numerators); + write_vector_int(w, nitems, entries, + t.nbits_delta_content_offset, + &HPageOffsetEntry::delta_content_offset); + write_vector_int(w, nitems, entries, + t.nbits_delta_content_length, + &HPageOffsetEntry::delta_content_length); +} + +void +QPDF::writeHSharedObject(BitWriter& w) +{ + HSharedObject& t = this->shared_object_hints; + + w.writeBits(t.first_shared_obj, 32); // 1 + w.writeBits(t.first_shared_offset, 32); // 2 + w.writeBits(t.nshared_first_page, 32); // 3 + w.writeBits(t.nshared_total, 32); // 4 + w.writeBits(t.nbits_nobjects, 16); // 5 + w.writeBits(t.min_group_length, 32); // 6 + w.writeBits(t.nbits_delta_group_length, 16); // 7 + + QTC::TC("qpdf", "QPDF lin write nshared_total > nshared_first_page", + (t.nshared_total > t.nshared_first_page) ? 1 : 0); + + int nitems = t.nshared_total; + std::vector& entries = t.entries; + + write_vector_int(w, nitems, entries, + t.nbits_delta_group_length, + &HSharedObjectEntry::delta_group_length); + write_vector_int(w, nitems, entries, + 1, &HSharedObjectEntry::signature_present); + for (int i = 0; i < nitems; ++i) + { + // If signature were present, we'd have to write a 128-bit hash. + assert(entries[i].signature_present == 0); + } + write_vector_int(w, nitems, entries, + t.nbits_nobjects, + &HSharedObjectEntry::nobjects_minus_one); +} + +void +QPDF::writeHGeneric(BitWriter& w, HGeneric& t) +{ + w.writeBits(t.first_object, 32); // 1 + w.writeBits(t.first_object_offset, 32); // 2 + w.writeBits(t.nobjects, 32); // 3 + w.writeBits(t.group_length, 32); // 4 +} + +void +QPDF::generateHintStream(std::map const& xref, + std::map const& lengths, + std::map const& obj_renumber, + PointerHolder& hint_buffer, + int& S, int& O) +{ + // Populate actual hint table values + calculateHPageOffset(xref, lengths, obj_renumber); + calculateHSharedObject(xref, lengths, obj_renumber); + calculateHOutline(xref, lengths, obj_renumber); + + // Write the hint stream itself into a compressed memory buffer. + // Write through a couter so we can get offsets. + Pl_Buffer hint_stream("hint stream"); + Pl_Flate f("compress hint stream", &hint_stream, Pl_Flate::a_deflate); + Pl_Count c("count", &f); + BitWriter w(&c); + + writeHPageOffset(w); + S = c.getCount(); + writeHSharedObject(w); + O = 0; + if (this->outline_hints.nobjects > 0) + { + O = c.getCount(); + writeHGeneric(w, this->outline_hints); + } + c.finish(); + + hint_buffer = hint_stream.getBuffer(); +} -- cgit v1.2.3-54-g00ecf