From b8bdef0ad12883d72ced5eb443e6e34a93bbbb91 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 25 Oct 2015 11:09:25 -0400 Subject: Implement deterministic ID For non-encrypted files, determinstic ID generation uses file contents instead of timestamp and file name. At a small runtime cost, this enables generation of the same /ID if the same inputs are converted in the same way multiple times. --- libqpdf/Pl_MD5.cc | 61 +++++++++++------ libqpdf/QPDFWriter.cc | 174 +++++++++++++++++++++++++++++++++++++++++-------- libqpdf/qpdf-c.cc | 6 ++ libqpdf/qpdf/Pl_MD5.hh | 14 ++++ 4 files changed, 208 insertions(+), 47 deletions(-) (limited to 'libqpdf') diff --git a/libqpdf/Pl_MD5.cc b/libqpdf/Pl_MD5.cc index 3a78cb33..0510e50e 100644 --- a/libqpdf/Pl_MD5.cc +++ b/libqpdf/Pl_MD5.cc @@ -3,7 +3,9 @@ Pl_MD5::Pl_MD5(char const* identifier, Pipeline* next) : Pipeline(identifier, next), - in_progress(false) + in_progress(false), + enabled(true), + persist_across_finish(false) { } @@ -14,24 +16,27 @@ Pl_MD5::~Pl_MD5() void Pl_MD5::write(unsigned char* buf, size_t len) { - if (! this->in_progress) + if (this->enabled) { - this->md5.reset(); - this->in_progress = true; - } + if (! this->in_progress) + { + this->md5.reset(); + this->in_progress = true; + } - // Write in chunks in case len is too big to fit in an int. - // Assume int is at least 32 bits. - static size_t const max_bytes = 1 << 30; - size_t bytes_left = len; - unsigned char* data = buf; - while (bytes_left > 0) - { - size_t bytes = (bytes_left >= max_bytes ? max_bytes : bytes_left); - this->md5.encodeDataIncrementally( - reinterpret_cast(data), bytes); - bytes_left -= bytes; - data += bytes; + // Write in chunks in case len is too big to fit in an int. + // Assume int is at least 32 bits. + static size_t const max_bytes = 1 << 30; + size_t bytes_left = len; + unsigned char* data = buf; + while (bytes_left > 0) + { + size_t bytes = (bytes_left >= max_bytes ? max_bytes : bytes_left); + this->md5.encodeDataIncrementally( + reinterpret_cast(data), bytes); + bytes_left -= bytes; + data += bytes; + } } this->getNext()->write(buf, len); @@ -41,16 +46,32 @@ void Pl_MD5::finish() { this->getNext()->finish(); - this->in_progress = false; + if (! this->persist_across_finish) + { + this->in_progress = false; + } +} + +void +Pl_MD5::enable(bool enabled) +{ + this->enabled = enabled; +} + +void +Pl_MD5::persistAcrossFinish(bool persist) +{ + this->persist_across_finish = persist; } std::string Pl_MD5::getHexDigest() { - if (this->in_progress) + if (! this->enabled) { throw std::logic_error( - "digest requested for in-progress MD5 Pipeline"); + "digest requested for a disabled MD5 Pipeline"); } + this->in_progress = false; return this->md5.unparse(); } diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc index 08647e37..ef3210c6 100644 --- a/libqpdf/QPDFWriter.cc +++ b/libqpdf/QPDFWriter.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -77,6 +78,8 @@ QPDFWriter::init() cur_stream_length = 0; added_newline = false; max_ostream_index = 0; + deterministic_id = false; + md5_pipeline = 0; } QPDFWriter::~QPDFWriter() @@ -263,6 +266,12 @@ QPDFWriter::setStaticID(bool val) this->static_id = val; } +void +QPDFWriter::setDeterministicID(bool val) +{ + this->deterministic_id = val; +} + void QPDFWriter::setStaticAesIV(bool val) { @@ -507,10 +516,10 @@ void QPDFWriter::copyEncryptionParameters(QPDF& qpdf) { this->preserve_encryption = false; - generateID(); QPDFObjectHandle trailer = qpdf.getTrailer(); if (trailer.hasKey("/Encrypt")) { + generateID(); this->id1 = trailer.getKey("/ID").getArrayItem(0).getStringValue(); QPDFObjectHandle encrypt = trailer.getKey("/Encrypt"); @@ -864,6 +873,10 @@ QPDFWriter::popPipelineStack(PointerHolder* bp) while (dynamic_cast(this->pipeline_stack.back()) == 0) { Pipeline* p = this->pipeline_stack.back(); + if (dynamic_cast(p) == this->md5_pipeline) + { + this->md5_pipeline = 0; + } this->pipeline_stack.pop_back(); Pl_Buffer* buf = dynamic_cast(p); if (bp && buf) @@ -921,6 +934,36 @@ QPDFWriter::pushDiscardFilter() activatePipelineStack(); } +void +QPDFWriter::pushMD5Pipeline() +{ + if (! this->id2.empty()) + { + // Can't happen in the code + throw std::logic_error( + "Deterministic ID computation enabled after ID" + " generation has already occurred."); + } + assert(this->deterministic_id); + assert(this->md5_pipeline == 0); + assert(this->pipeline->getCount() == 0); + this->md5_pipeline = new Pl_MD5("qpdf md5", this->pipeline); + this->md5_pipeline->persistAcrossFinish(true); + // Special case code in popPipelineStack clears this->md5_pipeline + // upon deletion. + pushPipeline(this->md5_pipeline); + activatePipelineStack(); +} + +void +QPDFWriter::computeDeterministicIDData() +{ + assert(this->md5_pipeline != 0); + assert(this->deterministic_id_data.empty()); + this->deterministic_id_data = this->md5_pipeline->getHexDigest(); + this->md5_pipeline->enable(false); +} + int QPDFWriter::openObject(int objid) { @@ -1068,6 +1111,13 @@ QPDFWriter::unparseChild(QPDFObjectHandle child, int level, int flags) void QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, qpdf_offset_t prev) +{ + writeTrailer(which, size, xref_stream, prev, 0); +} + +void +QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, + qpdf_offset_t prev, int linearization_pass) { QPDFObjectHandle trailer = getTrimmedTrailer(); if (! xref_stream) @@ -1119,8 +1169,21 @@ QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, // Write ID writeStringQDF(" "); writeString(" /ID ["); - writeString(QPDF_String(this->id1).unparse(true)); - writeString(QPDF_String(this->id2).unparse(true)); + if (linearization_pass == 1) + { + writeString("<00000000000000000000000000000000>" + "<00000000000000000000000000000000>"); + } + else + { + if ((linearization_pass == 0) && (this->deterministic_id)) + { + computeDeterministicIDData(); + } + generateID(); + writeString(QPDF_String(this->id1).unparse(true)); + writeString(QPDF_String(this->id2).unparse(true)); + } writeString("]"); if (which != t_lin_second) @@ -1794,12 +1857,8 @@ QPDFWriter::writeObject(QPDFObjectHandle object, int object_stream_index) void QPDFWriter::generateID() { - // Note: we can't call generateID() at the time of construction - // since the caller hasn't yet had a chance to call setStaticID(), - // but we need to generate it before computing encryption - // dictionary parameters. This is why we call this function both - // from setEncryptionParameters() and from write() and return - // immediately if the ID has already been generated. + // Generate the ID lazily so that we can handle the user's + // preference to use static or deterministic ID generation. if (! this->id2.empty()) { @@ -1822,17 +1881,40 @@ QPDFWriter::generateID() } else { - // The PDF specification has guidelines for creating IDs, but it - // states clearly that the only thing that's really important is - // that it is very likely to be unique. We can't really follow - // the guidelines in the spec exactly because we haven't written - // the file yet. This scheme should be fine though. + // The PDF specification has guidelines for creating IDs, but + // it states clearly that the only thing that's really + // important is that it is very likely to be unique. We can't + // really follow the guidelines in the spec exactly because we + // haven't written the file yet. This scheme should be fine + // though. The deterministic ID case uses a digest of a + // sufficient portion of the file's contents such no two + // non-matching files would match in the subsets used for this + // computation. Note that we explicitly omit the filename from + // the digest calculation for deterministic ID so that the same + // file converted with qpdf, in that case, would have the same + // ID regardless of the output file's name. std::string seed; - seed += QUtil::int_to_string(QUtil::get_current_time()); + if (this->deterministic_id) + { + if (this->deterministic_id_data.empty()) + { + QTC::TC("qpdf", "QPDFWriter deterministic with no data"); + throw std::logic_error( + "INTERNAL ERROR: QPDFWriter::generateID has no" + " data for deterministic ID. This may happen if" + " deterministic ID and file encryption are requested" + " together."); + } + seed += this->deterministic_id_data; + } + else + { + seed += QUtil::int_to_string(QUtil::get_current_time()); + seed += this->filename; + seed += " "; + } seed += " QPDF "; - seed += this->filename; - seed += " "; if (trailer.hasKey("/Info")) { QPDFObjectHandle info = trailer.getKey("/Info"); @@ -2260,8 +2342,6 @@ QPDFWriter::write() setMinimumPDFVersion("1.5"); } - generateID(); - prepareFileForWrite(); if (this->linearized) @@ -2396,6 +2476,17 @@ QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, qpdf_offset_t prev, bool suppress_offsets, int hint_id, qpdf_offset_t hint_offset, qpdf_offset_t hint_length) +{ + // ABI compatibility + return writeXRefTable(which, first, last, size, prev, suppress_offsets, + hint_id, hint_offset, hint_length, 0); +} + +qpdf_offset_t +QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, + qpdf_offset_t prev, bool suppress_offsets, + int hint_id, qpdf_offset_t hint_offset, + qpdf_offset_t hint_length, int linearization_pass) { writeString("xref\n"); writeString(QUtil::int_to_string(first)); @@ -2426,7 +2517,7 @@ QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, writeString(" 00000 n \n"); } } - writeTrailer(which, size, false, prev); + writeTrailer(which, size, false, prev, linearization_pass); writeString("\n"); return space_before_zero; } @@ -2435,8 +2526,9 @@ qpdf_offset_t QPDFWriter::writeXRefStream(int objid, int max_id, qpdf_offset_t max_offset, trailer_e which, int first, int last, int size) { + // ABI compatibility return writeXRefStream(objid, max_id, max_offset, - which, first, last, size, 0, 0, 0, 0, false); + which, first, last, size, 0, 0, 0, 0, false, 0); } qpdf_offset_t @@ -2445,7 +2537,8 @@ QPDFWriter::writeXRefStream(int xref_id, int max_id, qpdf_offset_t max_offset, qpdf_offset_t prev, int hint_id, qpdf_offset_t hint_offset, qpdf_offset_t hint_length, - bool skip_compression) + bool skip_compression, + int linearization_pass) { qpdf_offset_t xref_offset = this->pipeline->getCount(); qpdf_offset_t space_before_zero = xref_offset - 1; @@ -2545,7 +2638,7 @@ QPDFWriter::writeXRefStream(int xref_id, int max_id, qpdf_offset_t max_offset, QUtil::int_to_string(first) + " " + QUtil::int_to_string(last - first + 1) + " ]"); } - writeTrailer(which, size, true, prev); + writeTrailer(which, size, true, prev, linearization_pass); writeString("\nstream\n"); writeBuffer(xref_data); writeString("\nendstream"); @@ -2725,6 +2818,10 @@ QPDFWriter::writeLinearized() if (pass == 1) { pushDiscardFilter(); + if (this->deterministic_id) + { + pushMD5Pipeline(); + } } // Part 1: header @@ -2807,7 +2904,7 @@ QPDFWriter::writeLinearized() first_trailer_size, hint_length + second_xref_offset, hint_id, hint_offset, hint_length, - (pass == 1)); + (pass == 1), pass); qpdf_offset_t endpos = this->pipeline->getCount(); if (pass == 1) { @@ -2834,7 +2931,8 @@ QPDFWriter::writeLinearized() { writeXRefTable(t_lin_first, first_half_start, first_half_end, first_trailer_size, hint_length + second_xref_offset, - (pass == 1), hint_id, hint_offset, hint_length); + (pass == 1), hint_id, hint_offset, hint_length, + pass); writeString("startxref\n0\n%%EOF\n"); } @@ -2886,7 +2984,7 @@ QPDFWriter::writeLinearized() second_half_end, second_xref_offset, t_lin_second, 0, second_half_end, second_trailer_size, - 0, 0, 0, 0, (pass == 1)); + 0, 0, 0, 0, (pass == 1), pass); qpdf_offset_t endpos = this->pipeline->getCount(); if (pass == 1) @@ -2920,7 +3018,7 @@ QPDFWriter::writeLinearized() { space_before_zero = writeXRefTable(t_lin_second, 0, second_half_end, - second_trailer_size); + second_trailer_size, 0, false, 0, 0, 0, pass); } writeString("startxref\n"); writeString(QUtil::int_to_string(first_xref_offset)); @@ -2930,6 +3028,15 @@ QPDFWriter::writeLinearized() if (pass == 1) { + if (this->deterministic_id) + { + QTC::TC("qpdf", "QPDFWriter linearized deterministic ID", + need_xref_stream ? 0 : 1); + computeDeterministicIDData(); + popPipelineStack(); + assert(this->md5_pipeline == 0); + } + // Close first pass pipeline file_size = this->pipeline->getCount(); popPipelineStack(); @@ -2954,6 +3061,11 @@ QPDFWriter::writeLinearized() void QPDFWriter::writeStandard() { + if (this->deterministic_id) + { + pushMD5Pipeline(); + } + // Start writing writeHeader(); @@ -3005,4 +3117,12 @@ QPDFWriter::writeStandard() writeString("startxref\n"); writeString(QUtil::int_to_string(xref_offset)); writeString("\n%%EOF\n"); + + if (this->deterministic_id) + { + QTC::TC("qpdf", "QPDFWriter standard deterministic ID", + this->object_stream_to_objects.empty() ? 0 : 1); + popPipelineStack(); + assert(this->md5_pipeline == 0); + } } diff --git a/libqpdf/qpdf-c.cc b/libqpdf/qpdf-c.cc index a46df63e..c8adb9e4 100644 --- a/libqpdf/qpdf-c.cc +++ b/libqpdf/qpdf-c.cc @@ -512,6 +512,12 @@ void qpdf_set_qdf_mode(qpdf_data qpdf, QPDF_BOOL value) qpdf->qpdf_writer->setQDFMode(value); } +void qpdf_set_deterministic_ID(qpdf_data qpdf, QPDF_BOOL value) +{ + QTC::TC("qpdf", "qpdf-c called qpdf_set_deterministic_ID"); + qpdf->qpdf_writer->setDeterministicID(value); +} + void qpdf_set_static_ID(qpdf_data qpdf, QPDF_BOOL value) { QTC::TC("qpdf", "qpdf-c called qpdf_set_static_ID"); diff --git a/libqpdf/qpdf/Pl_MD5.hh b/libqpdf/qpdf/Pl_MD5.hh index 13a0927d..c23e6b45 100644 --- a/libqpdf/qpdf/Pl_MD5.hh +++ b/libqpdf/qpdf/Pl_MD5.hh @@ -25,10 +25,24 @@ class Pl_MD5: public Pipeline virtual void finish(); QPDF_DLL std::string getHexDigest(); + // Enable/disable. Disabling the pipeline causes it to become a + // pass-through. This makes it possible to stick an MD5 pipeline + // in a pipeline when it may or may not be required. Disabling it + // avoids incurring the runtime overhead of doing needless + // digest computation. + QPDF_DLL + void enable(bool enabled); + // If persistAcrossFinish is called, calls to finish do not + // finalize the underlying md5 object. In this case, the object is + // not finalized until getHexDigest() is called. + QPDF_DLL + void persistAcrossFinish(bool); private: bool in_progress; MD5 md5; + bool enabled; + bool persist_across_finish; }; #endif // __PL_MD5_HH__ -- cgit v1.2.3-54-g00ecf