aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2017-07-28 05:42:27 +0200
committerJay Berkenbilt <ejb@ql.org>2017-07-28 05:42:27 +0200
commit7f8892525f897b17049f9e59bc4ce8ac28c9e082 (patch)
tree8f46d2ac355e8245cb2e0c764ebe0b34923e3494
parent428d96dfe19da96ac4759b190f5b25cf75ecdac6 (diff)
downloadqpdf-7f8892525f897b17049f9e59bc4ce8ac28c9e082.tar.zst
Add precheck streams capability
When requested, QPDFWriter will do more aggress prechecking of streams to make sure it can actually succeed in decoding them before attempting to do so. This will allow preservation of raw data even when the raw data is corrupted relative to the specified filters.
-rw-r--r--ChangeLog4
-rw-r--r--include/qpdf/QPDF.hh14
-rw-r--r--include/qpdf/QPDFObjectHandle.hh3
-rw-r--r--include/qpdf/QPDFWriter.hh12
-rw-r--r--libqpdf/QPDF.cc28
-rw-r--r--libqpdf/QPDFObjectHandle.cc5
-rw-r--r--libqpdf/QPDFWriter.cc22
-rw-r--r--libqpdf/QPDF_Stream.cc17
-rw-r--r--libqpdf/qpdf/QPDF_Stream.hh3
-rw-r--r--manual/qpdf-manual.xml17
-rw-r--r--qpdf/qpdf.cc10
-rw-r--r--qpdf/qpdf.testcov1
-rw-r--r--qpdf/qtest/qpdf.test20
-rw-r--r--qpdf/qtest/qpdf/bad-data-out.pdfbin0 -> 759 bytes
-rw-r--r--qpdf/qtest/qpdf/bad-data-precheck.pdfbin0 -> 797 bytes
-rw-r--r--qpdf/qtest/qpdf/bad-data.out2
-rw-r--r--qpdf/qtest/qpdf/bad-data.pdfbin0 -> 799 bytes
17 files changed, 133 insertions, 25 deletions
diff --git a/ChangeLog b/ChangeLog
index 119a4c6c..026833d4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2017-07-27 Jay Berkenbilt <ejb@ql.org>
+ * Add --precheck-streams command-line option and setStreamPrecheck
+ option to QPDFWriter to tell QPDFWriter to attempt decoding a
+ stream fully before deciding whether to filter it or not.
+
* Recover gracefully from streams that aren't filterable because
the filter parameters are invalid in the stream dictionary or the
dictionary itself is invalid.
diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh
index 18a6851f..ef9ce597 100644
--- a/include/qpdf/QPDF.hh
+++ b/include/qpdf/QPDF.hh
@@ -540,13 +540,14 @@ class QPDF
{
friend class QPDF_Stream;
private:
- static void pipeStreamData(QPDF* qpdf, int objid, int generation,
+ static bool pipeStreamData(QPDF* qpdf, int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle dict,
- Pipeline* pipeline)
+ Pipeline* pipeline, bool suppress_warnings)
{
- qpdf->pipeStreamData(
- objid, generation, offset, length, dict, pipeline);
+ return qpdf->pipeStreamData(
+ objid, generation, offset, length, dict, pipeline,
+ suppress_warnings);
}
};
friend class Pipe;
@@ -666,10 +667,11 @@ class QPDF
void findAttachmentStreams();
// Calls finish() on the pipeline when done but does not delete it
- void pipeStreamData(int objid, int generation,
+ bool pipeStreamData(int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle dict,
- Pipeline* pipeline);
+ Pipeline* pipeline,
+ bool suppress_warnings);
// For QPDFWriter:
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index 0fc989a5..11a52596 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -394,7 +394,8 @@ class QPDFObjectHandle
// replaced if writing a new stream object.
QPDF_DLL
bool pipeStreamData(Pipeline*, bool filter,
- bool normalize, bool compress);
+ bool normalize, bool compress,
+ bool suppress_warnings = false);
// Replace a stream's dictionary. The new dictionary must be
// consistent with the stream's data. This is most appropriately
diff --git a/include/qpdf/QPDFWriter.hh b/include/qpdf/QPDFWriter.hh
index b2738c1f..2687cce0 100644
--- a/include/qpdf/QPDFWriter.hh
+++ b/include/qpdf/QPDFWriter.hh
@@ -144,6 +144,17 @@ class QPDFWriter
QPDF_DLL
void setQDFMode(bool);
+ // Enable stream precheck mode. In this mode, all filterable
+ // streams are checked by actually attempting to decode them
+ // before filtering. This may add significant time to the process
+ // of writing the data because all streams from the input must be
+ // read twice, but it enables the raw stream data to be preserved
+ // even in cases where qpdf would run into errors decoding the
+ // stream after it determines that it should be able to do it.
+ // Examples would include compressed data with errors in it.
+ QPDF_DLL
+ void setPrecheckStreams(bool);
+
// Set the minimum PDF version. If the PDF version of the input
// file (or previously set minimum version) is less than the
// version passed to this method, the PDF version of the output
@@ -415,6 +426,7 @@ class QPDFWriter
bool stream_data_mode_set;
qpdf_stream_data_e stream_data_mode;
bool qdf_mode;
+ bool precheck_streams;
bool static_id;
bool suppress_original_object_ids;
bool direct_stream_lengths;
diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc
index 32c8cdf9..b5c1212c 100644
--- a/libqpdf/QPDF.cc
+++ b/libqpdf/QPDF.cc
@@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens()
return result;
}
-void
+bool
QPDF::pipeStreamData(int objid, int generation,
qpdf_offset_t offset, size_t length,
QPDFObjectHandle stream_dict,
- Pipeline* pipeline)
+ Pipeline* pipeline,
+ bool suppress_warnings)
{
+ bool success = false;
std::vector<PointerHolder<Pipeline> > to_delete;
if (this->encrypted)
{
@@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation,
length -= len;
pipeline->write(QUtil::unsigned_char_pointer(buf), len);
}
+ success = true;
}
catch (QPDFExc& e)
{
- warn(e);
+ if (! suppress_warnings)
+ {
+ warn(e);
+ }
}
catch (std::runtime_error& e)
{
- QTC::TC("qpdf", "QPDF decoding error warning");
- warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
- "", this->file->getLastOffset(),
- "error decoding stream data for object " +
- QUtil::int_to_string(objid) + " " +
- QUtil::int_to_string(generation) + ": " + e.what()));
+ if (! suppress_warnings)
+ {
+ QTC::TC("qpdf", "QPDF decoding error warning");
+ warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
+ "", this->file->getLastOffset(),
+ "error decoding stream data for object " +
+ QUtil::int_to_string(objid) + " " +
+ QUtil::int_to_string(generation) + ": " + e.what()));
+ }
}
pipeline->finish();
+ return success;
}
void
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 7618cdf3..bac233df 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData()
bool
QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter,
- bool normalize, bool compress)
+ bool normalize, bool compress,
+ bool suppress_warnings)
{
assertStream();
return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData(
- p, filter, normalize, compress);
+ p, filter, normalize, compress, suppress_warnings);
}
void
diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc
index 01748fc7..59e306fc 100644
--- a/libqpdf/QPDFWriter.cc
+++ b/libqpdf/QPDFWriter.cc
@@ -57,6 +57,7 @@ QPDFWriter::init()
stream_data_mode_set = false;
stream_data_mode = qpdf_s_compress;
qdf_mode = false;
+ precheck_streams = false;
static_id = false;
suppress_original_object_ids = false;
direct_stream_lengths = true;
@@ -177,6 +178,12 @@ QPDFWriter::setQDFMode(bool val)
}
void
+QPDFWriter::setPrecheckStreams(bool val)
+{
+ this->precheck_streams = val;
+}
+
+void
QPDFWriter::setMinimumPDFVersion(std::string const& version)
{
setMinimumPDFVersion(version, 0);
@@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
flags |= f_stream;
+ if (filter && this->precheck_streams)
+ {
+ try
+ {
+ QTC::TC("qpdf", "QPDFWriter precheck stream");
+ Pl_Discard discard;
+ filter = object.pipeStreamData(
+ &discard, true, false, false, true);
+ }
+ catch (std::exception)
+ {
+ filter = false;
+ }
+ }
+
pushPipeline(new Pl_Buffer("stream data"));
activatePipelineStack();
bool filtered =
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index b4d14441..31d583b8 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -85,7 +85,7 @@ PointerHolder<Buffer>
QPDF_Stream::getStreamData()
{
Pl_Buffer buf("stream data buffer");
- if (! pipeStreamData(&buf, true, false, false))
+ if (! pipeStreamData(&buf, true, false, false, false))
{
throw std::logic_error("getStreamData called on unfilterable stream");
}
@@ -97,7 +97,7 @@ PointerHolder<Buffer>
QPDF_Stream::getRawStreamData()
{
Pl_Buffer buf("stream data buffer");
- pipeStreamData(&buf, false, false, false);
+ pipeStreamData(&buf, false, false, false, false);
QTC::TC("qpdf", "QPDF_Stream getRawStreamData");
return buf.getBuffer();
}
@@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector<std::string>& filters,
bool
QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
- bool normalize, bool compress)
+ bool normalize, bool compress,
+ bool suppress_warnings)
{
std::vector<std::string> filters;
int predictor = 1;
@@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
else
{
QTC::TC("qpdf", "QPDF_Stream pipe original stream data");
- QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
- this->offset, this->length,
- this->stream_dict, pipeline);
+ if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
+ this->offset, this->length,
+ this->stream_dict, pipeline,
+ suppress_warnings))
+ {
+ filter = false;
+ }
}
return filter;
diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh
index fa405d70..d053fd0f 100644
--- a/libqpdf/qpdf/QPDF_Stream.hh
+++ b/libqpdf/qpdf/QPDF_Stream.hh
@@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject
// See comments in QPDFObjectHandle.hh for these methods.
bool pipeStreamData(Pipeline*, bool filter,
- bool normalize, bool compress);
+ bool normalize, bool compress,
+ bool suppress_warnings);
PointerHolder<Buffer> getStreamData();
PointerHolder<Buffer> getRawStreamData();
void replaceStreamData(PointerHolder<Buffer> data,
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index a4c34e90..cd35718d 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -822,6 +822,23 @@ outfile.pdf</option>
</listitem>
</varlistentry>
<varlistentry>
+ <term><option>--precheck-streams</option></term>
+ <listitem>
+ <para>
+ Tells qpdf to precheck each stream for the ability to decode
+ it. Ordinarily qpdf tries to decode streams that it thinks it
+ can decode based on the filters, and if there ends up being an
+ error when actually trying to do the decode, the stream data
+ is truncated. This flag causes qpdf to actually read the
+ stream fully before deciding whether to filter the stream.
+ This option will slow qpdf down since it will have to read the
+ stream twice, but it allows raw stream data to be preserved in
+ cases where the decoding of the stream would fail for some
+ reason. This may be useful in working with some damaged files.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
<term><option>--qdf</option></term>
<listitem>
<para>
diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc
index c52e1125..99cfd3a1 100644
--- a/qpdf/qpdf.cc
+++ b/qpdf/qpdf.cc
@@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\
--suppress-recovery prevents qpdf from attempting to recover damaged files\n\
--object-streams=mode controls handing of object streams\n\
--ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\
+--precheck-streams precheck ability to decode streams\n\
--qdf turns on \"QDF mode\" (below)\n\
--min-version=version sets the minimum PDF version of the output file\n\
--force-version=version forces this to be the PDF version of the output file\n\
@@ -1028,6 +1029,7 @@ int main(int argc, char* argv[])
qpdf_object_stream_e object_stream_mode = qpdf_o_preserve;
bool ignore_xref_streams = false;
bool qdf_mode = false;
+ bool precheck_streams = false;
std::string min_version;
std::string force_version;
@@ -1213,6 +1215,10 @@ int main(int argc, char* argv[])
{
qdf_mode = true;
}
+ else if (strcmp(arg, "precheck-streams") == 0)
+ {
+ precheck_streams = true;
+ }
else if (strcmp(arg, "min-version") == 0)
{
if (parameter == 0)
@@ -1704,6 +1710,10 @@ int main(int argc, char* argv[])
{
w.setQDFMode(true);
}
+ if (precheck_streams)
+ {
+ w.setPrecheckStreams(true);
+ }
if (normalize_set)
{
w.setContentNormalization(normalize);
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 268ecb16..bf227c7a 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0
QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0
+QPDFWriter precheck stream 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index b80ab9cb..b61882b9 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -723,6 +723,26 @@ $td->runtest("check output",
{$td->FILE => "from-scratch-0.pdf"});
show_ntests();
# ----------
+$td->notify("--- Precheck streams ---");
+$n_tests += 4;
+
+$td->runtest("bad stream without precheck",
+ {$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"},
+ {$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "bad-data-out.pdf"});
+$td->runtest("bad stream with precheck",
+ {$td->COMMAND =>
+ "qpdf --static-id --precheck-streams bad-data.pdf a.pdf"},
+ {$td->STRING => "", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "bad-data-precheck.pdf"});
+show_ntests();
+# ----------
$td->notify("--- Copy Foreign Objects ---");
$n_tests += 7;
diff --git a/qpdf/qtest/qpdf/bad-data-out.pdf b/qpdf/qtest/qpdf/bad-data-out.pdf
new file mode 100644
index 00000000..f4300662
--- /dev/null
+++ b/qpdf/qtest/qpdf/bad-data-out.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/bad-data-precheck.pdf b/qpdf/qtest/qpdf/bad-data-precheck.pdf
new file mode 100644
index 00000000..4314025a
--- /dev/null
+++ b/qpdf/qtest/qpdf/bad-data-precheck.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/bad-data.out b/qpdf/qtest/qpdf/bad-data.out
new file mode 100644
index 00000000..3ea1d07f
--- /dev/null
+++ b/qpdf/qtest/qpdf/bad-data.out
@@ -0,0 +1,2 @@
+WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received
+qpdf: operation succeeded with warnings; resulting file may have some problems
diff --git a/qpdf/qtest/qpdf/bad-data.pdf b/qpdf/qtest/qpdf/bad-data.pdf
new file mode 100644
index 00000000..94ddafd4
--- /dev/null
+++ b/qpdf/qtest/qpdf/bad-data.pdf
Binary files differ