aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-01-31 03:25:51 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:46 +0100
commitfcd611b61eb6cc352b4e072fc791681ad927aee2 (patch)
tree8354ae7b890066173bd5f42e68d7f4d8490ef0a3
parent05ff619b09dc0c2c51f7f56dacd067f2c3baedbc (diff)
downloadqpdf-fcd611b61eb6cc352b4e072fc791681ad927aee2.tar.zst
Refactor parseContentStream
-rw-r--r--ChangeLog12
-rw-r--r--include/qpdf/QPDFObjectHandle.hh37
-rw-r--r--libqpdf/QPDFObjectHandle.cc176
-rw-r--r--qpdf/qpdf.testcov2
-rw-r--r--qpdf/qtest/qpdf/split-content-stream-errors.out2
5 files changed, 155 insertions, 74 deletions
diff --git a/ChangeLog b/ChangeLog
index e9dea347..b29e6548 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -45,6 +45,18 @@
characters may surround the EI operator that marks the end of an
inline image.
+ * New method QPDFObjectHandle::parsePageContents() to improve upon
+ QPDFObjectHandle::parseContentStream(). The parseContentStream
+ method used to operate on a single content stream, but was fixed
+ to properly handle pages with contents split across multiple
+ streams in an earlier release. The new method parsePageContents()
+ can be called on the page object rather than the value of the
+ page dictionary's /Contents key. This removes a few lines of
+ boiler-plate code from any code that uses parseContentStream, and
+ it also enables creation of more helpful error messages if
+ problems are encountered as the error messages can include
+ information about which page the streams come from.
+
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index cd4c4767..86fa0202 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -88,7 +88,7 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0;
};
- // This class is used by parseContentStream. Callers must
+ // This class is used by parsePageContents. Callers must
// instantiate a subclass of this with handlers defined to accept
// QPDFObjectHandles that are parsed from the stream.
class ParserCallbacks
@@ -103,8 +103,8 @@ class QPDFObjectHandle
protected:
// Implementors may call this method during parsing to
- // terminate parsing early. This method throws an exception
- // that is caught by parseContentStream, so its effect is
+ // terminate parsing early. This method throws an exception
+ // that is caught by parsePageContents, so its effect is
// immediate.
QPDF_DLL
void terminateParsing();
@@ -187,6 +187,24 @@ class QPDFObjectHandle
QPDF* context);
// Helpers for parsing content streams
+
+ // Parse a page's contents through ParserCallbacks, described
+ // above. This method works whether the contents are a single
+ // stream or an array of streams. Call on a page object.
+ QPDF_DLL
+ void parsePageContents(ParserCallbacks* callbacks);
+
+ // Pipe a page's contents through the given pipeline. This method
+ // works whether the contents are a single stream or an array of
+ // streams. Call on a page object.
+ QPDF_DLL
+ void pipePageContents(Pipeline* p);
+
+ // Older method: stream_or_array should be the value of /Contents
+ // from a page object. It's more convenient to just call
+ // parsePageContents on the page object, and error messages will
+ // also be more useful because the page object information will be
+ // known.
QPDF_DLL
static void parseContentStream(QPDFObjectHandle stream_or_array,
ParserCallbacks* callbacks);
@@ -697,12 +715,17 @@ class QPDFObjectHandle
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
bool content_stream);
- static void parseContentStream_internal(
- PointerHolder<Buffer> stream_data,
+ void parseContentStream_internal(
std::string const& description,
ParserCallbacks* callbacks);
-
- // Other methods
+ static void parseContentStream_data(
+ PointerHolder<Buffer>,
+ std::string const& description,
+ ParserCallbacks* callbacks);
+ std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
+ std::string const& description, std::string& all_description);
+ void pipeContentStreams(Pipeline* p, std::string const& description,
+ std::string& all_description);
static void warn(QPDF*, QPDFExc const&);
bool initialized;
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index fb15cb1c..1e73f9a6 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
}
std::vector<QPDFObjectHandle>
-QPDFObjectHandle::getPageContents()
+QPDFObjectHandle::arrayOrStreamToStreamArray(
+ std::string const& description, std::string& all_description)
{
- assertPageObject();
-
+ all_description = description;
std::vector<QPDFObjectHandle> result;
- QPDFObjectHandle contents = this->getKey("/Contents");
- if (contents.isArray())
+ if (isArray())
{
- int n_items = contents.getArrayNItems();
+ int n_items = getArrayNItems();
for (int i = 0; i < n_items; ++i)
{
- QPDFObjectHandle item = contents.getArrayItem(i);
+ QPDFObjectHandle item = getArrayItem(i);
if (item.isStream())
+ {
+ result.push_back(item);
+ }
+ else
{
- result.push_back(item);
- }
- else
- {
- throw std::runtime_error(
- "unknown item type while inspecting "
- "element of /Contents array in page "
- "dictionary");
+ QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
+ warn(item.getOwningQPDF(),
+ QPDFExc(qpdf_e_damaged_pdf, description,
+ "item index " + QUtil::int_to_string(i) +
+ " (from 0)", 0,
+ "ignoring non-stream in an array of streams"));
}
}
}
- else if (contents.isStream())
+ else if (isStream())
+ {
+ result.push_back(*this);
+ }
+ else if (! isNull())
{
- result.push_back(contents);
+ warn(getOwningQPDF(),
+ QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
+ " object is supposed to be a stream or an"
+ " array of streams but is neither"));
}
- else if (! contents.isNull())
+
+ bool first = true;
+ for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
+ iter != result.end(); ++iter)
{
- throw std::runtime_error("unknown object type inspecting /Contents "
- "key in page dictionary");
+ QPDFObjectHandle item = *iter;
+ std::string og =
+ QUtil::int_to_string(item.getObjectID()) + " " +
+ QUtil::int_to_string(item.getGeneration());
+ if (first)
+ {
+ first = false;
+ }
+ else
+ {
+ all_description += ",";
+ }
+ all_description += " stream " + og;
}
return result;
}
+std::vector<QPDFObjectHandle>
+QPDFObjectHandle::getPageContents()
+{
+ assertPageObject();
+ std::string description = "page object " +
+ QUtil::int_to_string(this->objid) + " " +
+ QUtil::int_to_string(this->generation);
+ std::string all_description;
+ return this->getKey("/Contents").arrayOrStreamToStreamArray(
+ description, all_description);
+}
+
void
QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
{
@@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str,
}
void
-QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
- ParserCallbacks* callbacks)
+QPDFObjectHandle::pipePageContents(Pipeline* p)
{
- std::vector<QPDFObjectHandle> streams;
- if (stream_or_array.isArray())
- {
- streams = stream_or_array.getArrayAsVector();
- }
- else
- {
- streams.push_back(stream_or_array);
- }
- Pl_Buffer buf("concatenated stream data buffer");
- std::string all_description = "content stream objects";
- bool first = true;
+ std::string description = "page object " +
+ QUtil::int_to_string(this->objid) + " " +
+ QUtil::int_to_string(this->generation);
+ std::string all_description;
+ this->getKey("/Contents").pipeContentStreams(
+ p, description, all_description);
+}
+
+void
+QPDFObjectHandle::pipeContentStreams(
+ Pipeline* p, std::string const& description, std::string& all_description)
+{
+ std::vector<QPDFObjectHandle> streams =
+ arrayOrStreamToStreamArray(
+ description, all_description);
for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
iter != streams.end(); ++iter)
{
QPDFObjectHandle stream = *iter;
- if (! stream.isStream())
+ std::string og =
+ QUtil::int_to_string(stream.getObjectID()) + " " +
+ QUtil::int_to_string(stream.getGeneration());
+ std::string description = "content stream object " + og;
+ if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
{
- QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
+ QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
warn(stream.getOwningQPDF(),
QPDFExc(qpdf_e_damaged_pdf, "content stream",
- "", 0,
- "ignoring non-stream while parsing content streams"));
- }
- else
- {
- std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
- QUtil::int_to_string(stream.getGeneration());
- std::string description = "content stream object " + og;
- if (first)
- {
- first = false;
- }
- else
- {
- all_description += ",";
- }
- all_description += " " + og;
- if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))
- {
- QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
- warn(stream.getOwningQPDF(),
- QPDFExc(qpdf_e_damaged_pdf, "content stream",
- description, 0,
- "errors while decoding content stream"));
- }
+ description, 0,
+ "errors while decoding content stream"));
}
}
+}
+
+void
+QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
+{
+ std::string description = "page object " +
+ QUtil::int_to_string(this->objid) + " " +
+ QUtil::int_to_string(this->generation);
+ this->getKey("/Contents").parseContentStream_internal(
+ description, callbacks);
+}
+
+void
+QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
+ ParserCallbacks* callbacks)
+{
+ stream_or_array.parseContentStream_internal(
+ "content stream objects", callbacks);
+}
+
+void
+QPDFObjectHandle::parseContentStream_internal(
+ std::string const& description,
+ ParserCallbacks* callbacks)
+{
+ Pl_Buffer buf("concatenated stream data buffer");
+ std::string all_description;
+ pipeContentStreams(&buf, description, all_description);
PointerHolder<Buffer> stream_data = buf.getBuffer();
try
{
- parseContentStream_internal(stream_data, all_description, callbacks);
+ parseContentStream_data(stream_data, all_description, callbacks);
}
catch (TerminateParsing&)
{
@@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
}
void
-QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
- std::string const& description,
- ParserCallbacks* callbacks)
+QPDFObjectHandle::parseContentStream_data(
+ PointerHolder<Buffer> stream_data,
+ std::string const& description,
+ ParserCallbacks* callbacks)
{
size_t length = stream_data->getSize();
PointerHolder<InputSource> input =
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 57fd4fd4..35ca70d3 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
QPDFObjectHandle no val for last key 0
QPDF resolve failure to null 0
QPDFWriter preserve unreferenced standard 0
-QPDFObjectHandle non-stream in parsecontent 0
QPDFObjectHandle errors in parsecontent 0
QPDF stream with non-space 0
qpdf same file error 0
@@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
QPDFTokenizer EOF when not allowed 0
QPDFTokenizer inline image at EOF 0
Pl_QPDFTokenizer found ID 0
+QPDFObjectHandle non-stream in stream array 0
diff --git a/qpdf/qtest/qpdf/split-content-stream-errors.out b/qpdf/qtest/qpdf/split-content-stream-errors.out
index 81e6b8cb..fbfe020d 100644
--- a/qpdf/qtest/qpdf/split-content-stream-errors.out
+++ b/qpdf/qtest/qpdf/split-content-stream-errors.out
@@ -4,6 +4,6 @@ File is not encrypted
File is not linearized
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
-WARNING: content stream: ignoring non-stream while parsing content streams
+WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
WARNING: content stream (content stream object 6 0): errors while decoding content stream