aboutsummaryrefslogtreecommitdiffstats
path: root/examples/pdf-custom-filter.cc
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2020-12-26 17:13:00 +0100
committerJay Berkenbilt <ejb@ql.org>2020-12-28 19:03:04 +0100
commitd4d7630cf544dc295202382026658b55bf49f76b (patch)
tree2a5796a99c99f0ee3d27db7f412923e697dd8781 /examples/pdf-custom-filter.cc
parentac042d16cfe6efd1ff3295e1507558a2f87ec73f (diff)
downloadqpdf-d4d7630cf544dc295202382026658b55bf49f76b.tar.zst
Add pdf-custom-filter example
Diffstat (limited to 'examples/pdf-custom-filter.cc')
-rw-r--r--examples/pdf-custom-filter.cc532
1 files changed, 532 insertions, 0 deletions
diff --git a/examples/pdf-custom-filter.cc b/examples/pdf-custom-filter.cc
new file mode 100644
index 00000000..35f0ee38
--- /dev/null
+++ b/examples/pdf-custom-filter.cc
@@ -0,0 +1,532 @@
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFWriter.hh>
+#include <qpdf/QPDFStreamFilter.hh>
+
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+
+// This example shows you everything you need to know to implement a
+// custom stream filter for encoding and decoding as well as a stream
+// data provider that modifies the stream's dictionary. This example
+// uses the pattern of having the stream data provider class use a
+// second QPDF instance with copies of streams from the original QPDF
+// so that the stream data provider can access the original stream
+// data. This is implement very efficiently inside the qpdf library as
+// the second QPDF instance knows how to read the stream data from the
+// original input file, so no extra copies of the original stream data
+// are made.
+
+// This example creates an imaginary filter called /XORDecode. There
+// is no such filter in PDF, so the streams created by the example
+// would not be usable by any PDF reader. However, the techniques here
+// would work if you were going to implement support for a filter that
+// qpdf does not support natively. For example, using the techinques
+// shown here, it would be possible to create an application that
+// downsampled or re-encoded images or that re-compressed streams
+// using a more efficient "deflate" implementation than zlib.
+
+// Comments appear throughout the code describing each piece of code
+// and its purpose. You can read the file top to bottom, or you can
+// start with main() and follow the flow.
+
+// Please also see the test suite, qtest/custom-filter.test, which
+// contains additional comments describing how to observe the results
+// of running this example on test files that are specifically crafted
+// for it.
+
+static char const* whoami = 0;
+
+
+class Pl_XOR: public Pipeline
+{
+ // This class implements a Pipeline for the made-up XOR decoder.
+ // It is initialized with a single-byte "key" and just XORs each
+ // byte with that key. This makes it reversible, so there is no
+ // distinction between encoding and decoding.
+
+ public:
+ Pl_XOR(char const* identifier, Pipeline* next, unsigned char key);
+ virtual ~Pl_XOR() = default;
+ virtual void write(unsigned char* data, size_t len) override;
+ virtual void finish() override;
+
+ private:
+ unsigned char key;
+};
+
+Pl_XOR::Pl_XOR(char const* identifier, Pipeline* next, unsigned char key) :
+ Pipeline(identifier, next),
+ key(key)
+{
+}
+
+void
+Pl_XOR::write(unsigned char* data, size_t len)
+{
+ for (size_t i = 0; i < len; ++i)
+ {
+ unsigned char p = data[i] ^ this->key;
+ getNext()->write(&p, 1);
+ }
+}
+
+void
+Pl_XOR::finish()
+{
+ getNext()->finish();
+}
+
+class SF_XORDecode: public QPDFStreamFilter
+{
+ // This class implements a QPDFStreamFilter that knows how to
+ // validate and interpret decode parameters (/DecodeParms) for the
+ // made-up /XORDecode stream filter. Since this is not a real
+ // stream filter, no actual PDF reader would know how to interpret
+ // it. This is just to illlustrate how to create a stream filter.
+ // In main(), we call QPDF::registerStreamFilter to tell the
+ // library about the filter. See comments in QPDFStreamFilter.hh
+ // for details on how to implement the methods. For purposes of
+ // example, we are calling this a "specialized" compression
+ // filter, which just means QPDF assumes that it should not
+ // "uncompress" the stream by default.
+ public:
+ virtual ~SF_XORDecode() = default;
+ virtual bool setDecodeParms(QPDFObjectHandle decode_parms) override;
+ virtual Pipeline* getDecodePipeline(Pipeline* next) override;
+ virtual bool isSpecializedCompression() override;
+
+ private:
+ unsigned char key;
+ // It is the responsibility of the QPDFStreamFilter implementation
+ // to ensure that the pipeline returned by getDecodePipeline() is
+ // deleted when the class is deleted. The easiest way to do this
+ // is to stash the pipeline in a std::shared_ptr, which enables us
+ // to use the default destructor implementation.
+ std::shared_ptr<Pl_XOR> pipeline;
+};
+
+bool
+SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms)
+{
+ // For purposes of example, we store the key in a separate stream.
+ // We could just as well store the key directly in /DecodeParms,
+ // but this example uses a stream to illustrate how one might do
+ // that. For example, if implementing /JBIG2Decode, one would need
+ // to handle the /JBIG2Globals key, which points to a stream. See
+ // comments in SF_XORDecode::registerStream for additional notes
+ // on this.
+ try
+ {
+ // Expect /DecodeParms to be a dictionary with a /KeyStream
+ // key that points to a one-byte stream whose single byte is
+ // the key. If we are successful at retrieving the key, return
+ // true, indicating that we are able to process with the given
+ // decode parameters. Under any other circumstances, return
+ // false. For other examples of QPDFStreamFilter
+ // implementations, look at the classes whose names start with
+ // SF_ in the qpdf library implementation.
+ auto buf = decode_parms.getKey("/KeyStream").getStreamData();
+ if (buf->getSize() != 1)
+ {
+ return false;
+ }
+ this->key = buf->getBuffer()[0];
+ return true;
+ }
+ catch (std::exception& e)
+ {
+ std::cerr << "Error extracting key for /XORDecode: "
+ << e.what() << std::endl;
+ }
+ return false;
+}
+
+Pipeline*
+SF_XORDecode::getDecodePipeline(Pipeline* next)
+{
+ // Return a pipeline that the qpdf library should pass the stream
+ // data through. The pipeline should receive encoded data and pass
+ // decoded data to "next". getDecodePipeline() can always count on
+ // setDecodeParms() having been called first. The setDecodeParms()
+ // method should store any parameters needed by the pipeline. To
+ // ensure that the pipeline we return disappears when the class
+ // disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve
+ // the raw pointer from there.
+ this->pipeline = std::make_shared<Pl_XOR>("xor", next, this->key);
+ return this->pipeline.get();
+}
+
+bool
+SF_XORDecode::isSpecializedCompression()
+{
+ // The default implementation of QPDFStreamFilter would return
+ // false, so if you want a specialized or lossy compression
+ // filter, override one of the methods as described in
+ // QPDFStreamFilter.hh.
+ return true;
+}
+
+class StreamReplacer: public QPDFObjectHandle::StreamDataProvider
+{
+ // This class implements a StreamDataProvider that, under specific
+ // conditions, replaces the stream data with data encoded with the
+ // made-up /XORDecode filter.
+
+ // The flow for this class is as follows:
+ //
+ // * The main application iterates through streams that should be
+ // replaced and calls registerStream. registerStream in turn
+ // calls maybeReplace passing nullptr to pipeline and the
+ // address of a valid QPDFObjectHandle to dict_updates. The
+ // stream passed in for this call is the stream for the original
+ // QPDF object. It has not yet been altered, so we have access
+ // to its original dictionary and data. As described in the
+ // method, the method when called in this way makes a
+ // determination as to whether the stream should be replaced. If
+ // so, registerStream makes whatever changes are required. We
+ // have to do this now because we can't modify the stream during
+ // the writing process.
+ //
+ // * provideStreamData(), which is called by QPDFWriter during the
+ // write process, actually writes the modified stream data. It
+ // calls maybeReplace again, but this time it passes a valid
+ // pipeline and passes nullptr to dict_updates. In this mode,
+ // the stream dictionary has already been altered, and the
+ // original stream data is no longer directly accessible. Trying
+ // to retrieve the stream data would be an infinite loop because
+ // it would just end up calling provideStreamData again. This is
+ // why maybeReplace uses a stashed copy of the original stream
+ // from the "other" QPDF object.
+
+ // Additional explanation can be found in the method
+ // implementations.
+
+ public:
+ StreamReplacer(QPDF* pdf);
+ virtual ~StreamReplacer() = default;
+ virtual void provideStreamData(int objid, int generation,
+ Pipeline* pipeline) override;
+
+ void registerStream(
+ QPDFObjectHandle stream,
+ PointerHolder<QPDFObjectHandle::StreamDataProvider> self);
+
+ private:
+ bool maybeReplace(QPDFObjGen const& og,
+ QPDFObjectHandle& stream, Pipeline* pipeline,
+ QPDFObjectHandle* dict_updates);
+
+ // Hang onto a reference to the QPDF object containing the streams
+ // we are replacing. We need this to create a new stream.
+ QPDF* pdf;
+
+ // This second QPDF instance gives us a place to copy streams to
+ // so that we can access the original stream data of the streams
+ // whose data we are replacing.
+ QPDF other;
+
+ // Map the object/generation in original file to the copied stream
+ // in "other". We use this to retrieve the original data.
+ std::map<QPDFObjGen, QPDFObjectHandle> copied_streams;
+
+ // Each stream gets is own "key" for the XOR filter. We use a
+ // single instance of StreamReplacer for all streams, so stash all
+ // the keys here.
+ std::map<QPDFObjGen, unsigned char> keys;
+};
+
+StreamReplacer::StreamReplacer(QPDF* pdf) :
+ pdf(pdf)
+{
+ // Our "other" QPDF is just a place to stash streams. It doesn't
+ // have to be a valid PDF with pages, etc. We are never going to
+ // write this out.
+ this->other.emptyPDF();
+}
+
+bool
+StreamReplacer::maybeReplace(QPDFObjGen const& og,
+ QPDFObjectHandle& stream,
+ Pipeline* pipeline,
+ QPDFObjectHandle* dict_updates)
+{
+ // As described in the class comments, this method is called
+ // twice. Before writing has started pipeline is nullptr, and
+ // dict_updates is provided. In this mode, we figure out whether
+ // we should replace the stream and, if so, take care of the
+ // necessary setup. When we are actually ready to supply the data,
+ // this method is called again with pipeline populated and
+ // dict_updates as a nullptr. In this mode, we are not allowed to
+ // change anything, sincing writing is already in progress. We
+ // must simply provide the stream data.
+
+ // The return value indicates whether or not we should replace the
+ // stream. If the first call returns false, there will be no
+ // second call. If the second call returns false, something went
+ // wrong since the method should always make the same decision for
+ // a given stream.
+
+ // For this example, all the determination logic could have
+ // appeared inside the if (dict_updates) block rather than being
+ // duplicated, but in some cases, there may be a reason to
+ // duplicate things. For example, if you wanted to write code that
+ // re-encoded an image if the new encoding was more efficient,
+ // you'd have to actually try it out. Then you would either have
+ // to cache the result somewhere or just repeat the calculations,
+ // depending on space/time constraints, etc.
+
+ // In our contrived example, we are replacing the data for all
+ // streams that have /DoXOR = true in the stream dictionary. If
+ // this were a more realistic application, our criteria would be
+ // more sensible. For example, an image downsampler might choose
+ // to replace a stream that represented an image with a high pixel
+ // density.
+ auto dict = stream.getDict();
+ auto mark = dict.getKey("/DoXOR");
+ if (! (mark.isBool() && mark.getBoolValue()))
+ {
+ return false;
+ }
+
+ // We can't replace the stream data if we can't get the original
+ // stream data for any reason. A more realistic application may
+ // actually look at the data here as well, or it may be able to
+ // make all its decisions from the stream dictionary. However,
+ // it's a good idea to make sure we can retrieve the filtered data
+ // if we are going to need it later.
+ PointerHolder<Buffer> out;
+ try
+ {
+ out = stream.getStreamData();
+ }
+ catch (...)
+ {
+ return false;
+ }
+
+ if (dict_updates)
+ {
+ // It's not safe to make any modifications to any objects
+ // during the writing process since the updated objects may
+ // have already been written. In this mode, when dict_updates
+ // is provided, we have not started writing. Store the
+ // modifications we intend to make to the stream dictionary
+ // here. We're just storing /OrigLength for purposes of
+ // example. Again, a realistic application would make other
+ // changes. For example, an image resampler might change the
+ // dimensions or other properties of the image.
+ dict_updates->replaceKey(
+ "/OrigLength", QPDFObjectHandle::newInteger(
+ QIntC::to_longlong(out->getSize())));
+ // We are also storing the "key" that we will access when
+ // writing the data.
+ this->keys[og] = QIntC::to_uchar(
+ (og.getObj() * QIntC::to_int(out->getSize())) & 0xff);
+ }
+
+ if (pipeline)
+ {
+ unsigned char key = this->keys[og];
+ Pl_XOR p("xor", pipeline, key);
+ p.write(out->getBuffer(), out->getSize());
+ p.finish();
+ }
+ return true;
+}
+
+void
+StreamReplacer::registerStream(
+ QPDFObjectHandle stream,
+ PointerHolder<QPDFObjectHandle::StreamDataProvider> self)
+{
+ QPDFObjGen og(stream.getObjGen());
+
+ // We don't need to process a stream more than once. In this
+ // example, we are just iterating through objects, but if we were
+ // doing something like iterating through images on pages, we
+ // might realistically encounter the same stream more than once.
+ if (this->copied_streams.count(og) > 0)
+ {
+ return;
+ }
+ // Store something in copied_streams so that we don't
+ // double-process even in the negative case. This gets replaced
+ // later if needed.
+ this->copied_streams[og] = QPDFObjectHandle::newNull();
+
+ // Call maybeReplace with dict_updates. In this mode, it
+ // determines whether we should replace the stream data and, if
+ // so, supplies dictionary updates we should make.
+ bool should_replace = false;
+ QPDFObjectHandle dict_updates = QPDFObjectHandle::newDictionary();
+ try
+ {
+ should_replace = maybeReplace(og, stream, nullptr, &dict_updates);
+ }
+ catch (std::exception& e)
+ {
+ stream.warnIfPossible(
+ std::string("exception while attempting to replace: ") +
+ e.what());
+ }
+
+ if (should_replace)
+ {
+ // Copy the stream to another QPDF object so we can get to the
+ // original data from the stream data provider.
+ this->copied_streams[og] = this->other.copyForeignObject(stream);
+ // Update the stream dictionary with any changes.
+ auto dict = stream.getDict();
+ for (auto const& k: dict_updates.getKeys())
+ {
+ dict.replaceKey(k, dict_updates.getKey(k));
+ }
+ // Create the key stream that will be referenced from
+ // /DecodeParms. We have to do this now since you can't modify
+ // or create objects during write.
+ char p[1] = { static_cast<char>(this->keys[og]) };
+ std::string p_str(p, 1);
+ QPDFObjectHandle dp_stream =
+ QPDFObjectHandle::newStream(this->pdf, p_str);
+ // Create /DecodeParms as expected by our fictitious
+ // /XORDecode filter.
+ QPDFObjectHandle decode_parms =
+ QPDFObjectHandle::newDictionary({{"/KeyStream", dp_stream}});
+ stream.replaceStreamData(
+ self,
+ QPDFObjectHandle::newName("/XORDecode"),
+ decode_parms);
+ // Further, if /ProtectXOR = true, we disable filtering on write
+ // so that QPDFWriter will not decode the stream even though we
+ // have registered a stream filter for /XORDecode.
+ auto protect = dict.getKey("/ProtectXOR");
+ if (protect.isBool() && protect.getBoolValue())
+ {
+ stream.setFilterOnWrite(false);
+ }
+ }
+}
+
+void
+StreamReplacer::provideStreamData(int objid, int generation,
+ Pipeline* pipeline)
+{
+ QPDFObjGen og(objid, generation);
+ QPDFObjectHandle orig = this->copied_streams[og];
+ // call maybeReplace again, this time with the pipeline and no
+ // dict_updates. In this mode, maybeReplace doesn't make any
+ // changes. We have to hand it the original stream data, which we
+ // get from copied_streams.
+ if (! maybeReplace(og, orig, pipeline, nullptr))
+ {
+ // Since this only gets called for streams we already
+ // determined we are replacing, a false return would indicate
+ // a logic error.
+ throw std::logic_error(
+ "should_replace return false in provideStreamData");
+ }
+}
+
+static void process(char const* infilename, char const* outfilename,
+ bool decode_specialized)
+{
+ QPDF qpdf;
+ qpdf.processFile(infilename);
+
+ // Create a single StreamReplacer instance. The interface requires
+ // a PointerHolder in various places, so allocate a StreamReplacer
+ // and stash it in a PointerHolder.
+ StreamReplacer* replacer = new StreamReplacer(&qpdf);
+ PointerHolder<QPDFObjectHandle::StreamDataProvider> p(replacer);
+
+ for (auto& o: qpdf.getAllObjects())
+ {
+ if (o.isStream())
+ {
+ // Call registerStream for every stream. Only ones that
+ // registerStream decides to replace will actually be
+ // replaced.
+ replacer->registerStream(o, p);
+ }
+ }
+
+ QPDFWriter w(qpdf, outfilename);
+ if (decode_specialized)
+ {
+ w.setDecodeLevel(qpdf_dl_specialized);
+ }
+ // For the test suite, use static IDs.
+ w.setStaticID(true); // for testing only
+ w.write();
+ std::cout << whoami << ": new file written to " << outfilename
+ << std::endl;
+}
+
+static void usage()
+{
+ std::cerr
+ << "\n"
+ << "Usage: " << whoami << " [ --decode-specialized ] infile outfile\n"
+ << std::endl;
+ exit(2);
+}
+
+int main(int argc, char* argv[])
+{
+ whoami = QUtil::getWhoami(argv[0]);
+
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ char const* infilename = 0;
+ char const* outfilename = 0;
+ bool decode_specialized = false;
+ for (int i = 1; i < argc; ++i)
+ {
+ if (strcmp(argv[i], "--decode-specialized") == 0)
+ {
+ decode_specialized = true;
+ }
+ else if (! infilename)
+ {
+ infilename = argv[i];
+ }
+ else if (! outfilename)
+ {
+ outfilename = argv[i];
+ }
+ else
+ {
+ usage();
+ }
+ }
+ if (! (infilename && outfilename))
+ {
+ usage();
+ }
+
+ try
+ {
+ // Register our fictitious filter. This enables QPDFWriter to
+ // decode our streams. This is not a real filter, so no real
+ // PDF reading application would be able to interpret it. This
+ // is just for illustrative purposes.
+ QPDF::registerStreamFilter(
+ "/XORDecode", []{ return std::make_shared<SF_XORDecode>(); });
+ // Do the actual processing.
+ process(infilename, outfilename, decode_specialized);
+ }
+ catch (std::exception &e)
+ {
+ std::cerr << whoami << ": exception: " << e.what() << std::endl;
+ exit(2);
+ }
+
+ return 0;
+}