diff options
author | Jay Berkenbilt <jberkenbilt@users.noreply.github.com> | 2023-06-17 15:26:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-17 15:26:16 +0200 |
commit | 44dce4e2988ec09e36dfb6d1fc527a143e6f597f (patch) | |
tree | 3cdb1999faeb5103eb87c36fb1673d8cbb5ab082 /examples/pdf-custom-filter.cc | |
parent | fc30e75122f6bef20ac8d676ac7cb8822feecf3e (diff) | |
parent | acd0acf16931ce92bc908e4960c5a1e43d53b550 (diff) | |
download | qpdf-44dce4e2988ec09e36dfb6d1fc527a143e6f597f.tar.zst |
Merge pull request #978 from m-holger/reflow
Code tidy - reflow comments and strings
Diffstat (limited to 'examples/pdf-custom-filter.cc')
-rw-r--r-- | examples/pdf-custom-filter.cc | 327 |
1 files changed, 136 insertions, 191 deletions
diff --git a/examples/pdf-custom-filter.cc b/examples/pdf-custom-filter.cc index 57eedf74..e3cdf164 100644 --- a/examples/pdf-custom-filter.cc +++ b/examples/pdf-custom-filter.cc @@ -8,43 +8,35 @@ #include <iostream> #include <memory> -// This example shows you everything you need to know to implement a -// custom stream filter for encoding and decoding as well as a stream -// data provider that modifies the stream's dictionary. This example -// uses the pattern of having the stream data provider class use a -// second QPDF instance with copies of streams from the original QPDF -// so that the stream data provider can access the original stream -// data. This is implemented very efficiently inside the qpdf library as -// the second QPDF instance knows how to read the stream data from the -// original input file, so no extra copies of the original stream data -// are made. - -// This example creates an imaginary filter called /XORDecode. There -// is no such filter in PDF, so the streams created by the example -// would not be usable by any PDF reader. However, the techniques here -// would work if you were going to implement support for a filter that -// qpdf does not support natively. For example, using the techniques -// shown here, it would be possible to create an application that -// downsampled or re-encoded images or that re-compressed streams -// using a more efficient "deflate" implementation than zlib. - -// Comments appear throughout the code describing each piece of code -// and its purpose. You can read the file top to bottom, or you can -// start with main() and follow the flow. - -// Please also see the test suite, qtest/custom-filter.test, which -// contains additional comments describing how to observe the results -// of running this example on test files that are specifically crafted -// for it. +// This example shows you everything you need to know to implement a custom stream filter for +// encoding and decoding as well as a stream data provider that modifies the stream's dictionary. +// This example uses the pattern of having the stream data provider class use a second QPDF instance +// with copies of streams from the original QPDF so that the stream data provider can access the +// original stream data. This is implemented very efficiently inside the qpdf library as the second +// QPDF instance knows how to read the stream data from the original input file, so no extra copies +// of the original stream data are made. + +// This example creates an imaginary filter called /XORDecode. There is no such filter in PDF, so +// the streams created by the example would not be usable by any PDF reader. However, the techniques +// here would work if you were going to implement support for a filter that qpdf does not support +// natively. For example, using the techniques shown here, it would be possible to create an +// application that downsampled or re-encoded images or that re-compressed streams using a more +// efficient "deflate" implementation than zlib. + +// Comments appear throughout the code describing each piece of code and its purpose. You can read +// the file top to bottom, or you can start with main() and follow the flow. + +// Please also see the test suite, qtest/custom-filter.test, which contains additional comments +// describing how to observe the results of running this example on test files that are specifically +// crafted for it. static char const* whoami = nullptr; class Pl_XOR: public Pipeline { - // This class implements a Pipeline for the made-up XOR decoder. - // It is initialized with a single-byte "key" and just XORs each - // byte with that key. This makes it reversible, so there is no - // distinction between encoding and decoding. + // This class implements a Pipeline for the made-up XOR decoder. It is initialized with a + // single-byte "key" and just XORs each byte with that key. This makes it reversible, so there + // is no distinction between encoding and decoding. public: Pl_XOR(char const* identifier, Pipeline* next, unsigned char key); @@ -79,17 +71,14 @@ Pl_XOR::finish() class SF_XORDecode: public QPDFStreamFilter { - // This class implements a QPDFStreamFilter that knows how to - // validate and interpret decode parameters (/DecodeParms) for the - // made-up /XORDecode stream filter. Since this is not a real - // stream filter, no actual PDF reader would know how to interpret - // it. This is just to illustrate how to create a stream filter. - // In main(), we call QPDF::registerStreamFilter to tell the - // library about the filter. See comments in QPDFStreamFilter.hh - // for details on how to implement the methods. For purposes of - // example, we are calling this a "specialized" compression - // filter, which just means QPDF assumes that it should not - // "uncompress" the stream by default. + // This class implements a QPDFStreamFilter that knows how to validate and interpret decode + // parameters (/DecodeParms) for the made-up /XORDecode stream filter. Since this is not a real + // stream filter, no actual PDF reader would know how to interpret it. This is just to + // illustrate how to create a stream filter. In main(), we call QPDF::registerStreamFilter to + // tell the library about the filter. See comments in QPDFStreamFilter.hh for details on how to + // implement the methods. For purposes of example, we are calling this a "specialized" + // compression filter, which just means QPDF assumes that it should not "uncompress" the stream + // by default. public: ~SF_XORDecode() override = default; bool setDecodeParms(QPDFObjectHandle decode_parms) override; @@ -98,33 +87,28 @@ class SF_XORDecode: public QPDFStreamFilter private: unsigned char key; - // It is the responsibility of the QPDFStreamFilter implementation - // to ensure that the pipeline returned by getDecodePipeline() is - // deleted when the class is deleted. The easiest way to do this - // is to stash the pipeline in a std::shared_ptr, which enables us - // to use the default destructor implementation. + // It is the responsibility of the QPDFStreamFilter implementation to ensure that the pipeline + // returned by getDecodePipeline() is deleted when the class is deleted. The easiest way to do + // this is to stash the pipeline in a std::shared_ptr, which enables us to use the default + // destructor implementation. std::shared_ptr<Pl_XOR> pipeline; }; bool SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms) { - // For purposes of example, we store the key in a separate stream. - // We could just as well store the key directly in /DecodeParms, - // but this example uses a stream to illustrate how one might do - // that. For example, if implementing /JBIG2Decode, one would need - // to handle the /JBIG2Globals key, which points to a stream. See - // comments in SF_XORDecode::registerStream for additional notes - // on this. + // For purposes of example, we store the key in a separate stream. We could just as well store + // the key directly in /DecodeParms, but this example uses a stream to illustrate how one might + // do that. For example, if implementing /JBIG2Decode, one would need to handle the + // /JBIG2Globals key, which points to a stream. See comments in SF_XORDecode::registerStream for + // additional notes on this. try { - // Expect /DecodeParms to be a dictionary with a /KeyStream - // key that points to a one-byte stream whose single byte is - // the key. If we are successful at retrieving the key, return - // true, indicating that we are able to process with the given - // decode parameters. Under any other circumstances, return - // false. For other examples of QPDFStreamFilter - // implementations, look at the classes whose names start with - // SF_ in the qpdf library implementation. + // Expect /DecodeParms to be a dictionary with a /KeyStream key that points to a one-byte + // stream whose single byte is the key. If we are successful at retrieving the key, return + // true, indicating that we are able to process with the given decode parameters. Under any + // other circumstances, return false. For other examples of QPDFStreamFilter + // implementations, look at the classes whose names start with SF_ in the qpdf library + // implementation. auto buf = decode_parms.getKey("/KeyStream").getStreamData(); if (buf->getSize() != 1) { return false; @@ -140,14 +124,12 @@ SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms) Pipeline* SF_XORDecode::getDecodePipeline(Pipeline* next) { - // Return a pipeline that the qpdf library should pass the stream - // data through. The pipeline should receive encoded data and pass - // decoded data to "next". getDecodePipeline() can always count on - // setDecodeParms() having been called first. The setDecodeParms() - // method should store any parameters needed by the pipeline. To - // ensure that the pipeline we return disappears when the class - // disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve - // the raw pointer from there. + // Return a pipeline that the qpdf library should pass the stream data through. The pipeline + // should receive encoded data and pass decoded data to "next". getDecodePipeline() can always + // count on setDecodeParms() having been called first. The setDecodeParms() method should store + // any parameters needed by the pipeline. To ensure that the pipeline we return disappears when + // the class disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve the raw pointer from + // there. this->pipeline = std::make_shared<Pl_XOR>("xor", next, this->key); return this->pipeline.get(); } @@ -155,46 +137,37 @@ SF_XORDecode::getDecodePipeline(Pipeline* next) bool SF_XORDecode::isSpecializedCompression() { - // The default implementation of QPDFStreamFilter would return - // false, so if you want a specialized or lossy compression - // filter, override one of the methods as described in + // The default implementation of QPDFStreamFilter would return false, so if you want a + // specialized or lossy compression filter, override one of the methods as described in // QPDFStreamFilter.hh. return true; } class StreamReplacer: public QPDFObjectHandle::StreamDataProvider { - // This class implements a StreamDataProvider that, under specific - // conditions, replaces the stream data with data encoded with the - // made-up /XORDecode filter. + // This class implements a StreamDataProvider that, under specific conditions, replaces the + // stream data with data encoded with the made-up /XORDecode filter. // The flow for this class is as follows: // - // * The main application iterates through streams that should be - // replaced and calls registerStream. registerStream in turn - // calls maybeReplace passing nullptr to pipeline and the - // address of a valid QPDFObjectHandle to dict_updates. The - // stream passed in for this call is the stream for the original - // QPDF object. It has not yet been altered, so we have access - // to its original dictionary and data. As described in the - // method, the method when called in this way makes a - // determination as to whether the stream should be replaced. If - // so, registerStream makes whatever changes are required. We - // have to do this now because we can't modify the stream during - // the writing process. + // * The main application iterates through streams that should be replaced and calls + // registerStream. registerStream in turn calls maybeReplace passing nullptr to pipeline and + // the address of a valid QPDFObjectHandle to dict_updates. The stream passed in for this call + // is the stream for the original QPDF object. It has not yet been altered, so we have access + // to its original dictionary and data. As described in the method, the method when called in + // this way makes a determination as to whether the stream should be replaced. If so, + // registerStream makes whatever changes are required. We have to do this now because we can't + // modify the stream during the writing process. // - // * provideStreamData(), which is called by QPDFWriter during the - // write process, actually writes the modified stream data. It - // calls maybeReplace again, but this time it passes a valid - // pipeline and passes nullptr to dict_updates. In this mode, - // the stream dictionary has already been altered, and the - // original stream data is no longer directly accessible. Trying - // to retrieve the stream data would cause an infinite loop because - // it would just end up calling provideStreamData again. This is - // why maybeReplace uses a stashed copy of the original stream. - - // Additional explanation can be found in the method - // implementations. + // * provideStreamData(), which is called by QPDFWriter during the write process, actually + // writes the modified stream data. It calls maybeReplace again, but this time it passes a + // valid pipeline and passes nullptr to dict_updates. In this mode, the stream dictionary has + // already been altered, and the original stream data is no longer directly accessible. Trying + // to retrieve the stream data would cause an infinite loop because it would just end up + // calling provideStreamData again. This is why maybeReplace uses a stashed copy of the + // original stream. + + // Additional explanation can be found in the method implementations. public: StreamReplacer(QPDF* pdf); @@ -211,17 +184,16 @@ class StreamReplacer: public QPDFObjectHandle::StreamDataProvider Pipeline* pipeline, QPDFObjectHandle* dict_updates); - // Hang onto a reference to the QPDF object containing the streams - // we are replacing. We need this to create a new stream. + // Hang onto a reference to the QPDF object containing the streams we are replacing. We need + // this to create a new stream. QPDF* pdf; - // Map the object/generation in original file to the copied stream - // in "other". We use this to retrieve the original data. + // Map the object/generation in original file to the copied stream in "other". We use this to + // retrieve the original data. std::map<QPDFObjGen, QPDFObjectHandle> copied_streams; - // Each stream gets is own "key" for the XOR filter. We use a - // single instance of StreamReplacer for all streams, so stash all - // the keys here. + // Each stream gets is own "key" for the XOR filter. We use a single instance of StreamReplacer + // for all streams, so stash all the keys here. std::map<QPDFObjGen, unsigned char> keys; }; @@ -237,49 +209,38 @@ StreamReplacer::maybeReplace( Pipeline* pipeline, QPDFObjectHandle* dict_updates) { - // As described in the class comments, this method is called - // twice. Before writing has started pipeline is nullptr, and - // dict_updates is provided. In this mode, we figure out whether - // we should replace the stream and, if so, take care of the - // necessary setup. When we are actually ready to supply the data, - // this method is called again with pipeline populated and - // dict_updates as a nullptr. In this mode, we are not allowed to - // change anything, since writing is already in progress. We - // must simply provide the stream data. - - // The return value indicates whether or not we should replace the - // stream. If the first call returns false, there will be no - // second call. If the second call returns false, something went - // wrong since the method should always make the same decision for - // a given stream. - - // For this example, all the determination logic could have - // appeared inside the if (dict_updates) block rather than being - // duplicated, but in some cases, there may be a reason to - // duplicate things. For example, if you wanted to write code that - // re-encoded an image if the new encoding was more efficient, - // you'd have to actually try it out. Then you would either have - // to cache the result somewhere or just repeat the calculations, - // depending on space/time constraints, etc. - - // In our contrived example, we are replacing the data for all - // streams that have /DoXOR = true in the stream dictionary. If - // this were a more realistic application, our criteria would be - // more sensible. For example, an image downsampler might choose - // to replace a stream that represented an image with a high pixel - // density. + // As described in the class comments, this method is called twice. Before writing has started + // pipeline is nullptr, and dict_updates is provided. In this mode, we figure out whether we + // should replace the stream and, if so, take care of the necessary setup. When we are actually + // ready to supply the data, this method is called again with pipeline populated and + // dict_updates as a nullptr. In this mode, we are not allowed to change anything, since writing + // is already in progress. We must simply provide the stream data. + + // The return value indicates whether or not we should replace the stream. If the first call + // returns false, there will be no second call. If the second call returns false, something went + // wrong since the method should always make the same decision for a given stream. + + // For this example, all the determination logic could have appeared inside the if + // (dict_updates) block rather than being duplicated, but in some cases, there may be a reason + // to duplicate things. For example, if you wanted to write code that re-encoded an image if the + // new encoding was more efficient, you'd have to actually try it out. Then you would either + // have to cache the result somewhere or just repeat the calculations, depending on space/time + // constraints, etc. + + // In our contrived example, we are replacing the data for all streams that have /DoXOR = true + // in the stream dictionary. If this were a more realistic application, our criteria would be + // more sensible. For example, an image downsampler might choose to replace a stream that + // represented an image with a high pixel density. auto dict = stream.getDict(); auto mark = dict.getKey("/DoXOR"); if (!(mark.isBool() && mark.getBoolValue())) { return false; } - // We can't replace the stream data if we can't get the original - // stream data for any reason. A more realistic application may - // actually look at the data here as well, or it may be able to - // make all its decisions from the stream dictionary. However, - // it's a good idea to make sure we can retrieve the filtered data - // if we are going to need it later. + // We can't replace the stream data if we can't get the original stream data for any reason. A + // more realistic application may actually look at the data here as well, or it may be able to + // make all its decisions from the stream dictionary. However, it's a good idea to make sure we + // can retrieve the filtered data if we are going to need it later. std::shared_ptr<Buffer> out; try { out = stream.getStreamData(); @@ -288,19 +249,15 @@ StreamReplacer::maybeReplace( } if (dict_updates) { - // It's not safe to make any modifications to any objects - // during the writing process since the updated objects may - // have already been written. In this mode, when dict_updates - // is provided, we have not started writing. Store the - // modifications we intend to make to the stream dictionary - // here. We're just storing /OrigLength for purposes of - // example. Again, a realistic application would make other - // changes. For example, an image resampler might change the - // dimensions or other properties of the image. + // It's not safe to make any modifications to any objects during the writing process since + // the updated objects may have already been written. In this mode, when dict_updates is + // provided, we have not started writing. Store the modifications we intend to make to the + // stream dictionary here. We're just storing /OrigLength for purposes of example. Again, a + // realistic application would make other changes. For example, an image resampler might + // change the dimensions or other properties of the image. dict_updates->replaceKey( "/OrigLength", QPDFObjectHandle::newInteger(QIntC::to_longlong(out->getSize()))); - // We are also storing the "key" that we will access when - // writing the data. + // We are also storing the "key" that we will access when writing the data. this->keys[og] = QIntC::to_uchar((og.getObj() * QIntC::to_int(out->getSize())) & 0xff); } @@ -319,21 +276,18 @@ StreamReplacer::registerStream( { QPDFObjGen og(stream.getObjGen()); - // We don't need to process a stream more than once. In this - // example, we are just iterating through objects, but if we were - // doing something like iterating through images on pages, we + // We don't need to process a stream more than once. In this example, we are just iterating + // through objects, but if we were doing something like iterating through images on pages, we // might realistically encounter the same stream more than once. if (this->copied_streams.count(og) > 0) { return; } - // Store something in copied_streams so that we don't - // double-process even in the negative case. This gets replaced - // later if needed. + // Store something in copied_streams so that we don't double-process even in the negative case. + // This gets replaced later if needed. this->copied_streams[og] = QPDFObjectHandle::newNull(); - // Call maybeReplace with dict_updates. In this mode, it - // determines whether we should replace the stream data and, if - // so, supplies dictionary updates we should make. + // Call maybeReplace with dict_updates. In this mode, it determines whether we should replace + // the stream data and, if so, supplies dictionary updates we should make. bool should_replace = false; QPDFObjectHandle dict_updates = QPDFObjectHandle::newDictionary(); try { @@ -343,9 +297,8 @@ StreamReplacer::registerStream( } if (should_replace) { - // Copy the stream so we can get to the original data from the - // stream data provider. This doesn't actually copy any data, - // but the copy retains the original stream data after the + // Copy the stream so we can get to the original data from the stream data provider. This + // doesn't actually copy any data, but the copy retains the original stream data after the // original one is modified. this->copied_streams[og] = stream.copyStream(); // Update the stream dictionary with any changes. @@ -353,20 +306,17 @@ StreamReplacer::registerStream( for (auto const& k: dict_updates.getKeys()) { dict.replaceKey(k, dict_updates.getKey(k)); } - // Create the key stream that will be referenced from - // /DecodeParms. We have to do this now since you can't modify - // or create objects during write. + // Create the key stream that will be referenced from /DecodeParms. We have to do this now + // since you can't modify or create objects during write. char p[1] = {static_cast<char>(this->keys[og])}; std::string p_str(p, 1); QPDFObjectHandle dp_stream = this->pdf->newStream(p_str); - // Create /DecodeParms as expected by our fictitious - // /XORDecode filter. + // Create /DecodeParms as expected by our fictitious /XORDecode filter. QPDFObjectHandle decode_parms = QPDFObjectHandle::newDictionary({{"/KeyStream", dp_stream}}); stream.replaceStreamData(self, QPDFObjectHandle::newName("/XORDecode"), decode_parms); - // Further, if /ProtectXOR = true, we disable filtering on write - // so that QPDFWriter will not decode the stream even though we - // have registered a stream filter for /XORDecode. + // Further, if /ProtectXOR = true, we disable filtering on write so that QPDFWriter will not + // decode the stream even though we have registered a stream filter for /XORDecode. auto protect = dict.getKey("/ProtectXOR"); if (protect.isBool() && protect.getBoolValue()) { stream.setFilterOnWrite(false); @@ -378,14 +328,12 @@ void StreamReplacer::provideStreamData(QPDFObjGen const& og, Pipeline* pipeline) { QPDFObjectHandle orig = this->copied_streams[og]; - // call maybeReplace again, this time with the pipeline and no - // dict_updates. In this mode, maybeReplace doesn't make any - // changes. We have to hand it the original stream data, which we + // call maybeReplace again, this time with the pipeline and no dict_updates. In this mode, + // maybeReplace doesn't make any changes. We have to hand it the original stream data, which we // get from copied_streams. if (!maybeReplace(og, orig, pipeline, nullptr)) { - // Since this only gets called for streams we already - // determined we are replacing, a false return would indicate - // a logic error. + // Since this only gets called for streams we already determined we are replacing, a false + // return would indicate a logic error. throw std::logic_error("should_replace return false in provideStreamData"); } } @@ -396,17 +344,15 @@ process(char const* infilename, char const* outfilename, bool decode_specialized QPDF qpdf; qpdf.processFile(infilename); - // Create a single StreamReplacer instance. The interface requires - // a std::shared_ptr in various places, so allocate a StreamReplacer - // and stash it in a std::shared_ptr. + // Create a single StreamReplacer instance. The interface requires a std::shared_ptr in various + // places, so allocate a StreamReplacer and stash it in a std::shared_ptr. auto* replacer = new StreamReplacer(&qpdf); std::shared_ptr<QPDFObjectHandle::StreamDataProvider> p(replacer); for (auto& o: qpdf.getAllObjects()) { if (o.isStream()) { - // Call registerStream for every stream. Only ones that - // registerStream decides to replace will actually be - // replaced. + // Call registerStream for every stream. Only ones that registerStream decides to + // replace will actually be replaced. replacer->registerStream(o, p); } } @@ -454,9 +400,8 @@ main(int argc, char* argv[]) } try { - // Register our fictitious filter. This enables QPDFWriter to - // decode our streams. This is not a real filter, so no real - // PDF reading application would be able to interpret it. This + // Register our fictitious filter. This enables QPDFWriter to decode our streams. This is + // not a real filter, so no real PDF reading application would be able to interpret it. This // is just for illustrative purposes. QPDF::registerStreamFilter("/XORDecode", [] { return std::make_shared<SF_XORDecode>(); }); // Do the actual processing. |