From 3f1ab640669ac493f1b2985b70322dba7c037ac9 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Thu, 22 Aug 2019 19:16:25 -0400 Subject: Pass offset and length to ParserCallbacks::handleObject --- ChangeLog | 14 ++ examples/pdf-parse-content.cc | 15 ++- examples/qtest/parse-content/content.out | 21 +-- include/qpdf/QPDFObjectHandle.hh | 16 ++- libqpdf/QPDFObjectHandle.cc | 39 +++++- manual/qpdf-manual.xml | 19 +++ qpdf/qtest/qpdf/eof-in-inline-image.out | 49 +++---- qpdf/qtest/qpdf/terminate-parsing.out | 168 ++++++++++++------------ qpdf/qtest/qpdf/tokenize-content-streams.out | 189 ++++++++++++++------------- qpdf/qtest/qpdf/tokenize-content-streams.pdf | Bin 1539 -> 1575 bytes qpdf/test_driver.cc | 15 ++- 11 files changed, 326 insertions(+), 219 deletions(-) diff --git a/ChangeLog b/ChangeLog index a270f54f..af06bf3f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,19 @@ 2019-08-22 Jay Berkenbilt + * In QPDFObjectHandle::ParserCallbacks, in addition to + handleObject(QPDFObjectHandle), allow developers to override + handleObject(QPDFObjectHandle, size_t offset, size_t length). If + this method appears instead, it is called with the offset of the + object in the content stream (which may be concatenated from an + array of streams) and the length of the object. Intervening + whitespace and comments are not included in offset and length. + + * Add method + QPDFObjectHandle::ParserCallbacks::contentSize(size_t). If + defined, it is called by the content stream parser before the + first call to handleObject, and the argument is the total size in + bytes of the content streams. + * Add QPDFObjectHandle::isDirectNull() -- a const method that allows determining whether an object is a literal null without attempting to resolve it. diff --git a/examples/pdf-parse-content.cc b/examples/pdf-parse-content.cc index 254fcdfe..a8cd3290 100644 --- a/examples/pdf-parse-content.cc +++ b/examples/pdf-parse-content.cc @@ -26,14 +26,23 @@ class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks { } - virtual void handleObject(QPDFObjectHandle); + virtual void contentSize(size_t); + virtual void handleObject(QPDFObjectHandle, size_t offset, size_t length); virtual void handleEOF(); }; void -ParserCallbacks::handleObject(QPDFObjectHandle obj) +ParserCallbacks::contentSize(size_t size) { - std::cout << obj.getTypeName() << ": "; + std::cout << "content size: " << size << std::endl; +} + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj, + size_t offset, size_t length) +{ + std::cout << obj.getTypeName() << ", offset=" << offset + << ", length=" << length << ": "; if (obj.isInlineImage()) { std::cout << QUtil::hex_encode(obj.getInlineImageValue()) << std::endl; diff --git a/examples/qtest/parse-content/content.out b/examples/qtest/parse-content/content.out index b0c041e5..d2eb5fe8 100644 --- a/examples/qtest/parse-content/content.out +++ b/examples/qtest/parse-content/content.out @@ -1,11 +1,12 @@ -operator: BT -name: /F1 -integer: 24 -operator: Tf -integer: 72 -integer: 720 -operator: Td -string: (Potato) -operator: Tj -operator: ET +content size: 44 +operator, offset=0, length=2: BT +name, offset=5, length=3: /F1 +integer, offset=9, length=2: 24 +operator, offset=12, length=2: Tf +integer, offset=17, length=2: 72 +integer, offset=20, length=3: 720 +operator, offset=24, length=2: Td +string, offset=29, length=8: (Potato) +operator, offset=38, length=2: Tj +operator, offset=41, length=2: ET -EOF- diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 58708f72..08782a89 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -159,16 +159,28 @@ class QPDFObjectHandle // This class is used by parsePageContents. Callers must // instantiate a subclass of this with handlers defined to accept // QPDFObjectHandles that are parsed from the stream. - class ParserCallbacks + class QPDF_DLL_CLASS ParserCallbacks { public: QPDF_DLL virtual ~ParserCallbacks() { } - virtual void handleObject(QPDFObjectHandle) = 0; + // One of the handleObject methods must be overridden. + QPDF_DLL + virtual void handleObject(QPDFObjectHandle); + QPDF_DLL + virtual void handleObject( + QPDFObjectHandle, size_t offset, size_t length); + virtual void handleEOF() = 0; + // Override this if you want to know the full size of the + // contents, possibly after concatenation of multiple streams. + // This is called before the first call to handleObject. + QPDF_DLL + virtual void contentSize(size_t); + protected: // Implementors may call this method during parsing to // terminate parsing early. This method throws an exception diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index c58675a4..04149b22 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -105,6 +105,29 @@ QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) write(value.c_str(), value.length()); } +void +QPDFObjectHandle::ParserCallbacks::handleObject(QPDFObjectHandle) +{ + throw std::logic_error("You must override one of the" + " handleObject methods in ParserCallbacks"); +} + +void +QPDFObjectHandle::ParserCallbacks::handleObject( + QPDFObjectHandle oh, size_t, size_t) +{ + // This version of handleObject was added in qpdf 9. If the + // developer did not override it, fall back to the older + // interface. + handleObject(oh); +} + +void +QPDFObjectHandle::ParserCallbacks::contentSize(size_t) +{ + // Ignore by default; overriding this is optional. +} + void QPDFObjectHandle::ParserCallbacks::terminateParsing() { @@ -1615,6 +1638,7 @@ QPDFObjectHandle::parseContentStream_internal( std::string all_description; pipeContentStreams(&buf, description, all_description); PointerHolder stream_data = buf.getBuffer(); + callbacks->contentSize(stream_data->getSize()); try { parseContentStream_data(stream_data, all_description, @@ -1642,6 +1666,13 @@ QPDFObjectHandle::parseContentStream_data( bool empty = false; while (QIntC::to_size(input->tell()) < length) { + // Read a token and seek to the beginning. The offset we get + // from this process is the beginning of the next + // non-ignorable (space, comment) token. This way, the offset + // and don't including ignorable content. + tokenizer.readToken(input, "content", true); + qpdf_offset_t offset = input->getLastOffset(); + input->seek(offset, SEEK_SET); QPDFObjectHandle obj = parseInternal(input, "content", tokenizer, empty, 0, context, true); @@ -1650,8 +1681,9 @@ QPDFObjectHandle::parseContentStream_data( // EOF break; } + size_t length = QIntC::to_size(input->tell() - offset); - callbacks->handleObject(obj); + callbacks->handleObject(obj, QIntC::to_size(offset), length); if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { // Discard next character; it is the space after ID that @@ -1661,6 +1693,8 @@ QPDFObjectHandle::parseContentStream_data( tokenizer.expectInlineImage(input); QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); + offset = input->getLastOffset(); + length = QIntC::to_size(input->tell() - offset); if (t.getType() == QPDFTokenizer::tt_bad) { QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); @@ -1674,7 +1708,8 @@ QPDFObjectHandle::parseContentStream_data( std::string inline_image = t.getValue(); QTC::TC("qpdf", "QPDFObjectHandle inline image token"); callbacks->handleObject( - QPDFObjectHandle::newInlineImage(inline_image)); + QPDFObjectHandle::newInlineImage(inline_image), + QIntC::to_size(offset), length); } } } diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml index edf61bdb..db2a54fa 100644 --- a/manual/qpdf-manual.xml +++ b/manual/qpdf-manual.xml @@ -4489,6 +4489,25 @@ print "\n"; getUIntValueAsUInt. + + + When parsing content streams with + QPDFObjectHandle::ParserCallbacks, in + place of the method + handleObject(QPDFObjectHandle), the + developer may override + handleObject(QPDFObjectHandle, size_t offset, + size_t length). If this method is defined, it + will be invoked with the object along with its offset and + length within the overall contents being parsed. Intervening + spaces and comments are not included in offset and length. + Additionally, a new method + contentSize(size_t) may be implemented. + If present, it will be called prior to the first call to + handleObject with the total size in + bytes of the combined contents. + + The underlying implementation of QPDF arrays has been diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.out b/qpdf/qtest/qpdf/eof-in-inline-image.out index ba45e3b2..1f2d64a5 100644 --- a/qpdf/qtest/qpdf/eof-in-inline-image.out +++ b/qpdf/qtest/qpdf/eof-in-inline-image.out @@ -1,27 +1,28 @@ -operator: BT -name: /F1 -integer: 24 -operator: Tf -integer: 72 -integer: 720 -operator: Td -string: (Potato) -operator: Tj -operator: ET -operator: BI -name: /CS -name: /G -name: /W -integer: 1 -name: /H -integer: 1 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 1 /Predictor 15 >> -operator: ID +content size: 139 +operator, offset=0, length=2: BT +name, offset=5, length=3: /F1 +integer, offset=9, length=2: 24 +operator, offset=12, length=2: Tf +integer, offset=17, length=2: 72 +integer, offset=20, length=3: 720 +operator, offset=24, length=2: Td +string, offset=29, length=8: (Potato) +operator, offset=38, length=2: Tj +operator, offset=41, length=2: ET +operator, offset=66, length=2: BI +name, offset=69, length=3: /CS +name, offset=73, length=2: /G +name, offset=75, length=2: /W +integer, offset=78, length=1: 1 +name, offset=79, length=2: /H +integer, offset=82, length=1: 1 +name, offset=83, length=4: /BPC +integer, offset=88, length=1: 8 +name, offset=89, length=2: /F +name, offset=91, length=3: /Fl +name, offset=94, length=3: /DP +dictionary, offset=97, length=27: << /Columns 1 /Predictor 15 >> +operator, offset=125, length=2: ID WARNING: page object 3 0 stream 4 0 (stream data, offset 139): EOF found while reading inline image -EOF- test 37 done diff --git a/qpdf/qtest/qpdf/terminate-parsing.out b/qpdf/qtest/qpdf/terminate-parsing.out index 2a66b909..014c7c12 100644 --- a/qpdf/qtest/qpdf/terminate-parsing.out +++ b/qpdf/qtest/qpdf/terminate-parsing.out @@ -1,86 +1,88 @@ -name: /potato +content size: 44 +name, offset=0, length=7: /potato test suite: terminating parsing -real: 0.1 -integer: 0 -integer: 0 -real: 0.1 -integer: 0 -integer: 0 -operator: cm -operator: q -integer: 0 -real: 1.1999 -real: -1.1999 -integer: 0 -real: 121.19 -real: 150.009 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 1 -name: /H -integer: 1 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 1 /Predictor 15 >> -operator: ID -inline-image: 789c63fc0f00010301010a -operator: EI -operator: Q -operator: q -integer: 0 -real: 35.997 -real: -128.389 -integer: 0 -real: 431.964 -real: 7269.02 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 30 -name: /H -integer: 107 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 30 /Predictor 15 >> -operator: ID -inline-image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a -operator: EI -operator: Q -operator: q -integer: 0 -real: 38.3968 -real: -93.5922 -integer: 0 -real: 431.964 -real: 7567.79 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 32 -name: /H -integer: 78 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 32 /Predictor 15 >> -operator: ID -inline-image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a -operator: EI -operator: Q +content size: 454 +real, offset=0, length=3: 0.1 +integer, offset=4, length=1: 0 +integer, offset=6, length=1: 0 +real, offset=8, length=3: 0.1 +integer, offset=12, length=1: 0 +integer, offset=14, length=1: 0 +operator, offset=16, length=2: cm +operator, offset=19, length=1: q +integer, offset=21, length=1: 0 +real, offset=23, length=6: 1.1999 +real, offset=30, length=7: -1.1999 +integer, offset=38, length=2: 0 +real, offset=41, length=6: 121.19 +real, offset=48, length=7: 150.009 +operator, offset=56, length=2: cm +operator, offset=59, length=2: BI +name, offset=62, length=3: /CS +name, offset=66, length=2: /G +name, offset=68, length=2: /W +integer, offset=71, length=1: 1 +name, offset=72, length=2: /H +integer, offset=75, length=1: 1 +name, offset=76, length=4: /BPC +integer, offset=81, length=1: 8 +name, offset=82, length=2: /F +name, offset=84, length=3: /Fl +name, offset=87, length=3: /DP +dictionary, offset=90, length=27: << /Columns 1 /Predictor 15 >> +operator, offset=118, length=2: ID +inline-image, offset=121, length=11: 789c63fc0f00010301010a +operator, offset=132, length=2: EI +operator, offset=135, length=1: Q +operator, offset=137, length=1: q +integer, offset=139, length=1: 0 +real, offset=141, length=6: 35.997 +real, offset=148, length=8: -128.389 +integer, offset=157, length=2: 0 +real, offset=160, length=7: 431.964 +real, offset=168, length=7: 7269.02 +operator, offset=176, length=2: cm +operator, offset=179, length=2: BI +name, offset=182, length=3: /CS +name, offset=186, length=2: /G +name, offset=188, length=2: /W +integer, offset=191, length=2: 30 +name, offset=193, length=2: /H +integer, offset=196, length=3: 107 +name, offset=199, length=4: /BPC +integer, offset=204, length=1: 8 +name, offset=205, length=2: /F +name, offset=207, length=3: /Fl +name, offset=210, length=3: /DP +dictionary, offset=213, length=28: << /Columns 30 /Predictor 15 >> +operator, offset=242, length=2: ID +inline-image, offset=245, length=46: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a +operator, offset=291, length=2: EI +operator, offset=294, length=1: Q +operator, offset=296, length=1: q +integer, offset=298, length=1: 0 +real, offset=300, length=7: 38.3968 +real, offset=308, length=8: -93.5922 +integer, offset=317, length=2: 0 +real, offset=320, length=7: 431.964 +real, offset=328, length=7: 7567.79 +operator, offset=336, length=2: cm +operator, offset=339, length=2: BI +name, offset=342, length=3: /CS +name, offset=346, length=2: /G +name, offset=348, length=2: /W +integer, offset=351, length=2: 32 +name, offset=353, length=2: /H +integer, offset=356, length=2: 78 +name, offset=358, length=4: /BPC +integer, offset=363, length=1: 8 +name, offset=364, length=2: /F +name, offset=366, length=3: /Fl +name, offset=369, length=3: /DP +dictionary, offset=372, length=28: << /Columns 32 /Predictor 15 >> +operator, offset=401, length=2: ID +inline-image, offset=404, length=45: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a +operator, offset=449, length=2: EI +operator, offset=452, length=1: Q -EOF- test 37 done diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.out b/qpdf/qtest/qpdf/tokenize-content-streams.out index 7ac7d51e..448a07fb 100644 --- a/qpdf/qtest/qpdf/tokenize-content-streams.out +++ b/qpdf/qtest/qpdf/tokenize-content-streams.out @@ -1,95 +1,100 @@ -operator: BT -name: /F1 -integer: 24 -operator: Tf -integer: 72 -integer: 720 -operator: Td -string: (Potato) -operator: Tj -operator: ET +content size: 44 +operator, offset=0, length=2: BT +name, offset=5, length=3: /F1 +integer, offset=9, length=2: 24 +operator, offset=12, length=2: Tf +integer, offset=17, length=2: 72 +integer, offset=20, length=3: 720 +operator, offset=24, length=2: Td +string, offset=29, length=8: (Potato) +operator, offset=38, length=2: Tj +operator, offset=41, length=2: ET -EOF- -real: 0.1 -integer: 0 -integer: 0 -real: 0.1 -integer: 0 -integer: 0 -operator: cm -operator: q -integer: 0 -real: 1.1999 -real: -1.1999 -integer: 0 -real: 121.19 -real: 150.009 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 1 -name: /H -integer: 1 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 1 /Predictor 15 >> -operator: ID -inline-image: 789c63fc0f00010301010a -operator: EI -operator: Q -operator: q -integer: 0 -real: 35.997 -real: -128.389 -integer: 0 -real: 431.964 -real: 7269.02 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 30 -name: /H -integer: 107 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 30 /Predictor 15 >> -operator: ID -inline-image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a -operator: EI -operator: Q -operator: q -integer: 0 -real: 38.3968 -real: -93.5922 -integer: 0 -real: 431.964 -real: 7567.79 -operator: cm -operator: BI -name: /CS -name: /G -name: /W -integer: 32 -name: /H -integer: 78 -name: /BPC -integer: 8 -name: /F -name: /Fl -name: /DP -dictionary: << /Columns 32 /Predictor 15 >> -operator: ID -inline-image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a -operator: EI -operator: Q +content size: 490 +real, offset=0, length=3: 0.1 +integer, offset=4, length=1: 0 +integer, offset=6, length=1: 0 +real, offset=8, length=3: 0.1 +integer, offset=12, length=1: 0 +integer, offset=14, length=1: 0 +operator, offset=16, length=2: cm +operator, offset=19, length=1: q +integer, offset=21, length=1: 0 +real, offset=23, length=6: 1.1999 +real, offset=30, length=7: -1.1999 +integer, offset=38, length=2: 0 +real, offset=41, length=6: 121.19 +real, offset=48, length=7: 150.009 +operator, offset=56, length=2: cm +operator, offset=59, length=2: BI +name, offset=62, length=3: /CS +name, offset=66, length=2: /G +name, offset=68, length=2: /W +integer, offset=71, length=1: 1 +name, offset=72, length=2: /H +integer, offset=75, length=1: 1 +name, offset=76, length=4: /BPC +integer, offset=81, length=1: 8 +name, offset=82, length=2: /F +name, offset=84, length=3: /Fl +name, offset=87, length=3: /DP +dictionary, offset=90, length=27: << /Columns 1 /Predictor 15 >> +operator, offset=118, length=2: ID +inline-image, offset=121, length=11: 789c63fc0f00010301010a +operator, offset=132, length=2: EI +operator, offset=135, length=1: Q +operator, offset=137, length=1: q +integer, offset=139, length=1: 0 +real, offset=141, length=6: 35.997 +real, offset=148, length=8: -128.389 +integer, offset=157, length=2: 0 +real, offset=160, length=7: 431.964 +real, offset=168, length=7: 7269.02 +operator, offset=176, length=2: cm +operator, offset=179, length=2: BI +name, offset=182, length=3: /CS +name, offset=186, length=2: /G +name, offset=188, length=2: /W +integer, offset=191, length=2: 30 +name, offset=193, length=2: /H +integer, offset=196, length=3: 107 +name, offset=199, length=4: /BPC +integer, offset=204, length=1: 8 +name, offset=205, length=2: /F +name, offset=207, length=3: /Fl +name, offset=210, length=3: /DP +dictionary, offset=214, length=28: << /Columns 30 /Predictor 15 >> +operator, offset=243, length=2: ID +inline-image, offset=246, length=46: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a0a +operator, offset=292, length=2: EI +operator, offset=295, length=1: Q +operator, offset=297, length=1: q +array, offset=299, length=30: [ 1 /two (three) << /four 5 >> ] +operator, offset=330, length=1: Q +operator, offset=332, length=1: q +integer, offset=334, length=1: 0 +real, offset=336, length=7: 38.3968 +real, offset=344, length=8: -93.5922 +integer, offset=353, length=2: 0 +real, offset=356, length=7: 431.964 +real, offset=364, length=7: 7567.79 +operator, offset=372, length=2: cm +operator, offset=375, length=2: BI +name, offset=378, length=3: /CS +name, offset=382, length=2: /G +name, offset=384, length=2: /W +integer, offset=387, length=2: 32 +name, offset=389, length=2: /H +integer, offset=392, length=2: 78 +name, offset=394, length=4: /BPC +integer, offset=399, length=1: 8 +name, offset=400, length=2: /F +name, offset=402, length=3: /Fl +name, offset=405, length=3: /DP +dictionary, offset=408, length=28: << /Columns 32 /Predictor 15 >> +operator, offset=437, length=2: ID +inline-image, offset=440, length=45: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c130a +operator, offset=485, length=2: EI +operator, offset=488, length=1: Q -EOF- test 37 done diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.pdf b/qpdf/qtest/qpdf/tokenize-content-streams.pdf index ea97a6e2..569e2424 100644 Binary files a/qpdf/qtest/qpdf/tokenize-content-streams.pdf and b/qpdf/qtest/qpdf/tokenize-content-streams.pdf differ diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc index c6ddd715..2b1c710d 100644 --- a/qpdf/test_driver.cc +++ b/qpdf/test_driver.cc @@ -76,19 +76,28 @@ class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks { } - virtual void handleObject(QPDFObjectHandle); + virtual void contentSize(size_t size); + virtual void handleObject(QPDFObjectHandle, size_t, size_t); virtual void handleEOF(); }; void -ParserCallbacks::handleObject(QPDFObjectHandle obj) +ParserCallbacks::contentSize(size_t size) +{ + std::cout << "content size: " << size << std::endl; +} + +void +ParserCallbacks::handleObject(QPDFObjectHandle obj, + size_t offset, size_t length) { if (obj.isName() && (obj.getName() == "/Abort")) { std::cout << "test suite: terminating parsing" << std::endl; terminateParsing(); } - std::cout << obj.getTypeName() << ": "; + std::cout << obj.getTypeName() << ", offset=" << offset + << ", length=" << length << ": "; if (obj.isInlineImage()) { // Exercise getTypeCode -- cgit v1.2.3-54-g00ecf