aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--examples/build.mk3
-rw-r--r--examples/pdf-parse-content.cc97
-rw-r--r--examples/qtest/parse-content.test17
-rw-r--r--examples/qtest/parse-content/content.out11
-rw-r--r--examples/qtest/parse-content/input.pdfbin0 -> 799 bytes
-rw-r--r--include/qpdf/QPDFObjectHandle.hh25
-rw-r--r--include/qpdf/QPDFTokenizer.hh10
-rw-r--r--libqpdf/QPDFObjectHandle.cc128
-rw-r--r--libqpdf/QPDFTokenizer.cc23
-rw-r--r--qpdf/qpdf.testcov4
-rw-r--r--qpdf/qtest/qpdf.test12
-rw-r--r--qpdf/qtest/qpdf/eof-in-inline-image.out25
-rw-r--r--qpdf/qtest/qpdf/eof-in-inline-image.pdfbin0 -> 870 bytes
-rw-r--r--qpdf/qtest/qpdf/tokenize-content-streams.out95
-rw-r--r--qpdf/qtest/qpdf/tokenize-content-streams.pdfbin0 -> 1539 bytes
-rw-r--r--qpdf/test_driver.cc52
17 files changed, 494 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index e4919ca6..0a3b3d81 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2013-01-20 Jay Berkenbilt <ejb@ql.org>
+ * Added QPDFObjectHandle::parseContentStream, which parses the
+ objects in a content stream and calls handlers in a callback
+ class. The example pdf-parse-content illustrates it use.
+
* Added QPDF_Keyword and QPDF_InlineImage types along with
appropriate wrapper methods in QPDFObjectHandle. These new object
types are to facilitate content stream parsing.
diff --git a/examples/build.mk b/examples/build.mk
index 12734b1b..bcb4440e 100644
--- a/examples/build.mk
+++ b/examples/build.mk
@@ -4,7 +4,8 @@ BINS_examples = \
pdf-npages \
pdf-double-page-size \
pdf-invert-images \
- pdf-create
+ pdf-create \
+ pdf-parse-content
CBINS_examples = pdf-linearize
TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/examples/pdf-parse-content.cc b/examples/pdf-parse-content.cc
new file mode 100644
index 00000000..1c3cae16
--- /dev/null
+++ b/examples/pdf-parse-content.cc
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+
+static char const* whoami = 0;
+
+void usage()
+{
+ std::cerr << "Usage: " << whoami << " filename page-number" << std::endl
+ << "Prints a dump of the objects in the content streams"
+ << " of the given page." << std::endl
+ << "Pages are numbered from 1." << std::endl;
+ exit(2);
+}
+
+class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
+{
+ public:
+ virtual ~ParserCallbacks()
+ {
+ }
+
+ virtual void handleObject(QPDFObjectHandle);
+ virtual void handleEOF();
+};
+
+void
+ParserCallbacks::handleObject(QPDFObjectHandle obj)
+{
+ if (obj.isInlineImage())
+ {
+ std::string val = obj.getInlineImageValue();
+ std::cout << "inline image: ";
+ char buf[3];
+ buf[2] = '\0';
+ for (size_t i = 0; i < val.length(); ++i)
+ {
+ sprintf(buf, "%02x", (unsigned char)(val[i]));
+ std::cout << buf;
+ }
+ std::cout << std::endl;
+ }
+ else
+ {
+ std::cout << obj.unparse() << std::endl;
+ }
+}
+
+void
+ParserCallbacks::handleEOF()
+{
+ std::cout << "-EOF-" << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+ whoami = QUtil::getWhoami(argv[0]);
+
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ if (argc != 3)
+ {
+ usage();
+ }
+ char const* filename = argv[1];
+ int pageno = atoi(argv[2]);
+
+ try
+ {
+ QPDF pdf;
+ pdf.processFile(filename);
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+ if ((pageno < 1) || (pageno > (int)pages.size()))
+ {
+ usage();
+ }
+
+ QPDFObjectHandle page = pages[pageno-1];
+ QPDFObjectHandle contents = page.getKey("/Contents");
+ ParserCallbacks cb;
+ QPDFObjectHandle::parseContentStream(contents, &cb);
+ }
+ catch (std::exception& e)
+ {
+ std::cerr << whoami << ": " << e.what() << std::endl;
+ exit(2);
+ }
+
+ return 0;
+}
diff --git a/examples/qtest/parse-content.test b/examples/qtest/parse-content.test
new file mode 100644
index 00000000..a73566f8
--- /dev/null
+++ b/examples/qtest/parse-content.test
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+require 5.008;
+BEGIN { $^W = 1; }
+use strict;
+
+chdir("parse-content");
+
+require TestDriver;
+
+my $td = new TestDriver('pdf-parse-content');
+
+$td->runtest("parse content",
+ {$td->COMMAND => "pdf-parse-content input.pdf 1"},
+ {$td->FILE => "content.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
+$td->report(1);
diff --git a/examples/qtest/parse-content/content.out b/examples/qtest/parse-content/content.out
new file mode 100644
index 00000000..9c07edc2
--- /dev/null
+++ b/examples/qtest/parse-content/content.out
@@ -0,0 +1,11 @@
+BT
+/F1
+24
+Tf
+72
+720
+Td
+(Potato)
+Tj
+ET
+-EOF-
diff --git a/examples/qtest/parse-content/input.pdf b/examples/qtest/parse-content/input.pdf
new file mode 100644
index 00000000..cd319591
--- /dev/null
+++ b/examples/qtest/parse-content/input.pdf
Binary files differ
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index 932a6678..c4a922d1 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -71,6 +71,21 @@ class QPDFObjectHandle
virtual void decryptString(std::string& val) = 0;
};
+ // This class is used by parseContentStream. Callers must
+ // instantiate a subclass of this with handlers defined to accept
+ // QPDFObjectHandles that are parsed from the stream.
+ class ParserCallbacks
+ {
+ public:
+ QPDF_DLL
+ virtual ~ParserCallbacks()
+ {
+ }
+ virtual void handleObject(QPDFObjectHandle) = 0;
+ virtual void handleEOF() = 0;
+ };
+
+
QPDF_DLL
QPDFObjectHandle();
QPDF_DLL
@@ -138,6 +153,11 @@ class QPDFObjectHandle
StringDecrypter* decrypter,
QPDF* context);
+ // Helpers for parsing content streams
+ QPDF_DLL
+ static void parseContentStream(QPDFObjectHandle stream_or_array,
+ ParserCallbacks* callbacks);
+
// Type-specific factories
QPDF_DLL
static QPDFObjectHandle newNull();
@@ -571,7 +591,10 @@ class QPDFObjectHandle
std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
- bool in_array, bool in_dictionary);
+ bool in_array, bool in_dictionary,
+ bool content_stream);
+ static void parseContentStream_internal(
+ QPDFObjectHandle stream, ParserCallbacks* callbacks);
bool initialized;
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 1835fcb1..081e12d3 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -18,6 +18,8 @@
class QPDFTokenizer
{
public:
+ // Token type tt_eof is only returned of allowEOF() is called on
+ // the tokenizer. tt_eof was introduced in QPDF version 4.1.
enum token_type_e
{
tt_bad,
@@ -34,6 +36,7 @@ class QPDFTokenizer
tt_null,
tt_bool,
tt_word,
+ tt_eof,
};
class Token
@@ -97,6 +100,12 @@ class QPDFTokenizer
QPDF_DLL
void allowPoundAnywhereInName();
+ // If called, treat EOF as a separate token type instead of an
+ // error. This was introduced in QPDF 4.1 to facilitate
+ // tokenizing content streams.
+ QPDF_DLL
+ void allowEOF();
+
// Mode of operation:
// Keep presenting characters and calling getToken() until
@@ -140,6 +149,7 @@ class QPDFTokenizer
st_literal, st_in_hexstring, st_token_ready } state;
bool pound_special_in_name;
+ bool allow_eof;
// Current token accumulation
token_type_e type;
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 9b51a0cb..bfca3f08 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -680,6 +680,106 @@ QPDFObjectHandle::parse(std::string const& object_str,
return result;
}
+void
+QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
+ ParserCallbacks* callbacks)
+{
+ std::vector<QPDFObjectHandle> streams;
+ if (stream_or_array.isArray())
+ {
+ streams = stream_or_array.getArrayAsVector();
+ }
+ else
+ {
+ streams.push_back(stream_or_array);
+ }
+ for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
+ iter != streams.end(); ++iter)
+ {
+ QPDFObjectHandle stream = *iter;
+ if (! stream.isStream())
+ {
+ throw std::logic_error(
+ "QPDFObjectHandle: parseContentStream called on non-stream");
+ }
+ parseContentStream_internal(stream, callbacks);
+ }
+ callbacks->handleEOF();
+}
+
+void
+QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream,
+ ParserCallbacks* callbacks)
+{
+ stream.assertStream();
+ PointerHolder<Buffer> stream_data = stream.getStreamData();
+ size_t length = stream_data->getSize();
+ std::string description = "content stream object " +
+ QUtil::int_to_string(stream.getObjectID()) + " " +
+ QUtil::int_to_string(stream.getGeneration());
+ PointerHolder<InputSource> input =
+ new BufferInputSource(description, stream_data.getPointer());
+ QPDFTokenizer tokenizer;
+ tokenizer.allowEOF();
+ bool empty = false;
+ while ((size_t) input->tell() < length)
+ {
+ QPDFObjectHandle obj =
+ parseInternal(input, "content", tokenizer, empty,
+ 0, 0, false, false, true);
+ if (! obj.isInitialized())
+ {
+ // EOF
+ break;
+ }
+
+ callbacks->handleObject(obj);
+ if (obj.isKeyword() && (obj.getKeywordValue() == "ID"))
+ {
+ // Discard next character; it is the space after ID that
+ // terminated the token. Read until end of inline image.
+ char ch;
+ input->read(&ch, 1);
+ char buf[4];
+ memset(buf, '\0', sizeof(buf));
+ bool done = false;
+ std::string inline_image;
+ while (! done)
+ {
+ if (input->read(&ch, 1) == 0)
+ {
+ QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
+ throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
+ "stream data", input->tell(),
+ "EOF found while reading inline image");
+ }
+ inline_image += ch;
+ memmove(buf, buf + 1, sizeof(buf) - 1);
+ buf[sizeof(buf) - 1] = ch;
+ if (strchr(" \t\n\v\f\r", buf[0]) &&
+ (buf[1] == 'E') &&
+ (buf[2] == 'I') &&
+ strchr(" \t\n\v\f\r", buf[3]))
+ {
+ // We've found an EI operator.
+ done = true;
+ input->seek(-3, SEEK_CUR);
+ for (int i = 0; i < 4; ++i)
+ {
+ if (inline_image.length() > 0)
+ {
+ inline_image.erase(inline_image.length() - 1);
+ }
+ }
+ }
+ }
+ QTC::TC("qpdf", "QPDFObjectHandle inline image token");
+ callbacks->handleObject(
+ QPDFObjectHandle::newInlineImage(inline_image));
+ }
+ }
+}
+
QPDFObjectHandle
QPDFObjectHandle::parse(PointerHolder<InputSource> input,
std::string const& object_description,
@@ -687,7 +787,7 @@ QPDFObjectHandle::parse(PointerHolder<InputSource> input,
StringDecrypter* decrypter, QPDF* context)
{
return parseInternal(input, object_description, tokenizer, empty,
- decrypter, context, false, false);
+ decrypter, context, false, false, false);
}
QPDFObjectHandle
@@ -695,7 +795,8 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
std::string const& object_description,
QPDFTokenizer& tokenizer, bool& empty,
StringDecrypter* decrypter, QPDF* context,
- bool in_array, bool in_dictionary)
+ bool in_array, bool in_dictionary,
+ bool content_stream)
{
empty = false;
if (in_dictionary && in_array)
@@ -721,6 +822,21 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
switch (token.getType())
{
+ case QPDFTokenizer::tt_eof:
+ if (content_stream)
+ {
+ // Return uninitialized object to indicate EOF
+ return object;
+ }
+ else
+ {
+ // When not in content stream mode, EOF is tt_bad and
+ // throws an exception before we get here.
+ throw std::logic_error(
+ "EOF received while not in content stream mode");
+ }
+ break;
+
case QPDFTokenizer::tt_brace_open:
case QPDFTokenizer::tt_brace_close:
// Don't know what to do with these for now
@@ -764,13 +880,13 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
case QPDFTokenizer::tt_array_open:
object = parseInternal(
input, object_description, tokenizer, empty,
- decrypter, context, true, false);
+ decrypter, context, true, false, content_stream);
break;
case QPDFTokenizer::tt_dict_open:
object = parseInternal(
input, object_description, tokenizer, empty,
- decrypter, context, false, true);
+ decrypter, context, false, true, content_stream);
break;
case QPDFTokenizer::tt_bool:
@@ -826,6 +942,10 @@ QPDFObjectHandle::parseInternal(PointerHolder<InputSource> input,
input->seek(input->getLastOffset(), SEEK_SET);
empty = true;
}
+ else if (content_stream)
+ {
+ object = QPDFObjectHandle::newKeyword(token.getValue());
+ }
else
{
throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 1a20bb5a..a6333b73 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -22,7 +22,8 @@ static bool is_space(char ch)
}
QPDFTokenizer::QPDFTokenizer() :
- pound_special_in_name(true)
+ pound_special_in_name(true),
+ allow_eof(false)
{
reset();
}
@@ -35,6 +36,12 @@ QPDFTokenizer::allowPoundAnywhereInName()
}
void
+QPDFTokenizer::allowEOF()
+{
+ this->allow_eof = true;
+}
+
+void
QPDFTokenizer::reset()
{
state = st_top;
@@ -441,9 +448,17 @@ QPDFTokenizer::presentEOF()
}
else if (state != st_token_ready)
{
- QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token");
- type = tt_bad;
- error_message = "EOF while reading token";
+ QTC::TC("qpdf", "QPDF_Tokenizer EOF reading token",
+ this->allow_eof ? 1 : 0);
+ if (this->allow_eof)
+ {
+ type = tt_eof;
+ }
+ else
+ {
+ type = tt_bad;
+ error_message = "EOF while reading token";
+ }
}
state = st_token_ready;
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index a0578f28..b09e966c 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -236,7 +236,7 @@ QPDFWriter copy use_aes 1
QPDFObjectHandle indirect without context 0
QPDFObjectHandle trailing data in parse 0
qpdf pages encryption password 0
-QPDF_Tokenizer EOF reading token 0
+QPDF_Tokenizer EOF reading token 1
QPDF_Tokenizer EOF reading appendable token 0
QPDFWriter extra header text no newline 0
QPDFWriter extra header text add newline 0
@@ -259,3 +259,5 @@ QPDFWriter remove Crypt 0
qpdf-c called qpdf_get_pdf_extension_level 0
qpdf-c called qpdf_set_r5_encryption_parameters 0
qpdf-c called qpdf_set_r6_encryption_parameters 0
+QPDFObjectHandle EOF in inline image 0
+QPDFObjectHandle inline image token 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index bf62ceea..8d2b5cfc 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -199,7 +199,7 @@ $td->runtest("remove page we don't have",
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
-$n_tests += 57;
+$n_tests += 59;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@@ -468,6 +468,16 @@ $td->runtest("check file with leading junk",
{$td->COMMAND => "qpdf --check leading-junk.pdf"},
{$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
+$td->runtest("EOF inside inline image",
+ {$td->COMMAND => "test_driver 37 eof-in-inline-image.pdf"},
+ {$td->FILE => "eof-in-inline-image.out",
+ $td->EXIT_STATUS => 2},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("tokenize content streams",
+ {$td->COMMAND => "test_driver 37 tokenize-content-streams.pdf"},
+ {$td->FILE => "tokenize-content-streams.out",
+ $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.out b/qpdf/qtest/qpdf/eof-in-inline-image.out
new file mode 100644
index 00000000..8ac365c4
--- /dev/null
+++ b/qpdf/qtest/qpdf/eof-in-inline-image.out
@@ -0,0 +1,25 @@
+BT
+/F1
+24
+Tf
+72
+720
+Td
+(Potato)
+Tj
+ET
+BI
+/CS
+/G
+/W
+1
+/H
+1
+/BPC
+8
+/F
+/Fl
+/DP
+<< /Columns 1 /Predictor 15 >>
+ID
+content stream object 4 0 (stream data, file position 139): EOF found while reading inline image
diff --git a/qpdf/qtest/qpdf/eof-in-inline-image.pdf b/qpdf/qtest/qpdf/eof-in-inline-image.pdf
new file mode 100644
index 00000000..e970b77d
--- /dev/null
+++ b/qpdf/qtest/qpdf/eof-in-inline-image.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.out b/qpdf/qtest/qpdf/tokenize-content-streams.out
new file mode 100644
index 00000000..9bc933dc
--- /dev/null
+++ b/qpdf/qtest/qpdf/tokenize-content-streams.out
@@ -0,0 +1,95 @@
+BT
+/F1
+24
+Tf
+72
+720
+Td
+(Potato)
+Tj
+ET
+-EOF-
+0.1
+0
+0
+0.1
+0
+0
+cm
+q
+0
+1.1999
+-1.1999
+0
+121.19
+150.009
+cm
+BI
+/CS
+/G
+/W
+1
+/H
+1
+/BPC
+8
+/F
+/Fl
+/DP
+<< /Columns 1 /Predictor 15 >>
+ID
+inline image: 789c63fc0f0001030101
+EI
+Q
+q
+0
+35.997
+-128.389
+0
+431.964
+7269.02
+cm
+BI
+/CS
+/G
+/W
+30
+/H
+107
+/BPC
+8
+/F
+/Fl
+/DP
+<< /Columns 30 /Predictor 15 >>
+ID
+inline image: 789cedd1a11100300800b1b2ffd06503148283bc8dfcf8af2a306ee352eff2e06318638c31c63b3801627b620a
+EI
+Q
+q
+0
+38.3968
+-93.5922
+0
+431.964
+7567.79
+cm
+BI
+/CS
+/G
+/W
+32
+/H
+78
+/BPC
+8
+/F
+/Fl
+/DP
+<< /Columns 32 /Predictor 15 >>
+ID
+inline image: 789c63fccf801f308e2a185530aa60882a20203faa605401890a0643aa1e5530aa6054010d140000bdd03c13
+EI
+Q
+-EOF-
+test 37 done
diff --git a/qpdf/qtest/qpdf/tokenize-content-streams.pdf b/qpdf/qtest/qpdf/tokenize-content-streams.pdf
new file mode 100644
index 00000000..ea97a6e2
--- /dev/null
+++ b/qpdf/qtest/qpdf/tokenize-content-streams.pdf
Binary files differ
diff --git a/qpdf/test_driver.cc b/qpdf/test_driver.cc
index 48017908..cd6aa991 100644
--- a/qpdf/test_driver.cc
+++ b/qpdf/test_driver.cc
@@ -58,6 +58,45 @@ class Provider: public QPDFObjectHandle::StreamDataProvider
bool bad_length;
};
+class ParserCallbacks: public QPDFObjectHandle::ParserCallbacks
+{
+ public:
+ virtual ~ParserCallbacks()
+ {
+ }
+
+ virtual void handleObject(QPDFObjectHandle);
+ virtual void handleEOF();
+};
+
+void
+ParserCallbacks::handleObject(QPDFObjectHandle obj)
+{
+ if (obj.isInlineImage())
+ {
+ std::string val = obj.getInlineImageValue();
+ std::cout << "inline image: ";
+ char buf[3];
+ buf[2] = '\0';
+ for (size_t i = 0; i < val.length(); ++i)
+ {
+ sprintf(buf, "%02x", (unsigned char)(val[i]));
+ std::cout << buf;
+ }
+ std::cout << std::endl;
+ }
+ else
+ {
+ std::cout << obj.unparse() << std::endl;
+ }
+}
+
+void
+ParserCallbacks::handleEOF()
+{
+ std::cout << "-EOF-" << std::endl;
+}
+
static std::string getPageContents(QPDFObjectHandle page)
{
PointerHolder<Buffer> b1 =
@@ -1245,6 +1284,19 @@ void runtest(int n, char const* filename1, char const* arg2)
}
}
}
+ else if (n == 37)
+ {
+ // Parse content streams of all pages
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
+ for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ QPDFObjectHandle page = *iter;
+ QPDFObjectHandle contents = page.getKey("/Contents");
+ ParserCallbacks cb;
+ QPDFObjectHandle::parseContentStream(contents, &cb);
+ }
+ }
else
{
throw std::runtime_error(std::string("invalid test ") +