aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/qpdf/Pl_QPDFTokenizer.hh8
-rw-r--r--include/qpdf/QPDFTokenizer.hh11
-rw-r--r--libqpdf/Pl_QPDFTokenizer.cc73
-rw-r--r--libqpdf/QPDFObjectHandle.cc2
-rw-r--r--libqpdf/QPDFTokenizer.cc117
-rw-r--r--qpdf/qpdf.testcov3
-rw-r--r--qpdf/qtest/qpdf.test7
-rw-r--r--qpdf/test_tokenizer.cc28
8 files changed, 176 insertions, 73 deletions
diff --git a/include/qpdf/Pl_QPDFTokenizer.hh b/include/qpdf/Pl_QPDFTokenizer.hh
index 52630d2a..a571b079 100644
--- a/include/qpdf/Pl_QPDFTokenizer.hh
+++ b/include/qpdf/Pl_QPDFTokenizer.hh
@@ -27,6 +27,7 @@
#include <qpdf/QPDFTokenizer.hh>
#include <qpdf/PointerHolder.hh>
#include <qpdf/QPDFObjectHandle.hh>
+#include <qpdf/Pl_Buffer.hh>
// Tokenize the incoming text using QPDFTokenizer and pass the tokens
// in turn to a QPDFObjectHandle::TokenFilter object. All bytes of
@@ -56,9 +57,6 @@ class Pl_QPDFTokenizer: public Pipeline
virtual void finish();
private:
- void processChar(char ch);
- void checkUnread();
-
class Members
{
friend class Pl_QPDFTokenizer;
@@ -73,9 +71,7 @@ class Pl_QPDFTokenizer: public Pipeline
QPDFObjectHandle::TokenFilter* filter;
QPDFTokenizer tokenizer;
- bool last_char_was_cr;
- bool unread_char;
- char char_to_unread;
+ Pl_Buffer buf;
};
PointerHolder<Members> m;
};
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 370296b2..31f2f398 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -178,7 +178,15 @@ class QPDFTokenizer
// including the next EI token. After you call this method, the
// next call to readToken (or the token created next time getToken
// returns true) will either be tt_inline_image or tt_bad. This is
- // the only way readToken returns a tt_inline_image token.
+ // the only way readToken returns a tt_inline_image token. The
+ // version of this method that takes a PointerHolder<InputSource>
+ // does a better job of locating the end of the inline image and
+ // should be used whenever the input source is available. It
+ // preserves both tell() and getLastOffset(). The version without
+ // the input source will always end the inline image the first
+ // time it sees something that looks like an EI operator.
+ QPDF_DLL
+ void expectInlineImage(PointerHolder<InputSource> input);
QPDF_DLL
void expectInlineImage();
@@ -223,6 +231,7 @@ class QPDFTokenizer
std::string error_message;
bool unread_char;
char char_to_unread;
+ size_t inline_image_bytes;
// State for strings
int string_depth;
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 577c5cc7..bd5d88ab 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,13 +1,13 @@
#include <qpdf/Pl_QPDFTokenizer.hh>
#include <qpdf/QTC.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/BufferInputSource.hh>
#include <stdexcept>
#include <string.h>
Pl_QPDFTokenizer::Members::Members() :
filter(0),
- last_char_was_cr(false),
- unread_char(false),
- char_to_unread('\0')
+ buf("tokenizer buffer")
{
}
@@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
}
void
-Pl_QPDFTokenizer::processChar(char ch)
+Pl_QPDFTokenizer::write(unsigned char* data, size_t len)
{
- this->m->tokenizer.presentCharacter(ch);
- QPDFTokenizer::Token token;
- if (this->m->tokenizer.getToken(
- token, this->m->unread_char, this->m->char_to_unread))
- {
- this->m->filter->handleToken(token);
- if ((token.getType() == QPDFTokenizer::tt_word) &&
- (token.getValue() == "ID"))
- {
- QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
- this->m->tokenizer.expectInlineImage();
- }
- }
-}
-
-
-void
-Pl_QPDFTokenizer::checkUnread()
-{
- if (this->m->unread_char)
- {
- processChar(this->m->char_to_unread);
- if (this->m->unread_char)
- {
- throw std::logic_error(
- "INTERNAL ERROR: unread_char still true after processing "
- "unread character");
- }
- }
-}
-
-void
-Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
-{
- checkUnread();
- for (size_t i = 0; i < len; ++i)
- {
- processChar(buf[i]);
- checkUnread();
- }
+ this->m->buf.write(data, len);
}
void
Pl_QPDFTokenizer::finish()
{
- this->m->tokenizer.presentEOF();
- QPDFTokenizer::Token token;
- if (this->m->tokenizer.getToken(
- token, this->m->unread_char, this->m->char_to_unread))
+ this->m->buf.finish();
+ PointerHolder<InputSource> input =
+ new BufferInputSource("tokenizer data",
+ this->m->buf.getBuffer(), true);
+
+ while (true)
{
+ QPDFTokenizer::Token token = this->m->tokenizer.readToken(
+ input, "offset " + QUtil::int_to_string(input->tell()),
+ true);
this->m->filter->handleToken(token);
+ if (token.getType() == QPDFTokenizer::tt_eof)
+ {
+ break;
+ }
+ else if ((token.getType() == QPDFTokenizer::tt_word) &&
+ (token.getValue() == "ID"))
+ {
+ QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
+ this->m->tokenizer.expectInlineImage(input);
+ }
}
-
this->m->filter->handleEOF();
QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline(
m->filter, 0);
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index ecaa49bd..de5d56b3 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data(
// terminated the token. Read until end of inline image.
char ch;
input->read(&ch, 1);
- tokenizer.expectInlineImage();
+ tokenizer.expectInlineImage(input);
QPDFTokenizer::Token t =
tokenizer.readToken(input, description, true);
if (t.getType() == QPDFTokenizer::tt_bad)
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index c11c8218..e03f927b 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -13,6 +13,79 @@
#include <string.h>
#include <cstdlib>
+static bool is_delimiter(char ch)
+{
+ return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+}
+
+class QPDFWordTokenFinder: public InputSource::Finder
+{
+ public:
+ QPDFWordTokenFinder(PointerHolder<InputSource> is,
+ std::string const& str) :
+ is(is),
+ str(str)
+ {
+ }
+ virtual ~QPDFWordTokenFinder()
+ {
+ }
+ virtual bool check();
+
+ private:
+ PointerHolder<InputSource> is;
+ std::string str;
+};
+
+bool
+QPDFWordTokenFinder::check()
+{
+ // Find a word token matching the given string, preceded by a
+ // delimiter, and followed by a delimiter or EOF.
+ QPDFTokenizer tokenizer;
+ QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+ qpdf_offset_t pos = is->tell();
+ if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
+ {
+/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+ return false;
+ }
+ qpdf_offset_t token_start = is->getLastOffset();
+ char next;
+ bool next_okay = false;
+ if (is->read(&next, 1) == 0)
+ {
+ QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+ next_okay = true;
+ }
+ else
+ {
+ next_okay = is_delimiter(next);
+ }
+ is->seek(pos, SEEK_SET);
+ if (! next_okay)
+ {
+/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
+ return false;
+ }
+ if (token_start == 0)
+ {
+ // Can't actually happen...we never start the search at the
+ // beginning of the input.
+ return false;
+ }
+ is->seek(token_start - 1, SEEK_SET);
+ char prev;
+ bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev));
+ is->seek(pos, SEEK_SET);
+ if (! prev_okay)
+ {
+/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+ return false;
+ }
+ return true;
+}
+
QPDFTokenizer::Members::Members() :
pound_special_in_name(true),
allow_eof(false),
@@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset()
error_message = "";
unread_char = false;
char_to_unread = '\0';
+ inline_image_bytes = 0;
string_depth = 0;
string_ignoring_newline = false;
last_char_was_bs = false;
@@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch)
bool
QPDFTokenizer::isDelimiter(char ch)
{
- return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+ return is_delimiter(ch);
}
void
@@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch)
{
this->m->val += ch;
size_t len = this->m->val.length();
- if ((len >= 4) &&
- isDelimiter(this->m->val.at(len-4)) &&
- (this->m->val.at(len-3) == 'E') &&
- (this->m->val.at(len-2) == 'I') &&
- isDelimiter(this->m->val.at(len-1)))
+ if (len == this->m->inline_image_bytes)
+ {
+ QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+ this->m->type = tt_inline_image;
+ this->m->inline_image_bytes = 0;
+ this->m->state = st_token_ready;
+ }
+ else if ((this->m->inline_image_bytes == 0) &&
+ (len >= 4) &&
+ isDelimiter(this->m->val.at(len-4)) &&
+ (this->m->val.at(len-3) == 'E') &&
+ (this->m->val.at(len-2) == 'I') &&
+ isDelimiter(this->m->val.at(len-1)))
{
+ QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
this->m->val.erase(len - 1);
this->m->type = tt_inline_image;
this->m->unread_char = true;
@@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF()
(this->m->val.at(len-2) == 'E') &&
(this->m->val.at(len-1) == 'I'))
{
- QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+ QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
this->m->type = tt_inline_image;
this->m->state = st_token_ready;
}
@@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF()
void
QPDFTokenizer::expectInlineImage()
{
+ expectInlineImage(PointerHolder<InputSource>());
+}
+
+void
+QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
+{
+ if (input.getPointer())
+ {
+ qpdf_offset_t last_offset = input->getLastOffset();
+ qpdf_offset_t pos = input->tell();
+
+ QPDFWordTokenFinder f(input, "EI");
+ if (input->findFirst("EI", pos, 0, f))
+ {
+ this->m->inline_image_bytes = input->tell() - pos;
+ }
+
+ input->seek(pos, SEEK_SET);
+ input->setLastOffset(last_offset);
+ }
if (this->m->state != st_top)
{
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 5150e567..6dcebd6e 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -430,3 +430,6 @@ QPDFPageObjectHelper copy shared attribute 0
qpdf from_nr from repeat_nr 0
QPDF resolve duplicated page object 0
QPDF handle direct page object 0
+QPDFTokenizer found EI the old way 0
+QPDFTokenizer found EI by byte count 0
+QPDFTokenizer inline image at EOF the old way 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index ca7ea12b..6abc7edb 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -694,7 +694,7 @@ $td->runtest("check pass1 file",
show_ntests();
# ----------
$td->notify("--- Tokenizer ---");
-$n_tests += 4;
+$n_tests += 5;
$td->runtest("tokenizer with no ignorable",
{$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
@@ -706,6 +706,11 @@ $td->runtest("tokenizer",
{$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
+$td->runtest("tokenizer with old inline image code",
+ {$td->COMMAND => "test_tokenizer -old-ei tokens.pdf"},
+ {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
$td->runtest("tokenizer with max_len",
{$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
{$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc
index 9f65281b..ecbb3552 100644
--- a/qpdf/test_tokenizer.cc
+++ b/qpdf/test_tokenizer.cc
@@ -16,7 +16,7 @@ static char const* whoami = 0;
void usage()
{
std::cerr << "Usage: " << whoami
- << " [-maxlen len | -no-ignorable] filename"
+ << " [-maxlen len | -no-ignorable | -old-ei] filename"
<< std::endl;
exit(2);
}
@@ -132,7 +132,7 @@ try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
static void
dump_tokens(PointerHolder<InputSource> is, std::string const& label,
size_t max_len, bool include_ignorable,
- bool skip_streams, bool skip_inline_images)
+ bool skip_streams, bool skip_inline_images, bool old_ei)
{
Finder f1(is, "endstream");
std::cout << "--- BEGIN " << label << " ---" << std::endl;
@@ -183,7 +183,14 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
else if (skip_inline_images &&
(token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
{
- tokenizer.expectInlineImage();
+ if (old_ei)
+ {
+ tokenizer.expectInlineImage();
+ }
+ else
+ {
+ tokenizer.expectInlineImage(is);
+ }
inline_image_offset = is->tell();
}
else if (token.getType() == QPDFTokenizer::tt_eof)
@@ -195,7 +202,7 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
}
static void process(char const* filename, bool include_ignorable,
- size_t max_len)
+ size_t max_len, bool old_ei)
{
PointerHolder<InputSource> is;
@@ -203,7 +210,7 @@ static void process(char const* filename, bool include_ignorable,
FileInputSource* fis = new FileInputSource();
fis->setFilename(filename);
is = fis;
- dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
+ dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false);
// Tokenize content streams, skipping inline images
QPDF qpdf;
@@ -222,7 +229,7 @@ static void process(char const* filename, bool include_ignorable,
"content data", content_data.getPointer());
is = bis;
dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
- max_len, include_ignorable, false, true);
+ max_len, include_ignorable, false, true, old_ei);
}
// Tokenize object streams
@@ -241,7 +248,7 @@ static void process(char const* filename, bool include_ignorable,
is = bis;
dump_tokens(is, "OBJECT STREAM " +
QUtil::int_to_string((*iter).getObjectID()),
- max_len, include_ignorable, false, false);
+ max_len, include_ignorable, false, false, false);
}
}
}
@@ -266,6 +273,7 @@ int main(int argc, char* argv[])
char const* filename = 0;
size_t max_len = 0;
bool include_ignorable = true;
+ bool old_ei = false;
for (int i = 1; i < argc; ++i)
{
if (argv[i][0] == '-')
@@ -282,6 +290,10 @@ int main(int argc, char* argv[])
{
include_ignorable = false;
}
+ else if (strcmp(argv[i], "-old-ei") == 0)
+ {
+ old_ei = true;
+ }
else
{
usage();
@@ -303,7 +315,7 @@ int main(int argc, char* argv[])
try
{
- process(filename, include_ignorable, max_len);
+ process(filename, include_ignorable, max_len, old_ei);
}
catch (std::exception& e)
{