aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDFTokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/QPDFTokenizer.cc')
-rw-r--r--libqpdf/QPDFTokenizer.cc246
1 files changed, 230 insertions, 16 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 9c2a1e05..80fcf347 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -13,6 +13,69 @@
#include <string.h>
#include <cstdlib>
+static bool is_delimiter(char ch)
+{
+ return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+}
+
+class QPDFWordTokenFinder: public InputSource::Finder
+{
+ public:
+ QPDFWordTokenFinder(PointerHolder<InputSource> is,
+ std::string const& str) :
+ is(is),
+ str(str)
+ {
+ }
+ virtual ~QPDFWordTokenFinder()
+ {
+ }
+ virtual bool check();
+
+ private:
+ PointerHolder<InputSource> is;
+ std::string str;
+};
+
+bool
+QPDFWordTokenFinder::check()
+{
+ // Find a word token matching the given string, preceded by a
+ // delimiter, and followed by a delimiter or EOF.
+ QPDFTokenizer tokenizer;
+ QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+ qpdf_offset_t pos = is->tell();
+ if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
+ {
+ QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+ return false;
+ }
+ qpdf_offset_t token_start = is->getLastOffset();
+ char next;
+ bool next_okay = false;
+ if (is->read(&next, 1) == 0)
+ {
+ QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+ next_okay = true;
+ }
+ else
+ {
+ next_okay = is_delimiter(next);
+ }
+ is->seek(pos, SEEK_SET);
+ if (! next_okay)
+ {
+ return false;
+ }
+ if (token_start == 0)
+ {
+ // Can't actually happen...we never start the search at the
+ // beginning of the input.
+ return false;
+ }
+ return true;
+}
+
QPDFTokenizer::Members::Members() :
pound_special_in_name(true),
allow_eof(false),
@@ -31,9 +94,11 @@ QPDFTokenizer::Members::reset()
error_message = "";
unread_char = false;
char_to_unread = '\0';
+ inline_image_bytes = 0;
string_depth = 0;
string_ignoring_newline = false;
last_char_was_bs = false;
+ last_char_was_cr = false;
}
QPDFTokenizer::Members::~Members()
@@ -90,7 +155,7 @@ QPDFTokenizer::isSpace(char ch)
bool
QPDFTokenizer::isDelimiter(char ch)
{
- return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
+ return is_delimiter(ch);
}
void
@@ -217,6 +282,7 @@ QPDFTokenizer::presentCharacter(char ch)
memset(this->m->bs_num_register, '\0',
sizeof(this->m->bs_num_register));
this->m->last_char_was_bs = false;
+ this->m->last_char_was_cr = false;
this->m->state = st_in_string;
}
else if (ch == '<')
@@ -334,8 +400,7 @@ QPDFTokenizer::presentCharacter(char ch)
}
else if (this->m->state == st_in_string)
{
- if (this->m->string_ignoring_newline &&
- (! ((ch == '\r') || (ch == '\n'))))
+ if (this->m->string_ignoring_newline && (ch != '\n'))
{
this->m->string_ignoring_newline = false;
}
@@ -353,9 +418,10 @@ QPDFTokenizer::presentCharacter(char ch)
bs_num_count = 0;
}
- if (this->m->string_ignoring_newline && ((ch == '\r') || (ch == '\n')))
+ if (this->m->string_ignoring_newline && (ch == '\n'))
{
// ignore
+ this->m->string_ignoring_newline = false;
}
else if (ch_is_octal &&
(this->m->last_char_was_bs || (bs_num_count > 0)))
@@ -386,8 +452,10 @@ QPDFTokenizer::presentCharacter(char ch)
this->m->val += '\f';
break;
- case '\r':
case '\n':
+ break;
+
+ case '\r':
this->m->string_ignoring_newline = true;
break;
@@ -417,11 +485,26 @@ QPDFTokenizer::presentCharacter(char ch)
this->m->type = tt_string;
this->m->state = st_token_ready;
}
+ else if (ch == '\r')
+ {
+ // CR by itself is converted to LF
+ this->m->val += '\n';
+ }
+ else if (ch == '\n')
+ {
+ // CR LF is converted to LF
+ if (! this->m->last_char_was_cr)
+ {
+ this->m->val += ch;
+ }
+ }
else
{
this->m->val += ch;
}
+ this->m->last_char_was_cr =
+ ((! this->m->string_ignoring_newline) && (ch == '\r'));
this->m->last_char_was_bs =
((! this->m->last_char_was_bs) && (ch == '\\'));
}
@@ -449,21 +532,28 @@ QPDFTokenizer::presentCharacter(char ch)
}
else if (this->m->state == st_inline_image)
{
+ this->m->val += ch;
size_t len = this->m->val.length();
- if ((len >= 4) &&
- isDelimiter(this->m->val.at(len-4)) &&
- (this->m->val.at(len-3) == 'E') &&
- (this->m->val.at(len-2) == 'I') &&
- isDelimiter(this->m->val.at(len-1)))
+ if (len == this->m->inline_image_bytes)
{
+ QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
this->m->type = tt_inline_image;
- this->m->unread_char = true;
- this->m->char_to_unread = ch;
+ this->m->inline_image_bytes = 0;
this->m->state = st_token_ready;
}
- else
+ else if ((this->m->inline_image_bytes == 0) &&
+ (len >= 4) &&
+ isDelimiter(this->m->val.at(len-4)) &&
+ (this->m->val.at(len-3) == 'E') &&
+ (this->m->val.at(len-2) == 'I') &&
+ isDelimiter(this->m->val.at(len-1)))
{
- this->m->val += ch;
+ QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
+ this->m->val.erase(len - 1);
+ this->m->type = tt_inline_image;
+ this->m->unread_char = true;
+ this->m->char_to_unread = ch;
+ this->m->state = st_token_ready;
}
}
else
@@ -471,7 +561,6 @@ QPDFTokenizer::presentCharacter(char ch)
handled = false;
}
-
if (handled)
{
// okay
@@ -546,7 +635,7 @@ QPDFTokenizer::presentEOF()
(this->m->val.at(len-2) == 'E') &&
(this->m->val.at(len-1) == 'I'))
{
- QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
+ QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
this->m->type = tt_inline_image;
this->m->state = st_token_ready;
}
@@ -582,14 +671,139 @@ QPDFTokenizer::presentEOF()
void
QPDFTokenizer::expectInlineImage()
{
+ expectInlineImage(PointerHolder<InputSource>());
+}
+
+void
+QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
+{
if (this->m->state != st_top)
{
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
" when tokenizer is in improper state");
}
+ findEI(input);
this->m->state = st_inline_image;
}
+void
+QPDFTokenizer::findEI(PointerHolder<InputSource> input)
+{
+ if (! input.getPointer())
+ {
+ return;
+ }
+
+ qpdf_offset_t last_offset = input->getLastOffset();
+ qpdf_offset_t pos = input->tell();
+
+ // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
+ // Then read the next several tokens or up to EOF. If we find any
+ // suspicious-looking or tokens, this is probably still part of
+ // the image data, so keep looking for EI. Stop at the first EI
+ // that passes. If we get to the end without finding one, return
+ // the last EI we found. Store the number of bytes expected in the
+ // inline image including the EI and use that to break out of
+ // inline image, falling back to the old method if needed.
+
+ bool okay = false;
+ bool first_try = true;
+ while (! okay)
+ {
+ QPDFWordTokenFinder f(input, "EI");
+ if (! input->findFirst("EI", input->tell(), 0, f))
+ {
+ break;
+ }
+ this->m->inline_image_bytes = input->tell() - pos - 2;
+
+ QPDFTokenizer check;
+ bool found_bad = false;
+ // Look at the next 10 tokens or up to EOF. The next inline
+ // image's image data would look like bad tokens, but there
+ // will always be at least 10 tokens between one inline
+ // image's EI and the next valid one's ID since width, height,
+ // bits per pixel, and color space are all required as well as
+ // a BI and ID. If we get 10 good tokens in a row or hit EOF,
+ // we can be pretty sure we've found the actual EI.
+ for (int i = 0; i < 10; ++i)
+ {
+ QPDFTokenizer::Token t =
+ check.readToken(input, "checker", true);
+ token_type_e type = t.getType();
+ if (type == tt_eof)
+ {
+ okay = true;
+ }
+ else if (type == tt_bad)
+ {
+ found_bad = true;
+ }
+ else if (type == tt_word)
+ {
+ // The qpdf tokenizer lumps alphabetic and otherwise
+ // uncategorized characters into "words". We recognize
+ // strings of alphabetic characters as potential valid
+ // operators for purposes of telling whether we're in
+ // valid content or not. It's not perfect, but it
+ // should work more reliably than what we used to do,
+ // which was already good enough for the vast majority
+ // of files.
+ bool found_alpha = false;
+ bool found_non_printable = false;
+ bool found_other = false;
+ std::string value = t.getValue();
+ for (std::string::iterator iter = value.begin();
+ iter != value.end(); ++iter)
+ {
+ char ch = *iter;
+ if (((ch >= 'a') && (ch <= 'z')) ||
+ ((ch >= 'A') && (ch <= 'Z')) ||
+ (ch == '*'))
+ {
+ // Treat '*' as alpha since there are valid
+ // PDF operators that contain * along with
+ // alphabetic characters.
+ found_alpha = true;
+ }
+ else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
+ {
+ found_non_printable = true;
+ break;
+ }
+ else
+ {
+ found_other = true;
+ }
+ }
+ if (found_non_printable || (found_alpha && found_other))
+ {
+ found_bad = true;
+ }
+ }
+ if (okay || found_bad)
+ {
+ break;
+ }
+ }
+ if (! found_bad)
+ {
+ okay = true;
+ }
+ if (! okay)
+ {
+ first_try = false;
+ }
+ }
+ if (okay && (! first_try))
+ {
+ QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
+ }
+
+ input->seek(pos, SEEK_SET);
+ input->setLastOffset(last_offset);
+}
+
bool
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
{