summaryrefslogtreecommitdiffstats
path: root/libqpdf/QPDFTokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/QPDFTokenizer.cc')
-rw-r--r--libqpdf/QPDFTokenizer.cc123
1 files changed, 54 insertions, 69 deletions
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index da02a0fe..d98af8a9 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -1,8 +1,7 @@
#include <qpdf/QPDFTokenizer.hh>
-// DO NOT USE ctype -- it is locale dependent for some things, and
-// it's not worth the risk of including it in case it may accidentally
-// be used.
+// DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
+// including it in case it may accidentally be used.
#include <qpdf/QIntC.hh>
#include <qpdf/QPDFExc.hh>
@@ -45,8 +44,8 @@ namespace
bool
QPDFWordTokenFinder::check()
{
- // Find a word token matching the given string, preceded by a
- // delimiter, and followed by a delimiter or EOF.
+ // Find a word token matching the given string, preceded by a delimiter, and followed by a
+ // delimiter or EOF.
QPDFTokenizer tokenizer;
QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
qpdf_offset_t pos = is->tell();
@@ -68,8 +67,7 @@ QPDFWordTokenFinder::check()
return false;
}
if (token_start == 0) {
- // Can't actually happen...we never start the search at the
- // beginning of the input.
+ // Can't actually happen...we never start the search at the beginning of the input.
return false;
}
return true;
@@ -147,9 +145,9 @@ QPDFTokenizer::presentCharacter(char ch)
void
QPDFTokenizer::handleCharacter(char ch)
{
- // State machine is implemented such that the final character may not be
- // handled. This happens whenever you have to use a character from the
- // next token to detect the end of the current token.
+ // State machine is implemented such that the final character may not be handled. This happens
+ // whenever you have to use a character from the next token to detect the end of the current
+ // token.
switch (this->state) {
case st_top:
@@ -248,15 +246,14 @@ QPDFTokenizer::handleCharacter(char ch)
void
QPDFTokenizer::inTokenReady(char ch)
{
- throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
- "while token is waiting");
+ throw std::logic_error(
+ "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting");
}
void
QPDFTokenizer::inBeforeToken(char ch)
{
- // Note: we specifically do not use ctype here. It is
- // locale-dependent.
+ // Note: we specifically do not use ctype here. It is locale-dependent.
if (isSpace(ch)) {
this->before_token = !this->include_ignorable;
this->in_token = this->include_ignorable;
@@ -421,11 +418,9 @@ void
QPDFTokenizer::inName(char ch)
{
if (isDelimiter(ch)) {
- // A C-locale whitespace character or delimiter terminates
- // token. It is important to unread the whitespace
- // character even though it is ignored since it may be the
- // newline after a stream keyword. Removing it here could
- // make the stream-reading code break on some files,
+ // A C-locale whitespace character or delimiter terminates token. It is important to unread
+ // the whitespace character even though it is ignored since it may be the newline after a
+ // stream keyword. Removing it here could make the stream-reading code break on some files,
// though not on any files in the test suite as of this
// writing.
@@ -452,8 +447,7 @@ QPDFTokenizer::inNameHex1(char ch)
} else {
QTC::TC("qpdf", "QPDFTokenizer bad name 1");
this->error_message = "name with stray # will not work with PDF >= 1.2";
- // Use null to encode a bad # -- this is reversed
- // in QPDF_Name::normalizeName.
+ // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
this->val += '\0';
this->state = st_name;
inName(ch);
@@ -468,8 +462,7 @@ QPDFTokenizer::inNameHex2(char ch)
} else {
QTC::TC("qpdf", "QPDFTokenizer bad name 2");
this->error_message = "name with stray # will not work with PDF >= 1.2";
- // Use null to encode a bad # -- this is reversed
- // in QPDF_Name::normalizeName.
+ // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
this->val += '\0';
this->val += this->hex_char;
this->state = st_name;
@@ -636,13 +629,10 @@ void
QPDFTokenizer::inLiteral(char ch)
{
if (isDelimiter(ch)) {
- // A C-locale whitespace character or delimiter terminates
- // token. It is important to unread the whitespace
- // character even though it is ignored since it may be the
- // newline after a stream keyword. Removing it here could
- // make the stream-reading code break on some files,
- // though not on any files in the test suite as of this
- // writing.
+ // A C-locale whitespace character or delimiter terminates token. It is important to unread
+ // the whitespace character even though it is ignored since it may be the newline after a
+ // stream keyword. Removing it here could make the stream-reading code break on some files,
+ // though not on any files in the test suite as of this writing.
this->in_token = false;
this->char_to_unread = ch;
@@ -707,8 +697,7 @@ QPDFTokenizer::inCharCode(char ch)
if (++(this->digit_count) < 3) {
return;
}
- // We've accumulated \ddd. PDF Spec says to ignore
- // high-order overflow.
+ // We've accumulated \ddd. PDF Spec says to ignore high-order overflow.
}
this->val += char(this->char_code % 256);
this->state = st_in_string;
@@ -739,8 +728,7 @@ QPDFTokenizer::presentEOF()
case st_decimal:
case st_literal:
QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
- // Push any delimiter to the state machine to finish off the final
- // token.
+ // Push any delimiter to the state machine to finish off the final token.
presentCharacter('\f');
this->in_token = true;
break;
@@ -794,14 +782,12 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
qpdf_offset_t last_offset = input->getLastOffset();
qpdf_offset_t pos = input->tell();
- // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
- // Then read the next several tokens or up to EOF. If we find any
- // suspicious-looking or tokens, this is probably still part of
- // the image data, so keep looking for EI. Stop at the first EI
- // that passes. If we get to the end without finding one, return
- // the last EI we found. Store the number of bytes expected in the
- // inline image including the EI and use that to break out of
- // inline image, falling back to the old method if needed.
+ // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
+ // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
+ // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the
+ // end without finding one, return the last EI we found. Store the number of bytes expected in
+ // the inline image including the EI and use that to break out of inline image, falling back to
+ // the old method if needed.
bool okay = false;
bool first_try = true;
@@ -814,13 +800,11 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
QPDFTokenizer check;
bool found_bad = false;
- // Look at the next 10 tokens or up to EOF. The next inline
- // image's image data would look like bad tokens, but there
- // will always be at least 10 tokens between one inline
- // image's EI and the next valid one's ID since width, height,
- // bits per pixel, and color space are all required as well as
- // a BI and ID. If we get 10 good tokens in a row or hit EOF,
- // we can be pretty sure we've found the actual EI.
+ // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
+ // like bad tokens, but there will always be at least 10 tokens between one inline image's
+ // EI and the next valid one's ID since width, height, bits per pixel, and color space are
+ // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
+ // be pretty sure we've found the actual EI.
for (int i = 0; i < 10; ++i) {
QPDFTokenizer::Token t = check.readToken(input, "checker", true);
token_type_e type = t.getType();
@@ -829,27 +813,22 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
} else if (type == tt_bad) {
found_bad = true;
} else if (t.isWord()) {
- // The qpdf tokenizer lumps alphabetic and otherwise
- // uncategorized characters into "words". We recognize
- // strings of alphabetic characters as potential valid
- // operators for purposes of telling whether we're in
- // valid content or not. It's not perfect, but it
- // should work more reliably than what we used to do,
- // which was already good enough for the vast majority
- // of files.
+ // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into
+ // "words". We recognize strings of alphabetic characters as potential valid
+ // operators for purposes of telling whether we're in valid content or not. It's not
+ // perfect, but it should work more reliably than what we used to do, which was
+ // already good enough for the vast majority of files.
bool found_alpha = false;
bool found_non_printable = false;
bool found_other = false;
for (char ch: t.getValue()) {
if (((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A') && (ch <= 'Z')) ||
(ch == '*')) {
- // Treat '*' as alpha since there are valid
- // PDF operators that contain * along with
- // alphabetic characters.
+ // Treat '*' as alpha since there are valid PDF operators that contain *
+ // along with alphabetic characters.
found_alpha = true;
} else if ((static_cast<signed char>(ch) < 32) && (!isSpace(ch))) {
- // Compare ch as a signed char so characters
- // outside of 7-bit will be < 0.
+ // Compare ch as a signed char so characters outside of 7-bit will be < 0.
found_non_printable = true;
break;
} else {
@@ -903,9 +882,9 @@ QPDFTokenizer::betweenTokens()
QPDFTokenizer::Token
QPDFTokenizer::readToken(
- std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
+ InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
{
- nextToken(*input, context, max_len);
+ nextToken(input, context, max_len);
Token token;
bool unread_char;
@@ -918,15 +897,22 @@ QPDFTokenizer::readToken(
} else {
throw QPDFExc(
qpdf_e_damaged_pdf,
- input->getName(),
+ input.getName(),
context,
- input->getLastOffset(),
+ input.getLastOffset(),
token.getErrorMessage());
}
}
return token;
}
+QPDFTokenizer::Token
+QPDFTokenizer::readToken(
+ std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
+{
+ return readToken(*input, context, allow_bad, max_len);
+}
+
bool
QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
{
@@ -941,9 +927,8 @@ QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t
presentEOF();
if ((this->type == tt_eof) && (!this->allow_eof)) {
- // Nothing in the qpdf library calls readToken
- // without allowEOF anymore, so this case is not
- // exercised.
+ // Nothing in the qpdf library calls readToken without allowEOF anymore, so this
+ // case is not exercised.
this->type = tt_bad;
this->error_message = "unexpected EOF";
offset = input.getLastOffset();