aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--TODO9
-rw-r--r--include/qpdf/BufferInputSource.hh25
-rw-r--r--include/qpdf/ClosedFileInputSource.hh21
-rw-r--r--include/qpdf/FileInputSource.hh21
-rw-r--r--include/qpdf/InputSource.hh68
-rw-r--r--include/qpdf/QPDFTokenizer.hh94
-rw-r--r--include/qpdf/QUtil.hh55
-rw-r--r--libqpdf/BufferInputSource.cc85
-rw-r--r--libqpdf/ClosedFileInputSource.cc46
-rw-r--r--libqpdf/FileInputSource.cc85
-rw-r--r--libqpdf/QPDFTokenizer.cc1105
-rw-r--r--libqpdf/QUtil.cc46
-rw-r--r--manual/release-notes.rst3
-rw-r--r--qpdf/qpdf.testcov4
15 files changed, 1024 insertions, 650 deletions
diff --git a/ChangeLog b/ChangeLog
index b861ddda..093eeaf5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2022-08-27 Jay Berkenbilt <ejb@ql.org>
+
+ * From m-holger: major refactoring of QPDFTokenizer to improve
+ readability and to optimize performance. This also included some
+ optimizations to some InputSource classes. Thanks for this
+ excellent contribution. Fixes #749, #442.
+
2022-08-07 Jay Berkenbilt <ejb@ql.org>
* Add new build configuration option ENABLE_QTC, which is off by
diff --git a/TODO b/TODO
index 1b452805..a4628397 100644
--- a/TODO
+++ b/TODO
@@ -4,6 +4,7 @@ Next
Before Release:
+* Review in order #729, #726, #747
* Make ./performance_check usable by other people by having published
files to use for testing.
* https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf
@@ -26,6 +27,14 @@ Pending changes:
Soon: Break ground on "Document-level work"
+Remove raw pointers from the API
+================================
+
+(For qpdf >= 12)
+
+See if we can remove raw pointers from the QPDF API. There's a
+discussion in https://github.com/qpdf/qpdf/pull/747.
+
Fix Multiple Direct Object Owner Issue
======================================
diff --git a/include/qpdf/BufferInputSource.hh b/include/qpdf/BufferInputSource.hh
index b965704f..1a93815b 100644
--- a/include/qpdf/BufferInputSource.hh
+++ b/include/qpdf/BufferInputSource.hh
@@ -54,26 +54,11 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource
virtual void unreadCh(char ch);
private:
- class QPDF_DLL_PRIVATE Members
- {
- friend class BufferInputSource;
-
- public:
- QPDF_DLL
- ~Members() = default;
-
- private:
- Members(bool own_memory, std::string const& description, Buffer* buf);
- Members(Members const&) = delete;
-
- bool own_memory;
- std::string description;
- Buffer* buf;
- qpdf_offset_t cur_offset;
- qpdf_offset_t max_offset;
- };
-
- std::shared_ptr<Members> m;
+ bool own_memory;
+ std::string description;
+ Buffer* buf;
+ qpdf_offset_t cur_offset;
+ qpdf_offset_t max_offset;
};
#endif // QPDF_BUFFERINPUTSOURCE_HH
diff --git a/include/qpdf/ClosedFileInputSource.hh b/include/qpdf/ClosedFileInputSource.hh
index c72a1df8..b23c2767 100644
--- a/include/qpdf/ClosedFileInputSource.hh
+++ b/include/qpdf/ClosedFileInputSource.hh
@@ -73,23 +73,10 @@ class QPDF_DLL_CLASS ClosedFileInputSource: public InputSource
QPDF_DLL_PRIVATE
void after();
- class QPDF_DLL_PRIVATE Members
- {
- friend class ClosedFileInputSource;
-
- public:
- QPDF_DLL
- ~Members() = default;
-
- private:
- Members(char const* filename);
-
- std::string filename;
- qpdf_offset_t offset;
- std::shared_ptr<FileInputSource> fis;
- bool stay_open;
- };
- std::shared_ptr<Members> m;
+ std::string filename;
+ qpdf_offset_t offset;
+ std::shared_ptr<FileInputSource> fis;
+ bool stay_open;
};
#endif // QPDF_CLOSEDFILEINPUTSOURCE_HH
diff --git a/include/qpdf/FileInputSource.hh b/include/qpdf/FileInputSource.hh
index f1e7edf4..9e0d57fb 100644
--- a/include/qpdf/FileInputSource.hh
+++ b/include/qpdf/FileInputSource.hh
@@ -58,24 +58,9 @@ class QPDF_DLL_CLASS FileInputSource: public InputSource
FileInputSource(FileInputSource const&) = delete;
FileInputSource& operator=(FileInputSource const&) = delete;
- class QPDF_DLL_PRIVATE Members
- {
- friend class FileInputSource;
-
- public:
- QPDF_DLL
- ~Members();
-
- private:
- Members(bool close_file);
- Members(Members const&) = delete;
-
- bool close_file;
- std::string filename;
- FILE* file;
- };
-
- std::shared_ptr<Members> m;
+ bool close_file;
+ std::string filename;
+ FILE* file;
};
#endif // QPDF_FILEINPUTSOURCE_HH
diff --git a/include/qpdf/InputSource.hh b/include/qpdf/InputSource.hh
index 9feb8ec3..e9d99cdb 100644
--- a/include/qpdf/InputSource.hh
+++ b/include/qpdf/InputSource.hh
@@ -93,6 +93,12 @@ class QPDF_DLL_CLASS InputSource
// efficient.
virtual void unreadCh(char ch) = 0;
+ // The following methods are for use by QPDFTokenizer
+ inline qpdf_offset_t fastTell();
+ inline bool fastRead(char&);
+ inline void fastUnread(bool);
+ inline void loadBuffer();
+
protected:
qpdf_offset_t last_offset;
@@ -111,6 +117,68 @@ class QPDF_DLL_CLASS InputSource
};
std::shared_ptr<Members> m;
+
+ // State for fast... methods
+ static const qpdf_offset_t buf_size = 128;
+ char buffer[buf_size];
+ qpdf_offset_t buf_len = 0;
+ qpdf_offset_t buf_idx = 0;
+ qpdf_offset_t buf_start = 0;
};
+inline void
+InputSource::loadBuffer()
+{
+ this->buf_idx = 0;
+ this->buf_len = qpdf_offset_t(read(this->buffer, this->buf_size));
+ // NB read sets last_offset
+ this->buf_start = this->last_offset;
+}
+
+inline qpdf_offset_t
+InputSource::fastTell()
+{
+ if (this->buf_len == 0) {
+ loadBuffer();
+ } else {
+ auto curr = tell();
+ if (curr < this->buf_start ||
+ curr >= (this->buf_start + this->buf_len)) {
+ loadBuffer();
+ } else {
+ this->last_offset = curr;
+ this->buf_idx = curr - this->buf_start;
+ }
+ }
+ return this->last_offset;
+}
+
+inline bool
+InputSource::fastRead(char& ch)
+{
+ // Before calling fastRead, fastTell must be called to prepare the buffer.
+ // Once reading is complete, fastUnread must be called to set the correct
+ // file position.
+ if (this->buf_idx < this->buf_len) {
+ ch = this->buffer[this->buf_idx];
+ ++(this->buf_idx);
+ ++(this->last_offset);
+ return true;
+
+ } else if (this->buf_len == 0) {
+ return false;
+ } else {
+ seek(this->buf_start + this->buf_len, SEEK_SET);
+ fastTell();
+ return fastRead(ch);
+ }
+}
+
+inline void
+InputSource::fastUnread(bool back)
+{
+ this->last_offset -= back ? 1 : 0;
+ seek(this->last_offset, SEEK_SET);
+}
+
#endif // QPDF_INPUTSOURCE_HH
diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh
index 2187f21e..33b2e710 100644
--- a/include/qpdf/QPDFTokenizer.hh
+++ b/include/qpdf/QPDFTokenizer.hh
@@ -193,60 +193,82 @@ class QPDFTokenizer
QPDFTokenizer(QPDFTokenizer const&) = delete;
QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
- void resolveLiteral();
bool isSpace(char);
bool isDelimiter(char);
void findEI(std::shared_ptr<InputSource> input);
enum state_e {
st_top,
+ st_in_hexstring,
+ st_in_string,
+ st_in_hexstring_2nd,
+ st_name,
+ st_literal,
st_in_space,
st_in_comment,
- st_in_string,
+ st_string_escape,
+ st_char_code,
+ st_string_after_cr,
st_lt,
st_gt,
- st_literal,
- st_in_hexstring,
st_inline_image,
+ st_sign,
+ st_number,
+ st_real,
+ st_decimal,
+ st_name_hex1,
+ st_name_hex2,
+ st_before_token,
st_token_ready
};
- class Members
- {
- friend class QPDFTokenizer;
-
- public:
- QPDF_DLL
- ~Members() = default;
+ void handleCharacter(char);
+ void inBeforeToken(char);
+ void inTop(char);
+ void inSpace(char);
+ void inComment(char);
+ void inString(char);
+ void inName(char);
+ void inLt(char);
+ void inGt(char);
+ void inStringAfterCR(char);
+ void inStringEscape(char);
+ void inLiteral(char);
+ void inCharCode(char);
+ void inHexstring(char);
+ void inHexstring2nd(char);
+ void inInlineImage(char);
+ void inTokenReady(char);
+ void inNameHex1(char);
+ void inNameHex2(char);
+ void inSign(char);
+ void inDecimal(char);
+ void inNumber(char);
+ void inReal(char);
+ void reset();
- private:
- Members();
- Members(Members const&) = delete;
- void reset();
+ // Lexer state
+ state_e state;
- // Lexer state
- state_e state;
+ bool allow_eof;
+ bool include_ignorable;
- bool allow_eof;
- bool include_ignorable;
+ // Current token accumulation
+ token_type_e type;
+ std::string val;
+ std::string raw_val;
+ std::string error_message;
+ bool before_token;
+ bool in_token;
+ char char_to_unread;
+ size_t inline_image_bytes;
+ bool bad;
- // Current token accumulation
- token_type_e type;
- std::string val;
- std::string raw_val;
- std::string error_message;
- bool unread_char;
- char char_to_unread;
- size_t inline_image_bytes;
-
- // State for strings
- int string_depth;
- bool string_ignoring_newline;
- char bs_num_register[4];
- bool last_char_was_bs;
- bool last_char_was_cr;
- };
- std::shared_ptr<Members> m;
+ // State for strings
+ int string_depth;
+ int char_code;
+ char hex_char;
+ int digit_count;
};
#endif // QPDFTOKENIZER_HH
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index 32aeae1f..41b89da4 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -25,6 +25,7 @@
#include <qpdf/DLL.h>
#include <qpdf/PointerHolder.hh>
#include <qpdf/Types.h>
+#include <cstring>
#include <functional>
#include <list>
#include <memory>
@@ -489,16 +490,16 @@ namespace QUtil
// classes without using ctype, which we avoid because of locale
// considerations.
QPDF_DLL
- bool is_hex_digit(char);
+ inline bool is_hex_digit(char);
QPDF_DLL
- bool is_space(char);
+ inline bool is_space(char);
QPDF_DLL
- bool is_digit(char);
+ inline bool is_digit(char);
QPDF_DLL
- bool is_number(char const*);
+ inline bool is_number(char const*);
// This method parses the numeric range syntax used by the qpdf
// command-line tool. May throw std::runtime_error.
@@ -526,4 +527,50 @@ namespace QUtil
#endif // QPDF_NO_WCHAR_T
}; // namespace QUtil
+inline bool
+QUtil::is_hex_digit(char ch)
+{
+ return (ch && (strchr("0123456789abcdefABCDEF", ch) != nullptr));
+}
+
+inline bool
+QUtil::is_space(char ch)
+{
+ return (ch && (strchr(" \f\n\r\t\v", ch) != nullptr));
+}
+
+inline bool
+QUtil::is_digit(char ch)
+{
+ return ((ch >= '0') && (ch <= '9'));
+}
+
+inline bool
+QUtil::is_number(char const* p)
+{
+ // ^[\+\-]?(\.\d*|\d+(\.\d*)?)$
+ if (!*p) {
+ return false;
+ }
+ if ((*p == '-') || (*p == '+')) {
+ ++p;
+ }
+ bool found_dot = false;
+ bool found_digit = false;
+ for (; *p; ++p) {
+ if (*p == '.') {
+ if (found_dot) {
+ // only one dot
+ return false;
+ }
+ found_dot = true;
+ } else if (QUtil::is_digit(*p)) {
+ found_digit = true;
+ } else {
+ return false;
+ }
+ }
+ return found_digit;
+}
+
#endif // QUTIL_HH
diff --git a/libqpdf/BufferInputSource.cc b/libqpdf/BufferInputSource.cc
index 5b59c801..6402f639 100644
--- a/libqpdf/BufferInputSource.cc
+++ b/libqpdf/BufferInputSource.cc
@@ -7,8 +7,8 @@
#include <stdexcept>
#include <string.h>
-BufferInputSource::Members::Members(
- bool own_memory, std::string const& description, Buffer* buf) :
+BufferInputSource::BufferInputSource(
+ std::string const& description, Buffer* buf, bool own_memory) :
own_memory(own_memory),
description(description),
buf(buf),
@@ -18,60 +18,54 @@ BufferInputSource::Members::Members(
}
BufferInputSource::BufferInputSource(
- std::string const& description, Buffer* buf, bool own_memory) :
- m(new Members(own_memory, description, buf))
-{
-}
-
-BufferInputSource::BufferInputSource(
std::string const& description, std::string const& contents) :
- m(new Members(true, description, nullptr))
+ own_memory(true),
+ description(description),
+ buf(new Buffer(contents.length())),
+ cur_offset(0),
+ max_offset(QIntC::to_offset(buf->getSize()))
{
- this->m->buf = new Buffer(contents.length());
- this->m->max_offset = QIntC::to_offset(this->m->buf->getSize());
- unsigned char* bp = this->m->buf->getBuffer();
- memcpy(bp, contents.c_str(), contents.length());
+ memcpy(buf->getBuffer(), contents.c_str(), contents.length());
}
BufferInputSource::~BufferInputSource()
{
- if (this->m->own_memory) {
- delete this->m->buf;
+ if (this->own_memory) {
+ delete this->buf;
}
}
qpdf_offset_t
BufferInputSource::findAndSkipNextEOL()
{
- if (this->m->cur_offset < 0) {
+ if (this->cur_offset < 0) {
throw std::logic_error("INTERNAL ERROR: BufferInputSource offset < 0");
}
- qpdf_offset_t end_pos = this->m->max_offset;
- if (this->m->cur_offset >= end_pos) {
+ qpdf_offset_t end_pos = this->max_offset;
+ if (this->cur_offset >= end_pos) {
this->last_offset = end_pos;
- this->m->cur_offset = end_pos;
+ this->cur_offset = end_pos;
return end_pos;
}
qpdf_offset_t result = 0;
- unsigned char const* buffer = this->m->buf->getBuffer();
+ unsigned char const* buffer = this->buf->getBuffer();
unsigned char const* end = buffer + end_pos;
- unsigned char const* p = buffer + this->m->cur_offset;
+ unsigned char const* p = buffer + this->cur_offset;
while ((p < end) && !((*p == '\r') || (*p == '\n'))) {
++p;
}
if (p < end) {
result = p - buffer;
- this->m->cur_offset = result + 1;
+ this->cur_offset = result + 1;
++p;
- while ((this->m->cur_offset < end_pos) &&
- ((*p == '\r') || (*p == '\n'))) {
+ while ((this->cur_offset < end_pos) && ((*p == '\r') || (*p == '\n'))) {
++p;
- ++this->m->cur_offset;
+ ++this->cur_offset;
}
} else {
- this->m->cur_offset = end_pos;
+ this->cur_offset = end_pos;
result = end_pos;
}
return result;
@@ -80,13 +74,13 @@ BufferInputSource::findAndSkipNextEOL()
std::string const&
BufferInputSource::getName() const
{
- return this->m->description;
+ return this->description;
}
qpdf_offset_t
BufferInputSource::tell()
{
- return this->m->cur_offset;
+ return this->cur_offset;
}
void
@@ -94,17 +88,17 @@ BufferInputSource::seek(qpdf_offset_t offset, int whence)
{
switch (whence) {
case SEEK_SET:
- this->m->cur_offset = offset;
+ this->cur_offset = offset;
break;
case SEEK_END:
- QIntC::range_check(this->m->max_offset, offset);
- this->m->cur_offset = this->m->max_offset + offset;
+ QIntC::range_check(this->max_offset, offset);
+ this->cur_offset = this->max_offset + offset;
break;
case SEEK_CUR:
- QIntC::range_check(this->m->cur_offset, offset);
- this->m->cur_offset += offset;
+ QIntC::range_check(this->cur_offset, offset);
+ this->cur_offset += offset;
break;
default:
@@ -113,42 +107,41 @@ BufferInputSource::seek(qpdf_offset_t offset, int whence)
break;
}
- if (this->m->cur_offset < 0) {
+ if (this->cur_offset < 0) {
throw std::runtime_error(
- this->m->description + ": seek before beginning of buffer");
+ this->description + ": seek before beginning of buffer");
}
}
void
BufferInputSource::rewind()
{
- this->m->cur_offset = 0;
+ this->cur_offset = 0;
}
size_t
BufferInputSource::read(char* buffer, size_t length)
{
- if (this->m->cur_offset < 0) {
+ if (this->cur_offset < 0) {
throw std::logic_error("INTERNAL ERROR: BufferInputSource offset < 0");
}
- qpdf_offset_t end_pos = this->m->max_offset;
- if (this->m->cur_offset >= end_pos) {
+ qpdf_offset_t end_pos = this->max_offset;
+ if (this->cur_offset >= end_pos) {
this->last_offset = end_pos;
return 0;
}
- this->last_offset = this->m->cur_offset;
- size_t len =
- std::min(QIntC::to_size(end_pos - this->m->cur_offset), length);
- memcpy(buffer, this->m->buf->getBuffer() + this->m->cur_offset, len);
- this->m->cur_offset += QIntC::to_offset(len);
+ this->last_offset = this->cur_offset;
+ size_t len = std::min(QIntC::to_size(end_pos - this->cur_offset), length);
+ memcpy(buffer, this->buf->getBuffer() + this->cur_offset, len);
+ this->cur_offset += QIntC::to_offset(len);
return len;
}
void
BufferInputSource::unreadCh(char ch)
{
- if (this->m->cur_offset > 0) {
- --this->m->cur_offset;
+ if (this->cur_offset > 0) {
+ --this->cur_offset;
}
}
diff --git a/libqpdf/ClosedFileInputSource.cc b/libqpdf/ClosedFileInputSource.cc
index ec977c69..06ebb156 100644
--- a/libqpdf/ClosedFileInputSource.cc
+++ b/libqpdf/ClosedFileInputSource.cc
@@ -2,18 +2,13 @@
#include <qpdf/FileInputSource.hh>
-ClosedFileInputSource::Members::Members(char const* filename) :
+ClosedFileInputSource::ClosedFileInputSource(char const* filename) :
filename(filename),
offset(0),
stay_open(false)
{
}
-ClosedFileInputSource::ClosedFileInputSource(char const* filename) :
- m(new Members(filename))
-{
-}
-
ClosedFileInputSource::~ClosedFileInputSource()
{
// Must be explicit and not inline -- see QPDF_DLL_CLASS in
@@ -23,30 +18,29 @@ ClosedFileInputSource::~ClosedFileInputSource()
void
ClosedFileInputSource::before()
{
- if (nullptr == this->m->fis) {
- this->m->fis =
- std::make_shared<FileInputSource>(this->m->filename.c_str());
- this->m->fis->seek(this->m->offset, SEEK_SET);
- this->m->fis->setLastOffset(this->last_offset);
+ if (nullptr == this->fis) {
+ this->fis = std::make_shared<FileInputSource>(this->filename.c_str());
+ this->fis->seek(this->offset, SEEK_SET);
+ this->fis->setLastOffset(this->last_offset);
}
}
void
ClosedFileInputSource::after()
{
- this->last_offset = this->m->fis->getLastOffset();
- this->m->offset = this->m->fis->tell();
- if (this->m->stay_open) {
+ this->last_offset = this->fis->getLastOffset();
+ this->offset = this->fis->tell();
+ if (this->stay_open) {
return;
}
- this->m->fis = nullptr;
+ this->fis = nullptr;
}
qpdf_offset_t
ClosedFileInputSource::findAndSkipNextEOL()
{
before();
- qpdf_offset_t r = this->m->fis->findAndSkipNextEOL();
+ qpdf_offset_t r = this->fis->findAndSkipNextEOL();
after();
return r;
}
@@ -54,14 +48,14 @@ ClosedFileInputSource::findAndSkipNextEOL()
std::string const&
ClosedFileInputSource::getName() const
{
- return this->m->filename;
+ return this->filename;
}
qpdf_offset_t
ClosedFileInputSource::tell()
{
before();
- qpdf_offset_t r = this->m->fis->tell();
+ qpdf_offset_t r = this->fis->tell();
after();
return r;
}
@@ -70,16 +64,16 @@ void
ClosedFileInputSource::seek(qpdf_offset_t offset, int whence)
{
before();
- this->m->fis->seek(offset, whence);
+ this->fis->seek(offset, whence);
after();
}
void
ClosedFileInputSource::rewind()
{
- this->m->offset = 0;
- if (this->m->fis.get()) {
- this->m->fis->rewind();
+ this->offset = 0;
+ if (this->fis.get()) {
+ this->fis->rewind();
}
}
@@ -87,7 +81,7 @@ size_t
ClosedFileInputSource::read(char* buffer, size_t length)
{
before();
- size_t r = this->m->fis->read(buffer, length);
+ size_t r = this->fis->read(buffer, length);
after();
return r;
}
@@ -96,7 +90,7 @@ void
ClosedFileInputSource::unreadCh(char ch)
{
before();
- this->m->fis->unreadCh(ch);
+ this->fis->unreadCh(ch);
// Don't call after -- the file has to stay open after this
// operation.
}
@@ -104,8 +98,8 @@ ClosedFileInputSource::unreadCh(char ch)
void
ClosedFileInputSource::stayOpen(bool val)
{
- this->m->stay_open = val;
- if ((!val) && this->m->fis.get()) {
+ this->stay_open = val;
+ if ((!val) && this->fis.get()) {
after();
}
}
diff --git a/libqpdf/FileInputSource.cc b/libqpdf/FileInputSource.cc
index ab88d302..2b1ee1ab 100644
--- a/libqpdf/FileInputSource.cc
+++ b/libqpdf/FileInputSource.cc
@@ -5,60 +5,52 @@
#include <algorithm>
#include <string.h>
-FileInputSource::Members::Members(bool close_file) :
- close_file(close_file),
- file(nullptr)
-{
-}
-
-FileInputSource::Members::~Members()
-{
- if (this->file && this->close_file) {
- fclose(this->file);
- }
-}
-
FileInputSource::FileInputSource() :
- m(new Members(false))
+ close_file(false),
+ file(nullptr)
{
}
FileInputSource::FileInputSource(char const* filename) :
- m(new Members(false))
+ close_file(true),
+ filename(filename),
+ file(QUtil::safe_fopen(filename, "rb"))
{
- setFilename(filename);
}
FileInputSource::FileInputSource(
char const* description, FILE* filep, bool close_file) :
- m(new Members(false))
+ close_file(close_file),
+ filename(description),
+ file(filep)
+{
+}
+
+FileInputSource::~FileInputSource()
{
- setFile(description, filep, close_file);
+ // Must be explicit and not inline -- see QPDF_DLL_CLASS in
+ // README-maintainer
+ if (this->file && this->close_file) {
+ fclose(this->file);
+ }
}
void
FileInputSource::setFilename(char const* filename)
{
- this->m = std::shared_ptr<Members>(new Members(true));
- this->m->filename = filename;
- this->m->file = QUtil::safe_fopen(filename, "rb");
+ this->close_file = true;
+ this->filename = filename;
+ this->file = QUtil::safe_fopen(filename, "rb");
}
void
FileInputSource::setFile(char const* description, FILE* filep, bool close_file)
{
- this->m = std::shared_ptr<Members>(new Members(close_file));
- this->m->filename = description;
- this->m->file = filep;
+ this->filename = description;
+ this->file = filep;
this->seek(0, SEEK_SET);
}
-FileInputSource::~FileInputSource()
-{
- // Must be explicit and not inline -- see QPDF_DLL_CLASS in
- // README-maintainer
-}
-
qpdf_offset_t
FileInputSource::findAndSkipNextEOL()
{
@@ -66,7 +58,7 @@ FileInputSource::findAndSkipNextEOL()
bool done = false;
char buf[10240];
while (!done) {
- qpdf_offset_t cur_offset = QUtil::tell(this->m->file);
+ qpdf_offset_t cur_offset = QUtil::tell(this->file);
size_t len = this->read(buf, sizeof(buf));
if (len == 0) {
done = true;
@@ -98,41 +90,42 @@ FileInputSource::findAndSkipNextEOL()
std::string const&
FileInputSource::getName() const
{
- return this->m->filename;
+ return this->filename;
}
qpdf_offset_t
FileInputSource::tell()
{
- return QUtil::tell(this->m->file);
+ return QUtil::tell(this->file);
}
void
FileInputSource::seek(qpdf_offset_t offset, int whence)
{
- QUtil::os_wrapper(
- (std::string("seek to ") + this->m->filename + ", offset " +
- QUtil::int_to_string(offset) + " (" + QUtil::int_to_string(whence) +
- ")"),
- QUtil::seek(this->m->file, offset, whence));
+ if (QUtil::seek(this->file, offset, whence) == -1) {
+ QUtil::throw_system_error(
+ std::string("seek to ") + this->filename + ", offset " +
+ QUtil::int_to_string(offset) + " (" + QUtil::int_to_string(whence) +
+ ")");
+ }
}
void
FileInputSource::rewind()
{
- ::rewind(this->m->file);
+ ::rewind(this->file);
}
size_t
FileInputSource::read(char* buffer, size_t length)
{
- this->last_offset = this->tell();
- size_t len = fread(buffer, 1, length, this->m->file);
+ this->last_offset = QUtil::tell(this->file);
+ size_t len = fread(buffer, 1, length, this->file);
if (len == 0) {
- if (ferror(this->m->file)) {
+ if (ferror(this->file)) {
throw QPDFExc(
qpdf_e_system,
- this->m->filename,
+ this->filename,
"",
this->last_offset,
(std::string("read ") + QUtil::uint_to_string(length) +
@@ -148,7 +141,7 @@ FileInputSource::read(char* buffer, size_t length)
void
FileInputSource::unreadCh(char ch)
{
- QUtil::os_wrapper(
- this->m->filename + ": unread character",
- ungetc(static_cast<unsigned char>(ch), this->m->file));
+ if (ungetc(static_cast<unsigned char>(ch), this->file) == -1) {
+ QUtil::throw_system_error(this->filename + ": unread character");
+ }
}
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 1726e1b9..cd8f932d 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -73,28 +73,20 @@ QPDFWordTokenFinder::check()
return true;
}
-QPDFTokenizer::Members::Members() :
- allow_eof(false),
- include_ignorable(false)
-{
- reset();
-}
-
void
-QPDFTokenizer::Members::reset()
+QPDFTokenizer::reset()
{
- state = st_top;
+ state = st_before_token;
type = tt_bad;
- val = "";
- raw_val = "";
+ val.clear();
+ raw_val.clear();
error_message = "";
- unread_char = false;
+ before_token = true;
+ in_token = false;
char_to_unread = '\0';
inline_image_bytes = 0;
string_depth = 0;
- string_ignoring_newline = false;
- last_char_was_bs = false;
- last_char_was_cr = false;
+ bad = false;
}
QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@@ -110,20 +102,22 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
}
QPDFTokenizer::QPDFTokenizer() :
- m(new Members())
+ allow_eof(false),
+ include_ignorable(false)
{
+ reset();
}
void
QPDFTokenizer::allowEOF()
{
- this->m->allow_eof = true;
+ this->allow_eof = true;
}
void
QPDFTokenizer::includeIgnorable()
{
- this->m->include_ignorable = true;
+ this->include_ignorable = true;
}
bool
@@ -139,376 +133,719 @@ QPDFTokenizer::isDelimiter(char ch)
}
void
-QPDFTokenizer::resolveLiteral()
-{
- if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) {
- this->m->type = tt_name;
- // Deal with # in name token. Note: '/' by itself is a
- // valid name, so don't strip leading /. That way we
- // don't have to deal with the empty string as a name.
- std::string nval = "/";
- size_t len = this->m->val.length();
- for (size_t i = 1; i < len; ++i) {
- char ch = this->m->val.at(i);
- if (ch == '#') {
- if ((i + 2 < len) &&
- QUtil::is_hex_digit(this->m->val.at(i + 1)) &&
- QUtil::is_hex_digit(this->m->val.at(i + 2))) {
- char num[3];
- num[0] = this->m->val.at(i + 1);
- num[1] = this->m->val.at(i + 2);
- num[2] = '\0';
- char ch2 = static_cast<char>(strtol(num, nullptr, 16));
- if (ch2 == '\0') {
- this->m->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer null in name");
- this->m->error_message =
- "null character not allowed in name token";
- nval += "#00";
- } else {
- nval.append(1, ch2);
- }
- i += 2;
- } else {
- QTC::TC("qpdf", "QPDFTokenizer bad name");
- this->m->error_message =
- "name with stray # will not work with PDF >= 1.2";
- // Use null to encode a bad # -- this is reversed
- // in QPDF_Name::normalizeName.
- nval += '\0';
- }
- } else {
- nval.append(1, ch);
- }
- }
- this->m->val = nval;
- } else if (QUtil::is_number(this->m->val.c_str())) {
- if (this->m->val.find('.') != std::string::npos) {
- this->m->type = tt_real;
- } else {
- this->m->type = tt_integer;
- }
- } else if ((this->m->val == "true") || (this->m->val == "false")) {
- this->m->type = tt_bool;
- } else if (this->m->val == "null") {
- this->m->type = tt_null;
- } else {
- // I don't really know what it is, so leave it as tt_word.
- // Lots of cases ($, #, etc.) other than actual words fall
- // into this category, but that's okay at least for now.
- this->m->type = tt_word;
+QPDFTokenizer::presentCharacter(char ch)
+{
+ handleCharacter(ch);
+
+ if (this->in_token) {
+ this->raw_val += ch;
}
}
void
-QPDFTokenizer::presentCharacter(char ch)
+QPDFTokenizer::handleCharacter(char ch)
{
- if (this->m->state == st_token_ready) {
+ // State machine is implemented such that the final character may not be
+ // handled. This happens whenever you have to use a character from the
+ // next token to detect the end of the current token.
+
+ switch (this->state) {
+ case st_top:
+ inTop(ch);
+ return;
+
+ case st_in_space:
+ inSpace(ch);
+ return;
+
+ case st_in_comment:
+ inComment(ch);
+ return;
+
+ case st_lt:
+ inLt(ch);
+ return;
+
+ case st_gt:
+ inGt(ch);
+ return;
+
+ case st_in_string:
+ inString(ch);
+ return;
+
+ case st_name:
+ inName(ch);
+ return;
+
+ case st_number:
+ inNumber(ch);
+ return;
+
+ case st_real:
+ inReal(ch);
+ return;
+
+ case st_string_after_cr:
+ inStringAfterCR(ch);
+ return;
+
+ case st_string_escape:
+ inStringEscape(ch);
+ return;
+
+ case st_char_code:
+ inCharCode(ch);
+ return;
+
+ case st_literal:
+ inLiteral(ch);
+ return;
+
+ case st_inline_image:
+ inInlineImage(ch);
+ return;
+
+ case st_in_hexstring:
+ inHexstring(ch);
+ return;
+
+ case st_in_hexstring_2nd:
+ inHexstring2nd(ch);
+ return;
+
+ case st_name_hex1:
+ inNameHex1(ch);
+ return;
+
+ case st_name_hex2:
+ inNameHex2(ch);
+ return;
+
+ case st_sign:
+ inSign(ch);
+ return;
+
+ case st_decimal:
+ inDecimal(ch);
+ return;
+
+ case (st_before_token):
+ inBeforeToken(ch);
+ return;
+
+ case (st_token_ready):
+ inTokenReady(ch);
+ return;
+
+ default:
throw std::logic_error(
- "INTERNAL ERROR: QPDF tokenizer presented character "
- "while token is waiting");
+ "INTERNAL ERROR: invalid state while reading token");
}
+}
- char orig_ch = ch;
-
- // State machine is implemented such that some characters may be
- // handled more than once. This happens whenever you have to use
- // the character that caused a state change in the new state.
+void
+QPDFTokenizer::inTokenReady(char ch)
+{
+ throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
+ "while token is waiting");
+}
- bool handled = true;
- if (this->m->state == st_top) {
- // Note: we specifically do not use ctype here. It is
- // locale-dependent.
- if (isSpace(ch)) {
- if (this->m->include_ignorable) {
- this->m->state = st_in_space;
- this->m->val += ch;
- }
- } else if (ch == '%') {
- this->m->state = st_in_comment;
- if (this->m->include_ignorable) {
- this->m->val += ch;
- }
- } else if (ch == '(') {
- this->m->string_depth = 1;
- this->m->string_ignoring_newline = false;
- memset(
- this->m->bs_num_register,
- '\0',
- sizeof(this->m->bs_num_register));
- this->m->last_char_was_bs = false;
- this->m->last_char_was_cr = false;
- this->m->state = st_in_string;
- } else if (ch == '<') {
- this->m->state = st_lt;
- } else if (ch == '>') {
- this->m->state = st_gt;
- } else {
- this->m->val += ch;
- if (ch == ')') {
- this->m->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer bad )");
- this->m->error_message = "unexpected )";
- this->m->state = st_token_ready;
- } else if (ch == '[') {
- this->m->type = tt_array_open;
- this->m->state = st_token_ready;
- } else if (ch == ']') {
- this->m->type = tt_array_close;
- this->m->state = st_token_ready;
- } else if (ch == '{') {
- this->m->type = tt_brace_open;
- this->m->state = st_token_ready;
- } else if (ch == '}') {
- this->m->type = tt_brace_close;
- this->m->state = st_token_ready;
- } else {
- this->m->state = st_literal;
- }
- }
- } else if (this->m->state == st_in_space) {
- // We only enter this state if include_ignorable is true.
- if (!isSpace(ch)) {
- this->m->type = tt_space;
- this->m->unread_char = true;
- this->m->char_to_unread = ch;
- this->m->state = st_token_ready;
- } else {
- this->m->val += ch;
- }
- } else if (this->m->state == st_in_comment) {
- if ((ch == '\r') || (ch == '\n')) {
- if (this->m->include_ignorable) {
- this->m->type = tt_comment;
- this->m->unread_char = true;
- this->m->char_to_unread = ch;
- this->m->state = st_token_ready;
- } else {
- this->m->state = st_top;
- }
- } else if (this->m->include_ignorable) {
- this->m->val += ch;
- }
- } else if (this->m->state == st_lt) {
- if (ch == '<') {
- this->m->val = "<<";
- this->m->type = tt_dict_open;
- this->m->state = st_token_ready;
- } else {
- handled = false;
- this->m->state = st_in_hexstring;
- }
- } else if (this->m->state == st_gt) {
- if (ch == '>') {
- this->m->val = ">>";
- this->m->type = tt_dict_close;
- this->m->state = st_token_ready;
- } else {
- this->m->val = ">";
- this->m->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer bad >");
- this->m->error_message = "unexpected >";
- this->m->unread_char = true;
- this->m->char_to_unread = ch;
- this->m->state = st_token_ready;
+void
+QPDFTokenizer::inBeforeToken(char ch)
+{
+ // Note: we specifically do not use ctype here. It is
+ // locale-dependent.
+ if (isSpace(ch)) {
+ this->before_token = !this->include_ignorable;
+ this->in_token = this->include_ignorable;
+ if (this->include_ignorable) {
+ this->state = st_in_space;
+ this->val += ch;
}
- } else if (this->m->state == st_in_string) {
- if (this->m->string_ignoring_newline && (ch != '\n')) {
- this->m->string_ignoring_newline = false;
+ } else if (ch == '%') {
+ this->before_token = !this->include_ignorable;
+ this->in_token = this->include_ignorable;
+ this->state = st_in_comment;
+ if (this->include_ignorable) {
+ this->val += ch;
}
+ } else {
+ this->before_token = false;
+ this->in_token = true;
+ inTop(ch);
+ }
+}
- size_t bs_num_count = strlen(this->m->bs_num_register);
- bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
- if ((bs_num_count == 3) || ((bs_num_count > 0) && (!ch_is_octal))) {
- // We've accumulated \ddd. PDF Spec says to ignore
- // high-order overflow.
- this->m->val +=
- static_cast<char>(strtol(this->m->bs_num_register, nullptr, 8));
- memset(
- this->m->bs_num_register,
- '\0',
- sizeof(this->m->bs_num_register));
- bs_num_count = 0;
- }
+void
+QPDFTokenizer::inTop(char ch)
+{
+ switch (ch) {
+ case '(':
+ this->string_depth = 1;
+ this->state = st_in_string;
+ return;
- if (this->m->string_ignoring_newline && (ch == '\n')) {
- // ignore
- this->m->string_ignoring_newline = false;
- } else if (
- ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) {
- this->m->bs_num_register[bs_num_count++] = ch;
- } else if (this->m->last_char_was_bs) {
- switch (ch) {
- case 'n':
- this->m->val += '\n';
- break;
+ case '<':
+ this->state = st_lt;
+ return;
- case 'r':
- this->m->val += '\r';
- break;
+ case '>':
+ this->state = st_gt;
+ return;
- case 't':
- this->m->val += '\t';
- break;
+ case (')'):
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad )");
+ this->error_message = "unexpected )";
+ this->val += ch;
+ this->state = st_token_ready;
+ return;
- case 'b':
- this->m->val += '\b';
- break;
+ case '[':
+ this->type = tt_array_open;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
- case 'f':
- this->m->val += '\f';
- break;
+ case ']':
+ this->type = tt_array_close;
+ this->val += ch;
+ this->state = st_token_ready;
+ return;
- case '\n':
- break;
+ case '{':
+ this->type = tt_brace_open;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
- case '\r':
- this->m->string_ignoring_newline = true;
- break;
+ case '}':
+ this->type = tt_brace_close;
+ this->state = st_token_ready;
+ this->val += ch;
+ return;
- default:
- // PDF spec says backslash is ignored before anything else
- this->m->val += ch;
- break;
- }
- } else if (ch == '\\') {
- // last_char_was_bs is set/cleared below as appropriate
- if (bs_num_count) {
- throw std::logic_error(
- "INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 "
- "when ch == '\\'");
- }
- } else if (ch == '(') {
- this->m->val += ch;
- ++this->m->string_depth;
- } else if ((ch == ')') && (--this->m->string_depth == 0)) {
- this->m->type = tt_string;
- this->m->state = st_token_ready;
- } else if (ch == '\r') {
- // CR by itself is converted to LF
- this->m->val += '\n';
- } else if (ch == '\n') {
- // CR LF is converted to LF
- if (!this->m->last_char_was_cr) {
- this->m->val += ch;
- }
- } else {
- this->m->val += ch;
- }
+ case '/':
+ this->state = st_name;
+ this->val += ch;
+ return;
- this->m->last_char_was_cr =
- ((!this->m->string_ignoring_newline) && (ch == '\r'));
- this->m->last_char_was_bs =
- ((!this->m->last_char_was_bs) && (ch == '\\'));
- } else if (this->m->state == st_literal) {
- if (isDelimiter(ch)) {
- // A C-locale whitespace character or delimiter terminates
- // token. It is important to unread the whitespace
- // character even though it is ignored since it may be the
- // newline after a stream keyword. Removing it here could
- // make the stream-reading code break on some files,
- // though not on any files in the test suite as of this
- // writing.
-
- this->m->type = tt_word;
- this->m->unread_char = true;
- this->m->char_to_unread = ch;
- this->m->state = st_token_ready;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ this->state = st_number;
+ this->val += ch;
+ return;
+
+ case '+':
+ case '-':
+ this->state = st_sign;
+ this->val += ch;
+ return;
+
+ case '.':
+ this->state = st_decimal;
+ this->val += ch;
+ return;
+
+ default:
+ this->state = st_literal;
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inSpace(char ch)
+{
+ // We only enter this state if include_ignorable is true.
+ if (!isSpace(ch)) {
+ this->type = tt_space;
+ this->in_token = false;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ return;
+ } else {
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inComment(char ch)
+{
+ if ((ch == '\r') || (ch == '\n')) {
+ if (this->include_ignorable) {
+ this->type = tt_comment;
+ this->in_token = false;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
} else {
- this->m->val += ch;
+ this->state = st_before_token;
}
- } else if (this->m->state == st_inline_image) {
- this->m->val += ch;
- size_t len = this->m->val.length();
- if (len == this->m->inline_image_bytes) {
- QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
- this->m->type = tt_inline_image;
- this->m->inline_image_bytes = 0;
- this->m->state = st_token_ready;
+ } else if (this->include_ignorable) {
+ this->val += ch;
+ }
+}
+
+void
+QPDFTokenizer::inString(char ch)
+{
+ switch (ch) {
+ case '\\':
+ this->state = st_string_escape;
+ return;
+
+ case '(':
+ this->val += ch;
+ ++this->string_depth;
+ return;
+
+ case ')':
+ if (--this->string_depth == 0) {
+ this->type = tt_string;
+ this->state = st_token_ready;
+ return;
}
+
+ this->val += ch;
+ return;
+
+ case '\r':
+ // CR by itself is converted to LF
+ this->val += '\n';
+ this->state = st_string_after_cr;
+ return;
+
+ case '\n':
+ this->val += ch;
+ return;
+
+ default:
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inName(char ch)
+{
+ if (isDelimiter(ch)) {
+ // A C-locale whitespace character or delimiter terminates
+ // token. It is important to unread the whitespace
+ // character even though it is ignored since it may be the
+ // newline after a stream keyword. Removing it here could
+ // make the stream-reading code break on some files,
+ // though not on any files in the test suite as of this
+ // writing.
+
+ this->type = this->bad ? tt_bad : tt_name;
+ this->in_token = false;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ } else if (ch == '#') {
+ this->char_code = 0;
+ this->state = st_name_hex1;
} else {
- handled = false;
- }
-
- if (handled) {
- // okay
- } else if (this->m->state == st_in_hexstring) {
- if (ch == '>') {
- this->m->type = tt_string;
- this->m->state = st_token_ready;
- if (this->m->val.length() % 2) {
- // PDF spec says odd hexstrings have implicit
- // trailing 0.
- this->m->val += '0';
- }
- char num[3];
- num[2] = '\0';
- std::string nval;
- for (unsigned int i = 0; i < this->m->val.length(); i += 2) {
- num[0] = this->m->val.at(i);
- num[1] = this->m->val.at(i + 1);
- char nch = static_cast<char>(strtol(num, nullptr, 16));
- nval += nch;
- }
- this->m->val = nval;
- } else if (QUtil::is_hex_digit(ch)) {
- this->m->val += ch;
- } else if (isSpace(ch)) {
- // ignore
- } else {
- this->m->type = tt_bad;
- QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
- this->m->error_message =
- std::string("invalid character (") + ch + ") in hexstring";
- this->m->state = st_token_ready;
- }
+ this->val += ch;
+ }
+}
+
+void
+QPDFTokenizer::inNameHex1(char ch)
+{
+ this->hex_char = ch;
+
+ if ('0' <= ch && ch <= '9') {
+ this->char_code = 16 * (int(ch) - int('0'));
+ this->state = st_name_hex2;
+
+ } else if ('A' <= ch && ch <= 'F') {
+ this->char_code = 16 * (10 + int(ch) - int('A'));
+ this->state = st_name_hex2;
+
+ } else if ('a' <= ch && ch <= 'f') {
+ this->char_code = 16 * (10 + int(ch) - int('a'));
+ this->state = st_name_hex2;
+
} else {
- throw std::logic_error(
- "INTERNAL ERROR: invalid state while reading token");
+ QTC::TC("qpdf", "QPDFTokenizer bad name 1");
+ this->error_message = "name with stray # will not work with PDF >= 1.2";
+ // Use null to encode a bad # -- this is reversed
+ // in QPDF_Name::normalizeName.
+ this->val += '\0';
+ this->state = st_name;
+ inName(ch);
+ }
+}
+
+void
+QPDFTokenizer::inNameHex2(char ch)
+{
+ if ('0' <= ch && ch <= '9') {
+ this->char_code += int(ch) - int('0');
+
+ } else if ('A' <= ch && ch <= 'F') {
+ this->char_code += 10 + int(ch) - int('A');
+
+ } else if ('a' <= ch && ch <= 'f') {
+ this->char_code += 10 + int(ch) - int('a');
+
+ } else {
+ QTC::TC("qpdf", "QPDFTokenizer bad name 2");
+ this->error_message = "name with stray # will not work with PDF >= 1.2";
+ // Use null to encode a bad # -- this is reversed
+ // in QPDF_Name::normalizeName.
+ this->val += '\0';
+ this->val += this->hex_char;
+ this->state = st_name;
+ inName(ch);
+ return;
+ }
+ if (this->char_code == 0) {
+ QTC::TC("qpdf", "QPDFTokenizer null in name");
+ this->error_message = "null character not allowed in name token";
+ this->val += "#00";
+ this->state = st_name;
+ this->bad = true;
+ } else {
+ this->val += char(this->char_code);
+ this->state = st_name;
+ }
+}
+
+void
+QPDFTokenizer::inSign(char ch)
+{
+ if (QUtil::is_digit(ch)) {
+ this->state = st_number;
+ this->val += ch;
+ } else if (ch == '.') {
+ this->state = st_decimal;
+ this->val += ch;
+ } else {
+ this->state = st_literal;
+ inLiteral(ch);
}
+}
- if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) {
- resolveLiteral();
+void
+QPDFTokenizer::inDecimal(char ch)
+{
+ if (QUtil::is_digit(ch)) {
+ this->state = st_real;
+ this->val += ch;
+ } else {
+ this->state = st_literal;
+ inLiteral(ch);
}
+}
+
+void
+QPDFTokenizer::inNumber(char ch)
+{
+ if (QUtil::is_digit(ch)) {
+ this->val += ch;
+ } else if (ch == '.') {
+ this->state = st_real;
+ this->val += ch;
+ } else if (isDelimiter(ch)) {
+ this->type = tt_integer;
+ this->state = st_token_ready;
+ this->in_token = false;
+ this->char_to_unread = ch;
+ } else {
+ this->state = st_literal;
+ this->val += ch;
+ }
+}
+
+void
+QPDFTokenizer::inReal(char ch)
+{
+ if (QUtil::is_digit(ch)) {
+ this->val += ch;
+ } else if (isDelimiter(ch)) {
+ this->type = tt_real;
+ this->state = st_token_ready;
+ this->in_token = false;
+ this->char_to_unread = ch;
+ } else {
+ this->state = st_literal;
+ this->val += ch;
+ }
+}
+void
+QPDFTokenizer::inStringEscape(char ch)
+{
+ this->state = st_in_string;
+ switch (ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ this->state = st_char_code;
+ this->char_code = 0;
+ this->digit_count = 0;
+ inCharCode(ch);
+ return;
+
+ case 'n':
+ this->val += '\n';
+ return;
+
+ case 'r':
+ this->val += '\r';
+ return;
+
+ case 't':
+ this->val += '\t';
+ return;
+
+ case 'b':
+ this->val += '\b';
+ return;
- if (!(betweenTokens() ||
- ((this->m->state == st_token_ready) && this->m->unread_char))) {
- this->m->raw_val += orig_ch;
+ case 'f':
+ this->val += '\f';
+ return;
+
+ case '\n':
+ return;
+
+ case '\r':
+ this->state = st_string_after_cr;
+ return;
+
+ default:
+ // PDF spec says backslash is ignored before anything else
+ this->val += ch;
+ return;
+ }
+}
+
+void
+QPDFTokenizer::inStringAfterCR(char ch)
+{
+ this->state = st_in_string;
+ if (ch != '\n') {
+ inString(ch);
+ }
+}
+
+void
+QPDFTokenizer::inLt(char ch)
+{
+ if (ch == '<') {
+ this->val += "<<";
+ this->type = tt_dict_open;
+ this->state = st_token_ready;
+ return;
+ }
+
+ this->state = st_in_hexstring;
+ inHexstring(ch);
+}
+
+void
+QPDFTokenizer::inGt(char ch)
+{
+ if (ch == '>') {
+ this->val += ">>";
+ this->type = tt_dict_close;
+ this->state = st_token_ready;
+ } else {
+ this->val += ">";
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad >");
+ this->error_message = "unexpected >";
+ this->in_token = false;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ }
+}
+
+void
+QPDFTokenizer::inLiteral(char ch)
+{
+ if (isDelimiter(ch)) {
+ // A C-locale whitespace character or delimiter terminates
+ // token. It is important to unread the whitespace
+ // character even though it is ignored since it may be the
+ // newline after a stream keyword. Removing it here could
+ // make the stream-reading code break on some files,
+ // though not on any files in the test suite as of this
+ // writing.
+
+ this->in_token = false;
+ this->char_to_unread = ch;
+ this->state = st_token_ready;
+ this->type = (this->val == "true") || (this->val == "false")
+ ? tt_bool
+ : (this->val == "null" ? tt_null : tt_word);
+ } else {
+ this->val += ch;
+ }
+}
+
+void
+QPDFTokenizer::inHexstring(char ch)
+{
+ if ('0' <= ch && ch <= '9') {
+ this->char_code = 16 * (int(ch) - int('0'));
+ this->state = st_in_hexstring_2nd;
+
+ } else if ('A' <= ch && ch <= 'F') {
+ this->char_code = 16 * (10 + int(ch) - int('A'));
+ this->state = st_in_hexstring_2nd;
+
+ } else if ('a' <= ch && ch <= 'f') {
+ this->char_code = 16 * (10 + int(ch) - int('a'));
+ this->state = st_in_hexstring_2nd;
+
+ } else if (ch == '>') {
+ this->type = tt_string;
+ this->state = st_token_ready;
+
+ } else if (isSpace(ch)) {
+ // ignore
+
+ } else {
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
+ this->error_message =
+ std::string("invalid character (") + ch + ") in hexstring";
+ this->state = st_token_ready;
+ }
+}
+
+void
+QPDFTokenizer::inHexstring2nd(char ch)
+{
+ if ('0' <= ch && ch <= '9') {
+ this->val += char(this->char_code + int(ch) - int('0'));
+ this->state = st_in_hexstring;
+
+ } else if ('A' <= ch && ch <= 'F') {
+ this->val += char(this->char_code + 10 + int(ch) - int('A'));
+ this->state = st_in_hexstring;
+
+ } else if ('a' <= ch && ch <= 'f') {
+ this->val += char(this->char_code + 10 + int(ch) - int('a'));
+ this->state = st_in_hexstring;
+
+ } else if (ch == '>') {
+ // PDF spec says odd hexstrings have implicit trailing 0.
+ this->val += char(this->char_code);
+ this->type = tt_string;
+ this->state = st_token_ready;
+
+ } else if (isSpace(ch)) {
+ // ignore
+
+ } else {
+ this->type = tt_bad;
+ QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
+ this->error_message =
+ std::string("invalid character (") + ch + ") in hexstring";
+ this->state = st_token_ready;
+ }
+}
+
+void
+QPDFTokenizer::inCharCode(char ch)
+{
+ if (('0' <= ch) && (ch <= '7')) {
+ this->char_code = 8 * this->char_code + (int(ch) - int('0'));
+ if (++(this->digit_count) < 3) {
+ return;
+ }
+ // We've accumulated \ddd. PDF Spec says to ignore
+ // high-order overflow.
+ }
+ this->val += char(this->char_code % 256);
+ this->state = st_in_string;
+ return;
+}
+
+void
+QPDFTokenizer::inInlineImage(char ch)
+{
+ this->val += ch;
+ if (this->val.length() == this->inline_image_bytes) {
+ QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
+ this->type = tt_inline_image;
+ this->inline_image_bytes = 0;
+ this->state = st_token_ready;
}
}
void
QPDFTokenizer::presentEOF()
{
- if (this->m->state == st_literal) {
+ switch (this->state) {
+ case st_name:
+ case st_name_hex1:
+ case st_name_hex2:
+ case st_number:
+ case st_real:
+ case st_sign:
+ case st_decimal:
+ case st_literal:
QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
- resolveLiteral();
- } else if (
- (this->m->include_ignorable) && (this->m->state == st_in_space)) {
- this->m->type = tt_space;
- } else if (
- (this->m->include_ignorable) && (this->m->state == st_in_comment)) {
- this->m->type = tt_comment;
- } else if (betweenTokens()) {
- this->m->type = tt_eof;
- } else if (this->m->state != st_token_ready) {
+ // Push any delimiter to the state machine to finish off the final
+ // token.
+ presentCharacter('\f');
+ this->in_token = true;
+ break;
+
+ case st_top:
+ case st_before_token:
+ this->type = tt_eof;
+ break;
+
+ case st_in_space:
+ this->type = this->include_ignorable ? tt_space : tt_eof;
+ break;
+
+ case st_in_comment:
+ this->type = this->include_ignorable ? tt_comment : tt_bad;
+ break;
+
+ case st_token_ready:
+ break;
+
+ default:
QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
- this->m->type = tt_bad;
- this->m->error_message = "EOF while reading token";
+ this->type = tt_bad;
+ this->error_message = "EOF while reading token";
}
-
- this->m->state = st_token_ready;
+ this->state = st_token_ready;
}
void
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
{
- if (this->m->state != st_top) {
+ if (this->state != st_before_token) {
throw std::logic_error("QPDFTokenizer::expectInlineImage called"
" when tokenizer is in improper state");
}
findEI(input);
- this->m->state = st_inline_image;
+ this->before_token = false;
+ this->in_token = true;
+ this->state = st_inline_image;
}
void
@@ -537,7 +874,7 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
if (!input->findFirst("EI", input->tell(), 0, f)) {
break;
}
- this->m->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
+ this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
QPDFTokenizer check;
bool found_bad = false;
@@ -610,19 +947,16 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
bool
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
{
- bool ready = (this->m->state == st_token_ready);
- unread_char = this->m->unread_char;
- ch = this->m->char_to_unread;
+ bool ready = (this->state == st_token_ready);
+ unread_char = !this->in_token && !this->before_token;
+ ch = this->char_to_unread;
if (ready) {
- if (this->m->type == tt_bad) {
- this->m->val = this->m->raw_val;
- }
- token = Token(
- this->m->type,
- this->m->val,
- this->m->raw_val,
- this->m->error_message);
- this->m->reset();
+ token = (this->type == tt_bad)
+ ? Token(
+ this->type, this->raw_val, this->raw_val, this->error_message)
+ : Token(this->type, this->val, this->raw_val, this->error_message);
+
+ this->reset();
}
return ready;
}
@@ -630,11 +964,7 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
bool
QPDFTokenizer::betweenTokens()
{
- return (
- (this->m->state == st_top) ||
- ((!this->m->include_ignorable) &&
- ((this->m->state == st_in_comment) ||
- (this->m->state == st_in_space))));
+ return this->before_token;
}
QPDFTokenizer::Token
@@ -644,49 +974,46 @@ QPDFTokenizer::readToken(
bool allow_bad,
size_t max_len)
{
- qpdf_offset_t offset = input->tell();
- Token token;
- bool unread_char;
- char char_to_unread;
- bool presented_eof = false;
- while (!getToken(token, unread_char, char_to_unread)) {
+ qpdf_offset_t offset = input->fastTell();
+
+ while (this->state != st_token_ready) {
char ch;
- if (input->read(&ch, 1) == 0) {
- if (!presented_eof) {
- presentEOF();
- presented_eof = true;
- if ((this->m->type == tt_eof) && (!this->m->allow_eof)) {
- // Nothing in the qpdf library calls readToken
- // without allowEOF anymore, so this case is not
- // exercised.
- this->m->type = tt_bad;
- this->m->error_message = "unexpected EOF";
- offset = input->getLastOffset();
- }
- } else {
- throw std::logic_error(
- "getToken returned false after presenting EOF");
+ if (!input->fastRead(ch)) {
+ presentEOF();
+
+ if ((this->type == tt_eof) && (!this->allow_eof)) {
+ // Nothing in the qpdf library calls readToken
+ // without allowEOF anymore, so this case is not
+ // exercised.
+ this->type = tt_bad;
+ this->error_message = "unexpected EOF";
+ offset = input->getLastOffset();
}
} else {
- presentCharacter(ch);
- if (betweenTokens() && (input->getLastOffset() == offset)) {
+ handleCharacter(ch);
+ if (this->before_token) {
++offset;
}
- if (max_len && (this->m->raw_val.length() >= max_len) &&
- (this->m->state != st_token_ready)) {
+ if (this->in_token) {
+ this->raw_val += ch;
+ }
+ if (max_len && (this->raw_val.length() >= max_len) &&
+ (this->state != st_token_ready)) {
// terminate this token now
QTC::TC("qpdf", "QPDFTokenizer block long token");
- this->m->type = tt_bad;
- this->m->state = st_token_ready;
- this->m->error_message =
+ this->type = tt_bad;
+ this->state = st_token_ready;
+ this->error_message =
"exceeded allowable length while reading token";
}
}
}
- if (unread_char) {
- input->unreadCh(char_to_unread);
- }
+ Token token;
+ bool unread_char;
+ char char_to_unread;
+ getToken(token, unread_char, char_to_unread);
+ input->fastUnread(unread_char);
if (token.getType() != tt_eof) {
input->setLastOffset(offset);
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index 4e58aaf7..d565ece0 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -1207,52 +1207,6 @@ QUtil::random()
return result;
}
-bool
-QUtil::is_hex_digit(char ch)
-{
- return (ch && (strchr("0123456789abcdefABCDEF", ch) != nullptr));
-}
-
-bool
-QUtil::is_space(char ch)
-{
- return (ch && (strchr(" \f\n\r\t\v", ch) != nullptr));
-}
-
-bool
-QUtil::is_digit(char ch)
-{
- return ((ch >= '0') && (ch <= '9'));
-}
-
-bool
-QUtil::is_number(char const* p)
-{
- // ^[\+\-]?(\.\d*|\d+(\.\d*)?)$
- if (!*p) {
- return false;
- }
- if ((*p == '-') || (*p == '+')) {
- ++p;
- }
- bool found_dot = false;
- bool found_digit = false;
- for (; *p; ++p) {
- if (*p == '.') {
- if (found_dot) {
- // only one dot
- return false;
- }
- found_dot = true;
- } else if (QUtil::is_digit(*p)) {
- found_digit = true;
- } else {
- return false;
- }
- }
- return found_digit;
-}
-
void
QUtil::read_file_into_memory(
char const* filename, std::shared_ptr<char>& file_buf, size_t& size)
diff --git a/manual/release-notes.rst b/manual/release-notes.rst
index ab2c1d8e..01a19249 100644
--- a/manual/release-notes.rst
+++ b/manual/release-notes.rst
@@ -11,7 +11,8 @@ For a detailed list of changes, please see the file
- Many performance enhancements have been added. In developer
performance benchmarks, gains on the order of 20% have been
- observed.
+ observed. Most of that work, including major optimization of
+ qpdf's lexical layer, was done by M. Holger.
- Replacement of ``PointerHolder`` with ``std::shared_ptr``
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index f535b9ee..9e106902 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -66,8 +66,10 @@ QPDF can't find xref 0
QPDFTokenizer bad ) 0
QPDFTokenizer bad > 0
QPDFTokenizer bad hexstring character 0
+QPDFTokenizer bad hexstring 2nd character 0
QPDFTokenizer null in name 0
-QPDFTokenizer bad name 0
+QPDFTokenizer bad name 1 0
+QPDFTokenizer bad name 2 0
QPDF_Stream invalid filter 0
QPDF UseOutlines but no Outlines 0
QPDFObjectHandle makeDirect loop 0