diff options
Diffstat (limited to 'libqpdf')
-rw-r--r-- | libqpdf/BufferInputSource.cc | 85 | ||||
-rw-r--r-- | libqpdf/ClosedFileInputSource.cc | 46 | ||||
-rw-r--r-- | libqpdf/FileInputSource.cc | 85 | ||||
-rw-r--r-- | libqpdf/QPDFTokenizer.cc | 1105 | ||||
-rw-r--r-- | libqpdf/QUtil.cc | 46 |
5 files changed, 814 insertions, 553 deletions
diff --git a/libqpdf/BufferInputSource.cc b/libqpdf/BufferInputSource.cc index 5b59c801..6402f639 100644 --- a/libqpdf/BufferInputSource.cc +++ b/libqpdf/BufferInputSource.cc @@ -7,8 +7,8 @@ #include <stdexcept> #include <string.h> -BufferInputSource::Members::Members( - bool own_memory, std::string const& description, Buffer* buf) : +BufferInputSource::BufferInputSource( + std::string const& description, Buffer* buf, bool own_memory) : own_memory(own_memory), description(description), buf(buf), @@ -18,60 +18,54 @@ BufferInputSource::Members::Members( } BufferInputSource::BufferInputSource( - std::string const& description, Buffer* buf, bool own_memory) : - m(new Members(own_memory, description, buf)) -{ -} - -BufferInputSource::BufferInputSource( std::string const& description, std::string const& contents) : - m(new Members(true, description, nullptr)) + own_memory(true), + description(description), + buf(new Buffer(contents.length())), + cur_offset(0), + max_offset(QIntC::to_offset(buf->getSize())) { - this->m->buf = new Buffer(contents.length()); - this->m->max_offset = QIntC::to_offset(this->m->buf->getSize()); - unsigned char* bp = this->m->buf->getBuffer(); - memcpy(bp, contents.c_str(), contents.length()); + memcpy(buf->getBuffer(), contents.c_str(), contents.length()); } BufferInputSource::~BufferInputSource() { - if (this->m->own_memory) { - delete this->m->buf; + if (this->own_memory) { + delete this->buf; } } qpdf_offset_t BufferInputSource::findAndSkipNextEOL() { - if (this->m->cur_offset < 0) { + if (this->cur_offset < 0) { throw std::logic_error("INTERNAL ERROR: BufferInputSource offset < 0"); } - qpdf_offset_t end_pos = this->m->max_offset; - if (this->m->cur_offset >= end_pos) { + qpdf_offset_t end_pos = this->max_offset; + if (this->cur_offset >= end_pos) { this->last_offset = end_pos; - this->m->cur_offset = end_pos; + this->cur_offset = end_pos; return end_pos; } qpdf_offset_t result = 0; - unsigned char const* buffer = this->m->buf->getBuffer(); + unsigned char const* buffer = this->buf->getBuffer(); unsigned char const* end = buffer + end_pos; - unsigned char const* p = buffer + this->m->cur_offset; + unsigned char const* p = buffer + this->cur_offset; while ((p < end) && !((*p == '\r') || (*p == '\n'))) { ++p; } if (p < end) { result = p - buffer; - this->m->cur_offset = result + 1; + this->cur_offset = result + 1; ++p; - while ((this->m->cur_offset < end_pos) && - ((*p == '\r') || (*p == '\n'))) { + while ((this->cur_offset < end_pos) && ((*p == '\r') || (*p == '\n'))) { ++p; - ++this->m->cur_offset; + ++this->cur_offset; } } else { - this->m->cur_offset = end_pos; + this->cur_offset = end_pos; result = end_pos; } return result; @@ -80,13 +74,13 @@ BufferInputSource::findAndSkipNextEOL() std::string const& BufferInputSource::getName() const { - return this->m->description; + return this->description; } qpdf_offset_t BufferInputSource::tell() { - return this->m->cur_offset; + return this->cur_offset; } void @@ -94,17 +88,17 @@ BufferInputSource::seek(qpdf_offset_t offset, int whence) { switch (whence) { case SEEK_SET: - this->m->cur_offset = offset; + this->cur_offset = offset; break; case SEEK_END: - QIntC::range_check(this->m->max_offset, offset); - this->m->cur_offset = this->m->max_offset + offset; + QIntC::range_check(this->max_offset, offset); + this->cur_offset = this->max_offset + offset; break; case SEEK_CUR: - QIntC::range_check(this->m->cur_offset, offset); - this->m->cur_offset += offset; + QIntC::range_check(this->cur_offset, offset); + this->cur_offset += offset; break; default: @@ -113,42 +107,41 @@ BufferInputSource::seek(qpdf_offset_t offset, int whence) break; } - if (this->m->cur_offset < 0) { + if (this->cur_offset < 0) { throw std::runtime_error( - this->m->description + ": seek before beginning of buffer"); + this->description + ": seek before beginning of buffer"); } } void BufferInputSource::rewind() { - this->m->cur_offset = 0; + this->cur_offset = 0; } size_t BufferInputSource::read(char* buffer, size_t length) { - if (this->m->cur_offset < 0) { + if (this->cur_offset < 0) { throw std::logic_error("INTERNAL ERROR: BufferInputSource offset < 0"); } - qpdf_offset_t end_pos = this->m->max_offset; - if (this->m->cur_offset >= end_pos) { + qpdf_offset_t end_pos = this->max_offset; + if (this->cur_offset >= end_pos) { this->last_offset = end_pos; return 0; } - this->last_offset = this->m->cur_offset; - size_t len = - std::min(QIntC::to_size(end_pos - this->m->cur_offset), length); - memcpy(buffer, this->m->buf->getBuffer() + this->m->cur_offset, len); - this->m->cur_offset += QIntC::to_offset(len); + this->last_offset = this->cur_offset; + size_t len = std::min(QIntC::to_size(end_pos - this->cur_offset), length); + memcpy(buffer, this->buf->getBuffer() + this->cur_offset, len); + this->cur_offset += QIntC::to_offset(len); return len; } void BufferInputSource::unreadCh(char ch) { - if (this->m->cur_offset > 0) { - --this->m->cur_offset; + if (this->cur_offset > 0) { + --this->cur_offset; } } diff --git a/libqpdf/ClosedFileInputSource.cc b/libqpdf/ClosedFileInputSource.cc index ec977c69..06ebb156 100644 --- a/libqpdf/ClosedFileInputSource.cc +++ b/libqpdf/ClosedFileInputSource.cc @@ -2,18 +2,13 @@ #include <qpdf/FileInputSource.hh> -ClosedFileInputSource::Members::Members(char const* filename) : +ClosedFileInputSource::ClosedFileInputSource(char const* filename) : filename(filename), offset(0), stay_open(false) { } -ClosedFileInputSource::ClosedFileInputSource(char const* filename) : - m(new Members(filename)) -{ -} - ClosedFileInputSource::~ClosedFileInputSource() { // Must be explicit and not inline -- see QPDF_DLL_CLASS in @@ -23,30 +18,29 @@ ClosedFileInputSource::~ClosedFileInputSource() void ClosedFileInputSource::before() { - if (nullptr == this->m->fis) { - this->m->fis = - std::make_shared<FileInputSource>(this->m->filename.c_str()); - this->m->fis->seek(this->m->offset, SEEK_SET); - this->m->fis->setLastOffset(this->last_offset); + if (nullptr == this->fis) { + this->fis = std::make_shared<FileInputSource>(this->filename.c_str()); + this->fis->seek(this->offset, SEEK_SET); + this->fis->setLastOffset(this->last_offset); } } void ClosedFileInputSource::after() { - this->last_offset = this->m->fis->getLastOffset(); - this->m->offset = this->m->fis->tell(); - if (this->m->stay_open) { + this->last_offset = this->fis->getLastOffset(); + this->offset = this->fis->tell(); + if (this->stay_open) { return; } - this->m->fis = nullptr; + this->fis = nullptr; } qpdf_offset_t ClosedFileInputSource::findAndSkipNextEOL() { before(); - qpdf_offset_t r = this->m->fis->findAndSkipNextEOL(); + qpdf_offset_t r = this->fis->findAndSkipNextEOL(); after(); return r; } @@ -54,14 +48,14 @@ ClosedFileInputSource::findAndSkipNextEOL() std::string const& ClosedFileInputSource::getName() const { - return this->m->filename; + return this->filename; } qpdf_offset_t ClosedFileInputSource::tell() { before(); - qpdf_offset_t r = this->m->fis->tell(); + qpdf_offset_t r = this->fis->tell(); after(); return r; } @@ -70,16 +64,16 @@ void ClosedFileInputSource::seek(qpdf_offset_t offset, int whence) { before(); - this->m->fis->seek(offset, whence); + this->fis->seek(offset, whence); after(); } void ClosedFileInputSource::rewind() { - this->m->offset = 0; - if (this->m->fis.get()) { - this->m->fis->rewind(); + this->offset = 0; + if (this->fis.get()) { + this->fis->rewind(); } } @@ -87,7 +81,7 @@ size_t ClosedFileInputSource::read(char* buffer, size_t length) { before(); - size_t r = this->m->fis->read(buffer, length); + size_t r = this->fis->read(buffer, length); after(); return r; } @@ -96,7 +90,7 @@ void ClosedFileInputSource::unreadCh(char ch) { before(); - this->m->fis->unreadCh(ch); + this->fis->unreadCh(ch); // Don't call after -- the file has to stay open after this // operation. } @@ -104,8 +98,8 @@ ClosedFileInputSource::unreadCh(char ch) void ClosedFileInputSource::stayOpen(bool val) { - this->m->stay_open = val; - if ((!val) && this->m->fis.get()) { + this->stay_open = val; + if ((!val) && this->fis.get()) { after(); } } diff --git a/libqpdf/FileInputSource.cc b/libqpdf/FileInputSource.cc index ab88d302..2b1ee1ab 100644 --- a/libqpdf/FileInputSource.cc +++ b/libqpdf/FileInputSource.cc @@ -5,60 +5,52 @@ #include <algorithm> #include <string.h> -FileInputSource::Members::Members(bool close_file) : - close_file(close_file), - file(nullptr) -{ -} - -FileInputSource::Members::~Members() -{ - if (this->file && this->close_file) { - fclose(this->file); - } -} - FileInputSource::FileInputSource() : - m(new Members(false)) + close_file(false), + file(nullptr) { } FileInputSource::FileInputSource(char const* filename) : - m(new Members(false)) + close_file(true), + filename(filename), + file(QUtil::safe_fopen(filename, "rb")) { - setFilename(filename); } FileInputSource::FileInputSource( char const* description, FILE* filep, bool close_file) : - m(new Members(false)) + close_file(close_file), + filename(description), + file(filep) +{ +} + +FileInputSource::~FileInputSource() { - setFile(description, filep, close_file); + // Must be explicit and not inline -- see QPDF_DLL_CLASS in + // README-maintainer + if (this->file && this->close_file) { + fclose(this->file); + } } void FileInputSource::setFilename(char const* filename) { - this->m = std::shared_ptr<Members>(new Members(true)); - this->m->filename = filename; - this->m->file = QUtil::safe_fopen(filename, "rb"); + this->close_file = true; + this->filename = filename; + this->file = QUtil::safe_fopen(filename, "rb"); } void FileInputSource::setFile(char const* description, FILE* filep, bool close_file) { - this->m = std::shared_ptr<Members>(new Members(close_file)); - this->m->filename = description; - this->m->file = filep; + this->filename = description; + this->file = filep; this->seek(0, SEEK_SET); } -FileInputSource::~FileInputSource() -{ - // Must be explicit and not inline -- see QPDF_DLL_CLASS in - // README-maintainer -} - qpdf_offset_t FileInputSource::findAndSkipNextEOL() { @@ -66,7 +58,7 @@ FileInputSource::findAndSkipNextEOL() bool done = false; char buf[10240]; while (!done) { - qpdf_offset_t cur_offset = QUtil::tell(this->m->file); + qpdf_offset_t cur_offset = QUtil::tell(this->file); size_t len = this->read(buf, sizeof(buf)); if (len == 0) { done = true; @@ -98,41 +90,42 @@ FileInputSource::findAndSkipNextEOL() std::string const& FileInputSource::getName() const { - return this->m->filename; + return this->filename; } qpdf_offset_t FileInputSource::tell() { - return QUtil::tell(this->m->file); + return QUtil::tell(this->file); } void FileInputSource::seek(qpdf_offset_t offset, int whence) { - QUtil::os_wrapper( - (std::string("seek to ") + this->m->filename + ", offset " + - QUtil::int_to_string(offset) + " (" + QUtil::int_to_string(whence) + - ")"), - QUtil::seek(this->m->file, offset, whence)); + if (QUtil::seek(this->file, offset, whence) == -1) { + QUtil::throw_system_error( + std::string("seek to ") + this->filename + ", offset " + + QUtil::int_to_string(offset) + " (" + QUtil::int_to_string(whence) + + ")"); + } } void FileInputSource::rewind() { - ::rewind(this->m->file); + ::rewind(this->file); } size_t FileInputSource::read(char* buffer, size_t length) { - this->last_offset = this->tell(); - size_t len = fread(buffer, 1, length, this->m->file); + this->last_offset = QUtil::tell(this->file); + size_t len = fread(buffer, 1, length, this->file); if (len == 0) { - if (ferror(this->m->file)) { + if (ferror(this->file)) { throw QPDFExc( qpdf_e_system, - this->m->filename, + this->filename, "", this->last_offset, (std::string("read ") + QUtil::uint_to_string(length) + @@ -148,7 +141,7 @@ FileInputSource::read(char* buffer, size_t length) void FileInputSource::unreadCh(char ch) { - QUtil::os_wrapper( - this->m->filename + ": unread character", - ungetc(static_cast<unsigned char>(ch), this->m->file)); + if (ungetc(static_cast<unsigned char>(ch), this->file) == -1) { + QUtil::throw_system_error(this->filename + ": unread character"); + } } diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 1726e1b9..cd8f932d 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -73,28 +73,20 @@ QPDFWordTokenFinder::check() return true; } -QPDFTokenizer::Members::Members() : - allow_eof(false), - include_ignorable(false) -{ - reset(); -} - void -QPDFTokenizer::Members::reset() +QPDFTokenizer::reset() { - state = st_top; + state = st_before_token; type = tt_bad; - val = ""; - raw_val = ""; + val.clear(); + raw_val.clear(); error_message = ""; - unread_char = false; + before_token = true; + in_token = false; char_to_unread = '\0'; inline_image_bytes = 0; string_depth = 0; - string_ignoring_newline = false; - last_char_was_bs = false; - last_char_was_cr = false; + bad = false; } QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : @@ -110,20 +102,22 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : } QPDFTokenizer::QPDFTokenizer() : - m(new Members()) + allow_eof(false), + include_ignorable(false) { + reset(); } void QPDFTokenizer::allowEOF() { - this->m->allow_eof = true; + this->allow_eof = true; } void QPDFTokenizer::includeIgnorable() { - this->m->include_ignorable = true; + this->include_ignorable = true; } bool @@ -139,376 +133,719 @@ QPDFTokenizer::isDelimiter(char ch) } void -QPDFTokenizer::resolveLiteral() -{ - if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/')) { - this->m->type = tt_name; - // Deal with # in name token. Note: '/' by itself is a - // valid name, so don't strip leading /. That way we - // don't have to deal with the empty string as a name. - std::string nval = "/"; - size_t len = this->m->val.length(); - for (size_t i = 1; i < len; ++i) { - char ch = this->m->val.at(i); - if (ch == '#') { - if ((i + 2 < len) && - QUtil::is_hex_digit(this->m->val.at(i + 1)) && - QUtil::is_hex_digit(this->m->val.at(i + 2))) { - char num[3]; - num[0] = this->m->val.at(i + 1); - num[1] = this->m->val.at(i + 2); - num[2] = '\0'; - char ch2 = static_cast<char>(strtol(num, nullptr, 16)); - if (ch2 == '\0') { - this->m->type = tt_bad; - QTC::TC("qpdf", "QPDFTokenizer null in name"); - this->m->error_message = - "null character not allowed in name token"; - nval += "#00"; - } else { - nval.append(1, ch2); - } - i += 2; - } else { - QTC::TC("qpdf", "QPDFTokenizer bad name"); - this->m->error_message = - "name with stray # will not work with PDF >= 1.2"; - // Use null to encode a bad # -- this is reversed - // in QPDF_Name::normalizeName. - nval += '\0'; - } - } else { - nval.append(1, ch); - } - } - this->m->val = nval; - } else if (QUtil::is_number(this->m->val.c_str())) { - if (this->m->val.find('.') != std::string::npos) { - this->m->type = tt_real; - } else { - this->m->type = tt_integer; - } - } else if ((this->m->val == "true") || (this->m->val == "false")) { - this->m->type = tt_bool; - } else if (this->m->val == "null") { - this->m->type = tt_null; - } else { - // I don't really know what it is, so leave it as tt_word. - // Lots of cases ($, #, etc.) other than actual words fall - // into this category, but that's okay at least for now. - this->m->type = tt_word; +QPDFTokenizer::presentCharacter(char ch) +{ + handleCharacter(ch); + + if (this->in_token) { + this->raw_val += ch; } } void -QPDFTokenizer::presentCharacter(char ch) +QPDFTokenizer::handleCharacter(char ch) { - if (this->m->state == st_token_ready) { + // State machine is implemented such that the final character may not be + // handled. This happens whenever you have to use a character from the + // next token to detect the end of the current token. + + switch (this->state) { + case st_top: + inTop(ch); + return; + + case st_in_space: + inSpace(ch); + return; + + case st_in_comment: + inComment(ch); + return; + + case st_lt: + inLt(ch); + return; + + case st_gt: + inGt(ch); + return; + + case st_in_string: + inString(ch); + return; + + case st_name: + inName(ch); + return; + + case st_number: + inNumber(ch); + return; + + case st_real: + inReal(ch); + return; + + case st_string_after_cr: + inStringAfterCR(ch); + return; + + case st_string_escape: + inStringEscape(ch); + return; + + case st_char_code: + inCharCode(ch); + return; + + case st_literal: + inLiteral(ch); + return; + + case st_inline_image: + inInlineImage(ch); + return; + + case st_in_hexstring: + inHexstring(ch); + return; + + case st_in_hexstring_2nd: + inHexstring2nd(ch); + return; + + case st_name_hex1: + inNameHex1(ch); + return; + + case st_name_hex2: + inNameHex2(ch); + return; + + case st_sign: + inSign(ch); + return; + + case st_decimal: + inDecimal(ch); + return; + + case (st_before_token): + inBeforeToken(ch); + return; + + case (st_token_ready): + inTokenReady(ch); + return; + + default: throw std::logic_error( - "INTERNAL ERROR: QPDF tokenizer presented character " - "while token is waiting"); + "INTERNAL ERROR: invalid state while reading token"); } +} - char orig_ch = ch; - - // State machine is implemented such that some characters may be - // handled more than once. This happens whenever you have to use - // the character that caused a state change in the new state. +void +QPDFTokenizer::inTokenReady(char ch) +{ + throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character " + "while token is waiting"); +} - bool handled = true; - if (this->m->state == st_top) { - // Note: we specifically do not use ctype here. It is - // locale-dependent. - if (isSpace(ch)) { - if (this->m->include_ignorable) { - this->m->state = st_in_space; - this->m->val += ch; - } - } else if (ch == '%') { - this->m->state = st_in_comment; - if (this->m->include_ignorable) { - this->m->val += ch; - } - } else if (ch == '(') { - this->m->string_depth = 1; - this->m->string_ignoring_newline = false; - memset( - this->m->bs_num_register, - '\0', - sizeof(this->m->bs_num_register)); - this->m->last_char_was_bs = false; - this->m->last_char_was_cr = false; - this->m->state = st_in_string; - } else if (ch == '<') { - this->m->state = st_lt; - } else if (ch == '>') { - this->m->state = st_gt; - } else { - this->m->val += ch; - if (ch == ')') { - this->m->type = tt_bad; - QTC::TC("qpdf", "QPDFTokenizer bad )"); - this->m->error_message = "unexpected )"; - this->m->state = st_token_ready; - } else if (ch == '[') { - this->m->type = tt_array_open; - this->m->state = st_token_ready; - } else if (ch == ']') { - this->m->type = tt_array_close; - this->m->state = st_token_ready; - } else if (ch == '{') { - this->m->type = tt_brace_open; - this->m->state = st_token_ready; - } else if (ch == '}') { - this->m->type = tt_brace_close; - this->m->state = st_token_ready; - } else { - this->m->state = st_literal; - } - } - } else if (this->m->state == st_in_space) { - // We only enter this state if include_ignorable is true. - if (!isSpace(ch)) { - this->m->type = tt_space; - this->m->unread_char = true; - this->m->char_to_unread = ch; - this->m->state = st_token_ready; - } else { - this->m->val += ch; - } - } else if (this->m->state == st_in_comment) { - if ((ch == '\r') || (ch == '\n')) { - if (this->m->include_ignorable) { - this->m->type = tt_comment; - this->m->unread_char = true; - this->m->char_to_unread = ch; - this->m->state = st_token_ready; - } else { - this->m->state = st_top; - } - } else if (this->m->include_ignorable) { - this->m->val += ch; - } - } else if (this->m->state == st_lt) { - if (ch == '<') { - this->m->val = "<<"; - this->m->type = tt_dict_open; - this->m->state = st_token_ready; - } else { - handled = false; - this->m->state = st_in_hexstring; - } - } else if (this->m->state == st_gt) { - if (ch == '>') { - this->m->val = ">>"; - this->m->type = tt_dict_close; - this->m->state = st_token_ready; - } else { - this->m->val = ">"; - this->m->type = tt_bad; - QTC::TC("qpdf", "QPDFTokenizer bad >"); - this->m->error_message = "unexpected >"; - this->m->unread_char = true; - this->m->char_to_unread = ch; - this->m->state = st_token_ready; +void +QPDFTokenizer::inBeforeToken(char ch) +{ + // Note: we specifically do not use ctype here. It is + // locale-dependent. + if (isSpace(ch)) { + this->before_token = !this->include_ignorable; + this->in_token = this->include_ignorable; + if (this->include_ignorable) { + this->state = st_in_space; + this->val += ch; } - } else if (this->m->state == st_in_string) { - if (this->m->string_ignoring_newline && (ch != '\n')) { - this->m->string_ignoring_newline = false; + } else if (ch == '%') { + this->before_token = !this->include_ignorable; + this->in_token = this->include_ignorable; + this->state = st_in_comment; + if (this->include_ignorable) { + this->val += ch; } + } else { + this->before_token = false; + this->in_token = true; + inTop(ch); + } +} - size_t bs_num_count = strlen(this->m->bs_num_register); - bool ch_is_octal = ((ch >= '0') && (ch <= '7')); - if ((bs_num_count == 3) || ((bs_num_count > 0) && (!ch_is_octal))) { - // We've accumulated \ddd. PDF Spec says to ignore - // high-order overflow. - this->m->val += - static_cast<char>(strtol(this->m->bs_num_register, nullptr, 8)); - memset( - this->m->bs_num_register, - '\0', - sizeof(this->m->bs_num_register)); - bs_num_count = 0; - } +void +QPDFTokenizer::inTop(char ch) +{ + switch (ch) { + case '(': + this->string_depth = 1; + this->state = st_in_string; + return; - if (this->m->string_ignoring_newline && (ch == '\n')) { - // ignore - this->m->string_ignoring_newline = false; - } else if ( - ch_is_octal && (this->m->last_char_was_bs || (bs_num_count > 0))) { - this->m->bs_num_register[bs_num_count++] = ch; - } else if (this->m->last_char_was_bs) { - switch (ch) { - case 'n': - this->m->val += '\n'; - break; + case '<': + this->state = st_lt; + return; - case 'r': - this->m->val += '\r'; - break; + case '>': + this->state = st_gt; + return; - case 't': - this->m->val += '\t'; - break; + case (')'): + this->type = tt_bad; + QTC::TC("qpdf", "QPDFTokenizer bad )"); + this->error_message = "unexpected )"; + this->val += ch; + this->state = st_token_ready; + return; - case 'b': - this->m->val += '\b'; - break; + case '[': + this->type = tt_array_open; + this->state = st_token_ready; + this->val += ch; + return; - case 'f': - this->m->val += '\f'; - break; + case ']': + this->type = tt_array_close; + this->val += ch; + this->state = st_token_ready; + return; - case '\n': - break; + case '{': + this->type = tt_brace_open; + this->state = st_token_ready; + this->val += ch; + return; - case '\r': - this->m->string_ignoring_newline = true; - break; + case '}': + this->type = tt_brace_close; + this->state = st_token_ready; + this->val += ch; + return; - default: - // PDF spec says backslash is ignored before anything else - this->m->val += ch; - break; - } - } else if (ch == '\\') { - // last_char_was_bs is set/cleared below as appropriate - if (bs_num_count) { - throw std::logic_error( - "INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 " - "when ch == '\\'"); - } - } else if (ch == '(') { - this->m->val += ch; - ++this->m->string_depth; - } else if ((ch == ')') && (--this->m->string_depth == 0)) { - this->m->type = tt_string; - this->m->state = st_token_ready; - } else if (ch == '\r') { - // CR by itself is converted to LF - this->m->val += '\n'; - } else if (ch == '\n') { - // CR LF is converted to LF - if (!this->m->last_char_was_cr) { - this->m->val += ch; - } - } else { - this->m->val += ch; - } + case '/': + this->state = st_name; + this->val += ch; + return; - this->m->last_char_was_cr = - ((!this->m->string_ignoring_newline) && (ch == '\r')); - this->m->last_char_was_bs = - ((!this->m->last_char_was_bs) && (ch == '\\')); - } else if (this->m->state == st_literal) { - if (isDelimiter(ch)) { - // A C-locale whitespace character or delimiter terminates - // token. It is important to unread the whitespace - // character even though it is ignored since it may be the - // newline after a stream keyword. Removing it here could - // make the stream-reading code break on some files, - // though not on any files in the test suite as of this - // writing. - - this->m->type = tt_word; - this->m->unread_char = true; - this->m->char_to_unread = ch; - this->m->state = st_token_ready; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + this->state = st_number; + this->val += ch; + return; + + case '+': + case '-': + this->state = st_sign; + this->val += ch; + return; + + case '.': + this->state = st_decimal; + this->val += ch; + return; + + default: + this->state = st_literal; + this->val += ch; + return; + } +} + +void +QPDFTokenizer::inSpace(char ch) +{ + // We only enter this state if include_ignorable is true. + if (!isSpace(ch)) { + this->type = tt_space; + this->in_token = false; + this->char_to_unread = ch; + this->state = st_token_ready; + return; + } else { + this->val += ch; + return; + } +} + +void +QPDFTokenizer::inComment(char ch) +{ + if ((ch == '\r') || (ch == '\n')) { + if (this->include_ignorable) { + this->type = tt_comment; + this->in_token = false; + this->char_to_unread = ch; + this->state = st_token_ready; } else { - this->m->val += ch; + this->state = st_before_token; } - } else if (this->m->state == st_inline_image) { - this->m->val += ch; - size_t len = this->m->val.length(); - if (len == this->m->inline_image_bytes) { - QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); - this->m->type = tt_inline_image; - this->m->inline_image_bytes = 0; - this->m->state = st_token_ready; + } else if (this->include_ignorable) { + this->val += ch; + } +} + +void +QPDFTokenizer::inString(char ch) +{ + switch (ch) { + case '\\': + this->state = st_string_escape; + return; + + case '(': + this->val += ch; + ++this->string_depth; + return; + + case ')': + if (--this->string_depth == 0) { + this->type = tt_string; + this->state = st_token_ready; + return; } + + this->val += ch; + return; + + case '\r': + // CR by itself is converted to LF + this->val += '\n'; + this->state = st_string_after_cr; + return; + + case '\n': + this->val += ch; + return; + + default: + this->val += ch; + return; + } +} + +void +QPDFTokenizer::inName(char ch) +{ + if (isDelimiter(ch)) { + // A C-locale whitespace character or delimiter terminates + // token. It is important to unread the whitespace + // character even though it is ignored since it may be the + // newline after a stream keyword. Removing it here could + // make the stream-reading code break on some files, + // though not on any files in the test suite as of this + // writing. + + this->type = this->bad ? tt_bad : tt_name; + this->in_token = false; + this->char_to_unread = ch; + this->state = st_token_ready; + } else if (ch == '#') { + this->char_code = 0; + this->state = st_name_hex1; } else { - handled = false; - } - - if (handled) { - // okay - } else if (this->m->state == st_in_hexstring) { - if (ch == '>') { - this->m->type = tt_string; - this->m->state = st_token_ready; - if (this->m->val.length() % 2) { - // PDF spec says odd hexstrings have implicit - // trailing 0. - this->m->val += '0'; - } - char num[3]; - num[2] = '\0'; - std::string nval; - for (unsigned int i = 0; i < this->m->val.length(); i += 2) { - num[0] = this->m->val.at(i); - num[1] = this->m->val.at(i + 1); - char nch = static_cast<char>(strtol(num, nullptr, 16)); - nval += nch; - } - this->m->val = nval; - } else if (QUtil::is_hex_digit(ch)) { - this->m->val += ch; - } else if (isSpace(ch)) { - // ignore - } else { - this->m->type = tt_bad; - QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); - this->m->error_message = - std::string("invalid character (") + ch + ") in hexstring"; - this->m->state = st_token_ready; - } + this->val += ch; + } +} + +void +QPDFTokenizer::inNameHex1(char ch) +{ + this->hex_char = ch; + + if ('0' <= ch && ch <= '9') { + this->char_code = 16 * (int(ch) - int('0')); + this->state = st_name_hex2; + + } else if ('A' <= ch && ch <= 'F') { + this->char_code = 16 * (10 + int(ch) - int('A')); + this->state = st_name_hex2; + + } else if ('a' <= ch && ch <= 'f') { + this->char_code = 16 * (10 + int(ch) - int('a')); + this->state = st_name_hex2; + } else { - throw std::logic_error( - "INTERNAL ERROR: invalid state while reading token"); + QTC::TC("qpdf", "QPDFTokenizer bad name 1"); + this->error_message = "name with stray # will not work with PDF >= 1.2"; + // Use null to encode a bad # -- this is reversed + // in QPDF_Name::normalizeName. + this->val += '\0'; + this->state = st_name; + inName(ch); + } +} + +void +QPDFTokenizer::inNameHex2(char ch) +{ + if ('0' <= ch && ch <= '9') { + this->char_code += int(ch) - int('0'); + + } else if ('A' <= ch && ch <= 'F') { + this->char_code += 10 + int(ch) - int('A'); + + } else if ('a' <= ch && ch <= 'f') { + this->char_code += 10 + int(ch) - int('a'); + + } else { + QTC::TC("qpdf", "QPDFTokenizer bad name 2"); + this->error_message = "name with stray # will not work with PDF >= 1.2"; + // Use null to encode a bad # -- this is reversed + // in QPDF_Name::normalizeName. + this->val += '\0'; + this->val += this->hex_char; + this->state = st_name; + inName(ch); + return; + } + if (this->char_code == 0) { + QTC::TC("qpdf", "QPDFTokenizer null in name"); + this->error_message = "null character not allowed in name token"; + this->val += "#00"; + this->state = st_name; + this->bad = true; + } else { + this->val += char(this->char_code); + this->state = st_name; + } +} + +void +QPDFTokenizer::inSign(char ch) +{ + if (QUtil::is_digit(ch)) { + this->state = st_number; + this->val += ch; + } else if (ch == '.') { + this->state = st_decimal; + this->val += ch; + } else { + this->state = st_literal; + inLiteral(ch); } +} - if ((this->m->state == st_token_ready) && (this->m->type == tt_word)) { - resolveLiteral(); +void +QPDFTokenizer::inDecimal(char ch) +{ + if (QUtil::is_digit(ch)) { + this->state = st_real; + this->val += ch; + } else { + this->state = st_literal; + inLiteral(ch); } +} + +void +QPDFTokenizer::inNumber(char ch) +{ + if (QUtil::is_digit(ch)) { + this->val += ch; + } else if (ch == '.') { + this->state = st_real; + this->val += ch; + } else if (isDelimiter(ch)) { + this->type = tt_integer; + this->state = st_token_ready; + this->in_token = false; + this->char_to_unread = ch; + } else { + this->state = st_literal; + this->val += ch; + } +} + +void +QPDFTokenizer::inReal(char ch) +{ + if (QUtil::is_digit(ch)) { + this->val += ch; + } else if (isDelimiter(ch)) { + this->type = tt_real; + this->state = st_token_ready; + this->in_token = false; + this->char_to_unread = ch; + } else { + this->state = st_literal; + this->val += ch; + } +} +void +QPDFTokenizer::inStringEscape(char ch) +{ + this->state = st_in_string; + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + this->state = st_char_code; + this->char_code = 0; + this->digit_count = 0; + inCharCode(ch); + return; + + case 'n': + this->val += '\n'; + return; + + case 'r': + this->val += '\r'; + return; + + case 't': + this->val += '\t'; + return; + + case 'b': + this->val += '\b'; + return; - if (!(betweenTokens() || - ((this->m->state == st_token_ready) && this->m->unread_char))) { - this->m->raw_val += orig_ch; + case 'f': + this->val += '\f'; + return; + + case '\n': + return; + + case '\r': + this->state = st_string_after_cr; + return; + + default: + // PDF spec says backslash is ignored before anything else + this->val += ch; + return; + } +} + +void +QPDFTokenizer::inStringAfterCR(char ch) +{ + this->state = st_in_string; + if (ch != '\n') { + inString(ch); + } +} + +void +QPDFTokenizer::inLt(char ch) +{ + if (ch == '<') { + this->val += "<<"; + this->type = tt_dict_open; + this->state = st_token_ready; + return; + } + + this->state = st_in_hexstring; + inHexstring(ch); +} + +void +QPDFTokenizer::inGt(char ch) +{ + if (ch == '>') { + this->val += ">>"; + this->type = tt_dict_close; + this->state = st_token_ready; + } else { + this->val += ">"; + this->type = tt_bad; + QTC::TC("qpdf", "QPDFTokenizer bad >"); + this->error_message = "unexpected >"; + this->in_token = false; + this->char_to_unread = ch; + this->state = st_token_ready; + } +} + +void +QPDFTokenizer::inLiteral(char ch) +{ + if (isDelimiter(ch)) { + // A C-locale whitespace character or delimiter terminates + // token. It is important to unread the whitespace + // character even though it is ignored since it may be the + // newline after a stream keyword. Removing it here could + // make the stream-reading code break on some files, + // though not on any files in the test suite as of this + // writing. + + this->in_token = false; + this->char_to_unread = ch; + this->state = st_token_ready; + this->type = (this->val == "true") || (this->val == "false") + ? tt_bool + : (this->val == "null" ? tt_null : tt_word); + } else { + this->val += ch; + } +} + +void +QPDFTokenizer::inHexstring(char ch) +{ + if ('0' <= ch && ch <= '9') { + this->char_code = 16 * (int(ch) - int('0')); + this->state = st_in_hexstring_2nd; + + } else if ('A' <= ch && ch <= 'F') { + this->char_code = 16 * (10 + int(ch) - int('A')); + this->state = st_in_hexstring_2nd; + + } else if ('a' <= ch && ch <= 'f') { + this->char_code = 16 * (10 + int(ch) - int('a')); + this->state = st_in_hexstring_2nd; + + } else if (ch == '>') { + this->type = tt_string; + this->state = st_token_ready; + + } else if (isSpace(ch)) { + // ignore + + } else { + this->type = tt_bad; + QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); + this->error_message = + std::string("invalid character (") + ch + ") in hexstring"; + this->state = st_token_ready; + } +} + +void +QPDFTokenizer::inHexstring2nd(char ch) +{ + if ('0' <= ch && ch <= '9') { + this->val += char(this->char_code + int(ch) - int('0')); + this->state = st_in_hexstring; + + } else if ('A' <= ch && ch <= 'F') { + this->val += char(this->char_code + 10 + int(ch) - int('A')); + this->state = st_in_hexstring; + + } else if ('a' <= ch && ch <= 'f') { + this->val += char(this->char_code + 10 + int(ch) - int('a')); + this->state = st_in_hexstring; + + } else if (ch == '>') { + // PDF spec says odd hexstrings have implicit trailing 0. + this->val += char(this->char_code); + this->type = tt_string; + this->state = st_token_ready; + + } else if (isSpace(ch)) { + // ignore + + } else { + this->type = tt_bad; + QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); + this->error_message = + std::string("invalid character (") + ch + ") in hexstring"; + this->state = st_token_ready; + } +} + +void +QPDFTokenizer::inCharCode(char ch) +{ + if (('0' <= ch) && (ch <= '7')) { + this->char_code = 8 * this->char_code + (int(ch) - int('0')); + if (++(this->digit_count) < 3) { + return; + } + // We've accumulated \ddd. PDF Spec says to ignore + // high-order overflow. + } + this->val += char(this->char_code % 256); + this->state = st_in_string; + return; +} + +void +QPDFTokenizer::inInlineImage(char ch) +{ + this->val += ch; + if (this->val.length() == this->inline_image_bytes) { + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); + this->type = tt_inline_image; + this->inline_image_bytes = 0; + this->state = st_token_ready; } } void QPDFTokenizer::presentEOF() { - if (this->m->state == st_literal) { + switch (this->state) { + case st_name: + case st_name_hex1: + case st_name_hex2: + case st_number: + case st_real: + case st_sign: + case st_decimal: + case st_literal: QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); - resolveLiteral(); - } else if ( - (this->m->include_ignorable) && (this->m->state == st_in_space)) { - this->m->type = tt_space; - } else if ( - (this->m->include_ignorable) && (this->m->state == st_in_comment)) { - this->m->type = tt_comment; - } else if (betweenTokens()) { - this->m->type = tt_eof; - } else if (this->m->state != st_token_ready) { + // Push any delimiter to the state machine to finish off the final + // token. + presentCharacter('\f'); + this->in_token = true; + break; + + case st_top: + case st_before_token: + this->type = tt_eof; + break; + + case st_in_space: + this->type = this->include_ignorable ? tt_space : tt_eof; + break; + + case st_in_comment: + this->type = this->include_ignorable ? tt_comment : tt_bad; + break; + + case st_token_ready: + break; + + default: QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); - this->m->type = tt_bad; - this->m->error_message = "EOF while reading token"; + this->type = tt_bad; + this->error_message = "EOF while reading token"; } - - this->m->state = st_token_ready; + this->state = st_token_ready; } void QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) { - if (this->m->state != st_top) { + if (this->state != st_before_token) { throw std::logic_error("QPDFTokenizer::expectInlineImage called" " when tokenizer is in improper state"); } findEI(input); - this->m->state = st_inline_image; + this->before_token = false; + this->in_token = true; + this->state = st_inline_image; } void @@ -537,7 +874,7 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) if (!input->findFirst("EI", input->tell(), 0, f)) { break; } - this->m->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); + this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); QPDFTokenizer check; bool found_bad = false; @@ -610,19 +947,16 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) bool QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) { - bool ready = (this->m->state == st_token_ready); - unread_char = this->m->unread_char; - ch = this->m->char_to_unread; + bool ready = (this->state == st_token_ready); + unread_char = !this->in_token && !this->before_token; + ch = this->char_to_unread; if (ready) { - if (this->m->type == tt_bad) { - this->m->val = this->m->raw_val; - } - token = Token( - this->m->type, - this->m->val, - this->m->raw_val, - this->m->error_message); - this->m->reset(); + token = (this->type == tt_bad) + ? Token( + this->type, this->raw_val, this->raw_val, this->error_message) + : Token(this->type, this->val, this->raw_val, this->error_message); + + this->reset(); } return ready; } @@ -630,11 +964,7 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) bool QPDFTokenizer::betweenTokens() { - return ( - (this->m->state == st_top) || - ((!this->m->include_ignorable) && - ((this->m->state == st_in_comment) || - (this->m->state == st_in_space)))); + return this->before_token; } QPDFTokenizer::Token @@ -644,49 +974,46 @@ QPDFTokenizer::readToken( bool allow_bad, size_t max_len) { - qpdf_offset_t offset = input->tell(); - Token token; - bool unread_char; - char char_to_unread; - bool presented_eof = false; - while (!getToken(token, unread_char, char_to_unread)) { + qpdf_offset_t offset = input->fastTell(); + + while (this->state != st_token_ready) { char ch; - if (input->read(&ch, 1) == 0) { - if (!presented_eof) { - presentEOF(); - presented_eof = true; - if ((this->m->type == tt_eof) && (!this->m->allow_eof)) { - // Nothing in the qpdf library calls readToken - // without allowEOF anymore, so this case is not - // exercised. - this->m->type = tt_bad; - this->m->error_message = "unexpected EOF"; - offset = input->getLastOffset(); - } - } else { - throw std::logic_error( - "getToken returned false after presenting EOF"); + if (!input->fastRead(ch)) { + presentEOF(); + + if ((this->type == tt_eof) && (!this->allow_eof)) { + // Nothing in the qpdf library calls readToken + // without allowEOF anymore, so this case is not + // exercised. + this->type = tt_bad; + this->error_message = "unexpected EOF"; + offset = input->getLastOffset(); } } else { - presentCharacter(ch); - if (betweenTokens() && (input->getLastOffset() == offset)) { + handleCharacter(ch); + if (this->before_token) { ++offset; } - if (max_len && (this->m->raw_val.length() >= max_len) && - (this->m->state != st_token_ready)) { + if (this->in_token) { + this->raw_val += ch; + } + if (max_len && (this->raw_val.length() >= max_len) && + (this->state != st_token_ready)) { // terminate this token now QTC::TC("qpdf", "QPDFTokenizer block long token"); - this->m->type = tt_bad; - this->m->state = st_token_ready; - this->m->error_message = + this->type = tt_bad; + this->state = st_token_ready; + this->error_message = "exceeded allowable length while reading token"; } } } - if (unread_char) { - input->unreadCh(char_to_unread); - } + Token token; + bool unread_char; + char char_to_unread; + getToken(token, unread_char, char_to_unread); + input->fastUnread(unread_char); if (token.getType() != tt_eof) { input->setLastOffset(offset); diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 4e58aaf7..d565ece0 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1207,52 +1207,6 @@ QUtil::random() return result; } -bool -QUtil::is_hex_digit(char ch) -{ - return (ch && (strchr("0123456789abcdefABCDEF", ch) != nullptr)); -} - -bool -QUtil::is_space(char ch) -{ - return (ch && (strchr(" \f\n\r\t\v", ch) != nullptr)); -} - -bool -QUtil::is_digit(char ch) -{ - return ((ch >= '0') && (ch <= '9')); -} - -bool -QUtil::is_number(char const* p) -{ - // ^[\+\-]?(\.\d*|\d+(\.\d*)?)$ - if (!*p) { - return false; - } - if ((*p == '-') || (*p == '+')) { - ++p; - } - bool found_dot = false; - bool found_digit = false; - for (; *p; ++p) { - if (*p == '.') { - if (found_dot) { - // only one dot - return false; - } - found_dot = true; - } else if (QUtil::is_digit(*p)) { - found_digit = true; - } else { - return false; - } - } - return found_digit; -} - void QUtil::read_file_into_memory( char const* filename, std::shared_ptr<char>& file_buf, size_t& size) |