From 296b679d6e3217cc112b7ed19b363b82356615ef Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sun, 6 Aug 2017 08:42:01 -0400 Subject: Implement findFirst and findLast in InputSource Preparing to refactor some pattern searching code to use these instead of their own memchr loops. This should simplify the code that replaces PCRE. --- include/qpdf/BufferInputSource.hh | 10 ++ include/qpdf/FileInputSource.hh | 11 ++ include/qpdf/InputSource.hh | 35 ++++++ include/qpdf/QPDF.hh | 21 ++++ libqpdf/InputSource.cc | 167 +++++++++++++++++++++++++++ libqpdf/QPDF_linearization.cc | 2 +- libtests/build.mk | 1 + libtests/input_source.cc | 108 +++++++++++++++++ libtests/libtests.testcov | 8 ++ libtests/qtest/input_source.test | 26 +++++ libtests/qtest/input_source/input_source.out | 14 +++ 11 files changed, 402 insertions(+), 1 deletion(-) create mode 100644 libtests/input_source.cc create mode 100644 libtests/qtest/input_source.test create mode 100644 libtests/qtest/input_source/input_source.out diff --git a/include/qpdf/BufferInputSource.hh b/include/qpdf/BufferInputSource.hh index 64ee4605..db055783 100644 --- a/include/qpdf/BufferInputSource.hh +++ b/include/qpdf/BufferInputSource.hh @@ -15,17 +15,27 @@ class BufferInputSource: public InputSource { public: + QPDF_DLL BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); + QPDF_DLL BufferInputSource(std::string const& description, std::string const& contents); + QPDF_DLL virtual ~BufferInputSource(); + QPDF_DLL virtual qpdf_offset_t findAndSkipNextEOL(); + QPDF_DLL virtual std::string const& getName() const; + QPDF_DLL virtual qpdf_offset_t tell(); + QPDF_DLL virtual void seek(qpdf_offset_t offset, int whence); + QPDF_DLL virtual void rewind(); + QPDF_DLL virtual size_t read(char* buffer, size_t length); + QPDF_DLL virtual void unreadCh(char ch); private: diff --git a/include/qpdf/FileInputSource.hh b/include/qpdf/FileInputSource.hh index 64457365..3f0c05a9 100644 --- a/include/qpdf/FileInputSource.hh +++ b/include/qpdf/FileInputSource.hh @@ -14,16 +14,27 @@ class FileInputSource: public InputSource { public: + QPDF_DLL FileInputSource(); + QPDF_DLL void setFilename(char const* filename); + QPDF_DLL void setFile(char const* description, FILE* filep, bool close_file); + QPDF_DLL virtual ~FileInputSource(); + QPDF_DLL virtual qpdf_offset_t findAndSkipNextEOL(); + QPDF_DLL virtual std::string const& getName() const; + QPDF_DLL virtual qpdf_offset_t tell(); + QPDF_DLL virtual void seek(qpdf_offset_t offset, int whence); + QPDF_DLL virtual void rewind(); + QPDF_DLL virtual size_t read(char* buffer, size_t length); + QPDF_DLL virtual void unreadCh(char ch); private: diff --git a/include/qpdf/InputSource.hh b/include/qpdf/InputSource.hh index a731918e..c20e5076 100644 --- a/include/qpdf/InputSource.hh +++ b/include/qpdf/InputSource.hh @@ -9,6 +9,7 @@ #ifndef __QPDF_INPUTSOURCE_HH__ #define __QPDF_INPUTSOURCE_HH__ +#include #include #include #include @@ -16,18 +17,52 @@ class InputSource { public: + QPDF_DLL InputSource() : last_offset(0) { } + QPDF_DLL virtual ~InputSource() { } + class Finder + { + public: + Finder() + { + } + virtual ~Finder() + { + } + + virtual bool check() = 0; + }; + + QPDF_DLL void setLastOffset(qpdf_offset_t); + QPDF_DLL qpdf_offset_t getLastOffset() const; + QPDF_DLL std::string readLine(size_t max_line_length); + // Find first or last occurrence of a sequence of characters + // starting within the range defined by offset and len such that, + // when the input source is positioned at the beginning of that + // sequence, finder.check() returns true. If len is 0, the search + // proceeds until EOF. If a qualifying pattern these methods + // return true and leave the input source positioned wherever + // check() left it at the end of the matching pattern. + QPDF_DLL + bool findFirst(char const* start_chars, + qpdf_offset_t offset, size_t len, + Finder& finder); + QPDF_DLL + bool findLast(char const* start_chars, + qpdf_offset_t offset, size_t len, + Finder& finder); + virtual qpdf_offset_t findAndSkipNextEOL() = 0; virtual std::string const& getName() const = 0; virtual qpdf_offset_t tell() = 0; diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index f57789a0..c9d120b4 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -1006,6 +1006,27 @@ class QPDF std::string key; // if ou_trailer_key or ou_root_key }; + class PatternFinder: public InputSource::Finder + { + public: + PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) : + qpdf(qpdf), + checker(checker) + { + } + virtual ~PatternFinder() + { + } + virtual bool check() + { + return (this->qpdf.*checker)(); + } + + private: + QPDF& qpdf; + bool (QPDF::*checker)(); + }; + // methods to support linearization checking -- implemented in // QPDF_linearization.cc void readLinearizationData(); diff --git a/libqpdf/InputSource.cc b/libqpdf/InputSource.cc index 79c889bf..69cafeb8 100644 --- a/libqpdf/InputSource.cc +++ b/libqpdf/InputSource.cc @@ -1,7 +1,10 @@ #include #include +#include +#include #include + void InputSource::setLastOffset(qpdf_offset_t offset) { @@ -39,3 +42,167 @@ InputSource::readLine(size_t max_line_length) } return std::string(buf); } + +bool +InputSource::findFirst(char const* start_chars, + qpdf_offset_t offset, size_t len, + Finder& finder) +{ + // Basic approach: search for the first character of start_chars + // starting from offset but not going past len (if len != 0). Once + // the first character is found, see if it is the beginning of a + // sequence of characters matching start_chars. If so, call + // finder.check() to do caller-specific additional checks. If not, + // keep searching. + + // This code is tricky and highly subject to off-by-one or other + // edge case logic errors. See comments throughout that explain + // how we're not missing any edge cases. There are also tests + // specifically constructed to make sure we caught the edge cases + // in testing. + + char buf[1025]; // size known to input_source.cc in libtests + // To enable us to guarantee null-termination, save an extra byte + // so that buf[size] is valid memory. + size_t size = sizeof(buf) - 1; + if ((strlen(start_chars) < 1) || (strlen(start_chars) > size)) + { + throw std::logic_error( + "InputSource::findSource called with" + " too small or too large of a character sequence"); + } + + char* p = 0; + qpdf_offset_t buf_offset = offset; + size_t bytes_read = 0; + + // Guarantee that we return from this loop. Each time through, we + // either return, advance p, or restart the loop with a condition + // that will cause return on the next pass. Eventually we will + // either be out of range or hit EOF, either of which forces us to + // return. + while (true) + { + // Do we need to read more data? Pretend size = 5, buf starts + // at 0, and start_chars has 3 characters. buf[5] is valid and + // null. If p == 2, start_chars could be buf[2] through + // buf[4], so p + strlen(start_chars) == buf + size is okay. + // If p points to buf[size], since strlen(start_chars) is + // always >= 1, this overflow test will be correct for that + // case regardless of start_chars. + if ((p == 0) || ((p + strlen(start_chars)) > (buf + bytes_read))) + { + if (p) + { + QTC::TC("libtests", "InputSource read next block", + ((p == buf + bytes_read) ? 0 : 1)); + buf_offset += (p - buf); + } + this->seek(buf_offset, SEEK_SET); + // Read into buffer and zero out the rest of the buffer + // including buf[size]. We allocated an extra byte so that + // we could guarantee null termination as an extra + // protection against overrun when using string functions. + bytes_read = this->read(buf, size); + if (bytes_read < strlen(start_chars)) + { + QTC::TC("libtests", "InputSource find EOF", + bytes_read == 0 ? 0 : 1); + return false; + } + memset(buf + bytes_read, '\0', 1 + (size - bytes_read)); + p = buf; + } + + // Search for the first character. + if ((p = static_cast( + memchr(p, start_chars[0], bytes_read - (p - buf)))) != 0) + { + if (p == buf) + { + QTC::TC("libtests", "InputSource found match at buf[0]"); + } + // Found first letter. + if (len != 0) + { + // Make sure it's in range. + size_t p_relative_offset = (p - buf) + (buf_offset - offset); + if (p_relative_offset >= len) + { + // out of range + QTC::TC("libtests", "InputSource out of range"); + return false; + } + } + if ((p + strlen(start_chars)) > (buf + bytes_read)) + { + // If there are not enough bytes left in the file for + // start_chars, we will detect this on the next pass + // as EOF and return. + QTC::TC("libtests", "InputSource not enough bytes"); + continue; + } + + // See if p points to a sequence matching start_chars. We + // already checked above to make sure we are not going to + // overrun memory. + if (strncmp(p, start_chars, strlen(start_chars)) == 0) + { + // Call finder.check() with the input source + // positioned to the point of the match. + this->seek(buf_offset + (p - buf), SEEK_SET); + if (finder.check()) + { + return true; + } + else + { + QTC::TC("libtests", "InputSource start_chars matched but not check"); + } + } + else + { + QTC::TC("libtests", "InputSource first char matched but not string"); + } + // This occurrence of the first character wasn't a match. + // Skip over it and keep searching. + ++p; + } + else + { + // Trigger reading the next block + p = buf + bytes_read; + } + } + throw std::logic_error("InputSource after while (true)"); +} + +bool +InputSource::findLast(char const* start_chars, + qpdf_offset_t offset, size_t len, + Finder& finder) +{ + bool found = false; + qpdf_offset_t after_found_offset = 0; + qpdf_offset_t cur_offset = offset; + size_t cur_len = len; + while (this->findFirst(start_chars, cur_offset, cur_len, finder)) + { + if (found) + { + QTC::TC("libtests", "InputSource findLast found more than one"); + } + else + { + found = true; + } + after_found_offset = this->tell(); + cur_offset = after_found_offset; + cur_len = len - (cur_offset - offset); + } + if (found) + { + this->seek(after_found_offset, SEEK_SET); + } + return found; +} diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc index 4e71f8d2..86a61081 100644 --- a/libqpdf/QPDF_linearization.cc +++ b/libqpdf/QPDF_linearization.cc @@ -118,7 +118,7 @@ QPDF::isLinearized() } else { - p = reinterpret_cast(memchr(p, '\0', tbuf_size - (p - buf))); + p = static_cast(memchr(p, '\0', tbuf_size - (p - buf))); assert(p != 0); while ((p - buf < tbuf_size) && (*p == 0)) { diff --git a/libtests/build.mk b/libtests/build.mk index 22d9299e..2a272799 100644 --- a/libtests/build.mk +++ b/libtests/build.mk @@ -6,6 +6,7 @@ BINS_libtests = \ concatenate \ flate \ hex \ + input_source \ lzw \ md5 \ pcre \ diff --git a/libtests/input_source.cc b/libtests/input_source.cc new file mode 100644 index 00000000..091a1ea1 --- /dev/null +++ b/libtests/input_source.cc @@ -0,0 +1,108 @@ +#include +#include +#include +#include +#include + +static PointerHolder +get_buffer() +{ + size_t size = 3172; + PointerHolder b(new Buffer(size)); + unsigned char* p = b->getBuffer(); + for (size_t i = 0; i < size; ++i) + { + p[i] = static_cast(i & 0xff); + } + return b; +} + +class Finder: public InputSource::Finder +{ + public: + Finder(PointerHolder is, std::string const& after) : + is(is), + after(after) + { + } + virtual ~Finder() + { + } + virtual bool check(); + + private: + PointerHolder is; + std::string after; +}; + +bool +Finder::check() +{ + QPDFTokenizer tokenizer; + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "potato")) + { + t = tokenizer.readToken(is, "finder", true); + return (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, after)); + } + return false; +} + +void check(char const* description, bool expected, bool actual) +{ + std::cout << description << ": " + << ((actual == expected) ? "PASS" : "FAIL") + << std::endl; +} + +int main() +{ + PointerHolder b1 = get_buffer(); + unsigned char* b = b1->getBuffer(); + // Straddle block boundaries + memcpy(b + 1022, "potato", 6); + // Overlap so that the first check() would advance past the start + // of the next match + memcpy(b + 2037, "potato potato salad ", 20); + PointerHolder is = + new BufferInputSource("test buffer input source", b1.getPointer()); + Finder f1(is, "salad"); + check("find potato salad", true, + is->findFirst("potato", 0, 0, f1)); + check("barely find potato salad", true, + is->findFirst("potato", 1100, 945, f1)); + check("barely find potato salad", true, + is->findFirst("potato", 2000, 45, f1)); + check("potato salad is too late", false, + is->findFirst("potato", 1100, 944, f1)); + check("potato salad is too late", false, + is->findFirst("potato", 2000, 44, f1)); + check("potato salad not found", false, + is->findFirst("potato", 2045, 0, f1)); + check("potato salad not found", false, + is->findFirst("potato", 0, 1, f1)); + + // Put one more right at EOF + memcpy(b + b1->getSize() - 12, "potato salad", 12); + check("potato salad at EOF", true, + is->findFirst("potato", 3000, 0, f1)); + + is->findFirst("potato", 0, 0, f1); + check("findFirst found first", true, + is->tell() == 2056); + check("findLast found potato salad", true, + is->findLast("potato", 0, 0, f1)); + check("findLast found at EOF", true, + is->tell() == 3172); + + // Make check() bump into EOF + memcpy(b + b1->getSize() - 6, "potato", 6); + check("potato but not salad salad at EOF", false, + is->findFirst("potato", 3000, 0, f1)); + check("findLast found potato salad", true, + is->findLast("potato", 0, 0, f1)); + check("findLast found first one", true, + is->tell() == 2056); + + return 0; +} diff --git a/libtests/libtests.testcov b/libtests/libtests.testcov index ddbccd24..a5fe625f 100644 --- a/libtests/libtests.testcov +++ b/libtests/libtests.testcov @@ -16,3 +16,11 @@ bits write zero bits 0 Pl_ASCIIHexDecoder ignore space 0 Pl_ASCIIHexDecoder no-op flush 0 Pl_ASCIIHexDecoder partial flush 1 +InputSource read next block 1 +InputSource find EOF 1 +InputSource out of range 0 +InputSource first char matched but not string 0 +InputSource start_chars matched but not check 0 +InputSource not enough bytes 0 +InputSource findLast found more than one 0 +InputSource found match at buf[0] 0 diff --git a/libtests/qtest/input_source.test b/libtests/qtest/input_source.test new file mode 100644 index 00000000..89a1c21a --- /dev/null +++ b/libtests/qtest/input_source.test @@ -0,0 +1,26 @@ +#!/usr/bin/env perl +require 5.008; +use warnings; +use strict; + +chdir("input_source") or die "chdir testdir failed: $!\n"; + +require TestDriver; + +my $td = new TestDriver('InputSource'); + +cleanup(); + +$td->runtest("input source tests", + {$td->COMMAND => "input_source"}, + {$td->FILE => "input_source.out", + $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + +cleanup(); + +$td->report(1); + +sub cleanup +{ +} diff --git a/libtests/qtest/input_source/input_source.out b/libtests/qtest/input_source/input_source.out new file mode 100644 index 00000000..6af379c6 --- /dev/null +++ b/libtests/qtest/input_source/input_source.out @@ -0,0 +1,14 @@ +find potato salad: PASS +barely find potato salad: PASS +barely find potato salad: PASS +potato salad is too late: PASS +potato salad is too late: PASS +potato salad not found: PASS +potato salad not found: PASS +potato salad at EOF: PASS +findFirst found first: PASS +findLast found potato salad: PASS +findLast found at EOF: PASS +potato but not salad salad at EOF: PASS +findLast found potato salad: PASS +findLast found first one: PASS -- cgit v1.2.3-54-g00ecf