summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2017-08-06 14:42:01 +0200
committerJay Berkenbilt <ejb@ql.org>2017-08-11 03:30:32 +0200
commit296b679d6e3217cc112b7ed19b363b82356615ef (patch)
tree7cb9b3aa95c00da45cf1a7cf67a020c1ee54a6c9
parentef8ae5449dc30782451beba64fdd0af86e1cb931 (diff)
downloadqpdf-296b679d6e3217cc112b7ed19b363b82356615ef.tar.zst
Implement findFirst and findLast in InputSource
Preparing to refactor some pattern searching code to use these instead of their own memchr loops. This should simplify the code that replaces PCRE.
-rw-r--r--include/qpdf/BufferInputSource.hh10
-rw-r--r--include/qpdf/FileInputSource.hh11
-rw-r--r--include/qpdf/InputSource.hh35
-rw-r--r--include/qpdf/QPDF.hh21
-rw-r--r--libqpdf/InputSource.cc167
-rw-r--r--libqpdf/QPDF_linearization.cc2
-rw-r--r--libtests/build.mk1
-rw-r--r--libtests/input_source.cc108
-rw-r--r--libtests/libtests.testcov8
-rw-r--r--libtests/qtest/input_source.test26
-rw-r--r--libtests/qtest/input_source/input_source.out14
11 files changed, 402 insertions, 1 deletions
diff --git a/include/qpdf/BufferInputSource.hh b/include/qpdf/BufferInputSource.hh
index 64ee4605..db055783 100644
--- a/include/qpdf/BufferInputSource.hh
+++ b/include/qpdf/BufferInputSource.hh
@@ -15,17 +15,27 @@
class BufferInputSource: public InputSource
{
public:
+ QPDF_DLL
BufferInputSource(std::string const& description, Buffer* buf,
bool own_memory = false);
+ QPDF_DLL
BufferInputSource(std::string const& description,
std::string const& contents);
+ QPDF_DLL
virtual ~BufferInputSource();
+ QPDF_DLL
virtual qpdf_offset_t findAndSkipNextEOL();
+ QPDF_DLL
virtual std::string const& getName() const;
+ QPDF_DLL
virtual qpdf_offset_t tell();
+ QPDF_DLL
virtual void seek(qpdf_offset_t offset, int whence);
+ QPDF_DLL
virtual void rewind();
+ QPDF_DLL
virtual size_t read(char* buffer, size_t length);
+ QPDF_DLL
virtual void unreadCh(char ch);
private:
diff --git a/include/qpdf/FileInputSource.hh b/include/qpdf/FileInputSource.hh
index 64457365..3f0c05a9 100644
--- a/include/qpdf/FileInputSource.hh
+++ b/include/qpdf/FileInputSource.hh
@@ -14,16 +14,27 @@
class FileInputSource: public InputSource
{
public:
+ QPDF_DLL
FileInputSource();
+ QPDF_DLL
void setFilename(char const* filename);
+ QPDF_DLL
void setFile(char const* description, FILE* filep, bool close_file);
+ QPDF_DLL
virtual ~FileInputSource();
+ QPDF_DLL
virtual qpdf_offset_t findAndSkipNextEOL();
+ QPDF_DLL
virtual std::string const& getName() const;
+ QPDF_DLL
virtual qpdf_offset_t tell();
+ QPDF_DLL
virtual void seek(qpdf_offset_t offset, int whence);
+ QPDF_DLL
virtual void rewind();
+ QPDF_DLL
virtual size_t read(char* buffer, size_t length);
+ QPDF_DLL
virtual void unreadCh(char ch);
private:
diff --git a/include/qpdf/InputSource.hh b/include/qpdf/InputSource.hh
index a731918e..c20e5076 100644
--- a/include/qpdf/InputSource.hh
+++ b/include/qpdf/InputSource.hh
@@ -9,6 +9,7 @@
#ifndef __QPDF_INPUTSOURCE_HH__
#define __QPDF_INPUTSOURCE_HH__
+#include <qpdf/DLL.h>
#include <qpdf/Types.h>
#include <stdio.h>
#include <string>
@@ -16,18 +17,52 @@
class InputSource
{
public:
+ QPDF_DLL
InputSource() :
last_offset(0)
{
}
+ QPDF_DLL
virtual ~InputSource()
{
}
+ class Finder
+ {
+ public:
+ Finder()
+ {
+ }
+ virtual ~Finder()
+ {
+ }
+
+ virtual bool check() = 0;
+ };
+
+ QPDF_DLL
void setLastOffset(qpdf_offset_t);
+ QPDF_DLL
qpdf_offset_t getLastOffset() const;
+ QPDF_DLL
std::string readLine(size_t max_line_length);
+ // Find first or last occurrence of a sequence of characters
+ // starting within the range defined by offset and len such that,
+ // when the input source is positioned at the beginning of that
+ // sequence, finder.check() returns true. If len is 0, the search
+ // proceeds until EOF. If a qualifying pattern these methods
+ // return true and leave the input source positioned wherever
+ // check() left it at the end of the matching pattern.
+ QPDF_DLL
+ bool findFirst(char const* start_chars,
+ qpdf_offset_t offset, size_t len,
+ Finder& finder);
+ QPDF_DLL
+ bool findLast(char const* start_chars,
+ qpdf_offset_t offset, size_t len,
+ Finder& finder);
+
virtual qpdf_offset_t findAndSkipNextEOL() = 0;
virtual std::string const& getName() const = 0;
virtual qpdf_offset_t tell() = 0;
diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh
index f57789a0..c9d120b4 100644
--- a/include/qpdf/QPDF.hh
+++ b/include/qpdf/QPDF.hh
@@ -1006,6 +1006,27 @@ class QPDF
std::string key; // if ou_trailer_key or ou_root_key
};
+ class PatternFinder: public InputSource::Finder
+ {
+ public:
+ PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :
+ qpdf(qpdf),
+ checker(checker)
+ {
+ }
+ virtual ~PatternFinder()
+ {
+ }
+ virtual bool check()
+ {
+ return (this->qpdf.*checker)();
+ }
+
+ private:
+ QPDF& qpdf;
+ bool (QPDF::*checker)();
+ };
+
// methods to support linearization checking -- implemented in
// QPDF_linearization.cc
void readLinearizationData();
diff --git a/libqpdf/InputSource.cc b/libqpdf/InputSource.cc
index 79c889bf..69cafeb8 100644
--- a/libqpdf/InputSource.cc
+++ b/libqpdf/InputSource.cc
@@ -1,7 +1,10 @@
#include <qpdf/InputSource.hh>
#include <string.h>
+#include <stdexcept>
+#include <qpdf/QTC.hh>
#include <qpdf/PointerHolder.hh>
+
void
InputSource::setLastOffset(qpdf_offset_t offset)
{
@@ -39,3 +42,167 @@ InputSource::readLine(size_t max_line_length)
}
return std::string(buf);
}
+
+bool
+InputSource::findFirst(char const* start_chars,
+ qpdf_offset_t offset, size_t len,
+ Finder& finder)
+{
+ // Basic approach: search for the first character of start_chars
+ // starting from offset but not going past len (if len != 0). Once
+ // the first character is found, see if it is the beginning of a
+ // sequence of characters matching start_chars. If so, call
+ // finder.check() to do caller-specific additional checks. If not,
+ // keep searching.
+
+ // This code is tricky and highly subject to off-by-one or other
+ // edge case logic errors. See comments throughout that explain
+ // how we're not missing any edge cases. There are also tests
+ // specifically constructed to make sure we caught the edge cases
+ // in testing.
+
+ char buf[1025]; // size known to input_source.cc in libtests
+ // To enable us to guarantee null-termination, save an extra byte
+ // so that buf[size] is valid memory.
+ size_t size = sizeof(buf) - 1;
+ if ((strlen(start_chars) < 1) || (strlen(start_chars) > size))
+ {
+ throw std::logic_error(
+ "InputSource::findSource called with"
+ " too small or too large of a character sequence");
+ }
+
+ char* p = 0;
+ qpdf_offset_t buf_offset = offset;
+ size_t bytes_read = 0;
+
+ // Guarantee that we return from this loop. Each time through, we
+ // either return, advance p, or restart the loop with a condition
+ // that will cause return on the next pass. Eventually we will
+ // either be out of range or hit EOF, either of which forces us to
+ // return.
+ while (true)
+ {
+ // Do we need to read more data? Pretend size = 5, buf starts
+ // at 0, and start_chars has 3 characters. buf[5] is valid and
+ // null. If p == 2, start_chars could be buf[2] through
+ // buf[4], so p + strlen(start_chars) == buf + size is okay.
+ // If p points to buf[size], since strlen(start_chars) is
+ // always >= 1, this overflow test will be correct for that
+ // case regardless of start_chars.
+ if ((p == 0) || ((p + strlen(start_chars)) > (buf + bytes_read)))
+ {
+ if (p)
+ {
+ QTC::TC("libtests", "InputSource read next block",
+ ((p == buf + bytes_read) ? 0 : 1));
+ buf_offset += (p - buf);
+ }
+ this->seek(buf_offset, SEEK_SET);
+ // Read into buffer and zero out the rest of the buffer
+ // including buf[size]. We allocated an extra byte so that
+ // we could guarantee null termination as an extra
+ // protection against overrun when using string functions.
+ bytes_read = this->read(buf, size);
+ if (bytes_read < strlen(start_chars))
+ {
+ QTC::TC("libtests", "InputSource find EOF",
+ bytes_read == 0 ? 0 : 1);
+ return false;
+ }
+ memset(buf + bytes_read, '\0', 1 + (size - bytes_read));
+ p = buf;
+ }
+
+ // Search for the first character.
+ if ((p = static_cast<char*>(
+ memchr(p, start_chars[0], bytes_read - (p - buf)))) != 0)
+ {
+ if (p == buf)
+ {
+ QTC::TC("libtests", "InputSource found match at buf[0]");
+ }
+ // Found first letter.
+ if (len != 0)
+ {
+ // Make sure it's in range.
+ size_t p_relative_offset = (p - buf) + (buf_offset - offset);
+ if (p_relative_offset >= len)
+ {
+ // out of range
+ QTC::TC("libtests", "InputSource out of range");
+ return false;
+ }
+ }
+ if ((p + strlen(start_chars)) > (buf + bytes_read))
+ {
+ // If there are not enough bytes left in the file for
+ // start_chars, we will detect this on the next pass
+ // as EOF and return.
+ QTC::TC("libtests", "InputSource not enough bytes");
+ continue;
+ }
+
+ // See if p points to a sequence matching start_chars. We
+ // already checked above to make sure we are not going to
+ // overrun memory.
+ if (strncmp(p, start_chars, strlen(start_chars)) == 0)
+ {
+ // Call finder.check() with the input source
+ // positioned to the point of the match.
+ this->seek(buf_offset + (p - buf), SEEK_SET);
+ if (finder.check())
+ {
+ return true;
+ }
+ else
+ {
+ QTC::TC("libtests", "InputSource start_chars matched but not check");
+ }
+ }
+ else
+ {
+ QTC::TC("libtests", "InputSource first char matched but not string");
+ }
+ // This occurrence of the first character wasn't a match.
+ // Skip over it and keep searching.
+ ++p;
+ }
+ else
+ {
+ // Trigger reading the next block
+ p = buf + bytes_read;
+ }
+ }
+ throw std::logic_error("InputSource after while (true)");
+}
+
+bool
+InputSource::findLast(char const* start_chars,
+ qpdf_offset_t offset, size_t len,
+ Finder& finder)
+{
+ bool found = false;
+ qpdf_offset_t after_found_offset = 0;
+ qpdf_offset_t cur_offset = offset;
+ size_t cur_len = len;
+ while (this->findFirst(start_chars, cur_offset, cur_len, finder))
+ {
+ if (found)
+ {
+ QTC::TC("libtests", "InputSource findLast found more than one");
+ }
+ else
+ {
+ found = true;
+ }
+ after_found_offset = this->tell();
+ cur_offset = after_found_offset;
+ cur_len = len - (cur_offset - offset);
+ }
+ if (found)
+ {
+ this->seek(after_found_offset, SEEK_SET);
+ }
+ return found;
+}
diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc
index 4e71f8d2..86a61081 100644
--- a/libqpdf/QPDF_linearization.cc
+++ b/libqpdf/QPDF_linearization.cc
@@ -118,7 +118,7 @@ QPDF::isLinearized()
}
else
{
- p = reinterpret_cast<char*>(memchr(p, '\0', tbuf_size - (p - buf)));
+ p = static_cast<char*>(memchr(p, '\0', tbuf_size - (p - buf)));
assert(p != 0);
while ((p - buf < tbuf_size) && (*p == 0))
{
diff --git a/libtests/build.mk b/libtests/build.mk
index 22d9299e..2a272799 100644
--- a/libtests/build.mk
+++ b/libtests/build.mk
@@ -6,6 +6,7 @@ BINS_libtests = \
concatenate \
flate \
hex \
+ input_source \
lzw \
md5 \
pcre \
diff --git a/libtests/input_source.cc b/libtests/input_source.cc
new file mode 100644
index 00000000..091a1ea1
--- /dev/null
+++ b/libtests/input_source.cc
@@ -0,0 +1,108 @@
+#include <iostream>
+#include <qpdf/BufferInputSource.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/Buffer.hh>
+#include <qpdf/QPDFTokenizer.hh>
+
+static PointerHolder<Buffer>
+get_buffer()
+{
+ size_t size = 3172;
+ PointerHolder<Buffer> b(new Buffer(size));
+ unsigned char* p = b->getBuffer();
+ for (size_t i = 0; i < size; ++i)
+ {
+ p[i] = static_cast<unsigned char>(i & 0xff);
+ }
+ return b;
+}
+
+class Finder: public InputSource::Finder
+{
+ public:
+ Finder(PointerHolder<InputSource> is, std::string const& after) :
+ is(is),
+ after(after)
+ {
+ }
+ virtual ~Finder()
+ {
+ }
+ virtual bool check();
+
+ private:
+ PointerHolder<InputSource> is;
+ std::string after;
+};
+
+bool
+Finder::check()
+{
+ QPDFTokenizer tokenizer;
+ QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+ if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "potato"))
+ {
+ t = tokenizer.readToken(is, "finder", true);
+ return (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, after));
+ }
+ return false;
+}
+
+void check(char const* description, bool expected, bool actual)
+{
+ std::cout << description << ": "
+ << ((actual == expected) ? "PASS" : "FAIL")
+ << std::endl;
+}
+
+int main()
+{
+ PointerHolder<Buffer> b1 = get_buffer();
+ unsigned char* b = b1->getBuffer();
+ // Straddle block boundaries
+ memcpy(b + 1022, "potato", 6);
+ // Overlap so that the first check() would advance past the start
+ // of the next match
+ memcpy(b + 2037, "potato potato salad ", 20);
+ PointerHolder<InputSource> is =
+ new BufferInputSource("test buffer input source", b1.getPointer());
+ Finder f1(is, "salad");
+ check("find potato salad", true,
+ is->findFirst("potato", 0, 0, f1));
+ check("barely find potato salad", true,
+ is->findFirst("potato", 1100, 945, f1));
+ check("barely find potato salad", true,
+ is->findFirst("potato", 2000, 45, f1));
+ check("potato salad is too late", false,
+ is->findFirst("potato", 1100, 944, f1));
+ check("potato salad is too late", false,
+ is->findFirst("potato", 2000, 44, f1));
+ check("potato salad not found", false,
+ is->findFirst("potato", 2045, 0, f1));
+ check("potato salad not found", false,
+ is->findFirst("potato", 0, 1, f1));
+
+ // Put one more right at EOF
+ memcpy(b + b1->getSize() - 12, "potato salad", 12);
+ check("potato salad at EOF", true,
+ is->findFirst("potato", 3000, 0, f1));
+
+ is->findFirst("potato", 0, 0, f1);
+ check("findFirst found first", true,
+ is->tell() == 2056);
+ check("findLast found potato salad", true,
+ is->findLast("potato", 0, 0, f1));
+ check("findLast found at EOF", true,
+ is->tell() == 3172);
+
+ // Make check() bump into EOF
+ memcpy(b + b1->getSize() - 6, "potato", 6);
+ check("potato but not salad salad at EOF", false,
+ is->findFirst("potato", 3000, 0, f1));
+ check("findLast found potato salad", true,
+ is->findLast("potato", 0, 0, f1));
+ check("findLast found first one", true,
+ is->tell() == 2056);
+
+ return 0;
+}
diff --git a/libtests/libtests.testcov b/libtests/libtests.testcov
index ddbccd24..a5fe625f 100644
--- a/libtests/libtests.testcov
+++ b/libtests/libtests.testcov
@@ -16,3 +16,11 @@ bits write zero bits 0
Pl_ASCIIHexDecoder ignore space 0
Pl_ASCIIHexDecoder no-op flush 0
Pl_ASCIIHexDecoder partial flush 1
+InputSource read next block 1
+InputSource find EOF 1
+InputSource out of range 0
+InputSource first char matched but not string 0
+InputSource start_chars matched but not check 0
+InputSource not enough bytes 0
+InputSource findLast found more than one 0
+InputSource found match at buf[0] 0
diff --git a/libtests/qtest/input_source.test b/libtests/qtest/input_source.test
new file mode 100644
index 00000000..89a1c21a
--- /dev/null
+++ b/libtests/qtest/input_source.test
@@ -0,0 +1,26 @@
+#!/usr/bin/env perl
+require 5.008;
+use warnings;
+use strict;
+
+chdir("input_source") or die "chdir testdir failed: $!\n";
+
+require TestDriver;
+
+my $td = new TestDriver('InputSource');
+
+cleanup();
+
+$td->runtest("input source tests",
+ {$td->COMMAND => "input_source"},
+ {$td->FILE => "input_source.out",
+ $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
+cleanup();
+
+$td->report(1);
+
+sub cleanup
+{
+}
diff --git a/libtests/qtest/input_source/input_source.out b/libtests/qtest/input_source/input_source.out
new file mode 100644
index 00000000..6af379c6
--- /dev/null
+++ b/libtests/qtest/input_source/input_source.out
@@ -0,0 +1,14 @@
+find potato salad: PASS
+barely find potato salad: PASS
+barely find potato salad: PASS
+potato salad is too late: PASS
+potato salad is too late: PASS
+potato salad not found: PASS
+potato salad not found: PASS
+potato salad at EOF: PASS
+findFirst found first: PASS
+findLast found potato salad: PASS
+findLast found at EOF: PASS
+potato but not salad salad at EOF: PASS
+findLast found potato salad: PASS
+findLast found first one: PASS