diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | TODO | 9 | ||||
-rw-r--r-- | libqpdf/OffsetInputSource.cc | 61 | ||||
-rw-r--r-- | libqpdf/QPDF.cc | 20 | ||||
-rw-r--r-- | libqpdf/build.mk | 1 | ||||
-rw-r--r-- | libqpdf/qpdf/OffsetInputSource.hh | 29 | ||||
-rw-r--r-- | qpdf/qpdf.testcov | 1 | ||||
-rw-r--r-- | qpdf/qtest/qpdf.test | 6 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/leading-junk.out | 17 | ||||
-rw-r--r-- | qpdf/qtest/qpdf/leading-junk.pdf | bin | 0 -> 13670 bytes |
10 files changed, 137 insertions, 13 deletions
@@ -1,3 +1,9 @@ +2012-12-25 Jay Berkenbilt <ejb@ql.org> + + * Allow PDF header to appear anywhere in the first 1024 bytes of + the file as recommended in the implementation notes of the Adobe + version of the PDF spec. + 2012-11-20 Jay Berkenbilt <ejb@ql.org> * Add zlib and libpcre to Requires.private in the pkg-config file @@ -1,12 +1,3 @@ -Next -==== - - * Find PDF header in the first 1024 bytes of the file. Treat the - location of the PDF header as offset 0 for purposes of resolving - explicit file locations as this is what other implementations - appear to do. - - General ======= diff --git a/libqpdf/OffsetInputSource.cc b/libqpdf/OffsetInputSource.cc new file mode 100644 index 00000000..c1ec4102 --- /dev/null +++ b/libqpdf/OffsetInputSource.cc @@ -0,0 +1,61 @@ +#include <qpdf/OffsetInputSource.hh> + +OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied, + qpdf_offset_t global_offset) : + proxied(proxied), + global_offset(global_offset) +{ +} + +OffsetInputSource::~OffsetInputSource() +{ +} + +qpdf_offset_t +OffsetInputSource::findAndSkipNextEOL() +{ + return this->proxied->findAndSkipNextEOL() - this->global_offset; +} + +std::string const& +OffsetInputSource::getName() const +{ + return this->proxied->getName(); +} + +qpdf_offset_t +OffsetInputSource::tell() +{ + return this->proxied->tell() - this->global_offset; +} + +void +OffsetInputSource::seek(qpdf_offset_t offset, int whence) +{ + if (whence == SEEK_SET) + { + this->proxied->seek(offset + global_offset, whence); + } + else + { + this->proxied->seek(offset, whence); + } +} + +void +OffsetInputSource::rewind() +{ + seek(0, SEEK_SET); +} + +size_t +OffsetInputSource::read(char* buffer, size_t length) +{ + return this->proxied->read(buffer, length); +} + +void +OffsetInputSource::unreadCh(char ch) +{ + this->proxied->unreadCh(ch); +} diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index ccbfaf7c..ba96cb64 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -13,6 +13,7 @@ #include <qpdf/Pl_Discard.hh> #include <qpdf/FileInputSource.hh> #include <qpdf/BufferInputSource.hh> +#include <qpdf/OffsetInputSource.hh> #include <qpdf/QPDFExc.hh> #include <qpdf/QPDF_Null.hh> @@ -213,7 +214,7 @@ QPDF::getWarnings() void QPDF::parse(char const* password) { - PCRE header_re("^%PDF-(1.\\d+)\\b"); + PCRE header_re("\\A((?s).*?)%PDF-(1.\\d+)\\b"); PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); if (password) @@ -221,11 +222,24 @@ QPDF::parse(char const* password) this->provided_password = password; } - std::string line = this->file->readLine(20); + // Find the header anywhere in the first 1024 bytes of the file. + char buffer[1044]; + this->file->read(buffer, sizeof(buffer)); + std::string line(buffer); PCRE::Match m1 = header_re.match(line.c_str()); if (m1) { - this->pdf_version = m1.getMatch(1); + size_t global_offset = m1.getMatch(1).length(); + if (global_offset != 0) + { + // Emperical evidence strongly suggests that when there is + // leading material prior to the PDF header, all explicit + // offsets in the file are such that 0 points to the + // beginning of the header. + QTC::TC("qpdf", "QPDF global offset"); + this->file = new OffsetInputSource(this->file, global_offset); + } + this->pdf_version = m1.getMatch(2); if (atof(this->pdf_version.c_str()) < 1.2) { this->tokenizer.allowPoundAnywhereInName(); diff --git a/libqpdf/build.mk b/libqpdf/build.mk index 6debf107..0ad96a2d 100644 --- a/libqpdf/build.mk +++ b/libqpdf/build.mk @@ -12,6 +12,7 @@ SRCS_libqpdf = \ libqpdf/FileInputSource.cc \ libqpdf/InputSource.cc \ libqpdf/MD5.cc \ + libqpdf/OffsetInputSource.cc \ libqpdf/PCRE.cc \ libqpdf/Pipeline.cc \ libqpdf/Pl_AES_PDF.cc \ diff --git a/libqpdf/qpdf/OffsetInputSource.hh b/libqpdf/qpdf/OffsetInputSource.hh new file mode 100644 index 00000000..aedc574a --- /dev/null +++ b/libqpdf/qpdf/OffsetInputSource.hh @@ -0,0 +1,29 @@ +#ifndef __QPDF_OFFSETINPUTSOURCE_HH__ +#define __QPDF_OFFSETINPUTSOURCE_HH__ + +// This class implements an InputSource that proxies for an underlying +// input source but offset a specific number of bytes. + +#include <qpdf/InputSource.hh> +#include <qpdf/PointerHolder.hh> + +class OffsetInputSource: public InputSource +{ + public: + OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset); + virtual ~OffsetInputSource(); + + virtual qpdf_offset_t findAndSkipNextEOL(); + virtual std::string const& getName() const; + virtual qpdf_offset_t tell(); + virtual void seek(qpdf_offset_t offset, int whence); + virtual void rewind(); + virtual size_t read(char* buffer, size_t length); + virtual void unreadCh(char ch); + + private: + PointerHolder<InputSource> proxied; + qpdf_offset_t global_offset; +}; + +#endif // __QPDF_OFFSETINPUTSOURCE_HH__ diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 3458297a..937d2b0c 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0 QPDFWriter extra header text no newline 0 QPDFWriter extra header text add newline 0 QPDF bogus 0 offset 0 +QPDF global offset 0 diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index 16af5832..35645466 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", $td->NORMALIZE_NEWLINES); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 56; +$n_tests += 57; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -414,6 +414,10 @@ $td->runtest("object with zero offset", {$td->COMMAND => "qpdf --check zero-offset.pdf"}, {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, $td->NORMALIZE_NEWLINES); +$td->runtest("check file with leading junk", + {$td->COMMAND => "qpdf --check leading-junk.pdf"}, + {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); show_ntests(); # ---------- diff --git a/qpdf/qtest/qpdf/leading-junk.out b/qpdf/qtest/qpdf/leading-junk.out new file mode 100644 index 00000000..58847c9d --- /dev/null +++ b/qpdf/qtest/qpdf/leading-junk.out @@ -0,0 +1,17 @@ +checking leading-junk.pdf +PDF Version: 1.4 +R = 3 +P = -4 +User password = +extract for accessibility: allowed +extract for any purpose: allowed +print low resolution: allowed +print high resolution: allowed +modify document assembly: allowed +modify forms: allowed +modify annotations: allowed +modify other: allowed +modify anything: allowed +File is linearized +No syntax or stream encoding errors found; the file may still contain +errors that qpdf cannot detect diff --git a/qpdf/qtest/qpdf/leading-junk.pdf b/qpdf/qtest/qpdf/leading-junk.pdf Binary files differnew file mode 100644 index 00000000..2b2a0a2c --- /dev/null +++ b/qpdf/qtest/qpdf/leading-junk.pdf |