summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2012-12-25 20:38:18 +0100
committerJay Berkenbilt <ejb@ql.org>2012-12-25 20:43:37 +0100
commit7f84239cad2ec58166245394e56a4647085e025e (patch)
treede91917df48f54d645c90f67a1cb1a49214b5d5f
parentbcfc9847beb0f059a98ef5c8c02646b43fab4272 (diff)
downloadqpdf-7f84239cad2ec58166245394e56a4647085e025e.tar.zst
Find PDF header anywhere in the first 1024 bytes
-rw-r--r--ChangeLog6
-rw-r--r--TODO9
-rw-r--r--libqpdf/OffsetInputSource.cc61
-rw-r--r--libqpdf/QPDF.cc20
-rw-r--r--libqpdf/build.mk1
-rw-r--r--libqpdf/qpdf/OffsetInputSource.hh29
-rw-r--r--qpdf/qpdf.testcov1
-rw-r--r--qpdf/qtest/qpdf.test6
-rw-r--r--qpdf/qtest/qpdf/leading-junk.out17
-rw-r--r--qpdf/qtest/qpdf/leading-junk.pdfbin0 -> 13670 bytes
10 files changed, 137 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index 88f57f30..a06ffdcf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2012-12-25 Jay Berkenbilt <ejb@ql.org>
+
+ * Allow PDF header to appear anywhere in the first 1024 bytes of
+ the file as recommended in the implementation notes of the Adobe
+ version of the PDF spec.
+
2012-11-20 Jay Berkenbilt <ejb@ql.org>
* Add zlib and libpcre to Requires.private in the pkg-config file
diff --git a/TODO b/TODO
index 73ffc087..c8e0ceca 100644
--- a/TODO
+++ b/TODO
@@ -1,12 +1,3 @@
-Next
-====
-
- * Find PDF header in the first 1024 bytes of the file. Treat the
- location of the PDF header as offset 0 for purposes of resolving
- explicit file locations as this is what other implementations
- appear to do.
-
-
General
=======
diff --git a/libqpdf/OffsetInputSource.cc b/libqpdf/OffsetInputSource.cc
new file mode 100644
index 00000000..c1ec4102
--- /dev/null
+++ b/libqpdf/OffsetInputSource.cc
@@ -0,0 +1,61 @@
+#include <qpdf/OffsetInputSource.hh>
+
+OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied,
+ qpdf_offset_t global_offset) :
+ proxied(proxied),
+ global_offset(global_offset)
+{
+}
+
+OffsetInputSource::~OffsetInputSource()
+{
+}
+
+qpdf_offset_t
+OffsetInputSource::findAndSkipNextEOL()
+{
+ return this->proxied->findAndSkipNextEOL() - this->global_offset;
+}
+
+std::string const&
+OffsetInputSource::getName() const
+{
+ return this->proxied->getName();
+}
+
+qpdf_offset_t
+OffsetInputSource::tell()
+{
+ return this->proxied->tell() - this->global_offset;
+}
+
+void
+OffsetInputSource::seek(qpdf_offset_t offset, int whence)
+{
+ if (whence == SEEK_SET)
+ {
+ this->proxied->seek(offset + global_offset, whence);
+ }
+ else
+ {
+ this->proxied->seek(offset, whence);
+ }
+}
+
+void
+OffsetInputSource::rewind()
+{
+ seek(0, SEEK_SET);
+}
+
+size_t
+OffsetInputSource::read(char* buffer, size_t length)
+{
+ return this->proxied->read(buffer, length);
+}
+
+void
+OffsetInputSource::unreadCh(char ch)
+{
+ this->proxied->unreadCh(ch);
+}
diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc
index ccbfaf7c..ba96cb64 100644
--- a/libqpdf/QPDF.cc
+++ b/libqpdf/QPDF.cc
@@ -13,6 +13,7 @@
#include <qpdf/Pl_Discard.hh>
#include <qpdf/FileInputSource.hh>
#include <qpdf/BufferInputSource.hh>
+#include <qpdf/OffsetInputSource.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QPDF_Null.hh>
@@ -213,7 +214,7 @@ QPDF::getWarnings()
void
QPDF::parse(char const* password)
{
- PCRE header_re("^%PDF-(1.\\d+)\\b");
+ PCRE header_re("\\A((?s).*?)%PDF-(1.\\d+)\\b");
PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)");
if (password)
@@ -221,11 +222,24 @@ QPDF::parse(char const* password)
this->provided_password = password;
}
- std::string line = this->file->readLine(20);
+ // Find the header anywhere in the first 1024 bytes of the file.
+ char buffer[1044];
+ this->file->read(buffer, sizeof(buffer));
+ std::string line(buffer);
PCRE::Match m1 = header_re.match(line.c_str());
if (m1)
{
- this->pdf_version = m1.getMatch(1);
+ size_t global_offset = m1.getMatch(1).length();
+ if (global_offset != 0)
+ {
+ // Emperical evidence strongly suggests that when there is
+ // leading material prior to the PDF header, all explicit
+ // offsets in the file are such that 0 points to the
+ // beginning of the header.
+ QTC::TC("qpdf", "QPDF global offset");
+ this->file = new OffsetInputSource(this->file, global_offset);
+ }
+ this->pdf_version = m1.getMatch(2);
if (atof(this->pdf_version.c_str()) < 1.2)
{
this->tokenizer.allowPoundAnywhereInName();
diff --git a/libqpdf/build.mk b/libqpdf/build.mk
index 6debf107..0ad96a2d 100644
--- a/libqpdf/build.mk
+++ b/libqpdf/build.mk
@@ -12,6 +12,7 @@ SRCS_libqpdf = \
libqpdf/FileInputSource.cc \
libqpdf/InputSource.cc \
libqpdf/MD5.cc \
+ libqpdf/OffsetInputSource.cc \
libqpdf/PCRE.cc \
libqpdf/Pipeline.cc \
libqpdf/Pl_AES_PDF.cc \
diff --git a/libqpdf/qpdf/OffsetInputSource.hh b/libqpdf/qpdf/OffsetInputSource.hh
new file mode 100644
index 00000000..aedc574a
--- /dev/null
+++ b/libqpdf/qpdf/OffsetInputSource.hh
@@ -0,0 +1,29 @@
+#ifndef __QPDF_OFFSETINPUTSOURCE_HH__
+#define __QPDF_OFFSETINPUTSOURCE_HH__
+
+// This class implements an InputSource that proxies for an underlying
+// input source but offset a specific number of bytes.
+
+#include <qpdf/InputSource.hh>
+#include <qpdf/PointerHolder.hh>
+
+class OffsetInputSource: public InputSource
+{
+ public:
+ OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset);
+ virtual ~OffsetInputSource();
+
+ virtual qpdf_offset_t findAndSkipNextEOL();
+ virtual std::string const& getName() const;
+ virtual qpdf_offset_t tell();
+ virtual void seek(qpdf_offset_t offset, int whence);
+ virtual void rewind();
+ virtual size_t read(char* buffer, size_t length);
+ virtual void unreadCh(char ch);
+
+ private:
+ PointerHolder<InputSource> proxied;
+ qpdf_offset_t global_offset;
+};
+
+#endif // __QPDF_OFFSETINPUTSOURCE_HH__
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 3458297a..937d2b0c 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0
QPDFWriter extra header text no newline 0
QPDFWriter extra header text add newline 0
QPDF bogus 0 offset 0
+QPDF global offset 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index 16af5832..35645466 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -149,7 +149,7 @@ $td->runtest("remove page we don't have",
$td->NORMALIZE_NEWLINES);
# ----------
$td->notify("--- Miscellaneous Tests ---");
-$n_tests += 56;
+$n_tests += 57;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@@ -414,6 +414,10 @@ $td->runtest("object with zero offset",
{$td->COMMAND => "qpdf --check zero-offset.pdf"},
{$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3},
$td->NORMALIZE_NEWLINES);
+$td->runtest("check file with leading junk",
+ {$td->COMMAND => "qpdf --check leading-junk.pdf"},
+ {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
diff --git a/qpdf/qtest/qpdf/leading-junk.out b/qpdf/qtest/qpdf/leading-junk.out
new file mode 100644
index 00000000..58847c9d
--- /dev/null
+++ b/qpdf/qtest/qpdf/leading-junk.out
@@ -0,0 +1,17 @@
+checking leading-junk.pdf
+PDF Version: 1.4
+R = 3
+P = -4
+User password =
+extract for accessibility: allowed
+extract for any purpose: allowed
+print low resolution: allowed
+print high resolution: allowed
+modify document assembly: allowed
+modify forms: allowed
+modify annotations: allowed
+modify other: allowed
+modify anything: allowed
+File is linearized
+No syntax or stream encoding errors found; the file may still contain
+errors that qpdf cannot detect
diff --git a/qpdf/qtest/qpdf/leading-junk.pdf b/qpdf/qtest/qpdf/leading-junk.pdf
new file mode 100644
index 00000000..2b2a0a2c
--- /dev/null
+++ b/qpdf/qtest/qpdf/leading-junk.pdf
Binary files differ