diff options
Diffstat (limited to 'libqpdf')
65 files changed, 12412 insertions, 0 deletions
diff --git a/libqpdf/BitStream.cc b/libqpdf/BitStream.cc new file mode 100644 index 00000000..c6fda4e6 --- /dev/null +++ b/libqpdf/BitStream.cc @@ -0,0 +1,45 @@ + + +#include <qpdf/BitStream.hh> + +// See comments in bits.cc +#define BITS_READ 1 +#include "bits.icc" + +BitStream::BitStream(unsigned char const* p, int nbytes) : + start(p), + nbytes(nbytes) +{ + reset(); +} + +void +BitStream::reset() +{ + p = start; + bit_offset = 7; + bits_available = 8 * nbytes; +} + +unsigned long +BitStream::getBits(int nbits) +{ + return read_bits(this->p, this->bit_offset, + this->bits_available, nbits); +} + +void +BitStream::skipToNextByte() +{ + if (bit_offset != 7) + { + unsigned int bits_to_skip = bit_offset + 1; + if (bits_available < bits_to_skip) + { + throw QEXC::Internal("overflow skipping to next byte in bitstream"); + } + bit_offset = 7; + ++p; + bits_available -= bits_to_skip; + } +} diff --git a/libqpdf/BitWriter.cc b/libqpdf/BitWriter.cc new file mode 100644 index 00000000..f682aac5 --- /dev/null +++ b/libqpdf/BitWriter.cc @@ -0,0 +1,30 @@ + + +#include <qpdf/BitWriter.hh> + +// See comments in bits.cc +#define BITS_WRITE 1 +#include "bits.icc" + +BitWriter::BitWriter(Pipeline* pl) : + pl(pl), + ch(0), + bit_offset(7) +{ +} + +void +BitWriter::writeBits(unsigned long val, int bits) +{ + write_bits(this->ch, this->bit_offset, val, bits, this->pl); +} + +void +BitWriter::flush() +{ + if (bit_offset < 7) + { + int bits_to_write = bit_offset + 1; + write_bits(this->ch, this->bit_offset, 0, bits_to_write, this->pl); + } +} diff --git a/libqpdf/Buffer.cc b/libqpdf/Buffer.cc new file mode 100644 index 00000000..3dde1f90 --- /dev/null +++ b/libqpdf/Buffer.cc @@ -0,0 +1,79 @@ + +#include <qpdf/Buffer.hh> + +#include <string.h> + +Buffer::Buffer() +{ + init(0); +} + +Buffer::Buffer(unsigned long size) +{ + init(size); +} + +Buffer::Buffer(Buffer const& rhs) +{ + init(0); + copy(rhs); +} + +Buffer& +Buffer::operator=(Buffer const& rhs) +{ + copy(rhs); + return *this; +} + +Buffer::~Buffer() +{ + destroy(); +} + +void +Buffer::init(unsigned long size) +{ + this->size = size; + this->buf = (size ? new unsigned char[size] : 0); +} + +void +Buffer::copy(Buffer const& rhs) +{ + if (this != &rhs) + { + this->destroy(); + this->init(rhs.size); + if (this->size) + { + memcpy(this->buf, rhs.buf, this->size); + } + } +} + +void +Buffer::destroy() +{ + delete [] this->buf; + this->size = 0; + this->buf = 0; +} + +unsigned long +Buffer::getSize() const +{ + return this->size; +} + +unsigned char const* +Buffer::getBuffer() const +{ + return this->buf; +} + +unsigned char* +Buffer::getBuffer() +{ + return this->buf; +} diff --git a/libqpdf/MD5.cc b/libqpdf/MD5.cc new file mode 100644 index 00000000..ecdd8a33 --- /dev/null +++ b/libqpdf/MD5.cc @@ -0,0 +1,441 @@ +// This file implements a class for computation of MD5 checksums. +// It is derived from the reference algorithm for MD5 as given in +// RFC 1321. The original copyright notice is as follows: +// +///////////////////////////////////////////////////////////////////////// +// +// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +// rights reserved. +// +// License to copy and use this software is granted provided that it +// is identified as the "RSA Data Security, Inc. MD5 Message-Digest +// Algorithm" in all material mentioning or referencing this software +// or this function. +// +// License is also granted to make and use derivative works provided +// that such works are identified as "derived from the RSA Data +// Security, Inc. MD5 Message-Digest Algorithm" in all material +// mentioning or referencing the derived work. +// +// RSA Data Security, Inc. makes no representations concerning either +// the merchantability of this software or the suitability of this +// software for any particular purpose. It is provided "as is" +// without express or implied warranty of any kind. +// +// These notices must be retained in any copies of any part of this +// documentation and/or software. +// +///////////////////////////////////////////////////////////////////////// + +#include <qpdf/MD5.hh> + +#include <stdio.h> +#include <memory.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +int const S11 = 7; +int const S12 = 12; +int const S13 = 17; +int const S14 = 22; +int const S21 = 5; +int const S22 = 9; +int const S23 = 14; +int const S24 = 20; +int const S31 = 4; +int const S32 = 11; +int const S33 = 16; +int const S34 = 23; +int const S41 = 6; +int const S42 = 10; +int const S43 = 15; +int const S44 = 21; + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +// F, G, H and I are basic MD5 functions. +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +// ROTATE_LEFT rotates x left n bits. +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +// Rotation is separate from addition to prevent recomputation. +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +// MD5 initialization. Begins an MD5 operation, writing a new context. +void MD5::init() +{ + count[0] = count[1] = 0; + // Load magic initialization constants. + state[0] = 0x67452301; + state[1] = 0xefcdab89; + state[2] = 0x98badcfe; + state[3] = 0x10325476; + + finalized = false; + memset(digest_val, 0, sizeof(digest_val)); +} + +// MD5 block update operation. Continues an MD5 message-digest +// operation, processing another message block, and updating the +// context. + +void MD5::update(unsigned char *input, + unsigned int inputLen) +{ + unsigned int i, index, partLen; + + // Compute number of bytes mod 64 + index = (unsigned int)((count[0] >> 3) & 0x3F); + + // Update number of bits + if ((count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + count[1]++; + count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + // Transform as many times as possible. + + if (inputLen >= partLen) { + memcpy + ((POINTER)&buffer[index], (POINTER)input, partLen); + transform(state, buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + transform(state, &input[i]); + + index = 0; + } + else + i = 0; + + // Buffer remaining input + memcpy + ((POINTER)&buffer[index], (POINTER)&input[i], + inputLen-i); +} + +// MD5 finalization. Ends an MD5 message-digest operation, writing the +// the message digest and zeroizing the context. +void MD5::final() +{ + if (finalized) + { + return; + } + + unsigned char bits[8]; + unsigned int index, padLen; + + // Save number of bits + encode(bits, count, 8); + + // Pad out to 56 mod 64. + + index = (unsigned int)((count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + update(PADDING, padLen); + + // Append length (before padding) + update(bits, 8); + // Store state in digest_val + encode(digest_val, state, 16); + + // Zeroize sensitive information. + memset(state, 0, sizeof(state)); + memset(count, 0, sizeof(count)); + memset(buffer, 0, sizeof(buffer)); + + finalized = true; +} + +// MD5 basic transformation. Transforms state based on block. +void MD5::transform(UINT4 state[4], unsigned char block[64]) +{ + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + decode(x, block, 64); + + // Round 1 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); // 1 + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); // 2 + FF (c, d, a, b, x[ 2], S13, 0x242070db); // 3 + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); // 4 + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); // 5 + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); // 6 + FF (c, d, a, b, x[ 6], S13, 0xa8304613); // 7 + FF (b, c, d, a, x[ 7], S14, 0xfd469501); // 8 + FF (a, b, c, d, x[ 8], S11, 0x698098d8); // 9 + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); // 10 + FF (c, d, a, b, x[10], S13, 0xffff5bb1); // 11 + FF (b, c, d, a, x[11], S14, 0x895cd7be); // 12 + FF (a, b, c, d, x[12], S11, 0x6b901122); // 13 + FF (d, a, b, c, x[13], S12, 0xfd987193); // 14 + FF (c, d, a, b, x[14], S13, 0xa679438e); // 15 + FF (b, c, d, a, x[15], S14, 0x49b40821); // 16 + + // Round 2 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); // 17 + GG (d, a, b, c, x[ 6], S22, 0xc040b340); // 18 + GG (c, d, a, b, x[11], S23, 0x265e5a51); // 19 + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); // 20 + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); // 21 + GG (d, a, b, c, x[10], S22, 0x2441453); // 22 + GG (c, d, a, b, x[15], S23, 0xd8a1e681); // 23 + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); // 24 + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); // 25 + GG (d, a, b, c, x[14], S22, 0xc33707d6); // 26 + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); // 27 + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); // 28 + GG (a, b, c, d, x[13], S21, 0xa9e3e905); // 29 + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); // 30 + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); // 31 + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); // 32 + + // Round 3 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); // 33 + HH (d, a, b, c, x[ 8], S32, 0x8771f681); // 34 + HH (c, d, a, b, x[11], S33, 0x6d9d6122); // 35 + HH (b, c, d, a, x[14], S34, 0xfde5380c); // 36 + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); // 37 + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); // 38 + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); // 39 + HH (b, c, d, a, x[10], S34, 0xbebfbc70); // 40 + HH (a, b, c, d, x[13], S31, 0x289b7ec6); // 41 + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); // 42 + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); // 43 + HH (b, c, d, a, x[ 6], S34, 0x4881d05); // 44 + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); // 45 + HH (d, a, b, c, x[12], S32, 0xe6db99e5); // 46 + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); // 47 + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); // 48 + + // Round 4 + II (a, b, c, d, x[ 0], S41, 0xf4292244); // 49 + II (d, a, b, c, x[ 7], S42, 0x432aff97); // 50 + II (c, d, a, b, x[14], S43, 0xab9423a7); // 51 + II (b, c, d, a, x[ 5], S44, 0xfc93a039); // 52 + II (a, b, c, d, x[12], S41, 0x655b59c3); // 53 + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); // 54 + II (c, d, a, b, x[10], S43, 0xffeff47d); // 55 + II (b, c, d, a, x[ 1], S44, 0x85845dd1); // 56 + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); // 57 + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); // 58 + II (c, d, a, b, x[ 6], S43, 0xa3014314); // 59 + II (b, c, d, a, x[13], S44, 0x4e0811a1); // 60 + II (a, b, c, d, x[ 4], S41, 0xf7537e82); // 61 + II (d, a, b, c, x[11], S42, 0xbd3af235); // 62 + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); // 63 + II (b, c, d, a, x[ 9], S44, 0xeb86d391); // 64 + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + // Zeroize sensitive information. + + memset ((POINTER)x, 0, sizeof (x)); +} + +// Encodes input (UINT4) into output (unsigned char). Assumes len is a +// multiple of 4. +void MD5::encode(unsigned char *output, UINT4 *input, unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +// Decodes input (unsigned char) into output (UINT4). Assumes len is a +// multiple of 4. +void MD5::decode(UINT4 *output, unsigned char *input, unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); +} + +// Public functions + +MD5::MD5() +{ + init(); +} + +void MD5::reset() +{ + init(); +} + +void MD5::encodeString(char const* str) +{ + unsigned int len = strlen(str); + + update((unsigned char *)str, len); + final(); +} + +void MD5::appendString(char const* input_string) +{ + update((unsigned char *)input_string, strlen(input_string)); +} + +void MD5::encodeDataIncrementally(char const* data, int len) +{ + update((unsigned char *)data, len); +} + +void MD5::encodeFile(char const *filename, int up_to_size) + throw (QEXC::System) +{ + FILE *file; + unsigned char buffer[1024]; + + if ((file = fopen (filename, "rb")) == NULL) + { + throw QEXC::System(std::string("MD5: can't open ") + filename, errno); + } + + int len; + int so_far = 0; + int to_try = 1024; + do + { + if ((up_to_size >= 0) && ((so_far + to_try) > up_to_size)) + { + to_try = up_to_size - so_far; + } + len = fread(buffer, 1, to_try, file); + if (len > 0) + { + update(buffer, len); + so_far += len; + if ((up_to_size >= 0) && (so_far >= up_to_size)) + { + break; + } + } + } while (len > 0); + if (ferror(file)) + { + // Assume, perhaps incorrectly, that errno was set by the + // underlying call to read.... + (void) fclose(file); + throw QEXC::System(std::string("MD5: read error on ") + filename, errno); + } + (void) fclose(file); + + final(); +} + +void MD5::digest(Digest result) +{ + final(); + memcpy(result, digest_val, sizeof(digest_val)); +} + +void MD5::print() +{ + final(); + + unsigned int i; + for (i = 0; i < 16; ++i) + { + printf("%02x", digest_val[i]); + } + printf("\n"); +} + +std::string MD5::unparse() +{ + final(); + + char result[33]; + char* p = result; + unsigned int i; + for (i = 0; i < 16; ++i) + { + sprintf(p, "%02x", digest_val[i]); + p += 2; + } + return result; +} + +std::string +MD5::getDataChecksum(char const* buf, int len) +{ + MD5 m; + m.encodeDataIncrementally(buf, len); + return m.unparse(); +} + +std::string +MD5::getFileChecksum(char const* filename, int up_to_size) +{ + MD5 m; + m.encodeFile(filename, up_to_size); + return m.unparse(); +} + +bool +MD5::checkDataChecksum(char const* const checksum, + char const* buf, int len) +{ + std::string actual_checksum = getDataChecksum(buf, len); + return (checksum == actual_checksum); +} + +bool +MD5::checkFileChecksum(char const* const checksum, + char const* filename, int up_to_size) +{ + bool result = false; + try + { + std::string actual_checksum = getFileChecksum(filename, up_to_size); + result = (checksum == actual_checksum); + } + catch (QEXC::System) + { + // Ignore -- return false + } + return result; +} diff --git a/libqpdf/Makefile b/libqpdf/Makefile new file mode 100644 index 00000000..90899055 --- /dev/null +++ b/libqpdf/Makefile @@ -0,0 +1 @@ +include ../make/proxy.mk diff --git a/libqpdf/PCRE.cc b/libqpdf/PCRE.cc new file mode 100644 index 00000000..afa6e954 --- /dev/null +++ b/libqpdf/PCRE.cc @@ -0,0 +1,365 @@ + + +#include <qpdf/PCRE.hh> +#include <qpdf/QUtil.hh> + +#include <iostream> + +PCRE::Exception::Exception(std::string const& message) +{ + this->setMessage("PCRE error: " + message); +} + +PCRE::NoBackref::NoBackref() : + Exception("no match") +{ +} + +PCRE::Match::Match(int nbackrefs, char const* subject) +{ + this->init(-1, nbackrefs, subject); +} + +PCRE::Match::~Match() +{ + this->destroy(); +} + +PCRE::Match::Match(Match const& rhs) +{ + this->copy(rhs); +} + +PCRE::Match& +PCRE::Match::operator=(Match const& rhs) +{ + if (this != &rhs) + { + this->destroy(); + this->copy(rhs); + } + return *this; +} + +void +PCRE::Match::init(int nmatches, int nbackrefs, char const* subject) +{ + this->nmatches = nmatches; + this->nbackrefs = nbackrefs; + this->subject = subject; + this->ovecsize = 3 * (1 + nbackrefs); + this->ovector = 0; + if (this->ovecsize) + { + this->ovector = new int[this->ovecsize]; + } +} + +void +PCRE::Match::copy(Match const& rhs) +{ + this->init(rhs.nmatches, rhs.nbackrefs, rhs.subject); + int i; + for (i = 0; i < this->ovecsize; ++i) + { + this->ovector[i] = rhs.ovector[i]; + } +} + +void +PCRE::Match::destroy() +{ + delete [] this->ovector; +} + +PCRE::Match::operator bool() +{ + return (this->nmatches >= 0); +} + + +std::string +PCRE::Match::getMatch(int n, int flags) + throw(QEXC::General, Exception) +{ + // This method used to be implemented in terms of + // pcre_get_substring, but that function gives you an empty string + // for an unmatched backreference that is in range. + + int offset; + int length; + try + { + getOffsetLength(n, offset, length); + } + catch (NoBackref&) + { + if (flags & gm_no_substring_returns_empty) + { + return ""; + } + else + { + throw; + } + } + + return std::string(this->subject).substr(offset, length); +} + +void +PCRE::Match::getOffsetLength(int n, int& offset, int& length) throw(Exception) +{ + if ((this->nmatches < 0) || + (n > this->nmatches - 1) || + (this->ovector[n * 2] == -1)) + { + throw NoBackref(); + } + offset = this->ovector[n * 2]; + length = this->ovector[n * 2 + 1] - offset; +} + + +int +PCRE::Match::getOffset(int n) throw(Exception) +{ + int offset; + int length; + this->getOffsetLength(n, offset, length); + return offset; +} + + +int +PCRE::Match::getLength(int n) throw(Exception) +{ + int offset; + int length; + this->getOffsetLength(n, offset, length); + return length; +} + + +int +PCRE::Match::nMatches() const +{ + return this->nmatches; +} + +PCRE::PCRE(char const* pattern, int options) throw (Exception) +{ + char const *errptr; + int erroffset; + this->code = pcre_compile(pattern, options, &errptr, &erroffset, 0); + if (this->code) + { + this->nbackrefs = pcre_info(this->code, 0, 0); + } + else + { + std::string message = (std::string("compilation of ") + pattern + + " failed at offset " + + QUtil::int_to_string(erroffset) + ": " + + errptr); + throw Exception(message); + } +} + +PCRE::~PCRE() +{ + pcre_free(this->code); +} + +PCRE::Match +PCRE::match(char const* subject, int options, int startoffset, int size) + throw (QEXC::General, Exception) +{ + if (size == -1) + { + size = strlen(subject); + } + + Match result(this->nbackrefs, subject); + int status = pcre_exec(this->code, 0, subject, size, + startoffset, options, + result.ovector, result.ovecsize); + if (status >= 0) + { + result.nmatches = status; + } + else + { + std::string message; + + switch (status) + { + case PCRE_ERROR_NOMATCH: + break; + + case PCRE_ERROR_BADOPTION: + message = "bad option passed to PCRE::match()"; + throw Exception(message); + break; + + case PCRE_ERROR_NOMEMORY: + message = "insufficient memory"; + throw Exception(message); + break; + + case PCRE_ERROR_NULL: + case PCRE_ERROR_BADMAGIC: + case PCRE_ERROR_UNKNOWN_NODE: + default: + message = "pcre_exec returned " + QUtil::int_to_string(status); + throw QEXC::Internal(message); + } + } + + return result; +} + +void +PCRE::test(int n) +{ + try + { + if (n == 1) + { + static char const* utf8 = "abÏ€defq"; + PCRE u1("^([[:alpha:]]+)"); + PCRE u2("^([\\p{L}]+)", PCRE_UTF8); + PCRE::Match m1 = u1.match(utf8); + if (m1) + { + std::cout << "no utf8: " << m1.getMatch(1) << std::endl; + } + PCRE::Match m2 = u2.match(utf8); + if (m2) + { + std::cout << "utf8: " << m2.getMatch(1) << std::endl; + } + return; + } + + try + { + PCRE pcre1("a**"); + } + catch (Exception& e) + { + std::cout << e.unparse() << std::endl; + } + + PCRE pcre2("^([^\\s:]*)\\s*:\\s*(.*?)\\s*$"); + PCRE::Match m2 = pcre2.match("key: value one two three "); + if (m2) + { + std::cout << m2.nMatches() << std::endl; + std::cout << m2.getMatch(0) << std::endl; + std::cout << m2.getOffset(0) << std::endl; + std::cout << m2.getLength(0) << std::endl; + std::cout << m2.getMatch(1) << std::endl; + std::cout << m2.getOffset(1) << std::endl; + std::cout << m2.getLength(1) << std::endl; + std::cout << m2.getMatch(2) << std::endl; + std::cout << m2.getOffset(2) << std::endl; + std::cout << m2.getLength(2) << std::endl; + try + { + std::cout << m2.getMatch(3) << std::endl; + } + catch (Exception& e) + { + std::cout << e.unparse() << std::endl; + } + try + { + std::cout << m2.getOffset(3) << std::endl; + } + catch (Exception& e) + { + std::cout << e.unparse() << std::endl; + } + } + PCRE pcre3("^(a+)(b+)?$"); + PCRE::Match m3 = pcre3.match("aaa"); + try + { + if (m3) + { + std::cout << m3.nMatches() << std::endl; + std::cout << m3.getMatch(0) << std::endl; + std::cout << m3.getMatch(1) << std::endl; + std::cout << "-" + << m3.getMatch( + 2, Match::gm_no_substring_returns_empty) + << "-" << std::endl; + std::cout << "hello" << std::endl; + std::cout << m3.getMatch(2) << std::endl; + std::cout << "can't see this" << std::endl; + } + } + catch (Exception& e) + { + std::cout << e.unparse() << std::endl; + } + + // backref: 1 2 3 4 5 + PCRE pcre4("^((?:(a(b)?)(?:,(c))?)|(c))?$"); + static char const* candidates[] = { + "qqqcqqq", // no match + "ab,c", // backrefs: 0, 1, 2, 3, 4 + "ab", // backrefs: 0, 1, 2, 3 + "a", // backrefs: 0, 1, 2 + "a,c", // backrefs: 0, 1, 2, 4 + "c", // backrefs: 0, 1, 5 + "", // backrefs: 0 + 0 + }; + for (char const** p = candidates; *p; ++p) + { + PCRE::Match m(pcre4.match(*p)); + if (m) + { + int nmatches = m.nMatches(); + for (int i = 0; i < nmatches; ++i) + { + std::cout << *p << ": " << i << ": "; + try + { + std::string match = m.getMatch(i); + std::cout << match; + } + catch (NoBackref&) + { + std::cout << "no backref (getMatch)"; + } + std::cout << std::endl; + + std::cout << *p << ": " << i << ": "; + try + { + int offset; + int length; + m.getOffsetLength(i, offset, length); + std::cout << offset << ", " << length; + } + catch (NoBackref&) + { + std::cout << "no backref (getOffsetLength)"; + } + std:: cout << std::endl; + } + } + else + { + std::cout << *p << ": no match" << std::endl; + } + } + } + catch (QEXC::General& e) + { + std::cout << "unexpected exception: " << e.unparse() << std::endl; + } +} diff --git a/libqpdf/Pipeline.cc b/libqpdf/Pipeline.cc new file mode 100644 index 00000000..17c0c8b2 --- /dev/null +++ b/libqpdf/Pipeline.cc @@ -0,0 +1,25 @@ + + +#include <qpdf/Pipeline.hh> + +Pipeline::Pipeline(char const* identifier, Pipeline* next) : + identifier(identifier), + next(next) +{ +} + +Pipeline::~Pipeline() +{ +} + +Pipeline* +Pipeline::getNext(bool allow_null) +{ + if ((next == 0) && (! allow_null)) + { + throw Exception( + this->identifier + + ": Pipeline::getNext() called on pipeline with no next"); + } + return this->next; +} diff --git a/libqpdf/Pl_ASCII85Decoder.cc b/libqpdf/Pl_ASCII85Decoder.cc new file mode 100644 index 00000000..4ecdaf41 --- /dev/null +++ b/libqpdf/Pl_ASCII85Decoder.cc @@ -0,0 +1,131 @@ +#include <qpdf/Pl_ASCII85Decoder.hh> +#include <qpdf/QEXC.hh> +#include <qpdf/QTC.hh> +#include <string.h> + +Pl_ASCII85Decoder::Pl_ASCII85Decoder(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + pos(0), + eod(0) +{ + memset(this->inbuf, 117, 5); +} + +Pl_ASCII85Decoder::~Pl_ASCII85Decoder() +{ +} + +void +Pl_ASCII85Decoder::write(unsigned char* buf, int len) +{ + if (eod > 1) + { + return; + } + for (int i = 0; i < len; ++i) + { + if (eod > 1) + { + break; + } + else if (eod == 1) + { + if (buf[i] == '>') + { + flush(); + eod = 2; + } + else + { + throw QEXC::General( + "broken end-of-data sequence in base 85 data"); + } + } + else + { + switch (buf[i]) + { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + QTC::TC("libtests", "Pl_ASCII85Decoder ignore space"); + // ignore whitespace + break; + + case '~': + eod = 1; + break; + + case 'z': + if (pos != 0) + { + throw QEXC::General( + "unexpected z during base 85 decode"); + } + else + { + QTC::TC("libtests", "Pl_ASCII85Decoder read z"); + getNext()->write((unsigned char*)"\000\000\000\000", 4); + } + break; + + default: + if ((buf[i] < 33) || (buf[i] > 117)) + { + throw QEXC::General + ("character out of range during base 85 decode"); + } + else + { + this->inbuf[this->pos++] = buf[i]; + if (pos == 5) + { + flush(); + } + } + break; + } + } + } +} + +void +Pl_ASCII85Decoder::flush() +{ + if (this->pos == 0) + { + QTC::TC("libtests", "Pl_ASCII85Decoder no-op flush"); + return; + } + unsigned long lval = 0; + for (int i = 0; i < 5; ++i) + { + lval *= 85; + lval += (this->inbuf[i] - 33); + } + + unsigned char outbuf[4]; + memset(outbuf, 0, 4); + for (int i = 3; i >= 0; --i) + { + outbuf[i] = lval & 0xff; + lval >>= 8; + } + + QTC::TC("libtests", "Pl_ASCII85Decoder partial flush", + (this->pos == 5) ? 0 : 1); + getNext()->write(outbuf, this->pos - 1); + + this->pos = 0; + memset(this->inbuf, 117, 5); +} + +void +Pl_ASCII85Decoder::finish() +{ + flush(); + getNext()->finish(); +} diff --git a/libqpdf/Pl_ASCIIHexDecoder.cc b/libqpdf/Pl_ASCIIHexDecoder.cc new file mode 100644 index 00000000..d1b4ef1c --- /dev/null +++ b/libqpdf/Pl_ASCIIHexDecoder.cc @@ -0,0 +1,108 @@ +#include <qpdf/Pl_ASCIIHexDecoder.hh> +#include <qpdf/QEXC.hh> +#include <qpdf/QTC.hh> +#include <string.h> +#include <ctype.h> + +Pl_ASCIIHexDecoder::Pl_ASCIIHexDecoder(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + pos(0), + eod(false) +{ + strcpy(this->inbuf, "00"); +} + +Pl_ASCIIHexDecoder::~Pl_ASCIIHexDecoder() +{ +} + +void +Pl_ASCIIHexDecoder::write(unsigned char* buf, int len) +{ + if (this->eod) + { + return; + } + for (int i = 0; i < len; ++i) + { + char ch = toupper(buf[i]); + switch (ch) + { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + QTC::TC("libtests", "Pl_ASCIIHexDecoder ignore space"); + // ignore whitespace + break; + + case '>': + this->eod = true; + flush(); + break; + + default: + if (((ch >= '0') && (ch <= '9')) || + ((ch >= 'A') && (ch <= 'F'))) + { + this->inbuf[this->pos++] = ch; + if (this->pos == 2) + { + flush(); + } + } + else + { + char t[2]; + t[0] = ch; + t[1] = 0; + throw QEXC::General( + std::string("character out of range during base Hex decode: ") + t); + } + break; + } + if (this->eod) + { + break; + } + } +} + +void +Pl_ASCIIHexDecoder::flush() +{ + if (this->pos == 0) + { + QTC::TC("libtests", "Pl_ASCIIHexDecoder no-op flush"); + return; + } + int b[2]; + for (int i = 0; i < 2; ++i) + { + if (this->inbuf[i] >= 'A') + { + b[i] = this->inbuf[i] - 'A' + 10; + } + else + { + b[i] = this->inbuf[i] - '0'; + } + } + unsigned char ch = (unsigned char)((b[0] << 4) + b[1]); + + QTC::TC("libtests", "Pl_ASCIIHexDecoder partial flush", + (this->pos == 2) ? 0 : 1); + getNext()->write(&ch, 1); + + this->pos = 0; + strcpy(this->inbuf, "00"); +} + +void +Pl_ASCIIHexDecoder::finish() +{ + flush(); + getNext()->finish(); +} diff --git a/libqpdf/Pl_Buffer.cc b/libqpdf/Pl_Buffer.cc new file mode 100644 index 00000000..185cf636 --- /dev/null +++ b/libqpdf/Pl_Buffer.cc @@ -0,0 +1,67 @@ + +#include <qpdf/Pl_Buffer.hh> +#include <qpdf/QEXC.hh> +#include <assert.h> + +Pl_Buffer::Pl_Buffer(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + ready(false), + total_size(0) +{ +} + +Pl_Buffer::~Pl_Buffer() +{ +} + +void +Pl_Buffer::write(unsigned char* buf, int len) +{ + Buffer* b = new Buffer(len); + memcpy(b->getBuffer(), buf, len); + this->data.push_back(b); + this->ready = false; + this->total_size += len; + + if (getNext(true)) + { + getNext()->write(buf, len); + } +} + +void +Pl_Buffer::finish() +{ + this->ready = true; + if (getNext(true)) + { + getNext()->finish(); + } +} + +Buffer* +Pl_Buffer::getBuffer() +{ + if (! this->ready) + { + throw QEXC::Internal("Pl_Buffer::getBuffer() called when not ready"); + } + + Buffer* b = new Buffer(this->total_size); + unsigned char* p = b->getBuffer(); + while (! this->data.empty()) + { + PointerHolder<Buffer> bph = this->data.front(); + this->data.pop_front(); + Buffer* bp = bph.getPointer(); + size_t bytes = bp->getSize(); + memcpy(p, bp->getBuffer(), bytes); + p += bytes; + this->total_size -= bytes; + } + + assert(this->total_size == 0); + this->ready = false; + + return b; +} diff --git a/libqpdf/Pl_Count.cc b/libqpdf/Pl_Count.cc new file mode 100644 index 00000000..8a361ad5 --- /dev/null +++ b/libqpdf/Pl_Count.cc @@ -0,0 +1,42 @@ + +#include <qpdf/Pl_Count.hh> + +Pl_Count::Pl_Count(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + count(0), + last_char('\0') +{ +} + +Pl_Count::~Pl_Count() +{ +} + +void +Pl_Count::write(unsigned char* buf, int len) +{ + if (len) + { + this->count += len; + getNext()->write(buf, len); + this->last_char = buf[len - 1]; + } +} + +void +Pl_Count::finish() +{ + getNext()->finish(); +} + +int +Pl_Count::getCount() const +{ + return this->count; +} + +unsigned char +Pl_Count::getLastChar() const +{ + return this->last_char; +} diff --git a/libqpdf/Pl_Discard.cc b/libqpdf/Pl_Discard.cc new file mode 100644 index 00000000..1632ea23 --- /dev/null +++ b/libqpdf/Pl_Discard.cc @@ -0,0 +1,23 @@ + +#include <qpdf/Pl_Discard.hh> + +// Exercised in md5 test suite + +Pl_Discard::Pl_Discard() : + Pipeline("discard", 0) +{ +} + +Pl_Discard::~Pl_Discard() +{ +} + +void +Pl_Discard::write(unsigned char* buf, int len) +{ +} + +void +Pl_Discard::finish() +{ +} diff --git a/libqpdf/Pl_Flate.cc b/libqpdf/Pl_Flate.cc new file mode 100644 index 00000000..ba60c472 --- /dev/null +++ b/libqpdf/Pl_Flate.cc @@ -0,0 +1,198 @@ + +#include <qpdf/Pl_Flate.hh> + +#include <qpdf/QUtil.hh> + +Pl_Flate::Pl_Flate(char const* identifier, Pipeline* next, + action_e action, int out_bufsize) : + Pipeline(identifier, next), + out_bufsize(out_bufsize), + action(action), + initialized(false) +{ + this->outbuf = new unsigned char[out_bufsize]; + + zstream.zalloc = (alloc_func)0; + zstream.zfree = (free_func)0; + zstream.opaque = (voidpf)0; + zstream.next_in = 0; + zstream.avail_in = 0; + zstream.next_out = this->outbuf; + zstream.avail_out = out_bufsize; +} + +Pl_Flate::~Pl_Flate() +{ + if (this->outbuf) + { + delete [] this->outbuf; + this->outbuf = 0; + } +} + +void +Pl_Flate::write(unsigned char* data, int len) +{ + if (this->outbuf == 0) + { + throw Exception( + this->identifier + + ": Pl_Flate: write() called after finish() called"); + } + handleData(data, len, Z_NO_FLUSH); +} + +void +Pl_Flate::handleData(unsigned char* data, int len, int flush) +{ + this->zstream.next_in = data; + this->zstream.avail_in = len; + + if (! this->initialized) + { + int err = Z_OK; + if (this->action == a_deflate) + { + err = deflateInit(&this->zstream, Z_DEFAULT_COMPRESSION); + } + else + { + err = inflateInit(&this->zstream); + } + checkError("Init", err); + this->initialized = true; + } + + int err = Z_OK; + + bool done = false; + while (! done) + { + if (action == a_deflate) + { + err = deflate(&this->zstream, flush); + } + else + { + err = inflate(&this->zstream, flush); + } + switch (err) + { + case Z_BUF_ERROR: + // Probably shouldn't be able to happen, but possible as a + // boundary condition: if the last call to inflate exactly + // filled the output buffer, it's possible that the next + // call to inflate could have nothing to do. + done = true; + break; + + case Z_STREAM_END: + done = true; + // fall through + + case Z_OK: + { + if ((this->zstream.avail_in == 0) && + (this->zstream.avail_out > 0)) + { + // There is nothing left to read, and there was + // sufficient buffer space to write everything we + // needed, so we're done for now. + done = true; + } + uLong ready = (this->out_bufsize - this->zstream.avail_out); + if (ready > 0) + { + this->getNext()->write(this->outbuf, ready); + this->zstream.next_out = this->outbuf; + this->zstream.avail_out = this->out_bufsize; + } + } + break; + + default: + this->checkError("data", err); + break; + } + } +} + +void +Pl_Flate::finish() +{ + if (this->outbuf) + { + if (this->initialized) + { + unsigned char buf[1]; + buf[0] = '\0'; + handleData(buf, 0, Z_FINISH); + int err = Z_OK; + if (action == a_deflate) + { + err = deflateEnd(&this->zstream); + } + else + { + err = inflateEnd(&this->zstream); + } + checkError("End", err); + } + + delete [] this->outbuf; + this->outbuf = 0; + } + this->getNext()->finish(); +} + +void +Pl_Flate::checkError(char const* prefix, int error_code) +{ + if (error_code != Z_OK) + { + char const* action_str = (action == a_deflate ? "deflate" : "inflate"); + std::string msg = + this->identifier + ": " + action_str + ": " + prefix + ": "; + + if (this->zstream.msg) + { + msg += this->zstream.msg; + } + else + { + switch (error_code) + { + case Z_ERRNO: + msg += "zlib system error"; + break; + + case Z_STREAM_ERROR: + msg += "zlib stream error"; + break; + + case Z_DATA_ERROR: + msg += "zlib data error"; + break; + + case Z_MEM_ERROR: + msg += "zlib memory error"; + break; + + case Z_BUF_ERROR: + msg += "zlib buffer error"; + break; + + case Z_VERSION_ERROR: + msg += "zlib version error"; + break; + + default: + msg += std::string("zlib unknown error (") + + QUtil::int_to_string(error_code) + ")"; + break; + } + } + + throw Exception(msg); + } +} diff --git a/libqpdf/Pl_LZWDecoder.cc b/libqpdf/Pl_LZWDecoder.cc new file mode 100644 index 00000000..e85531e9 --- /dev/null +++ b/libqpdf/Pl_LZWDecoder.cc @@ -0,0 +1,229 @@ +#include <qpdf/Pl_LZWDecoder.hh> + +#include <qpdf/QEXC.hh> +#include <qpdf/QTC.hh> +#include <string.h> +#include <assert.h> + +Pl_LZWDecoder::Pl_LZWDecoder(char const* identifier, Pipeline* next, + bool early_code_change) : + Pipeline(identifier, next), + code_size(9), + next(0), + byte_pos(0), + bit_pos(0), + bits_available(0), + code_change_delta(early_code_change ? 1 : 0), + eod(false), + last_code(256) +{ + memset(buf, 0, 3); +} + + +Pl_LZWDecoder::~Pl_LZWDecoder() +{ +} + +void +Pl_LZWDecoder::write(unsigned char* bytes, int len) +{ + for (int i = 0; i < len; ++i) + { + this->buf[next++] = bytes[i]; + if (this->next == 3) + { + this->next = 0; + } + this->bits_available += 8; + if (this->bits_available >= this->code_size) + { + sendNextCode(); + } + } +} + +void +Pl_LZWDecoder::finish() +{ + getNext()->finish(); +} + +void +Pl_LZWDecoder::sendNextCode() +{ + int high = this->byte_pos; + int med = (this->byte_pos + 1) % 3; + int low = (this->byte_pos + 2) % 3; + + int bits_from_high = 8 - this->bit_pos; + int bits_from_med = this->code_size - bits_from_high; + int bits_from_low = 0; + if (bits_from_med > 8) + { + bits_from_low = bits_from_med - 8; + bits_from_med = 8; + } + int high_mask = (1 << bits_from_high) - 1; + int med_mask = 0xff - ((1 << (8 - bits_from_med)) - 1); + int low_mask = 0xff - ((1 << (8 - bits_from_low)) - 1); + int code = 0; + code += (this->buf[high] & high_mask) << bits_from_med; + code += ((this->buf[med] & med_mask) >> (8 - bits_from_med)); + if (bits_from_low) + { + code <<= bits_from_low; + code += ((this->buf[low] & low_mask) >> (8 - bits_from_low)); + this->byte_pos = low; + this->bit_pos = bits_from_low; + } + else + { + this->byte_pos = med; + this->bit_pos = bits_from_med; + } + if (this->bit_pos == 8) + { + this->bit_pos = 0; + ++this->byte_pos; + this->byte_pos %= 3; + } + this->bits_available -= this->code_size; + + handleCode(code); +} + +unsigned char +Pl_LZWDecoder::getFirstChar(int code) +{ + unsigned char result = '\0'; + if (code < 256) + { + result = (unsigned char) code; + } + else + { + assert(code > 257); + unsigned int idx = code - 258; + assert(idx < table.size()); + Buffer& b = table[idx]; + result = b.getBuffer()[0]; + } + return result; +} + +void +Pl_LZWDecoder::addToTable(unsigned char next) +{ + unsigned int last_size = 0; + unsigned char const* last_data = 0; + unsigned char tmp[1]; + + if (this->last_code < 256) + { + tmp[0] = this->last_code; + last_data = tmp; + last_size = 1; + } + else + { + assert(this->last_code > 257); + unsigned int idx = this->last_code - 258; + assert(idx < table.size()); + Buffer& b = table[idx]; + last_data = b.getBuffer(); + last_size = b.getSize(); + } + + Buffer entry(1 + last_size); + unsigned char* new_data = entry.getBuffer(); + memcpy(new_data, last_data, last_size); + new_data[last_size] = next; + this->table.push_back(entry); +} + +void +Pl_LZWDecoder::handleCode(int code) +{ + if (this->eod) + { + return; + } + + if (code == 256) + { + if (! this->table.empty()) + { + QTC::TC("libtests", "Pl_LZWDecoder intermediate reset"); + } + this->table.clear(); + this->code_size = 9; + } + else if (code == 257) + { + this->eod = true; + } + else + { + if (this->last_code != 256) + { + // Add to the table from last time. New table entry would + // be what we read last plus the first character of what + // we're reading now. + unsigned char next = '\0'; + unsigned int table_size = table.size(); + if (code < 256) + { + // just read < 256; last time's next was code + next = code; + } + else if (code > 257) + { + unsigned int idx = code - 258; + if (idx > table_size) + { + throw QEXC::General("LZWDecoder: bad code received"); + } + else if (idx == table_size) + { + // The encoder would have just created this entry, + // so the first character of this entry would have + // been the same as the first character of the + // last entry. + QTC::TC("libtests", "Pl_LZWDecoder last was table size"); + next = getFirstChar(this->last_code); + } + else + { + next = getFirstChar(code); + } + } + unsigned int last_idx = 258 + table_size; + if (last_idx == 4095) + { + throw QEXC::General("LZWDecoder: table full"); + } + addToTable(next); + unsigned int change_idx = last_idx + code_change_delta; + if ((change_idx == 511) || + (change_idx == 1023) || + (change_idx == 2047)) + { + ++this->code_size; + } + } + + if (code < 256) + { + unsigned char ch = (unsigned char) code; + getNext()->write(&ch, 1); + } + else + { + Buffer& b = table[code - 258]; + getNext()->write(b.getBuffer(), b.getSize()); + } + } + + this->last_code = code; +} diff --git a/libqpdf/Pl_MD5.cc b/libqpdf/Pl_MD5.cc new file mode 100644 index 00000000..0a2711b8 --- /dev/null +++ b/libqpdf/Pl_MD5.cc @@ -0,0 +1,43 @@ + +#include <qpdf/Pl_MD5.hh> + +#include <qpdf/QEXC.hh> + +Pl_MD5::Pl_MD5(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + in_progress(false) +{ +} + +Pl_MD5::~Pl_MD5() +{ +} + +void +Pl_MD5::write(unsigned char* buf, int len) +{ + if (! this->in_progress) + { + this->md5.reset(); + this->in_progress = true; + } + this->md5.encodeDataIncrementally((char*) buf, len); + this->getNext()->write(buf, len); +} + +void +Pl_MD5::finish() +{ + this->getNext()->finish(); + this->in_progress = false; +} + +std::string +Pl_MD5::getHexDigest() +{ + if (this->in_progress) + { + throw QEXC::General("digest requested for in-progress MD5 Pipeline"); + } + return this->md5.unparse(); +} diff --git a/libqpdf/Pl_PNGFilter.cc b/libqpdf/Pl_PNGFilter.cc new file mode 100644 index 00000000..28b87c5e --- /dev/null +++ b/libqpdf/Pl_PNGFilter.cc @@ -0,0 +1,146 @@ + +#include <qpdf/Pl_PNGFilter.hh> +#include <string.h> + +Pl_PNGFilter::Pl_PNGFilter(char const* identifier, Pipeline* next, + action_e action, unsigned int columns, + unsigned int bytes_per_pixel) : + Pipeline(identifier, next), + action(action), + columns(columns), + cur_row(0), + prev_row(0), + buf1(0), + buf2(0), + pos(0) +{ + this->buf1 = new unsigned char[columns + 1]; + this->buf2 = new unsigned char[columns + 1]; + this->cur_row = buf1; + + // number of bytes per incoming row + this->incoming = (action == a_encode ? columns : columns + 1); +} + +Pl_PNGFilter::~Pl_PNGFilter() +{ + delete [] buf1; + delete [] buf2; +} + +void +Pl_PNGFilter::write(unsigned char* data, int len) +{ + int left = this->incoming - this->pos; + unsigned int offset = 0; + while (len >= left) + { + // finish off current row + memcpy(this->cur_row + this->pos, data + offset, left); + offset += left; + len -= left; + + processRow(); + + // Swap rows + unsigned char* t = this->prev_row; + this->prev_row = this->cur_row; + this->cur_row = t ? t : this->buf2; + memset(this->cur_row, 0, this->columns + 1); + left = this->incoming; + this->pos = 0; + } + if (len) + { + memcpy(this->cur_row + this->pos, data + offset, len); + } + this->pos += len; +} + +void +Pl_PNGFilter::processRow() +{ + if (this->action == a_encode) + { + encodeRow(); + } + else + { + decodeRow(); + } +} + +void +Pl_PNGFilter::decodeRow() +{ + int filter = (int) this->cur_row[0]; + if (this->prev_row) + { + switch (filter) + { + case 0: // none + break; + + case 1: // sub + throw Exception("sub filter not implemented"); + break; + + case 2: // up + for (unsigned int i = 1; i <= this->columns; ++i) + { + this->cur_row[i] += this->prev_row[i]; + } + break; + + case 3: // average + throw Exception("average filter not implemented"); + break; + + case 4: // Paeth + throw Exception("Paeth filter not implemented"); + break; + + default: + // ignore + break; + } + } + + getNext()->write(this->cur_row + 1, this->columns); +} + +void +Pl_PNGFilter::encodeRow() +{ + // For now, hard-code to using UP filter. + unsigned char ch = 2; + getNext()->write(&ch, 1); + if (this->prev_row) + { + for (unsigned int i = 0; i < this->columns; ++i) + { + ch = this->cur_row[i] - this->prev_row[i]; + getNext()->write(&ch, 1); + } + } + else + { + getNext()->write(this->cur_row, this->columns); + } +} + +void +Pl_PNGFilter::finish() +{ + if (this->pos) + { + // write partial row + processRow(); + } + this->prev_row = 0; + this->cur_row = buf1; + this->pos = 0; + memset(this->cur_row, 0, this->columns + 1); + + getNext()->finish(); +} diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc new file mode 100644 index 00000000..63f0caaf --- /dev/null +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -0,0 +1,179 @@ + +#include <qpdf/Pl_QPDFTokenizer.hh> +#include <qpdf/QPDF_String.hh> +#include <qpdf/QPDF_Name.hh> + +Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : + Pipeline(identifier, next), + newline_after_next_token(false), + just_wrote_nl(false), + last_char_was_cr(false), + unread_char(false), + char_to_unread('\0'), + pass_through(false) +{ +} + +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() +{ +} + +void +Pl_QPDFTokenizer::writeNext(char const* buf, int len) +{ + if (len) + { + unsigned char* t = new unsigned char[len]; + memcpy(t, buf, len); + getNext()->write(t, len); + delete [] t; + this->just_wrote_nl = (buf[len-1] == '\n'); + } +} + +void +Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) +{ + std::string value = token.getRawValue(); + + switch (token.getType()) + { + case QPDFTokenizer::tt_string: + value = QPDF_String(token.getValue()).unparse(); + break; + + case QPDFTokenizer::tt_name: + value = QPDF_Name(token.getValue()).unparse(); + break; + + default: + break; + } + writeNext(value.c_str(), value.length()); +} + +void +Pl_QPDFTokenizer::processChar(char ch) +{ + if (this->pass_through) + { + // We're not noramlizing anymore -- just write this without + // looking at it. + writeNext(&ch, 1); + return; + } + + tokenizer.presentCharacter(ch); + QPDFTokenizer::Token token; + if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + { + writeToken(token); + if (this->newline_after_next_token) + { + writeNext("\n", 1); + this->newline_after_next_token = false; + } + if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "BI")) + { + // Uh oh.... we're not sophisticated enough to handle + // inline images safely. We'd have to to set up all the + // filters and pipe the iamge data through it until the + // filtered output was the right size for an image of the + // specified dimensions. Then we'd either have to write + // out raw image data or continue to write filtered data, + // resuming normalization when we get to the end. + // Insetad, for now, we'll just turn off noramlization for + // the remainder of this stream. + this->pass_through = true; + if (this->unread_char) + { + writeNext(&this->char_to_unread, 1); + this->unread_char = false; + } + } + } + else + { + bool suppress = false; + if ((ch == '\n') && (this->last_char_was_cr)) + { + // Always ignore \n following \r + suppress = true; + } + + if ((this->last_char_was_cr = (ch == '\r'))) + { + ch = '\n'; + } + + if (this->tokenizer.betweenTokens()) + { + if (! suppress) + { + writeNext(&ch, 1); + } + } + else + { + if (ch == '\n') + { + this->newline_after_next_token = true; + } + } + } +} + + +void +Pl_QPDFTokenizer::checkUnread() +{ + if (this->unread_char) + { + processChar(this->char_to_unread); + if (this->unread_char) + { + throw QEXC::Internal("unread_char still true after processing " + "unread character"); + } + } +} + +void +Pl_QPDFTokenizer::write(unsigned char* buf, int len) +{ + checkUnread(); + for (int i = 0; i < len; ++i) + { + processChar(buf[i]); + checkUnread(); + } +} + +void +Pl_QPDFTokenizer::finish() +{ + this->tokenizer.presentEOF(); + if (! this->pass_through) + { + QPDFTokenizer::Token token; + if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + { + writeToken(token); + if (unread_char) + { + if (this->char_to_unread == '\r') + { + this->char_to_unread = '\n'; + } + writeNext(&this->char_to_unread, 1); + } + } + } + if (! this->just_wrote_nl) + { + writeNext("\n", 1); + } + + getNext()->finish(); +} diff --git a/libqpdf/Pl_RC4.cc b/libqpdf/Pl_RC4.cc new file mode 100644 index 00000000..74e53c8b --- /dev/null +++ b/libqpdf/Pl_RC4.cc @@ -0,0 +1,57 @@ + +#include <qpdf/Pl_RC4.hh> + +#include <qpdf/QUtil.hh> + +Pl_RC4::Pl_RC4(char const* identifier, Pipeline* next, + unsigned char const* key_data, int key_len, + int out_bufsize) : + Pipeline(identifier, next), + out_bufsize(out_bufsize), + rc4(key_data, key_len) +{ + this->outbuf = new unsigned char[out_bufsize]; +} + +Pl_RC4::~Pl_RC4() +{ + if (this->outbuf) + { + delete [] this->outbuf; + this->outbuf = 0; + } +} + +void +Pl_RC4::write(unsigned char* data, int len) +{ + if (this->outbuf == 0) + { + throw Exception( + this->identifier + + ": Pl_RC4: write() called after finish() called"); + } + + int bytes_left = len; + unsigned char* p = data; + + while (bytes_left > 0) + { + int bytes = (bytes_left < this->out_bufsize ? bytes_left : out_bufsize); + bytes_left -= bytes; + rc4.process(p, bytes, outbuf); + p += bytes; + getNext()->write(outbuf, bytes); + } +} + +void +Pl_RC4::finish() +{ + if (this->outbuf) + { + delete [] this->outbuf; + this->outbuf = 0; + } + this->getNext()->finish(); +} diff --git a/libqpdf/Pl_StdioFile.cc b/libqpdf/Pl_StdioFile.cc new file mode 100644 index 00000000..c0f42afd --- /dev/null +++ b/libqpdf/Pl_StdioFile.cc @@ -0,0 +1,48 @@ + +#include <qpdf/Pl_StdioFile.hh> + +#include <errno.h> + +Pl_StdioFile::Pl_StdioFile(char const* identifier, FILE* f) : + Pipeline(identifier, 0), + file(f) +{ +} + +Pl_StdioFile::~Pl_StdioFile() +{ +} + +void +Pl_StdioFile::write(unsigned char* buf, int len) +{ + size_t so_far = 0; + while (len > 0) + { + so_far = fwrite(buf, 1, len, this->file); + if (so_far == 0) + { + throw QEXC::System(this->identifier + ": Pl_StdioFile::write", + errno); + } + else + { + buf += so_far; + len -= so_far; + } + } +} + +void +Pl_StdioFile::finish() +{ + if (fileno(this->file) != -1) + { + fflush(this->file); + } + else + { + throw QEXC::Internal(this->identifier + + ": Pl_StdioFile::finish: stream already closed"); + } +} diff --git a/libqpdf/QEXC.cc b/libqpdf/QEXC.cc new file mode 100644 index 00000000..c65afbb6 --- /dev/null +++ b/libqpdf/QEXC.cc @@ -0,0 +1,67 @@ + +#include <qpdf/QEXC.hh> +#include <string.h> +#include <errno.h> + +QEXC::Base::Base() +{ + // nothing needed +} + +QEXC::Base::Base(std::string const& message) : + message(message) +{ + // nothing needed +} + +std::string const& +QEXC::Base::unparse() const +{ + return this->message; +} + +void +QEXC::Base::setMessage(std::string const& message) +{ + this->message = message; +} + +const char* +QEXC::Base::what() const throw() +{ + // Since unparse() returns a const string reference, its + // implementors must arrange to have it return a reference to a + // string that is not going to disappear. It is therefore safe + // for us to return it's c_str() pointer. + return this->unparse().c_str(); +} + +QEXC::General::General() +{ + // nothing needed +} + +QEXC::General::General(std::string const& message) : + Base(message) +{ + // nothing needed +} + +QEXC::System::System(std::string const& prefix, int sys_errno) +{ + // Note: using sys_errno in case errno is a macro. + this->sys_errno = sys_errno; + this->setMessage(prefix + ": " + strerror(sys_errno)); +} + +int +QEXC::System::getErrno() const +{ + return this->sys_errno; +} + +QEXC::Internal::Internal(std::string const& message) : + Base("INTERNAL ERROR: " + message) +{ + // nothing needed +} diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc new file mode 100644 index 00000000..6f51fa2c --- /dev/null +++ b/libqpdf/QPDF.cc @@ -0,0 +1,1851 @@ + +#include <qpdf/QPDF.hh> + +#include <vector> +#include <map> +#include <string.h> +#include <memory.h> + +#include <qpdf/QTC.hh> +#include <qpdf/QUtil.hh> +#include <qpdf/PCRE.hh> +#include <qpdf/Pipeline.hh> + +#include <qpdf/QPDFExc.hh> +#include <qpdf/QPDF_Null.hh> +#include <qpdf/QPDF_Dictionary.hh> + +void +QPDF::InputSource::setLastOffset(off_t offset) +{ + this->last_offset = offset; +} + +off_t +QPDF::InputSource::getLastOffset() const +{ + return this->last_offset; +} + +std::string +QPDF::InputSource::readLine() +{ + // Read a line terminated by one or more \r or \n characters + // without caring what the exact terminator is. Consume the + // trailing newline characters but don't return them. + + off_t offset = this->tell(); + std::string buf; + enum { st_before_nl, st_at_nl } state = st_before_nl; + char ch; + while (1) + { + size_t len = this->read(&ch, 1); + if (len == 0) + { + break; + } + + if (state == st_before_nl) + { + if ((ch == '\012') || (ch == '\015')) + { + state = st_at_nl; + } + else + { + buf += ch; + } + } + else if (state == st_at_nl) + { + if ((ch == '\012') || (ch == '\015')) + { + // do nothing + } + else + { + // unread this character + this->unreadCh(ch); + break; + } + } + } + // Override last offset to be where we started this line rather + // than before the last character read + this->last_offset = offset; + return buf; +} + +QPDF::FileInputSource::FileInputSource() : + file(0) +{ +} + +void +QPDF::FileInputSource::setFilename(char const* filename) +{ + destroy(); + this->filename = filename; + this->file = QUtil::fopen_wrapper(std::string("open ") + this->filename, + fopen(this->filename.c_str(), "rb")); +} + +QPDF::FileInputSource::~FileInputSource() +{ + destroy(); +} + +void +QPDF::FileInputSource::destroy() +{ + if (this->file) + { + fclose(this->file); + this->file = 0; + } +} + +std::string const& +QPDF::FileInputSource::getName() const +{ + return this->filename; +} + +off_t +QPDF::FileInputSource::tell() +{ + return ftell(this->file); +} + +void +QPDF::FileInputSource::seek(off_t offset, int whence) +{ + QUtil::os_wrapper(std::string("seek to ") + this->filename + ", offset " + + QUtil::int_to_string(offset) + " (" + + QUtil::int_to_string(whence) + ")", + fseek(this->file, offset, whence)); +} + +void +QPDF::FileInputSource::rewind() +{ + ::rewind(this->file); +} + +size_t +QPDF::FileInputSource::read(char* buffer, int length) +{ + this->last_offset = ftell(this->file); + size_t len = fread(buffer, 1, length, this->file); + if ((len == 0) && ferror(this->file)) + { + throw QPDFExc(this->filename, this->last_offset, + std::string("read ") + + QUtil::int_to_string(length) + " bytes"); + } + return len; +} + +void +QPDF::FileInputSource::unreadCh(char ch) +{ + QUtil::os_wrapper(this->filename + ": unread character", + ungetc((unsigned char)ch, this->file)); +} + +QPDF::BufferInputSource::BufferInputSource(std::string const& description, + Buffer* buf) : + description(description), + buf(buf), + cur_offset(0) +{ +} + +QPDF::BufferInputSource::~BufferInputSource() +{ +} + +std::string const& +QPDF::BufferInputSource::getName() const +{ + return this->description; +} + +off_t +QPDF::BufferInputSource::tell() +{ + return this->cur_offset; +} + +void +QPDF::BufferInputSource::seek(off_t offset, int whence) +{ + switch (whence) + { + case SEEK_SET: + this->cur_offset = offset; + break; + + case SEEK_END: + this->cur_offset = this->buf->getSize() - offset; + break; + + case SEEK_CUR: + this->cur_offset += offset; + break; + + default: + throw QEXC::Internal("invalid argument to BufferInputSource::seek"); + break; + } +} + +void +QPDF::BufferInputSource::rewind() +{ + this->cur_offset = 0; +} + +size_t +QPDF::BufferInputSource::read(char* buffer, int length) +{ + off_t end_pos = this->buf->getSize(); + if (this->cur_offset >= end_pos) + { + this->last_offset = end_pos; + return 0; + } + + this->last_offset = this->cur_offset; + size_t len = std::min((int)(end_pos - this->cur_offset), length); + memcpy(buffer, buf->getBuffer() + this->cur_offset, len); + this->cur_offset += len; + return len; +} + +void +QPDF::BufferInputSource::unreadCh(char ch) +{ + if (this->cur_offset > 0) + { + --this->cur_offset; + } +} + +QPDF::ObjGen::ObjGen(int o = 0, int g = 0) : + obj(o), + gen(g) +{ +} + +bool +QPDF::ObjGen::ObjGen::operator<(ObjGen const& rhs) const +{ + return ((this->obj < rhs.obj) || + ((this->obj == rhs.obj) && (this->gen < rhs.gen))); +} + +QPDF::QPDF() : + encrypted(false), + encryption_initialized(false), + ignore_xref_streams(false), + suppress_warnings(false), + attempt_recovery(true), + cached_key_objid(0), + cached_key_generation(0), + first_xref_item_offset(0), + uncompressed_after_compressed(false) +{ +} + +QPDF::~QPDF() +{ +} + +void +QPDF::processFile(char const* filename, char const* password) +{ + this->file.setFilename(filename); + this->provided_password = password; + parse(); +} + +void +QPDF::setIgnoreXRefStreams(bool val) +{ + this->ignore_xref_streams = val; +} + +void +QPDF::setSuppressWarnings(bool val) +{ + this->suppress_warnings = val; +} + +void +QPDF::setAttemptRecovery(bool val) +{ + this->attempt_recovery = val; +} + +std::vector<std::string> +QPDF::getWarnings() +{ + std::vector<std::string> result = this->warnings; + this->warnings.clear(); + return result; +} + +void +QPDF::parse() +{ + static PCRE header_re("^%PDF-(1.\\d+)\\b"); + static PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); + + std::string line = this->file.readLine(); + PCRE::Match m1 = header_re.match(line.c_str()); + if (m1) + { + this->pdf_version = m1.getMatch(1); + if (atof(this->pdf_version.c_str()) < 1.2) + { + this->tokenizer.allowPoundAnywhereInName(); + } + } + else + { + QTC::TC("qpdf", "QPDF not a pdf file"); + throw QPDFExc(this->file.getName(), 0, "not a PDF file"); + } + + // PDF spec says %%EOF must be found within the last 1024 bytes of + // the file. We add an extra 30 characters to leave room for the + // startxref stuff. + static int const tbuf_size = 1054; + this->file.seek(0, SEEK_END); + if (this->file.tell() > tbuf_size) + { + this->file.seek(-tbuf_size, SEEK_END); + } + else + { + this->file.rewind(); + } + char* buf = new char[tbuf_size + 1]; + // Put buf in a PointerHolder to guarantee deletion of buf. This + // calls delete rather than delete [], but it's okay since buf is + // an array of fundamental types. + PointerHolder<char> b(buf); + memset(buf, '\0', tbuf_size + 1); + this->file.read(buf, tbuf_size); + + // Since buf may contain null characters, we can't do a regexp + // search on buf directly. Find the last occurrence within buf + // where the regexp matches. + char* p = buf; + char const* candidate = ""; + while ((p = (char*)memchr(p, 's', tbuf_size - (p - buf))) != 0) + { + if (eof_re.match(p)) + { + candidate = p; + } + ++p; + } + + try + { + PCRE::Match m2 = eof_re.match(candidate); + if (! m2) + { + QTC::TC("qpdf", "QPDF can't find startxref"); + throw QPDFExc(this->file.getName() + ": can't find startxref"); + } + off_t xref_offset = atoi(m2.getMatch(1).c_str()); + read_xref(xref_offset); + } + catch (QPDFExc& e) + { + if (this->attempt_recovery) + { + reconstruct_xref(e); + QTC::TC("qpdf", "QPDF reconstructed xref table"); + } + else + { + throw e; + } + } + + initializeEncryption(); +} + +void +QPDF::warn(QPDFExc const& e) +{ + this->warnings.push_back(e.unparse()); + if (! this->suppress_warnings) + { + std::cerr << "WARNING: " << this->warnings.back() << std::endl; + } +} + +void +QPDF::setTrailer(QPDFObjectHandle obj) +{ + if (this->trailer.isInitialized()) + { + return; + } + this->trailer = obj; +} + +void +QPDF::reconstruct_xref(QPDFExc& e) +{ + static PCRE obj_re("^(\\d+) (\\d+) obj\\b"); + static PCRE endobj_re("^endobj\\b"); + static PCRE trailer_re("^trailer\\b"); + + warn(QPDFExc(this->file.getName(), 0, "file is damaged")); + warn(e); + warn(QPDFExc("Attempting to reconstruct cross-reference table")); + + this->file.seek(0, SEEK_END); + off_t eof = this->file.tell(); + this->file.seek(0, SEEK_SET); + bool in_obj = false; + while (this->file.tell() < eof) + { + std::string line = this->file.readLine(); + if (in_obj) + { + if (endobj_re.match(line.c_str())) + { + in_obj = false; + } + } + else + { + PCRE::Match m = obj_re.match(line.c_str()); + if (m) + { + in_obj = true; + int obj = atoi(m.getMatch(1).c_str()); + int gen = atoi(m.getMatch(2).c_str()); + int offset = this->file.getLastOffset(); + insertXrefEntry(obj, 1, offset, gen); + } + else if ((! this->trailer.isInitialized()) && + trailer_re.match(line.c_str())) + { + // read "trailer" + this->file.seek(this->file.getLastOffset(), SEEK_SET); + readToken(&this->file); + QPDFObjectHandle t = readObject(&this->file, 0, 0, false); + if (! t.isDictionary()) + { + // Oh well. It was worth a try. + } + else + { + setTrailer(t); + } + } + } + } + + if (! this->trailer.isInitialized()) + { + // We could check the last encountered object to see if it was + // an xref stream. If so, we could try to get the trailer + // from there. This may make it possible to recover files + // with bad startxref pointers even when they have object + // streams. + + throw QPDFExc(this->file.getName() + ": unable to find trailer " + "dictionary while recovering damanged file"); + } + + // We could iterate through the objects looking for streams and + // try to find objects inside of them, but it's probably not worth + // the trouble. Acrobat can't recover files with any errors in an + // xref stream, and this would be a real long shot anyway. If we + // wanted to do anything that involved looking at stream contents, + // we'd also have to call initializeEncryption() here. It's safe + // to call it more than once. +} + +void +QPDF::read_xref(off_t xref_offset) +{ + std::map<int, int> free_table; + while (xref_offset) + { + this->file.seek(xref_offset, SEEK_SET); + std::string line = this->file.readLine(); + if (line == "xref") + { + xref_offset = read_xrefTable(this->file.tell()); + } + else + { + xref_offset = read_xrefStream(xref_offset); + } + } + + int size = this->trailer.getKey("/Size").getIntValue(); + int max_obj = (*(xref_table.rbegin())).first.obj; + if (! this->deleted_objects.empty()) + { + max_obj = std::max(max_obj, *(this->deleted_objects.rbegin())); + } + if (size != max_obj + 1) + { + QTC::TC("qpdf", "QPDF xref size mismatch"); + warn(QPDFExc(this->file.getName() + + std::string(": reported number of objects (") + + QUtil::int_to_string(size) + + ") inconsistent with actual number of objects (" + + QUtil::int_to_string(max_obj + 1) + ")")); + } + + // We no longer need the deleted_objects table, so go ahead and + // clear it out to make sure we never depend on its being set. + this->deleted_objects.clear(); +} + +int +QPDF::read_xrefTable(off_t xref_offset) +{ + static PCRE xref_first_re("^(\\d+)\\s+(\\d+)"); + static PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])[ \r\n]{2}$)"); + + std::vector<ObjGen> deleted_items; + + this->file.seek(xref_offset, SEEK_SET); + bool done = false; + while (! done) + { + std::string line = this->file.readLine(); + PCRE::Match m1 = xref_first_re.match(line.c_str()); + if (! m1) + { + QTC::TC("qpdf", "QPDF invalid xref"); + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "xref syntax invalid"); + } + int obj = atoi(m1.getMatch(1).c_str()); + int num = atoi(m1.getMatch(2).c_str()); + static int const xref_entry_size = 20; + char xref_entry[xref_entry_size + 1]; + for (int i = obj; i < obj + num; ++i) + { + if (i == 0) + { + // This is needed by checkLinearization() + this->first_xref_item_offset = this->file.tell(); + } + memset(xref_entry, 0, sizeof(xref_entry)); + this->file.read(xref_entry, xref_entry_size); + PCRE::Match m2 = xref_entry_re.match(xref_entry); + if (! m2) + { + QTC::TC("qpdf", "QPDF invalid xref entry"); + throw QPDFExc( + this->file.getName(), this->file.getLastOffset(), + "invalid xref entry (obj=" + + QUtil::int_to_string(i) + ")"); + } + + int f1 = atoi(m2.getMatch(1).c_str()); + int f2 = atoi(m2.getMatch(2).c_str()); + char type = m2.getMatch(3)[0]; + if (type == 'f') + { + // Save deleted items until after we've checked the + // XRefStm, if any. + deleted_items.push_back(ObjGen(i, f2)); + } + else + { + insertXrefEntry(i, 1, f1, f2); + } + } + off_t pos = this->file.tell(); + QPDFTokenizer::Token t = readToken(&this->file); + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "trailer")) + { + done = true; + } + else + { + this->file.seek(pos, SEEK_SET); + } + } + + // Set offset to previous xref table if any + QPDFObjectHandle cur_trailer = readObject(&this->file, 0, 0, false); + if (! cur_trailer.isDictionary()) + { + QTC::TC("qpdf", "QPDF missing trailer"); + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "expected trailer dictionary"); + } + + if (! this->trailer.isInitialized()) + { + setTrailer(cur_trailer); + + if (! this->trailer.hasKey("/Size")) + { + QTC::TC("qpdf", "QPDF trailer lacks size"); + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "trailer dictionary lacks /Size key"); + } + if (! this->trailer.getKey("/Size").isInteger()) + { + QTC::TC("qpdf", "QPDF trailer size not integer"); + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "/Size key in trailer dictionary is not " + "an integer"); + } + } + + if (cur_trailer.hasKey("/XRefStm")) + { + if (this->ignore_xref_streams) + { + QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer"); + } + else + { + if (cur_trailer.getKey("/XRefStm").isInteger()) + { + // Read the xref stream but disregard any return value + // -- we'll use our trailer's /Prev key instead of the + // xref stream's. + (void) read_xrefStream( + cur_trailer.getKey("/XRefStm").getIntValue()); + } + else + { + throw QPDFExc(this->file.getName(), xref_offset, + "invalid /XRefStm"); + } + } + } + + // Handle any deleted items now that we've read the /XRefStm. + for (std::vector<ObjGen>::iterator iter = deleted_items.begin(); + iter != deleted_items.end(); ++iter) + { + ObjGen& og = *iter; + insertXrefEntry(og.obj, 0, 0, og.gen); + } + + if (cur_trailer.hasKey("/Prev")) + { + if (! cur_trailer.getKey("/Prev").isInteger()) + { + QTC::TC("qpdf", "QPDF trailer prev not integer"); + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "/Prev key in trailer dictionary is not " + "an integer"); + } + QTC::TC("qpdf", "QPDF prev key in trailer dictionary"); + xref_offset = cur_trailer.getKey("/Prev").getIntValue(); + } + else + { + xref_offset = 0; + } + + return xref_offset; +} + +int +QPDF::read_xrefStream(off_t xref_offset) +{ + bool found = false; + if (! this->ignore_xref_streams) + { + int xobj; + int xgen; + QPDFObjectHandle xref_obj; + try + { + xref_obj = readObjectAtOffset(xref_offset, 0, 0, xobj, xgen); + } + catch (QPDFExc& e) + { + // ignore -- report error below + } + if (xref_obj.isInitialized() && + xref_obj.isStream() && + xref_obj.getDict().getKey("/Type").isName() && + xref_obj.getDict().getKey("/Type").getName() == "/XRef") + { + QTC::TC("qpdf", "QPDF found xref stream"); + found = true; + xref_offset = processXRefStream(xref_offset, xref_obj); + } + } + + if (! found) + { + QTC::TC("qpdf", "QPDF can't find xref"); + throw QPDFExc(this->file.getName(), xref_offset, "xref not found"); + } + + return xref_offset; +} + +int +QPDF::processXRefStream(off_t xref_offset, QPDFObjectHandle& xref_obj) +{ + QPDFObjectHandle dict = xref_obj.getDict(); + QPDFObjectHandle W_obj = dict.getKey("/W"); + QPDFObjectHandle Index_obj = dict.getKey("/Index"); + if (! (W_obj.isArray() && + (W_obj.getArrayNItems() >= 3) && + W_obj.getArrayItem(0).isInteger() && + W_obj.getArrayItem(1).isInteger() && + W_obj.getArrayItem(2).isInteger() && + dict.getKey("/Size").isInteger() && + (Index_obj.isArray() || Index_obj.isNull()))) + { + throw QPDFExc(this->file.getName(), xref_offset, + "Cross-reference stream does not have" + " proper /W and /Index keys"); + } + std::vector<int> indx; + if (Index_obj.isArray()) + { + int n_index = Index_obj.getArrayNItems(); + if ((n_index % 2) || (n_index < 2)) + { + throw QPDFExc(this->file.getName(), xref_offset, + "Cross-reference stream's /Index has an" + " invalid number of values"); + } + for (int i = 0; i < n_index; ++i) + { + if (Index_obj.getArrayItem(i).isInteger()) + { + indx.push_back(Index_obj.getArrayItem(i).getIntValue()); + } + else + { + throw QPDFExc(this->file.getName(), xref_offset, + "Cross-reference stream's /Index's item " + + QUtil::int_to_string(i) + + " is not an integer"); + } + } + QTC::TC("qpdf", "QPDF xref /Index is array", + n_index == 2 ? 0 : 1); + } + else + { + QTC::TC("qpdf", "QPDF xref /Index is null"); + int size = dict.getKey("/Size").getIntValue(); + indx.push_back(0); + indx.push_back(size); + } + + int num_entries = 0; + for (unsigned int i = 1; i < indx.size(); i += 2) + { + num_entries += indx[i]; + } + + int W[3]; + int entry_size = 0; + for (int i = 0; i < 3; ++i) + { + W[i] = W_obj.getArrayItem(i).getIntValue(); + entry_size += W[i]; + } + + int expected_size = entry_size * num_entries; + + PointerHolder<Buffer> bp = xref_obj.getStreamData(); + int actual_size = bp.getPointer()->getSize(); + + if (expected_size != actual_size) + { + throw QPDFExc(this->file.getName(), xref_offset, + "Cross-reference stream data has the wrong size;" + " expected = " + QUtil::int_to_string(expected_size) + + "; actual = " + QUtil::int_to_string(actual_size)); + } + + int cur_chunk = 0; + int chunk_count = 0; + + bool saw_first_compressed_object = false; + + unsigned char const* data = bp.getPointer()->getBuffer(); + for (int i = 0; i < num_entries; ++i) + { + // Read this entry + unsigned char const* entry = data + (entry_size * i); + int fields[3]; + unsigned char const* p = entry; + for (int j = 0; j < 3; ++j) + { + fields[j] = 0; + if ((j == 0) && (W[0] == 0)) + { + QTC::TC("qpdf", "QPDF default for xref stream field 0"); + fields[0] = 1; + } + for (int k = 0; k < W[j]; ++k) + { + fields[j] <<= 8; + fields[j] += (int)(*p++); + } + } + + // Get the object and generation number. The object number is + // based on /Index. The generation number is 0 unless this is + // an uncompressed object record, in which case the generation + // number appears as the third field. + int obj = indx[cur_chunk] + chunk_count; + ++chunk_count; + if (chunk_count >= indx[cur_chunk + 1]) + { + cur_chunk += 2; + chunk_count = 0; + } + + if (saw_first_compressed_object) + { + if (fields[0] != 2) + { + this->uncompressed_after_compressed = true; + } + } + else if (fields[0] == 2) + { + saw_first_compressed_object = true; + } + if (obj == 0) + { + // This is needed by checkLinearization() + this->first_xref_item_offset = xref_offset; + } + insertXrefEntry(obj, fields[0], fields[1], fields[2]); + } + + if (! this->trailer.isInitialized()) + { + setTrailer(dict); + } + + if (dict.hasKey("/Prev")) + { + if (! dict.getKey("/Prev").isInteger()) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "/Prev key in xref stream dictionary is not " + "an integer"); + } + QTC::TC("qpdf", "QPDF prev key in xref stream dictionary"); + xref_offset = dict.getKey("/Prev").getIntValue(); + } + else + { + xref_offset = 0; + } + + return xref_offset; +} + +void +QPDF::insertXrefEntry(int obj, int f0, int f1, int f2) +{ + // Populate the xref table in such a way that the first reference + // to an object that we see, which is the one in the latest xref + // table in which it appears, is the one that gets stored. + + // If there is already an entry for this object and generation in + // the table, it means that a later xref table has registered this + // object. Disregard this one. + { // private scope + int gen = (f0 == 2 ? 0 : f2); + ObjGen og(obj, gen); + if (this->xref_table.count(og)) + { + QTC::TC("qpdf", "QPDF xref reused object"); + return; + } + if (this->deleted_objects.count(obj)) + { + QTC::TC("qpdf", "QPDF xref deleted object"); + return; + } + } + + switch (f0) + { + case 0: + this->deleted_objects.insert(obj); + break; + + case 1: + // f2 is generation + QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0)); + this->xref_table[ObjGen(obj, f2)] = QPDFXRefEntry(f0, f1, f2); + break; + + case 2: + this->xref_table[ObjGen(obj, 0)] = QPDFXRefEntry(f0, f1, f2); + break; + + default: + throw QPDFExc(this->file.getName(), 0, + "unknown xref stream entry type " + + QUtil::int_to_string(f0)); + break; + } +} + +void +QPDF::showXRefTable() +{ + for (std::map<ObjGen, QPDFXRefEntry>::iterator iter = + this->xref_table.begin(); + iter != this->xref_table.end(); ++iter) + { + ObjGen const& og = (*iter).first; + QPDFXRefEntry const& entry = (*iter).second; + std::cout << og.obj << "/" << og.gen << ": "; + switch (entry.getType()) + { + case 1: + std::cout << "uncompressed; offset = " << entry.getOffset(); + break; + + case 2: + std::cout << "compressed; stream = " << entry.getObjStreamNumber() + << ", index = " << entry.getObjStreamIndex(); + break; + + default: + throw QEXC::Internal("unknown cross-reference table type while" + " showing xref_table"); + break; + } + std::cout << std::endl; + } +} + +QPDFObjectHandle +QPDF::readObject(InputSource* input, int objid, int generation, + bool in_object_stream) +{ + off_t offset = input->tell(); + QPDFObjectHandle object = readObjectInternal( + input, objid, generation, in_object_stream, false, false); + // Override last_offset so that it points to the beginning of the + // object we just read + input->setLastOffset(offset); + return object; +} + +QPDFObjectHandle +QPDF::readObjectInternal(InputSource* input, + int objid, int generation, + bool in_object_stream, + bool in_array, bool in_dictionary) +{ + if (in_dictionary && in_array) + { + // Although dictionaries and arrays arbitrarily nest, these + // variables indicate what is at the top of the stack right + // now, so they can, by definition, never both be true. + throw QEXC::Internal("readObjectInternal: in_dict && in_array"); + } + + QPDFObjectHandle object; + + off_t offset = input->tell(); + std::vector<QPDFObjectHandle> olist; + bool done = false; + while (! done) + { + object = QPDFObjectHandle(); + + QPDFTokenizer::Token token = readToken(input); + + switch (token.getType()) + { + case QPDFTokenizer::tt_brace_open: + case QPDFTokenizer::tt_brace_close: + // Don't know what to do with these for now + QTC::TC("qpdf", "QPDF bad brace"); + throw QPDFExc(input->getName(), input->getLastOffset(), + "unexpected brace token"); + break; + + case QPDFTokenizer::tt_array_close: + if (in_array) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDF bad array close"); + throw QPDFExc(input->getName(), input->getLastOffset(), + "unexpected array close token"); + } + break; + + case QPDFTokenizer::tt_dict_close: + if (in_dictionary) + { + done = true; + } + else + { + QTC::TC("qpdf", "QPDF bad dictionary close"); + throw QPDFExc(input->getName(), input->getLastOffset(), + "unexpected dictionary close token"); + } + break; + + case QPDFTokenizer::tt_array_open: + object = readObjectInternal( + input, objid, generation, in_object_stream, true, false); + break; + + case QPDFTokenizer::tt_dict_open: + object = readObjectInternal( + input, objid, generation, in_object_stream, false, true); + break; + + case QPDFTokenizer::tt_bool: + object = QPDFObjectHandle::newBool( + (token.getValue() == "true")); + break; + + case QPDFTokenizer::tt_null: + object = QPDFObjectHandle::newNull(); + break; + + case QPDFTokenizer::tt_integer: + object = QPDFObjectHandle::newInteger( + atoi(token.getValue().c_str())); + break; + + case QPDFTokenizer::tt_real: + object = QPDFObjectHandle::newReal(token.getValue()); + break; + + case QPDFTokenizer::tt_name: + object = QPDFObjectHandle::newName(token.getValue()); + break; + + case QPDFTokenizer::tt_word: + { + std::string const& value = token.getValue(); + if ((value == "R") && (in_array || in_dictionary) && + (olist.size() >= 2) && + (olist[olist.size() - 1].isInteger()) && + (olist[olist.size() - 2].isInteger())) + { + // Try to resolve indirect objects + object = QPDFObjectHandle::Factory::newIndirect( + this, + olist[olist.size() - 2].getIntValue(), + olist[olist.size() - 1].getIntValue()); + olist.pop_back(); + olist.pop_back(); + } + else + { + throw QPDFExc(input->getName(), input->getLastOffset(), + "unknown token while reading object (" + + value + ")"); + } + } + break; + + case QPDFTokenizer::tt_string: + { + std::string val = token.getValue(); + if (this->encrypted && (! in_object_stream)) + { + decryptString(val, objid, generation); + } + object = QPDFObjectHandle::newString(val); + } + break; + + default: + throw QPDFExc(input->getName(), input->getLastOffset(), + "unknown token type while reading object"); + break; + } + + if (in_dictionary || in_array) + { + if (! done) + { + olist.push_back(object); + } + } + else if (! object.isInitialized()) + { + throw QEXC::Internal(std::string("uninitialized object (token = ") + + QUtil::int_to_string(token.getType()) + + ", " + token.getValue() + ")"); + } + else + { + done = true; + } + } + + if (in_array) + { + object = QPDFObjectHandle::newArray(olist); + } + else if (in_dictionary) + { + // Convert list to map. Alternating elements are keys. + std::map<std::string, QPDFObjectHandle> dict; + if (olist.size() % 2) + { + QTC::TC("qpdf", "QPDF dictionary odd number of elements"); + throw QPDFExc( + input->getName(), input->getLastOffset(), + "dictionary ending here has an odd number of elements"); + } + for (unsigned int i = 0; i < olist.size(); i += 2) + { + QPDFObjectHandle key_obj = olist[i]; + QPDFObjectHandle val = olist[i + 1]; + if (! key_obj.isName()) + { + throw QPDFExc( + input->getName(), offset, + std::string("dictionary key not name (") + + key_obj.unparse() + ")"); + } + dict[key_obj.getName()] = val; + } + object = QPDFObjectHandle::newDictionary(dict); + + if (! in_object_stream) + { + // check for stream + off_t cur_offset = input->tell(); + if (readToken(input) == + QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")) + { + // Kill to next actual newline. Do not use readLine() + // here -- streams are a special case. The next + // single newline character marks the end of the + // stream token. It is incorrect to strip subsequent + // carriage returns or newlines as they may be part of + // the stream. + { + char ch; + do + { + if (input->read(&ch, 1) == 0) + { + // A premature EOF here will result in + // some other problem that will get + // reported at another time. + ch = '\n'; + } + } while (ch != '\n'); + } + + // Must get offset before accessing any additional + // objects since resolving a previously unresolved + // indirect object will change file position. + off_t stream_offset = input->tell(); + int length = 0; + + try + { + if (dict.count("/Length") == 0) + { + QTC::TC("qpdf", "QPDF stream without length"); + throw QPDFExc(input->getName(), offset, + "stream dictionary lacks /Length key"); + } + + QPDFObjectHandle length_obj = dict["/Length"]; + if (! length_obj.isInteger()) + { + QTC::TC("qpdf", "QPDF stream length not integer"); + throw QPDFExc(input->getName(), offset, + "/Length key in stream dictionary is not " + "an integer"); + } + + length = length_obj.getIntValue(); + input->seek(stream_offset + length, SEEK_SET); + if (! (readToken(input) == + QPDFTokenizer::Token( + QPDFTokenizer::tt_word, "endstream"))) + { + QTC::TC("qpdf", "QPDF missing endstream"); + throw QPDFExc(input->getName(), input->getLastOffset(), + "expected endstream"); + } + } + catch (QPDFExc& e) + { + if (this->attempt_recovery) + { + // may throw an exception + length = recoverStreamLength( + input, objid, generation, stream_offset); + } + else + { + throw e; + } + } + object = QPDFObjectHandle::Factory::newStream( + this, objid, generation, object, stream_offset, length); + } + else + { + input->seek(cur_offset, SEEK_SET); + } + } + } + + return object; +} + +int +QPDF::recoverStreamLength(InputSource* input, + int objid, int generation, off_t stream_offset) +{ + static PCRE endobj_re("^endobj\\b"); + + // Try to reconstruct stream length by looking for + // endstream(\r\n?|\n)endobj + warn(QPDFExc(input->getName(), stream_offset, + "attempting to recover stream length")); + + input->seek(0, SEEK_END); + off_t eof = input->tell(); + input->seek(stream_offset, SEEK_SET); + std::string last_line; + off_t last_line_offset = 0; + int length = 0; + while (input->tell() < eof) + { + std::string line = input->readLine(); + // Can't use regexp last_line since it might contain nulls + if (endobj_re.match(line.c_str()) && + (last_line.length() >= 9) && + (last_line.substr(last_line.length() - 9, 9) == "endstream")) + { + // Stream probably ends right before "endstream", which + // contains 9 characters. + length = last_line_offset + last_line.length() - 9 - stream_offset; + // Go back to where we would have been if we had just read + // the endstream. + input->seek(input->getLastOffset(), SEEK_SET); + break; + } + last_line = line; + last_line_offset = input->getLastOffset(); + } + + if (length) + { + int this_obj_offset = 0; + ObjGen this_obj(0, 0); + + // Make sure this is inside this object + for (std::map<ObjGen, QPDFXRefEntry>::iterator iter = + this->xref_table.begin(); + iter != this->xref_table.end(); ++iter) + { + ObjGen const& og = (*iter).first; + QPDFXRefEntry const& entry = (*iter).second; + if (entry.getType() == 1) + { + int obj_offset = entry.getOffset(); + if ((obj_offset > stream_offset) && + ((this_obj_offset == 0) || + (this_obj_offset > obj_offset))) + { + this_obj_offset = obj_offset; + this_obj = og; + } + } + } + if (this_obj_offset && + (this_obj.obj == objid) && + (this_obj.gen == generation)) + { + // Well, we found endstream\nendobj within the space + // allowed for this object, so we're probably in good + // shape. + } + else + { + QTC::TC("qpdf", "QPDF found wrong endstream in recovery"); + } + } + + if (length == 0) + { + throw QPDFExc(input->getName(), stream_offset, + "unable to recover stream data"); + } + + QTC::TC("qpdf", "QPDF recovered stream length"); + return length; +} + +QPDFTokenizer::Token +QPDF::readToken(InputSource* input) +{ + off_t offset = input->tell(); + QPDFTokenizer::Token token; + bool unread_char; + char char_to_unread; + while (! this->tokenizer.getToken(token, unread_char, char_to_unread)) + { + char ch; + if (input->read(&ch, 1) == 0) + { + throw QPDFExc(input->getName(), offset, "EOF while reading token"); + } + else + { + if (isspace(ch) && (input->getLastOffset() == offset)) + { + ++offset; + } + this->tokenizer.presentCharacter(ch); + } + } + + if (unread_char) + { + input->unreadCh(char_to_unread); + } + + if (token.getType() == QPDFTokenizer::tt_bad) + { + throw QPDFExc(input->getName(), offset, token.getErrorMessage()); + } + + input->setLastOffset(offset); + + return token; +} + +QPDFObjectHandle +QPDF::readObjectAtOffset(off_t offset, int exp_objid, int exp_generation, + int& objid, int& generation) +{ + this->file.seek(offset, SEEK_SET); + + QPDFTokenizer::Token tobjid = readToken(&this->file); + QPDFTokenizer::Token tgen = readToken(&this->file); + QPDFTokenizer::Token tobj = readToken(&this->file); + + bool objidok = (tobjid.getType() == QPDFTokenizer::tt_integer); + int genok = (tgen.getType() == QPDFTokenizer::tt_integer); + int objok = (tobj == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj")); + + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0); + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0); + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0); + + try + { + if (! (objidok && genok && objok)) + { + QTC::TC("qpdf", "QPDF expected n n obj"); + throw QPDFExc(this->file.getName(), offset, "expected n n obj"); + } + objid = atoi(tobjid.getValue().c_str()); + generation = atoi(tgen.getValue().c_str()); + + if (exp_objid && + (! ((objid == exp_objid) && (generation == exp_generation)))) + { + QTC::TC("qpdf", "QPDF err wrong objid/generation"); + throw QPDFExc(this->file.getName(), offset, + std::string("expected ") + + QUtil::int_to_string(exp_objid) + " " + + QUtil::int_to_string(exp_generation) + " obj"); + } + } + catch (QPDFExc& e) + { + if (exp_objid && this->attempt_recovery) + { + // Try again after reconstructing xref table + reconstruct_xref(e); + ObjGen og(exp_objid, exp_generation); + if (this->xref_table.count(og) && + (this->xref_table[og].getType() == 1)) + { + off_t new_offset = this->xref_table[og].getOffset(); + // Call readObjectAtOffset with 0 for exp_objid to + // avoid an infinite loop. + QPDFObjectHandle result = + readObjectAtOffset(new_offset, 0, 0, objid, generation); + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset"); + return result; + } + } + else + { + throw e; + } + } + + QPDFObjectHandle oh = readObject( + &this->file, objid, generation, false); + + if (! (readToken(&this->file) == + QPDFTokenizer::Token(QPDFTokenizer::tt_word, "endobj"))) + { + QTC::TC("qpdf", "QPDF err expected endobj"); + warn(QPDFExc(this->file.getName(), this->file.getLastOffset(), + "expected endobj")); + } + + ObjGen og(objid, generation); + if (! this->obj_cache.count(og)) + { + // Store the object in the cache here so it gets cached + // whether we first know the offset or whether we first know + // the object ID and generation (in which we case we would get + // here through resolve). + + // Determine the end offset of this object before and after + // white space. We use these numbers to validate + // linearization hint tables. Offsets and lengths of objects + // may imply the end of an object to be anywhere between these + // values. + off_t end_before_space = this->file.tell(); + + // skip over spaces + while (true) + { + char ch; + if (this->file.read(&ch, 1)) + { + if (! isspace(ch)) + { + this->file.seek(-1, SEEK_CUR); + break; + } + } + else + { + throw QPDFExc(this->file.getName(), offset, + "EOF after endobj"); + } + } + off_t end_after_space = this->file.tell(); + + this->obj_cache[og] = + ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), + end_before_space, end_after_space); + } + + return oh; +} + +PointerHolder<QPDFObject> +QPDF::resolve(int objid, int generation) +{ + // Check object cache before checking xref table. This allows us + // to insert things into the object cache that don't actually + // exist in the file. + ObjGen og(objid, generation); + if (! this->obj_cache.count(og)) + { + if (! this->xref_table.count(og)) + { + // PDF spec says unknown objects resolve to the null object. + return new QPDF_Null; + } + + QPDFXRefEntry const& entry = this->xref_table[og]; + switch (entry.getType()) + { + case 1: + { + off_t offset = entry.getOffset(); + // Object stored in cache by readObjectAtOffset + int aobjid; + int ageneration; + QPDFObjectHandle oh = + readObjectAtOffset(offset, objid, generation, + aobjid, ageneration); + } + break; + + case 2: + resolveObjectsInStream(entry.getObjStreamNumber()); + break; + + default: + throw QPDFExc(this->file.getName(), 0, + "object " + + QUtil::int_to_string(objid) + "/" + + QUtil::int_to_string(generation) + + " has unexpected xref entry type"); + } + } + + return this->obj_cache[og].object; +} + +void +QPDF::resolveObjectsInStream(int obj_stream_number) +{ + // Force resolution of object stream + QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0); + if (! obj_stream.isStream()) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "supposed object stream " + + QUtil::int_to_string(obj_stream_number) + + " is not a stream"); + } + + // For linearization data in the object, use the data from the + // object stream for the objects in the stream. + ObjGen stream_og(obj_stream_number, 0); + off_t end_before_space = this->obj_cache[stream_og].end_before_space; + off_t end_after_space = this->obj_cache[stream_og].end_after_space; + + QPDFObjectHandle dict = obj_stream.getDict(); + if (! (dict.getKey("/Type").isName() && + dict.getKey("/Type").getName() == "/ObjStm")) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "supposed object stream " + + QUtil::int_to_string(obj_stream_number) + + " has wrong type"); + } + + if (! (dict.getKey("/N").isInteger() && + dict.getKey("/First").isInteger())) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "object stream " + + QUtil::int_to_string(obj_stream_number) + + " has incorrect keys"); + } + + int n = dict.getKey("/N").getIntValue(); + int first = dict.getKey("/First").getIntValue(); + + std::map<int, int> offsets; + + PointerHolder<Buffer> bp = obj_stream.getStreamData(); + BufferInputSource input( + "object stream " + QUtil::int_to_string(obj_stream_number), + bp.getPointer()); + + for (int i = 0; i < n; ++i) + { + QPDFTokenizer::Token tnum = readToken(&input); + QPDFTokenizer::Token toffset = readToken(&input); + if (! ((tnum.getType() == QPDFTokenizer::tt_integer) && + (toffset.getType() == QPDFTokenizer::tt_integer))) + { + throw QPDFExc(input.getName(), input.getLastOffset(), + "expected integer in object stream header"); + } + + int num = atoi(tnum.getValue().c_str()); + int offset = atoi(toffset.getValue().c_str()); + offsets[num] = offset + first; + } + + for (std::map<int, int>::iterator iter = offsets.begin(); + iter != offsets.end(); ++iter) + { + int obj = (*iter).first; + int offset = (*iter).second; + input.seek(offset, SEEK_SET); + QPDFObjectHandle oh = readObject(&input, obj, 0, true); + + // Store in cache + ObjGen og(obj, 0); + + this->obj_cache[og] = + ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), + end_before_space, end_after_space); + } +} + +QPDFObjectHandle +QPDF::makeIndirectObject(QPDFObjectHandle oh) +{ + ObjGen o1 = (*(this->obj_cache.rbegin())).first; + ObjGen o2 = (*(this->xref_table.rbegin())).first; + QTC::TC("qpdf", "QPDF indirect last obj from xref", + (o2.obj > o1.obj) ? 1 : 0); + int max_objid = std::max(o1.obj, o2.obj); + ObjGen next(max_objid + 1, 0); + this->obj_cache[next] = + ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1); + return QPDFObjectHandle::Factory::newIndirect(this, next.obj, next.gen); +} + +QPDFObjectHandle +QPDF::getObjectByID(int objid, int generation) +{ + return QPDFObjectHandle::Factory::newIndirect(this, objid, generation); +} + +void +QPDF::trimTrailerForWrite() +{ + // Note that removing the encryption dictionary does not interfere + // with reading encrypted files. QPDF loads all the information + // it needs from the encryption dictionary at the beginning and + // never looks at it again. + this->trailer.removeKey("/ID"); + this->trailer.removeKey("/Encrypt"); + this->trailer.removeKey("/Prev"); + + // Remove all trailer keys that potentially come from a + // cross-reference stream + this->trailer.removeKey("/Index"); + this->trailer.removeKey("/W"); + this->trailer.removeKey("/Length"); + this->trailer.removeKey("/Filter"); + this->trailer.removeKey("/DecodeParms"); + this->trailer.removeKey("/Type"); + this->trailer.removeKey("/XRefStm"); +} + +std::string +QPDF::getFilename() const +{ + return this->file.getName(); +} + +std::string +QPDF::getPDFVersion() const +{ + return this->pdf_version; +} + +QPDFObjectHandle +QPDF::getTrailer() +{ + return this->trailer; +} + +QPDFObjectHandle +QPDF::getRoot() +{ + return this->trailer.getKey("/Root"); +} + +void +QPDF::getObjectStreamData(std::map<int, int>& omap) +{ + for (std::map<ObjGen, QPDFXRefEntry>::iterator iter = + this->xref_table.begin(); + iter != this->xref_table.end(); ++iter) + { + ObjGen const& og = (*iter).first; + QPDFXRefEntry const& entry = (*iter).second; + if (entry.getType() == 2) + { + omap[og.obj] = entry.getObjStreamNumber(); + } + } +} + +std::vector<int> +QPDF::getCompressibleObjects() +{ + // Return a set of object numbers of objects that are allowed to + // be in object streams. We disregard generation numbers here + // since this is a helper function for QPDFWriter which is going + // to renumber objects anyway. This code will do weird things if + // we have two objects with the same object number and different + // generations, but so do virtually all PDF consumers, + // particularly since this is not a permitted condition. + + // We walk through the objects by traversing the document from the + // root, including a traversal of the pages tree. This makes that + // objects that are on the same page are more likely to be in the + // same object stream, which is slightly more efficient, + // particularly with linearized files. This is better than + // iterating through the xref table since it avoids preserving + // orphaned items. + + // Exclude encryption dictionary, if any + int encryption_dict_id = 0; + QPDFObjectHandle encryption_dict = trailer.getKey("/Encrypt"); + if (encryption_dict.isIndirect()) + { + encryption_dict_id = encryption_dict.getObjectID(); + } + + std::set<int> visited; + std::list<QPDFObjectHandle> queue; + queue.push_front(this->trailer); + std::vector<int> result; + while (! queue.empty()) + { + QPDFObjectHandle obj = queue.front(); + queue.pop_front(); + if (obj.isIndirect()) + { + int objid = obj.getObjectID(); + if (visited.count(objid)) + { + QTC::TC("qpdf", "QPDF loop detected traversing objects"); + continue; + } + if (objid == encryption_dict_id) + { + QTC::TC("qpdf", "QPDF exclude encryption dictionary"); + } + else if (! obj.isStream()) + { + result.push_back(objid); + } + visited.insert(objid); + } + if (obj.isStream()) + { + QPDFObjectHandle dict = obj.getDict(); + std::set<std::string> keys = dict.getKeys(); + for (std::set<std::string>::reverse_iterator iter = keys.rbegin(); + iter != keys.rend(); ++iter) + { + std::string const& key = *iter; + QPDFObjectHandle value = dict.getKey(key); + if (key == "/Length") + { + // omit stream lengths + if (value.isIndirect()) + { + QTC::TC("qpdf", "QPDF exclude indirect length"); + } + } + else + { + queue.push_front(value); + } + } + } + else if (obj.isDictionary()) + { + std::set<std::string> keys = obj.getKeys(); + for (std::set<std::string>::reverse_iterator iter = keys.rbegin(); + iter != keys.rend(); ++iter) + { + queue.push_front(obj.getKey(*iter)); + } + } + else if (obj.isArray()) + { + int n = obj.getArrayNItems(); + for (int i = 1; i <= n; ++i) + { + queue.push_front(obj.getArrayItem(n - i)); + } + } + } + + return result; +} + +void +QPDF::pipeStreamData(int objid, int generation, + off_t offset, size_t length, + QPDFObjectHandle stream_dict, + Pipeline* pipeline) +{ + std::vector<PointerHolder<Pipeline> > to_delete; + if (this->encrypted) + { + bool xref_stream = false; + if (stream_dict.getKey("/Type").isName() && + (stream_dict.getKey("/Type").getName() == "/XRef")) + { + QTC::TC("qpdf", "QPDF piping xref stream from encrypted file"); + xref_stream = true; + } + if (! xref_stream) + { + decryptStream(pipeline, objid, generation, to_delete); + } + } + + this->file.seek(offset, SEEK_SET); + char buf[10240]; + while (length > 0) + { + size_t to_read = (sizeof(buf) < length ? sizeof(buf) : length); + size_t len = this->file.read(buf, to_read); + if (len == 0) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "unexpected EOF reading stream data"); + } + length -= len; + pipeline->write((unsigned char*)buf, len); + } + pipeline->finish(); +} + +std::vector<QPDFObjectHandle> const& +QPDF::getAllPages() +{ + if (this->all_pages.empty()) + { + getAllPagesInternal( + this->trailer.getKey("/Root").getKey("/Pages"), this->all_pages); + } + return this->all_pages; +} + +void +QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, + std::vector<QPDFObjectHandle>& result) +{ + std::string type = cur_pages.getKey("/Type").getName(); + if (type == "/Pages") + { + QPDFObjectHandle kids = cur_pages.getKey("/Kids"); + int n = kids.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + getAllPagesInternal(kids.getArrayItem(i), result); + } + } + else if (type == "/Page") + { + result.push_back(cur_pages); + } + else + { + throw QPDFExc(this->file.getName() + ": invalid Type in page tree"); + } +} diff --git a/libqpdf/QPDFExc.cc b/libqpdf/QPDFExc.cc new file mode 100644 index 00000000..c7270677 --- /dev/null +++ b/libqpdf/QPDFExc.cc @@ -0,0 +1,20 @@ + +#include <qpdf/QPDFExc.hh> + +#include <qpdf/QUtil.hh> + +QPDFExc::QPDFExc(std::string const& message) : + QEXC::General(message) +{ +} + +QPDFExc::QPDFExc(std::string const& filename, int offset, + std::string const& message) : + QEXC::General(filename + ": offset " + QUtil::int_to_string(offset) + + ": " + message) +{ +} + +QPDFExc::~QPDFExc() throw () +{ +} diff --git a/libqpdf/QPDFObject.cc b/libqpdf/QPDFObject.cc new file mode 100644 index 00000000..6c4963e2 --- /dev/null +++ b/libqpdf/QPDFObject.cc @@ -0,0 +1,2 @@ + +#include <qpdf/QPDFObject.hh> diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc new file mode 100644 index 00000000..9fba7b43 --- /dev/null +++ b/libqpdf/QPDFObjectHandle.cc @@ -0,0 +1,637 @@ + +#include <qpdf/QPDFObjectHandle.hh> + +#include <qpdf/QPDF.hh> +#include <qpdf/QPDF_Bool.hh> +#include <qpdf/QPDF_Null.hh> +#include <qpdf/QPDF_Integer.hh> +#include <qpdf/QPDF_Real.hh> +#include <qpdf/QPDF_Name.hh> +#include <qpdf/QPDF_String.hh> +#include <qpdf/QPDF_Array.hh> +#include <qpdf/QPDF_Dictionary.hh> +#include <qpdf/QPDF_Stream.hh> + +#include <qpdf/QTC.hh> +#include <qpdf/QEXC.hh> +#include <qpdf/QUtil.hh> + +QPDFObjectHandle::QPDFObjectHandle() : + initialized(false), + objid(0), + generation(0) +{ +} + +QPDFObjectHandle::QPDFObjectHandle(QPDF* qpdf, int objid, int generation) : + initialized(true), + qpdf(qpdf), + objid(objid), + generation(generation) +{ +} + +QPDFObjectHandle::QPDFObjectHandle(QPDFObject* data) : + initialized(true), + qpdf(0), + objid(0), + generation(0), + obj(data) +{ +} + +bool +QPDFObjectHandle::isInitialized() const +{ + return this->initialized; +} + +template <class T> +class QPDFObjectTypeAccessor +{ + public: + static bool check(QPDFObject* o) + { + return (o && dynamic_cast<T*>(o)); + } +}; + +bool +QPDFObjectHandle::isBool() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Bool>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isNull() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Null>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isInteger() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Integer>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isReal() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Real>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isNumber() +{ + return (isInteger() || isReal()); +} + +double +QPDFObjectHandle::getNumericValue() +{ + double result = 0.0; + if (isInteger()) + { + result = getIntValue(); + } + else if (isReal()) + { + result = atof(getRealValue().c_str()); + } + else + { + throw QEXC::Internal("getNumericValue called for non-numeric object"); + } + return result; +} + +bool +QPDFObjectHandle::isName() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Name>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isString() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_String>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isArray() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Array>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isDictionary() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Dictionary>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isStream() +{ + dereference(); + return QPDFObjectTypeAccessor<QPDF_Stream>::check(obj.getPointer()); +} + +bool +QPDFObjectHandle::isIndirect() +{ + assertInitialized(); + return (this->objid != 0); +} + +bool +QPDFObjectHandle::isScalar() +{ + return (! (isArray() || isDictionary() || isStream())); +} + +// Bool accessors + +bool +QPDFObjectHandle::getBoolValue() +{ + assertType("Boolean", isBool()); + return dynamic_cast<QPDF_Bool*>(obj.getPointer())->getVal(); +} + +// Integer accessors + +int +QPDFObjectHandle::getIntValue() +{ + assertType("Integer", isInteger()); + return dynamic_cast<QPDF_Integer*>(obj.getPointer())->getVal(); +} + +// Real accessors + +std::string +QPDFObjectHandle::getRealValue() +{ + assertType("Real", isReal()); + return dynamic_cast<QPDF_Real*>(obj.getPointer())->getVal(); +} + +// Name acessors + +std::string +QPDFObjectHandle::getName() +{ + assertType("Name", isName()); + return dynamic_cast<QPDF_Name*>(obj.getPointer())->getName(); +} + +// String accessors + +std::string +QPDFObjectHandle::getStringValue() +{ + assertType("String", isString()); + return dynamic_cast<QPDF_String*>(obj.getPointer())->getVal(); +} + +std::string +QPDFObjectHandle::getUTF8Value() +{ + assertType("String", isString()); + return dynamic_cast<QPDF_String*>(obj.getPointer())->getUTF8Val(); +} + +// Array acessors + +int +QPDFObjectHandle::getArrayNItems() +{ + assertType("Array", isArray()); + return dynamic_cast<QPDF_Array*>(obj.getPointer())->getNItems(); +} + +QPDFObjectHandle +QPDFObjectHandle::getArrayItem(int n) +{ + assertType("Array", isArray()); + return dynamic_cast<QPDF_Array*>(obj.getPointer())->getItem(n); +} + +// Array mutators + +void +QPDFObjectHandle::setArrayItem(int n, QPDFObjectHandle const& item) +{ + assertType("Array", isArray()); + return dynamic_cast<QPDF_Array*>(obj.getPointer())->setItem(n, item); +} + +// Dictionary accesors + +bool +QPDFObjectHandle::hasKey(std::string const& key) +{ + assertType("Dictionary", isDictionary()); + return dynamic_cast<QPDF_Dictionary*>(obj.getPointer())->hasKey(key); +} + +QPDFObjectHandle +QPDFObjectHandle::getKey(std::string const& key) +{ + assertType("Dictionary", isDictionary()); + return dynamic_cast<QPDF_Dictionary*>(obj.getPointer())->getKey(key); +} + +std::set<std::string> +QPDFObjectHandle::getKeys() +{ + assertType("Dictionary", isDictionary()); + return dynamic_cast<QPDF_Dictionary*>(obj.getPointer())->getKeys(); +} + +// Dictionary mutators + +void +QPDFObjectHandle::replaceKey(std::string const& key, + QPDFObjectHandle const& value) +{ + assertType("Dictionary", isDictionary()); + return dynamic_cast<QPDF_Dictionary*>( + obj.getPointer())->replaceKey(key, value); +} + +void +QPDFObjectHandle::removeKey(std::string const& key) +{ + assertType("Dictionary", isDictionary()); + return dynamic_cast<QPDF_Dictionary*>(obj.getPointer())->removeKey(key); +} + +// Stream accessors +QPDFObjectHandle +QPDFObjectHandle::getDict() +{ + assertType("Stream", isStream()); + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict(); +} + +PointerHolder<Buffer> +QPDFObjectHandle::getStreamData() +{ + assertType("Stream", isStream()); + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getStreamData(); +} + +bool +QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter, + bool normalize, bool compress) +{ + assertType("Stream", isStream()); + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData( + p, filter, normalize, compress); +} + +int +QPDFObjectHandle::getObjectID() const +{ + return this->objid; +} + +int +QPDFObjectHandle::getGeneration() const +{ + return this->generation; +} + +std::map<std::string, QPDFObjectHandle> +QPDFObjectHandle::getPageImages() +{ + assertPageObject(); + + // Note: this code doesn't handle inherited resources. If this + // page dictionary doesn't have a /Resources key or has one whose + // value is null or an empty dictionary, you are supposed to walk + // up the page tree until you find a /Resources dictionary. As of + // this writing, I don't have any test files that use inherited + // resources, and hand-generating one won't be a good test beacuse + // any mistakes in my understanding would be present in both the + // code and the test file. + + // NOTE: If support of inherited resources (see above comment) is + // implemented, edit comment in QPDFObjectHandle.hh for this + // function. + + std::map<std::string, QPDFObjectHandle> result; + if (this->hasKey("/Resources")) + { + QPDFObjectHandle resources = this->getKey("/Resources"); + if (resources.hasKey("/XObject")) + { + QPDFObjectHandle xobject = resources.getKey("/XObject"); + std::set<std::string> keys = xobject.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string key = (*iter); + QPDFObjectHandle value = xobject.getKey(key); + if (value.isStream()) + { + QPDFObjectHandle dict = value.getDict(); + if (dict.hasKey("/Subtype") && + (dict.getKey("/Subtype").getName() == "/Image") && + (! dict.hasKey("/ImageMask"))) + { + result[key] = value; + } + } + } + } + } + + return result; +} + +std::vector<QPDFObjectHandle> +QPDFObjectHandle::getPageContents() +{ + assertPageObject(); + + std::vector<QPDFObjectHandle> result; + QPDFObjectHandle contents = this->getKey("/Contents"); + if (contents.isArray()) + { + int n_items = contents.getArrayNItems(); + for (int i = 0; i < n_items; ++i) + { + QPDFObjectHandle item = contents.getArrayItem(i); + if (item.isStream()) + { + result.push_back(item); + } + else + { + throw QEXC::General("unknown item type while inspecting " + "element of /Contents array in page " + "dictionary"); + } + } + } + else if (contents.isStream()) + { + result.push_back(contents); + } + else + { + throw QEXC::General("unknown object type inspecting /Contents " + "key in page dictionary"); + } + + return result; +} + +std::string +QPDFObjectHandle::unparse() +{ + std::string result; + if (this->isIndirect()) + { + result = QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation) + " R"; + } + else + { + result = unparseResolved(); + } + return result; +} + +std::string +QPDFObjectHandle::unparseResolved() +{ + dereference(); + return this->obj.getPointer()->unparse(); +} + +QPDFObjectHandle +QPDFObjectHandle::newIndirect(QPDF* qpdf, int objid, int generation) +{ + return QPDFObjectHandle(qpdf, objid, generation); +} + +QPDFObjectHandle +QPDFObjectHandle::newBool(bool value) +{ + return QPDFObjectHandle(new QPDF_Bool(value)); +} + +QPDFObjectHandle +QPDFObjectHandle::newNull() +{ + return QPDFObjectHandle(new QPDF_Null()); +} + +QPDFObjectHandle +QPDFObjectHandle::newInteger(int value) +{ + return QPDFObjectHandle(new QPDF_Integer(value)); +} + +QPDFObjectHandle +QPDFObjectHandle::newReal(std::string const& value) +{ + return QPDFObjectHandle(new QPDF_Real(value)); +} + +QPDFObjectHandle +QPDFObjectHandle::newName(std::string const& name) +{ + return QPDFObjectHandle(new QPDF_Name(name)); +} + +QPDFObjectHandle +QPDFObjectHandle::newString(std::string const& str) +{ + return QPDFObjectHandle(new QPDF_String(str)); +} + +QPDFObjectHandle +QPDFObjectHandle::newArray(std::vector<QPDFObjectHandle> const& items) +{ + return QPDFObjectHandle(new QPDF_Array(items)); +} + +QPDFObjectHandle +QPDFObjectHandle::newDictionary( + std::map<std::string, QPDFObjectHandle> const& items) +{ + return QPDFObjectHandle(new QPDF_Dictionary(items)); +} + + +QPDFObjectHandle +QPDFObjectHandle::newStream(QPDF* qpdf, int objid, int generation, + QPDFObjectHandle stream_dict, + off_t offset, int length) +{ + return QPDFObjectHandle(new QPDF_Stream( + qpdf, objid, generation, + stream_dict, offset, length)); +} + +void +QPDFObjectHandle::makeDirectInternal(std::set<int>& visited) +{ + assertInitialized(); + + if (isStream()) + { + QTC::TC("qpdf", "QPDFObjectHandle ERR clone stream"); + throw QEXC::General("attempt to make a stream into a direct object"); + } + + int cur_objid = this->objid; + if (cur_objid != 0) + { + if (visited.count(cur_objid)) + { + QTC::TC("qpdf", "QPDFObjectHandle makeDirect loop"); + throw QEXC::General("loop detected while converting object from " + "indirect to direct"); + } + visited.insert(cur_objid); + } + + dereference(); + this->objid = 0; + this->generation = 0; + + QPDFObject* new_obj = 0; + + if (isBool()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone bool"); + new_obj = new QPDF_Bool(getBoolValue()); + } + else if (isNull()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone null"); + new_obj = new QPDF_Null(); + } + else if (isInteger()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone integer"); + new_obj = new QPDF_Integer(getIntValue()); + } + else if (isReal()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone real"); + new_obj = new QPDF_Real(getRealValue()); + } + else if (isName()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone name"); + new_obj = new QPDF_Name(getName()); + } + else if (isString()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone string"); + new_obj = new QPDF_String(getStringValue()); + } + else if (isArray()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone array"); + std::vector<QPDFObjectHandle> items; + int n = getArrayNItems(); + for (int i = 0; i < n; ++i) + { + items.push_back(getArrayItem(i)); + items.back().makeDirectInternal(visited); + } + new_obj = new QPDF_Array(items); + } + else if (isDictionary()) + { + QTC::TC("qpdf", "QPDFObjectHandle clone dictionary"); + std::set<std::string> keys = getKeys(); + std::map<std::string, QPDFObjectHandle> items; + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + items[*iter] = getKey(*iter); + items[*iter].makeDirectInternal(visited); + } + new_obj = new QPDF_Dictionary(items); + } + else + { + throw QEXC::Internal("QPDFObjectHandle::makeIndirect: " + "unknown object type"); + } + + this->obj = new_obj; + + if (cur_objid) + { + visited.erase(cur_objid); + } +} + +void +QPDFObjectHandle::makeDirect() +{ + std::set<int> visited; + makeDirectInternal(visited); +} + +void +QPDFObjectHandle::assertInitialized() const +{ + if (! this->initialized) + { + throw QEXC::Internal("operation attempted on uninitialized " + "QPDFObjectHandle"); + } +} + +void +QPDFObjectHandle::assertType(char const* type_name, bool istype) +{ + if (! istype) + { + throw QEXC::Internal(std::string("operation for ") + type_name + + " object attempted on object of wrong type"); + } +} + +void +QPDFObjectHandle::assertPageObject() +{ + if (! (this->isDictionary() && this->hasKey("/Type") && + (this->getKey("/Type").getName() == "/Page"))) + { + throw QEXC::Internal("page operation called on non-Page object"); + } +} + +void +QPDFObjectHandle::dereference() +{ + if (this->obj.getPointer() == 0) + { + this->obj = QPDF::Resolver::resolve( + this->qpdf, this->objid, this->generation); + if (this->obj.getPointer() == 0) + { + QTC::TC("qpdf", "QPDFObjectHandle indirect to unknown"); + this->obj = new QPDF_Null(); + } + } +} diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc new file mode 100644 index 00000000..4eed6f16 --- /dev/null +++ b/libqpdf/QPDFTokenizer.cc @@ -0,0 +1,458 @@ + +#include <qpdf/QPDFTokenizer.hh> + +// DO NOT USE ctype -- it is locale dependent for some things, and +// it's not worth the risk of including it in case it may accidentally +// be used. + +#include <qpdf/PCRE.hh> +#include <qpdf/QEXC.hh> +#include <qpdf/QTC.hh> + +// See note above about ctype. +static bool is_hex_digit(char ch) +{ + return (strchr("0123456789abcdefABCDEF", ch) != 0); +} + +QPDFTokenizer::QPDFTokenizer() : + pound_special_in_name(true) +{ + reset(); +} + +void +QPDFTokenizer::allowPoundAnywhereInName() +{ + QTC::TC("qpdf", "QPDFTokenizer allow pound anywhere in name"); + this->pound_special_in_name = false; +} + +void +QPDFTokenizer::reset() +{ + state = st_top; + type = tt_bad; + val = ""; + raw_val = ""; + error_message = ""; + unread_char = false; + char_to_unread = '\0'; + string_depth = 0; + string_ignoring_newline = false; + last_char_was_bs = false; +} + +void +QPDFTokenizer::presentCharacter(char ch) +{ + static PCRE num_re("^[\\+\\-]?(?:\\.\\d+|\\d+(?:\\.\\d+)?)$"); + + if (state == st_token_ready) + { + throw QEXC::Internal("QPDF tokenizer presented character " + "while token is waiting"); + } + + char orig_ch = ch; + + // State machine is implemented such that some characters may be + // handled more than once. This happens whenever you have to use + // the character that caused a state change in the new state. + + bool handled = true; + if (state == st_top) + { + // Note: we specifically do not use ctype here. It is + // locale-dependent. + if (strchr(" \t\n\v\f\r", ch)) + { + // ignore + } + else if (ch == '%') + { + // Discard comments + state = st_in_comment; + } + else if (ch == '(') + { + string_depth = 1; + string_ignoring_newline = false; + memset(bs_num_register, '\0', sizeof(bs_num_register)); + last_char_was_bs = false; + state = st_in_string; + } + else if (ch == '<') + { + state = st_lt; + } + else if (ch == '>') + { + state = st_gt; + } + else + { + val += ch; + if (ch == ')') + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad )"); + error_message = "unexpected )"; + state = st_token_ready; + } + else if (ch == '[') + { + type = tt_array_open; + state = st_token_ready; + } + else if (ch == ']') + { + type = tt_array_close; + state = st_token_ready; + } + else if (ch == '{') + { + type = tt_brace_open; + state = st_token_ready; + } + else if (ch == '}') + { + type = tt_brace_close; + state = st_token_ready; + } + else + { + state = st_literal; + } + } + } + else if (state == st_in_comment) + { + if ((ch == '\r') || (ch == '\n')) + { + state = st_top; + } + } + else if (state == st_lt) + { + if (ch == '<') + { + val = "<<"; + type = tt_dict_open; + state = st_token_ready; + } + else + { + handled = false; + state = st_in_hexstring; + } + } + else if (state == st_gt) + { + if (ch == '>') + { + val = ">>"; + type = tt_dict_close; + state = st_token_ready; + } + else + { + val = ">"; + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad >"); + error_message = "unexpected >"; + unread_char = true; + char_to_unread = ch; + state = st_token_ready; + } + } + else if (state == st_in_string) + { + if (string_ignoring_newline && (! ((ch == '\r') || (ch == '\n')))) + { + string_ignoring_newline = false; + } + + unsigned int bs_num_count = strlen(bs_num_register); + bool ch_is_octal = ((ch >= '0') && (ch <= '7')); + if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal))) + { + // We've accumulated \ddd. PDF Spec says to ignore + // high-order overflow. + val += (char) strtol(bs_num_register, 0, 8); + memset(bs_num_register, '\0', sizeof(bs_num_register)); + bs_num_count = 0; + } + + if (string_ignoring_newline && ((ch == '\r') || (ch == '\n'))) + { + // ignore + } + else if (ch_is_octal && (last_char_was_bs || (bs_num_count > 0))) + { + bs_num_register[bs_num_count++] = ch; + } + else if (last_char_was_bs) + { + switch (ch) + { + case 'n': + val += '\n'; + break; + + case 'r': + val += '\r'; + break; + + case 't': + val += '\t'; + break; + + case 'b': + val += '\b'; + break; + + case 'f': + val += '\f'; + break; + + case '\r': + case '\n': + string_ignoring_newline = true; + break; + + default: + // PDF spec says backslash is ignored before anything else + val += ch; + break; + } + } + else if (ch == '\\') + { + // last_char_was_bs is set/cleared below as appropriate + if (bs_num_count) + { + throw QEXC::Internal("QPDFTokenizer: bs_num_count != 0 " + "when ch == '\\'"); + } + } + else if (ch == '(') + { + val += ch; + ++string_depth; + } + else if ((ch == ')') && (--string_depth == 0)) + { + type = tt_string; + state = st_token_ready; + } + else + { + val += ch; + } + + last_char_was_bs = ((! last_char_was_bs) && (ch == '\\')); + } + else if (state == st_literal) + { + if (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0) + { + // A C-loacle whitespace character or delimiter terminates + // token. It is important to unread the whitespace + // character even though it is ignored since it may be the + // newline after a stream keyword. Removing it here could + // make the stream-reading code break on some files, + // though not on any files in the test suite as of this + // writing. + + type = tt_word; + unread_char = true; + char_to_unread = ch; + state = st_token_ready; + } + else + { + val += ch; + } + } + else + { + handled = false; + } + + + if (handled) + { + // okay + } + else if (state == st_in_hexstring) + { + if (ch == '>') + { + type = tt_string; + state = st_token_ready; + if (val.length() % 2) + { + // PDF spec says odd hexstrings have implicit + // trailing 0. + val += '0'; + } + char num[3]; + num[2] = '\0'; + std::string nval; + for (unsigned int i = 0; i < val.length(); i += 2) + { + num[0] = val[i]; + num[1] = val[i+1]; + char nch = (char)(strtol(num, 0, 16)); + nval += nch; + } + val = nval; + } + else if (is_hex_digit(ch)) + { + val += ch; + } + else if (strchr(" \t\n\v\f\r", ch)) + { + // ignore + } + else + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer bad ("); + error_message = std::string("invalid character (") + + ch + ") in hexstring"; + state = st_token_ready; + } + } + else + { + throw QEXC::Internal("invalid state while reading token"); + } + + if ((state == st_token_ready) && (type == tt_word)) + { + if ((val.length() > 0) && (val[0] == '/')) + { + type = tt_name; + // Deal with # in name token. Note: '/' by itself is a + // valid name, so don't strip leading /. That way we + // don't have to deal with the empty string as a name. + std::string nval = "/"; + char const* valstr = val.c_str() + 1; + for (char const* p = valstr; *p; ++p) + { + if ((*p == '#') && this->pound_special_in_name) + { + if (p[1] && p[2] && + is_hex_digit(p[1]) && is_hex_digit(p[2])) + { + char num[3]; + num[0] = p[1]; + num[1] = p[2]; + num[2] = '\0'; + char ch = (char)(strtol(num, 0, 16)); + if (ch == '\0') + { + type = tt_bad; + QTC::TC("qpdf", "QPDF_Tokenizer null in name"); + error_message = + "null character not allowed in name token"; + nval += "#00"; + } + else + { + nval += ch; + } + p += 2; + } + else + { + QTC::TC("qpdf", "QPDF_Tokenizer bad name"); + type = tt_bad; + error_message = "invalid name token"; + nval += *p; + } + } + else + { + nval += *p; + } + } + val = nval; + } + else if (num_re.match(val.c_str())) + { + if (val.find('.') != std::string::npos) + { + type = tt_real; + } + else + { + type = tt_integer; + } + } + else if ((val == "true") || (val == "false")) + { + type = tt_bool; + } + else if (val == "null") + { + type = tt_null; + } + else + { + // I don't really know what it is, so leave it as tt_word. + // Lots of cases ($, #, etc.) other than actual words fall + // into this category, but that's okay at least for now. + type = tt_word; + } + } + + if (! (betweenTokens() || ((state == st_token_ready) && unread_char))) + { + this->raw_val += orig_ch; + } +} + +void +QPDFTokenizer::presentEOF() +{ + switch (state) + { + case st_token_ready: + case st_top: + // okay + break; + + case st_in_comment: + state = st_top; + break; + + default: + type = tt_bad; + error_message = "EOF while reading token"; + state = st_token_ready; + } +} + +bool +QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) +{ + bool ready = (this->state == st_token_ready); + unread_char = this->unread_char; + ch = this->char_to_unread; + if (ready) + { + token = Token(type, val, raw_val, error_message); + reset(); + } + return ready; +} + +bool +QPDFTokenizer::betweenTokens() +{ + return ((state == st_top) || (state == st_in_comment)); +} diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc new file mode 100644 index 00000000..0a611eb9 --- /dev/null +++ b/libqpdf/QPDFWriter.cc @@ -0,0 +1,2021 @@ + +#include <qpdf/QPDFWriter.hh> + +#include <assert.h> +#include <qpdf/Pl_StdioFile.hh> +#include <qpdf/Pl_Count.hh> +#include <qpdf/Pl_Discard.hh> +#include <qpdf/Pl_Buffer.hh> +#include <qpdf/Pl_RC4.hh> +#include <qpdf/Pl_Flate.hh> +#include <qpdf/Pl_PNGFilter.hh> +#include <qpdf/QUtil.hh> +#include <qpdf/MD5.hh> +#include <qpdf/RC4.hh> +#include <qpdf/QTC.hh> + +#include <qpdf/QPDF.hh> +#include <qpdf/QPDFObjectHandle.hh> +#include <qpdf/QPDF_Name.hh> +#include <qpdf/QPDF_String.hh> + +QPDFWriter::QPDFWriter(QPDF& pdf, char const* filename) : + pdf(pdf), + filename(filename), + file(0), + close_file(false), + normalize_content_set(false), + normalize_content(false), + stream_data_mode_set(false), + stream_data_mode(s_compress), + qdf_mode(false), + static_id(false), + direct_stream_lengths(true), + encrypted(false), + preserve_encryption(true), + linearized(false), + object_stream_mode(o_preserve), + encryption_dict_objid(0), + next_objid(1), + cur_stream_length_id(0), + cur_stream_length(0), + added_newline(false), + max_ostream_index(0) +{ + if (filename == 0) + { + this->filename = "standard output"; + QTC::TC("qpdf", "QPDFWriter write to stdout"); + file = stdout; + } + else + { + QTC::TC("qpdf", "QPDFWriter write to file"); + file = QUtil::fopen_wrapper(std::string("open ") + filename, + fopen(filename, "wb+")); + close_file = true; + } + Pipeline* p = new Pl_StdioFile("qdf output", file); + to_delete.push_back(p); + pipeline = new Pl_Count("qdf count", p); + to_delete.push_back(pipeline); + pipeline_stack.push_back(pipeline); +} + +QPDFWriter::~QPDFWriter() +{ + if (file) + { + fclose(file); + } +} + +void +QPDFWriter::setObjectStreamMode(object_stream_e mode) +{ + this->object_stream_mode = mode; +} + +void +QPDFWriter::setStreamDataMode(stream_data_e mode) +{ + this->stream_data_mode_set = true; + this->stream_data_mode = mode; +} + +void +QPDFWriter::setContentNormalization(bool val) +{ + this->normalize_content_set = true; + this->normalize_content = val; +} + +void +QPDFWriter::setQDFMode(bool val) +{ + this->qdf_mode = val; +} + +void +QPDFWriter::setStaticID(bool val) +{ + this->static_id = val; +} + +void +QPDFWriter::setPreserveEncryption(bool val) +{ + this->preserve_encryption = val; +} + +void +QPDFWriter::setLinearization(bool val) +{ + this->linearized = val; +} + +void +QPDFWriter::setR2EncryptionParameters( + char const* user_password, char const* owner_password, + bool allow_print, bool allow_modify, + bool allow_extract, bool allow_annotate) +{ + std::set<int> clear; + if (! allow_print) + { + clear.insert(3); + } + if (! allow_modify) + { + clear.insert(4); + } + if (! allow_extract) + { + clear.insert(5); + } + if (! allow_annotate) + { + clear.insert(6); + } + + this->min_pdf_version = "1.3"; + setEncryptionParameters(user_password, owner_password, 1, 2, 5, clear); +} + +void +QPDFWriter::setR3EncryptionParameters( + char const* user_password, char const* owner_password, + bool allow_accessibility, bool allow_extract, + r3_print_e print, r3_modify_e modify) +{ + // Acrobat 5 security options: + + // Checkboxes: + // Enable Content Access for the Visually Impaired + // Allow Content Copying and Extraction + + // Allowed changes menu: + // None + // Only Document Assembly + // Only Form Field Fill-in or Signing + // Comment AUthoring, Form Field Fill-in or Signing + // General Editing, Comment and Form Field Authoring + + // Allowed printing menu: + // None + // Low Resolution + // Full printing + + std::set<int> clear; + if (! allow_accessibility) + { + clear.insert(10); + } + if (! allow_extract) + { + clear.insert(5); + } + + // Note: these switch statements all "fall through" (no break + // statements). Each option clears successively more access bits. + switch (print) + { + case r3p_none: + clear.insert(3); // any printing + + case r3p_low: + clear.insert(12); // high resolution printing + + case r3p_full: + break; + + // no default so gcc warns for missing cases + } + + switch (modify) + { + case r3m_none: + clear.insert(11); // document essembly + + case r3m_assembly: + clear.insert(9); // filling in form fields + + case r3m_form: + clear.insert(6); // modify annotations, fill in form fields + + case r3m_annotate: + clear.insert(4); // other modifications + + case r3m_all: + break; + + // no default so gcc warns for missing cases + } + + this->min_pdf_version = "1.4"; + setEncryptionParameters(user_password, owner_password, 2, 3, 16, clear); +} + +void +QPDFWriter::setEncryptionParameters( + char const* user_password, char const* owner_password, + int V, int R, int key_len, std::set<int>& bits_to_clear) +{ + // PDF specification refers to bits with the low bit numbered 1. + // We have to convert this into a bit field. + + // Specification always requirse bits 1 and 2 to be cleared. + bits_to_clear.insert(1); + bits_to_clear.insert(2); + + unsigned long P = 0; + // Create the complement of P, then invert. + for (std::set<int>::iterator iter = bits_to_clear.begin(); + iter != bits_to_clear.end(); ++iter) + { + P |= (1 << (*iter) - 1); + } + P = ~P; + + generateID(); + std::string O; + std::string U; + QPDF::compute_encryption_O_U( + user_password, owner_password, V, R, key_len, P, this->id1, O, U); + setEncryptionParametersInternal( + V, R, key_len, P, O, U, this->id1, user_password); +} + +void +QPDFWriter::copyEncryptionParameters() +{ + generateID(); + QPDFObjectHandle trailer = this->pdf.getTrailer(); + if (trailer.hasKey("/Encrypt")) + { + QPDFObjectHandle encrypt = trailer.getKey("/Encrypt"); + int V = encrypt.getKey("/V").getIntValue(); + int key_len = 5; + if (V > 1) + { + key_len = encrypt.getKey("/Length").getIntValue() / 8; + } + setEncryptionParametersInternal( + V, + encrypt.getKey("/R").getIntValue(), + key_len, + encrypt.getKey("/P").getIntValue(), + encrypt.getKey("/O").getStringValue(), + encrypt.getKey("/U").getStringValue(), + this->id1, // this->id1 == the other file's id1 + pdf.getUserPassword()); + } +} + +void +QPDFWriter::setEncryptionParametersInternal( + int V, int R, int key_len, long P, + std::string const& O, std::string const& U, + std::string const& id1, std::string const& user_password) +{ + encryption_dictionary["/Filter"] = "/Standard"; + encryption_dictionary["/V"] = QUtil::int_to_string(V); + encryption_dictionary["/Length"] = QUtil::int_to_string(key_len * 8); + encryption_dictionary["/R"] = QUtil::int_to_string(R); + encryption_dictionary["/P"] = QUtil::int_to_string(P); + encryption_dictionary["/O"] = QPDF_String(O).unparse(true); + encryption_dictionary["/U"] = QPDF_String(U).unparse(true); + this->encrypted = true; + QPDF::EncryptionData encryption_data(V, R, key_len, P, O, U, this->id1); + this->encryption_key = QPDF::compute_encryption_key( + user_password, encryption_data); +} + +void +QPDFWriter::setDataKey(int objid) +{ + this->cur_data_key = QPDF::compute_data_key( + this->encryption_key, objid, 0); +} + +int +QPDFWriter::bytesNeeded(unsigned long n) +{ + int bytes = 0; + while (n) + { + ++bytes; + n >>= 8; + } + return bytes; +} + +void +QPDFWriter::writeBinary(unsigned long val, unsigned int bytes) +{ + assert(bytes <= sizeof(unsigned long)); + unsigned char data[sizeof(unsigned long)]; + for (unsigned int i = 0; i < bytes; ++i) + { + data[bytes - i - 1] = (unsigned char)(val & 0xff); + val >>= 8; + } + this->pipeline->write(data, bytes); +} + +void +QPDFWriter::writeString(std::string const& str) +{ + this->pipeline->write((unsigned char*)str.c_str(), str.length()); +} + +void +QPDFWriter::writeBuffer(PointerHolder<Buffer>& b) +{ + this->pipeline->write(b.getPointer()->getBuffer(), + b.getPointer()->getSize()); +} + +void +QPDFWriter::writeStringQDF(std::string const& str) +{ + if (this->qdf_mode) + { + writeString(str); + } +} + +void +QPDFWriter::writeStringNoQDF(std::string const& str) +{ + if (! this->qdf_mode) + { + writeString(str); + } +} + +Pipeline* +QPDFWriter::pushPipeline(Pipeline* p) +{ + assert(dynamic_cast<Pl_Count*>(p) == 0); + this->pipeline_stack.push_back(p); + return p; +} + +void +QPDFWriter::activatePipelineStack() +{ + Pl_Count* c = new Pl_Count("count", this->pipeline_stack.back()); + this->pipeline_stack.push_back(c); + this->pipeline = c; +} + +void +QPDFWriter::popPipelineStack(PointerHolder<Buffer>* bp) +{ + assert(this->pipeline_stack.size() >= 2); + this->pipeline->finish(); + assert(dynamic_cast<Pl_Count*>(this->pipeline_stack.back()) == + this->pipeline); + delete this->pipeline_stack.back(); + this->pipeline_stack.pop_back(); + while (dynamic_cast<Pl_Count*>(this->pipeline_stack.back()) == 0) + { + Pipeline* p = this->pipeline_stack.back(); + this->pipeline_stack.pop_back(); + Pl_Buffer* buf = dynamic_cast<Pl_Buffer*>(p); + if (bp && buf) + { + *bp = buf->getBuffer(); + } + delete p; + } + this->pipeline = dynamic_cast<Pl_Count*>(this->pipeline_stack.back()); +} + +void +QPDFWriter::pushEncryptionFilter() +{ + if (this->encrypted && (! this->cur_data_key.empty())) + { + Pipeline* p = + new Pl_RC4("stream encryption", this->pipeline, + (unsigned char*) this->cur_data_key.c_str(), + this->cur_data_key.length()); + pushPipeline(p); + } + // Must call this unconditionally so we can call popPipelineStack + // to balance pushEncryptionFilter(). + activatePipelineStack(); +} + +void +QPDFWriter::pushDiscardFilter() +{ + pushPipeline(new Pl_Discard()); + activatePipelineStack(); +} + +int +QPDFWriter::openObject(int objid) +{ + if (objid == 0) + { + objid = this->next_objid++; + } + this->xref[objid] = QPDFXRefEntry(1, pipeline->getCount(), 0); + writeString(QUtil::int_to_string(objid)); + writeString(" 0 obj\n"); + return objid; +} + +void +QPDFWriter::closeObject(int objid) +{ + // Write a newline before endobj as it makes the file easier to + // repair. + writeString("\nendobj\n"); + writeStringQDF("\n"); + this->lengths[objid] = pipeline->getCount() - this->xref[objid].getOffset(); +} + +void +QPDFWriter::assignCompressedObjectNumbers(int objid) +{ + if (this->object_stream_to_objects.count(objid) == 0) + { + return; + } + + // Reserve numbers for the objects that belong to this object + // stream. + for (std::set<int>::iterator iter = + this->object_stream_to_objects[objid].begin(); + iter != this->object_stream_to_objects[objid].end(); + ++iter) + { + obj_renumber[*iter] = next_objid++; + } +} + +void +QPDFWriter::enqueueObject(QPDFObjectHandle object) +{ + if (object.isIndirect()) + { + if (object.isNull()) + { + // This is a place-holder object for an object stream + } + else if (object.isScalar()) + { + throw QEXC::Internal( + "QPDFWriter::enqueueObject: indirect scalar: " + + std::string(this->filename) + " " + + QUtil::int_to_string(object.getObjectID()) + " " + + QUtil::int_to_string(object.getGeneration())); + } + int objid = object.getObjectID(); + + if (obj_renumber.count(objid) == 0) + { + if (this->object_to_object_stream.count(objid)) + { + // This is in an object stream. Don't process it + // here. Instead, enqueue the object stream. + int stream_id = this->object_to_object_stream[objid]; + enqueueObject(this->pdf.getObjectByID(stream_id, 0)); + } + else + { + object_queue.push_back(object); + obj_renumber[objid] = next_objid++; + + if (this->object_stream_to_objects.count(objid)) + { + // For linearized files, uncompressed objects go + // at end, and we take care of assigning numbers + // to them elsewhere. + if (! this->linearized) + { + assignCompressedObjectNumbers(objid); + } + } + else if ((! this->direct_stream_lengths) && object.isStream()) + { + // reserve next object ID for length + ++next_objid; + } + } + } + } + else if (object.isArray()) + { + int n = object.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + if (! this->linearized) + { + enqueueObject(object.getArrayItem(i)); + } + } + } + else if (object.isDictionary()) + { + std::set<std::string> keys = object.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + if (! this->linearized) + { + enqueueObject(object.getKey(*iter)); + } + } + } + else + { + // ignore + } +} + +void +QPDFWriter::unparseChild(QPDFObjectHandle child, int level, int flags) +{ + if (! this->linearized) + { + enqueueObject(child); + } + if (child.isIndirect()) + { + if (child.isScalar()) + { + throw QEXC::Internal( + "QPDFWriter::unparseChild: indirect scalar: " + + QUtil::int_to_string(child.getObjectID()) + " " + + QUtil::int_to_string(child.getGeneration())); + } + int old_id = child.getObjectID(); + int new_id = obj_renumber[old_id]; + writeString(QUtil::int_to_string(new_id)); + writeString(" 0 R"); + } + else + { + unparseObject(child, level, flags); + } +} + +void +QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, int prev) +{ + QPDFObjectHandle trailer = pdf.getTrailer(); + if (! xref_stream) + { + writeString("trailer <<"); + } + writeStringQDF("\n"); + if (which == t_lin_second) + { + writeString(" /Size "); + writeString(QUtil::int_to_string(size)); + } + else + { + std::set<std::string> keys = trailer.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + writeStringQDF(" "); + writeStringNoQDF(" "); + writeString(QPDF_Name::normalizeName(key)); + writeString(" "); + if (key == "/Size") + { + writeString(QUtil::int_to_string(size)); + if (which == t_lin_first) + { + writeString(" /Prev "); + int pos = this->pipeline->getCount(); + writeString(QUtil::int_to_string(prev)); + int nspaces = pos + 11 - this->pipeline->getCount(); + assert(nspaces >= 0); + for (int i = 0; i < nspaces; ++i) + { + writeString(" "); + } + } + } + else + { + unparseChild(trailer.getKey(key), 1, 0); + } + writeStringQDF("\n"); + } + } + + // Write ID + writeStringQDF(" "); + writeString(" /ID ["); + writeString(QPDF_String(this->id1).unparse(true)); + writeString(QPDF_String(this->id2).unparse(true)); + writeString("]"); + + if (which != t_lin_second) + { + // Write reference to encryption dictionary + if (this->encrypted) + { + writeString(" /Encrypt "); + writeString(QUtil::int_to_string(this->encryption_dict_objid)); + writeString(" 0 R"); + } + } + + writeStringQDF("\n"); + writeStringNoQDF(" "); + writeString(">>"); +} + +void +QPDFWriter::unparseObject(QPDFObjectHandle object, int level, + unsigned int flags) +{ + unparseObject(object, level, flags, 0, false); +} + +void +QPDFWriter::unparseObject(QPDFObjectHandle object, int level, + unsigned int flags, int stream_length, bool compress) +{ + unsigned int child_flags = flags & ~f_stream; + + std::string indent; + for (int i = 0; i < level; ++i) + { + indent += " "; + } + + if (object.isArray()) + { + // Note: PDF spec 1.4 implementation note 121 states that + // Acrobat requires a space after the [ in the /H key of the + // linearization parameter dictionary. We'll do this + // unconditionally for all arrays because it looks nicer and + // doesn't make the files that much bigger. + writeString("["); + writeStringQDF("\n"); + int n = object.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + writeStringQDF(indent); + writeStringQDF(" "); + writeStringNoQDF(" "); + unparseChild(object.getArrayItem(i), level + 1, child_flags); + writeStringQDF("\n"); + } + writeStringQDF(indent); + writeStringNoQDF(" "); + writeString("]"); + } + else if (object.isDictionary()) + { + writeString("<<"); + writeStringQDF("\n"); + std::set<std::string> keys = object.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + if ((flags & f_filtered) && + ((key == "/Filter") || + (key == "/DecodeParms"))) + { + continue; + } + if ((flags & f_stream) && (key == "/Length")) + { + continue; + } + writeStringQDF(indent); + writeStringQDF(" "); + writeStringNoQDF(" "); + writeString(QPDF_Name::normalizeName(key)); + writeString(" "); + unparseChild(object.getKey(key), level + 1, child_flags); + writeStringQDF("\n"); + } + + if (flags & f_stream) + { + writeStringQDF(indent); + writeStringQDF(" "); + writeString(" /Length "); + + if (this->direct_stream_lengths) + { + writeString(QUtil::int_to_string(stream_length)); + } + else + { + writeString( + QUtil::int_to_string(this->cur_stream_length_id)); + writeString(" 0 R"); + } + writeStringQDF("\n"); + if (compress && (flags & f_filtered)) + { + writeStringQDF(indent); + writeStringQDF(" "); + writeString(" /Filter /FlateDecode"); + writeStringQDF("\n"); + } + } + + writeStringQDF(indent); + writeStringNoQDF(" "); + writeString(">>"); + } + else if (object.isStream()) + { + // Write stream data to a buffer. + int old_id = object.getObjectID(); + int new_id = obj_renumber[old_id]; + if (! this->direct_stream_lengths) + { + this->cur_stream_length_id = new_id + 1; + } + QPDFObjectHandle stream_dict = object.getDict(); + + bool filter = (this->stream_data_mode != s_preserve); + if (this->stream_data_mode == s_compress) + { + // Don't filter if the stream is already compressed with + // FlateDecode. We don't want to make it worse by getting + // rid of a predictor or otherwising messing with it. We + // should also avoid messing with anything that's + // compressed with a lossy compression scheme, but we + // don't support any of those right now. + QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); + if (filter_obj.isName() && (filter_obj.getName() == "/FlateDecode")) + { + QTC::TC("qpdf", "QPDFWriter not recompressing /FlateDecode"); + filter = false; + } + } + bool normalize = false; + bool compress = false; + if (this->normalize_content && normalized_streams.count(old_id)) + { + normalize = true; + filter = true; + } + else if (filter && (this->stream_data_mode == s_compress)) + { + compress = true; + QTC::TC("qpdf", "QPDFWriter compressing uncompressed stream"); + } + + flags |= f_stream; + + pushPipeline(new Pl_Buffer("stream data")); + activatePipelineStack(); + bool filtered = + object.pipeStreamData(this->pipeline, filter, normalize, compress); + PointerHolder<Buffer> stream_data; + popPipelineStack(&stream_data); + if (filtered) + { + flags |= f_filtered; + } + else + { + compress = false; + } + + this->cur_stream_length = stream_data.getPointer()->getSize(); + unparseObject(stream_dict, 0, flags, this->cur_stream_length, compress); + writeString("\nstream\n"); + pushEncryptionFilter(); + writeBuffer(stream_data); + popPipelineStack(); + + if (this->qdf_mode) + { + if (this->pipeline->getLastChar() != '\n') + { + writeString("\n"); + this->added_newline = true; + } + else + { + this->added_newline = false; + } + } + writeString("endstream"); + } + else if (object.isString()) + { + std::string val; + if (this->encrypted && + (! (flags & f_in_ostream)) && + (! this->cur_data_key.empty())) + { + val = object.getStringValue(); + char* tmp = QUtil::copy_string(val); + unsigned int vlen = val.length(); + RC4 rc4((unsigned char const*)this->cur_data_key.c_str(), + this->cur_data_key.length()); + rc4.process((unsigned char*)tmp, vlen); + val = QPDF_String(std::string(tmp, vlen)).unparse(); + delete [] tmp; + } + else + { + val = object.unparseResolved(); + } + writeString(val); + } + else + { + writeString(object.unparseResolved()); + } +} + +void +QPDFWriter::writeObjectStreamOffsets(std::vector<int>& offsets, + int first_obj) +{ + for (unsigned int i = 0; i < offsets.size(); ++i) + { + if (i != 0) + { + writeStringQDF("\n"); + writeStringNoQDF(" "); + } + writeString(QUtil::int_to_string(i + first_obj)); + writeString(" "); + writeString(QUtil::int_to_string(offsets[i])); + } + writeString("\n"); +} + +void +QPDFWriter::writeObjectStream(QPDFObjectHandle object) +{ + // Note: object might be null if this is a place-holder for an + // object stream that we are generating from scratch. + + int old_id = object.getObjectID(); + int new_id = obj_renumber[old_id]; + + std::vector<int> offsets; + int first = 0; + + // Generate stream itself. We have to do this in two passes so we + // can calculate offsets in the first pass. + PointerHolder<Buffer> stream_buffer; + int first_obj = -1; + bool compressed = false; + for (int pass = 1; pass <= 2; ++pass) + { + if (pass == 1) + { + pushDiscardFilter(); + } + else + { + // Adjust offsets to skip over comment before first object + + first = offsets[0]; + for (std::vector<int>::iterator iter = offsets.begin(); + iter != offsets.end(); ++iter) + { + *iter -= first; + } + + // Take one pass at writing pairs of numbers so we can get + // their size information + pushDiscardFilter(); + writeObjectStreamOffsets(offsets, first_obj); + first += this->pipeline->getCount(); + popPipelineStack(); + + // Set up a stream to write the stream data into a buffer. + Pipeline* next = pushPipeline(new Pl_Buffer("object stream")); + if (! ((this->stream_data_mode == s_uncompress) || this->qdf_mode)) + { + compressed = true; + next = pushPipeline( + new Pl_Flate("compress object stream", next, + Pl_Flate::a_deflate)); + } + activatePipelineStack(); + writeObjectStreamOffsets(offsets, first_obj); + } + + int count = 0; + for (std::set<int>::iterator iter = + this->object_stream_to_objects[old_id].begin(); + iter != this->object_stream_to_objects[old_id].end(); + ++iter, ++count) + { + int obj = *iter; + int new_obj = this->obj_renumber[obj]; + if (first_obj == -1) + { + first_obj = new_obj; + } + if (this->qdf_mode) + { + writeString("%% Object stream: object " + + QUtil::int_to_string(new_obj) + ", index " + + QUtil::int_to_string(count) + "\n"); + } + if (pass == 1) + { + offsets.push_back(this->pipeline->getCount()); + } + writeObject(this->pdf.getObjectByID(obj, 0), count); + + this->xref[new_obj] = QPDFXRefEntry(2, new_id, count); + } + + // stream_buffer will be initialized only for pass 2 + popPipelineStack(&stream_buffer); + } + + // Write the object + openObject(new_id); + setDataKey(new_id); + writeString("<<"); + writeStringQDF("\n "); + writeString(" /Type /ObjStm"); + writeStringQDF("\n "); + writeString(" /Length " + + QUtil::int_to_string(stream_buffer.getPointer()->getSize())); + writeStringQDF("\n "); + if (compressed) + { + writeString(" /Filter /FlateDecode"); + } + writeString(" /N " + QUtil::int_to_string(offsets.size())); + writeStringQDF("\n "); + writeString(" /First " + QUtil::int_to_string(first)); + if (! object.isNull()) + { + // If the original object has an /Extends key, preserve it. + QPDFObjectHandle dict = object.getDict(); + QPDFObjectHandle extends = dict.getKey("/Extends"); + if (extends.isIndirect()) + { + QTC::TC("qpdf", "QPDFWriter copy Extends"); + writeStringQDF("\n "); + writeString(" /Extends "); + unparseChild(extends, 1, f_in_ostream); + } + } + writeStringQDF("\n"); + writeStringNoQDF(" "); + writeString(">>\nstream\n"); + if (this->encrypted) + { + QTC::TC("qpdf", "QPDFWriter encrypt object stream"); + } + pushEncryptionFilter(); + writeBuffer(stream_buffer); + popPipelineStack(); + writeString("endstream"); + this->cur_data_key.clear(); + closeObject(new_id); +} + +void +QPDFWriter::writeObject(QPDFObjectHandle object, int object_stream_index) +{ + int old_id = object.getObjectID(); + + if ((object_stream_index == -1) && + (this->object_stream_to_objects.count(old_id))) + { + writeObjectStream(object); + return; + } + + int new_id = obj_renumber[old_id]; + if (this->qdf_mode) + { + if (this->page_object_to_seq.count(old_id)) + { + writeString("%% Page "); + writeString( + QUtil::int_to_string( + this->page_object_to_seq[old_id])); + writeString("\n"); + } + if (this->contents_to_page_seq.count(old_id)) + { + writeString("%% Contents for page "); + writeString( + QUtil::int_to_string( + this->contents_to_page_seq[old_id])); + writeString("\n"); + } + } + if (object_stream_index == -1) + { + openObject(new_id); + setDataKey(new_id); + unparseObject(object, 0, 0); + this->cur_data_key.clear(); + closeObject(new_id); + } + else + { + unparseObject(object, 0, f_in_ostream); + writeString("\n"); + } + + if ((! this->direct_stream_lengths) && object.isStream()) + { + if (this->qdf_mode) + { + if (this->added_newline) + { + writeString("%QDF: ignore_newline\n"); + } + } + openObject(new_id + 1); + writeString(QUtil::int_to_string(this->cur_stream_length)); + closeObject(new_id + 1); + } +} + +void +QPDFWriter::generateID() +{ + // Note: we can't call generateID() at the time of construction + // since the caller hasn't yet had a chance to call setStaticID(), + // but we need to generate it before computing encryption + // dictionary parameters. This is why we call this function both + // from setEncryptionParameters() and from write() and return + // immediately if the ID has already been generated. + + if (! this->id2.empty()) + { + return; + } + + QPDFObjectHandle trailer = pdf.getTrailer(); + + std::string result; + + if (this->static_id) + { + // For test suite use only... + static char tmp[] = {0x31, 0x41, 0x59, 0x26, + 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, + 0x33, 0x83, 0x27, 0x95, + 0x00}; + result = tmp; + } + else + { + // The PDF specification has guidelines for creating IDs, but it + // states clearly that the only thing that's really important is + // that it is very likely to be unique. We can't really follow + // the guidelines in the spec exactly because we haven't written + // the file yet. This scheme should be fine though. + + std::string seed; + seed += QUtil::int_to_string((int)time(0)); + seed += " QPDF "; + seed += filename; + seed += " "; + if (trailer.hasKey("/Info")) + { + std::set<std::string> keys = trailer.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + QPDFObjectHandle obj = trailer.getKey(*iter); + if (obj.isString()) + { + seed += " "; + seed += obj.getStringValue(); + } + } + } + + MD5 m; + m.encodeString(seed.c_str()); + MD5::Digest digest; + m.digest(digest); + result = std::string((char*)digest, sizeof(MD5::Digest)); + } + + // If /ID already exists, follow the spec: use the original first + // word and generate a new second word. Otherwise, we'll use the + // generated ID for both. + + this->id2 = result; + if (trailer.hasKey("/ID")) + { + // Note: keep /ID from old file even if --static-id was given. + this->id1 = trailer.getKey("/ID").getArrayItem(0).getStringValue(); + } + else + { + this->id1 = this->id2; + } +} + +void +QPDFWriter::initializeSpecialStreams() +{ + // Mark all page content streams in case we are filtering or + // normalizing. + std::vector<QPDFObjectHandle> pages = pdf.getAllPages(); + int num = 0; + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + QPDFObjectHandle& page = *iter; + this->page_object_to_seq[page.getObjectID()] = ++num; + QPDFObjectHandle contents = page.getKey("/Contents"); + std::vector<int> contents_objects; + if (contents.isArray()) + { + int n = contents.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + contents_objects.push_back( + contents.getArrayItem(i).getObjectID()); + } + } + else if (contents.isStream()) + { + contents_objects.push_back(contents.getObjectID()); + } + + for (std::vector<int>::iterator iter = contents_objects.begin(); + iter != contents_objects.end(); ++iter) + { + this->contents_to_page_seq[*iter] = num; + this->normalized_streams.insert(*iter); + } + } +} + +void +QPDFWriter::preserveObjectStreams() +{ + this->pdf.getObjectStreamData(this->object_to_object_stream); +} + +void +QPDFWriter::generateObjectStreams() +{ + // Basic strategy: make a list of objects that can go into an + // object stream. Then figure out how many object streams are + // needed so that we can distribute objects approximately evenly + // without having any object stream exceed 100 members. We don't + // have to worry about linearized files here -- if the file is + // linearized, we take care of excluding things that aren't + // allowed here later. + + // This code doesn't do anything with /Extends. + + std::vector<int> const& eligible = this->pdf.getCompressibleObjects(); + unsigned int n_object_streams = (eligible.size() + 99) / 100; + unsigned int n_per = eligible.size() / n_object_streams; + if (n_per * n_object_streams < eligible.size()) + { + ++n_per; + } + unsigned int n = 0; + int cur_ostream = 0; + for (std::vector<int>::const_iterator iter = eligible.begin(); + iter != eligible.end(); ++iter) + { + if ((n % n_per) == 0) + { + if (n > 0) + { + QTC::TC("qpdf", "QPDFWriter generate >1 ostream"); + } + n = 0; + } + if (n == 0) + { + // Construct a new null object as the "original" object + // stream. The rest of the code knows that this means + // we're creating the object stream from scratch. + cur_ostream = this->pdf.makeIndirectObject( + QPDFObjectHandle::newNull()).getObjectID(); + } + this->object_to_object_stream[*iter] = cur_ostream; + ++n; + } +} + +void +QPDFWriter::write() +{ + // Do preliminary setup + + if (this->linearized) + { + this->qdf_mode = false; + } + + if (this->qdf_mode) + { + if (! this->normalize_content_set) + { + this->normalize_content = true; + } + if (! this->stream_data_mode_set) + { + this->stream_data_mode = s_uncompress; + } + } + + if (this->encrypted) + { + // Encryption has been explicitly set + this->preserve_encryption = false; + } + else if (this->normalize_content || + (this->stream_data_mode == s_uncompress) || + this->qdf_mode) + { + // Encryption makes looking at contents pretty useless. If + // the user explicitly encrypted though, we still obey that. + this->preserve_encryption = false; + } + + if (preserve_encryption) + { + copyEncryptionParameters(); + } + + if (this->qdf_mode || this->normalize_content || + (this->stream_data_mode == s_uncompress)) + { + initializeSpecialStreams(); + } + + if (this->qdf_mode) + { + // Generate indirect stream lengths for qdf mode since fix-qdf + // uses them for storing recomputed stream length data. + // Certain streams such as object streams, xref streams, and + // hint streams always get direct stream lengths. + this->direct_stream_lengths = false; + } + + switch (this->object_stream_mode) + { + case o_disable: + // no action required + break; + + case o_preserve: + preserveObjectStreams(); + break; + + case o_generate: + generateObjectStreams(); + break; + + // no default so gcc will warn for missing case tag + } + + if (this->linearized) + { + // Page dictionaries are not allowed to be compressed objects. + std::vector<QPDFObjectHandle> pages = pdf.getAllPages(); + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + QPDFObjectHandle& page = *iter; + int objid = page.getObjectID(); + if (this->object_to_object_stream.count(objid)) + { + QTC::TC("qpdf", "QPDFWriter uncompressing page dictionary"); + this->object_to_object_stream.erase(objid); + } + } + } + + if (this->linearized || this->encrypted) + { + // The document catalog is not allowed to be compressed in + // linearized files either. It also appears that Adobe Reader + // 8.0.0 has a bug that prevents it from being able to handle + // encrypted files with compressed document catalogs, so we + // disable them in that case as well. + int objid = pdf.getRoot().getObjectID(); + if (this->object_to_object_stream.count(objid)) + { + QTC::TC("qpdf", "QPDFWriter uncompressing root"); + this->object_to_object_stream.erase(objid); + } + } + + // Generate reverse mapping from object stream to objects + for (std::map<int, int>::iterator iter = + this->object_to_object_stream.begin(); + iter != this->object_to_object_stream.end(); ++iter) + { + int obj = (*iter).first; + int stream = (*iter).second; + this->object_stream_to_objects[stream].insert(obj); + this->max_ostream_index = + std::max(this->max_ostream_index, + (int)this->object_stream_to_objects[stream].size() - 1); + } + + if (! this->object_stream_to_objects.empty()) + { + this->min_pdf_version = "1.5"; + } + + generateID(); + + pdf.trimTrailerForWrite(); + pdf.flattenScalarReferences(); + + if (this->linearized) + { + writeLinearized(); + } + else + { + writeStandard(); + } + + this->pipeline->finish(); + if (this->close_file) + { + fclose(this->file); + } + this->file = 0; +} + +void +QPDFWriter::enqueuePart(std::vector<QPDFObjectHandle>& part) +{ + for (std::vector<QPDFObjectHandle>::iterator iter = part.begin(); + iter != part.end(); ++iter) + { + enqueueObject(*iter); + } +} + +void +QPDFWriter::writeEncryptionDictionary() +{ + this->encryption_dict_objid = openObject(this->encryption_dict_objid); + writeString("<<"); + for (std::map<std::string, std::string>::iterator iter = + this->encryption_dictionary.begin(); + iter != this->encryption_dictionary.end(); ++iter) + { + writeString(" "); + writeString((*iter).first); + writeString(" "); + writeString((*iter).second); + } + writeString(" >>"); + closeObject(this->encryption_dict_objid); +} + +void +QPDFWriter::writeHeader() +{ + std::string version = pdf.getPDFVersion(); + if (! this->min_pdf_version.empty()) + { + float ov = atof(version.c_str()); + float mv = atof(this->min_pdf_version.c_str()); + if (mv > ov) + { + version = this->min_pdf_version; + } + } + + writeString("%PDF-"); + writeString(version); + // This string of binary characters would not be valid UTF-8, so + // it really should be treated as binary. + writeString("\n%¿÷¢þ\n"); + writeStringQDF("%QDF-1.0\n\n"); +} + +void +QPDFWriter::writeHintStream(int hint_id) +{ + PointerHolder<Buffer> hint_buffer; + int S = 0; + int O = 0; + pdf.generateHintStream( + this->xref, this->lengths, this->obj_renumber, hint_buffer, S, O); + + openObject(hint_id); + setDataKey(hint_id); + + unsigned char* hs = hint_buffer.getPointer()->getBuffer(); + unsigned long hlen = hint_buffer.getPointer()->getSize(); + + writeString("<< /Filter /FlateDecode /S "); + writeString(QUtil::int_to_string(S)); + if (O) + { + writeString(" /O "); + writeString(QUtil::int_to_string(O)); + } + writeString(" /Length "); + writeString(QUtil::int_to_string(hlen)); + writeString(" >>\nstream\n"); + + if (this->encrypted) + { + QTC::TC("qpdf", "QPDFWriter encrypted hint stream"); + } + pushEncryptionFilter(); + writeBuffer(hint_buffer); + popPipelineStack(); + + if (hs[hlen - 1] != '\n') + { + writeString("\n"); + } + writeString("endstream"); + closeObject(hint_id); +} + +int +QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size) +{ + return writeXRefTable(which, first, last, size, 0, false, 0, 0, 0); +} + +int +QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, + int prev, bool suppress_offsets, + int hint_id, int hint_offset, int hint_length) +{ + writeString("xref\n"); + writeString(QUtil::int_to_string(first)); + writeString(" "); + writeString(QUtil::int_to_string(last - first + 1)); + int space_before_zero = this->pipeline->getCount(); + writeString("\n"); + for (int i = first; i <= last; ++i) + { + if (i == 0) + { + writeString("0000000000 65535 f \n"); + } + else + { + int offset = 0; + if (! suppress_offsets) + { + offset = this->xref[i].getOffset(); + if ((hint_id != 0) && + (i != hint_id) && + (offset >= hint_offset)) + { + offset += hint_length; + } + } + writeString(QUtil::int_to_string(offset, 10)); + writeString(" 00000 n \n"); + } + } + writeTrailer(which, size, false, prev); + writeString("\n"); + return space_before_zero; +} + +int +QPDFWriter::writeXRefStream(int objid, int max_id, int max_offset, + trailer_e which, int first, int last, int size) +{ + return writeXRefStream(objid, max_id, max_offset, + which, first, last, size, 0, 0, 0, 0); +} + +int +QPDFWriter::writeXRefStream(int xref_id, int max_id, int max_offset, + trailer_e which, int first, int last, int size, + int prev, int hint_id, + int hint_offset, int hint_length) +{ + int xref_offset = this->pipeline->getCount(); + int space_before_zero = xref_offset - 1; + + // field 1 contains offsets and object stream identifiers + int f1_size = std::max(bytesNeeded(max_offset), + bytesNeeded(max_id)); + + // field 2 contains object stream indices + int f2_size = bytesNeeded(this->max_ostream_index); + + unsigned int esize = 1 + f1_size + f2_size; + + // Must store in xref table in advance of writing the actual data + // rather than waiting for openObject to do it. + this->xref[xref_id] = QPDFXRefEntry(1, pipeline->getCount(), 0); + + Pipeline* p = pushPipeline(new Pl_Buffer("xref stream")); + bool compressed = false; + if (! ((this->stream_data_mode == s_uncompress) || this->qdf_mode)) + { + compressed = true; + p = pushPipeline( + new Pl_Flate("compress xref", p, Pl_Flate::a_deflate)); + p = pushPipeline( + new Pl_PNGFilter( + "pngify xref", p, Pl_PNGFilter::a_encode, esize, 0)); + } + activatePipelineStack(); + for (int i = first; i <= last; ++i) + { + QPDFXRefEntry& e = this->xref[i]; + switch (e.getType()) + { + case 0: + writeBinary(0, 1); + writeBinary(0, f1_size); + writeBinary(0, f2_size); + break; + + case 1: + { + int offset = e.getOffset(); + if ((hint_id != 0) && + (i != hint_id) && + (offset >= hint_offset)) + { + offset += hint_length; + } + writeBinary(1, 1); + writeBinary(offset, f1_size); + writeBinary(0, f2_size); + } + break; + + case 2: + writeBinary(2, 1); + writeBinary(e.getObjStreamNumber(), f1_size); + writeBinary(e.getObjStreamIndex(), f2_size); + break; + + default: + throw QEXC::Internal("invalid type writing xref stream"); + break; + } + } + PointerHolder<Buffer> xref_data; + popPipelineStack(&xref_data); + + openObject(xref_id); + writeString("<<"); + writeStringQDF("\n "); + writeString(" /Type /XRef"); + writeStringQDF("\n "); + writeString(" /Length " + + QUtil::int_to_string(xref_data.getPointer()->getSize())); + if (compressed) + { + writeStringQDF("\n "); + writeString(" /Filter /FlateDecode"); + writeStringQDF("\n "); + writeString(" /DecodeParms << /Columns " + + QUtil::int_to_string(esize) + " /Predictor 12 >>"); + } + writeStringQDF("\n "); + writeString(" /W [ 1 " + + QUtil::int_to_string(f1_size) + " " + + QUtil::int_to_string(f2_size) + " ]"); + if (! ((first == 0) && (last == size - 1))) + { + writeString(" /Index [ " + + QUtil::int_to_string(first) + " " + + QUtil::int_to_string(last - first + 1) + " ]"); + } + writeTrailer(which, size, true, prev); + writeString("\nstream\n"); + writeBuffer(xref_data); + writeString("\nendstream"); + closeObject(xref_id); + return space_before_zero; +} + +void +QPDFWriter::writeLinearized() +{ + // Optimize file and enqueue objects in order + + bool need_xref_stream = (! this->object_to_object_stream.empty()); + pdf.optimize(this->object_to_object_stream); + + std::vector<QPDFObjectHandle> part4; + std::vector<QPDFObjectHandle> part6; + std::vector<QPDFObjectHandle> part7; + std::vector<QPDFObjectHandle> part8; + std::vector<QPDFObjectHandle> part9; + pdf.getLinearizedParts(this->object_to_object_stream, + part4, part6, part7, part8, part9); + + // Object number sequence: + // + // second half + // second half uncompressed objects + // second half xref stream, if any + // second half compressed objects + // first half + // linearization dictionary + // first half xref stream, if any + // part 4 uncompresesd objects + // encryption dictionary, if any + // hint stream + // part 6 uncompressed objects + // first half compressed objects + // + + // Second half objects + int second_half_uncompressed = part7.size() + part8.size() + part9.size(); + int second_half_first_obj = 1; + int after_second_half = 1 + second_half_uncompressed; + this->next_objid = after_second_half; + int second_half_xref = 0; + if (need_xref_stream) + { + second_half_xref = this->next_objid++; + } + // Assign numbers to all compressed objects in the second half. + std::vector<QPDFObjectHandle>* vecs2[] = {&part7, &part8, &part9}; + for (int i = 0; i < 3; ++i) + { + for (std::vector<QPDFObjectHandle>::iterator iter = (*vecs2[i]).begin(); + iter != (*vecs2[i]).end(); ++iter) + { + assignCompressedObjectNumbers((*iter).getObjectID()); + } + } + int second_half_end = this->next_objid - 1; + int second_trailer_size = this->next_objid; + + // First half objects + int first_half_start = this->next_objid; + int lindict_id = this->next_objid++; + int first_half_xref = 0; + if (need_xref_stream) + { + first_half_xref = this->next_objid++; + } + int part4_first_obj = this->next_objid; + this->next_objid += part4.size(); + int after_part4 = this->next_objid; + if (this->encrypted) + { + this->encryption_dict_objid = this->next_objid++; + } + int hint_id = this->next_objid++; + int part6_first_obj = this->next_objid; + this->next_objid += part6.size(); + int after_part6 = this->next_objid; + // Assign numbers to all compressed objects in the first half + std::vector<QPDFObjectHandle>* vecs1[] = {&part4, &part6}; + for (int i = 0; i < 2; ++i) + { + for (std::vector<QPDFObjectHandle>::iterator iter = (*vecs1[i]).begin(); + iter != (*vecs1[i]).end(); ++iter) + { + assignCompressedObjectNumbers((*iter).getObjectID()); + } + } + int first_half_end = this->next_objid - 1; + int first_trailer_size = this->next_objid; + + int part4_end_marker = part4.back().getObjectID(); + int part6_end_marker = part6.back().getObjectID(); + int space_before_zero = 0; + int file_size = 0; + int part6_end_offset = 0; + int first_half_max_obj_offset = 0; + int second_xref_offset = 0; + int first_xref_end = 0; + int second_xref_end = 0; + + this->next_objid = part4_first_obj; + enqueuePart(part4); + assert(this->next_objid = after_part4); + this->next_objid = part6_first_obj; + enqueuePart(part6); + assert(this->next_objid == after_part6); + this->next_objid = second_half_first_obj; + enqueuePart(part7); + enqueuePart(part8); + enqueuePart(part9); + assert(this->next_objid == after_second_half); + + int hint_length = 0; + PointerHolder<Buffer> hint_buffer; + + // Write file in two passes. Part numbers refer to PDF spec 1.4. + + for (int pass = 1; pass <= 2; ++pass) + { + if (pass == 1) + { + pushDiscardFilter(); + } + + // Part 1: header + + writeHeader(); + + // Part 2: linearization parameter dictionary. Save enough + // space to write real dictionary. 150 characters is enough + // space if all numerical values in the parameter dictionary + // are 10 digits long plus a few extra characters for safety. + + int pos = this->pipeline->getCount(); + openObject(lindict_id); + writeString("<<"); + if (pass == 2) + { + std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages(); + int first_page_object = obj_renumber[pages[0].getObjectID()]; + int npages = pages.size(); + + writeString(" /Linearized 1 /L "); + writeString(QUtil::int_to_string(file_size + hint_length)); + // Implementation note 121 states that a space is + // mandatory after this open bracket. + writeString(" /H [ "); + writeString(QUtil::int_to_string(this->xref[hint_id].getOffset())); + writeString(" "); + writeString(QUtil::int_to_string(hint_length)); + writeString(" ] /O "); + writeString(QUtil::int_to_string(first_page_object)); + writeString(" /E "); + writeString(QUtil::int_to_string(part6_end_offset + hint_length)); + writeString(" /N "); + writeString(QUtil::int_to_string(npages)); + writeString(" /T "); + writeString(QUtil::int_to_string(space_before_zero + hint_length)); + } + writeString(" >>"); + closeObject(lindict_id); + static int const pad = 150; + int spaces = (pos + pad - this->pipeline->getCount()); + assert(spaces >= 0); + for (int i = 0; i < spaces; ++i) + { + writeString(" "); + } + writeString("\n"); + + // Part 3: first page cross reference table and trailer. + + int first_xref_offset = this->pipeline->getCount(); + int hint_offset = 0; + if (pass == 2) + { + hint_offset = this->xref[hint_id].getOffset(); + } + if (need_xref_stream) + { + // Must pad here too. + if (pass == 1) + { + // first_half_max_obj_offset is very likely to fall + // within the first 64K of the document (thus + // requiring two bytes for offsets) since it is the + // offset of the last uncompressed object in page 1. + // We allow for it to do otherwise though. + first_half_max_obj_offset = 65535; + } + pos = this->pipeline->getCount(); + writeXRefStream(first_half_xref, first_half_end, + first_half_max_obj_offset, + t_lin_first, first_half_start, first_half_end, + first_trailer_size, + hint_length + second_xref_offset, + hint_id, hint_offset, hint_length); + int endpos = this->pipeline->getCount(); + if (pass == 1) + { + // Pad so we have enough room for the real xref + // stream. In an extremely unlikely worst case, + // first_half_max_obj_offset could be enough larger to + // require two extra bytes beyond what we calculated + // in pass 1. This means we need to save two extra + // bytes for each xref entry. To that, we'll add 10 + // extra bytes for number length increases. + int possible_extra = + 10 + (2 * (first_half_end - first_half_start + 1)); + for (int i = 0; i < possible_extra; ++i) + { + writeString(" "); + } + first_xref_end = this->pipeline->getCount(); + } + else + { + // Pad so that the next object starts at the same + // place as in pass 1. + for (int i = 0; i < first_xref_end - endpos; ++i) + { + writeString(" "); + } + assert(this->pipeline->getCount() == first_xref_end); + } + writeString("\n"); + } + else + { + writeXRefTable(t_lin_first, first_half_start, first_half_end, + first_trailer_size, hint_length + second_xref_offset, + (pass == 1), hint_id, hint_offset, hint_length); + writeString("startxref\n0\n%%EOF\n"); + } + + // Parts 4 through 9 + + for (std::list<QPDFObjectHandle>::iterator iter = + this->object_queue.begin(); + iter != this->object_queue.end(); ++iter) + { + QPDFObjectHandle cur_object = (*iter); + if (cur_object.getObjectID() == part6_end_marker) + { + first_half_max_obj_offset = this->pipeline->getCount(); + } + writeObject(cur_object); + if (cur_object.getObjectID() == part4_end_marker) + { + if (this->encrypted) + { + writeEncryptionDictionary(); + } + if (pass == 1) + { + this->xref[hint_id] = + QPDFXRefEntry(1, this->pipeline->getCount(), 0); + } + else + { + // Part 5: hint stream + writeBuffer(hint_buffer); + } + } + if (cur_object.getObjectID() == part6_end_marker) + { + part6_end_offset = this->pipeline->getCount(); + } + } + + // Part 10: overflow hint stream -- not used + + // Part 11: main cross reference table and trailer + + second_xref_offset = this->pipeline->getCount(); + if (need_xref_stream) + { + space_before_zero = + writeXRefStream(second_half_xref, + second_half_end, second_xref_offset, + t_lin_second, 0, second_half_end, + second_trailer_size); + if (pass == 1) + { + // Add some padding -- we need an accurate file_size + // number, and this could change if the pass 2 xref + // stream compresses differently. There shouldn't be + // much difference, so we'll just pad 100 characters. + // This is unscientific though, and may not always + // work. The only way we could really get around this + // would be to seek back to the beginning of the file + // and update /L in the linearization dictionary, but + // that would be the only thing in the design that + // would require the output file to be seekable. + for (int i = 0; i < 99; ++i) + { + writeString(" "); + } + writeString("\n"); + second_xref_end = this->pipeline->getCount(); + } + else + { + // Make the file size the same. + int pos = this->pipeline->getCount(); + while (pos < second_xref_end + hint_length - 1) + { + ++pos; + writeString(" "); + } + writeString("\n"); + // If this assertion fails, maybe we didn't have + // enough padding above. + assert(this->pipeline->getCount() == + second_xref_end + hint_length); + } + } + else + { + space_before_zero = + writeXRefTable(t_lin_second, 0, second_half_end, + second_trailer_size); + } + writeString("startxref\n"); + writeString(QUtil::int_to_string(first_xref_offset)); + writeString("\n%%EOF\n"); + + if (pass == 1) + { + // Close first pass pipeline + file_size = this->pipeline->getCount(); + popPipelineStack(); + + // Save hint offset since it will be set to zero by + // calling openObject. + int hint_offset = this->xref[hint_id].getOffset(); + + // Write hint stream to a buffer + pushPipeline(new Pl_Buffer("hint buffer")); + activatePipelineStack(); + writeHintStream(hint_id); + popPipelineStack(&hint_buffer); + hint_length = hint_buffer.getPointer()->getSize(); + + // Restore hint offset + this->xref[hint_id] = QPDFXRefEntry(1, hint_offset, 0); + } + } +} + +void +QPDFWriter::writeStandard() +{ + // Start writing + + writeHeader(); + + // Put root first on queue. + QPDFObjectHandle trailer = pdf.getTrailer(); + enqueueObject(trailer.getKey("/Root")); + + // Next place any other objects referenced from the trailer + // dictionary into the queue, handling direct objects recursively. + // Root is already there, so enqueuing it a second time is a + // no-op. + std::set<std::string> keys = trailer.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + enqueueObject(trailer.getKey(*iter)); + } + + // Now start walking queue, output each object + while (this->object_queue.size()) + { + QPDFObjectHandle cur_object = this->object_queue.front(); + this->object_queue.pop_front(); + writeObject(cur_object); + } + + // Write out the encryption dictionary, if any + if (this->encrypted) + { + writeEncryptionDictionary(); + } + + // Now write out xref. next_objid is now the number of objects. + off_t xref_offset = this->pipeline->getCount(); + if (this->object_stream_to_objects.empty()) + { + // Write regular cross-reference table + // Write regular cross-reference table + writeXRefTable(t_normal, 0, this->next_objid - 1, this->next_objid); + } + else + { + // Write cross-reference stream. + int xref_id = this->next_objid++; + writeXRefStream(xref_id, xref_id, xref_offset, t_normal, + 0, this->next_objid - 1, this->next_objid); + } + writeString("startxref\n"); + writeString(QUtil::int_to_string(xref_offset)); + writeString("\n%%EOF\n"); +} diff --git a/libqpdf/QPDFXRefEntry.cc b/libqpdf/QPDFXRefEntry.cc new file mode 100644 index 00000000..669a2f13 --- /dev/null +++ b/libqpdf/QPDFXRefEntry.cc @@ -0,0 +1,61 @@ + +#include <qpdf/QPDFXRefEntry.hh> +#include <qpdf/QPDFExc.hh> +#include <qpdf/QUtil.hh> + +QPDFXRefEntry::QPDFXRefEntry() : + type(0), + field1(0), + field2(0) +{ +} + +QPDFXRefEntry::QPDFXRefEntry(int type, int field1, int field2) : + type(type), + field1(field1), + field2(field2) +{ + if ((type < 1) || (type > 2)) + { + throw QPDFExc("invalid xref type " + QUtil::int_to_string(type)); + } +} + +int +QPDFXRefEntry::getType() const +{ + return this->type; +} + +int +QPDFXRefEntry::getOffset() const +{ + if (this->type != 1) + { + throw QPDFExc( + "getOffset called for xref entry of type != 1"); + } + return this->field1; +} + +int +QPDFXRefEntry::getObjStreamNumber() const +{ + if (this->type != 2) + { + throw QPDFExc( + "getObjStreamNumber called for xref entry of type != 2"); + } + return this->field1; +} + +int +QPDFXRefEntry::getObjStreamIndex() const +{ + if (this->type != 2) + { + throw QPDFExc( + "getObjStreamIndex called for xref entry of type != 2"); + } + return this->field2; +} diff --git a/libqpdf/QPDF_Array.cc b/libqpdf/QPDF_Array.cc new file mode 100644 index 00000000..d1edbfdd --- /dev/null +++ b/libqpdf/QPDF_Array.cc @@ -0,0 +1,51 @@ + +#include <qpdf/QPDF_Array.hh> + +#include <qpdf/QEXC.hh> + +QPDF_Array::QPDF_Array(std::vector<QPDFObjectHandle> const& items) : + items(items) +{ +} + +QPDF_Array::~QPDF_Array() +{ +} + +std::string +QPDF_Array::unparse() +{ + std::string result = "[ "; + for (std::vector<QPDFObjectHandle>::iterator iter = this->items.begin(); + iter != this->items.end(); ++iter) + { + result += (*iter).unparse(); + result += " "; + } + result += "]"; + return result; +} + +int +QPDF_Array::getNItems() const +{ + return this->items.size(); +} + +QPDFObjectHandle +QPDF_Array::getItem(int n) const +{ + if ((n < 0) || (n >= (int)this->items.size())) + { + throw QEXC::Internal("bounds array accessing QPDF_Array element"); + } + return this->items[n]; +} + +void +QPDF_Array::setItem(int n, QPDFObjectHandle const& oh) +{ + // Call getItem for bounds checking + (void) getItem(n); + this->items[n] = oh; +} diff --git a/libqpdf/QPDF_Bool.cc b/libqpdf/QPDF_Bool.cc new file mode 100644 index 00000000..2b50c4c2 --- /dev/null +++ b/libqpdf/QPDF_Bool.cc @@ -0,0 +1,23 @@ + +#include <qpdf/QPDF_Bool.hh> + +QPDF_Bool::QPDF_Bool(bool val) : + val(val) +{ +} + +QPDF_Bool::~QPDF_Bool() +{ +} + +std::string +QPDF_Bool::unparse() +{ + return (val ? "true" : "false"); +} + +bool +QPDF_Bool::getVal() const +{ + return this->val; +} diff --git a/libqpdf/QPDF_Dictionary.cc b/libqpdf/QPDF_Dictionary.cc new file mode 100644 index 00000000..654df688 --- /dev/null +++ b/libqpdf/QPDF_Dictionary.cc @@ -0,0 +1,84 @@ + +#include <qpdf/QPDF_Dictionary.hh> + +#include <qpdf/QPDF_Null.hh> +#include <qpdf/QPDF_Name.hh> + +QPDF_Dictionary::QPDF_Dictionary( + std::map<std::string, QPDFObjectHandle> const& items) : + items(items) +{ +} + +QPDF_Dictionary::~QPDF_Dictionary() +{ +} + +std::string +QPDF_Dictionary::unparse() +{ + std::string result = "<< "; + for (std::map<std::string, QPDFObjectHandle>::iterator iter = + this->items.begin(); + iter != this->items.end(); ++iter) + { + result += QPDF_Name::normalizeName((*iter).first) + + " " + (*iter).second.unparse() + " "; + } + result += ">>"; + return result; +} + +bool +QPDF_Dictionary::hasKey(std::string const& key) +{ + return ((this->items.count(key) > 0) && + (! this->items[key].isNull())); +} + +QPDFObjectHandle +QPDF_Dictionary::getKey(std::string const& key) +{ + // PDF spec says fetching a non-existent key from a dictionary + // returns the null object. + if (this->items.count(key)) + { + // May be a null object + return (*(this->items.find(key))).second; + } + else + { + return QPDFObjectHandle::newNull(); + } +} + +std::set<std::string> +QPDF_Dictionary::getKeys() +{ + std::set<std::string> result; + for (std::map<std::string, QPDFObjectHandle>::const_iterator iter = + this->items.begin(); + iter != this->items.end(); ++iter) + { + if (hasKey((*iter).first)) + { + result.insert((*iter).first); + } + } + return result; +} + +void +QPDF_Dictionary::replaceKey(std::string const& key, + QPDFObjectHandle const& value) +{ + // add or replace value + this->items[key] = value; +} + +void +QPDF_Dictionary::removeKey(std::string const& key) +{ + // no-op if key does not exist + this->items.erase(key); +} diff --git a/libqpdf/QPDF_Integer.cc b/libqpdf/QPDF_Integer.cc new file mode 100644 index 00000000..988519d0 --- /dev/null +++ b/libqpdf/QPDF_Integer.cc @@ -0,0 +1,25 @@ + +#include <qpdf/QPDF_Integer.hh> + +#include <qpdf/QUtil.hh> + +QPDF_Integer::QPDF_Integer(int val) : + val(val) +{ +} + +QPDF_Integer::~QPDF_Integer() +{ +} + +std::string +QPDF_Integer::unparse() +{ + return QUtil::int_to_string(this->val); +} + +int +QPDF_Integer::getVal() const +{ + return this->val; +} diff --git a/libqpdf/QPDF_Name.cc b/libqpdf/QPDF_Name.cc new file mode 100644 index 00000000..f57ced04 --- /dev/null +++ b/libqpdf/QPDF_Name.cc @@ -0,0 +1,46 @@ + +#include <qpdf/QPDF_Name.hh> + +QPDF_Name::QPDF_Name(std::string const& name) : + name(name) +{ +} + +QPDF_Name::~QPDF_Name() +{ +} + +std::string +QPDF_Name::normalizeName(std::string const& name) +{ + std::string result; + char num[4]; + result += name[0]; + for (unsigned int i = 1; i < name.length(); ++i) + { + char ch = name[i]; + // Don't use locale/ctype here; follow PDF spec guidlines. + if (strchr("#()<>[]{}/%", ch) || (ch < 33) || (ch > 126)) + { + sprintf(num, "#%02x", (unsigned char) ch); + result += num; + } + else + { + result += ch; + } + } + return result; +} + +std::string +QPDF_Name::unparse() +{ + return normalizeName(this->name); +} + +std::string +QPDF_Name::getName() const +{ + return this->name; +} diff --git a/libqpdf/QPDF_Null.cc b/libqpdf/QPDF_Null.cc new file mode 100644 index 00000000..57a78b7e --- /dev/null +++ b/libqpdf/QPDF_Null.cc @@ -0,0 +1,12 @@ + +#include <qpdf/QPDF_Null.hh> + +QPDF_Null::~QPDF_Null() +{ +} + +std::string +QPDF_Null::unparse() +{ + return "null"; +} diff --git a/libqpdf/QPDF_Real.cc b/libqpdf/QPDF_Real.cc new file mode 100644 index 00000000..87a19cb2 --- /dev/null +++ b/libqpdf/QPDF_Real.cc @@ -0,0 +1,23 @@ + +#include <qpdf/QPDF_Real.hh> + +QPDF_Real::QPDF_Real(std::string const& val) : + val(val) +{ +} + +QPDF_Real::~QPDF_Real() +{ +} + +std::string +QPDF_Real::unparse() +{ + return this->val; +} + +std::string +QPDF_Real::getVal() +{ + return this->val; +} diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc new file mode 100644 index 00000000..9694f837 --- /dev/null +++ b/libqpdf/QPDF_Stream.cc @@ -0,0 +1,309 @@ + +#include <qpdf/QPDF_Stream.hh> + +#include <qpdf/QEXC.hh> +#include <qpdf/QUtil.hh> +#include <qpdf/Pipeline.hh> +#include <qpdf/Pl_Flate.hh> +#include <qpdf/Pl_PNGFilter.hh> +#include <qpdf/Pl_RC4.hh> +#include <qpdf/Pl_Buffer.hh> +#include <qpdf/Pl_ASCII85Decoder.hh> +#include <qpdf/Pl_ASCIIHexDecoder.hh> +#include <qpdf/Pl_LZWDecoder.hh> + +#include <qpdf/QTC.hh> +#include <qpdf/QPDF.hh> +#include <qpdf/QPDFExc.hh> +#include <qpdf/Pl_QPDFTokenizer.hh> + +QPDF_Stream::QPDF_Stream(QPDF* qpdf, int objid, int generation, + QPDFObjectHandle stream_dict, + off_t offset, int length) : + qpdf(qpdf), + objid(objid), + generation(generation), + stream_dict(stream_dict), + offset(offset), + length(length) +{ + if (! stream_dict.isDictionary()) + { + throw QEXC::Internal("stream object instantiated with non-dictionary " + "object for dictionary"); + } +} + +QPDF_Stream::~QPDF_Stream() +{ +} + +std::string +QPDF_Stream::unparse() +{ + // Unparse stream objects as indirect references + return QUtil::int_to_string(this->objid) + " " + + QUtil::int_to_string(this->generation) + " R"; +} + +QPDFObjectHandle +QPDF_Stream::getDict() const +{ + return this->stream_dict; +} + +PointerHolder<Buffer> +QPDF_Stream::getStreamData() +{ + Pl_Buffer buf("stream data buffer"); + if (! pipeStreamData(&buf, true, false, false)) + { + throw QPDFExc("getStreamData called on unfilterable stream"); + } + return buf.getBuffer(); +} + +bool +QPDF_Stream::filterable(std::vector<std::string>& filters, + int& predictor, int& columns, + bool& early_code_change) +{ + // Initialize values to their defaults as per the PDF spec + predictor = 1; + columns = 0; + early_code_change = true; + + bool filterable = true; + + // See if we can support any decode parameters that are specified. + + QPDFObjectHandle decode_obj = + this->stream_dict.getKey("/DecodeParms"); + if (decode_obj.isNull()) + { + // no problem + } + else if (decode_obj.isDictionary()) + { + std::set<std::string> keys = decode_obj.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + if (key == "/Predictor") + { + QPDFObjectHandle predictor_obj = decode_obj.getKey(key); + if (predictor_obj.isInteger()) + { + predictor = predictor_obj.getIntValue(); + if (! ((predictor == 1) || (predictor == 12))) + { + filterable = false; + } + } + else + { + filterable = false; + } + } + else if (key == "/EarlyChange") + { + QPDFObjectHandle earlychange_obj = decode_obj.getKey(key); + if (earlychange_obj.isInteger()) + { + int earlychange = earlychange_obj.getIntValue(); + early_code_change = (earlychange == 1); + if (! ((earlychange == 0) || (earlychange == 1))) + { + filterable = false; + } + } + else + { + filterable = false; + } + } + else if (key == "/Columns") + { + QPDFObjectHandle columns_obj = decode_obj.getKey(key); + if (columns_obj.isInteger()) + { + columns = columns_obj.getIntValue(); + } + else + { + filterable = false; + } + } + else + { + filterable = false; + } + } + } + else + { + throw QPDFExc(qpdf->getFilename(), this->offset, + "invalid decode parameters object type for this stream"); + } + + if ((predictor > 1) && (columns == 0)) + { + // invalid + filterable = false; + } + + if (! filterable) + { + return false; + } + + // Check filters + + QPDFObjectHandle filter_obj = this->stream_dict.getKey("/Filter"); + bool filters_okay = true; + + if (filter_obj.isNull()) + { + // No filters + } + else if (filter_obj.isName()) + { + // One filter + filters.push_back(filter_obj.getName()); + } + else if (filter_obj.isArray()) + { + // Potentially multiple filters + int n = filter_obj.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + QPDFObjectHandle item = filter_obj.getArrayItem(i); + if (item.isName()) + { + filters.push_back(item.getName()); + } + else + { + filters_okay = false; + } + } + } + else + { + filters_okay = false; + } + + if (! filters_okay) + { + QTC::TC("qpdf", "QPDF_Stream invalid filter"); + throw QPDFExc(qpdf->getFilename(), this->offset, + "invalid filter object type for this stream"); + } + + // `filters' now contains a list of filters to be applied in + // order. See which ones we can support. + + for (std::vector<std::string>::iterator iter = filters.begin(); + iter != filters.end(); ++iter) + { + std::string const& filter = *iter; + if (! ((filter == "/FlateDecode") || + (filter == "/LZWDecode") || + (filter == "/ASCII85Decode") || + (filter == "/ASCIIHexDecode"))) + { + filterable = false; + } + } + + return filterable; +} + +bool +QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, + bool normalize, bool compress) +{ + std::vector<std::string> filters; + int predictor = 1; + int columns = 0; + bool early_code_change = true; + if (filter) + { + filter = filterable(filters, predictor, columns, early_code_change); + } + + if (pipeline == 0) + { + QTC::TC("qpdf", "QPDF_Stream pipeStreamData with null pipeline"); + return filter; + } + + // Construct the pipeline in reverse order. Force pipelines we + // create to be deleted when this function finishes. + std::vector<PointerHolder<Pipeline> > to_delete; + + if (filter) + { + if (compress) + { + pipeline = new Pl_Flate("compress object stream", pipeline, + Pl_Flate::a_deflate); + to_delete.push_back(pipeline); + } + + if (normalize) + { + pipeline = new Pl_QPDFTokenizer("normalizer", pipeline); + to_delete.push_back(pipeline); + } + + for (std::vector<std::string>::reverse_iterator iter = filters.rbegin(); + iter != filters.rend(); ++iter) + { + std::string const& filter = *iter; + if (filter == "/FlateDecode") + { + if (predictor == 12) + { + QTC::TC("qpdf", "QPDF_Stream PNG filter"); + pipeline = new Pl_PNGFilter( + "png decode", pipeline, Pl_PNGFilter::a_decode, + columns, 0 /* not used */); + to_delete.push_back(pipeline); + } + + pipeline = new Pl_Flate("stream inflate", + pipeline, Pl_Flate::a_inflate); + to_delete.push_back(pipeline); + } + else if (filter == "/ASCII85Decode") + { + pipeline = new Pl_ASCII85Decoder("ascii85 decode", pipeline); + to_delete.push_back(pipeline); + } + else if (filter == "/ASCIIHexDecode") + { + pipeline = new Pl_ASCIIHexDecoder("asciiHex decode", pipeline); + to_delete.push_back(pipeline); + } + else if (filter == "/LZWDecode") + { + pipeline = new Pl_LZWDecoder("lzw decode", pipeline, + early_code_change); + to_delete.push_back(pipeline); + } + else + { + throw QEXC::Internal("QPDFStream: unknown filter " + "encountered after check"); + } + } + } + + QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation, + this->offset, this->length, + this->stream_dict, pipeline); + + return filter; +} diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc new file mode 100644 index 00000000..cc8ca042 --- /dev/null +++ b/libqpdf/QPDF_String.cc @@ -0,0 +1,178 @@ + +#include <qpdf/QPDF_String.hh> + +#include <qpdf/QUtil.hh> +// DO NOT USE ctype -- it is locale dependent for some things, and +// it's not worth the risk of including it in case it may accidentally +// be used. +#include <string.h> + +// See above about ctype. +static bool is_iso_latin1_printable(unsigned char ch) +{ + return (((ch >= 32) && (ch <= 126)) || (ch >= 160)); +} + +QPDF_String::QPDF_String(std::string const& val) : + val(val) +{ +} + +QPDF_String::~QPDF_String() +{ +} + +std::string +QPDF_String::unparse() +{ + return unparse(false); +} + +std::string +QPDF_String::unparse(bool force_binary) +{ + bool use_hexstring = force_binary; + if (! use_hexstring) + { + unsigned int nonprintable = 0; + int consecutive_printable = 0; + for (unsigned int i = 0; i < this->val.length(); ++i) + { + char ch = this->val[i]; + // Note: do not use locale to determine printability. The PDF + // specification accepts arbitrary binary data. Some locales + // imply multibyte characters. We'll consider something + // printable if it is printable in ISO-Latin-1. We'll code + // this manually rather than being rude and setting locale. + if ((ch == 0) || (! (is_iso_latin1_printable(ch) || + strchr("\n\r\t\b\f", ch)))) + { + ++nonprintable; + consecutive_printable = 0; + } + else + { + if (++consecutive_printable > 5) + { + // If there are more than 5 consecutive printable + // characters, I want to see them as such. + nonprintable = 0; + break; + } + } + } + + // Use hex notation if more than 20% of the characters are not + // printable in the current locale. Uniformly distributed random + // characters will not pass this test even with ISO-Latin-1 in + // which 76% are either printable or in the set of standard + // escaped characters. + if (5 * nonprintable > val.length()) + { + use_hexstring = true; + } + } + std::string result; + if (use_hexstring) + { + result += "<"; + char num[3]; + for (unsigned int i = 0; i < this->val.length(); ++i) + { + sprintf(num, "%02x", (unsigned char) this->val[i]); + result += num; + } + result += ">"; + } + else + { + result += "("; + char num[5]; + for (unsigned int i = 0; i < this->val.length(); ++i) + { + char ch = this->val[i]; + switch (ch) + { + case '\n': + result += "\\n"; + break; + + case '\r': + result += "\\r"; + break; + + case '\t': + result += "\\t"; + break; + + case '\b': + result += "\\b"; + break; + + case '\f': + result += "\\f"; + break; + + case '(': + result += "\\("; + break; + + case ')': + result += "\\)"; + break; + + case '\\': + result += "\\\\"; + break; + + default: + if (is_iso_latin1_printable(ch)) + { + result += this->val[i]; + } + else + { + sprintf(num, "\\%03o", (unsigned char)ch); + result += num; + } + break; + } + } + result += ")"; + } + + return result; +} + +std::string +QPDF_String::getVal() const +{ + return this->val; +} + +std::string +QPDF_String::getUTF8Val() const +{ + std::string result; + unsigned int len = this->val.length(); + if ((len >= 2) && (len % 2 == 0) && + (this->val[0] == '\xfe') && (this->val[1] == '\xff')) + { + // This is a Unicode string using big-endian UTF-16. This + // code is not actually correct as it doesn't properly handle + // characters past 0xffff. + for (unsigned int i = 2; i < len; i += 2) + { + result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) + + ((unsigned char) this->val[i+1])); + } + } + else + { + for (unsigned int i = 0; i < len; ++i) + { + result += QUtil::toUTF8((unsigned char) this->val[i]); + } + } + return result; +} diff --git a/libqpdf/QPDF_encryption.cc b/libqpdf/QPDF_encryption.cc new file mode 100644 index 00000000..e5e2d8be --- /dev/null +++ b/libqpdf/QPDF_encryption.cc @@ -0,0 +1,441 @@ +// This file implements methods from the QPDF class that involve +// encryption. + +#include <qpdf/QPDF.hh> + +#include <qpdf/QPDFExc.hh> + +#include <qpdf/QUtil.hh> +#include <qpdf/Pl_RC4.hh> +#include <qpdf/RC4.hh> +#include <qpdf/MD5.hh> + +static char const padding_string[] = { + 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41, + 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08, + 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80, + 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a +}; + +static unsigned int const O_key_bytes = sizeof(MD5::Digest); +static unsigned int const id_bytes = 16; +static unsigned int const key_bytes = 32; + +void +pad_or_truncate_password(std::string const& password, char k1[key_bytes]) +{ + int password_bytes = std::min(key_bytes, password.length()); + int pad_bytes = key_bytes - password_bytes; + memcpy(k1, password.c_str(), password_bytes); + memcpy(k1 + password_bytes, padding_string, pad_bytes); +} + +void +QPDF::trim_user_password(std::string& user_password) +{ + // Although unnecessary, this routine trims the padding string + // from the end of a user password. Its only purpose is for + // recovery of user passwords which is done in the test suite. + char const* cstr = user_password.c_str(); + size_t len = user_password.length(); + if (len < key_bytes) + { + return; + } + + char* p = 0; + while ((p = strchr(cstr, '\x28')) != 0) + { + if (memcmp(p, padding_string, len - (p - cstr)) == 0) + { + user_password = user_password.substr(0, p - cstr); + return; + } + } +} + +static std::string +pad_or_truncate_password(std::string const& password) +{ + char k1[key_bytes]; + pad_or_truncate_password(password, k1); + return std::string(k1, key_bytes); +} + +static void +iterate_md5_digest(MD5& md5, MD5::Digest& digest, int iterations) +{ + md5.digest(digest); + + for (int i = 0; i < iterations; ++i) + { + MD5 m; + m.encodeDataIncrementally((char*)digest, sizeof(digest)); + m.digest(digest); + } +} + + +static void +iterate_rc4(unsigned char* data, int data_len, + unsigned char* okey, int key_len, + int iterations, bool reverse) +{ + unsigned char* key = new unsigned char[key_len]; + for (int i = 0; i < iterations; ++i) + { + int const xor_value = (reverse ? iterations - 1 - i : i); + for (int j = 0; j < key_len; ++j) + { + key[j] = okey[j] ^ xor_value; + } + RC4 rc4(key, key_len); + rc4.process(data, data_len); + } + delete [] key; +} + +std::string +QPDF::compute_data_key(std::string const& encryption_key, + int objid, int generation) +{ + // Algorithm 3.1 from the PDF 1.4 Reference Manual + + std::string result = encryption_key; + + // Append low three bytes of object ID and low two bytes of generation + result += (char) (objid & 0xff); + result += (char) ((objid >> 8) & 0xff); + result += (char) ((objid >> 16) & 0xff); + result += (char) (generation & 0xff); + result += (char) ((generation >> 8) & 0xff); + + MD5 md5; + md5.encodeDataIncrementally(result.c_str(), result.length()); + MD5::Digest digest; + md5.digest(digest); + return std::string((char*) digest, + std::min(result.length(), (size_t) 16)); +} + +std::string +QPDF::compute_encryption_key( + std::string const& password, EncryptionData const& data) +{ + // Algorithm 3.2 from the PDF 1.4 Reference Manual + + MD5 md5; + md5.encodeDataIncrementally( + pad_or_truncate_password(password).c_str(), key_bytes); + md5.encodeDataIncrementally(data.O.c_str(), key_bytes); + char pbytes[4]; + pbytes[0] = (char) (data.P & 0xff); + pbytes[1] = (char) ((data.P >> 8) & 0xff); + pbytes[2] = (char) ((data.P >> 16) & 0xff); + pbytes[3] = (char) ((data.P >> 24) & 0xff); + md5.encodeDataIncrementally(pbytes, 4); + md5.encodeDataIncrementally(data.id1.c_str(), id_bytes); + MD5::Digest digest; + iterate_md5_digest(md5, digest, ((data.R == 3) ? 50 : 0)); + return std::string((char*)digest, data.Length_bytes); +} + +static void +compute_O_rc4_key(std::string const& user_password, + std::string const& owner_password, + QPDF::EncryptionData const& data, + unsigned char key[O_key_bytes]) +{ + std::string password = owner_password; + if (password.empty()) + { + password = user_password; + } + MD5 md5; + md5.encodeDataIncrementally( + pad_or_truncate_password(password).c_str(), key_bytes); + MD5::Digest digest; + iterate_md5_digest(md5, digest, ((data.R == 3) ? 50 : 0)); + memcpy(key, digest, O_key_bytes); +} + +static std::string +compute_O_value(std::string const& user_password, + std::string const& owner_password, + QPDF::EncryptionData const& data) +{ + // Algorithm 3.3 from the PDF 1.4 Reference Manual + + unsigned char O_key[O_key_bytes]; + compute_O_rc4_key(user_password, owner_password, data, O_key); + + char upass[key_bytes]; + pad_or_truncate_password(user_password, upass); + iterate_rc4((unsigned char*) upass, key_bytes, + O_key, data.Length_bytes, (data.R == 3) ? 20 : 1, false); + return std::string(upass, key_bytes); +} + +static +std::string +compute_U_value_R2(std::string const& user_password, + QPDF::EncryptionData const& data) +{ + // Algorithm 3.4 from the PDF 1.4 Reference Manual + + std::string k1 = QPDF::compute_encryption_key(user_password, data); + char udata[key_bytes]; + pad_or_truncate_password("", udata); + iterate_rc4((unsigned char*) udata, key_bytes, + (unsigned char*)k1.c_str(), data.Length_bytes, 1, false); + return std::string(udata, key_bytes); +} + +static +std::string +compute_U_value_R3(std::string const& user_password, + QPDF::EncryptionData const& data) +{ + // Algorithm 3.5 from the PDF 1.4 Reference Manual + + std::string k1 = QPDF::compute_encryption_key(user_password, data); + MD5 md5; + md5.encodeDataIncrementally( + pad_or_truncate_password("").c_str(), key_bytes); + md5.encodeDataIncrementally(data.id1.c_str(), data.id1.length()); + MD5::Digest digest; + md5.digest(digest); + iterate_rc4(digest, sizeof(MD5::Digest), + (unsigned char*) k1.c_str(), data.Length_bytes, 20, false); + char result[key_bytes]; + memcpy(result, digest, sizeof(MD5::Digest)); + // pad with arbitrary data -- make it consistent for the sake of + // testing + for (unsigned int i = sizeof(MD5::Digest); i < key_bytes; ++i) + { + result[i] = (char)((i * i) % 0xff); + } + return std::string(result, key_bytes); +} + +static std::string +compute_U_value(std::string const& user_password, + QPDF::EncryptionData const& data) +{ + if (data.R == 3) + { + return compute_U_value_R3(user_password, data); + } + + return compute_U_value_R2(user_password, data); +} + +static bool +check_user_password(std::string const& user_password, + QPDF::EncryptionData const& data) +{ + // Algorithm 3.6 from the PDF 1.4 Reference Manual + + std::string u_value = compute_U_value(user_password, data); + int to_compare = ((data.R == 3) ? sizeof(MD5::Digest) : key_bytes); + return (memcmp(data.U.c_str(), u_value.c_str(), to_compare) == 0); +} + +static bool +check_owner_password(std::string& user_password, + std::string const& owner_password, + QPDF::EncryptionData const& data) +{ + // Algorithm 3.7 from the PDF 1.4 Reference Manual + + unsigned char key[O_key_bytes]; + compute_O_rc4_key(user_password, owner_password, data, key); + unsigned char O_data[key_bytes]; + memcpy(O_data, (unsigned char*) data.O.c_str(), key_bytes); + iterate_rc4(O_data, key_bytes, key, data.Length_bytes, + (data.R == 3) ? 20 : 1, true); + std::string new_user_password = + std::string((char*)O_data, key_bytes); + bool result = false; + if (check_user_password(new_user_password, data)) + { + result = true; + user_password = new_user_password; + } + return result; +} + +void +QPDF::initializeEncryption() +{ + if (this->encryption_initialized) + { + return; + } + this->encryption_initialized = true; + + // After we initialize encryption parameters, we must used stored + // key information and never look at /Encrypt again. Otherwise, + // things could go wrong if someone mutates the encryption + // dictionary. + + if (! this->trailer.hasKey("/Encrypt")) + { + return; + } + + QPDFObjectHandle id_obj = this->trailer.getKey("/ID"); + if (! (id_obj.isArray() && + (id_obj.getArrayNItems() == 2) && + id_obj.getArrayItem(0).isString())) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "invalid /ID in trailer dictionary"); + } + + std::string id1 = id_obj.getArrayItem(0).getStringValue(); + if (id1.length() != id_bytes) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "first /ID string in trailer dictionary has " + "incorrect length"); + } + + QPDFObjectHandle encryption_dict = this->trailer.getKey("/Encrypt"); + if (! encryption_dict.isDictionary()) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "/Encrypt in trailer dictionary is not a dictionary"); + } + + if (! (encryption_dict.getKey("/Filter").isName() && + (encryption_dict.getKey("/Filter").getName() == "/Standard"))) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "unsupported encryption filter"); + } + + if (! (encryption_dict.getKey("/V").isInteger() && + encryption_dict.getKey("/R").isInteger() && + encryption_dict.getKey("/O").isString() && + encryption_dict.getKey("/U").isString() && + encryption_dict.getKey("/P").isInteger())) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "some encryption dictionary parameters are missing " + "or the wrong type"); + } + + int V = encryption_dict.getKey("/V").getIntValue(); + int R = encryption_dict.getKey("/R").getIntValue(); + std::string O = encryption_dict.getKey("/O").getStringValue(); + std::string U = encryption_dict.getKey("/U").getStringValue(); + unsigned int P = (unsigned int) encryption_dict.getKey("/P").getIntValue(); + + if (! (((R == 2) || (R == 3)) && + ((V == 1) || (V == 2)))) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "Unsupported /R or /V in encryption dictionary"); + } + + if (! ((O.length() == key_bytes) && (U.length() == key_bytes))) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "incorrect length for /O and/or /P in " + "encryption dictionary"); + } + + int Length = 40; + if (encryption_dict.getKey("/Length").isInteger()) + { + Length = encryption_dict.getKey("/Length").getIntValue(); + if ((Length % 8) || (Length < 40) || (Length > 128)) + { + throw QPDFExc(this->file.getName(), this->file.getLastOffset(), + "invalid /Length value in encryption dictionary"); + } + } + + EncryptionData data(V, R, Length / 8, P, O, U, id1); + if (check_owner_password(this->user_password, this->provided_password, data)) + { + // password supplied was owner password; user_password has + // been initialized + } + else if (check_user_password(this->provided_password, data)) + { + this->user_password = this->provided_password; + } + else + { + throw QPDFExc(this->file.getName() + ": invalid password"); + } + + this->encrypted = true; + this->encryption_key = compute_encryption_key(this->user_password, data); +} + +std::string +QPDF::getKeyForObject(int objid, int generation) +{ + if (! this->encrypted) + { + throw QEXC::Internal("request for encryption key in non-encrypted PDF"); + } + + if (! ((objid == this->cached_key_objid) && + (generation == this->cached_key_generation))) + { + this->cached_object_encryption_key = + compute_data_key(this->encryption_key, objid, generation); + this->cached_key_objid = objid; + this->cached_key_generation = generation; + } + + return this->cached_object_encryption_key; +} + +void +QPDF::decryptString(std::string& str, int objid, int generation) +{ + if (objid == 0) + { + return; + } + std::string key = getKeyForObject(objid, generation); + char* tmp = QUtil::copy_string(str); + unsigned int vlen = str.length(); + RC4 rc4((unsigned char const*)key.c_str(), key.length()); + rc4.process((unsigned char*)tmp, vlen); + str = std::string(tmp, vlen); + delete [] tmp; +} + +void +QPDF::decryptStream(Pipeline*& pipeline, int objid, int generation, + std::vector<PointerHolder<Pipeline> >& heap) +{ + std::string key = getKeyForObject(objid, generation); + pipeline = new Pl_RC4("stream decryption", pipeline, + (unsigned char*) key.c_str(), key.length()); + heap.push_back(pipeline); +} + +void +QPDF::compute_encryption_O_U( + char const* user_password, char const* owner_password, + int V, int R, int key_len, unsigned long P, + std::string const& id1, std::string& O, std::string& U) +{ + EncryptionData data(V, R, key_len, P, "", "", id1); + data.O = compute_O_value(user_password, owner_password, data); + O = data.O; + U = compute_U_value(user_password, data); +} + +std::string const& +QPDF::getUserPassword() const +{ + return this->user_password; +} diff --git a/libqpdf/QPDF_linearization.cc b/libqpdf/QPDF_linearization.cc new file mode 100644 index 00000000..6c0cf3be --- /dev/null +++ b/libqpdf/QPDF_linearization.cc @@ -0,0 +1,2103 @@ +// See doc/linearization. + +#include <qpdf/QPDF.hh> + +#include <qpdf/QPDFExc.hh> +#include <qpdf/QTC.hh> +#include <qpdf/QUtil.hh> +#include <qpdf/PCRE.hh> +#include <qpdf/Pl_Buffer.hh> +#include <qpdf/Pl_Flate.hh> +#include <qpdf/Pl_Count.hh> +#include <qpdf/BitWriter.hh> +#include <qpdf/BitStream.hh> + +#include <iostream> +#include <algorithm> +#include <assert.h> +#include <math.h> + +template <class T> +static void +load_vector_int(BitStream& bit_stream, int nitems, std::vector<T>& vec, + int bits_wanted, int T::*field) +{ + // nitems times, read bits_wanted from the given bit stream, + // storing results in the ith vector entry. + + for (int i = 0; i < nitems; ++i) + { + vec[i].*field = bit_stream.getBits(bits_wanted); + } + // The PDF spec says that each hint table starts at a byte + // boundary. Each "row" actually must start on a byte boundary. + bit_stream.skipToNextByte(); +} + +template <class T> +static void +load_vector_vector(BitStream& bit_stream, + int nitems1, std::vector<T>& vec1, int T::*nitems2, + int bits_wanted, std::vector<int> T::*vec2) +{ + // nitems1 times, read nitems2 (from the ith element of vec1) items + // into the vec2 vector field of the ith item of vec1. + for (int i1 = 0; i1 < nitems1; ++i1) + { + for (int i2 = 0; i2 < vec1[i1].*nitems2; ++i2) + { + (vec1[i1].*vec2).push_back(bit_stream.getBits(bits_wanted)); + } + } + bit_stream.skipToNextByte(); +} + +bool +QPDF::checkLinearization() +{ + bool result = false; + try + { + readLinearizationData(); + result = checkLinearizationInternal(); + } + catch (QPDFExc& e) + { + std::cout << e.what() << std::endl; + } + return result; +} + +bool +QPDF::isLinearized() +{ + // If the first object in the file is a dictionary with a suitable + // /Linearized key and has an /L key that accurately indicates the + // file size, initialize this->lindict and return true. + + // A linearized PDF spec's first object will be contained within + // the first 1024 bytes of the file and will be a dictionary with + // a valid /Linearized key. This routine looks for that and does + // no additional validation. + + // The PDF spec says the linearization dictionary must be + // completely contained within the first 1024 bytes of the file. + // Add a byte for a null terminator. + static int const tbuf_size = 1025; + + char* buf = new char[tbuf_size]; + this->file.seek(0, SEEK_SET); + PointerHolder<char> b(buf); // guarantee deletion + memset(buf, '\0', tbuf_size); + this->file.read(buf, tbuf_size - 1); + + static PCRE lindict_re("(?s:(\\d+)\\s+0\\s+obj\\s*<<)"); + + off_t offset = -1; + int lindict_obj = 0; + char* p = buf; + while (lindict_obj == 0) + { + PCRE::Match m(lindict_re.match(p)); + if (m) + { + offset = m.getOffset(0) + (p - buf); + lindict_obj = atoi(m.getMatch(1).c_str()); + if (m.getMatch(0).find('\n') != std::string::npos) + { + QTC::TC("qpdf", "QPDF lindict found newline"); + } + } + else + { + if ((p = (char*)memchr(p, '\0', tbuf_size - (p - buf))) != 0) + { + QTC::TC("qpdf", "QPDF lindict null found"); + while ((p - buf < tbuf_size) && (*p == 0)) + { + ++p; + } + if ((p - buf) == tbuf_size) + { + break; + } + QTC::TC("qpdf", "QPDF lindict searching after null"); + } + } + } + + if (lindict_obj == 0) + { + return false; + } + + QPDFObjectHandle candidate = QPDFObjectHandle::Factory::newIndirect( + this, lindict_obj, 0); + if (! candidate.isDictionary()) + { + return false; + } + + QPDFObjectHandle linkey = candidate.getKey("/Linearized"); + if (! (linkey.isNumber() && ((int)floor(linkey.getNumericValue()) == 1))) + { + return false; + } + + QPDFObjectHandle L = candidate.getKey("/L"); + if (L.isInteger()) + { + int Li = L.getIntValue(); + this->file.seek(0, SEEK_END); + if (Li != this->file.tell()) + { + QTC::TC("qpdf", "QPDF /L mismatch"); + return false; + } + else + { + this->linp.file_size = Li; + } + } + + this->lindict = candidate; + + return true; +} + +void +QPDF::readLinearizationData() +{ + // This function throws an exception (which is trapped by + // checkLinearization()) for any errors that prevent loading. + + // Hint table parsing code needs at least 32 bits in a long. + assert(sizeof(long) >= 4); + + if (! isLinearized()) + { + throw QPDFExc(this->file.getName() + " is not linearized"); + } + + // /L is read and stored in linp by isLinearized() + QPDFObjectHandle H = lindict.getKey("/H"); + QPDFObjectHandle O = lindict.getKey("/O"); + QPDFObjectHandle E = lindict.getKey("/E"); + QPDFObjectHandle N = lindict.getKey("/N"); + QPDFObjectHandle T = lindict.getKey("/T"); + QPDFObjectHandle P = lindict.getKey("/P"); + + if (! (H.isArray() && + O.isInteger() && + E.isInteger() && + N.isInteger() && + T.isInteger() && + (P.isInteger() || P.isNull()))) + { + throw QPDFExc("some keys in linearization dictionary are of " + "the wrong type"); + } + + // Hint table array: offset length [ offset length ] + unsigned int n_H_items = H.getArrayNItems(); + if (! ((n_H_items == 2) || (n_H_items == 4))) + { + throw QPDFExc("H has the wrong number of items"); + } + + std::vector<int> H_items; + for (unsigned int i = 0; i < n_H_items; ++i) + { + QPDFObjectHandle oh(H.getArrayItem(i)); + if (oh.isInteger()) + { + H_items.push_back(oh.getIntValue()); + } + else + { + throw QPDFExc("some H items are of the wrong type"); + } + } + + // H: hint table offset/length for primary and overflow hint tables + int H0_offset = H_items[0]; + int H0_length = H_items[1]; + int H1_offset = 0; + int H1_length = 0; + if (H_items.size() == 4) + { + // Acrobat doesn't read or write these (as PDF 1.4), so we + // don't have a way to generate a test case. + // QTC::TC("qpdf", "QPDF overflow hint table"); + H1_offset = H_items[2]; + H1_length = H_items[3]; + } + + // P: first page number + int first_page = 0; + if (P.isInteger()) + { + QTC::TC("qpdf", "QPDF P present in lindict"); + first_page = P.getIntValue(); + } + else + { + QTC::TC("qpdf", "QPDF P absent in lindict"); + } + + // Store linearization parameter data + + // file_size initialized by isLinearized() + this->linp.first_page_object = O.getIntValue(); + this->linp.first_page_end = E.getIntValue(); + this->linp.npages = N.getIntValue(); + this->linp.xref_zero_offset = T.getIntValue(); + this->linp.first_page = first_page; + this->linp.H_offset = H0_offset; + this->linp.H_length = H0_length; + + // Read hint streams + + Pl_Buffer pb("hint buffer"); + QPDFObjectHandle H0 = readHintStream(pb, H0_offset, H0_length); + if (H1_offset) + { + (void) readHintStream(pb, H1_offset, H1_length); + } + + // PDF 1.4 hint tables that we ignore: + + // /T thumbnail + // /A thread information + // /E named destination + // /V interactive form + // /I information dictionary + // /C logical structure + // /L page label + + // Individual hint table offsets + QPDFObjectHandle HS = H0.getKey("/S"); // shared object + QPDFObjectHandle HO = H0.getKey("/O"); // outline + + PointerHolder<Buffer> hbp = pb.getBuffer(); + Buffer* hb = hbp.getPointer(); + unsigned char const* h_buf = hb->getBuffer(); + int h_size = hb->getSize(); + + readHPageOffset(BitStream(h_buf, h_size)); + + int HSi = HS.getIntValue(); + readHSharedObject(BitStream(h_buf + HSi, h_size - HSi)); + + if (HO.isInteger()) + { + int HOi = HO.getIntValue(); + readHGeneric(BitStream(h_buf + HOi, h_size - HOi), + this->outline_hints); + } +} + +QPDFObjectHandle +QPDF::readHintStream(Pipeline& pl, off_t offset, size_t length) +{ + int obj; + int gen; + QPDFObjectHandle H = readObjectAtOffset(offset, 0, 0, obj, gen); + ObjCache& oc = this->obj_cache[ObjGen(obj, gen)]; + off_t min_end_offset = oc.end_before_space; + off_t max_end_offset = oc.end_after_space; + if (! H.isStream()) + { + throw QPDFExc("hint table is not a stream"); + } + + QPDFObjectHandle Hdict = H.getDict(); + + // Some versions of Acrobat make /Length indirect and place it + // immediately after the stream, increasing length to cover it, + // even though the specification says all objects in the + // linearization parameter dictionary must be direct. We have to + // get the file position of the end of length in this case. + QPDFObjectHandle length_obj = Hdict.getKey("/Length"); + if (length_obj.isIndirect()) + { + QTC::TC("qpdf", "QPDF hint table length indirect"); + // Force resolution + (void) length_obj.getIntValue(); + ObjCache& oc = this->obj_cache + [ObjGen(length_obj.getObjectID(), + length_obj.getGeneration())]; + min_end_offset = oc.end_before_space; + max_end_offset = oc.end_after_space; + } + else + { + QTC::TC("qpdf", "QPDF hint table length direct"); + } + off_t computed_end = offset + length; + if ((computed_end < min_end_offset) || + (computed_end > max_end_offset)) + { + std::cout << "expected = " << computed_end + << "; actual = " << min_end_offset << ".." + << max_end_offset << std::endl; + throw QPDFExc("hint table length mismatch"); + } + H.pipeStreamData(&pl, true, false, false); + return Hdict; +} + +void +QPDF::readHPageOffset(BitStream h) +{ + // All comments referring to the PDF spec refer to the spec for + // version 1.4. + + HPageOffset& t = this->page_offset_hints; + + t.min_nobjects = h.getBits(32); // 1 + t.first_page_offset = h.getBits(32); // 2 + t.nbits_delta_nobjects = h.getBits(16); // 3 + t.min_page_length = h.getBits(32); // 4 + t.nbits_delta_page_length = h.getBits(16); // 5 + t.min_content_offset = h.getBits(32); // 6 + t.nbits_delta_content_offset = h.getBits(16); // 7 + t.min_content_length = h.getBits(32); // 8 + t.nbits_delta_content_length = h.getBits(16); // 9 + t.nbits_nshared_objects = h.getBits(16); // 10 + t.nbits_shared_identifier = h.getBits(16); // 11 + t.nbits_shared_numerator = h.getBits(16); // 12 + t.shared_denominator = h.getBits(16); // 13 + + unsigned int nitems = this->linp.npages; + std::vector<HPageOffsetEntry>& entries = t.entries; + entries = std::vector<HPageOffsetEntry>(nitems); + + load_vector_int(h, nitems, entries, + t.nbits_delta_nobjects, + &HPageOffsetEntry::delta_nobjects); + load_vector_int(h, nitems, entries, + t.nbits_delta_page_length, + &HPageOffsetEntry::delta_page_length); + load_vector_int(h, nitems, entries, + t.nbits_nshared_objects, + &HPageOffsetEntry::nshared_objects); + load_vector_vector(h, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_identifier, + &HPageOffsetEntry::shared_identifiers); + load_vector_vector(h, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_numerator, + &HPageOffsetEntry::shared_numerators); + load_vector_int(h, nitems, entries, + t.nbits_delta_content_offset, + &HPageOffsetEntry::delta_content_offset); + load_vector_int(h, nitems, entries, + t.nbits_delta_content_length, + &HPageOffsetEntry::delta_content_length); +} + +void +QPDF::readHSharedObject(BitStream h) +{ + HSharedObject& t = this->shared_object_hints; + + t.first_shared_obj = h.getBits(32); // 1 + t.first_shared_offset = h.getBits(32); // 2 + t.nshared_first_page = h.getBits(32); // 3 + t.nshared_total = h.getBits(32); // 4 + t.nbits_nobjects = h.getBits(16); // 5 + t.min_group_length = h.getBits(32); // 6 + t.nbits_delta_group_length = h.getBits(16); // 7 + + QTC::TC("qpdf", "QPDF lin nshared_total > nshared_first_page", + (t.nshared_total > t.nshared_first_page) ? 1 : 0); + + int nitems = t.nshared_total; + std::vector<HSharedObjectEntry>& entries = t.entries; + entries = std::vector<HSharedObjectEntry>(nitems); + + load_vector_int(h, nitems, entries, + t.nbits_delta_group_length, + &HSharedObjectEntry::delta_group_length); + load_vector_int(h, nitems, entries, + 1, &HSharedObjectEntry::signature_present); + for (int i = 0; i < nitems; ++i) + { + if (entries[i].signature_present) + { + // Skip 128-bit MD5 hash. These are not supported by + // acrobat, so they should probably never be there. We + // have no test case for this. + for (int j = 0; j < 4; ++j) + { + (void) h.getBits(32); + } + } + } + load_vector_int(h, nitems, entries, + t.nbits_nobjects, + &HSharedObjectEntry::nobjects_minus_one); +} + +void +QPDF::readHGeneric(BitStream h, HGeneric& t) +{ + t.first_object = h.getBits(32); // 1 + t.first_object_offset = h.getBits(32); // 2 + t.nobjects = h.getBits(32); // 3 + t.group_length = h.getBits(32); // 4 +} + +bool +QPDF::checkLinearizationInternal() +{ + // All comments referring to the PDF spec refer to the spec for + // version 1.4. + + std::list<std::string> errors; + std::list<std::string> warnings; + + // Check all values in linearization parameter dictionary + + LinParameters& p = this->linp; + + // L: file size in bytes -- checked by isLinearized + + // O: object number of first page + std::vector<QPDFObjectHandle> const& pages = getAllPages(); + if (p.first_page_object != pages[0].getObjectID()) + { + QTC::TC("qpdf", "QPDF err /O mismatch"); + errors.push_back("first page object (/O) mismatch"); + } + + // N: number of pages + int npages = pages.size(); + if (p.npages != npages) + { + // Not tested in the test suite + errors.push_back("page count (/N) mismatch"); + } + + for (int i = 0; i < npages; ++i) + { + QPDFObjectHandle const& page = pages[i]; + ObjGen og(page.getObjectID(), page.getGeneration()); + if (this->xref_table[og].getType() == 2) + { + errors.push_back("page dictionary for page " + + QUtil::int_to_string(i) + " is compressed"); + } + } + + // T: offset of whitespace character preceding xref entry for object 0 + this->file.seek(p.xref_zero_offset, SEEK_SET); + while (1) + { + char ch; + this->file.read(&ch, 1); + if (! ((ch == ' ') || (ch == '\r') || (ch == '\n'))) + { + this->file.seek(-1, SEEK_CUR); + break; + } + } + if (this->file.tell() != this->first_xref_item_offset) + { + QTC::TC("qpdf", "QPDF err /T mismatch"); + errors.push_back("space before first xref item (/T) mismatch " + "(computed = " + + QUtil::int_to_string(this->first_xref_item_offset) + + "; file = " + QUtil::int_to_string(this->file.tell())); + } + + // P: first page number -- Implementation note 124 says Acrobat + // ignores this value, so we will too. + + // Check numbering of compressed objects in each xref section. + // For linearized files, all compressed objects are supposed to be + // at the end of the containing xref section if any object streams + // are in use. + + if (this->uncompressed_after_compressed) + { + errors.push_back("linearized file contains an uncompressed object" + " after a compressed one in a cross-reference stream"); + } + + // Further checking requires optimization and order calculation. + // Don't allow optimization to make changes. If it has to, then + // the file is not properly linearized. We use the xref table to + // figure out which objects are compressed and which are + // uncompressed. + { // local scope + std::map<int, int> object_stream_data; + for (std::map<ObjGen, QPDFXRefEntry>::const_iterator iter = + this->xref_table.begin(); + iter != this->xref_table.end(); ++iter) + { + ObjGen const& og = (*iter).first; + QPDFXRefEntry const& entry = (*iter).second; + if (entry.getType() == 2) + { + object_stream_data[og.obj] = entry.getObjStreamNumber(); + } + } + optimize(object_stream_data, false); + calculateLinearizationData(object_stream_data); + } + + // E: offset of end of first page -- Implementation note 123 says + // Acrobat includes on extra object here by mistake. pdlin fails + // to place thumbnail images in section 9, so when thumbnails are + // present, it also gets the wrong value for /E. It also doesn't + // count outlines here when it should even though it places them + // in part 6. This code fails to put thread information + // dictionaries in part 9, so it actually gets the wrong value for + // E when threads are present. In that case, it would probably + // agree with pdlin. As of this writing, the test suite doesn't + // contain any files with threads. + + assert(! this->part6.empty()); + int min_E = -1; + int max_E = -1; + for (std::vector<QPDFObjectHandle>::iterator iter = this->part6.begin(); + iter != this->part6.end(); ++iter) + { + ObjGen og((*iter).getObjectID(), (*iter).getGeneration()); + // All objects have to have been dereferenced to be classified. + assert(this->obj_cache.count(og) > 0); + ObjCache const& oc = this->obj_cache[og]; + min_E = std::max(min_E, (int)oc.end_before_space); + max_E = std::max(max_E, (int)oc.end_after_space); + } + if ((p.first_page_end < min_E) || (p.first_page_end > max_E)) + { + QTC::TC("qpdf", "QPDF warn /E mismatch"); + warnings.push_back("end of first page section (/E) mismatch: /E = " + + QUtil::int_to_string(p.first_page_end) + + "; computed = " + + QUtil::int_to_string(min_E) + ".." + + QUtil::int_to_string(max_E)); + } + + // Check hint tables + + std::map<int, int> shared_idx_to_obj; + checkHSharedObject(errors, warnings, pages, shared_idx_to_obj); + checkHPageOffset(errors, warnings, pages, shared_idx_to_obj); + checkHOutlines(warnings); + + // Report errors + + bool result = true; + + if (! errors.empty()) + { + result = false; + for (std::list<std::string>::iterator iter = errors.begin(); + iter != errors.end(); ++iter) + { + std::cout << "ERROR: " << (*iter) << std::endl; + } + } + + if (! warnings.empty()) + { + result = false; + for (std::list<std::string>::iterator iter = warnings.begin(); + iter != warnings.end(); ++iter) + { + std::cout << "WARNING: " << (*iter) << std::endl; + } + } + + return result; +} + +int +QPDF::maxEnd(ObjUser const& ou) +{ + assert(this->obj_user_to_objects.count(ou) > 0); + std::set<ObjGen> const& ogs = this->obj_user_to_objects[ou]; + int end = 0; + for (std::set<ObjGen>::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + assert(this->obj_cache.count(og) > 0); + end = std::max( + end, (int)(this->obj_cache[og].end_after_space)); + } + return end; +} + +int +QPDF::getLinearizationOffset(ObjGen const& og) +{ + QPDFXRefEntry entry = this->xref_table[og]; + int result = 0; + switch (entry.getType()) + { + case 1: + result = entry.getOffset(); + break; + + case 2: + // For compressed objects, return the offset of the object + // stream that contains them. + result = getLinearizationOffset(ObjGen(entry.getObjStreamNumber(), 0)); + break; + + default: + throw QPDFExc( + this->file.getName(), 0, + "getLinearizationOffset called for xref entry not of type 1 or 2"); + break; + } + return result; +} + +QPDFObjectHandle +QPDF::getUncompressedObject(QPDFObjectHandle& obj, + std::map<int, int> const& object_stream_data) +{ + if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0)) + { + return obj; + } + else + { + int repl = (*(object_stream_data.find(obj.getObjectID()))).second; + return objGenToIndirect(ObjGen(repl, 0)); + } +} + +int +QPDF::lengthNextN(int first_object, int n, + std::list<std::string>& errors) +{ + int length = 0; + for (int i = 0; i < n; ++i) + { + ObjGen og(first_object + i, 0); + if (this->xref_table.count(og) == 0) + { + errors.push_back( + "no xref table entry for " + + QUtil::int_to_string(first_object + i) + " 0"); + } + else + { + assert(this->obj_cache.count(og) > 0); + length += this->obj_cache[og].end_after_space - + getLinearizationOffset(og); + } + } + return length; +} + +void +QPDF::checkHPageOffset(std::list<std::string>& errors, + std::list<std::string>& warnings, + std::vector<QPDFObjectHandle> const& pages, + std::map<int, int>& shared_idx_to_obj) +{ + // Implementation note 126 says Acrobat always sets + // delta_content_offset and delta_content_length in the page + // offset header dictionary to 0. It also states that + // min_content_offset in the per-page information is always 0, + // which is an incorrect value. + + // Implementation note 127 explains that Acrobat always sets item + // 8 (min_content_length) to zero, item 9 + // (nbits_delta_content_length) to the value of item 5 + // (nbits_delta_page_length), and item 7 of each per-page hint + // table (delta_content_length) to item 2 (delta_page_length) of + // that entry. Acrobat ignores these values when reading files. + + // Empirically, it also seems that Acrobat sometimes puts items + // under a page's /Resources dictionary in with shared objects + // even when they are private. + + unsigned int npages = pages.size(); + int table_offset = adjusted_offset( + this->page_offset_hints.first_page_offset); + ObjGen first_page_og(pages[0].getObjectID(), pages[0].getGeneration()); + assert(this->xref_table.count(first_page_og) > 0); + int offset = getLinearizationOffset(first_page_og); + if (table_offset != offset) + { + warnings.push_back("first page object offset mismatch"); + } + + for (unsigned int pageno = 0; pageno < npages; ++pageno) + { + ObjGen page_og(pages[pageno].getObjectID(), + pages[pageno].getGeneration()); + int first_object = page_og.obj; + assert(this->xref_table.count(page_og) > 0); + offset = getLinearizationOffset(page_og); + + HPageOffsetEntry& he = this->page_offset_hints.entries[pageno]; + CHPageOffsetEntry& ce = this->c_page_offset_data.entries[pageno]; + int h_nobjects = he.delta_nobjects + + this->page_offset_hints.min_nobjects; + if (h_nobjects != ce.nobjects) + { + // This happens with pdlin when there are thumbnails. + warnings.push_back( + "object count mismatch for page " + + QUtil::int_to_string(pageno) + ": hint table = " + + QUtil::int_to_string(h_nobjects) + "; computed = " + + QUtil::int_to_string(ce.nobjects)); + } + + // Use value for number of objects in hint table rather than + // computed value if there is a discrepancy. + int length = lengthNextN(first_object, h_nobjects, errors); + int h_length = he.delta_page_length + + this->page_offset_hints.min_page_length; + if (length != h_length) + { + // This condition almost certainly indicates a bad hint + // table or a bug in this code. + errors.push_back( + "page length mismatch for page " + + QUtil::int_to_string(pageno) + ": hint table = " + + QUtil::int_to_string(h_length) + "; computed length = " + + QUtil::int_to_string(length) + " (offset = " + + QUtil::int_to_string(offset) + ")"); + } + + offset += h_length; + + // Translate shared object indexes to object numbers. + std::set<int> hint_shared; + std::set<int> computed_shared; + + if ((pageno == 0) && (he.nshared_objects > 0)) + { + // pdlin and Acrobat both do this even though the spec + // states clearly and unambiguously that they should not. + warnings.push_back("page 0 has shared identifier entries"); + } + + for (int i = 0; i < he.nshared_objects; ++i) + { + int idx = he.shared_identifiers[i]; + assert(shared_idx_to_obj.count(idx) > 0); + hint_shared.insert(shared_idx_to_obj[idx]); + } + + for (int i = 0; i < ce.nshared_objects; ++i) + { + int idx = ce.shared_identifiers[i]; + assert(idx < this->c_shared_object_data.nshared_total); + int obj = this->c_shared_object_data.entries[idx].object; + computed_shared.insert(obj); + } + + for (std::set<int>::iterator iter = hint_shared.begin(); + iter != hint_shared.end(); ++iter) + { + if (! computed_shared.count(*iter)) + { + // pdlin puts thumbnails here even though it shouldn't + warnings.push_back( + "page " + QUtil::int_to_string(pageno) + + ": shared object " + QUtil::int_to_string(*iter) + + ": in hint table but not computed list"); + } + } + + for (std::set<int>::iterator iter = computed_shared.begin(); + iter != computed_shared.end(); ++iter) + { + if (! hint_shared.count(*iter)) + { + // Acrobat does not put some things including at least + // built-in fonts and procsets here, at least in some + // cases. + warnings.push_back( + "page " + QUtil::int_to_string(pageno) + + ": shared object " + QUtil::int_to_string(*iter) + + ": in computed list but not hint table"); + } + } + } +} + +void +QPDF::checkHSharedObject(std::list<std::string>& errors, + std::list<std::string>& warnings, + std::vector<QPDFObjectHandle> const& pages, + std::map<int, int>& idx_to_obj) +{ + // Implementation note 125 says shared object groups always + // contain only one object. Implementation note 128 says that + // Acrobat always nbits_nobjects to zero. Implementation note 130 + // says that Acrobat does not support more than one shared object + // per group. These are all consistent. + + // Implementation note 129 states that MD5 signatures are not + // implemented in Acrobat, so signature_present must always be + // zero. + + // Implementation note 131 states that first_shared_obj and + // first_shared_offset have meaningless values for single-page + // files. + + // Empirically, Acrobat and pdlin generate incorrect values for + // these whenever there are no shared objects not referenced by + // the first page (i.e., nshared_total == nshared_first_page). + + HSharedObject& so = this->shared_object_hints; + if (so.nshared_total < so.nshared_first_page) + { + errors.push_back("shared object hint table: ntotal < nfirst_page"); + } + else + { + // The first nshared_first_page objects are consecutive + // objects starting with the first page object. The rest are + // consecutive starting from the first_shared_obj object. + int cur_object = pages[0].getObjectID(); + for (int i = 0; i < so.nshared_total; ++i) + { + if (i == so.nshared_first_page) + { + QTC::TC("qpdf", "QPDF lin check shared past first page"); + if (this->part8.empty()) + { + errors.push_back( + "part 8 is empty but nshared_total > " + "nshared_first_page"); + } + else + { + int obj = this->part8[0].getObjectID(); + if (obj != so.first_shared_obj) + { + errors.push_back( + "first shared object number mismatch: " + "hint table = " + + QUtil::int_to_string(so.first_shared_obj) + + "; computed = " + + QUtil::int_to_string(obj)); + } + } + + cur_object = so.first_shared_obj; + + ObjGen og(cur_object, 0); + assert(this->xref_table.count(og) > 0); + int offset = getLinearizationOffset(og); + int h_offset = adjusted_offset(so.first_shared_offset); + if (offset != h_offset) + { + errors.push_back( + "first shared object offset mismatch: hint table = " + + QUtil::int_to_string(h_offset) + "; computed = " + + QUtil::int_to_string(offset)); + } + } + + idx_to_obj[i] = cur_object; + HSharedObjectEntry& se = so.entries[i]; + int nobjects = se.nobjects_minus_one + 1; + int length = lengthNextN(cur_object, nobjects, errors); + int h_length = so.min_group_length + se.delta_group_length; + if (length != h_length) + { + errors.push_back( + "shared object " + QUtil::int_to_string(i) + + " length mismatch: hint table = " + + QUtil::int_to_string(h_length) + "; computed = " + + QUtil::int_to_string(length)); + } + cur_object += nobjects; + } + } +} + +void +QPDF::checkHOutlines(std::list<std::string>& warnings) +{ + // Empirically, Acrobat generates the correct value for the object + // number but incorrectly stores the next object number's offset + // as the offset, at least when outlines appear in part 6. It + // also generates an incorrect value for length (specifically, the + // length that would cover the correct number of objects from the + // wrong starting place). pdlin appears to generate correct + // values in those cases. + + if (this->c_outline_data.nobjects == this->outline_hints.nobjects) + { + if (this->c_outline_data.nobjects == 0) + { + return; + } + + if (this->c_outline_data.first_object == + this->outline_hints.first_object) + { + // Check length and offset. Acrobat gets these wrong. + QPDFObjectHandle outlines = getRoot().getKey("/Outlines"); + ObjGen og(outlines.getObjectID(), outlines.getGeneration()); + assert(this->xref_table.count(og) > 0); + int offset = getLinearizationOffset(og); + ObjUser ou(ObjUser::ou_root_key, "/Outlines"); + int length = maxEnd(ou) - offset; + int table_offset = + adjusted_offset(this->outline_hints.first_object_offset); + if (offset != table_offset) + { + warnings.push_back( + "incorrect offset in outlines table: hint table = " + + QUtil::int_to_string(table_offset) + + "; computed = " + QUtil::int_to_string(offset)); + } + int table_length = this->outline_hints.group_length; + if (length != table_length) + { + warnings.push_back( + "incorrect length in outlines table: hint table = " + + QUtil::int_to_string(table_length) + + "; computed = " + QUtil::int_to_string(length)); + } + } + else + { + warnings.push_back("incorrect first object number in outline " + "hints table."); + } + } + else + { + warnings.push_back("incorrect object count in outline hint table"); + } +} + +void +QPDF::showLinearizationData() +{ + try + { + readLinearizationData(); + checkLinearizationInternal(); + dumpLinearizationDataInternal(); + } + catch (QPDFExc& e) + { + std::cout << e.what() << std::endl; + } +} + +void +QPDF::dumpLinearizationDataInternal() +{ + std::cout << this->file.getName() << ": linearization data:" << std::endl + << std::endl; + + std::cout + << "file_size: " << this->linp.file_size << std::endl + << "first_page_object: " << this->linp.first_page_object << std::endl + << "first_page_end: " << this->linp.first_page_end << std::endl + << "npages: " << this->linp.npages << std::endl + << "xref_zero_offset: " << this->linp.xref_zero_offset << std::endl + << "first_page: " << this->linp.first_page << std::endl + << "H_offset: " << this->linp.H_offset << std::endl + << "H_length: " << this->linp.H_length << std::endl + << std::endl; + + std::cout << "Page Offsets Hint Table" << std::endl + << std::endl; + dumpHPageOffset(); + std::cout << std::endl + << "Shared Objects Hint Table" << std::endl + << std::endl; + dumpHSharedObject(); + + if (this->outline_hints.nobjects > 0) + { + std::cout << std::endl + << "Outlines Hint Table" << std::endl + << std::endl; + dumpHGeneric(this->outline_hints); + } +} + +int +QPDF::adjusted_offset(int offset) +{ + // All offsets >= H_offset have to be increased by H_length + // since all hint table location values disregard the hint table + // itself. + if (offset >= this->linp.H_offset) + { + return offset + this->linp.H_length; + } + return offset; +} + + +void +QPDF::dumpHPageOffset() +{ + HPageOffset& t = this->page_offset_hints; + std::cout + << "min_nobjects: " << t.min_nobjects + << std::endl + << "first_page_offset: " << adjusted_offset(t.first_page_offset) + << std::endl + << "nbits_delta_nobjects: " << t.nbits_delta_nobjects + << std::endl + << "min_page_length: " << t.min_page_length + << std::endl + << "nbits_delta_page_length: " << t.nbits_delta_page_length + << std::endl + << "min_content_offset: " << t.min_content_offset + << std::endl + << "nbits_delta_content_offset: " << t.nbits_delta_content_offset + << std::endl + << "min_content_length: " << t.min_content_length + << std::endl + << "nbits_delta_content_length: " << t.nbits_delta_content_length + << std::endl + << "nbits_nshared_objects: " << t.nbits_nshared_objects + << std::endl + << "nbits_shared_identifier: " << t.nbits_shared_identifier + << std::endl + << "nbits_shared_numerator: " << t.nbits_shared_numerator + << std::endl + << "shared_denominator: " << t.shared_denominator + << std::endl; + + for (int i1 = 0; i1 < this->linp.npages; ++i1) + { + HPageOffsetEntry& pe = t.entries[i1]; + std::cout + << "Page " << i1 << ":" << std::endl + << " nobjects: " << pe.delta_nobjects + t.min_nobjects + << std::endl + << " length: " << pe.delta_page_length + t.min_page_length + << std::endl + // content offset is relative to page, not file + << " content_offset: " + << pe.delta_content_offset + t.min_content_offset << std::endl + << " content_length: " + << pe.delta_content_length + t.min_content_length << std::endl + << " nshared_objects: " << pe.nshared_objects << std::endl; + for (int i2 = 0; i2 < pe.nshared_objects; ++i2) + { + std::cout << " identifier " << i2 << ": " + << pe.shared_identifiers[i2] << std::endl; + std::cout << " numerator " << i2 << ": " + << pe.shared_numerators[i2] << std::endl; + } + } +} + +void +QPDF::dumpHSharedObject() +{ + HSharedObject& t = this->shared_object_hints; + std::cout + << "first_shared_obj: " << t.first_shared_obj + << std::endl + << "first_shared_offset: " << adjusted_offset(t.first_shared_offset) + << std::endl + << "nshared_first_page: " << t.nshared_first_page + << std::endl + << "nshared_total: " << t.nshared_total + << std::endl + << "nbits_nobjects: " << t.nbits_nobjects + << std::endl + << "min_group_length: " << t.min_group_length + << std::endl + << "nbits_delta_group_length: " << t.nbits_delta_group_length + << std::endl; + + for (int i = 0; i < t.nshared_total; ++i) + { + HSharedObjectEntry& se = t.entries[i]; + std::cout << "Shared Object " << i << ":" << std::endl; + std::cout << " group length: " + << se.delta_group_length + t.min_group_length << std::endl; + // PDF spec says signature present nobjects_minus_one are + // always 0, so print them only if they have a non-zero value. + if (se.signature_present) + { + std::cout << " signature present" << std::endl; + } + if (se.nobjects_minus_one != 0) + { + std::cout << " nobjects: " + << se.nobjects_minus_one + 1 << std::endl; + } + } +} + +void +QPDF::dumpHGeneric(HGeneric& t) +{ + std::cout + << "first_object: " << t.first_object + << std::endl + << "first_object_offset: " << adjusted_offset(t.first_object_offset) + << std::endl + << "nobjects: " << t.nobjects + << std::endl + << "group_length: " << t.group_length + << std::endl; +} + +QPDFObjectHandle +QPDF::objGenToIndirect(ObjGen const& og) +{ + return getObjectByID(og.obj, og.gen); +} + +void +QPDF::calculateLinearizationData(std::map<int, int> const& object_stream_data) +{ + // This function calculates the ordering of objects, divides them + // into the appropriate parts, and computes some values for the + // linearization parameter dictionary and hint tables. The file + // must be optimized (via calling optimize()) prior to calling + // this function. Note that actual offsets and lengths are not + // computed here, but anything related to object ordering is. + + if (this->object_to_obj_users.empty()) + { + // Note that we can't call optimize here because we don't know + // whether it should be called with or without allow changes. + throw QEXC::Internal("QPDF::calculateLinearizationData " + "called before optimize()"); + } + + // Separate objects into the categories sufficient for us to + // determine which part of the linearized file should contain the + // object. This categorization is useful for other purposes as + // well. Part numbers refer to version 1.4 of the PDF spec. + + // Parts 1, 3, 5, 10, and 11 don't contain any objects from the + // original file (except the trailer dictionary in part 11). + + // Part 4 is the document catalog (root) and the following root + // keys: /ViewerPreferences, /PageMode, /Threads, /OpenAction, + // /AcroForm, /Encrypt. Note that Thread information dictionaries + // are supposed to appear in part 9, but we are disregarding that + // recommendation for now. + + // Part 6 is the first page section. It includes all remaining + // objects referenced by the first page including shared objects + // but not including thumbnails. Additionally, if /PageMode is + // /Outlines, then information from /Outlines also appears here. + + // Part 7 contains remaining objects private to pages other than + // the first page. + + // Part 8 contains all remaining shared objects except those that + // are shared only within thumbnails. + + // Part 9 contains all remaining objects. + + // We sort objects into the following categories: + + // * open_document: part 4 + + // * first_page_private: part 6 + + // * first_page_shared: part 6 + + // * other_page_private: part 7 + + // * other_page_shared: part 8 + + // * thumbnail_private: part 9 + + // * thumbnail_shared: part 9 + + // * other: part 9 + + // * outlines: part 6 or 9 + + QPDFObjectHandle root = getRoot(); + bool outlines_in_first_page = false; + QPDFObjectHandle pagemode = root.getKey("/PageMode"); + QTC::TC("qpdf", "QPDF categorize pagemode present", + pagemode.isName() ? 1 : 0); + if (pagemode.isName()) + { + if (pagemode.getName() == "/UseOutlines") + { + if (root.hasKey("/Outlines")) + { + outlines_in_first_page = true; + } + else + { + QTC::TC("qpdf", "QPDF UseOutlines but no Outlines"); + } + } + QTC::TC("qpdf", "QPDF categorize pagemode outlines", + outlines_in_first_page ? 1 : 0); + } + + std::set<std::string> open_document_keys; + open_document_keys.insert("/ViewerPreferences"); + open_document_keys.insert("/PageMode"); + open_document_keys.insert("/Threads"); + open_document_keys.insert("/OpenAction"); + open_document_keys.insert("/AcroForm"); + + std::set<ObjGen> lc_open_document; + std::set<ObjGen> lc_first_page_private; + std::set<ObjGen> lc_first_page_shared; + std::set<ObjGen> lc_other_page_private; + std::set<ObjGen> lc_other_page_shared; + std::set<ObjGen> lc_thumbnail_private; + std::set<ObjGen> lc_thumbnail_shared; + std::set<ObjGen> lc_other; + std::set<ObjGen> lc_outlines; + std::set<ObjGen> lc_root; + + for (std::map<ObjGen, std::set<ObjUser> >::iterator oiter = + this->object_to_obj_users.begin(); + oiter != this->object_to_obj_users.end(); ++oiter) + { + ObjGen const& og = (*oiter).first; + + std::set<ObjUser>& ous = (*oiter).second; + + bool in_open_document = false; + bool in_first_page = false; + int other_pages = 0; + int thumbs = 0; + int others = 0; + bool in_outlines = false; + bool is_root = false; + + for (std::set<ObjUser>::iterator uiter = ous.begin(); + uiter != ous.end(); ++uiter) + { + ObjUser const& ou = *uiter; + switch (ou.ou_type) + { + case ObjUser::ou_trailer_key: + if (ou.key == "/Encrypt") + { + in_open_document = true; + } + else + { + ++others; + } + break; + + case ObjUser::ou_thumb: + ++thumbs; + break; + + case ObjUser::ou_root_key: + if (open_document_keys.count(ou.key) > 0) + { + in_open_document = true; + } + else if (ou.key == "/Outlines") + { + in_outlines = true; + } + else + { + ++others; + } + break; + + case ObjUser::ou_page: + if (ou.pageno == 0) + { + in_first_page = true; + } + else + { + ++other_pages; + } + break; + + case ObjUser::ou_root: + is_root = true; + break; + + case ObjUser::ou_bad: + throw QEXC::Internal("QPDF::calculateLinearizationData: " + "invalid user type"); + break; + } + } + + if (is_root) + { + lc_root.insert(og); + } + else if (in_outlines) + { + lc_outlines.insert(og); + } + else if (in_open_document) + { + lc_open_document.insert(og); + } + else if ((in_first_page) && + (others == 0) && (other_pages == 0) && (thumbs == 0)) + { + lc_first_page_private.insert(og); + } + else if (in_first_page) + { + lc_first_page_shared.insert(og); + } + else if ((other_pages == 1) && (others == 0) && (thumbs == 0)) + { + lc_other_page_private.insert(og); + } + else if (other_pages > 1) + { + lc_other_page_shared.insert(og); + } + else if ((thumbs == 1) && (others == 0)) + { + lc_thumbnail_private.insert(og); + } + else if (thumbs > 1) + { + lc_thumbnail_shared.insert(og); + } + else + { + lc_other.insert(og); + } + } + + // Generate ordering for objects in the output file. Sometimes we + // just dump right from a set into a vector. Rather than + // optimizing this by going straight into the vector, we'll leave + // these phases separate for now. That way, this section can be + // concerned only with ordering, and the above section can be + // considered only with categorization. Note that sets of ObjGens + // are sorted by ObjGen. In a linearized file, objects appear in + // sequence with the possible exception of hints tables which we + // won't see here anyway. That means that running + // calculateLinearizationData() on a linearized file should give + // results identical to the original file ordering. + + // We seem to traverse the page tree a lot in this code, but we + // can address this for a future code optimization if necessary. + // Premature optimization is the root of all evil. + std::vector<QPDFObjectHandle> pages; + { // local scope + // Map all page objects to the containing object stream. This + // should be a no-op in a properly linearized file. + std::vector<QPDFObjectHandle> t = getAllPages(); + for (std::vector<QPDFObjectHandle>::iterator iter = t.begin(); + iter != t.end(); ++iter) + { + pages.push_back(getUncompressedObject(*iter, object_stream_data)); + } + } + unsigned int npages = pages.size(); + + // We will be initializing some values of the computed hint + // tables. Specifically, we can initialize any items that deal + // with object numbers or counts but not any items that deal with + // lengths or offsets. The code that writes linearized files will + // have to fill in these values during the first pass. The + // validation code can compute them relatively easily given the + // rest of the information. + + this->c_linp.npages = npages; + this->c_page_offset_data.entries = std::vector<CHPageOffsetEntry>(npages); + + // Part 4: open document objects. We don't care about the order. + + assert(lc_root.size() == 1); + this->part4.push_back(objGenToIndirect(*(lc_root.begin()))); + for (std::set<ObjGen>::iterator iter = lc_open_document.begin(); + iter != lc_open_document.end(); ++iter) + { + this->part4.push_back(objGenToIndirect(*iter)); + } + + // Part 6: first page objects. Note: implementation note 124 + // states that Acrobat always treats page 0 as the first page for + // linearization regardless of /OpenAction. pdlin doesn't provide + // any option to set this and also disregards /OpenAction. We + // will do the same. + + // First, place the actual first page object itself. + ObjGen first_page_og(pages[0].getObjectID(), pages[0].getGeneration()); + if (! lc_first_page_private.count(first_page_og)) + { + throw QEXC::Internal("QPDF::calculateLinearizationData: first page " + "object not in lc_first_page_private"); + } + lc_first_page_private.erase(first_page_og); + this->c_linp.first_page_object = pages[0].getObjectID(); + this->part6.push_back(pages[0]); + + // The PDF spec "recommends" an order for the rest of the objects, + // but we are going to disregard it except to the extent that it + // groups private and shared objects contiguously for the sake of + // hint tables. + + for (std::set<ObjGen>::iterator iter = lc_first_page_private.begin(); + iter != lc_first_page_private.end(); ++iter) + { + this->part6.push_back(objGenToIndirect(*iter)); + } + + for (std::set<ObjGen>::iterator iter = lc_first_page_shared.begin(); + iter != lc_first_page_shared.end(); ++iter) + { + this->part6.push_back(objGenToIndirect(*iter)); + } + + // Place the outline dictionary if it goes in the first page section. + if (outlines_in_first_page) + { + pushOutlinesToPart(this->part6, lc_outlines, object_stream_data); + } + + // Fill in page offset hint table information for the first page. + // The PDF spec says that nshared_objects should be zero for the + // first page. pdlin does not appear to obey this, but it fills + // in garbage values for all the shared object identifiers on the + // first page. + + this->c_page_offset_data.entries[0].nobjects = this->part6.size(); + + // Part 7: other pages' private objects + + // For each page in order: + for (unsigned int i = 1; i < npages; ++i) + { + // Place this page's page object + + ObjGen page_og(pages[i].getObjectID(), pages[i].getGeneration()); + if (! lc_other_page_private.count(page_og)) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: page object for page " + + QUtil::int_to_string(i) + " not in lc_other_page_private"); + } + lc_other_page_private.erase(page_og); + this->part7.push_back(pages[i]); + + // Place all non-shared objects referenced by this page, + // updating the page object count for the hint table. + + this->c_page_offset_data.entries[i].nobjects = 1; + + ObjUser ou(ObjUser::ou_page, i); + assert(this->obj_user_to_objects.count(ou) > 0); + std::set<ObjGen> ogs = this->obj_user_to_objects[ou]; + for (std::set<ObjGen>::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = (*iter); + if (lc_other_page_private.count(og)) + { + lc_other_page_private.erase(og); + this->part7.push_back(objGenToIndirect(og)); + ++this->c_page_offset_data.entries[i].nobjects; + } + } + } + // That should have covered all part7 objects. + if (! lc_other_page_private.empty()) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: lc_other_page_private is " + "not empty after generation of part7"); + } + + // Part 8: other pages' shared objects + + // Order is unimportant. + for (std::set<ObjGen>::iterator iter = lc_other_page_shared.begin(); + iter != lc_other_page_shared.end(); ++iter) + { + this->part8.push_back(objGenToIndirect(*iter)); + } + + // Part 9: other objects + + // The PDF specification makes recommendations on ordering here. + // We follow them only to a limited extent. Specifically, we put + // the pages tree first, then private thumbnail objects in page + // order, then shared thumbnail objects, and then outlines (unless + // in part 6). After that, we throw all remaining objects in + // arbitrary order. + + // Place the pages tree. + std::set<ObjGen> pages_ogs = + this->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")]; + assert(! pages_ogs.empty()); + for (std::set<ObjGen>::iterator iter = pages_ogs.begin(); + iter != pages_ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if (lc_other.count(og)) + { + lc_other.erase(og); + this->part9.push_back(objGenToIndirect(og)); + } + } + + // Place private thumbnail images in page order. Slightly more + // information would be required if we were going to bother with + // thumbnail hint tables. + for (unsigned int i = 0; i < npages; ++i) + { + QPDFObjectHandle thumb = pages[i].getKey("/Thumb"); + thumb = getUncompressedObject(thumb, object_stream_data); + if (! thumb.isNull()) + { + // Output the thumbnail itself + ObjGen thumb_og(thumb.getObjectID(), thumb.getGeneration()); + if (lc_thumbnail_private.count(thumb_og)) + { + lc_thumbnail_private.erase(thumb_og); + this->part9.push_back(thumb); + } + else + { + // No internal error this time...there's nothing to + // stop this object from having been referred to + // somewhere else outside of a page's /Thumb, and if + // it had been, there's nothing to prevent it from + // having been in some set other than + // lc_thumbnail_private. + } + std::set<ObjGen>& ogs = + this->obj_user_to_objects[ObjUser(ObjUser::ou_thumb, i)]; + for (std::set<ObjGen>::iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if (lc_thumbnail_private.count(og)) + { + lc_thumbnail_private.erase(og); + this->part9.push_back(objGenToIndirect(og)); + } + } + } + } + if (! lc_thumbnail_private.empty()) + { + throw QEXC::Internal( + "QPDF::calculateLinearizationData: lc_thumbnail_private " + "not empty after placing thumbnails"); + } + + // Place shared thumbnail objects + for (std::set<ObjGen>::iterator iter = lc_thumbnail_shared.begin(); + iter != lc_thumbnail_shared.end(); ++iter) + { + this->part9.push_back(objGenToIndirect(*iter)); + } + + // Place outlines unless in first page + if (! outlines_in_first_page) + { + pushOutlinesToPart(this->part9, lc_outlines, object_stream_data); + } + + // Place all remaining objects + for (std::set<ObjGen>::iterator iter = lc_other.begin(); + iter != lc_other.end(); ++iter) + { + this->part9.push_back(objGenToIndirect(*iter)); + } + + // Make sure we got everything exactly once. + + unsigned int num_placed = this->part4.size() + this->part6.size() + + this->part7.size() + this->part8.size() + this->part9.size(); + unsigned int num_wanted = this->object_to_obj_users.size(); + if (num_placed != num_wanted) + { + throw QEXC::Internal("QPDF::calculateLinearizationData: wrong " + "number of objects placed (num_placed = " + + QUtil::int_to_string(num_placed) + + "; number of objects: " + + QUtil::int_to_string(num_wanted)); + } + + // Calculate shared object hint table information including + // references to shared objects from page offset hint data. + + // The shared object hint table consists of all part 6 (whether + // shared or not) in order followed by all part 8 objects in + // order. Add the objects to shared object data keeping a map of + // object number to index. Then populate the shared object + // information for the pages. + + // Note that two objects never have the same object number, so we + // can map from object number only without regards to generation. + std::map<int, int> obj_to_index; + + this->c_shared_object_data.nshared_first_page = this->part6.size(); + this->c_shared_object_data.nshared_total = + this->c_shared_object_data.nshared_first_page + + this->part8.size(); + + std::vector<CHSharedObjectEntry>& shared = + this->c_shared_object_data.entries; + for (std::vector<QPDFObjectHandle>::iterator iter = this->part6.begin(); + iter != this->part6.end(); ++iter) + { + QPDFObjectHandle& oh = *iter; + int obj = oh.getObjectID(); + obj_to_index[obj] = shared.size(); + shared.push_back(CHSharedObjectEntry(obj)); + } + QTC::TC("qpdf", "QPDF lin part 8 empty", this->part8.empty() ? 1 : 0); + if (! this->part8.empty()) + { + this->c_shared_object_data.first_shared_obj = + this->part8[0].getObjectID(); + for (std::vector<QPDFObjectHandle>::iterator iter = + this->part8.begin(); + iter != this->part8.end(); ++iter) + { + QPDFObjectHandle& oh = *iter; + int obj = oh.getObjectID(); + obj_to_index[obj] = shared.size(); + shared.push_back(CHSharedObjectEntry(obj)); + } + } + assert(this->c_shared_object_data.nshared_total == + (int) this->c_shared_object_data.entries.size()); + + // Now compute the list of shared objects for each page after the + // first page. + + for (unsigned int i = 1; i < npages; ++i) + { + CHPageOffsetEntry& pe = this->c_page_offset_data.entries[i]; + ObjUser ou(ObjUser::ou_page, i); + assert(this->obj_user_to_objects.count(ou) > 0); + std::set<ObjGen> const& ogs = this->obj_user_to_objects[ou]; + for (std::set<ObjGen>::const_iterator iter = ogs.begin(); + iter != ogs.end(); ++iter) + { + ObjGen const& og = *iter; + if ((this->object_to_obj_users[og].size() > 1) && + (obj_to_index.count(og.obj) > 0)) + { + int idx = obj_to_index[og.obj]; + ++pe.nshared_objects; + pe.shared_identifiers.push_back(idx); + } + } + } +} + +void +QPDF::pushOutlinesToPart( + std::vector<QPDFObjectHandle>& part, + std::set<ObjGen>& lc_outlines, + std::map<int, int> const& object_stream_data) +{ + QPDFObjectHandle root = getRoot(); + QPDFObjectHandle outlines = root.getKey("/Outlines"); + if (outlines.isNull()) + { + return; + } + outlines = getUncompressedObject(outlines, object_stream_data); + ObjGen outlines_og(outlines.getObjectID(), outlines.getGeneration()); + QTC::TC("qpdf", "QPDF lin outlines in part", + ((&part == (&this->part6)) ? 0 + : (&part == (&this->part9)) ? 1 + : 9999)); // can't happen + this->c_outline_data.first_object = outlines_og.obj; + this->c_outline_data.nobjects = 1; + lc_outlines.erase(outlines_og); + part.push_back(outlines); + for (std::set<ObjGen>::iterator iter = lc_outlines.begin(); + iter != lc_outlines.end(); ++iter) + { + part.push_back(objGenToIndirect(*iter)); + ++this->c_outline_data.nobjects; + } +} + +void +QPDF::getLinearizedParts( + std::map<int, int> const& object_stream_data, + std::vector<QPDFObjectHandle>& part4, + std::vector<QPDFObjectHandle>& part6, + std::vector<QPDFObjectHandle>& part7, + std::vector<QPDFObjectHandle>& part8, + std::vector<QPDFObjectHandle>& part9) +{ + calculateLinearizationData(object_stream_data); + part4 = this->part4; + part6 = this->part6; + part7 = this->part7; + part8 = this->part8; + part9 = this->part9; +} + +static inline int nbits(int val) +{ + return (val == 0 ? 0 : (1 + nbits(val >> 1))); +} + +int +QPDF::outputLengthNextN( + int in_object, int n, + std::map<int, size_t> const& lengths, + std::map<int, int> const& obj_renumber) +{ + // Figure out the length of a series of n consecutive objects in + // the output file starting with whatever object in_object from + // the input file mapped to. + + assert(obj_renumber.count(in_object) > 0); + int first = (*(obj_renumber.find(in_object))).second; + int length = 0; + for (int i = 0; i < n; ++i) + { + assert(lengths.count(first + i) > 0); + length += (*(lengths.find(first + i))).second; + } + return length; +} + +void +QPDF::calculateHPageOffset( + std::map<int, QPDFXRefEntry> const& xref, + std::map<int, size_t> const& lengths, + std::map<int, int> const& obj_renumber) +{ + // Page Offset Hint Table + + // We are purposely leaving some values set to their initial zero + // values. + + std::vector<QPDFObjectHandle> const& pages = getAllPages(); + unsigned int npages = pages.size(); + CHPageOffset& cph = this->c_page_offset_data; + std::vector<CHPageOffsetEntry>& cphe = cph.entries; + + // Calculate minimum and maximum values for number of objects per + // page and page length. + + int min_nobjects = cphe[0].nobjects; + int max_nobjects = min_nobjects; + int min_length = outputLengthNextN( + pages[0].getObjectID(), min_nobjects, lengths, obj_renumber); + int max_length = min_length; + int max_shared = cphe[0].nshared_objects; + + HPageOffset& ph = this->page_offset_hints; + std::vector<HPageOffsetEntry>& phe = ph.entries; + phe = std::vector<HPageOffsetEntry>(npages); + + for (unsigned int i = 0; i < npages; ++i) + { + // Calculate values for each page, assigning full values to + // the delta items. They will be adjusted later. + + // Repeat calculations for page 0 so we can assign to phe[i] + // without duplicating those assignments. + + int nobjects = cphe[i].nobjects; + int length = outputLengthNextN( + pages[i].getObjectID(), nobjects, lengths, obj_renumber); + int nshared = cphe[i].nshared_objects; + + min_nobjects = std::min(min_nobjects, nobjects); + max_nobjects = std::max(max_nobjects, nobjects); + min_length = std::min(min_length, length); + max_length = std::max(max_length, length); + max_shared = std::max(max_shared, nshared); + + phe[i].delta_nobjects = nobjects; + phe[i].delta_page_length = length; + phe[i].nshared_objects = nshared; + } + + ph.min_nobjects = min_nobjects; + int in_page0_id = pages[0].getObjectID(); + int out_page0_id = (*(obj_renumber.find(in_page0_id))).second; + ph.first_page_offset = (*(xref.find(out_page0_id))).second.getOffset(); + ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects); + ph.min_page_length = min_length; + ph.nbits_delta_page_length = nbits(max_length - min_length); + ph.nbits_nshared_objects = nbits(max_shared); + ph.nbits_shared_identifier = + nbits(this->c_shared_object_data.nshared_total); + ph.shared_denominator = 4; // doesn't matter + + // It isn't clear how to compute content offset and content + // length. Since we are not interleaving page objects with the + // content stream, we'll use the same values for content length as + // page length. We will use 0 as content offset because this is + // what Adobe does (implementation note 127) and pdlin as well. + ph.nbits_delta_content_length = ph.nbits_delta_page_length; + ph.min_content_length = ph.min_page_length; + + for (unsigned int i = 0; i < npages; ++i) + { + // Adjust delta entries + assert(phe[i].delta_nobjects >= min_nobjects); + assert(phe[i].delta_page_length >= min_length); + phe[i].delta_nobjects -= min_nobjects; + phe[i].delta_page_length -= min_length; + phe[i].delta_content_length = phe[i].delta_page_length; + + for (int j = 0; j < cphe[i].nshared_objects; ++j) + { + phe[i].shared_identifiers.push_back( + cphe[i].shared_identifiers[j]); + phe[i].shared_numerators.push_back(0); + } + } +} + +void +QPDF::calculateHSharedObject( + std::map<int, QPDFXRefEntry> const& xref, + std::map<int, size_t> const& lengths, + std::map<int, int> const& obj_renumber) +{ + CHSharedObject& cso = this->c_shared_object_data; + std::vector<CHSharedObjectEntry>& csoe = cso.entries; + HSharedObject& so = this->shared_object_hints; + std::vector<HSharedObjectEntry>& soe = so.entries; + soe = std::vector<HSharedObjectEntry>(cso.nshared_total); + + int min_length = outputLengthNextN( + csoe[0].object, 1, lengths, obj_renumber); + int max_length = min_length; + + for (int i = 0; i < cso.nshared_total; ++i) + { + // Assign absolute numbers to deltas; adjust later + int length = outputLengthNextN( + csoe[i].object, 1, lengths, obj_renumber); + min_length = std::min(min_length, length); + max_length = std::max(max_length, length); + soe[i].delta_group_length = length; + } + + so.nshared_total = cso.nshared_total; + so.nshared_first_page = cso.nshared_first_page; + if (so.nshared_total > so.nshared_first_page) + { + so.first_shared_obj = + (*(obj_renumber.find(cso.first_shared_obj))).second; + so.first_shared_offset = + (*(xref.find(so.first_shared_obj))).second.getOffset(); + } + so.min_group_length = min_length; + so.nbits_delta_group_length = nbits(max_length - min_length); + + for (int i = 0; i < cso.nshared_total; ++i) + { + // Adjust deltas + assert(soe[i].delta_group_length >= min_length); + soe[i].delta_group_length -= min_length; + } +} + +void +QPDF::calculateHOutline( + std::map<int, QPDFXRefEntry> const& xref, + std::map<int, size_t> const& lengths, + std::map<int, int> const& obj_renumber) +{ + HGeneric& cho = this->c_outline_data; + + if (cho.nobjects == 0) + { + return; + } + + HGeneric& ho = this->outline_hints; + + ho.first_object = + (*(obj_renumber.find(cho.first_object))).second; + ho.first_object_offset = + (*(xref.find(ho.first_object))).second.getOffset(); + ho.nobjects = cho.nobjects; + ho.group_length = outputLengthNextN( + cho.first_object, ho.nobjects, lengths, obj_renumber); +} + +template <class T> +static void +write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec, + int bits, int T::*field) +{ + // nitems times, write bits bits from the given field of the ith + // vector to the given bit writer. + + for (int i = 0; i < nitems; ++i) + { + w.writeBits(vec[i].*field, bits); + } + // The PDF spec says that each hint table starts at a byte + // boundary. Each "row" actually must start on a byte boundary. + w.flush(); +} + +template <class T> +static void +write_vector_vector(BitWriter& w, + int nitems1, std::vector<T>& vec1, int T::*nitems2, + int bits, std::vector<int> T::*vec2) +{ + // nitems1 times, write nitems2 (from the ith element of vec1) items + // from the vec2 vector field of the ith item of vec1. + for (int i1 = 0; i1 < nitems1; ++i1) + { + for (int i2 = 0; i2 < vec1[i1].*nitems2; ++i2) + { + w.writeBits((vec1[i1].*vec2)[i2], bits); + } + } + w.flush(); +} + + +void +QPDF::writeHPageOffset(BitWriter& w) +{ + HPageOffset& t = this->page_offset_hints; + + w.writeBits(t.min_nobjects, 32); // 1 + w.writeBits(t.first_page_offset, 32); // 2 + w.writeBits(t.nbits_delta_nobjects, 16); // 3 + w.writeBits(t.min_page_length, 32); // 4 + w.writeBits(t.nbits_delta_page_length, 16); // 5 + w.writeBits(t.min_content_offset, 32); // 6 + w.writeBits(t.nbits_delta_content_offset, 16); // 7 + w.writeBits(t.min_content_length, 32); // 8 + w.writeBits(t.nbits_delta_content_length, 16); // 9 + w.writeBits(t.nbits_nshared_objects, 16); // 10 + w.writeBits(t.nbits_shared_identifier, 16); // 11 + w.writeBits(t.nbits_shared_numerator, 16); // 12 + w.writeBits(t.shared_denominator, 16); // 13 + + unsigned int nitems = getAllPages().size(); + std::vector<HPageOffsetEntry>& entries = t.entries; + + write_vector_int(w, nitems, entries, + t.nbits_delta_nobjects, + &HPageOffsetEntry::delta_nobjects); + write_vector_int(w, nitems, entries, + t.nbits_delta_page_length, + &HPageOffsetEntry::delta_page_length); + write_vector_int(w, nitems, entries, + t.nbits_nshared_objects, + &HPageOffsetEntry::nshared_objects); + write_vector_vector(w, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_identifier, + &HPageOffsetEntry::shared_identifiers); + write_vector_vector(w, nitems, entries, + &HPageOffsetEntry::nshared_objects, + t.nbits_shared_numerator, + &HPageOffsetEntry::shared_numerators); + write_vector_int(w, nitems, entries, + t.nbits_delta_content_offset, + &HPageOffsetEntry::delta_content_offset); + write_vector_int(w, nitems, entries, + t.nbits_delta_content_length, + &HPageOffsetEntry::delta_content_length); +} + +void +QPDF::writeHSharedObject(BitWriter& w) +{ + HSharedObject& t = this->shared_object_hints; + + w.writeBits(t.first_shared_obj, 32); // 1 + w.writeBits(t.first_shared_offset, 32); // 2 + w.writeBits(t.nshared_first_page, 32); // 3 + w.writeBits(t.nshared_total, 32); // 4 + w.writeBits(t.nbits_nobjects, 16); // 5 + w.writeBits(t.min_group_length, 32); // 6 + w.writeBits(t.nbits_delta_group_length, 16); // 7 + + QTC::TC("qpdf", "QPDF lin write nshared_total > nshared_first_page", + (t.nshared_total > t.nshared_first_page) ? 1 : 0); + + int nitems = t.nshared_total; + std::vector<HSharedObjectEntry>& entries = t.entries; + + write_vector_int(w, nitems, entries, + t.nbits_delta_group_length, + &HSharedObjectEntry::delta_group_length); + write_vector_int(w, nitems, entries, + 1, &HSharedObjectEntry::signature_present); + for (int i = 0; i < nitems; ++i) + { + // If signature were present, we'd have to write a 128-bit hash. + assert(entries[i].signature_present == 0); + } + write_vector_int(w, nitems, entries, + t.nbits_nobjects, + &HSharedObjectEntry::nobjects_minus_one); +} + +void +QPDF::writeHGeneric(BitWriter& w, HGeneric& t) +{ + w.writeBits(t.first_object, 32); // 1 + w.writeBits(t.first_object_offset, 32); // 2 + w.writeBits(t.nobjects, 32); // 3 + w.writeBits(t.group_length, 32); // 4 +} + +void +QPDF::generateHintStream(std::map<int, QPDFXRefEntry> const& xref, + std::map<int, size_t> const& lengths, + std::map<int, int> const& obj_renumber, + PointerHolder<Buffer>& hint_buffer, + int& S, int& O) +{ + // Populate actual hint table values + calculateHPageOffset(xref, lengths, obj_renumber); + calculateHSharedObject(xref, lengths, obj_renumber); + calculateHOutline(xref, lengths, obj_renumber); + + // Write the hint stream itself into a compressed memory buffer. + // Write through a couter so we can get offsets. + Pl_Buffer hint_stream("hint stream"); + Pl_Flate f("compress hint stream", &hint_stream, Pl_Flate::a_deflate); + Pl_Count c("count", &f); + BitWriter w(&c); + + writeHPageOffset(w); + S = c.getCount(); + writeHSharedObject(w); + O = 0; + if (this->outline_hints.nobjects > 0) + { + O = c.getCount(); + writeHGeneric(w, this->outline_hints); + } + c.finish(); + + hint_buffer = hint_stream.getBuffer(); +} diff --git a/libqpdf/QPDF_optimization.cc b/libqpdf/QPDF_optimization.cc new file mode 100644 index 00000000..8797445c --- /dev/null +++ b/libqpdf/QPDF_optimization.cc @@ -0,0 +1,490 @@ +// See doc/optimization. + +#include <qpdf/QPDF.hh> + +#include <qpdf/QTC.hh> +#include <qpdf/QPDFExc.hh> +#include <qpdf/QPDF_Dictionary.hh> +#include <qpdf/QPDF_Array.hh> +#include <assert.h> + +QPDF::ObjUser::ObjUser() : + ou_type(ou_bad), + pageno(0) +{ +} + +QPDF::ObjUser::ObjUser(user_e type) : + ou_type(type), + pageno(0) +{ + assert(type == ou_root); +} + +QPDF::ObjUser::ObjUser(user_e type, int pageno) : + ou_type(type), + pageno(pageno) +{ + assert((type == ou_page) || (type == ou_thumb)); +} + +QPDF::ObjUser::ObjUser(user_e type, std::string const& key) : + ou_type(type), + pageno(0), + key(key) +{ + assert((type == ou_trailer_key) || (type == ou_root_key)); +} + +bool +QPDF::ObjUser::operator<(ObjUser const& rhs) const +{ + if (this->ou_type < rhs.ou_type) + { + return true; + } + else if (this->ou_type == rhs.ou_type) + { + if (this->pageno < rhs.pageno) + { + return true; + } + else if (this->pageno == rhs.pageno) + { + return (this->key < rhs.key); + } + } + + return false; +} + +void +QPDF::flattenScalarReferences() +{ + // Do a traversal of the entire PDF file structure replacing all + // indirect objects that are not arrays, streams, or dictionaries + // with direct objects. + + std::list<QPDFObjectHandle> queue; + queue.push_back(this->trailer); + std::set<ObjGen> visited; + + while (! queue.empty()) + { + QPDFObjectHandle node = queue.front(); + queue.pop_front(); + if (node.isIndirect()) + { + if (node.isScalar()) + { + throw QEXC::Internal( + "flattenScalarReferences landed at indirect scalar"); + } + ObjGen og(node.getObjectID(), node.getGeneration()); + if (visited.count(og) > 0) + { + continue; + } + visited.insert(og); + } + + if (node.isArray()) + { + int nitems = node.getArrayNItems(); + for (int i = 0; i < nitems; ++i) + { + QPDFObjectHandle oh = node.getArrayItem(i); + if (oh.isScalar()) + { + QTC::TC("qpdf", "QPDF opt flatten array scalar"); + oh.makeDirect(); + node.setArrayItem(i, oh); + } + else + { + queue.push_back(oh); + } + } + } + else if (node.isDictionary() || node.isStream()) + { + QPDFObjectHandle dict = node; + if (node.isStream()) + { + dict = node.getDict(); + } + std::set<std::string> keys = dict.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + QPDFObjectHandle oh = dict.getKey(key); + if (oh.isNull()) + { + // QPDF_Dictionary.getKeys() never returns null + // keys. + throw QEXC::Internal("dictionary with null key found"); + } + else if (oh.isScalar()) + { + QTC::TC("qpdf", "QPDF opt flatten dict scalar"); + oh.makeDirect(); + dict.replaceKey(key, oh); + } + else + { + queue.push_back(oh); + } + } + } + } +} + +void +QPDF::optimize(std::map<int, int> const& object_stream_data, + bool allow_changes) +{ + if (! this->obj_user_to_objects.empty()) + { + // already optimized + return; + } + + // Traverse pages tree pushing all inherited resources down to the + // page level. + + // key_ancestors is a mapping of page attribute keys to a stack of + // Pages nodes that contain values for them. pageno is the + // current page sequence number numbered from 0. + std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors; + int pageno = 0; + optimizePagesTree(this->trailer.getKey("/Root").getKey("/Pages"), + key_ancestors, pageno, allow_changes); + assert(key_ancestors.empty()); + + // Traverse document-level items + std::set<std::string> keys = this->trailer.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + if (key == "/Root") + { + // handled separately + } + else + { + updateObjectMaps(ObjUser(ObjUser::ou_trailer_key, key), + this->trailer.getKey(key)); + } + } + + QPDFObjectHandle root = getRoot(); + keys = root.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + // Technically, /I keys from /Thread dictionaries are supposed + // to be handled separately, but we are going to disregard + // that specification for now. There is loads of evidence + // that pdlin and Acrobat both disregard things like this from + // time to time, so this is almost certain not to cause any + // problems. + + std::string const& key = *iter; + updateObjectMaps(ObjUser(ObjUser::ou_root_key, key), + root.getKey(key)); + } + + ObjUser root_ou = ObjUser(ObjUser::ou_root); + ObjGen root_og = ObjGen(root.getObjectID(), root.getGeneration()); + obj_user_to_objects[root_ou].insert(root_og); + object_to_obj_users[root_og].insert(root_ou); + + filterCompressedObjects(object_stream_data); +} + +void +QPDF::optimizePagesTree( + QPDFObjectHandle cur_pages, + std::map<std::string, std::vector<QPDFObjectHandle> >& key_ancestors, + int& pageno, bool allow_changes) +{ + // Extract the underlying dictionary object + std::string type = cur_pages.getKey("/Type").getName(); + + if (type == "/Pages") + { + // Make a list of inheritable keys. Any key other than /Type, + // /Parent, Kids, or /Count is an inheritable attribute. Push + // this object onto the stack of pages nodes that have values + // for this attribute. + + std::set<std::string> inheritable_keys; + std::set<std::string> keys = cur_pages.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + if (! ((key == "/Type") || (key == "/Parent") || + (key == "/Kids") || (key == "/Count"))) + { + if (! allow_changes) + { + throw QPDFExc(this->file.getName() + + ": optimize detected an " + "inheritable resource"); + } + + // This is an inheritable resource + inheritable_keys.insert(key); + QPDFObjectHandle oh = cur_pages.getKey(key); + QTC::TC("qpdf", "QPDF opt direct pages resource", + oh.isIndirect() ? 0 : 1); + if (! oh.isIndirect()) + { + if (! oh.isScalar()) + { + // Replace shared direct object non-scalar + // resources with indirect objects to avoid + // copying large structures around. + cur_pages.replaceKey(key, makeIndirectObject(oh)); + oh = cur_pages.getKey(key); + } + else + { + // Don't defeat flattenScalarReferences which + // would have already been called by this + // time. + QTC::TC("qpdf", "QPDF opt inherited scalar"); + } + } + key_ancestors[key].push_back(oh); + if (key_ancestors[key].size() > 1) + { + QTC::TC("qpdf", "QPDF opt key ancestors depth > 1"); + } + // Remove this resource from this node. It will be + // reattached at the page level. + cur_pages.removeKey(key); + } + } + + // Visit descendant nodes. + QPDFObjectHandle kids = cur_pages.getKey("/Kids"); + int n = kids.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + optimizePagesTree(kids.getArrayItem(i), key_ancestors, pageno, + allow_changes); + } + + // For each inheritable key, pop the stack. If the stack + // becomes empty, remove it from the map. That way, the + // invariant that the list of keys in key_ancestors is exactly + // those keys for which inheritable attributes are available. + + if (! inheritable_keys.empty()) + { + QTC::TC("qpdf", "QPDF opt inheritable keys"); + for (std::set<std::string>::iterator iter = + inheritable_keys.begin(); + iter != inheritable_keys.end(); ++iter) + { + std::string const& key = (*iter); + key_ancestors[key].pop_back(); + if (key_ancestors[key].empty()) + { + QTC::TC("qpdf", "QPDF opt erase empty key ancestor"); + key_ancestors.erase(key); + } + } + } + else + { + QTC::TC("qpdf", "QPDF opt no inheritable keys"); + } + } + else if (type == "/Page") + { + // Add all available inheritable attributes not present in + // this object to this object. + for (std::map<std::string, std::vector<QPDFObjectHandle> >::iterator + iter = key_ancestors.begin(); + iter != key_ancestors.end(); ++iter) + { + std::string const& key = (*iter).first; + if (! cur_pages.hasKey(key)) + { + QTC::TC("qpdf", "QPDF opt resource inherited"); + cur_pages.replaceKey(key, (*iter).second.back()); + } + else + { + QTC::TC("qpdf", "QPDF opt page resource hides ancestor"); + } + } + + // Traverse from this point, updating the mappings of object + // users to objects and objects to object users. + + updateObjectMaps(ObjUser(ObjUser::ou_page, pageno), cur_pages); + + // Increment pageno so that its value will be correct for the + // next page. + ++pageno; + } + else + { + throw QPDFExc(this->file.getName() + ": invalid Type in page tree"); + } +} + +void +QPDF::updateObjectMaps(ObjUser const& ou, QPDFObjectHandle oh) +{ + std::set<ObjGen> visited; + updateObjectMapsInternal(ou, oh, visited, true); +} + +void +QPDF::updateObjectMapsInternal(ObjUser const& ou, QPDFObjectHandle oh, + std::set<ObjGen>& visited, bool top) +{ + // Traverse the object tree from this point taking care to avoid + // crossing page boundaries. + + bool is_page_node = false; + + if (oh.isDictionary() && oh.hasKey("/Type")) + { + std::string type = oh.getKey("/Type").getName(); + if (type == "/Page") + { + is_page_node = true; + if (! top) + { + return; + } + } + } + + if (oh.isIndirect()) + { + ObjGen og(oh.getObjectID(), oh.getGeneration()); + if (visited.count(og)) + { + QTC::TC("qpdf", "QPDF opt loop detected"); + return; + } + this->obj_user_to_objects[ou].insert(og); + this->object_to_obj_users[og].insert(ou); + visited.insert(og); + } + + if (oh.isArray()) + { + int n = oh.getArrayNItems(); + for (int i = 0; i < n; ++i) + { + updateObjectMapsInternal(ou, oh.getArrayItem(i), visited, false); + } + } + else if (oh.isDictionary() || oh.isStream()) + { + QPDFObjectHandle dict = oh; + if (oh.isStream()) + { + dict = oh.getDict(); + } + + std::set<std::string> keys = dict.getKeys(); + for (std::set<std::string>::iterator iter = keys.begin(); + iter != keys.end(); ++iter) + { + std::string const& key = *iter; + if (is_page_node && (key == "/Thumb")) + { + // Traverse page thumbnail dictionaries as a special + // case. + updateObjectMaps(ObjUser(ObjUser::ou_thumb, ou.pageno), + dict.getKey(key)); + } + else if (is_page_node && (key == "/Parent")) + { + // Don't traverse back up the page tree + } + else + { + updateObjectMapsInternal(ou, dict.getKey(key), + visited, false); + } + } + } +} + +void +QPDF::filterCompressedObjects(std::map<int, int> const& object_stream_data) +{ + if (object_stream_data.empty()) + { + return; + } + + // Transform object_to_obj_users and obj_user_to_objects so that + // they refer only to uncompressed objects. If something is a + // user of a compressed object, then it is really a user of the + // object stream that contains it. + + std::map<ObjUser, std::set<ObjGen> > t_obj_user_to_objects; + std::map<ObjGen, std::set<ObjUser> > t_object_to_obj_users; + + for (std::map<ObjUser, std::set<ObjGen> >::iterator i1 = + this->obj_user_to_objects.begin(); + i1 != this->obj_user_to_objects.end(); ++i1) + { + ObjUser const& ou = (*i1).first; + std::set<ObjGen> const& objects = (*i1).second; + for (std::set<ObjGen>::const_iterator i2 = objects.begin(); + i2 != objects.end(); ++i2) + { + ObjGen const& og = (*i2); + std::map<int, int>::const_iterator i3 = + object_stream_data.find(og.obj); + if (i3 == object_stream_data.end()) + { + t_obj_user_to_objects[ou].insert(og); + } + else + { + t_obj_user_to_objects[ou].insert(ObjGen((*i3).second, 0)); + } + } + } + + for (std::map<ObjGen, std::set<ObjUser> >::iterator i1 = + this->object_to_obj_users.begin(); + i1 != this->object_to_obj_users.end(); ++i1) + { + ObjGen const& og = (*i1).first; + std::set<ObjUser> const& objusers = (*i1).second; + for (std::set<ObjUser>::const_iterator i2 = objusers.begin(); + i2 != objusers.end(); ++i2) + { + ObjUser const& ou = (*i2); + std::map<int, int>::const_iterator i3 = + object_stream_data.find(og.obj); + if (i3 == object_stream_data.end()) + { + t_object_to_obj_users[og].insert(ou); + } + else + { + t_object_to_obj_users[ObjGen((*i3).second, 0)].insert(ou); + } + } + } + + this->obj_user_to_objects = t_obj_user_to_objects; + this->object_to_obj_users = t_object_to_obj_users; +} diff --git a/libqpdf/QTC.cc b/libqpdf/QTC.cc new file mode 100644 index 00000000..b8328b2e --- /dev/null +++ b/libqpdf/QTC.cc @@ -0,0 +1,46 @@ + +#include <qpdf/QTC.hh> + +#include <set> +#include <stdio.h> +#include <qpdf/QUtil.hh> + +static bool tc_active(char const* const scope) +{ + std::string value; + return (QUtil::get_env("TC_SCOPE", &value) && (value == scope)); +} + +void QTC::TC(char const* const scope, char const* const ccase, int n) +{ + static std::set<std::pair<std::string, int> > cache; + + if (! tc_active(scope)) + { + return; + } + + std::string filename; +#ifdef _WIN32 +# define TC_ENV "TC_WIN_FILENAME" +#else +# define TC_ENV "TC_FILENAME" +#endif + if (! QUtil::get_env(TC_ENV, &filename)) + { + return; + } +#undef TC_ENV + + if (cache.count(std::make_pair(ccase, n))) + { + return; + } + cache.insert(std::make_pair(ccase, n)); + + FILE* tc = + QUtil::fopen_wrapper("open test coverage file (" + filename + ")", + fopen(filename.c_str(), "ab")); + fprintf(tc, "%s %d\n", ccase, n); + fclose(tc); +} diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc new file mode 100644 index 00000000..c0de95f7 --- /dev/null +++ b/libqpdf/QUtil.cc @@ -0,0 +1,198 @@ + +#include <qpdf/QUtil.hh> +#include <stdio.h> +#include <errno.h> +#include <ctype.h> +#include <stdlib.h> +#ifdef _WIN32 +#include <Windows.h> +#include <direct.h> +#else +#include <unistd.h> +#endif + +std::string +QUtil::int_to_string(int num, int fullpad) +{ + // This routine will need to be recompiled if an int can be longer than + // 49 digits. + char t[50]; + + // -2 or -1 to leave space for the possible negative sign and for NUL... + if (abs(fullpad) > (int)sizeof(t) - ((num < 0)?2:1)) + { + throw QEXC::Internal("Util::int_to_string has been called with " + "a padding value greater than its internal " + "limit"); + } + + if (fullpad) + { + sprintf(t, "%0*d", fullpad, num); + } + else + { + sprintf(t, "%d", num); + } + + return std::string(t); +} + +std::string +QUtil::double_to_string(double num, int decimal_places) +{ + // This routine will need to be recompiled if a double can be longer than + // 99 digits. + char t[100]; + + std::string lhs = int_to_string((int)num); + + // lhs.length() gives us the length of the part on the right hand + // side of the dot + 1 for the dot + decimal_places: total size of + // the required string. -1 on the sizeof side to allow for NUL at + // the end. + // + // If decimal_places <= 0, it is as if no precision was provided + // so trust the buffer is big enough. The following test will + // always pass in those cases. + if (decimal_places + 1 + (int)lhs.length() > (int)sizeof(t) - 1) + { + throw QEXC::Internal("Util::double_to_string has been called with " + "a number and a decimal places specification " + "that would break an internal limit"); + } + + if (decimal_places) + { + sprintf(t, "%.*f", decimal_places, num); + } + else + { + sprintf(t, "%f", num); + } + return std::string(t); +} + +int +QUtil::os_wrapper(std::string const& description, int status) throw (QEXC::System) +{ + if (status == -1) + { + throw QEXC::System(description, errno); + } + return status; +} + +FILE* +QUtil::fopen_wrapper(std::string const& description, FILE* f) throw (QEXC::System) +{ + if (f == 0) + { + throw QEXC::System(description, errno); + } + return f; +} + +char* +QUtil::copy_string(std::string const& str) +{ + char* result = new char[str.length() + 1]; + // Use memcpy in case string contains nulls + result[str.length()] = '\0'; + memcpy(result, str.c_str(), str.length()); + return result; +} + +bool +QUtil::get_env(std::string const& var, std::string* value) +{ + // This was basically ripped out of wxWindows. +#ifdef _WIN32 + // first get the size of the buffer + DWORD len = ::GetEnvironmentVariable(var.c_str(), NULL, 0); + if (len == 0) + { + // this means that there is no such variable + return false; + } + + if (value) + { + char* t = new char[len + 1]; + ::GetEnvironmentVariable(var.c_str(), t, len); + *value = t; + delete [] t; + } + + return true; +#else + char* p = getenv(var.c_str()); + if (p == 0) + { + return false; + } + if (value) + { + *value = p; + } + + return true; +#endif +} + +std::string +QUtil::toUTF8(unsigned long uval) +{ + std::string result; + + // A UTF-8 encoding of a Unicode value is a single byte for + // Unicode values <= 127. For larger values, the first byte of + // the UTF-8 encoding has '1' as each of its n highest bits and + // '0' for its (n+1)th highest bit where n is the total number of + // bytes required. Subsequent bytes start with '10' and have the + // remaining 6 bits free for encoding. For example, an 11-bit + // unicode value can be stored in two bytes where the first is + // 110zzzzz, the second is 10zzzzzz, and the z's represent the + // remaining bits. + + if (uval > 0x7fffffff) + { + throw QEXC::General("bounds error in QUtil::toUTF8"); + } + else if (uval < 128) + { + result += (char)(uval); + } + else + { + unsigned char bytes[7]; + bytes[6] = '\0'; + unsigned char* cur_byte = &bytes[5]; + + // maximum value that will fit in the current number of bytes + unsigned char maxval = 0x3f; // six bits + + while (uval > maxval) + { + // Assign low six bits plus 10000000 to lowest unused + // byte position, then shift + *cur_byte = (unsigned char) (0x80 + (uval & 0x3f)); + uval >>= 6; + // Maximum that will fit in high byte now shrinks by one bit + maxval >>= 1; + // Slide to the left one byte + --cur_byte; + if (cur_byte < bytes) + { + throw QEXC::Internal("QUtil::toUTF8: overflow error"); + } + } + // If maxval is k bits long, the high (7 - k) bits of the + // resulting byte must be high. + *cur_byte = (unsigned char)((0xff - (1 + (maxval << 1))) + uval); + + result += (char*)cur_byte; + } + + return result; +} diff --git a/libqpdf/RC4.cc b/libqpdf/RC4.cc new file mode 100644 index 00000000..74b538b5 --- /dev/null +++ b/libqpdf/RC4.cc @@ -0,0 +1,56 @@ + +#include <qpdf/RC4.hh> + +#include <string.h> + +static void swap_byte(unsigned char &a, unsigned char &b) +{ + unsigned char t; + + t = a; + a = b; + b = t; +} + +RC4::RC4(unsigned char const* key_data, int key_len) +{ + if (key_len == -1) + { + key_len = strlen((char*)key_data); + } + + for (int i = 0; i < 256; ++i) + { + key.state[i] = i; + } + key.x = 0; + key.y = 0; + + int i1 = 0; + int i2 = 0; + for (int i = 0; i < 256; ++i) + { + i2 = (key_data[i1] + key.state[i] + i2) % 256; + swap_byte(key.state[i], key.state[i2]); + i1 = (i1 + 1) % key_len; + } +} + +void +RC4::process(unsigned char *in_data, int len, unsigned char* out_data) +{ + if (out_data == 0) + { + // Convert in place + out_data = in_data; + } + + for (int i = 0; i < len; ++i) + { + key.x = (key.x + 1) % 256; + key.y = (key.state[key.x] + key.y) % 256; + swap_byte(key.state[key.x], key.state[key.y]); + int xor_index = (key.state[key.x] + key.state[key.y]) % 256; + out_data[i] = in_data[i] ^ key.state[xor_index]; + } +} diff --git a/libqpdf/bits.icc b/libqpdf/bits.icc new file mode 100644 index 00000000..465bf5b9 --- /dev/null +++ b/libqpdf/bits.icc @@ -0,0 +1,149 @@ + +#ifndef __BITS_CC__ +#define __BITS_CC__ + +#include <algorithm> +#include <qpdf/QTC.hh> +#include <qpdf/QEXC.hh> +#include <qpdf/Pipeline.hh> + +// These functions may be run at places where the function call +// overhead from test coverage testing would be too high. Therefore, +// we make the test coverage cases conditional upon a preprocessor +// symbol. BitStream.cc includes this file without defining the +// symbol, and the specially designed test code that fully exercises +// this code includes with the symbol defined. + +#ifdef BITS_READ +static unsigned long +read_bits(unsigned char const*& p, unsigned int& bit_offset, + unsigned int& bits_available, unsigned int bits_wanted) +{ + // View p as a stream of bits: + + // 76543210 76543210 .... + + // bit_offset is the bit number within the first byte that marks + // the first bit that we would read. + + if (bits_wanted > bits_available) + { + throw QEXC::General("overflow reading bit stream"); + } + if (bits_wanted > 32) + { + throw QEXC::Internal("read_bits: too many bits requested"); + } + + unsigned long result = 0; +#ifdef BITS_TESTING + if (bits_wanted == 0) + { + QTC::TC("libtests", "bits zero bits wanted"); + } +#endif + while (bits_wanted > 0) + { + // Grab bits from the first byte clearing anything before + // bit_offset. + unsigned char byte = *p & ((1 << (bit_offset + 1)) - 1); + + // There are bit_offset + 1 bits available in the first byte. + unsigned int to_copy = std::min(bits_wanted, bit_offset + 1); + unsigned int leftover = (bit_offset + 1) - to_copy; + +#ifdef BITS_TESTING + QTC::TC("libtests", "bits bit_offset", + ((bit_offset == 0) ? 0 : + (bit_offset == 7) ? 1 : + 2)); + QTC::TC("libtests", "bits leftover", (leftover > 0) ? 1 : 0); +#endif + + // Right shift so that all the bits we want are right justified. + byte >>= leftover; + + // Copy the bits into result + result <<= to_copy; + result |= byte; + + // Update pointers + if (leftover) + { + bit_offset = leftover - 1; + } + else + { + bit_offset = 7; + ++p; + } + bits_wanted -= to_copy; + bits_available -= to_copy; + +#ifdef BITS_TESTING + QTC::TC("libtests", "bits iterations", + ((bits_wanted > 8) ? 0 : + (bits_wanted > 0) ? 1 : + 2)); +#endif + } + + return result; +} +#endif + +#ifdef BITS_WRITE +static void +write_bits(unsigned char& ch, unsigned int& bit_offset, + unsigned long val, unsigned bits, Pipeline* pipeline) +{ + if (bits > 32) + { + throw QEXC::Internal("write_bits: too many bits requested"); + } + + // bit_offset + 1 is the number of bits left in ch +#ifdef BITS_TESTING + if (bits == 0) + { + QTC::TC("libtests", "bits write zero bits"); + } +#endif + while (bits > 0) + { + int bits_to_write = std::min(bits, bit_offset + 1); + unsigned char newval = + (val >> (bits - bits_to_write)) & ((1 << bits_to_write) - 1); + int bits_left_in_ch = bit_offset + 1 - bits_to_write; + newval <<= bits_left_in_ch; + ch |= newval; + if (bits_left_in_ch == 0) + { +#ifdef BITS_TESTING + QTC::TC("libtests", "bits write pipeline"); +#endif + pipeline->write(&ch, 1); + bit_offset = 7; + ch = 0; + } + else + { +#ifdef BITS_TESTING + QTC::TC("libtests", "bits write leftover"); +#endif + bit_offset -= bits_to_write; + } + bits -= bits_to_write; +#ifdef BITS_TESTING + QTC::TC("libtests", "bits write iterations", + ((bits > 8) ? 0 : + (bits > 0) ? 1 : + 2)); +#endif + } + +} +#endif + + +#endif // __BITS_CC__ diff --git a/libqpdf/build.mk b/libqpdf/build.mk new file mode 100644 index 00000000..9733cb9f --- /dev/null +++ b/libqpdf/build.mk @@ -0,0 +1,73 @@ +TARGETS_libqpdf = \ + libqpdf/$(OUTPUT_DIR)/libqpdf.la + +INCLUDES_libqpdf = include libqpdf + +SRCS_libqpdf = \ + libqpdf/BitStream.cc \ + libqpdf/BitWriter.cc \ + libqpdf/Buffer.cc \ + libqpdf/MD5.cc \ + libqpdf/PCRE.cc \ + libqpdf/Pipeline.cc \ + libqpdf/Pl_ASCII85Decoder.cc \ + libqpdf/Pl_ASCIIHexDecoder.cc \ + libqpdf/Pl_Buffer.cc \ + libqpdf/Pl_Count.cc \ + libqpdf/Pl_Discard.cc \ + libqpdf/Pl_Flate.cc \ + libqpdf/Pl_LZWDecoder.cc \ + libqpdf/Pl_MD5.cc \ + libqpdf/Pl_PNGFilter.cc \ + libqpdf/Pl_QPDFTokenizer.cc \ + libqpdf/Pl_RC4.cc \ + libqpdf/Pl_StdioFile.cc \ + libqpdf/QEXC.cc \ + libqpdf/QPDF.cc \ + libqpdf/QPDFExc.cc \ + libqpdf/QPDFObject.cc \ + libqpdf/QPDFObjectHandle.cc \ + libqpdf/QPDFTokenizer.cc \ + libqpdf/QPDFWriter.cc \ + libqpdf/QPDFXRefEntry.cc \ + libqpdf/QPDF_Array.cc \ + libqpdf/QPDF_Bool.cc \ + libqpdf/QPDF_Dictionary.cc \ + libqpdf/QPDF_Integer.cc \ + libqpdf/QPDF_Name.cc \ + libqpdf/QPDF_Null.cc \ + libqpdf/QPDF_Real.cc \ + libqpdf/QPDF_Stream.cc \ + libqpdf/QPDF_String.cc \ + libqpdf/QPDF_encryption.cc \ + libqpdf/QPDF_linearization.cc \ + libqpdf/QPDF_optimization.cc \ + libqpdf/QTC.cc \ + libqpdf/QUtil.cc \ + libqpdf/RC4.cc + +# ----- + +OBJS_libqpdf = $(call src_to_lobj,$(SRCS_libqpdf)) + +ifeq ($(GENDEPS),1) +-include $(call lobj_to_dep,$(OBJS_libqpdf)) +endif + +$(OBJS_libqpdf): libqpdf/$(OUTPUT_DIR)/%.lo: libqpdf/%.cc + $(call libcompile,$<,$(INCLUDES_libqpdf)) + +# Last three arguments to makelib are CURRENT,REVISION,AGE. +# +# * If any interfaces have been removed or changed, we are not binary +# compatible. Increment CURRENT, and set AGE and REVISION to 0. +# +# * Otherwise, if any interfaces have been added since the last +# public release, then increment CURRENT and AGE, and set REVISION +# to 0. +# +# * Otherwise, increment REVISION + +libqpdf/$(OUTPUT_DIR)/libqpdf.la: $(OBJS_libqpdf) + $(call makelib,$(OBJS_libqpdf),$@,1,0,0) + diff --git a/libqpdf/qpdf/BitStream.hh b/libqpdf/qpdf/BitStream.hh new file mode 100644 index 00000000..d02eea42 --- /dev/null +++ b/libqpdf/qpdf/BitStream.hh @@ -0,0 +1,23 @@ +// Read bits from a bit stream. See BitWriter for writing. + +#ifndef __BITSTREAM_HH__ +#define __BITSTREAM_HH__ + +class BitStream +{ + public: + BitStream(unsigned char const* p, int nbytes); + void reset(); + unsigned long getBits(int nbits); + void skipToNextByte(); + + private: + unsigned char const* start; + int nbytes; + + unsigned char const* p; + unsigned int bit_offset; + unsigned int bits_available; +}; + +#endif // __BITSTREAM_HH__ diff --git a/libqpdf/qpdf/BitWriter.hh b/libqpdf/qpdf/BitWriter.hh new file mode 100644 index 00000000..1efd498a --- /dev/null +++ b/libqpdf/qpdf/BitWriter.hh @@ -0,0 +1,24 @@ +// Write bits into a bit stream. See BitStream for reading. + +#ifndef __THIS_FILE_Q__ +#define __THIS_FILE_Q__ + +class Pipeline; + +class BitWriter +{ + public: + // Write bits to the pipeline. It is the caller's responsibility + // to eventually call finish on the pipeline. + BitWriter(Pipeline* pl); + void writeBits(unsigned long val, int bits); + // Force any partial byte to be written to the pipeline. + void flush(); + + private: + Pipeline* pl; + unsigned char ch; + unsigned int bit_offset; +}; + +#endif // __THIS_FILE_Q__ diff --git a/libqpdf/qpdf/MD5.hh b/libqpdf/qpdf/MD5.hh new file mode 100644 index 00000000..0ae15da9 --- /dev/null +++ b/libqpdf/qpdf/MD5.hh @@ -0,0 +1,73 @@ + +#ifndef __MD5_HH__ +#define __MD5_HH__ + +#include <string> +#include <qpdf/QEXC.hh> + +class MD5 +{ + public: + typedef unsigned char Digest[16]; + + MD5(); + void reset(); + + // encodes string and finalizes + void encodeString(char const* input_string); + + // encodes file and finalizes + void encodeFile(char const* filename, int up_to_size = -1) + throw(QEXC::System); + + // appends string to current md5 object + void appendString(char const* input_string); + + // appends arbitrary data to current md5 object + void encodeDataIncrementally(char const* input_data, int len); + + // computes a raw digest + void digest(Digest); + + // prints the digest to stdout terminated with \r\n (primarily for + // testing) + void print(); + + // returns the digest as a hexademical string + std::string unparse(); + + // Convenience functions + static std::string getDataChecksum(char const* buf, int len); + static std::string getFileChecksum(char const* filename, int up_to_size = -1); + static bool checkDataChecksum(char const* const checksum, + char const* buf, int len); + static bool checkFileChecksum(char const* const checksum, + char const* filename, int up_to_size = -1); + + private: + // POINTER defines a generic pointer type + typedef void *POINTER; + + // UINT2 defines a two byte word + typedef unsigned short int UINT2; + + // UINT4 defines a four byte word + typedef unsigned long int UINT4; + + void init(); + void update(unsigned char *, unsigned int); + void final(); + + static void transform(UINT4 [4], unsigned char [64]); + static void encode(unsigned char *, UINT4 *, unsigned int); + static void decode(UINT4 *, unsigned char *, unsigned int); + + UINT4 state[4]; // state (ABCD) + UINT4 count[2]; // number of bits, modulo 2^64 (lsb first) + unsigned char buffer[64]; // input buffer + + bool finalized; + Digest digest_val; +}; + +#endif // __MD5_HH__ diff --git a/libqpdf/qpdf/PCRE.hh b/libqpdf/qpdf/PCRE.hh new file mode 100644 index 00000000..a226aa19 --- /dev/null +++ b/libqpdf/qpdf/PCRE.hh @@ -0,0 +1,107 @@ +// This is a C++ wrapper class around Philip Hazel's perl-compatible +// regular expressions library. +// + +#ifndef __PCRE_HH__ +#define __PCRE_HH__ + +#include <pcre.h> +#include <string> + +#include <qpdf/QEXC.hh> + +// Note: this class does not encapsulate all features of the PCRE +// package -- only those that I actually need right now are here. + +class PCRE +{ + public: + class Exception: public QEXC::General + { + public: + Exception(std::string const& message); + virtual ~Exception() throw() {} + }; + + // This is thrown when an attempt is made to access a non-existent + // back reference. + class NoBackref: public Exception + { + public: + NoBackref(); + virtual ~NoBackref() throw() {} + }; + + class Match + { + friend class PCRE; + public: + Match(int nbackrefs, char const* subject); + Match(Match const&); + Match& operator=(Match const&); + ~Match(); + operator bool(); + + // All the back reference accessing routines may throw the + // special exception NoBackref (derived from Exception) if the + // back reference does not exist. Exception will be thrown + // for other error conditions. This allows callers to trap + // this condition explicitly when they care about the + // difference between a backreference matching an empty string + // and not matching at all. + + // see getMatch flags below + std::string getMatch(int n, int flags = 0) + throw(QEXC::General, Exception); + void getOffsetLength(int n, int& offset, int& length) throw(Exception); + int getOffset(int n) throw(Exception); + int getLength(int n) throw(Exception); + + // nMatches returns the number of available matches including + // match 0 which is the whole string. In other words, if you + // have one backreference in your expression and the + // expression matches, nMatches() will return 2, getMatch(0) + // will return the whole string, getMatch(1) will return the + // text that matched the backreference, and getMatch(2) will + // throw an exception because it is out of range. + int nMatches() const; + + // Flags for getMatch + + // getMatch on a substring that didn't match should return + // empty string instead of throwing an exception + static int const gm_no_substring_returns_empty = (1 << 0); + + private: + void init(int nmatches, int nbackrefs, char const* subject); + void copy(Match const&); + void destroy(); + + int nbackrefs; + char const* subject; + int* ovector; + int ovecsize; + int nmatches; + }; + + // The value passed in as options is passed to pcre_exec. See man + // pcreapi for details. + PCRE(char const* pattern, int options = 0) throw(Exception); + ~PCRE(); + + Match match(char const* subject, int options = 0, int startoffset = 0, + int size = -1) + throw(QEXC::General, Exception); + + static void test(int n = 0); + + private: + // prohibit copying and assignment + PCRE(PCRE const&); + PCRE& operator=(PCRE const&); + + pcre* code; + int nbackrefs; +}; + +#endif // __PCRE_HH__ diff --git a/libqpdf/qpdf/Pl_ASCII85Decoder.hh b/libqpdf/qpdf/Pl_ASCII85Decoder.hh new file mode 100644 index 00000000..9883a58e --- /dev/null +++ b/libqpdf/qpdf/Pl_ASCII85Decoder.hh @@ -0,0 +1,23 @@ + +#ifndef __PL_ASCII85DECODER_HH__ +#define __PL_ASCII85DECODER_HH__ + +#include <qpdf/Pipeline.hh> + +class Pl_ASCII85Decoder: public Pipeline +{ + public: + Pl_ASCII85Decoder(char const* identifier, Pipeline* next); + virtual ~Pl_ASCII85Decoder(); + virtual void write(unsigned char* buf, int len); + virtual void finish(); + + private: + void flush(); + + char inbuf[5]; + int pos; + int eod; +}; + +#endif // __PL_ASCII85DECODER_HH__ diff --git a/libqpdf/qpdf/Pl_ASCIIHexDecoder.hh b/libqpdf/qpdf/Pl_ASCIIHexDecoder.hh new file mode 100644 index 00000000..36272328 --- /dev/null +++ b/libqpdf/qpdf/Pl_ASCIIHexDecoder.hh @@ -0,0 +1,23 @@ + +#ifndef __PL_ASCIIHEXDECODER_HH__ +#define __PL_ASCIIHEXDECODER_HH__ + +#include <qpdf/Pipeline.hh> + +class Pl_ASCIIHexDecoder: public Pipeline +{ + public: + Pl_ASCIIHexDecoder(char const* identifier, Pipeline* next); + virtual ~Pl_ASCIIHexDecoder(); + virtual void write(unsigned char* buf, int len); + virtual void finish(); + + private: + void flush(); + + char inbuf[3]; + int pos; + bool eod; +}; + +#endif // __PL_ASCIIHEXDECODER_HH__ diff --git a/libqpdf/qpdf/Pl_LZWDecoder.hh b/libqpdf/qpdf/Pl_LZWDecoder.hh new file mode 100644 index 00000000..95ec55b3 --- /dev/null +++ b/libqpdf/qpdf/Pl_LZWDecoder.hh @@ -0,0 +1,40 @@ + +#ifndef __PL_LZWDECODER_HH__ +#define __PL_LZWDECODER_HH__ + +#include <qpdf/Pipeline.hh> + +#include <qpdf/Buffer.hh> +#include <vector> + +class Pl_LZWDecoder: public Pipeline +{ + public: + Pl_LZWDecoder(char const* identifier, Pipeline* next, + bool early_code_change); + virtual ~Pl_LZWDecoder(); + virtual void write(unsigned char* buf, int len); + virtual void finish(); + + private: + void sendNextCode(); + void handleCode(int code); + unsigned char getFirstChar(int code); + void addToTable(unsigned char next); + + // members used for converting bits to codes + unsigned char buf[3]; + int code_size; + int next; + int byte_pos; + int bit_pos; // left to right: 01234567 + int bits_available; + + // members used for handle LZW decompression + bool code_change_delta; + bool eod; + std::vector<Buffer> table; + int last_code; +}; + +#endif // __PL_LZWDECODER_HH__ diff --git a/libqpdf/qpdf/Pl_MD5.hh b/libqpdf/qpdf/Pl_MD5.hh new file mode 100644 index 00000000..2d9d11fd --- /dev/null +++ b/libqpdf/qpdf/Pl_MD5.hh @@ -0,0 +1,30 @@ + +#ifndef __PL_MD5_HH__ +#define __PL_MD5_HH__ + +// This pipeline sends its output to its successor unmodified. After +// calling finish, the MD5 checksum of the data that passed through +// the pipeline is available. + +// This pipeline is reusable; i.e., it is safe to call write() after +// calling finish(). The first call to write() after a call to +// finish() initializes a new MD5 object. + +#include <qpdf/Pipeline.hh> +#include <qpdf/MD5.hh> + +class Pl_MD5: public Pipeline +{ + public: + Pl_MD5(char const* identifier, Pipeline* next); + virtual ~Pl_MD5(); + virtual void write(unsigned char*, int); + virtual void finish(); + std::string getHexDigest(); + + private: + bool in_progress; + MD5 md5; +}; + +#endif // __PL_MD5_HH__ diff --git a/libqpdf/qpdf/Pl_PNGFilter.hh b/libqpdf/qpdf/Pl_PNGFilter.hh new file mode 100644 index 00000000..1ecc7060 --- /dev/null +++ b/libqpdf/qpdf/Pl_PNGFilter.hh @@ -0,0 +1,62 @@ + +#ifndef __PL_PNGFILTER_HH__ +#define __PL_PNGFILTER_HH__ + +// This pipeline applies or reverses the application of a PNG filter +// as described in the PNG specification. + +// NOTE: In its initial implementation, it only encodes and decodes +// filters "none" and "up". The primary motivation of this code is to +// encode and decode PDF 1.5+ XRef streams which are often encoded +// with Flate predictor 12, which corresponds to the PNG up filter. +// At present, the bytes_per_pixel parameter is ignored, and an +// exception is thrown if any row of the file has a filter of other +// than 0 or 2. Finishing the implementation would not be difficult. +// See chapter 6 of the PNG specification for a description of the +// filter algorithms. + +#include <qpdf/Pipeline.hh> + +class Pl_PNGFilter: public Pipeline +{ + public: + class Exception: public Pipeline::Exception + { + public: + Exception(std::string const& message) : + Pipeline::Exception(message) + { + } + + virtual ~Exception() throw () + { + } + }; + + // Encoding is not presently supported + enum action_e { a_encode, a_decode }; + + Pl_PNGFilter(char const* identifier, Pipeline* next, + action_e action, unsigned int columns, + unsigned int bytes_per_pixel); + virtual ~Pl_PNGFilter(); + + virtual void write(unsigned char* data, int len); + virtual void finish(); + + private: + void processRow(); + void encodeRow(); + void decodeRow(); + + action_e action; + unsigned int columns; + unsigned char* cur_row; + unsigned char* prev_row; + unsigned char* buf1; + unsigned char* buf2; + int pos; + int incoming; +}; + +#endif // __PL_PNGFILTER_HH__ diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh new file mode 100644 index 00000000..448dbb18 --- /dev/null +++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh @@ -0,0 +1,40 @@ + +#ifndef __PL_QPDFTOKENIZER_HH__ +#define __PL_QPDFTOKENIZER_HH__ + +#include <qpdf/Pipeline.hh> + +#include <qpdf/QPDFTokenizer.hh> + +// +// Treat incoming text as a stream consisting of valid PDF tokens, but +// output bad tokens just the same. The idea here is to be able to +// use pipeline for content streams to normalize newlines without +// interfering with meaningful newlines such as those that occur +// inside of strings. +// + +class Pl_QPDFTokenizer: public Pipeline +{ + public: + Pl_QPDFTokenizer(char const* identifier, Pipeline* next); + virtual ~Pl_QPDFTokenizer(); + virtual void write(unsigned char* buf, int len); + virtual void finish(); + + private: + void processChar(char ch); + void checkUnread(); + void writeNext(char const*, int len); + void writeToken(QPDFTokenizer::Token&); + + QPDFTokenizer tokenizer; + bool newline_after_next_token; + bool just_wrote_nl; + bool last_char_was_cr; + bool unread_char; + char char_to_unread; + bool pass_through; +}; + +#endif // __PL_QPDFTOKENIZER_HH__ diff --git a/libqpdf/qpdf/Pl_RC4.hh b/libqpdf/qpdf/Pl_RC4.hh new file mode 100644 index 00000000..6bebe5aa --- /dev/null +++ b/libqpdf/qpdf/Pl_RC4.hh @@ -0,0 +1,42 @@ + +#ifndef __PL_RC4_HH__ +#define __PL_RC4_HH__ + +#include <qpdf/Pipeline.hh> + +#include <qpdf/RC4.hh> + +class Pl_RC4: public Pipeline +{ + public: + class Exception: public Pipeline::Exception + { + public: + Exception(std::string const& message) : + Pipeline::Exception(message) + { + } + + virtual ~Exception() throw() + { + } + }; + + static int const def_bufsize = 65536; + + // key_len of -1 means treat key_data as a null-terminated string + Pl_RC4(char const* identifier, Pipeline* next, + unsigned char const* key_data, int key_len = -1, + int out_bufsize = def_bufsize); + virtual ~Pl_RC4(); + + virtual void write(unsigned char* data, int len); + virtual void finish(); + + private: + unsigned char* outbuf; + int out_bufsize; + RC4 rc4; +}; + +#endif // __PL_RC4_HH__ diff --git a/libqpdf/qpdf/QPDF_Array.hh b/libqpdf/qpdf/QPDF_Array.hh new file mode 100644 index 00000000..371be50e --- /dev/null +++ b/libqpdf/qpdf/QPDF_Array.hh @@ -0,0 +1,24 @@ + +#ifndef __QPDF_ARRAY_HH__ +#define __QPDF_ARRAY_HH__ + +#include <qpdf/QPDFObject.hh> + +#include <vector> +#include <qpdf/QPDFObjectHandle.hh> + +class QPDF_Array: public QPDFObject +{ + public: + QPDF_Array(std::vector<QPDFObjectHandle> const& items); + virtual ~QPDF_Array(); + virtual std::string unparse(); + int getNItems() const; + QPDFObjectHandle getItem(int n) const; + void setItem(int, QPDFObjectHandle const&); + + private: + std::vector<QPDFObjectHandle> items; +}; + +#endif // __QPDF_ARRAY_HH__ diff --git a/libqpdf/qpdf/QPDF_Bool.hh b/libqpdf/qpdf/QPDF_Bool.hh new file mode 100644 index 00000000..06aca822 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Bool.hh @@ -0,0 +1,19 @@ + +#ifndef __QPDF_BOOL_HH__ +#define __QPDF_BOOL_HH__ + +#include <qpdf/QPDFObject.hh> + +class QPDF_Bool: public QPDFObject +{ + public: + QPDF_Bool(bool val); + virtual ~QPDF_Bool(); + virtual std::string unparse(); + bool getVal() const; + + private: + bool val; +}; + +#endif // __QPDF_BOOL_HH__ diff --git a/libqpdf/qpdf/QPDF_Dictionary.hh b/libqpdf/qpdf/QPDF_Dictionary.hh new file mode 100644 index 00000000..6a79fb69 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Dictionary.hh @@ -0,0 +1,35 @@ + +#ifndef __QPDF_DICTIONARY_HH__ +#define __QPDF_DICTIONARY_HH__ + +#include <qpdf/QPDFObject.hh> + +#include <set> +#include <map> + +#include <qpdf/QPDFObjectHandle.hh> + +class QPDF_Dictionary: public QPDFObject +{ + public: + QPDF_Dictionary(std::map<std::string, QPDFObjectHandle> const& items); + virtual ~QPDF_Dictionary(); + virtual std::string unparse(); + + // hasKey() and getKeys() treat keys with null values as if they + // aren't there. getKey() returns null for the value of a + // non-existent key. This is as per the PDF spec. + bool hasKey(std::string const&); + QPDFObjectHandle getKey(std::string const&); + std::set<std::string> getKeys(); + + // Repalce value of key, adding it if it does not exist + void replaceKey(std::string const& key, QPDFObjectHandle const&); + // Remove key, doing nothing if key does not exist + void removeKey(std::string const& key); + + private: + std::map<std::string, QPDFObjectHandle> items; +}; + +#endif // __QPDF_DICTIONARY_HH__ diff --git a/libqpdf/qpdf/QPDF_Integer.hh b/libqpdf/qpdf/QPDF_Integer.hh new file mode 100644 index 00000000..fb6360b2 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Integer.hh @@ -0,0 +1,19 @@ + +#ifndef __QPDF_INTEGER_HH__ +#define __QPDF_INTEGER_HH__ + +#include <qpdf/QPDFObject.hh> + +class QPDF_Integer: public QPDFObject +{ + public: + QPDF_Integer(int val); + virtual ~QPDF_Integer(); + virtual std::string unparse(); + int getVal() const; + + private: + int val; +}; + +#endif // __QPDF_INTEGER_HH__ diff --git a/libqpdf/qpdf/QPDF_Name.hh b/libqpdf/qpdf/QPDF_Name.hh new file mode 100644 index 00000000..a32f6f4f --- /dev/null +++ b/libqpdf/qpdf/QPDF_Name.hh @@ -0,0 +1,22 @@ + +#ifndef __QPDF_NAME_HH__ +#define __QPDF_NAME_HH__ + +#include <qpdf/QPDFObject.hh> + +class QPDF_Name: public QPDFObject +{ + public: + QPDF_Name(std::string const& name); + virtual ~QPDF_Name(); + virtual std::string unparse(); + std::string getName() const; + + // Put # into strings with characters unsuitable for name token + static std::string normalizeName(std::string const& name); + + private: + std::string name; +}; + +#endif // __QPDF_NAME_HH__ diff --git a/libqpdf/qpdf/QPDF_Null.hh b/libqpdf/qpdf/QPDF_Null.hh new file mode 100644 index 00000000..60c1ae35 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Null.hh @@ -0,0 +1,14 @@ + +#ifndef __QPDF_NULL_HH__ +#define __QPDF_NULL_HH__ + +#include <qpdf/QPDFObject.hh> + +class QPDF_Null: public QPDFObject +{ + public: + virtual ~QPDF_Null(); + std::string unparse(); +}; + +#endif // __QPDF_NULL_HH__ diff --git a/libqpdf/qpdf/QPDF_Real.hh b/libqpdf/qpdf/QPDF_Real.hh new file mode 100644 index 00000000..b950c569 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Real.hh @@ -0,0 +1,20 @@ + +#ifndef __QPDF_REAL_HH__ +#define __QPDF_REAL_HH__ + +#include <qpdf/QPDFObject.hh> + +class QPDF_Real: public QPDFObject +{ + public: + QPDF_Real(std::string const& val); + virtual ~QPDF_Real(); + std::string unparse(); + std::string getVal(); + + private: + // Store reals as strings to avoid roundoff errors. + std::string val; +}; + +#endif // __QPDF_REAL_HH__ diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh new file mode 100644 index 00000000..71381fd3 --- /dev/null +++ b/libqpdf/qpdf/QPDF_Stream.hh @@ -0,0 +1,42 @@ + +#ifndef __QPDF_STREAM_HH__ +#define __QPDF_STREAM_HH__ + +#include <qpdf/QPDFObject.hh> + +#include <qpdf/QPDFObjectHandle.hh> + +class Pipeline; +class QPDF; + +class QPDF_Stream: public QPDFObject +{ + public: + QPDF_Stream(QPDF*, int objid, int generation, + QPDFObjectHandle stream_dict, + off_t offset, int length); + virtual ~QPDF_Stream(); + virtual std::string unparse(); + QPDFObjectHandle getDict() const; + + // See comments in QPDFObjectHandle.hh + bool pipeStreamData(Pipeline*, bool filter, + bool normalize, bool compress); + + // See comments in QPDFObjectHandle.hh + PointerHolder<Buffer> getStreamData(); + + private: + bool filterable(std::vector<std::string>& filters, + int& predictor, int& columns, bool& early_code_change); + + + QPDF* qpdf; + int objid; + int generation; + QPDFObjectHandle stream_dict; + off_t offset; + int length; +}; + +#endif // __QPDF_STREAM_HH__ diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh new file mode 100644 index 00000000..f3063c50 --- /dev/null +++ b/libqpdf/qpdf/QPDF_String.hh @@ -0,0 +1,23 @@ + +#ifndef __QPDF_STRING_HH__ +#define __QPDF_STRING_HH__ + +#include <qpdf/QPDFObject.hh> + +// QPDF_Strings may included embedded null characters. + +class QPDF_String: public QPDFObject +{ + public: + QPDF_String(std::string const& val); + virtual ~QPDF_String(); + virtual std::string unparse(); + std::string unparse(bool force_binary); + std::string getVal() const; + std::string getUTF8Val() const; + + private: + std::string val; +}; + +#endif // __QPDF_STRING_HH__ diff --git a/libqpdf/qpdf/RC4.hh b/libqpdf/qpdf/RC4.hh new file mode 100644 index 00000000..657bf35b --- /dev/null +++ b/libqpdf/qpdf/RC4.hh @@ -0,0 +1,26 @@ + +#ifndef __RC4_HH__ +#define __RC4_HH__ + +class RC4 +{ + public: + // key_len of -1 means treat key_data as a null-terminated string + RC4(unsigned char const* key_data, int key_len = -1); + + // out_data = 0 means to encrypt/decrypt in place + void process(unsigned char* in_data, int len, unsigned char* out_data = 0); + + private: + class RC4Key + { + public: + unsigned char state[256]; + unsigned char x; + unsigned char y; + }; + + RC4Key key; +}; + +#endif // __RC4_HH__ |