aboutsummaryrefslogtreecommitdiffstats
path: root/libqpdf
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-03 00:21:34 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:46 +0100
commit99101044429c3c91bd11bdd1b26e5b6c2ceb140b (patch)
tree5ab366eab31ddf76e80f99bd1d34c421291f1c4e /libqpdf
parentb8723e97f4b94fe03e631aab0309382ead3137ed (diff)
downloadqpdf-99101044429c3c91bd11bdd1b26e5b6c2ceb140b.tar.zst
Implement TokenFilter and refactor Pl_QPDFTokenizer
Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a general filter that passes data through a TokenFilter.
Diffstat (limited to 'libqpdf')
-rw-r--r--libqpdf/ContentNormalizer.cc77
-rw-r--r--libqpdf/Pl_QPDFTokenizer.cc121
-rw-r--r--libqpdf/QPDFObjectHandle.cc66
-rw-r--r--libqpdf/QPDFTokenizer.cc18
-rw-r--r--libqpdf/QPDFWriter.cc6
-rw-r--r--libqpdf/QPDF_Stream.cc34
-rw-r--r--libqpdf/build.mk1
-rw-r--r--libqpdf/qpdf/ContentNormalizer.hh15
-rw-r--r--libqpdf/qpdf/Pl_QPDFTokenizer.hh32
-rw-r--r--libqpdf/qpdf/QPDF_Stream.hh5
10 files changed, 269 insertions, 106 deletions
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
new file mode 100644
index 00000000..35a8ad74
--- /dev/null
+++ b/libqpdf/ContentNormalizer.cc
@@ -0,0 +1,77 @@
+#include <qpdf/ContentNormalizer.hh>
+#include <qpdf/QUtil.hh>
+
+ContentNormalizer::ContentNormalizer()
+{
+}
+
+ContentNormalizer::~ContentNormalizer()
+{
+}
+
+void
+ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
+{
+ std::string value = token.getRawValue();
+ QPDFTokenizer::token_type_e token_type = token.getType();
+
+ switch (token_type)
+ {
+ case QPDFTokenizer::tt_space:
+ {
+ size_t len = value.length();
+ for (size_t i = 0; i < len; ++i)
+ {
+ char ch = value.at(i);
+ if (ch == '\r')
+ {
+ if ((i + 1 < len) && (value.at(i + 1) == '\n'))
+ {
+ // ignore
+ }
+ else
+ {
+ write("\n");
+ }
+ }
+ else
+ {
+ write(&ch, 1);
+ }
+ }
+ }
+ break;
+
+ case QPDFTokenizer::tt_string:
+ // Replacing string and name tokens in this way normalizes
+ // their representation as this will automatically handle
+ // quoting of unprintable characters, etc.
+ writeToken(QPDFTokenizer::Token(
+ QPDFTokenizer::tt_string, token.getValue()));
+ break;
+
+ case QPDFTokenizer::tt_name:
+ writeToken(QPDFTokenizer::Token(
+ QPDFTokenizer::tt_name, token.getValue()));
+ break;
+
+ default:
+ writeToken(token);
+ break;
+ }
+
+ value = token.getRawValue();
+ if (((token_type == QPDFTokenizer::tt_string) ||
+ (token_type == QPDFTokenizer::tt_name)) &&
+ ((value.find('\r') != std::string::npos) ||
+ (value.find('\n') != std::string::npos)))
+ {
+ write("\n");
+ }
+}
+
+void
+ContentNormalizer::handleEOF()
+{
+ finish();
+}
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
index 9595cd75..4fc37767 100644
--- a/libqpdf/Pl_QPDFTokenizer.cc
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -1,107 +1,51 @@
#include <qpdf/Pl_QPDFTokenizer.hh>
-#include <qpdf/QPDF_String.hh>
-#include <qpdf/QPDF_Name.hh>
#include <qpdf/QTC.hh>
-#include <qpdf/QUtil.hh>
#include <stdexcept>
#include <string.h>
-Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
- Pipeline(identifier, next),
- just_wrote_nl(false),
+Pl_QPDFTokenizer::Members::Members() :
+ filter(0),
last_char_was_cr(false),
unread_char(false),
char_to_unread('\0')
{
- tokenizer.allowEOF();
- tokenizer.includeIgnorable();
}
-Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
+Pl_QPDFTokenizer::Members::~Members()
{
}
-void
-Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)
+Pl_QPDFTokenizer::Pl_QPDFTokenizer(
+ char const* identifier,
+ QPDFObjectHandle::TokenFilter* filter)
+ :
+ Pipeline(identifier, 0),
+ m(new Members)
{
- if (len)
- {
- getNext()->write(QUtil::unsigned_char_pointer(buf), len);
- this->just_wrote_nl = (buf[len-1] == '\n');
- }
+ m->filter = filter;
+ m->tokenizer.allowEOF();
+ m->tokenizer.includeIgnorable();
}
-void
-Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
+Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
{
- std::string value = token.getRawValue();
-
- switch (token.getType())
- {
- case QPDFTokenizer::tt_space:
- {
- size_t len = value.length();
- for (size_t i = 0; i < len; ++i)
- {
- char ch = value.at(i);
- if (ch == '\r')
- {
- if ((i + 1 < len) && (value.at(i + 1) == '\n'))
- {
- // ignore
- }
- else
- {
- writeNext("\n", 1);
- }
- }
- else
- {
- writeNext(&ch, 1);
- }
- }
- }
- value.clear();
- break;
-
- case QPDFTokenizer::tt_string:
- value = QPDF_String(token.getValue()).unparse();
-
- break;
-
- case QPDFTokenizer::tt_name:
- value = QPDF_Name(token.getValue()).unparse();
- break;
-
- default:
- break;
- }
- writeNext(value.c_str(), value.length());
}
void
Pl_QPDFTokenizer::processChar(char ch)
{
- tokenizer.presentCharacter(ch);
+ this->m->tokenizer.presentCharacter(ch);
QPDFTokenizer::Token token;
- if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ if (this->m->tokenizer.getToken(
+ token, this->m->unread_char, this->m->char_to_unread))
{
- writeToken(token);
- std::string value = token.getRawValue();
- QPDFTokenizer::token_type_e token_type = token.getType();
- if (((token_type == QPDFTokenizer::tt_string) ||
- (token_type == QPDFTokenizer::tt_name)) &&
- ((value.find('\r') != std::string::npos) ||
- (value.find('\n') != std::string::npos)))
+ this->m->filter->handleToken(token);
+ if ((token.getType() == QPDFTokenizer::tt_word) &&
+ (token.getValue() == "ID"))
{
- writeNext("\n", 1);
- }
- if ((token.getType() == QPDFTokenizer::tt_word) &&
- (token.getValue() == "ID"))
- {
QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
- tokenizer.expectInlineImage();
- }
+ this->m->tokenizer.expectInlineImage();
+ }
}
}
@@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch)
void
Pl_QPDFTokenizer::checkUnread()
{
- if (this->unread_char)
+ if (this->m->unread_char)
{
- processChar(this->char_to_unread);
- if (this->unread_char)
+ processChar(this->m->char_to_unread);
+ if (this->m->unread_char)
{
throw std::logic_error(
"INTERNAL ERROR: unread_char still true after processing "
@@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
void
Pl_QPDFTokenizer::finish()
{
- this->tokenizer.presentEOF();
+ this->m->tokenizer.presentEOF();
QPDFTokenizer::Token token;
- if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ if (this->m->tokenizer.getToken(
+ token, this->m->unread_char, this->m->char_to_unread))
{
- writeToken(token);
- if (unread_char)
- {
- if (this->char_to_unread == '\r')
- {
- this->char_to_unread = '\n';
- }
- writeNext(&this->char_to_unread, 1);
- }
+ this->m->filter->handleToken(token);
}
- getNext()->finish();
+ this->m->filter->handleEOF();
}
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 51de87e1..bba95938 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -63,6 +63,50 @@ CoalesceProvider::provideStreamData(int, int, Pipeline* p)
}
void
+QPDFObjectHandle::TokenFilter::setPipeline(Pipeline* p)
+{
+ this->pipeline = p;
+}
+
+void
+QPDFObjectHandle::TokenFilter::write(char const* data, size_t len)
+{
+ if (! this->pipeline)
+ {
+ throw std::logic_error(
+ "TokenFilter::write called before setPipeline");
+ }
+ if (len)
+ {
+ this->pipeline->write(QUtil::unsigned_char_pointer(data), len);
+ }
+}
+
+void
+QPDFObjectHandle::TokenFilter::write(std::string const& str)
+{
+ write(str.c_str(), str.length());
+}
+
+void
+QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
+{
+ std::string value = token.getRawValue();
+ write(value.c_str(), value.length());
+}
+
+void
+QPDFObjectHandle::TokenFilter::finish()
+{
+ if (! this->pipeline)
+ {
+ throw std::logic_error(
+ "TokenFilter::finish called before setPipeline");
+ }
+ this->pipeline->finish();
+}
+
+void
QPDFObjectHandle::ParserCallbacks::terminateParsing()
{
throw TerminateParsing();
@@ -508,6 +552,13 @@ QPDFObjectHandle::getDict()
return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict();
}
+bool
+QPDFObjectHandle::isDataModified()
+{
+ assertStream();
+ return dynamic_cast<QPDF_Stream*>(obj.getPointer())->isDataModified();
+}
+
void
QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict)
{
@@ -1033,6 +1084,21 @@ QPDFObjectHandle::parseContentStream_data(
}
}
+void
+QPDFObjectHandle::addContentTokenFilter(PointerHolder<TokenFilter> filter)
+{
+ coalesceContentStreams();
+ this->getKey("/Contents").addTokenFilter(filter);
+}
+
+void
+QPDFObjectHandle::addTokenFilter(PointerHolder<TokenFilter> filter)
+{
+ assertStream();
+ return dynamic_cast<QPDF_Stream*>(
+ obj.getPointer())->addTokenFilter(filter);
+}
+
QPDFObjectHandle
QPDFObjectHandle::parse(PointerHolder<InputSource> input,
std::string const& object_description,
diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc
index 078b1af0..c3a017d0 100644
--- a/libqpdf/QPDFTokenizer.cc
+++ b/libqpdf/QPDFTokenizer.cc
@@ -7,6 +7,7 @@
#include <qpdf/QTC.hh>
#include <qpdf/QPDFExc.hh>
#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFObjectHandle.hh>
#include <stdexcept>
#include <string.h>
@@ -39,6 +40,23 @@ QPDFTokenizer::Members::~Members()
{
}
+QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
+ type(type),
+ value(value),
+ raw_value(value)
+{
+ if (type == tt_string)
+ {
+ raw_value = QPDFObjectHandle::newString(value).unparse();
+ }
+ else if (type == tt_string)
+ {
+ raw_value = QPDFObjectHandle::newName(value).unparse();
+ }
+}
+
+
+
QPDFTokenizer::QPDFTokenizer() :
m(new Members())
{
diff --git a/libqpdf/QPDFWriter.cc b/libqpdf/QPDFWriter.cc
index f7f834b5..f277189a 100644
--- a/libqpdf/QPDFWriter.cc
+++ b/libqpdf/QPDFWriter.cc
@@ -1591,7 +1591,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
{
is_metadata = true;
}
- bool filter = (this->m->compress_streams ||
+ bool filter = (object.isDataModified() ||
+ this->m->compress_streams ||
this->m->stream_decode_level);
if (this->m->compress_streams)
{
@@ -1602,7 +1603,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
// compressed with a lossy compression scheme, but we
// don't support any of those right now.
QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter");
- if (filter_obj.isName() &&
+ if ((! object.isDataModified()) &&
+ filter_obj.isName() &&
((filter_obj.getName() == "/FlateDecode") ||
(filter_obj.getName() == "/Fl")))
{
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index 89b6b5a9..a026f9a4 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -13,7 +13,7 @@
#include <qpdf/Pl_RunLength.hh>
#include <qpdf/Pl_DCT.hh>
#include <qpdf/Pl_Count.hh>
-
+#include <qpdf/ContentNormalizer.hh>
#include <qpdf/QTC.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFExc.hh>
@@ -91,6 +91,12 @@ QPDF_Stream::getDict() const
return this->stream_dict;
}
+bool
+QPDF_Stream::isDataModified() const
+{
+ return (! this->token_filters.empty());
+}
+
PointerHolder<Buffer>
QPDF_Stream::getStreamData(qpdf_stream_decode_level_e decode_level)
{
@@ -440,21 +446,36 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
// create to be deleted when this function finishes.
std::vector<PointerHolder<Pipeline> > to_delete;
+ PointerHolder<ContentNormalizer> normalizer;
if (filter)
{
if (encode_flags & qpdf_ef_compress)
{
- pipeline = new Pl_Flate("compress object stream", pipeline,
+ pipeline = new Pl_Flate("compress stream", pipeline,
Pl_Flate::a_deflate);
to_delete.push_back(pipeline);
}
if (encode_flags & qpdf_ef_normalize)
{
- pipeline = new Pl_QPDFTokenizer("normalizer", pipeline);
+ normalizer = new ContentNormalizer();
+ normalizer->setPipeline(pipeline);
+ pipeline = new Pl_QPDFTokenizer(
+ "normalizer", normalizer.getPointer());
to_delete.push_back(pipeline);
}
+ for (std::vector<PointerHolder<
+ QPDFObjectHandle::TokenFilter> >::reverse_iterator iter =
+ this->token_filters.rbegin();
+ iter != this->token_filters.rend(); ++iter)
+ {
+ (*iter)->setPipeline(pipeline);
+ pipeline = new Pl_QPDFTokenizer(
+ "token filter", (*iter).getPointer());
+ to_delete.push_back(pipeline);
+ }
+
for (std::vector<std::string>::reverse_iterator iter = filters.rbegin();
iter != filters.rend(); ++iter)
{
@@ -613,6 +634,13 @@ QPDF_Stream::replaceStreamData(
}
void
+QPDF_Stream::addTokenFilter(
+ PointerHolder<QPDFObjectHandle::TokenFilter> token_filter)
+{
+ this->token_filters.push_back(token_filter);
+}
+
+void
QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter,
QPDFObjectHandle const& decode_parms,
size_t length)
diff --git a/libqpdf/build.mk b/libqpdf/build.mk
index c75c3dd9..11895623 100644
--- a/libqpdf/build.mk
+++ b/libqpdf/build.mk
@@ -9,6 +9,7 @@ SRCS_libqpdf = \
libqpdf/BitWriter.cc \
libqpdf/Buffer.cc \
libqpdf/BufferInputSource.cc \
+ libqpdf/ContentNormalizer.cc \
libqpdf/FileInputSource.cc \
libqpdf/InputSource.cc \
libqpdf/InsecureRandomDataProvider.cc \
diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh
new file mode 100644
index 00000000..504f15e8
--- /dev/null
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@@ -0,0 +1,15 @@
+#ifndef __CONTENTNORMALIZER_HH__
+#define __CONTENTNORMALIZER_HH__
+
+#include <qpdf/QPDFObjectHandle.hh>
+
+class ContentNormalizer: public QPDFObjectHandle::TokenFilter
+{
+ public:
+ ContentNormalizer();
+ virtual ~ContentNormalizer();
+ virtual void handleToken(QPDFTokenizer::Token const&);
+ virtual void handleEOF();
+};
+
+#endif // __CONTENTNORMALIZER_HH__
diff --git a/libqpdf/qpdf/Pl_QPDFTokenizer.hh b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
index 54507f68..9f4ac133 100644
--- a/libqpdf/qpdf/Pl_QPDFTokenizer.hh
+++ b/libqpdf/qpdf/Pl_QPDFTokenizer.hh
@@ -4,6 +4,8 @@
#include <qpdf/Pipeline.hh>
#include <qpdf/QPDFTokenizer.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/QPDFObjectHandle.hh>
//
// Treat incoming text as a stream consisting of valid PDF tokens, but
@@ -16,7 +18,8 @@
class Pl_QPDFTokenizer: public Pipeline
{
public:
- Pl_QPDFTokenizer(char const* identifier, Pipeline* next);
+ Pl_QPDFTokenizer(char const* identifier,
+ QPDFObjectHandle::TokenFilter* filter);
virtual ~Pl_QPDFTokenizer();
virtual void write(unsigned char* buf, size_t len);
virtual void finish();
@@ -24,14 +27,25 @@ class Pl_QPDFTokenizer: public Pipeline
private:
void processChar(char ch);
void checkUnread();
- void writeNext(char const*, size_t len);
- void writeToken(QPDFTokenizer::Token&);
-
- QPDFTokenizer tokenizer;
- bool just_wrote_nl;
- bool last_char_was_cr;
- bool unread_char;
- char char_to_unread;
+
+ class Members
+ {
+ friend class Pl_QPDFTokenizer;
+
+ public:
+ ~Members();
+
+ private:
+ Members();
+ Members(Members const&);
+
+ QPDFObjectHandle::TokenFilter* filter;
+ QPDFTokenizer tokenizer;
+ bool last_char_was_cr;
+ bool unread_char;
+ char char_to_unread;
+ };
+ PointerHolder<Members> m;
};
#endif // __PL_QPDFTOKENIZER_HH__
diff --git a/libqpdf/qpdf/QPDF_Stream.hh b/libqpdf/qpdf/QPDF_Stream.hh
index 5350fc0d..86b796cf 100644
--- a/libqpdf/qpdf/QPDF_Stream.hh
+++ b/libqpdf/qpdf/QPDF_Stream.hh
@@ -20,6 +20,7 @@ class QPDF_Stream: public QPDFObject
virtual QPDFObject::object_type_e getTypeCode() const;
virtual char const* getTypeName() const;
QPDFObjectHandle getDict() const;
+ bool isDataModified() const;
// See comments in QPDFObjectHandle.hh for these methods.
bool pipeStreamData(Pipeline*,
@@ -35,6 +36,8 @@ class QPDF_Stream: public QPDFObject
PointerHolder<QPDFObjectHandle::StreamDataProvider> provider,
QPDFObjectHandle const& filter,
QPDFObjectHandle const& decode_parms);
+ void addTokenFilter(
+ PointerHolder<QPDFObjectHandle::TokenFilter> token_filter);
void replaceDict(QPDFObjectHandle new_dict);
@@ -72,6 +75,8 @@ class QPDF_Stream: public QPDFObject
size_t length;
PointerHolder<Buffer> stream_data;
PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider;
+ std::vector<
+ PointerHolder<QPDFObjectHandle::TokenFilter> > token_filters;
};
#endif // __QPDF_STREAM_HH__