From 99101044429c3c91bd11bdd1b26e5b6c2ceb140b Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Fri, 2 Feb 2018 18:21:34 -0500 Subject: Implement TokenFilter and refactor Pl_QPDFTokenizer Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a general filter that passes data through a TokenFilter. --- libqpdf/Pl_QPDFTokenizer.cc | 121 +++++++++++--------------------------------- 1 file changed, 29 insertions(+), 92 deletions(-) (limited to 'libqpdf/Pl_QPDFTokenizer.cc') diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 9595cd75..4fc37767 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -1,107 +1,51 @@ #include -#include -#include #include -#include #include #include -Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : - Pipeline(identifier, next), - just_wrote_nl(false), +Pl_QPDFTokenizer::Members::Members() : + filter(0), last_char_was_cr(false), unread_char(false), char_to_unread('\0') { - tokenizer.allowEOF(); - tokenizer.includeIgnorable(); } -Pl_QPDFTokenizer::~Pl_QPDFTokenizer() +Pl_QPDFTokenizer::Members::~Members() { } -void -Pl_QPDFTokenizer::writeNext(char const* buf, size_t len) +Pl_QPDFTokenizer::Pl_QPDFTokenizer( + char const* identifier, + QPDFObjectHandle::TokenFilter* filter) + : + Pipeline(identifier, 0), + m(new Members) { - if (len) - { - getNext()->write(QUtil::unsigned_char_pointer(buf), len); - this->just_wrote_nl = (buf[len-1] == '\n'); - } + m->filter = filter; + m->tokenizer.allowEOF(); + m->tokenizer.includeIgnorable(); } -void -Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() { - std::string value = token.getRawValue(); - - switch (token.getType()) - { - case QPDFTokenizer::tt_space: - { - size_t len = value.length(); - for (size_t i = 0; i < len; ++i) - { - char ch = value.at(i); - if (ch == '\r') - { - if ((i + 1 < len) && (value.at(i + 1) == '\n')) - { - // ignore - } - else - { - writeNext("\n", 1); - } - } - else - { - writeNext(&ch, 1); - } - } - } - value.clear(); - break; - - case QPDFTokenizer::tt_string: - value = QPDF_String(token.getValue()).unparse(); - - break; - - case QPDFTokenizer::tt_name: - value = QPDF_Name(token.getValue()).unparse(); - break; - - default: - break; - } - writeNext(value.c_str(), value.length()); } void Pl_QPDFTokenizer::processChar(char ch) { - tokenizer.presentCharacter(ch); + this->m->tokenizer.presentCharacter(ch); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - std::string value = token.getRawValue(); - QPDFTokenizer::token_type_e token_type = token.getType(); - if (((token_type == QPDFTokenizer::tt_string) || - (token_type == QPDFTokenizer::tt_name)) && - ((value.find('\r') != std::string::npos) || - (value.find('\n') != std::string::npos))) + this->m->filter->handleToken(token); + if ((token.getType() == QPDFTokenizer::tt_word) && + (token.getValue() == "ID")) { - writeNext("\n", 1); - } - if ((token.getType() == QPDFTokenizer::tt_word) && - (token.getValue() == "ID")) - { QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); - tokenizer.expectInlineImage(); - } + this->m->tokenizer.expectInlineImage(); + } } } @@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch) void Pl_QPDFTokenizer::checkUnread() { - if (this->unread_char) + if (this->m->unread_char) { - processChar(this->char_to_unread); - if (this->unread_char) + processChar(this->m->char_to_unread); + if (this->m->unread_char) { throw std::logic_error( "INTERNAL ERROR: unread_char still true after processing " @@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len) void Pl_QPDFTokenizer::finish() { - this->tokenizer.presentEOF(); + this->m->tokenizer.presentEOF(); QPDFTokenizer::Token token; - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) + if (this->m->tokenizer.getToken( + token, this->m->unread_char, this->m->char_to_unread)) { - writeToken(token); - if (unread_char) - { - if (this->char_to_unread == '\r') - { - this->char_to_unread = '\n'; - } - writeNext(&this->char_to_unread, 1); - } + this->m->filter->handleToken(token); } - getNext()->finish(); + this->m->filter->handleEOF(); } -- cgit v1.2.3-54-g00ecf