Detect and report bad tokens in content normalization

author: Jay Berkenbilt <ejb@ql.org> 2018-02-03 03:16:40 +0100
committer: Jay Berkenbilt <ejb@ql.org> 2018-02-19 03:05:47 +0100
commit: 5136238f2a973f693cea53c340dcff23a655531f (patch)
tree: 8cc1d2a1fdf1833fa67454b2707994b3328c879c /libqpdf
parent: 30709935af023dd66a17f2d494aa7dc84b7177e1 (diff)
download: qpdf-5136238f2a973f693cea53c340dcff23a655531f.tar.zst
3 files changed, 59 insertions, 1 deletions
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 35a8ad74..f85ab829 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -1,7 +1,9 @@
 #include <qpdf/ContentNormalizer.hh>
 #include <qpdf/QUtil.hh>
 
-ContentNormalizer::ContentNormalizer()
+ContentNormalizer::ContentNormalizer() :
+    any_bad_tokens(false),
+    last_token_was_bad(false)
 {
 }
 
@@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
     std::string value = token.getRawValue();
     QPDFTokenizer::token_type_e token_type = token.getType();
 
+    if (token_type == QPDFTokenizer::tt_bad)
+    {
+        this->any_bad_tokens = true;
+        this->last_token_was_bad = true;
+    }
+    else if (token_type != QPDFTokenizer::tt_eof)
+    {
+        this->last_token_was_bad = false;
+    }
+
     switch (token_type)
     {
       case QPDFTokenizer::tt_space:
@@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
 {
     finish();
 }
+
+bool
+ContentNormalizer::anyBadTokens() const
+{
+    return this->any_bad_tokens;
+}
+
+bool
+ContentNormalizer::lastTokenWasBad()const
+{
+    return this->last_token_was_bad;
+}
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index a026f9a4..bb1e24e6 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
         }
     }
 
+    if (filter &&
+        (! suppress_warnings) &&
+        normalizer.getPointer() &&
+        normalizer->anyBadTokens())
+    {
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "content normalization encountered bad tokens"));
+        if (normalizer->lastTokenWasBad())
+        {
+            QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
+            warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                         "", this->offset,
+                         "normalized content ended with a bad token;"
+                         " you may be able to resolve this by"
+                         " coalescing content streams in combination"
+                         " with normalizing content. From the command"
+                         " line, specify --coalesce-contents"));
+        }
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "Resulting stream data may be corrupted but is"
+                     " may still useful for manual inspection."
+                     " For more information on this warning, search"
+                     " for content normalization in the manual."));
+    }
+
     return filter;
 }
 
diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh
index 504f15e8..89b28f3a 100644
--- a/libqpdf/qpdf/ContentNormalizer.hh
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
     virtual ~ContentNormalizer();
     virtual void handleToken(QPDFTokenizer::Token const&);
     virtual void handleEOF();
+
+    bool anyBadTokens() const;
+    bool lastTokenWasBad() const;
+
+  private:
+    bool any_bad_tokens;
+    bool last_token_was_bad;
 };
 
 #endif // __CONTENTNORMALIZER_HH__
author	Jay Berkenbilt <ejb@ql.org>	2018-02-03 03:16:40 +0100
committer	Jay Berkenbilt <ejb@ql.org>	2018-02-19 03:05:47 +0100
commit	5136238f2a973f693cea53c340dcff23a655531f (patch)
tree	8cc1d2a1fdf1833fa67454b2707994b3328c879c /libqpdf
parent	30709935af023dd66a17f2d494aa7dc84b7177e1 (diff)
download	qpdf-5136238f2a973f693cea53c340dcff23a655531f.tar.zst