From 5136238f2a973f693cea53c340dcff23a655531f Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Fri, 2 Feb 2018 21:16:40 -0500
Subject: Detect and report bad tokens in content normalization

---
 ChangeLog                              |  19 +++
 libqpdf/ContentNormalizer.cc           |  26 +++-
 libqpdf/QPDF_Stream.cc                 |  27 ++++
 libqpdf/qpdf/ContentNormalizer.hh      |   7 +
 qpdf/qpdf.testcov                      |   1 +
 qpdf/qtest/qpdf.test                   |  10 +-
 qpdf/qtest/qpdf/coalesce.qdf           | 231 +++++++++++++++++++++++++++++++++
 qpdf/qtest/qpdf/good14.out             |  17 ++-
 qpdf/qtest/qpdf/normalize-warnings.out |   9 ++
 9 files changed, 343 insertions(+), 4 deletions(-)
 create mode 100644 qpdf/qtest/qpdf/coalesce.qdf
 create mode 100644 qpdf/qtest/qpdf/normalize-warnings.out

diff --git a/ChangeLog b/ChangeLog
index b061c584..7d94eb9f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -153,6 +153,25 @@
 	* Provide heavily annoated examples/pdf-filter-tokens.cc example
 	that illustrates use of some simple token filters.
 
+	* When normalizing content streams, as in qdf mode, issue warning
+	about bad tokens. Content streams are only normalized when this is
+	explicitly requested, so this has no impact on normal operation.
+	However, in qdf mode, if qpdf detects a bad token, it means that
+	either there's a bug in qpdf's lexer, that the file is damaged, or
+	that the page's contents are split in a weird way. In any of those
+	cases, qpdf could potentially damage the stream's contents by
+	replacing carrige returns with newlines or otherwise messing with
+	spaces. The mostly likely case of this would be an inline image's
+	compressed data being divided across two streams and having the
+	compressed data in the second stream contain a carriage return as
+	part of its binary data. If you are using qdf mode just to look at
+	PDF files in text editors, this usually doesn't matter. In cases
+	of contents split across multiple streams, coalescing streams
+	would eliminate the problem, so the warning mentions this. Prior
+	to this enhancement, the chances of qdf mode writing incorrect
+	data were already very low. This change should make it nearly
+	impossible for qdf mode to unknowingly write invalid data.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>
 
 	* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 35a8ad74..f85ab829 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -1,7 +1,9 @@
 #include <qpdf/ContentNormalizer.hh>
 #include <qpdf/QUtil.hh>
 
-ContentNormalizer::ContentNormalizer()
+ContentNormalizer::ContentNormalizer() :
+    any_bad_tokens(false),
+    last_token_was_bad(false)
 {
 }
 
@@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
     std::string value = token.getRawValue();
     QPDFTokenizer::token_type_e token_type = token.getType();
 
+    if (token_type == QPDFTokenizer::tt_bad)
+    {
+        this->any_bad_tokens = true;
+        this->last_token_was_bad = true;
+    }
+    else if (token_type != QPDFTokenizer::tt_eof)
+    {
+        this->last_token_was_bad = false;
+    }
+
     switch (token_type)
     {
       case QPDFTokenizer::tt_space:
@@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
 {
     finish();
 }
+
+bool
+ContentNormalizer::anyBadTokens() const
+{
+    return this->any_bad_tokens;
+}
+
+bool
+ContentNormalizer::lastTokenWasBad()const
+{
+    return this->last_token_was_bad;
+}
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index a026f9a4..bb1e24e6 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
         }
     }
 
+    if (filter &&
+        (! suppress_warnings) &&
+        normalizer.getPointer() &&
+        normalizer->anyBadTokens())
+    {
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "content normalization encountered bad tokens"));
+        if (normalizer->lastTokenWasBad())
+        {
+            QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
+            warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                         "", this->offset,
+                         "normalized content ended with a bad token;"
+                         " you may be able to resolve this by"
+                         " coalescing content streams in combination"
+                         " with normalizing content. From the command"
+                         " line, specify --coalesce-contents"));
+        }
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "Resulting stream data may be corrupted but is"
+                     " may still useful for manual inspection."
+                     " For more information on this warning, search"
+                     " for content normalization in the manual."));
+    }
+
     return filter;
 }
 
diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh
index 504f15e8..89b28f3a 100644
--- a/libqpdf/qpdf/ContentNormalizer.hh
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
     virtual ~ContentNormalizer();
     virtual void handleToken(QPDFTokenizer::Token const&);
     virtual void handleEOF();
+
+    bool anyBadTokens() const;
+    bool lastTokenWasBad() const;
+
+  private:
+    bool any_bad_tokens;
+    bool last_token_was_bad;
 };
 
 #endif // __CONTENTNORMALIZER_HH__
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index a1ce662d..2c51867f 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
 QPDFObjectHandle non-stream in stream array 0
 QPDFObjectHandle coalesce called on stream 0
 QPDFObjectHandle coalesce provide stream data 0
+QPDF_Stream bad token at end during normalize 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index a3572859..45c750fd 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
 show_ntests();
 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 4;
+$n_tests += 6;
 
+$td->runtest("qdf with normalize warnings",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+             {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "coalesce.qdf"});
 $td->runtest("coalesce contents with qdf",
              {$td->COMMAND =>
                   "qpdf --qdf --static-id" .
diff --git a/qpdf/qtest/qpdf/coalesce.qdf b/qpdf/qtest/qpdf/coalesce.qdf
new file mode 100644
index 00000000..5007dc12
--- /dev/null
+++ b/qpdf/qtest/qpdf/coalesce.qdf
@@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 5 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 7 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 9 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À>”^&®¡uâ]€"!‡•–*¬&E|Sy® ðd-€<B0Bú@Nê+<hlèKÐî/56L ‰ã £–¹¦>0>Yù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«3ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 11 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 13 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000524 00000 n 
+0000000769 00000 n 
+0000000879 00000 n 
+0000000948 00000 n 
+0000001044 00000 n 
+0000001113 00000 n 
+0000001444 00000 n 
+0000001516 00000 n 
+0000001660 00000 n 
+0000001708 00000 n 
+0000001855 00000 n 
+0000001942 00000 n 
+0000002043 00000 n 
+0000002091 00000 n 
+0000002238 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+2274
+%%EOF
diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out
index 87819670..84bf7133 100644
--- a/qpdf/qtest/qpdf/good14.out
+++ b/qpdf/qtest/qpdf/good14.out
@@ -13,7 +13,9 @@ three lines
 <8a8b>
 (ab)
 <8c><dd> ) >
-<610062> (MOO)-- stream 1 --
+<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 1 --
 This stream does end with a newline.
 // tests:
 //   bad tokens preserved
@@ -31,10 +33,18 @@ This stream does end with a newline.
   
 /good name
 /bad#00name
+WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 2 --
 (This stream ends with a \001 bad token
+WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 3 --
-<AB X-- stream 4 --
+<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 4 --
 (ends with a name)
 /ThisMustBeLast-- stream 5 --
 % This stream has an inline image marker that is not terminated
@@ -44,4 +54,7 @@ BI
 ID
 <506f7
 461746f>
+WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 test 3 done
diff --git a/qpdf/qtest/qpdf/normalize-warnings.out b/qpdf/qtest/qpdf/normalize-warnings.out
new file mode 100644
index 00000000..73947b1a
--- /dev/null
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@@ -0,0 +1,9 @@
+WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems
-- 
cgit v1.2.3-70-g09d2