From b30deaeeaba3941d7615bc2cc89c664b1273e5df Mon Sep 17 00:00:00 2001
From: Jay Berkenbilt <ejb@ql.org>
Date: Fri, 23 Oct 2020 06:40:27 -0400
Subject: Avoid merging adjacent tokens when concatenating contents (fixes
 #444)

---
 ChangeLog                                  |   6 +
 TODO                                       |   1 -
 libqpdf/QPDFObjectHandle.cc                |  53 ++++++-
 manual/qpdf-manual.xml                     |  39 ++---
 qpdf/qpdf.testcov                          |   1 +
 qpdf/qtest/qpdf.test                       |  18 ++-
 qpdf/qtest/qpdf/coalesce-out.pdf           | Bin 1623 -> 2951 bytes
 qpdf/qtest/qpdf/coalesce-out.qdf           | Bin 2192 -> 3520 bytes
 qpdf/qtest/qpdf/coalesce-split-1-2.pdf     | 231 -----------------------------
 qpdf/qtest/qpdf/coalesce-split.out         |  10 --
 qpdf/qtest/qpdf/coalesce.pdf               | Bin 2445 -> 3769 bytes
 qpdf/qtest/qpdf/coalesce.qdf               | Bin 2801 -> 4126 bytes
 qpdf/qtest/qpdf/normalize-warnings.out     |  16 +-
 qpdf/qtest/qpdf/split-tokens-split-1-2.pdf | 231 +++++++++++++++++++++++++++++
 qpdf/qtest/qpdf/split-tokens-split.out     |  10 ++
 qpdf/qtest/qpdf/split-tokens.pdf           | 217 +++++++++++++++++++++++++++
 qpdf/qtest/qpdf/split-tokens.qdf           | 231 +++++++++++++++++++++++++++++
 qpdf/qtest/qpdf/token-filters-out.pdf      | Bin 2178 -> 3505 bytes
 18 files changed, 781 insertions(+), 283 deletions(-)
 delete mode 100644 qpdf/qtest/qpdf/coalesce-split-1-2.pdf
 delete mode 100644 qpdf/qtest/qpdf/coalesce-split.out
 create mode 100644 qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
 create mode 100644 qpdf/qtest/qpdf/split-tokens-split.out
 create mode 100644 qpdf/qtest/qpdf/split-tokens.pdf
 create mode 100644 qpdf/qtest/qpdf/split-tokens.qdf
diff --git a/ChangeLog b/ChangeLog
index f7ba4f6a..fd057636 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2020-10-23  Jay Berkenbilt  <ejb@ql.org>
 
+	* Bug fix: when concatenating content streams, insert a newline if
+	needed to prevent the last token from the old stream from being
+	merged with the first token of the new stream. Qpdf was mistakenly
+	concatenating the streams without regard to the specification that
+	content streams are to be broken on token boundaries. Fixes #444.
+
 	* Bug fix: fix-qdf: properly handle empty streams with ignore
 	newline.
 
diff --git a/TODO b/TODO
index cd6f4c88..2e3898ff 100644
--- a/TODO
+++ b/TODO
@@ -4,7 +4,6 @@ Candidates for upcoming release
 * Open "next" issues
   * bugs
     * #473: zsh completion with directories
-    * #444: concatenated stream/whitespace bug
   * Non-bugs
     * #446: recognize edited QDF files
     * #436: parsing of document with form xobject
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 85493680..472ff4e8 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing()
     throw TerminateParsing();
 }
 
+class LastChar: public Pipeline
+{
+  public:
+    LastChar(Pipeline* next);
+    virtual ~LastChar() = default;
+    virtual void write(unsigned char* data, size_t len);
+    virtual void finish();
+    unsigned char getLastChar();
+
+  private:
+    unsigned char last_char;
+};
+
+LastChar::LastChar(Pipeline* next) :
+    Pipeline("lastchar", next),
+    last_char(0)
+{
+}
+
+void
+LastChar::write(unsigned char* data, size_t len)
+{
+    if (len > 0)
+    {
+        this->last_char = data[len - 1];
+    }
+    getNext()->write(data, len);
+}
+
+void
+LastChar::finish()
+{
+    getNext()->finish();
+}
+
+unsigned char
+LastChar::getLastChar()
+{
+    return this->last_char;
+}
+
 QPDFObjectHandle::QPDFObjectHandle() :
     initialized(false),
     qpdf(0),
@@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams(
     std::vector<QPDFObjectHandle> streams =
         arrayOrStreamToStreamArray(
             description, all_description);
+    bool need_newline = false;
     for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
          iter != streams.end(); ++iter)
     {
+        if (need_newline)
+        {
+            p->write(QUtil::unsigned_char_pointer("\n"), 1);
+        }
+        LastChar lc(p);
         QPDFObjectHandle stream = *iter;
         std::string og =
             QUtil::int_to_string(stream.getObjectID()) + " " +
             QUtil::int_to_string(stream.getGeneration());
         std::string w_description = "content stream object " + og;
-        if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
+        if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized))
         {
             QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
             throw QPDFExc(qpdf_e_damaged_pdf, "content stream",
                           w_description, 0,
                           "errors while decoding content stream");
         }
+        lc.finish();
+        need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n'));
+        QTC::TC("qpdf", "QPDFObjectHandle need_newline",
+                need_newline ? 0 : 1);
     }
 }
 
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index 866a5016..659fbd08 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -2090,14 +2090,9 @@ outfile.pdf</option>
         option causes qpdf to combine them into a single stream. Use
         of this option is never necessary for ordinary usage, but it
         can help when working with some files in some cases. For
-        example, some PDF writers split page contents into small
-        streams at arbitrary points that may fall in the middle of
-        lexical tokens within the content, and some PDF readers may
-        get confused on such files. If you use qpdf to coalesce the
-        content streams, such readers may be able to work with the
-        file more easily. This can also be combined with QDF mode or
-        content normalization to make it easier to look at all of a
-        page's contents at once.
+        example, this can also be combined with QDF mode or content
+        normalization to make it easier to look at all of a page's
+        contents at once.
        </para>
       </listitem>
      </varlistentry>
@@ -2398,25 +2393,15 @@ outfile.pdf</option>
     You should not use this for &ldquo;production&rdquo; PDF files.
    </para>
    <para>
-    This paragraph discusses edge cases of content normalization that
-    are not of concern to most users and are not relevant when content
-    normalization is not enabled. When normalizing content, if qpdf
-    runs into any lexical errors, it will print a warning indicating
-    that content may be damaged. The only situation in which qpdf is
-    known to cause damage during content normalization is when a
-    page's contents are split across multiple streams and streams are
-    split in the middle of a lexical token such as a string, name, or
-    inline image. There may be some pathological cases in which qpdf
-    could damage content without noticing this, such as if the partial
-    tokens at the end of one stream and the beginning of the next
-    stream are both valid, but usually qpdf will be able to detect
-    this case. For slightly increased safety, you can specify
-    <option>--coalesce-contents</option> in addition to
-    <option>--normalize-content</option> or <option>--qdf</option>.
-    This will cause qpdf to combine all the content streams into one,
-    thus recombining any split tokens. However doing this will prevent
-    you from being able to see the original layout of the content
-    streams. If you must inspect the original content streams in an
+    When normalizing content, if qpdf runs into any lexical errors, it
+    will print a warning indicating that content may be damaged. The
+    only situation in which qpdf is known to cause damage during
+    content normalization is when a page's contents are split across
+    multiple streams and streams are split in the middle of a lexical
+    token such as a string, name, or inline image. Note that files
+    that do this are invalid since the PDF specification states that
+    content streams are not to be split in the middle of a token. If
+    you want to inspect the original content streams in an
     uncompressed format, you can always run with <option>--qdf
     --normalize-content=n</option> for a QDF file without content
     normalization, or alternatively
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index 621ec53a..ced20279 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0
 qpdf found shared xobject in leaf 0
 QPDF copy foreign with data 1
 QPDF copy foreign with foreign_stream 1
+QPDFObjectHandle need_newline 1
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index a0ff2a57..75021b56 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -1591,13 +1591,21 @@ $td->runtest("type checks with object streams",
 
 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 6;
+$n_tests += 8;
 
 $td->runtest("qdf with normalize warnings",
              {$td->COMMAND =>
-                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+                  "qpdf --qdf --static-id split-tokens.pdf a.pdf"},
              {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
              $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "split-tokens.qdf"});
+$td->runtest("coalesce to qdf",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+             {$td->STRING => "", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
              {$td->FILE => "a.pdf"},
              {$td->FILE => "coalesce.qdf"});
@@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token",
              {$td->COMMAND =>
                   "qpdf --qdf --static-id --split-pages=2" .
                   " --remove-unreferenced-resources=yes" .
-                  " coalesce.pdf split-out-bad-token.pdf"},
-             {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
+                  " split-tokens.pdf split-out-bad-token.pdf"},
+             {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3},
              $td->NORMALIZE_NEWLINES);
 $td->runtest("check output",
              {$td->FILE => "split-out-bad-token-1-2.pdf"},
-             {$td->FILE => "coalesce-split-1-2.pdf"});
+             {$td->FILE => "split-tokens-split-1-2.pdf"});
 
 $td->runtest("shared images in form xobject",
              {$td->COMMAND => "qpdf --qdf --static-id --split-pages".
diff --git a/qpdf/qtest/qpdf/coalesce-out.pdf b/qpdf/qtest/qpdf/coalesce-out.pdf
index 78505aba..a0dae39d 100644
Binary files a/qpdf/qtest/qpdf/coalesce-out.pdf and b/qpdf/qtest/qpdf/coalesce-out.pdf differ
diff --git a/qpdf/qtest/qpdf/coalesce-out.qdf b/qpdf/qtest/qpdf/coalesce-out.qdf
index 9a7129f3..822fdd17 100644
Binary files a/qpdf/qtest/qpdf/coalesce-out.qdf and b/qpdf/qtest/qpdf/coalesce-out.qdf differ
diff --git a/qpdf/qtest/qpdf/coalesce-split-1-2.pdf b/qpdf/qtest/qpdf/coalesce-split-1-2.pdf
deleted file mode 100644
index 4542411e..00000000
--- a/qpdf/qtest/qpdf/coalesce-split-1-2.pdf
+++ /dev/null
@@ -1,231 +0,0 @@
-%PDF-1.3
-%¿÷¢þ
-%QDF-1.0
-
-%% Original object ID: 1 0
-1 0 obj
-<<
-  /Pages 2 0 R
-  /Type /Catalog
->>
-endobj
-
-%% Original object ID: 2 0
-2 0 obj
-<<
-  /Count 2
-  /Kids [
-    3 0 R
-    4 0 R
-  ]
-  /Type /Pages
->>
-endobj
-
-%% Page 1
-%% Original object ID: 3 0
-3 0 obj
-<<
-  /Contents [
-    5 0 R
-    7 0 R
-    9 0 R
-    11 0 R
-  ]
-  /MediaBox [
-    0
-    0
-    612
-    792
-  ]
-  /Parent 2 0 R
-  /Resources <<
-    /Font <<
-      /F1 13 0 R
-    >>
-    /ProcSet 14 0 R
-  >>
-  /Type /Page
->>
-endobj
-
-%% Page 2
-%% Original object ID: 14 0
-4 0 obj
-<<
-  /Contents 15 0 R
-  /MediaBox [
-    0
-    0
-    612
-    792
-  ]
-  /Parent 2 0 R
-  /Resources <<
-    /Font <<
-      /F1 17 0 R
-    >>
-    /ProcSet 18 0 R
-  >>
-  /Type /Page
->>
-endobj
-
-%% Contents for page 1
-%% Original object ID: 4 0
-5 0 obj
-<<
-  /Length 6 0 R
->>
-stream
-BT
-  /F1 24 Tf
-  72 720 Td
-  (Pot
-endstream
-endobj
-
-%QDF: ignore_newline
-6 0 obj
-33
-endobj
-
-%% Contents for page 1
-%% Original object ID: 6 0
-7 0 obj
-<<
-  /Length 8 0 R
->>
-stream
-ato) Tj
-ET [ /array
-endstream
-endobj
-
-%QDF: ignore_newline
-8 0 obj
-19
-endobj
-
-%% Contents for page 1
-%% Original object ID: 8 0
-9 0 obj
-<<
-  /Length 10 0 R
->>
-stream
-/split ] BI
-/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
-ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À>”^&®¡uâ]€"!‡•–*¬&E|Sy® ðd-€<B0Bú@Nê+<hlèKÐî/56L ‰ã £–¹¦>0>Yù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«3ÂÖlpÛsHöûtú
-endstream
-endobj
-
-%QDF: ignore_newline
-10 0 obj
-253
-endobj
-
-%% Contents for page 1
-%% Original object ID: 10 0
-11 0 obj
-<<
-  /Length 12 0 R
->>
-stream
-QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
-ÍÔu#c•ÆW	ôß‰W“O
-EI
-endstream
-endobj
-
-%QDF: ignore_newline
-12 0 obj
-65
-endobj
-
-%% Original object ID: 12 0
-13 0 obj
-<<
-  /BaseFont /Helvetica
-  /Encoding /WinAnsiEncoding
-  /Name /F1
-  /Subtype /Type1
-  /Type /Font
->>
-endobj
-
-%% Original object ID: 13 0
-14 0 obj
-[
-  /PDF
-  /Text
-]
-endobj
-
-%% Contents for page 2
-%% Original object ID: 15 0
-15 0 obj
-<<
-  /Length 16 0 R
->>
-stream
-BT
-  /F1 24 Tf
-  72 720 Td
-  (Potato) Tj
-ET
-endstream
-endobj
-
-16 0 obj
-44
-endobj
-
-%% Original object ID: 17 0
-17 0 obj
-<<
-  /BaseFont /Helvetica
-  /Encoding /WinAnsiEncoding
-  /Name /F1
-  /Subtype /Type1
-  /Type /Font
->>
-endobj
-
-%% Original object ID: 18 0
-18 0 obj
-[
-  /PDF
-  /Text
-]
-endobj
-
-xref
-0 19
-0000000000 65535 f 
-0000000052 00000 n 
-0000000133 00000 n 
-0000000252 00000 n 
-0000000525 00000 n 
-0000000770 00000 n 
-0000000880 00000 n 
-0000000949 00000 n 
-0000001045 00000 n 
-0000001114 00000 n 
-0000001445 00000 n 
-0000001517 00000 n 
-0000001661 00000 n 
-0000001709 00000 n 
-0000001856 00000 n 
-0000001943 00000 n 
-0000002044 00000 n 
-0000002092 00000 n 
-0000002239 00000 n 
-trailer <<
-  /Root 1 0 R
-  /Size 19
-  /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
->>
-startxref
-2275
-%%EOF
diff --git a/qpdf/qtest/qpdf/coalesce-split.out b/qpdf/qtest/qpdf/coalesce-split.out
deleted file mode 100644
index 5e18173c..00000000
--- a/qpdf/qtest/qpdf/coalesce-split.out
+++ /dev/null
@@ -1,10 +0,0 @@
-WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
-WARNING: empty PDF: content normalization encountered bad tokens
-WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: empty PDF: content normalization encountered bad tokens
-WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: empty PDF: content normalization encountered bad tokens
-WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-qpdf: operation succeeded with warnings; resulting file may have some problems
diff --git a/qpdf/qtest/qpdf/coalesce.pdf b/qpdf/qtest/qpdf/coalesce.pdf
index ba5d959b..4fa99202 100644
Binary files a/qpdf/qtest/qpdf/coalesce.pdf and b/qpdf/qtest/qpdf/coalesce.pdf differ
diff --git a/qpdf/qtest/qpdf/coalesce.qdf b/qpdf/qtest/qpdf/coalesce.qdf
index 5007dc12..ab5b08cc 100644
Binary files a/qpdf/qtest/qpdf/coalesce.qdf and b/qpdf/qtest/qpdf/coalesce.qdf differ
diff --git a/qpdf/qtest/qpdf/normalize-warnings.out b/qpdf/qtest/qpdf/normalize-warnings.out
index 57f038f4..287a583c 100644
--- a/qpdf/qtest/qpdf/normalize-warnings.out
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@@ -1,9 +1,9 @@
-WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens
-WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
-WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens
+WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 qpdf: operation succeeded with warnings; resulting file may have some problems
diff --git a/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf b/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
new file mode 100644
index 00000000..4542411e
--- /dev/null
+++ b/qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
@@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 14 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 4 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 6 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 8 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À>”^&®¡uâ]€"!‡•–*¬&E|Sy® ðd-€<B0Bú@Nê+<hlèKÐî/56L ‰ã £–¹¦>0>Yù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«3ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 10 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 12 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 13 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000525 00000 n 
+0000000770 00000 n 
+0000000880 00000 n 
+0000000949 00000 n 
+0000001045 00000 n 
+0000001114 00000 n 
+0000001445 00000 n 
+0000001517 00000 n 
+0000001661 00000 n 
+0000001709 00000 n 
+0000001856 00000 n 
+0000001943 00000 n 
+0000002044 00000 n 
+0000002092 00000 n 
+0000002239 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
+>>
+startxref
+2275
+%%EOF
diff --git a/qpdf/qtest/qpdf/split-tokens-split.out b/qpdf/qtest/qpdf/split-tokens-split.out
new file mode 100644
index 00000000..0a76a46a
--- /dev/null
+++ b/qpdf/qtest/qpdf/split-tokens-split.out
@@ -0,0 +1,10 @@
+WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: empty PDF: content normalization encountered bad tokens
+WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems
diff --git a/qpdf/qtest/qpdf/split-tokens.pdf b/qpdf/qtest/qpdf/split-tokens.pdf
new file mode 100644
index 00000000..ba5d959b
--- /dev/null
+++ b/qpdf/qtest/qpdf/split-tokens.pdf
@@ -0,0 +1,217 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À>”^&®¡uâ]€"!‡•–*¬&E|Sy® ðd-€<B0Bú@Nê+<hlèKÐî/56L ‰ã £–¹¦>0>Yù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«3ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+66
+endobj
+
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000025 00000 n 
+0000000079 00000 n 
+0000000171 00000 n 
+0000000416 00000 n 
+0000000634 00000 n 
+0000000744 00000 n 
+0000000786 00000 n 
+0000000882 00000 n 
+0000000924 00000 n 
+0000001255 00000 n 
+0000001299 00000 n 
+0000001444 00000 n 
+0000001464 00000 n 
+0000001583 00000 n 
+0000001642 00000 n 
+0000001743 00000 n 
+0000001763 00000 n 
+0000001882 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><6af379f20e8dcd4e724869daec3ba023>]
+>>
+startxref
+1918
+%%EOF
diff --git a/qpdf/qtest/qpdf/split-tokens.qdf b/qpdf/qtest/qpdf/split-tokens.qdf
new file mode 100644
index 00000000..5007dc12
--- /dev/null
+++ b/qpdf/qtest/qpdf/split-tokens.qdf
@@ -0,0 +1,231 @@
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 2
+  /Kids [
+    3 0 R
+    4 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents [
+    5 0 R
+    7 0 R
+    9 0 R
+    11 0 R
+  ]
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 13 0 R
+    >>
+    /ProcSet 14 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Contents 15 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 17 0 R
+    >>
+    /ProcSet 18 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 5 0
+5 0 obj
+<<
+  /Length 6 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 7 0
+7 0 obj
+<<
+  /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 9 0
+9 0 obj
+<<
+  /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xœÅÖIÃ P|ÿC;UÈ`ÀÓ7‘Z©¦Ä˜Úæ}Dðï_´øÉW©„œÄ-”ˆ>ÿ‡À>”^&®¡uâ]€"!‡•–*¬&E|Sy® ðd-€<B0Bú@Nê+<hlèKÐî/56L ‰ã £–¹¦>0>Yù!cì\YØ%Yð¥Ö8?& Öëˆ}j’ûè;«3ÂÖlpÛsHöûtú
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 11 0
+11 0 obj
+<<
+  /Length 12 0 R
+>>
+stream
+QØTt*hÌUúãwÍÕÐ%¨)p–³"•DiRj¹–DYNUÓÙAv’Fà&
+ÍÔu#c•ÆW	ôß‰W“O
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 13 0
+13 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+14 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+  /Length 16 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000252 00000 n 
+0000000524 00000 n 
+0000000769 00000 n 
+0000000879 00000 n 
+0000000948 00000 n 
+0000001044 00000 n 
+0000001113 00000 n 
+0000001444 00000 n 
+0000001516 00000 n 
+0000001660 00000 n 
+0000001708 00000 n 
+0000001855 00000 n 
+0000001942 00000 n 
+0000002043 00000 n 
+0000002091 00000 n 
+0000002238 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 19
+  /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+2274
+%%EOF
diff --git a/qpdf/qtest/qpdf/token-filters-out.pdf b/qpdf/qtest/qpdf/token-filters-out.pdf
index 6d24497c..8f5f14c3 100644
Binary files a/qpdf/qtest/qpdf/token-filters-out.pdf and b/qpdf/qtest/qpdf/token-filters-out.pdf differ
-- 
cgit v1.2.3-54-g00ecf