aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-02-03 03:16:40 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 03:05:47 +0100
commit5136238f2a973f693cea53c340dcff23a655531f (patch)
tree8cc1d2a1fdf1833fa67454b2707994b3328c879c
parent30709935af023dd66a17f2d494aa7dc84b7177e1 (diff)
downloadqpdf-5136238f2a973f693cea53c340dcff23a655531f.tar.zst
Detect and report bad tokens in content normalization
-rw-r--r--ChangeLog19
-rw-r--r--libqpdf/ContentNormalizer.cc26
-rw-r--r--libqpdf/QPDF_Stream.cc27
-rw-r--r--libqpdf/qpdf/ContentNormalizer.hh7
-rw-r--r--qpdf/qpdf.testcov1
-rw-r--r--qpdf/qtest/qpdf.test10
-rw-r--r--qpdf/qtest/qpdf/coalesce.qdf231
-rw-r--r--qpdf/qtest/qpdf/good14.out17
-rw-r--r--qpdf/qtest/qpdf/normalize-warnings.out9
9 files changed, 343 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index b061c584..7d94eb9f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -153,6 +153,25 @@
* Provide heavily annoated examples/pdf-filter-tokens.cc example
that illustrates use of some simple token filters.
+ * When normalizing content streams, as in qdf mode, issue warning
+ about bad tokens. Content streams are only normalized when this is
+ explicitly requested, so this has no impact on normal operation.
+ However, in qdf mode, if qpdf detects a bad token, it means that
+ either there's a bug in qpdf's lexer, that the file is damaged, or
+ that the page's contents are split in a weird way. In any of those
+ cases, qpdf could potentially damage the stream's contents by
+ replacing carrige returns with newlines or otherwise messing with
+ spaces. The mostly likely case of this would be an inline image's
+ compressed data being divided across two streams and having the
+ compressed data in the second stream contain a carriage return as
+ part of its binary data. If you are using qdf mode just to look at
+ PDF files in text editors, this usually doesn't matter. In cases
+ of contents split across multiple streams, coalescing streams
+ would eliminate the problem, so the warning mentions this. Prior
+ to this enhancement, the chances of qdf mode writing incorrect
+ data were already very low. This change should make it nearly
+ impossible for qdf mode to unknowingly write invalid data.
+
2018-02-04 Jay Berkenbilt <ejb@ql.org>
* Add QPDFWriter::setLinearizationPass1Filename method and
diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc
index 35a8ad74..f85ab829 100644
--- a/libqpdf/ContentNormalizer.cc
+++ b/libqpdf/ContentNormalizer.cc
@@ -1,7 +1,9 @@
#include <qpdf/ContentNormalizer.hh>
#include <qpdf/QUtil.hh>
-ContentNormalizer::ContentNormalizer()
+ContentNormalizer::ContentNormalizer() :
+ any_bad_tokens(false),
+ last_token_was_bad(false)
{
}
@@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
std::string value = token.getRawValue();
QPDFTokenizer::token_type_e token_type = token.getType();
+ if (token_type == QPDFTokenizer::tt_bad)
+ {
+ this->any_bad_tokens = true;
+ this->last_token_was_bad = true;
+ }
+ else if (token_type != QPDFTokenizer::tt_eof)
+ {
+ this->last_token_was_bad = false;
+ }
+
switch (token_type)
{
case QPDFTokenizer::tt_space:
@@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
{
finish();
}
+
+bool
+ContentNormalizer::anyBadTokens() const
+{
+ return this->any_bad_tokens;
+}
+
+bool
+ContentNormalizer::lastTokenWasBad()const
+{
+ return this->last_token_was_bad;
+}
diff --git a/libqpdf/QPDF_Stream.cc b/libqpdf/QPDF_Stream.cc
index a026f9a4..bb1e24e6 100644
--- a/libqpdf/QPDF_Stream.cc
+++ b/libqpdf/QPDF_Stream.cc
@@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
}
}
+ if (filter &&
+ (! suppress_warnings) &&
+ normalizer.getPointer() &&
+ normalizer->anyBadTokens())
+ {
+ warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+ "", this->offset,
+ "content normalization encountered bad tokens"));
+ if (normalizer->lastTokenWasBad())
+ {
+ QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
+ warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+ "", this->offset,
+ "normalized content ended with a bad token;"
+ " you may be able to resolve this by"
+ " coalescing content streams in combination"
+ " with normalizing content. From the command"
+ " line, specify --coalesce-contents"));
+ }
+ warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+ "", this->offset,
+ "Resulting stream data may be corrupted but is"
+ " may still useful for manual inspection."
+ " For more information on this warning, search"
+ " for content normalization in the manual."));
+ }
+
return filter;
}
diff --git a/libqpdf/qpdf/ContentNormalizer.hh b/libqpdf/qpdf/ContentNormalizer.hh
index 504f15e8..89b28f3a 100644
--- a/libqpdf/qpdf/ContentNormalizer.hh
+++ b/libqpdf/qpdf/ContentNormalizer.hh
@@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
virtual ~ContentNormalizer();
virtual void handleToken(QPDFTokenizer::Token const&);
virtual void handleEOF();
+
+ bool anyBadTokens() const;
+ bool lastTokenWasBad() const;
+
+ private:
+ bool any_bad_tokens;
+ bool last_token_was_bad;
};
#endif // __CONTENTNORMALIZER_HH__
diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov
index a1ce662d..2c51867f 100644
--- a/qpdf/qpdf.testcov
+++ b/qpdf/qpdf.testcov
@@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
QPDFObjectHandle non-stream in stream array 0
QPDFObjectHandle coalesce called on stream 0
QPDFObjectHandle coalesce provide stream data 0
+QPDF_Stream bad token at end during normalize 0
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index a3572859..45c750fd 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor",
show_ntests();
# ----------
$td->notify("--- Coalesce contents ---");
-$n_tests += 4;
+$n_tests += 6;
+$td->runtest("qdf with normalize warnings",
+ {$td->COMMAND =>
+ "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+ {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+ {$td->FILE => "a.pdf"},
+ {$td->FILE => "coalesce.qdf"});
$td->runtest("coalesce contents with qdf",
{$td->COMMAND =>
"qpdf --qdf --static-id" .
diff --git a/qpdf/qtest/qpdf/coalesce.qdf b/qpdf/qtest/qpdf/coalesce.qdf
new file mode 100644
index 00000000..5007dc12
--- /dev/null
+++ b/qpdf/qtest/qpdf/coalesce.qdf
@@ -0,0 +1,231 @@
+%PDF-1.3
+%
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+ /Pages 2 0 R
+ /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+ /Count 2
+ /Kids [
+ 3 0 R
+ 4 0 R
+ ]
+ /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+ /Contents [
+ 5 0 R
+ 7 0 R
+ 9 0 R
+ 11 0 R
+ ]
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 13 0 R
+ >>
+ /ProcSet 14 0 R
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Page 2
+%% Original object ID: 4 0
+4 0 obj
+<<
+ /Contents 15 0 R
+ /MediaBox [
+ 0
+ 0
+ 612
+ 792
+ ]
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 17 0 R
+ >>
+ /ProcSet 18 0 R
+ >>
+ /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 5 0
+5 0 obj
+<<
+ /Length 6 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Pot
+endstream
+endobj
+
+%QDF: ignore_newline
+6 0 obj
+33
+endobj
+
+%% Contents for page 1
+%% Original object ID: 7 0
+7 0 obj
+<<
+ /Length 8 0 R
+>>
+stream
+ato) Tj
+ET [ /array
+endstream
+endobj
+
+%QDF: ignore_newline
+8 0 obj
+19
+endobj
+
+%% Contents for page 1
+%% Original object ID: 9 0
+9 0 obj
+<<
+ /Length 10 0 R
+>>
+stream
+/split ] BI
+/CS /G/W 66/H 47/BPC 8/F/Fl/DP<</Predictor 15/Columns 66>>
+ID xI P|C;U`7Z Ę}D_W->>^&u]"!*&E|Sy d-<B0B@N+<hlK/56L >0>Y!c\Y %Y8?&}j;3lpsHt
+endstream
+endobj
+
+%QDF: ignore_newline
+10 0 obj
+253
+endobj
+
+%% Contents for page 1
+%% Original object ID: 11 0
+11 0 obj
+<<
+ /Length 12 0 R
+>>
+stream
+QTt*hUw%)p"DiRjDYNUAvF&
+u#cW ߉WO
+EI
+endstream
+endobj
+
+%QDF: ignore_newline
+12 0 obj
+65
+endobj
+
+%% Original object ID: 13 0
+13 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Name /F1
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+%% Original object ID: 14 0
+14 0 obj
+[
+ /PDF
+ /Text
+]
+endobj
+
+%% Contents for page 2
+%% Original object ID: 15 0
+15 0 obj
+<<
+ /Length 16 0 R
+>>
+stream
+BT
+ /F1 24 Tf
+ 72 720 Td
+ (Potato) Tj
+ET
+endstream
+endobj
+
+16 0 obj
+44
+endobj
+
+%% Original object ID: 17 0
+17 0 obj
+<<
+ /BaseFont /Helvetica
+ /Encoding /WinAnsiEncoding
+ /Name /F1
+ /Subtype /Type1
+ /Type /Font
+>>
+endobj
+
+%% Original object ID: 18 0
+18 0 obj
+[
+ /PDF
+ /Text
+]
+endobj
+
+xref
+0 19
+0000000000 65535 f
+0000000052 00000 n
+0000000133 00000 n
+0000000252 00000 n
+0000000524 00000 n
+0000000769 00000 n
+0000000879 00000 n
+0000000948 00000 n
+0000001044 00000 n
+0000001113 00000 n
+0000001444 00000 n
+0000001516 00000 n
+0000001660 00000 n
+0000001708 00000 n
+0000001855 00000 n
+0000001942 00000 n
+0000002043 00000 n
+0000002091 00000 n
+0000002238 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 19
+ /ID [<fa46a90bcf56476b9904a2e7adb75024><31415926535897932384626433832795>]
+>>
+startxref
+2274
+%%EOF
diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out
index 87819670..84bf7133 100644
--- a/qpdf/qtest/qpdf/good14.out
+++ b/qpdf/qtest/qpdf/good14.out
@@ -13,7 +13,9 @@ three lines
<8a8b>
(ab)
<8c><dd> ) >
-<610062> (MOO)-- stream 1 --
+<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 1 --
This stream does end with a newline.
// tests:
// bad tokens preserved
@@ -31,10 +33,18 @@ This stream does end with a newline.
/good name
/bad#00name
+WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 2 --
(This stream ends with a \001 bad token
+WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
-- stream 3 --
-<AB X-- stream 4 --
+<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 4 --
(ends with a name)
/ThisMustBeLast-- stream 5 --
% This stream has an inline image marker that is not terminated
@@ -44,4 +54,7 @@ BI
ID
<506f7
461746f>
+WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
test 3 done
diff --git a/qpdf/qtest/qpdf/normalize-warnings.out b/qpdf/qtest/qpdf/normalize-warnings.out
new file mode 100644
index 00000000..73947b1a
--- /dev/null
+++ b/qpdf/qtest/qpdf/normalize-warnings.out
@@ -0,0 +1,9 @@
+WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems