aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2018-01-24 03:48:22 +0100
committerJay Berkenbilt <ejb@ql.org>2018-02-19 02:18:40 +0100
commitbb9e91adbd75d05d0d60227b2d419d7ee12e1b42 (patch)
tree690ea6f2a2c340eb1a2114fd5c4d34b51cf388bd
parentebd5ed63decb90e26ae9129164214f9d7d684621 (diff)
downloadqpdf-bb9e91adbd75d05d0d60227b2d419d7ee12e1b42.tar.zst
Create isolated tokenizer tests
This tokenizes outer parts of the file, page content streams, and object streams. It is for exercising the tokenizer in isolation and is being introduced before reworking the lexical layer of qpdf.
-rw-r--r--qpdf/build.mk2
-rw-r--r--qpdf/qtest/qpdf.test7
-rw-r--r--qpdf/qtest/qpdf/tokens.out1120
-rw-r--r--qpdf/qtest/qpdf/tokens.pdfbin0 -> 9120 bytes
-rw-r--r--qpdf/test_tokenizer.cc261
5 files changed, 1388 insertions, 2 deletions
diff --git a/qpdf/build.mk b/qpdf/build.mk
index 893bbbd1..1bc21836 100644
--- a/qpdf/build.mk
+++ b/qpdf/build.mk
@@ -1,4 +1,4 @@
-BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file
+BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer
CBINS_qpdf = qpdf-ctest
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index f0dde70f..ddf25d73 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -240,7 +240,7 @@ foreach my $d (@bug_tests)
show_ntests();
# ----------
$td->notify("--- Miscellaneous Tests ---");
-$n_tests += 96;
+$n_tests += 97;
$td->runtest("qpdf version",
{$td->COMMAND => "qpdf --version"},
@@ -263,6 +263,11 @@ $td->runtest("check pass1 file",
{$td->FILE => "b.pdf"},
{$td->FILE => "minimal-linearize-pass1.pdf"});
+$td->runtest("tokenizer",
+ {$td->COMMAND => "test_tokenizer tokens.pdf"},
+ {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
foreach (my $i = 1; $i <= 3; ++$i)
{
$td->runtest("misc tests",
diff --git a/qpdf/qtest/qpdf/tokens.out b/qpdf/qtest/qpdf/tokens.out
new file mode 100644
index 00000000..f52e619b
--- /dev/null
+++ b/qpdf/qtest/qpdf/tokens.out
@@ -0,0 +1,1120 @@
+--- BEGIN FILE ---
+60: integer: 1
+62: integer: 0
+64: word: obj
+68: dict_open: <<
+73: name: /Type
+79: name: /ObjStm
+89: name: /Length
+97: integer: 6020
+104: name: /N
+107: integer: 35
+112: name: /First
+119: integer: 323
+123: dict_close: >>
+126: word: stream
+skipping to endstream
+6153: word: endstream
+6163: word: endobj
+6222: integer: 37
+6225: integer: 0
+6227: word: obj
+6231: dict_open: <<
+6236: name: /Length
+6244: integer: 38
+6247: integer: 0
+6249: word: R
+6251: dict_close: >>
+6254: word: stream
+skipping to endstream
+6305: word: endstream
+6315: word: endobj
+6323: integer: 38
+6326: integer: 0
+6328: word: obj
+6332: integer: 44
+6335: word: endobj
+6394: integer: 39
+6397: integer: 0
+6399: word: obj
+6403: dict_open: <<
+6408: name: /Length
+6416: integer: 40
+6419: integer: 0
+6421: word: R
+6423: dict_close: >>
+6426: word: stream
+skipping to endstream
+6832: word: endstream
+6842: word: endobj
+6850: integer: 40
+6853: integer: 0
+6855: word: obj
+6859: integer: 399
+6863: word: endobj
+6922: integer: 41
+6925: integer: 0
+6927: word: obj
+6931: dict_open: <<
+6936: name: /Length
+6944: integer: 42
+6947: integer: 0
+6949: word: R
+6951: dict_close: >>
+6954: word: stream
+skipping to endstream
+7001: word: endstream
+7011: word: endobj
+7019: integer: 42
+7022: integer: 0
+7024: word: obj
+7028: integer: 40
+7031: word: endobj
+7090: integer: 43
+7093: integer: 0
+7095: word: obj
+7099: dict_open: <<
+7104: name: /Length
+7112: integer: 44
+7115: integer: 0
+7117: word: R
+7119: dict_close: >>
+7122: word: stream
+skipping to endstream
+7404: word: endstream
+7414: word: endobj
+7422: integer: 44
+7425: integer: 0
+7427: word: obj
+7431: integer: 275
+7435: word: endobj
+7494: integer: 45
+7497: integer: 0
+7499: word: obj
+7503: dict_open: <<
+7508: name: /Length
+7516: integer: 46
+7519: integer: 0
+7521: word: R
+7523: dict_close: >>
+7526: word: stream
+skipping to endstream
+7601: word: endstream
+7611: word: endobj
+7619: integer: 46
+7622: integer: 0
+7624: word: obj
+7628: integer: 68
+7631: word: endobj
+7690: integer: 47
+7693: integer: 0
+7695: word: obj
+7699: dict_open: <<
+7704: name: /Length
+7712: integer: 48
+7715: integer: 0
+7717: word: R
+7719: dict_close: >>
+7722: word: stream
+skipping to endstream
+7773: word: endstream
+7783: word: endobj
+7791: integer: 48
+7794: integer: 0
+7796: word: obj
+7800: integer: 44
+7803: word: endobj
+7862: integer: 49
+7865: integer: 0
+7867: word: obj
+7871: dict_open: <<
+7876: name: /Length
+7884: integer: 50
+7887: integer: 0
+7889: word: R
+7891: dict_close: >>
+7894: word: stream
+skipping to endstream
+7945: word: endstream
+7955: word: endobj
+7963: integer: 50
+7966: integer: 0
+7968: word: obj
+7972: integer: 44
+7975: word: endobj
+8034: integer: 51
+8037: integer: 0
+8039: word: obj
+8043: dict_open: <<
+8048: name: /Length
+8056: integer: 52
+8059: integer: 0
+8061: word: R
+8063: dict_close: >>
+8066: word: stream
+skipping to endstream
+8117: word: endstream
+8127: word: endobj
+8135: integer: 52
+8138: integer: 0
+8140: word: obj
+8144: integer: 44
+8147: word: endobj
+8206: integer: 53
+8209: integer: 0
+8211: word: obj
+8215: dict_open: <<
+8220: name: /Length
+8228: integer: 54
+8231: integer: 0
+8233: word: R
+8235: dict_close: >>
+8238: word: stream
+skipping to endstream
+8289: word: endstream
+8299: word: endobj
+8307: integer: 54
+8310: integer: 0
+8312: word: obj
+8316: integer: 44
+8319: word: endobj
+8379: integer: 55
+8382: integer: 0
+8384: word: obj
+8388: dict_open: <<
+8393: name: /Length
+8401: integer: 56
+8404: integer: 0
+8406: word: R
+8408: dict_close: >>
+8411: word: stream
+skipping to endstream
+8462: word: endstream
+8472: word: endobj
+8480: integer: 56
+8483: integer: 0
+8485: word: obj
+8489: integer: 44
+8492: word: endobj
+8552: integer: 57
+8555: integer: 0
+8557: word: obj
+8561: dict_open: <<
+8566: name: /Length
+8574: integer: 58
+8577: integer: 0
+8579: word: R
+8581: dict_close: >>
+8584: word: stream
+skipping to endstream
+8635: word: endstream
+8645: word: endobj
+8653: integer: 58
+8656: integer: 0
+8658: word: obj
+8662: integer: 44
+8665: word: endobj
+8673: integer: 59
+8676: integer: 0
+8678: word: obj
+8682: dict_open: <<
+8687: name: /Type
+8693: name: /XRef
+8701: name: /Length
+8709: integer: 240
+8715: name: /W
+8718: array_open: [
+8720: integer: 1
+8722: integer: 2
+8724: integer: 1
+8726: array_close: ]
+8730: name: /Root
+8736: integer: 2
+8738: integer: 0
+8740: word: R
+8744: name: /Size
+8750: integer: 60
+8755: name: /ID
+8759: array_open: [
+8760: string: \x88\x04\x8e\x17\xc9a\xe0\x94\xff\xec\xe9\x8c\xb8\x8cF\xd0 (raw: <88048e17c961e094ffece98cb88c46d0>)
+8794: string: \xed\xd6\x0f\xe8\xee\x87\xf8\x871\xa8o\x81\x9f\xe6Q\x99 (raw: <edd60fe8ee87f88731a86f819fe65199>)
+8828: array_close: ]
+8830: dict_close: >>
+8833: word: stream
+skipping to endstream
+9081: word: endstream
+9091: word: endobj
+9099: word: startxref
+9109: integer: 8673
+9120: eof
+--- END FILE ---
+--- BEGIN PAGE 1 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 1 ---
+--- BEGIN PAGE 2 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: word: BI
+47: name: /CS
+51: name: /G
+53: name: /W
+56: integer: 66
+58: name: /H
+61: integer: 47
+63: name: /BPC
+68: integer: 8
+69: name: /F
+71: name: /Fl
+74: name: /DP
+77: dict_open: <<
+79: name: /Predictor
+90: integer: 15
+92: name: /Columns
+101: integer: 66
+103: dict_close: >>
+106: word: ID
+skipping to EI
+352: word: EI
+355: word: BT
+360: name: /F1
+364: integer: 24
+367: word: Tf
+372: integer: 72
+375: integer: 720
+379: word: Td
+384: string: Potato (raw: (Potato))
+393: word: Tj
+396: word: ET
+399: eof
+--- END PAGE 2 ---
+--- BEGIN PAGE 3 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: bad: Potato\x0aET\x0a (raw: (Potato\x0aET\x0a) (EOF while reading token)
+40: eof
+--- END PAGE 3 ---
+--- BEGIN PAGE 4 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: string: \xfe\xeb (raw: <feeb>)
+26: string: \xab\xcd (raw: <ab\x0aCD>)
+36: string: quack (raw: (qu\\x0d\x0aack))
+49: string: quack (raw: (qu\\x0aack))
+61: string: quack (raw: (qu\\x0dack))
+73: integer: 72
+76: integer: 720
+80: word: Td
+85: real: 3.14
+92: real: 3.
+97: real: .14
+103: real: +3.14
+111: real: +3.
+117: real: +.14
+124: real: -3.14
+132: real: -3.
+138: real: -.14
+145: integer: +16059
+154: integer: -16059
+163: word: +.
+168: bad: fadeE (raw: <fade\x0aET) (invalid character (T) in hexstring)
+177: bad: ) (unexpected ))
+178: bad: > (unexpected >)
+179: word: quack
+185: bad: /name#oops (invalid name token)
+196: name: /name (raw: /n#61me)
+204: word: one
+208: bool: true
+213: word: two
+217: bool: false
+223: word: three
+229: null: null
+234: word: four
+239: word: !@#$^&
+245: brace_open: {
+246: brace_close: }
+247: word: *-_+=
+253: word: abc123def3.14true
+271: bad: ff (raw: <ff\x0a) (EOF while reading token)
+275: eof
+--- END PAGE 4 ---
+--- BEGIN PAGE 5 ---
+0: word: BT
+5: bad: /F#00x (null character not allowed in name token)
+12: integer: 24
+15: word: Tf
+20: integer: 72
+23: integer: 720
+27: word: Td
+32: string: P\x00tat\x00 (raw: (P\x00tat\000))
+44: word: Tj
+47: word: ET
+52: name: /ThisMustBeLast
+68: eof
+--- END PAGE 5 ---
+--- BEGIN PAGE 6 ---
+0: word: ID
+skipping to EI
+EI not found
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 6 ---
+--- BEGIN PAGE 7 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 7 ---
+--- BEGIN PAGE 8 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 8 ---
+--- BEGIN PAGE 9 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 9 ---
+--- BEGIN PAGE 10 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 10 ---
+--- BEGIN PAGE 11 ---
+0: word: BT
+5: name: /F1
+9: integer: 24
+12: word: Tf
+17: integer: 72
+20: integer: 720
+24: word: Td
+29: string: Potato (raw: (Potato))
+38: word: Tj
+41: word: ET
+44: eof
+--- END PAGE 11 ---
+--- BEGIN OBJECT STREAM 1 ---
+0: integer: 2
+2: integer: 0
+4: integer: 3
+6: integer: 97
+9: integer: 4
+11: integer: 318
+15: integer: 5
+17: integer: 566
+21: integer: 6
+23: integer: 814
+27: integer: 7
+29: integer: 1062
+34: integer: 8
+36: integer: 1310
+41: integer: 9
+43: integer: 1558
+48: integer: 10
+51: integer: 1808
+56: integer: 11
+59: integer: 2058
+64: integer: 12
+67: integer: 2309
+72: integer: 13
+75: integer: 2560
+80: integer: 14
+83: integer: 2812
+88: integer: 15
+91: integer: 3064
+96: integer: 16
+99: integer: 3228
+104: integer: 17
+107: integer: 3309
+112: integer: 18
+115: integer: 3473
+120: integer: 19
+123: integer: 3554
+128: integer: 20
+131: integer: 3718
+136: integer: 21
+139: integer: 3799
+144: integer: 22
+147: integer: 3963
+152: integer: 23
+155: integer: 4044
+160: integer: 24
+163: integer: 4208
+168: integer: 25
+171: integer: 4289
+176: integer: 26
+179: integer: 4453
+184: integer: 27
+187: integer: 4534
+192: integer: 28
+195: integer: 4698
+200: integer: 29
+203: integer: 4779
+208: integer: 30
+211: integer: 4943
+216: integer: 31
+219: integer: 5024
+224: integer: 32
+227: integer: 5188
+232: integer: 33
+235: integer: 5269
+240: integer: 34
+243: integer: 5433
+248: integer: 35
+251: integer: 5514
+256: integer: 36
+259: integer: 5678
+323: dict_open: <<
+328: name: /Pages
+335: integer: 3
+337: integer: 0
+339: word: R
+343: name: /Type
+349: name: /Catalog
+358: dict_close: >>
+420: dict_open: <<
+425: name: /Count
+432: integer: 11
+437: name: /Kids
+443: array_open: [
+449: integer: 4
+451: integer: 0
+453: word: R
+459: integer: 5
+461: integer: 0
+463: word: R
+469: integer: 6
+471: integer: 0
+473: word: R
+479: integer: 7
+481: integer: 0
+483: word: R
+489: integer: 8
+491: integer: 0
+493: word: R
+499: integer: 9
+501: integer: 0
+503: word: R
+509: integer: 10
+512: integer: 0
+514: word: R
+520: integer: 11
+523: integer: 0
+525: word: R
+531: integer: 12
+534: integer: 0
+536: word: R
+542: integer: 13
+545: integer: 0
+547: word: R
+553: integer: 14
+556: integer: 0
+558: word: R
+562: array_close: ]
+566: name: /Type
+572: name: /Pages
+579: dict_close: >>
+651: dict_open: <<
+656: name: /Contents
+666: integer: 37
+669: integer: 0
+671: word: R
+675: name: /MediaBox
+685: array_open: [
+691: integer: 0
+697: integer: 0
+703: integer: 612
+711: integer: 792
+717: array_close: ]
+721: name: /Parent
+729: integer: 3
+731: integer: 0
+733: word: R
+737: name: /Resources
+748: dict_open: <<
+755: name: /Font
+761: dict_open: <<
+770: name: /F1
+774: integer: 15
+777: integer: 0
+779: word: R
+785: dict_close: >>
+792: name: /ProcSet
+801: integer: 16
+804: integer: 0
+806: word: R
+810: dict_close: >>
+815: name: /Type
+821: name: /Page
+827: dict_close: >>
+899: dict_open: <<
+904: name: /Contents
+914: integer: 39
+917: integer: 0
+919: word: R
+923: name: /MediaBox
+933: array_open: [
+939: integer: 0
+945: integer: 0
+951: integer: 612
+959: integer: 792
+965: array_close: ]
+969: name: /Parent
+977: integer: 3
+979: integer: 0
+981: word: R
+985: name: /Resources
+996: dict_open: <<
+1003: name: /Font
+1009: dict_open: <<
+1018: name: /F1
+1022: integer: 17
+1025: integer: 0
+1027: word: R
+1033: dict_close: >>
+1040: name: /ProcSet
+1049: integer: 18
+1052: integer: 0
+1054: word: R
+1058: dict_close: >>
+1063: name: /Type
+1069: name: /Page
+1075: dict_close: >>
+1147: dict_open: <<
+1152: name: /Contents
+1162: integer: 41
+1165: integer: 0
+1167: word: R
+1171: name: /MediaBox
+1181: array_open: [
+1187: integer: 0
+1193: integer: 0
+1199: integer: 612
+1207: integer: 792
+1213: array_close: ]
+1217: name: /Parent
+1225: integer: 3
+1227: integer: 0
+1229: word: R
+1233: name: /Resources
+1244: dict_open: <<
+1251: name: /Font
+1257: dict_open: <<
+1266: name: /F1
+1270: integer: 19
+1273: integer: 0
+1275: word: R
+1281: dict_close: >>
+1288: name: /ProcSet
+1297: integer: 20
+1300: integer: 0
+1302: word: R
+1306: dict_close: >>
+1311: name: /Type
+1317: name: /Page
+1323: dict_close: >>
+1395: dict_open: <<
+1400: name: /Contents
+1410: integer: 43
+1413: integer: 0
+1415: word: R
+1419: name: /MediaBox
+1429: array_open: [
+1435: integer: 0
+1441: integer: 0
+1447: integer: 612
+1455: integer: 792
+1461: array_close: ]
+1465: name: /Parent
+1473: integer: 3
+1475: integer: 0
+1477: word: R
+1481: name: /Resources
+1492: dict_open: <<
+1499: name: /Font
+1505: dict_open: <<
+1514: name: /F1
+1518: integer: 21
+1521: integer: 0
+1523: word: R
+1529: dict_close: >>
+1536: name: /ProcSet
+1545: integer: 22
+1548: integer: 0
+1550: word: R
+1554: dict_close: >>
+1559: name: /Type
+1565: name: /Page
+1571: dict_close: >>
+1643: dict_open: <<
+1648: name: /Contents
+1658: integer: 45
+1661: integer: 0
+1663: word: R
+1667: name: /MediaBox
+1677: array_open: [
+1683: integer: 0
+1689: integer: 0
+1695: integer: 612
+1703: integer: 792
+1709: array_close: ]
+1713: name: /Parent
+1721: integer: 3
+1723: integer: 0
+1725: word: R
+1729: name: /Resources
+1740: dict_open: <<
+1747: name: /Font
+1753: dict_open: <<
+1762: name: /F1
+1766: integer: 23
+1769: integer: 0
+1771: word: R
+1777: dict_close: >>
+1784: name: /ProcSet
+1793: integer: 24
+1796: integer: 0
+1798: word: R
+1802: dict_close: >>
+1807: name: /Type
+1813: name: /Page
+1819: dict_close: >>
+1891: dict_open: <<
+1896: name: /Contents
+1906: integer: 47
+1909: integer: 0
+1911: word: R
+1915: name: /MediaBox
+1925: array_open: [
+1931: integer: 0
+1937: integer: 0
+1943: integer: 612
+1951: integer: 792
+1957: array_close: ]
+1961: name: /Parent
+1969: integer: 3
+1971: integer: 0
+1973: word: R
+1977: name: /Resources
+1988: dict_open: <<
+1995: name: /Font
+2001: dict_open: <<
+2010: name: /F1
+2014: integer: 25
+2017: integer: 0
+2019: word: R
+2025: dict_close: >>
+2032: name: /ProcSet
+2041: integer: 26
+2044: integer: 0
+2046: word: R
+2050: dict_close: >>
+2055: name: /Type
+2061: name: /Page
+2067: dict_close: >>
+2141: dict_open: <<
+2146: name: /Contents
+2156: integer: 49
+2159: integer: 0
+2161: word: R
+2165: name: /MediaBox
+2175: array_open: [
+2181: integer: 0
+2187: integer: 0
+2193: integer: 612
+2201: integer: 792
+2207: array_close: ]
+2211: name: /Parent
+2219: integer: 3
+2221: integer: 0
+2223: word: R
+2227: name: /Resources
+2238: dict_open: <<
+2245: name: /Font
+2251: dict_open: <<
+2260: name: /F1
+2264: integer: 27
+2267: integer: 0
+2269: word: R
+2275: dict_close: >>
+2282: name: /ProcSet
+2291: integer: 28
+2294: integer: 0
+2296: word: R
+2300: dict_close: >>
+2305: name: /Type
+2311: name: /Page
+2317: dict_close: >>
+2391: dict_open: <<
+2396: name: /Contents
+2406: integer: 51
+2409: integer: 0
+2411: word: R
+2415: name: /MediaBox
+2425: array_open: [
+2431: integer: 0
+2437: integer: 0
+2443: integer: 612
+2451: integer: 792
+2457: array_close: ]
+2461: name: /Parent
+2469: integer: 3
+2471: integer: 0
+2473: word: R
+2477: name: /Resources
+2488: dict_open: <<
+2495: name: /Font
+2501: dict_open: <<
+2510: name: /F1
+2514: integer: 29
+2517: integer: 0
+2519: word: R
+2525: dict_close: >>
+2532: name: /ProcSet
+2541: integer: 30
+2544: integer: 0
+2546: word: R
+2550: dict_close: >>
+2555: name: /Type
+2561: name: /Page
+2567: dict_close: >>
+2642: dict_open: <<
+2647: name: /Contents
+2657: integer: 53
+2660: integer: 0
+2662: word: R
+2666: name: /MediaBox
+2676: array_open: [
+2682: integer: 0
+2688: integer: 0
+2694: integer: 612
+2702: integer: 792
+2708: array_close: ]
+2712: name: /Parent
+2720: integer: 3
+2722: integer: 0
+2724: word: R
+2728: name: /Resources
+2739: dict_open: <<
+2746: name: /Font
+2752: dict_open: <<
+2761: name: /F1
+2765: integer: 31
+2768: integer: 0
+2770: word: R
+2776: dict_close: >>
+2783: name: /ProcSet
+2792: integer: 32
+2795: integer: 0
+2797: word: R
+2801: dict_close: >>
+2806: name: /Type
+2812: name: /Page
+2818: dict_close: >>
+2894: dict_open: <<
+2899: name: /Contents
+2909: integer: 55
+2912: integer: 0
+2914: word: R
+2918: name: /MediaBox
+2928: array_open: [
+2934: integer: 0
+2940: integer: 0
+2946: integer: 612
+2954: integer: 792
+2960: array_close: ]
+2964: name: /Parent
+2972: integer: 3
+2974: integer: 0
+2976: word: R
+2980: name: /Resources
+2991: dict_open: <<
+2998: name: /Font
+3004: dict_open: <<
+3013: name: /F1
+3017: integer: 33
+3020: integer: 0
+3022: word: R
+3028: dict_close: >>
+3035: name: /ProcSet
+3044: integer: 34
+3047: integer: 0
+3049: word: R
+3053: dict_close: >>
+3058: name: /Type
+3064: name: /Page
+3070: dict_close: >>
+3146: dict_open: <<
+3151: name: /Contents
+3161: integer: 57
+3164: integer: 0
+3166: word: R
+3170: name: /MediaBox
+3180: array_open: [
+3186: integer: 0
+3192: integer: 0
+3198: integer: 612
+3206: integer: 792
+3212: array_close: ]
+3216: name: /Parent
+3224: integer: 3
+3226: integer: 0
+3228: word: R
+3232: name: /Resources
+3243: dict_open: <<
+3250: name: /Font
+3256: dict_open: <<
+3265: name: /F1
+3269: integer: 35
+3272: integer: 0
+3274: word: R
+3280: dict_close: >>
+3287: name: /ProcSet
+3296: integer: 36
+3299: integer: 0
+3301: word: R
+3305: dict_close: >>
+3310: name: /Type
+3316: name: /Page
+3322: dict_close: >>
+3387: dict_open: <<
+3392: name: /BaseFont
+3402: name: /Helvetica
+3415: name: /Encoding
+3425: name: /WinAnsiEncoding
+3444: name: /Name
+3450: name: /F1
+3456: name: /Subtype
+3465: name: /Type1
+3474: name: /Type
+3480: name: /Font
+3486: dict_close: >>
+3551: array_open: [
+3555: name: /PDF
+3562: name: /Text
+3568: array_close: ]
+3632: dict_open: <<
+3637: name: /BaseFont
+3647: name: /Helvetica
+3660: name: /Encoding
+3670: name: /WinAnsiEncoding
+3689: name: /Name
+3695: name: /F1
+3701: name: /Subtype
+3710: name: /Type1
+3719: name: /Type
+3725: name: /Font
+3731: dict_close: >>
+3796: array_open: [
+3800: name: /PDF
+3807: name: /Text
+3813: array_close: ]
+3877: dict_open: <<
+3882: name: /BaseFont
+3892: name: /Helvetica
+3905: name: /Encoding
+3915: name: /WinAnsiEncoding
+3934: name: /Name
+3940: name: /F1
+3946: name: /Subtype
+3955: name: /Type1
+3964: name: /Type
+3970: name: /Font
+3976: dict_close: >>
+4041: array_open: [
+4045: name: /PDF
+4052: name: /Text
+4058: array_close: ]
+4122: dict_open: <<
+4127: name: /BaseFont
+4137: name: /Helvetica
+4150: name: /Encoding
+4160: name: /WinAnsiEncoding
+4179: name: /Name
+4185: name: /F1
+4191: name: /Subtype
+4200: name: /Type1
+4209: name: /Type
+4215: name: /Font
+4221: dict_close: >>
+4286: array_open: [
+4290: name: /PDF
+4297: name: /Text
+4303: array_close: ]
+4367: dict_open: <<
+4372: name: /BaseFont
+4382: name: /Helvetica
+4395: name: /Encoding
+4405: name: /WinAnsiEncoding
+4424: name: /Name
+4430: name: /F1
+4436: name: /Subtype
+4445: name: /Type1
+4454: name: /Type
+4460: name: /Font
+4466: dict_close: >>
+4531: array_open: [
+4535: name: /PDF
+4542: name: /Text
+4548: array_close: ]
+4612: dict_open: <<
+4617: name: /BaseFont
+4627: name: /Helvetica
+4640: name: /Encoding
+4650: name: /WinAnsiEncoding
+4669: name: /Name
+4675: name: /F1
+4681: name: /Subtype
+4690: name: /Type1
+4699: name: /Type
+4705: name: /Font
+4711: dict_close: >>
+4776: array_open: [
+4780: name: /PDF
+4787: name: /Text
+4793: array_close: ]
+4857: dict_open: <<
+4862: name: /BaseFont
+4872: name: /Helvetica
+4885: name: /Encoding
+4895: name: /WinAnsiEncoding
+4914: name: /Name
+4920: name: /F1
+4926: name: /Subtype
+4935: name: /Type1
+4944: name: /Type
+4950: name: /Font
+4956: dict_close: >>
+5021: array_open: [
+5025: name: /PDF
+5032: name: /Text
+5038: array_close: ]
+5102: dict_open: <<
+5107: name: /BaseFont
+5117: name: /Helvetica
+5130: name: /Encoding
+5140: name: /WinAnsiEncoding
+5159: name: /Name
+5165: name: /F1
+5171: name: /Subtype
+5180: name: /Type1
+5189: name: /Type
+5195: name: /Font
+5201: dict_close: >>
+5266: array_open: [
+5270: name: /PDF
+5277: name: /Text
+5283: array_close: ]
+5347: dict_open: <<
+5352: name: /BaseFont
+5362: name: /Helvetica
+5375: name: /Encoding
+5385: name: /WinAnsiEncoding
+5404: name: /Name
+5410: name: /F1
+5416: name: /Subtype
+5425: name: /Type1
+5434: name: /Type
+5440: name: /Font
+5446: dict_close: >>
+5511: array_open: [
+5515: name: /PDF
+5522: name: /Text
+5528: array_close: ]
+5592: dict_open: <<
+5597: name: /BaseFont
+5607: name: /Helvetica
+5620: name: /Encoding
+5630: name: /WinAnsiEncoding
+5649: name: /Name
+5655: name: /F1
+5661: name: /Subtype
+5670: name: /Type1
+5679: name: /Type
+5685: name: /Font
+5691: dict_close: >>
+5756: array_open: [
+5760: name: /PDF
+5767: name: /Text
+5773: array_close: ]
+5837: dict_open: <<
+5842: name: /BaseFont
+5852: name: /Helvetica
+5865: name: /Encoding
+5875: name: /WinAnsiEncoding
+5894: name: /Name
+5900: name: /F1
+5906: name: /Subtype
+5915: name: /Type1
+5924: name: /Type
+5930: name: /Font
+5936: dict_close: >>
+6001: array_open: [
+6005: name: /PDF
+6012: name: /Text
+6018: array_close: ]
+6020: eof
+--- END OBJECT STREAM 1 ---
diff --git a/qpdf/qtest/qpdf/tokens.pdf b/qpdf/qtest/qpdf/tokens.pdf
new file mode 100644
index 00000000..b444db5f
--- /dev/null
+++ b/qpdf/qtest/qpdf/tokens.pdf
Binary files differ
diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc
new file mode 100644
index 00000000..de079195
--- /dev/null
+++ b/qpdf/test_tokenizer.cc
@@ -0,0 +1,261 @@
+#include <qpdf/QPDFTokenizer.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/FileInputSource.hh>
+#include <qpdf/BufferInputSource.hh>
+#include <qpdf/QPDF.hh>
+#include <qpdf/Pl_Buffer.hh>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+
+static char const* whoami = 0;
+
+void usage()
+{
+ std::cerr << "Usage: " << whoami << " filename"
+ << std::endl;
+ exit(2);
+}
+
+class Finder: public InputSource::Finder
+{
+ public:
+ Finder(PointerHolder<InputSource> is, std::string const& str) :
+ is(is),
+ str(str)
+ {
+ }
+ virtual ~Finder()
+ {
+ }
+ virtual bool check();
+
+ private:
+ PointerHolder<InputSource> is;
+ std::string str;
+};
+
+bool
+Finder::check()
+{
+ QPDFTokenizer tokenizer;
+ QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
+ qpdf_offset_t offset = this->is->tell();
+ bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
+ this->is->seek(offset - this->str.length(), SEEK_SET);
+ return result;
+}
+
+static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
+{
+ // Do this is a case statement instead of a lookup so the compiler
+ // will warn if we miss any.
+ switch (ttype)
+ {
+ case QPDFTokenizer::tt_bad:
+ return "bad";
+ case QPDFTokenizer::tt_array_close:
+ return "array_close";
+ case QPDFTokenizer::tt_array_open:
+ return "array_open";
+ case QPDFTokenizer::tt_brace_close:
+ return "brace_close";
+ case QPDFTokenizer::tt_brace_open:
+ return "brace_open";
+ case QPDFTokenizer::tt_dict_close:
+ return "dict_close";
+ case QPDFTokenizer::tt_dict_open:
+ return "dict_open";
+ case QPDFTokenizer::tt_integer:
+ return "integer";
+ case QPDFTokenizer::tt_name:
+ return "name";
+ case QPDFTokenizer::tt_real:
+ return "real";
+ case QPDFTokenizer::tt_string:
+ return "string";
+ case QPDFTokenizer::tt_null:
+ return "null";
+ case QPDFTokenizer::tt_bool:
+ return "bool";
+ case QPDFTokenizer::tt_word:
+ return "word";
+ case QPDFTokenizer::tt_eof:
+ return "eof";
+ }
+ return 0;
+}
+
+static std::string
+sanitize(std::string const& value)
+{
+ std::string result;
+ for (std::string::const_iterator iter = value.begin(); iter != value.end();
+ ++iter)
+ {
+ if ((*iter >= 32) && (*iter <= 126))
+ {
+ result.append(1, *iter);
+ }
+ else
+ {
+ result += "\\x" + QUtil::int_to_string_base(
+ static_cast<unsigned char>(*iter), 16, 2);
+ }
+ }
+ return result;
+}
+
+static void
+try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
+{
+ std::cout << "skipping to " << what << std::endl;
+ qpdf_offset_t offset = is->tell();
+ if (! is->findFirst(what, offset, 0, f))
+ {
+ std::cout << what << " not found" << std::endl;
+ is->seek(offset, SEEK_SET);
+ }
+}
+
+static void
+dump_tokens(PointerHolder<InputSource> is, std::string const& label,
+ bool skip_streams, bool skip_inline_images)
+{
+ Finder f1(is, "endstream");
+ Finder f2(is, "EI");
+ std::cout << "--- BEGIN " << label << " ---" << std::endl;
+ bool done = false;
+ QPDFTokenizer tokenizer;
+ tokenizer.allowEOF();
+ while (! done)
+ {
+ QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true);
+
+ qpdf_offset_t offset = is->tell() - token.getRawValue().length();
+ std::cout << offset << ": "
+ << tokenTypeName(token.getType());
+ if (token.getType() != QPDFTokenizer::tt_eof)
+ {
+ std::cout << ": "
+ << sanitize(token.getValue());
+ if (token.getValue() != token.getRawValue())
+ {
+ std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
+ }
+ }
+ if (token.getType() == QPDFTokenizer::tt_bad)
+ {
+ std::cout << " (" << token.getErrorMessage() << ")";
+ }
+ std::cout << std::endl;
+ if (skip_streams &&
+ (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
+ {
+ try_skipping(is, "endstream", f1);
+ }
+ else if (skip_inline_images &&
+ (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
+ {
+ try_skipping(is, "EI", f2);
+ }
+ else if (token.getType() == QPDFTokenizer::tt_eof)
+ {
+ done = true;
+ }
+ }
+ std::cout << "--- END " << label << " ---" << std::endl;
+}
+
+static void process(char const* filename)
+{
+ PointerHolder<InputSource> is;
+ QPDFTokenizer tokenizer;
+ tokenizer.allowEOF();
+
+ // Tokenize file, skipping streams
+ FileInputSource* fis = new FileInputSource();
+ fis->setFilename(filename);
+ is = fis;
+ dump_tokens(is, "FILE", true, false);
+
+ // Tokenize content streams, skipping inline images
+ QPDF qpdf;
+ qpdf.processFile(filename);
+ std::vector<QPDFObjectHandle> pages = qpdf.getAllPages();
+ int pageno = 0;
+ for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ ++pageno;
+ Pl_Buffer plb("buffer");
+ std::vector<QPDFObjectHandle> contents = (*iter).getPageContents();
+ for (std::vector<QPDFObjectHandle>::iterator citer = contents.begin();
+ citer != contents.end(); ++citer)
+ {
+ (*citer).pipeStreamData(&plb, 0, qpdf_dl_specialized);
+ }
+ plb.finish();
+ PointerHolder<Buffer> content_data = plb.getBuffer();
+ BufferInputSource* bis = new BufferInputSource(
+ "content data", content_data.getPointer());
+ is = bis;
+ dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true);
+ }
+
+ // Tokenize object streams
+ std::vector<QPDFObjectHandle> all = qpdf.getAllObjects();
+ for (std::vector<QPDFObjectHandle>::iterator iter = all.begin();
+ iter != all.end(); ++iter)
+ {
+ if ((*iter).isStream() &&
+ (*iter).getDict().getKey("/Type").isName() &&
+ (*iter).getDict().getKey("/Type").getName() == "/ObjStm")
+ {
+ PointerHolder<Buffer> b =
+ (*iter).getStreamData(qpdf_dl_specialized);
+ BufferInputSource* bis = new BufferInputSource(
+ "object stream data", b.getPointer());
+ is = bis;
+ dump_tokens(is, "OBJECT STREAM " +
+ QUtil::int_to_string((*iter).getObjectID()),
+ false, false);
+ }
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ QUtil::setLineBuf(stdout);
+ if ((whoami = strrchr(argv[0], '/')) == NULL)
+ {
+ whoami = argv[0];
+ }
+ else
+ {
+ ++whoami;
+ }
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ if (argc != 2)
+ {
+ usage();
+ }
+
+ char const* filename = argv[1];
+ try
+ {
+ process(filename);
+ }
+ catch (std::exception& e)
+ {
+ std::cerr << whoami << ": exception: " << e.what();
+ exit(2);
+ }
+ return 0;
+}