From bb9e91adbd75d05d0d60227b2d419d7ee12e1b42 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 23 Jan 2018 21:48:22 -0500 Subject: Create isolated tokenizer tests This tokenizes outer parts of the file, page content streams, and object streams. It is for exercising the tokenizer in isolation and is being introduced before reworking the lexical layer of qpdf. --- qpdf/build.mk | 2 +- qpdf/qtest/qpdf.test | 7 +- qpdf/qtest/qpdf/tokens.out | 1120 ++++++++++++++++++++++++++++++++++++++++++++ qpdf/qtest/qpdf/tokens.pdf | Bin 0 -> 9120 bytes qpdf/test_tokenizer.cc | 261 +++++++++++ 5 files changed, 1388 insertions(+), 2 deletions(-) create mode 100644 qpdf/qtest/qpdf/tokens.out create mode 100644 qpdf/qtest/qpdf/tokens.pdf create mode 100644 qpdf/test_tokenizer.cc diff --git a/qpdf/build.mk b/qpdf/build.mk index 893bbbd1..1bc21836 100644 --- a/qpdf/build.mk +++ b/qpdf/build.mk @@ -1,4 +1,4 @@ -BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file +BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer CBINS_qpdf = qpdf-ctest TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test index f0dde70f..ddf25d73 100644 --- a/qpdf/qtest/qpdf.test +++ b/qpdf/qtest/qpdf.test @@ -240,7 +240,7 @@ foreach my $d (@bug_tests) show_ntests(); # ---------- $td->notify("--- Miscellaneous Tests ---"); -$n_tests += 96; +$n_tests += 97; $td->runtest("qpdf version", {$td->COMMAND => "qpdf --version"}, @@ -263,6 +263,11 @@ $td->runtest("check pass1 file", {$td->FILE => "b.pdf"}, {$td->FILE => "minimal-linearize-pass1.pdf"}); +$td->runtest("tokenizer", + {$td->COMMAND => "test_tokenizer tokens.pdf"}, + {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0}, + $td->NORMALIZE_NEWLINES); + foreach (my $i = 1; $i <= 3; ++$i) { $td->runtest("misc tests", diff --git a/qpdf/qtest/qpdf/tokens.out b/qpdf/qtest/qpdf/tokens.out new file mode 100644 index 00000000..f52e619b --- /dev/null +++ b/qpdf/qtest/qpdf/tokens.out @@ -0,0 +1,1120 @@ +--- BEGIN FILE --- +60: integer: 1 +62: integer: 0 +64: word: obj +68: dict_open: << +73: name: /Type +79: name: /ObjStm +89: name: /Length +97: integer: 6020 +104: name: /N +107: integer: 35 +112: name: /First +119: integer: 323 +123: dict_close: >> +126: word: stream +skipping to endstream +6153: word: endstream +6163: word: endobj +6222: integer: 37 +6225: integer: 0 +6227: word: obj +6231: dict_open: << +6236: name: /Length +6244: integer: 38 +6247: integer: 0 +6249: word: R +6251: dict_close: >> +6254: word: stream +skipping to endstream +6305: word: endstream +6315: word: endobj +6323: integer: 38 +6326: integer: 0 +6328: word: obj +6332: integer: 44 +6335: word: endobj +6394: integer: 39 +6397: integer: 0 +6399: word: obj +6403: dict_open: << +6408: name: /Length +6416: integer: 40 +6419: integer: 0 +6421: word: R +6423: dict_close: >> +6426: word: stream +skipping to endstream +6832: word: endstream +6842: word: endobj +6850: integer: 40 +6853: integer: 0 +6855: word: obj +6859: integer: 399 +6863: word: endobj +6922: integer: 41 +6925: integer: 0 +6927: word: obj +6931: dict_open: << +6936: name: /Length +6944: integer: 42 +6947: integer: 0 +6949: word: R +6951: dict_close: >> +6954: word: stream +skipping to endstream +7001: word: endstream +7011: word: endobj +7019: integer: 42 +7022: integer: 0 +7024: word: obj +7028: integer: 40 +7031: word: endobj +7090: integer: 43 +7093: integer: 0 +7095: word: obj +7099: dict_open: << +7104: name: /Length +7112: integer: 44 +7115: integer: 0 +7117: word: R +7119: dict_close: >> +7122: word: stream +skipping to endstream +7404: word: endstream +7414: word: endobj +7422: integer: 44 +7425: integer: 0 +7427: word: obj +7431: integer: 275 +7435: word: endobj +7494: integer: 45 +7497: integer: 0 +7499: word: obj +7503: dict_open: << +7508: name: /Length +7516: integer: 46 +7519: integer: 0 +7521: word: R +7523: dict_close: >> +7526: word: stream +skipping to endstream +7601: word: endstream +7611: word: endobj +7619: integer: 46 +7622: integer: 0 +7624: word: obj +7628: integer: 68 +7631: word: endobj +7690: integer: 47 +7693: integer: 0 +7695: word: obj +7699: dict_open: << +7704: name: /Length +7712: integer: 48 +7715: integer: 0 +7717: word: R +7719: dict_close: >> +7722: word: stream +skipping to endstream +7773: word: endstream +7783: word: endobj +7791: integer: 48 +7794: integer: 0 +7796: word: obj +7800: integer: 44 +7803: word: endobj +7862: integer: 49 +7865: integer: 0 +7867: word: obj +7871: dict_open: << +7876: name: /Length +7884: integer: 50 +7887: integer: 0 +7889: word: R +7891: dict_close: >> +7894: word: stream +skipping to endstream +7945: word: endstream +7955: word: endobj +7963: integer: 50 +7966: integer: 0 +7968: word: obj +7972: integer: 44 +7975: word: endobj +8034: integer: 51 +8037: integer: 0 +8039: word: obj +8043: dict_open: << +8048: name: /Length +8056: integer: 52 +8059: integer: 0 +8061: word: R +8063: dict_close: >> +8066: word: stream +skipping to endstream +8117: word: endstream +8127: word: endobj +8135: integer: 52 +8138: integer: 0 +8140: word: obj +8144: integer: 44 +8147: word: endobj +8206: integer: 53 +8209: integer: 0 +8211: word: obj +8215: dict_open: << +8220: name: /Length +8228: integer: 54 +8231: integer: 0 +8233: word: R +8235: dict_close: >> +8238: word: stream +skipping to endstream +8289: word: endstream +8299: word: endobj +8307: integer: 54 +8310: integer: 0 +8312: word: obj +8316: integer: 44 +8319: word: endobj +8379: integer: 55 +8382: integer: 0 +8384: word: obj +8388: dict_open: << +8393: name: /Length +8401: integer: 56 +8404: integer: 0 +8406: word: R +8408: dict_close: >> +8411: word: stream +skipping to endstream +8462: word: endstream +8472: word: endobj +8480: integer: 56 +8483: integer: 0 +8485: word: obj +8489: integer: 44 +8492: word: endobj +8552: integer: 57 +8555: integer: 0 +8557: word: obj +8561: dict_open: << +8566: name: /Length +8574: integer: 58 +8577: integer: 0 +8579: word: R +8581: dict_close: >> +8584: word: stream +skipping to endstream +8635: word: endstream +8645: word: endobj +8653: integer: 58 +8656: integer: 0 +8658: word: obj +8662: integer: 44 +8665: word: endobj +8673: integer: 59 +8676: integer: 0 +8678: word: obj +8682: dict_open: << +8687: name: /Type +8693: name: /XRef +8701: name: /Length +8709: integer: 240 +8715: name: /W +8718: array_open: [ +8720: integer: 1 +8722: integer: 2 +8724: integer: 1 +8726: array_close: ] +8730: name: /Root +8736: integer: 2 +8738: integer: 0 +8740: word: R +8744: name: /Size +8750: integer: 60 +8755: name: /ID +8759: array_open: [ +8760: string: \x88\x04\x8e\x17\xc9a\xe0\x94\xff\xec\xe9\x8c\xb8\x8cF\xd0 (raw: <88048e17c961e094ffece98cb88c46d0>) +8794: string: \xed\xd6\x0f\xe8\xee\x87\xf8\x871\xa8o\x81\x9f\xe6Q\x99 (raw: ) +8828: array_close: ] +8830: dict_close: >> +8833: word: stream +skipping to endstream +9081: word: endstream +9091: word: endobj +9099: word: startxref +9109: integer: 8673 +9120: eof +--- END FILE --- +--- BEGIN PAGE 1 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: string: Potato (raw: (Potato)) +38: word: Tj +41: word: ET +44: eof +--- END PAGE 1 --- +--- BEGIN PAGE 2 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: string: Potato (raw: (Potato)) +38: word: Tj +41: word: ET +44: word: BI +47: name: /CS +51: name: /G +53: name: /W +56: integer: 66 +58: name: /H +61: integer: 47 +63: name: /BPC +68: integer: 8 +69: name: /F +71: name: /Fl +74: name: /DP +77: dict_open: << +79: name: /Predictor +90: integer: 15 +92: name: /Columns +101: integer: 66 +103: dict_close: >> +106: word: ID +skipping to EI +352: word: EI +355: word: BT +360: name: /F1 +364: integer: 24 +367: word: Tf +372: integer: 72 +375: integer: 720 +379: word: Td +384: string: Potato (raw: (Potato)) +393: word: Tj +396: word: ET +399: eof +--- END PAGE 2 --- +--- BEGIN PAGE 3 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: integer: 72 +20: integer: 720 +24: word: Td +29: bad: Potato\x0aET\x0a (raw: (Potato\x0aET\x0a) (EOF while reading token) +40: eof +--- END PAGE 3 --- +--- BEGIN PAGE 4 --- +0: word: BT +5: name: /F1 +9: integer: 24 +12: word: Tf +17: string: \xfe\xeb (raw: ) +26: string: \xab\xcd (raw: ) +36: string: quack (raw: (qu\\x0d\x0aack)) +49: string: quack (raw: (qu\\x0aack)) +61: string: quack (raw: (qu\\x0dack)) +73: integer: 72 +76: integer: 720 +80: word: Td +85: real: 3.14 +92: real: 3. +97: real: .14 +103: real: +3.14 +111: real: +3. +117: real: +.14 +124: real: -3.14 +132: real: -3. +138: real: -.14 +145: integer: +16059 +154: integer: -16059 +163: word: +. +168: bad: fadeE (raw: (unexpected >) +179: word: quack +185: bad: /name#oops (invalid name token) +196: name: /name (raw: /n#61me) +204: word: one +208: bool: true +213: word: two +217: bool: false +223: word: three +229: null: null +234: word: four +239: word: !@#$^& +245: brace_open: { +246: brace_close: } +247: word: *-_+= +253: word: abc123def3.14true +271: bad: ff (raw: > +420: dict_open: << +425: name: /Count +432: integer: 11 +437: name: /Kids +443: array_open: [ +449: integer: 4 +451: integer: 0 +453: word: R +459: integer: 5 +461: integer: 0 +463: word: R +469: integer: 6 +471: integer: 0 +473: word: R +479: integer: 7 +481: integer: 0 +483: word: R +489: integer: 8 +491: integer: 0 +493: word: R +499: integer: 9 +501: integer: 0 +503: word: R +509: integer: 10 +512: integer: 0 +514: word: R +520: integer: 11 +523: integer: 0 +525: word: R +531: integer: 12 +534: integer: 0 +536: word: R +542: integer: 13 +545: integer: 0 +547: word: R +553: integer: 14 +556: integer: 0 +558: word: R +562: array_close: ] +566: name: /Type +572: name: /Pages +579: dict_close: >> +651: dict_open: << +656: name: /Contents +666: integer: 37 +669: integer: 0 +671: word: R +675: name: /MediaBox +685: array_open: [ +691: integer: 0 +697: integer: 0 +703: integer: 612 +711: integer: 792 +717: array_close: ] +721: name: /Parent +729: integer: 3 +731: integer: 0 +733: word: R +737: name: /Resources +748: dict_open: << +755: name: /Font +761: dict_open: << +770: name: /F1 +774: integer: 15 +777: integer: 0 +779: word: R +785: dict_close: >> +792: name: /ProcSet +801: integer: 16 +804: integer: 0 +806: word: R +810: dict_close: >> +815: name: /Type +821: name: /Page +827: dict_close: >> +899: dict_open: << +904: name: /Contents +914: integer: 39 +917: integer: 0 +919: word: R +923: name: /MediaBox +933: array_open: [ +939: integer: 0 +945: integer: 0 +951: integer: 612 +959: integer: 792 +965: array_close: ] +969: name: /Parent +977: integer: 3 +979: integer: 0 +981: word: R +985: name: /Resources +996: dict_open: << +1003: name: /Font +1009: dict_open: << +1018: name: /F1 +1022: integer: 17 +1025: integer: 0 +1027: word: R +1033: dict_close: >> +1040: name: /ProcSet +1049: integer: 18 +1052: integer: 0 +1054: word: R +1058: dict_close: >> +1063: name: /Type +1069: name: /Page +1075: dict_close: >> +1147: dict_open: << +1152: name: /Contents +1162: integer: 41 +1165: integer: 0 +1167: word: R +1171: name: /MediaBox +1181: array_open: [ +1187: integer: 0 +1193: integer: 0 +1199: integer: 612 +1207: integer: 792 +1213: array_close: ] +1217: name: /Parent +1225: integer: 3 +1227: integer: 0 +1229: word: R +1233: name: /Resources +1244: dict_open: << +1251: name: /Font +1257: dict_open: << +1266: name: /F1 +1270: integer: 19 +1273: integer: 0 +1275: word: R +1281: dict_close: >> +1288: name: /ProcSet +1297: integer: 20 +1300: integer: 0 +1302: word: R +1306: dict_close: >> +1311: name: /Type +1317: name: /Page +1323: dict_close: >> +1395: dict_open: << +1400: name: /Contents +1410: integer: 43 +1413: integer: 0 +1415: word: R +1419: name: /MediaBox +1429: array_open: [ +1435: integer: 0 +1441: integer: 0 +1447: integer: 612 +1455: integer: 792 +1461: array_close: ] +1465: name: /Parent +1473: integer: 3 +1475: integer: 0 +1477: word: R +1481: name: /Resources +1492: dict_open: << +1499: name: /Font +1505: dict_open: << +1514: name: /F1 +1518: integer: 21 +1521: integer: 0 +1523: word: R +1529: dict_close: >> +1536: name: /ProcSet +1545: integer: 22 +1548: integer: 0 +1550: word: R +1554: dict_close: >> +1559: name: /Type +1565: name: /Page +1571: dict_close: >> +1643: dict_open: << +1648: name: /Contents +1658: integer: 45 +1661: integer: 0 +1663: word: R +1667: name: /MediaBox +1677: array_open: [ +1683: integer: 0 +1689: integer: 0 +1695: integer: 612 +1703: integer: 792 +1709: array_close: ] +1713: name: /Parent +1721: integer: 3 +1723: integer: 0 +1725: word: R +1729: name: /Resources +1740: dict_open: << +1747: name: /Font +1753: dict_open: << +1762: name: /F1 +1766: integer: 23 +1769: integer: 0 +1771: word: R +1777: dict_close: >> +1784: name: /ProcSet +1793: integer: 24 +1796: integer: 0 +1798: word: R +1802: dict_close: >> +1807: name: /Type +1813: name: /Page +1819: dict_close: >> +1891: dict_open: << +1896: name: /Contents +1906: integer: 47 +1909: integer: 0 +1911: word: R +1915: name: /MediaBox +1925: array_open: [ +1931: integer: 0 +1937: integer: 0 +1943: integer: 612 +1951: integer: 792 +1957: array_close: ] +1961: name: /Parent +1969: integer: 3 +1971: integer: 0 +1973: word: R +1977: name: /Resources +1988: dict_open: << +1995: name: /Font +2001: dict_open: << +2010: name: /F1 +2014: integer: 25 +2017: integer: 0 +2019: word: R +2025: dict_close: >> +2032: name: /ProcSet +2041: integer: 26 +2044: integer: 0 +2046: word: R +2050: dict_close: >> +2055: name: /Type +2061: name: /Page +2067: dict_close: >> +2141: dict_open: << +2146: name: /Contents +2156: integer: 49 +2159: integer: 0 +2161: word: R +2165: name: /MediaBox +2175: array_open: [ +2181: integer: 0 +2187: integer: 0 +2193: integer: 612 +2201: integer: 792 +2207: array_close: ] +2211: name: /Parent +2219: integer: 3 +2221: integer: 0 +2223: word: R +2227: name: /Resources +2238: dict_open: << +2245: name: /Font +2251: dict_open: << +2260: name: /F1 +2264: integer: 27 +2267: integer: 0 +2269: word: R +2275: dict_close: >> +2282: name: /ProcSet +2291: integer: 28 +2294: integer: 0 +2296: word: R +2300: dict_close: >> +2305: name: /Type +2311: name: /Page +2317: dict_close: >> +2391: dict_open: << +2396: name: /Contents +2406: integer: 51 +2409: integer: 0 +2411: word: R +2415: name: /MediaBox +2425: array_open: [ +2431: integer: 0 +2437: integer: 0 +2443: integer: 612 +2451: integer: 792 +2457: array_close: ] +2461: name: /Parent +2469: integer: 3 +2471: integer: 0 +2473: word: R +2477: name: /Resources +2488: dict_open: << +2495: name: /Font +2501: dict_open: << +2510: name: /F1 +2514: integer: 29 +2517: integer: 0 +2519: word: R +2525: dict_close: >> +2532: name: /ProcSet +2541: integer: 30 +2544: integer: 0 +2546: word: R +2550: dict_close: >> +2555: name: /Type +2561: name: /Page +2567: dict_close: >> +2642: dict_open: << +2647: name: /Contents +2657: integer: 53 +2660: integer: 0 +2662: word: R +2666: name: /MediaBox +2676: array_open: [ +2682: integer: 0 +2688: integer: 0 +2694: integer: 612 +2702: integer: 792 +2708: array_close: ] +2712: name: /Parent +2720: integer: 3 +2722: integer: 0 +2724: word: R +2728: name: /Resources +2739: dict_open: << +2746: name: /Font +2752: dict_open: << +2761: name: /F1 +2765: integer: 31 +2768: integer: 0 +2770: word: R +2776: dict_close: >> +2783: name: /ProcSet +2792: integer: 32 +2795: integer: 0 +2797: word: R +2801: dict_close: >> +2806: name: /Type +2812: name: /Page +2818: dict_close: >> +2894: dict_open: << +2899: name: /Contents +2909: integer: 55 +2912: integer: 0 +2914: word: R +2918: name: /MediaBox +2928: array_open: [ +2934: integer: 0 +2940: integer: 0 +2946: integer: 612 +2954: integer: 792 +2960: array_close: ] +2964: name: /Parent +2972: integer: 3 +2974: integer: 0 +2976: word: R +2980: name: /Resources +2991: dict_open: << +2998: name: /Font +3004: dict_open: << +3013: name: /F1 +3017: integer: 33 +3020: integer: 0 +3022: word: R +3028: dict_close: >> +3035: name: /ProcSet +3044: integer: 34 +3047: integer: 0 +3049: word: R +3053: dict_close: >> +3058: name: /Type +3064: name: /Page +3070: dict_close: >> +3146: dict_open: << +3151: name: /Contents +3161: integer: 57 +3164: integer: 0 +3166: word: R +3170: name: /MediaBox +3180: array_open: [ +3186: integer: 0 +3192: integer: 0 +3198: integer: 612 +3206: integer: 792 +3212: array_close: ] +3216: name: /Parent +3224: integer: 3 +3226: integer: 0 +3228: word: R +3232: name: /Resources +3243: dict_open: << +3250: name: /Font +3256: dict_open: << +3265: name: /F1 +3269: integer: 35 +3272: integer: 0 +3274: word: R +3280: dict_close: >> +3287: name: /ProcSet +3296: integer: 36 +3299: integer: 0 +3301: word: R +3305: dict_close: >> +3310: name: /Type +3316: name: /Page +3322: dict_close: >> +3387: dict_open: << +3392: name: /BaseFont +3402: name: /Helvetica +3415: name: /Encoding +3425: name: /WinAnsiEncoding +3444: name: /Name +3450: name: /F1 +3456: name: /Subtype +3465: name: /Type1 +3474: name: /Type +3480: name: /Font +3486: dict_close: >> +3551: array_open: [ +3555: name: /PDF +3562: name: /Text +3568: array_close: ] +3632: dict_open: << +3637: name: /BaseFont +3647: name: /Helvetica +3660: name: /Encoding +3670: name: /WinAnsiEncoding +3689: name: /Name +3695: name: /F1 +3701: name: /Subtype +3710: name: /Type1 +3719: name: /Type +3725: name: /Font +3731: dict_close: >> +3796: array_open: [ +3800: name: /PDF +3807: name: /Text +3813: array_close: ] +3877: dict_open: << +3882: name: /BaseFont +3892: name: /Helvetica +3905: name: /Encoding +3915: name: /WinAnsiEncoding +3934: name: /Name +3940: name: /F1 +3946: name: /Subtype +3955: name: /Type1 +3964: name: /Type +3970: name: /Font +3976: dict_close: >> +4041: array_open: [ +4045: name: /PDF +4052: name: /Text +4058: array_close: ] +4122: dict_open: << +4127: name: /BaseFont +4137: name: /Helvetica +4150: name: /Encoding +4160: name: /WinAnsiEncoding +4179: name: /Name +4185: name: /F1 +4191: name: /Subtype +4200: name: /Type1 +4209: name: /Type +4215: name: /Font +4221: dict_close: >> +4286: array_open: [ +4290: name: /PDF +4297: name: /Text +4303: array_close: ] +4367: dict_open: << +4372: name: /BaseFont +4382: name: /Helvetica +4395: name: /Encoding +4405: name: /WinAnsiEncoding +4424: name: /Name +4430: name: /F1 +4436: name: /Subtype +4445: name: /Type1 +4454: name: /Type +4460: name: /Font +4466: dict_close: >> +4531: array_open: [ +4535: name: /PDF +4542: name: /Text +4548: array_close: ] +4612: dict_open: << +4617: name: /BaseFont +4627: name: /Helvetica +4640: name: /Encoding +4650: name: /WinAnsiEncoding +4669: name: /Name +4675: name: /F1 +4681: name: /Subtype +4690: name: /Type1 +4699: name: /Type +4705: name: /Font +4711: dict_close: >> +4776: array_open: [ +4780: name: /PDF +4787: name: /Text +4793: array_close: ] +4857: dict_open: << +4862: name: /BaseFont +4872: name: /Helvetica +4885: name: /Encoding +4895: name: /WinAnsiEncoding +4914: name: /Name +4920: name: /F1 +4926: name: /Subtype +4935: name: /Type1 +4944: name: /Type +4950: name: /Font +4956: dict_close: >> +5021: array_open: [ +5025: name: /PDF +5032: name: /Text +5038: array_close: ] +5102: dict_open: << +5107: name: /BaseFont +5117: name: /Helvetica +5130: name: /Encoding +5140: name: /WinAnsiEncoding +5159: name: /Name +5165: name: /F1 +5171: name: /Subtype +5180: name: /Type1 +5189: name: /Type +5195: name: /Font +5201: dict_close: >> +5266: array_open: [ +5270: name: /PDF +5277: name: /Text +5283: array_close: ] +5347: dict_open: << +5352: name: /BaseFont +5362: name: /Helvetica +5375: name: /Encoding +5385: name: /WinAnsiEncoding +5404: name: /Name +5410: name: /F1 +5416: name: /Subtype +5425: name: /Type1 +5434: name: /Type +5440: name: /Font +5446: dict_close: >> +5511: array_open: [ +5515: name: /PDF +5522: name: /Text +5528: array_close: ] +5592: dict_open: << +5597: name: /BaseFont +5607: name: /Helvetica +5620: name: /Encoding +5630: name: /WinAnsiEncoding +5649: name: /Name +5655: name: /F1 +5661: name: /Subtype +5670: name: /Type1 +5679: name: /Type +5685: name: /Font +5691: dict_close: >> +5756: array_open: [ +5760: name: /PDF +5767: name: /Text +5773: array_close: ] +5837: dict_open: << +5842: name: /BaseFont +5852: name: /Helvetica +5865: name: /Encoding +5875: name: /WinAnsiEncoding +5894: name: /Name +5900: name: /F1 +5906: name: /Subtype +5915: name: /Type1 +5924: name: /Type +5930: name: /Font +5936: dict_close: >> +6001: array_open: [ +6005: name: /PDF +6012: name: /Text +6018: array_close: ] +6020: eof +--- END OBJECT STREAM 1 --- diff --git a/qpdf/qtest/qpdf/tokens.pdf b/qpdf/qtest/qpdf/tokens.pdf new file mode 100644 index 00000000..b444db5f Binary files /dev/null and b/qpdf/qtest/qpdf/tokens.pdf differ diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc new file mode 100644 index 00000000..de079195 --- /dev/null +++ b/qpdf/test_tokenizer.cc @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char const* whoami = 0; + +void usage() +{ + std::cerr << "Usage: " << whoami << " filename" + << std::endl; + exit(2); +} + +class Finder: public InputSource::Finder +{ + public: + Finder(PointerHolder is, std::string const& str) : + is(is), + str(str) + { + } + virtual ~Finder() + { + } + virtual bool check(); + + private: + PointerHolder is; + std::string str; +}; + +bool +Finder::check() +{ + QPDFTokenizer tokenizer; + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); + qpdf_offset_t offset = this->is->tell(); + bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)); + this->is->seek(offset - this->str.length(), SEEK_SET); + return result; +} + +static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype) +{ + // Do this is a case statement instead of a lookup so the compiler + // will warn if we miss any. + switch (ttype) + { + case QPDFTokenizer::tt_bad: + return "bad"; + case QPDFTokenizer::tt_array_close: + return "array_close"; + case QPDFTokenizer::tt_array_open: + return "array_open"; + case QPDFTokenizer::tt_brace_close: + return "brace_close"; + case QPDFTokenizer::tt_brace_open: + return "brace_open"; + case QPDFTokenizer::tt_dict_close: + return "dict_close"; + case QPDFTokenizer::tt_dict_open: + return "dict_open"; + case QPDFTokenizer::tt_integer: + return "integer"; + case QPDFTokenizer::tt_name: + return "name"; + case QPDFTokenizer::tt_real: + return "real"; + case QPDFTokenizer::tt_string: + return "string"; + case QPDFTokenizer::tt_null: + return "null"; + case QPDFTokenizer::tt_bool: + return "bool"; + case QPDFTokenizer::tt_word: + return "word"; + case QPDFTokenizer::tt_eof: + return "eof"; + } + return 0; +} + +static std::string +sanitize(std::string const& value) +{ + std::string result; + for (std::string::const_iterator iter = value.begin(); iter != value.end(); + ++iter) + { + if ((*iter >= 32) && (*iter <= 126)) + { + result.append(1, *iter); + } + else + { + result += "\\x" + QUtil::int_to_string_base( + static_cast(*iter), 16, 2); + } + } + return result; +} + +static void +try_skipping(PointerHolder is, char const* what, Finder& f) +{ + std::cout << "skipping to " << what << std::endl; + qpdf_offset_t offset = is->tell(); + if (! is->findFirst(what, offset, 0, f)) + { + std::cout << what << " not found" << std::endl; + is->seek(offset, SEEK_SET); + } +} + +static void +dump_tokens(PointerHolder is, std::string const& label, + bool skip_streams, bool skip_inline_images) +{ + Finder f1(is, "endstream"); + Finder f2(is, "EI"); + std::cout << "--- BEGIN " << label << " ---" << std::endl; + bool done = false; + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + while (! done) + { + QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true); + + qpdf_offset_t offset = is->tell() - token.getRawValue().length(); + std::cout << offset << ": " + << tokenTypeName(token.getType()); + if (token.getType() != QPDFTokenizer::tt_eof) + { + std::cout << ": " + << sanitize(token.getValue()); + if (token.getValue() != token.getRawValue()) + { + std::cout << " (raw: " << sanitize(token.getRawValue()) << ")"; + } + } + if (token.getType() == QPDFTokenizer::tt_bad) + { + std::cout << " (" << token.getErrorMessage() << ")"; + } + std::cout << std::endl; + if (skip_streams && + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))) + { + try_skipping(is, "endstream", f1); + } + else if (skip_inline_images && + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))) + { + try_skipping(is, "EI", f2); + } + else if (token.getType() == QPDFTokenizer::tt_eof) + { + done = true; + } + } + std::cout << "--- END " << label << " ---" << std::endl; +} + +static void process(char const* filename) +{ + PointerHolder is; + QPDFTokenizer tokenizer; + tokenizer.allowEOF(); + + // Tokenize file, skipping streams + FileInputSource* fis = new FileInputSource(); + fis->setFilename(filename); + is = fis; + dump_tokens(is, "FILE", true, false); + + // Tokenize content streams, skipping inline images + QPDF qpdf; + qpdf.processFile(filename); + std::vector pages = qpdf.getAllPages(); + int pageno = 0; + for (std::vector::iterator iter = pages.begin(); + iter != pages.end(); ++iter) + { + ++pageno; + Pl_Buffer plb("buffer"); + std::vector contents = (*iter).getPageContents(); + for (std::vector::iterator citer = contents.begin(); + citer != contents.end(); ++citer) + { + (*citer).pipeStreamData(&plb, 0, qpdf_dl_specialized); + } + plb.finish(); + PointerHolder content_data = plb.getBuffer(); + BufferInputSource* bis = new BufferInputSource( + "content data", content_data.getPointer()); + is = bis; + dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true); + } + + // Tokenize object streams + std::vector all = qpdf.getAllObjects(); + for (std::vector::iterator iter = all.begin(); + iter != all.end(); ++iter) + { + if ((*iter).isStream() && + (*iter).getDict().getKey("/Type").isName() && + (*iter).getDict().getKey("/Type").getName() == "/ObjStm") + { + PointerHolder b = + (*iter).getStreamData(qpdf_dl_specialized); + BufferInputSource* bis = new BufferInputSource( + "object stream data", b.getPointer()); + is = bis; + dump_tokens(is, "OBJECT STREAM " + + QUtil::int_to_string((*iter).getObjectID()), + false, false); + } + } +} + +int main(int argc, char* argv[]) +{ + QUtil::setLineBuf(stdout); + if ((whoami = strrchr(argv[0], '/')) == NULL) + { + whoami = argv[0]; + } + else + { + ++whoami; + } + // For libtool's sake.... + if (strncmp(whoami, "lt-", 3) == 0) + { + whoami += 3; + } + + if (argc != 2) + { + usage(); + } + + char const* filename = argv[1]; + try + { + process(filename); + } + catch (std::exception& e) + { + std::cerr << whoami << ": exception: " << e.what(); + exit(2); + } + return 0; +} -- cgit v1.2.3-54-g00ecf