Lexer enhancements: EOF, comment, space

Significant enhancements to the lexer to improve EOF handling and to support comments and spaces as tokens. Various other minor issues were fixed as well.
author: Jay Berkenbilt <ejb@ql.org> 2018-01-29 00:28:45 +0100
committer: Jay Berkenbilt <ejb@ql.org> 2018-02-19 02:18:40 +0100
commit: d97474868d7fa6a94bab49d89af5dd82fd5e3a41 (patch)
tree: 754e4741adf505081e81a30bcd3c4395acb066f9 /qpdf/test_tokenizer.cc
parent: bb9e91adbd75d05d0d60227b2d419d7ee12e1b42 (diff)
download: qpdf-d97474868d7fa6a94bab49d89af5dd82fd5e3a41.tar.zst
1 files changed, 59 insertions, 15 deletions
diff --git a/qpdf/test_tokenizer.cc b/qpdf/test_tokenizer.cc
index de079195..6e694bfb 100644
--- a/qpdf/test_tokenizer.cc
+++ b/qpdf/test_tokenizer.cc
@@ -13,7 +13,8 @@ static char const* whoami = 0;
 
 void usage()
 {
-    std::cerr << "Usage: " << whoami << " filename"
+    std::cerr << "Usage: " << whoami
+              << " [-maxlen len | -no-ignorable] filename"
               << std::endl;
     exit(2);
 }
@@ -83,6 +84,10 @@ static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
         return "word";
       case QPDFTokenizer::tt_eof:
         return "eof";
+      case QPDFTokenizer::tt_space:
+        return "space";
+      case QPDFTokenizer::tt_comment:
+        return "comment";
     }
     return 0;
 }
@@ -108,7 +113,8 @@ sanitize(std::string const& value)
 }
 
 static void
-try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
+try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
+             size_t max_len, char const* what, Finder& f)
 {
     std::cout << "skipping to " << what << std::endl;
     qpdf_offset_t offset = is->tell();
@@ -121,6 +127,7 @@ try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
 
 static void
 dump_tokens(PointerHolder<InputSource> is, std::string const& label,
+            size_t max_len, bool include_ignorable,
             bool skip_streams, bool skip_inline_images)
 {
     Finder f1(is, "endstream");
@@ -129,11 +136,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
     bool done = false;
     QPDFTokenizer tokenizer;
     tokenizer.allowEOF();
+    if (include_ignorable)
+    {
+        tokenizer.includeIgnorable();
+    }
     while (! done)
     {
-        QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true);
+        QPDFTokenizer::Token token =
+            tokenizer.readToken(is, "test", true, max_len);
 
-        qpdf_offset_t offset = is->tell() - token.getRawValue().length();
+        qpdf_offset_t offset = is->getLastOffset();
         std::cout << offset << ": "
                   << tokenTypeName(token.getType());
         if (token.getType() != QPDFTokenizer::tt_eof)
@@ -153,12 +165,12 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
         if (skip_streams &&
             (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
         {
-            try_skipping(is, "endstream", f1);
+            try_skipping(tokenizer, is, max_len, "endstream", f1);
         }
         else if (skip_inline_images &&
                  (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
         {
-            try_skipping(is, "EI", f2);
+            try_skipping(tokenizer, is, max_len, "EI", f2);
         }
         else if (token.getType() == QPDFTokenizer::tt_eof)
         {
@@ -168,17 +180,16 @@ dump_tokens(PointerHolder<InputSource> is, std::string const& label,
     std::cout << "--- END " << label << " ---" << std::endl;
 }
 
-static void process(char const* filename)
+static void process(char const* filename, bool include_ignorable,
+                    size_t max_len)
 {
     PointerHolder<InputSource> is;
-    QPDFTokenizer tokenizer;
-    tokenizer.allowEOF();
 
     // Tokenize file, skipping streams
     FileInputSource* fis = new FileInputSource();
     fis->setFilename(filename);
     is = fis;
-    dump_tokens(is, "FILE", true, false);
+    dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
 
     // Tokenize content streams, skipping inline images
     QPDF qpdf;
@@ -201,7 +212,8 @@ static void process(char const* filename)
         BufferInputSource* bis = new BufferInputSource(
             "content data", content_data.getPointer());
         is = bis;
-        dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true);
+        dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
+                    max_len, include_ignorable, false, true);
     }
 
     // Tokenize object streams
@@ -220,7 +232,7 @@ static void process(char const* filename)
             is = bis;
             dump_tokens(is, "OBJECT STREAM " +
                         QUtil::int_to_string((*iter).getObjectID()),
-                        false, false);
+                        max_len, include_ignorable, false, false);
         }
     }
 }
@@ -242,15 +254,47 @@ int main(int argc, char* argv[])
 	whoami += 3;
     }
 
-    if (argc != 2)
+    char const* filename = 0;
+    size_t max_len = 0;
+    bool include_ignorable = true;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (argv[i][0] == '-')
+        {
+            if (strcmp(argv[i], "-maxlen") == 0)
+            {
+                if (++i >= argc)
+                {
+                    usage();
+                }
+                max_len = QUtil::string_to_int(argv[i]);
+            }
+            else if (strcmp(argv[i], "-no-ignorable") == 0)
+            {
+                include_ignorable = false;
+            }
+            else
+            {
+                usage();
+            }
+        }
+        else if (filename)
+        {
+            usage();
+        }
+        else
+        {
+            filename = argv[i];
+        }
+    }
+    if (filename == 0)
     {
         usage();
     }
 
-    char const* filename = argv[1];
     try
     {
-        process(filename);
+        process(filename, include_ignorable, max_len);
     }
     catch (std::exception& e)
     {
author	Jay Berkenbilt <ejb@ql.org>	2018-01-29 00:28:45 +0100
committer	Jay Berkenbilt <ejb@ql.org>	2018-02-19 02:18:40 +0100
commit	d97474868d7fa6a94bab49d89af5dd82fd5e3a41 (patch)
tree	754e4741adf505081e81a30bcd3c4395acb066f9 /qpdf/test_tokenizer.cc
parent	bb9e91adbd75d05d0d60227b2d419d7ee12e1b42 (diff)
download	qpdf-d97474868d7fa6a94bab49d89af5dd82fd5e3a41.tar.zst