aboutsummaryrefslogtreecommitdiffstats
path: root/examples/pdf-filter-tokens.cc
blob: 4a06bcd2bf694e1ebaa49699c4242d595a6d4785 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
//
// This example illustrates the use of QPDFObjectHandle::TokenFilter with addContentTokenFilter.
// Please see comments inline for details. See also pdf-count-strings.cc for a use of
// QPDFObjectHandle::TokenFilter with filterContents.
//

#include <algorithm>
#include <cstdlib>
#include <deque>
#include <iostream>

#include <qpdf/QPDF.hh>
#include <qpdf/QPDFObjectHandle.hh>
#include <qpdf/QPDFPageDocumentHelper.hh>
#include <qpdf/QPDFWriter.hh>
#include <qpdf/QUtil.hh>

static char const* whoami = nullptr;

void
usage()
{
    std::cerr << "Usage: " << whoami << " infile outfile" << std::endl
              << "Applies token filters to infile and writes outfile" << std::endl;
    exit(2);
}

// The StringReverser class is a trivial example of using a token filter. This class only overrides
// the pure virtual handleToken function and preserves the default handleEOF function.
class StringReverser: public QPDFObjectHandle::TokenFilter
{
  public:
    ~StringReverser() override = default;
    void handleToken(QPDFTokenizer::Token const&) override;
};

void
StringReverser::handleToken(QPDFTokenizer::Token const& token)
{
    // For string tokens, reverse the characters. For other tokens, just pass them through. Notice
    // that we construct a new string token and write that, thus allowing the library to handle any
    // subtleties about properly encoding unprintable characters. This function doesn't handle
    // multibyte characters at all. It's not intended to be an example of the correct way to reverse
    // strings. It's just intended to give a simple example of a pretty minimal filter and to show
    // an example of writing a constructed token.
    if (token.getType() == QPDFTokenizer::tt_string) {
        std::string value = token.getValue();
        std::reverse(value.begin(), value.end());
        writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, value));
    } else {
        writeToken(token);
    }
}

// The ColorToGray filter finds all "rg" operators in the content stream and replaces them with "g"
// operators, thus mapping color to grayscale. Note that it only applies to content streams, not
// images, so this will not replace color images with grayscale
// images.
class ColorToGray: public QPDFObjectHandle::TokenFilter
{
  public:
    ~ColorToGray() override = default;
    void handleToken(QPDFTokenizer::Token const&) override;
    void handleEOF() override;

  private:
    bool isNumeric(QPDFTokenizer::token_type_e);
    bool isIgnorable(QPDFTokenizer::token_type_e);
    double numericValue(QPDFTokenizer::Token const&);

    std::deque<QPDFTokenizer::Token> all_stack;
    std::deque<QPDFTokenizer::Token> stack;
};

bool
ColorToGray::isNumeric(QPDFTokenizer::token_type_e token_type)
{
    return ((token_type == QPDFTokenizer::tt_integer) || (token_type == QPDFTokenizer::tt_real));
}

bool
ColorToGray::isIgnorable(QPDFTokenizer::token_type_e token_type)
{
    return ((token_type == QPDFTokenizer::tt_space) || (token_type == QPDFTokenizer::tt_comment));
}

double
ColorToGray::numericValue(QPDFTokenizer::Token const& token)
{
    return QPDFObjectHandle::parse(token.getValue()).getNumericValue();
}

void
ColorToGray::handleToken(QPDFTokenizer::Token const& token)
{
    // Track the number of non-ignorable tokens we've seen. If we see an "rg" following three
    // numbers, convert it to a grayscale value. Keep writing tokens to the output as we can.

    // There are several things to notice here. We keep two stacks: one of "meaningful" tokens, and
    // one of all tokens. This way we can preserve whitespace or comments that we encounter in the
    // stream and there preserve layout. As we receive tokens, we keep the last four meaningful
    // tokens. If we see three numbers followed by rg, we use the three numbers to calculate a gray
    // value that is perceptually similar to the color value and then write the "g" operator to the
    // output, discarding any spaces or comments encountered embedded in the "rg" operator.

    // The stack and all_stack members are updated in such a way that they always contain exactly
    // the same non-ignorable tokens. The stack member contains the tokens that would be left if you
    // removed all space and comment tokens from all_stack.

    // On each new token, flush out any space or comment tokens. Store the incoming token. If we
    // just got an rg preceded by the right kinds of operands, replace the command. Flush any
    // additional accumulated tokens to keep the stack only four tokens deep.

    while ((!this->all_stack.empty()) && isIgnorable(this->all_stack.at(0).getType())) {
        writeToken(this->all_stack.at(0));
        this->all_stack.pop_front();
    }
    this->all_stack.push_back(token);
    QPDFTokenizer::token_type_e token_type = token.getType();
    if (!isIgnorable(token_type)) {
        this->stack.push_back(token);
        if ((this->stack.size() == 4) && token.isWord("rg") &&
            (isNumeric(this->stack.at(0).getType())) && (isNumeric(this->stack.at(1).getType())) &&
            (isNumeric(this->stack.at(2).getType()))) {
            double r = numericValue(this->stack.at(0));
            double g = numericValue(this->stack.at(1));
            double b = numericValue(this->stack.at(2));
            double gray = ((0.3 * r) + (0.59 * b) + (0.11 * g));
            if (gray > 1.0) {
                gray = 1.0;
            }
            if (gray < 0.0) {
                gray = 0.0;
            }
            write(QUtil::double_to_string(gray, 3));
            write(" g");
            this->stack.clear();
            this->all_stack.clear();
        }
    }
    if (this->stack.size() == 4) {
        writeToken(this->all_stack.at(0));
        this->all_stack.pop_front();
        this->stack.pop_front();
    }
}

void
ColorToGray::handleEOF()
{
    // Flush out any remaining accumulated tokens.
    while (!this->all_stack.empty()) {
        writeToken(this->all_stack.at(0));
        this->all_stack.pop_front();
    }
}

int
main(int argc, char* argv[])
{
    whoami = QUtil::getWhoami(argv[0]);

    if (argc != 3) {
        usage();
    }
    char const* infilename = argv[1];
    char const* outfilename = argv[2];

    try {
        QPDF pdf;
        pdf.processFile(infilename);
        for (auto& page: QPDFPageDocumentHelper(pdf).getAllPages()) {
            // Attach two token filters to each page of this file. When the file is written, or when
            // the pages' contents are retrieved in any other way, the filters will be applied. See
            // comments on the filters for additional details.
            page.addContentTokenFilter(
                std::shared_ptr<QPDFObjectHandle::TokenFilter>(new StringReverser));
            page.addContentTokenFilter(
                std::shared_ptr<QPDFObjectHandle::TokenFilter>(new ColorToGray));
        }

        QPDFWriter w(pdf, outfilename);
        w.setStaticID(true); // for testing only
        w.setQDFMode(true);
        w.write();
    } catch (std::exception& e) {
        std::cerr << whoami << ": " << e.what() << std::endl;
        exit(2);
    }

    return 0;
}