From 68e4aec054dd735c0a808133acdf2fbca6c648c7 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Tue, 8 Feb 2022 09:07:33 -0500 Subject: Clarify qpdf's representation of names in the API Clarify that names are to appear in canonical form with PDF escaping resolved when used in non-parsing QPDFObjectHandle APIs and their C API counterparts. See https://github.com/qpdf/qpdf/discussions/625. --- include/qpdf/QPDFObjectHandle.hh | 65 +++++++++++++++++++++++++++++----------- include/qpdf/qpdf-c.h | 14 +++++++-- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 2cc38e71..e7d03d89 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -145,18 +145,26 @@ class QPDFObjectHandle // TokenFilters. // // Please note that when you call token.getValue() on a token of - // type tt_string, you get the string value without any - // delimiters. token.getRawValue() will return something suitable - // for being written to output, or calling writeToken with a - // string token will also work. The correct way to construct a - // string token that would write the literal value (str) is - // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). A - // similar situation exists with tt_name. token.getValue() returns - // a normalized name with # codes resolved into characters, and - // may not be suitable for writing. You can pass it to - // QPDF_Name::normalizeName first, or you can use writeToken with - // a name token. The correct way to create a name token is - // QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/Name"). + // type tt_string or tt_name, you get the canonical, "parsed" + // representation of the token. For a string, this means that + // there are no delimiters, and for a name, it means that all + // escaping (# followed by two hex digits) has been resolved. + // qpdf's internal representation of name includes the leading + // slash. As such, you can't write the value of token.getValue() + // directly to output that is supposed to be valid PDF syntax. If + // you want to do that, you need to call writeToken() instead, or + // you can retrieve the token as it appeared in the input with + // token.getRawValue(). To construct a new string or name token + // from a canonical representation, use + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "parsed-str") or + // QPDFTokenizer::Token(QPDFTokenizer::tt_name, + // "/Canonical-Name"). Tokens created this way won't have a + // PDF-syntax raw value, but you can still write them with + // writeToken(). Example: + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/text/plain")) + // would write `/text#2fplain`, and + // writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, "a\\(b")) + // would write `(a\(b)` class QPDF_DLL_CLASS TokenFilter { public: @@ -519,6 +527,22 @@ class QPDFObjectHandle QPDF_DLL static QPDFObjectHandle newReal(double value, int decimal_places, bool trim_trailing_zeroes); + // Note about name objects: qpdf's internal representation of a + // PDF name is a sequence of bytes, excluding the NUL character, + // and starting with a slash. Name objects as represented in the + // PDF specification can contain characters escaped with #, but + // such escaping is not of concern calling QPDFObjectHandle + // methods not directly relating to parsing. For example, + // newName("/text/plain").getName() and + // parse("/text#2fplain").getName() both return "/text/plain", + // while newName("/text/plain").unparse() and + // parse("/text#2fplain").unparse() both return "/text#2fplain". + // When working with the qpdf API for creating, retrieving, and + // modifying objects, you want to work with the internal, + // canonical representation. For names containing alphanumeric + // characters, dashes, and underscores, there is no difference + // between the two representations. For a lengthy discussion, see + // https://github.com/qpdf/qpdf/discussions/625. QPDF_DLL static QPDFObjectHandle newName(std::string const& name); QPDF_DLL @@ -719,7 +743,9 @@ class QPDFObjectHandle QPDF_DLL bool getValueAsNumber(double&); - // Methods for name objects; see also name and array objects + // Methods for name objects. The returned name value is in qpdf's + // canonical form with all escaping resolved. See comments for + // newName() for details. QPDF_DLL std::string getName(); QPDF_DLL @@ -789,7 +815,10 @@ class QPDFObjectHandle QPDF_DLL Matrix getArrayAsMatrix(); - // Methods for dictionary objects. + // Methods for dictionary objects. In all dictionary methods, keys + // are specified/represented as canonical name strings starting + // with a leading slash and not containing any PDF syntax + // escaping. See comments for getName() for details. // Return an object that enables iteration over members. You can // do @@ -824,7 +853,9 @@ class QPDFObjectHandle QPDF_DLL std::map getDictAsMap(); - // Methods for name and array objects + // Methods for name and array objects. The name value is in qpdf's + // canonical form with all escaping resolved. See comments for + // newName() for details. QPDF_DLL bool isOrHasName(std::string const&); @@ -1237,8 +1268,8 @@ class QPDFObjectHandle // Return encoded as JSON. For most object types, there is an // obvious mapping. The JSON is generated as follows: - // * Names are encoded as strings representing the normalized value of - // getName() + // * Names are encoded as strings representing the normalized name + // in PDF syntax as returned by unparse() // * Indirect references are encoded as strings containing "obj gen R" // * Strings are encoded as UTF-8 strings with unrepresentable binary // characters encoded as \uHHHH diff --git a/include/qpdf/qpdf-c.h b/include/qpdf/qpdf-c.h index a7e2d042..e5962da9 100644 --- a/include/qpdf/qpdf-c.h +++ b/include/qpdf/qpdf-c.h @@ -654,7 +654,10 @@ extern "C" { /* Wrappers around QPDFObjectHandle methods. Be sure to read * corresponding comments in QPDFObjectHandle.hh to understand * what each function does and what kinds of objects it applies - * to. + * to. Note that names are to appear in a canonicalized form + * starting with a leading slash and with all PDF escaping + * resolved. See comments for getName() in QPDFObjectHandle.hh for + * details. */ QPDF_DLL @@ -790,6 +793,12 @@ extern "C" { QPDF_DLL qpdf_oh qpdf_oh_get_array_item(qpdf_data qpdf, qpdf_oh oh, int n); + /* In all dictionary APIs, keys are specified/represented as + * canonicalized name strings starting with / and with all PDF + * escaping resolved. See comments for getName() in + * QPDFObjectHandle for details. + */ + /* "C"-specific dictionary key iteration */ /* Iteration is allowed on only one dictionary at a time. */ @@ -813,7 +822,8 @@ extern "C" { QPDF_DLL qpdf_oh qpdf_oh_get_key(qpdf_data qpdf, qpdf_oh oh, char const* key); QPDF_DLL - qpdf_oh qpdf_oh_get_key_if_dict(qpdf_data qpdf, qpdf_oh oh, char const* key); + qpdf_oh qpdf_oh_get_key_if_dict( + qpdf_data qpdf, qpdf_oh oh, char const* key); QPDF_DLL QPDF_BOOL qpdf_oh_is_or_has_name( -- cgit v1.2.3-54-g00ecf