From 3246923cf2189554f7c348ebf51c9774c09deec8 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Sat, 7 May 2022 08:20:09 -0400 Subject: Implement JSON v2 for String Also refine the herustic for deciding whether to use hexadecimal notation for a string. --- libqpdf/QPDF_String.cc | 81 +++++++++++++++----------- libqpdf/qpdf/QPDF_String.hh | 1 + qpdf/qtest/qpdf/V4-clearmeta.pdf | Bin 15225 -> 15240 bytes qpdf/qtest/qpdf/direct-pages-json-objects.out | 4 +- qpdf/qtest/qpdf/direct-pages-json-pages.out | 4 +- qpdf/qtest/qpdf/good14.out | 2 +- qpdf/qtest/qpdf/merge-dict.out | 12 ++-- qpdf/qtest/qpdf/page_api_2-json-objects.out | 8 +-- qpdf/qtest/qpdf/page_api_2-json-pages.out | 8 +-- 9 files changed, 68 insertions(+), 52 deletions(-) diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index fd820998..4d45d851 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -45,8 +45,32 @@ QPDF_String::unparse() JSON QPDF_String::getJSON(int json_version) { - // QXXXQ - return JSON::makeString(getUTF8Val()); + if (json_version == 1) { + return JSON::makeString(getUTF8Val()); + } + // See if we can unambiguously represent as Unicode. + bool is_unicode = false; + std::string result; + std::string candidate = getUTF8Val(); + if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) { + is_unicode = true; + result = candidate; + } else if (!useHexString()) { + std::string test; + if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && + (test == this->val)) { + // This is a PDF-doc string that can be losslessly encoded + // as Unicode. + is_unicode = true; + result = candidate; + } + } + if (is_unicode) { + result = "u:" + result; + } else { + result = "b:" + QUtil::hex_encode(this->val); + } + return JSON::makeString(result); } QPDFObject::object_type_e @@ -61,41 +85,32 @@ QPDF_String::getTypeName() const return "string"; } -std::string -QPDF_String::unparse(bool force_binary) +bool +QPDF_String::useHexString() const { - bool use_hexstring = force_binary; - if (!use_hexstring) { - unsigned int nonprintable = 0; - int consecutive_printable = 0; - for (unsigned int i = 0; i < this->val.length(); ++i) { - char ch = this->val.at(i); - // Note: do not use locale to determine printability. The - // PDF specification accepts arbitrary binary data. Some - // locales imply multibyte characters. We'll consider - // something printable if it is printable in 7-bit ASCII. - // We'll code this manually rather than being rude and - // setting locale. - if ((ch == 0) || - (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { - ++nonprintable; - consecutive_printable = 0; - } else { - if (++consecutive_printable > 5) { - // If there are more than 5 consecutive printable - // characters, I want to see them as such. - nonprintable = 0; - break; - } + // Heuristic: use the hexadecimal representation of a string if + // there are any non-printable (in PDF Doc encoding) characters or + // if too large of a proportion of the string consists of + // non-ASCII characters. + bool nonprintable = false; + unsigned int non_ascii = 0; + for (unsigned int i = 0; i < this->val.length(); ++i) { + char ch = this->val.at(i); + if ((ch == 0) || + (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) { + if ((ch >= 0) && (ch < 24)) { + nonprintable = true; } - } - - // Use hex notation if more than 20% of the characters are not - // printable in plain ASCII. - if (5 * nonprintable > val.length()) { - use_hexstring = true; + ++non_ascii; } } + return (nonprintable || (5 * non_ascii > val.length())); +} + +std::string +QPDF_String::unparse(bool force_binary) +{ + bool use_hexstring = force_binary || useHexString(); std::string result; if (use_hexstring) { result += "<" + QUtil::hex_encode(this->val) + ">"; diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh index df33138c..6fd1b0e9 100644 --- a/libqpdf/qpdf/QPDF_String.hh +++ b/libqpdf/qpdf/QPDF_String.hh @@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject std::string getUTF8Val() const; private: + bool useHexString() const; std::string val; }; diff --git a/qpdf/qtest/qpdf/V4-clearmeta.pdf b/qpdf/qtest/qpdf/V4-clearmeta.pdf index 7d5786c0..5e804faa 100644 Binary files a/qpdf/qtest/qpdf/V4-clearmeta.pdf and b/qpdf/qtest/qpdf/V4-clearmeta.pdf differ diff --git a/qpdf/qtest/qpdf/direct-pages-json-objects.out b/qpdf/qtest/qpdf/direct-pages-json-objects.out index 91b69e8b..1e0fe469 100644 --- a/qpdf/qtest/qpdf/direct-pages-json-objects.out +++ b/qpdf/qtest/qpdf/direct-pages-json-objects.out @@ -65,8 +65,8 @@ ], "trailer": { "/ID": [ - "\u0013#¥fi|WzfsU…©6ŸÎ<", - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" + "b:1323a5937c577a66735583a93698ce3c", + "b:372cbf44f6db88ab60d9263c0f0bd26a" ], "/Root": "1 0 R", "/Size": 7 diff --git a/qpdf/qtest/qpdf/direct-pages-json-pages.out b/qpdf/qtest/qpdf/direct-pages-json-pages.out index 57cc0cb7..d58aafb1 100644 --- a/qpdf/qtest/qpdf/direct-pages-json-pages.out +++ b/qpdf/qtest/qpdf/direct-pages-json-pages.out @@ -89,8 +89,8 @@ }, "trailer": { "/ID": [ - "\u0013#¥fi|WzfsU…©6ŸÎ<", - "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj" + "b:1323a5937c577a66735583a93698ce3c", + "b:372cbf44f6db88ab60d9263c0f0bd26a" ], "/Root": "1 0 R", "/Size": 7 diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out index 2ac91d53..5963b3a6 100644 --- a/qpdf/qtest/qpdf/good14.out +++ b/qpdf/qtest/qpdf/good14.out @@ -9,7 +9,7 @@ three lines (string with \nCRLF and\nCR and\nLF) and another indentation -(\001B%DEF)<01> +<014225444546><01> <8a8b> (ab) <8c>
) > diff --git a/qpdf/qtest/qpdf/merge-dict.out b/qpdf/qtest/qpdf/merge-dict.out index e0b6dc3e..0135f75d 100644 --- a/qpdf/qtest/qpdf/merge-dict.out +++ b/qpdf/qtest/qpdf/merge-dict.out @@ -1,9 +1,9 @@ { - "/k1": "scalar1", + "/k1": "u:scalar1", "/k2": 16059, "/k3": { - "/a": "a", - "/b": "conflict: seen", + "/a": "u:a", + "/b": "u:conflict: seen", "/c": [ 2, 3 @@ -12,7 +12,7 @@ "/y": 25, "/z": 26 }, - "/e": "e" + "/e": "u:e" }, "/k4": { "/A": 65, @@ -24,11 +24,11 @@ "/k5": [ "/one", 2, - "three", + "u:three", [ "/four" ], - "two" + "u:two" ] } /A diff --git a/qpdf/qtest/qpdf/page_api_2-json-objects.out b/qpdf/qtest/qpdf/page_api_2-json-objects.out index cc6d1630..995a00e4 100644 --- a/qpdf/qtest/qpdf/page_api_2-json-objects.out +++ b/qpdf/qtest/qpdf/page_api_2-json-objects.out @@ -9,8 +9,8 @@ "/Type": "/Catalog" }, "2 0 R": { - "/CreationDate": "D:20120621124041", - "/Producer": "Apex PDFWriter" + "/CreationDate": "u:D:20120621124041", + "/Producer": "u:Apex PDFWriter" }, "3 0 R": { "/Count": 3, @@ -77,8 +77,8 @@ "10 0 R": 47, "trailer": { "/ID": [ - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" + "b:fb18b786ff7b358705da8a532aba8f6f", + "b:f7179eb35159bfd4c00f128abcfd1f02" ], "/Info": "2 0 R", "/Root": "1 0 R", diff --git a/qpdf/qtest/qpdf/page_api_2-json-pages.out b/qpdf/qtest/qpdf/page_api_2-json-pages.out index bf6a2d25..caf27100 100644 --- a/qpdf/qtest/qpdf/page_api_2-json-pages.out +++ b/qpdf/qtest/qpdf/page_api_2-json-pages.out @@ -41,8 +41,8 @@ "/Type": "/Catalog" }, "2 0 R": { - "/CreationDate": "D:20120621124041", - "/Producer": "Apex PDFWriter" + "/CreationDate": "u:D:20120621124041", + "/Producer": "u:Apex PDFWriter" }, "3 0 R": { "/Count": 3, @@ -129,8 +129,8 @@ }, "trailer": { "/ID": [ - "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" + "b:fb18b786ff7b358705da8a532aba8f6f", + "b:f7179eb35159bfd4c00f128abcfd1f02" ], "/Info": "2 0 R", "/Root": "1 0 R", -- cgit v1.2.3-54-g00ecf