summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2022-05-07 14:20:09 +0200
committerJay Berkenbilt <ejb@ql.org>2022-05-08 19:45:20 +0200
commit3246923cf2189554f7c348ebf51c9774c09deec8 (patch)
treea402b44a775aaa6a113f1d311107461a3a2258a2
parent16f4f94cd99b4d0f633596074e8d9358db135517 (diff)
downloadqpdf-3246923cf2189554f7c348ebf51c9774c09deec8.tar.zst
Implement JSON v2 for String
Also refine the herustic for deciding whether to use hexadecimal notation for a string.
-rw-r--r--libqpdf/QPDF_String.cc81
-rw-r--r--libqpdf/qpdf/QPDF_String.hh1
-rw-r--r--qpdf/qtest/qpdf/V4-clearmeta.pdfbin15225 -> 15240 bytes
-rw-r--r--qpdf/qtest/qpdf/direct-pages-json-objects.out4
-rw-r--r--qpdf/qtest/qpdf/direct-pages-json-pages.out4
-rw-r--r--qpdf/qtest/qpdf/good14.out2
-rw-r--r--qpdf/qtest/qpdf/merge-dict.out12
-rw-r--r--qpdf/qtest/qpdf/page_api_2-json-objects.out8
-rw-r--r--qpdf/qtest/qpdf/page_api_2-json-pages.out8
9 files changed, 68 insertions, 52 deletions
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index fd820998..4d45d851 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -45,8 +45,32 @@ QPDF_String::unparse()
JSON
QPDF_String::getJSON(int json_version)
{
- // QXXXQ
- return JSON::makeString(getUTF8Val());
+ if (json_version == 1) {
+ return JSON::makeString(getUTF8Val());
+ }
+ // See if we can unambiguously represent as Unicode.
+ bool is_unicode = false;
+ std::string result;
+ std::string candidate = getUTF8Val();
+ if (QUtil::is_utf16(this->val) || QUtil::is_explicit_utf8(this->val)) {
+ is_unicode = true;
+ result = candidate;
+ } else if (!useHexString()) {
+ std::string test;
+ if (QUtil::utf8_to_pdf_doc(candidate, test, '?') &&
+ (test == this->val)) {
+ // This is a PDF-doc string that can be losslessly encoded
+ // as Unicode.
+ is_unicode = true;
+ result = candidate;
+ }
+ }
+ if (is_unicode) {
+ result = "u:" + result;
+ } else {
+ result = "b:" + QUtil::hex_encode(this->val);
+ }
+ return JSON::makeString(result);
}
QPDFObject::object_type_e
@@ -61,41 +85,32 @@ QPDF_String::getTypeName() const
return "string";
}
-std::string
-QPDF_String::unparse(bool force_binary)
+bool
+QPDF_String::useHexString() const
{
- bool use_hexstring = force_binary;
- if (!use_hexstring) {
- unsigned int nonprintable = 0;
- int consecutive_printable = 0;
- for (unsigned int i = 0; i < this->val.length(); ++i) {
- char ch = this->val.at(i);
- // Note: do not use locale to determine printability. The
- // PDF specification accepts arbitrary binary data. Some
- // locales imply multibyte characters. We'll consider
- // something printable if it is printable in 7-bit ASCII.
- // We'll code this manually rather than being rude and
- // setting locale.
- if ((ch == 0) ||
- (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
- ++nonprintable;
- consecutive_printable = 0;
- } else {
- if (++consecutive_printable > 5) {
- // If there are more than 5 consecutive printable
- // characters, I want to see them as such.
- nonprintable = 0;
- break;
- }
+ // Heuristic: use the hexadecimal representation of a string if
+ // there are any non-printable (in PDF Doc encoding) characters or
+ // if too large of a proportion of the string consists of
+ // non-ASCII characters.
+ bool nonprintable = false;
+ unsigned int non_ascii = 0;
+ for (unsigned int i = 0; i < this->val.length(); ++i) {
+ char ch = this->val.at(i);
+ if ((ch == 0) ||
+ (!(is_ascii_printable(ch) || strchr("\n\r\t\b\f", ch)))) {
+ if ((ch >= 0) && (ch < 24)) {
+ nonprintable = true;
}
- }
-
- // Use hex notation if more than 20% of the characters are not
- // printable in plain ASCII.
- if (5 * nonprintable > val.length()) {
- use_hexstring = true;
+ ++non_ascii;
}
}
+ return (nonprintable || (5 * non_ascii > val.length()));
+}
+
+std::string
+QPDF_String::unparse(bool force_binary)
+{
+ bool use_hexstring = force_binary || useHexString();
std::string result;
if (use_hexstring) {
result += "<" + QUtil::hex_encode(this->val) + ">";
diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh
index df33138c..6fd1b0e9 100644
--- a/libqpdf/qpdf/QPDF_String.hh
+++ b/libqpdf/qpdf/QPDF_String.hh
@@ -20,6 +20,7 @@ class QPDF_String: public QPDFObject
std::string getUTF8Val() const;
private:
+ bool useHexString() const;
std::string val;
};
diff --git a/qpdf/qtest/qpdf/V4-clearmeta.pdf b/qpdf/qtest/qpdf/V4-clearmeta.pdf
index 7d5786c0..5e804faa 100644
--- a/qpdf/qtest/qpdf/V4-clearmeta.pdf
+++ b/qpdf/qtest/qpdf/V4-clearmeta.pdf
Binary files differ
diff --git a/qpdf/qtest/qpdf/direct-pages-json-objects.out b/qpdf/qtest/qpdf/direct-pages-json-objects.out
index 91b69e8b..1e0fe469 100644
--- a/qpdf/qtest/qpdf/direct-pages-json-objects.out
+++ b/qpdf/qtest/qpdf/direct-pages-json-objects.out
@@ -65,8 +65,8 @@
],
"trailer": {
"/ID": [
- "\u0013#¥fi|WzfsU…©6ŸÎ<",
- "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
+ "b:1323a5937c577a66735583a93698ce3c",
+ "b:372cbf44f6db88ab60d9263c0f0bd26a"
],
"/Root": "1 0 R",
"/Size": 7
diff --git a/qpdf/qtest/qpdf/direct-pages-json-pages.out b/qpdf/qtest/qpdf/direct-pages-json-pages.out
index 57cc0cb7..d58aafb1 100644
--- a/qpdf/qtest/qpdf/direct-pages-json-pages.out
+++ b/qpdf/qtest/qpdf/direct-pages-json-pages.out
@@ -89,8 +89,8 @@
},
"trailer": {
"/ID": [
- "\u0013#¥fi|WzfsU…©6ŸÎ<",
- "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
+ "b:1323a5937c577a66735583a93698ce3c",
+ "b:372cbf44f6db88ab60d9263c0f0bd26a"
],
"/Root": "1 0 R",
"/Size": 7
diff --git a/qpdf/qtest/qpdf/good14.out b/qpdf/qtest/qpdf/good14.out
index 2ac91d53..5963b3a6 100644
--- a/qpdf/qtest/qpdf/good14.out
+++ b/qpdf/qtest/qpdf/good14.out
@@ -9,7 +9,7 @@ three lines
(string with \nCRLF and\nCR and\nLF)
and another
indentation
-(\001B%DEF)<01>
+<014225444546><01>
<8a8b>
(ab)
<8c><dd> ) >
diff --git a/qpdf/qtest/qpdf/merge-dict.out b/qpdf/qtest/qpdf/merge-dict.out
index e0b6dc3e..0135f75d 100644
--- a/qpdf/qtest/qpdf/merge-dict.out
+++ b/qpdf/qtest/qpdf/merge-dict.out
@@ -1,9 +1,9 @@
{
- "/k1": "scalar1",
+ "/k1": "u:scalar1",
"/k2": 16059,
"/k3": {
- "/a": "a",
- "/b": "conflict: seen",
+ "/a": "u:a",
+ "/b": "u:conflict: seen",
"/c": [
2,
3
@@ -12,7 +12,7 @@
"/y": 25,
"/z": 26
},
- "/e": "e"
+ "/e": "u:e"
},
"/k4": {
"/A": 65,
@@ -24,11 +24,11 @@
"/k5": [
"/one",
2,
- "three",
+ "u:three",
[
"/four"
],
- "two"
+ "u:two"
]
}
/A
diff --git a/qpdf/qtest/qpdf/page_api_2-json-objects.out b/qpdf/qtest/qpdf/page_api_2-json-objects.out
index cc6d1630..995a00e4 100644
--- a/qpdf/qtest/qpdf/page_api_2-json-objects.out
+++ b/qpdf/qtest/qpdf/page_api_2-json-objects.out
@@ -9,8 +9,8 @@
"/Type": "/Catalog"
},
"2 0 R": {
- "/CreationDate": "D:20120621124041",
- "/Producer": "Apex PDFWriter"
+ "/CreationDate": "u:D:20120621124041",
+ "/Producer": "u:Apex PDFWriter"
},
"3 0 R": {
"/Count": 3,
@@ -77,8 +77,8 @@
"10 0 R": 47,
"trailer": {
"/ID": [
- "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
- "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
+ "b:fb18b786ff7b358705da8a532aba8f6f",
+ "b:f7179eb35159bfd4c00f128abcfd1f02"
],
"/Info": "2 0 R",
"/Root": "1 0 R",
diff --git a/qpdf/qtest/qpdf/page_api_2-json-pages.out b/qpdf/qtest/qpdf/page_api_2-json-pages.out
index bf6a2d25..caf27100 100644
--- a/qpdf/qtest/qpdf/page_api_2-json-pages.out
+++ b/qpdf/qtest/qpdf/page_api_2-json-pages.out
@@ -41,8 +41,8 @@
"/Type": "/Catalog"
},
"2 0 R": {
- "/CreationDate": "D:20120621124041",
- "/Producer": "Apex PDFWriter"
+ "/CreationDate": "u:D:20120621124041",
+ "/Producer": "u:Apex PDFWriter"
},
"3 0 R": {
"/Count": 3,
@@ -129,8 +129,8 @@
},
"trailer": {
"/ID": [
- "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
- "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
+ "b:fb18b786ff7b358705da8a532aba8f6f",
+ "b:f7179eb35159bfd4c00f128abcfd1f02"
],
"/Info": "2 0 R",
"/Root": "1 0 R",