aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--include/qpdf/QPDFObjectHandle.hh10
-rw-r--r--libqpdf/QPDFObjectHandle.cc20
-rw-r--r--libqpdf/QPDF_String.cc52
-rw-r--r--libqpdf/qpdf/QPDF_String.hh1
-rw-r--r--qpdf/build.mk1
-rw-r--r--qpdf/qtest/qpdf.test12
-rw-r--r--qpdf/qtest/qpdf/unicode-errors.in7
-rw-r--r--qpdf/qtest/qpdf/unicode-errors.out7
-rw-r--r--qpdf/qtest/qpdf/unicode.in5
-rw-r--r--qpdf/qtest/qpdf/unicode.out5
-rw-r--r--qpdf/test_pdf_unicode.cc46
12 files changed, 172 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index cabf7efe..e27b680d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2018-06-21 Jay Berkenbilt <ejb@ql.org>
+
+ * Added QPDFObject::newUnicodeString and QPDFObject::unparseBinary
+ to allow for more convenient creation of strings that are
+ explicitly encoded in UTF-16 BE. This is useful for creating
+ Unicode strings that appear outside of content streams, such as in
+ page labels, outlines, form field values, etc.
+
2018-06-20 Jay Berkenbilt <ejb@ql.org>
* Added new classes QPDFAcroFormDocumentHelper,
diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh
index 967e786c..868b5c07 100644
--- a/include/qpdf/QPDFObjectHandle.hh
+++ b/include/qpdf/QPDFObjectHandle.hh
@@ -344,6 +344,12 @@ class QPDFObjectHandle
static QPDFObjectHandle newName(std::string const& name);
QPDF_DLL
static QPDFObjectHandle newString(std::string const& str);
+ // Create a string encoded in UTF-16 from the given utf8-encoded
+ // string. Such strings are appropriately encoded to appear in PDF
+ // files outside of content streams, such as in document metadata
+ // form field values, page labels, outlines, and similar locations.
+ QPDF_DLL
+ static QPDFObjectHandle newUnicodeString(std::string const& utf8_str);
QPDF_DLL
static QPDFObjectHandle newOperator(std::string const&);
QPDF_DLL
@@ -715,6 +721,10 @@ class QPDFObjectHandle
std::string unparse();
QPDF_DLL
std::string unparseResolved();
+ // For strings only, force binary representation. Otherwise, same
+ // as unparse.
+ QPDF_DLL
+ std::string unparseBinary();
// Legacy helper methods for commonly performed operations on
// pages. Newer code should use QPDFPageObjectHelper instead. The
diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc
index 5c111cc8..da609cc2 100644
--- a/libqpdf/QPDFObjectHandle.cc
+++ b/libqpdf/QPDFObjectHandle.cc
@@ -1221,6 +1221,20 @@ QPDFObjectHandle::unparseResolved()
return this->m->obj->unparse();
}
+std::string
+QPDFObjectHandle::unparseBinary()
+{
+ if (this->isString())
+ {
+ return dynamic_cast<QPDF_String*>(
+ this->m->obj.getPointer())->unparse(true);
+ }
+ else
+ {
+ return unparse();
+ }
+}
+
QPDFObjectHandle
QPDFObjectHandle::parse(std::string const& object_str,
std::string const& object_description)
@@ -1846,6 +1860,12 @@ QPDFObjectHandle::newString(std::string const& str)
}
QPDFObjectHandle
+QPDFObjectHandle::newUnicodeString(std::string const& utf8_str)
+{
+ return QPDFObjectHandle(QPDF_String::new_utf16(utf8_str));
+}
+
+QPDFObjectHandle
QPDFObjectHandle::newOperator(std::string const& value)
{
return QPDFObjectHandle(new QPDF_Operator(value));
diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc
index 60a3e0df..eb31a808 100644
--- a/libqpdf/QPDF_String.cc
+++ b/libqpdf/QPDF_String.cc
@@ -64,6 +64,58 @@ QPDF_String::~QPDF_String()
{
}
+QPDF_String*
+QPDF_String::new_utf16(std::string const& utf8_val)
+{
+ std::string result = "\xfe\xff";
+ size_t len = utf8_val.length();
+ for (size_t i = 0; i < len; ++i)
+ {
+ unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
+ if (ch < 128)
+ {
+ result += QUtil::toUTF16(ch);
+ }
+ else
+ {
+ size_t bytes_needed = 0;
+ unsigned bit_check = 0x40;
+ unsigned char to_clear = 0x80;
+ while (ch & bit_check)
+ {
+ ++bytes_needed;
+ to_clear |= bit_check;
+ bit_check >>= 1;
+ }
+
+ if (((bytes_needed > 5) || (bytes_needed < 1)) ||
+ ((i + bytes_needed) >= len))
+ {
+ result += "\xff\xfd";
+ }
+ else
+ {
+ unsigned long codepoint = (ch & ~to_clear);
+ while (bytes_needed > 0)
+ {
+ --bytes_needed;
+ ch = utf8_val.at(++i);
+ if ((ch & 0xc0) != 0x80)
+ {
+ --i;
+ codepoint = 0xfffd;
+ break;
+ }
+ codepoint <<= 6;
+ codepoint += (ch & 0x3f);
+ }
+ result += QUtil::toUTF16(codepoint);
+ }
+ }
+ }
+ return new QPDF_String(result);
+}
+
std::string
QPDF_String::unparse()
{
diff --git a/libqpdf/qpdf/QPDF_String.hh b/libqpdf/qpdf/QPDF_String.hh
index abf8291a..b4858c49 100644
--- a/libqpdf/qpdf/QPDF_String.hh
+++ b/libqpdf/qpdf/QPDF_String.hh
@@ -9,6 +9,7 @@ class QPDF_String: public QPDFObject
{
public:
QPDF_String(std::string const& val);
+ static QPDF_String* new_utf16(std::string const& utf8_val);
virtual ~QPDF_String();
virtual std::string unparse();
virtual QPDFObject::object_type_e getTypeCode() const;
diff --git a/qpdf/build.mk b/qpdf/build.mk
index 1692fc92..21e7bb17 100644
--- a/qpdf/build.mk
+++ b/qpdf/build.mk
@@ -4,6 +4,7 @@ BINS_qpdf = \
test_driver \
test_large_file \
test_pdf_doc_encoding \
+ test_pdf_unicode \
test_tokenizer
CBINS_qpdf = qpdf-ctest
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index a23e20e8..f80da1c9 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -84,13 +84,21 @@ flush_tiff_cache();
show_ntests();
# ----------
-$td->notify("--- PDF Doc Encoding ---");
-$n_tests += 1;
+$td->notify("--- Character Encoding ---");
+$n_tests += 3;
$td->runtest("PDF doc encoding to Unicode",
{$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
{$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
$td->NORMALIZE_NEWLINES);
+$td->runtest("UTF-16 encoding",
+ {$td->COMMAND => "test_pdf_unicode unicode.in"},
+ {$td->FILE => "unicode.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+$td->runtest("UTF-16 encoding errors",
+ {$td->COMMAND => "test_pdf_unicode unicode-errors.in"},
+ {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
show_ntests();
# ----------
diff --git a/qpdf/qtest/qpdf/unicode-errors.in b/qpdf/qtest/qpdf/unicode-errors.in
new file mode 100644
index 00000000..484928c3
--- /dev/null
+++ b/qpdf/qtest/qpdf/unicode-errors.in
@@ -0,0 +1,7 @@
+This file has utf-8 encoding errors and should be edited as a binary file.
+
+0: too many bytes: after
+1: too few bytes: after
+2: invalid codepoint (U+DEAD): after
+3: not enough bytes for character: !after (! included)
+4: not enough bytes left in file
diff --git a/qpdf/qtest/qpdf/unicode-errors.out b/qpdf/qtest/qpdf/unicode-errors.out
new file mode 100644
index 00000000..43a06511
--- /dev/null
+++ b/qpdf/qtest/qpdf/unicode-errors.out
@@ -0,0 +1,7 @@
+This file has utf-8 encoding errors and should be edited as a binary file. // <feff0054006800690073002000660069006c006500200068006100730020007500740066002d003800200065006e0063006f00640069006e00670020006500720072006f0072007300200061006e0064002000730068006f0075006c0064002000620065002000650064006900740065006400200061007300200061002000620069006e006100720079002000660069006c0065002e>
+ // <feff>
+0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
+1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
+2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
+3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
+4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
diff --git a/qpdf/qtest/qpdf/unicode.in b/qpdf/qtest/qpdf/unicode.in
new file mode 100644
index 00000000..f686f9d6
--- /dev/null
+++ b/qpdf/qtest/qpdf/unicode.in
@@ -0,0 +1,5 @@
+This is a potato: 🥔 (u+01f954).
+If you wanted to, you could cook some sweet 🥔 π.
+If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
+బంగాళాదుంప సలాడ్
+𝄞 𝄢 𝄪 𝅂
diff --git a/qpdf/qtest/qpdf/unicode.out b/qpdf/qtest/qpdf/unicode.out
new file mode 100644
index 00000000..bedec447
--- /dev/null
+++ b/qpdf/qtest/qpdf/unicode.out
@@ -0,0 +1,5 @@
+This is a potato: 🥔 (u+01f954). // <feff00540068006900730020006900730020006100200070006f007400610074006f003a0020d83edd54002000280075002b0030003100660039003500340029002e>
+If you wanted to, you could cook some sweet 🥔 π. // <feff0049006600200079006f0075002000770061006e00740065006400200074006f002c00200079006f007500200063006f0075006c006400200063006f006f006b00200073006f006d00650020007300770065006500740020d83edd54002003c0002e>
+If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff0049006600200079006f00750020007400680069006e006b002000770077007700770077007700200069007300200067006f006f0064002c00200079006f0075002000730068006f0075006c00640020007400720079002002ac02ac02ac02ac02ac02ac002e>
+బంగాళాదుంప సలాడ్ // <feff0c2c0c020c170c3e0c330c3e0c260c410c020c2a00200c380c320c3e0c210c4d>
+𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
diff --git a/qpdf/test_pdf_unicode.cc b/qpdf/test_pdf_unicode.cc
new file mode 100644
index 00000000..07073424
--- /dev/null
+++ b/qpdf/test_pdf_unicode.cc
@@ -0,0 +1,46 @@
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFObjectHandle.hh>
+#include <iostream>
+#include <stdlib.h>
+#include <string.h>
+
+static char const* whoami = 0;
+
+void usage()
+{
+ std::cerr << "Usage: " << whoami << " infile" << std::endl;
+ exit(2);
+}
+
+int main(int argc, char* argv[])
+{
+ if ((whoami = strrchr(argv[0], '/')) == NULL)
+ {
+ whoami = argv[0];
+ }
+ else
+ {
+ ++whoami;
+ }
+ // For libtool's sake....
+ if (strncmp(whoami, "lt-", 3) == 0)
+ {
+ whoami += 3;
+ }
+
+ if (argc != 2)
+ {
+ usage();
+ }
+ char const* infilename = argv[1];
+ std::list<std::string> lines =
+ QUtil::read_lines_from_file(infilename);
+ for (std::list<std::string>::iterator iter = lines.begin();
+ iter != lines.end(); ++iter)
+ {
+ QPDFObjectHandle str = QPDFObjectHandle::newUnicodeString(*iter);
+ std::cout << str.getUTF8Value() << " // "
+ << str.unparseBinary() << std::endl;
+ }
+ return 0;
+}