Improve Unicode filename testing

Remove dependency on the behavior of perl for reliable creation of Unicode file names on Windows.
author: Jay Berkenbilt <ejb@ql.org> 2019-04-28 01:54:52 +0200
committer: Jay Berkenbilt <ejb@ql.org> 2019-04-28 02:37:33 +0200
commit: 03e27709f32ebc83b1c351da5c03ffb2d18f28da (patch)
tree: a2ad9971099228467b369d6187f618b0eff830dd
parent: 7ff234a92ff7749c090af05d4d85a97bf62e91c4 (diff)
download: qpdf-03e27709f32ebc83b1c351da5c03ffb2d18f28da.tar.zst
5 files changed, 131 insertions, 3 deletions
diff --git a/TODO b/TODO
index a6ff5baf..650ef834 100644
--- a/TODO
+++ b/TODO
@@ -170,6 +170,14 @@ I find it useful to make reference to them in this list
 
  * Pl_TIFFPredictor is pretty slow.
 
+ * Support for handling file names with Unicode characters in Windows
+   is incomplete. qpdf seems to support them okay from a functionality
+   standpoint, and the right thing happens if you pass in UTF-8
+   encoded filenames to QPDF library routines in Windows (they are
+   converted internally to wchar_t*), but file names are encoded in
+   UTF-8 on output, which doesn't produce nice error messages or
+   output on Windows in some cases.
+
  * If we ever wanted to do anything more with character encoding, see
    ../misc/character-encoding/, which includes machine-readable dump
    of table D.2 in the ISO-32000 PDF spec. This shows the mapping
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index dac5f00d..1df6e788 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -2612,6 +2612,31 @@ outfile.pdf</option>
     </varlistentry>
    </variablelist>
   </sect1>
+  <sect1 id="ref.unicode-files">
+   <title>A Note About Unicode File Names</title>
+   <para>
+    When strings are passed to qpdf library routines either as
+    <literal>char*</literal> or as <literal>std::string</literal>,
+    they are treated as byte arrays except where otherwise noted. When
+    Unicode is desired, qpdf wants UTF-8 unless otherwise noted in
+    comments in header files. In modern UNIX/Linux environments, this
+    generally does the right thing. In Windows, it's a bit more
+    complicated. Starting in qpdf 8.4.0, passwords that contain
+    Unicode characters are handled much better, and starting in qpdf
+    8.4.1, the library attempts to properly handle Unicode characters
+    in filenames. In particular, in Windows, if a UTF-8 encoded string
+    is used as a filename in either <classname>QPDF</classname> or
+    <classname>QPDFWriter</classname>, it is internally converted to
+    <literal>wchar_t*</literal>, and Unicode-aware Windows APIs are
+    used. As such, qpdf will generally operate properly on files with
+    non-ASCII characters in their names as long as the filenames are
+    UTF-8 encoded for passing into the qpdf library API, but there are
+    still some rough edges, such as the encoding of the filenames in
+    error messages our CLI output messages. Patches or bug reports are
+    welcome for any continuing issues with Unicode file names in
+    Windows.
+   </para>
+  </sect1>
  </chapter>
  <chapter id="ref.json">
   <title>QPDF JSON</title>
diff --git a/qpdf/build.mk b/qpdf/build.mk
index 40de3617..87038c79 100644
--- a/qpdf/build.mk
+++ b/qpdf/build.mk
@@ -5,7 +5,8 @@ BINS_qpdf = \
     test_large_file \
     test_pdf_doc_encoding \
     test_pdf_unicode \
-    test_tokenizer
+    test_tokenizer \
+    test_unicode_filenames
 CBINS_qpdf = qpdf-ctest
 
 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
@@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc)
 
 XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE)
 XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK)
+XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE)
+XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK)
 
 $(foreach B,$(BINS_qpdf),$(eval \
   OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc)))
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index ec5eb3c1..e95c22bc 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -135,7 +135,7 @@ foreach my $c (@completion_tests)
 show_ntests();
 # ----------
 $td->notify("--- Argument Parsing ---");
-$n_tests += 8;
+$n_tests += 6;
 
 $td->runtest("required argument",
              {$td->COMMAND => "qpdf --password minimal.pdf"},
@@ -167,10 +167,21 @@ $td->runtest("extra overlay filename",
              {$td->REGEXP => ".*overlay file already specified.*",
                   $td->EXIT_STATUS => 2},
              $td->NORMALIZE_NEWLINES);
+
+show_ntests();
+# ----------
+$td->notify("--- Unicode Filenames ---");
+$n_tests += 3;
+
+$td->runtest("create unicode filenames",
+             {$td->COMMAND => "test_unicode_filenames"},
+             {$td->STRING => "created Unicode filenames\n",
+                  $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 foreach my $d (['auto-ü', 1], ['auto-öπ', 2])
 {
     my ($u, $n) = @$d;
-    copy('minimal.pdf', "$u.pdf");
     $td->runtest("unicode filename $u",
                  {$td->COMMAND => "qpdf --check $u.pdf"},
                  {$td->FILE => "check-unicode-filename-$n.out",
diff --git a/qpdf/test_unicode_filenames.cc b/qpdf/test_unicode_filenames.cc
new file mode 100644
index 00000000..45701a9f
--- /dev/null
+++ b/qpdf/test_unicode_filenames.cc
@@ -0,0 +1,81 @@
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h>
+#include <io.h>
+#endif
+
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+
+static void do_copy(FILE* in, FILE* out)
+{
+    if ((in == 0) || (out == 0))
+    {
+        std::cerr << "errors opening files" << std::endl;
+        exit(2);
+    }
+    char buf[10240];
+    size_t len = 0;
+    while ((len = fread(buf, 1, sizeof(buf), in)) > 0)
+    {
+        fwrite(buf, 1, len, out);
+    }
+    if (len != 0)
+    {
+        std::cerr << "errors reading or writing" << std::endl;
+        exit(2);
+    }
+    fclose(in);
+    fclose(out);
+}
+
+#ifdef WINDOWS_WMAIN
+
+void copy(wchar_t const* outname)
+{
+#ifdef _MSC_VER
+    FILE* in = 0;
+    _wfopen_s(&in, L"minimal.pdf", L"rb");
+    FILE* out = 0;
+    _wfopen_s(&out, outname, L"wb");
+#else
+    FILE* in = _wfopen(L"minimal.pdf", L"rb");
+    FILE* out = _wfopen(outname, L"wb");
+#endif
+    do_copy(in, out);
+}
+
+extern "C"
+int wmain(int argc, wchar_t* argv[])
+{
+    // Unicode
+    wchar_t const* f1 = L"auto-\xfc.pdf";
+    wchar_t const* f2 = L"auto-\xf6\x03c0.pdf";
+    copy(f1);
+    copy(f2);
+    std::cout << "created Unicode filenames" << std::endl;
+    return 0;
+}
+
+#else
+
+void copy(char const* outname)
+{
+    FILE* in = fopen("minimal.pdf", "rb");
+    FILE* out = fopen(outname, "wb");
+    do_copy(in, out);
+}
+
+int main(int argc, char* argv[])
+{
+    // Explicit UTF-8 encoding
+    char const* f1 = "auto-\xc3\xbc.pdf";
+    char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf";
+    copy(f1);
+    copy(f2);
+    std::cout << "created Unicode filenames" << std::endl;
+    return 0;
+}
+
+#endif
author	Jay Berkenbilt <ejb@ql.org>	2019-04-28 01:54:52 +0200
committer	Jay Berkenbilt <ejb@ql.org>	2019-04-28 02:37:33 +0200
commit	03e27709f32ebc83b1c351da5c03ffb2d18f28da (patch)
tree	a2ad9971099228467b369d6187f618b0eff830dd
parent	7ff234a92ff7749c090af05d4d85a97bf62e91c4 (diff)
download	qpdf-03e27709f32ebc83b1c351da5c03ffb2d18f28da.tar.zst