aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Berkenbilt <ejb@ql.org>2019-04-28 01:54:52 +0200
committerJay Berkenbilt <ejb@ql.org>2019-04-28 02:37:33 +0200
commit03e27709f32ebc83b1c351da5c03ffb2d18f28da (patch)
treea2ad9971099228467b369d6187f618b0eff830dd
parent7ff234a92ff7749c090af05d4d85a97bf62e91c4 (diff)
downloadqpdf-03e27709f32ebc83b1c351da5c03ffb2d18f28da.tar.zst
Improve Unicode filename testing
Remove dependency on the behavior of perl for reliable creation of Unicode file names on Windows.
-rw-r--r--TODO8
-rw-r--r--manual/qpdf-manual.xml25
-rw-r--r--qpdf/build.mk5
-rw-r--r--qpdf/qtest/qpdf.test15
-rw-r--r--qpdf/test_unicode_filenames.cc81
5 files changed, 131 insertions, 3 deletions
diff --git a/TODO b/TODO
index a6ff5baf..650ef834 100644
--- a/TODO
+++ b/TODO
@@ -170,6 +170,14 @@ I find it useful to make reference to them in this list
* Pl_TIFFPredictor is pretty slow.
+ * Support for handling file names with Unicode characters in Windows
+ is incomplete. qpdf seems to support them okay from a functionality
+ standpoint, and the right thing happens if you pass in UTF-8
+ encoded filenames to QPDF library routines in Windows (they are
+ converted internally to wchar_t*), but file names are encoded in
+ UTF-8 on output, which doesn't produce nice error messages or
+ output on Windows in some cases.
+
* If we ever wanted to do anything more with character encoding, see
../misc/character-encoding/, which includes machine-readable dump
of table D.2 in the ISO-32000 PDF spec. This shows the mapping
diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index dac5f00d..1df6e788 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -2612,6 +2612,31 @@ outfile.pdf</option>
</varlistentry>
</variablelist>
</sect1>
+ <sect1 id="ref.unicode-files">
+ <title>A Note About Unicode File Names</title>
+ <para>
+ When strings are passed to qpdf library routines either as
+ <literal>char*</literal> or as <literal>std::string</literal>,
+ they are treated as byte arrays except where otherwise noted. When
+ Unicode is desired, qpdf wants UTF-8 unless otherwise noted in
+ comments in header files. In modern UNIX/Linux environments, this
+ generally does the right thing. In Windows, it's a bit more
+ complicated. Starting in qpdf 8.4.0, passwords that contain
+ Unicode characters are handled much better, and starting in qpdf
+ 8.4.1, the library attempts to properly handle Unicode characters
+ in filenames. In particular, in Windows, if a UTF-8 encoded string
+ is used as a filename in either <classname>QPDF</classname> or
+ <classname>QPDFWriter</classname>, it is internally converted to
+ <literal>wchar_t*</literal>, and Unicode-aware Windows APIs are
+ used. As such, qpdf will generally operate properly on files with
+ non-ASCII characters in their names as long as the filenames are
+ UTF-8 encoded for passing into the qpdf library API, but there are
+ still some rough edges, such as the encoding of the filenames in
+ error messages our CLI output messages. Patches or bug reports are
+ welcome for any continuing issues with Unicode file names in
+ Windows.
+ </para>
+ </sect1>
</chapter>
<chapter id="ref.json">
<title>QPDF JSON</title>
diff --git a/qpdf/build.mk b/qpdf/build.mk
index 40de3617..87038c79 100644
--- a/qpdf/build.mk
+++ b/qpdf/build.mk
@@ -5,7 +5,8 @@ BINS_qpdf = \
test_large_file \
test_pdf_doc_encoding \
test_pdf_unicode \
- test_tokenizer
+ test_tokenizer \
+ test_unicode_filenames
CBINS_qpdf = qpdf-ctest
TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
@@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc)
XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE)
XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK)
+XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE)
+XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK)
$(foreach B,$(BINS_qpdf),$(eval \
OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc)))
diff --git a/qpdf/qtest/qpdf.test b/qpdf/qtest/qpdf.test
index ec5eb3c1..e95c22bc 100644
--- a/qpdf/qtest/qpdf.test
+++ b/qpdf/qtest/qpdf.test
@@ -135,7 +135,7 @@ foreach my $c (@completion_tests)
show_ntests();
# ----------
$td->notify("--- Argument Parsing ---");
-$n_tests += 8;
+$n_tests += 6;
$td->runtest("required argument",
{$td->COMMAND => "qpdf --password minimal.pdf"},
@@ -167,10 +167,21 @@ $td->runtest("extra overlay filename",
{$td->REGEXP => ".*overlay file already specified.*",
$td->EXIT_STATUS => 2},
$td->NORMALIZE_NEWLINES);
+
+show_ntests();
+# ----------
+$td->notify("--- Unicode Filenames ---");
+$n_tests += 3;
+
+$td->runtest("create unicode filenames",
+ {$td->COMMAND => "test_unicode_filenames"},
+ {$td->STRING => "created Unicode filenames\n",
+ $td->EXIT_STATUS => 0},
+ $td->NORMALIZE_NEWLINES);
+
foreach my $d (['auto-ü', 1], ['auto-öπ', 2])
{
my ($u, $n) = @$d;
- copy('minimal.pdf', "$u.pdf");
$td->runtest("unicode filename $u",
{$td->COMMAND => "qpdf --check $u.pdf"},
{$td->FILE => "check-unicode-filename-$n.out",
diff --git a/qpdf/test_unicode_filenames.cc b/qpdf/test_unicode_filenames.cc
new file mode 100644
index 00000000..45701a9f
--- /dev/null
+++ b/qpdf/test_unicode_filenames.cc
@@ -0,0 +1,81 @@
+#ifdef _WIN32
+#include <windows.h>
+#include <direct.h>
+#include <io.h>
+#endif
+
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+
+static void do_copy(FILE* in, FILE* out)
+{
+ if ((in == 0) || (out == 0))
+ {
+ std::cerr << "errors opening files" << std::endl;
+ exit(2);
+ }
+ char buf[10240];
+ size_t len = 0;
+ while ((len = fread(buf, 1, sizeof(buf), in)) > 0)
+ {
+ fwrite(buf, 1, len, out);
+ }
+ if (len != 0)
+ {
+ std::cerr << "errors reading or writing" << std::endl;
+ exit(2);
+ }
+ fclose(in);
+ fclose(out);
+}
+
+#ifdef WINDOWS_WMAIN
+
+void copy(wchar_t const* outname)
+{
+#ifdef _MSC_VER
+ FILE* in = 0;
+ _wfopen_s(&in, L"minimal.pdf", L"rb");
+ FILE* out = 0;
+ _wfopen_s(&out, outname, L"wb");
+#else
+ FILE* in = _wfopen(L"minimal.pdf", L"rb");
+ FILE* out = _wfopen(outname, L"wb");
+#endif
+ do_copy(in, out);
+}
+
+extern "C"
+int wmain(int argc, wchar_t* argv[])
+{
+ // Unicode
+ wchar_t const* f1 = L"auto-\xfc.pdf";
+ wchar_t const* f2 = L"auto-\xf6\x03c0.pdf";
+ copy(f1);
+ copy(f2);
+ std::cout << "created Unicode filenames" << std::endl;
+ return 0;
+}
+
+#else
+
+void copy(char const* outname)
+{
+ FILE* in = fopen("minimal.pdf", "rb");
+ FILE* out = fopen(outname, "wb");
+ do_copy(in, out);
+}
+
+int main(int argc, char* argv[])
+{
+ // Explicit UTF-8 encoding
+ char const* f1 = "auto-\xc3\xbc.pdf";
+ char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf";
+ copy(f1);
+ copy(f2);
+ std::cout << "created Unicode filenames" << std::endl;
+ return 0;
+}
+
+#endif