aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--include/qpdf/QUtil.hh22
-rw-r--r--libqpdf/QUtil.cc93
-rw-r--r--libtests/qtest/qutil/qutil.out13
-rw-r--r--libtests/qutil.cc22
5 files changed, 158 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 8f1ed679..992cf507 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,6 +14,14 @@
the first bug in qpdf's history that could result in silent loss
of data when processing a correct input file. Fixes #276.
+2019-01-15 Jay Berkenbilt <ejb@ql.org>
+
+ * Add QUtil::possible_repaired_encodings which, given a string,
+ generates other strings that represent re-interpretation of the
+ bytes in a different coding system. This is used to help recover
+ passwords if the password string was improperly encoded on a
+ different system due to user error or a software bug.
+
2019-01-14 Jay Berkenbilt <ejb@ql.org>
* Add new CLI flags to 128-bit and 256-bit encryption: --assemble,
diff --git a/include/qpdf/QUtil.hh b/include/qpdf/QUtil.hh
index 5fe8e97c..02dec5ad 100644
--- a/include/qpdf/QUtil.hh
+++ b/include/qpdf/QUtil.hh
@@ -223,6 +223,28 @@ namespace QUtil
bool& is_valid_utf8,
bool& is_utf16);
+ // Try to compensate for previously incorrectly encoded strings.
+ // We want to compensate for the following errors:
+ //
+ // * The string was supposed to be UTF-8 but was one of the
+ // single-byte encodings
+ // * The string was supposed to be PDF Doc but was either UTF-8 or
+ // one of the other single-byte encodings
+ //
+ // The returned vector always contains the original string first,
+ // and then it contains what the correct string would be in the
+ // event that the original string was the result of any of the
+ // above errors.
+ //
+ // This method is useful for attempting to recover a password that
+ // may have been previously incorrectly encoded. For example, the
+ // password was supposed to be UTF-8 but the previous application
+ // used a password encoded in WinAnsi, or if the previous password
+ // was supposed to be PDFDoc but was actually given as UTF-8 or
+ // WinAnsi, this method would find the correct password.
+ QPDF_DLL
+ std::vector<std::string> possible_repaired_encodings(std::string);
+
// If secure random number generation is supported on your
// platform and qpdf was not compiled with insecure random number
// generation, this returns a cryptographically secure random
diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc
index e645c4fc..58646ade 100644
--- a/libqpdf/QUtil.cc
+++ b/libqpdf/QUtil.cc
@@ -15,6 +15,7 @@
#include <sstream>
#include <fstream>
#include <stdexcept>
+#include <set>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
@@ -1992,3 +1993,95 @@ QUtil::analyze_encoding(std::string const& val,
is_valid_utf8 = true;
}
}
+
+std::vector<std::string>
+QUtil::possible_repaired_encodings(std::string supplied)
+{
+ std::vector<std::string> result;
+ // Always include the original string
+ result.push_back(supplied);
+ bool has_8bit_chars = false;
+ bool is_valid_utf8 = false;
+ bool is_utf16 = false;
+ analyze_encoding(supplied, has_8bit_chars, is_valid_utf8, is_utf16);
+ if (! has_8bit_chars)
+ {
+ return result;
+ }
+ if (is_utf16)
+ {
+ // Convert to UTF-8 and pretend we got a UTF-8 string.
+ is_utf16 = false;
+ is_valid_utf8 = true;
+ supplied = utf16_to_utf8(supplied);
+ }
+ std::string output;
+ if (is_valid_utf8)
+ {
+ // Maybe we were given UTF-8 but wanted one of the single-byte
+ // encodings.
+ if (utf8_to_pdf_doc(supplied, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_win_ansi(supplied, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(supplied, output))
+ {
+ result.push_back(output);
+ }
+ }
+ else
+ {
+ // Maybe we were given one of the single-byte encodings but
+ // wanted UTF-8.
+ std::string from_pdf_doc(pdf_doc_to_utf8(supplied));
+ result.push_back(from_pdf_doc);
+ std::string from_win_ansi(win_ansi_to_utf8(supplied));
+ result.push_back(from_win_ansi);
+ std::string from_mac_roman(mac_roman_to_utf8(supplied));
+ result.push_back(from_mac_roman);
+
+ // Maybe we were given one of the other single-byte encodings
+ // but wanted one of the other ones.
+ if (utf8_to_win_ansi(from_pdf_doc, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(from_pdf_doc, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_pdf_doc(from_win_ansi, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_mac_roman(from_win_ansi, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_pdf_doc(from_mac_roman, output))
+ {
+ result.push_back(output);
+ }
+ if (utf8_to_win_ansi(from_mac_roman, output))
+ {
+ result.push_back(output);
+ }
+ }
+ // De-duplicate
+ std::vector<std::string> t;
+ std::set<std::string> seen;
+ for (std::vector<std::string>::iterator iter = result.begin();
+ iter != result.end(); ++iter)
+ {
+ if (! seen.count(*iter))
+ {
+ seen.insert(*iter);
+ t.push_back(*iter);
+ }
+ }
+ return t;
+}
diff --git a/libtests/qtest/qutil/qutil.out b/libtests/qtest/qutil/qutil.out
index c0789a36..c35f22e3 100644
--- a/libtests/qtest/qutil/qutil.out
+++ b/libtests/qtest/qutil/qutil.out
@@ -58,6 +58,19 @@ bidirectional pdf doc done
bidirectional win ansi done
bidirectional mac roman done
analysis done
+alternatives
+0: 86a9e99e
+1: c692c2a9c3a9c5be
+2: e280a0c2a9c3a9c5be
+3: c39cc2a9c388c3bb
+4: 83a9e99e
+5: 81a9e99e
+6: dca9c8fb
+0: c692c2a9c3a9c5be
+1: 86a9e99e
+2: 83a9e99e
+0: 717561636b
+done alternatives
---- whoami
quack1
quack2
diff --git a/libtests/qutil.cc b/libtests/qutil.cc
index 35877b9c..27881c6e 100644
--- a/libtests/qutil.cc
+++ b/libtests/qutil.cc
@@ -276,6 +276,16 @@ void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
}
}
+void print_alternatives(std::string const& str)
+{
+ std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
+ size_t n = result.size();
+ for (size_t i = 0; i < n; ++i)
+ {
+ std::cout << i << ": " << QUtil::hex_encode(result.at(i)) << std::endl;
+ }
+}
+
void transcoding_test()
{
transcoding_test(&QUtil::pdf_doc_to_utf8,
@@ -308,6 +318,18 @@ void transcoding_test()
assert(QUtil::utf8_to_pdf_doc(input1, output));
assert(! QUtil::utf8_to_pdf_doc(input2, output));
assert(QUtil::utf8_to_pdf_doc(input3, output));
+ std::cout << "alternatives" << std::endl;
+ // char name mac win pdf-doc
+ // U+0192 florin 304 203 206
+ // U+00A9 copyright 251 251 251
+ // U+00E9 eacute 216 351 351
+ // U+017E zcaron - 236 236
+ std::string pdfdoc = "\206\251\351\236";
+ std::string utf8 = QUtil::pdf_doc_to_utf8(pdfdoc);
+ print_alternatives(pdfdoc);
+ print_alternatives(utf8);
+ print_alternatives("quack");
+ std::cout << "done alternatives" << std::endl;
}
void print_whoami(char const* str)