paths_as_strings/
lib.rs

1use std::borrow::Cow;
2use std::path::{Path, PathBuf};
3use std::ffi::{OsStr, OsString};
4
5/// Converts the Path `P` to a UTF-8 string which can be safely written to a file
6/// irrespective of whether the original Path contains unprintable characters
7/// or is an invalid UTF-8 string. If the Path is a valid UTF-8 string and
8/// contains no control characters such as `\t` it is returned as-is, otherwise
9/// it is encoded as a Base-64 string and given a special prefix which means
10/// the resultant string can be unambiguously detected as an encoded path rather
11/// than an actual path. This conversion can be reversed using the `decode_path`
12/// function.
13pub fn encode_path<P>(p: &P) -> Cow<str>
14    where P: AsRef<Path>
15{
16    let p = p.as_ref();
17
18    if let Some(s) = p.to_str() {
19        if !should_be_encoded(s) {
20            return Cow::Borrowed(s);
21        }
22    }
23
24    Cow::Owned(encode_os(p.as_os_str()))
25}
26
27/// Reverses the encoding of a Path performed by `encode_path`. This function
28/// should always be used to reverse the encoding, as it will correctly detect
29/// whether the string 'S' is an actual path or one that was Base-64 encoded.
30/// The function will only return an error if the Path was the Base-64 encoded
31/// form and the encoding has been tampered with.
32pub fn decode_path(encoded_path_string: &str) -> Result<PathBuf, base64::DecodeError>
33{
34    if encoded_path_string.starts_with(PREFIX) {
35        let bytes = decode_bytes(encoded_path_string)?;
36        let os_str = decode_os(bytes);
37        Ok(PathBuf::from(os_str))
38    } else {
39        Ok(PathBuf::from(encoded_path_string))
40    }
41}
42
43/// Drive letters must be A-Z, single character only. Therefore this
44/// always represents an invalid path (note also that ':' is illegal anywhere
45/// in Windows paths).
46#[cfg(windows)]
47const PREFIX: &str = "::\\_";
48
49/// On Unix (which also means BSD, Android, OSX...), filenames can contain any byte
50/// except '\0' and '/', which makes formulating an impossible filename very difficult
51/// (since we can't use a zero-byte in a printable string and '/' is the usual
52/// directory separator). You can even use filenames such as '/../../b64' in the shell
53/// and File::create() and they work ok because the '..' file in the root directory
54/// is a link back to the root directory making it impossible to 'escape' the
55/// filesystem (very clever, Unix guys).
56/// However, you cannot have a file under '/dev/null' because it is defined as a file
57/// in POSIX! http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap10.html
58/// Therefore any path beginning with '/dev/null' will be an invalid path.
59/// Baldrick levels of cunning going on here.
60#[cfg(not(windows))]
61const PREFIX: &str = "/dev/null/b64_";
62
63/// Even if a Path can be converted to a valid UTF-8 string we still might want
64/// to encode it: it's difficult to write filenames with newlines or '\b' in a sensible
65/// manner, for example.
66fn should_be_encoded(s: &str) -> bool {
67    s.chars().any(|c| c.is_control())
68}
69
70#[cfg(windows)]
71fn encode_os(s: &OsStr) -> String {
72    use std::os::windows::ffi::OsStrExt;
73
74    let wide_chars = s.encode_wide().collect::<Vec<_>>();
75    let bytes = u16_slice_to_byte_array(&wide_chars);
76    encode_bytes(&bytes)
77}
78
79#[cfg(not(windows))]
80fn encode_os(s: &OsStr) -> String {
81    use std::os::unix::ffi::OsStrExt;
82
83    let bytes = s.as_bytes();
84    encode_bytes(bytes)
85}
86
87/// A small wrapper around the 'encode' call to the base64 library to ensure
88/// we do it the same way every time.
89fn encode_bytes(bytes: &[u8]) -> String {
90    let mut b64 = PREFIX.to_string();
91    base64::encode_config_buf(bytes, base64::STANDARD, &mut b64);
92    b64
93}
94
95/// A small wrapper around the 'decode' call to the base64 library to ensure
96/// we do it the same way every time. The decode will not fail unless the
97/// previously encoded string is messed with in some way, but that is a
98/// distinct possibility in human-editable files, either by malice or misfortune.
99fn decode_bytes(encoded_str: &str) -> Result<Vec<u8>, base64::DecodeError> {
100    let encoded_bytes = &encoded_str[PREFIX.len()..];
101    base64::decode_config(encoded_bytes, base64::STANDARD)
102}
103
104#[cfg(not(windows))]
105pub(crate) fn decode_os(bytes: Vec<u8>) -> OsString {
106    use std::os::unix::ffi::OsStringExt;
107
108    OsString::from_vec(bytes)
109}
110
111#[cfg(windows)]
112pub(crate) fn decode_os(bytes: Vec<u8>) -> OsString {
113    use std::os::windows::ffi::OsStringExt;
114
115    let mut wide_chars = Vec::with_capacity(bytes.len() / 2);
116    let mut i = 0;
117    while i < bytes.len() - 1 {
118        let wide = bytes_to_u16(bytes[i], bytes[i + 1]);
119        wide_chars.push(wide);
120        i += 2;
121    }
122
123    OsString::from_wide(&wide_chars)
124}
125
126#[cfg(windows)]
127#[inline]
128fn bytes_to_u16(b1: u8, b2: u8) -> u16 {
129    let result = ((b1 as u16) << 8) + b2 as u16;
130    result
131}
132
133#[cfg(windows)]
134#[inline]
135fn u16_to_bytes(value: u16) -> [u8; 2] {
136    let b1: u8 = ((value >> 8) & 0xff) as u8;
137    let b2: u8 = (value & 0xff) as u8;
138    return [b1, b2]
139}
140
141#[cfg(windows)]
142fn u16_slice_to_byte_array(wides: &[u16]) -> Vec<u8> {
143    let mut bytes = Vec::with_capacity(wides.len() * 2);
144    for &wc in wides {
145        let a = u16_to_bytes(wc);
146        bytes.push(a[0]);
147        bytes.push(a[1]);
148    }
149    bytes
150}
151
152
153#[cfg(test)]
154mod tests {
155    use std::path::PathBuf;
156    use super::*;
157
158    // On Unix, only the '\0' and '/' are invalid in filenames but any
159    // other byte sequence is valid.
160    //
161    // For UTF-8 these bytes are forbidden *anywhere* in the byte sequence
162    // (see https://en.wikipedia.org/wiki/UTF-8#Codepage_layout):
163    //
164    //     0xc0 (192), 0xc1 (193)
165    //     0xf5 (245) to 0xff (255) inclusive
166    //
167    // Therefore sequence including such bytes will be valid paths but not a valid Rust String.
168    // This is "Hello" followed by an invalid byte.
169    #[cfg(unix)]
170    const INVALID_UTF8_BYTE_SEQUENCE: [u8; 6] = [0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xc0];
171
172    // On Windows, the following characters are invalid in filenames according to
173    // https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
174    //
175    //     < (less than)
176    //     > (greater than)
177    //     : (colon - sometimes works, but is actually NTFS Alternate Data Streams)
178    //     " (double quote)
179    //     / (forward slash)
180    //     \ (backslash)
181    //     | (vertical bar or pipe)
182    //     ? (question mark)
183    //     * (asterisk)
184    //
185    // However, note that these are all printable characters.
186    // Windows also bans bytes 0..31 (the ASCII control characters) - so no
187    // tabs, bells or newlines in filenames.
188    //
189    // On Windows, paths are UTF-16-le, not UTF-8. So we need to make a UTF-16
190    // string that is not a valid UTF-8 string.
191    // This is an invalid byte sequence according to http://unicode.org/faq/utf_bom.html#utf16-7
192    // path.display() works, and prints "Hello\u{d800}H", but path.to_str() will return None.
193    // Windows will accept this as a valid path, but it is not a valid Rust String.
194    #[cfg(windows)]
195    const INVALID_UTF16_BYTE_SEQUENCE: [u16; 7] = [0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xd800, 0x48]; // "Hello\u{d800}H"
196
197    #[test]
198    fn for_utf8_which_does_not_need_encoding() {
199        let pb = PathBuf::new();
200        let s = encode_path(&pb);
201        assert_eq!(s, "", "Empty paths should be empty strings.");
202        let pb2 = decode_path(&s).unwrap();
203        assert_eq!(pb2, pb, "Empty paths should be round-trippable.");
204
205        let pb = PathBuf::from("hello");
206        let s = encode_path(&pb);
207        assert_eq!(s, "hello", "Valid UTF-8 paths without control chars should be encoded as-is.");
208        let pb2 = decode_path(&s).unwrap();
209        assert_eq!(pb2, pb, "Valid UTF-8 paths without control chars should be round-trippable.");
210    }
211
212    #[cfg(unix)]
213    #[test]
214    fn for_valid_utf8_needing_unix_encoding() {
215        // There are separate Unix and Windows tests because on Windows a valid UTF-8 string
216        // will still be treated as UTF-16 wide chars by the time it is encoded.
217        let pb = PathBuf::from("hello\tworld");
218        let s = encode_path(&pb);
219        assert_eq!(s, format!("{}aGVsbG8Jd29ybGQ=", PREFIX), "Paths with control characters in them should be base-64 encoded.");
220        let pb2 = decode_path(&s).unwrap();
221        assert_eq!(pb2, pb, "Paths with control characters in them should be round-trippable.");
222    }
223
224    #[cfg(windows)]
225    #[test]
226    fn for_valid_utf8_needing_windows_encoding() {
227        // There are separate Unix and Windows tests because on Windows a valid UTF-8 string
228        // will still be treated as UTF-16 wide chars by the time it is encoded.
229        let pb = PathBuf::from("hello\tworld");
230        let s = path_to_path_string(&pb);
231        assert_eq!(s, format!("{}AGgAZQBsAGwAbwAJAHcAbwByAGwAZA==", PREFIX), "Paths with control characters in them should be base-64 encoded.");
232        let pb2 = path_string_to_path_buf(&s);
233        assert_eq!(pb2, pb, "Paths with control characters in them should be round-trippable.");
234    }
235
236    #[cfg(unix)]
237    #[test]
238    fn for_invalid_utf8() {
239        let os = decode_os(INVALID_UTF8_BYTE_SEQUENCE.to_vec());
240        let pb = PathBuf::from(os);
241        let s = encode_path(&pb);
242        assert_eq!(s, format!("{}SGVsbG/A", PREFIX), "Invalid UTF-8 byte sequences should be base-64 encoded.");
243        let pb2 = decode_path(&s).unwrap();
244        assert_eq!(pb2, pb, "Invalid UTF-8 byte sequences should be round-trippable.");
245    }
246
247    #[cfg(windows)]
248    #[test]
249    fn for_invalid_utf16() {
250        let bytes = u16_slice_to_byte_array(&INVALID_UTF16_BYTE_SEQUENCE);
251        let os = decode_os(bytes);
252        let pb = PathBuf::from(os);
253        let s = encode_path(&pb);
254        assert_eq!(s, format!("{}AEgAZQBsAGwAb9gAAEg=", PREFIX), "Invalid UTF-16 byte sequences should be base-64 encoded.");
255        let pb2 = decode_path(&s);
256        assert_eq!(pb2, pb, "Invalid UTF-16 byte sequences should be round-trippable.");
257    }
258
259    #[cfg(unix)]
260    #[test]
261    fn decode_for_mangled_base64_returns_err() {
262        // Create a path that will get Base-64 encoded.
263        // \x11 is just a random control character.
264        let mut s = encode_path(&"Hello\x11world").into_owned();
265        // Mangle the encoded string, as if a user manually edited it.
266        s.push('\t');
267        let decode_attempt = decode_path(&s);
268        assert!(decode_attempt.is_err(), "Tabs are not valid in Base-64 encoded strings, so we should get an error when decoding it.");
269    }
270}