1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
pub mod chars;
pub mod path;
use std::path::PathBuf;
pub use chars::CHARS;
pub use path::PathUtf8Encoder;
#[cfg(test)]
pub mod tests;
pub trait Utf8Encoder {
fn to_utf8_string(&self) -> String;
/// Converts a byte sequence into a [`PathBuf`] using the same decoding rules
/// as [`Self::to_utf8_string`].
///
/// Valid UTF-8 runs pass through unchanged; other bytes are
/// mapped through the [`CHARS`] fallback table. This is **lossy**
/// for paths containing non-UTF-8 bytes.
///
/// To preserve the original bytes exactly (e.g. when opening an existing
/// file with a non-UTF-8 name on Unix), do not use this method. Convert
/// the bytes directly into an [`OsString`](std::ffi::OsString) via a platform-specific API
/// such as `std::os::unix::ffi::OsStringExt::from_vec`, then into a
/// [`PathBuf`].
///
/// # Example
///
/// ```
/// use std::path::PathBuf;
/// use ps_str::Utf8Encoder;
///
/// let bytes = b"hello.txt";
///
/// assert_eq!(bytes.to_utf8_path(), PathBuf::from("hello.txt"));
/// ```
fn to_utf8_path(&self) -> PathBuf {
self.to_utf8_string().into()
}
}
impl<T: AsRef<[u8]>> Utf8Encoder for T {
/// Converts a byte sequence into a UTF-8 string, replacing invalid bytes
/// with fallback characters.
///
/// This method processes bytes sequentially, attempting to parse valid UTF-8
/// sequences. When valid UTF-8 is encountered, it is appended directly to the
/// result string. When an invalid UTF-8 sequence is detected, the valid portion
/// preceding the error is appended, then the offending byte is replaced, and
/// processing continues from the next byte.
///
/// # Returns
///
/// A `String` containing the decoded content, guaranteed to be valid UTF-8
/// even if the input contained invalid byte sequences.
///
/// # Example
///
/// ```
/// use ps_str::Utf8Encoder;
///
/// let bytes = b"Hello\xFFWorld";
/// let result = bytes.to_utf8_string();
///
/// assert_eq!(result, "Hello˙World");
/// ```
fn to_utf8_string(&self) -> String {
let bytes = self.as_ref();
let mut result = String::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
match std::str::from_utf8(&bytes[i..]) {
Ok(valid_str) => {
result.push_str(valid_str);
break;
}
Err(e) => {
if e.valid_up_to() > 0 {
let valid_part = &bytes[i..i + e.valid_up_to()];
// SAFETY: valid_up_to() guarantees this slice contains only complete, valid UTF-8 characters
result.push_str(unsafe { std::str::from_utf8_unchecked(valid_part) });
i += e.valid_up_to();
}
if let Some(c) = bytes.get(i) {
result.push(CHARS[*c as usize]);
i += 1;
}
}
}
}
result
}
}