fhp_encoding/
decode.rs

1//! Decoding raw bytes to UTF-8 strings.
2//!
3//! Uses [`encoding_rs`] for the actual byte-to-character conversion.
4
5use encoding_rs::Encoding;
6use fhp_core::error::EncodingError;
7
8/// Decode raw bytes using a known encoding.
9///
10/// Returns the decoded UTF-8 string. If the input is already valid UTF-8 and
11/// `encoding` is UTF-8, this avoids copying via [`Cow`](std::borrow::Cow).
12///
13/// # Errors
14///
15/// Returns [`EncodingError::MalformedInput`] if the decoder encounters
16/// bytes that cannot be mapped to Unicode and the encoding does not use
17/// a replacement character (in practice, `encoding_rs` always replaces
18/// unmappable bytes, so this is defensive).
19///
20/// # Example
21///
22/// ```
23/// use fhp_encoding::decode;
24/// use encoding_rs::UTF_8;
25///
26/// let text = decode(b"Hello, world!", UTF_8).unwrap();
27/// assert_eq!(text, "Hello, world!");
28/// ```
29pub fn decode(input: &[u8], encoding: &'static Encoding) -> Result<String, EncodingError> {
30    // Strip BOM if present.
31    let input = strip_bom(input, encoding);
32
33    let (cow, _actual_encoding, had_errors) = encoding.decode(input);
34
35    if had_errors {
36        // Find approximate offset of first malformed byte.
37        let offset = find_first_error_offset(input, encoding);
38        return Err(EncodingError::MalformedInput {
39            encoding: encoding.name(),
40            offset,
41        });
42    }
43
44    Ok(cow.into_owned())
45}
46
47/// Auto-detect encoding and decode in one step.
48///
49/// Combines [`detect`](crate::detect::detect) and [`decode`] for convenience.
50///
51/// # Errors
52///
53/// Returns [`EncodingError::MalformedInput`] on decode failure.
54///
55/// # Example
56///
57/// ```
58/// use fhp_encoding::decode_or_detect;
59///
60/// let html = b"<html><body>Hello</body></html>";
61/// let (text, encoding) = decode_or_detect(html).unwrap();
62/// assert!(text.contains("Hello"));
63/// assert_eq!(encoding.name(), "UTF-8");
64/// ```
65pub fn decode_or_detect(input: &[u8]) -> Result<(String, &'static Encoding), EncodingError> {
66    let encoding = crate::detect::detect(input);
67    let text = decode(input, encoding)?;
68    Ok((text, encoding))
69}
70
71/// Strip the BOM prefix if it matches the given encoding.
72fn strip_bom<'a>(input: &'a [u8], encoding: &'static Encoding) -> &'a [u8] {
73    if encoding == encoding_rs::UTF_8
74        && input.len() >= 3
75        && input[0] == 0xEF
76        && input[1] == 0xBB
77        && input[2] == 0xBF
78    {
79        return &input[3..];
80    }
81    if encoding == encoding_rs::UTF_16LE && input.len() >= 2 && input[0] == 0xFF && input[1] == 0xFE
82    {
83        return &input[2..];
84    }
85    if encoding == encoding_rs::UTF_16BE && input.len() >= 2 && input[0] == 0xFE && input[1] == 0xFF
86    {
87        return &input[2..];
88    }
89    input
90}
91
92/// Find the approximate byte offset of the first decoding error.
93///
94/// Uses a binary-search-like approach: decode prefixes to narrow down the
95/// first problematic byte. Falls back to 0 if the error is in the first byte.
96fn find_first_error_offset(input: &[u8], encoding: &'static Encoding) -> usize {
97    // Simple linear scan with small chunks for accuracy.
98    let mut decoder = encoding.new_decoder_without_bom_handling();
99    let mut output = vec![0u8; 1024];
100    let mut total_read = 0usize;
101
102    for chunk in input.chunks(256) {
103        let is_last = total_read + chunk.len() >= input.len();
104        let (result, read, _) =
105            decoder.decode_to_utf8_without_replacement(chunk, &mut output, is_last);
106        if let encoding_rs::DecoderResult::Malformed(_, _) = result {
107            return total_read + read;
108        }
109        total_read += read;
110    }
111    0
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    #[test]
119    fn decode_utf8() {
120        let text = decode(b"Hello, world!", encoding_rs::UTF_8).unwrap();
121        assert_eq!(text, "Hello, world!");
122    }
123
124    #[test]
125    fn decode_utf8_with_bom() {
126        let input = b"\xEF\xBB\xBFHello";
127        let text = decode(input, encoding_rs::UTF_8).unwrap();
128        assert_eq!(text, "Hello");
129    }
130
131    #[test]
132    fn decode_latin1() {
133        // ISO-8859-1 "café" — encoding_rs maps iso-8859-1 to windows-1252.
134        let input = b"caf\xe9";
135        let text = decode(input, encoding_rs::WINDOWS_1252).unwrap();
136        assert_eq!(text, "caf\u{00e9}");
137    }
138
139    #[test]
140    fn decode_windows_1252_turkish() {
141        // Windows-1252 Turkish characters: ş=0x9F doesn't exist in 1252.
142        // In Windows-1254 (Turkish): ş=0xFE, ğ=0xF0, ı=0xFD, ö=0xF6, ü=0xFC, ç=0xE7
143        let input: &[u8] = &[0xF6, 0xFC, 0xE7]; // ö, ü, ç in Windows-1254
144        let text = decode(input, encoding_rs::WINDOWS_1254).unwrap();
145        assert_eq!(text, "\u{00f6}\u{00fc}\u{00e7}"); // ö, ü, ç
146    }
147
148    #[test]
149    fn decode_or_detect_utf8() {
150        let input = b"<html>Hello</html>";
151        let (text, enc) = decode_or_detect(input).unwrap();
152        assert_eq!(enc.name(), "UTF-8");
153        assert!(text.contains("Hello"));
154    }
155
156    #[test]
157    fn decode_or_detect_with_meta() {
158        let input =
159            b"<html><head><meta charset=\"windows-1254\"></head><body>\xFE\xF0\xFD</body></html>";
160        let (text, enc) = decode_or_detect(input).unwrap();
161        assert_eq!(enc.name(), "windows-1254");
162        assert!(text.contains('\u{015F}')); // ş
163        assert!(text.contains('\u{011F}')); // ğ
164        assert!(text.contains('\u{0131}')); // ı
165    }
166
167    #[test]
168    fn decode_empty() {
169        let text = decode(b"", encoding_rs::UTF_8).unwrap();
170        assert_eq!(text, "");
171    }
172}
fhp_encoding/decode.rs

fhp_encoding/
decode.rs