1use encoding_rs::Encoding;
6use fhp_core::error::EncodingError;
7
8pub fn decode(input: &[u8], encoding: &'static Encoding) -> Result<String, EncodingError> {
30 let input = strip_bom(input, encoding);
32
33 let (cow, _actual_encoding, had_errors) = encoding.decode(input);
34
35 if had_errors {
36 let offset = find_first_error_offset(input, encoding);
38 return Err(EncodingError::MalformedInput {
39 encoding: encoding.name(),
40 offset,
41 });
42 }
43
44 Ok(cow.into_owned())
45}
46
47pub fn decode_or_detect(input: &[u8]) -> Result<(String, &'static Encoding), EncodingError> {
66 let encoding = crate::detect::detect(input);
67 let text = decode(input, encoding)?;
68 Ok((text, encoding))
69}
70
71fn strip_bom<'a>(input: &'a [u8], encoding: &'static Encoding) -> &'a [u8] {
73 if encoding == encoding_rs::UTF_8
74 && input.len() >= 3
75 && input[0] == 0xEF
76 && input[1] == 0xBB
77 && input[2] == 0xBF
78 {
79 return &input[3..];
80 }
81 if encoding == encoding_rs::UTF_16LE && input.len() >= 2 && input[0] == 0xFF && input[1] == 0xFE
82 {
83 return &input[2..];
84 }
85 if encoding == encoding_rs::UTF_16BE && input.len() >= 2 && input[0] == 0xFE && input[1] == 0xFF
86 {
87 return &input[2..];
88 }
89 input
90}
91
92fn find_first_error_offset(input: &[u8], encoding: &'static Encoding) -> usize {
97 let mut decoder = encoding.new_decoder_without_bom_handling();
99 let mut output = vec![0u8; 1024];
100 let mut total_read = 0usize;
101
102 for chunk in input.chunks(256) {
103 let is_last = total_read + chunk.len() >= input.len();
104 let (result, read, _) =
105 decoder.decode_to_utf8_without_replacement(chunk, &mut output, is_last);
106 if let encoding_rs::DecoderResult::Malformed(_, _) = result {
107 return total_read + read;
108 }
109 total_read += read;
110 }
111 0
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn decode_utf8() {
120 let text = decode(b"Hello, world!", encoding_rs::UTF_8).unwrap();
121 assert_eq!(text, "Hello, world!");
122 }
123
124 #[test]
125 fn decode_utf8_with_bom() {
126 let input = b"\xEF\xBB\xBFHello";
127 let text = decode(input, encoding_rs::UTF_8).unwrap();
128 assert_eq!(text, "Hello");
129 }
130
131 #[test]
132 fn decode_latin1() {
133 let input = b"caf\xe9";
135 let text = decode(input, encoding_rs::WINDOWS_1252).unwrap();
136 assert_eq!(text, "caf\u{00e9}");
137 }
138
139 #[test]
140 fn decode_windows_1252_turkish() {
141 let input: &[u8] = &[0xF6, 0xFC, 0xE7]; let text = decode(input, encoding_rs::WINDOWS_1254).unwrap();
145 assert_eq!(text, "\u{00f6}\u{00fc}\u{00e7}"); }
147
148 #[test]
149 fn decode_or_detect_utf8() {
150 let input = b"<html>Hello</html>";
151 let (text, enc) = decode_or_detect(input).unwrap();
152 assert_eq!(enc.name(), "UTF-8");
153 assert!(text.contains("Hello"));
154 }
155
156 #[test]
157 fn decode_or_detect_with_meta() {
158 let input =
159 b"<html><head><meta charset=\"windows-1254\"></head><body>\xFE\xF0\xFD</body></html>";
160 let (text, enc) = decode_or_detect(input).unwrap();
161 assert_eq!(enc.name(), "windows-1254");
162 assert!(text.contains('\u{015F}')); assert!(text.contains('\u{011F}')); assert!(text.contains('\u{0131}')); }
166
167 #[test]
168 fn decode_empty() {
169 let text = decode(b"", encoding_rs::UTF_8).unwrap();
170 assert_eq!(text, "");
171 }
172}