use encoding_rs::Encoding;
use fhp_core::error::EncodingError;
pub fn decode(input: &[u8], encoding: &'static Encoding) -> Result<String, EncodingError> {
let input = strip_bom(input, encoding);
let (cow, _actual_encoding, had_errors) = encoding.decode(input);
if had_errors {
let offset = find_first_error_offset(input, encoding);
return Err(EncodingError::MalformedInput {
encoding: encoding.name(),
offset,
});
}
Ok(cow.into_owned())
}
pub fn decode_or_detect(input: &[u8]) -> Result<(String, &'static Encoding), EncodingError> {
let encoding = crate::detect::detect(input);
let text = decode(input, encoding)?;
Ok((text, encoding))
}
fn strip_bom<'a>(input: &'a [u8], encoding: &'static Encoding) -> &'a [u8] {
if encoding == encoding_rs::UTF_8
&& input.len() >= 3
&& input[0] == 0xEF
&& input[1] == 0xBB
&& input[2] == 0xBF
{
return &input[3..];
}
if encoding == encoding_rs::UTF_16LE && input.len() >= 2 && input[0] == 0xFF && input[1] == 0xFE
{
return &input[2..];
}
if encoding == encoding_rs::UTF_16BE && input.len() >= 2 && input[0] == 0xFE && input[1] == 0xFF
{
return &input[2..];
}
input
}
fn find_first_error_offset(input: &[u8], encoding: &'static Encoding) -> usize {
let mut decoder = encoding.new_decoder_without_bom_handling();
let mut output = vec![0u8; 1024];
let mut total_read = 0usize;
for chunk in input.chunks(256) {
let is_last = total_read + chunk.len() >= input.len();
let (result, read, _) =
decoder.decode_to_utf8_without_replacement(chunk, &mut output, is_last);
if let encoding_rs::DecoderResult::Malformed(_, _) = result {
return total_read + read;
}
total_read += read;
}
0
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decode_utf8() {
let text = decode(b"Hello, world!", encoding_rs::UTF_8).unwrap();
assert_eq!(text, "Hello, world!");
}
#[test]
fn decode_utf8_with_bom() {
let input = b"\xEF\xBB\xBFHello";
let text = decode(input, encoding_rs::UTF_8).unwrap();
assert_eq!(text, "Hello");
}
#[test]
fn decode_latin1() {
let input = b"caf\xe9";
let text = decode(input, encoding_rs::WINDOWS_1252).unwrap();
assert_eq!(text, "caf\u{00e9}");
}
#[test]
fn decode_windows_1252_turkish() {
let input: &[u8] = &[0xF6, 0xFC, 0xE7]; let text = decode(input, encoding_rs::WINDOWS_1254).unwrap();
assert_eq!(text, "\u{00f6}\u{00fc}\u{00e7}"); }
#[test]
fn decode_or_detect_utf8() {
let input = b"<html>Hello</html>";
let (text, enc) = decode_or_detect(input).unwrap();
assert_eq!(enc.name(), "UTF-8");
assert!(text.contains("Hello"));
}
#[test]
fn decode_or_detect_with_meta() {
let input =
b"<html><head><meta charset=\"windows-1254\"></head><body>\xFE\xF0\xFD</body></html>";
let (text, enc) = decode_or_detect(input).unwrap();
assert_eq!(enc.name(), "windows-1254");
assert!(text.contains('\u{015F}')); assert!(text.contains('\u{011F}')); assert!(text.contains('\u{0131}')); }
#[test]
fn decode_empty() {
let text = decode(b"", encoding_rs::UTF_8).unwrap();
assert_eq!(text, "");
}
}