use crate::{data::WINDOWS_1252, util::contains_zero_byte, util::le_u64, util::repeat_byte};
use std::borrow::Cow;
pub trait Encoding {
fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str>;
}
#[derive(Debug, Default, Copy, Clone)]
pub struct Windows1252Encoding;
impl Windows1252Encoding {
pub fn new() -> Self {
Windows1252Encoding
}
pub fn decode(data: &[u8]) -> Cow<'_, str> {
decode_windows1252(data)
}
}
impl Encoding for Windows1252Encoding {
fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> {
Windows1252Encoding::decode(data)
}
}
impl<T: Encoding + ?Sized> Encoding for &'_ T {
fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> {
(**self).decode(data)
}
}
impl<T: Encoding + ?Sized> Encoding for Box<T> {
fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> {
(**self).decode(data)
}
}
#[derive(Debug, Default, Copy, Clone)]
pub struct Utf8Encoding;
impl Utf8Encoding {
pub fn new() -> Self {
Utf8Encoding
}
pub fn decode(data: &[u8]) -> Cow<'_, str> {
decode_utf8(data)
}
}
impl Encoding for Utf8Encoding {
fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> {
Utf8Encoding::decode(data)
}
}
const fn trim_ascii_end(data: &[u8]) -> &[u8] {
let mut bytes = data;
while let [rest @ .., last] = bytes {
if last.is_ascii_whitespace() {
bytes = rest;
} else {
break;
}
}
bytes
}
#[inline]
pub(crate) fn decode_windows1252(d: &[u8]) -> Cow<'_, str> {
let bytes = trim_ascii_end(d);
let mut eject = false;
for x in bytes {
eject |= !x.is_ascii() || *x == b'\\'
}
if eject {
return Cow::Owned(windows_1252_create(bytes, 0));
}
debug_assert!(std::str::from_utf8(bytes).is_ok());
let s = unsafe { std::str::from_utf8_unchecked(bytes) };
Cow::Borrowed(s)
}
#[inline(never)]
fn windows_1252_create(d: &[u8], offset: usize) -> String {
let (upto, rest) = d.split_at(offset);
let size_estimate = offset + (d.len() - offset) * 2;
let mut result = String::with_capacity(size_estimate);
let head = unsafe { std::str::from_utf8_unchecked(upto) };
result.push_str(head);
for &c in rest.iter().filter(|&x| *x != b'\\') {
result.push(WINDOWS_1252[c as usize]);
}
result
}
#[inline]
pub(crate) fn decode_utf8(d: &[u8]) -> Cow<'_, str> {
let d = trim_ascii_end(d);
let mut chunk_iter = d.chunks_exact(8);
let mut offset = 0;
let mut is_ascii = true;
for n in &mut chunk_iter {
let wide = le_u64(n);
is_ascii &= wide & 0x8080_8080_8080_8080 == 0;
if contains_zero_byte(wide ^ repeat_byte(b'\\')) {
return Cow::Owned(utf8_create(d, offset));
}
offset += 8;
}
let remainder = chunk_iter.remainder();
for &byte in remainder {
is_ascii &= byte.is_ascii();
if byte == b'\\' {
return Cow::Owned(utf8_create(d, offset));
}
offset += 1;
}
let d = trim_ascii_end(d);
if is_ascii {
debug_assert!(std::str::from_utf8(d).is_ok());
let s = unsafe { std::str::from_utf8_unchecked(d) };
Cow::Borrowed(s)
} else {
String::from_utf8_lossy(d)
}
}
fn utf8_create(d: &[u8], offset: usize) -> String {
let (upto, rest) = d.split_at(offset);
let size_estimate = offset + (d.len() - offset) * 2;
let mut result = Vec::with_capacity(size_estimate);
result.extend_from_slice(upto);
for &c in rest.iter().filter(|&x| *x != b'\\') {
result.push(c);
}
match String::from_utf8(result) {
Ok(s) => s,
Err(e) => String::from_utf8_lossy(&e.into_bytes()).into_owned(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn scalar_string_trim_end_newlines() {
assert_eq!(Windows1252Encoding::decode(b""), "");
assert_eq!(Windows1252Encoding::decode(b"new\n"), "new");
assert_eq!(Windows1252Encoding::decode(b"\t"), "");
assert_eq!(
Windows1252Encoding::decode(b"new \\\"Captain\\\" \n"),
r#"new "Captain""#
);
assert_eq!(Windows1252Encoding::decode(b"new\xF8 "), "newø")
}
#[test]
fn scalar_string_trim_end_newlines_utf8() {
assert_eq!(Utf8Encoding::decode(b""), "");
assert_eq!(Utf8Encoding::decode(b"new\n"), "new");
assert_eq!(Utf8Encoding::decode(b"\t"), "");
assert_eq!(
Utf8Encoding::decode(b"new \\\"Captain\\\" \n"),
r#"new "Captain""#
);
assert_eq!(Utf8Encoding::decode("newø ".as_bytes()), "newø")
}
#[test]
fn scalar_string_escapes() {
let data = br#"Joe \"Captain\" Rogers"#;
assert_eq!(Windows1252Encoding::decode(data), r#"Joe "Captain" Rogers"#);
}
#[test]
fn scalar_utf8_string_escapes() {
let data = br#"Joe \"Captain\" Rogers"#;
assert_eq!(Utf8Encoding::decode(data), r#"Joe "Captain" Rogers"#);
let data = br#"Joe Captain\"s"#;
assert_eq!(Utf8Encoding::decode(data), r#"Joe Captain"s"#);
}
#[test]
fn scalar_invalid_utf8_replace() {
let data = b"Joe\xffcheeze";
assert_eq!(Utf8Encoding::decode(data), "Joe�cheeze");
}
#[test]
fn scalar_to_string_undefined_characters() {
let data = &[0x81, 0x8d, 0x8f, 0x90, 0x9d];
let (cow, _) = encoding_rs::WINDOWS_1252.decode_without_bom_handling(data);
assert_eq!(Windows1252Encoding::decode(data), cow);
}
}