use core::fmt;
use core::str;
use crate::filters;
#[inline]
fn char_from_validated_u32(valid_ch32: u32) -> char {
debug_assert!(!(0xD800..=0xDFFF).contains(&valid_ch32) && valid_ch32 < 0x110000);
return unsafe { char::from_u32_unchecked(valid_ch32) };
}
#[inline]
fn str_from_validated_utf8(valid_utf8: &[u8]) -> &str {
debug_assert!(core::str::from_utf8(valid_utf8).is_ok());
return unsafe { str::from_utf8_unchecked(valid_utf8) };
}
pub fn write_latin1_to<F: filters::Filter>(bytes: &[u8], filter: &mut F) -> fmt::Result {
let len = bytes.len();
let mut written_pos = 0;
for pos in 0..len {
let b = bytes[pos];
if b <= 0x7F {
continue;
}
if written_pos < pos {
filter.write_str(str_from_validated_utf8(&bytes[written_pos..pos]))?;
}
filter.write_non_ascii(b as char)?;
written_pos = pos + 1;
}
return if written_pos < len {
filter.write_str(str_from_validated_utf8(&bytes[written_pos..]))
} else {
Ok(())
};
}
pub fn write_utf8_with_latin1_fallback_to<F: filters::Filter>(
bytes: &[u8],
filter: &mut F,
) -> fmt::Result {
let len = bytes.len();
let mut written_pos = 0;
let mut pos = 0;
while pos < len {
let b0 = bytes[pos];
if b0 <= 0x7F {
pos += 1;
continue;
} else if b0 <= 0xBF {
} else if b0 <= 0xDF {
if len - pos >= 2 {
let b1 = bytes[pos + 1];
if 0x80 == (b1 & 0xC0) {
let ch = ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32);
if 0x80 <= ch {
pos += 2;
continue;
}
}
}
} else if b0 <= 0xEF {
if len - pos >= 3 {
let b1 = bytes[pos + 1];
let b2 = bytes[pos + 2];
if 0x80 == (b1 & 0xC0) && 0x80 == (b2 & 0xC0) {
let ch = ((b0 & 0x0F) as u32) << 12
| ((b1 & 0x3F) as u32) << 6
| ((b2 & 0x3F) as u32);
if 0x800 <= ch && !(0xD800..=0xDFFF).contains(&ch) {
pos += 3;
continue;
}
}
}
} else if b0 <= 0xF4 {
#[allow(clippy::collapsible_if)]
if len - pos >= 4 {
let b1 = bytes[pos + 1];
let b2 = bytes[pos + 2];
let b3 = bytes[pos + 3];
if 0x80 == (b1 & 0xC0) && 0x80 == (b2 & 0xC0) && 0x80 == (b3 & 0xC0) {
let ch = ((b0 & 0x07) as u32) << 18
| ((b1 & 0x3F) as u32) << 12
| ((b2 & 0x3F) as u32) << 6
| ((b3 & 0x3F) as u32);
if (0x10000..=0x10FFFF).contains(&ch) {
pos += 4;
continue;
}
}
}
}
if written_pos < pos {
filter.write_str(str_from_validated_utf8(&bytes[written_pos..pos]))?;
}
filter.write_non_ascii(b0 as char)?;
written_pos = pos + 1;
pos = written_pos;
}
return if written_pos < len {
filter.write_str(str_from_validated_utf8(&bytes[written_pos..]))
} else {
Ok(())
};
}
fn write_utf16_to<const BIG_ENDIAN: bool, F: filters::Filter>(
bytes: &[u8],
filter: &mut F,
) -> fmt::Result {
let len = bytes.len();
let mut pos = 0;
while len - pos >= 2 {
let high = if BIG_ENDIAN {
u16::from_be_bytes(bytes[pos..pos + 2].try_into().unwrap())
} else {
u16::from_le_bytes(bytes[pos..pos + 2].try_into().unwrap())
};
pos += 2;
let ch;
if high <= 0x7F {
filter.write_ascii(high as u8)?;
continue;
} else if !(0xD800..=0xDFFF).contains(&high) {
ch = char_from_validated_u32(high as u32);
} else if high >= 0xDC00 || len - pos < 2 {
ch = char::REPLACEMENT_CHARACTER;
} else {
let low = if BIG_ENDIAN {
u16::from_be_bytes(bytes[pos..pos + 2].try_into().unwrap())
} else {
u16::from_le_bytes(bytes[pos..pos + 2].try_into().unwrap())
};
ch = if !(0xDC00..=0xDFFF).contains(&low) {
char::REPLACEMENT_CHARACTER } else {
pos += 2; char_from_validated_u32(
(((high as u32 - 0xD800) << 10) | (low as u32 - 0xDC00)) + 0x10000,
)
};
}
filter.write_non_ascii(ch)?;
}
return Ok(());
}
pub fn write_utf16be_to<F: filters::Filter>(bytes: &[u8], filter: &mut F) -> fmt::Result {
return write_utf16_to::<true, F>(bytes, filter);
}
pub fn write_utf16le_to<F: filters::Filter>(bytes: &[u8], filter: &mut F) -> fmt::Result {
return write_utf16_to::<false, F>(bytes, filter);
}
fn write_utf32_to<const BIG_ENDIAN: bool, F: filters::Filter>(
bytes: &[u8],
filter: &mut F,
) -> fmt::Result {
let len = bytes.len();
let mut pos = 0;
while len - pos >= 4 {
let ch32 = if BIG_ENDIAN {
u32::from_be_bytes(bytes[pos..pos + 4].try_into().unwrap())
} else {
u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap())
};
pos += 4;
let ch = char::from_u32(ch32).unwrap_or(char::REPLACEMENT_CHARACTER);
filter.write_non_ascii(ch)?;
}
return Ok(());
}
pub fn write_utf32be_to<F: filters::Filter>(bytes: &[u8], filter: &mut F) -> fmt::Result {
return write_utf32_to::<true, F>(bytes, filter);
}
pub fn write_utf32le_to<F: filters::Filter>(bytes: &[u8], filter: &mut F) -> fmt::Result {
return write_utf32_to::<false, F>(bytes, filter);
}
#[cfg(test)]
mod tests {
extern crate alloc;
use alloc::string;
use super::*;
#[test]
fn test_write_latin1_to() {
fn check(expected: &str, input: &[u8]) {
let mut actual = string::String::new();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_latin1_to(input, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
}
check("", b"");
check("A", b"A");
check("A\u{80}", b"A\x80");
check("\u{80}", b"\x80");
check("\u{80}A", b"\x80A");
check("Hello, world!", b"Hello, world!");
check("Hello, \u{80}world!", b"Hello, \x80world!");
check("Hello, \u{FF}", b"Hello, \xFF");
}
#[test]
fn test_write_utf8_with_latin1_fallback_to() {
fn check(expected: &str, input: &[u8]) {
let mut actual = string::String::new();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_utf8_with_latin1_fallback_to(input, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
}
fn check_u32(i: u32) {
let mut buf4 = [0u8; 4];
if let Some(ch) = char::from_u32(i) {
let s = ch.encode_utf8(&mut buf4[..]);
let sb = s.as_bytes();
check(s, sb);
} else {
debug_assert!(i >= 0x800); debug_assert!(i <= 0x1FFFFF); let len;
if i <= 0xFFFF {
buf4[0] = (0xE0 | ((i >> 12) & 0x0F)) as u8;
buf4[1] = (0x80 | ((i >> 6) & 0x3F)) as u8;
buf4[2] = (0x80 | (i & 0x3F)) as u8;
len = 3;
} else {
buf4[0] = (0xF0 | ((i >> 18) & 0x07)) as u8;
buf4[1] = (0x80 | ((i >> 12) & 0x3F)) as u8;
buf4[2] = (0x80 | ((i >> 6) & 0x3F)) as u8;
buf4[3] = (0x80 | (i & 0x3F)) as u8;
len = 4;
}
let mut str = string::String::new();
for j in 0..len {
str.push(char::from_u32(buf4[j] as u32).unwrap());
}
check(&str, &buf4[..len]);
}
}
for i in 0x0..0xFFFF {
check_u32(i);
}
for i in (0x0..0x1F0000).step_by(0x10000) {
check_u32(i);
check_u32(i | 0xFFFF);
}
let valid_strings = [
"",
"Hello, world!",
"\u{0}\u{7F}\u{80}\u{7FF}\u{800}\u{D7FF}\u{E000}\u{FFFF}\u{10000}\u{10FFFF}",
];
for valid_string in valid_strings.iter() {
check(valid_string, valid_string.as_bytes());
}
}
#[test]
fn test_write_utf16_to() {
fn check(expected: &str, input_big_endian: &mut [u8]) {
let mut actual = string::String::new();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_utf16_to::<true, _>(input_big_endian, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
for i in 0..(input_big_endian.len() / 2) {
input_big_endian.swap(i * 2, i * 2 + 1);
}
actual.clear();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_utf16_to::<false, _>(&input_big_endian, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
}
check("", &mut []); check("", &mut [99]); check("0", &mut [0x00, 0x30]); check("0", &mut [0x00, 0x30, 99]); check("\u{FF}", &mut [0x00, 0xFF]); check("\u{100}", &mut [0x01, 0x00]); check("\u{D7FF}", &mut [0xD7, 0xFF]); check("\u{10000}", &mut [0xD8, 0x00, 0xDC, 0x00]); check("\u{103FF}", &mut [0xD8, 0x00, 0xDF, 0xFF]); check("\u{10FC00}", &mut [0xDB, 0xFF, 0xDC, 0x00]); check("\u{10FFFF}", &mut [0xDB, 0xFF, 0xDF, 0xFF]);
check("\u{FFFD}", &mut [0xD8, 0x00]); check("\u{FFFD}0", &mut [0xDB, 0xFF, 0x00, 0x30]); check("\u{FFFD}", &mut [0xDC, 0x00]); check("\u{FFFD}0", &mut [0xDF, 0xFF, 0x00, 0x30]); }
#[test]
fn test_write_utf32_to() {
fn check(expected: &str, input_big_endian: &mut [u8]) {
let mut actual = string::String::new();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_utf32_to::<true, _>(input_big_endian, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
for i in 0..(input_big_endian.len() / 4) {
input_big_endian.swap(i * 4, i * 4 + 3);
input_big_endian.swap(i * 4 + 1, i * 4 + 2);
}
actual.clear();
let mut actual_writer = filters::WriteFilter::new(&mut actual);
write_utf32_to::<false, _>(&input_big_endian, &mut actual_writer).unwrap();
assert_eq!(expected, actual);
}
check("", &mut []); check("", &mut [99]); check("", &mut [0, 99]); check("", &mut [0, 0, 99]); check("0", &mut [0x00, 0x00, 0x00, 0x30]); check("0", &mut [0x00, 0x00, 0x00, 0x30, 99]); check("0", &mut [0x00, 0x00, 0x00, 0x30, 0, 99]); check("0", &mut [0x00, 0x00, 0x00, 0x30, 0, 0, 99]); check("\u{FF}", &mut [0x00, 0x00, 0x00, 0xFF]); check("\u{100}", &mut [0x00, 0x00, 0x01, 0x00]);
check("\u{D7FF}", &mut [0x00, 0x00, 0xD7, 0xFF]);
check("\u{10000}", &mut [0x00, 0x01, 0x00, 0x00]);
check("\u{10FFFF}", &mut [0x00, 0x10, 0xFF, 0xFF]);
check("\u{FFFD}", &mut [0x00, 0x11, 0x00, 0x00]);
check(
"\u{FFFD}0",
&mut [0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30],
);
check("\u{FFFD}", &mut [0x00, 0x00, 0xDC, 0x00]);
check(
"\u{FFFD}0",
&mut [0x01, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x30],
);
}
}