use crate::tables::lookup_utf8;
pub fn apply(input: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(input.len());
let mut i = 0;
while i < input.len() {
let first = input[i];
if first < 0x80 {
out.push(first);
i += 1;
continue;
}
let (cp, consumed) = decode_one(&input[i..]);
if let (Some(cp), consumed) = (cp, consumed) {
if let Some(repl) = lookup_utf8(cp) {
out.extend_from_slice(repl);
} else {
out.extend_from_slice(&input[i..i + consumed]);
}
i += consumed;
} else {
out.push(first);
i += 1;
}
}
out
}
fn decode_one(bytes: &[u8]) -> (Option<u32>, usize) {
if bytes.is_empty() {
return (None, 0);
}
let b = bytes[0];
let (expected_len, mut cp): (usize, u32) = if b < 0x80 {
return (Some(b as u32), 1);
} else if (b & 0b1110_0000) == 0b1100_0000 {
(2, (b & 0b0001_1111) as u32)
} else if (b & 0b1111_0000) == 0b1110_0000 {
(3, (b & 0b0000_1111) as u32)
} else if (b & 0b1111_1000) == 0b1111_0000 {
(4, (b & 0b0000_0111) as u32)
} else {
return (None, 0);
};
if bytes.len() < expected_len {
return (None, 0);
}
for &cont in &bytes[1..expected_len] {
if (cont & 0b1100_0000) != 0b1000_0000 {
return (None, 0);
}
cp = (cp << 6) | ((cont & 0b0011_1111) as u32);
}
(Some(cp), expected_len)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_passes_through() {
assert_eq!(apply(b"hello.txt"), b"hello.txt");
}
#[test]
fn utf8_e_acute_to_e() {
assert_eq!(apply("café".as_bytes()), b"cafe");
}
#[test]
fn unmapped_codepoint_passes_through_bytes() {
let middle = "中".as_bytes();
assert_eq!(apply(middle), middle);
}
#[test]
fn invalid_utf8_byte_passes_through() {
assert_eq!(apply(&[b'a', 0xFF, b'b']), &[b'a', 0xFF, b'b']);
}
}