use auto_encoder::auto_encode_bytes;
enum Character {
SingleByte { data: u8 },
MultiByte { len: usize },
}
pub trait WhitespaceSifter: AsRef<str> {
#[must_use]
fn sift(&self) -> String {
let input: &str = self.as_ref();
let mut out: String = String::with_capacity(input.len());
sift_preallocated(input.as_bytes(), &mut out);
out
}
#[must_use]
fn sift_preserve_newlines(&self) -> String {
let input = self.as_ref();
let mut out = String::with_capacity(input.len());
let bytes = input.as_bytes();
let mut ind: usize = 0;
while ind < bytes.len() {
sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}
if out.ends_with("\r\n") {
let _ = out.pop();
let _ = out.pop();
} else if out.ends_with('\n') {
let _ = out.pop();
}
out
}
}
pub trait WhitespaceSifterBytes: AsRef<[u8]> {
#[must_use]
fn sift_bytes(&self) -> String {
let input = self.as_ref();
let mut out: String = String::with_capacity(input.len());
sift_preallocated(input, &mut out);
out
}
#[must_use]
fn sift_bytes_preserve_newlines(&self) -> String {
let bytes = self.as_ref();
let mut out = String::with_capacity(bytes.len());
let mut ind: usize = 0;
while ind < bytes.len() {
sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}
if out.ends_with("\r\n") {
let _ = out.pop();
let _ = out.pop();
} else if out.ends_with('\n') {
let _ = out.pop();
}
out
}
}
impl<T: AsRef<str>> WhitespaceSifter for T {}
impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}
fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
while *ind < bytes.len() {
match get_char_metadata(bytes[*ind]) {
Character::SingleByte { data } => {
*ind += 1;
if !is_ascii_whitespace(data) {
out.push(data as char);
break;
}
}
Character::MultiByte { len } => {
let _ = extend_from_bytes_with_len(bytes, ind, out, len);
break;
}
}
}
}
fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
if is_last_whitespace {
out.pop();
}
}
#[inline]
fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) -> bool {
let end = ind.saturating_add(len);
if *ind <= end && end <= bytes.len() {
let slice = &bytes[*ind..end];
if slice == [0xC2, 0xA0] {
out.push(' ');
*ind = end;
return true;
}
if slice == [0xE2, 0x80, 0xAF] {
out.push(' ');
*ind = end;
return true;
}
if slice == [0xE2, 0x80, 0x89] {
out.push(' ');
*ind = end;
return true;
}
let output = auto_encode_bytes(slice);
out.push_str(&output);
}
*ind = end;
false
}
#[inline]
const fn is_newline(codepoint: u8) -> bool {
matches!(codepoint, LINE_FEED | CARRIAGE_RETURN)
}
fn sift_preallocated(bytes: &[u8], out: &mut String) {
if bytes.is_empty() {
return;
}
let mut ind: usize = 0;
sift_trim_start(bytes, &mut ind, out);
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;
while ind < bytes.len() {
match get_char_metadata(bytes[ind]) {
Character::SingleByte { data } => {
ind += 1;
if is_ascii_whitespace(data) {
if data == LINE_FEED && is_last_carriage_return {
out.push('\n');
is_last_carriage_return = false;
is_last_whitespace = false;
continue;
}
if is_last_whitespace {
is_last_carriage_return = data == CARRIAGE_RETURN;
continue;
}
is_last_whitespace = true;
is_last_carriage_return = data == CARRIAGE_RETURN;
out.push(data as char);
} else {
is_last_whitespace = false;
is_last_carriage_return = false;
out.push(data as char);
}
}
Character::MultiByte { len } => {
let was_ws = extend_from_bytes_with_len(bytes, &mut ind, out, len);
is_last_whitespace = was_ws;
is_last_carriage_return = false;
}
}
}
sift_trim_end(out, is_last_whitespace);
}
#[inline]
const fn is_regular_ascii(b: u8) -> bool {
b > 0x20 && b < 0x7F }
fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
sift_trim_start(bytes, ind, out);
let mut is_last_whitespace = false;
let mut is_last_carriage_return = false;
while *ind < bytes.len() {
let b = bytes[*ind];
if is_regular_ascii(b) {
let start = *ind;
*ind += 1;
while *ind < bytes.len() && is_regular_ascii(bytes[*ind]) {
*ind += 1;
}
let slice = unsafe { std::str::from_utf8_unchecked(&bytes[start..*ind]) };
out.push_str(slice);
is_last_whitespace = false;
is_last_carriage_return = false;
continue;
}
match get_char_metadata(b) {
Character::SingleByte { data } => {
*ind += 1;
if is_ascii_whitespace(data) {
if is_newline(data) {
if is_last_whitespace {
out.pop();
is_last_whitespace = false;
}
if is_last_carriage_return {
out.push('\r');
}
out.push('\n');
return;
}
is_last_carriage_return = data == CARRIAGE_RETURN;
if is_last_whitespace {
continue;
}
is_last_whitespace = true;
out.push(' ');
} else {
is_last_whitespace = false;
is_last_carriage_return = false;
out.push(data as char);
}
}
Character::MultiByte { len } => {
let was_ws = extend_from_bytes_with_len(bytes, ind, out, len);
is_last_whitespace = was_ws;
is_last_carriage_return = false;
}
}
}
sift_trim_end(out, is_last_whitespace);
}
#[inline]
const fn get_char_metadata(first_byte: u8) -> Character {
match first_byte {
0b0000_0000..=0b0111_1111 => Character::SingleByte { data: first_byte },
0b1000_0000..=0b1101_1111 => Character::MultiByte { len: 2 },
0b1110_0000..=0b1110_1111 => Character::MultiByte { len: 3 },
0b1111_0000..=0b1111_1111 => Character::MultiByte { len: 4 },
}
}
#[allow(clippy::cast_possible_truncation)]
const SPACE: u8 = ' ' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const HORIZONTAL_TAB: u8 = '\t' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const LINE_FEED: u8 = '\n' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const FORM_FEED: u8 = '\x0C' as u32 as u8;
#[allow(clippy::cast_possible_truncation)]
const CARRIAGE_RETURN: u8 = '\r' as u32 as u8;
#[inline]
const fn is_ascii_whitespace(codepoint: u8) -> bool {
matches!(
codepoint,
SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN
)
}