use std::mem;
use tendril::StrTendril;
pub(crate) fn replace_chars(
st: &mut StrTendril,
ws: bool,
ctrl: bool,
trim_start: bool,
trim_end: bool)
{
let mut last = 0;
let mut ost = None; let mut replacing = 0u8;
let ins = st.as_ref();
for (i, ch) in ins.char_indices() {
let rmask = replace_mask(ch, ws, ctrl);
if rmask > 0 {
if replacing == 0 {
if ost.is_none() {
ost = Some(StrTendril::new());
}
ost.as_mut().unwrap().push_slice(&ins[last..i]);
}
replacing |= rmask;
} else if replacing > 0 {
if replacing >= 2 &&
(ost.as_ref().unwrap().len32() > 0 || !trim_start)
{
ost.as_mut().unwrap().push_char(' ');
}
last = i;
replacing = 0;
}
}
if replacing > 0 {
if replacing >= 2 && !trim_end {
ost.as_mut().unwrap().push_char(' ');
}
} else if ost.is_some() {
ost.as_mut().unwrap().push_slice(&ins[last..]);
}
if ost.is_some() {
mem::replace(st, ost.take().unwrap());
}
}
fn replace_mask(c: char, ws: bool, ctrl: bool) -> u8 {
use CharClass::*;
match char_class(c) {
ZeroSpace | Control if ctrl => 1,
WhiteSpace if ws => 2,
_ => 0,
}
}
#[derive(Debug, Eq, PartialEq)]
enum CharClass {
Unclassified,
WhiteSpace,
ZeroSpace,
Control,
}
pub(crate) fn is_all_ctrl_ws(st: &StrTendril) -> bool {
st.as_ref().chars().all(|c| char_class(c) != CharClass::Unclassified)
}
fn char_class(c: char) -> CharClass {
use CharClass::*;
match c {
'\u{0000}'..='\u{0008}' => Control, '\u{0009}' | '\u{000A}' | '\u{000B}' => WhiteSpace, '\u{000C}' => Control, '\u{000D}' => WhiteSpace, '\u{000E}'..='\u{001F}' => Control, '\u{0020}' => WhiteSpace,
'\u{007F}' | '\u{0080}'..='\u{009F}' => Control, '\u{00A0}' => WhiteSpace,
'\u{2000}'..='\u{200A}' => WhiteSpace, '\u{200B}' | '\u{200C}' => ZeroSpace,
'\u{2028}' | '\u{2029}' |
'\u{202F}' |
'\u{205F}' => WhiteSpace, '\u{2060}' => ZeroSpace,
'\u{3000}' => WhiteSpace,
'\u{FEFF}' => ZeroSpace, '\u{FFFE}' | '\u{FFFF}' => Control, _ => Unclassified,
}
}
#[cfg(test)]
mod tests {
use super::*;
use tendril::SliceExt;
#[test]
fn test_char_class() {
use CharClass::*;
assert_eq!(Unclassified, char_class('x'));
assert_eq!(Control, char_class('\u{0008}'));
assert_eq!(ZeroSpace, char_class('\u{2060}'));
assert_eq!(WhiteSpace, char_class('\n'));
assert_eq!(WhiteSpace, char_class('\n'));
}
#[test]
fn replace() {
assert_clean("", "" );
assert_clean("", "\u{2060}" );
assert_clean(" ", " ");
assert_clean(" ", "\t \r\n");
assert_clean("x", "x" );
assert_clean(" x ", " x ");
assert_clean(" x", " x\u{2060}" );
assert_clean("x ", "x " );
assert_clean("aa b ", "\u{009F}a\u{009F}a b " );
assert_clean("aa b c ", "aa b c " );
assert_clean("aa b c", "aa \t b c" );
assert_clean(" aa b c", "\t aa \t b c");
}
#[test]
fn replace_multibyte() {
assert_clean("Ψ", "Ψ" );
assert_clean(" Ψ ", " Ψ ");
assert_clean(" Ψ", " Ψ\u{2060}" );
assert_clean("Ψ ", "Ψ " );
assert_clean("αα β ", "\u{009F}α\u{009F}α β " );
assert_clean("αα β γ ", "αα β γ " );
assert_clean("αα β γ", "αα \t β γ" );
assert_clean(" αα β γ", "\t αα \t β γ");
}
#[test]
fn replace_ctrl_only() {
assert_clean_ctrl("", "" );
assert_clean_ctrl("", "\u{2060}" );
assert_clean_ctrl(" ", " ");
assert_clean_ctrl("x", "x" );
assert_clean_ctrl(" x ", " x ");
assert_clean_ctrl(" x", " x\u{2060}" );
assert_clean_ctrl("x ", "x " );
assert_clean_ctrl("aaa β ", "\u{009F}a\u{009F}aa β " );
assert_clean_ctrl("aa β c ", "aa β c " );
assert_clean_ctrl("aa \t β c", "aa \t β c" );
assert_clean_ctrl("\t aa \t β c", "\t aa \t β c");
}
#[test]
fn replace_trim() {
assert_clean_trim("", "");
assert_clean_trim("", "\t \r\n");
assert_clean_trim("", "\u{0000}"); assert_clean_trim("", "\u{FFFE}"); assert_clean_trim("", "\u{00A0}\u{2007}\u{202F}");
assert_clean_trim("x", "x" );
assert_clean_trim("x", " x ");
assert_clean_trim("x", " x" );
assert_clean_trim("x", "x " );
assert_clean_trim("aa b", " a\u{009F}a\u{009F} b " );
assert_clean_trim("aa b c", "aa b c " );
assert_clean_trim("aa b c", "aa \t b c" );
assert_clean_trim("aa b c", "\t aa \t b c");
}
#[test]
fn replace_trim_left() {
assert_clean_trim_l("", "");
assert_clean_trim_l(" ", " ");
assert_clean_trim_l(" ", "\t \r\n");
}
#[test]
fn replace_trim_right() {
assert_clean_trim_r("", "");
assert_clean_trim_r("", " ");
assert_clean_trim_r("", "\t \r\n");
}
fn assert_clean_trim(exp: &str, src: &str) {
let mut st = src.to_tendril();
replace_chars(&mut st, true, true, true, true);
assert_eq!(exp, st.as_ref());
}
fn assert_clean_trim_l(exp: &str, src: &str) {
let mut st = src.to_tendril();
replace_chars(&mut st, true, true, true, false);
assert_eq!(exp, st.as_ref());
}
fn assert_clean_trim_r(exp: &str, src: &str) {
let mut st = src.to_tendril();
replace_chars(&mut st, true, true, false, true);
assert_eq!(exp, st.as_ref());
}
fn assert_clean(exp: &str, src: &str) {
let mut st = src.to_tendril();
replace_chars(&mut st, true, true, false, false);
assert_eq!(exp, st.as_ref());
}
fn assert_clean_ctrl(exp: &str, src: &str) {
let mut st = src.to_tendril();
replace_chars(&mut st, false, true, false, false);
assert_eq!(exp, st.as_ref());
}
}