use crate::error::{Error, Result};
use std::{fmt::Display, str::FromStr};
use std::convert::TryFrom;
pub const SPACE: Char = Char { char: ' ' };
pub const UNDERSCORE: Char = Char { char: '_' };
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Char {
pub(crate) char: char,
}
impl Char {
pub fn new(ch: char) -> Result<Self> {
check_char(ch).map_or(Err(Error::DisallowedChar(ch.into())), |ok| {
Ok(Self { char: ok })
})
}
pub fn new_must(ch: char) -> Char {
Self::new(ch).unwrap()
}
pub fn new_or(ch: char, default: Char) -> Char {
check_char(ch).map_or(default, |ok| Char { char: ok })
}
}
impl Display for Char {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.char)
}
}
impl Into<char> for Char {
fn into(self) -> char {
self.char
}
}
impl Into<char> for &Char {
fn into(self) -> char {
self.char
}
}
impl Into<u32> for Char {
fn into(self) -> u32 {
self.char.into()
}
}
impl Into<u32> for &Char {
fn into(self) -> u32 {
self.char.into()
}
}
impl Into<String> for Char {
fn into(self) -> String {
format!("{}", self)
}
}
impl Into<String> for &Char {
fn into(self) -> String {
format!("{}", self)
}
}
impl FromStr for Char {
type Err = Error;
fn from_str(s: &str) -> Result<Self> {
let chars: Vec<char> = s.chars().collect();
if chars.len() == 1 {
Self::new(chars[0])
} else {
Err(Error::StrToCharConversion(chars.len()))
}
}
}
impl TryFrom<&str> for Char {
type Error = Error;
fn try_from(value: &str) -> Result<Self> {
Self::from_str(value)
}
}
impl TryFrom<String> for Char {
type Error = Error;
fn try_from(value: String) -> Result<Self> {
Self::from_str(&value)
}
}
pub fn check_char(ch: char) -> Option<char> {
let cp = ch as u32;
if ch == ' ' {
return Some(' ');
}
if cp == 0x0009 {
return Some(' ');
}
if cp == 0x180E {
return Some(' ');
}
if cp == 0x0020
|| cp == 0x00A0
|| cp == 0x1680
|| (0x2000..=0x200A).contains(&cp)
|| cp == 0x202F
|| cp == 0x205F
|| cp == 0x3000
{
return Some(' ');
}
if (0x0000..=0x001F).contains(&cp) {
return None;
}
if [0x7F, 0x81, 0x8D, 0x8F, 0x90, 0x9D, 0xA0].contains(&cp) {
return None;
}
if (0x0300..=0x036F).contains(&cp) {
return None;
}
if (0x200B..=0x200F).contains(&cp) || cp == 0xFEFF || (0xFE00..=0xFE0F).contains(&cp) {
return None;
}
if (0x202A..=0x202E).contains(&cp) || (0x2066..=0x2069).contains(&cp) {
return None;
}
if (0xD800..=0xDFFF).contains(&cp) {
return None;
}
Some(ch)
}
pub fn normalize_text(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for ch in input.chars() {
if let Some(ch) = check_char(ch) {
out.push(ch);
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn removes_cr_and_leaves_newline() {
let s = "line1\r\nline2\nline3\r";
let got = normalize_text(s);
assert_eq!(got, "line1line2line3");
}
#[test]
fn tabs_and_spaces_normalized() {
let s = "A\tB\u{00A0}C\u{2003}D\u{205F}E\u{3000}F\u{180E}G";
let got = normalize_text(s);
assert_eq!(got, "A B C D E F G");
}
#[test]
fn removes_zero_width_and_bidi_and_combining() {
let s = format!("X\u{200B}Y\u{202A}Z\u{0301}Q");
let got = normalize_text(&s);
assert_eq!(got, "XYZQ");
}
#[test]
fn removes_control_c1_and_c0_except_newline_and_tab() {
let s = format!("A\u{0001}B\u{001b}CD\tE");
let got = normalize_text(&s);
assert_eq!(got, "ABCD E");
}
#[test]
fn preserves_other_unicode() {
let s = "Привет\u{00A0}мир — hello";
let got = normalize_text(s);
assert_eq!(got, "Привет мир — hello");
}
#[test]
fn example_from_doc() {
let s = "Hello\u{00A0}World\t!\r\nA\u{200B}B\u{0301}C";
let out = normalize_text(s);
assert_eq!(out, "Hello World !ABC");
}
}