use std::borrow::Cow;
pub const LRI: char = '\u{2066}'; pub const PDI: char = '\u{2069}';
#[derive(Clone, Debug)]
pub struct FixConfig {
pub exclude_urls: bool,
pub exclude_emails: bool,
pub exclude_pure_numbers: bool,
}
impl Default for FixConfig {
fn default() -> Self {
Self {
exclude_urls: true,
exclude_emails: true,
exclude_pure_numbers: true,
}
}
}
pub fn decorate_rtl_plain_text(input: &str, cfg: &FixConfig) -> String {
if input.is_empty() {
return String::new();
}
let mut out = String::with_capacity(input.len() + 16);
let mut isolate_depth: i32 = 0;
let bytes = input.as_bytes();
let mut i: usize = 0;
while i < bytes.len() {
let (ch, ch_len) = next_char(input, i);
if ch == LRI || ch == '\u{2067}' || ch == '\u{2068}' {
isolate_depth += 1;
out.push(ch);
i += ch_len;
continue;
}
if ch == PDI {
if isolate_depth > 0 {
isolate_depth -= 1;
}
out.push(ch);
i += ch_len;
continue;
}
if isolate_depth > 0 {
out.push(ch);
i += ch_len;
continue;
}
if is_token_start(ch) {
let start = i;
let mut j = i + ch_len;
while j < bytes.len() {
let (c2, c2_len) = next_char(input, j);
if is_token_continue(c2) {
j += c2_len;
} else {
break;
}
}
let token = &input[start..j];
let (core, trailing) = split_trailing_punct(token);
if should_wrap(core, cfg) {
out.push(LRI);
out.push_str(core);
out.push(PDI);
} else {
out.push_str(core);
}
out.push_str(trailing);
i = j;
continue;
}
out.push(ch);
i += ch_len;
}
out
}
pub fn strip_isolates(input: &str) -> Cow<'_, str> {
if input.is_empty() {
return Cow::Borrowed(input);
}
let mut changed = false;
for ch in input.chars() {
if is_isolate_control(ch) {
changed = true;
break;
}
}
if !changed {
return Cow::Borrowed(input);
}
let mut out = String::with_capacity(input.len());
for ch in input.chars() {
if !is_isolate_control(ch) {
out.push(ch);
}
}
Cow::Owned(out)
}
pub fn visualize_invisibles(input: &str) -> String {
if input.is_empty() {
return String::new();
}
let mut out = String::with_capacity(input.len() + 32);
for ch in input.chars() {
match ch {
LRI => out.push_str("⟦LRI⟧"),
'\u{2067}' => out.push_str("⟦RLI⟧"),
'\u{2068}' => out.push_str("⟦FSI⟧"),
PDI => out.push_str("⟦PDI⟧"),
'\u{200E}' => out.push_str("⟦LRM⟧"),
'\u{200F}' => out.push_str("⟦RLM⟧"),
_ => out.push(ch),
}
}
out
}
#[inline]
fn is_isolate_control(ch: char) -> bool {
matches!(ch, '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}')
}
#[inline]
fn next_char(s: &str, byte_idx: usize) -> (char, usize) {
let ch = s[byte_idx..].chars().next().unwrap();
(ch, ch.len_utf8())
}
#[inline]
fn is_token_start(ch: char) -> bool {
ch.is_ascii_alphanumeric()
}
#[inline]
fn is_token_continue(ch: char) -> bool {
if ch.is_ascii_alphanumeric() {
return true;
}
matches!(
ch,
'#' | '@' | '.' | '/' | '\\' | ':' | '_' | '-' | '+' | '=' | '%' | '&' | '?' | '~'
)
}
fn split_trailing_punct(token: &str) -> (&str, &str) {
let trim_set = ['.', ',', ';', '!', '?'];
let mut end = token.len();
while end > 0 {
let prev = token[..end].chars().last().unwrap();
if trim_set.contains(&prev) {
end -= prev.len_utf8();
} else {
break;
}
}
(&token[..end], &token[end..])
}
fn should_wrap(core: &str, cfg: &FixConfig) -> bool {
if core.is_empty() {
return false;
}
if cfg.exclude_pure_numbers && core.chars().all(|c| c.is_ascii_digit()) {
return false;
}
if cfg.exclude_urls && looks_like_url(core) {
return false;
}
if cfg.exclude_emails && looks_like_email(core) {
return false;
}
let mut has_letter = false;
let mut has_tech_punct = false;
for ch in core.chars() {
if ch.is_ascii_alphabetic() {
has_letter = true;
}
if matches!(ch, '#' | '.' | '/' | '\\' | ':' | '_' | '-' | '+' | '=') {
has_tech_punct = true;
}
}
has_letter || has_tech_punct
}
fn looks_like_url(s: &str) -> bool {
let lower = s.as_bytes();
if s.contains("://") {
return true;
}
if lower.len() >= 4 {
if (lower[0] == b'w' || lower[0] == b'W')
&& (lower[1] == b'w' || lower[1] == b'W')
&& (lower[2] == b'w' || lower[2] == b'W')
&& lower[3] == b'.'
{
return true;
}
}
let s_l = s.to_ascii_lowercase();
s_l.starts_with("http://") || s_l.starts_with("https://") || s_l.starts_with("ftp://")
}
fn looks_like_email(s: &str) -> bool {
let at = match s.find('@') {
Some(i) => i,
None => return false,
};
if at == 0 || at + 1 >= s.len() {
return false;
}
if s[at + 1..].contains('@') {
return false; }
let (local, domain) = s.split_at(at);
let domain = &domain[1..];
if local.is_empty() || domain.is_empty() {
return false;
}
let dot = match domain.rfind('.') {
Some(i) => i,
None => return false,
};
if dot == 0 || dot + 1 >= domain.len() {
return false;
}
if !local.chars().all(is_email_local_char) {
return false;
}
if !domain.chars().all(is_email_domain_char) {
return false;
}
true
}
#[inline]
fn is_email_local_char(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '%' | '+' | '-')
}
#[inline]
fn is_email_domain_char(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, '.' | '-')
}