pub fn canonicalize(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for ch in input.chars() {
if is_zero_width(ch) {
continue;
}
let mapped = fold_homoglyph(ch);
if mapped.is_ascii_uppercase() {
out.push(mapped.to_ascii_lowercase());
} else {
out.push(mapped);
}
}
collapse_runs(&out)
}
pub fn is_zero_width(ch: char) -> bool {
matches!(
ch,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}' | '\u{2060}' | '\u{2061}'..='\u{2064}' | '\u{FEFF}' | '\u{180E}' | '\u{034F}' | '\u{061C}' )
}
fn fold_homoglyph(ch: char) -> char {
match ch {
'А' => 'A',
'а' => 'a',
'В' => 'B',
'С' => 'C',
'с' => 'c',
'Е' => 'E',
'е' => 'e',
'Н' => 'H',
'К' => 'K',
'М' => 'M',
'О' => 'O',
'о' => 'o',
'Р' => 'P',
'р' => 'p',
'Т' => 'T',
'Х' => 'X',
'х' => 'x',
'У' => 'Y',
'у' => 'y',
'і' => 'i',
'І' => 'I',
'Α' => 'A',
'α' => 'a',
'Β' => 'B',
'Ε' => 'E',
'ε' => 'e',
'Η' => 'H',
'Ι' => 'I',
'ι' => 'i',
'Κ' => 'K',
'Μ' => 'M',
'Ν' => 'N',
'Ο' => 'O',
'ο' => 'o',
'Ρ' => 'P',
'Τ' => 'T',
'Υ' => 'Y',
'Χ' => 'X',
'\u{FF01}'..='\u{FF5E}' => {
let raw = ch as u32 - 0xFEE0;
char::from_u32(raw).unwrap_or(ch)
}
_ => ch,
}
}
fn collapse_runs(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut prev_was_break = false;
for ch in input.chars() {
let is_break = ch.is_whitespace() || is_separator_punct(ch);
if is_break {
if !prev_was_break && !out.is_empty() {
out.push(' ');
}
prev_was_break = true;
} else {
out.push(ch);
prev_was_break = false;
}
}
let trimmed = out.trim_end().to_string();
trimmed
}
fn is_separator_punct(ch: char) -> bool {
matches!(
ch,
'-' | '_' | '~' | '=' | '*' | '+' | '.' | ',' | ';' | '|' | '\\'
)
}
pub fn truncate_at_char_boundary(input: &str, max_bytes: usize) -> (&str, bool) {
if input.len() <= max_bytes {
return (input, false);
}
let mut end = max_bytes.min(input.len());
while end > 0 && !input.is_char_boundary(end) {
end -= 1;
}
(&input[..end], true)
}
pub fn punctuation_ratio(s: &str) -> f32 {
let mut punct = 0usize;
let mut total = 0usize;
for c in s.chars() {
if c.is_whitespace() {
continue;
}
total += 1;
if !c.is_alphanumeric() {
punct += 1;
}
}
if total == 0 {
0.0
} else {
punct as f32 / total as f32
}
}
pub fn long_run_of_symbols(s: &str, min_run: usize) -> bool {
if min_run == 0 {
return true;
}
let mut run = 0usize;
for c in s.chars() {
if c.is_alphanumeric() || c.is_whitespace() {
run = 0;
continue;
}
run += 1;
if run >= min_run {
return true;
}
}
false
}
pub fn shannon_entropy_ascii_nonws(s: &str) -> f32 {
let mut counts = [0u32; 128];
let mut total = 0u32;
for b in s.bytes() {
if b >= 128 || b.is_ascii_whitespace() {
continue;
}
counts[b as usize] = counts[b as usize].saturating_add(1);
total = total.saturating_add(1);
}
if total == 0 {
return 0.0;
}
let total_f = total as f64;
let mut entropy = 0.0f64;
for c in counts {
if c == 0 {
continue;
}
let p = (c as f64) / total_f;
entropy -= p * p.log2();
}
entropy as f32
}
pub fn zero_width_count(s: &str) -> usize {
s.chars().filter(|c| is_zero_width(*c)).count()
}
pub fn shingle_uniqueness(s: &str, n: usize) -> f32 {
let n = n.clamp(1, 16);
let chars: Vec<char> = s.chars().collect();
if chars.len() < n {
return 1.0;
}
let total = chars.len() - n + 1;
if total == 0 {
return 1.0;
}
let mut seen: std::collections::HashSet<String> =
std::collections::HashSet::with_capacity(total);
for window in chars.windows(n) {
let key: String = window.iter().collect();
seen.insert(key);
}
(seen.len() as f32) / (total as f32)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn canonicalize_lowercases_ascii() {
assert_eq!(canonicalize("IGNORE ALL"), "ignore all");
}
#[test]
fn canonicalize_strips_zero_width() {
let sneaky = "ig\u{200B}no\u{200C}re all";
assert_eq!(canonicalize(sneaky), "ignore all");
}
#[test]
fn canonicalize_folds_homoglyphs() {
let disguised = "igno\u{0440}e";
assert_eq!(canonicalize(disguised), "ignope");
assert_eq!(canonicalize("IGNORE"), "ignore");
}
#[test]
fn canonicalize_collapses_separators() {
assert_eq!(
canonicalize("ignore---all___previous"),
"ignore all previous"
);
}
#[test]
fn truncate_respects_utf8_boundary() {
let input = "héllo"; let (out, truncated) = truncate_at_char_boundary(input, 2);
assert!(truncated);
assert_eq!(out, "h");
}
#[test]
fn truncate_short_input_unchanged() {
let (out, truncated) = truncate_at_char_boundary("hi", 100);
assert!(!truncated);
assert_eq!(out, "hi");
}
#[test]
fn punctuation_ratio_basic() {
assert_eq!(punctuation_ratio(""), 0.0);
assert_eq!(punctuation_ratio(" \n\t"), 0.0);
assert_eq!(punctuation_ratio("abc123"), 0.0);
assert_eq!(punctuation_ratio("!!!@@@"), 1.0);
assert!((punctuation_ratio("ab;c;!") - 0.5).abs() < 1e-6);
}
#[test]
fn long_run_of_symbols_detects_runs() {
assert!(!long_run_of_symbols("hello world", 12));
assert!(long_run_of_symbols("hello !!!!!!!!!!!! world", 12));
assert!(!long_run_of_symbols("hello !!! world", 12));
assert!(long_run_of_symbols("", 0));
}
#[test]
fn shannon_entropy_ascii_nonws_bounds() {
assert!(shannon_entropy_ascii_nonws("aaaaaa") < 1e-6);
let e = shannon_entropy_ascii_nonws("abababab");
assert!((e - 1.0).abs() < 0.1);
assert_eq!(shannon_entropy_ascii_nonws(""), 0.0);
}
#[test]
fn zero_width_count_matches_inserts() {
let s = "a\u{200B}b\u{200C}c\u{FEFF}d";
assert_eq!(zero_width_count(s), 3);
assert_eq!(zero_width_count("plain"), 0);
}
#[test]
fn shingle_uniqueness_detects_repetition() {
let u = shingle_uniqueness("abcdefg", 3);
assert!((u - 1.0).abs() < 1e-6);
let r = shingle_uniqueness("aaaaaaaaa", 3);
assert!(r < 0.2, "expected low uniqueness, got {r}");
assert_eq!(shingle_uniqueness("ab", 3), 1.0);
}
}