use unicode_normalization::char::is_combining_mark;
use unicode_normalization::UnicodeNormalization;
pub(crate) const DEFAULT_THRESHOLD: usize = 3;
pub(crate) const DEFAULT_MAX_MARKS: usize = 2;
fn max_combining_run(text: &str) -> usize {
let mut max_run: usize = 0;
let mut current_run: usize = 0;
for ch in text.nfd() {
if is_combining_mark(ch) {
current_run += 1;
if current_run > max_run {
max_run = current_run;
}
} else {
current_run = 0;
}
}
max_run
}
pub(crate) fn is_zalgo(text: &str, threshold: usize) -> bool {
if text.is_ascii() {
return false;
}
max_combining_run(text) > threshold
}
pub(crate) fn strip_zalgo(text: &str, max_marks: usize) -> String {
let mut out = String::new();
strip_zalgo_into(text, max_marks, &mut out);
out
}
pub(crate) fn strip_zalgo_into(text: &str, max_marks: usize, out: &mut String) {
out.clear();
if text.is_ascii() {
out.push_str(text);
return;
}
let mut filtered = String::with_capacity(text.len());
let mut mark_count: usize = 0;
for ch in text.nfd() {
if is_combining_mark(ch) {
mark_count += 1;
if mark_count <= max_marks {
filtered.push(ch);
}
} else {
mark_count = 0;
filtered.push(ch);
}
}
out.extend(filtered.nfc());
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_zalgo_clean_text() {
assert!(!is_zalgo("hello world", 3));
assert!(!is_zalgo("café résumé", 3));
assert!(!is_zalgo("", 3));
}
#[test]
fn test_is_zalgo_ascii_fast_path() {
assert!(!is_zalgo("just ascii text 12345!@#$%", 3));
}
#[test]
fn test_is_zalgo_vietnamese() {
assert!(!is_zalgo("Việt Nam", 3));
assert!(!is_zalgo("ệ", 2));
}
#[test]
fn test_is_zalgo_detects_stacking() {
let mut zalgo = String::from("a");
for _ in 0..10 {
zalgo.push('\u{0300}'); }
assert!(is_zalgo(&zalgo, 3));
}
#[test]
fn test_is_zalgo_threshold_boundary() {
let mut text = String::from("a");
for _ in 0..3 {
text.push('\u{0300}');
}
assert!(!is_zalgo(&text, 3));
text.push('\u{0300}');
assert!(is_zalgo(&text, 3));
}
#[test]
fn test_strip_zalgo_clean_text_unchanged() {
assert_eq!(strip_zalgo("hello world", 2), "hello world");
assert_eq!(strip_zalgo("café", 2), "café");
}
#[test]
fn test_strip_zalgo_preserves_legitimate_diacritics() {
let input = "Việt Nam";
assert_eq!(strip_zalgo(input, 2), input);
assert_eq!(strip_zalgo("résumé", 2), "résumé");
}
#[test]
fn test_strip_zalgo_removes_excess() {
let mut zalgo = String::from("a");
for _ in 0..10 {
zalgo.push('\u{0300}'); }
let result = strip_zalgo(&zalgo, 2);
assert!(result.chars().count() <= 3); assert!(result.starts_with('à'));
}
#[test]
fn test_strip_zalgo_max_marks_zero_strips_all() {
assert_eq!(strip_zalgo("café", 0), "cafe");
assert_eq!(strip_zalgo("résumé", 0), "resume");
}
#[test]
fn test_strip_zalgo_ascii_fast_path() {
let input = "just ascii";
assert_eq!(strip_zalgo(input, 2), input);
}
#[test]
fn test_strip_zalgo_multiple_base_chars() {
let mut zalgo = String::new();
for base in ['H', 'i'] {
zalgo.push(base);
for _ in 0..8 {
zalgo.push('\u{0300}');
zalgo.push('\u{0301}');
zalgo.push('\u{0302}');
}
}
let result = strip_zalgo(&zalgo, 2);
let mut mark_count = 0;
for ch in result.nfd() {
if is_combining_mark(ch) {
mark_count += 1;
assert!(mark_count <= 2, "Too many combining marks in output");
} else {
mark_count = 0;
}
}
}
#[test]
fn test_max_combining_run() {
assert_eq!(max_combining_run("hello"), 0);
assert_eq!(max_combining_run("café"), 1);
assert_eq!(max_combining_run(""), 0);
let mut text = String::from("a");
for _ in 0..5 {
text.push('\u{0300}');
}
assert_eq!(max_combining_run(&text), 5);
}
}