use regex::Regex;
const DEFAULT_FILLER_PATTERNS: &[&str] = &[
r"\bum\b,?\s*",
r"\buh\b,?\s*",
r"\blike,\s*",
r"\byou know,?\s*",
r"\bbasically,?\s*",
r"\bactually,?\s*",
r"\bI mean,?\s*",
r"\bsort of\b",
r"\bkind of\b",
];
pub fn remove_filler_words(text: &str, custom_words: &[String]) -> String {
let mut result = text.to_string();
if custom_words.is_empty() {
for pattern_str in DEFAULT_FILLER_PATTERNS {
if let Ok(re) = Regex::new(&format!("(?i){pattern_str}")) {
result = re.replace_all(&result, "").to_string();
}
}
} else {
for word in custom_words {
let pattern_str = format!(r"(?i)\b{},?\s*", regex::escape(word));
if let Ok(re) = Regex::new(&pattern_str) {
result = re.replace_all(&result, "").to_string();
}
}
}
result = remove_stutters(&result);
if let Ok(re) = Regex::new(r" {2,}") {
result = re.replace_all(&result, " ").to_string();
}
result.trim().to_string()
}
fn remove_stutters(text: &str) -> String {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return String::new();
}
let mut result = Vec::with_capacity(words.len());
result.push(words[0]);
for word in &words[1..] {
if let Some(prev) = result.last() {
if !prev.eq_ignore_ascii_case(word) {
result.push(word);
}
}
}
result.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn removes_um() {
assert_eq!(
remove_filler_words("um I went to the store", &[]),
"I went to the store"
);
}
#[test]
fn removes_uh() {
assert_eq!(remove_filler_words("I uh went home", &[]), "I went home");
}
#[test]
fn removes_like_filler() {
assert_eq!(
remove_filler_words("it was like, really cool", &[]),
"it was really cool"
);
}
#[test]
fn preserves_like_as_verb() {
assert_eq!(remove_filler_words("I like cats", &[]), "I like cats");
}
#[test]
fn removes_you_know() {
assert_eq!(
remove_filler_words("it was, you know, pretty good", &[]),
"it was, pretty good"
);
}
#[test]
fn removes_basically() {
assert_eq!(remove_filler_words("basically it works", &[]), "it works");
}
#[test]
fn removes_actually() {
assert_eq!(
remove_filler_words("actually I think so", &[]),
"I think so"
);
}
#[test]
fn removes_i_mean() {
assert_eq!(
remove_filler_words("I mean it was fine", &[]),
"it was fine"
);
}
#[test]
fn removes_sort_of() {
assert_eq!(
remove_filler_words("it was sort of okay", &[]),
"it was okay"
);
}
#[test]
fn removes_kind_of() {
assert_eq!(
remove_filler_words("it was kind of nice", &[]),
"it was nice"
);
}
#[test]
fn removes_stutters() {
assert_eq!(
remove_filler_words("I I I went to the store", &[]),
"I went to the store"
);
}
#[test]
fn removes_double_stutter() {
assert_eq!(remove_filler_words("the the cat sat", &[]), "the cat sat");
}
#[test]
fn removes_multiple_fillers() {
assert_eq!(
remove_filler_words("um uh like, you know basically it works", &[]),
"it works"
);
}
#[test]
fn case_insensitive() {
assert_eq!(remove_filler_words("Um I went", &[]), "I went");
assert_eq!(remove_filler_words("UH okay", &[]), "okay");
}
#[test]
fn collapses_spaces() {
assert_eq!(remove_filler_words("I um went home", &[]), "I went home");
}
#[test]
fn empty_input() {
assert_eq!(remove_filler_words("", &[]), "");
}
#[test]
fn no_fillers() {
assert_eq!(
remove_filler_words("the cat sat on the mat", &[]),
"the cat sat on the mat"
);
}
#[test]
fn custom_words() {
let custom = vec!["well".to_string(), "so".to_string()];
assert_eq!(
remove_filler_words("well so I went home", &custom),
"I went home"
);
}
#[test]
fn custom_words_ignores_defaults() {
let custom = vec!["well".to_string()];
assert_eq!(remove_filler_words("well um I went", &custom), "um I went");
}
#[test]
fn trims_result() {
assert_eq!(remove_filler_words(" um hello ", &[]), "hello");
}
#[test]
fn filler_with_comma() {
assert_eq!(remove_filler_words("like, it was good", &[]), "it was good");
}
#[test]
fn cyrillic_no_fillers() {
let text = "Привет мир, как дела?";
assert_eq!(remove_filler_words(text, &[]), text);
}
#[test]
fn cyrillic_with_english_filler() {
assert_eq!(remove_filler_words("um Привет мир", &[]), "Привет мир");
}
#[test]
fn cyrillic_stutter_removal() {
assert_eq!(remove_filler_words("я я пошёл домой", &[]), "я пошёл домой");
}
#[test]
fn arabic_no_fillers() {
let text = "مرحبا بالعالم";
assert_eq!(remove_filler_words(text, &[]), text);
}
#[test]
fn arabic_stutter_removal() {
assert_eq!(remove_filler_words("هذا هذا اختبار", &[]), "هذا اختبار");
}
#[test]
fn cjk_no_fillers() {
let text = "你好世界";
assert_eq!(remove_filler_words(text, &[]), text);
}
#[test]
fn japanese_no_fillers() {
let text = "こんにちは世界";
assert_eq!(remove_filler_words(text, &[]), text);
}
#[test]
fn korean_no_fillers() {
let text = "안녕하세요 세계";
assert_eq!(remove_filler_words(text, &[]), text);
}
#[test]
fn mixed_script_with_filler() {
assert_eq!(
remove_filler_words("basically Привет, 世界 uh okay", &[]),
"Привет, 世界 okay"
);
}
#[test]
fn custom_cyrillic_filler() {
let custom = vec!["ну".to_string(), "типа".to_string()];
assert_eq!(remove_filler_words("ну типа я пошёл", &custom), "я пошёл");
}
#[test]
fn emoji_in_text() {
let text = "um 😀 that was basically 🎉 great";
assert_eq!(remove_filler_words(text, &[]), "😀 that was 🎉 great");
}
}