use regex::Regex;
const DEFAULT_FILLER_PATTERNS: &[&str] = &[
r"\bum\b,?\s*",
r"\buh\b,?\s*",
r"\blike,\s*",
r"\byou know,?\s*",
r"\bbasically,?\s*",
r"\bactually,?\s*",
r"\bI mean,?\s*",
r"\bsort of\b",
r"\bkind of\b",
];
pub fn remove_filler_words(text: &str, custom_words: &[String]) -> String {
let mut result = text.to_string();
if custom_words.is_empty() {
for pattern_str in DEFAULT_FILLER_PATTERNS {
if let Ok(re) = Regex::new(&format!("(?i){pattern_str}")) {
result = re.replace_all(&result, "").to_string();
}
}
} else {
for word in custom_words {
let pattern_str = format!(r"(?i)\b{},?\s*", regex::escape(word));
if let Ok(re) = Regex::new(&pattern_str) {
result = re.replace_all(&result, "").to_string();
}
}
}
result = remove_stutters(&result);
if let Ok(re) = Regex::new(r" {2,}") {
result = re.replace_all(&result, " ").to_string();
}
result.trim().to_string()
}
fn remove_stutters(text: &str) -> String {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return String::new();
}
let mut result = Vec::with_capacity(words.len());
result.push(words[0]);
for word in &words[1..] {
if let Some(prev) = result.last() {
if !prev.eq_ignore_ascii_case(word) {
result.push(word);
}
}
}
result.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn removes_um() {
assert_eq!(
remove_filler_words("um I went to the store", &[]),
"I went to the store"
);
}
#[test]
fn removes_uh() {
assert_eq!(remove_filler_words("I uh went home", &[]), "I went home");
}
#[test]
fn removes_like_filler() {
assert_eq!(
remove_filler_words("it was like, really cool", &[]),
"it was really cool"
);
}
#[test]
fn preserves_like_as_verb() {
assert_eq!(remove_filler_words("I like cats", &[]), "I like cats");
}
#[test]
fn removes_you_know() {
assert_eq!(
remove_filler_words("it was, you know, pretty good", &[]),
"it was, pretty good"
);
}
#[test]
fn removes_basically() {
assert_eq!(remove_filler_words("basically it works", &[]), "it works");
}
#[test]
fn removes_actually() {
assert_eq!(
remove_filler_words("actually I think so", &[]),
"I think so"
);
}
#[test]
fn removes_i_mean() {
assert_eq!(
remove_filler_words("I mean it was fine", &[]),
"it was fine"
);
}
#[test]
fn removes_sort_of() {
assert_eq!(
remove_filler_words("it was sort of okay", &[]),
"it was okay"
);
}
#[test]
fn removes_kind_of() {
assert_eq!(
remove_filler_words("it was kind of nice", &[]),
"it was nice"
);
}
#[test]
fn removes_stutters() {
assert_eq!(
remove_filler_words("I I I went to the store", &[]),
"I went to the store"
);
}
#[test]
fn removes_double_stutter() {
assert_eq!(remove_filler_words("the the cat sat", &[]), "the cat sat");
}
#[test]
fn removes_multiple_fillers() {
assert_eq!(
remove_filler_words("um uh like, you know basically it works", &[]),
"it works"
);
}
#[test]
fn case_insensitive() {
assert_eq!(remove_filler_words("Um I went", &[]), "I went");
assert_eq!(remove_filler_words("UH okay", &[]), "okay");
}
#[test]
fn collapses_spaces() {
assert_eq!(remove_filler_words("I um went home", &[]), "I went home");
}
#[test]
fn empty_input() {
assert_eq!(remove_filler_words("", &[]), "");
}
#[test]
fn no_fillers() {
assert_eq!(
remove_filler_words("the cat sat on the mat", &[]),
"the cat sat on the mat"
);
}
#[test]
fn custom_words() {
let custom = vec!["well".to_string(), "so".to_string()];
assert_eq!(
remove_filler_words("well so I went home", &custom),
"I went home"
);
}
#[test]
fn custom_words_ignores_defaults() {
let custom = vec!["well".to_string()];
assert_eq!(remove_filler_words("well um I went", &custom), "um I went");
}
#[test]
fn trims_result() {
assert_eq!(remove_filler_words(" um hello ", &[]), "hello");
}
#[test]
fn filler_with_comma() {
assert_eq!(remove_filler_words("like, it was good", &[]), "it was good");
}
}