use std::collections::HashSet;
use std::sync::LazyLock;
#[derive(Clone, Copy, PartialEq)]
enum Mode {
Clean,
Audible,
}
enum Segment {
Word(String),
AngleGroup(Vec<String>),
Drop,
Replace(Vec<String>),
KeepOriginal,
Retracing,
Expand(usize),
}
#[derive(Clone)]
enum OutputItem {
Word(String),
Group(Vec<String>),
}
static ESCAPE_PREFIXES: &[&str] = &[
"[?", "[/", "[<", "[>", "[:", "[!", "[*", "+\"", "+,", "<&", "&",
];
static ESCAPE_SUFFIXES: &[&str] = &["\u{21ab}xxx"];
static ESCAPE_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"0",
"++",
"+<",
"+^",
"(.)",
"(..)",
"(...)",
":",
";",
";;",
"<",
">",
"xx",
"yy",
"xxx",
"yyy",
"www",
"www:",
"xxx:",
"xxx;",
"xxx;;",
"xxx\u{2192}", "xxx\u{2191}", "xxx@si",
"yyy:",
"\u{2192}", ]
.into_iter()
.collect()
});
static KEEP_PREFIXES: &[&str] = &["+\"/", "+,/", "+\"."];
fn flush_word(buf: &mut String, segments: &mut Vec<Segment>) {
if buf.is_empty() {
return;
}
let word = std::mem::take(buf);
segments.push(Segment::Word(word));
}
fn should_start_angle_group(chars: &[char], pos: usize, word_buf: &str) -> bool {
if !word_buf.is_empty() {
return false;
}
chars[pos + 1..].contains(&'>')
}
fn classify_bracket(content: &str, mode: Mode) -> Segment {
match content.trim() {
"/" | "//" | "///" | "/?" | "/-" | "e" => {
return match mode {
Mode::Clean => Segment::Retracing,
Mode::Audible => Segment::Drop,
};
}
"?" | "!" | "!!" | "^c" | "*" => return Segment::Drop,
_ => {}
}
let trimmed = content.trim();
if let Some(rest) = trimmed
.strip_prefix('<')
.or_else(|| trimmed.strip_prefix('>'))
&& (rest.is_empty() || rest.chars().all(|c| c.is_ascii_digit()))
{
return Segment::Drop;
}
if let Some(rest) = content.strip_prefix(":: ") {
let _ = rest; return Segment::KeepOriginal;
}
if let Some(rest) = content.strip_prefix(": ") {
return match mode {
Mode::Clean => {
let words: Vec<String> = rest.split_whitespace().map(String::from).collect();
Segment::Replace(words)
}
Mode::Audible => Segment::KeepOriginal,
};
}
if content.starts_with("= ")
|| content.starts_with("=? ")
|| content.starts_with("=! ")
|| content.starts_with("+ ")
|| content.starts_with("* ")
|| content.starts_with("% ")
|| content.starts_with("- ")
|| content.starts_with("^ ")
|| content.starts_with("# ")
|| content.starts_with("%act: ")
{
return Segment::Drop;
}
if let Some(rest) = content.strip_prefix("x ") {
if let Ok(n) = rest.trim().parse::<usize>() {
return match mode {
Mode::Clean => Segment::Drop,
Mode::Audible => Segment::Expand(n),
};
}
return Segment::Drop;
}
Segment::Word(format!("[{content}]"))
}
fn is_timed_pause(content: &str) -> bool {
let b = content.as_bytes();
if b.is_empty() {
return false;
}
let mut i = 0;
let start = i;
while i < b.len() && b[i].is_ascii_digit() {
i += 1;
}
if i < b.len() && b[i] == b':' {
if i == start {
return false;
}
i += 1;
} else {
i = start;
}
let digit_start = i;
while i < b.len() && b[i].is_ascii_digit() {
i += 1;
}
if i == digit_start {
return false;
}
if i < b.len() && b[i] == b'.' {
i += 1;
while i < b.len() && b[i].is_ascii_digit() {
i += 1;
}
}
i == b.len()
}
fn tokenize(input: &str, mode: Mode) -> Vec<Segment> {
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut segments: Vec<Segment> = Vec::new();
let mut i = 0;
let mut word_buf = String::new();
while i < len {
match chars[i] {
'\x15' => {
flush_word(&mut word_buf, &mut segments);
i += 1;
while i < len && chars[i] != '\x15' {
i += 1;
}
if i < len {
i += 1;
}
}
'[' => {
flush_word(&mut word_buf, &mut segments);
i += 1;
let mut content = String::new();
while i < len && chars[i] != ']' {
content.push(chars[i]);
i += 1;
}
if i < len {
i += 1;
}
segments.push(classify_bracket(&content, mode));
}
'<' if should_start_angle_group(&chars, i, &word_buf) => {
flush_word(&mut word_buf, &mut segments);
i += 1;
let mut content = String::new();
let mut depth: usize = 1;
while i < len && depth > 0 {
match chars[i] {
'[' => {
content.push('[');
i += 1;
while i < len && chars[i] != ']' {
content.push(chars[i]);
i += 1;
}
if i < len {
content.push(']');
i += 1;
}
}
'<' => {
depth += 1;
content.push('<');
i += 1;
}
'>' => {
depth -= 1;
if depth > 0 {
content.push('>');
}
i += 1;
}
ch => {
content.push(ch);
i += 1;
}
}
}
let inner_segments = tokenize(&content, mode);
let words = process(&inner_segments);
if !words.is_empty() {
segments.push(Segment::AngleGroup(words));
}
}
'(' => {
let mut j = i + 1;
while j < len && chars[j] != ')' {
j += 1;
}
if j < len {
let content: String = chars[i + 1..j].iter().collect();
if is_timed_pause(&content) {
flush_word(&mut word_buf, &mut segments);
i = j + 1;
} else {
word_buf.push('(');
i += 1;
}
} else {
word_buf.push('(');
i += 1;
}
}
'\u{2039}' | '\u{203a}' | '\u{2308}' | '\u{2309}' | '\u{230a}' | '\u{230b}' | '\u{201c}' | '\u{201d}' => {
i += 1;
}
',' => {
if word_buf == "+" {
word_buf.push(',');
} else {
flush_word(&mut word_buf, &mut segments);
segments.push(Segment::Word(",".to_string()));
}
i += 1;
}
' ' | '\t' | '\n' | '\r' => {
flush_word(&mut word_buf, &mut segments);
i += 1;
}
ch => {
word_buf.push(ch);
i += 1;
}
}
}
flush_word(&mut word_buf, &mut segments);
segments
}
fn process(segments: &[Segment]) -> Vec<String> {
let mut output: Vec<OutputItem> = Vec::new();
let mut i = 0;
while i < segments.len() {
match &segments[i] {
Segment::Word(w) => {
output.push(OutputItem::Word(w.clone()));
}
Segment::AngleGroup(words) => {
output.push(OutputItem::Group(words.clone()));
}
Segment::Drop | Segment::KeepOriginal => {
}
Segment::Replace(replacement) => {
if let Some(pos) = output
.iter()
.rposition(|item| matches!(item, OutputItem::Word(_) | OutputItem::Group(_)))
{
output[pos] = OutputItem::Group(replacement.clone());
}
}
Segment::Retracing => {
while i + 1 < segments.len() && matches!(&segments[i + 1], Segment::Retracing) {
i += 1;
}
if let Some(pos) = output
.iter()
.rposition(|item| matches!(item, OutputItem::Word(_) | OutputItem::Group(_)))
{
output.remove(pos);
}
}
Segment::Expand(n) => {
if let Some(item) = output.last().cloned() {
for _ in 1..*n {
output.push(item.clone());
}
}
}
}
i += 1;
}
let mut words = Vec::new();
for item in output {
match item {
OutputItem::Word(w) => words.push(w),
OutputItem::Group(ws) => words.extend(ws),
}
}
words
}
fn clean_word_boundaries(word: &str) -> &str {
let mut w = word;
if let Some(rest) = w.strip_prefix('<') {
w = rest;
}
if let Some(rest) = w.strip_suffix('>') {
w = rest;
}
if let Some(rest) = w.strip_suffix(']') {
w = rest;
}
w
}
fn filter_words(words: Vec<String>) -> Vec<String> {
let mut result = Vec::new();
for raw in words {
let word = clean_word_boundaries(&raw);
if word.is_empty() {
continue;
}
if KEEP_PREFIXES.iter().any(|k| word.starts_with(k)) {
result.push(word.to_string());
continue;
}
if word.starts_with('0') && word[1..].starts_with(|c: char| c.is_alphabetic()) {
continue;
}
if !ESCAPE_WORDS.contains(word)
&& !ESCAPE_PREFIXES.iter().any(|e| word.starts_with(e))
&& !ESCAPE_SUFFIXES.iter().any(|e| word.ends_with(e))
{
let word = match word.find('@') {
Some(pos) => &word[..pos],
None => word,
};
let word: String = word.chars().filter(|&c| c != '(' && c != ')').collect();
if !word.is_empty() {
result.push(word);
}
}
}
result
}
fn split_trailing_punct(words: &mut Vec<String>) {
if let Some(last) = words.last()
&& last.len() > 1
{
let bytes = last.as_bytes();
let final_byte = bytes[bytes.len() - 1];
let penult_byte = bytes[bytes.len() - 2];
if (final_byte == b'.' || final_byte == b'?') && penult_byte.is_ascii_lowercase() {
let word = last[..last.len() - 1].to_string();
let punct = last[last.len() - 1..].to_string();
let len = words.len();
words[len - 1] = word;
words.push(punct);
}
}
}
pub(crate) fn clean_utterance(utterance: &str) -> String {
let segments = tokenize(utterance, Mode::Clean);
let words = process(&segments);
let mut words = filter_words(words);
split_trailing_punct(&mut words);
words.join(" ")
}
static AUDIBLE_ESCAPE_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"0", "++", "+<", "+^", "(.)", "(..)", "(...)", ":", ";", ";;", "<", ">", "xx", "yy",
"\u{2192}", ]
.into_iter()
.collect()
});
fn clean_disfluency(word: &str) -> String {
let mut result = String::with_capacity(word.len());
let word = word.strip_prefix('\u{2260}').unwrap_or(word);
let word = {
let first = word.find('\u{21ab}');
let last = word.rfind('\u{21ab}');
match (first, last) {
(Some(f), Some(l)) if f != l => {
let before = &word[..f];
let after = &word[l + '\u{21ab}'.len_utf8()..];
std::borrow::Cow::Owned(format!("{before}{after}"))
}
_ => std::borrow::Cow::Borrowed(word),
}
};
let chars: Vec<char> = word.chars().collect();
let len = chars.len();
for (i, &ch) in chars.iter().enumerate() {
match ch {
'^' => {}
':' if i > 0
&& i + 1 < len
&& chars[i - 1].is_alphabetic()
&& chars[i + 1].is_alphabetic() => {}
_ => result.push(ch),
}
}
result
}
fn filter_words_audible(words: Vec<String>) -> Vec<String> {
let mut result = Vec::new();
for raw in words {
let word = clean_word_boundaries(&raw);
if word.is_empty() {
continue;
}
if KEEP_PREFIXES.iter().any(|k| word.starts_with(k)) {
result.push(word.to_string());
continue;
}
if word.starts_with('0') && word[1..].starts_with(|c: char| c.is_alphabetic()) {
continue;
}
if let Some(after_amp) = word.strip_prefix('&') {
if let Some(after_eq) = after_amp.strip_prefix('=') {
if after_eq.contains(':') {
continue; }
let cleaned = clean_disfluency(word);
if !cleaned.is_empty() {
result.push(cleaned);
}
continue;
}
let rest = after_amp;
let rest = rest
.strip_prefix('+')
.or_else(|| rest.strip_prefix('-'))
.or_else(|| rest.strip_prefix('~'))
.unwrap_or(rest);
if !rest.is_empty() {
let rest = match rest.find('@') {
Some(pos) => &rest[..pos],
None => rest,
};
let cleaned = clean_disfluency(rest);
if !cleaned.is_empty() {
result.push(cleaned);
}
}
continue;
}
let non_amp_escape_prefixes: &[&str] =
&["[?", "[/", "[<", "[>", "[:", "[!", "[*", "+\"", "+,", "<&"];
if !AUDIBLE_ESCAPE_WORDS.contains(word)
&& !non_amp_escape_prefixes.iter().any(|e| word.starts_with(e))
&& !ESCAPE_SUFFIXES.iter().any(|e| word.ends_with(e))
{
let word = match word.find('@') {
Some(pos) => &word[..pos],
None => word,
};
let mut cleaned = String::with_capacity(word.len());
let mut in_parens = false;
for ch in word.chars() {
match ch {
'(' => in_parens = true,
')' => in_parens = false,
_ if !in_parens => cleaned.push(ch),
_ => {}
}
}
let cleaned = clean_disfluency(&cleaned);
if !cleaned.is_empty() {
result.push(cleaned);
}
}
}
result
}
pub(crate) fn audible_utterance(utterance: &str) -> String {
let segments = tokenize(utterance, Mode::Audible);
let words = process(&segments);
let mut words = filter_words_audible(words);
split_trailing_punct(&mut words);
words.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_string() {
assert_eq!(clean_utterance(""), "");
}
#[test]
fn test_simple_utterance() {
assert_eq!(clean_utterance("I want cookie ."), "I want cookie .");
}
#[test]
fn test_drop_explanation() {
assert_eq!(
clean_utterance("I want [= desire] cookie ."),
"I want cookie ."
);
}
#[test]
fn test_drop_repetition_count() {
assert_eq!(clean_utterance("cookie [x 3] ."), "cookie .");
}
#[test]
fn test_drop_actions() {
assert_eq!(clean_utterance("hello [+ IMP] ."), "hello .");
}
#[test]
fn test_drop_error_marker() {
assert_eq!(clean_utterance("goed [*] ."), "goed .");
}
#[test]
fn test_drop_overlap_markers() {
assert_eq!(clean_utterance("hello [<] world ."), "hello world .");
assert_eq!(clean_utterance("hello [>] world ."), "hello world .");
}
#[test]
fn test_drop_pauses() {
assert_eq!(clean_utterance("hello (1.5) world ."), "hello world .");
}
#[test]
fn test_timestamp_removal() {
let input = "hello \x15123_456\x15 .";
assert_eq!(clean_utterance(input), "hello .");
}
#[test]
fn test_reformulation_single_word() {
assert_eq!(clean_utterance("dog [//] cat ."), "cat .");
}
#[test]
fn test_repetition_single_word() {
assert_eq!(clean_utterance("the [/] the dog ."), "the dog .");
}
#[test]
fn test_reformulation_multi_word() {
assert_eq!(clean_utterance("< the dog > [//] the cat ."), "the cat .");
}
#[test]
fn test_escape_words_removed() {
assert_eq!(clean_utterance("xxx ."), ".");
assert_eq!(clean_utterance("yyy ."), ".");
assert_eq!(clean_utterance("www ."), ".");
}
#[test]
fn test_filler_removed() {
assert_eq!(clean_utterance("&um hello ."), "hello .");
}
#[test]
fn test_curly_quotes_removed() {
assert_eq!(clean_utterance("\u{201c}hello\u{201d} ."), "hello .");
}
#[test]
fn test_question_mark_spacing() {
assert_eq!(clean_utterance("what ?"), "what ?");
}
#[test]
fn test_sentence_final_period_spacing() {
assert_eq!(clean_utterance("cookie."), "cookie .");
}
#[test]
fn test_correction_keep_original() {
assert_eq!(clean_utterance("goed [:: went] ."), "goed .");
}
#[test]
fn test_correction_use_replacement() {
assert_eq!(clean_utterance("goed [: went] ."), "went .");
}
#[test]
fn test_unicode_brackets_removed() {
assert_eq!(clean_utterance("\u{2308}hello\u{2309} ."), "hello .");
}
#[test]
fn test_question_mark_attached_to_word() {
assert_eq!(clean_utterance("what?"), "what ?");
}
#[test]
fn test_nested_reformulations() {
assert_eq!(clean_utterance("< a b > [//] [/] the cat ."), "the cat .");
}
#[test]
fn test_multi_word_replacement() {
assert_eq!(clean_utterance("goed [: had gone] ."), "had gone .");
}
#[test]
fn test_angle_group_replacement() {
assert_eq!(clean_utterance("< the dog > [: the cat] ."), "the cat .");
}
#[test]
fn test_error_marker_before_retracing() {
assert_eq!(clean_utterance("word [*] [//] next ."), "next .");
}
#[test]
fn test_multiple_annotations() {
assert_eq!(
clean_utterance("hello [= greeting] [+ IMP] world ."),
"hello world ."
);
}
#[test]
fn test_uncertain_explanation() {
assert_eq!(clean_utterance("word [=? maybe this] ."), "word .");
}
#[test]
fn test_paralinguistic() {
assert_eq!(clean_utterance("hello [=! laughing] ."), "hello .");
}
#[test]
fn test_precode() {
assert_eq!(clean_utterance("[- eng] hello ."), "hello .");
}
#[test]
fn test_pause_dots_filtered() {
assert_eq!(clean_utterance("hello (.) world ."), "hello world .");
assert_eq!(clean_utterance("hello (..) world ."), "hello world .");
assert_eq!(clean_utterance("hello (...) world ."), "hello world .");
}
#[test]
fn test_timed_pause_with_colon() {
assert_eq!(clean_utterance("hello (2:30.5) world ."), "hello world .");
}
#[test]
fn test_false_start() {
assert_eq!(
clean_utterance("want [/-] I need cookie ."),
"I need cookie ."
);
}
#[test]
fn test_false_start_angle_group() {
assert_eq!(
clean_utterance("< I want > [/-] I need cookie ."),
"I need cookie ."
);
}
#[test]
fn test_completion() {
assert_eq!(clean_utterance("I [///] she went ."), "she went .");
}
#[test]
fn test_omitted_words_filtered() {
assert_eq!(clean_utterance("0you go ."), "go .");
assert_eq!(clean_utterance("I 0can go ."), "I go .");
assert_eq!(clean_utterance("0the dog ."), "dog .");
assert_eq!(
clean_utterance("I going 0to do another Bx ."),
"I going do another Bx ."
);
assert_eq!(clean_utterance("0學 去 ."), "去 .");
assert_eq!(clean_utterance("0你 好 ."), "好 .");
assert_eq!(clean_utterance("0 dog ."), "dog .");
}
#[test]
fn test_nested_angle_brackets_retracing() {
assert_eq!(
clean_utterance("<<how'd> [=? how]> [//] (.) how you hafta do the man ?"),
"how you hafta do the man ?"
);
}
#[test]
fn test_nested_angle_brackets_repetition() {
assert_eq!(
clean_utterance(
"<<I got> [<]> [/] I got ink on my fingers <and> [/] and shoe polish ."
),
"I got ink on my fingers and shoe polish ."
);
}
#[test]
fn test_exclude_single_word() {
assert_eq!(
clean_utterance("this is a mor [e] exclude ."),
"this is a exclude ."
);
}
#[test]
fn test_exclude_angle_group() {
assert_eq!(
clean_utterance("this is <a multi-word> [e] exclude ."),
"this is exclude ."
);
}
#[test]
fn test_special_form_markers_stripped() {
assert_eq!(clean_utterance("bingbing@c ."), "bingbing .");
assert_eq!(clean_utterance("woofwoof@o ."), "woofwoof .");
assert_eq!(clean_utterance("istenem@s:hu ."), "istenem .");
assert_eq!(clean_utterance("um@fp ."), "um .");
assert_eq!(clean_utterance("b@l ."), "b .");
assert_eq!(clean_utterance("wug@t ."), "wug .");
assert_eq!(
clean_utterance("I got a bingbing@c ."),
"I got a bingbing ."
);
}
#[test]
fn test_parentheses_stripped() {
assert_eq!(clean_utterance("(un)til the end ."), "until the end .");
assert_eq!(clean_utterance("sit(ting) down ."), "sitting down .");
assert_eq!(clean_utterance("(be)cause ."), "because .");
}
#[test]
fn test_is_timed_pause() {
assert!(is_timed_pause("1"));
assert!(is_timed_pause("1.5"));
assert!(is_timed_pause("2:30"));
assert!(is_timed_pause("2:30.5"));
assert!(is_timed_pause("0:01.23"));
assert!(!is_timed_pause(""));
assert!(!is_timed_pause("."));
assert!(!is_timed_pause(".."));
assert!(!is_timed_pause("..."));
assert!(!is_timed_pause("abc"));
assert!(!is_timed_pause(":5"));
}
#[test]
fn test_audible_simple() {
assert_eq!(audible_utterance("I want cookie ."), "I want cookie .");
}
#[test]
fn test_audible_keeps_xxx() {
assert_eq!(audible_utterance("xxx ."), "xxx .");
assert_eq!(audible_utterance("yyy ."), "yyy .");
assert_eq!(audible_utterance("www ."), "www .");
}
#[test]
fn test_audible_drops_xx() {
assert_eq!(audible_utterance("xx ."), ".");
assert_eq!(audible_utterance("yy ."), ".");
}
#[test]
fn test_audible_keeps_repetition() {
assert_eq!(audible_utterance("the [/] the dog ."), "the the dog .");
}
#[test]
fn test_audible_keeps_repetition_angle_group() {
assert_eq!(
audible_utterance("< I wanted > [/] I wanted to invite Margie ."),
"I wanted I wanted to invite Margie ."
);
}
#[test]
fn test_audible_keeps_retracing() {
assert_eq!(
audible_utterance("< I wanted > [//] blah blah blah ."),
"I wanted blah blah blah ."
);
}
#[test]
fn test_audible_keeps_reformulation() {
assert_eq!(audible_utterance("I [///] she went ."), "I she went .");
}
#[test]
fn test_audible_keeps_false_start() {
assert_eq!(
audible_utterance("want [/-] I need cookie ."),
"want I need cookie ."
);
}
#[test]
fn test_audible_keeps_excluded() {
assert_eq!(
audible_utterance("this is a mor [e] exclude ."),
"this is a mor exclude ."
);
}
#[test]
fn test_audible_expansion() {
assert_eq!(audible_utterance("want [x 3] ."), "want want want .");
}
#[test]
fn test_audible_expansion_single() {
assert_eq!(audible_utterance("want [x 1] ."), "want .");
}
#[test]
fn test_audible_replacement_keeps_original() {
assert_eq!(audible_utterance("goed [: went] ."), "goed .");
}
#[test]
fn test_audible_keep_original_unchanged() {
assert_eq!(audible_utterance("goed [:: went] ."), "goed .");
}
#[test]
fn test_audible_fragment_prefix_minus() {
assert_eq!(audible_utterance("&-uh hello ."), "uh hello .");
}
#[test]
fn test_audible_fragment_prefix_plus() {
assert_eq!(audible_utterance("&+um hello ."), "um hello .");
}
#[test]
fn test_audible_fragment_prefix_tilde() {
assert_eq!(audible_utterance("&~hey hello ."), "hey hello .");
}
#[test]
fn test_audible_fragment_bare_ampersand() {
assert_eq!(audible_utterance("&um hello ."), "um hello .");
}
#[test]
fn test_audible_simple_event_kept() {
assert_eq!(audible_utterance("&=laughs hello ."), "&=laughs hello .");
}
#[test]
fn test_audible_simple_event_action_dropped() {
assert_eq!(audible_utterance("&=imit:baby hello ."), "hello .");
assert_eq!(audible_utterance("&=ges:ignore hello ."), "hello .");
}
#[test]
fn test_audible_paren_content_removed() {
assert_eq!(audible_utterance("(un)til the end ."), "til the end .");
assert_eq!(audible_utterance("sit(ting) down ."), "sit down .");
assert_eq!(audible_utterance("(be)cause ."), "cause .");
}
#[test]
fn test_audible_disfluency_colon() {
assert_eq!(audible_utterance("s:paghetti ."), "spaghetti .");
}
#[test]
fn test_audible_disfluency_caret() {
assert_eq!(audible_utterance("spa^ghetti ."), "spaghetti .");
}
#[test]
fn test_audible_disfluency_not_equal() {
assert_eq!(audible_utterance("\u{2260}butter ."), "butter .");
}
#[test]
fn test_audible_disfluency_leftwards_arrow_paired() {
assert_eq!(audible_utterance("like\u{21ab}ike-ike\u{21ab} ."), "like .");
}
#[test]
fn test_audible_disfluency_leftwards_arrow_unpaired() {
assert_eq!(audible_utterance("like\u{21ab} ."), "like\u{21ab} .");
}
#[test]
fn test_audible_omitted_words_still_filtered() {
assert_eq!(audible_utterance("0you go ."), "go .");
}
#[test]
fn test_audible_at_markers_stripped() {
assert_eq!(audible_utterance("bingbing@c ."), "bingbing .");
}
#[test]
fn test_audible_drops_annotations() {
assert_eq!(
audible_utterance("I want [= desire] cookie ."),
"I want cookie ."
);
}
#[test]
fn test_audible_drops_timestamps() {
let input = "hello \x15123_456\x15 .";
assert_eq!(audible_utterance(input), "hello .");
}
#[test]
fn test_audible_drops_pauses() {
assert_eq!(audible_utterance("hello (1.5) world ."), "hello world .");
assert_eq!(audible_utterance("hello (.) world ."), "hello world .");
}
#[test]
fn test_clean_disfluency_colon_between_alpha() {
assert_eq!(clean_disfluency("s:paghetti"), "spaghetti");
}
#[test]
fn test_clean_disfluency_colon_not_between_alpha() {
assert_eq!(clean_disfluency("xxx:"), "xxx:");
}
#[test]
fn test_clean_disfluency_caret() {
assert_eq!(clean_disfluency("spa^ghetti"), "spaghetti");
}
#[test]
fn test_clean_disfluency_not_equal_prefix() {
assert_eq!(clean_disfluency("\u{2260}butter"), "butter");
}
#[test]
fn test_clean_disfluency_paired_arrow() {
assert_eq!(clean_disfluency("like\u{21ab}ike-ike\u{21ab}"), "like");
}
#[test]
fn test_clean_disfluency_unpaired_arrow() {
assert_eq!(clean_disfluency("like\u{21ab}"), "like\u{21ab}");
}
#[test]
fn test_clean_disfluency_no_change() {
assert_eq!(clean_disfluency("hello"), "hello");
}
}