fn text_part(line: &str) -> &str {
line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}
fn is_always_noise(text: &str) -> bool {
let t = text.trim().to_lowercase();
t == "[music]" || t == "[blank_audio]" || t == "[silence]" || t == "music"
}
const NOISE_WORDS: &[&str] = &[
"crying",
"laughter",
"laughing",
"applause",
"growling",
"music",
"sobbing",
"cheering",
"sighing",
"clapping",
"coughing",
"sneezing",
"gasping",
"whispering",
"mumbling",
"humming",
"breathing",
"silence",
"snoring",
"yelling",
"screaming",
"blank_audio",
"inaudible",
"noise",
"crosstalk",
"typing",
"static",
"beep",
"ringing",
"śmiech", "risas", "musik", "musique", "musica", "música", "muzyka", "applaus", "aplausos", "applausi", "oklaski", "ruido", "geräusch", "stille", "silencio", "cisza", "rires", "rire", "gelächter", "weeping", ];
const KNOWN_HALLUCINATION_PHRASES: &[&str] = &[
"thank you for watching",
"thanks for watching",
"thank you for watching!",
"thank you so much for watching",
"please subscribe to our channel",
"please subscribe",
"please like and subscribe",
"like and subscribe",
"smash that like button",
"don't forget to subscribe",
"see you in the next video",
"see you next time",
"subtitles by the amara.org community",
"transcribed by the amara.org community",
"translated by the amara.org community",
"the amara.org community",
"amara.org community",
"captions by the cyclope",
];
const HALLUCINATION_LINE_PREFIXES: &[&str] = &[
"transcripted by",
"transcribed by",
"captions by",
"captioned by",
"subtitles by",
"translated by",
];
fn is_url_line(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let first = trimmed.split_whitespace().next().unwrap_or("");
first.starts_with("www.") || first.starts_with("http://") || first.starts_with("https://")
}
fn is_known_hallucination(text: &str) -> bool {
let lowered = text.to_lowercase();
let normalized = lowered
.trim()
.trim_end_matches(['.', '!', '?', ',', ';', ':'])
.trim();
if normalized.is_empty() {
return false;
}
if KNOWN_HALLUCINATION_PHRASES.contains(&normalized) {
return true;
}
if HALLUCINATION_LINE_PREFIXES
.iter()
.any(|p| normalized.starts_with(p))
{
return true;
}
is_url_line(normalized)
}
fn is_noise_word(word: &str) -> bool {
let lower = word.to_lowercase();
NOISE_WORDS.iter().any(|w| *w == lower)
}
pub fn is_noise_marker(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
if t.starts_with("[...]") {
return false;
}
let t = t.strip_suffix('.').unwrap_or(t);
let matched =
(t.starts_with('[') && t.ends_with(']')) || (t.starts_with('(') && t.ends_with(')'));
if !matched {
return false;
}
let inner = &t[1..t.len() - 1];
if inner.chars().all(|c| c.is_ascii_digit() || c == ':') {
return false;
}
let word_count = inner.split_whitespace().count();
if !(1..=4).contains(&word_count) || inner.len() > 40 {
return false;
}
inner
.split_whitespace()
.next_back()
.is_some_and(is_noise_word)
}
pub fn is_all_noise(lines: &[String]) -> bool {
let mut saw_any = false;
for line in lines {
let text = text_part(line).trim();
if text.is_empty() {
continue;
}
saw_any = true;
if !is_noise_marker(text) {
return false;
}
}
saw_any
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct CleanStats {
pub original_lines: usize,
pub after_consecutive_dedup: usize,
pub after_interleaved_dedup: usize,
pub after_script_filter: usize,
pub after_noise_markers: usize,
pub after_trailing_trim: usize,
pub after_command_strip: usize,
pub after_hallucination_strip: usize,
pub lines_removed: usize,
pub all_noise: bool,
}
impl CleanStats {
pub fn summary(&self) -> String {
format!(
"whisper-guard: {} → {} segments ({} removed)",
self.original_lines, self.after_noise_markers, self.lines_removed,
)
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CleanOptions {
pub dedup_consecutive: bool,
pub dedup_interleaved: bool,
pub strip_foreign_script: bool,
pub collapse_noise_markers: bool,
pub trim_trailing_noise: bool,
pub strip_trailing_commands: bool,
pub strip_known_hallucinations: bool,
pub keep_dedup_annotations: bool,
}
impl Default for CleanOptions {
fn default() -> Self {
Self {
dedup_consecutive: true,
dedup_interleaved: true,
strip_foreign_script: true,
collapse_noise_markers: true,
trim_trailing_noise: true,
strip_trailing_commands: true,
strip_known_hallucinations: true,
keep_dedup_annotations: true,
}
}
}
impl CleanOptions {
pub fn all() -> Self {
Self::default()
}
pub fn none() -> Self {
Self {
dedup_consecutive: false,
dedup_interleaved: false,
strip_foreign_script: false,
collapse_noise_markers: false,
trim_trailing_noise: false,
strip_trailing_commands: false,
strip_known_hallucinations: false,
keep_dedup_annotations: true,
}
}
}
const DEDUP_ANNOTATION_PREFIX: &str = "[...] [repeated audio removed";
pub fn clean_segments(segments: &[String]) -> (Vec<String>, CleanStats) {
clean_segments_with_options(segments, &CleanOptions::default())
}
pub fn clean_segments_with_options(
segments: &[String],
opts: &CleanOptions,
) -> (Vec<String>, CleanStats) {
let original_count = segments.len();
let mut lines: Vec<String> = segments.to_vec();
if opts.strip_known_hallucinations {
lines = strip_known_hallucinations(&lines);
}
let after_hallucination = lines.len();
if opts.dedup_consecutive {
lines = dedup_segments(&lines);
if !opts.keep_dedup_annotations {
lines.retain(|s| !s.starts_with(DEDUP_ANNOTATION_PREFIX));
}
}
let after_consecutive = lines.len();
if opts.dedup_interleaved {
lines = dedup_interleaved(&lines);
}
let after_interleaved = lines.len();
if opts.strip_foreign_script {
lines = strip_foreign_script(&lines);
}
let after_script = lines.len();
if opts.strip_trailing_commands {
lines = strip_trailing_commands(&lines);
}
let after_command = lines.len();
if opts.trim_trailing_noise {
lines = trim_trailing_noise(&lines);
}
let after_trim = lines.len();
if opts.collapse_noise_markers {
lines = collapse_noise_markers(&lines);
}
let after_noise = lines.len();
let stats = CleanStats {
original_lines: original_count,
after_consecutive_dedup: after_consecutive,
after_interleaved_dedup: after_interleaved,
after_script_filter: after_script,
after_noise_markers: after_noise,
after_trailing_trim: after_trim,
after_command_strip: after_command,
after_hallucination_strip: after_hallucination,
lines_removed: original_count.saturating_sub(after_noise),
all_noise: is_all_noise(&lines),
};
(lines, stats)
}
pub fn clean_transcript(transcript: &str) -> (String, CleanStats) {
let lines: Vec<String> = transcript.lines().map(|l| l.to_string()).collect();
let (cleaned, stats) = clean_segments(&lines);
(cleaned.join("\n"), stats)
}
pub fn dedup_segments(lines: &[String]) -> Vec<String> {
if lines.len() < 3 {
return lines.to_vec();
}
fn similarity(a: &str, b: &str) -> f64 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let a_lower = a.to_lowercase();
let b_lower = b.to_lowercase();
if a_lower == b_lower {
return 1.0;
}
let (short, long) = if a_lower.len() <= b_lower.len() {
(&a_lower, &b_lower)
} else {
(&b_lower, &a_lower)
};
if long.contains(short.as_str()) {
return short.len() as f64 / long.len() as f64;
}
let a_words: Vec<&str> = a_lower.split_whitespace().collect();
let b_words: Vec<&str> = b_lower.split_whitespace().collect();
let matching = a_words.iter().filter(|w| b_words.contains(w)).count();
let total = a_words.len().max(b_words.len());
if total == 0 {
return 0.0;
}
matching as f64 / total as f64
}
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let base_text = text_part(&lines[i]);
if is_always_noise(base_text) {
result.push(lines[i].clone());
i += 1;
continue;
}
let mut run_end = i + 1;
while run_end < lines.len() {
let candidate = text_part(&lines[run_end]);
if similarity(base_text, candidate) >= 0.8 {
run_end += 1;
} else {
break;
}
}
let run_len = run_end - i;
if run_len >= 3 {
tracing::debug!(
first_segment = i,
repeated_count = run_len,
text = base_text,
"detected repetition loop in whisper output - collapsing {} segments",
run_len
);
result.push(lines[i].clone());
result.push(format!(
"{} - {} identical segments collapsed]",
DEDUP_ANNOTATION_PREFIX,
run_len - 1
));
i = run_end;
} else {
result.push(lines[i].clone());
i += 1;
}
}
result
}
pub fn dedup_interleaved(lines: &[String]) -> Vec<String> {
if lines.len() < 6 {
return lines.to_vec();
}
fn normalize(text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn is_filler(text: &str) -> bool {
let normalized = text.trim().to_lowercase();
let normalized = normalized.trim_matches(|c: char| !c.is_alphanumeric());
matches!(
normalized,
"okay"
| "ok"
| "yeah"
| "yes"
| "right"
| "so"
| "and"
| "but"
| "well"
| "uh"
| "um"
| "hmm"
| "mhm"
)
}
let texts: Vec<String> = lines.iter().map(|l| normalize(text_part(l))).collect();
let fillers: Vec<bool> = texts.iter().map(|t| is_filler(t)).collect();
let mut remove = vec![false; lines.len()];
let window_size = 10;
let min_occurrences = 5;
let mut i = 0;
while i + window_size <= lines.len() {
let mut freq: std::collections::BTreeMap<&str, Vec<usize>> =
std::collections::BTreeMap::new();
for j in i..i + window_size {
if !fillers[j] && !texts[j].is_empty() {
freq.entry(&texts[j]).or_default().push(j);
}
}
let dominant = freq
.iter()
.max_by(|(phrase_a, pos_a), (phrase_b, pos_b)| {
pos_a
.len()
.cmp(&pos_b.len())
.then_with(|| phrase_a.cmp(phrase_b))
})
.filter(|(_, positions)| positions.len() >= min_occurrences);
if let Some((phrase, _)) = dominant {
let phrase = phrase.to_string();
let mut region_end = i + window_size;
while region_end < lines.len() {
let t = &texts[region_end];
if *t == phrase || fillers[region_end] {
region_end += 1;
} else {
let mut gap = 0;
let mut found_resume = false;
for t in texts
.iter()
.take(lines.len().min(region_end + 3))
.skip(region_end)
{
if *t == phrase {
found_resume = true;
break;
}
gap += 1;
}
if found_resume && gap <= 2 {
region_end += gap + 1;
} else {
break;
}
}
}
let region_len = region_end - i;
let actual_count = (i..region_end).filter(|&j| texts[j] == phrase).count();
if actual_count >= min_occurrences && region_len >= 6 {
tracing::debug!(
region_start = i,
region_end = region_end,
occurrences = actual_count,
filler_count = (i..region_end).filter(|&j| fillers[j]).count(),
phrase = phrase,
"detected interleaved hallucination loop - marking {} lines for removal",
region_len
);
let mut kept_first = false;
for j in i..region_end {
if !kept_first && texts[j] == phrase {
kept_first = true;
} else {
remove[j] = true;
}
}
i = region_end;
continue;
}
}
i += 1;
}
let removed_count = remove.iter().filter(|&&r| r).count();
if removed_count > 0 {
let mut result = Vec::with_capacity(lines.len() - removed_count + 1);
let mut in_removed_run = false;
for (idx, line) in lines.iter().enumerate() {
if remove[idx] {
if !in_removed_run {
in_removed_run = true;
let run_len = (idx..lines.len()).take_while(|&j| remove[j]).count();
result.push(format!(
"[...] [hallucinated repetition removed - {} lines collapsed]",
run_len
));
}
} else {
in_removed_run = false;
result.push(line.clone());
}
}
tracing::info!(
original = lines.len(),
removed = removed_count,
remaining = result.len(),
"interleaved dedup complete"
);
result
} else {
lines.to_vec()
}
}
pub fn collapse_noise_markers(lines: &[String]) -> Vec<String> {
if lines.len() < 3 {
return lines.to_vec();
}
let markers: Vec<bool> = lines
.iter()
.map(|l| is_noise_marker(text_part(l)))
.collect();
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
if markers[i] {
let run_start = i;
while i < lines.len() && markers[i] {
i += 1;
}
let run_len = i - run_start;
if run_len >= 3 {
result.push(lines[run_start].clone());
result.push(format!(
"[...] [non-speech audio removed - {} markers collapsed]",
run_len - 1
));
tracing::debug!(
run_start = run_start,
collapsed = run_len - 1,
sample = text_part(&lines[run_start]),
"collapsed consecutive noise markers"
);
} else {
for line in lines.iter().take(i).skip(run_start) {
result.push(line.clone());
}
}
} else {
result.push(lines[i].clone());
i += 1;
}
}
let remaining_markers = result
.iter()
.filter(|l| is_noise_marker(text_part(l)))
.count();
let content_lines = result.len().saturating_sub(remaining_markers);
if remaining_markers > 0 && content_lines > 0 {
let ratio = remaining_markers as f64 / result.len() as f64;
if ratio >= 0.66 && remaining_markers >= 8 {
tracing::info!(
markers = remaining_markers,
total = result.len(),
ratio = format!("{:.0}%", ratio * 100.0),
"high noise marker density - stripping scattered markers"
);
let mut stripped = Vec::with_capacity(content_lines + 1);
let mut removed = 0usize;
for line in &result {
if is_noise_marker(text_part(line)) {
removed += 1;
} else {
stripped.push(line.clone());
}
}
stripped.push(format!(
"[{} scattered non-speech markers removed]",
removed
));
return stripped;
}
}
let removed = lines.len() - result.len();
if removed > 0 {
tracing::info!(
original = lines.len(),
removed = removed,
"collapsed noise markers"
);
}
result
}
pub fn strip_foreign_script(lines: &[String]) -> Vec<String> {
if lines.len() < 2 {
return lines.to_vec();
}
let classifications: Vec<Script> = lines
.iter()
.map(|l| classify_script(text_part(l)))
.collect();
let mut latin_count = 0usize;
let mut cjk_count = 0usize;
let mut other_count = 0usize;
for s in &classifications {
match s {
Script::Latin => latin_count += 1,
Script::Cjk => cjk_count += 1,
Script::Other => other_count += 1,
Script::Unknown => {}
}
}
let meaningful = latin_count + cjk_count + other_count;
if meaningful < 2 {
return lines.to_vec();
}
let majority = if latin_count as f64 / meaningful as f64 >= 0.7 {
Script::Latin
} else if cjk_count as f64 / meaningful as f64 >= 0.7 {
Script::Cjk
} else if other_count as f64 / meaningful as f64 >= 0.7 {
Script::Other
} else {
return lines.to_vec(); };
let mut result = Vec::with_capacity(lines.len());
let mut removed = 0usize;
for (i, line) in lines.iter().enumerate() {
let dominated_by_foreign = match (&classifications[i], &majority) {
(Script::Unknown, _) => false, (s, m) if s == m => false, _ => true, };
if dominated_by_foreign {
removed += 1;
} else {
result.push(line.clone());
}
}
if removed > 0 {
tracing::info!(
removed = removed,
majority = ?majority,
"removed foreign-script hallucination lines"
);
}
result
}
pub fn strip_known_hallucinations(lines: &[String]) -> Vec<String> {
let mut result = Vec::with_capacity(lines.len());
let mut removed = 0usize;
for line in lines {
if is_known_hallucination(text_part(line)) {
removed += 1;
} else {
result.push(line.clone());
}
}
if removed > 0 {
tracing::info!(
removed = removed,
"removed known whisper-hallucination phrases"
);
}
result
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum Script {
Latin,
Cjk,
Other,
Unknown,
}
fn classify_script(text: &str) -> Script {
let mut latin = 0u32;
let mut cjk = 0u32;
let mut other_script = 0u32;
for ch in text.chars() {
if !ch.is_alphabetic() {
continue;
}
if ch.is_ascii_alphabetic()
|| ('\u{00C0}'..='\u{024F}').contains(&ch) || ('\u{1E00}'..='\u{1EFF}').contains(&ch)
{
latin += 1;
} else if ('\u{4E00}'..='\u{9FFF}').contains(&ch) || ('\u{3400}'..='\u{4DBF}').contains(&ch) || ('\u{3040}'..='\u{309F}').contains(&ch) || ('\u{30A0}'..='\u{30FF}').contains(&ch) || ('\u{AC00}'..='\u{D7AF}').contains(&ch)
{
cjk += 1;
} else {
other_script += 1;
}
}
let total = latin + cjk + other_script;
if total == 0 {
return Script::Unknown;
}
if latin as f64 / total as f64 >= 0.5 {
Script::Latin
} else if cjk as f64 / total as f64 >= 0.5 {
Script::Cjk
} else {
Script::Other
}
}
pub fn trim_trailing_noise(lines: &[String]) -> Vec<String> {
if lines.is_empty() {
return Vec::new();
}
fn is_filler(text: &str) -> bool {
let t = text.trim().to_lowercase();
t == "you" || t == "okay." || t == "yeah."
}
let mut noise_trim_from = lines.len();
let mut always_noise_count = 0usize;
for i in (0..lines.len()).rev() {
let text = text_part(&lines[i]);
if is_always_noise(text) {
noise_trim_from = i;
always_noise_count += 1;
} else {
break;
}
}
let mut filler_trim_from = noise_trim_from;
let mut filler_count = 0usize;
for i in (0..noise_trim_from).rev() {
let text = text_part(&lines[i]);
if is_filler(text) {
filler_trim_from = i;
filler_count += 1;
} else {
break;
}
}
let trim_from = if filler_count >= 5 {
filler_trim_from
} else if always_noise_count > 0 {
noise_trim_from
} else {
lines.len()
};
let trimmed_count = lines.len() - trim_from;
if trimmed_count > 0 {
tracing::info!(
trimmed = trimmed_count,
always_noise = always_noise_count,
filler = filler_count,
"removed trailing noise from transcript"
);
let mut result: Vec<String> = lines[..trim_from].to_vec();
result.push(format!(
"[Recording ended - {} lines of trailing noise removed]",
trimmed_count
));
result
} else {
lines.to_vec()
}
}
pub fn strip_trailing_commands(lines: &[String]) -> Vec<String> {
const COMMANDS: &[&str] = &[
"stop recording",
"stop the recording",
"end recording",
"end the recording",
"stop transcription",
"end transcription",
"stop transcribing",
"hey minutes stop",
"minutes stop",
"okay stop",
"ok stop",
];
let mut result = lines.to_vec();
for _ in 0..2 {
if let Some(last) = result.last() {
let text = text_part(last).trim().to_lowercase();
let text = text.trim_end_matches('.');
if COMMANDS
.iter()
.any(|cmd| text == *cmd || text.ends_with(cmd))
{
tracing::debug!(
line = result.last().map(|l| l.as_str()).unwrap_or(""),
"stripping trailing voice command"
);
result.pop();
} else {
break;
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_transcript_removes_repetition() {
let input = "[0:00] Hello world\n[0:03] Hello world\n[0:06] Hello world\n[0:09] Hello world\n[0:12] Something different\n";
let (cleaned, stats) = clean_transcript(input);
assert!(stats.lines_removed > 0);
assert!(cleaned.contains("Something different"));
assert!(cleaned.contains("repeated audio removed"));
}
#[test]
fn clean_transcript_preserves_normal_text() {
let input = "[0:00] First line\n[0:05] Second line\n[0:10] Third line\n";
let (cleaned, stats) = clean_transcript(input);
assert_eq!(stats.lines_removed, 0);
assert!(cleaned.contains("First line"));
assert!(cleaned.contains("Third line"));
}
#[test]
fn dedup_no_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] How are you".into(),
"[0:06] Fine thanks".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_collapses_exact_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Hello world".into(),
"[0:09] Hello world".into(),
"[0:12] Something different".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("repeated audio removed"));
assert!(result[2].contains("Something different"));
}
#[test]
fn dedup_collapses_near_identical() {
let lines = vec![
"[0:00] Ok bene le macedi diesel".into(),
"[0:03] Ok, bene le macedi diesel".into(),
"[0:06] Ok bene, le macedi diesel".into(),
"[0:09] Good morning".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 3);
assert!(result[1].contains("repeated audio removed"));
}
#[test]
fn dedup_leaves_two_similar_alone() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Something else".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_handles_empty() {
let result = dedup_segments(&[]);
assert!(result.is_empty());
}
#[test]
fn dedup_handles_single_line() {
let lines = vec!["[0:00] Hello".into()];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_multiple_runs() {
let lines = vec![
"[0:00] First phrase".into(),
"[0:03] First phrase".into(),
"[0:06] First phrase".into(),
"[0:09] Second phrase".into(),
"[0:12] Second phrase".into(),
"[0:15] Second phrase".into(),
"[0:18] Second phrase".into(),
"[0:21] Normal text".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 5);
assert!(result[1].contains("2 identical"));
assert!(result[3].contains("3 identical"));
}
#[test]
fn interleaved_catches_alternating_pattern() {
let mut lines: Vec<String> = Vec::new();
for i in 0..20 {
let ts = i * 2;
if i % 2 == 0 {
lines.push(format!(
"[{}:{:02}] So I'm going to pick his brain as well.",
ts / 60,
ts % 60
));
} else {
lines.push(format!("[{}:{:02}] Okay.", ts / 60, ts % 60));
}
}
lines.push("[0:40] Something completely different".into());
let result = dedup_interleaved(&lines);
assert!(
result.len() <= 4,
"expected <=4 lines, got {}: {:?}",
result.len(),
result
);
assert!(result.iter().any(|l| l.contains("pick his brain")));
assert!(result
.iter()
.any(|l| l.contains("hallucinated repetition removed")));
assert!(result
.last()
.unwrap()
.contains("Something completely different"));
}
#[test]
fn interleaved_leaves_normal_conversation() {
let lines = vec![
"[0:00] Hello how are you".into(),
"[0:05] I'm fine thanks".into(),
"[0:10] Great to hear".into(),
"[0:15] Let's talk about the project".into(),
"[0:20] Sure what's the update".into(),
"[0:25] We shipped the feature".into(),
];
let result = dedup_interleaved(&lines);
assert_eq!(result, lines);
}
#[test]
fn interleaved_ignores_short_repeats() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:02] Okay.".into(),
"[0:04] Hello world".into(),
"[0:06] Okay.".into(),
"[0:08] Hello world".into(),
"[0:10] Something else".into(),
];
let result = dedup_interleaved(&lines);
assert_eq!(result, lines);
}
#[test]
fn trim_trailing_music() {
let mut lines: Vec<String> = vec![
"[0:00] Hello world".into(),
"[0:05] Some real content".into(),
];
for i in 0..20 {
lines.push(format!("[{}:00] [music]", i + 1));
}
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("real content"));
assert!(result[2].contains("trailing noise removed"));
}
#[test]
fn trim_short_run_of_always_noise_now_trimmed() {
let lines: Vec<String> = vec![
"[0:00] Hello world".into(),
"[0:05] [music]".into(),
"[0:10] [music]".into(),
"[0:15] [music]".into(),
];
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 2);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("trailing noise removed"));
}
#[test]
fn trim_keeps_short_trailing_filler() {
let lines: Vec<String> = vec!["[0:00] That wraps it".into(), "[0:05] yeah.".into()];
let result = trim_trailing_noise(&lines);
assert_eq!(result, lines, "single-filler closing must survive");
}
#[test]
fn trim_keeps_short_filler_before_trailing_noise() {
let lines: Vec<String> = vec![
"[0:00] That wraps it".into(),
"[0:05] yeah.".into(),
"[0:10] [music]".into(),
];
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("That wraps it"));
assert!(result[1].contains("yeah."));
assert!(result[2].contains("1 lines of trailing noise removed"));
assert!(!result.iter().any(|line| line.contains("[music]")));
}
#[test]
fn trim_long_run_of_filler_is_trimmed() {
let lines: Vec<String> = vec![
"[0:00] Real content".into(),
"[0:05] yeah.".into(),
"[0:10] yeah.".into(),
"[0:15] yeah.".into(),
"[0:20] yeah.".into(),
"[0:25] yeah.".into(),
];
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 2);
assert!(result[0].contains("Real content"));
assert!(result[1].contains("trailing noise removed"));
}
#[test]
fn trim_handles_empty() {
assert!(trim_trailing_noise(&[]).is_empty());
}
#[test]
fn trim_all_noise() {
let lines: Vec<String> = (0..10).map(|i| format!("[{}:00] [music]", i)).collect();
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("trailing noise removed"));
}
#[test]
fn script_removes_cjk_from_latin_transcript() {
let lines = vec![
"[0:00] Hello and welcome".into(),
"[0:05] Let's discuss the project".into(),
"[0:10] スパイシー".into(),
"[0:15] We should wrap up now".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 3);
assert!(!result.iter().any(|l| l.contains("スパイシー")));
}
#[test]
fn script_preserves_pure_latin_transcript() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] How are you".into(),
"[0:10] I'm doing fine".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_preserves_pure_cjk_transcript() {
let lines = vec![
"[0:00] こんにちは".into(),
"[0:05] お元気ですか".into(),
"[0:10] 元気です".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_no_action_on_mixed_transcript() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] こんにちは".into(),
"[0:10] Good morning".into(),
"[0:15] お元気ですか".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_handles_single_line() {
let lines = vec!["[0:00] スパイシー".into()];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines); }
#[test]
fn script_all_hallucinated_in_latin_majority() {
let lines = vec![
"[0:00] Today we need to discuss".into(),
"[0:05] The quarterly results".into(),
"[0:10] Are looking good".into(),
"[0:15] Revenue is up".into(),
"[0:20] Margins improved significantly".into(),
"[0:25] 東京タワー".into(),
"[0:30] 大阪城".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 5);
assert!(result
.iter()
.all(|l| !l.contains('東') && !l.contains('大')));
}
#[test]
fn script_two_cjk_lines_preserved() {
let lines = vec!["[0:00] スパイシー".into(), "[0:05] 東京タワー".into()];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_cyrillic_majority_strips_latin() {
let lines = vec![
"[0:00] Привет мир".into(),
"[0:05] Как дела".into(),
"[0:10] Всё хорошо".into(),
"[0:15] Hello world".into(), ];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 3);
assert!(!result.iter().any(|l| l.contains("Hello")));
}
#[test]
fn script_classify_basic() {
assert_eq!(classify_script("Hello world"), Script::Latin);
assert_eq!(classify_script("スパイシー"), Script::Cjk);
assert_eq!(classify_script("Привет"), Script::Other);
assert_eq!(classify_script(""), Script::Unknown);
assert_eq!(classify_script("123 !@#"), Script::Unknown);
}
#[test]
fn clean_transcript_includes_script_filter() {
let input =
"[0:00] Hello world\n[0:05] Testing one two\n[0:10] スパイシー\n[0:15] All done\n";
let (cleaned, stats) = clean_transcript(input);
assert!(!cleaned.contains("スパイシー"));
assert!(stats.after_script_filter < stats.after_interleaved_dedup);
}
#[test]
fn noise_markers_collapses_polish_laughter() {
let mut lines: Vec<String> = vec!["[0:00] Cześć, jak się masz?".into()];
for i in 1..=10 {
lines.push(format!("[0:{:02}] [Śmiech]", i * 3));
}
lines.push("[0:33] Dobrze, dziękuję".into());
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 4,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Cześć"));
assert!(result
.iter()
.any(|l| l.contains("non-speech audio removed")));
assert!(result.last().unwrap().contains("Dobrze"));
}
#[test]
fn noise_markers_collapses_english_mixed() {
let lines = vec![
"[0:00] Good morning everyone".into(),
"[0:05] [music]".into(),
"[0:10] [laughter]".into(),
"[0:15] [applause]".into(),
"[0:20] [music]".into(),
"[0:25] Thank you for coming".into(),
];
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 4,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Good morning"));
assert!(result.last().unwrap().contains("Thank you"));
}
#[test]
fn noise_markers_preserves_short_runs() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] [laughter]".into(),
"[0:10] That was funny".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_empty() {
assert!(collapse_noise_markers(&[]).is_empty());
}
#[test]
fn noise_markers_handles_single_line() {
let lines = vec!["[0:00] [music]".into()];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_two_lines() {
let lines = vec!["[0:00] [music]".into(), "[0:03] [laughter]".into()];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn is_noise_marker_matches_parenthetical_form() {
assert!(is_noise_marker("(crying)"));
assert!(is_noise_marker("(coughing)"));
assert!(is_noise_marker("(applause)"));
assert!(is_noise_marker("(silence)"));
assert!(is_noise_marker("(crying)."));
}
#[test]
fn is_noise_marker_matches_bracketed_form() {
assert!(is_noise_marker("[music]"));
assert!(is_noise_marker("[Growling]"));
assert!(is_noise_marker("[BLANK_AUDIO]"));
assert!(is_noise_marker("[Śmiech]"));
assert!(is_noise_marker("[laughter]."));
}
#[test]
fn is_noise_marker_rejects_non_markers() {
assert!(!is_noise_marker(""));
assert!(!is_noise_marker("Hello world"));
assert!(!is_noise_marker("[0:00]"));
assert!(!is_noise_marker("[...] [repeated audio removed - 3]"));
assert!(!is_noise_marker("(crying]"));
assert!(!is_noise_marker("[crying)"));
assert!(!is_noise_marker(
"(this is way more than four words of content)"
));
}
#[test]
fn is_noise_marker_rejects_user_authored_parentheticals() {
assert!(!is_noise_marker("(see attached)"));
assert!(!is_noise_marker("(part 1)"));
assert!(!is_noise_marker("(2 of 3)"));
assert!(!is_noise_marker("(continued)"));
assert!(!is_noise_marker("(TBD)"));
assert!(!is_noise_marker("(draft)"));
assert!(!is_noise_marker("(see attached)."));
}
#[test]
fn is_noise_marker_accepts_two_word_noise_forms() {
assert!(is_noise_marker("(soft music)"));
assert!(is_noise_marker("(loud applause)"));
assert!(is_noise_marker("(gentle music)"));
assert!(is_noise_marker("(background music)"));
assert!(is_noise_marker("[soft music]"));
assert!(is_noise_marker("[loud applause]"));
}
#[test]
fn is_noise_marker_rejects_user_authored_brackets() {
assert!(!is_noise_marker("[TODO]"));
assert!(!is_noise_marker("[draft]"));
assert!(!is_noise_marker("[part 1]"));
assert!(!is_noise_marker("[see attached]"));
}
#[test]
fn is_noise_marker_dominance_check_rejects_noise_word_with_content_suffix() {
assert!(!is_noise_marker("(music director)"));
assert!(!is_noise_marker("(applause sounds great)"));
assert!(!is_noise_marker("(noise complaint)"));
assert!(!is_noise_marker("(typing speed)"));
assert!(!is_noise_marker("[crying baby]"));
assert!(!is_noise_marker("[laughter therapy]"));
}
#[test]
fn is_noise_marker_dominance_check_accepts_modifier_plus_noise_word() {
assert!(is_noise_marker("(audience laughter)"));
assert!(is_noise_marker("(soft music)"));
assert!(is_noise_marker("(loud applause)"));
assert!(is_noise_marker("(background music)"));
assert!(is_noise_marker("[audience laughter]"));
}
#[test]
fn is_noise_marker_accepts_expanded_allowlist_tokens() {
assert!(is_noise_marker("[inaudible]"));
assert!(is_noise_marker("[crosstalk]"));
assert!(is_noise_marker("[typing]"));
assert!(is_noise_marker("[noise]"));
assert!(is_noise_marker("[static]"));
assert!(is_noise_marker("[beep]"));
assert!(is_noise_marker("[ringing]"));
assert!(is_noise_marker("(inaudible)"));
assert!(is_noise_marker("(crosstalk)"));
}
#[test]
fn is_known_hallucination_matches_youtube_phrases() {
assert!(is_known_hallucination("Thank you for watching!"));
assert!(is_known_hallucination("Thank you for watching."));
assert!(is_known_hallucination("Thank you for watching"));
assert!(is_known_hallucination("THANK YOU FOR WATCHING"));
assert!(is_known_hallucination("Please subscribe to our channel."));
assert!(is_known_hallucination("Please subscribe to our channel!"));
assert!(is_known_hallucination("Like and subscribe"));
assert!(is_known_hallucination("Don't forget to subscribe."));
}
#[test]
fn is_known_hallucination_matches_amara_phrases() {
assert!(is_known_hallucination(
"Subtitles by the Amara.org community"
));
assert!(is_known_hallucination(
"Transcribed by the Amara.org community"
));
assert!(is_known_hallucination("the Amara.org community"));
assert!(is_known_hallucination("Amara.org community"));
}
#[test]
fn is_known_hallucination_matches_url_lines() {
assert!(is_known_hallucination("www.transcription-exe-project.com"));
assert!(is_known_hallucination("https://amara.org"));
assert!(is_known_hallucination("http://example.com"));
}
#[test]
fn is_known_hallucination_matches_attribution_prefixes() {
assert!(is_known_hallucination(
"Transcripted by: www.transcription-exe-project.com"
));
assert!(is_known_hallucination("Transcripted by: www.amara.org"));
assert!(is_known_hallucination("Captioned by Acme Captions"));
assert!(is_known_hallucination(
"Transcribed by the Amara.org community"
));
assert!(is_known_hallucination("Subtitles by the cyclope team"));
assert!(is_known_hallucination("Translated by community volunteers"));
assert!(is_known_hallucination(
"Captions by the Acme transcription service"
));
assert!(is_known_hallucination("Transcribed by"));
assert!(is_known_hallucination("Subtitles by"));
}
#[test]
fn is_known_hallucination_rejects_real_speech_containing_phrase() {
assert!(!is_known_hallucination(
"Thank you for watching the demo carefully"
));
assert!(!is_known_hallucination(
"I would like to subscribe to that newsletter"
));
assert!(!is_known_hallucination(
"The Amara.org community has done great work, but our use case differs"
));
assert!(!is_known_hallucination(
"Check out www.example.com for the docs we discussed"
));
}
#[test]
fn is_known_hallucination_rejects_normal_content() {
assert!(!is_known_hallucination("Hello world"));
assert!(!is_known_hallucination("Let's review the action items"));
assert!(!is_known_hallucination("Thanks for joining the call"));
assert!(!is_known_hallucination(""));
}
#[test]
fn strip_known_hallucinations_drops_matching_lines() {
let lines: Vec<String> = vec![
"[0:00] Real meeting content".into(),
"[35:00] Thank you for watching!".into(),
"[35:30] More real content here".into(),
"[36:00] Subtitles by the Amara.org community".into(),
"[36:30] www.transcription-exe-project.com".into(),
"[37:00] Closing remarks from the team".into(),
];
let result = strip_known_hallucinations(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("Real meeting content"));
assert!(result[1].contains("More real content here"));
assert!(result[2].contains("Closing remarks"));
}
#[test]
fn strip_known_hallucinations_preserves_normal_transcript() {
let lines: Vec<String> = vec![
"[0:00] Hello everyone".into(),
"[0:05] Let's get started".into(),
"[0:10] We have three things to cover today".into(),
];
let result = strip_known_hallucinations(&lines);
assert_eq!(result, lines);
}
#[test]
fn clean_segments_strips_long_tail_hallucinations() {
let segments: Vec<String> = vec![
"Real meeting content one".into(),
"Real meeting content two".into(),
"Thank you for watching!".into(),
"Please subscribe to our channel".into(),
"Subtitles by the Amara.org community".into(),
"www.transcription-exe-project.com".into(),
"Thank you for watching".into(),
];
let (cleaned, stats) = clean_segments(&segments);
assert_eq!(stats.after_hallucination_strip, 2);
assert!(cleaned.iter().all(|s| s.contains("Real meeting content")));
}
#[test]
fn is_url_line_only_matches_url_prefix() {
assert!(is_url_line("www.example.com"));
assert!(is_url_line("https://amara.org"));
assert!(is_url_line("http://example.com"));
assert!(is_url_line("www.example.com path"));
assert!(!is_url_line("Check out www.example.com"));
assert!(!is_url_line(""));
assert!(!is_url_line("Hello"));
}
#[test]
fn is_all_noise_true_for_pure_noise_transcript() {
let lines = vec!["[0:07] (crying)".into(), "[1:52] [Growling]".into()];
assert!(is_all_noise(&lines));
}
#[test]
fn is_all_noise_true_for_single_noise_line() {
let lines = vec!["[0:00] [music]".into()];
assert!(is_all_noise(&lines));
}
#[test]
fn is_all_noise_false_when_any_line_is_speech() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] [laughter]".into(),
"[0:10] (crying)".into(),
];
assert!(!is_all_noise(&lines));
}
#[test]
fn is_all_noise_false_on_empty_input() {
let lines: Vec<String> = Vec::new();
assert!(!is_all_noise(&lines));
}
#[test]
fn is_all_noise_ignores_blank_lines() {
let lines = vec![
"".into(),
"[0:07] (crying)".into(),
" ".into(),
"[1:52] [Growling]".into(),
];
assert!(is_all_noise(&lines));
}
#[test]
fn clean_stats_all_noise_true_for_short_noise_only_input() {
let input = vec!["[0:07] (crying)".into(), "[1:52] [Growling]".into()];
let (cleaned, stats) = clean_segments(&input);
assert_eq!(cleaned, input);
assert!(stats.all_noise, "stats: {:?}", stats);
}
#[test]
fn clean_stats_all_noise_false_with_real_content() {
let input = vec![
"[0:00] Hello world".into(),
"[0:05] (crying)".into(),
"[0:10] Goodbye".into(),
];
let (_, stats) = clean_segments(&input);
assert!(!stats.all_noise, "stats: {:?}", stats);
}
#[test]
fn noise_markers_ignores_timestamps() {
let lines = vec![
"[0:00] Hello".into(),
"[0:05] World".into(),
"[0:10] Test".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_ignores_collapse_markers() {
let lines = vec![
"[0:00] Hello world".into(),
"[...] [repeated audio removed - 5 identical segments collapsed]".into(),
"[0:30] Something else".into(),
"[...] [hallucinated repetition removed - 10 lines collapsed]".into(),
"[1:00] Final line".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_multilingual_markers() {
let mut lines = Vec::new();
lines.push("[0:00] Bonjour".into());
for i in 1..=4 {
lines.push(format!("[0:{:02}] [rires]", i * 3));
}
for i in 5..=7 {
lines.push(format!("[0:{:02}] [Musik]", i * 3));
}
lines.push("[0:30] Au revoir".into());
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 5,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Bonjour"));
assert!(result.last().unwrap().contains("Au revoir"));
}
#[test]
fn noise_markers_scattered_high_density() {
let lines = vec![
"[0:00] Real content one".into(),
"[0:03] [Śmiech]".into(),
"[0:06] [muzyka]".into(),
"[0:09] Real content two".into(),
"[0:12] [cisza]".into(),
"[0:15] [oklaski]".into(),
"[0:18] Real content three".into(),
"[0:21] [Śmiech]".into(),
"[0:24] [muzyka]".into(),
"[0:27] Real content four".into(),
"[0:30] [cisza]".into(),
"[0:33] [oklaski]".into(),
"[0:36] Real content five".into(),
"[0:39] [Śmiech]".into(),
"[0:42] [muzyka]".into(),
];
let result = collapse_noise_markers(&lines);
let content_count = result.iter().filter(|l| l.contains("Real content")).count();
assert_eq!(content_count, 5, "all content lines preserved");
assert!(
result
.iter()
.any(|l| l.contains("non-speech markers removed")),
"expected pass 2 removal summary, got: {:?}",
result
);
}
#[test]
fn noise_markers_below_threshold_kept() {
let lines = vec![
"[0:00] Real content one".into(),
"[0:03] [laughter]".into(),
"[0:06] Real content two".into(),
"[0:09] [applause]".into(),
"[0:12] Real content three".into(),
"[0:15] [laughter]".into(),
"[0:18] Real content four".into(),
"[0:21] [music]".into(),
"[0:24] Real content five".into(),
"[0:27] [laughter]".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_blank_audio() {
let mut lines: Vec<String> = vec!["[0:00] Some content".into()];
for i in 1..=6 {
lines.push(format!("[0:{:02}] [BLANK_AUDIO]", i * 5));
}
lines.push("[0:35] More content".into());
let result = collapse_noise_markers(&lines);
assert!(result.len() <= 4);
assert!(result
.iter()
.any(|l| l.contains("non-speech audio removed")));
}
#[test]
fn clean_transcript_includes_noise_markers() {
let input = "[0:00] Hello world\n\
[0:03] [Śmiech]\n\
[0:06] [muzyka]\n\
[0:09] [cisza]\n\
[0:12] [oklaski]\n\
[0:15] [Śmiech]\n\
[0:18] [muzyka]\n\
[0:21] [cisza]\n\
[0:24] Goodbye\n";
let (cleaned, stats) = clean_transcript(input);
assert!(
stats.after_noise_markers < stats.after_script_filter,
"noise markers: {}, script filter: {}",
stats.after_noise_markers,
stats.after_script_filter
);
assert!(cleaned.contains("Hello world"));
assert!(cleaned.contains("Goodbye"));
}
#[test]
fn strip_command_removes_stop_recording() {
let lines = vec![
"[0:00] Great meeting everyone".into(),
"[0:05] Let's wrap up".into(),
"[0:10] Stop recording.".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 2);
assert!(result[1].contains("wrap up"));
}
#[test]
fn strip_command_removes_with_timestamp() {
let lines = vec!["[0:00] First point".into(), "[0:30] Stop recording".into()];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("First point"));
}
#[test]
fn strip_command_removes_end_recording() {
let lines = vec![
"[0:00] Discussion content".into(),
"[0:10] End recording".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
}
#[test]
fn strip_command_removes_two_trailing_commands() {
let lines = vec![
"[0:00] Content".into(),
"[0:10] Okay stop.".into(),
"[0:12] Stop recording.".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("Content"));
}
#[test]
fn strip_command_preserves_non_command_lines() {
let lines = vec![
"[0:00] We need to stop recording expenses".into(),
"[0:05] The stop recording policy is important".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 2, "non-command lines should be preserved");
}
#[test]
fn strip_command_handles_empty() {
let result = strip_trailing_commands(&[]);
assert!(result.is_empty());
}
#[test]
fn strip_command_case_insensitive() {
let lines = vec![
"[0:00] Meeting notes".into(),
"[0:05] STOP RECORDING".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
}
#[test]
fn clean_transcript_strips_trailing_command() {
let input =
"[0:00] Important discussion\n[0:05] Action item for Bob\n[0:10] Stop recording.\n";
let (cleaned, stats) = clean_transcript(input);
assert!(!cleaned.contains("Stop recording"));
assert!(cleaned.contains("Action item for Bob"));
assert!(stats.after_command_strip <= stats.after_trailing_trim);
assert_eq!(stats.lines_removed, 1);
}
#[test]
fn clean_segments_handles_empty() {
let (cleaned, stats) = clean_segments(&[]);
assert!(cleaned.is_empty());
assert_eq!(stats.original_lines, 0);
assert_eq!(stats.lines_removed, 0);
}
#[test]
fn clean_segments_passes_through_clean_input() {
let input: Vec<String> = vec![
"Welcome to the meeting.".into(),
"Let's discuss Q3 numbers.".into(),
"Revenue is up twelve percent.".into(),
];
let (cleaned, stats) = clean_segments(&input);
assert_eq!(cleaned, input, "clean input should be untouched");
assert_eq!(stats.lines_removed, 0);
assert_eq!(stats.after_command_strip, 3);
}
#[test]
fn clean_segments_dedups_repeated_hallucination() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"What's the budget for Q3?".into(),
];
let (cleaned, stats) = clean_segments(&input);
assert!(cleaned.iter().any(|s| s.contains("budget")));
assert!(stats.lines_removed >= 2);
assert!(cleaned.iter().any(|s| s.contains("repeated audio removed")));
}
#[test]
fn clean_segments_is_idempotent() {
let input: Vec<String> = vec![
"Real content.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"More real content.".into(),
];
let (first, _) = clean_segments(&input);
let (second, second_stats) = clean_segments(&first);
assert_eq!(first, second, "second pass should be a no-op");
assert_eq!(second_stats.lines_removed, 0);
}
#[test]
fn clean_segments_with_options_respects_disabled_passes() {
let input: Vec<String> = vec![
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
];
let opts = CleanOptions {
dedup_consecutive: false,
..CleanOptions::default()
};
let (cleaned, _) = clean_segments_with_options(&input, &opts);
assert_eq!(cleaned.len(), input.len(), "dedup disabled → no removal");
}
#[test]
fn clean_options_none_runs_no_passes() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Stop recording.".into(),
];
let (cleaned, stats) = clean_segments_with_options(&input, &CleanOptions::none());
assert_eq!(cleaned, input, "no passes → no changes");
assert_eq!(stats.lines_removed, 0);
}
#[test]
fn clean_options_all_matches_default() {
let input: Vec<String> = vec![
"Real meeting content.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"[music]".into(),
];
let (default_out, default_stats) = clean_segments(&input);
let (all_out, all_stats) = clean_segments_with_options(&input, &CleanOptions::all());
assert_eq!(default_out, all_out);
assert_eq!(default_stats, all_stats);
}
#[test]
fn clean_segments_works_on_raw_segments_without_timestamps() {
let raw_segments: Vec<String> = vec![
" Thank you.".into(), " Thank you.".into(),
" Thank you.".into(),
" Thank you.".into(),
" So what's our action plan?".into(),
];
let (cleaned, stats) = clean_segments(&raw_segments);
assert!(stats.lines_removed >= 2);
assert!(cleaned.iter().any(|s| s.contains("action plan")));
}
#[test]
fn clean_transcript_delegates_to_clean_segments() {
let raw = "Thank you.\nThank you.\nThank you.\nReal content.";
let segments: Vec<String> = raw.lines().map(String::from).collect();
let (transcript_out, _t_stats) = clean_transcript(raw);
let (segments_out, _s_stats) = clean_segments(&segments);
assert_eq!(transcript_out, segments_out.join("\n"));
}
#[test]
fn clean_stats_summary_is_human_readable() {
let input: Vec<String> = vec![
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
"World.".into(),
];
let (_, stats) = clean_segments(&input);
let summary = stats.summary();
assert!(summary.contains("whisper-guard:"));
assert!(summary.contains("4")); }
#[test]
fn clean_segments_with_huge_input_does_not_panic() {
let input: Vec<String> = (0..10_000).map(|_| "Thank you.".to_string()).collect();
let (cleaned, stats) = clean_segments(&input);
assert_eq!(stats.original_lines, 10_000);
assert!(cleaned.len() < 10);
}
#[test]
fn clean_segments_handles_unicode_correctly() {
let input: Vec<String> = vec![
"Café meeting at 9am with Søren and José".into(),
"Discussed naïve Bayes models".into(),
];
let (cleaned, _) = clean_segments(&input);
assert_eq!(cleaned.len(), 2, "unicode-in-Latin should not be filtered");
}
#[test]
fn keep_dedup_annotations_default_true_preserves_marker() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Real content.".into(),
];
let (cleaned, _) = clean_segments(&input);
assert!(
cleaned
.iter()
.any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
"default behavior should preserve the annotation line"
);
}
#[test]
fn keep_dedup_annotations_false_strips_marker() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Real content.".into(),
];
let opts = CleanOptions {
keep_dedup_annotations: false,
..CleanOptions::default()
};
let (cleaned, stats) = clean_segments_with_options(&input, &opts);
assert!(
!cleaned
.iter()
.any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
"annotation should be removed"
);
assert_eq!(cleaned.len(), 2);
assert_eq!(stats.lines_removed, 3);
}
#[test]
fn keep_dedup_annotations_does_not_strip_other_bracket_content() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"[NAME] said the deal closed.".into(),
];
let opts = CleanOptions {
keep_dedup_annotations: false,
..CleanOptions::default()
};
let (cleaned, _) = clean_segments_with_options(&input, &opts);
assert!(
cleaned.iter().any(|s| s.contains("[NAME]")),
"non-annotation bracket content must survive"
);
}
}