fn text_part(line: &str) -> &str {
line.find("] ").map(|i| &line[i + 2..]).unwrap_or(line)
}
fn is_always_noise(text: &str) -> bool {
let t = text.trim().to_lowercase();
t == "[music]" || t == "[blank_audio]" || t == "[silence]" || t == "music"
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[non_exhaustive]
pub struct CleanStats {
pub original_lines: usize,
pub after_consecutive_dedup: usize,
pub after_interleaved_dedup: usize,
pub after_script_filter: usize,
pub after_noise_markers: usize,
pub after_trailing_trim: usize,
pub after_command_strip: usize,
pub lines_removed: usize,
}
impl CleanStats {
pub fn summary(&self) -> String {
format!(
"whisper-guard: {} → {} segments ({} removed)",
self.original_lines, self.after_noise_markers, self.lines_removed,
)
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CleanOptions {
pub dedup_consecutive: bool,
pub dedup_interleaved: bool,
pub strip_foreign_script: bool,
pub collapse_noise_markers: bool,
pub trim_trailing_noise: bool,
pub strip_trailing_commands: bool,
pub keep_dedup_annotations: bool,
}
impl Default for CleanOptions {
fn default() -> Self {
Self {
dedup_consecutive: true,
dedup_interleaved: true,
strip_foreign_script: true,
collapse_noise_markers: true,
trim_trailing_noise: true,
strip_trailing_commands: true,
keep_dedup_annotations: true,
}
}
}
impl CleanOptions {
pub fn all() -> Self {
Self::default()
}
pub fn none() -> Self {
Self {
dedup_consecutive: false,
dedup_interleaved: false,
strip_foreign_script: false,
collapse_noise_markers: false,
trim_trailing_noise: false,
strip_trailing_commands: false,
keep_dedup_annotations: true,
}
}
}
const DEDUP_ANNOTATION_PREFIX: &str = "[...] [repeated audio removed";
pub fn clean_segments(segments: &[String]) -> (Vec<String>, CleanStats) {
clean_segments_with_options(segments, &CleanOptions::default())
}
pub fn clean_segments_with_options(
segments: &[String],
opts: &CleanOptions,
) -> (Vec<String>, CleanStats) {
let original_count = segments.len();
let mut lines: Vec<String> = segments.to_vec();
if opts.dedup_consecutive {
lines = dedup_segments(&lines);
if !opts.keep_dedup_annotations {
lines.retain(|s| !s.starts_with(DEDUP_ANNOTATION_PREFIX));
}
}
let after_consecutive = lines.len();
if opts.dedup_interleaved {
lines = dedup_interleaved(&lines);
}
let after_interleaved = lines.len();
if opts.strip_foreign_script {
lines = strip_foreign_script(&lines);
}
let after_script = lines.len();
if opts.strip_trailing_commands {
lines = strip_trailing_commands(&lines);
}
let after_command = lines.len();
if opts.trim_trailing_noise {
lines = trim_trailing_noise(&lines);
}
let after_trim = lines.len();
if opts.collapse_noise_markers {
lines = collapse_noise_markers(&lines);
}
let after_noise = lines.len();
let stats = CleanStats {
original_lines: original_count,
after_consecutive_dedup: after_consecutive,
after_interleaved_dedup: after_interleaved,
after_script_filter: after_script,
after_noise_markers: after_noise,
after_trailing_trim: after_trim,
after_command_strip: after_command,
lines_removed: original_count.saturating_sub(after_noise),
};
(lines, stats)
}
pub fn clean_transcript(transcript: &str) -> (String, CleanStats) {
let lines: Vec<String> = transcript.lines().map(|l| l.to_string()).collect();
let (cleaned, stats) = clean_segments(&lines);
(cleaned.join("\n"), stats)
}
pub fn dedup_segments(lines: &[String]) -> Vec<String> {
if lines.len() < 3 {
return lines.to_vec();
}
fn similarity(a: &str, b: &str) -> f64 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let a_lower = a.to_lowercase();
let b_lower = b.to_lowercase();
if a_lower == b_lower {
return 1.0;
}
let (short, long) = if a_lower.len() <= b_lower.len() {
(&a_lower, &b_lower)
} else {
(&b_lower, &a_lower)
};
if long.contains(short.as_str()) {
return short.len() as f64 / long.len() as f64;
}
let a_words: Vec<&str> = a_lower.split_whitespace().collect();
let b_words: Vec<&str> = b_lower.split_whitespace().collect();
let matching = a_words.iter().filter(|w| b_words.contains(w)).count();
let total = a_words.len().max(b_words.len());
if total == 0 {
return 0.0;
}
matching as f64 / total as f64
}
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let base_text = text_part(&lines[i]);
if is_always_noise(base_text) {
result.push(lines[i].clone());
i += 1;
continue;
}
let mut run_end = i + 1;
while run_end < lines.len() {
let candidate = text_part(&lines[run_end]);
if similarity(base_text, candidate) >= 0.8 {
run_end += 1;
} else {
break;
}
}
let run_len = run_end - i;
if run_len >= 3 {
tracing::debug!(
first_segment = i,
repeated_count = run_len,
text = base_text,
"detected repetition loop in whisper output - collapsing {} segments",
run_len
);
result.push(lines[i].clone());
result.push(format!(
"{} - {} identical segments collapsed]",
DEDUP_ANNOTATION_PREFIX,
run_len - 1
));
i = run_end;
} else {
result.push(lines[i].clone());
i += 1;
}
}
result
}
pub fn dedup_interleaved(lines: &[String]) -> Vec<String> {
if lines.len() < 6 {
return lines.to_vec();
}
fn normalize(text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn is_filler(text: &str) -> bool {
let normalized = text.trim().to_lowercase();
let normalized = normalized.trim_matches(|c: char| !c.is_alphanumeric());
matches!(
normalized,
"okay"
| "ok"
| "yeah"
| "yes"
| "right"
| "so"
| "and"
| "but"
| "well"
| "uh"
| "um"
| "hmm"
| "mhm"
)
}
let texts: Vec<String> = lines.iter().map(|l| normalize(text_part(l))).collect();
let fillers: Vec<bool> = texts.iter().map(|t| is_filler(t)).collect();
let mut remove = vec![false; lines.len()];
let window_size = 10;
let min_occurrences = 5;
let mut i = 0;
while i + window_size <= lines.len() {
let mut freq: std::collections::BTreeMap<&str, Vec<usize>> =
std::collections::BTreeMap::new();
for j in i..i + window_size {
if !fillers[j] && !texts[j].is_empty() {
freq.entry(&texts[j]).or_default().push(j);
}
}
let dominant = freq
.iter()
.max_by(|(phrase_a, pos_a), (phrase_b, pos_b)| {
pos_a
.len()
.cmp(&pos_b.len())
.then_with(|| phrase_a.cmp(phrase_b))
})
.filter(|(_, positions)| positions.len() >= min_occurrences);
if let Some((phrase, _)) = dominant {
let phrase = phrase.to_string();
let mut region_end = i + window_size;
while region_end < lines.len() {
let t = &texts[region_end];
if *t == phrase || fillers[region_end] {
region_end += 1;
} else {
let mut gap = 0;
let mut found_resume = false;
for t in texts
.iter()
.take(lines.len().min(region_end + 3))
.skip(region_end)
{
if *t == phrase {
found_resume = true;
break;
}
gap += 1;
}
if found_resume && gap <= 2 {
region_end += gap + 1;
} else {
break;
}
}
}
let region_len = region_end - i;
let actual_count = (i..region_end).filter(|&j| texts[j] == phrase).count();
if actual_count >= min_occurrences && region_len >= 6 {
tracing::debug!(
region_start = i,
region_end = region_end,
occurrences = actual_count,
filler_count = (i..region_end).filter(|&j| fillers[j]).count(),
phrase = phrase,
"detected interleaved hallucination loop - marking {} lines for removal",
region_len
);
let mut kept_first = false;
for j in i..region_end {
if !kept_first && texts[j] == phrase {
kept_first = true;
} else {
remove[j] = true;
}
}
i = region_end;
continue;
}
}
i += 1;
}
let removed_count = remove.iter().filter(|&&r| r).count();
if removed_count > 0 {
let mut result = Vec::with_capacity(lines.len() - removed_count + 1);
let mut in_removed_run = false;
for (idx, line) in lines.iter().enumerate() {
if remove[idx] {
if !in_removed_run {
in_removed_run = true;
let run_len = (idx..lines.len()).take_while(|&j| remove[j]).count();
result.push(format!(
"[...] [hallucinated repetition removed - {} lines collapsed]",
run_len
));
}
} else {
in_removed_run = false;
result.push(line.clone());
}
}
tracing::info!(
original = lines.len(),
removed = removed_count,
remaining = result.len(),
"interleaved dedup complete"
);
result
} else {
lines.to_vec()
}
}
pub fn collapse_noise_markers(lines: &[String]) -> Vec<String> {
if lines.len() < 3 {
return lines.to_vec();
}
fn is_noise_marker(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
if t.starts_with("[...]") {
return false;
}
let t = t.strip_suffix('.').unwrap_or(t);
if !(t.starts_with('[') && t.ends_with(']')) {
return false;
}
let inner = &t[1..t.len() - 1];
if inner.chars().all(|c| c.is_ascii_digit() || c == ':') {
return false;
}
let word_count = inner.split_whitespace().count();
(1..=4).contains(&word_count) && inner.len() <= 40
}
let markers: Vec<bool> = lines
.iter()
.map(|l| is_noise_marker(text_part(l)))
.collect();
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
if markers[i] {
let run_start = i;
while i < lines.len() && markers[i] {
i += 1;
}
let run_len = i - run_start;
if run_len >= 3 {
result.push(lines[run_start].clone());
result.push(format!(
"[...] [non-speech audio removed - {} markers collapsed]",
run_len - 1
));
tracing::debug!(
run_start = run_start,
collapsed = run_len - 1,
sample = text_part(&lines[run_start]),
"collapsed consecutive noise markers"
);
} else {
for line in lines.iter().take(i).skip(run_start) {
result.push(line.clone());
}
}
} else {
result.push(lines[i].clone());
i += 1;
}
}
let remaining_markers = result
.iter()
.filter(|l| is_noise_marker(text_part(l)))
.count();
let content_lines = result.len().saturating_sub(remaining_markers);
if remaining_markers > 0 && content_lines > 0 {
let ratio = remaining_markers as f64 / result.len() as f64;
if ratio >= 0.66 && remaining_markers >= 8 {
tracing::info!(
markers = remaining_markers,
total = result.len(),
ratio = format!("{:.0}%", ratio * 100.0),
"high noise marker density - stripping scattered markers"
);
let mut stripped = Vec::with_capacity(content_lines + 1);
let mut removed = 0usize;
for line in &result {
if is_noise_marker(text_part(line)) {
removed += 1;
} else {
stripped.push(line.clone());
}
}
stripped.push(format!(
"[{} scattered non-speech markers removed]",
removed
));
return stripped;
}
}
let removed = lines.len() - result.len();
if removed > 0 {
tracing::info!(
original = lines.len(),
removed = removed,
"collapsed noise markers"
);
}
result
}
pub fn strip_foreign_script(lines: &[String]) -> Vec<String> {
if lines.len() < 2 {
return lines.to_vec();
}
let classifications: Vec<Script> = lines
.iter()
.map(|l| classify_script(text_part(l)))
.collect();
let mut latin_count = 0usize;
let mut cjk_count = 0usize;
let mut other_count = 0usize;
for s in &classifications {
match s {
Script::Latin => latin_count += 1,
Script::Cjk => cjk_count += 1,
Script::Other => other_count += 1,
Script::Unknown => {}
}
}
let meaningful = latin_count + cjk_count + other_count;
if meaningful < 2 {
return lines.to_vec();
}
let majority = if latin_count as f64 / meaningful as f64 >= 0.7 {
Script::Latin
} else if cjk_count as f64 / meaningful as f64 >= 0.7 {
Script::Cjk
} else if other_count as f64 / meaningful as f64 >= 0.7 {
Script::Other
} else {
return lines.to_vec(); };
let mut result = Vec::with_capacity(lines.len());
let mut removed = 0usize;
for (i, line) in lines.iter().enumerate() {
let dominated_by_foreign = match (&classifications[i], &majority) {
(Script::Unknown, _) => false, (s, m) if s == m => false, _ => true, };
if dominated_by_foreign {
removed += 1;
} else {
result.push(line.clone());
}
}
if removed > 0 {
tracing::info!(
removed = removed,
majority = ?majority,
"removed foreign-script hallucination lines"
);
}
result
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum Script {
Latin,
Cjk,
Other,
Unknown,
}
fn classify_script(text: &str) -> Script {
let mut latin = 0u32;
let mut cjk = 0u32;
let mut other_script = 0u32;
for ch in text.chars() {
if !ch.is_alphabetic() {
continue;
}
if ch.is_ascii_alphabetic()
|| ('\u{00C0}'..='\u{024F}').contains(&ch) || ('\u{1E00}'..='\u{1EFF}').contains(&ch)
{
latin += 1;
} else if ('\u{4E00}'..='\u{9FFF}').contains(&ch) || ('\u{3400}'..='\u{4DBF}').contains(&ch) || ('\u{3040}'..='\u{309F}').contains(&ch) || ('\u{30A0}'..='\u{30FF}').contains(&ch) || ('\u{AC00}'..='\u{D7AF}').contains(&ch)
{
cjk += 1;
} else {
other_script += 1;
}
}
let total = latin + cjk + other_script;
if total == 0 {
return Script::Unknown;
}
if latin as f64 / total as f64 >= 0.5 {
Script::Latin
} else if cjk as f64 / total as f64 >= 0.5 {
Script::Cjk
} else {
Script::Other
}
}
pub fn trim_trailing_noise(lines: &[String]) -> Vec<String> {
if lines.is_empty() {
return Vec::new();
}
fn is_filler(text: &str) -> bool {
let t = text.trim().to_lowercase();
t == "you" || t == "okay." || t == "yeah."
}
let mut trim_from = lines.len();
let mut always_noise_count = 0usize;
for i in (0..lines.len()).rev() {
let text = text_part(&lines[i]);
if is_always_noise(text) {
trim_from = i;
always_noise_count += 1;
} else if is_filler(text) {
trim_from = i;
} else {
break;
}
}
let trimmed_count = lines.len() - trim_from;
let should_trim = always_noise_count > 0 || trimmed_count >= 5;
if should_trim {
tracing::info!(
trimmed = trimmed_count,
always_noise = always_noise_count,
"removed trailing noise from transcript"
);
let mut result: Vec<String> = lines[..trim_from].to_vec();
result.push(format!(
"[Recording ended - {} lines of trailing noise removed]",
trimmed_count
));
result
} else {
lines.to_vec()
}
}
pub fn strip_trailing_commands(lines: &[String]) -> Vec<String> {
const COMMANDS: &[&str] = &[
"stop recording",
"stop the recording",
"end recording",
"end the recording",
"stop transcription",
"end transcription",
"stop transcribing",
"hey minutes stop",
"minutes stop",
"okay stop",
"ok stop",
];
let mut result = lines.to_vec();
for _ in 0..2 {
if let Some(last) = result.last() {
let text = text_part(last).trim().to_lowercase();
let text = text.trim_end_matches('.');
if COMMANDS
.iter()
.any(|cmd| text == *cmd || text.ends_with(cmd))
{
tracing::debug!(
line = result.last().map(|l| l.as_str()).unwrap_or(""),
"stripping trailing voice command"
);
result.pop();
} else {
break;
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_transcript_removes_repetition() {
let input = "[0:00] Hello world\n[0:03] Hello world\n[0:06] Hello world\n[0:09] Hello world\n[0:12] Something different\n";
let (cleaned, stats) = clean_transcript(input);
assert!(stats.lines_removed > 0);
assert!(cleaned.contains("Something different"));
assert!(cleaned.contains("repeated audio removed"));
}
#[test]
fn clean_transcript_preserves_normal_text() {
let input = "[0:00] First line\n[0:05] Second line\n[0:10] Third line\n";
let (cleaned, stats) = clean_transcript(input);
assert_eq!(stats.lines_removed, 0);
assert!(cleaned.contains("First line"));
assert!(cleaned.contains("Third line"));
}
#[test]
fn dedup_no_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] How are you".into(),
"[0:06] Fine thanks".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_collapses_exact_repetition() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Hello world".into(),
"[0:09] Hello world".into(),
"[0:12] Something different".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("repeated audio removed"));
assert!(result[2].contains("Something different"));
}
#[test]
fn dedup_collapses_near_identical() {
let lines = vec![
"[0:00] Ok bene le macedi diesel".into(),
"[0:03] Ok, bene le macedi diesel".into(),
"[0:06] Ok bene, le macedi diesel".into(),
"[0:09] Good morning".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 3);
assert!(result[1].contains("repeated audio removed"));
}
#[test]
fn dedup_leaves_two_similar_alone() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:03] Hello world".into(),
"[0:06] Something else".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_handles_empty() {
let result = dedup_segments(&[]);
assert!(result.is_empty());
}
#[test]
fn dedup_handles_single_line() {
let lines = vec!["[0:00] Hello".into()];
let result = dedup_segments(&lines);
assert_eq!(result, lines);
}
#[test]
fn dedup_multiple_runs() {
let lines = vec![
"[0:00] First phrase".into(),
"[0:03] First phrase".into(),
"[0:06] First phrase".into(),
"[0:09] Second phrase".into(),
"[0:12] Second phrase".into(),
"[0:15] Second phrase".into(),
"[0:18] Second phrase".into(),
"[0:21] Normal text".into(),
];
let result = dedup_segments(&lines);
assert_eq!(result.len(), 5);
assert!(result[1].contains("2 identical"));
assert!(result[3].contains("3 identical"));
}
#[test]
fn interleaved_catches_alternating_pattern() {
let mut lines: Vec<String> = Vec::new();
for i in 0..20 {
let ts = i * 2;
if i % 2 == 0 {
lines.push(format!(
"[{}:{:02}] So I'm going to pick his brain as well.",
ts / 60,
ts % 60
));
} else {
lines.push(format!("[{}:{:02}] Okay.", ts / 60, ts % 60));
}
}
lines.push("[0:40] Something completely different".into());
let result = dedup_interleaved(&lines);
assert!(
result.len() <= 4,
"expected <=4 lines, got {}: {:?}",
result.len(),
result
);
assert!(result.iter().any(|l| l.contains("pick his brain")));
assert!(result
.iter()
.any(|l| l.contains("hallucinated repetition removed")));
assert!(result
.last()
.unwrap()
.contains("Something completely different"));
}
#[test]
fn interleaved_leaves_normal_conversation() {
let lines = vec![
"[0:00] Hello how are you".into(),
"[0:05] I'm fine thanks".into(),
"[0:10] Great to hear".into(),
"[0:15] Let's talk about the project".into(),
"[0:20] Sure what's the update".into(),
"[0:25] We shipped the feature".into(),
];
let result = dedup_interleaved(&lines);
assert_eq!(result, lines);
}
#[test]
fn interleaved_ignores_short_repeats() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:02] Okay.".into(),
"[0:04] Hello world".into(),
"[0:06] Okay.".into(),
"[0:08] Hello world".into(),
"[0:10] Something else".into(),
];
let result = dedup_interleaved(&lines);
assert_eq!(result, lines);
}
#[test]
fn trim_trailing_music() {
let mut lines: Vec<String> = vec![
"[0:00] Hello world".into(),
"[0:05] Some real content".into(),
];
for i in 0..20 {
lines.push(format!("[{}:00] [music]", i + 1));
}
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 3);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("real content"));
assert!(result[2].contains("trailing noise removed"));
}
#[test]
fn trim_short_run_of_always_noise_now_trimmed() {
let lines: Vec<String> = vec![
"[0:00] Hello world".into(),
"[0:05] [music]".into(),
"[0:10] [music]".into(),
"[0:15] [music]".into(),
];
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 2);
assert!(result[0].contains("Hello world"));
assert!(result[1].contains("trailing noise removed"));
}
#[test]
fn trim_keeps_short_trailing_filler() {
let lines: Vec<String> = vec!["[0:00] That wraps it".into(), "[0:05] yeah.".into()];
let result = trim_trailing_noise(&lines);
assert_eq!(result, lines, "single-filler closing must survive");
}
#[test]
fn trim_long_run_of_filler_is_trimmed() {
let lines: Vec<String> = vec![
"[0:00] Real content".into(),
"[0:05] yeah.".into(),
"[0:10] yeah.".into(),
"[0:15] yeah.".into(),
"[0:20] yeah.".into(),
"[0:25] yeah.".into(),
];
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 2);
assert!(result[0].contains("Real content"));
assert!(result[1].contains("trailing noise removed"));
}
#[test]
fn trim_handles_empty() {
assert!(trim_trailing_noise(&[]).is_empty());
}
#[test]
fn trim_all_noise() {
let lines: Vec<String> = (0..10).map(|i| format!("[{}:00] [music]", i)).collect();
let result = trim_trailing_noise(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("trailing noise removed"));
}
#[test]
fn script_removes_cjk_from_latin_transcript() {
let lines = vec![
"[0:00] Hello and welcome".into(),
"[0:05] Let's discuss the project".into(),
"[0:10] スパイシー".into(),
"[0:15] We should wrap up now".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 3);
assert!(!result.iter().any(|l| l.contains("スパイシー")));
}
#[test]
fn script_preserves_pure_latin_transcript() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] How are you".into(),
"[0:10] I'm doing fine".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_preserves_pure_cjk_transcript() {
let lines = vec![
"[0:00] こんにちは".into(),
"[0:05] お元気ですか".into(),
"[0:10] 元気です".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_no_action_on_mixed_transcript() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] こんにちは".into(),
"[0:10] Good morning".into(),
"[0:15] お元気ですか".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_handles_single_line() {
let lines = vec!["[0:00] スパイシー".into()];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines); }
#[test]
fn script_all_hallucinated_in_latin_majority() {
let lines = vec![
"[0:00] Today we need to discuss".into(),
"[0:05] The quarterly results".into(),
"[0:10] Are looking good".into(),
"[0:15] Revenue is up".into(),
"[0:20] Margins improved significantly".into(),
"[0:25] 東京タワー".into(),
"[0:30] 大阪城".into(),
];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 5);
assert!(result
.iter()
.all(|l| !l.contains('東') && !l.contains('大')));
}
#[test]
fn script_two_cjk_lines_preserved() {
let lines = vec!["[0:00] スパイシー".into(), "[0:05] 東京タワー".into()];
let result = strip_foreign_script(&lines);
assert_eq!(result, lines);
}
#[test]
fn script_cyrillic_majority_strips_latin() {
let lines = vec![
"[0:00] Привет мир".into(),
"[0:05] Как дела".into(),
"[0:10] Всё хорошо".into(),
"[0:15] Hello world".into(), ];
let result = strip_foreign_script(&lines);
assert_eq!(result.len(), 3);
assert!(!result.iter().any(|l| l.contains("Hello")));
}
#[test]
fn script_classify_basic() {
assert_eq!(classify_script("Hello world"), Script::Latin);
assert_eq!(classify_script("スパイシー"), Script::Cjk);
assert_eq!(classify_script("Привет"), Script::Other);
assert_eq!(classify_script(""), Script::Unknown);
assert_eq!(classify_script("123 !@#"), Script::Unknown);
}
#[test]
fn clean_transcript_includes_script_filter() {
let input =
"[0:00] Hello world\n[0:05] Testing one two\n[0:10] スパイシー\n[0:15] All done\n";
let (cleaned, stats) = clean_transcript(input);
assert!(!cleaned.contains("スパイシー"));
assert!(stats.after_script_filter < stats.after_interleaved_dedup);
}
#[test]
fn noise_markers_collapses_polish_laughter() {
let mut lines: Vec<String> = vec!["[0:00] Cześć, jak się masz?".into()];
for i in 1..=10 {
lines.push(format!("[0:{:02}] [Śmiech]", i * 3));
}
lines.push("[0:33] Dobrze, dziękuję".into());
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 4,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Cześć"));
assert!(result
.iter()
.any(|l| l.contains("non-speech audio removed")));
assert!(result.last().unwrap().contains("Dobrze"));
}
#[test]
fn noise_markers_collapses_english_mixed() {
let lines = vec![
"[0:00] Good morning everyone".into(),
"[0:05] [music]".into(),
"[0:10] [laughter]".into(),
"[0:15] [applause]".into(),
"[0:20] [music]".into(),
"[0:25] Thank you for coming".into(),
];
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 4,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Good morning"));
assert!(result.last().unwrap().contains("Thank you"));
}
#[test]
fn noise_markers_preserves_short_runs() {
let lines = vec![
"[0:00] Hello world".into(),
"[0:05] [laughter]".into(),
"[0:10] That was funny".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_empty() {
assert!(collapse_noise_markers(&[]).is_empty());
}
#[test]
fn noise_markers_handles_single_line() {
let lines = vec!["[0:00] [music]".into()];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_two_lines() {
let lines = vec!["[0:00] [music]".into(), "[0:03] [laughter]".into()];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_ignores_timestamps() {
let lines = vec![
"[0:00] Hello".into(),
"[0:05] World".into(),
"[0:10] Test".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_ignores_collapse_markers() {
let lines = vec![
"[0:00] Hello world".into(),
"[...] [repeated audio removed - 5 identical segments collapsed]".into(),
"[0:30] Something else".into(),
"[...] [hallucinated repetition removed - 10 lines collapsed]".into(),
"[1:00] Final line".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_multilingual_markers() {
let mut lines = Vec::new();
lines.push("[0:00] Bonjour".into());
for i in 1..=4 {
lines.push(format!("[0:{:02}] [rires]", i * 3));
}
for i in 5..=7 {
lines.push(format!("[0:{:02}] [Musik]", i * 3));
}
lines.push("[0:30] Au revoir".into());
let result = collapse_noise_markers(&lines);
assert!(
result.len() <= 5,
"got {} lines: {:?}",
result.len(),
result
);
assert!(result[0].contains("Bonjour"));
assert!(result.last().unwrap().contains("Au revoir"));
}
#[test]
fn noise_markers_scattered_high_density() {
let lines = vec![
"[0:00] Real content one".into(),
"[0:03] [Śmiech]".into(),
"[0:06] [muzyka]".into(),
"[0:09] Real content two".into(),
"[0:12] [cisza]".into(),
"[0:15] [oklaski]".into(),
"[0:18] Real content three".into(),
"[0:21] [Śmiech]".into(),
"[0:24] [muzyka]".into(),
"[0:27] Real content four".into(),
"[0:30] [cisza]".into(),
"[0:33] [oklaski]".into(),
"[0:36] Real content five".into(),
"[0:39] [Śmiech]".into(),
"[0:42] [muzyka]".into(),
];
let result = collapse_noise_markers(&lines);
let content_count = result.iter().filter(|l| l.contains("Real content")).count();
assert_eq!(content_count, 5, "all content lines preserved");
assert!(
result
.iter()
.any(|l| l.contains("non-speech markers removed")),
"expected pass 2 removal summary, got: {:?}",
result
);
}
#[test]
fn noise_markers_below_threshold_kept() {
let lines = vec![
"[0:00] Real content one".into(),
"[0:03] [laughter]".into(),
"[0:06] Real content two".into(),
"[0:09] [applause]".into(),
"[0:12] Real content three".into(),
"[0:15] [laughter]".into(),
"[0:18] Real content four".into(),
"[0:21] [music]".into(),
"[0:24] Real content five".into(),
"[0:27] [laughter]".into(),
];
let result = collapse_noise_markers(&lines);
assert_eq!(result, lines);
}
#[test]
fn noise_markers_handles_blank_audio() {
let mut lines: Vec<String> = vec!["[0:00] Some content".into()];
for i in 1..=6 {
lines.push(format!("[0:{:02}] [BLANK_AUDIO]", i * 5));
}
lines.push("[0:35] More content".into());
let result = collapse_noise_markers(&lines);
assert!(result.len() <= 4);
assert!(result
.iter()
.any(|l| l.contains("non-speech audio removed")));
}
#[test]
fn clean_transcript_includes_noise_markers() {
let input = "[0:00] Hello world\n\
[0:03] [Śmiech]\n\
[0:06] [muzyka]\n\
[0:09] [cisza]\n\
[0:12] [oklaski]\n\
[0:15] [Śmiech]\n\
[0:18] [muzyka]\n\
[0:21] [cisza]\n\
[0:24] Goodbye\n";
let (cleaned, stats) = clean_transcript(input);
assert!(
stats.after_noise_markers < stats.after_script_filter,
"noise markers: {}, script filter: {}",
stats.after_noise_markers,
stats.after_script_filter
);
assert!(cleaned.contains("Hello world"));
assert!(cleaned.contains("Goodbye"));
}
#[test]
fn strip_command_removes_stop_recording() {
let lines = vec![
"[0:00] Great meeting everyone".into(),
"[0:05] Let's wrap up".into(),
"[0:10] Stop recording.".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 2);
assert!(result[1].contains("wrap up"));
}
#[test]
fn strip_command_removes_with_timestamp() {
let lines = vec!["[0:00] First point".into(), "[0:30] Stop recording".into()];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("First point"));
}
#[test]
fn strip_command_removes_end_recording() {
let lines = vec![
"[0:00] Discussion content".into(),
"[0:10] End recording".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
}
#[test]
fn strip_command_removes_two_trailing_commands() {
let lines = vec![
"[0:00] Content".into(),
"[0:10] Okay stop.".into(),
"[0:12] Stop recording.".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
assert!(result[0].contains("Content"));
}
#[test]
fn strip_command_preserves_non_command_lines() {
let lines = vec![
"[0:00] We need to stop recording expenses".into(),
"[0:05] The stop recording policy is important".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 2, "non-command lines should be preserved");
}
#[test]
fn strip_command_handles_empty() {
let result = strip_trailing_commands(&[]);
assert!(result.is_empty());
}
#[test]
fn strip_command_case_insensitive() {
let lines = vec![
"[0:00] Meeting notes".into(),
"[0:05] STOP RECORDING".into(),
];
let result = strip_trailing_commands(&lines);
assert_eq!(result.len(), 1);
}
#[test]
fn clean_transcript_strips_trailing_command() {
let input =
"[0:00] Important discussion\n[0:05] Action item for Bob\n[0:10] Stop recording.\n";
let (cleaned, stats) = clean_transcript(input);
assert!(!cleaned.contains("Stop recording"));
assert!(cleaned.contains("Action item for Bob"));
assert!(stats.after_command_strip <= stats.after_trailing_trim);
assert_eq!(stats.lines_removed, 1);
}
#[test]
fn clean_segments_handles_empty() {
let (cleaned, stats) = clean_segments(&[]);
assert!(cleaned.is_empty());
assert_eq!(stats.original_lines, 0);
assert_eq!(stats.lines_removed, 0);
}
#[test]
fn clean_segments_passes_through_clean_input() {
let input: Vec<String> = vec![
"Welcome to the meeting.".into(),
"Let's discuss Q3 numbers.".into(),
"Revenue is up twelve percent.".into(),
];
let (cleaned, stats) = clean_segments(&input);
assert_eq!(cleaned, input, "clean input should be untouched");
assert_eq!(stats.lines_removed, 0);
assert_eq!(stats.after_command_strip, 3);
}
#[test]
fn clean_segments_dedups_repeated_hallucination() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"What's the budget for Q3?".into(),
];
let (cleaned, stats) = clean_segments(&input);
assert!(cleaned.iter().any(|s| s.contains("budget")));
assert!(stats.lines_removed >= 2);
assert!(cleaned.iter().any(|s| s.contains("repeated audio removed")));
}
#[test]
fn clean_segments_is_idempotent() {
let input: Vec<String> = vec![
"Real content.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"More real content.".into(),
];
let (first, _) = clean_segments(&input);
let (second, second_stats) = clean_segments(&first);
assert_eq!(first, second, "second pass should be a no-op");
assert_eq!(second_stats.lines_removed, 0);
}
#[test]
fn clean_segments_with_options_respects_disabled_passes() {
let input: Vec<String> = vec![
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
];
let opts = CleanOptions {
dedup_consecutive: false,
..CleanOptions::default()
};
let (cleaned, _) = clean_segments_with_options(&input, &opts);
assert_eq!(cleaned.len(), input.len(), "dedup disabled → no removal");
}
#[test]
fn clean_options_none_runs_no_passes() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Stop recording.".into(),
];
let (cleaned, stats) = clean_segments_with_options(&input, &CleanOptions::none());
assert_eq!(cleaned, input, "no passes → no changes");
assert_eq!(stats.lines_removed, 0);
}
#[test]
fn clean_options_all_matches_default() {
let input: Vec<String> = vec![
"Real meeting content.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"[music]".into(),
];
let (default_out, default_stats) = clean_segments(&input);
let (all_out, all_stats) = clean_segments_with_options(&input, &CleanOptions::all());
assert_eq!(default_out, all_out);
assert_eq!(default_stats, all_stats);
}
#[test]
fn clean_segments_works_on_raw_segments_without_timestamps() {
let raw_segments: Vec<String> = vec![
" Thank you.".into(), " Thank you.".into(),
" Thank you.".into(),
" Thank you.".into(),
" So what's our action plan?".into(),
];
let (cleaned, stats) = clean_segments(&raw_segments);
assert!(stats.lines_removed >= 2);
assert!(cleaned.iter().any(|s| s.contains("action plan")));
}
#[test]
fn clean_transcript_delegates_to_clean_segments() {
let raw = "Thank you.\nThank you.\nThank you.\nReal content.";
let segments: Vec<String> = raw.lines().map(String::from).collect();
let (transcript_out, _t_stats) = clean_transcript(raw);
let (segments_out, _s_stats) = clean_segments(&segments);
assert_eq!(transcript_out, segments_out.join("\n"));
}
#[test]
fn clean_stats_summary_is_human_readable() {
let input: Vec<String> = vec![
"Hello.".into(),
"Hello.".into(),
"Hello.".into(),
"World.".into(),
];
let (_, stats) = clean_segments(&input);
let summary = stats.summary();
assert!(summary.contains("whisper-guard:"));
assert!(summary.contains("4")); }
#[test]
fn clean_segments_with_huge_input_does_not_panic() {
let input: Vec<String> = (0..10_000).map(|_| "Thank you.".to_string()).collect();
let (cleaned, stats) = clean_segments(&input);
assert_eq!(stats.original_lines, 10_000);
assert!(cleaned.len() < 10);
}
#[test]
fn clean_segments_handles_unicode_correctly() {
let input: Vec<String> = vec![
"Café meeting at 9am with Søren and José".into(),
"Discussed naïve Bayes models".into(),
];
let (cleaned, _) = clean_segments(&input);
assert_eq!(cleaned.len(), 2, "unicode-in-Latin should not be filtered");
}
#[test]
fn keep_dedup_annotations_default_true_preserves_marker() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Real content.".into(),
];
let (cleaned, _) = clean_segments(&input);
assert!(
cleaned
.iter()
.any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
"default behavior should preserve the annotation line"
);
}
#[test]
fn keep_dedup_annotations_false_strips_marker() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"Real content.".into(),
];
let opts = CleanOptions {
keep_dedup_annotations: false,
..CleanOptions::default()
};
let (cleaned, stats) = clean_segments_with_options(&input, &opts);
assert!(
!cleaned
.iter()
.any(|s| s.starts_with(DEDUP_ANNOTATION_PREFIX)),
"annotation should be removed"
);
assert_eq!(cleaned.len(), 2);
assert_eq!(stats.lines_removed, 3);
}
#[test]
fn keep_dedup_annotations_does_not_strip_other_bracket_content() {
let input: Vec<String> = vec![
"Thank you.".into(),
"Thank you.".into(),
"Thank you.".into(),
"[NAME] said the deal closed.".into(),
];
let opts = CleanOptions {
keep_dedup_annotations: false,
..CleanOptions::default()
};
let (cleaned, _) = clean_segments_with_options(&input, &opts);
assert!(
cleaned.iter().any(|s| s.contains("[NAME]")),
"non-annotation bracket content must survive"
);
}
}