use tracing::debug;
use super::groq::GroqWord;
pub struct DeduplicationTracker {
transcribed_up_to: f64,
cumulative_offset: f64,
recent_text: String,
max_recent_chars: usize,
}
impl DeduplicationTracker {
pub fn new() -> Self {
Self {
transcribed_up_to: 0.0,
cumulative_offset: 0.0,
recent_text: String::new(),
max_recent_chars: 500,
}
}
pub fn advance_offset(&mut self, chunk_duration_secs: f64) {
self.cumulative_offset += chunk_duration_secs;
debug!(
"dedup: advanced offset by {:.2}s, cumulative = {:.2}s",
chunk_duration_secs, self.cumulative_offset
);
}
pub fn filter_words(&mut self, words: &[GroqWord]) -> Vec<GroqWord> {
let mut accepted = Vec::new();
for word in words {
let adjusted_start = word.start + self.cumulative_offset;
let adjusted_end = word.end + self.cumulative_offset;
if adjusted_start >= self.transcribed_up_to - 0.05 {
accepted.push(GroqWord {
word: word.word.clone(),
start: adjusted_start,
end: adjusted_end,
});
self.transcribed_up_to = adjusted_end;
}
}
debug!(
"dedup: accepted {}/{} words, transcribed_up_to = {:.2}s",
accepted.len(),
words.len(),
self.transcribed_up_to
);
accepted
}
pub fn filter_text(&mut self, new_text: &str) -> String {
let result = if self.recent_text.is_empty() {
new_text.to_string()
} else {
remove_overlap(&self.recent_text, new_text)
};
self.recent_text = new_text.to_string();
if self.recent_text.len() > self.max_recent_chars {
let trim_at = self.recent_text.len() - self.max_recent_chars;
self.recent_text = self.recent_text[trim_at..].to_string();
}
result
}
}
impl Default for DeduplicationTracker {
fn default() -> Self {
Self::new()
}
}
fn remove_overlap(previous: &str, new: &str) -> String {
let prev_words: Vec<&str> = previous.split_whitespace().collect();
let new_words: Vec<&str> = new.split_whitespace().collect();
if prev_words.is_empty() || new_words.is_empty() {
return new.to_string();
}
let search_limit = (new_words.len() * 3 / 4).max(1);
let max_anchor = prev_words.len().min(8);
for anchor_len in (3..=max_anchor).rev() {
let anchor = &prev_words[prev_words.len() - anchor_len..];
for pos in 0..new_words.len() {
if pos + anchor_len > new_words.len() || pos >= search_limit {
break;
}
let candidate = &new_words[pos..pos + anchor_len];
if ngram_match(anchor, candidate) {
let new_start = pos + anchor_len;
if new_start >= new_words.len() {
return String::new();
}
return new_words[new_start..].join(" ");
}
}
}
let max_overlap = prev_words.len().min(new_words.len()).min(50);
for overlap_len in (1..=max_overlap).rev() {
let prev_suffix = &prev_words[prev_words.len() - overlap_len..];
let new_prefix = &new_words[..overlap_len];
if ngram_match(prev_suffix, new_prefix) {
let remaining = &new_words[overlap_len..];
if remaining.is_empty() {
return String::new();
}
return remaining.join(" ");
}
}
new.to_string()
}
fn ngram_match(a: &[&str], b: &[&str]) -> bool {
if a.len() != b.len() {
return false;
}
a.iter().zip(b.iter()).all(|(wa, wb)| words_match(wa, wb))
}
fn words_match(a: &str, b: &str) -> bool {
let na = normalize_word(a);
let nb = normalize_word(b);
if na == nb {
return true;
}
let similarity = strsim::jaro_winkler(&na, &nb);
similarity >= 0.85
}
fn normalize_word(word: &str) -> String {
word.to_lowercase()
.trim_end_matches(|c: char| c.is_ascii_punctuation())
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn timestamp_dedup_no_overlap() {
let mut tracker = DeduplicationTracker::new();
let words = vec![
GroqWord {
word: "Hello".to_string(),
start: 0.0,
end: 0.5,
},
GroqWord {
word: "world".to_string(),
start: 0.6,
end: 1.0,
},
];
let accepted = tracker.filter_words(&words);
assert_eq!(accepted.len(), 2);
assert_eq!(accepted[0].word, "Hello");
assert_eq!(accepted[1].word, "world");
}
#[test]
fn timestamp_dedup_skips_overlapping() {
let mut tracker = DeduplicationTracker::new();
let words1 = vec![
GroqWord {
word: "Hello".to_string(),
start: 0.0,
end: 0.5,
},
GroqWord {
word: "world".to_string(),
start: 0.6,
end: 1.0,
},
];
let accepted1 = tracker.filter_words(&words1);
assert_eq!(accepted1.len(), 2);
let words2 = vec![
GroqWord {
word: "world".to_string(),
start: 0.6,
end: 1.0,
},
GroqWord {
word: "how".to_string(),
start: 1.1,
end: 1.3,
},
];
let accepted2 = tracker.filter_words(&words2);
assert_eq!(accepted2.len(), 1);
assert_eq!(accepted2[0].word, "how");
}
#[test]
fn timestamp_dedup_with_offset() {
let mut tracker = DeduplicationTracker::new();
let words1 = vec![GroqWord {
word: "Hello".to_string(),
start: 0.0,
end: 0.5,
}];
tracker.filter_words(&words1);
tracker.advance_offset(1.0);
let words2 = vec![GroqWord {
word: "world".to_string(),
start: 0.1,
end: 0.5,
}];
let accepted = tracker.filter_words(&words2);
assert_eq!(accepted.len(), 1);
assert_eq!(accepted[0].word, "world");
assert!((accepted[0].start - 1.1).abs() < 0.01);
}
#[test]
fn text_dedup_no_previous() {
let mut tracker = DeduplicationTracker::new();
let result = tracker.filter_text("Hello world");
assert_eq!(result, "Hello world");
}
#[test]
fn text_dedup_removes_overlap_prefix() {
let mut tracker = DeduplicationTracker::new();
tracker.filter_text("the quick brown fox");
let result = tracker.filter_text("the quick brown fox jumps over");
assert_eq!(result, "jumps over");
}
#[test]
fn text_dedup_handles_whisper_rephrase() {
let mut tracker = DeduplicationTracker::new();
tracker.filter_text("trying to test it to see if it works");
let result =
tracker.filter_text("trying to test it and see if it works right now I am speaking");
assert_eq!(result, "right now I am speaking");
}
#[test]
fn text_dedup_no_overlap_found() {
let mut tracker = DeduplicationTracker::new();
tracker.filter_text("Hello world");
let result = tracker.filter_text("completely different text");
assert_eq!(result, "completely different text");
}
#[test]
fn text_dedup_full_overlap() {
let mut tracker = DeduplicationTracker::new();
tracker.filter_text("Hello world foo bar baz");
let result = tracker.filter_text("Hello world foo bar baz");
assert_eq!(result, "");
}
#[test]
fn text_dedup_sliding_window_sequence() {
let mut tracker = DeduplicationTracker::new();
let r1 = tracker.filter_text("A B C D E F");
assert_eq!(r1, "A B C D E F");
let r2 = tracker.filter_text("A B C D E F G H I");
assert_eq!(r2, "G H I");
let r3 = tracker.filter_text("D E F G H I J K L");
assert_eq!(r3, "J K L");
}
#[test]
fn normalize_word_strips_punctuation() {
assert_eq!(normalize_word("Hello,"), "hello");
assert_eq!(normalize_word("world."), "world");
assert_eq!(normalize_word("test"), "test");
}
#[test]
fn words_match_exact() {
assert!(words_match("hello", "hello"));
assert!(words_match("Hello", "hello"));
}
#[test]
fn words_match_with_punctuation() {
assert!(words_match("hello,", "hello"));
assert!(words_match("world.", "world"));
}
#[test]
fn words_match_fuzzy() {
assert!(words_match("hello", "helo"));
}
#[test]
fn words_dont_match_very_different() {
assert!(!words_match("hello", "world"));
}
#[test]
fn remove_overlap_anchor_search() {
let result = remove_overlap("one two three four", "two three four five six");
assert_eq!(result, "five six");
}
#[test]
fn remove_overlap_with_inserted_word() {
let result = remove_overlap(
"I think it is going to work",
"I really think it is going to work now",
);
assert_eq!(result, "now");
}
#[test]
fn remove_overlap_none() {
let result = remove_overlap("hello world", "completely different");
assert_eq!(result, "completely different");
}
#[test]
fn remove_overlap_prefix_fallback() {
let result = remove_overlap("brown fox", "brown fox jumps");
assert_eq!(result, "jumps");
}
}