parakeet_rs/
timestamps.rs

1use crate::decoder::TimedToken;
2
3/// Timestamp output mode for transcription results
4///
5/// Determines how token-level timestamps are grouped and presented:
6/// - `Tokens`: Raw token-level output from the model (most detailed)
7/// - `Words`: Tokens grouped into individual words
8/// - `Sentences`: Tokens grouped by sentence boundaries (., ?, !)
9///
10/// # Model-Specific Recommendations
11///
12/// - **Parakeet CTC (English)**: Use `Words` mode. The CTC model only outputs lowercase
13///   alphabet without punctuation, so sentence segmentation is not possible.
14/// - **Parakeet TDT (Multilingual)**: Use `Sentences` mode. The TDT model predicts
15///   punctuation, enabling natural sentence boundaries.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum TimestampMode {
18    /// Raw token-level timestamps from the model
19    #[default]
20    Tokens,
21    /// Word-level timestamps (groups subword tokens)
22    Words,
23    /// Sentence-level timestamps (groups by punctuation)
24    ///
25    /// Note: Only works with models that predict punctuation (e.g., Parakeet TDT).
26    /// CTC models don't predict punctuation, so use `Words` mode instead.
27    Sentences,
28}
29
30/// Convert token timestamps to the requested output mode
31///
32/// Takes raw token-level timestamps from the model and optionally groups them
33/// into words or sentences while preserving the original timing information.
34///
35/// # Arguments
36///
37/// * `tokens` - Raw token-level timestamps from model output
38/// * `mode` - Desired grouping level (Tokens, Words, or Sentences)
39///
40/// # Returns
41///
42/// Vector of TimedToken with timestamps at the requested granularity
43pub fn process_timestamps(tokens: &[TimedToken], mode: TimestampMode) -> Vec<TimedToken> {
44    match mode {
45        TimestampMode::Tokens => tokens.to_vec(),
46        TimestampMode::Words => group_by_words(tokens),
47        TimestampMode::Sentences => group_by_sentences(tokens),
48    }
49}
50
51// Group tokens into words based on word boundary markers
52fn group_by_words(tokens: &[TimedToken]) -> Vec<TimedToken> {
53    if tokens.is_empty() {
54        return Vec::new();
55    }
56
57    let mut words = Vec::new();
58    let mut current_word_text = String::new();
59    let mut current_word_start = 0.0;
60    let mut last_word_lower = String::new();
61
62    for (i, token) in tokens.iter().enumerate() {
63        // Space-only tokens (from SentencePiece ▁ word boundaries) act as word separators
64        // but don't contribute text. Save current word if we hit one.
65        if token.text.trim().is_empty() {
66            if !current_word_text.is_empty() {
67                let word_lower = current_word_text.to_lowercase();
68                if word_lower != last_word_lower {
69                    words.push(TimedToken {
70                        text: current_word_text.clone(),
71                        start: current_word_start,
72                        end: if i > 0 { tokens[i - 1].end } else { token.end },
73                    });
74                    last_word_lower = word_lower;
75                }
76                current_word_text.clear();
77            }
78            continue;
79        }
80
81        // Check if this starts a new word (SentencePiece uses ▁ or space prefix)
82        // Also treat PURE punctuation marks (like ".", ",") as separate words
83        // But NOT contractions like "'re" or "'s" or hyphenations like "-two" (ex. twenty-two) which should attach to previous word
84        let is_pure_punctuation =
85            !token.text.is_empty() && token.text.chars().all(|c| c.is_ascii_punctuation());
86
87        // Check if this is a contraction or hyphenation suffix
88        // These should NOT start a new word - they attach to the previous word
89        let token_without_marker = token.text.trim_start_matches('▁').trim_start_matches(' ');
90        let is_contraction = token_without_marker.starts_with('\'');
91        let is_hyphenation = token_without_marker.starts_with('-');
92
93        let starts_word =
94            (token.text.starts_with('▁') || token.text.starts_with(' ') || is_pure_punctuation)
95                && !is_contraction
96                && !is_hyphenation
97                || i == 0;
98
99        if starts_word && !current_word_text.is_empty() {
100            // Save previous word (with deduplication)
101            let word_lower = current_word_text.to_lowercase();
102            if word_lower != last_word_lower {
103                words.push(TimedToken {
104                    text: current_word_text.clone(),
105                    start: current_word_start,
106                    end: tokens[i - 1].end,
107                });
108                last_word_lower = word_lower;
109            }
110            current_word_text.clear();
111        }
112
113        // Start new word or append to current
114        if current_word_text.is_empty() {
115            current_word_start = token.start;
116        }
117
118        // Add token text, removing word boundary markers
119        let token_text = token.text.trim_start_matches('▁').trim_start_matches(' ');
120        current_word_text.push_str(token_text);
121    }
122
123    // Add final word
124    if !current_word_text.is_empty() {
125        let word_lower = current_word_text.to_lowercase();
126        if word_lower != last_word_lower {
127            words.push(TimedToken {
128                text: current_word_text,
129                start: current_word_start,
130                end: tokens.last().unwrap().end,
131            });
132        }
133    }
134
135    words
136}
137
138// Group words into sentences based on punctuation
139fn group_by_sentences(tokens: &[TimedToken]) -> Vec<TimedToken> {
140    // First get word-level grouping
141    let words = group_by_words(tokens);
142    if words.is_empty() {
143        return Vec::new();
144    }
145
146    let mut sentences = Vec::new();
147    let mut current_sentence = Vec::new();
148
149    for word in words {
150        current_sentence.push(word.clone());
151
152        // Check if word ends with sentence terminator
153        let ends_sentence =
154            word.text.contains('.') || word.text.contains('?') || word.text.contains('!');
155
156        if ends_sentence {
157            let sentence_text = format_sentence(&current_sentence);
158            let start = current_sentence.first().unwrap().start;
159            let end = current_sentence.last().unwrap().end;
160
161            if !sentence_text.is_empty() {
162                sentences.push(TimedToken {
163                    text: sentence_text,
164                    start,
165                    end,
166                });
167            }
168            current_sentence.clear();
169        }
170    }
171
172    // Add final sentence if exists
173    if !current_sentence.is_empty() {
174        let sentence_text = format_sentence(&current_sentence);
175        let start = current_sentence.first().unwrap().start;
176        let end = current_sentence.last().unwrap().end;
177
178        if !sentence_text.is_empty() {
179            sentences.push(TimedToken {
180                text: sentence_text,
181                start,
182                end,
183            });
184        }
185    }
186
187    sentences
188}
189
190// Join words with punctuation spacing
191fn format_sentence(words: &[TimedToken]) -> String {
192    let result: Vec<&str> = words.iter().map(|w| w.text.as_str()).collect();
193
194    // Join words, but don't add space before certain punctuation
195    let mut output = String::new();
196    for (i, word) in result.iter().enumerate() {
197        // Check if this word is standalone punctuation that shouldn't have space before it
198        // Contractions like "'re" or "'s" should have spaces before them
199        let is_standalone_punct = word.len() == 1
200            && word
201                .chars()
202                .all(|c| matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')'));
203
204        if i > 0 && !is_standalone_punct {
205            output.push(' ');
206        }
207        output.push_str(word);
208    }
209    output
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_word_grouping() {
218        let tokens = vec![
219            TimedToken {
220                text: "▁Hello".to_string(),
221                start: 0.0,
222                end: 0.5,
223            },
224            TimedToken {
225                text: "▁world".to_string(),
226                start: 0.5,
227                end: 1.0,
228            },
229        ];
230
231        let words = group_by_words(&tokens);
232        assert_eq!(words.len(), 2);
233        assert_eq!(words[0].text, "Hello");
234        assert_eq!(words[1].text, "world");
235    }
236
237    #[test]
238    fn test_word_grouping_with_hyphenated_word() {
239        let tokens = vec![
240            TimedToken {
241                text: "▁twenty".to_string(),
242                start: 0.0,
243                end: 0.3,
244            },
245            TimedToken {
246                text: "-two".to_string(),
247                start: 0.3,
248                end: 0.6,
249            },
250            TimedToken {
251                text: "▁apples".to_string(),
252                start: 0.6,
253                end: 1.0,
254            },
255        ];
256
257        let words = group_by_words(&tokens);
258        assert_eq!(words.len(), 2);
259        assert_eq!(words[0].text, "twenty-two");
260        assert_eq!(words[1].text, "apples");
261        assert_eq!(words[0].start, 0.0);
262        assert_eq!(words[0].end, 0.6);
263        assert_eq!(words[1].start, 0.6);
264        assert_eq!(words[1].end, 1.0);
265    }
266
267    #[test]
268    fn test_sentence_grouping() {
269        let tokens = vec![
270            TimedToken {
271                text: "▁Hello".to_string(),
272                start: 0.0,
273                end: 0.5,
274            },
275            TimedToken {
276                text: "▁world".to_string(),
277                start: 0.5,
278                end: 1.0,
279            },
280            TimedToken {
281                text: ".".to_string(),
282                start: 1.0,
283                end: 1.1,
284            },
285        ];
286
287        let sentences = group_by_sentences(&tokens);
288        assert_eq!(sentences.len(), 1);
289        assert_eq!(sentences[0].text, "Hello world.");
290        assert_eq!(sentences[0].start, 0.0);
291        assert_eq!(sentences[0].end, 1.1);
292    }
293
294    #[test]
295    fn test_repetition_preservation() {
296        let words = vec![
297            TimedToken {
298                text: "uh".to_string(),
299                start: 0.0,
300                end: 0.5,
301            },
302            TimedToken {
303                text: "uh".to_string(),
304                start: 0.5,
305                end: 1.0,
306            },
307            TimedToken {
308                text: "hello".to_string(),
309                start: 1.0,
310                end: 1.5,
311            },
312        ];
313
314        let result = format_sentence(&words);
315        assert_eq!(result, "uh uh hello");
316    }
317
318    #[test]
319    fn test_space_token_separates_words_from_digits() {
320        // Simulates "like 100" tokenized as [" like", " ", "1", "0", "0"]
321        // The space-only token should act as word boundary
322        let tokens = vec![
323            TimedToken {
324                text: " like".to_string(),
325                start: 0.0,
326                end: 0.5,
327            },
328            TimedToken {
329                text: " ".to_string(), // Space-only token from ▁
330                start: 0.5,
331                end: 0.5,
332            },
333            TimedToken {
334                text: "1".to_string(),
335                start: 0.5,
336                end: 0.6,
337            },
338            TimedToken {
339                text: "0".to_string(),
340                start: 0.6,
341                end: 0.7,
342            },
343            TimedToken {
344                text: "0".to_string(),
345                start: 0.7,
346                end: 0.8,
347            },
348        ];
349
350        let words = group_by_words(&tokens);
351        assert_eq!(words.len(), 2);
352        assert_eq!(words[0].text, "like");
353        assert_eq!(words[1].text, "100");
354
355        // Also test sentence formatting
356        let sentence = format_sentence(&words);
357        assert_eq!(sentence, "like 100");
358    }
359}
parakeet_rs/timestamps.rs

parakeet_rs/
timestamps.rs