1use crate::decoder::TimedToken;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum TimestampMode {
18 #[default]
20 Tokens,
21 Words,
23 Sentences,
28}
29
30pub fn process_timestamps(tokens: &[TimedToken], mode: TimestampMode) -> Vec<TimedToken> {
44 match mode {
45 TimestampMode::Tokens => tokens.to_vec(),
46 TimestampMode::Words => group_by_words(tokens),
47 TimestampMode::Sentences => group_by_sentences(tokens),
48 }
49}
50
51fn group_by_words(tokens: &[TimedToken]) -> Vec<TimedToken> {
53 if tokens.is_empty() {
54 return Vec::new();
55 }
56
57 let mut words = Vec::new();
58 let mut current_word_text = String::new();
59 let mut current_word_start = 0.0;
60 let mut last_word_lower = String::new();
61
62 for (i, token) in tokens.iter().enumerate() {
63 if token.text.trim().is_empty() {
66 if !current_word_text.is_empty() {
67 let word_lower = current_word_text.to_lowercase();
68 if word_lower != last_word_lower {
69 words.push(TimedToken {
70 text: current_word_text.clone(),
71 start: current_word_start,
72 end: if i > 0 { tokens[i - 1].end } else { token.end },
73 });
74 last_word_lower = word_lower;
75 }
76 current_word_text.clear();
77 }
78 continue;
79 }
80
81 let is_pure_punctuation =
85 !token.text.is_empty() && token.text.chars().all(|c| c.is_ascii_punctuation());
86
87 let token_without_marker = token.text.trim_start_matches('▁').trim_start_matches(' ');
90 let is_contraction = token_without_marker.starts_with('\'');
91 let is_hyphenation = token_without_marker.starts_with('-');
92
93 let starts_word =
94 (token.text.starts_with('▁') || token.text.starts_with(' ') || is_pure_punctuation)
95 && !is_contraction
96 && !is_hyphenation
97 || i == 0;
98
99 if starts_word && !current_word_text.is_empty() {
100 let word_lower = current_word_text.to_lowercase();
102 if word_lower != last_word_lower {
103 words.push(TimedToken {
104 text: current_word_text.clone(),
105 start: current_word_start,
106 end: tokens[i - 1].end,
107 });
108 last_word_lower = word_lower;
109 }
110 current_word_text.clear();
111 }
112
113 if current_word_text.is_empty() {
115 current_word_start = token.start;
116 }
117
118 let token_text = token.text.trim_start_matches('▁').trim_start_matches(' ');
120 current_word_text.push_str(token_text);
121 }
122
123 if !current_word_text.is_empty() {
125 let word_lower = current_word_text.to_lowercase();
126 if word_lower != last_word_lower {
127 words.push(TimedToken {
128 text: current_word_text,
129 start: current_word_start,
130 end: tokens.last().unwrap().end,
131 });
132 }
133 }
134
135 words
136}
137
138fn group_by_sentences(tokens: &[TimedToken]) -> Vec<TimedToken> {
140 let words = group_by_words(tokens);
142 if words.is_empty() {
143 return Vec::new();
144 }
145
146 let mut sentences = Vec::new();
147 let mut current_sentence = Vec::new();
148
149 for word in words {
150 current_sentence.push(word.clone());
151
152 let ends_sentence =
154 word.text.contains('.') || word.text.contains('?') || word.text.contains('!');
155
156 if ends_sentence {
157 let sentence_text = format_sentence(¤t_sentence);
158 let start = current_sentence.first().unwrap().start;
159 let end = current_sentence.last().unwrap().end;
160
161 if !sentence_text.is_empty() {
162 sentences.push(TimedToken {
163 text: sentence_text,
164 start,
165 end,
166 });
167 }
168 current_sentence.clear();
169 }
170 }
171
172 if !current_sentence.is_empty() {
174 let sentence_text = format_sentence(¤t_sentence);
175 let start = current_sentence.first().unwrap().start;
176 let end = current_sentence.last().unwrap().end;
177
178 if !sentence_text.is_empty() {
179 sentences.push(TimedToken {
180 text: sentence_text,
181 start,
182 end,
183 });
184 }
185 }
186
187 sentences
188}
189
190fn format_sentence(words: &[TimedToken]) -> String {
192 let result: Vec<&str> = words.iter().map(|w| w.text.as_str()).collect();
193
194 let mut output = String::new();
196 for (i, word) in result.iter().enumerate() {
197 let is_standalone_punct = word.len() == 1
200 && word
201 .chars()
202 .all(|c| matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')'));
203
204 if i > 0 && !is_standalone_punct {
205 output.push(' ');
206 }
207 output.push_str(word);
208 }
209 output
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_word_grouping() {
218 let tokens = vec![
219 TimedToken {
220 text: "▁Hello".to_string(),
221 start: 0.0,
222 end: 0.5,
223 },
224 TimedToken {
225 text: "▁world".to_string(),
226 start: 0.5,
227 end: 1.0,
228 },
229 ];
230
231 let words = group_by_words(&tokens);
232 assert_eq!(words.len(), 2);
233 assert_eq!(words[0].text, "Hello");
234 assert_eq!(words[1].text, "world");
235 }
236
237 #[test]
238 fn test_word_grouping_with_hyphenated_word() {
239 let tokens = vec![
240 TimedToken {
241 text: "▁twenty".to_string(),
242 start: 0.0,
243 end: 0.3,
244 },
245 TimedToken {
246 text: "-two".to_string(),
247 start: 0.3,
248 end: 0.6,
249 },
250 TimedToken {
251 text: "▁apples".to_string(),
252 start: 0.6,
253 end: 1.0,
254 },
255 ];
256
257 let words = group_by_words(&tokens);
258 assert_eq!(words.len(), 2);
259 assert_eq!(words[0].text, "twenty-two");
260 assert_eq!(words[1].text, "apples");
261 assert_eq!(words[0].start, 0.0);
262 assert_eq!(words[0].end, 0.6);
263 assert_eq!(words[1].start, 0.6);
264 assert_eq!(words[1].end, 1.0);
265 }
266
267 #[test]
268 fn test_sentence_grouping() {
269 let tokens = vec![
270 TimedToken {
271 text: "▁Hello".to_string(),
272 start: 0.0,
273 end: 0.5,
274 },
275 TimedToken {
276 text: "▁world".to_string(),
277 start: 0.5,
278 end: 1.0,
279 },
280 TimedToken {
281 text: ".".to_string(),
282 start: 1.0,
283 end: 1.1,
284 },
285 ];
286
287 let sentences = group_by_sentences(&tokens);
288 assert_eq!(sentences.len(), 1);
289 assert_eq!(sentences[0].text, "Hello world.");
290 assert_eq!(sentences[0].start, 0.0);
291 assert_eq!(sentences[0].end, 1.1);
292 }
293
294 #[test]
295 fn test_repetition_preservation() {
296 let words = vec![
297 TimedToken {
298 text: "uh".to_string(),
299 start: 0.0,
300 end: 0.5,
301 },
302 TimedToken {
303 text: "uh".to_string(),
304 start: 0.5,
305 end: 1.0,
306 },
307 TimedToken {
308 text: "hello".to_string(),
309 start: 1.0,
310 end: 1.5,
311 },
312 ];
313
314 let result = format_sentence(&words);
315 assert_eq!(result, "uh uh hello");
316 }
317
318 #[test]
319 fn test_space_token_separates_words_from_digits() {
320 let tokens = vec![
323 TimedToken {
324 text: " like".to_string(),
325 start: 0.0,
326 end: 0.5,
327 },
328 TimedToken {
329 text: " ".to_string(), start: 0.5,
331 end: 0.5,
332 },
333 TimedToken {
334 text: "1".to_string(),
335 start: 0.5,
336 end: 0.6,
337 },
338 TimedToken {
339 text: "0".to_string(),
340 start: 0.6,
341 end: 0.7,
342 },
343 TimedToken {
344 text: "0".to_string(),
345 start: 0.7,
346 end: 0.8,
347 },
348 ];
349
350 let words = group_by_words(&tokens);
351 assert_eq!(words.len(), 2);
352 assert_eq!(words[0].text, "like");
353 assert_eq!(words[1].text, "100");
354
355 let sentence = format_sentence(&words);
357 assert_eq!(sentence, "like 100");
358 }
359}