1use super::{Chunk, Chunker, RecursiveChunker};
4use crate::{Document, Error, Result};
5
6#[derive(Debug, Clone)]
41#[allow(clippy::struct_field_names)]
42pub struct TimestampChunker {
43 target_duration_secs: f64,
45 min_duration_secs: f64,
47 #[allow(dead_code)]
49 max_duration_secs: f64,
50 overlap_secs: f64,
52}
53
54impl TimestampChunker {
55 #[must_use]
57 pub fn new(target_duration_secs: f64) -> Self {
58 Self {
59 target_duration_secs,
60 min_duration_secs: 10.0,
61 max_duration_secs: target_duration_secs * 2.0,
62 overlap_secs: 5.0,
63 }
64 }
65
66 #[must_use]
68 pub fn with_min_duration(mut self, secs: f64) -> Self {
69 self.min_duration_secs = secs;
70 self
71 }
72
73 #[must_use]
75 pub fn with_max_duration(mut self, secs: f64) -> Self {
76 self.max_duration_secs = secs;
77 self
78 }
79
80 #[must_use]
82 pub fn with_overlap(mut self, secs: f64) -> Self {
83 self.overlap_secs = secs;
84 self
85 }
86
87 #[allow(clippy::cast_sign_loss)]
89 #[allow(clippy::disallowed_methods)] fn build_chunk(
91 document: &Document,
92 cues: &[&crate::media::SubtitleCue],
93 chunk_start_secs: f64,
94 ) -> Chunk {
95 let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
96
97 let start_secs = cues.first().map(|c| c.start_secs).unwrap_or(chunk_start_secs);
98 let end_secs = cues.last().map(|c| c.end_secs).unwrap_or(chunk_start_secs);
99
100 let mut chunk =
101 Chunk::new(document.id, text, start_secs.max(0.0) as usize, end_secs.max(0.0) as usize);
102 chunk.metadata.title = document.title.clone();
103 chunk.metadata.custom.insert("start_secs".into(), serde_json::json!(start_secs));
104 chunk.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
105 chunk.metadata.custom.insert(
106 "start_display".into(),
107 serde_json::json!(crate::media::format_display_time(start_secs)),
108 );
109 chunk.metadata.custom.insert(
110 "end_display".into(),
111 serde_json::json!(crate::media::format_display_time(end_secs)),
112 );
113 chunk.metadata.custom.insert("cue_count".into(), serde_json::json!(cues.len()));
114 chunk
115 }
116}
117
118const DEFAULT_TARGET_DURATION: f64 = 60.0;
120
121impl Default for TimestampChunker {
122 fn default() -> Self {
123 Self {
124 target_duration_secs: DEFAULT_TARGET_DURATION,
125 min_duration_secs: 10.0,
126 max_duration_secs: DEFAULT_TARGET_DURATION * 2.0,
127 overlap_secs: 5.0,
128 }
129 }
130}
131
132impl Chunker for TimestampChunker {
133 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
134 if document.content.is_empty() {
135 return Err(Error::EmptyDocument(
136 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
137 ));
138 }
139
140 let cues: Vec<crate::media::SubtitleCue> = document
142 .metadata
143 .get("subtitle_cues")
144 .and_then(|v| serde_json::from_value(v.clone()).ok())
145 .unwrap_or_default();
146
147 if cues.is_empty() {
148 return RecursiveChunker::new(512, 50).chunk(document);
150 }
151
152 let mut chunks = Vec::new();
153 let mut current_cues: Vec<&crate::media::SubtitleCue> = Vec::new();
154 let mut chunk_start = cues[0].start_secs;
155
156 for cue in &cues {
157 let current_duration = cue.end_secs - chunk_start;
158
159 if current_duration >= self.target_duration_secs && !current_cues.is_empty() {
161 chunks.push(Self::build_chunk(document, ¤t_cues, chunk_start));
162
163 let overlap_start = cue.start_secs - self.overlap_secs;
165 current_cues.retain(|c| c.start_secs >= overlap_start);
166 chunk_start = current_cues.first().map(|c| c.start_secs).unwrap_or(cue.start_secs);
167 }
168
169 current_cues.push(cue);
170 }
171
172 if !current_cues.is_empty() {
174 let final_duration =
175 current_cues.last().map(|c| c.end_secs).unwrap_or(0.0) - chunk_start;
176
177 if final_duration < self.min_duration_secs && !chunks.is_empty() {
178 if let Some(last) = chunks.last_mut() {
180 let extra_text: String =
181 current_cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
182 last.content.push(' ');
183 last.content.push_str(&extra_text);
184
185 let end_secs = current_cues.last().map(|c| c.end_secs).unwrap_or(0.0);
186 #[allow(clippy::cast_sign_loss)]
187 {
188 last.end_offset = end_secs.max(0.0) as usize;
189 }
190 last.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
191 last.metadata.custom.insert(
192 "end_display".into(),
193 serde_json::json!(crate::media::format_display_time(end_secs)),
194 );
195 }
196 } else {
197 chunks.push(Self::build_chunk(document, ¤t_cues, chunk_start));
198 }
199 }
200
201 Ok(chunks)
202 }
203
204 fn estimate_chunks(&self, document: &Document) -> usize {
205 let duration =
206 document.metadata.get("duration_secs").and_then(|v| v.as_f64()).unwrap_or(0.0);
207
208 if duration <= 0.0 || self.target_duration_secs <= 0.0 {
209 return usize::from(!document.content.is_empty());
210 }
211 #[allow(clippy::cast_sign_loss)]
212 let estimate = (duration / self.target_duration_secs).ceil() as usize;
213 estimate
214 }
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220 use crate::Document;
221
222 fn make_cues(durations: &[(f64, f64, &str)]) -> Vec<crate::media::SubtitleCue> {
223 durations
224 .iter()
225 .enumerate()
226 .map(|(i, (start, end, text))| crate::media::SubtitleCue {
227 index: i,
228 start_secs: *start,
229 end_secs: *end,
230 text: (*text).to_string(),
231 })
232 .collect()
233 }
234
235 fn doc_with_cues(cues: &[crate::media::SubtitleCue]) -> Document {
236 let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
237 let duration = cues.last().map(|c| c.end_secs).unwrap_or(0.0);
238 let mut doc = Document::new(text);
239 doc.metadata.insert("subtitle_cues".into(), serde_json::to_value(cues).unwrap());
240 doc.metadata.insert("duration_secs".into(), serde_json::json!(duration));
241 doc
242 }
243
244 #[test]
245 fn test_timestamp_chunker_basic() {
246 let cues = make_cues(&[
247 (0.0, 25.0, "First segment."),
248 (25.0, 50.0, "Second segment."),
249 (50.0, 75.0, "Third segment."),
250 (75.0, 100.0, "Fourth segment."),
251 ]);
252 let doc = doc_with_cues(&cues);
253
254 let chunker = TimestampChunker::new(60.0);
255 let chunks = chunker.chunk(&doc).unwrap();
256
257 assert!(chunks.len() >= 2, "Expected at least 2 chunks, got {}", chunks.len());
258 for chunk in &chunks {
259 assert!(chunk.metadata.custom.contains_key("start_secs"));
260 assert!(chunk.metadata.custom.contains_key("end_secs"));
261 assert!(chunk.metadata.custom.contains_key("start_display"));
262 assert!(chunk.metadata.custom.contains_key("end_display"));
263 assert!(chunk.metadata.custom.contains_key("cue_count"));
264 }
265 }
266
267 #[test]
268 fn test_timestamp_chunker_single_short_chunk() {
269 let cues = make_cues(&[(0.0, 10.0, "Only one."), (10.0, 20.0, "Short transcript.")]);
270 let doc = doc_with_cues(&cues);
271
272 let chunker = TimestampChunker::new(60.0);
273 let chunks = chunker.chunk(&doc).unwrap();
274 assert_eq!(chunks.len(), 1);
275 }
276
277 #[test]
278 fn test_timestamp_chunker_fallback_no_cues() {
279 let doc = Document::new("Plain text without any subtitle metadata.");
280 let chunker = TimestampChunker::new(60.0);
281 let chunks = chunker.chunk(&doc).unwrap();
282 assert!(!chunks.is_empty());
284 assert!(!chunks[0].metadata.custom.contains_key("start_secs"));
285 }
286
287 #[test]
288 fn test_timestamp_chunker_empty_doc() {
289 let doc = Document::new("");
290 let chunker = TimestampChunker::new(60.0);
291 assert!(chunker.chunk(&doc).is_err());
292 }
293
294 #[test]
295 fn test_timestamp_chunker_metadata_values() {
296 let cues = make_cues(&[
297 (60.0, 90.0, "Starts at one minute."),
298 (90.0, 120.0, "Ends at two minutes."),
299 ]);
300 let doc = doc_with_cues(&cues);
301
302 let chunker = TimestampChunker::new(120.0);
303 let chunks = chunker.chunk(&doc).unwrap();
304 assert_eq!(chunks.len(), 1);
305
306 let start = chunks[0].metadata.custom["start_secs"].as_f64().unwrap();
307 let end = chunks[0].metadata.custom["end_secs"].as_f64().unwrap();
308 assert!((start - 60.0).abs() < 0.01);
309 assert!((end - 120.0).abs() < 0.01);
310 assert_eq!(chunks[0].metadata.custom["start_display"], "1:00");
311 assert_eq!(chunks[0].metadata.custom["end_display"], "2:00");
312 }
313
314 #[test]
315 fn test_timestamp_chunker_estimate() {
316 let mut doc = Document::new("content");
317 doc.metadata.insert("duration_secs".into(), serde_json::json!(300.0));
318
319 let chunker = TimestampChunker::new(60.0);
320 assert_eq!(chunker.estimate_chunks(&doc), 5);
321 }
322
323 #[test]
324 fn test_timestamp_chunker_estimate_no_duration() {
325 let doc = Document::new("content");
326 let chunker = TimestampChunker::new(60.0);
327 assert_eq!(chunker.estimate_chunks(&doc), 1);
328 }
329
330 #[test]
331 fn test_timestamp_chunker_merge_short_final() {
332 let cues = make_cues(&[
334 (0.0, 30.0, "First."),
335 (30.0, 60.0, "Second."),
336 (60.0, 65.0, "Tiny final."),
337 ]);
338 let doc = doc_with_cues(&cues);
339
340 let chunker = TimestampChunker::new(55.0).with_min_duration(10.0);
341 let chunks = chunker.chunk(&doc).unwrap();
342
343 let last_text = &chunks.last().unwrap().content;
345 assert!(last_text.contains("Tiny final"), "Last chunk: {last_text}");
346 }
347
348 #[test]
349 fn test_timestamp_chunker_all_text_represented() {
350 let cues = make_cues(&[
351 (0.0, 20.0, "Alpha."),
352 (20.0, 40.0, "Beta."),
353 (40.0, 60.0, "Gamma."),
354 (60.0, 80.0, "Delta."),
355 (80.0, 100.0, "Epsilon."),
356 ]);
357 let doc = doc_with_cues(&cues);
358
359 let chunker = TimestampChunker::new(45.0).with_overlap(0.0);
360 let chunks = chunker.chunk(&doc).unwrap();
361
362 for cue in &cues {
364 assert!(
365 chunks.iter().any(|c| c.content.contains(&cue.text)),
366 "Cue text '{}' not found in any chunk",
367 cue.text
368 );
369 }
370 }
371
372 #[test]
373 fn test_timestamp_chunker_default() {
374 let chunker = TimestampChunker::default();
375 assert!((chunker.target_duration_secs - 60.0).abs() < 0.01);
376 assert!((chunker.min_duration_secs - 10.0).abs() < 0.01);
377 assert!((chunker.max_duration_secs - 120.0).abs() < 0.01);
378 assert!((chunker.overlap_secs - 5.0).abs() < 0.01);
379 }
380
381 #[test]
382 fn test_timestamp_chunker_builder() {
383 let chunker = TimestampChunker::new(30.0)
384 .with_min_duration(5.0)
385 .with_max_duration(90.0)
386 .with_overlap(3.0);
387 assert!((chunker.target_duration_secs - 30.0).abs() < 0.01);
388 assert!((chunker.min_duration_secs - 5.0).abs() < 0.01);
389 assert!((chunker.max_duration_secs - 90.0).abs() < 0.01);
390 assert!((chunker.overlap_secs - 3.0).abs() < 0.01);
391 }
392}