1use super::{Chunk, Chunker, RecursiveChunker};
4use crate::{Document, Error, Result};
5
6#[derive(Debug, Clone)]
41#[allow(clippy::struct_field_names)]
42pub struct TimestampChunker {
43 target_duration_secs: f64,
45 min_duration_secs: f64,
47 #[allow(dead_code)]
49 max_duration_secs: f64,
50 overlap_secs: f64,
52}
53
54impl TimestampChunker {
55 #[must_use]
57 pub fn new(target_duration_secs: f64) -> Self {
58 Self {
59 target_duration_secs,
60 min_duration_secs: 10.0,
61 max_duration_secs: target_duration_secs * 2.0,
62 overlap_secs: 5.0,
63 }
64 }
65
66 #[must_use]
68 pub fn with_min_duration(mut self, secs: f64) -> Self {
69 self.min_duration_secs = secs;
70 self
71 }
72
73 #[must_use]
75 pub fn with_max_duration(mut self, secs: f64) -> Self {
76 self.max_duration_secs = secs;
77 self
78 }
79
80 #[must_use]
82 pub fn with_overlap(mut self, secs: f64) -> Self {
83 self.overlap_secs = secs;
84 self
85 }
86
87 #[allow(clippy::cast_sign_loss)]
89 fn build_chunk(
90 document: &Document,
91 cues: &[&crate::media::SubtitleCue],
92 chunk_start_secs: f64,
93 ) -> Chunk {
94 let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
95
96 let start_secs = cues.first().map(|c| c.start_secs).unwrap_or(chunk_start_secs);
97 let end_secs = cues.last().map(|c| c.end_secs).unwrap_or(chunk_start_secs);
98
99 let mut chunk =
100 Chunk::new(document.id, text, start_secs.max(0.0) as usize, end_secs.max(0.0) as usize);
101 chunk.metadata.title = document.title.clone();
102 chunk.metadata.custom.insert("start_secs".into(), serde_json::json!(start_secs));
103 chunk.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
104 chunk.metadata.custom.insert(
105 "start_display".into(),
106 serde_json::json!(crate::media::format_display_time(start_secs)),
107 );
108 chunk.metadata.custom.insert(
109 "end_display".into(),
110 serde_json::json!(crate::media::format_display_time(end_secs)),
111 );
112 chunk.metadata.custom.insert("cue_count".into(), serde_json::json!(cues.len()));
113 chunk
114 }
115}
116
117const DEFAULT_TARGET_DURATION: f64 = 60.0;
119
120impl Default for TimestampChunker {
121 fn default() -> Self {
122 Self {
123 target_duration_secs: DEFAULT_TARGET_DURATION,
124 min_duration_secs: 10.0,
125 max_duration_secs: DEFAULT_TARGET_DURATION * 2.0,
126 overlap_secs: 5.0,
127 }
128 }
129}
130
131impl Chunker for TimestampChunker {
132 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
133 if document.content.is_empty() {
134 return Err(Error::EmptyDocument(
135 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
136 ));
137 }
138
139 let cues: Vec<crate::media::SubtitleCue> = document
141 .metadata
142 .get("subtitle_cues")
143 .and_then(|v| serde_json::from_value(v.clone()).ok())
144 .unwrap_or_default();
145
146 if cues.is_empty() {
147 return RecursiveChunker::new(512, 50).chunk(document);
149 }
150
151 let mut chunks = Vec::new();
152 let mut current_cues: Vec<&crate::media::SubtitleCue> = Vec::new();
153 let mut chunk_start = cues[0].start_secs;
154
155 for cue in &cues {
156 let current_duration = cue.end_secs - chunk_start;
157
158 if current_duration >= self.target_duration_secs && !current_cues.is_empty() {
160 chunks.push(Self::build_chunk(document, ¤t_cues, chunk_start));
161
162 let overlap_start = cue.start_secs - self.overlap_secs;
164 current_cues.retain(|c| c.start_secs >= overlap_start);
165 chunk_start = current_cues.first().map(|c| c.start_secs).unwrap_or(cue.start_secs);
166 }
167
168 current_cues.push(cue);
169 }
170
171 if !current_cues.is_empty() {
173 let final_duration =
174 current_cues.last().map(|c| c.end_secs).unwrap_or(0.0) - chunk_start;
175
176 if final_duration < self.min_duration_secs && !chunks.is_empty() {
177 if let Some(last) = chunks.last_mut() {
179 let extra_text: String =
180 current_cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
181 last.content.push(' ');
182 last.content.push_str(&extra_text);
183
184 let end_secs = current_cues.last().map(|c| c.end_secs).unwrap_or(0.0);
185 #[allow(clippy::cast_sign_loss)]
186 {
187 last.end_offset = end_secs.max(0.0) as usize;
188 }
189 last.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
190 last.metadata.custom.insert(
191 "end_display".into(),
192 serde_json::json!(crate::media::format_display_time(end_secs)),
193 );
194 }
195 } else {
196 chunks.push(Self::build_chunk(document, ¤t_cues, chunk_start));
197 }
198 }
199
200 Ok(chunks)
201 }
202
203 fn estimate_chunks(&self, document: &Document) -> usize {
204 let duration =
205 document.metadata.get("duration_secs").and_then(|v| v.as_f64()).unwrap_or(0.0);
206
207 if duration <= 0.0 || self.target_duration_secs <= 0.0 {
208 return usize::from(!document.content.is_empty());
209 }
210 #[allow(clippy::cast_sign_loss)]
211 let estimate = (duration / self.target_duration_secs).ceil() as usize;
212 estimate
213 }
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219 use crate::Document;
220
221 fn make_cues(durations: &[(f64, f64, &str)]) -> Vec<crate::media::SubtitleCue> {
222 durations
223 .iter()
224 .enumerate()
225 .map(|(i, (start, end, text))| crate::media::SubtitleCue {
226 index: i,
227 start_secs: *start,
228 end_secs: *end,
229 text: (*text).to_string(),
230 })
231 .collect()
232 }
233
234 fn doc_with_cues(cues: &[crate::media::SubtitleCue]) -> Document {
235 let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
236 let duration = cues.last().map(|c| c.end_secs).unwrap_or(0.0);
237 let mut doc = Document::new(text);
238 doc.metadata.insert("subtitle_cues".into(), serde_json::to_value(cues).unwrap());
239 doc.metadata.insert("duration_secs".into(), serde_json::json!(duration));
240 doc
241 }
242
243 #[test]
244 fn test_timestamp_chunker_basic() {
245 let cues = make_cues(&[
246 (0.0, 25.0, "First segment."),
247 (25.0, 50.0, "Second segment."),
248 (50.0, 75.0, "Third segment."),
249 (75.0, 100.0, "Fourth segment."),
250 ]);
251 let doc = doc_with_cues(&cues);
252
253 let chunker = TimestampChunker::new(60.0);
254 let chunks = chunker.chunk(&doc).unwrap();
255
256 assert!(chunks.len() >= 2, "Expected at least 2 chunks, got {}", chunks.len());
257 for chunk in &chunks {
258 assert!(chunk.metadata.custom.contains_key("start_secs"));
259 assert!(chunk.metadata.custom.contains_key("end_secs"));
260 assert!(chunk.metadata.custom.contains_key("start_display"));
261 assert!(chunk.metadata.custom.contains_key("end_display"));
262 assert!(chunk.metadata.custom.contains_key("cue_count"));
263 }
264 }
265
266 #[test]
267 fn test_timestamp_chunker_single_short_chunk() {
268 let cues = make_cues(&[(0.0, 10.0, "Only one."), (10.0, 20.0, "Short transcript.")]);
269 let doc = doc_with_cues(&cues);
270
271 let chunker = TimestampChunker::new(60.0);
272 let chunks = chunker.chunk(&doc).unwrap();
273 assert_eq!(chunks.len(), 1);
274 }
275
276 #[test]
277 fn test_timestamp_chunker_fallback_no_cues() {
278 let doc = Document::new("Plain text without any subtitle metadata.");
279 let chunker = TimestampChunker::new(60.0);
280 let chunks = chunker.chunk(&doc).unwrap();
281 assert!(!chunks.is_empty());
283 assert!(!chunks[0].metadata.custom.contains_key("start_secs"));
284 }
285
286 #[test]
287 fn test_timestamp_chunker_empty_doc() {
288 let doc = Document::new("");
289 let chunker = TimestampChunker::new(60.0);
290 assert!(chunker.chunk(&doc).is_err());
291 }
292
293 #[test]
294 fn test_timestamp_chunker_metadata_values() {
295 let cues = make_cues(&[
296 (60.0, 90.0, "Starts at one minute."),
297 (90.0, 120.0, "Ends at two minutes."),
298 ]);
299 let doc = doc_with_cues(&cues);
300
301 let chunker = TimestampChunker::new(120.0);
302 let chunks = chunker.chunk(&doc).unwrap();
303 assert_eq!(chunks.len(), 1);
304
305 let start = chunks[0].metadata.custom["start_secs"].as_f64().unwrap();
306 let end = chunks[0].metadata.custom["end_secs"].as_f64().unwrap();
307 assert!((start - 60.0).abs() < 0.01);
308 assert!((end - 120.0).abs() < 0.01);
309 assert_eq!(chunks[0].metadata.custom["start_display"], "1:00");
310 assert_eq!(chunks[0].metadata.custom["end_display"], "2:00");
311 }
312
313 #[test]
314 fn test_timestamp_chunker_estimate() {
315 let mut doc = Document::new("content");
316 doc.metadata.insert("duration_secs".into(), serde_json::json!(300.0));
317
318 let chunker = TimestampChunker::new(60.0);
319 assert_eq!(chunker.estimate_chunks(&doc), 5);
320 }
321
322 #[test]
323 fn test_timestamp_chunker_estimate_no_duration() {
324 let doc = Document::new("content");
325 let chunker = TimestampChunker::new(60.0);
326 assert_eq!(chunker.estimate_chunks(&doc), 1);
327 }
328
329 #[test]
330 fn test_timestamp_chunker_merge_short_final() {
331 let cues = make_cues(&[
333 (0.0, 30.0, "First."),
334 (30.0, 60.0, "Second."),
335 (60.0, 65.0, "Tiny final."),
336 ]);
337 let doc = doc_with_cues(&cues);
338
339 let chunker = TimestampChunker::new(55.0).with_min_duration(10.0);
340 let chunks = chunker.chunk(&doc).unwrap();
341
342 let last_text = &chunks.last().unwrap().content;
344 assert!(last_text.contains("Tiny final"), "Last chunk: {last_text}");
345 }
346
347 #[test]
348 fn test_timestamp_chunker_all_text_represented() {
349 let cues = make_cues(&[
350 (0.0, 20.0, "Alpha."),
351 (20.0, 40.0, "Beta."),
352 (40.0, 60.0, "Gamma."),
353 (60.0, 80.0, "Delta."),
354 (80.0, 100.0, "Epsilon."),
355 ]);
356 let doc = doc_with_cues(&cues);
357
358 let chunker = TimestampChunker::new(45.0).with_overlap(0.0);
359 let chunks = chunker.chunk(&doc).unwrap();
360
361 for cue in &cues {
363 assert!(
364 chunks.iter().any(|c| c.content.contains(&cue.text)),
365 "Cue text '{}' not found in any chunk",
366 cue.text
367 );
368 }
369 }
370
371 #[test]
372 fn test_timestamp_chunker_default() {
373 let chunker = TimestampChunker::default();
374 assert!((chunker.target_duration_secs - 60.0).abs() < 0.01);
375 assert!((chunker.min_duration_secs - 10.0).abs() < 0.01);
376 assert!((chunker.max_duration_secs - 120.0).abs() < 0.01);
377 assert!((chunker.overlap_secs - 5.0).abs() < 0.01);
378 }
379
380 #[test]
381 fn test_timestamp_chunker_builder() {
382 let chunker = TimestampChunker::new(30.0)
383 .with_min_duration(5.0)
384 .with_max_duration(90.0)
385 .with_overlap(3.0);
386 assert!((chunker.target_duration_secs - 30.0).abs() < 0.01);
387 assert!((chunker.min_duration_secs - 5.0).abs() < 0.01);
388 assert!((chunker.max_duration_secs - 90.0).abs() < 0.01);
389 assert!((chunker.overlap_secs - 3.0).abs() < 0.01);
390 }
391}