Skip to main content

trueno_rag/chunk/
timestamp.rs

1//! Timestamp-aware chunker for subtitle/transcript content.
2
3use super::{Chunk, Chunker, RecursiveChunker};
4use crate::{Document, Error, Result};
5
6/// Timestamp-aware chunker for subtitle/transcript content.
7///
8/// Groups subtitle cues into chunks based on time duration rather than
9/// character count. Each chunk carries `start_secs` and `end_secs` in
10/// its metadata for timestamp-aware retrieval and citation.
11///
12/// Falls back to [`RecursiveChunker`] for documents without subtitle
13/// cue metadata.
14///
15/// # Example
16///
17/// ```rust
18/// use trueno_rag::chunk::{TimestampChunker, Chunker};
19/// use trueno_rag::Document;
20/// use trueno_rag::media::SubtitleCue;
21///
22/// let cues = vec![
23///     SubtitleCue { index: 0, start_secs: 0.0, end_secs: 30.0, text: "First segment.".into() },
24///     SubtitleCue { index: 1, start_secs: 30.0, end_secs: 65.0, text: "Second segment.".into() },
25///     SubtitleCue { index: 2, start_secs: 65.0, end_secs: 90.0, text: "Third segment.".into() },
26/// ];
27///
28/// let mut doc = Document::new("First segment. Second segment. Third segment.");
29/// doc.metadata.insert(
30///     "subtitle_cues".into(),
31///     serde_json::to_value(&cues).unwrap(),
32/// );
33/// doc.metadata.insert("duration_secs".into(), serde_json::json!(90.0));
34///
35/// let chunker = TimestampChunker::new(60.0);
36/// let chunks = chunker.chunk(&doc).unwrap();
37/// assert!(chunks.len() >= 2);
38/// assert!(chunks[0].metadata.custom.contains_key("start_secs"));
39/// ```
40#[derive(Debug, Clone)]
41#[allow(clippy::struct_field_names)]
42pub struct TimestampChunker {
43    /// Target chunk duration in seconds
44    target_duration_secs: f64,
45    /// Minimum chunk duration (avoids tiny fragments)
46    min_duration_secs: f64,
47    /// Maximum chunk duration (hard limit)
48    #[allow(dead_code)]
49    max_duration_secs: f64,
50    /// Overlap duration for context continuity
51    overlap_secs: f64,
52}
53
54impl TimestampChunker {
55    /// Create a timestamp chunker with the given target duration.
56    #[must_use]
57    pub fn new(target_duration_secs: f64) -> Self {
58        Self {
59            target_duration_secs,
60            min_duration_secs: 10.0,
61            max_duration_secs: target_duration_secs * 2.0,
62            overlap_secs: 5.0,
63        }
64    }
65
66    /// Set minimum chunk duration.
67    #[must_use]
68    pub fn with_min_duration(mut self, secs: f64) -> Self {
69        self.min_duration_secs = secs;
70        self
71    }
72
73    /// Set maximum chunk duration.
74    #[must_use]
75    pub fn with_max_duration(mut self, secs: f64) -> Self {
76        self.max_duration_secs = secs;
77        self
78    }
79
80    /// Set overlap duration.
81    #[must_use]
82    pub fn with_overlap(mut self, secs: f64) -> Self {
83        self.overlap_secs = secs;
84        self
85    }
86
87    /// Build a chunk from a slice of cues.
88    #[allow(clippy::cast_sign_loss)]
89    fn build_chunk(
90        document: &Document,
91        cues: &[&crate::media::SubtitleCue],
92        chunk_start_secs: f64,
93    ) -> Chunk {
94        let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
95
96        let start_secs = cues.first().map(|c| c.start_secs).unwrap_or(chunk_start_secs);
97        let end_secs = cues.last().map(|c| c.end_secs).unwrap_or(chunk_start_secs);
98
99        let mut chunk =
100            Chunk::new(document.id, text, start_secs.max(0.0) as usize, end_secs.max(0.0) as usize);
101        chunk.metadata.title = document.title.clone();
102        chunk.metadata.custom.insert("start_secs".into(), serde_json::json!(start_secs));
103        chunk.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
104        chunk.metadata.custom.insert(
105            "start_display".into(),
106            serde_json::json!(crate::media::format_display_time(start_secs)),
107        );
108        chunk.metadata.custom.insert(
109            "end_display".into(),
110            serde_json::json!(crate::media::format_display_time(end_secs)),
111        );
112        chunk.metadata.custom.insert("cue_count".into(), serde_json::json!(cues.len()));
113        chunk
114    }
115}
116
117/// Default target chunk duration in seconds
118const DEFAULT_TARGET_DURATION: f64 = 60.0;
119
120impl Default for TimestampChunker {
121    fn default() -> Self {
122        Self {
123            target_duration_secs: DEFAULT_TARGET_DURATION,
124            min_duration_secs: 10.0,
125            max_duration_secs: DEFAULT_TARGET_DURATION * 2.0,
126            overlap_secs: 5.0,
127        }
128    }
129}
130
131impl Chunker for TimestampChunker {
132    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
133        if document.content.is_empty() {
134            return Err(Error::EmptyDocument(
135                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
136            ));
137        }
138
139        // Extract subtitle cues from document metadata
140        let cues: Vec<crate::media::SubtitleCue> = document
141            .metadata
142            .get("subtitle_cues")
143            .and_then(|v| serde_json::from_value(v.clone()).ok())
144            .unwrap_or_default();
145
146        if cues.is_empty() {
147            // No timestamp data — fall back to RecursiveChunker
148            return RecursiveChunker::new(512, 50).chunk(document);
149        }
150
151        let mut chunks = Vec::new();
152        let mut current_cues: Vec<&crate::media::SubtitleCue> = Vec::new();
153        let mut chunk_start = cues[0].start_secs;
154
155        for cue in &cues {
156            let current_duration = cue.end_secs - chunk_start;
157
158            // Emit chunk when we've reached target duration
159            if current_duration >= self.target_duration_secs && !current_cues.is_empty() {
160                chunks.push(Self::build_chunk(document, &current_cues, chunk_start));
161
162                // Start next chunk, keeping cues that fall within overlap window
163                let overlap_start = cue.start_secs - self.overlap_secs;
164                current_cues.retain(|c| c.start_secs >= overlap_start);
165                chunk_start = current_cues.first().map(|c| c.start_secs).unwrap_or(cue.start_secs);
166            }
167
168            current_cues.push(cue);
169        }
170
171        // Emit final chunk
172        if !current_cues.is_empty() {
173            let final_duration =
174                current_cues.last().map(|c| c.end_secs).unwrap_or(0.0) - chunk_start;
175
176            if final_duration < self.min_duration_secs && !chunks.is_empty() {
177                // Merge into previous chunk if too short
178                if let Some(last) = chunks.last_mut() {
179                    let extra_text: String =
180                        current_cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
181                    last.content.push(' ');
182                    last.content.push_str(&extra_text);
183
184                    let end_secs = current_cues.last().map(|c| c.end_secs).unwrap_or(0.0);
185                    #[allow(clippy::cast_sign_loss)]
186                    {
187                        last.end_offset = end_secs.max(0.0) as usize;
188                    }
189                    last.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
190                    last.metadata.custom.insert(
191                        "end_display".into(),
192                        serde_json::json!(crate::media::format_display_time(end_secs)),
193                    );
194                }
195            } else {
196                chunks.push(Self::build_chunk(document, &current_cues, chunk_start));
197            }
198        }
199
200        Ok(chunks)
201    }
202
203    fn estimate_chunks(&self, document: &Document) -> usize {
204        let duration =
205            document.metadata.get("duration_secs").and_then(|v| v.as_f64()).unwrap_or(0.0);
206
207        if duration <= 0.0 || self.target_duration_secs <= 0.0 {
208            return usize::from(!document.content.is_empty());
209        }
210        #[allow(clippy::cast_sign_loss)]
211        let estimate = (duration / self.target_duration_secs).ceil() as usize;
212        estimate
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use crate::Document;
220
221    fn make_cues(durations: &[(f64, f64, &str)]) -> Vec<crate::media::SubtitleCue> {
222        durations
223            .iter()
224            .enumerate()
225            .map(|(i, (start, end, text))| crate::media::SubtitleCue {
226                index: i,
227                start_secs: *start,
228                end_secs: *end,
229                text: (*text).to_string(),
230            })
231            .collect()
232    }
233
234    fn doc_with_cues(cues: &[crate::media::SubtitleCue]) -> Document {
235        let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
236        let duration = cues.last().map(|c| c.end_secs).unwrap_or(0.0);
237        let mut doc = Document::new(text);
238        doc.metadata.insert("subtitle_cues".into(), serde_json::to_value(cues).unwrap());
239        doc.metadata.insert("duration_secs".into(), serde_json::json!(duration));
240        doc
241    }
242
243    #[test]
244    fn test_timestamp_chunker_basic() {
245        let cues = make_cues(&[
246            (0.0, 25.0, "First segment."),
247            (25.0, 50.0, "Second segment."),
248            (50.0, 75.0, "Third segment."),
249            (75.0, 100.0, "Fourth segment."),
250        ]);
251        let doc = doc_with_cues(&cues);
252
253        let chunker = TimestampChunker::new(60.0);
254        let chunks = chunker.chunk(&doc).unwrap();
255
256        assert!(chunks.len() >= 2, "Expected at least 2 chunks, got {}", chunks.len());
257        for chunk in &chunks {
258            assert!(chunk.metadata.custom.contains_key("start_secs"));
259            assert!(chunk.metadata.custom.contains_key("end_secs"));
260            assert!(chunk.metadata.custom.contains_key("start_display"));
261            assert!(chunk.metadata.custom.contains_key("end_display"));
262            assert!(chunk.metadata.custom.contains_key("cue_count"));
263        }
264    }
265
266    #[test]
267    fn test_timestamp_chunker_single_short_chunk() {
268        let cues = make_cues(&[(0.0, 10.0, "Only one."), (10.0, 20.0, "Short transcript.")]);
269        let doc = doc_with_cues(&cues);
270
271        let chunker = TimestampChunker::new(60.0);
272        let chunks = chunker.chunk(&doc).unwrap();
273        assert_eq!(chunks.len(), 1);
274    }
275
276    #[test]
277    fn test_timestamp_chunker_fallback_no_cues() {
278        let doc = Document::new("Plain text without any subtitle metadata.");
279        let chunker = TimestampChunker::new(60.0);
280        let chunks = chunker.chunk(&doc).unwrap();
281        // Falls back to RecursiveChunker
282        assert!(!chunks.is_empty());
283        assert!(!chunks[0].metadata.custom.contains_key("start_secs"));
284    }
285
286    #[test]
287    fn test_timestamp_chunker_empty_doc() {
288        let doc = Document::new("");
289        let chunker = TimestampChunker::new(60.0);
290        assert!(chunker.chunk(&doc).is_err());
291    }
292
293    #[test]
294    fn test_timestamp_chunker_metadata_values() {
295        let cues = make_cues(&[
296            (60.0, 90.0, "Starts at one minute."),
297            (90.0, 120.0, "Ends at two minutes."),
298        ]);
299        let doc = doc_with_cues(&cues);
300
301        let chunker = TimestampChunker::new(120.0);
302        let chunks = chunker.chunk(&doc).unwrap();
303        assert_eq!(chunks.len(), 1);
304
305        let start = chunks[0].metadata.custom["start_secs"].as_f64().unwrap();
306        let end = chunks[0].metadata.custom["end_secs"].as_f64().unwrap();
307        assert!((start - 60.0).abs() < 0.01);
308        assert!((end - 120.0).abs() < 0.01);
309        assert_eq!(chunks[0].metadata.custom["start_display"], "1:00");
310        assert_eq!(chunks[0].metadata.custom["end_display"], "2:00");
311    }
312
313    #[test]
314    fn test_timestamp_chunker_estimate() {
315        let mut doc = Document::new("content");
316        doc.metadata.insert("duration_secs".into(), serde_json::json!(300.0));
317
318        let chunker = TimestampChunker::new(60.0);
319        assert_eq!(chunker.estimate_chunks(&doc), 5);
320    }
321
322    #[test]
323    fn test_timestamp_chunker_estimate_no_duration() {
324        let doc = Document::new("content");
325        let chunker = TimestampChunker::new(60.0);
326        assert_eq!(chunker.estimate_chunks(&doc), 1);
327    }
328
329    #[test]
330    fn test_timestamp_chunker_merge_short_final() {
331        // Create cues where the final group is very short
332        let cues = make_cues(&[
333            (0.0, 30.0, "First."),
334            (30.0, 60.0, "Second."),
335            (60.0, 65.0, "Tiny final."),
336        ]);
337        let doc = doc_with_cues(&cues);
338
339        let chunker = TimestampChunker::new(55.0).with_min_duration(10.0);
340        let chunks = chunker.chunk(&doc).unwrap();
341
342        // The tiny final should be merged into the previous chunk
343        let last_text = &chunks.last().unwrap().content;
344        assert!(last_text.contains("Tiny final"), "Last chunk: {last_text}");
345    }
346
347    #[test]
348    fn test_timestamp_chunker_all_text_represented() {
349        let cues = make_cues(&[
350            (0.0, 20.0, "Alpha."),
351            (20.0, 40.0, "Beta."),
352            (40.0, 60.0, "Gamma."),
353            (60.0, 80.0, "Delta."),
354            (80.0, 100.0, "Epsilon."),
355        ]);
356        let doc = doc_with_cues(&cues);
357
358        let chunker = TimestampChunker::new(45.0).with_overlap(0.0);
359        let chunks = chunker.chunk(&doc).unwrap();
360
361        // Every cue text should appear in at least one chunk
362        for cue in &cues {
363            assert!(
364                chunks.iter().any(|c| c.content.contains(&cue.text)),
365                "Cue text '{}' not found in any chunk",
366                cue.text
367            );
368        }
369    }
370
371    #[test]
372    fn test_timestamp_chunker_default() {
373        let chunker = TimestampChunker::default();
374        assert!((chunker.target_duration_secs - 60.0).abs() < 0.01);
375        assert!((chunker.min_duration_secs - 10.0).abs() < 0.01);
376        assert!((chunker.max_duration_secs - 120.0).abs() < 0.01);
377        assert!((chunker.overlap_secs - 5.0).abs() < 0.01);
378    }
379
380    #[test]
381    fn test_timestamp_chunker_builder() {
382        let chunker = TimestampChunker::new(30.0)
383            .with_min_duration(5.0)
384            .with_max_duration(90.0)
385            .with_overlap(3.0);
386        assert!((chunker.target_duration_secs - 30.0).abs() < 0.01);
387        assert!((chunker.min_duration_secs - 5.0).abs() < 0.01);
388        assert!((chunker.max_duration_secs - 90.0).abs() < 0.01);
389        assert!((chunker.overlap_secs - 3.0).abs() < 0.01);
390    }
391}