Skip to main content

aprender_rag/chunk/
timestamp.rs

1//! Timestamp-aware chunker for subtitle/transcript content.
2
3use super::{Chunk, Chunker, RecursiveChunker};
4use crate::{Document, Error, Result};
5
6/// Timestamp-aware chunker for subtitle/transcript content.
7///
8/// Groups subtitle cues into chunks based on time duration rather than
9/// character count. Each chunk carries `start_secs` and `end_secs` in
10/// its metadata for timestamp-aware retrieval and citation.
11///
12/// Falls back to [`RecursiveChunker`] for documents without subtitle
13/// cue metadata.
14///
15/// # Example
16///
17/// ```rust
18/// use aprender_rag::chunk::{TimestampChunker, Chunker};
19/// use aprender_rag::Document;
20/// use aprender_rag::media::SubtitleCue;
21///
22/// let cues = vec![
23///     SubtitleCue { index: 0, start_secs: 0.0, end_secs: 30.0, text: "First segment.".into() },
24///     SubtitleCue { index: 1, start_secs: 30.0, end_secs: 65.0, text: "Second segment.".into() },
25///     SubtitleCue { index: 2, start_secs: 65.0, end_secs: 90.0, text: "Third segment.".into() },
26/// ];
27///
28/// let mut doc = Document::new("First segment. Second segment. Third segment.");
29/// doc.metadata.insert(
30///     "subtitle_cues".into(),
31///     serde_json::to_value(&cues).unwrap(),
32/// );
33/// doc.metadata.insert("duration_secs".into(), serde_json::json!(90.0));
34///
35/// let chunker = TimestampChunker::new(60.0);
36/// let chunks = chunker.chunk(&doc).unwrap();
37/// assert!(chunks.len() >= 2);
38/// assert!(chunks[0].metadata.custom.contains_key("start_secs"));
39/// ```
40#[derive(Debug, Clone)]
41#[allow(clippy::struct_field_names)]
42pub struct TimestampChunker {
43    /// Target chunk duration in seconds
44    target_duration_secs: f64,
45    /// Minimum chunk duration (avoids tiny fragments)
46    min_duration_secs: f64,
47    /// Maximum chunk duration (hard limit)
48    #[allow(dead_code)]
49    max_duration_secs: f64,
50    /// Overlap duration for context continuity
51    overlap_secs: f64,
52}
53
54impl TimestampChunker {
55    /// Create a timestamp chunker with the given target duration.
56    #[must_use]
57    pub fn new(target_duration_secs: f64) -> Self {
58        Self {
59            target_duration_secs,
60            min_duration_secs: 10.0,
61            max_duration_secs: target_duration_secs * 2.0,
62            overlap_secs: 5.0,
63        }
64    }
65
66    /// Set minimum chunk duration.
67    #[must_use]
68    pub fn with_min_duration(mut self, secs: f64) -> Self {
69        self.min_duration_secs = secs;
70        self
71    }
72
73    /// Set maximum chunk duration.
74    #[must_use]
75    pub fn with_max_duration(mut self, secs: f64) -> Self {
76        self.max_duration_secs = secs;
77        self
78    }
79
80    /// Set overlap duration.
81    #[must_use]
82    pub fn with_overlap(mut self, secs: f64) -> Self {
83        self.overlap_secs = secs;
84        self
85    }
86
87    /// Build a chunk from a slice of cues.
88    #[allow(clippy::cast_sign_loss)]
89    #[allow(clippy::disallowed_methods)] // json! macro internally uses unwrap
90    fn build_chunk(
91        document: &Document,
92        cues: &[&crate::media::SubtitleCue],
93        chunk_start_secs: f64,
94    ) -> Chunk {
95        let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
96
97        let start_secs = cues.first().map(|c| c.start_secs).unwrap_or(chunk_start_secs);
98        let end_secs = cues.last().map(|c| c.end_secs).unwrap_or(chunk_start_secs);
99
100        let mut chunk =
101            Chunk::new(document.id, text, start_secs.max(0.0) as usize, end_secs.max(0.0) as usize);
102        chunk.metadata.title = document.title.clone();
103        chunk.metadata.custom.insert("start_secs".into(), serde_json::json!(start_secs));
104        chunk.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
105        chunk.metadata.custom.insert(
106            "start_display".into(),
107            serde_json::json!(crate::media::format_display_time(start_secs)),
108        );
109        chunk.metadata.custom.insert(
110            "end_display".into(),
111            serde_json::json!(crate::media::format_display_time(end_secs)),
112        );
113        chunk.metadata.custom.insert("cue_count".into(), serde_json::json!(cues.len()));
114        chunk
115    }
116}
117
118/// Default target chunk duration in seconds
119const DEFAULT_TARGET_DURATION: f64 = 60.0;
120
121impl Default for TimestampChunker {
122    fn default() -> Self {
123        Self {
124            target_duration_secs: DEFAULT_TARGET_DURATION,
125            min_duration_secs: 10.0,
126            max_duration_secs: DEFAULT_TARGET_DURATION * 2.0,
127            overlap_secs: 5.0,
128        }
129    }
130}
131
132impl Chunker for TimestampChunker {
133    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
134        if document.content.is_empty() {
135            return Err(Error::EmptyDocument(
136                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
137            ));
138        }
139
140        // Extract subtitle cues from document metadata
141        let cues: Vec<crate::media::SubtitleCue> = document
142            .metadata
143            .get("subtitle_cues")
144            .and_then(|v| serde_json::from_value(v.clone()).ok())
145            .unwrap_or_default();
146
147        if cues.is_empty() {
148            // No timestamp data — fall back to RecursiveChunker
149            return RecursiveChunker::new(512, 50).chunk(document);
150        }
151
152        let mut chunks = Vec::new();
153        let mut current_cues: Vec<&crate::media::SubtitleCue> = Vec::new();
154        let mut chunk_start = cues[0].start_secs;
155
156        for cue in &cues {
157            let current_duration = cue.end_secs - chunk_start;
158
159            // Emit chunk when we've reached target duration
160            if current_duration >= self.target_duration_secs && !current_cues.is_empty() {
161                chunks.push(Self::build_chunk(document, &current_cues, chunk_start));
162
163                // Start next chunk, keeping cues that fall within overlap window
164                let overlap_start = cue.start_secs - self.overlap_secs;
165                current_cues.retain(|c| c.start_secs >= overlap_start);
166                chunk_start = current_cues.first().map(|c| c.start_secs).unwrap_or(cue.start_secs);
167            }
168
169            current_cues.push(cue);
170        }
171
172        // Emit final chunk
173        if !current_cues.is_empty() {
174            let final_duration =
175                current_cues.last().map(|c| c.end_secs).unwrap_or(0.0) - chunk_start;
176
177            if final_duration < self.min_duration_secs && !chunks.is_empty() {
178                // Merge into previous chunk if too short
179                if let Some(last) = chunks.last_mut() {
180                    let extra_text: String =
181                        current_cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
182                    last.content.push(' ');
183                    last.content.push_str(&extra_text);
184
185                    let end_secs = current_cues.last().map(|c| c.end_secs).unwrap_or(0.0);
186                    #[allow(clippy::cast_sign_loss)]
187                    {
188                        last.end_offset = end_secs.max(0.0) as usize;
189                    }
190                    last.metadata.custom.insert("end_secs".into(), serde_json::json!(end_secs));
191                    last.metadata.custom.insert(
192                        "end_display".into(),
193                        serde_json::json!(crate::media::format_display_time(end_secs)),
194                    );
195                }
196            } else {
197                chunks.push(Self::build_chunk(document, &current_cues, chunk_start));
198            }
199        }
200
201        Ok(chunks)
202    }
203
204    fn estimate_chunks(&self, document: &Document) -> usize {
205        let duration =
206            document.metadata.get("duration_secs").and_then(|v| v.as_f64()).unwrap_or(0.0);
207
208        if duration <= 0.0 || self.target_duration_secs <= 0.0 {
209            return usize::from(!document.content.is_empty());
210        }
211        #[allow(clippy::cast_sign_loss)]
212        let estimate = (duration / self.target_duration_secs).ceil() as usize;
213        estimate
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220    use crate::Document;
221
222    fn make_cues(durations: &[(f64, f64, &str)]) -> Vec<crate::media::SubtitleCue> {
223        durations
224            .iter()
225            .enumerate()
226            .map(|(i, (start, end, text))| crate::media::SubtitleCue {
227                index: i,
228                start_secs: *start,
229                end_secs: *end,
230                text: (*text).to_string(),
231            })
232            .collect()
233    }
234
235    fn doc_with_cues(cues: &[crate::media::SubtitleCue]) -> Document {
236        let text: String = cues.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join(" ");
237        let duration = cues.last().map(|c| c.end_secs).unwrap_or(0.0);
238        let mut doc = Document::new(text);
239        doc.metadata.insert("subtitle_cues".into(), serde_json::to_value(cues).unwrap());
240        doc.metadata.insert("duration_secs".into(), serde_json::json!(duration));
241        doc
242    }
243
244    #[test]
245    fn test_timestamp_chunker_basic() {
246        let cues = make_cues(&[
247            (0.0, 25.0, "First segment."),
248            (25.0, 50.0, "Second segment."),
249            (50.0, 75.0, "Third segment."),
250            (75.0, 100.0, "Fourth segment."),
251        ]);
252        let doc = doc_with_cues(&cues);
253
254        let chunker = TimestampChunker::new(60.0);
255        let chunks = chunker.chunk(&doc).unwrap();
256
257        assert!(chunks.len() >= 2, "Expected at least 2 chunks, got {}", chunks.len());
258        for chunk in &chunks {
259            assert!(chunk.metadata.custom.contains_key("start_secs"));
260            assert!(chunk.metadata.custom.contains_key("end_secs"));
261            assert!(chunk.metadata.custom.contains_key("start_display"));
262            assert!(chunk.metadata.custom.contains_key("end_display"));
263            assert!(chunk.metadata.custom.contains_key("cue_count"));
264        }
265    }
266
267    #[test]
268    fn test_timestamp_chunker_single_short_chunk() {
269        let cues = make_cues(&[(0.0, 10.0, "Only one."), (10.0, 20.0, "Short transcript.")]);
270        let doc = doc_with_cues(&cues);
271
272        let chunker = TimestampChunker::new(60.0);
273        let chunks = chunker.chunk(&doc).unwrap();
274        assert_eq!(chunks.len(), 1);
275    }
276
277    #[test]
278    fn test_timestamp_chunker_fallback_no_cues() {
279        let doc = Document::new("Plain text without any subtitle metadata.");
280        let chunker = TimestampChunker::new(60.0);
281        let chunks = chunker.chunk(&doc).unwrap();
282        // Falls back to RecursiveChunker
283        assert!(!chunks.is_empty());
284        assert!(!chunks[0].metadata.custom.contains_key("start_secs"));
285    }
286
287    #[test]
288    fn test_timestamp_chunker_empty_doc() {
289        let doc = Document::new("");
290        let chunker = TimestampChunker::new(60.0);
291        assert!(chunker.chunk(&doc).is_err());
292    }
293
294    #[test]
295    fn test_timestamp_chunker_metadata_values() {
296        let cues = make_cues(&[
297            (60.0, 90.0, "Starts at one minute."),
298            (90.0, 120.0, "Ends at two minutes."),
299        ]);
300        let doc = doc_with_cues(&cues);
301
302        let chunker = TimestampChunker::new(120.0);
303        let chunks = chunker.chunk(&doc).unwrap();
304        assert_eq!(chunks.len(), 1);
305
306        let start = chunks[0].metadata.custom["start_secs"].as_f64().unwrap();
307        let end = chunks[0].metadata.custom["end_secs"].as_f64().unwrap();
308        assert!((start - 60.0).abs() < 0.01);
309        assert!((end - 120.0).abs() < 0.01);
310        assert_eq!(chunks[0].metadata.custom["start_display"], "1:00");
311        assert_eq!(chunks[0].metadata.custom["end_display"], "2:00");
312    }
313
314    #[test]
315    fn test_timestamp_chunker_estimate() {
316        let mut doc = Document::new("content");
317        doc.metadata.insert("duration_secs".into(), serde_json::json!(300.0));
318
319        let chunker = TimestampChunker::new(60.0);
320        assert_eq!(chunker.estimate_chunks(&doc), 5);
321    }
322
323    #[test]
324    fn test_timestamp_chunker_estimate_no_duration() {
325        let doc = Document::new("content");
326        let chunker = TimestampChunker::new(60.0);
327        assert_eq!(chunker.estimate_chunks(&doc), 1);
328    }
329
330    #[test]
331    fn test_timestamp_chunker_merge_short_final() {
332        // Create cues where the final group is very short
333        let cues = make_cues(&[
334            (0.0, 30.0, "First."),
335            (30.0, 60.0, "Second."),
336            (60.0, 65.0, "Tiny final."),
337        ]);
338        let doc = doc_with_cues(&cues);
339
340        let chunker = TimestampChunker::new(55.0).with_min_duration(10.0);
341        let chunks = chunker.chunk(&doc).unwrap();
342
343        // The tiny final should be merged into the previous chunk
344        let last_text = &chunks.last().unwrap().content;
345        assert!(last_text.contains("Tiny final"), "Last chunk: {last_text}");
346    }
347
348    #[test]
349    fn test_timestamp_chunker_all_text_represented() {
350        let cues = make_cues(&[
351            (0.0, 20.0, "Alpha."),
352            (20.0, 40.0, "Beta."),
353            (40.0, 60.0, "Gamma."),
354            (60.0, 80.0, "Delta."),
355            (80.0, 100.0, "Epsilon."),
356        ]);
357        let doc = doc_with_cues(&cues);
358
359        let chunker = TimestampChunker::new(45.0).with_overlap(0.0);
360        let chunks = chunker.chunk(&doc).unwrap();
361
362        // Every cue text should appear in at least one chunk
363        for cue in &cues {
364            assert!(
365                chunks.iter().any(|c| c.content.contains(&cue.text)),
366                "Cue text '{}' not found in any chunk",
367                cue.text
368            );
369        }
370    }
371
372    #[test]
373    fn test_timestamp_chunker_default() {
374        let chunker = TimestampChunker::default();
375        assert!((chunker.target_duration_secs - 60.0).abs() < 0.01);
376        assert!((chunker.min_duration_secs - 10.0).abs() < 0.01);
377        assert!((chunker.max_duration_secs - 120.0).abs() < 0.01);
378        assert!((chunker.overlap_secs - 5.0).abs() < 0.01);
379    }
380
381    #[test]
382    fn test_timestamp_chunker_builder() {
383        let chunker = TimestampChunker::new(30.0)
384            .with_min_duration(5.0)
385            .with_max_duration(90.0)
386            .with_overlap(3.0);
387        assert!((chunker.target_duration_secs - 30.0).abs() < 0.01);
388        assert!((chunker.min_duration_secs - 5.0).abs() < 0.01);
389        assert!((chunker.max_duration_secs - 90.0).abs() < 0.01);
390        assert!((chunker.overlap_secs - 3.0).abs() < 0.01);
391    }
392}