Skip to main content

triplets_core/chunking/
sliding_window.rs

1use super::algorithm::ChunkingAlgorithm;
2use crate::config::ChunkingStrategy;
3use crate::data::{ChunkView, DataRecord, RecordChunk, RecordSection};
4use crate::tokenizer::{Tokenizer, WhitespaceTokenizer};
5
6/// Default sliding-window chunking algorithm.
7#[derive(Clone, Copy, Debug, Default)]
8pub struct SlidingWindowChunker;
9
10impl ChunkingAlgorithm for SlidingWindowChunker {
11    fn materialize(
12        &self,
13        strategy: &ChunkingStrategy,
14        record: &DataRecord,
15        section_idx: usize,
16        section: &RecordSection,
17    ) -> Vec<RecordChunk> {
18        let raw_text = section.text.as_str();
19        let preprocessed_owned: String;
20        let text = if strategy.preprocessors().is_empty() {
21            raw_text
22        } else {
23            let result = strategy
24                .preprocessors()
25                .iter()
26                .try_fold(raw_text.to_string(), |t, p| p.process(&t));
27            match result {
28                Some(s) => {
29                    preprocessed_owned = s;
30                    preprocessed_owned.as_str()
31                }
32                None => return Vec::new(),
33            }
34        };
35        let tokens: Vec<&str> = WhitespaceTokenizer.tokenize(text);
36        if tokens.is_empty() {
37            return Vec::new();
38        }
39
40        let mut chunks = Vec::new();
41        let total_tokens = tokens.len();
42        let span = strategy.max_window_tokens.min(total_tokens);
43        if span == tokens.len() {
44            let text = text.to_string();
45            chunks.push(RecordChunk {
46                record_id: record.id.clone(),
47                section_idx,
48                view: ChunkView::Window {
49                    index: 0,
50                    overlap: 0,
51                    span,
52                },
53                text,
54                tokens_estimate: span,
55                quality: record.quality,
56                kvp_meta: Default::default(),
57            });
58            return chunks;
59        }
60
61        for overlap in &strategy.overlap_tokens {
62            let stride = span.saturating_sub(*overlap).max(1);
63            let mut start = 0;
64            let mut index = 0;
65            while start < tokens.len() {
66                let end = (start + span).min(tokens.len());
67                let window = tokens[start..end].join(" ");
68                chunks.push(RecordChunk {
69                    record_id: record.id.clone(),
70                    section_idx,
71                    view: ChunkView::Window {
72                        index,
73                        overlap: *overlap,
74                        span,
75                    },
76                    text: window,
77                    tokens_estimate: end - start,
78                    quality: record.quality,
79                    kvp_meta: Default::default(),
80                });
81                if end == tokens.len() {
82                    break;
83                }
84                start += stride;
85                index += 1;
86            }
87        }
88
89        if tokens.len() > strategy.max_window_tokens && strategy.summary_fallback_tokens > 0 {
90            let fallback_cap = strategy
91                .summary_fallback_tokens
92                .min(strategy.max_window_tokens)
93                .max(1);
94            let fallback_len = tokens.len().min(fallback_cap);
95            let summary_tokens = tokens
96                .iter()
97                .take(fallback_len)
98                .copied()
99                .collect::<Vec<_>>()
100                .join(" ");
101            chunks.push(RecordChunk {
102                record_id: record.id.clone(),
103                section_idx,
104                view: ChunkView::SummaryFallback {
105                    strategy: "head".into(),
106                    weight: strategy.summary_fallback_weight,
107                },
108                text: summary_tokens,
109                tokens_estimate: fallback_len,
110                quality: record.quality,
111                kvp_meta: Default::default(),
112            });
113        }
114
115        chunks
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122    use crate::data::{QualityScore, RecordSection, SectionRole};
123    use chrono::Utc;
124
125    fn strategy() -> ChunkingStrategy {
126        ChunkingStrategy {
127            max_window_tokens: 4,
128            overlap_tokens: vec![1],
129            summary_fallback_weight: 0.3,
130            summary_fallback_tokens: 2,
131            chunk_weight_floor: 0.0,
132            ..ChunkingStrategy::default()
133        }
134    }
135
136    fn record(text: &str) -> DataRecord {
137        DataRecord {
138            id: "r1".into(),
139            source: "unit".into(),
140            created_at: Utc::now(),
141            updated_at: Utc::now(),
142            quality: QualityScore { trust: 1.0 },
143            taxonomy: vec![],
144            sections: vec![RecordSection {
145                role: SectionRole::Context,
146                heading: None,
147                text: text.into(),
148                sentences: vec![text.into()],
149            }],
150            meta_prefix: None,
151        }
152    }
153
154    #[test]
155    fn sliding_window_chunker_materializes_windows_and_summary() {
156        let strategy = strategy();
157        let record = record("one two three four five six seven");
158        let section = &record.sections[0];
159        let chunks = SlidingWindowChunker.materialize(&strategy, &record, 0, section);
160
161        let window_count = chunks
162            .iter()
163            .filter(|chunk| matches!(chunk.view, ChunkView::Window { .. }))
164            .count();
165        let summary_count = chunks
166            .iter()
167            .filter(|chunk| matches!(chunk.view, ChunkView::SummaryFallback { .. }))
168            .count();
169
170        assert_eq!(window_count, 2);
171        assert_eq!(summary_count, 1);
172    }
173
174    #[test]
175    fn denoiser_disabled_produces_chunks_unchanged() {
176        // Default strategy has denoiser disabled; numeric text should chunk normally.
177        let strategy = strategy();
178        let rec = record("42 524 10788 143 1995 190 394");
179        let section = &rec.sections[0];
180        let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
181        assert!(!chunks.is_empty());
182    }
183
184    #[test]
185    fn denoiser_enabled_drops_pure_numeric_section() {
186        use crate::config::DenoiserConfig;
187        use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
188        let mut strategy = strategy();
189        strategy.register_preprocessor(DenoiserPreprocessor::new(DenoiserConfig {
190            enabled: true,
191            max_digit_ratio: 0.35,
192            strip_markdown: true,
193        }));
194        let rec = record("42 524 10788 143 1995 190 394 13611 358 6444 266");
195        let section = &rec.sections[0];
196        let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
197        assert!(
198            chunks.is_empty(),
199            "digit-heavy section should produce no chunks"
200        );
201    }
202
203    #[test]
204    fn denoiser_line_level_strips_noisy_lines_retaining_text() {
205        use crate::config::DenoiserConfig;
206        use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
207        let mut strategy = strategy();
208        strategy.register_preprocessor(DenoiserPreprocessor::new(DenoiserConfig {
209            enabled: true,
210            max_digit_ratio: 0.35,
211            strip_markdown: true,
212        }));
213        let rec = record("NOVEX INDUSTRIES Springfield\n42 524 10788 143 1995 190 394 13611 358");
214        let section = &rec.sections[0];
215        let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
216        assert!(
217            !chunks.is_empty(),
218            "clean line should yield at least one chunk"
219        );
220        let all_text: String = chunks
221            .iter()
222            .map(|c| c.text.as_str())
223            .collect::<Vec<_>>()
224            .join(" ");
225        assert!(all_text.contains("NOVEX") || all_text.contains("Springfield"));
226    }
227}