1use super::algorithm::ChunkingAlgorithm;
2use crate::config::ChunkingStrategy;
3use crate::data::{ChunkView, DataRecord, RecordChunk, RecordSection};
4use crate::tokenizer::{Tokenizer, WhitespaceTokenizer};
5
6#[derive(Clone, Copy, Debug, Default)]
8pub struct SlidingWindowChunker;
9
10impl ChunkingAlgorithm for SlidingWindowChunker {
11 fn materialize(
12 &self,
13 strategy: &ChunkingStrategy,
14 record: &DataRecord,
15 section_idx: usize,
16 section: &RecordSection,
17 ) -> Vec<RecordChunk> {
18 let raw_text = section.text.as_str();
19 let preprocessed_owned: String;
20 let text = if strategy.preprocessors().is_empty() {
21 raw_text
22 } else {
23 let result = strategy
24 .preprocessors()
25 .iter()
26 .try_fold(raw_text.to_string(), |t, p| p.process(&t));
27 match result {
28 Some(s) => {
29 preprocessed_owned = s;
30 preprocessed_owned.as_str()
31 }
32 None => return Vec::new(),
33 }
34 };
35 let tokens: Vec<&str> = WhitespaceTokenizer.tokenize(text);
36 if tokens.is_empty() {
37 return Vec::new();
38 }
39
40 let mut chunks = Vec::new();
41 let total_tokens = tokens.len();
42 let span = strategy.max_window_tokens.min(total_tokens);
43 if span == tokens.len() {
44 let text = text.to_string();
45 chunks.push(RecordChunk {
46 record_id: record.id.clone(),
47 section_idx,
48 view: ChunkView::Window {
49 index: 0,
50 overlap: 0,
51 span,
52 },
53 text,
54 tokens_estimate: span,
55 quality: record.quality,
56 kvp_meta: Default::default(),
57 });
58 return chunks;
59 }
60
61 for overlap in &strategy.overlap_tokens {
62 let stride = span.saturating_sub(*overlap).max(1);
63 let mut start = 0;
64 let mut index = 0;
65 while start < tokens.len() {
66 let end = (start + span).min(tokens.len());
67 let window = tokens[start..end].join(" ");
68 chunks.push(RecordChunk {
69 record_id: record.id.clone(),
70 section_idx,
71 view: ChunkView::Window {
72 index,
73 overlap: *overlap,
74 span,
75 },
76 text: window,
77 tokens_estimate: end - start,
78 quality: record.quality,
79 kvp_meta: Default::default(),
80 });
81 if end == tokens.len() {
82 break;
83 }
84 start += stride;
85 index += 1;
86 }
87 }
88
89 if tokens.len() > strategy.max_window_tokens && strategy.summary_fallback_tokens > 0 {
90 let fallback_cap = strategy
91 .summary_fallback_tokens
92 .min(strategy.max_window_tokens)
93 .max(1);
94 let fallback_len = tokens.len().min(fallback_cap);
95 let summary_tokens = tokens
96 .iter()
97 .take(fallback_len)
98 .copied()
99 .collect::<Vec<_>>()
100 .join(" ");
101 chunks.push(RecordChunk {
102 record_id: record.id.clone(),
103 section_idx,
104 view: ChunkView::SummaryFallback {
105 strategy: "head".into(),
106 weight: strategy.summary_fallback_weight,
107 },
108 text: summary_tokens,
109 tokens_estimate: fallback_len,
110 quality: record.quality,
111 kvp_meta: Default::default(),
112 });
113 }
114
115 chunks
116 }
117}
118
119#[cfg(test)]
120mod tests {
121 use super::*;
122 use crate::data::{QualityScore, RecordSection, SectionRole};
123 use chrono::Utc;
124
125 fn strategy() -> ChunkingStrategy {
126 ChunkingStrategy {
127 max_window_tokens: 4,
128 overlap_tokens: vec![1],
129 summary_fallback_weight: 0.3,
130 summary_fallback_tokens: 2,
131 chunk_weight_floor: 0.0,
132 ..ChunkingStrategy::default()
133 }
134 }
135
136 fn record(text: &str) -> DataRecord {
137 DataRecord {
138 id: "r1".into(),
139 source: "unit".into(),
140 created_at: Utc::now(),
141 updated_at: Utc::now(),
142 quality: QualityScore { trust: 1.0 },
143 taxonomy: vec![],
144 sections: vec![RecordSection {
145 role: SectionRole::Context,
146 heading: None,
147 text: text.into(),
148 sentences: vec![text.into()],
149 }],
150 meta_prefix: None,
151 }
152 }
153
154 #[test]
155 fn sliding_window_chunker_materializes_windows_and_summary() {
156 let strategy = strategy();
157 let record = record("one two three four five six seven");
158 let section = &record.sections[0];
159 let chunks = SlidingWindowChunker.materialize(&strategy, &record, 0, section);
160
161 let window_count = chunks
162 .iter()
163 .filter(|chunk| matches!(chunk.view, ChunkView::Window { .. }))
164 .count();
165 let summary_count = chunks
166 .iter()
167 .filter(|chunk| matches!(chunk.view, ChunkView::SummaryFallback { .. }))
168 .count();
169
170 assert_eq!(window_count, 2);
171 assert_eq!(summary_count, 1);
172 }
173
174 #[test]
175 fn denoiser_disabled_produces_chunks_unchanged() {
176 let strategy = strategy();
178 let rec = record("42 524 10788 143 1995 190 394");
179 let section = &rec.sections[0];
180 let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
181 assert!(!chunks.is_empty());
182 }
183
184 #[test]
185 fn denoiser_enabled_drops_pure_numeric_section() {
186 use crate::config::DenoiserConfig;
187 use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
188 let mut strategy = strategy();
189 strategy.register_preprocessor(DenoiserPreprocessor::new(DenoiserConfig {
190 enabled: true,
191 max_digit_ratio: 0.35,
192 strip_markdown: true,
193 }));
194 let rec = record("42 524 10788 143 1995 190 394 13611 358 6444 266");
195 let section = &rec.sections[0];
196 let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
197 assert!(
198 chunks.is_empty(),
199 "digit-heavy section should produce no chunks"
200 );
201 }
202
203 #[test]
204 fn denoiser_line_level_strips_noisy_lines_retaining_text() {
205 use crate::config::DenoiserConfig;
206 use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
207 let mut strategy = strategy();
208 strategy.register_preprocessor(DenoiserPreprocessor::new(DenoiserConfig {
209 enabled: true,
210 max_digit_ratio: 0.35,
211 strip_markdown: true,
212 }));
213 let rec = record("NOVEX INDUSTRIES Springfield\n42 524 10788 143 1995 190 394 13611 358");
214 let section = &rec.sections[0];
215 let chunks = SlidingWindowChunker.materialize(&strategy, &rec, 0, section);
216 assert!(
217 !chunks.is_empty(),
218 "clean line should yield at least one chunk"
219 );
220 let all_text: String = chunks
221 .iter()
222 .map(|c| c.text.as_str())
223 .collect::<Vec<_>>()
224 .join(" ");
225 assert!(all_text.contains("NOVEX") || all_text.contains("Springfield"));
226 }
227}