1use crate::extract::ExtractedDoc;
2
3#[derive(Debug, Clone)]
4pub struct Chunk {
5 pub id: String,
6 pub doc_file: String,
7 pub content_hash: String,
8 pub index: usize,
9 pub heading: Option<String>,
10 pub text: String,
11 pub start_offset: usize,
12 pub end_offset: usize,
13 pub page: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy)]
17pub enum ChunkStrategy {
18 HeadingBounded,
19 FixedToken { size: usize, overlap: usize },
20}
21
22impl ChunkStrategy {
23 pub fn for_extension(ext: &str) -> Self {
24 match ext {
25 "md" | "markdown" | "rst" | "adoc" | "org" | "html" | "htm" | "xml" | "xsl" | "xsd"
26 | "svg" | "plist" => Self::HeadingBounded,
27 _ => Self::HeadingBounded,
28 }
29 }
30}
31
32pub fn chunk_document(
33 doc: &ExtractedDoc,
34 file: &str,
35 hash: &str,
36 strategy: ChunkStrategy,
37) -> Vec<Chunk> {
38 match strategy {
39 ChunkStrategy::HeadingBounded => chunk_by_headings(doc, file, hash),
40 ChunkStrategy::FixedToken { size, overlap } => {
41 chunk_by_tokens(doc, file, hash, size, overlap)
42 }
43 }
44}
45
46const MAX_SECTION_TOKENS: usize = 512;
47const SUB_CHUNK_OVERLAP: usize = 64;
48
49fn chunk_by_headings(doc: &ExtractedDoc, file: &str, hash: &str) -> Vec<Chunk> {
50 let text = &doc.text;
51 if text.is_empty() {
52 return Vec::new();
53 }
54
55 let heading_re = regex::Regex::new(r"(?m)^(#{1,6})\s+(.+)$|^([^\n]+)\n[=\-]{3,}$").unwrap();
56 let mut sections: Vec<(Option<String>, usize, usize)> = Vec::new();
57 let mut last_start = 0;
58 let mut last_heading: Option<String> = None;
59
60 for m in heading_re.find_iter(text) {
61 if m.start() > last_start {
62 sections.push((last_heading.clone(), last_start, m.start()));
63 }
64 last_start = m.start();
65 let heading_text = m.as_str();
66 last_heading = Some(
67 heading_text
68 .trim_start_matches('#')
69 .trim()
70 .lines()
71 .next()
72 .unwrap_or("")
73 .to_string(),
74 );
75 }
76 if last_start < text.len() {
77 sections.push((last_heading, last_start, text.len()));
78 }
79
80 if sections.is_empty() {
81 sections.push((None, 0, text.len()));
82 }
83
84 if sections.len() == 1 && sections[0].0.is_none() {
86 return chunk_by_paragraphs(doc, file, hash);
87 }
88
89 let mut chunks = Vec::new();
90 let mut chunk_idx = 0;
91
92 for (heading, start, end) in §ions {
93 let section_text = text[*start..*end].trim();
94 if section_text.is_empty() {
95 continue;
96 }
97
98 let words: Vec<&str> = section_text.split_whitespace().collect();
99 if words.len() <= MAX_SECTION_TOKENS {
100 chunks.push(Chunk {
101 id: format!("{}::chunk_{}", file, chunk_idx),
102 doc_file: file.to_string(),
103 content_hash: hash.to_string(),
104 index: chunk_idx,
105 heading: heading.clone(),
106 text: section_text.to_string(),
107 start_offset: *start,
108 end_offset: *end,
109 page: None,
110 });
111 chunk_idx += 1;
112 } else {
113 let mut w_start = 0;
114 while w_start < words.len() {
115 let w_end = (w_start + MAX_SECTION_TOKENS).min(words.len());
116 let sub_text = words[w_start..w_end].join(" ");
117 if !sub_text.is_empty() {
118 chunks.push(Chunk {
119 id: format!("{}::chunk_{}", file, chunk_idx),
120 doc_file: file.to_string(),
121 content_hash: hash.to_string(),
122 index: chunk_idx,
123 heading: heading.clone(),
124 text: sub_text,
125 start_offset: *start,
126 end_offset: *end,
127 page: None,
128 });
129 chunk_idx += 1;
130 }
131 if w_end >= words.len() {
132 break;
133 }
134 w_start = w_end - SUB_CHUNK_OVERLAP;
135 }
136 }
137 }
138
139 chunks
140}
141
142fn chunk_by_paragraphs(doc: &ExtractedDoc, file: &str, hash: &str) -> Vec<Chunk> {
143 let text = &doc.text;
144 if text.is_empty() {
145 return Vec::new();
146 }
147
148 let paragraphs: Vec<&str> = text
149 .split("\n\n")
150 .map(|p| p.trim())
151 .filter(|p| !p.is_empty())
152 .collect();
153
154 if paragraphs.is_empty() {
155 return chunk_by_tokens(doc, file, hash, MAX_SECTION_TOKENS, SUB_CHUNK_OVERLAP);
156 }
157
158 if paragraphs.len() == 1 {
160 return chunk_by_tokens(doc, file, hash, MAX_SECTION_TOKENS, SUB_CHUNK_OVERLAP);
161 }
162
163 let mut chunks = Vec::new();
164 let mut chunk_idx = 0;
165 let mut current_text = String::new();
166 let mut current_words = 0usize;
167 let mut current_start = 0usize;
168
169 for para in ¶graphs {
170 let para_words = para.split_whitespace().count();
171
172 if para_words > MAX_SECTION_TOKENS {
174 if !current_text.is_empty() {
175 let start_offset = text.find(current_text.trim()).unwrap_or(0);
176 chunks.push(Chunk {
177 id: format!("{}::chunk_{}", file, chunk_idx),
178 doc_file: file.to_string(),
179 content_hash: hash.to_string(),
180 index: chunk_idx,
181 heading: infer_heading(current_text.trim()),
182 text: current_text.trim().to_string(),
183 start_offset,
184 end_offset: start_offset + current_text.trim().len(),
185 page: None,
186 });
187 chunk_idx += 1;
188 current_text.clear();
189 current_words = 0;
190 }
191 let words: Vec<&str> = para.split_whitespace().collect();
192 let mut w_start = 0;
193 while w_start < words.len() {
194 let w_end = (w_start + MAX_SECTION_TOKENS).min(words.len());
195 let sub_text = words[w_start..w_end].join(" ");
196 let start_offset = text.find(&sub_text).unwrap_or(0);
197 chunks.push(Chunk {
198 id: format!("{}::chunk_{}", file, chunk_idx),
199 doc_file: file.to_string(),
200 content_hash: hash.to_string(),
201 index: chunk_idx,
202 heading: infer_heading(&sub_text),
203 text: sub_text.clone(),
204 start_offset,
205 end_offset: start_offset + sub_text.len(),
206 page: None,
207 });
208 chunk_idx += 1;
209 if w_end >= words.len() {
210 break;
211 }
212 w_start = w_end - SUB_CHUNK_OVERLAP;
213 }
214 continue;
215 }
216
217 if current_words + para_words > MAX_SECTION_TOKENS && !current_text.is_empty() {
219 let trimmed = current_text.trim();
220 let start_offset = text[current_start..]
221 .find(trimmed)
222 .map(|i| current_start + i)
223 .unwrap_or(current_start);
224 chunks.push(Chunk {
225 id: format!("{}::chunk_{}", file, chunk_idx),
226 doc_file: file.to_string(),
227 content_hash: hash.to_string(),
228 index: chunk_idx,
229 heading: infer_heading(trimmed),
230 text: trimmed.to_string(),
231 start_offset,
232 end_offset: start_offset + trimmed.len(),
233 page: None,
234 });
235 chunk_idx += 1;
236 current_text.clear();
237 current_words = 0;
238 current_start = text.find(para).unwrap_or(0);
239 }
240
241 if current_text.is_empty() {
242 current_start = text.find(para).unwrap_or(0);
243 }
244
245 if !current_text.is_empty() {
246 current_text.push_str("\n\n");
247 }
248 current_text.push_str(para);
249 current_words += para_words;
250 }
251
252 if !current_text.is_empty() {
254 let trimmed = current_text.trim();
255 let start_offset = text[current_start..]
256 .find(trimmed)
257 .map(|i| current_start + i)
258 .unwrap_or(current_start);
259 chunks.push(Chunk {
260 id: format!("{}::chunk_{}", file, chunk_idx),
261 doc_file: file.to_string(),
262 content_hash: hash.to_string(),
263 index: chunk_idx,
264 heading: infer_heading(trimmed),
265 text: trimmed.to_string(),
266 start_offset,
267 end_offset: start_offset + trimmed.len(),
268 page: None,
269 });
270 }
271
272 chunks
273}
274
275fn infer_heading(text: &str) -> Option<String> {
276 let first_line = text.lines().next().unwrap_or("").trim();
277 if first_line.is_empty() {
278 return None;
279 }
280 let words: Vec<&str> = first_line.split_whitespace().collect();
281 if words.len() <= 10 && !first_line.ends_with('.') && !first_line.ends_with(',') {
283 Some(first_line.to_string())
284 } else {
285 None
286 }
287}
288
289fn chunk_by_tokens(
290 doc: &ExtractedDoc,
291 file: &str,
292 hash: &str,
293 size: usize,
294 overlap: usize,
295) -> Vec<Chunk> {
296 let text = &doc.text;
297 if text.is_empty() {
298 return Vec::new();
299 }
300
301 let words: Vec<&str> = text.split_whitespace().collect();
302 if words.is_empty() {
303 return Vec::new();
304 }
305
306 let mut chunks = Vec::new();
307 let mut start = 0;
308 let mut chunk_idx = 0;
309
310 while start < words.len() {
311 let end = (start + size).min(words.len());
312 let chunk_text = words[start..end].join(" ");
313
314 let start_offset = if start == 0 {
316 0
317 } else {
318 text.find(words[start]).unwrap_or(0)
319 };
320 let end_offset = if end >= words.len() {
321 text.len()
322 } else {
323 text.find(words[end.min(words.len() - 1)])
324 .unwrap_or(text.len())
325 };
326
327 if !chunk_text.is_empty() {
328 chunks.push(Chunk {
329 id: format!("{}::chunk_{}", file, chunk_idx),
330 doc_file: file.to_string(),
331 content_hash: hash.to_string(),
332 index: chunk_idx,
333 heading: None,
334 text: chunk_text,
335 start_offset,
336 end_offset,
337 page: None,
338 });
339 chunk_idx += 1;
340 }
341
342 if end >= words.len() {
343 break;
344 }
345 start = end - overlap;
346 }
347
348 chunks
349}