Skip to main content

infigraph_docs/
chunk.rs

1use crate::extract::ExtractedDoc;
2
3#[derive(Debug, Clone)]
4pub struct Chunk {
5    pub id: String,
6    pub doc_file: String,
7    pub content_hash: String,
8    pub index: usize,
9    pub heading: Option<String>,
10    pub text: String,
11    pub start_offset: usize,
12    pub end_offset: usize,
13    pub page: Option<usize>,
14}
15
16#[derive(Debug, Clone, Copy)]
17pub enum ChunkStrategy {
18    HeadingBounded,
19    FixedToken { size: usize, overlap: usize },
20}
21
22impl ChunkStrategy {
23    pub fn for_extension(ext: &str) -> Self {
24        match ext {
25            "md" | "markdown" | "rst" | "adoc" | "org" | "html" | "htm" | "xml" | "xsl" | "xsd"
26            | "svg" | "plist" => Self::HeadingBounded,
27            _ => Self::HeadingBounded,
28        }
29    }
30}
31
32pub fn chunk_document(
33    doc: &ExtractedDoc,
34    file: &str,
35    hash: &str,
36    strategy: ChunkStrategy,
37) -> Vec<Chunk> {
38    match strategy {
39        ChunkStrategy::HeadingBounded => chunk_by_headings(doc, file, hash),
40        ChunkStrategy::FixedToken { size, overlap } => {
41            chunk_by_tokens(doc, file, hash, size, overlap)
42        }
43    }
44}
45
46const MAX_SECTION_TOKENS: usize = 512;
47const SUB_CHUNK_OVERLAP: usize = 64;
48
49fn chunk_by_headings(doc: &ExtractedDoc, file: &str, hash: &str) -> Vec<Chunk> {
50    let text = &doc.text;
51    if text.is_empty() {
52        return Vec::new();
53    }
54
55    let heading_re = regex::Regex::new(r"(?m)^(#{1,6})\s+(.+)$|^([^\n]+)\n[=\-]{3,}$").unwrap();
56    let mut sections: Vec<(Option<String>, usize, usize)> = Vec::new();
57    let mut last_start = 0;
58    let mut last_heading: Option<String> = None;
59
60    for m in heading_re.find_iter(text) {
61        if m.start() > last_start {
62            sections.push((last_heading.clone(), last_start, m.start()));
63        }
64        last_start = m.start();
65        let heading_text = m.as_str();
66        last_heading = Some(
67            heading_text
68                .trim_start_matches('#')
69                .trim()
70                .lines()
71                .next()
72                .unwrap_or("")
73                .to_string(),
74        );
75    }
76    if last_start < text.len() {
77        sections.push((last_heading, last_start, text.len()));
78    }
79
80    if sections.is_empty() {
81        sections.push((None, 0, text.len()));
82    }
83
84    // No headings found → fall back to paragraph-bounded chunking
85    if sections.len() == 1 && sections[0].0.is_none() {
86        return chunk_by_paragraphs(doc, file, hash);
87    }
88
89    let mut chunks = Vec::new();
90    let mut chunk_idx = 0;
91
92    for (heading, start, end) in &sections {
93        let section_text = text[*start..*end].trim();
94        if section_text.is_empty() {
95            continue;
96        }
97
98        let words: Vec<&str> = section_text.split_whitespace().collect();
99        if words.len() <= MAX_SECTION_TOKENS {
100            chunks.push(Chunk {
101                id: format!("{}::chunk_{}", file, chunk_idx),
102                doc_file: file.to_string(),
103                content_hash: hash.to_string(),
104                index: chunk_idx,
105                heading: heading.clone(),
106                text: section_text.to_string(),
107                start_offset: *start,
108                end_offset: *end,
109                page: None,
110            });
111            chunk_idx += 1;
112        } else {
113            let mut w_start = 0;
114            while w_start < words.len() {
115                let w_end = (w_start + MAX_SECTION_TOKENS).min(words.len());
116                let sub_text = words[w_start..w_end].join(" ");
117                if !sub_text.is_empty() {
118                    chunks.push(Chunk {
119                        id: format!("{}::chunk_{}", file, chunk_idx),
120                        doc_file: file.to_string(),
121                        content_hash: hash.to_string(),
122                        index: chunk_idx,
123                        heading: heading.clone(),
124                        text: sub_text,
125                        start_offset: *start,
126                        end_offset: *end,
127                        page: None,
128                    });
129                    chunk_idx += 1;
130                }
131                if w_end >= words.len() {
132                    break;
133                }
134                w_start = w_end - SUB_CHUNK_OVERLAP;
135            }
136        }
137    }
138
139    chunks
140}
141
142fn chunk_by_paragraphs(doc: &ExtractedDoc, file: &str, hash: &str) -> Vec<Chunk> {
143    let text = &doc.text;
144    if text.is_empty() {
145        return Vec::new();
146    }
147
148    let paragraphs: Vec<&str> = text
149        .split("\n\n")
150        .map(|p| p.trim())
151        .filter(|p| !p.is_empty())
152        .collect();
153
154    if paragraphs.is_empty() {
155        return chunk_by_tokens(doc, file, hash, MAX_SECTION_TOKENS, SUB_CHUNK_OVERLAP);
156    }
157
158    // If there's only one big block with no blank lines, fall back to fixed-token
159    if paragraphs.len() == 1 {
160        return chunk_by_tokens(doc, file, hash, MAX_SECTION_TOKENS, SUB_CHUNK_OVERLAP);
161    }
162
163    let mut chunks = Vec::new();
164    let mut chunk_idx = 0;
165    let mut current_text = String::new();
166    let mut current_words = 0usize;
167    let mut current_start = 0usize;
168
169    for para in &paragraphs {
170        let para_words = para.split_whitespace().count();
171
172        // Single paragraph exceeds limit — flush current, then sub-chunk this paragraph
173        if para_words > MAX_SECTION_TOKENS {
174            if !current_text.is_empty() {
175                let start_offset = text.find(current_text.trim()).unwrap_or(0);
176                chunks.push(Chunk {
177                    id: format!("{}::chunk_{}", file, chunk_idx),
178                    doc_file: file.to_string(),
179                    content_hash: hash.to_string(),
180                    index: chunk_idx,
181                    heading: infer_heading(current_text.trim()),
182                    text: current_text.trim().to_string(),
183                    start_offset,
184                    end_offset: start_offset + current_text.trim().len(),
185                    page: None,
186                });
187                chunk_idx += 1;
188                current_text.clear();
189                current_words = 0;
190            }
191            let words: Vec<&str> = para.split_whitespace().collect();
192            let mut w_start = 0;
193            while w_start < words.len() {
194                let w_end = (w_start + MAX_SECTION_TOKENS).min(words.len());
195                let sub_text = words[w_start..w_end].join(" ");
196                let start_offset = text.find(&sub_text).unwrap_or(0);
197                chunks.push(Chunk {
198                    id: format!("{}::chunk_{}", file, chunk_idx),
199                    doc_file: file.to_string(),
200                    content_hash: hash.to_string(),
201                    index: chunk_idx,
202                    heading: infer_heading(&sub_text),
203                    text: sub_text.clone(),
204                    start_offset,
205                    end_offset: start_offset + sub_text.len(),
206                    page: None,
207                });
208                chunk_idx += 1;
209                if w_end >= words.len() {
210                    break;
211                }
212                w_start = w_end - SUB_CHUNK_OVERLAP;
213            }
214            continue;
215        }
216
217        // Adding this paragraph would exceed limit — flush current chunk
218        if current_words + para_words > MAX_SECTION_TOKENS && !current_text.is_empty() {
219            let trimmed = current_text.trim();
220            let start_offset = text[current_start..]
221                .find(trimmed)
222                .map(|i| current_start + i)
223                .unwrap_or(current_start);
224            chunks.push(Chunk {
225                id: format!("{}::chunk_{}", file, chunk_idx),
226                doc_file: file.to_string(),
227                content_hash: hash.to_string(),
228                index: chunk_idx,
229                heading: infer_heading(trimmed),
230                text: trimmed.to_string(),
231                start_offset,
232                end_offset: start_offset + trimmed.len(),
233                page: None,
234            });
235            chunk_idx += 1;
236            current_text.clear();
237            current_words = 0;
238            current_start = text.find(para).unwrap_or(0);
239        }
240
241        if current_text.is_empty() {
242            current_start = text.find(para).unwrap_or(0);
243        }
244
245        if !current_text.is_empty() {
246            current_text.push_str("\n\n");
247        }
248        current_text.push_str(para);
249        current_words += para_words;
250    }
251
252    // Flush remaining
253    if !current_text.is_empty() {
254        let trimmed = current_text.trim();
255        let start_offset = text[current_start..]
256            .find(trimmed)
257            .map(|i| current_start + i)
258            .unwrap_or(current_start);
259        chunks.push(Chunk {
260            id: format!("{}::chunk_{}", file, chunk_idx),
261            doc_file: file.to_string(),
262            content_hash: hash.to_string(),
263            index: chunk_idx,
264            heading: infer_heading(trimmed),
265            text: trimmed.to_string(),
266            start_offset,
267            end_offset: start_offset + trimmed.len(),
268            page: None,
269        });
270    }
271
272    chunks
273}
274
275fn infer_heading(text: &str) -> Option<String> {
276    let first_line = text.lines().next().unwrap_or("").trim();
277    if first_line.is_empty() {
278        return None;
279    }
280    let words: Vec<&str> = first_line.split_whitespace().collect();
281    // Short first line that looks like a title (under 10 words, no trailing punctuation)
282    if words.len() <= 10 && !first_line.ends_with('.') && !first_line.ends_with(',') {
283        Some(first_line.to_string())
284    } else {
285        None
286    }
287}
288
289fn chunk_by_tokens(
290    doc: &ExtractedDoc,
291    file: &str,
292    hash: &str,
293    size: usize,
294    overlap: usize,
295) -> Vec<Chunk> {
296    let text = &doc.text;
297    if text.is_empty() {
298        return Vec::new();
299    }
300
301    let words: Vec<&str> = text.split_whitespace().collect();
302    if words.is_empty() {
303        return Vec::new();
304    }
305
306    let mut chunks = Vec::new();
307    let mut start = 0;
308    let mut chunk_idx = 0;
309
310    while start < words.len() {
311        let end = (start + size).min(words.len());
312        let chunk_text = words[start..end].join(" ");
313
314        // Approximate byte offsets
315        let start_offset = if start == 0 {
316            0
317        } else {
318            text.find(words[start]).unwrap_or(0)
319        };
320        let end_offset = if end >= words.len() {
321            text.len()
322        } else {
323            text.find(words[end.min(words.len() - 1)])
324                .unwrap_or(text.len())
325        };
326
327        if !chunk_text.is_empty() {
328            chunks.push(Chunk {
329                id: format!("{}::chunk_{}", file, chunk_idx),
330                doc_file: file.to_string(),
331                content_hash: hash.to_string(),
332                index: chunk_idx,
333                heading: None,
334                text: chunk_text,
335                start_offset,
336                end_offset,
337                page: None,
338            });
339            chunk_idx += 1;
340        }
341
342        if end >= words.len() {
343            break;
344        }
345        start = end - overlap;
346    }
347
348    chunks
349}