Skip to main content

enact_memory/
chunker.rs

1//! Line-based markdown chunker — splits documents into semantic chunks.
2//!
3//! Splits on markdown headings and paragraph boundaries, respecting
4//! a max token limit per chunk. Preserves heading context.
5
6/// A single chunk of text with metadata.
7#[derive(Debug, Clone)]
8pub struct Chunk {
9    pub index: usize,
10    pub content: String,
11    pub heading: Option<String>,
12}
13
14/// Split markdown text into chunks, each under `max_tokens` approximate tokens.
15///
16/// Strategy:
17/// 1. Split on `## ` and `# ` headings (keeps heading with its content)
18/// 2. If a section exceeds `max_tokens`, split on blank lines (paragraphs)
19/// 3. If a paragraph still exceeds, split on line boundaries
20///
21/// Token estimation: ~4 chars per token (rough English average).
22pub fn chunk_markdown(text: &str, max_tokens: usize) -> Vec<Chunk> {
23    if text.trim().is_empty() {
24        return Vec::new();
25    }
26
27    let max_chars = max_tokens * 4;
28    let sections = split_on_headings(text);
29    let mut chunks = Vec::new();
30
31    for (heading, body) in sections {
32        let full = if let Some(ref h) = heading {
33            format!("{h}\n{body}")
34        } else {
35            body.clone()
36        };
37
38        if full.len() <= max_chars {
39            chunks.push(Chunk {
40                index: chunks.len(),
41                content: full.trim().to_string(),
42                heading: heading.clone(),
43            });
44        } else {
45            // Split on paragraphs (blank lines)
46            let paragraphs = split_on_blank_lines(&body);
47            let mut current = heading
48                .as_ref()
49                .map_or_else(String::new, |h| format!("{h}\n"));
50
51            for para in paragraphs {
52                if current.len() + para.len() > max_chars && !current.trim().is_empty() {
53                    chunks.push(Chunk {
54                        index: chunks.len(),
55                        content: current.trim().to_string(),
56                        heading: heading.clone(),
57                    });
58                    current = heading
59                        .as_ref()
60                        .map_or_else(String::new, |h| format!("{h}\n"));
61                }
62
63                if para.len() > max_chars {
64                    // Paragraph too big — split on lines
65                    if !current.trim().is_empty() {
66                        chunks.push(Chunk {
67                            index: chunks.len(),
68                            content: current.trim().to_string(),
69                            heading: heading.clone(),
70                        });
71                        current = heading
72                            .as_ref()
73                            .map_or_else(String::new, |h| format!("{h}\n"));
74                    }
75                    for line_chunk in split_on_lines(&para, max_chars) {
76                        chunks.push(Chunk {
77                            index: chunks.len(),
78                            content: line_chunk.trim().to_string(),
79                            heading: heading.clone(),
80                        });
81                    }
82                } else {
83                    current.push_str(&para);
84                    current.push('\n');
85                }
86            }
87
88            if !current.trim().is_empty() {
89                chunks.push(Chunk {
90                    index: chunks.len(),
91                    content: current.trim().to_string(),
92                    heading: heading.clone(),
93                });
94            }
95        }
96    }
97
98    // Filter out empty chunks
99    chunks.retain(|c| !c.content.is_empty());
100
101    // Re-index
102    for (i, chunk) in chunks.iter_mut().enumerate() {
103        chunk.index = i;
104    }
105
106    chunks
107}
108
109/// Split text into `(heading, body)` sections.
110fn split_on_headings(text: &str) -> Vec<(Option<String>, String)> {
111    let mut sections = Vec::new();
112    let mut current_heading: Option<String> = None;
113    let mut current_body = String::new();
114
115    for line in text.lines() {
116        if line.starts_with("# ") || line.starts_with("## ") || line.starts_with("### ") {
117            if !current_body.trim().is_empty() || current_heading.is_some() {
118                sections.push((current_heading.take(), current_body.clone()));
119                current_body.clear();
120            }
121            current_heading = Some(line.to_string());
122        } else {
123            current_body.push_str(line);
124            current_body.push('\n');
125        }
126    }
127
128    if !current_body.trim().is_empty() || current_heading.is_some() {
129        sections.push((current_heading, current_body));
130    }
131
132    sections
133}
134
135/// Split text on blank lines (paragraph boundaries)
136fn split_on_blank_lines(text: &str) -> Vec<String> {
137    let mut paragraphs = Vec::new();
138    let mut current = String::new();
139
140    for line in text.lines() {
141        if line.trim().is_empty() {
142            if !current.trim().is_empty() {
143                paragraphs.push(current.clone());
144                current.clear();
145            }
146        } else {
147            current.push_str(line);
148            current.push('\n');
149        }
150    }
151
152    if !current.trim().is_empty() {
153        paragraphs.push(current);
154    }
155
156    paragraphs
157}
158
159/// Split text on line boundaries to fit within `max_chars`
160fn split_on_lines(text: &str, max_chars: usize) -> Vec<String> {
161    let mut chunks = Vec::new();
162    let mut current = String::new();
163
164    for line in text.lines() {
165        if current.len() + line.len() + 1 > max_chars && !current.is_empty() {
166            chunks.push(current.clone());
167            current.clear();
168        }
169        current.push_str(line);
170        current.push('\n');
171    }
172
173    if !current.is_empty() {
174        chunks.push(current);
175    }
176
177    chunks
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn empty_text() {
186        assert!(chunk_markdown("", 512).is_empty());
187        assert!(chunk_markdown("   ", 512).is_empty());
188    }
189
190    #[test]
191    fn single_short_paragraph() {
192        let chunks = chunk_markdown("Hello world", 512);
193        assert_eq!(chunks.len(), 1);
194        assert_eq!(chunks[0].content, "Hello world");
195        assert!(chunks[0].heading.is_none());
196    }
197
198    #[test]
199    fn heading_sections() {
200        let text = "# Title\nSome intro.\n\n## Section A\nContent A.\n\n## Section B\nContent B.";
201        let chunks = chunk_markdown(text, 512);
202        assert!(chunks.len() >= 3);
203    }
204
205    #[test]
206    fn respects_max_tokens() {
207        let long_text: String = (0..200).fold(String::new(), |mut s, i| {
208            use std::fmt::Write;
209            let _ = writeln!(
210                s,
211                "This is sentence number {i} with some extra words to fill it up."
212            );
213            s
214        });
215        let chunks = chunk_markdown(&long_text, 50);
216        assert!(chunks.len() > 1);
217        for chunk in &chunks {
218            assert!(chunk.content.len() <= 300);
219        }
220    }
221
222    #[test]
223    fn indexes_are_sequential() {
224        let text = "# A\nContent A\n\n# B\nContent B\n\n# C\nContent C";
225        let chunks = chunk_markdown(text, 512);
226        for (i, chunk) in chunks.iter().enumerate() {
227            assert_eq!(chunk.index, i);
228        }
229    }
230}