1#[derive(Debug, Clone)]
8pub struct Chunk {
9 pub index: usize,
10 pub content: String,
11 pub heading: Option<String>,
12}
13
14pub fn chunk_markdown(text: &str, max_tokens: usize) -> Vec<Chunk> {
23 if text.trim().is_empty() {
24 return Vec::new();
25 }
26
27 let max_chars = max_tokens * 4;
28 let sections = split_on_headings(text);
29 let mut chunks = Vec::new();
30
31 for (heading, body) in sections {
32 let full = if let Some(ref h) = heading {
33 format!("{h}\n{body}")
34 } else {
35 body.clone()
36 };
37
38 if full.len() <= max_chars {
39 chunks.push(Chunk {
40 index: chunks.len(),
41 content: full.trim().to_string(),
42 heading: heading.clone(),
43 });
44 } else {
45 let paragraphs = split_on_blank_lines(&body);
47 let mut current = heading
48 .as_ref()
49 .map_or_else(String::new, |h| format!("{h}\n"));
50
51 for para in paragraphs {
52 if current.len() + para.len() > max_chars && !current.trim().is_empty() {
53 chunks.push(Chunk {
54 index: chunks.len(),
55 content: current.trim().to_string(),
56 heading: heading.clone(),
57 });
58 current = heading
59 .as_ref()
60 .map_or_else(String::new, |h| format!("{h}\n"));
61 }
62
63 if para.len() > max_chars {
64 if !current.trim().is_empty() {
66 chunks.push(Chunk {
67 index: chunks.len(),
68 content: current.trim().to_string(),
69 heading: heading.clone(),
70 });
71 current = heading
72 .as_ref()
73 .map_or_else(String::new, |h| format!("{h}\n"));
74 }
75 for line_chunk in split_on_lines(¶, max_chars) {
76 chunks.push(Chunk {
77 index: chunks.len(),
78 content: line_chunk.trim().to_string(),
79 heading: heading.clone(),
80 });
81 }
82 } else {
83 current.push_str(¶);
84 current.push('\n');
85 }
86 }
87
88 if !current.trim().is_empty() {
89 chunks.push(Chunk {
90 index: chunks.len(),
91 content: current.trim().to_string(),
92 heading: heading.clone(),
93 });
94 }
95 }
96 }
97
98 chunks.retain(|c| !c.content.is_empty());
100
101 for (i, chunk) in chunks.iter_mut().enumerate() {
103 chunk.index = i;
104 }
105
106 chunks
107}
108
109fn split_on_headings(text: &str) -> Vec<(Option<String>, String)> {
111 let mut sections = Vec::new();
112 let mut current_heading: Option<String> = None;
113 let mut current_body = String::new();
114
115 for line in text.lines() {
116 if line.starts_with("# ") || line.starts_with("## ") || line.starts_with("### ") {
117 if !current_body.trim().is_empty() || current_heading.is_some() {
118 sections.push((current_heading.take(), current_body.clone()));
119 current_body.clear();
120 }
121 current_heading = Some(line.to_string());
122 } else {
123 current_body.push_str(line);
124 current_body.push('\n');
125 }
126 }
127
128 if !current_body.trim().is_empty() || current_heading.is_some() {
129 sections.push((current_heading, current_body));
130 }
131
132 sections
133}
134
135fn split_on_blank_lines(text: &str) -> Vec<String> {
137 let mut paragraphs = Vec::new();
138 let mut current = String::new();
139
140 for line in text.lines() {
141 if line.trim().is_empty() {
142 if !current.trim().is_empty() {
143 paragraphs.push(current.clone());
144 current.clear();
145 }
146 } else {
147 current.push_str(line);
148 current.push('\n');
149 }
150 }
151
152 if !current.trim().is_empty() {
153 paragraphs.push(current);
154 }
155
156 paragraphs
157}
158
159fn split_on_lines(text: &str, max_chars: usize) -> Vec<String> {
161 let mut chunks = Vec::new();
162 let mut current = String::new();
163
164 for line in text.lines() {
165 if current.len() + line.len() + 1 > max_chars && !current.is_empty() {
166 chunks.push(current.clone());
167 current.clear();
168 }
169 current.push_str(line);
170 current.push('\n');
171 }
172
173 if !current.is_empty() {
174 chunks.push(current);
175 }
176
177 chunks
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183
184 #[test]
185 fn empty_text() {
186 assert!(chunk_markdown("", 512).is_empty());
187 assert!(chunk_markdown(" ", 512).is_empty());
188 }
189
190 #[test]
191 fn single_short_paragraph() {
192 let chunks = chunk_markdown("Hello world", 512);
193 assert_eq!(chunks.len(), 1);
194 assert_eq!(chunks[0].content, "Hello world");
195 assert!(chunks[0].heading.is_none());
196 }
197
198 #[test]
199 fn heading_sections() {
200 let text = "# Title\nSome intro.\n\n## Section A\nContent A.\n\n## Section B\nContent B.";
201 let chunks = chunk_markdown(text, 512);
202 assert!(chunks.len() >= 3);
203 }
204
205 #[test]
206 fn respects_max_tokens() {
207 let long_text: String = (0..200).fold(String::new(), |mut s, i| {
208 use std::fmt::Write;
209 let _ = writeln!(
210 s,
211 "This is sentence number {i} with some extra words to fill it up."
212 );
213 s
214 });
215 let chunks = chunk_markdown(&long_text, 50);
216 assert!(chunks.len() > 1);
217 for chunk in &chunks {
218 assert!(chunk.content.len() <= 300);
219 }
220 }
221
222 #[test]
223 fn indexes_are_sequential() {
224 let text = "# A\nContent A\n\n# B\nContent B\n\n# C\nContent C";
225 let chunks = chunk_markdown(text, 512);
226 for (i, chunk) in chunks.iter().enumerate() {
227 assert_eq!(chunk.index, i);
228 }
229 }
230}