entelix_rag/splitter/
markdown.rs1use std::sync::OnceLock;
36
37use regex::Regex;
38
39use crate::document::{Document, Lineage};
40use crate::splitter::TextSplitter;
41
42pub const DEFAULT_MARKDOWN_HEADING_LEVELS: &[u8] = &[1, 2, 3];
46
47const SPLITTER_NAME: &str = "markdown-structure";
50
51fn heading_regex() -> &'static Regex {
56 static RE: OnceLock<Regex> = OnceLock::new();
57 RE.get_or_init(|| {
58 Regex::new(r"^(#{1,6})\s+\S").expect("heading regex compiles")
61 })
62}
63
64#[derive(Clone, Debug)]
70pub struct MarkdownStructureSplitter {
71 heading_levels: std::sync::Arc<[u8]>,
72}
73
74impl MarkdownStructureSplitter {
75 #[must_use]
78 pub fn new() -> Self {
79 Self {
80 heading_levels: DEFAULT_MARKDOWN_HEADING_LEVELS.into(),
81 }
82 }
83
84 #[must_use]
88 pub fn with_heading_levels<I>(mut self, levels: I) -> Self
89 where
90 I: IntoIterator<Item = u8>,
91 {
92 self.heading_levels = levels.into_iter().filter(|l| (1..=6).contains(l)).collect();
93 self
94 }
95
96 #[must_use]
98 pub fn heading_levels(&self) -> &[u8] {
99 &self.heading_levels
100 }
101
102 fn matches_level(&self, level: u8) -> bool {
104 self.heading_levels.contains(&level)
105 }
106}
107
108impl Default for MarkdownStructureSplitter {
109 fn default() -> Self {
110 Self::new()
111 }
112}
113
114impl TextSplitter for MarkdownStructureSplitter {
115 fn name(&self) -> &'static str {
116 SPLITTER_NAME
117 }
118
119 fn split(&self, document: &Document) -> Vec<Document> {
120 let sections = collect_sections(self, &document.content);
121 let total = sections.len();
122 if total == 0 {
123 return Vec::new();
124 }
125 #[allow(clippy::cast_possible_truncation)]
126 let total_u32 = total.min(u32::MAX as usize) as u32;
127 sections
128 .into_iter()
129 .enumerate()
130 .map(|(idx, content)| {
131 #[allow(clippy::cast_possible_truncation)]
132 let idx_u32 = idx.min(u32::MAX as usize) as u32;
133 let lineage =
134 Lineage::from_split(document.id.clone(), idx_u32, total_u32, SPLITTER_NAME);
135 document.child(content, lineage)
136 })
137 .collect()
138 }
139}
140
141fn collect_sections(splitter: &MarkdownStructureSplitter, text: &str) -> Vec<String> {
145 if text.is_empty() {
146 return Vec::new();
147 }
148 let mut sections: Vec<String> = Vec::new();
149 let mut current = String::new();
150 for line in text.split_inclusive('\n') {
151 if let Some(level) = matching_heading_level(splitter, line) {
152 if !current.is_empty() {
155 sections.push(std::mem::take(&mut current));
156 }
157 current.push_str(line);
158 let _ = level;
161 } else {
162 current.push_str(line);
163 }
164 }
165 if !current.is_empty() {
166 sections.push(current);
167 }
168 sections
169}
170
171fn matching_heading_level(splitter: &MarkdownStructureSplitter, line: &str) -> Option<u8> {
175 let captures = heading_regex().captures(line.trim_end_matches('\n'))?;
176 #[allow(clippy::cast_possible_truncation)]
179 let level = captures.get(1)?.as_str().len() as u8;
180 splitter.matches_level(level).then_some(level)
181}
182
183#[cfg(test)]
184#[allow(clippy::unwrap_used, clippy::indexing_slicing)]
185mod tests {
186 use super::*;
187 use crate::document::Source;
188 use entelix_memory::Namespace;
189
190 fn ns() -> Namespace {
191 Namespace::new(entelix_core::TenantId::new("acme"))
192 }
193
194 fn doc(content: &str) -> Document {
195 Document::root("doc", content, Source::now("test://", "test"), ns())
196 }
197
198 #[test]
199 fn empty_input_produces_no_chunks() {
200 let chunks = MarkdownStructureSplitter::new().split(&doc(""));
201 assert!(chunks.is_empty());
202 }
203
204 #[test]
205 fn no_headings_keeps_input_as_single_chunk() {
206 let text = "Just a paragraph.\n\nAnother paragraph.\n";
207 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
208 assert_eq!(chunks.len(), 1);
209 assert_eq!(chunks[0].content, text);
210 }
211
212 #[test]
213 fn h1_h2_split_at_default_levels() {
214 let text = "# Introduction\nIntro body.\n\n## Overview\nOverview body.\n\n## Details\nDetails body.\n";
215 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
216 assert_eq!(chunks.len(), 3);
217 assert!(chunks[0].content.starts_with("# Introduction"));
218 assert!(chunks[1].content.starts_with("## Overview"));
219 assert!(chunks[2].content.starts_with("## Details"));
220 }
221
222 #[test]
223 fn heading_attached_to_body_not_orphaned() {
224 let text = "# Title\nbody line one.\nbody line two.\n";
228 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
229 assert_eq!(chunks.len(), 1);
230 assert!(chunks[0].content.contains("# Title"));
231 assert!(chunks[0].content.contains("body line one"));
232 assert!(chunks[0].content.contains("body line two"));
233 }
234
235 #[test]
236 fn deeper_headings_stay_inline_under_default_config() {
237 let text = "## Section\nintro.\n\n#### Sub-detail\ndetail body.\n";
240 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
241 assert_eq!(chunks.len(), 1);
242 assert!(chunks[0].content.contains("#### Sub-detail"));
243 }
244
245 #[test]
246 fn narrowed_levels_skip_h2_split() {
247 let text = "# A\nbody A.\n\n## B\nbody B.\n";
248 let chunks = MarkdownStructureSplitter::new()
250 .with_heading_levels([1])
251 .split(&doc(text));
252 assert_eq!(chunks.len(), 1);
253 assert!(chunks[0].content.contains("# A"));
254 assert!(chunks[0].content.contains("## B"));
255 }
256
257 #[test]
258 fn lineage_carries_chunk_metadata() {
259 let text = "# A\nbody.\n# B\nbody.\n";
260 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
261 assert_eq!(chunks.len(), 2);
262 for (idx, chunk) in chunks.iter().enumerate() {
263 let lineage = chunk.lineage.as_ref().unwrap();
264 #[allow(clippy::cast_possible_truncation)]
265 let idx_u32 = idx as u32;
266 assert_eq!(lineage.chunk_index, idx_u32);
267 assert_eq!(lineage.total_chunks, 2);
268 assert_eq!(lineage.splitter, "markdown-structure");
269 assert_eq!(lineage.parent_id.as_str(), "doc");
270 }
271 }
272
273 #[test]
274 fn level_clamp_silently_ignores_invalid_levels() {
275 let splitter = MarkdownStructureSplitter::new().with_heading_levels([0, 2, 7]);
278 assert_eq!(splitter.heading_levels(), &[2]);
279 }
280
281 #[test]
282 fn rejoined_chunks_reproduce_the_input() {
283 let text = "# A\nbody A.\n\n## B\nbody B.\n\n### C\nbody C.\nfinal.\n";
287 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
288 let joined: String = chunks.iter().map(|c| c.content.as_str()).collect();
289 assert_eq!(joined, text);
290 }
291
292 #[test]
293 fn child_id_carries_chunk_index_suffix() {
294 let text = "# A\nbody.\n# B\nbody.\n";
295 let chunks = MarkdownStructureSplitter::new().split(&doc(text));
296 for (idx, chunk) in chunks.iter().enumerate() {
297 assert_eq!(chunk.id.as_str(), format!("doc:{idx}"));
298 }
299 }
300
301 #[test]
302 fn heading_regex_round_trips_levels_1_through_6() {
303 let cases = [
307 ("# h1", 1),
308 ("## h2", 2),
309 ("### h3", 3),
310 ("#### h4", 4),
311 ("##### h5", 5),
312 ("###### h6", 6),
313 ];
314 for (line, expected_level) in cases {
315 let captures = heading_regex().captures(line).unwrap();
316 #[allow(clippy::cast_possible_truncation)]
317 let level = captures.get(1).unwrap().as_str().len() as u8;
318 assert_eq!(level, expected_level);
319 }
320 assert!(heading_regex().captures("####### too deep").is_none());
322 }
323}