entelix_rag/splitter/
markdown.rs

1//! `MarkdownStructureSplitter` — heading-aware splitter that
2//! preserves the document's logical sectioning.
3//!
4//! Splits at ATX headings (`#`, `##`, `###`, …) so each chunk
5//! corresponds to a meaningful section of the source document. The
6//! heading line stays attached to its body (an "orphan heading"
7//! chunk would lose retrieval context), and nested sections under a
8//! parent heading flow into the same chunk until the next
9//! same-or-higher heading appears.
10//!
11//! Operators tune the split granularity through the *heading
12//! levels* configuration: with `[1, 2]` only `#` and `##` start a
13//! new chunk, deeper headings stay inline. The default
14//! `[1, 2, 3]` covers the typical "split at major sections, keep
15//! sub-sections inline" shape.
16//!
17//! ## Algorithm
18//!
19//! 1. Walk the input line by line. Lines matching the ATX heading
20//!    regex `^(#{1,6})\s+...` whose level appears in
21//!    [`Self::heading_levels`] open a new section.
22//! 2. Buffer lines under the current heading until the next
23//!    matching heading (or end of input).
24//! 3. Emit one [`Document`] per accumulated section, preserving
25//!    [`Lineage`].
26//!
27//! Setext headings (`===` / `---` underlines) and code-fenced
28//! `#`-lines (inside triple-backtick blocks) are intentionally NOT
29//! split on — the regex anchors at line start and ignores
30//! fence-aware parsing in service of zero-dependency simplicity.
31//! Documents relying on setext or `#`-comments inside code blocks
32//! need the recursive splitter or a future markdown-fenced
33//! companion.
34
35use std::sync::OnceLock;
36
37use regex::Regex;
38
39use crate::document::{Document, Lineage};
40use crate::splitter::TextSplitter;
41
42/// Default ATX heading levels that open a new chunk. `[1, 2, 3]`
43/// splits at `#`, `##`, `###`; deeper sub-headings (`####+`) stay
44/// inline.
45pub const DEFAULT_MARKDOWN_HEADING_LEVELS: &[u8] = &[1, 2, 3];
46
47/// Stable identifier surfaced on every produced chunk's
48/// [`Lineage::splitter`](crate::Lineage::splitter) field.
49const SPLITTER_NAME: &str = "markdown-structure";
50
51/// ATX heading regex — `^` anchors at line start, `#{1,6}` matches
52/// 1-6 hashes, followed by required whitespace and the heading
53/// text. Compiled once via `OnceLock` so repeated splits don't pay
54/// regex compilation per call.
55fn heading_regex() -> &'static Regex {
56    static RE: OnceLock<Regex> = OnceLock::new();
57    RE.get_or_init(|| {
58        // Safe `unwrap` — pattern is a compile-time constant whose
59        // validity is verified by the regex round-trip test below.
60        Regex::new(r"^(#{1,6})\s+\S").expect("heading regex compiles")
61    })
62}
63
64/// Heading-aware markdown splitter.
65///
66/// Construct via [`Self::new`] for the default `[1, 2, 3]`
67/// configuration, or [`Self::with_heading_levels`] to widen / narrow
68/// the split granularity.
69#[derive(Clone, Debug)]
70pub struct MarkdownStructureSplitter {
71    heading_levels: std::sync::Arc<[u8]>,
72}
73
74impl MarkdownStructureSplitter {
75    /// Build with the default `[1, 2, 3]` heading-levels
76    /// configuration.
77    #[must_use]
78    pub fn new() -> Self {
79        Self {
80            heading_levels: DEFAULT_MARKDOWN_HEADING_LEVELS.into(),
81        }
82    }
83
84    /// Override which ATX heading levels open a new chunk. Levels
85    /// outside `1..=6` are silently ignored at split time. Order
86    /// is irrelevant — duplicates are tolerated.
87    #[must_use]
88    pub fn with_heading_levels<I>(mut self, levels: I) -> Self
89    where
90        I: IntoIterator<Item = u8>,
91    {
92        self.heading_levels = levels.into_iter().filter(|l| (1..=6).contains(l)).collect();
93        self
94    }
95
96    /// Borrow the configured heading levels.
97    #[must_use]
98    pub fn heading_levels(&self) -> &[u8] {
99        &self.heading_levels
100    }
101
102    /// Whether `level` matches the configured split set.
103    fn matches_level(&self, level: u8) -> bool {
104        self.heading_levels.contains(&level)
105    }
106}
107
108impl Default for MarkdownStructureSplitter {
109    fn default() -> Self {
110        Self::new()
111    }
112}
113
114impl TextSplitter for MarkdownStructureSplitter {
115    fn name(&self) -> &'static str {
116        SPLITTER_NAME
117    }
118
119    fn split(&self, document: &Document) -> Vec<Document> {
120        let sections = collect_sections(self, &document.content);
121        let total = sections.len();
122        if total == 0 {
123            return Vec::new();
124        }
125        #[allow(clippy::cast_possible_truncation)]
126        let total_u32 = total.min(u32::MAX as usize) as u32;
127        sections
128            .into_iter()
129            .enumerate()
130            .map(|(idx, content)| {
131                #[allow(clippy::cast_possible_truncation)]
132                let idx_u32 = idx.min(u32::MAX as usize) as u32;
133                let lineage =
134                    Lineage::from_split(document.id.clone(), idx_u32, total_u32, SPLITTER_NAME);
135                document.child(content, lineage)
136            })
137            .collect()
138    }
139}
140
141/// Walk lines and accumulate sections. The opening heading line
142/// (if any) stays attached to the section body so retrieval hits
143/// land on a heading-anchored payload.
144fn collect_sections(splitter: &MarkdownStructureSplitter, text: &str) -> Vec<String> {
145    if text.is_empty() {
146        return Vec::new();
147    }
148    let mut sections: Vec<String> = Vec::new();
149    let mut current = String::new();
150    for line in text.split_inclusive('\n') {
151        if let Some(level) = matching_heading_level(splitter, line) {
152            // A section break — emit accumulated content and start a
153            // new section seeded with this heading line.
154            if !current.is_empty() {
155                sections.push(std::mem::take(&mut current));
156            }
157            current.push_str(line);
158            // `level` is the heading depth we matched on; bound by
159            // the regex (1..=6) so the `_` discard is safe.
160            let _ = level;
161        } else {
162            current.push_str(line);
163        }
164    }
165    if !current.is_empty() {
166        sections.push(current);
167    }
168    sections
169}
170
171/// Return the heading level of `line` when it matches a configured
172/// split level, `None` otherwise. Lines without a heading match —
173/// or with a level outside the configured set — return `None`.
174fn matching_heading_level(splitter: &MarkdownStructureSplitter, line: &str) -> Option<u8> {
175    let captures = heading_regex().captures(line.trim_end_matches('\n'))?;
176    // Group 1 is the run of `#` characters. Length is bounded by the
177    // regex repetition cap (`{1,6}`) so cast to u8 is exact.
178    #[allow(clippy::cast_possible_truncation)]
179    let level = captures.get(1)?.as_str().len() as u8;
180    splitter.matches_level(level).then_some(level)
181}
182
183#[cfg(test)]
184#[allow(clippy::unwrap_used, clippy::indexing_slicing)]
185mod tests {
186    use super::*;
187    use crate::document::Source;
188    use entelix_memory::Namespace;
189
190    fn ns() -> Namespace {
191        Namespace::new(entelix_core::TenantId::new("acme"))
192    }
193
194    fn doc(content: &str) -> Document {
195        Document::root("doc", content, Source::now("test://", "test"), ns())
196    }
197
198    #[test]
199    fn empty_input_produces_no_chunks() {
200        let chunks = MarkdownStructureSplitter::new().split(&doc(""));
201        assert!(chunks.is_empty());
202    }
203
204    #[test]
205    fn no_headings_keeps_input_as_single_chunk() {
206        let text = "Just a paragraph.\n\nAnother paragraph.\n";
207        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
208        assert_eq!(chunks.len(), 1);
209        assert_eq!(chunks[0].content, text);
210    }
211
212    #[test]
213    fn h1_h2_split_at_default_levels() {
214        let text = "# Introduction\nIntro body.\n\n## Overview\nOverview body.\n\n## Details\nDetails body.\n";
215        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
216        assert_eq!(chunks.len(), 3);
217        assert!(chunks[0].content.starts_with("# Introduction"));
218        assert!(chunks[1].content.starts_with("## Overview"));
219        assert!(chunks[2].content.starts_with("## Details"));
220    }
221
222    #[test]
223    fn heading_attached_to_body_not_orphaned() {
224        // The first chunk's body must include both the heading line
225        // and the paragraph below — an orphan heading chunk would
226        // lose retrieval context.
227        let text = "# Title\nbody line one.\nbody line two.\n";
228        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
229        assert_eq!(chunks.len(), 1);
230        assert!(chunks[0].content.contains("# Title"));
231        assert!(chunks[0].content.contains("body line one"));
232        assert!(chunks[0].content.contains("body line two"));
233    }
234
235    #[test]
236    fn deeper_headings_stay_inline_under_default_config() {
237        // Default config splits at 1..=3; `####` stays attached to
238        // its parent section.
239        let text = "## Section\nintro.\n\n#### Sub-detail\ndetail body.\n";
240        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
241        assert_eq!(chunks.len(), 1);
242        assert!(chunks[0].content.contains("#### Sub-detail"));
243    }
244
245    #[test]
246    fn narrowed_levels_skip_h2_split() {
247        let text = "# A\nbody A.\n\n## B\nbody B.\n";
248        // Only split on H1 — H2 stays inline under its parent.
249        let chunks = MarkdownStructureSplitter::new()
250            .with_heading_levels([1])
251            .split(&doc(text));
252        assert_eq!(chunks.len(), 1);
253        assert!(chunks[0].content.contains("# A"));
254        assert!(chunks[0].content.contains("## B"));
255    }
256
257    #[test]
258    fn lineage_carries_chunk_metadata() {
259        let text = "# A\nbody.\n# B\nbody.\n";
260        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
261        assert_eq!(chunks.len(), 2);
262        for (idx, chunk) in chunks.iter().enumerate() {
263            let lineage = chunk.lineage.as_ref().unwrap();
264            #[allow(clippy::cast_possible_truncation)]
265            let idx_u32 = idx as u32;
266            assert_eq!(lineage.chunk_index, idx_u32);
267            assert_eq!(lineage.total_chunks, 2);
268            assert_eq!(lineage.splitter, "markdown-structure");
269            assert_eq!(lineage.parent_id.as_str(), "doc");
270        }
271    }
272
273    #[test]
274    fn level_clamp_silently_ignores_invalid_levels() {
275        // Levels outside `1..=6` (regex max) are dropped at config
276        // time — `0` and `7` here disappear, leaving only `2`.
277        let splitter = MarkdownStructureSplitter::new().with_heading_levels([0, 2, 7]);
278        assert_eq!(splitter.heading_levels(), &[2]);
279    }
280
281    #[test]
282    fn rejoined_chunks_reproduce_the_input() {
283        // Chunks concatenate back to the original — splitter is
284        // lossless. Critical for downstream consumers that need
285        // round-trip equality (e.g. replay, audit reconstruction).
286        let text = "# A\nbody A.\n\n## B\nbody B.\n\n### C\nbody C.\nfinal.\n";
287        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
288        let joined: String = chunks.iter().map(|c| c.content.as_str()).collect();
289        assert_eq!(joined, text);
290    }
291
292    #[test]
293    fn child_id_carries_chunk_index_suffix() {
294        let text = "# A\nbody.\n# B\nbody.\n";
295        let chunks = MarkdownStructureSplitter::new().split(&doc(text));
296        for (idx, chunk) in chunks.iter().enumerate() {
297            assert_eq!(chunk.id.as_str(), format!("doc:{idx}"));
298        }
299    }
300
301    #[test]
302    fn heading_regex_round_trips_levels_1_through_6() {
303        // Compile-time guarantee that the regex matches every ATX
304        // heading depth — this anchors the `expect("heading regex
305        // compiles")` claim and validates the level extraction.
306        let cases = [
307            ("# h1", 1),
308            ("## h2", 2),
309            ("### h3", 3),
310            ("#### h4", 4),
311            ("##### h5", 5),
312            ("###### h6", 6),
313        ];
314        for (line, expected_level) in cases {
315            let captures = heading_regex().captures(line).unwrap();
316            #[allow(clippy::cast_possible_truncation)]
317            let level = captures.get(1).unwrap().as_str().len() as u8;
318            assert_eq!(level, expected_level);
319        }
320        // 7 hashes does NOT match — markdown spec caps at 6.
321        assert!(heading_regex().captures("####### too deep").is_none());
322    }
323}
entelix_rag/splitter/markdown.rs

entelix_rag/splitter/
markdown.rs