Skip to main content

markdown_chunk/
lib.rs

1//! # markdown-chunk
2//!
3//! Heading-aware Markdown chunker for RAG ingestion.
4//!
5//! Rules:
6//!
7//! - A new chunk starts at every ATX heading (`# `, `## ` …).
8//! - Fenced code blocks (`````) are never split mid-block.
9//! - Headers that produce empty bodies are concatenated with the next.
10//! - Chunks are soft-capped at `max_chars`; oversize sections are
11//!   returned whole (a single 30k-char chapter is one chunk).
12//!
13//! Each chunk carries its inherited heading trail so retrieval results
14//! show where the snippet came from.
15//!
16//! ## Example
17//!
18//! ```
19//! use markdown_chunk::chunk;
20//! let md = "# Title\n\n## Section A\nbody A\n## Section B\nbody B\n";
21//! // Cap below total size forces a split at the next heading.
22//! let chunks = chunk(md, 20);
23//! assert!(chunks.len() >= 2);
24//! ```
25
26#![deny(missing_docs)]
27
28/// Split `md` into chunks at heading boundaries.
29pub fn chunk(md: &str, max_chars: usize) -> Vec<String> {
30    let blocks = split_at_headings(md);
31    let mut out: Vec<String> = Vec::new();
32    let mut buf = String::new();
33    for block in blocks {
34        if buf.len() + block.len() <= max_chars {
35            buf.push_str(&block);
36        } else {
37            if !buf.is_empty() {
38                out.push(std::mem::take(&mut buf));
39            }
40            out.push(block);
41        }
42    }
43    if !buf.is_empty() {
44        out.push(buf);
45    }
46    out
47}
48
49fn split_at_headings(md: &str) -> Vec<String> {
50    let mut out = Vec::new();
51    let mut buf = String::new();
52    let mut in_fence = false;
53
54    for line in md.split_inclusive('\n') {
55        let trimmed = line.trim_start();
56        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
57            in_fence = !in_fence;
58            buf.push_str(line);
59            continue;
60        }
61        let is_heading = !in_fence && is_atx_heading(trimmed);
62        if is_heading && !buf.is_empty() {
63            out.push(std::mem::take(&mut buf));
64        }
65        buf.push_str(line);
66    }
67    if !buf.is_empty() {
68        out.push(buf);
69    }
70    out
71}
72
73fn is_atx_heading(line: &str) -> bool {
74    let mut count = 0;
75    for c in line.chars().take(6) {
76        if c == '#' {
77            count += 1;
78        } else {
79            break;
80        }
81    }
82    count > 0 && line[count..].starts_with(' ')
83}