talon_core/text/
chunker.rs

1//! Chunker module: semantic markdown segmentation using `text-splitter`.
2//!
3//! Body text (frontmatter already stripped by the indexer) is cleaned of
4//! Obsidian `%%...%%` comments, then split with [`MarkdownSplitter`] backed
5//! by a [`tokenx_rs`] length function.  Heading context is reconstructed
6//! from the text preceding each split point.  Trivial and sub-threshold
7//! chunks are discarded before returning.
8
9use std::sync::OnceLock;
10
11use regex::Regex;
12use sha2::{Digest, Sha256};
13use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
14// Intentional divergence from OHS `chunker.ts:23-35` and `chunker.ts:110`:
15// `tokenx-rs` preserves Unicode-aware estimates for CJK/Hangul/Cyrillic/
16// fullwidth text, and we keep its overlap wiring instead of porting the
17// coarser OHS heuristic.
18use tokenx_rs::estimate_token_count;
19
20use crate::config::ChunkerConfig;
21
22#[derive(Debug, Clone, Copy)]
23struct TokenxSizer;
24
25impl ChunkSizer for TokenxSizer {
26    fn size(&self, chunk: &str) -> usize {
27        estimate_token_count(chunk)
28    }
29}
30
31/// A chunk of note content ready for embedding.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct NoteChunk {
34    /// Byte offset where the chunk starts (in the stripped body).
35    pub char_start: usize,
36    /// Byte offset where the chunk ends (exclusive, in the stripped body).
37    pub char_end: usize,
38    /// SHA-256 of `text`.
39    pub chunk_hash: String,
40    /// Prefixed embedding text: `Title: …\nPath: …\nHeadings: …\n\n{text}`.
41    pub embedding_text: String,
42    /// Active heading stack at the chunk's start position.
43    pub headings: Vec<String>,
44    /// Headings joined with ` > `.
45    pub heading_path: String,
46    /// 1-based line number where the chunk starts (in the stripped body).
47    pub line_start: u32,
48    /// 1-based line number where the chunk ends (in the stripped body).
49    pub line_end: u32,
50    /// Trimmed chunk text.
51    pub text: String,
52    /// Token count estimate via `tokenx-rs`.
53    pub token_estimate: usize,
54}
55
56/// Build heading path by joining headings with ` > `.
57#[must_use]
58pub fn build_heading_path(headings: &[String]) -> String {
59    headings.join(" > ")
60}
61
62/// Build prefixed embedding text.
63#[must_use]
64pub fn build_embedding_text(title: &str, path: &str, headings: &[String], text: &str) -> String {
65    format!(
66        "Title: {}\nPath: {}\nHeadings: {}\n\n{}",
67        title,
68        path,
69        build_heading_path(headings),
70        text
71    )
72}
73
74/// SHA-256 hash of raw text.
75#[must_use]
76pub fn make_chunk_hash(text: &str) -> String {
77    let mut hasher = Sha256::new();
78    hasher.update(text.as_bytes());
79    format!("{:x}", hasher.finalize())
80}
81
82/// Chunk a note body (frontmatter already stripped) into [`NoteChunk`]s.
83///
84/// The caller is responsible for passing the body-only text — frontmatter
85/// must be stripped before calling this function so YAML keys/values never
86/// appear in chunk text or embedding text.
87///
88/// Pipeline:
89/// 1. Strip Obsidian `%%…%%` comments.
90/// 2. Split with `MarkdownSplitter` using a `tokenx-rs` token-count sizer.
91/// 3. Reconstruct heading context from the text before each split point.
92/// 4. Drop trivial chunks (heading-only, separator, single wikilink/embed).
93/// 5. Drop chunks below `config.chunk_min_tokens`.
94#[must_use]
95pub fn chunk_markdown(
96    body: &str,
97    title: &str,
98    path: &str,
99    config: &ChunkerConfig,
100) -> Vec<NoteChunk> {
101    let cleaned = strip_obsidian_comments(body);
102
103    let chunk_config = {
104        let base = ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer);
105        if config.chunk_overlap > 0 && config.chunk_overlap < config.chunk_tokens {
106            base.with_overlap(config.chunk_overlap)
107                .unwrap_or_else(|_| ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer))
108        } else {
109            base
110        }
111    };
112
113    let splitter = MarkdownSplitter::new(chunk_config);
114
115    splitter
116        .chunk_indices(&cleaned)
117        .filter_map(|(byte_offset, raw_chunk)| {
118            let text = raw_chunk.trim().to_string();
119
120            if is_trivial_chunk(&text) {
121                return None;
122            }
123
124            let token_estimate = estimate_token_count(&text);
125            if token_estimate < config.chunk_min_tokens {
126                return None;
127            }
128
129            let headings = headings_at_byte_offset(&cleaned, byte_offset);
130            let byte_end = byte_offset + raw_chunk.len();
131
132            let line_start = byte_offset_to_line(&cleaned, byte_offset);
133            let line_end = byte_offset_to_line(&cleaned, byte_end);
134
135            Some(NoteChunk {
136                char_start: byte_offset,
137                char_end: byte_end,
138                chunk_hash: make_chunk_hash(&text),
139                embedding_text: build_embedding_text(title, path, &headings, &text),
140                heading_path: build_heading_path(&headings),
141                headings,
142                line_start,
143                line_end,
144                text,
145                token_estimate,
146            })
147        })
148        .collect()
149}
150
151/// Strip Obsidian `%%inline%%` and `%%\nblock\n%%` comments.
152fn strip_obsidian_comments(body: &str) -> String {
153    static RE: OnceLock<Regex> = OnceLock::new();
154    let re = RE.get_or_init(|| Regex::new(r"(?s)%%.*?%%").unwrap_or_else(|_| unreachable!()));
155    re.replace_all(body, "").into_owned()
156}
157
158/// Walk the text up to `byte_offset` and return the active heading stack.
159fn headings_at_byte_offset(text: &str, byte_offset: usize) -> Vec<String> {
160    let before = &text[..floor_char_boundary(text, byte_offset)];
161    let mut headings: Vec<String> = Vec::new();
162    for line in before.lines() {
163        let level = line.bytes().take_while(|&b| b == b'#').count();
164        if level > 0 && level <= 6 {
165            let rest = &line[level..];
166            if let Some(heading_text) = rest.strip_prefix(' ') {
167                headings.truncate(level.saturating_sub(1));
168                headings.push(heading_text.trim().to_string());
169            }
170        }
171    }
172    headings
173}
174
175/// Return the 1-based line number for a byte offset within `text`.
176fn byte_offset_to_line(text: &str, byte_offset: usize) -> u32 {
177    let clamped = floor_char_boundary(text, byte_offset);
178    let newlines = text[..clamped].bytes().filter(|&b| b == b'\n').count();
179    u32::try_from(newlines)
180        .unwrap_or(u32::MAX)
181        .saturating_add(1)
182}
183
184fn floor_char_boundary(text: &str, byte_offset: usize) -> usize {
185    let mut offset = byte_offset.min(text.len());
186    while !text.is_char_boundary(offset) {
187        offset = offset.saturating_sub(1);
188    }
189    offset
190}
191
192/// Return `true` for chunks that carry no meaningful content:
193/// - heading-only lines (`# …`)
194/// - horizontal separators (`---`, `***`, `___`)
195/// - a bare block ID (`^word`)
196/// - a single wikilink `[[…]]` or image embed `![[…]]`
197fn is_trivial_chunk(text: &str) -> bool {
198    if text.is_empty() {
199        return true;
200    }
201
202    let lines: Vec<&str> = text.lines().collect();
203
204    // Multi-line chunks: only trivial if every line is trivial
205    if lines.len() > 1 {
206        return lines.iter().all(|l| is_trivial_line(l.trim()));
207    }
208
209    let line = lines[0].trim();
210    is_trivial_line(line)
211}
212
213fn is_trivial_line(line: &str) -> bool {
214    if line.is_empty() {
215        return true;
216    }
217
218    // ATX heading
219    if line.starts_with('#') {
220        let level = line.bytes().take_while(|&b| b == b'#').count();
221        if level <= 6 && line[level..].starts_with(' ') {
222            return true;
223        }
224    }
225
226    // Thematic breaks / horizontal rules
227    if matches!(line, "---" | "***" | "___" | "- - -" | "* * *" | "_ _ _") {
228        return true;
229    }
230
231    // Block ID alone: ^word-or-hyphen
232    if line.starts_with('^') && line[1..].chars().all(|c| c.is_alphanumeric() || c == '-') {
233        return true;
234    }
235
236    // Single wikilink or image embed
237    if (line.starts_with("[[") && line.ends_with("]]"))
238        || (line.starts_with("![[") && line.ends_with("]]"))
239    {
240        return true;
241    }
242
243    // Single image line (markdown syntax)
244    if line.starts_with("![") && line.ends_with(')') {
245        return true;
246    }
247
248    false
249}
250
251#[cfg(test)]
252#[allow(clippy::unwrap_used, clippy::expect_used)]
253mod tests;
254
255#[cfg(test)]
256mod token_tests;
talon_core/text/chunker.rs

talon_core/text/
chunker.rs