talon_core/text/
chunker.rs1use std::sync::OnceLock;
10
11use regex::Regex;
12use sha2::{Digest, Sha256};
13use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
14use tokenx_rs::estimate_token_count;
19
20use crate::config::ChunkerConfig;
21
22#[derive(Debug, Clone, Copy)]
23struct TokenxSizer;
24
25impl ChunkSizer for TokenxSizer {
26 fn size(&self, chunk: &str) -> usize {
27 estimate_token_count(chunk)
28 }
29}
30
31#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct NoteChunk {
34 pub char_start: usize,
36 pub char_end: usize,
38 pub chunk_hash: String,
40 pub embedding_text: String,
42 pub headings: Vec<String>,
44 pub heading_path: String,
46 pub line_start: u32,
48 pub line_end: u32,
50 pub text: String,
52 pub token_estimate: usize,
54}
55
56#[must_use]
58pub fn build_heading_path(headings: &[String]) -> String {
59 headings.join(" > ")
60}
61
62#[must_use]
64pub fn build_embedding_text(title: &str, path: &str, headings: &[String], text: &str) -> String {
65 format!(
66 "Title: {}\nPath: {}\nHeadings: {}\n\n{}",
67 title,
68 path,
69 build_heading_path(headings),
70 text
71 )
72}
73
74#[must_use]
76pub fn make_chunk_hash(text: &str) -> String {
77 let mut hasher = Sha256::new();
78 hasher.update(text.as_bytes());
79 format!("{:x}", hasher.finalize())
80}
81
82#[must_use]
95pub fn chunk_markdown(
96 body: &str,
97 title: &str,
98 path: &str,
99 config: &ChunkerConfig,
100) -> Vec<NoteChunk> {
101 let cleaned = strip_obsidian_comments(body);
102
103 let chunk_config = {
104 let base = ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer);
105 if config.chunk_overlap > 0 && config.chunk_overlap < config.chunk_tokens {
106 base.with_overlap(config.chunk_overlap)
107 .unwrap_or_else(|_| ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer))
108 } else {
109 base
110 }
111 };
112
113 let splitter = MarkdownSplitter::new(chunk_config);
114
115 splitter
116 .chunk_indices(&cleaned)
117 .filter_map(|(byte_offset, raw_chunk)| {
118 let text = raw_chunk.trim().to_string();
119
120 if is_trivial_chunk(&text) {
121 return None;
122 }
123
124 let token_estimate = estimate_token_count(&text);
125 if token_estimate < config.chunk_min_tokens {
126 return None;
127 }
128
129 let headings = headings_at_byte_offset(&cleaned, byte_offset);
130 let byte_end = byte_offset + raw_chunk.len();
131
132 let line_start = byte_offset_to_line(&cleaned, byte_offset);
133 let line_end = byte_offset_to_line(&cleaned, byte_end);
134
135 Some(NoteChunk {
136 char_start: byte_offset,
137 char_end: byte_end,
138 chunk_hash: make_chunk_hash(&text),
139 embedding_text: build_embedding_text(title, path, &headings, &text),
140 heading_path: build_heading_path(&headings),
141 headings,
142 line_start,
143 line_end,
144 text,
145 token_estimate,
146 })
147 })
148 .collect()
149}
150
151fn strip_obsidian_comments(body: &str) -> String {
153 static RE: OnceLock<Regex> = OnceLock::new();
154 let re = RE.get_or_init(|| Regex::new(r"(?s)%%.*?%%").unwrap_or_else(|_| unreachable!()));
155 re.replace_all(body, "").into_owned()
156}
157
158fn headings_at_byte_offset(text: &str, byte_offset: usize) -> Vec<String> {
160 let before = &text[..floor_char_boundary(text, byte_offset)];
161 let mut headings: Vec<String> = Vec::new();
162 for line in before.lines() {
163 let level = line.bytes().take_while(|&b| b == b'#').count();
164 if level > 0 && level <= 6 {
165 let rest = &line[level..];
166 if let Some(heading_text) = rest.strip_prefix(' ') {
167 headings.truncate(level.saturating_sub(1));
168 headings.push(heading_text.trim().to_string());
169 }
170 }
171 }
172 headings
173}
174
175fn byte_offset_to_line(text: &str, byte_offset: usize) -> u32 {
177 let clamped = floor_char_boundary(text, byte_offset);
178 let newlines = text[..clamped].bytes().filter(|&b| b == b'\n').count();
179 u32::try_from(newlines)
180 .unwrap_or(u32::MAX)
181 .saturating_add(1)
182}
183
184fn floor_char_boundary(text: &str, byte_offset: usize) -> usize {
185 let mut offset = byte_offset.min(text.len());
186 while !text.is_char_boundary(offset) {
187 offset = offset.saturating_sub(1);
188 }
189 offset
190}
191
192fn is_trivial_chunk(text: &str) -> bool {
198 if text.is_empty() {
199 return true;
200 }
201
202 let lines: Vec<&str> = text.lines().collect();
203
204 if lines.len() > 1 {
206 return lines.iter().all(|l| is_trivial_line(l.trim()));
207 }
208
209 let line = lines[0].trim();
210 is_trivial_line(line)
211}
212
213fn is_trivial_line(line: &str) -> bool {
214 if line.is_empty() {
215 return true;
216 }
217
218 if line.starts_with('#') {
220 let level = line.bytes().take_while(|&b| b == b'#').count();
221 if level <= 6 && line[level..].starts_with(' ') {
222 return true;
223 }
224 }
225
226 if matches!(line, "---" | "***" | "___" | "- - -" | "* * *" | "_ _ _") {
228 return true;
229 }
230
231 if line.starts_with('^') && line[1..].chars().all(|c| c.is_alphanumeric() || c == '-') {
233 return true;
234 }
235
236 if (line.starts_with("[[") && line.ends_with("]]"))
238 || (line.starts_with("![[") && line.ends_with("]]"))
239 {
240 return true;
241 }
242
243 if line.starts_with("![") && line.ends_with(')') {
245 return true;
246 }
247
248 false
249}
250
251#[cfg(test)]
252#[allow(clippy::unwrap_used, clippy::expect_used)]
253mod tests;
254
255#[cfg(test)]
256mod token_tests;