use std::sync::OnceLock;
use regex::Regex;
use sha2::{Digest, Sha256};
use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
use tokenx_rs::estimate_token_count;
use crate::config::ChunkerConfig;
#[derive(Debug, Clone, Copy)]
struct TokenxSizer;
impl ChunkSizer for TokenxSizer {
fn size(&self, chunk: &str) -> usize {
estimate_token_count(chunk)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NoteChunk {
pub char_start: usize,
pub char_end: usize,
pub chunk_hash: String,
pub embedding_text: String,
pub headings: Vec<String>,
pub heading_path: String,
pub line_start: u32,
pub line_end: u32,
pub text: String,
pub token_estimate: usize,
}
#[must_use]
pub fn build_heading_path(headings: &[String]) -> String {
headings.join(" > ")
}
#[must_use]
pub fn build_embedding_text(title: &str, path: &str, headings: &[String], text: &str) -> String {
format!(
"Title: {}\nPath: {}\nHeadings: {}\n\n{}",
title,
path,
build_heading_path(headings),
text
)
}
#[must_use]
pub fn make_chunk_hash(text: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(text.as_bytes());
format!("{:x}", hasher.finalize())
}
#[must_use]
pub fn chunk_markdown(
body: &str,
title: &str,
path: &str,
config: &ChunkerConfig,
) -> Vec<NoteChunk> {
let cleaned = strip_obsidian_comments(body);
let chunk_config = {
let base = ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer);
if config.chunk_overlap > 0 && config.chunk_overlap < config.chunk_tokens {
base.with_overlap(config.chunk_overlap)
.unwrap_or_else(|_| ChunkConfig::new(config.chunk_tokens).with_sizer(TokenxSizer))
} else {
base
}
};
let splitter = MarkdownSplitter::new(chunk_config);
splitter
.chunk_indices(&cleaned)
.filter_map(|(byte_offset, raw_chunk)| {
let text = raw_chunk.trim().to_string();
if is_trivial_chunk(&text) {
return None;
}
let token_estimate = estimate_token_count(&text);
if token_estimate < config.chunk_min_tokens {
return None;
}
let headings = headings_at_byte_offset(&cleaned, byte_offset);
let byte_end = byte_offset + raw_chunk.len();
let line_start = byte_offset_to_line(&cleaned, byte_offset);
let line_end = byte_offset_to_line(&cleaned, byte_end);
Some(NoteChunk {
char_start: byte_offset,
char_end: byte_end,
chunk_hash: make_chunk_hash(&text),
embedding_text: build_embedding_text(title, path, &headings, &text),
heading_path: build_heading_path(&headings),
headings,
line_start,
line_end,
text,
token_estimate,
})
})
.collect()
}
fn strip_obsidian_comments(body: &str) -> String {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"(?s)%%.*?%%").unwrap_or_else(|_| unreachable!()));
re.replace_all(body, "").into_owned()
}
fn headings_at_byte_offset(text: &str, byte_offset: usize) -> Vec<String> {
let before = &text[..floor_char_boundary(text, byte_offset)];
let mut headings: Vec<String> = Vec::new();
for line in before.lines() {
let level = line.bytes().take_while(|&b| b == b'#').count();
if level > 0 && level <= 6 {
let rest = &line[level..];
if let Some(heading_text) = rest.strip_prefix(' ') {
headings.truncate(level.saturating_sub(1));
headings.push(heading_text.trim().to_string());
}
}
}
headings
}
fn byte_offset_to_line(text: &str, byte_offset: usize) -> u32 {
let clamped = floor_char_boundary(text, byte_offset);
let newlines = text[..clamped].bytes().filter(|&b| b == b'\n').count();
u32::try_from(newlines)
.unwrap_or(u32::MAX)
.saturating_add(1)
}
fn floor_char_boundary(text: &str, byte_offset: usize) -> usize {
let mut offset = byte_offset.min(text.len());
while !text.is_char_boundary(offset) {
offset = offset.saturating_sub(1);
}
offset
}
fn is_trivial_chunk(text: &str) -> bool {
if text.is_empty() {
return true;
}
let lines: Vec<&str> = text.lines().collect();
if lines.len() > 1 {
return lines.iter().all(|l| is_trivial_line(l.trim()));
}
let line = lines[0].trim();
is_trivial_line(line)
}
fn is_trivial_line(line: &str) -> bool {
if line.is_empty() {
return true;
}
if line.starts_with('#') {
let level = line.bytes().take_while(|&b| b == b'#').count();
if level <= 6 && line[level..].starts_with(' ') {
return true;
}
}
if matches!(line, "---" | "***" | "___" | "- - -" | "* * *" | "_ _ _") {
return true;
}
if line.starts_with('^') && line[1..].chars().all(|c| c.is_alphanumeric() || c == '-') {
return true;
}
if (line.starts_with("[[") && line.ends_with("]]"))
|| (line.starts_with("![[") && line.ends_with("]]"))
{
return true;
}
if line.starts_with("![") && line.ends_with(')') {
return true;
}
false
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests;
#[cfg(test)]
mod token_tests;