pub mod splitter;
pub mod tokens;
use splitter::split_paragraphs;
use tokens::approx_token_count;
use crate::kb::{
canonicalize::md::heading_path_at,
model::{ChunkStatus, KbChunk, KbLocator, LogicalSourceId, chunk_id, hamming64, simhash64},
};
pub const DEFAULT_TARGET_TOKENS: u32 = 512;
pub const DEFAULT_MIN_TOKENS: u32 = 50;
pub const SIMHASH_DEDUP_THRESHOLD: u32 = 3;
#[derive(Debug, Clone)]
pub struct ChunkerInput<'a> {
pub logical_source_id: &'a LogicalSourceId,
pub doc_id: &'a str,
pub doc_version: u32,
pub markdown_body: &'a str,
pub default_locator_kind: LocatorKind,
}
#[derive(Debug, Clone, Copy)]
pub enum LocatorKind {
Offset,
MdSection,
}
pub fn chunk_markdown(input: ChunkerInput<'_>) -> Vec<KbChunk> {
let paras = split_paragraphs(input.markdown_body);
let mut chunks: Vec<KbChunk> = Vec::new();
let mut buf = String::new();
let mut buf_start: Option<usize> = None;
let mut buf_end: usize = 0;
let mut seq = 0u32;
for p in ¶s {
let tentative = approx_token_count(&buf) + approx_token_count(&p.text);
if !buf.is_empty() && tentative > DEFAULT_TARGET_TOKENS {
flush(
&mut chunks,
&mut seq,
&input,
buf_start.unwrap(),
buf_end,
&buf,
);
buf.clear();
buf_start = None;
}
if buf.is_empty() {
buf_start = Some(p.start);
}
if !buf.is_empty() {
buf.push_str("\n\n");
}
buf.push_str(&p.text);
buf_end = p.end;
if approx_token_count(&buf) >= DEFAULT_TARGET_TOKENS {
flush(
&mut chunks,
&mut seq,
&input,
buf_start.unwrap(),
buf_end,
&buf,
);
buf.clear();
buf_start = None;
}
}
if !buf.is_empty() {
flush(
&mut chunks,
&mut seq,
&input,
buf_start.unwrap(),
buf_end,
&buf,
);
}
deduplicate(&mut chunks);
chunks
}
fn flush(
out: &mut Vec<KbChunk>,
seq: &mut u32,
input: &ChunkerInput<'_>,
start: usize,
end: usize,
body: &str,
) {
let path = heading_path_at(input.markdown_body, end);
let indexed = if path.is_empty() {
body.to_string()
} else {
format!("{}\n\n{body}", path.join(" > "))
};
let id = chunk_id(input.logical_source_id, *seq, body);
let sim = simhash64(body);
let locator = match input.default_locator_kind {
LocatorKind::Offset => KbLocator::Offset { start, end },
LocatorKind::MdSection => KbLocator::MdSection {
heading_path: path.clone(),
},
};
out.push(KbChunk {
id,
doc_id: input.doc_id.to_string(),
logical_source_id: input.logical_source_id.0.clone(),
doc_version: input.doc_version,
seq: *seq,
heading_path: path,
byte_offset: (start as u64, end as u64),
indexed_text: indexed,
vector: vec![], simhash: sim,
locator,
status: ChunkStatus::Active,
source_quality: 1.0,
embedder_id: String::new(),
});
*seq += 1;
}
fn deduplicate(chunks: &mut Vec<KbChunk>) {
let mut kept: Vec<u64> = Vec::with_capacity(chunks.len());
chunks.retain(|c| {
for sh in &kept {
if hamming64(*sh, c.simhash) <= SIMHASH_DEDUP_THRESHOLD {
return false;
}
}
kept.push(c.simhash);
true
});
}
#[cfg(test)]
mod tests {
use super::*;
fn lsid() -> LogicalSourceId {
LogicalSourceId::for_file("hash")
}
#[test]
fn short_doc_one_chunk() {
let chunks = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: "tiny.",
default_locator_kind: LocatorKind::Offset,
});
assert_eq!(chunks.len(), 1);
}
#[test]
fn long_doc_multi_chunks() {
let md = "para one.\n\n".to_string() + &"para text here.\n\n".repeat(500);
let chunks = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: &md,
default_locator_kind: LocatorKind::Offset,
});
assert!(chunks.len() > 1);
}
#[test]
fn heading_path_in_indexed_text() {
let md = "# Mengniu\n## Recipe\n100g + 100ml.";
let chunks = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: md,
default_locator_kind: LocatorKind::MdSection,
});
assert!(chunks[0].indexed_text.starts_with("Mengniu > Recipe"));
assert_eq!(
chunks[0].heading_path,
vec!["Mengniu".to_string(), "Recipe".to_string()]
);
}
#[test]
fn idempotent_chunk_ids() {
let md = "hello.\n\nworld.";
let a = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: md,
default_locator_kind: LocatorKind::Offset,
});
let b = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d2",
doc_version: 5, markdown_body: md,
default_locator_kind: LocatorKind::Offset,
});
assert_eq!(a.len(), b.len());
for (x, y) in a.iter().zip(b.iter()) {
assert_eq!(
x.id, y.id,
"chunk_id must NOT depend on doc_id/version, only on logical_source_id"
);
}
}
#[test]
fn near_dup_dedup() {
let md = "the quick brown fox jumps over the lazy dog\n\n\
the quick brown fox jumps over the lazy dog";
let chunks = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: md,
default_locator_kind: LocatorKind::Offset,
});
assert_eq!(chunks.len(), 1, "identical chunks must be deduped");
}
#[test]
fn byte_offsets_in_bounds() {
let md = "first.\n\nsecond.";
let chunks = chunk_markdown(ChunkerInput {
logical_source_id: &lsid(),
doc_id: "d1",
doc_version: 1,
markdown_body: md,
default_locator_kind: LocatorKind::Offset,
});
for c in chunks {
let (s, e) = (c.byte_offset.0 as usize, c.byte_offset.1 as usize);
assert!(e <= md.len());
assert!(s < e);
assert!(!md[s..e].trim().is_empty());
}
}
}