use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use crate::kb::model::{KbLocator, LogicalSourceId};
pub fn chunk_id(lsid: &LogicalSourceId, seq: u32, content: &str) -> String {
let mut h = Sha256::new();
h.update(lsid.as_str().as_bytes());
h.update([0u8]);
h.update(seq.to_be_bytes());
h.update([0u8]);
h.update(content.as_bytes());
let mut hex = String::with_capacity(64);
for b in h.finalize().iter() {
use std::fmt::Write;
let _ = write!(hex, "{b:02x}");
}
hex.truncate(32);
hex
}
#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ChunkStatus {
Active,
Tombstoned,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct KbChunk {
pub id: String, pub doc_id: String, pub logical_source_id: String, pub doc_version: u32, pub seq: u32,
pub heading_path: Vec<String>,
pub byte_offset: (u64, u64), pub indexed_text: String, pub vector: Vec<f32>, pub simhash: u64,
pub locator: KbLocator,
pub status: ChunkStatus,
pub source_quality: f32,
pub embedder_id: String, }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chunk_id_deterministic() {
let lsid = LogicalSourceId::for_file("abc");
let a = chunk_id(&lsid, 0, "hello world");
let b = chunk_id(&lsid, 0, "hello world");
assert_eq!(a, b);
assert_eq!(a.len(), 32);
assert!(a.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn chunk_id_varies_with_seq() {
let lsid = LogicalSourceId::for_file("abc");
assert_ne!(chunk_id(&lsid, 0, "x"), chunk_id(&lsid, 1, "x"));
}
#[test]
fn chunk_id_varies_with_content() {
let lsid = LogicalSourceId::for_file("abc");
assert_ne!(chunk_id(&lsid, 0, "hello"), chunk_id(&lsid, 0, "world"));
}
#[test]
fn chunk_id_varies_with_logical_source() {
let l1 = LogicalSourceId::for_file("abc");
let l2 = LogicalSourceId::for_file("def");
assert_ne!(chunk_id(&l1, 0, "x"), chunk_id(&l2, 0, "x"));
}
#[test]
fn reingest_same_file_same_chunk_ids() {
let lsid = LogicalSourceId::for_file("hash_of_file_contents");
let body = "the actual canonical markdown body";
let id_first = chunk_id(&lsid, 0, body);
let id_second = chunk_id(&lsid, 0, body);
assert_eq!(
id_first, id_second,
"re-ingest must produce same chunk_id (idempotency invariant)"
);
}
#[test]
fn null_separators_avoid_field_collisions() {
let l1 = LogicalSourceId("ns1".into());
let l2 = LogicalSourceId("ns".into());
assert_ne!(chunk_id(&l1, 1, "d"), chunk_id(&l2, 12, "cd"));
}
#[test]
fn struct_serde_roundtrip() {
let lsid = LogicalSourceId::for_file("abc");
let c = KbChunk {
id: chunk_id(&lsid, 0, "hi"),
doc_id: "doc1".into(),
logical_source_id: lsid.0.clone(),
doc_version: 1,
seq: 0,
heading_path: vec!["A".into()],
byte_offset: (0, 2),
indexed_text: "A\n\nhi".into(),
vector: vec![],
simhash: 0,
locator: KbLocator::Offset { start: 0, end: 2 },
status: ChunkStatus::Active,
source_quality: 1.0,
embedder_id: String::new(),
};
let s = serde_json::to_string(&c).unwrap();
let back: KbChunk = serde_json::from_str(&s).unwrap();
assert_eq!(c, back);
}
}