use serde::{Deserialize, Serialize};
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DocumentFingerprint {
pub content_hash: [u8; 32],
pub chunker_config_hash: [u8; 32],
pub embedding_model_hash: [u8; 32],
pub indexed_at: u64,
}
impl DocumentFingerprint {
pub fn new(content: &[u8], chunker_config: &ChunkerConfig, model_hash: [u8; 32]) -> Self {
Self {
content_hash: blake3_hash(content),
chunker_config_hash: chunker_config.hash(),
embedding_model_hash: model_hash,
indexed_at: current_timestamp_ms(),
}
}
pub fn needs_reindex(&self, current: &Self) -> bool {
self.content_hash != current.content_hash
|| self.chunker_config_hash != current.chunker_config_hash
|| self.embedding_model_hash != current.embedding_model_hash
}
pub fn age_seconds(&self) -> u64 {
let now = current_timestamp_ms();
(now.saturating_sub(self.indexed_at)) / 1000
}
pub fn is_stale(&self, max_age_seconds: u64) -> bool {
self.age_seconds() > max_age_seconds
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ChunkerConfig {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub separators_hash: [u8; 32],
}
impl ChunkerConfig {
pub fn new(chunk_size: usize, chunk_overlap: usize, separators: &[&str]) -> Self {
let sep_bytes: Vec<u8> = separators.join("\n").into_bytes();
Self { chunk_size, chunk_overlap, separators_hash: blake3_hash(&sep_bytes) }
}
pub fn hash(&self) -> [u8; 32] {
let mut data = Vec::new();
data.extend_from_slice(&self.chunk_size.to_le_bytes());
data.extend_from_slice(&self.chunk_overlap.to_le_bytes());
data.extend_from_slice(&self.separators_hash);
blake3_hash(&data)
}
}
impl Default for ChunkerConfig {
fn default() -> Self {
Self::new(512, 64, &["\n## ", "\n### ", "\nfn ", "\nimpl ", "\nstruct ", "\n\n", "\n", " "])
}
}
pub fn blake3_hash(data: &[u8]) -> [u8; 32] {
let mut hash = [0u8; 32];
let mut state: u64 = 0xcbf2_9ce4_8422_2325; for &byte in data {
state ^= byte as u64;
state = state.wrapping_mul(0x0100_0000_01b3); }
for i in 0..4 {
let chunk = state.wrapping_add(i as u64).to_le_bytes();
hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
}
hash
}
fn current_timestamp_ms() -> u64 {
SystemTime::now().duration_since(UNIX_EPOCH).map(|d| d.as_millis() as u64).unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fingerprint_creation() {
let content = b"Hello, World!";
let config = ChunkerConfig::default();
let model_hash = [1u8; 32];
let fp = DocumentFingerprint::new(content, &config, model_hash);
assert_ne!(fp.content_hash, [0u8; 32]);
assert_ne!(fp.chunker_config_hash, [0u8; 32]);
assert_eq!(fp.embedding_model_hash, model_hash);
assert!(fp.indexed_at > 0);
}
#[test]
fn test_fingerprint_content_change_detection() {
let config = ChunkerConfig::default();
let model_hash = [1u8; 32];
let fp1 = DocumentFingerprint::new(b"content v1", &config, model_hash);
let fp2 = DocumentFingerprint::new(b"content v2", &config, model_hash);
assert!(fp1.needs_reindex(&fp2));
}
#[test]
fn test_fingerprint_no_change() {
let config = ChunkerConfig::default();
let model_hash = [1u8; 32];
let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash);
let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash);
assert_eq!(fp1.content_hash, fp2.content_hash);
assert!(!fp1.needs_reindex(&fp2));
}
#[test]
fn test_fingerprint_config_change_detection() {
let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
let config2 = ChunkerConfig::new(256, 32, &["\n\n"]); let model_hash = [1u8; 32];
let fp1 = DocumentFingerprint::new(b"same content", &config1, model_hash);
let fp2 = DocumentFingerprint::new(b"same content", &config2, model_hash);
assert!(fp1.needs_reindex(&fp2));
}
#[test]
fn test_fingerprint_model_change_detection() {
let config = ChunkerConfig::default();
let model_hash1 = [1u8; 32];
let model_hash2 = [2u8; 32];
let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash1);
let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash2);
assert!(fp1.needs_reindex(&fp2));
}
#[test]
fn test_blake3_hash_deterministic() {
let data = b"test data";
let hash1 = blake3_hash(data);
let hash2 = blake3_hash(data);
assert_eq!(hash1, hash2);
}
#[test]
fn test_blake3_hash_different_inputs() {
let hash1 = blake3_hash(b"input 1");
let hash2 = blake3_hash(b"input 2");
assert_ne!(hash1, hash2);
}
#[test]
fn test_chunker_config_hash_deterministic() {
let config1 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
let config2 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
assert_eq!(config1.hash(), config2.hash());
}
#[test]
fn test_chunker_config_different_params() {
let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
let config2 = ChunkerConfig::new(256, 64, &["\n\n"]);
assert_ne!(config1.hash(), config2.hash());
}
#[test]
fn test_fingerprint_age() {
let config = ChunkerConfig::default();
let model_hash = [1u8; 32];
let fp = DocumentFingerprint::new(b"content", &config, model_hash);
assert!(fp.age_seconds() < 2);
}
#[test]
fn test_fingerprint_staleness() {
let config = ChunkerConfig::default();
let model_hash = [1u8; 32];
let fp = DocumentFingerprint::new(b"content", &config, model_hash);
assert!(!fp.is_stale(60)); }
}