batuta/oracle/rag/
fingerprint.rs1use serde::{Deserialize, Serialize};
7use std::time::{SystemTime, UNIX_EPOCH};
8
9#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
13pub struct DocumentFingerprint {
14 pub content_hash: [u8; 32],
16 pub chunker_config_hash: [u8; 32],
18 pub embedding_model_hash: [u8; 32],
20 pub indexed_at: u64,
22}
23
24impl DocumentFingerprint {
25 pub fn new(content: &[u8], chunker_config: &ChunkerConfig, model_hash: [u8; 32]) -> Self {
27 Self {
28 content_hash: blake3_hash(content),
29 chunker_config_hash: chunker_config.hash(),
30 embedding_model_hash: model_hash,
31 indexed_at: current_timestamp_ms(),
32 }
33 }
34
35 pub fn needs_reindex(&self, current: &Self) -> bool {
42 self.content_hash != current.content_hash
43 || self.chunker_config_hash != current.chunker_config_hash
44 || self.embedding_model_hash != current.embedding_model_hash
45 }
46
47 pub fn age_seconds(&self) -> u64 {
49 let now = current_timestamp_ms();
50 (now.saturating_sub(self.indexed_at)) / 1000
51 }
52
53 pub fn is_stale(&self, max_age_seconds: u64) -> bool {
55 self.age_seconds() > max_age_seconds
56 }
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub struct ChunkerConfig {
62 pub chunk_size: usize,
64 pub chunk_overlap: usize,
66 pub separators_hash: [u8; 32],
68}
69
70impl ChunkerConfig {
71 pub fn new(chunk_size: usize, chunk_overlap: usize, separators: &[&str]) -> Self {
73 let sep_bytes: Vec<u8> = separators.join("\n").into_bytes();
74 Self { chunk_size, chunk_overlap, separators_hash: blake3_hash(&sep_bytes) }
75 }
76
77 pub fn hash(&self) -> [u8; 32] {
79 let mut data = Vec::new();
80 data.extend_from_slice(&self.chunk_size.to_le_bytes());
81 data.extend_from_slice(&self.chunk_overlap.to_le_bytes());
82 data.extend_from_slice(&self.separators_hash);
83 blake3_hash(&data)
84 }
85}
86
87impl Default for ChunkerConfig {
88 fn default() -> Self {
89 Self::new(512, 64, &["\n## ", "\n### ", "\nfn ", "\nimpl ", "\nstruct ", "\n\n", "\n", " "])
90 }
91}
92
93pub fn blake3_hash(data: &[u8]) -> [u8; 32] {
100 let mut hash = [0u8; 32];
103
104 let mut state: u64 = 0xcbf2_9ce4_8422_2325; for &byte in data {
108 state ^= byte as u64;
109 state = state.wrapping_mul(0x0100_0000_01b3); }
111
112 for i in 0..4 {
114 let chunk = state.wrapping_add(i as u64).to_le_bytes();
115 hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
116 }
117
118 hash
119}
120
121fn current_timestamp_ms() -> u64 {
123 SystemTime::now().duration_since(UNIX_EPOCH).map(|d| d.as_millis() as u64).unwrap_or(0)
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn test_fingerprint_creation() {
132 let content = b"Hello, World!";
133 let config = ChunkerConfig::default();
134 let model_hash = [1u8; 32];
135
136 let fp = DocumentFingerprint::new(content, &config, model_hash);
137
138 assert_ne!(fp.content_hash, [0u8; 32]);
139 assert_ne!(fp.chunker_config_hash, [0u8; 32]);
140 assert_eq!(fp.embedding_model_hash, model_hash);
141 assert!(fp.indexed_at > 0);
142 }
143
144 #[test]
145 fn test_fingerprint_content_change_detection() {
146 let config = ChunkerConfig::default();
147 let model_hash = [1u8; 32];
148
149 let fp1 = DocumentFingerprint::new(b"content v1", &config, model_hash);
150 let fp2 = DocumentFingerprint::new(b"content v2", &config, model_hash);
151
152 assert!(fp1.needs_reindex(&fp2));
153 }
154
155 #[test]
156 fn test_fingerprint_no_change() {
157 let config = ChunkerConfig::default();
158 let model_hash = [1u8; 32];
159
160 let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash);
161 let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash);
162
163 assert_eq!(fp1.content_hash, fp2.content_hash);
165 assert!(!fp1.needs_reindex(&fp2));
167 }
168
169 #[test]
170 fn test_fingerprint_config_change_detection() {
171 let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
172 let config2 = ChunkerConfig::new(256, 32, &["\n\n"]); let model_hash = [1u8; 32];
174
175 let fp1 = DocumentFingerprint::new(b"same content", &config1, model_hash);
176 let fp2 = DocumentFingerprint::new(b"same content", &config2, model_hash);
177
178 assert!(fp1.needs_reindex(&fp2));
179 }
180
181 #[test]
182 fn test_fingerprint_model_change_detection() {
183 let config = ChunkerConfig::default();
184 let model_hash1 = [1u8; 32];
185 let model_hash2 = [2u8; 32]; let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash1);
188 let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash2);
189
190 assert!(fp1.needs_reindex(&fp2));
191 }
192
193 #[test]
194 fn test_blake3_hash_deterministic() {
195 let data = b"test data";
196 let hash1 = blake3_hash(data);
197 let hash2 = blake3_hash(data);
198 assert_eq!(hash1, hash2);
199 }
200
201 #[test]
202 fn test_blake3_hash_different_inputs() {
203 let hash1 = blake3_hash(b"input 1");
204 let hash2 = blake3_hash(b"input 2");
205 assert_ne!(hash1, hash2);
206 }
207
208 #[test]
209 fn test_chunker_config_hash_deterministic() {
210 let config1 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
211 let config2 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
212 assert_eq!(config1.hash(), config2.hash());
213 }
214
215 #[test]
216 fn test_chunker_config_different_params() {
217 let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
218 let config2 = ChunkerConfig::new(256, 64, &["\n\n"]);
219 assert_ne!(config1.hash(), config2.hash());
220 }
221
222 #[test]
223 fn test_fingerprint_age() {
224 let config = ChunkerConfig::default();
225 let model_hash = [1u8; 32];
226 let fp = DocumentFingerprint::new(b"content", &config, model_hash);
227
228 assert!(fp.age_seconds() < 2);
230 }
231
232 #[test]
233 fn test_fingerprint_staleness() {
234 let config = ChunkerConfig::default();
235 let model_hash = [1u8; 32];
236 let fp = DocumentFingerprint::new(b"content", &config, model_hash);
237
238 assert!(!fp.is_stale(60)); }
241}