Skip to main content

batuta/oracle/rag/
fingerprint.rs

1//! Document Fingerprinting for Poka-Yoke Stale Detection
2//!
3//! Uses BLAKE3 content hashing for content-addressable index invalidation.
4//! Supports the Toyota Way principle of mistake-proofing (Poka-Yoke).
5
6use serde::{Deserialize, Serialize};
7use std::time::{SystemTime, UNIX_EPOCH};
8
9/// Document fingerprint for change detection (Poka-Yoke)
10///
11/// Content-addressable storage pattern from Quinlan & Dorward (2002).
12#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
13pub struct DocumentFingerprint {
14    /// BLAKE3 hash of document content
15    pub content_hash: [u8; 32],
16    /// Hash of chunking parameters (for reproducibility)
17    pub chunker_config_hash: [u8; 32],
18    /// Hash of embedding model weights
19    pub embedding_model_hash: [u8; 32],
20    /// Timestamp of last successful index (Unix epoch ms)
21    pub indexed_at: u64,
22}
23
24impl DocumentFingerprint {
25    /// Create a new fingerprint from content
26    pub fn new(content: &[u8], chunker_config: &ChunkerConfig, model_hash: [u8; 32]) -> Self {
27        Self {
28            content_hash: blake3_hash(content),
29            chunker_config_hash: chunker_config.hash(),
30            embedding_model_hash: model_hash,
31            indexed_at: current_timestamp_ms(),
32        }
33    }
34
35    /// Check if document needs reindexing (Poka-Yoke validation)
36    ///
37    /// Returns true if ANY component changed:
38    /// - Content changed
39    /// - Chunking config changed
40    /// - Embedding model changed
41    pub fn needs_reindex(&self, current: &Self) -> bool {
42        self.content_hash != current.content_hash
43            || self.chunker_config_hash != current.chunker_config_hash
44            || self.embedding_model_hash != current.embedding_model_hash
45    }
46
47    /// Get age in seconds since indexing
48    pub fn age_seconds(&self) -> u64 {
49        let now = current_timestamp_ms();
50        (now.saturating_sub(self.indexed_at)) / 1000
51    }
52
53    /// Check if fingerprint is stale (older than max_age_seconds)
54    pub fn is_stale(&self, max_age_seconds: u64) -> bool {
55        self.age_seconds() > max_age_seconds
56    }
57}
58
59/// Chunker configuration for reproducible chunking
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub struct ChunkerConfig {
62    /// Chunk size in tokens
63    pub chunk_size: usize,
64    /// Overlap between chunks
65    pub chunk_overlap: usize,
66    /// Separator list hash
67    pub separators_hash: [u8; 32],
68}
69
70impl ChunkerConfig {
71    /// Create a new chunker config
72    pub fn new(chunk_size: usize, chunk_overlap: usize, separators: &[&str]) -> Self {
73        let sep_bytes: Vec<u8> = separators.join("\n").into_bytes();
74        Self { chunk_size, chunk_overlap, separators_hash: blake3_hash(&sep_bytes) }
75    }
76
77    /// Compute hash of this config
78    pub fn hash(&self) -> [u8; 32] {
79        let mut data = Vec::new();
80        data.extend_from_slice(&self.chunk_size.to_le_bytes());
81        data.extend_from_slice(&self.chunk_overlap.to_le_bytes());
82        data.extend_from_slice(&self.separators_hash);
83        blake3_hash(&data)
84    }
85}
86
87impl Default for ChunkerConfig {
88    fn default() -> Self {
89        Self::new(512, 64, &["\n## ", "\n### ", "\nfn ", "\nimpl ", "\nstruct ", "\n\n", "\n", " "])
90    }
91}
92
93/// Compute BLAKE3 hash of data
94///
95/// BLAKE3 chosen for:
96/// - Speed: 4x faster than SHA-256
97/// - Security: 256-bit security level
98/// - Parallelism: Built-in SIMD acceleration
99pub fn blake3_hash(data: &[u8]) -> [u8; 32] {
100    // Use a simple hash for now - will integrate blake3 crate
101    // This is a placeholder that still provides deterministic hashing
102    let mut hash = [0u8; 32];
103
104    // Simple deterministic hash based on content
105    // In production, use blake3::hash(data).into()
106    let mut state: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
107    for &byte in data {
108        state ^= byte as u64;
109        state = state.wrapping_mul(0x0100_0000_01b3); // FNV prime
110    }
111
112    // Expand to 32 bytes
113    for i in 0..4 {
114        let chunk = state.wrapping_add(i as u64).to_le_bytes();
115        hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
116    }
117
118    hash
119}
120
121/// Get current timestamp in milliseconds
122fn current_timestamp_ms() -> u64 {
123    SystemTime::now().duration_since(UNIX_EPOCH).map(|d| d.as_millis() as u64).unwrap_or(0)
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn test_fingerprint_creation() {
132        let content = b"Hello, World!";
133        let config = ChunkerConfig::default();
134        let model_hash = [1u8; 32];
135
136        let fp = DocumentFingerprint::new(content, &config, model_hash);
137
138        assert_ne!(fp.content_hash, [0u8; 32]);
139        assert_ne!(fp.chunker_config_hash, [0u8; 32]);
140        assert_eq!(fp.embedding_model_hash, model_hash);
141        assert!(fp.indexed_at > 0);
142    }
143
144    #[test]
145    fn test_fingerprint_content_change_detection() {
146        let config = ChunkerConfig::default();
147        let model_hash = [1u8; 32];
148
149        let fp1 = DocumentFingerprint::new(b"content v1", &config, model_hash);
150        let fp2 = DocumentFingerprint::new(b"content v2", &config, model_hash);
151
152        assert!(fp1.needs_reindex(&fp2));
153    }
154
155    #[test]
156    fn test_fingerprint_no_change() {
157        let config = ChunkerConfig::default();
158        let model_hash = [1u8; 32];
159
160        let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash);
161        let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash);
162
163        // Content hash should match
164        assert_eq!(fp1.content_hash, fp2.content_hash);
165        // But timestamps differ, so needs_reindex compares hashes only
166        assert!(!fp1.needs_reindex(&fp2));
167    }
168
169    #[test]
170    fn test_fingerprint_config_change_detection() {
171        let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
172        let config2 = ChunkerConfig::new(256, 32, &["\n\n"]); // Different sizes
173        let model_hash = [1u8; 32];
174
175        let fp1 = DocumentFingerprint::new(b"same content", &config1, model_hash);
176        let fp2 = DocumentFingerprint::new(b"same content", &config2, model_hash);
177
178        assert!(fp1.needs_reindex(&fp2));
179    }
180
181    #[test]
182    fn test_fingerprint_model_change_detection() {
183        let config = ChunkerConfig::default();
184        let model_hash1 = [1u8; 32];
185        let model_hash2 = [2u8; 32]; // Different model
186
187        let fp1 = DocumentFingerprint::new(b"same content", &config, model_hash1);
188        let fp2 = DocumentFingerprint::new(b"same content", &config, model_hash2);
189
190        assert!(fp1.needs_reindex(&fp2));
191    }
192
193    #[test]
194    fn test_blake3_hash_deterministic() {
195        let data = b"test data";
196        let hash1 = blake3_hash(data);
197        let hash2 = blake3_hash(data);
198        assert_eq!(hash1, hash2);
199    }
200
201    #[test]
202    fn test_blake3_hash_different_inputs() {
203        let hash1 = blake3_hash(b"input 1");
204        let hash2 = blake3_hash(b"input 2");
205        assert_ne!(hash1, hash2);
206    }
207
208    #[test]
209    fn test_chunker_config_hash_deterministic() {
210        let config1 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
211        let config2 = ChunkerConfig::new(512, 64, &["\n\n", "\n"]);
212        assert_eq!(config1.hash(), config2.hash());
213    }
214
215    #[test]
216    fn test_chunker_config_different_params() {
217        let config1 = ChunkerConfig::new(512, 64, &["\n\n"]);
218        let config2 = ChunkerConfig::new(256, 64, &["\n\n"]);
219        assert_ne!(config1.hash(), config2.hash());
220    }
221
222    #[test]
223    fn test_fingerprint_age() {
224        let config = ChunkerConfig::default();
225        let model_hash = [1u8; 32];
226        let fp = DocumentFingerprint::new(b"content", &config, model_hash);
227
228        // Just created, age should be very small
229        assert!(fp.age_seconds() < 2);
230    }
231
232    #[test]
233    fn test_fingerprint_staleness() {
234        let config = ChunkerConfig::default();
235        let model_hash = [1u8; 32];
236        let fp = DocumentFingerprint::new(b"content", &config, model_hash);
237
238        // Just created, should not be stale
239        assert!(!fp.is_stale(60)); // 1 minute threshold
240    }
241}