Skip to main content

cognis_rag/
record_manager.rs

1//! Incremental indexing — track per-document fingerprints so re-indexing
2//! only re-embeds new or changed documents and removes deleted ones.
3//!
4//! The Rust-native shape:
5//! - [`RecordManager`] is a thin trait (4 methods, all async).
6//! - Fingerprints are arbitrary `String`s — typically a content hash.
7//! - The pipeline does the diffing; the record manager just stores state.
8
9use std::collections::HashMap;
10use std::sync::Mutex;
11
12use async_trait::async_trait;
13
14use cognis_core::{CognisError, Result};
15
16/// Per-key indexing state. Each `key` is a stable identifier for a
17/// document (path, URL, primary key); the `fingerprint` is whatever the
18/// caller computes from the doc — usually a content hash.
19#[async_trait]
20pub trait RecordManager: Send + Sync {
21    /// All keys currently tracked in `group`. Used to detect deletions.
22    async fn list_keys(&self, group: &str) -> Result<Vec<String>>;
23
24    /// Look up the fingerprint stored for `(group, key)`, if any.
25    async fn get_fingerprint(&self, group: &str, key: &str) -> Result<Option<String>>;
26
27    /// Record `fingerprint` for `(group, key)`. Replaces any existing.
28    async fn set_fingerprint(&self, group: &str, key: &str, fingerprint: &str) -> Result<()>;
29
30    /// Forget `(group, key)` pairs.
31    async fn delete(&self, group: &str, keys: &[String]) -> Result<()>;
32}
33
34/// In-process record manager. Suitable for tests and single-process apps.
35#[derive(Default)]
36pub struct InMemoryRecordManager {
37    inner: Mutex<HashMap<(String, String), String>>,
38}
39
40impl InMemoryRecordManager {
41    /// Empty record manager.
42    pub fn new() -> Self {
43        Self::default()
44    }
45}
46
47#[async_trait]
48impl RecordManager for InMemoryRecordManager {
49    async fn list_keys(&self, group: &str) -> Result<Vec<String>> {
50        let inner = self
51            .inner
52            .lock()
53            .map_err(|e| CognisError::Internal(format!("record_manager mutex: {e}")))?;
54        Ok(inner
55            .keys()
56            .filter(|(g, _)| g == group)
57            .map(|(_, k)| k.clone())
58            .collect())
59    }
60    async fn get_fingerprint(&self, group: &str, key: &str) -> Result<Option<String>> {
61        let inner = self
62            .inner
63            .lock()
64            .map_err(|e| CognisError::Internal(format!("record_manager mutex: {e}")))?;
65        Ok(inner.get(&(group.to_string(), key.to_string())).cloned())
66    }
67    async fn set_fingerprint(&self, group: &str, key: &str, fingerprint: &str) -> Result<()> {
68        let mut inner = self
69            .inner
70            .lock()
71            .map_err(|e| CognisError::Internal(format!("record_manager mutex: {e}")))?;
72        inner.insert(
73            (group.to_string(), key.to_string()),
74            fingerprint.to_string(),
75        );
76        Ok(())
77    }
78    async fn delete(&self, group: &str, keys: &[String]) -> Result<()> {
79        let mut inner = self
80            .inner
81            .lock()
82            .map_err(|e| CognisError::Internal(format!("record_manager mutex: {e}")))?;
83        for k in keys {
84            inner.remove(&(group.to_string(), k.clone()));
85        }
86        Ok(())
87    }
88}
89
90/// Stable content fingerprint — FNV-1a 128-bit. Result is a 32-hex-char
91/// string with ~2^-64 collision odds for non-adversarial inputs (well
92/// inside the safe range for change-detection over millions of docs).
93///
94/// The algorithm is fixed (FNV-1a, the published 128-bit constants), so
95/// fingerprints stored to disk stay valid across Rust toolchain
96/// upgrades. Stays in-tree — no hash crate dependency.
97///
98/// Two docs with identical content always produce the same fingerprint;
99/// changing any byte changes it.
100pub fn fingerprint(content: &str) -> String {
101    // FNV-1a 128-bit constants (published, public-domain).
102    // See http://www.isthe.com/chongo/tech/comp/fnv/
103    const FNV_OFFSET_BASIS: u128 = 0x6c62272e07bb014262b821756295c58d;
104    const FNV_PRIME: u128 = 0x0000000001000000000000000000013b;
105    let mut h: u128 = FNV_OFFSET_BASIS;
106    for b in content.as_bytes() {
107        h ^= u128::from(*b);
108        h = h.wrapping_mul(FNV_PRIME);
109    }
110    format!("{h:032x}")
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[tokio::test]
118    async fn fingerprint_is_deterministic() {
119        assert_eq!(fingerprint("hello"), fingerprint("hello"));
120        assert_ne!(fingerprint("hello"), fingerprint("world"));
121    }
122
123    #[test]
124    fn fingerprint_is_stable_across_releases() {
125        // Locked-in reference values. These are the FNV-1a 128-bit
126        // outputs of the current implementation; if any change makes
127        // this test fail, it means stored fingerprints in production
128        // record managers will be invalidated — the algorithm change
129        // must go through a versioned format migration, not a silent
130        // bump.
131        assert_eq!(
132            fingerprint(""),
133            "6c62272e07bb014262b821756295c58d",
134            "empty input must equal the FNV-1a 128 offset basis"
135        );
136        assert_eq!(fingerprint("hello"), "e3e1efd54283d94f7081314b599d31b3");
137        assert_eq!(
138            fingerprint("the quick brown fox jumps over the lazy dog"),
139            "577ea59947cc87c26ffa73dd35a3f550"
140        );
141    }
142
143    #[test]
144    fn fingerprint_is_32_hex_chars() {
145        for s in ["", "a", "longer content with whitespace and digits 12345"] {
146            let fp = fingerprint(s);
147            assert_eq!(fp.len(), 32, "fp = {fp}");
148            assert!(fp.chars().all(|c| c.is_ascii_hexdigit()));
149        }
150    }
151
152    #[tokio::test]
153    async fn record_manager_roundtrip() {
154        let m = InMemoryRecordManager::new();
155        m.set_fingerprint("g", "k1", "fp1").await.unwrap();
156        m.set_fingerprint("g", "k2", "fp2").await.unwrap();
157        m.set_fingerprint("other", "k1", "x").await.unwrap();
158
159        assert_eq!(
160            m.get_fingerprint("g", "k1").await.unwrap(),
161            Some("fp1".into())
162        );
163        assert_eq!(m.get_fingerprint("g", "missing").await.unwrap(), None);
164
165        let mut keys = m.list_keys("g").await.unwrap();
166        keys.sort();
167        assert_eq!(keys, vec!["k1", "k2"]);
168
169        m.delete("g", &["k1".into()]).await.unwrap();
170        assert_eq!(m.get_fingerprint("g", "k1").await.unwrap(), None);
171        assert_eq!(
172            m.get_fingerprint("other", "k1").await.unwrap(),
173            Some("x".into())
174        );
175    }
176}