Skip to main content

ripvec_core/cache/
file_cache.rs

1//! Per-file cache entry storing chunks and their embeddings.
2//!
3//! Uses `rkyv` for zero-copy deserialization — the on-disk format can be
4//! memory-mapped and accessed directly without parsing.
5
6use crate::chunk::CodeChunk;
7
8/// Cached chunks and embeddings for a single source file.
9///
10/// Stored as an rkyv archive in the object store, keyed by the blake3
11/// hash of the source file content.
12#[derive(
13    Debug, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize, bitcode::Encode, bitcode::Decode,
14)]
15pub struct FileCache {
16    /// The semantic chunks extracted from this file.
17    pub chunks: Vec<CodeChunk>,
18    /// Flat embedding data: `[n_chunks × hidden_dim]` contiguous f32 values.
19    pub embeddings: Vec<f32>,
20    /// The embedding dimension (e.g., 384 for BGE-small, 768 for ModernBERT).
21    pub hidden_dim: usize,
22}
23
24/// Magic bytes to identify zstd-compressed cache objects.
25/// Uncompressed (legacy) objects start with rkyv data which never begins with
26/// these bytes, so detection is unambiguous.
27const ZSTD_MAGIC: [u8; 4] = [0x28, 0xB5, 0x2F, 0xFD];
28
29/// Magic bytes to identify bitcode-encoded (portable) cache objects.
30/// 'B' 'C' for bitcode — distinct from zstd magic (0x28 0xB5) and rkyv data.
31const BITCODE_MAGIC: [u8; 2] = [0x42, 0x43];
32
33impl FileCache {
34    /// Serialize to zstd-compressed rkyv bytes.
35    ///
36    /// Embedding vectors compress ~8:1 with zstd because most values cluster
37    /// near zero. Level 1 matches level 3 ratio on this data with faster compression.
38    ///
39    /// # Panics
40    ///
41    /// Panics if serialization fails (should not happen for valid data).
42    #[must_use]
43    pub fn to_bytes(&self) -> Vec<u8> {
44        let raw = rkyv::to_bytes::<rkyv::rancor::Error>(self)
45            .expect("FileCache serialization should never fail");
46        zstd::encode_all(raw.as_slice(), 1)
47            .expect("zstd compression should never fail on valid data")
48    }
49
50    /// Deserialize from (optionally zstd-compressed) rkyv bytes.
51    ///
52    /// Transparently handles both compressed and legacy uncompressed objects
53    /// by checking for the zstd magic number.
54    ///
55    /// # Errors
56    ///
57    /// Returns an error if the bytes are not a valid archive.
58    pub fn from_bytes(bytes: &[u8]) -> crate::Result<Self> {
59        let raw = if bytes.len() >= 4 && bytes[..4] == ZSTD_MAGIC {
60            zstd::decode_all(bytes).map_err(|e| {
61                crate::Error::Other(anyhow::anyhow!("zstd decompression failed: {e}"))
62            })?
63        } else {
64            // Legacy uncompressed format — pass through.
65            bytes.to_vec()
66        };
67        rkyv::from_bytes::<Self, rkyv::rancor::Error>(&raw)
68            .map_err(|e| crate::Error::Other(anyhow::anyhow!("rkyv deserialization failed: {e}")))
69    }
70
71    /// Serialize to portable zstd-compressed bitcode bytes.
72    ///
73    /// Unlike `to_bytes` (rkyv, architecture-dependent), this format is safe
74    /// to share across different CPU architectures (e.g., x86_64 CI → aarch64 Mac).
75    ///
76    /// # Panics
77    ///
78    /// Panics if zstd compression fails (should not happen for valid data).
79    #[must_use]
80    pub fn to_portable_bytes(&self) -> Vec<u8> {
81        let raw = bitcode::encode(self);
82        let compressed = zstd::encode_all(raw.as_slice(), 1)
83            .expect("zstd compression should never fail on valid data");
84        let mut out = Vec::with_capacity(BITCODE_MAGIC.len() + compressed.len());
85        out.extend_from_slice(&BITCODE_MAGIC);
86        out.extend_from_slice(&compressed);
87        out
88    }
89
90    /// Deserialize from portable zstd-compressed bitcode bytes.
91    ///
92    /// Expects the `BITCODE_MAGIC` prefix. Returns an error if the bytes
93    /// are not a valid portable archive.
94    ///
95    /// # Errors
96    ///
97    /// Returns an error if the magic prefix is missing, decompression fails,
98    /// or bitcode deserialization fails.
99    pub fn from_portable_bytes(bytes: &[u8]) -> crate::Result<Self> {
100        if bytes.len() < 2 || bytes[..2] != BITCODE_MAGIC {
101            return Err(crate::Error::Other(anyhow::anyhow!(
102                "not a portable bitcode cache object (missing magic)"
103            )));
104        }
105        let compressed = &bytes[2..];
106        let raw = zstd::decode_all(compressed)
107            .map_err(|e| crate::Error::Other(anyhow::anyhow!("zstd decompression failed: {e}")))?;
108        bitcode::decode(&raw).map_err(|e| {
109            crate::Error::Other(anyhow::anyhow!("bitcode deserialization failed: {e}"))
110        })
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    #[test]
119    fn round_trip() {
120        let fc = FileCache {
121            chunks: vec![CodeChunk {
122                file_path: "test.rs".into(),
123                name: "foo".into(),
124                kind: "function".into(),
125                start_line: 1,
126                end_line: 10,
127                enriched_content: "fn foo() {}".into(),
128                content: "fn foo() {}".into(),
129            }],
130            embeddings: vec![1.0, 2.0, 3.0, 4.0],
131            hidden_dim: 4,
132        };
133        let bytes = fc.to_bytes();
134        let loaded = FileCache::from_bytes(&bytes).unwrap();
135        assert_eq!(loaded.chunks.len(), 1);
136        assert_eq!(loaded.chunks[0].name, "foo");
137        assert_eq!(loaded.embeddings.len(), 4);
138        assert_eq!(loaded.hidden_dim, 4);
139    }
140
141    #[test]
142    fn empty_cache() {
143        let fc = FileCache {
144            chunks: vec![],
145            embeddings: vec![],
146            hidden_dim: 384,
147        };
148        let bytes = fc.to_bytes();
149        let loaded = FileCache::from_bytes(&bytes).unwrap();
150        assert_eq!(loaded.chunks.len(), 0);
151        assert_eq!(loaded.embeddings.len(), 0);
152        assert_eq!(loaded.hidden_dim, 384);
153    }
154
155    #[test]
156    fn invalid_bytes_returns_error() {
157        let result = FileCache::from_bytes(b"garbage data");
158        assert!(result.is_err());
159    }
160
161    #[test]
162    fn portable_round_trip() {
163        let fc = FileCache {
164            chunks: vec![CodeChunk {
165                file_path: "test.rs".into(),
166                name: "foo".into(),
167                kind: "function".into(),
168                start_line: 1,
169                end_line: 10,
170                enriched_content: "fn foo() {}".into(),
171                content: "fn foo() {}".into(),
172            }],
173            embeddings: vec![1.0, 2.0, 3.0, 4.0],
174            hidden_dim: 4,
175        };
176        let bytes = fc.to_portable_bytes();
177        let loaded = FileCache::from_portable_bytes(&bytes).unwrap();
178        assert_eq!(loaded.chunks.len(), 1);
179        assert_eq!(loaded.chunks[0].name, "foo");
180        assert_eq!(loaded.embeddings.len(), 4);
181        assert_eq!(loaded.hidden_dim, 4);
182    }
183
184    #[test]
185    fn portable_empty_cache() {
186        let fc = FileCache {
187            chunks: vec![],
188            embeddings: vec![],
189            hidden_dim: 384,
190        };
191        let bytes = fc.to_portable_bytes();
192        let loaded = FileCache::from_portable_bytes(&bytes).unwrap();
193        assert_eq!(loaded.chunks.len(), 0);
194        assert_eq!(loaded.embeddings.len(), 0);
195        assert_eq!(loaded.hidden_dim, 384);
196    }
197
198    #[test]
199    fn portable_invalid_bytes_returns_error() {
200        let result = FileCache::from_portable_bytes(b"garbage data");
201        assert!(result.is_err());
202    }
203}