Skip to main content

context_core/cache/
invalidation.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use chrono::Utc;
6use sha2::{Digest, Sha256};
7use thiserror::Error;
8
9use crate::cache::cache::ContextCache;
10use crate::cache::versioning::{CacheBuildConfig, CacheIndex, CacheManifest, ManifestDocumentEntry};
11use crate::document::Document;
12
13#[derive(Debug, Error)]
14pub enum CacheBuildError {
15    #[error("IO error: {0}")]
16    Io(#[from] std::io::Error),
17    #[error("Serialization error: {0}")]
18    Serialization(#[from] serde_json::Error),
19    #[error("Output directory already exists: {0}")]
20    OutputExists(PathBuf),
21    #[error("Filename collision detected for hash fragment: {0}")]
22    FilenameCollision(String),
23    #[error("Duplicate document ID: {0}")]
24    DuplicateDocumentId(String),
25    #[error("Invalid version format: {0}")]
26    InvalidVersionFormat(String),
27}
28
29/// CacheBuilder is single-threaded and non-reentrant by design.
30pub struct CacheBuilder {
31    config: CacheBuildConfig,
32}
33
34impl CacheBuilder {
35    pub fn new(config: CacheBuildConfig) -> Self {
36        Self { config }
37    }
38
39    pub fn build(
40        &self,
41        documents: Vec<Document>,
42        output_dir: &Path,
43    ) -> Result<ContextCache, CacheBuildError> {
44        if output_dir.exists() {
45            return Err(CacheBuildError::OutputExists(output_dir.to_path_buf()));
46        }
47
48        // 1. Sort documents by ID to ensure determinism
49        let mut sorted_docs = documents;
50        sorted_docs.sort_by(|a, b| a.id.cmp(&b.id));
51
52        // 1b. Check for duplicate document IDs (adjacent after sort)
53        for pair in sorted_docs.windows(2) {
54            if pair[0].id == pair[1].id {
55                return Err(CacheBuildError::DuplicateDocumentId(
56                    pair[0].id.as_str().to_string(),
57                ));
58            }
59        }
60
61        // 2. Prepare structures and check for collisions
62        // We store pairs of (Document, ManifestEntry) to guarantee alignment explicitly
63        let mut doc_contexts = Vec::with_capacity(sorted_docs.len());
64        let mut index_entries = BTreeMap::new();
65        let mut seen_filenames = BTreeSet::new();
66
67        // Used for cache version computation
68        // "sorted(document_id + ":" + document_version)"
69        let mut version_hasher = Sha256::new();
70
71        // Hash the config
72        let config_json = serde_json::to_vec(&self.config)?;
73        version_hasher.update(&config_json);
74
75        for doc in &sorted_docs {
76            // Update cache version hash
77            let line = format!("{}:{}", doc.id.as_str(), doc.version.as_str());
78            version_hasher.update(line.as_bytes());
79
80            // Determine filename: first 12 chars of version hash (without prefix)
81            let full_hash = doc
82                .version
83                .as_str()
84                .strip_prefix("sha256:")
85                .ok_or_else(|| CacheBuildError::InvalidVersionFormat(doc.version.as_str().to_string()))?;
86
87            if full_hash.len() < 12 {
88                // Should not happen for sha256, but safe handling
89                return Err(CacheBuildError::FilenameCollision(full_hash.to_string()));
90            }
91            let filename_stem = &full_hash[..12];
92            let filename = format!("{}.json", filename_stem);
93
94            // Check collision
95            if seen_filenames.contains(filename_stem) {
96                return Err(CacheBuildError::FilenameCollision(filename_stem.to_string()));
97            }
98            seen_filenames.insert(filename_stem.to_string());
99
100            // Add to entries
101            let relative_path = format!("documents/{}", filename);
102
103            let entry = ManifestDocumentEntry {
104                id: doc.id.clone(),
105                version: doc.version.clone(),
106                file: relative_path.clone(),
107            };
108
109            index_entries.insert(doc.id.clone(), relative_path);
110            doc_contexts.push((doc, entry));
111        }
112
113        let hash_bytes = version_hasher.finalize();
114        let cache_version = format!("sha256:{}", hex::encode(hash_bytes));
115
116        // 3. Create Manifest
117        // Collect manifest documents from our aligned context
118        let mut manifest_documents: Vec<ManifestDocumentEntry> = doc_contexts
119            .iter()
120            .map(|(_, entry)| entry.clone())
121            .collect();
122
123        // Explicitly sort again just to be absolutely safe against refactors
124        manifest_documents.sort_by(|a, b| a.id.cmp(&b.id));
125
126        // Note: created_at is strictly informational
127        let manifest = CacheManifest {
128            cache_version: cache_version.clone(),
129            build_config: self.config.clone(),
130            created_at: Utc::now(),
131            document_count: sorted_docs.len(),
132            documents: manifest_documents,
133        };
134
135        let index = CacheIndex::new(index_entries);
136
137        // 4. Write to temp dir
138        // Use a deterministic-but-unique temp dir
139        // We use the first 12 chars of the new cache version to avoid collisions
140        // between different builds targeting the same parent dir (unlikely but safer)
141        let temp_suffix = format!("tmp.{}", &cache_version[7..19]);
142        let temp_dir = output_dir.with_extension(temp_suffix);
143
144        // Clean up any stale temp dir from a crashed previous run of THIS specific version
145        if temp_dir.exists() {
146            fs::remove_dir_all(&temp_dir)?;
147        }
148        fs::create_dir_all(&temp_dir)?;
149        fs::create_dir(temp_dir.join("documents"))?;
150
151        // Write documents
152        // doc_contexts guarantees alignment
153        for (doc, entry) in doc_contexts {
154            let path = temp_dir.join(&entry.file); // entry.file is "documents/..."
155            let f = fs::File::create(path)?;
156            serde_json::to_writer(&f, doc)?;
157            f.sync_all()?;
158        }
159
160        // Write index.json
161        let index_path = temp_dir.join("index.json");
162        let f_idx = fs::File::create(index_path)?;
163        // BTreeMap ensures lexicographical sort of keys
164        serde_json::to_writer_pretty(&f_idx, &index)?;
165        f_idx.sync_all()?;
166
167        // Write manifest.json
168        let manifest_path = temp_dir.join("manifest.json");
169        let f_man = fs::File::create(manifest_path)?;
170        serde_json::to_writer_pretty(&f_man, &manifest)?;
171        f_man.sync_all()?;
172
173        // 5. Atomic Rename
174        fs::rename(&temp_dir, output_dir)?;
175
176        Ok(ContextCache {
177            root: output_dir.to_path_buf(),
178            manifest,
179        })
180    }
181}