Skip to main content

sql_splitter/duckdb/
cache.rs

1//! Cache manager for persistent DuckDB databases.
2//!
3//! Caches imported SQL dumps as DuckDB database files for fast repeated queries.
4
5use anyhow::{Context, Result};
6use sha2::{Digest, Sha256};
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::time::SystemTime;
10
11/// Cache entry metadata
12#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
13pub struct CacheEntry {
14    /// Original dump file path
15    pub dump_path: String,
16    /// SHA256 hash of (path + size + mtime)
17    pub cache_key: String,
18    /// Size of original dump file
19    pub dump_size: u64,
20    /// Modification time of dump file (as Unix timestamp)
21    pub dump_mtime: u64,
22    /// Size of cached DuckDB file
23    pub cache_size: u64,
24    /// When this cache entry was created
25    pub created_at: u64,
26    /// Number of tables in the cache
27    pub table_count: usize,
28    /// Total rows in the cache
29    pub row_count: u64,
30}
31
32/// Cache index containing all cache entries
33#[derive(Debug, Default, serde::Serialize, serde::Deserialize)]
34pub struct CacheIndex {
35    pub entries: Vec<CacheEntry>,
36}
37
38/// Manager for cached DuckDB databases
39pub struct CacheManager {
40    cache_dir: PathBuf,
41}
42
43impl CacheManager {
44    /// Create a new cache manager with the default cache directory
45    pub fn new() -> Result<Self> {
46        let cache_dir = Self::default_cache_dir()?;
47        fs::create_dir_all(&cache_dir).context("Failed to create cache directory")?;
48        Ok(Self { cache_dir })
49    }
50
51    /// Create a cache manager with a custom cache directory
52    pub fn with_dir(cache_dir: PathBuf) -> Result<Self> {
53        fs::create_dir_all(&cache_dir).context("Failed to create cache directory")?;
54        Ok(Self { cache_dir })
55    }
56
57    /// Get the default cache directory
58    pub fn default_cache_dir() -> Result<PathBuf> {
59        let cache_base = dirs::cache_dir()
60            .or_else(|| dirs::home_dir().map(|h| h.join(".cache")))
61            .context("Could not determine cache directory")?;
62
63        Ok(cache_base.join("sql-splitter").join("duckdb"))
64    }
65
66    /// Compute the cache key for a dump file
67    pub fn compute_cache_key(dump_path: &Path) -> Result<String> {
68        let canonical = dump_path
69            .canonicalize()
70            .with_context(|| format!("Failed to canonicalize path: {}", dump_path.display()))?;
71
72        let metadata = fs::metadata(&canonical)
73            .with_context(|| format!("Failed to read metadata: {}", dump_path.display()))?;
74
75        let mtime = metadata
76            .modified()
77            .unwrap_or(SystemTime::UNIX_EPOCH)
78            .duration_since(SystemTime::UNIX_EPOCH)
79            .map(|d| d.as_secs())
80            .unwrap_or(0);
81
82        let key_input = format!("{}:{}:{}", canonical.display(), metadata.len(), mtime);
83
84        let mut hasher = Sha256::new();
85        hasher.update(key_input.as_bytes());
86        let hash = hasher.finalize();
87
88        Ok(hex::encode(&hash[..16])) // Use first 16 bytes for shorter filename
89    }
90
91    /// Get the path where a cached database would be stored
92    pub fn cache_path(&self, cache_key: &str) -> PathBuf {
93        self.cache_dir.join(format!("{}.duckdb", cache_key))
94    }
95
96    /// Check if a valid cache exists for a dump file
97    pub fn has_valid_cache(&self, dump_path: &Path) -> Result<bool> {
98        let cache_key = Self::compute_cache_key(dump_path)?;
99        let cache_path = self.cache_path(&cache_key);
100
101        if !cache_path.exists() {
102            return Ok(false);
103        }
104
105        // Check if cache is newer than dump
106        let dump_mtime = fs::metadata(dump_path)?
107            .modified()
108            .unwrap_or(SystemTime::UNIX_EPOCH);
109        let cache_mtime = fs::metadata(&cache_path)?
110            .modified()
111            .unwrap_or(SystemTime::UNIX_EPOCH);
112
113        Ok(cache_mtime > dump_mtime)
114    }
115
116    /// Get the cache path for a dump file, if a valid cache exists
117    pub fn get_cache(&self, dump_path: &Path) -> Result<Option<PathBuf>> {
118        if self.has_valid_cache(dump_path)? {
119            let cache_key = Self::compute_cache_key(dump_path)?;
120            Ok(Some(self.cache_path(&cache_key)))
121        } else {
122            Ok(None)
123        }
124    }
125
126    /// Create a new cache entry for a dump file
127    pub fn create_cache(
128        &self,
129        dump_path: &Path,
130        table_count: usize,
131        row_count: u64,
132    ) -> Result<PathBuf> {
133        let cache_key = Self::compute_cache_key(dump_path)?;
134        let cache_path = self.cache_path(&cache_key);
135
136        // Update index
137        self.update_index(dump_path, &cache_key, table_count, row_count)?;
138
139        Ok(cache_path)
140    }
141
142    /// Update the cache index
143    fn update_index(
144        &self,
145        dump_path: &Path,
146        cache_key: &str,
147        table_count: usize,
148        row_count: u64,
149    ) -> Result<()> {
150        let mut index = self.load_index()?;
151
152        let metadata = fs::metadata(dump_path)?;
153        let dump_mtime = metadata
154            .modified()
155            .unwrap_or(SystemTime::UNIX_EPOCH)
156            .duration_since(SystemTime::UNIX_EPOCH)
157            .map(|d| d.as_secs())
158            .unwrap_or(0);
159
160        let cache_path = self.cache_path(cache_key);
161        let cache_size = fs::metadata(&cache_path).map(|m| m.len()).unwrap_or(0);
162
163        let entry = CacheEntry {
164            dump_path: dump_path.display().to_string(),
165            cache_key: cache_key.to_string(),
166            dump_size: metadata.len(),
167            dump_mtime,
168            cache_size,
169            created_at: SystemTime::now()
170                .duration_since(SystemTime::UNIX_EPOCH)
171                .map(|d| d.as_secs())
172                .unwrap_or(0),
173            table_count,
174            row_count,
175        };
176
177        // Remove old entry for this dump path
178        index
179            .entries
180            .retain(|e| e.dump_path != dump_path.display().to_string());
181        index.entries.push(entry);
182
183        self.save_index(&index)?;
184        Ok(())
185    }
186
187    /// Load the cache index
188    pub fn load_index(&self) -> Result<CacheIndex> {
189        let index_path = self.cache_dir.join("index.json");
190
191        if !index_path.exists() {
192            return Ok(CacheIndex::default());
193        }
194
195        let content = fs::read_to_string(&index_path).context("Failed to read cache index")?;
196        serde_json::from_str(&content).context("Failed to parse cache index")
197    }
198
199    /// Save the cache index
200    fn save_index(&self, index: &CacheIndex) -> Result<()> {
201        let index_path = self.cache_dir.join("index.json");
202        let content =
203            serde_json::to_string_pretty(index).context("Failed to serialize cache index")?;
204        fs::write(&index_path, content).context("Failed to write cache index")?;
205        Ok(())
206    }
207
208    /// List all cache entries
209    pub fn list_entries(&self) -> Result<Vec<CacheEntry>> {
210        let index = self.load_index()?;
211        Ok(index.entries)
212    }
213
214    /// Remove a specific cache entry
215    pub fn remove_cache(&self, cache_key: &str) -> Result<()> {
216        let cache_path = self.cache_path(cache_key);
217
218        if cache_path.exists() {
219            fs::remove_file(&cache_path).context("Failed to remove cache file")?;
220        }
221
222        // Also remove WAL file if it exists
223        let wal_path = cache_path.with_extension("duckdb.wal");
224        if wal_path.exists() {
225            fs::remove_file(&wal_path)?;
226        }
227
228        // Update index
229        let mut index = self.load_index()?;
230        index.entries.retain(|e| e.cache_key != cache_key);
231        self.save_index(&index)?;
232
233        Ok(())
234    }
235
236    /// Clear all cached databases
237    pub fn clear_all(&self) -> Result<usize> {
238        let entries = self.list_entries()?;
239        let count = entries.len();
240
241        for entry in entries {
242            self.remove_cache(&entry.cache_key)?;
243        }
244
245        Ok(count)
246    }
247
248    /// Get total cache size in bytes
249    pub fn total_size(&self) -> Result<u64> {
250        let entries = self.list_entries()?;
251        Ok(entries.iter().map(|e| e.cache_size).sum())
252    }
253
254    /// Get the cache directory path
255    pub fn cache_dir(&self) -> &Path {
256        &self.cache_dir
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use tempfile::TempDir;
264
265    fn setup_test_cache() -> (CacheManager, TempDir) {
266        let temp_dir = TempDir::new().unwrap();
267        let cache_manager = CacheManager::with_dir(temp_dir.path().to_path_buf()).unwrap();
268        (cache_manager, temp_dir)
269    }
270
271    #[test]
272    fn test_cache_key_computation() {
273        let temp_dir = TempDir::new().unwrap();
274        let test_file = temp_dir.path().join("test.sql");
275        fs::write(&test_file, "SELECT 1;").unwrap();
276
277        let key1 = CacheManager::compute_cache_key(&test_file).unwrap();
278        let key2 = CacheManager::compute_cache_key(&test_file).unwrap();
279
280        assert_eq!(key1, key2);
281        assert_eq!(key1.len(), 32); // 16 bytes hex encoded
282    }
283
284    #[test]
285    fn test_cache_key_changes_with_content() {
286        let temp_dir = TempDir::new().unwrap();
287        let test_file = temp_dir.path().join("test.sql");
288
289        fs::write(&test_file, "SELECT 1;").unwrap();
290        let key1 = CacheManager::compute_cache_key(&test_file).unwrap();
291
292        // Modify the file with different size (which is always captured, unlike mtime)
293        fs::write(&test_file, "SELECT 2; -- with extra content to change size").unwrap();
294        let key2 = CacheManager::compute_cache_key(&test_file).unwrap();
295
296        // Key should be different because size changed
297        assert_ne!(key1, key2);
298    }
299
300    #[test]
301    fn test_cache_path() {
302        let (cache_manager, _temp_dir) = setup_test_cache();
303        let cache_path = cache_manager.cache_path("abc123");
304        assert!(cache_path.to_string_lossy().ends_with("abc123.duckdb"));
305    }
306
307    #[test]
308    fn test_has_valid_cache_when_missing() {
309        let (cache_manager, temp_dir) = setup_test_cache();
310        let test_file = temp_dir.path().join("test.sql");
311        fs::write(&test_file, "SELECT 1;").unwrap();
312
313        assert!(!cache_manager.has_valid_cache(&test_file).unwrap());
314    }
315
316    #[test]
317    fn test_list_entries_empty() {
318        let (cache_manager, _temp_dir) = setup_test_cache();
319        let entries = cache_manager.list_entries().unwrap();
320        assert!(entries.is_empty());
321    }
322
323    #[test]
324    fn test_total_size_empty() {
325        let (cache_manager, _temp_dir) = setup_test_cache();
326        assert_eq!(cache_manager.total_size().unwrap(), 0);
327    }
328}