Skip to main content

chub_core/
cache.rs

1use std::fs;
2use std::io::{Read, Write};
3use std::path::{Path, PathBuf};
4
5use serde::{Deserialize, Serialize};
6
7use crate::config::{chub_dir, load_config, SourceConfig};
8use crate::types::{Registry, SearchIndex};
9
10/// Default maximum cache size in bytes (100 MB).
11const DEFAULT_MAX_CACHE_BYTES: u64 = 100 * 1024 * 1024;
12
13/// Threshold above which cached docs are gzip-compressed.
14const GZIP_THRESHOLD: usize = 10 * 1024;
15
16/// Metadata stored alongside each cached source.
17#[derive(Debug, Clone, Serialize, Deserialize, Default)]
18pub struct SourceMeta {
19    #[serde(rename = "lastUpdated", default)]
20    pub last_updated: Option<u64>,
21    #[serde(rename = "fullBundle", default)]
22    pub full_bundle: bool,
23    #[serde(rename = "bundledSeed", default)]
24    pub bundled_seed: bool,
25}
26
27pub fn get_source_dir(source_name: &str) -> PathBuf {
28    chub_dir().join("sources").join(source_name)
29}
30
31pub fn get_source_data_dir(source_name: &str) -> PathBuf {
32    get_source_dir(source_name).join("data")
33}
34
35pub fn get_source_meta_path(source_name: &str) -> PathBuf {
36    get_source_dir(source_name).join("meta.json")
37}
38
39pub fn get_source_registry_path(source_name: &str) -> PathBuf {
40    get_source_dir(source_name).join("registry.json")
41}
42
43pub fn get_source_search_index_path(source_name: &str) -> PathBuf {
44    get_source_dir(source_name).join("search-index.json")
45}
46
47pub fn read_meta(source_name: &str) -> SourceMeta {
48    let path = get_source_meta_path(source_name);
49    fs::read_to_string(&path)
50        .ok()
51        .and_then(|s| serde_json::from_str(&s).ok())
52        .unwrap_or_default()
53}
54
55pub fn write_meta(source_name: &str, meta: &SourceMeta) {
56    let dir = get_source_dir(source_name);
57    let _ = fs::create_dir_all(&dir);
58    let data = serde_json::to_string_pretty(meta).unwrap_or_default();
59    let _ = crate::util::atomic_write(&get_source_meta_path(source_name), data.as_bytes());
60}
61
62pub fn is_source_cache_fresh(source_name: &str) -> bool {
63    let meta = read_meta(source_name);
64    let last = match meta.last_updated {
65        Some(ts) if ts > 0 => ts,
66        _ => return false,
67    };
68    let config = load_config();
69    let now = std::time::SystemTime::now()
70        .duration_since(std::time::UNIX_EPOCH)
71        .unwrap_or_default()
72        .as_millis() as u64;
73    let age_secs = (now.saturating_sub(last)) / 1000;
74    age_secs < config.refresh_interval
75}
76
77/// Returns true if we should fetch the remote registry for this source.
78/// Inverse of fresh check, but also returns true when no registry exists at all.
79pub fn should_fetch_remote_registry(source_name: &str) -> bool {
80    !is_source_cache_fresh(source_name) || !get_source_registry_path(source_name).exists()
81}
82
83fn now_millis() -> u64 {
84    std::time::SystemTime::now()
85        .duration_since(std::time::UNIX_EPOCH)
86        .unwrap_or_default()
87        .as_millis() as u64
88}
89
90/// Load cached/local registry for a single source.
91pub fn load_source_registry(source: &SourceConfig) -> Option<Registry> {
92    let reg_path = if let Some(ref p) = source.path {
93        PathBuf::from(p).join("registry.json")
94    } else {
95        get_source_registry_path(&source.name)
96    };
97    if !reg_path.exists() {
98        return None;
99    }
100    let data = fs::read_to_string(&reg_path).ok()?;
101    serde_json::from_str(&data).ok()
102}
103
104/// Load BM25 search index for a single source.
105pub fn load_search_index(source: &SourceConfig) -> Option<SearchIndex> {
106    // For local sources, look in the source path
107    if let Some(ref p) = source.path {
108        let index_path = PathBuf::from(p).join("search-index.json");
109        if index_path.exists() {
110            return fs::read_to_string(&index_path)
111                .ok()
112                .and_then(|s| serde_json::from_str(&s).ok());
113        }
114        return None;
115    }
116    // For remote sources, check the per-source search index file
117    let index_path = get_source_search_index_path(&source.name);
118    if !index_path.exists() {
119        return None;
120    }
121    fs::read_to_string(&index_path)
122        .ok()
123        .and_then(|s| serde_json::from_str(&s).ok())
124}
125
126/// Cache stats for display.
127#[derive(Debug, Clone, Serialize)]
128pub struct CacheStats {
129    pub exists: bool,
130    pub sources: Vec<SourceStat>,
131}
132
133#[derive(Debug, Clone, Serialize)]
134#[serde(tag = "type")]
135pub enum SourceStat {
136    #[serde(rename = "local")]
137    Local { name: String, path: String },
138    #[serde(rename = "remote")]
139    Remote {
140        name: String,
141        #[serde(rename = "hasRegistry")]
142        has_registry: bool,
143        #[serde(rename = "lastUpdated")]
144        last_updated: Option<String>,
145        #[serde(rename = "fullBundle")]
146        full_bundle: bool,
147        #[serde(rename = "fileCount")]
148        file_count: usize,
149        #[serde(rename = "dataSize")]
150        data_size: u64,
151    },
152}
153
154pub fn get_cache_stats() -> CacheStats {
155    let chub = chub_dir();
156    if !chub.exists() {
157        return CacheStats {
158            exists: false,
159            sources: vec![],
160        };
161    }
162
163    let config = load_config();
164    let mut sources = Vec::new();
165
166    for source in &config.sources {
167        if let Some(ref p) = source.path {
168            sources.push(SourceStat::Local {
169                name: source.name.clone(),
170                path: p.clone(),
171            });
172            continue;
173        }
174
175        let meta = read_meta(&source.name);
176        let data_dir = get_source_data_dir(&source.name);
177        let (file_count, data_size) = dir_stats(&data_dir);
178
179        let last_updated = meta.last_updated.map(|ts| {
180            // Convert millis to ISO 8601
181            let secs = ts / 1000;
182            let days = secs / 86400;
183            let tod = secs % 86400;
184            let (y, m, d) = crate::util::days_to_date(days);
185            format!(
186                "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}.000Z",
187                y,
188                m,
189                d,
190                tod / 3600,
191                (tod % 3600) / 60,
192                tod % 60
193            )
194        });
195
196        sources.push(SourceStat::Remote {
197            name: source.name.clone(),
198            has_registry: get_source_registry_path(&source.name).exists(),
199            last_updated,
200            full_bundle: meta.full_bundle,
201            file_count,
202            data_size,
203        });
204    }
205
206    CacheStats {
207        exists: true,
208        sources,
209    }
210}
211
212fn dir_stats(dir: &Path) -> (usize, u64) {
213    let mut count = 0usize;
214    let mut size = 0u64;
215    if dir.exists() {
216        for entry in walkdir::WalkDir::new(dir)
217            .into_iter()
218            .filter_map(|e| e.ok())
219        {
220            if entry.file_type().is_file() {
221                count += 1;
222                size += entry.metadata().map(|m| m.len()).unwrap_or(0);
223            }
224        }
225    }
226    (count, size)
227}
228
229/// Clear the cache (preserves config.yaml).
230pub fn clear_cache() {
231    let chub = chub_dir();
232    let config_path = chub.join("config.yaml");
233    let config_content = fs::read_to_string(&config_path).ok();
234
235    let _ = fs::remove_dir_all(&chub);
236
237    if let Some(content) = config_content {
238        let _ = fs::create_dir_all(&chub);
239        let _ = fs::write(&config_path, content);
240    }
241}
242
243/// Save a fetched registry to the source cache.
244pub fn save_source_registry(source_name: &str, data: &str) {
245    let dir = get_source_dir(source_name);
246    let _ = fs::create_dir_all(&dir);
247    let _ = crate::util::atomic_write(&get_source_registry_path(source_name), data.as_bytes());
248}
249
250/// Update the last_updated timestamp for a source.
251pub fn touch_source_meta(source_name: &str) {
252    let mut meta = read_meta(source_name);
253    meta.last_updated = Some(now_millis());
254    write_meta(source_name, &meta);
255}
256
257/// Save a fetched doc to the source data cache.
258/// Content larger than 10 KB is gzip-compressed (saved as `.gz`).
259pub fn save_cached_doc(source_name: &str, doc_path: &str, content: &str) {
260    let base_path = get_source_data_dir(source_name).join(doc_path);
261    if let Some(parent) = base_path.parent() {
262        let _ = fs::create_dir_all(parent);
263    }
264
265    if content.len() > GZIP_THRESHOLD {
266        let gz_path = PathBuf::from(format!("{}.gz", base_path.display()));
267        if let Ok(file) = fs::File::create(&gz_path) {
268            let mut encoder = flate2::write::GzEncoder::new(file, flate2::Compression::fast());
269            let _ = encoder.write_all(content.as_bytes());
270            let _ = encoder.finish();
271            // Remove uncompressed version if it exists
272            let _ = fs::remove_file(&base_path);
273            return;
274        }
275    }
276    let _ = fs::write(&base_path, content);
277}
278
279/// Read a cached doc if it exists (handles both plain and gzip-compressed).
280pub fn read_cached_doc(source_name: &str, doc_path: &str) -> Option<String> {
281    let base_path = get_source_data_dir(source_name).join(doc_path);
282
283    // Check for gzip-compressed version first
284    let gz_path = PathBuf::from(format!("{}.gz", base_path.display()));
285    if gz_path.exists() {
286        if let Ok(file) = fs::File::open(&gz_path) {
287            let mut decoder = flate2::read::GzDecoder::new(file);
288            let mut content = String::new();
289            if decoder.read_to_string(&mut content).is_ok() {
290                return Some(content);
291            }
292        }
293    }
294
295    // Fall back to plain file
296    fs::read_to_string(&base_path).ok()
297}
298
299/// Evict cached data from the oldest sources until total cache size is under the limit.
300/// Returns the number of bytes freed.
301pub fn evict_lru_cache(max_bytes: Option<u64>) -> u64 {
302    let max = max_bytes.unwrap_or(DEFAULT_MAX_CACHE_BYTES);
303    let config = load_config();
304    let chub = chub_dir();
305
306    if !chub.exists() {
307        return 0;
308    }
309
310    // Collect (source_name, data_size, last_updated) for remote sources
311    let mut source_stats: Vec<(String, u64, u64)> = Vec::new();
312    let mut total_size: u64 = 0;
313
314    for source in &config.sources {
315        if source.path.is_some() {
316            continue;
317        }
318        let data_dir = get_source_data_dir(&source.name);
319        let (_, size) = dir_stats(&data_dir);
320        let meta = read_meta(&source.name);
321        let last = meta.last_updated.unwrap_or(0);
322        total_size += size;
323        source_stats.push((source.name.clone(), size, last));
324    }
325
326    if total_size <= max {
327        return 0;
328    }
329
330    // Sort by last_updated ascending (oldest first)
331    source_stats.sort_by_key(|s| s.2);
332
333    let mut freed: u64 = 0;
334    for (name, size, _) in &source_stats {
335        if total_size - freed <= max {
336            break;
337        }
338        let data_dir = get_source_data_dir(name);
339        if data_dir.exists() {
340            let _ = fs::remove_dir_all(&data_dir);
341            freed += size;
342        }
343    }
344
345    freed
346}
347
348/// Check if any source has a registry available.
349pub fn has_any_registry() -> bool {
350    let config = load_config();
351    for source in &config.sources {
352        if let Some(ref p) = source.path {
353            if PathBuf::from(p).join("registry.json").exists() {
354                return true;
355            }
356        } else if get_source_registry_path(&source.name).exists() {
357            return true;
358        }
359    }
360    false
361}