Skip to main content

chub_core/
cache.rs

1use std::fs;
2use std::io::{Read, Write};
3use std::path::{Path, PathBuf};
4
5use serde::{Deserialize, Serialize};
6
7use crate::config::{chub_dir, load_config, SourceConfig};
8use crate::types::{Registry, SearchIndex};
9
10/// Default maximum cache size in bytes (100 MB).
11const DEFAULT_MAX_CACHE_BYTES: u64 = 100 * 1024 * 1024;
12
13/// Threshold above which cached docs are gzip-compressed.
14const GZIP_THRESHOLD: usize = 10 * 1024;
15
16/// Metadata stored alongside each cached source.
17#[derive(Debug, Clone, Serialize, Deserialize, Default)]
18pub struct SourceMeta {
19    #[serde(rename = "lastUpdated", default)]
20    pub last_updated: Option<u64>,
21    #[serde(rename = "fullBundle", default)]
22    pub full_bundle: bool,
23    #[serde(rename = "bundledSeed", default)]
24    pub bundled_seed: bool,
25}
26
27pub fn get_source_dir(source_name: &str) -> PathBuf {
28    chub_dir().join("sources").join(source_name)
29}
30
31pub fn get_source_data_dir(source_name: &str) -> PathBuf {
32    get_source_dir(source_name).join("data")
33}
34
35pub fn get_source_meta_path(source_name: &str) -> PathBuf {
36    get_source_dir(source_name).join("meta.json")
37}
38
39pub fn get_source_registry_path(source_name: &str) -> PathBuf {
40    get_source_dir(source_name).join("registry.json")
41}
42
43pub fn get_source_search_index_path(source_name: &str) -> PathBuf {
44    get_source_dir(source_name).join("search-index.json")
45}
46
47pub fn read_meta(source_name: &str) -> SourceMeta {
48    let path = get_source_meta_path(source_name);
49    fs::read_to_string(&path)
50        .ok()
51        .and_then(|s| serde_json::from_str(&s).ok())
52        .unwrap_or_default()
53}
54
55pub fn write_meta(source_name: &str, meta: &SourceMeta) {
56    let dir = get_source_dir(source_name);
57    let _ = fs::create_dir_all(&dir);
58    let _ = fs::write(
59        get_source_meta_path(source_name),
60        serde_json::to_string_pretty(meta).unwrap_or_default(),
61    );
62}
63
64pub fn is_source_cache_fresh(source_name: &str) -> bool {
65    let meta = read_meta(source_name);
66    let last = match meta.last_updated {
67        Some(ts) if ts > 0 => ts,
68        _ => return false,
69    };
70    let config = load_config();
71    let now = std::time::SystemTime::now()
72        .duration_since(std::time::UNIX_EPOCH)
73        .unwrap_or_default()
74        .as_millis() as u64;
75    let age_secs = (now.saturating_sub(last)) / 1000;
76    age_secs < config.refresh_interval
77}
78
79/// Returns true if we should fetch the remote registry for this source.
80/// Inverse of fresh check, but also returns true when no registry exists at all.
81pub fn should_fetch_remote_registry(source_name: &str) -> bool {
82    !is_source_cache_fresh(source_name) || !get_source_registry_path(source_name).exists()
83}
84
85fn now_millis() -> u64 {
86    std::time::SystemTime::now()
87        .duration_since(std::time::UNIX_EPOCH)
88        .unwrap_or_default()
89        .as_millis() as u64
90}
91
92/// Load cached/local registry for a single source.
93pub fn load_source_registry(source: &SourceConfig) -> Option<Registry> {
94    let reg_path = if let Some(ref p) = source.path {
95        PathBuf::from(p).join("registry.json")
96    } else {
97        get_source_registry_path(&source.name)
98    };
99    if !reg_path.exists() {
100        return None;
101    }
102    let data = fs::read_to_string(&reg_path).ok()?;
103    serde_json::from_str(&data).ok()
104}
105
106/// Load BM25 search index for a single source.
107pub fn load_search_index(source: &SourceConfig) -> Option<SearchIndex> {
108    // For local sources, look in the source path
109    if let Some(ref p) = source.path {
110        let index_path = PathBuf::from(p).join("search-index.json");
111        if index_path.exists() {
112            return fs::read_to_string(&index_path)
113                .ok()
114                .and_then(|s| serde_json::from_str(&s).ok());
115        }
116        return None;
117    }
118    // For remote sources, check the per-source search index file
119    let index_path = get_source_search_index_path(&source.name);
120    if !index_path.exists() {
121        return None;
122    }
123    fs::read_to_string(&index_path)
124        .ok()
125        .and_then(|s| serde_json::from_str(&s).ok())
126}
127
128/// Cache stats for display.
129#[derive(Debug, Clone, Serialize)]
130pub struct CacheStats {
131    pub exists: bool,
132    pub sources: Vec<SourceStat>,
133}
134
135#[derive(Debug, Clone, Serialize)]
136#[serde(tag = "type")]
137pub enum SourceStat {
138    #[serde(rename = "local")]
139    Local { name: String, path: String },
140    #[serde(rename = "remote")]
141    Remote {
142        name: String,
143        #[serde(rename = "hasRegistry")]
144        has_registry: bool,
145        #[serde(rename = "lastUpdated")]
146        last_updated: Option<String>,
147        #[serde(rename = "fullBundle")]
148        full_bundle: bool,
149        #[serde(rename = "fileCount")]
150        file_count: usize,
151        #[serde(rename = "dataSize")]
152        data_size: u64,
153    },
154}
155
156pub fn get_cache_stats() -> CacheStats {
157    let chub = chub_dir();
158    if !chub.exists() {
159        return CacheStats {
160            exists: false,
161            sources: vec![],
162        };
163    }
164
165    let config = load_config();
166    let mut sources = Vec::new();
167
168    for source in &config.sources {
169        if let Some(ref p) = source.path {
170            sources.push(SourceStat::Local {
171                name: source.name.clone(),
172                path: p.clone(),
173            });
174            continue;
175        }
176
177        let meta = read_meta(&source.name);
178        let data_dir = get_source_data_dir(&source.name);
179        let (file_count, data_size) = dir_stats(&data_dir);
180
181        let last_updated = meta.last_updated.map(|ts| {
182            // Convert millis to ISO 8601
183            let secs = ts / 1000;
184            let days = secs / 86400;
185            let tod = secs % 86400;
186            let (y, m, d) = crate::build::builder::days_to_date(days);
187            format!(
188                "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}.000Z",
189                y,
190                m,
191                d,
192                tod / 3600,
193                (tod % 3600) / 60,
194                tod % 60
195            )
196        });
197
198        sources.push(SourceStat::Remote {
199            name: source.name.clone(),
200            has_registry: get_source_registry_path(&source.name).exists(),
201            last_updated,
202            full_bundle: meta.full_bundle,
203            file_count,
204            data_size,
205        });
206    }
207
208    CacheStats {
209        exists: true,
210        sources,
211    }
212}
213
214fn dir_stats(dir: &Path) -> (usize, u64) {
215    let mut count = 0usize;
216    let mut size = 0u64;
217    if dir.exists() {
218        for entry in walkdir::WalkDir::new(dir)
219            .into_iter()
220            .filter_map(|e| e.ok())
221        {
222            if entry.file_type().is_file() {
223                count += 1;
224                size += entry.metadata().map(|m| m.len()).unwrap_or(0);
225            }
226        }
227    }
228    (count, size)
229}
230
231/// Clear the cache (preserves config.yaml).
232pub fn clear_cache() {
233    let chub = chub_dir();
234    let config_path = chub.join("config.yaml");
235    let config_content = fs::read_to_string(&config_path).ok();
236
237    let _ = fs::remove_dir_all(&chub);
238
239    if let Some(content) = config_content {
240        let _ = fs::create_dir_all(&chub);
241        let _ = fs::write(&config_path, content);
242    }
243}
244
245/// Save a fetched registry to the source cache.
246pub fn save_source_registry(source_name: &str, data: &str) {
247    let dir = get_source_dir(source_name);
248    let _ = fs::create_dir_all(&dir);
249    let _ = fs::write(get_source_registry_path(source_name), data);
250}
251
252/// Update the last_updated timestamp for a source.
253pub fn touch_source_meta(source_name: &str) {
254    let mut meta = read_meta(source_name);
255    meta.last_updated = Some(now_millis());
256    write_meta(source_name, &meta);
257}
258
259/// Save a fetched doc to the source data cache.
260/// Content larger than 10 KB is gzip-compressed (saved as `.gz`).
261pub fn save_cached_doc(source_name: &str, doc_path: &str, content: &str) {
262    let base_path = get_source_data_dir(source_name).join(doc_path);
263    if let Some(parent) = base_path.parent() {
264        let _ = fs::create_dir_all(parent);
265    }
266
267    if content.len() > GZIP_THRESHOLD {
268        let gz_path = PathBuf::from(format!("{}.gz", base_path.display()));
269        if let Ok(file) = fs::File::create(&gz_path) {
270            let mut encoder = flate2::write::GzEncoder::new(file, flate2::Compression::fast());
271            let _ = encoder.write_all(content.as_bytes());
272            let _ = encoder.finish();
273            // Remove uncompressed version if it exists
274            let _ = fs::remove_file(&base_path);
275            return;
276        }
277    }
278    let _ = fs::write(&base_path, content);
279}
280
281/// Read a cached doc if it exists (handles both plain and gzip-compressed).
282pub fn read_cached_doc(source_name: &str, doc_path: &str) -> Option<String> {
283    let base_path = get_source_data_dir(source_name).join(doc_path);
284
285    // Check for gzip-compressed version first
286    let gz_path = PathBuf::from(format!("{}.gz", base_path.display()));
287    if gz_path.exists() {
288        if let Ok(file) = fs::File::open(&gz_path) {
289            let mut decoder = flate2::read::GzDecoder::new(file);
290            let mut content = String::new();
291            if decoder.read_to_string(&mut content).is_ok() {
292                return Some(content);
293            }
294        }
295    }
296
297    // Fall back to plain file
298    fs::read_to_string(&base_path).ok()
299}
300
301/// Evict cached data from the oldest sources until total cache size is under the limit.
302/// Returns the number of bytes freed.
303pub fn evict_lru_cache(max_bytes: Option<u64>) -> u64 {
304    let max = max_bytes.unwrap_or(DEFAULT_MAX_CACHE_BYTES);
305    let config = load_config();
306    let chub = chub_dir();
307
308    if !chub.exists() {
309        return 0;
310    }
311
312    // Collect (source_name, data_size, last_updated) for remote sources
313    let mut source_stats: Vec<(String, u64, u64)> = Vec::new();
314    let mut total_size: u64 = 0;
315
316    for source in &config.sources {
317        if source.path.is_some() {
318            continue;
319        }
320        let data_dir = get_source_data_dir(&source.name);
321        let (_, size) = dir_stats(&data_dir);
322        let meta = read_meta(&source.name);
323        let last = meta.last_updated.unwrap_or(0);
324        total_size += size;
325        source_stats.push((source.name.clone(), size, last));
326    }
327
328    if total_size <= max {
329        return 0;
330    }
331
332    // Sort by last_updated ascending (oldest first)
333    source_stats.sort_by_key(|s| s.2);
334
335    let mut freed: u64 = 0;
336    for (name, size, _) in &source_stats {
337        if total_size - freed <= max {
338            break;
339        }
340        let data_dir = get_source_data_dir(name);
341        if data_dir.exists() {
342            let _ = fs::remove_dir_all(&data_dir);
343            freed += size;
344        }
345    }
346
347    freed
348}
349
350/// Check if any source has a registry available.
351pub fn has_any_registry() -> bool {
352    let config = load_config();
353    for source in &config.sources {
354        if let Some(ref p) = source.path {
355            if PathBuf::from(p).join("registry.json").exists() {
356                return true;
357            }
358        } else if get_source_registry_path(&source.name).exists() {
359            return true;
360        }
361    }
362    false
363}