Skip to main content

scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(format!("{:x}", hasher.finalize()))
46}
47
48/// Registry entry for dataset files
49pub struct RegistryEntry {
50    /// SHA256 hash of the file
51    pub sha256: &'static str,
52    /// URL to download the file from
53    pub url: &'static str,
54}
55
56/// Get the platform-specific cache directory for downloading and storing datasets
57///
58/// The cache directory is determined in the following order:
59/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
60/// 2. Platform-specific cache directory:
61///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
62///    - macOS: `~/Library/Caches/scirs2-datasets`
63///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
64/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
65#[allow(dead_code)]
66pub fn get_cachedir() -> Result<PathBuf> {
67    // Check environment variable first
68    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
69        let cachepath = PathBuf::from(cachedir);
70        ensuredirectory_exists(&cachepath)?;
71        return Ok(cachepath);
72    }
73
74    // Try platform-specific cache directory
75    if let Some(cachedir) = get_platform_cachedir() {
76        ensuredirectory_exists(&cachedir)?;
77        return Ok(cachedir);
78    }
79
80    // Fallback to home directory
81    let homedir = dirs::home_dir()
82        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
83    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
84    ensuredirectory_exists(&cachedir)?;
85
86    Ok(cachedir)
87}
88
89/// Get platform-specific cache directory
90#[allow(dead_code)]
91fn get_platform_cachedir() -> Option<PathBuf> {
92    #[cfg(target_os = "windows")]
93    {
94        dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
95    }
96    #[cfg(target_os = "macos")]
97    {
98        dirs::home_dir().map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
99    }
100    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
101    {
102        // Linux/Unix: Use XDG cache directory
103        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
104            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
105        } else {
106            dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
107        }
108    }
109}
110
111/// Ensure a directory exists, creating it if necessary
112#[allow(dead_code)]
113fn ensuredirectory_exists(dir: &Path) -> Result<()> {
114    if !dir.exists() {
115        fs::create_dir_all(dir).map_err(|e| {
116            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
117        })?;
118    }
119    Ok(())
120}
121
122/// Fetch a dataset file from either cache or download it from the URL
123///
124/// This function will:
125/// 1. Check if the file exists in the cache directory
126/// 2. If not, download it from the URL in the registry entry
127/// 3. Store it in the cache directory
128/// 4. Return the path to the cached file
129///
130/// # Arguments
131///
132/// * `filename` - The name of the file to fetch
133/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
134///
135/// # Returns
136///
137/// * `Ok(PathBuf)` - Path to the cached file
138/// * `Err(String)` - Error message if fetching fails
139#[cfg(feature = "download-sync")]
140#[allow(dead_code)]
141pub fn fetch_data(
142    filename: &str,
143    registry_entry: Option<&RegistryEntry>,
144) -> std::result::Result<PathBuf, String> {
145    // Get the cache directory
146    let cachedir = match get_cachedir() {
147        Ok(dir) => dir,
148        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
149    };
150
151    // Check if file exists in cache
152    let cachepath = cachedir.join(filename);
153    if cachepath.exists() {
154        return Ok(cachepath);
155    }
156
157    // If not in cache, fetch from the URL
158    let entry = match registry_entry {
159        Some(entry) => entry,
160        None => return Err(format!("No registry entry found for {filename}")),
161    };
162
163    // Create a temporary file to download to
164    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
165    let temp_file = tempdir.path().join(filename);
166
167    // Download the file
168    let response = ureq::get(entry.url)
169        .call()
170        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
171
172    // Read body into memory (ureq 3.x: use into_body which implements Read)
173    let mut body = response.into_body();
174    let bytes = body
175        .read_to_vec()
176        .map_err(|e| format!("Failed to read response body: {e}"))?;
177    let mut file = std::fs::File::create(&temp_file)
178        .map_err(|e| format!("Failed to create temp file: {e}"))?;
179    file.write_all(&bytes)
180        .map_err(|e| format!("Failed to write downloaded file: {e}"))?;
181
182    // Verify the SHA256 hash of the downloaded file if provided
183    if !entry.sha256.is_empty() {
184        let computed_hash = sha256_hash_file(&temp_file)?;
185        if computed_hash != entry.sha256 {
186            return Err(format!(
187                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
188                entry.sha256
189            ));
190        }
191    }
192
193    // Move the file to the cache
194    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
195    if let Some(parent) = cachepath.parent() {
196        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
197    }
198
199    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
200
201    Ok(cachepath)
202}
203
204/// Stub for fetch_data when download-sync feature is disabled
205#[cfg(not(feature = "download-sync"))]
206#[allow(dead_code)]
207pub fn fetch_data(
208    _filename: &str,
209    _registry_entry: Option<&RegistryEntry>,
210) -> std::result::Result<PathBuf, String> {
211    Err("Synchronous download feature is disabled. Enable 'download-sync' feature.".to_string())
212}
213
214/// Cache key for dataset caching with configuration-aware hashing
215#[derive(Clone, Debug, Eq, PartialEq, Hash)]
216pub struct CacheKey {
217    name: String,
218    config_hash: String,
219}
220
221impl CacheKey {
222    /// Create a new cache key from dataset name and configuration
223    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
224        use std::collections::hash_map::DefaultHasher;
225        use std::hash::{Hash, Hasher};
226
227        let mut hasher = DefaultHasher::new();
228        config.use_cache.hash(&mut hasher);
229        config.download_if_missing.hash(&mut hasher);
230        config.return_preprocessed.hash(&mut hasher);
231        config.subset.hash(&mut hasher);
232        config.random_state.hash(&mut hasher);
233
234        Self {
235            name: name.to_string(),
236            config_hash: format!("{:x}", hasher.finish()),
237        }
238    }
239
240    /// Get the cache key as a string
241    pub fn as_string(&self) -> String {
242        format!("{}_{}", self.name, self.config_hash)
243    }
244}
245
246/// File path wrapper for hashing
247#[derive(Clone, Debug, Eq, PartialEq)]
248struct FileCacheKey(String);
249
250impl Hash for FileCacheKey {
251    fn hash<H: Hasher>(&self, state: &mut H) {
252        self.0.hash(state);
253    }
254}
255
256/// Manages caching of downloaded datasets, using both file-based and in-memory caching
257///
258/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
259/// while maintaining the file-based persistence for long-term storage.
260pub struct DatasetCache {
261    /// Directory for file-based caching
262    cachedir: PathBuf,
263    /// In-memory cache for frequently accessed datasets
264    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
265    /// Maximum cache size in bytes (0 means unlimited)
266    max_cache_size: u64,
267    /// Whether to operate in offline mode (no downloads)
268    offline_mode: bool,
269}
270
271impl Default for DatasetCache {
272    fn default() -> Self {
273        let cachedir = get_cachedir().expect("Could not get cache directory");
274
275        let mem_cache = RefCell::new(
276            CacheBuilder::new()
277                .with_size(DEFAULT_CACHE_SIZE)
278                .with_ttl(DEFAULT_CACHE_TTL)
279                .build_sized_cache(),
280        );
281
282        // Check if offline mode is enabled via environment variable
283        let offline_mode = std::env::var("SCIRS2_OFFLINE")
284            .map(|v| v.to_lowercase() == "true" || v == "1")
285            .unwrap_or(false);
286
287        DatasetCache {
288            cachedir,
289            mem_cache,
290            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
291            offline_mode,
292        }
293    }
294}
295
296impl DatasetCache {
297    /// Create a new dataset cache with the given cache directory and default memory cache
298    pub fn new(cachedir: PathBuf) -> Self {
299        let mem_cache = RefCell::new(
300            CacheBuilder::new()
301                .with_size(DEFAULT_CACHE_SIZE)
302                .with_ttl(DEFAULT_CACHE_TTL)
303                .build_sized_cache(),
304        );
305
306        let offline_mode = std::env::var("SCIRS2_OFFLINE")
307            .map(|v| v.to_lowercase() == "true" || v == "1")
308            .unwrap_or(false);
309
310        DatasetCache {
311            cachedir,
312            mem_cache,
313            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
314            offline_mode,
315        }
316    }
317
318    /// Create a new dataset cache with custom settings
319    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
320        let mem_cache = RefCell::new(
321            CacheBuilder::new()
322                .with_size(cache_size)
323                .with_ttl(ttl_seconds)
324                .build_sized_cache(),
325        );
326
327        let offline_mode = std::env::var("SCIRS2_OFFLINE")
328            .map(|v| v.to_lowercase() == "true" || v == "1")
329            .unwrap_or(false);
330
331        DatasetCache {
332            cachedir,
333            mem_cache,
334            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
335            offline_mode,
336        }
337    }
338
339    /// Create a new dataset cache with comprehensive configuration
340    pub fn with_full_config(
341        cachedir: PathBuf,
342        cache_size: usize,
343        ttl_seconds: u64,
344        max_cache_size: u64,
345        offline_mode: bool,
346    ) -> Self {
347        let mem_cache = RefCell::new(
348            CacheBuilder::new()
349                .with_size(cache_size)
350                .with_ttl(ttl_seconds)
351                .build_sized_cache(),
352        );
353
354        DatasetCache {
355            cachedir,
356            mem_cache,
357            max_cache_size,
358            offline_mode,
359        }
360    }
361
362    /// Create the cache directory if it doesn't exist
363    pub fn ensure_cachedir(&self) -> Result<()> {
364        if !self.cachedir.exists() {
365            fs::create_dir_all(&self.cachedir).map_err(|e| {
366                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
367            })?;
368        }
369        Ok(())
370    }
371
372    /// Get the path to a cached file
373    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
374        self.cachedir.join(name)
375    }
376
377    /// Check if a file is already cached (either in memory or on disk)
378    pub fn is_cached(&self, name: &str) -> bool {
379        // Check memory cache first
380        let key = FileCacheKey(name.to_string());
381        if self.mem_cache.borrow_mut().get(&key).is_some() {
382            return true;
383        }
384
385        // Then check file system
386        self.get_cachedpath(name).exists()
387    }
388
389    /// Read a cached file as bytes
390    ///
391    /// This method checks the in-memory cache first, and falls back to the file system if needed.
392    /// When reading from the file system, the result is also stored in the in-memory cache.
393    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
394        // Try memory cache first
395        let key = FileCacheKey(name.to_string());
396        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
397            return Ok(data);
398        }
399
400        // Fall back to file system cache
401        let path = self.get_cachedpath(name);
402        if !path.exists() {
403            return Err(DatasetsError::CacheError(format!(
404                "Cached file does not exist: {name}"
405            )));
406        }
407
408        let mut file = File::open(path)
409            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
410
411        let mut buffer = Vec::new();
412        file.read_to_end(&mut buffer)
413            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
414
415        // Update memory cache
416        self.mem_cache.borrow_mut().insert(key, buffer.clone());
417
418        Ok(buffer)
419    }
420
421    /// Write data to both the file cache and memory cache
422    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
423        self.ensure_cachedir()?;
424
425        // Check if writing this file would exceed cache size limit
426        if self.max_cache_size > 0 {
427            let current_size = self.get_cache_size_bytes()?;
428            let new_file_size = data.len() as u64;
429
430            if current_size + new_file_size > self.max_cache_size {
431                self.cleanup_cache_to_fit(new_file_size)?;
432            }
433        }
434
435        // Write to file system cache
436        let path = self.get_cachedpath(name);
437        let mut file = File::create(path)
438            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
439
440        file.write_all(data).map_err(|e| {
441            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
442        })?;
443
444        // Update memory cache
445        let key = FileCacheKey(name.to_string());
446        self.mem_cache.borrow_mut().insert(key, data.to_vec());
447
448        Ok(())
449    }
450
451    /// Clear the entire cache (both memory and file-based)
452    pub fn clear_cache(&self) -> Result<()> {
453        // Clear file system cache
454        if self.cachedir.exists() {
455            fs::remove_dir_all(&self.cachedir)
456                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
457        }
458
459        // Clear memory cache
460        self.mem_cache.borrow_mut().clear();
461
462        Ok(())
463    }
464
465    /// Remove a specific cached file (from both memory and file system)
466    pub fn remove_cached(&self, name: &str) -> Result<()> {
467        // Remove from file system
468        let path = self.get_cachedpath(name);
469        if path.exists() {
470            fs::remove_file(path).map_err(|e| {
471                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
472            })?;
473        }
474
475        // Remove from memory cache
476        let key = FileCacheKey(name.to_string());
477        self.mem_cache.borrow_mut().remove(&key);
478
479        Ok(())
480    }
481
482    /// Compute a hash for a filename or URL
483    pub fn hash_filename(name: &str) -> String {
484        let hash = blake3::hash(name.as_bytes());
485        hash.to_hex().to_string()
486    }
487
488    /// Get the total size of the cache in bytes
489    pub fn get_cache_size_bytes(&self) -> Result<u64> {
490        let mut total_size = 0u64;
491
492        if self.cachedir.exists() {
493            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
494                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
495            })?;
496
497            for entry in entries {
498                let entry = entry.map_err(|e| {
499                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
500                })?;
501
502                if let Ok(metadata) = entry.metadata() {
503                    if metadata.is_file() {
504                        total_size += metadata.len();
505                    }
506                }
507            }
508        }
509
510        Ok(total_size)
511    }
512
513    /// Clean up cache to fit a new file of specified size
514    ///
515    /// This method removes the oldest files first until there's enough space
516    /// for the new file plus some buffer space.
517    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
518        if self.max_cache_size == 0 {
519            return Ok(()); // No _size limit
520        }
521
522        let current_size = self.get_cache_size_bytes()?;
523        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
524        let total_needed = current_size + needed_size;
525
526        if total_needed <= target_size {
527            return Ok(()); // No cleanup needed
528        }
529
530        let size_to_free = total_needed - target_size;
531
532        // Get all files with their modification times
533        let mut files_with_times = Vec::new();
534
535        if self.cachedir.exists() {
536            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
537                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
538            })?;
539
540            for entry in entries {
541                let entry = entry.map_err(|e| {
542                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
543                })?;
544
545                if let Ok(metadata) = entry.metadata() {
546                    if metadata.is_file() {
547                        if let Ok(modified) = metadata.modified() {
548                            files_with_times.push((entry.path(), metadata.len(), modified));
549                        }
550                    }
551                }
552            }
553        }
554
555        // Sort by modification time (oldest first)
556        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
557
558        // Remove files until we've freed enough space
559        let mut freed_size = 0u64;
560        for (path, size, _modified) in files_with_times {
561            if freed_size >= size_to_free {
562                break;
563            }
564
565            // Remove from memory cache first
566            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
567                let key = FileCacheKey(filename.to_string());
568                self.mem_cache.borrow_mut().remove(&key);
569            }
570
571            // Remove file
572            if let Err(e) = fs::remove_file(&path) {
573                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
574            } else {
575                freed_size += size;
576            }
577        }
578
579        Ok(())
580    }
581
582    /// Set offline mode
583    pub fn set_offline_mode(&mut self, offline: bool) {
584        self.offline_mode = offline;
585    }
586
587    /// Check if cache is in offline mode
588    pub fn is_offline(&self) -> bool {
589        self.offline_mode
590    }
591
592    /// Set maximum cache size in bytes (0 for unlimited)
593    pub fn set_max_cache_size(&mut self, max_size: u64) {
594        self.max_cache_size = max_size;
595    }
596
597    /// Get maximum cache size in bytes
598    pub fn max_cache_size(&self) -> u64 {
599        self.max_cache_size
600    }
601
602    /// Put data into the cache (alias for write_cached)
603    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
604        self.write_cached(name, data)
605    }
606
607    /// Get detailed cache information
608    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
609        let mut total_size = 0u64;
610        let mut file_count = 0usize;
611        let mut files = Vec::new();
612
613        if self.cachedir.exists() {
614            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
615                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
616            })?;
617
618            for entry in entries {
619                let entry = entry.map_err(|e| {
620                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
621                })?;
622
623                if let Ok(metadata) = entry.metadata() {
624                    if metadata.is_file() {
625                        let size = metadata.len();
626                        total_size += size;
627                        file_count += 1;
628
629                        if let Some(filename) = entry.file_name().to_str() {
630                            files.push(CacheFileInfo {
631                                name: filename.to_string(),
632                                size_bytes: size,
633                                modified: metadata.modified().ok(),
634                            });
635                        }
636                    }
637                }
638            }
639        }
640
641        // Sort files by size (largest first)
642        files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
643
644        Ok(DetailedCacheStats {
645            total_size_bytes: total_size,
646            file_count,
647            cachedir: self.cachedir.clone(),
648            max_cache_size: self.max_cache_size,
649            offline_mode: self.offline_mode,
650            files,
651        })
652    }
653}
654
655/// Downloads data from a URL and returns it as bytes, using the cache when possible
656#[cfg(feature = "download")]
657#[allow(dead_code)]
658pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
659    let cache = DatasetCache::default();
660    let cache_key = DatasetCache::hash_filename(_url);
661
662    // Check if the data is already cached
663    if !force_download && cache.is_cached(&cache_key) {
664        return cache.read_cached(&cache_key);
665    }
666
667    // Download the data
668    let response = reqwest::blocking::get(_url).map_err(|e| {
669        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
670    })?;
671
672    if !response.status().is_success() {
673        return Err(DatasetsError::DownloadError(format!(
674            "Failed to download from {_url}: HTTP status {}",
675            response.status()
676        )));
677    }
678
679    let data = response
680        .bytes()
681        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
682
683    let data_vec = data.to_vec();
684
685    // Cache the data
686    cache.write_cached(&cache_key, &data_vec)?;
687
688    Ok(data_vec)
689}
690
691// Stub for when download feature is not enabled
692#[cfg(not(feature = "download"))]
693/// Downloads data from a URL or retrieves it from cache
694///
695/// This is a stub implementation when the download feature is not enabled.
696/// It returns an error informing the user to enable the download feature.
697///
698/// # Arguments
699///
700/// * `_url` - The URL to download from
701/// * `_force_download` - If true, force a new download instead of using cache
702///
703/// # Returns
704///
705/// * An error indicating that the download feature is not enabled
706#[allow(dead_code)]
707pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
708    Err(DatasetsError::Other(
709        "Download feature is not enabled. Recompile with --features download".to_string(),
710    ))
711}
712
713/// Cache management utilities
714pub struct CacheManager {
715    cache: DatasetCache,
716}
717
718impl CacheManager {
719    /// Create a new cache manager with default settings
720    pub fn new() -> Result<Self> {
721        let cachedir = get_cachedir()?;
722        Ok(Self {
723            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
724        })
725    }
726
727    /// Create a new cache manager with custom settings
728    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
729        Self {
730            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
731        }
732    }
733
734    /// Get a dataset from cache using CacheKey
735    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
736        let name = key.as_string();
737        if self.cache.is_cached(&name) {
738            match self.cache.read_cached(&name) {
739                Ok(cached_data) => {
740                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
741                        Ok(dataset) => Ok(Some(dataset)),
742                        Err(e) => {
743                            // If deserialization fails, consider the cache entry invalid
744                            self.cache
745                                .mem_cache
746                                .borrow_mut()
747                                .remove(&FileCacheKey(name.clone()));
748                            Err(DatasetsError::CacheError(format!(
749                                "Failed to deserialize cached dataset: {e}"
750                            )))
751                        }
752                    }
753                }
754                Err(e) => Err(DatasetsError::CacheError(format!(
755                    "Failed to read cached data: {e}"
756                ))),
757            }
758        } else {
759            Ok(None)
760        }
761    }
762
763    /// Put a dataset into cache using CacheKey
764    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
765        let name = key.as_string();
766
767        // Serialize the dataset to JSON bytes for caching
768        let serialized = serde_json::to_vec(dataset)
769            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
770
771        // Write the serialized data to cache
772        self.cache
773            .write_cached(&name, &serialized)
774            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
775    }
776
777    /// Create a cache manager with comprehensive configuration
778    pub fn with_full_config(
779        cachedir: PathBuf,
780        cache_size: usize,
781        ttl_seconds: u64,
782        max_cache_size: u64,
783        offline_mode: bool,
784    ) -> Self {
785        Self {
786            cache: DatasetCache::with_full_config(
787                cachedir,
788                cache_size,
789                ttl_seconds,
790                max_cache_size,
791                offline_mode,
792            ),
793        }
794    }
795
796    /// Get basic cache statistics
797    pub fn get_stats(&self) -> CacheStats {
798        let cachedir = &self.cache.cachedir;
799        let mut total_size = 0u64;
800        let mut file_count = 0usize;
801
802        if cachedir.exists() {
803            if let Ok(entries) = fs::read_dir(cachedir) {
804                for entry in entries.flatten() {
805                    if let Ok(metadata) = entry.metadata() {
806                        if metadata.is_file() {
807                            total_size += metadata.len();
808                            file_count += 1;
809                        }
810                    }
811                }
812            }
813        }
814
815        CacheStats {
816            total_size_bytes: total_size,
817            file_count,
818            cachedir: cachedir.clone(),
819        }
820    }
821
822    /// Get detailed cache statistics
823    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
824        self.cache.get_detailed_stats()
825    }
826
827    /// Set offline mode
828    pub fn set_offline_mode(&mut self, offline: bool) {
829        self.cache.set_offline_mode(offline);
830    }
831
832    /// Check if in offline mode
833    pub fn is_offline(&self) -> bool {
834        self.cache.is_offline()
835    }
836
837    /// Set maximum cache size in bytes (0 for unlimited)
838    pub fn set_max_cache_size(&mut self, max_size: u64) {
839        self.cache.set_max_cache_size(max_size);
840    }
841
842    /// Get maximum cache size in bytes
843    pub fn max_cache_size(&self) -> u64 {
844        self.cache.max_cache_size()
845    }
846
847    /// Clear all cached data
848    pub fn clear_all(&self) -> Result<()> {
849        self.cache.clear_cache()
850    }
851
852    /// Remove specific cached file
853    pub fn remove(&self, name: &str) -> Result<()> {
854        self.cache.remove_cached(name)
855    }
856
857    /// Remove old files to free up space
858    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
859        self.cache.cleanup_cache_to_fit(target_size)
860    }
861
862    /// List all cached files
863    pub fn list_cached_files(&self) -> Result<Vec<String>> {
864        let cachedir = &self.cache.cachedir;
865        let mut files = Vec::new();
866
867        if cachedir.exists() {
868            let entries = fs::read_dir(cachedir).map_err(|e| {
869                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
870            })?;
871
872            for entry in entries {
873                let entry = entry.map_err(|e| {
874                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
875                })?;
876
877                if let Some(filename) = entry.file_name().to_str() {
878                    files.push(filename.to_string());
879                }
880            }
881        }
882
883        files.sort();
884        Ok(files)
885    }
886
887    /// Get cache directory path
888    pub fn cachedir(&self) -> &PathBuf {
889        &self.cache.cachedir
890    }
891
892    /// Check if a file is cached
893    pub fn is_cached(&self, name: &str) -> bool {
894        self.cache.is_cached(name)
895    }
896
897    /// Print detailed cache report
898    pub fn print_cache_report(&self) -> Result<()> {
899        let stats = self.get_detailed_stats()?;
900
901        println!("=== Cache Report ===");
902        println!("Cache Directory: {}", stats.cachedir.display());
903        println!(
904            "Total Size: {} ({} files)",
905            stats.formatted_size(),
906            stats.file_count
907        );
908        println!("Max Size: {}", stats.formatted_max_size());
909
910        if stats.max_cache_size > 0 {
911            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
912        }
913
914        println!(
915            "Offline Mode: {}",
916            if stats.offline_mode {
917                "Enabled"
918            } else {
919                "Disabled"
920            }
921        );
922
923        if !stats.files.is_empty() {
924            println!("\nCached Files:");
925            for file in &stats.files {
926                println!(
927                    "  {} - {} ({})",
928                    file.name,
929                    file.formatted_size(),
930                    file.formatted_modified()
931                );
932            }
933        }
934
935        Ok(())
936    }
937}
938
939/// Cache statistics
940pub struct CacheStats {
941    /// Total size of all cached files in bytes
942    pub total_size_bytes: u64,
943    /// Number of cached files
944    pub file_count: usize,
945    /// Cache directory path
946    pub cachedir: PathBuf,
947}
948
949/// Detailed cache statistics with file-level information
950pub struct DetailedCacheStats {
951    /// Total size of all cached files in bytes
952    pub total_size_bytes: u64,
953    /// Number of cached files
954    pub file_count: usize,
955    /// Cache directory path
956    pub cachedir: PathBuf,
957    /// Maximum cache size (0 = unlimited)
958    pub max_cache_size: u64,
959    /// Whether cache is in offline mode
960    pub offline_mode: bool,
961    /// Information about individual cached files
962    pub files: Vec<CacheFileInfo>,
963}
964
965/// Information about a cached file
966#[derive(Debug, Clone)]
967pub struct CacheFileInfo {
968    /// Name of the cached file
969    pub name: String,
970    /// Size in bytes
971    pub size_bytes: u64,
972    /// Last modified time
973    pub modified: Option<std::time::SystemTime>,
974}
975
976impl CacheStats {
977    /// Get total size formatted as human-readable string
978    pub fn formatted_size(&self) -> String {
979        format_bytes(self.total_size_bytes)
980    }
981}
982
983impl DetailedCacheStats {
984    /// Get total size formatted as human-readable string
985    pub fn formatted_size(&self) -> String {
986        format_bytes(self.total_size_bytes)
987    }
988
989    /// Get max cache size formatted as human-readable string
990    pub fn formatted_max_size(&self) -> String {
991        if self.max_cache_size == 0 {
992            "Unlimited".to_string()
993        } else {
994            format_bytes(self.max_cache_size)
995        }
996    }
997
998    /// Get cache usage percentage (0.0-1.0)
999    pub fn usage_percentage(&self) -> f64 {
1000        if self.max_cache_size == 0 {
1001            0.0
1002        } else {
1003            self.total_size_bytes as f64 / self.max_cache_size as f64
1004        }
1005    }
1006}
1007
1008impl CacheFileInfo {
1009    /// Get file size formatted as human-readable string
1010    pub fn formatted_size(&self) -> String {
1011        format_bytes(self.size_bytes)
1012    }
1013
1014    /// Get formatted modification time
1015    pub fn formatted_modified(&self) -> String {
1016        match &self.modified {
1017            Some(time) => {
1018                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1019                {
1020                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1021                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1022                        let days = diff_secs / 86400;
1023                        let hours = (diff_secs % 86400) / 3600;
1024                        let mins = (diff_secs % 3600) / 60;
1025
1026                        if days > 0 {
1027                            format!("{days} days ago")
1028                        } else if hours > 0 {
1029                            format!("{hours} hours ago")
1030                        } else if mins > 0 {
1031                            format!("{mins} minutes ago")
1032                        } else {
1033                            "Just now".to_string()
1034                        }
1035                    } else {
1036                        "Unknown".to_string()
1037                    }
1038                } else {
1039                    "Unknown".to_string()
1040                }
1041            }
1042            None => "Unknown".to_string(),
1043        }
1044    }
1045}
1046
1047/// Format bytes as human-readable string
1048#[allow(dead_code)]
1049fn format_bytes(bytes: u64) -> String {
1050    let size = bytes as f64;
1051    if size < 1024.0 {
1052        format!("{size} B")
1053    } else if size < 1024.0 * 1024.0 {
1054        format!("{:.1} KB", size / 1024.0)
1055    } else if size < 1024.0 * 1024.0 * 1024.0 {
1056        format!("{:.1} MB", size / (1024.0 * 1024.0))
1057    } else {
1058        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1059    }
1060}
1061
1062/// Batch operation result containing success/failure information
1063#[derive(Debug, Clone)]
1064pub struct BatchResult {
1065    /// Number of successful operations
1066    pub success_count: usize,
1067    /// Number of failed operations
1068    pub failure_count: usize,
1069    /// List of failed items with error messages
1070    pub failures: Vec<(String, String)>,
1071    /// Total bytes processed
1072    pub total_bytes: u64,
1073    /// Total time taken for the batch operation
1074    pub elapsed_time: std::time::Duration,
1075}
1076
1077impl BatchResult {
1078    /// Create a new empty batch result
1079    pub fn new() -> Self {
1080        Self {
1081            success_count: 0,
1082            failure_count: 0,
1083            failures: Vec::new(),
1084            total_bytes: 0,
1085            elapsed_time: std::time::Duration::ZERO,
1086        }
1087    }
1088
1089    /// Check if all operations were successful
1090    pub fn is_all_success(&self) -> bool {
1091        self.failure_count == 0
1092    }
1093
1094    /// Get success rate as percentage
1095    pub fn success_rate(&self) -> f64 {
1096        let total = self.success_count + self.failure_count;
1097        if total == 0 {
1098            0.0
1099        } else {
1100            (self.success_count as f64 / total as f64) * 100.0
1101        }
1102    }
1103
1104    /// Get formatted summary
1105    pub fn summary(&self) -> String {
1106        format!(
1107            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1108            self.success_count,
1109            self.success_count + self.failure_count,
1110            self.success_rate(),
1111            format_bytes(self.total_bytes),
1112            self.elapsed_time.as_secs_f64()
1113        )
1114    }
1115}
1116
1117impl Default for BatchResult {
1118    fn default() -> Self {
1119        Self::new()
1120    }
1121}
1122
1123/// Batch operations manager for dataset caching
1124pub struct BatchOperations {
1125    cache: CacheManager,
1126    parallel: bool,
1127    max_retries: usize,
1128    retry_delay: std::time::Duration,
1129}
1130
1131impl BatchOperations {
1132    /// Create a new batch operations manager
1133    pub fn new(cache: CacheManager) -> Self {
1134        Self {
1135            cache,
1136            parallel: true,
1137            max_retries: 3,
1138            retry_delay: std::time::Duration::from_millis(1000),
1139        }
1140    }
1141
1142    /// Configure parallel processing
1143    pub fn with_parallel(mut self, parallel: bool) -> Self {
1144        self.parallel = parallel;
1145        self
1146    }
1147
1148    /// Configure retry settings
1149    pub fn with_retry_config(
1150        mut self,
1151        max_retries: usize,
1152        retry_delay: std::time::Duration,
1153    ) -> Self {
1154        self.max_retries = max_retries;
1155        self.retry_delay = retry_delay;
1156        self
1157    }
1158
1159    /// Download multiple datasets in batch
1160    #[cfg(feature = "download")]
1161    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1162        let start_time = std::time::Instant::now();
1163        let mut result = BatchResult::new();
1164
1165        if self.parallel {
1166            self.batch_download_parallel(urls_andnames, &mut result)
1167        } else {
1168            self.batch_download_sequential(urls_andnames, &mut result)
1169        }
1170
1171        result.elapsed_time = start_time.elapsed();
1172        result
1173    }
1174
1175    #[cfg(feature = "download")]
1176    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1177        use std::fs::File;
1178        use std::io::Write;
1179        use std::sync::{Arc, Mutex};
1180        use std::thread;
1181
1182        // Ensure cache directory exists before spawning threads
1183        if let Err(e) = self.cache.cache.ensure_cachedir() {
1184            result.failure_count += urls_andnames.len();
1185            for &(_, name) in urls_andnames {
1186                result
1187                    .failures
1188                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1189            }
1190            return;
1191        }
1192
1193        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1194        let cachedir = self.cache.cache.cachedir.clone();
1195        let max_retries = self.max_retries;
1196        let retry_delay = self.retry_delay;
1197
1198        let handles: Vec<_> = urls_andnames
1199            .iter()
1200            .map(|&(url, name)| {
1201                let result_clone = Arc::clone(&result_arc);
1202                let url = url.to_string();
1203                let name = name.to_string();
1204                let cachedir = cachedir.clone();
1205
1206                thread::spawn(move || {
1207                    let mut success = false;
1208                    let mut last_error = String::new();
1209                    let mut downloaded_data = Vec::new();
1210
1211                    for attempt in 0..=max_retries {
1212                        match download_data(&url, false) {
1213                            Ok(data) => {
1214                                // Write directly to filesystem (bypassing RefCell memory cache)
1215                                let path = cachedir.join(&name);
1216                                match File::create(&path) {
1217                                    Ok(mut file) => match file.write_all(&data) {
1218                                        Ok(_) => {
1219                                            let mut r =
1220                                                result_clone.lock().expect("Operation failed");
1221                                            r.success_count += 1;
1222                                            r.total_bytes += data.len() as u64;
1223                                            downloaded_data = data;
1224                                            success = true;
1225                                            break;
1226                                        }
1227                                        Err(e) => {
1228                                            last_error = format!("Failed to write cache file: {e}");
1229                                        }
1230                                    },
1231                                    Err(e) => {
1232                                        last_error = format!("Failed to create cache file: {e}");
1233                                    }
1234                                }
1235                            }
1236                            Err(e) => {
1237                                last_error = format!("Download failed: {e}");
1238                                if attempt < max_retries {
1239                                    thread::sleep(retry_delay);
1240                                }
1241                            }
1242                        }
1243                    }
1244
1245                    if !success {
1246                        let mut r = result_clone.lock().expect("Operation failed");
1247                        r.failure_count += 1;
1248                        r.failures.push((name.clone(), last_error));
1249                    }
1250
1251                    (name, success, downloaded_data)
1252                })
1253            })
1254            .collect();
1255
1256        // Collect results and update memory cache for successful downloads
1257        let mut successful_downloads = Vec::new();
1258        for handle in handles {
1259            if let Ok((name, success, data)) = handle.join() {
1260                if success && !data.is_empty() {
1261                    successful_downloads.push((name, data));
1262                }
1263            }
1264        }
1265
1266        // Merge the results from the arc back into the original result
1267        if let Ok(arc_result) = result_arc.lock() {
1268            result.success_count += arc_result.success_count;
1269            result.failure_count += arc_result.failure_count;
1270            result.failures.extend(arc_result.failures.clone());
1271        }
1272
1273        // Update memory cache after all threads complete
1274        for (name, data) in successful_downloads {
1275            let key = FileCacheKey(name);
1276            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1277        }
1278    }
1279
1280    #[cfg(feature = "download")]
1281    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1282        for &(url, name) in urls_andnames {
1283            let mut success = false;
1284            let mut last_error = String::new();
1285
1286            for attempt in 0..=self.max_retries {
1287                match download_data(url, false) {
1288                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1289                        Ok(_) => {
1290                            result.success_count += 1;
1291                            result.total_bytes += data.len() as u64;
1292                            success = true;
1293                            break;
1294                        }
1295                        Err(e) => {
1296                            last_error = format!("Cache write failed: {e}");
1297                        }
1298                    },
1299                    Err(e) => {
1300                        last_error = format!("Download failed: {e}");
1301                        if attempt < self.max_retries {
1302                            std::thread::sleep(self.retry_delay);
1303                        }
1304                    }
1305                }
1306            }
1307
1308            if !success {
1309                result.failure_count += 1;
1310                result.failures.push((name.to_string(), last_error));
1311            }
1312        }
1313    }
1314
1315    /// Verify integrity of multiple cached files
1316    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1317        let start_time = std::time::Instant::now();
1318        let mut result = BatchResult::new();
1319
1320        for &(filename, expected_hash) in files_andhashes {
1321            match self.cache.cache.get_cachedpath(filename).exists() {
1322                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1323                    Ok(actual_hash) => {
1324                        if actual_hash == expected_hash {
1325                            result.success_count += 1;
1326                            if let Ok(metadata) =
1327                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1328                            {
1329                                result.total_bytes += metadata.len();
1330                            }
1331                        } else {
1332                            result.failure_count += 1;
1333                            result.failures.push((
1334                                filename.to_string(),
1335                                format!(
1336                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1337                                ),
1338                            ));
1339                        }
1340                    }
1341                    Err(e) => {
1342                        result.failure_count += 1;
1343                        result.failures.push((
1344                            filename.to_string(),
1345                            format!("Hash computation failed: {e}"),
1346                        ));
1347                    }
1348                },
1349                false => {
1350                    result.failure_count += 1;
1351                    result
1352                        .failures
1353                        .push((filename.to_string(), "File not found in cache".to_string()));
1354                }
1355            }
1356        }
1357
1358        result.elapsed_time = start_time.elapsed();
1359        result
1360    }
1361
1362    /// Clean up cache selectively based on patterns
1363    pub fn selective_cleanup(
1364        &self,
1365        patterns: &[&str],
1366        max_age_days: Option<u32>,
1367    ) -> Result<BatchResult> {
1368        let start_time = std::time::Instant::now();
1369        let mut result = BatchResult::new();
1370
1371        let cached_files = self.cache.list_cached_files()?;
1372        let now = std::time::SystemTime::now();
1373
1374        for filename in cached_files {
1375            let should_remove = patterns.iter().any(|pattern| {
1376                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1377            });
1378
1379            if should_remove {
1380                let filepath = self.cache.cache.get_cachedpath(&filename);
1381
1382                // Check age if max_age_days is specified
1383                let remove_due_to_age = if let Some(max_age) = max_age_days {
1384                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1385                        if let Ok(modified) = metadata.modified() {
1386                            if let Ok(age) = now.duration_since(modified) {
1387                                age.as_secs() > (max_age as u64 * 24 * 3600)
1388                            } else {
1389                                false
1390                            }
1391                        } else {
1392                            false
1393                        }
1394                    } else {
1395                        false
1396                    }
1397                } else {
1398                    true // Remove regardless of age if no age limit specified
1399                };
1400
1401                if remove_due_to_age {
1402                    match self.cache.remove(&filename) {
1403                        Ok(_) => {
1404                            result.success_count += 1;
1405                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1406                                result.total_bytes += metadata.len();
1407                            }
1408                        }
1409                        Err(e) => {
1410                            result.failure_count += 1;
1411                            result
1412                                .failures
1413                                .push((filename, format!("Removal failed: {e}")));
1414                        }
1415                    }
1416                }
1417            }
1418        }
1419
1420        result.elapsed_time = start_time.elapsed();
1421        Ok(result)
1422    }
1423
1424    /// Process multiple datasets with a given function
1425    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1426    where
1427        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1428        E: std::fmt::Display,
1429        T: Send,
1430    {
1431        let start_time = std::time::Instant::now();
1432        let mut result = BatchResult::new();
1433
1434        if self.parallel {
1435            self.batch_process_parallel(names, processor, &mut result)
1436        } else {
1437            self.batch_process_sequential(names, processor, &mut result)
1438        }
1439
1440        result.elapsed_time = start_time.elapsed();
1441        result
1442    }
1443
1444    fn batch_process_parallel<F, T, E>(
1445        &self,
1446        names: &[String],
1447        processor: F,
1448        result: &mut BatchResult,
1449    ) where
1450        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1451        E: std::fmt::Display,
1452        T: Send,
1453    {
1454        // For thread safety with the current cache implementation,
1455        // we need to read all data first, then process in parallel
1456        let mut data_pairs = Vec::new();
1457
1458        // Sequential read phase
1459        for name in names {
1460            match self.cache.cache.read_cached(name) {
1461                Ok(data) => data_pairs.push((name.clone(), data)),
1462                Err(e) => {
1463                    result.failure_count += 1;
1464                    result
1465                        .failures
1466                        .push((name.clone(), format!("Cache read failed: {e}")));
1467                }
1468            }
1469        }
1470
1471        // Parallel processing phase
1472        if !data_pairs.is_empty() {
1473            use std::sync::{Arc, Mutex};
1474            use std::thread;
1475
1476            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1477            let processor = Arc::new(processor);
1478
1479            let handles: Vec<_> = data_pairs
1480                .into_iter()
1481                .map(|(name, data)| {
1482                    let result_clone = Arc::clone(&parallel_result);
1483                    let processor_clone = Arc::clone(&processor);
1484
1485                    thread::spawn(move || match processor_clone(&name, &data) {
1486                        Ok(_) => {
1487                            let mut r = result_clone.lock().expect("Operation failed");
1488                            r.success_count += 1;
1489                            r.total_bytes += data.len() as u64;
1490                        }
1491                        Err(e) => {
1492                            let mut r = result_clone.lock().expect("Operation failed");
1493                            r.failure_count += 1;
1494                            r.failures.push((name, format!("Processing failed: {e}")));
1495                        }
1496                    })
1497                })
1498                .collect();
1499
1500            for handle in handles {
1501                let _ = handle.join();
1502            }
1503
1504            // Merge parallel results into main result
1505            let parallel_result = parallel_result.lock().expect("Operation failed");
1506            result.success_count += parallel_result.success_count;
1507            result.failure_count += parallel_result.failure_count;
1508            result.total_bytes += parallel_result.total_bytes;
1509            result.failures.extend(parallel_result.failures.clone());
1510        }
1511    }
1512
1513    fn batch_process_sequential<F, T, E>(
1514        &self,
1515        names: &[String],
1516        processor: F,
1517        result: &mut BatchResult,
1518    ) where
1519        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1520        E: std::fmt::Display,
1521    {
1522        for name in names {
1523            match self.cache.cache.read_cached(name) {
1524                Ok(data) => match processor(name, &data) {
1525                    Ok(_) => {
1526                        result.success_count += 1;
1527                        result.total_bytes += data.len() as u64;
1528                    }
1529                    Err(e) => {
1530                        result.failure_count += 1;
1531                        result
1532                            .failures
1533                            .push((name.clone(), format!("Processing failed: {e}")));
1534                    }
1535                },
1536                Err(e) => {
1537                    result.failure_count += 1;
1538                    result
1539                        .failures
1540                        .push((name.clone(), format!("Cache read failed: {e}")));
1541                }
1542            }
1543        }
1544    }
1545
1546    /// Get access to the underlying cache manager
1547    pub fn cache_manager(&self) -> &CacheManager {
1548        &self.cache
1549    }
1550
1551    /// Write data to cache
1552    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1553        self.cache.cache.write_cached(name, data)
1554    }
1555
1556    /// Read data from cache
1557    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1558        self.cache.cache.read_cached(name)
1559    }
1560
1561    /// List cached files
1562    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1563        self.cache.list_cached_files()
1564    }
1565
1566    /// Print cache report
1567    pub fn print_cache_report(&self) -> Result<()> {
1568        self.cache.print_cache_report()
1569    }
1570
1571    /// Get statistics about cached datasets
1572    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1573        let start_time = std::time::Instant::now();
1574        let mut result = BatchResult::new();
1575
1576        let cached_files = self.cache.list_cached_files()?;
1577
1578        for filename in cached_files {
1579            let filepath = self.cache.cache.get_cachedpath(&filename);
1580            match std::fs::metadata(&filepath) {
1581                Ok(metadata) => {
1582                    result.success_count += 1;
1583                    result.total_bytes += metadata.len();
1584                }
1585                Err(e) => {
1586                    result.failure_count += 1;
1587                    result
1588                        .failures
1589                        .push((filename, format!("Metadata read failed: {e}")));
1590                }
1591            }
1592        }
1593
1594        result.elapsed_time = start_time.elapsed();
1595        Ok(result)
1596    }
1597}
1598
1599/// Simple glob pattern matching for filenames
1600#[allow(dead_code)]
1601fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1602    if pattern == "*" {
1603        return true;
1604    }
1605
1606    if pattern.contains('*') {
1607        let parts: Vec<&str> = pattern.split('*').collect();
1608        if parts.len() == 2 {
1609            let prefix = parts[0];
1610            let suffix = parts[1];
1611            return filename.starts_with(prefix) && filename.ends_with(suffix);
1612        }
1613    }
1614
1615    filename == pattern
1616}
1617
1618#[cfg(test)]
1619mod tests {
1620    use super::*;
1621    use tempfile::TempDir;
1622
1623    #[test]
1624    fn test_batch_result() {
1625        let mut result = BatchResult::new();
1626        assert_eq!(result.success_count, 0);
1627        assert_eq!(result.failure_count, 0);
1628        assert!(result.is_all_success());
1629        assert_eq!(result.success_rate(), 0.0);
1630
1631        result.success_count = 8;
1632        result.failure_count = 2;
1633        result.total_bytes = 1024;
1634
1635        assert!(!result.is_all_success());
1636        assert_eq!(result.success_rate(), 80.0);
1637        assert!(result.summary().contains("8/10 successful"));
1638        assert!(result.summary().contains("80.0%"));
1639    }
1640
1641    #[test]
1642    fn test_batch_operations_creation() {
1643        let tempdir = TempDir::new().expect("Operation failed");
1644        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1645        let batch_ops = BatchOperations::new(cache_manager)
1646            .with_parallel(false)
1647            .with_retry_config(2, std::time::Duration::from_millis(500));
1648
1649        assert!(!batch_ops.parallel);
1650        assert_eq!(batch_ops.max_retries, 2);
1651    }
1652
1653    #[test]
1654    fn test_selective_cleanup() {
1655        let tempdir = TempDir::new().expect("Operation failed");
1656        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1657        let batch_ops = BatchOperations::new(cache_manager);
1658
1659        // Create some test files
1660        let test_data = vec![0u8; 100];
1661        batch_ops
1662            .cache
1663            .cache
1664            .write_cached("test1.csv", &test_data)
1665            .expect("Test: cache operation failed");
1666        batch_ops
1667            .cache
1668            .cache
1669            .write_cached("test2.csv", &test_data)
1670            .expect("Test: cache operation failed");
1671        batch_ops
1672            .cache
1673            .cache
1674            .write_cached("data.json", &test_data)
1675            .expect("Test: cache operation failed");
1676
1677        // Clean up files matching pattern
1678        let result = batch_ops
1679            .selective_cleanup(&["*.csv"], None)
1680            .expect("Operation failed");
1681
1682        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1683        assert!(!batch_ops.cache.is_cached("test1.csv"));
1684        assert!(!batch_ops.cache.is_cached("test2.csv"));
1685        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1686    }
1687
1688    #[test]
1689    fn test_batch_process() {
1690        let tempdir = TempDir::new().expect("Operation failed");
1691        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1692        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1693
1694        // Create test files
1695        let test_data1 = vec![1u8; 100];
1696        let test_data2 = vec![2u8; 200];
1697        batch_ops
1698            .cache
1699            .cache
1700            .write_cached("file1.dat", &test_data1)
1701            .expect("Test: cache operation failed");
1702        batch_ops
1703            .cache
1704            .cache
1705            .write_cached("file2.dat", &test_data2)
1706            .expect("Test: cache operation failed");
1707
1708        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1709
1710        // Process files (verify they're non-empty)
1711        let result = batch_ops.batch_process(&files, |_name, data| {
1712            if data.is_empty() {
1713                Err("Empty file")
1714            } else {
1715                Ok(data.len())
1716            }
1717        });
1718
1719        assert_eq!(result.success_count, 2);
1720        assert_eq!(result.failure_count, 0);
1721        assert_eq!(result.total_bytes, 300); // 100 + 200
1722    }
1723
1724    #[test]
1725    fn test_get_cache_statistics() {
1726        let tempdir = TempDir::new().expect("Operation failed");
1727        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1728        let batch_ops = BatchOperations::new(cache_manager);
1729
1730        // Start with empty cache
1731        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1732        assert_eq!(result.success_count, 0);
1733
1734        // Add some files
1735        let test_data = vec![0u8; 500];
1736        batch_ops
1737            .cache
1738            .cache
1739            .write_cached("test1.dat", &test_data)
1740            .expect("Test: cache operation failed");
1741        batch_ops
1742            .cache
1743            .cache
1744            .write_cached("test2.dat", &test_data)
1745            .expect("Test: cache operation failed");
1746
1747        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1748        assert_eq!(result.success_count, 2);
1749        assert_eq!(result.total_bytes, 1000);
1750    }
1751
1752    #[test]
1753    fn test_matches_glob_pattern() {
1754        assert!(matches_glob_pattern("test.csv", "*"));
1755        assert!(matches_glob_pattern("test.csv", "*.csv"));
1756        assert!(matches_glob_pattern("test.csv", "test.*"));
1757        assert!(matches_glob_pattern("test.csv", "test.csv"));
1758
1759        assert!(!matches_glob_pattern("test.json", "*.csv"));
1760        assert!(!matches_glob_pattern("other.csv", "test.*"));
1761    }
1762
1763    #[test]
1764    fn test_cache_manager_creation() {
1765        let tempdir = TempDir::new().expect("Operation failed");
1766        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1767        let stats = manager.get_stats();
1768        assert_eq!(stats.file_count, 0);
1769    }
1770
1771    #[test]
1772    fn test_cache_stats_formatting() {
1773        let tempdir = TempDir::new().expect("Operation failed");
1774        let stats = CacheStats {
1775            total_size_bytes: 1024,
1776            file_count: 1,
1777            cachedir: tempdir.path().to_path_buf(),
1778        };
1779
1780        assert_eq!(stats.formatted_size(), "1.0 KB");
1781
1782        let stats_large = CacheStats {
1783            total_size_bytes: 1024 * 1024 * 1024,
1784            file_count: 1,
1785            cachedir: tempdir.path().to_path_buf(),
1786        };
1787
1788        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1789    }
1790
1791    #[test]
1792    fn test_hash_file_name() {
1793        let hash1 = DatasetCache::hash_filename("test.csv");
1794        let hash2 = DatasetCache::hash_filename("test.csv");
1795        let hash3 = DatasetCache::hash_filename("different.csv");
1796
1797        assert_eq!(hash1, hash2);
1798        assert_ne!(hash1, hash3);
1799        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1800    }
1801
1802    #[test]
1803    fn test_platform_cachedir() {
1804        let cachedir = get_platform_cachedir();
1805        // Should work on any platform
1806        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1807
1808        if let Some(dir) = cachedir {
1809            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1810        }
1811    }
1812
1813    #[test]
1814    fn test_cache_size_management() {
1815        let tempdir = TempDir::new().expect("Operation failed");
1816        let cache = DatasetCache::with_full_config(
1817            tempdir.path().to_path_buf(),
1818            10,
1819            3600,
1820            2048, // 2KB limit
1821            false,
1822        );
1823
1824        // Write multiple small files to approach the limit
1825        let small_data1 = vec![0u8; 400];
1826        cache
1827            .write_cached("small1.dat", &small_data1)
1828            .expect("Operation failed");
1829
1830        let small_data2 = vec![0u8; 400];
1831        cache
1832            .write_cached("small2.dat", &small_data2)
1833            .expect("Operation failed");
1834
1835        let small_data3 = vec![0u8; 400];
1836        cache
1837            .write_cached("small3.dat", &small_data3)
1838            .expect("Operation failed");
1839
1840        // Now write a file that should trigger cleanup
1841        let medium_data = vec![0u8; 800];
1842        cache
1843            .write_cached("medium.dat", &medium_data)
1844            .expect("Operation failed");
1845
1846        // The cache should have cleaned up to stay under the limit
1847        let stats = cache.get_detailed_stats().expect("Operation failed");
1848        assert!(stats.total_size_bytes <= cache.max_cache_size());
1849
1850        // The most recent file should still be cached
1851        assert!(cache.is_cached("medium.dat"));
1852    }
1853
1854    #[test]
1855    fn test_offline_mode() {
1856        let tempdir = TempDir::new().expect("Operation failed");
1857        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1858
1859        assert!(!cache.is_offline());
1860        cache.set_offline_mode(true);
1861        assert!(cache.is_offline());
1862    }
1863
1864    #[test]
1865    fn test_detailed_stats() {
1866        let tempdir = TempDir::new().expect("Operation failed");
1867        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1868
1869        let test_data = vec![1, 2, 3, 4, 5];
1870        cache
1871            .write_cached("test.dat", &test_data)
1872            .expect("Operation failed");
1873
1874        let stats = cache.get_detailed_stats().expect("Operation failed");
1875        assert_eq!(stats.file_count, 1);
1876        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1877        assert_eq!(stats.files.len(), 1);
1878        assert_eq!(stats.files[0].name, "test.dat");
1879        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1880    }
1881
1882    #[test]
1883    fn test_cache_manager() {
1884        let tempdir = TempDir::new().expect("Operation failed");
1885        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1886
1887        let stats = manager.get_stats();
1888        assert_eq!(stats.file_count, 0);
1889        assert_eq!(stats.total_size_bytes, 0);
1890
1891        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1892    }
1893
1894    #[test]
1895    fn test_format_bytes() {
1896        assert_eq!(format_bytes(512), "512 B");
1897        assert_eq!(format_bytes(1024), "1.0 KB");
1898        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1899        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1900    }
1901}