scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(format!("{:x}", hasher.finalize()))
46}
47
48/// Registry entry for dataset files
49pub struct RegistryEntry {
50    /// SHA256 hash of the file
51    pub sha256: &'static str,
52    /// URL to download the file from
53    pub url: &'static str,
54}
55
56/// Get the platform-specific cache directory for downloading and storing datasets
57///
58/// The cache directory is determined in the following order:
59/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
60/// 2. Platform-specific cache directory:
61///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
62///    - macOS: `~/Library/Caches/scirs2-datasets`
63///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
64/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
65#[allow(dead_code)]
66pub fn get_cachedir() -> Result<PathBuf> {
67    // Check environment variable first
68    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
69        let cachepath = PathBuf::from(cachedir);
70        ensuredirectory_exists(&cachepath)?;
71        return Ok(cachepath);
72    }
73
74    // Try platform-specific cache directory
75    if let Some(cachedir) = get_platform_cachedir() {
76        ensuredirectory_exists(&cachedir)?;
77        return Ok(cachedir);
78    }
79
80    // Fallback to home directory
81    let homedir = dirs::home_dir()
82        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
83    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
84    ensuredirectory_exists(&cachedir)?;
85
86    Ok(cachedir)
87}
88
89/// Get platform-specific cache directory
90#[allow(dead_code)]
91fn get_platform_cachedir() -> Option<PathBuf> {
92    #[cfg(target_os = "windows")]
93    {
94        dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
95    }
96    #[cfg(target_os = "macos")]
97    {
98        dirs::home_dir().map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
99    }
100    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
101    {
102        // Linux/Unix: Use XDG cache directory
103        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
104            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
105        } else {
106            dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
107        }
108    }
109}
110
111/// Ensure a directory exists, creating it if necessary
112#[allow(dead_code)]
113fn ensuredirectory_exists(dir: &Path) -> Result<()> {
114    if !dir.exists() {
115        fs::create_dir_all(dir).map_err(|e| {
116            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
117        })?;
118    }
119    Ok(())
120}
121
122/// Fetch a dataset file from either cache or download it from the URL
123///
124/// This function will:
125/// 1. Check if the file exists in the cache directory
126/// 2. If not, download it from the URL in the registry entry
127/// 3. Store it in the cache directory
128/// 4. Return the path to the cached file
129///
130/// # Arguments
131///
132/// * `filename` - The name of the file to fetch
133/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
134///
135/// # Returns
136///
137/// * `Ok(PathBuf)` - Path to the cached file
138/// * `Err(String)` - Error message if fetching fails
139#[allow(dead_code)]
140pub fn fetch_data(
141    filename: &str,
142    registry_entry: Option<&RegistryEntry>,
143) -> std::result::Result<PathBuf, String> {
144    // Get the cache directory
145    let cachedir = match get_cachedir() {
146        Ok(dir) => dir,
147        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
148    };
149
150    // Check if file exists in cache
151    let cachepath = cachedir.join(filename);
152    if cachepath.exists() {
153        return Ok(cachepath);
154    }
155
156    // If not in cache, fetch from the URL
157    let entry = match registry_entry {
158        Some(entry) => entry,
159        None => return Err(format!("No registry entry found for {filename}")),
160    };
161
162    // Create a temporary file to download to
163    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
164    let temp_file = tempdir.path().join(filename);
165
166    // Download the file
167    let response = ureq::get(entry.url)
168        .call()
169        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
170
171    // Read body into memory (ureq 3.x: use into_body which implements Read)
172    let mut body = response.into_body();
173    let bytes = body
174        .read_to_vec()
175        .map_err(|e| format!("Failed to read response body: {e}"))?;
176    let mut file = std::fs::File::create(&temp_file)
177        .map_err(|e| format!("Failed to create temp file: {e}"))?;
178    file.write_all(&bytes)
179        .map_err(|e| format!("Failed to write downloaded file: {e}"))?;
180
181    // Verify the SHA256 hash of the downloaded file if provided
182    if !entry.sha256.is_empty() {
183        let computed_hash = sha256_hash_file(&temp_file)?;
184        if computed_hash != entry.sha256 {
185            return Err(format!(
186                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
187                entry.sha256
188            ));
189        }
190    }
191
192    // Move the file to the cache
193    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
194    if let Some(parent) = cachepath.parent() {
195        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
196    }
197
198    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
199
200    Ok(cachepath)
201}
202
203/// Cache key for dataset caching with configuration-aware hashing
204#[derive(Clone, Debug, Eq, PartialEq, Hash)]
205pub struct CacheKey {
206    name: String,
207    config_hash: String,
208}
209
210impl CacheKey {
211    /// Create a new cache key from dataset name and configuration
212    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
213        use std::collections::hash_map::DefaultHasher;
214        use std::hash::{Hash, Hasher};
215
216        let mut hasher = DefaultHasher::new();
217        config.use_cache.hash(&mut hasher);
218        config.download_if_missing.hash(&mut hasher);
219        config.return_preprocessed.hash(&mut hasher);
220        config.subset.hash(&mut hasher);
221        config.random_state.hash(&mut hasher);
222
223        Self {
224            name: name.to_string(),
225            config_hash: format!("{:x}", hasher.finish()),
226        }
227    }
228
229    /// Get the cache key as a string
230    pub fn as_string(&self) -> String {
231        format!("{}_{}", self.name, self.config_hash)
232    }
233}
234
235/// File path wrapper for hashing
236#[derive(Clone, Debug, Eq, PartialEq)]
237struct FileCacheKey(String);
238
239impl Hash for FileCacheKey {
240    fn hash<H: Hasher>(&self, state: &mut H) {
241        self.0.hash(state);
242    }
243}
244
245/// Manages caching of downloaded datasets, using both file-based and in-memory caching
246///
247/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
248/// while maintaining the file-based persistence for long-term storage.
249pub struct DatasetCache {
250    /// Directory for file-based caching
251    cachedir: PathBuf,
252    /// In-memory cache for frequently accessed datasets
253    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
254    /// Maximum cache size in bytes (0 means unlimited)
255    max_cache_size: u64,
256    /// Whether to operate in offline mode (no downloads)
257    offline_mode: bool,
258}
259
260impl Default for DatasetCache {
261    fn default() -> Self {
262        let cachedir = get_cachedir().expect("Could not get cache directory");
263
264        let mem_cache = RefCell::new(
265            CacheBuilder::new()
266                .with_size(DEFAULT_CACHE_SIZE)
267                .with_ttl(DEFAULT_CACHE_TTL)
268                .build_sized_cache(),
269        );
270
271        // Check if offline mode is enabled via environment variable
272        let offline_mode = std::env::var("SCIRS2_OFFLINE")
273            .map(|v| v.to_lowercase() == "true" || v == "1")
274            .unwrap_or(false);
275
276        DatasetCache {
277            cachedir,
278            mem_cache,
279            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
280            offline_mode,
281        }
282    }
283}
284
285impl DatasetCache {
286    /// Create a new dataset cache with the given cache directory and default memory cache
287    pub fn new(cachedir: PathBuf) -> Self {
288        let mem_cache = RefCell::new(
289            CacheBuilder::new()
290                .with_size(DEFAULT_CACHE_SIZE)
291                .with_ttl(DEFAULT_CACHE_TTL)
292                .build_sized_cache(),
293        );
294
295        let offline_mode = std::env::var("SCIRS2_OFFLINE")
296            .map(|v| v.to_lowercase() == "true" || v == "1")
297            .unwrap_or(false);
298
299        DatasetCache {
300            cachedir,
301            mem_cache,
302            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
303            offline_mode,
304        }
305    }
306
307    /// Create a new dataset cache with custom settings
308    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
309        let mem_cache = RefCell::new(
310            CacheBuilder::new()
311                .with_size(cache_size)
312                .with_ttl(ttl_seconds)
313                .build_sized_cache(),
314        );
315
316        let offline_mode = std::env::var("SCIRS2_OFFLINE")
317            .map(|v| v.to_lowercase() == "true" || v == "1")
318            .unwrap_or(false);
319
320        DatasetCache {
321            cachedir,
322            mem_cache,
323            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
324            offline_mode,
325        }
326    }
327
328    /// Create a new dataset cache with comprehensive configuration
329    pub fn with_full_config(
330        cachedir: PathBuf,
331        cache_size: usize,
332        ttl_seconds: u64,
333        max_cache_size: u64,
334        offline_mode: bool,
335    ) -> Self {
336        let mem_cache = RefCell::new(
337            CacheBuilder::new()
338                .with_size(cache_size)
339                .with_ttl(ttl_seconds)
340                .build_sized_cache(),
341        );
342
343        DatasetCache {
344            cachedir,
345            mem_cache,
346            max_cache_size,
347            offline_mode,
348        }
349    }
350
351    /// Create the cache directory if it doesn't exist
352    pub fn ensure_cachedir(&self) -> Result<()> {
353        if !self.cachedir.exists() {
354            fs::create_dir_all(&self.cachedir).map_err(|e| {
355                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
356            })?;
357        }
358        Ok(())
359    }
360
361    /// Get the path to a cached file
362    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
363        self.cachedir.join(name)
364    }
365
366    /// Check if a file is already cached (either in memory or on disk)
367    pub fn is_cached(&self, name: &str) -> bool {
368        // Check memory cache first
369        let key = FileCacheKey(name.to_string());
370        if self.mem_cache.borrow_mut().get(&key).is_some() {
371            return true;
372        }
373
374        // Then check file system
375        self.get_cachedpath(name).exists()
376    }
377
378    /// Read a cached file as bytes
379    ///
380    /// This method checks the in-memory cache first, and falls back to the file system if needed.
381    /// When reading from the file system, the result is also stored in the in-memory cache.
382    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
383        // Try memory cache first
384        let key = FileCacheKey(name.to_string());
385        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
386            return Ok(data);
387        }
388
389        // Fall back to file system cache
390        let path = self.get_cachedpath(name);
391        if !path.exists() {
392            return Err(DatasetsError::CacheError(format!(
393                "Cached file does not exist: {name}"
394            )));
395        }
396
397        let mut file = File::open(path)
398            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
399
400        let mut buffer = Vec::new();
401        file.read_to_end(&mut buffer)
402            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
403
404        // Update memory cache
405        self.mem_cache.borrow_mut().insert(key, buffer.clone());
406
407        Ok(buffer)
408    }
409
410    /// Write data to both the file cache and memory cache
411    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
412        self.ensure_cachedir()?;
413
414        // Check if writing this file would exceed cache size limit
415        if self.max_cache_size > 0 {
416            let current_size = self.get_cache_size_bytes()?;
417            let new_file_size = data.len() as u64;
418
419            if current_size + new_file_size > self.max_cache_size {
420                self.cleanup_cache_to_fit(new_file_size)?;
421            }
422        }
423
424        // Write to file system cache
425        let path = self.get_cachedpath(name);
426        let mut file = File::create(path)
427            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
428
429        file.write_all(data).map_err(|e| {
430            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
431        })?;
432
433        // Update memory cache
434        let key = FileCacheKey(name.to_string());
435        self.mem_cache.borrow_mut().insert(key, data.to_vec());
436
437        Ok(())
438    }
439
440    /// Clear the entire cache (both memory and file-based)
441    pub fn clear_cache(&self) -> Result<()> {
442        // Clear file system cache
443        if self.cachedir.exists() {
444            fs::remove_dir_all(&self.cachedir)
445                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
446        }
447
448        // Clear memory cache
449        self.mem_cache.borrow_mut().clear();
450
451        Ok(())
452    }
453
454    /// Remove a specific cached file (from both memory and file system)
455    pub fn remove_cached(&self, name: &str) -> Result<()> {
456        // Remove from file system
457        let path = self.get_cachedpath(name);
458        if path.exists() {
459            fs::remove_file(path).map_err(|e| {
460                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
461            })?;
462        }
463
464        // Remove from memory cache
465        let key = FileCacheKey(name.to_string());
466        self.mem_cache.borrow_mut().remove(&key);
467
468        Ok(())
469    }
470
471    /// Compute a hash for a filename or URL
472    pub fn hash_filename(name: &str) -> String {
473        let hash = blake3::hash(name.as_bytes());
474        hash.to_hex().to_string()
475    }
476
477    /// Get the total size of the cache in bytes
478    pub fn get_cache_size_bytes(&self) -> Result<u64> {
479        let mut total_size = 0u64;
480
481        if self.cachedir.exists() {
482            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
483                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
484            })?;
485
486            for entry in entries {
487                let entry = entry.map_err(|e| {
488                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
489                })?;
490
491                if let Ok(metadata) = entry.metadata() {
492                    if metadata.is_file() {
493                        total_size += metadata.len();
494                    }
495                }
496            }
497        }
498
499        Ok(total_size)
500    }
501
502    /// Clean up cache to fit a new file of specified size
503    ///
504    /// This method removes the oldest files first until there's enough space
505    /// for the new file plus some buffer space.
506    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
507        if self.max_cache_size == 0 {
508            return Ok(()); // No _size limit
509        }
510
511        let current_size = self.get_cache_size_bytes()?;
512        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
513        let total_needed = current_size + needed_size;
514
515        if total_needed <= target_size {
516            return Ok(()); // No cleanup needed
517        }
518
519        let size_to_free = total_needed - target_size;
520
521        // Get all files with their modification times
522        let mut files_with_times = Vec::new();
523
524        if self.cachedir.exists() {
525            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
526                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
527            })?;
528
529            for entry in entries {
530                let entry = entry.map_err(|e| {
531                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
532                })?;
533
534                if let Ok(metadata) = entry.metadata() {
535                    if metadata.is_file() {
536                        if let Ok(modified) = metadata.modified() {
537                            files_with_times.push((entry.path(), metadata.len(), modified));
538                        }
539                    }
540                }
541            }
542        }
543
544        // Sort by modification time (oldest first)
545        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
546
547        // Remove files until we've freed enough space
548        let mut freed_size = 0u64;
549        for (path, size, _modified) in files_with_times {
550            if freed_size >= size_to_free {
551                break;
552            }
553
554            // Remove from memory cache first
555            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
556                let key = FileCacheKey(filename.to_string());
557                self.mem_cache.borrow_mut().remove(&key);
558            }
559
560            // Remove file
561            if let Err(e) = fs::remove_file(&path) {
562                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
563            } else {
564                freed_size += size;
565            }
566        }
567
568        Ok(())
569    }
570
571    /// Set offline mode
572    pub fn set_offline_mode(&mut self, offline: bool) {
573        self.offline_mode = offline;
574    }
575
576    /// Check if cache is in offline mode
577    pub fn is_offline(&self) -> bool {
578        self.offline_mode
579    }
580
581    /// Set maximum cache size in bytes (0 for unlimited)
582    pub fn set_max_cache_size(&mut self, max_size: u64) {
583        self.max_cache_size = max_size;
584    }
585
586    /// Get maximum cache size in bytes
587    pub fn max_cache_size(&self) -> u64 {
588        self.max_cache_size
589    }
590
591    /// Put data into the cache (alias for write_cached)
592    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
593        self.write_cached(name, data)
594    }
595
596    /// Get detailed cache information
597    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
598        let mut total_size = 0u64;
599        let mut file_count = 0usize;
600        let mut files = Vec::new();
601
602        if self.cachedir.exists() {
603            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
604                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
605            })?;
606
607            for entry in entries {
608                let entry = entry.map_err(|e| {
609                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
610                })?;
611
612                if let Ok(metadata) = entry.metadata() {
613                    if metadata.is_file() {
614                        let size = metadata.len();
615                        total_size += size;
616                        file_count += 1;
617
618                        if let Some(filename) = entry.file_name().to_str() {
619                            files.push(CacheFileInfo {
620                                name: filename.to_string(),
621                                size_bytes: size,
622                                modified: metadata.modified().ok(),
623                            });
624                        }
625                    }
626                }
627            }
628        }
629
630        // Sort files by size (largest first)
631        files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
632
633        Ok(DetailedCacheStats {
634            total_size_bytes: total_size,
635            file_count,
636            cachedir: self.cachedir.clone(),
637            max_cache_size: self.max_cache_size,
638            offline_mode: self.offline_mode,
639            files,
640        })
641    }
642}
643
644/// Downloads data from a URL and returns it as bytes, using the cache when possible
645#[cfg(feature = "download")]
646#[allow(dead_code)]
647pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
648    let cache = DatasetCache::default();
649    let cache_key = DatasetCache::hash_filename(_url);
650
651    // Check if the data is already cached
652    if !force_download && cache.is_cached(&cache_key) {
653        return cache.read_cached(&cache_key);
654    }
655
656    // Download the data
657    let response = reqwest::blocking::get(_url).map_err(|e| {
658        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
659    })?;
660
661    if !response.status().is_success() {
662        return Err(DatasetsError::DownloadError(format!(
663            "Failed to download from {_url}: HTTP status {}",
664            response.status()
665        )));
666    }
667
668    let data = response
669        .bytes()
670        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
671
672    let data_vec = data.to_vec();
673
674    // Cache the data
675    cache.write_cached(&cache_key, &data_vec)?;
676
677    Ok(data_vec)
678}
679
680// Stub for when download feature is not enabled
681#[cfg(not(feature = "download"))]
682/// Downloads data from a URL or retrieves it from cache
683///
684/// This is a stub implementation when the download feature is not enabled.
685/// It returns an error informing the user to enable the download feature.
686///
687/// # Arguments
688///
689/// * `_url` - The URL to download from
690/// * `_force_download` - If true, force a new download instead of using cache
691///
692/// # Returns
693///
694/// * An error indicating that the download feature is not enabled
695#[allow(dead_code)]
696pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
697    Err(DatasetsError::Other(
698        "Download feature is not enabled. Recompile with --features download".to_string(),
699    ))
700}
701
702/// Cache management utilities
703pub struct CacheManager {
704    cache: DatasetCache,
705}
706
707impl CacheManager {
708    /// Create a new cache manager with default settings
709    pub fn new() -> Result<Self> {
710        let cachedir = get_cachedir()?;
711        Ok(Self {
712            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
713        })
714    }
715
716    /// Create a new cache manager with custom settings
717    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
718        Self {
719            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
720        }
721    }
722
723    /// Get a dataset from cache using CacheKey
724    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
725        let name = key.as_string();
726        if self.cache.is_cached(&name) {
727            match self.cache.read_cached(&name) {
728                Ok(cached_data) => {
729                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
730                        Ok(dataset) => Ok(Some(dataset)),
731                        Err(e) => {
732                            // If deserialization fails, consider the cache entry invalid
733                            self.cache
734                                .mem_cache
735                                .borrow_mut()
736                                .remove(&FileCacheKey(name.clone()));
737                            Err(DatasetsError::CacheError(format!(
738                                "Failed to deserialize cached dataset: {e}"
739                            )))
740                        }
741                    }
742                }
743                Err(e) => Err(DatasetsError::CacheError(format!(
744                    "Failed to read cached data: {e}"
745                ))),
746            }
747        } else {
748            Ok(None)
749        }
750    }
751
752    /// Put a dataset into cache using CacheKey
753    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
754        let name = key.as_string();
755
756        // Serialize the dataset to JSON bytes for caching
757        let serialized = serde_json::to_vec(dataset)
758            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
759
760        // Write the serialized data to cache
761        self.cache
762            .write_cached(&name, &serialized)
763            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
764    }
765
766    /// Create a cache manager with comprehensive configuration
767    pub fn with_full_config(
768        cachedir: PathBuf,
769        cache_size: usize,
770        ttl_seconds: u64,
771        max_cache_size: u64,
772        offline_mode: bool,
773    ) -> Self {
774        Self {
775            cache: DatasetCache::with_full_config(
776                cachedir,
777                cache_size,
778                ttl_seconds,
779                max_cache_size,
780                offline_mode,
781            ),
782        }
783    }
784
785    /// Get basic cache statistics
786    pub fn get_stats(&self) -> CacheStats {
787        let cachedir = &self.cache.cachedir;
788        let mut total_size = 0u64;
789        let mut file_count = 0usize;
790
791        if cachedir.exists() {
792            if let Ok(entries) = fs::read_dir(cachedir) {
793                for entry in entries.flatten() {
794                    if let Ok(metadata) = entry.metadata() {
795                        if metadata.is_file() {
796                            total_size += metadata.len();
797                            file_count += 1;
798                        }
799                    }
800                }
801            }
802        }
803
804        CacheStats {
805            total_size_bytes: total_size,
806            file_count,
807            cachedir: cachedir.clone(),
808        }
809    }
810
811    /// Get detailed cache statistics
812    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
813        self.cache.get_detailed_stats()
814    }
815
816    /// Set offline mode
817    pub fn set_offline_mode(&mut self, offline: bool) {
818        self.cache.set_offline_mode(offline);
819    }
820
821    /// Check if in offline mode
822    pub fn is_offline(&self) -> bool {
823        self.cache.is_offline()
824    }
825
826    /// Set maximum cache size in bytes (0 for unlimited)
827    pub fn set_max_cache_size(&mut self, max_size: u64) {
828        self.cache.set_max_cache_size(max_size);
829    }
830
831    /// Get maximum cache size in bytes
832    pub fn max_cache_size(&self) -> u64 {
833        self.cache.max_cache_size()
834    }
835
836    /// Clear all cached data
837    pub fn clear_all(&self) -> Result<()> {
838        self.cache.clear_cache()
839    }
840
841    /// Remove specific cached file
842    pub fn remove(&self, name: &str) -> Result<()> {
843        self.cache.remove_cached(name)
844    }
845
846    /// Remove old files to free up space
847    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
848        self.cache.cleanup_cache_to_fit(target_size)
849    }
850
851    /// List all cached files
852    pub fn list_cached_files(&self) -> Result<Vec<String>> {
853        let cachedir = &self.cache.cachedir;
854        let mut files = Vec::new();
855
856        if cachedir.exists() {
857            let entries = fs::read_dir(cachedir).map_err(|e| {
858                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
859            })?;
860
861            for entry in entries {
862                let entry = entry.map_err(|e| {
863                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
864                })?;
865
866                if let Some(filename) = entry.file_name().to_str() {
867                    files.push(filename.to_string());
868                }
869            }
870        }
871
872        files.sort();
873        Ok(files)
874    }
875
876    /// Get cache directory path
877    pub fn cachedir(&self) -> &PathBuf {
878        &self.cache.cachedir
879    }
880
881    /// Check if a file is cached
882    pub fn is_cached(&self, name: &str) -> bool {
883        self.cache.is_cached(name)
884    }
885
886    /// Print detailed cache report
887    pub fn print_cache_report(&self) -> Result<()> {
888        let stats = self.get_detailed_stats()?;
889
890        println!("=== Cache Report ===");
891        println!("Cache Directory: {}", stats.cachedir.display());
892        println!(
893            "Total Size: {} ({} files)",
894            stats.formatted_size(),
895            stats.file_count
896        );
897        println!("Max Size: {}", stats.formatted_max_size());
898
899        if stats.max_cache_size > 0 {
900            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
901        }
902
903        println!(
904            "Offline Mode: {}",
905            if stats.offline_mode {
906                "Enabled"
907            } else {
908                "Disabled"
909            }
910        );
911
912        if !stats.files.is_empty() {
913            println!("\nCached Files:");
914            for file in &stats.files {
915                println!(
916                    "  {} - {} ({})",
917                    file.name,
918                    file.formatted_size(),
919                    file.formatted_modified()
920                );
921            }
922        }
923
924        Ok(())
925    }
926}
927
928/// Cache statistics
929pub struct CacheStats {
930    /// Total size of all cached files in bytes
931    pub total_size_bytes: u64,
932    /// Number of cached files
933    pub file_count: usize,
934    /// Cache directory path
935    pub cachedir: PathBuf,
936}
937
938/// Detailed cache statistics with file-level information
939pub struct DetailedCacheStats {
940    /// Total size of all cached files in bytes
941    pub total_size_bytes: u64,
942    /// Number of cached files
943    pub file_count: usize,
944    /// Cache directory path
945    pub cachedir: PathBuf,
946    /// Maximum cache size (0 = unlimited)
947    pub max_cache_size: u64,
948    /// Whether cache is in offline mode
949    pub offline_mode: bool,
950    /// Information about individual cached files
951    pub files: Vec<CacheFileInfo>,
952}
953
954/// Information about a cached file
955#[derive(Debug, Clone)]
956pub struct CacheFileInfo {
957    /// Name of the cached file
958    pub name: String,
959    /// Size in bytes
960    pub size_bytes: u64,
961    /// Last modified time
962    pub modified: Option<std::time::SystemTime>,
963}
964
965impl CacheStats {
966    /// Get total size formatted as human-readable string
967    pub fn formatted_size(&self) -> String {
968        format_bytes(self.total_size_bytes)
969    }
970}
971
972impl DetailedCacheStats {
973    /// Get total size formatted as human-readable string
974    pub fn formatted_size(&self) -> String {
975        format_bytes(self.total_size_bytes)
976    }
977
978    /// Get max cache size formatted as human-readable string
979    pub fn formatted_max_size(&self) -> String {
980        if self.max_cache_size == 0 {
981            "Unlimited".to_string()
982        } else {
983            format_bytes(self.max_cache_size)
984        }
985    }
986
987    /// Get cache usage percentage (0.0-1.0)
988    pub fn usage_percentage(&self) -> f64 {
989        if self.max_cache_size == 0 {
990            0.0
991        } else {
992            self.total_size_bytes as f64 / self.max_cache_size as f64
993        }
994    }
995}
996
997impl CacheFileInfo {
998    /// Get file size formatted as human-readable string
999    pub fn formatted_size(&self) -> String {
1000        format_bytes(self.size_bytes)
1001    }
1002
1003    /// Get formatted modification time
1004    pub fn formatted_modified(&self) -> String {
1005        match &self.modified {
1006            Some(time) => {
1007                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1008                {
1009                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1010                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1011                        let days = diff_secs / 86400;
1012                        let hours = (diff_secs % 86400) / 3600;
1013                        let mins = (diff_secs % 3600) / 60;
1014
1015                        if days > 0 {
1016                            format!("{days} days ago")
1017                        } else if hours > 0 {
1018                            format!("{hours} hours ago")
1019                        } else if mins > 0 {
1020                            format!("{mins} minutes ago")
1021                        } else {
1022                            "Just now".to_string()
1023                        }
1024                    } else {
1025                        "Unknown".to_string()
1026                    }
1027                } else {
1028                    "Unknown".to_string()
1029                }
1030            }
1031            None => "Unknown".to_string(),
1032        }
1033    }
1034}
1035
1036/// Format bytes as human-readable string
1037#[allow(dead_code)]
1038fn format_bytes(bytes: u64) -> String {
1039    let size = bytes as f64;
1040    if size < 1024.0 {
1041        format!("{size} B")
1042    } else if size < 1024.0 * 1024.0 {
1043        format!("{:.1} KB", size / 1024.0)
1044    } else if size < 1024.0 * 1024.0 * 1024.0 {
1045        format!("{:.1} MB", size / (1024.0 * 1024.0))
1046    } else {
1047        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1048    }
1049}
1050
1051/// Batch operation result containing success/failure information
1052#[derive(Debug, Clone)]
1053pub struct BatchResult {
1054    /// Number of successful operations
1055    pub success_count: usize,
1056    /// Number of failed operations
1057    pub failure_count: usize,
1058    /// List of failed items with error messages
1059    pub failures: Vec<(String, String)>,
1060    /// Total bytes processed
1061    pub total_bytes: u64,
1062    /// Total time taken for the batch operation
1063    pub elapsed_time: std::time::Duration,
1064}
1065
1066impl BatchResult {
1067    /// Create a new empty batch result
1068    pub fn new() -> Self {
1069        Self {
1070            success_count: 0,
1071            failure_count: 0,
1072            failures: Vec::new(),
1073            total_bytes: 0,
1074            elapsed_time: std::time::Duration::ZERO,
1075        }
1076    }
1077
1078    /// Check if all operations were successful
1079    pub fn is_all_success(&self) -> bool {
1080        self.failure_count == 0
1081    }
1082
1083    /// Get success rate as percentage
1084    pub fn success_rate(&self) -> f64 {
1085        let total = self.success_count + self.failure_count;
1086        if total == 0 {
1087            0.0
1088        } else {
1089            (self.success_count as f64 / total as f64) * 100.0
1090        }
1091    }
1092
1093    /// Get formatted summary
1094    pub fn summary(&self) -> String {
1095        format!(
1096            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1097            self.success_count,
1098            self.success_count + self.failure_count,
1099            self.success_rate(),
1100            format_bytes(self.total_bytes),
1101            self.elapsed_time.as_secs_f64()
1102        )
1103    }
1104}
1105
1106impl Default for BatchResult {
1107    fn default() -> Self {
1108        Self::new()
1109    }
1110}
1111
1112/// Batch operations manager for dataset caching
1113pub struct BatchOperations {
1114    cache: CacheManager,
1115    parallel: bool,
1116    max_retries: usize,
1117    retry_delay: std::time::Duration,
1118}
1119
1120impl BatchOperations {
1121    /// Create a new batch operations manager
1122    pub fn new(cache: CacheManager) -> Self {
1123        Self {
1124            cache,
1125            parallel: true,
1126            max_retries: 3,
1127            retry_delay: std::time::Duration::from_millis(1000),
1128        }
1129    }
1130
1131    /// Configure parallel processing
1132    pub fn with_parallel(mut self, parallel: bool) -> Self {
1133        self.parallel = parallel;
1134        self
1135    }
1136
1137    /// Configure retry settings
1138    pub fn with_retry_config(
1139        mut self,
1140        max_retries: usize,
1141        retry_delay: std::time::Duration,
1142    ) -> Self {
1143        self.max_retries = max_retries;
1144        self.retry_delay = retry_delay;
1145        self
1146    }
1147
1148    /// Download multiple datasets in batch
1149    #[cfg(feature = "download")]
1150    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1151        let start_time = std::time::Instant::now();
1152        let mut result = BatchResult::new();
1153
1154        if self.parallel {
1155            self.batch_download_parallel(urls_andnames, &mut result)
1156        } else {
1157            self.batch_download_sequential(urls_andnames, &mut result)
1158        }
1159
1160        result.elapsed_time = start_time.elapsed();
1161        result
1162    }
1163
1164    #[cfg(feature = "download")]
1165    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1166        use std::fs::File;
1167        use std::io::Write;
1168        use std::sync::{Arc, Mutex};
1169        use std::thread;
1170
1171        // Ensure cache directory exists before spawning threads
1172        if let Err(e) = self.cache.cache.ensure_cachedir() {
1173            result.failure_count += urls_andnames.len();
1174            for &(_, name) in urls_andnames {
1175                result
1176                    .failures
1177                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1178            }
1179            return;
1180        }
1181
1182        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1183        let cachedir = self.cache.cache.cachedir.clone();
1184        let max_retries = self.max_retries;
1185        let retry_delay = self.retry_delay;
1186
1187        let handles: Vec<_> = urls_andnames
1188            .iter()
1189            .map(|&(url, name)| {
1190                let result_clone = Arc::clone(&result_arc);
1191                let url = url.to_string();
1192                let name = name.to_string();
1193                let cachedir = cachedir.clone();
1194
1195                thread::spawn(move || {
1196                    let mut success = false;
1197                    let mut last_error = String::new();
1198                    let mut downloaded_data = Vec::new();
1199
1200                    for attempt in 0..=max_retries {
1201                        match download_data(&url, false) {
1202                            Ok(data) => {
1203                                // Write directly to filesystem (bypassing RefCell memory cache)
1204                                let path = cachedir.join(&name);
1205                                match File::create(&path) {
1206                                    Ok(mut file) => match file.write_all(&data) {
1207                                        Ok(_) => {
1208                                            let mut r = result_clone.lock().unwrap();
1209                                            r.success_count += 1;
1210                                            r.total_bytes += data.len() as u64;
1211                                            downloaded_data = data;
1212                                            success = true;
1213                                            break;
1214                                        }
1215                                        Err(e) => {
1216                                            last_error = format!("Failed to write cache file: {e}");
1217                                        }
1218                                    },
1219                                    Err(e) => {
1220                                        last_error = format!("Failed to create cache file: {e}");
1221                                    }
1222                                }
1223                            }
1224                            Err(e) => {
1225                                last_error = format!("Download failed: {e}");
1226                                if attempt < max_retries {
1227                                    thread::sleep(retry_delay);
1228                                }
1229                            }
1230                        }
1231                    }
1232
1233                    if !success {
1234                        let mut r = result_clone.lock().unwrap();
1235                        r.failure_count += 1;
1236                        r.failures.push((name.clone(), last_error));
1237                    }
1238
1239                    (name, success, downloaded_data)
1240                })
1241            })
1242            .collect();
1243
1244        // Collect results and update memory cache for successful downloads
1245        let mut successful_downloads = Vec::new();
1246        for handle in handles {
1247            if let Ok((name, success, data)) = handle.join() {
1248                if success && !data.is_empty() {
1249                    successful_downloads.push((name, data));
1250                }
1251            }
1252        }
1253
1254        // Merge the results from the arc back into the original result
1255        if let Ok(arc_result) = result_arc.lock() {
1256            result.success_count += arc_result.success_count;
1257            result.failure_count += arc_result.failure_count;
1258            result.failures.extend(arc_result.failures.clone());
1259        }
1260
1261        // Update memory cache after all threads complete
1262        for (name, data) in successful_downloads {
1263            let key = FileCacheKey(name);
1264            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1265        }
1266    }
1267
1268    #[cfg(feature = "download")]
1269    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1270        for &(url, name) in urls_andnames {
1271            let mut success = false;
1272            let mut last_error = String::new();
1273
1274            for attempt in 0..=self.max_retries {
1275                match download_data(url, false) {
1276                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1277                        Ok(_) => {
1278                            result.success_count += 1;
1279                            result.total_bytes += data.len() as u64;
1280                            success = true;
1281                            break;
1282                        }
1283                        Err(e) => {
1284                            last_error = format!("Cache write failed: {e}");
1285                        }
1286                    },
1287                    Err(e) => {
1288                        last_error = format!("Download failed: {e}");
1289                        if attempt < self.max_retries {
1290                            std::thread::sleep(self.retry_delay);
1291                        }
1292                    }
1293                }
1294            }
1295
1296            if !success {
1297                result.failure_count += 1;
1298                result.failures.push((name.to_string(), last_error));
1299            }
1300        }
1301    }
1302
1303    /// Verify integrity of multiple cached files
1304    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1305        let start_time = std::time::Instant::now();
1306        let mut result = BatchResult::new();
1307
1308        for &(filename, expected_hash) in files_andhashes {
1309            match self.cache.cache.get_cachedpath(filename).exists() {
1310                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1311                    Ok(actual_hash) => {
1312                        if actual_hash == expected_hash {
1313                            result.success_count += 1;
1314                            if let Ok(metadata) =
1315                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1316                            {
1317                                result.total_bytes += metadata.len();
1318                            }
1319                        } else {
1320                            result.failure_count += 1;
1321                            result.failures.push((
1322                                filename.to_string(),
1323                                format!(
1324                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1325                                ),
1326                            ));
1327                        }
1328                    }
1329                    Err(e) => {
1330                        result.failure_count += 1;
1331                        result.failures.push((
1332                            filename.to_string(),
1333                            format!("Hash computation failed: {e}"),
1334                        ));
1335                    }
1336                },
1337                false => {
1338                    result.failure_count += 1;
1339                    result
1340                        .failures
1341                        .push((filename.to_string(), "File not found in cache".to_string()));
1342                }
1343            }
1344        }
1345
1346        result.elapsed_time = start_time.elapsed();
1347        result
1348    }
1349
1350    /// Clean up cache selectively based on patterns
1351    pub fn selective_cleanup(
1352        &self,
1353        patterns: &[&str],
1354        max_age_days: Option<u32>,
1355    ) -> Result<BatchResult> {
1356        let start_time = std::time::Instant::now();
1357        let mut result = BatchResult::new();
1358
1359        let cached_files = self.cache.list_cached_files()?;
1360        let now = std::time::SystemTime::now();
1361
1362        for filename in cached_files {
1363            let should_remove = patterns.iter().any(|pattern| {
1364                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1365            });
1366
1367            if should_remove {
1368                let filepath = self.cache.cache.get_cachedpath(&filename);
1369
1370                // Check age if max_age_days is specified
1371                let remove_due_to_age = if let Some(max_age) = max_age_days {
1372                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1373                        if let Ok(modified) = metadata.modified() {
1374                            if let Ok(age) = now.duration_since(modified) {
1375                                age.as_secs() > (max_age as u64 * 24 * 3600)
1376                            } else {
1377                                false
1378                            }
1379                        } else {
1380                            false
1381                        }
1382                    } else {
1383                        false
1384                    }
1385                } else {
1386                    true // Remove regardless of age if no age limit specified
1387                };
1388
1389                if remove_due_to_age {
1390                    match self.cache.remove(&filename) {
1391                        Ok(_) => {
1392                            result.success_count += 1;
1393                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1394                                result.total_bytes += metadata.len();
1395                            }
1396                        }
1397                        Err(e) => {
1398                            result.failure_count += 1;
1399                            result
1400                                .failures
1401                                .push((filename, format!("Removal failed: {e}")));
1402                        }
1403                    }
1404                }
1405            }
1406        }
1407
1408        result.elapsed_time = start_time.elapsed();
1409        Ok(result)
1410    }
1411
1412    /// Process multiple datasets with a given function
1413    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1414    where
1415        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1416        E: std::fmt::Display,
1417        T: Send,
1418    {
1419        let start_time = std::time::Instant::now();
1420        let mut result = BatchResult::new();
1421
1422        if self.parallel {
1423            self.batch_process_parallel(names, processor, &mut result)
1424        } else {
1425            self.batch_process_sequential(names, processor, &mut result)
1426        }
1427
1428        result.elapsed_time = start_time.elapsed();
1429        result
1430    }
1431
1432    fn batch_process_parallel<F, T, E>(
1433        &self,
1434        names: &[String],
1435        processor: F,
1436        result: &mut BatchResult,
1437    ) where
1438        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1439        E: std::fmt::Display,
1440        T: Send,
1441    {
1442        // For thread safety with the current cache implementation,
1443        // we need to read all data first, then process in parallel
1444        let mut data_pairs = Vec::new();
1445
1446        // Sequential read phase
1447        for name in names {
1448            match self.cache.cache.read_cached(name) {
1449                Ok(data) => data_pairs.push((name.clone(), data)),
1450                Err(e) => {
1451                    result.failure_count += 1;
1452                    result
1453                        .failures
1454                        .push((name.clone(), format!("Cache read failed: {e}")));
1455                }
1456            }
1457        }
1458
1459        // Parallel processing phase
1460        if !data_pairs.is_empty() {
1461            use std::sync::{Arc, Mutex};
1462            use std::thread;
1463
1464            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1465            let processor = Arc::new(processor);
1466
1467            let handles: Vec<_> = data_pairs
1468                .into_iter()
1469                .map(|(name, data)| {
1470                    let result_clone = Arc::clone(&parallel_result);
1471                    let processor_clone = Arc::clone(&processor);
1472
1473                    thread::spawn(move || match processor_clone(&name, &data) {
1474                        Ok(_) => {
1475                            let mut r = result_clone.lock().unwrap();
1476                            r.success_count += 1;
1477                            r.total_bytes += data.len() as u64;
1478                        }
1479                        Err(e) => {
1480                            let mut r = result_clone.lock().unwrap();
1481                            r.failure_count += 1;
1482                            r.failures.push((name, format!("Processing failed: {e}")));
1483                        }
1484                    })
1485                })
1486                .collect();
1487
1488            for handle in handles {
1489                let _ = handle.join();
1490            }
1491
1492            // Merge parallel results into main result
1493            let parallel_result = parallel_result.lock().unwrap();
1494            result.success_count += parallel_result.success_count;
1495            result.failure_count += parallel_result.failure_count;
1496            result.total_bytes += parallel_result.total_bytes;
1497            result.failures.extend(parallel_result.failures.clone());
1498        }
1499    }
1500
1501    fn batch_process_sequential<F, T, E>(
1502        &self,
1503        names: &[String],
1504        processor: F,
1505        result: &mut BatchResult,
1506    ) where
1507        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1508        E: std::fmt::Display,
1509    {
1510        for name in names {
1511            match self.cache.cache.read_cached(name) {
1512                Ok(data) => match processor(name, &data) {
1513                    Ok(_) => {
1514                        result.success_count += 1;
1515                        result.total_bytes += data.len() as u64;
1516                    }
1517                    Err(e) => {
1518                        result.failure_count += 1;
1519                        result
1520                            .failures
1521                            .push((name.clone(), format!("Processing failed: {e}")));
1522                    }
1523                },
1524                Err(e) => {
1525                    result.failure_count += 1;
1526                    result
1527                        .failures
1528                        .push((name.clone(), format!("Cache read failed: {e}")));
1529                }
1530            }
1531        }
1532    }
1533
1534    /// Get access to the underlying cache manager
1535    pub fn cache_manager(&self) -> &CacheManager {
1536        &self.cache
1537    }
1538
1539    /// Write data to cache
1540    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1541        self.cache.cache.write_cached(name, data)
1542    }
1543
1544    /// Read data from cache
1545    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1546        self.cache.cache.read_cached(name)
1547    }
1548
1549    /// List cached files
1550    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1551        self.cache.list_cached_files()
1552    }
1553
1554    /// Print cache report
1555    pub fn print_cache_report(&self) -> Result<()> {
1556        self.cache.print_cache_report()
1557    }
1558
1559    /// Get statistics about cached datasets
1560    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1561        let start_time = std::time::Instant::now();
1562        let mut result = BatchResult::new();
1563
1564        let cached_files = self.cache.list_cached_files()?;
1565
1566        for filename in cached_files {
1567            let filepath = self.cache.cache.get_cachedpath(&filename);
1568            match std::fs::metadata(&filepath) {
1569                Ok(metadata) => {
1570                    result.success_count += 1;
1571                    result.total_bytes += metadata.len();
1572                }
1573                Err(e) => {
1574                    result.failure_count += 1;
1575                    result
1576                        .failures
1577                        .push((filename, format!("Metadata read failed: {e}")));
1578                }
1579            }
1580        }
1581
1582        result.elapsed_time = start_time.elapsed();
1583        Ok(result)
1584    }
1585}
1586
1587/// Simple glob pattern matching for filenames
1588#[allow(dead_code)]
1589fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1590    if pattern == "*" {
1591        return true;
1592    }
1593
1594    if pattern.contains('*') {
1595        let parts: Vec<&str> = pattern.split('*').collect();
1596        if parts.len() == 2 {
1597            let prefix = parts[0];
1598            let suffix = parts[1];
1599            return filename.starts_with(prefix) && filename.ends_with(suffix);
1600        }
1601    }
1602
1603    filename == pattern
1604}
1605
1606#[cfg(test)]
1607mod tests {
1608    use super::*;
1609    use tempfile::TempDir;
1610
1611    #[test]
1612    fn test_batch_result() {
1613        let mut result = BatchResult::new();
1614        assert_eq!(result.success_count, 0);
1615        assert_eq!(result.failure_count, 0);
1616        assert!(result.is_all_success());
1617        assert_eq!(result.success_rate(), 0.0);
1618
1619        result.success_count = 8;
1620        result.failure_count = 2;
1621        result.total_bytes = 1024;
1622
1623        assert!(!result.is_all_success());
1624        assert_eq!(result.success_rate(), 80.0);
1625        assert!(result.summary().contains("8/10 successful"));
1626        assert!(result.summary().contains("80.0%"));
1627    }
1628
1629    #[test]
1630    fn test_batch_operations_creation() {
1631        let tempdir = TempDir::new().unwrap();
1632        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1633        let batch_ops = BatchOperations::new(cache_manager)
1634            .with_parallel(false)
1635            .with_retry_config(2, std::time::Duration::from_millis(500));
1636
1637        assert!(!batch_ops.parallel);
1638        assert_eq!(batch_ops.max_retries, 2);
1639    }
1640
1641    #[test]
1642    fn test_selective_cleanup() {
1643        let tempdir = TempDir::new().unwrap();
1644        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1645        let batch_ops = BatchOperations::new(cache_manager);
1646
1647        // Create some test files
1648        let test_data = vec![0u8; 100];
1649        batch_ops
1650            .cache
1651            .cache
1652            .write_cached("test1.csv", &test_data)
1653            .unwrap();
1654        batch_ops
1655            .cache
1656            .cache
1657            .write_cached("test2.csv", &test_data)
1658            .unwrap();
1659        batch_ops
1660            .cache
1661            .cache
1662            .write_cached("data.json", &test_data)
1663            .unwrap();
1664
1665        // Clean up files matching pattern
1666        let result = batch_ops.selective_cleanup(&["*.csv"], None).unwrap();
1667
1668        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1669        assert!(!batch_ops.cache.is_cached("test1.csv"));
1670        assert!(!batch_ops.cache.is_cached("test2.csv"));
1671        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1672    }
1673
1674    #[test]
1675    fn test_batch_process() {
1676        let tempdir = TempDir::new().unwrap();
1677        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1678        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1679
1680        // Create test files
1681        let test_data1 = vec![1u8; 100];
1682        let test_data2 = vec![2u8; 200];
1683        batch_ops
1684            .cache
1685            .cache
1686            .write_cached("file1.dat", &test_data1)
1687            .unwrap();
1688        batch_ops
1689            .cache
1690            .cache
1691            .write_cached("file2.dat", &test_data2)
1692            .unwrap();
1693
1694        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1695
1696        // Process files (verify they're non-empty)
1697        let result = batch_ops.batch_process(&files, |_name, data| {
1698            if data.is_empty() {
1699                Err("Empty file")
1700            } else {
1701                Ok(data.len())
1702            }
1703        });
1704
1705        assert_eq!(result.success_count, 2);
1706        assert_eq!(result.failure_count, 0);
1707        assert_eq!(result.total_bytes, 300); // 100 + 200
1708    }
1709
1710    #[test]
1711    fn test_get_cache_statistics() {
1712        let tempdir = TempDir::new().unwrap();
1713        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1714        let batch_ops = BatchOperations::new(cache_manager);
1715
1716        // Start with empty cache
1717        let result = batch_ops.get_cache_statistics().unwrap();
1718        assert_eq!(result.success_count, 0);
1719
1720        // Add some files
1721        let test_data = vec![0u8; 500];
1722        batch_ops
1723            .cache
1724            .cache
1725            .write_cached("test1.dat", &test_data)
1726            .unwrap();
1727        batch_ops
1728            .cache
1729            .cache
1730            .write_cached("test2.dat", &test_data)
1731            .unwrap();
1732
1733        let result = batch_ops.get_cache_statistics().unwrap();
1734        assert_eq!(result.success_count, 2);
1735        assert_eq!(result.total_bytes, 1000);
1736    }
1737
1738    #[test]
1739    fn test_matches_glob_pattern() {
1740        assert!(matches_glob_pattern("test.csv", "*"));
1741        assert!(matches_glob_pattern("test.csv", "*.csv"));
1742        assert!(matches_glob_pattern("test.csv", "test.*"));
1743        assert!(matches_glob_pattern("test.csv", "test.csv"));
1744
1745        assert!(!matches_glob_pattern("test.json", "*.csv"));
1746        assert!(!matches_glob_pattern("other.csv", "test.*"));
1747    }
1748
1749    #[test]
1750    fn test_cache_manager_creation() {
1751        let tempdir = TempDir::new().unwrap();
1752        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1753        let stats = manager.get_stats();
1754        assert_eq!(stats.file_count, 0);
1755    }
1756
1757    #[test]
1758    fn test_cache_stats_formatting() {
1759        let tempdir = TempDir::new().unwrap();
1760        let stats = CacheStats {
1761            total_size_bytes: 1024,
1762            file_count: 1,
1763            cachedir: tempdir.path().to_path_buf(),
1764        };
1765
1766        assert_eq!(stats.formatted_size(), "1.0 KB");
1767
1768        let stats_large = CacheStats {
1769            total_size_bytes: 1024 * 1024 * 1024,
1770            file_count: 1,
1771            cachedir: tempdir.path().to_path_buf(),
1772        };
1773
1774        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1775    }
1776
1777    #[test]
1778    fn test_hash_file_name() {
1779        let hash1 = DatasetCache::hash_filename("test.csv");
1780        let hash2 = DatasetCache::hash_filename("test.csv");
1781        let hash3 = DatasetCache::hash_filename("different.csv");
1782
1783        assert_eq!(hash1, hash2);
1784        assert_ne!(hash1, hash3);
1785        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1786    }
1787
1788    #[test]
1789    fn test_platform_cachedir() {
1790        let cachedir = get_platform_cachedir();
1791        // Should work on any platform
1792        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1793
1794        if let Some(dir) = cachedir {
1795            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1796        }
1797    }
1798
1799    #[test]
1800    fn test_cache_size_management() {
1801        let tempdir = TempDir::new().unwrap();
1802        let cache = DatasetCache::with_full_config(
1803            tempdir.path().to_path_buf(),
1804            10,
1805            3600,
1806            2048, // 2KB limit
1807            false,
1808        );
1809
1810        // Write multiple small files to approach the limit
1811        let small_data1 = vec![0u8; 400];
1812        cache.write_cached("small1.dat", &small_data1).unwrap();
1813
1814        let small_data2 = vec![0u8; 400];
1815        cache.write_cached("small2.dat", &small_data2).unwrap();
1816
1817        let small_data3 = vec![0u8; 400];
1818        cache.write_cached("small3.dat", &small_data3).unwrap();
1819
1820        // Now write a file that should trigger cleanup
1821        let medium_data = vec![0u8; 800];
1822        cache.write_cached("medium.dat", &medium_data).unwrap();
1823
1824        // The cache should have cleaned up to stay under the limit
1825        let stats = cache.get_detailed_stats().unwrap();
1826        assert!(stats.total_size_bytes <= cache.max_cache_size());
1827
1828        // The most recent file should still be cached
1829        assert!(cache.is_cached("medium.dat"));
1830    }
1831
1832    #[test]
1833    fn test_offline_mode() {
1834        let tempdir = TempDir::new().unwrap();
1835        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1836
1837        assert!(!cache.is_offline());
1838        cache.set_offline_mode(true);
1839        assert!(cache.is_offline());
1840    }
1841
1842    #[test]
1843    fn test_detailed_stats() {
1844        let tempdir = TempDir::new().unwrap();
1845        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1846
1847        let test_data = vec![1, 2, 3, 4, 5];
1848        cache.write_cached("test.dat", &test_data).unwrap();
1849
1850        let stats = cache.get_detailed_stats().unwrap();
1851        assert_eq!(stats.file_count, 1);
1852        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1853        assert_eq!(stats.files.len(), 1);
1854        assert_eq!(stats.files[0].name, "test.dat");
1855        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1856    }
1857
1858    #[test]
1859    fn test_cache_manager() {
1860        let tempdir = TempDir::new().unwrap();
1861        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1862
1863        let stats = manager.get_stats();
1864        assert_eq!(stats.file_count, 0);
1865        assert_eq!(stats.total_size_bytes, 0);
1866
1867        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1868    }
1869
1870    #[test]
1871    fn test_format_bytes() {
1872        assert_eq!(format_bytes(512), "512 B");
1873        assert_eq!(format_bytes(1024), "1.0 KB");
1874        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1875        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1876    }
1877}