scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(format!("{:x}", hasher.finalize()))
46}
47
48/// Registry entry for dataset files
49pub struct RegistryEntry {
50    /// SHA256 hash of the file
51    pub sha256: &'static str,
52    /// URL to download the file from
53    pub url: &'static str,
54}
55
56/// Get the platform-specific cache directory for downloading and storing datasets
57///
58/// The cache directory is determined in the following order:
59/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
60/// 2. Platform-specific cache directory:
61///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
62///    - macOS: `~/Library/Caches/scirs2-datasets`
63///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
64/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
65#[allow(dead_code)]
66pub fn get_cachedir() -> Result<PathBuf> {
67    // Check environment variable first
68    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
69        let cachepath = PathBuf::from(cachedir);
70        ensuredirectory_exists(&cachepath)?;
71        return Ok(cachepath);
72    }
73
74    // Try platform-specific cache directory
75    if let Some(cachedir) = get_platform_cachedir() {
76        ensuredirectory_exists(&cachedir)?;
77        return Ok(cachedir);
78    }
79
80    // Fallback to home directory
81    let homedir = dirs::home_dir()
82        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
83    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
84    ensuredirectory_exists(&cachedir)?;
85
86    Ok(cachedir)
87}
88
89/// Get platform-specific cache directory
90#[allow(dead_code)]
91fn get_platform_cachedir() -> Option<PathBuf> {
92    #[cfg(target_os = "windows")]
93    {
94        dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
95    }
96    #[cfg(target_os = "macos")]
97    {
98        dirs::home_dir().map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
99    }
100    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
101    {
102        // Linux/Unix: Use XDG cache directory
103        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
104            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
105        } else {
106            dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
107        }
108    }
109}
110
111/// Ensure a directory exists, creating it if necessary
112#[allow(dead_code)]
113fn ensuredirectory_exists(dir: &Path) -> Result<()> {
114    if !dir.exists() {
115        fs::create_dir_all(dir).map_err(|e| {
116            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
117        })?;
118    }
119    Ok(())
120}
121
122/// Fetch a dataset file from either cache or download it from the URL
123///
124/// This function will:
125/// 1. Check if the file exists in the cache directory
126/// 2. If not, download it from the URL in the registry entry
127/// 3. Store it in the cache directory
128/// 4. Return the path to the cached file
129///
130/// # Arguments
131///
132/// * `filename` - The name of the file to fetch
133/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
134///
135/// # Returns
136///
137/// * `Ok(PathBuf)` - Path to the cached file
138/// * `Err(String)` - Error message if fetching fails
139#[allow(dead_code)]
140pub fn fetch_data(
141    filename: &str,
142    registry_entry: Option<&RegistryEntry>,
143) -> std::result::Result<PathBuf, String> {
144    // Get the cache directory
145    let cachedir = match get_cachedir() {
146        Ok(dir) => dir,
147        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
148    };
149
150    // Check if file exists in cache
151    let cachepath = cachedir.join(filename);
152    if cachepath.exists() {
153        return Ok(cachepath);
154    }
155
156    // If not in cache, fetch from the URL
157    let entry = match registry_entry {
158        Some(entry) => entry,
159        None => return Err(format!("No registry entry found for {filename}")),
160    };
161
162    // Create a temporary file to download to
163    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
164    let temp_file = tempdir.path().join(filename);
165
166    // Download the file
167    let response = ureq::get(entry.url)
168        .call()
169        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
170
171    // Read body into memory (ureq 3.x: use into_body which implements Read)
172    let mut body = response.into_body();
173    let bytes = body
174        .read_to_vec()
175        .map_err(|e| format!("Failed to read response body: {e}"))?;
176    let mut file = std::fs::File::create(&temp_file)
177        .map_err(|e| format!("Failed to create temp file: {e}"))?;
178    file.write_all(&bytes)
179        .map_err(|e| format!("Failed to write downloaded file: {e}"))?;
180
181    // Verify the SHA256 hash of the downloaded file if provided
182    if !entry.sha256.is_empty() {
183        let computed_hash = sha256_hash_file(&temp_file)?;
184        if computed_hash != entry.sha256 {
185            return Err(format!(
186                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
187                entry.sha256
188            ));
189        }
190    }
191
192    // Move the file to the cache
193    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
194    if let Some(parent) = cachepath.parent() {
195        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
196    }
197
198    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
199
200    Ok(cachepath)
201}
202
203/// Cache key for dataset caching with configuration-aware hashing
204#[derive(Clone, Debug, Eq, PartialEq, Hash)]
205pub struct CacheKey {
206    name: String,
207    config_hash: String,
208}
209
210impl CacheKey {
211    /// Create a new cache key from dataset name and configuration
212    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
213        use std::collections::hash_map::DefaultHasher;
214        use std::hash::{Hash, Hasher};
215
216        let mut hasher = DefaultHasher::new();
217        config.use_cache.hash(&mut hasher);
218        config.download_if_missing.hash(&mut hasher);
219        config.return_preprocessed.hash(&mut hasher);
220        config.subset.hash(&mut hasher);
221        config.random_state.hash(&mut hasher);
222
223        Self {
224            name: name.to_string(),
225            config_hash: format!("{:x}", hasher.finish()),
226        }
227    }
228
229    /// Get the cache key as a string
230    pub fn as_string(&self) -> String {
231        format!("{}_{}", self.name, self.config_hash)
232    }
233}
234
235/// File path wrapper for hashing
236#[derive(Clone, Debug, Eq, PartialEq)]
237struct FileCacheKey(String);
238
239impl Hash for FileCacheKey {
240    fn hash<H: Hasher>(&self, state: &mut H) {
241        self.0.hash(state);
242    }
243}
244
245/// Manages caching of downloaded datasets, using both file-based and in-memory caching
246///
247/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
248/// while maintaining the file-based persistence for long-term storage.
249pub struct DatasetCache {
250    /// Directory for file-based caching
251    cachedir: PathBuf,
252    /// In-memory cache for frequently accessed datasets
253    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
254    /// Maximum cache size in bytes (0 means unlimited)
255    max_cache_size: u64,
256    /// Whether to operate in offline mode (no downloads)
257    offline_mode: bool,
258}
259
260impl Default for DatasetCache {
261    fn default() -> Self {
262        let cachedir = get_cachedir().expect("Could not get cache directory");
263
264        let mem_cache = RefCell::new(
265            CacheBuilder::new()
266                .with_size(DEFAULT_CACHE_SIZE)
267                .with_ttl(DEFAULT_CACHE_TTL)
268                .build_sized_cache(),
269        );
270
271        // Check if offline mode is enabled via environment variable
272        let offline_mode = std::env::var("SCIRS2_OFFLINE")
273            .map(|v| v.to_lowercase() == "true" || v == "1")
274            .unwrap_or(false);
275
276        DatasetCache {
277            cachedir,
278            mem_cache,
279            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
280            offline_mode,
281        }
282    }
283}
284
285impl DatasetCache {
286    /// Create a new dataset cache with the given cache directory and default memory cache
287    pub fn new(cachedir: PathBuf) -> Self {
288        let mem_cache = RefCell::new(
289            CacheBuilder::new()
290                .with_size(DEFAULT_CACHE_SIZE)
291                .with_ttl(DEFAULT_CACHE_TTL)
292                .build_sized_cache(),
293        );
294
295        let offline_mode = std::env::var("SCIRS2_OFFLINE")
296            .map(|v| v.to_lowercase() == "true" || v == "1")
297            .unwrap_or(false);
298
299        DatasetCache {
300            cachedir,
301            mem_cache,
302            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
303            offline_mode,
304        }
305    }
306
307    /// Create a new dataset cache with custom settings
308    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
309        let mem_cache = RefCell::new(
310            CacheBuilder::new()
311                .with_size(cache_size)
312                .with_ttl(ttl_seconds)
313                .build_sized_cache(),
314        );
315
316        let offline_mode = std::env::var("SCIRS2_OFFLINE")
317            .map(|v| v.to_lowercase() == "true" || v == "1")
318            .unwrap_or(false);
319
320        DatasetCache {
321            cachedir,
322            mem_cache,
323            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
324            offline_mode,
325        }
326    }
327
328    /// Create a new dataset cache with comprehensive configuration
329    pub fn with_full_config(
330        cachedir: PathBuf,
331        cache_size: usize,
332        ttl_seconds: u64,
333        max_cache_size: u64,
334        offline_mode: bool,
335    ) -> Self {
336        let mem_cache = RefCell::new(
337            CacheBuilder::new()
338                .with_size(cache_size)
339                .with_ttl(ttl_seconds)
340                .build_sized_cache(),
341        );
342
343        DatasetCache {
344            cachedir,
345            mem_cache,
346            max_cache_size,
347            offline_mode,
348        }
349    }
350
351    /// Create the cache directory if it doesn't exist
352    pub fn ensure_cachedir(&self) -> Result<()> {
353        if !self.cachedir.exists() {
354            fs::create_dir_all(&self.cachedir).map_err(|e| {
355                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
356            })?;
357        }
358        Ok(())
359    }
360
361    /// Get the path to a cached file
362    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
363        self.cachedir.join(name)
364    }
365
366    /// Check if a file is already cached (either in memory or on disk)
367    pub fn is_cached(&self, name: &str) -> bool {
368        // Check memory cache first
369        let key = FileCacheKey(name.to_string());
370        if self.mem_cache.borrow_mut().get(&key).is_some() {
371            return true;
372        }
373
374        // Then check file system
375        self.get_cachedpath(name).exists()
376    }
377
378    /// Read a cached file as bytes
379    ///
380    /// This method checks the in-memory cache first, and falls back to the file system if needed.
381    /// When reading from the file system, the result is also stored in the in-memory cache.
382    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
383        // Try memory cache first
384        let key = FileCacheKey(name.to_string());
385        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
386            return Ok(data);
387        }
388
389        // Fall back to file system cache
390        let path = self.get_cachedpath(name);
391        if !path.exists() {
392            return Err(DatasetsError::CacheError(format!(
393                "Cached file does not exist: {name}"
394            )));
395        }
396
397        let mut file = File::open(path)
398            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
399
400        let mut buffer = Vec::new();
401        file.read_to_end(&mut buffer)
402            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
403
404        // Update memory cache
405        self.mem_cache.borrow_mut().insert(key, buffer.clone());
406
407        Ok(buffer)
408    }
409
410    /// Write data to both the file cache and memory cache
411    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
412        self.ensure_cachedir()?;
413
414        // Check if writing this file would exceed cache size limit
415        if self.max_cache_size > 0 {
416            let current_size = self.get_cache_size_bytes()?;
417            let new_file_size = data.len() as u64;
418
419            if current_size + new_file_size > self.max_cache_size {
420                self.cleanup_cache_to_fit(new_file_size)?;
421            }
422        }
423
424        // Write to file system cache
425        let path = self.get_cachedpath(name);
426        let mut file = File::create(path)
427            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
428
429        file.write_all(data).map_err(|e| {
430            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
431        })?;
432
433        // Update memory cache
434        let key = FileCacheKey(name.to_string());
435        self.mem_cache.borrow_mut().insert(key, data.to_vec());
436
437        Ok(())
438    }
439
440    /// Clear the entire cache (both memory and file-based)
441    pub fn clear_cache(&self) -> Result<()> {
442        // Clear file system cache
443        if self.cachedir.exists() {
444            fs::remove_dir_all(&self.cachedir)
445                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
446        }
447
448        // Clear memory cache
449        self.mem_cache.borrow_mut().clear();
450
451        Ok(())
452    }
453
454    /// Remove a specific cached file (from both memory and file system)
455    pub fn remove_cached(&self, name: &str) -> Result<()> {
456        // Remove from file system
457        let path = self.get_cachedpath(name);
458        if path.exists() {
459            fs::remove_file(path).map_err(|e| {
460                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
461            })?;
462        }
463
464        // Remove from memory cache
465        let key = FileCacheKey(name.to_string());
466        self.mem_cache.borrow_mut().remove(&key);
467
468        Ok(())
469    }
470
471    /// Compute a hash for a filename or URL
472    pub fn hash_filename(name: &str) -> String {
473        let hash = blake3::hash(name.as_bytes());
474        hash.to_hex().to_string()
475    }
476
477    /// Get the total size of the cache in bytes
478    pub fn get_cache_size_bytes(&self) -> Result<u64> {
479        let mut total_size = 0u64;
480
481        if self.cachedir.exists() {
482            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
483                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
484            })?;
485
486            for entry in entries {
487                let entry = entry.map_err(|e| {
488                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
489                })?;
490
491                if let Ok(metadata) = entry.metadata() {
492                    if metadata.is_file() {
493                        total_size += metadata.len();
494                    }
495                }
496            }
497        }
498
499        Ok(total_size)
500    }
501
502    /// Clean up cache to fit a new file of specified size
503    ///
504    /// This method removes the oldest files first until there's enough space
505    /// for the new file plus some buffer space.
506    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
507        if self.max_cache_size == 0 {
508            return Ok(()); // No _size limit
509        }
510
511        let current_size = self.get_cache_size_bytes()?;
512        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
513        let total_needed = current_size + needed_size;
514
515        if total_needed <= target_size {
516            return Ok(()); // No cleanup needed
517        }
518
519        let size_to_free = total_needed - target_size;
520
521        // Get all files with their modification times
522        let mut files_with_times = Vec::new();
523
524        if self.cachedir.exists() {
525            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
526                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
527            })?;
528
529            for entry in entries {
530                let entry = entry.map_err(|e| {
531                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
532                })?;
533
534                if let Ok(metadata) = entry.metadata() {
535                    if metadata.is_file() {
536                        if let Ok(modified) = metadata.modified() {
537                            files_with_times.push((entry.path(), metadata.len(), modified));
538                        }
539                    }
540                }
541            }
542        }
543
544        // Sort by modification time (oldest first)
545        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
546
547        // Remove files until we've freed enough space
548        let mut freed_size = 0u64;
549        for (path, size, _modified) in files_with_times {
550            if freed_size >= size_to_free {
551                break;
552            }
553
554            // Remove from memory cache first
555            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
556                let key = FileCacheKey(filename.to_string());
557                self.mem_cache.borrow_mut().remove(&key);
558            }
559
560            // Remove file
561            if let Err(e) = fs::remove_file(&path) {
562                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
563            } else {
564                freed_size += size;
565            }
566        }
567
568        Ok(())
569    }
570
571    /// Set offline mode
572    pub fn set_offline_mode(&mut self, offline: bool) {
573        self.offline_mode = offline;
574    }
575
576    /// Check if cache is in offline mode
577    pub fn is_offline(&self) -> bool {
578        self.offline_mode
579    }
580
581    /// Set maximum cache size in bytes (0 for unlimited)
582    pub fn set_max_cache_size(&mut self, max_size: u64) {
583        self.max_cache_size = max_size;
584    }
585
586    /// Get maximum cache size in bytes
587    pub fn max_cache_size(&self) -> u64 {
588        self.max_cache_size
589    }
590
591    /// Put data into the cache (alias for write_cached)
592    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
593        self.write_cached(name, data)
594    }
595
596    /// Get detailed cache information
597    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
598        let mut total_size = 0u64;
599        let mut file_count = 0usize;
600        let mut files = Vec::new();
601
602        if self.cachedir.exists() {
603            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
604                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
605            })?;
606
607            for entry in entries {
608                let entry = entry.map_err(|e| {
609                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
610                })?;
611
612                if let Ok(metadata) = entry.metadata() {
613                    if metadata.is_file() {
614                        let size = metadata.len();
615                        total_size += size;
616                        file_count += 1;
617
618                        if let Some(filename) = entry.file_name().to_str() {
619                            files.push(CacheFileInfo {
620                                name: filename.to_string(),
621                                size_bytes: size,
622                                modified: metadata.modified().ok(),
623                            });
624                        }
625                    }
626                }
627            }
628        }
629
630        // Sort files by size (largest first)
631        files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
632
633        Ok(DetailedCacheStats {
634            total_size_bytes: total_size,
635            file_count,
636            cachedir: self.cachedir.clone(),
637            max_cache_size: self.max_cache_size,
638            offline_mode: self.offline_mode,
639            files,
640        })
641    }
642}
643
644/// Downloads data from a URL and returns it as bytes, using the cache when possible
645#[cfg(feature = "download")]
646#[allow(dead_code)]
647pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
648    let cache = DatasetCache::default();
649    let cache_key = DatasetCache::hash_filename(_url);
650
651    // Check if the data is already cached
652    if !force_download && cache.is_cached(&cache_key) {
653        return cache.read_cached(&cache_key);
654    }
655
656    // Download the data
657    let response = reqwest::blocking::get(_url).map_err(|e| {
658        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
659    })?;
660
661    if !response.status().is_success() {
662        return Err(DatasetsError::DownloadError(format!(
663            "Failed to download from {_url}: HTTP status {}",
664            response.status()
665        )));
666    }
667
668    let data = response
669        .bytes()
670        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
671
672    let data_vec = data.to_vec();
673
674    // Cache the data
675    cache.write_cached(&cache_key, &data_vec)?;
676
677    Ok(data_vec)
678}
679
680// Stub for when download feature is not enabled
681#[cfg(not(feature = "download"))]
682/// Downloads data from a URL or retrieves it from cache
683///
684/// This is a stub implementation when the download feature is not enabled.
685/// It returns an error informing the user to enable the download feature.
686///
687/// # Arguments
688///
689/// * `_url` - The URL to download from
690/// * `_force_download` - If true, force a new download instead of using cache
691///
692/// # Returns
693///
694/// * An error indicating that the download feature is not enabled
695#[allow(dead_code)]
696pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
697    Err(DatasetsError::Other(
698        "Download feature is not enabled. Recompile with --features download".to_string(),
699    ))
700}
701
702/// Cache management utilities
703pub struct CacheManager {
704    cache: DatasetCache,
705}
706
707impl CacheManager {
708    /// Create a new cache manager with default settings
709    pub fn new() -> Result<Self> {
710        let cachedir = get_cachedir()?;
711        Ok(Self {
712            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
713        })
714    }
715
716    /// Create a new cache manager with custom settings
717    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
718        Self {
719            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
720        }
721    }
722
723    /// Get a dataset from cache using CacheKey
724    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
725        let name = key.as_string();
726        if self.cache.is_cached(&name) {
727            match self.cache.read_cached(&name) {
728                Ok(cached_data) => {
729                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
730                        Ok(dataset) => Ok(Some(dataset)),
731                        Err(e) => {
732                            // If deserialization fails, consider the cache entry invalid
733                            self.cache
734                                .mem_cache
735                                .borrow_mut()
736                                .remove(&FileCacheKey(name.clone()));
737                            Err(DatasetsError::CacheError(format!(
738                                "Failed to deserialize cached dataset: {e}"
739                            )))
740                        }
741                    }
742                }
743                Err(e) => Err(DatasetsError::CacheError(format!(
744                    "Failed to read cached data: {e}"
745                ))),
746            }
747        } else {
748            Ok(None)
749        }
750    }
751
752    /// Put a dataset into cache using CacheKey
753    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
754        let name = key.as_string();
755
756        // Serialize the dataset to JSON bytes for caching
757        let serialized = serde_json::to_vec(dataset)
758            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
759
760        // Write the serialized data to cache
761        self.cache
762            .write_cached(&name, &serialized)
763            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
764    }
765
766    /// Create a cache manager with comprehensive configuration
767    pub fn with_full_config(
768        cachedir: PathBuf,
769        cache_size: usize,
770        ttl_seconds: u64,
771        max_cache_size: u64,
772        offline_mode: bool,
773    ) -> Self {
774        Self {
775            cache: DatasetCache::with_full_config(
776                cachedir,
777                cache_size,
778                ttl_seconds,
779                max_cache_size,
780                offline_mode,
781            ),
782        }
783    }
784
785    /// Get basic cache statistics
786    pub fn get_stats(&self) -> CacheStats {
787        let cachedir = &self.cache.cachedir;
788        let mut total_size = 0u64;
789        let mut file_count = 0usize;
790
791        if cachedir.exists() {
792            if let Ok(entries) = fs::read_dir(cachedir) {
793                for entry in entries.flatten() {
794                    if let Ok(metadata) = entry.metadata() {
795                        if metadata.is_file() {
796                            total_size += metadata.len();
797                            file_count += 1;
798                        }
799                    }
800                }
801            }
802        }
803
804        CacheStats {
805            total_size_bytes: total_size,
806            file_count,
807            cachedir: cachedir.clone(),
808        }
809    }
810
811    /// Get detailed cache statistics
812    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
813        self.cache.get_detailed_stats()
814    }
815
816    /// Set offline mode
817    pub fn set_offline_mode(&mut self, offline: bool) {
818        self.cache.set_offline_mode(offline);
819    }
820
821    /// Check if in offline mode
822    pub fn is_offline(&self) -> bool {
823        self.cache.is_offline()
824    }
825
826    /// Set maximum cache size in bytes (0 for unlimited)
827    pub fn set_max_cache_size(&mut self, max_size: u64) {
828        self.cache.set_max_cache_size(max_size);
829    }
830
831    /// Get maximum cache size in bytes
832    pub fn max_cache_size(&self) -> u64 {
833        self.cache.max_cache_size()
834    }
835
836    /// Clear all cached data
837    pub fn clear_all(&self) -> Result<()> {
838        self.cache.clear_cache()
839    }
840
841    /// Remove specific cached file
842    pub fn remove(&self, name: &str) -> Result<()> {
843        self.cache.remove_cached(name)
844    }
845
846    /// Remove old files to free up space
847    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
848        self.cache.cleanup_cache_to_fit(target_size)
849    }
850
851    /// List all cached files
852    pub fn list_cached_files(&self) -> Result<Vec<String>> {
853        let cachedir = &self.cache.cachedir;
854        let mut files = Vec::new();
855
856        if cachedir.exists() {
857            let entries = fs::read_dir(cachedir).map_err(|e| {
858                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
859            })?;
860
861            for entry in entries {
862                let entry = entry.map_err(|e| {
863                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
864                })?;
865
866                if let Some(filename) = entry.file_name().to_str() {
867                    files.push(filename.to_string());
868                }
869            }
870        }
871
872        files.sort();
873        Ok(files)
874    }
875
876    /// Get cache directory path
877    pub fn cachedir(&self) -> &PathBuf {
878        &self.cache.cachedir
879    }
880
881    /// Check if a file is cached
882    pub fn is_cached(&self, name: &str) -> bool {
883        self.cache.is_cached(name)
884    }
885
886    /// Print detailed cache report
887    pub fn print_cache_report(&self) -> Result<()> {
888        let stats = self.get_detailed_stats()?;
889
890        println!("=== Cache Report ===");
891        println!("Cache Directory: {}", stats.cachedir.display());
892        println!(
893            "Total Size: {} ({} files)",
894            stats.formatted_size(),
895            stats.file_count
896        );
897        println!("Max Size: {}", stats.formatted_max_size());
898
899        if stats.max_cache_size > 0 {
900            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
901        }
902
903        println!(
904            "Offline Mode: {}",
905            if stats.offline_mode {
906                "Enabled"
907            } else {
908                "Disabled"
909            }
910        );
911
912        if !stats.files.is_empty() {
913            println!("\nCached Files:");
914            for file in &stats.files {
915                println!(
916                    "  {} - {} ({})",
917                    file.name,
918                    file.formatted_size(),
919                    file.formatted_modified()
920                );
921            }
922        }
923
924        Ok(())
925    }
926}
927
928/// Cache statistics
929pub struct CacheStats {
930    /// Total size of all cached files in bytes
931    pub total_size_bytes: u64,
932    /// Number of cached files
933    pub file_count: usize,
934    /// Cache directory path
935    pub cachedir: PathBuf,
936}
937
938/// Detailed cache statistics with file-level information
939pub struct DetailedCacheStats {
940    /// Total size of all cached files in bytes
941    pub total_size_bytes: u64,
942    /// Number of cached files
943    pub file_count: usize,
944    /// Cache directory path
945    pub cachedir: PathBuf,
946    /// Maximum cache size (0 = unlimited)
947    pub max_cache_size: u64,
948    /// Whether cache is in offline mode
949    pub offline_mode: bool,
950    /// Information about individual cached files
951    pub files: Vec<CacheFileInfo>,
952}
953
954/// Information about a cached file
955#[derive(Debug, Clone)]
956pub struct CacheFileInfo {
957    /// Name of the cached file
958    pub name: String,
959    /// Size in bytes
960    pub size_bytes: u64,
961    /// Last modified time
962    pub modified: Option<std::time::SystemTime>,
963}
964
965impl CacheStats {
966    /// Get total size formatted as human-readable string
967    pub fn formatted_size(&self) -> String {
968        format_bytes(self.total_size_bytes)
969    }
970}
971
972impl DetailedCacheStats {
973    /// Get total size formatted as human-readable string
974    pub fn formatted_size(&self) -> String {
975        format_bytes(self.total_size_bytes)
976    }
977
978    /// Get max cache size formatted as human-readable string
979    pub fn formatted_max_size(&self) -> String {
980        if self.max_cache_size == 0 {
981            "Unlimited".to_string()
982        } else {
983            format_bytes(self.max_cache_size)
984        }
985    }
986
987    /// Get cache usage percentage (0.0-1.0)
988    pub fn usage_percentage(&self) -> f64 {
989        if self.max_cache_size == 0 {
990            0.0
991        } else {
992            self.total_size_bytes as f64 / self.max_cache_size as f64
993        }
994    }
995}
996
997impl CacheFileInfo {
998    /// Get file size formatted as human-readable string
999    pub fn formatted_size(&self) -> String {
1000        format_bytes(self.size_bytes)
1001    }
1002
1003    /// Get formatted modification time
1004    pub fn formatted_modified(&self) -> String {
1005        match &self.modified {
1006            Some(time) => {
1007                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1008                {
1009                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1010                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1011                        let days = diff_secs / 86400;
1012                        let hours = (diff_secs % 86400) / 3600;
1013                        let mins = (diff_secs % 3600) / 60;
1014
1015                        if days > 0 {
1016                            format!("{days} days ago")
1017                        } else if hours > 0 {
1018                            format!("{hours} hours ago")
1019                        } else if mins > 0 {
1020                            format!("{mins} minutes ago")
1021                        } else {
1022                            "Just now".to_string()
1023                        }
1024                    } else {
1025                        "Unknown".to_string()
1026                    }
1027                } else {
1028                    "Unknown".to_string()
1029                }
1030            }
1031            None => "Unknown".to_string(),
1032        }
1033    }
1034}
1035
1036/// Format bytes as human-readable string
1037#[allow(dead_code)]
1038fn format_bytes(bytes: u64) -> String {
1039    let size = bytes as f64;
1040    if size < 1024.0 {
1041        format!("{size} B")
1042    } else if size < 1024.0 * 1024.0 {
1043        format!("{:.1} KB", size / 1024.0)
1044    } else if size < 1024.0 * 1024.0 * 1024.0 {
1045        format!("{:.1} MB", size / (1024.0 * 1024.0))
1046    } else {
1047        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1048    }
1049}
1050
1051/// Batch operation result containing success/failure information
1052#[derive(Debug, Clone)]
1053pub struct BatchResult {
1054    /// Number of successful operations
1055    pub success_count: usize,
1056    /// Number of failed operations
1057    pub failure_count: usize,
1058    /// List of failed items with error messages
1059    pub failures: Vec<(String, String)>,
1060    /// Total bytes processed
1061    pub total_bytes: u64,
1062    /// Total time taken for the batch operation
1063    pub elapsed_time: std::time::Duration,
1064}
1065
1066impl BatchResult {
1067    /// Create a new empty batch result
1068    pub fn new() -> Self {
1069        Self {
1070            success_count: 0,
1071            failure_count: 0,
1072            failures: Vec::new(),
1073            total_bytes: 0,
1074            elapsed_time: std::time::Duration::ZERO,
1075        }
1076    }
1077
1078    /// Check if all operations were successful
1079    pub fn is_all_success(&self) -> bool {
1080        self.failure_count == 0
1081    }
1082
1083    /// Get success rate as percentage
1084    pub fn success_rate(&self) -> f64 {
1085        let total = self.success_count + self.failure_count;
1086        if total == 0 {
1087            0.0
1088        } else {
1089            (self.success_count as f64 / total as f64) * 100.0
1090        }
1091    }
1092
1093    /// Get formatted summary
1094    pub fn summary(&self) -> String {
1095        format!(
1096            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1097            self.success_count,
1098            self.success_count + self.failure_count,
1099            self.success_rate(),
1100            format_bytes(self.total_bytes),
1101            self.elapsed_time.as_secs_f64()
1102        )
1103    }
1104}
1105
1106impl Default for BatchResult {
1107    fn default() -> Self {
1108        Self::new()
1109    }
1110}
1111
1112/// Batch operations manager for dataset caching
1113pub struct BatchOperations {
1114    cache: CacheManager,
1115    parallel: bool,
1116    max_retries: usize,
1117    retry_delay: std::time::Duration,
1118}
1119
1120impl BatchOperations {
1121    /// Create a new batch operations manager
1122    pub fn new(cache: CacheManager) -> Self {
1123        Self {
1124            cache,
1125            parallel: true,
1126            max_retries: 3,
1127            retry_delay: std::time::Duration::from_millis(1000),
1128        }
1129    }
1130
1131    /// Configure parallel processing
1132    pub fn with_parallel(mut self, parallel: bool) -> Self {
1133        self.parallel = parallel;
1134        self
1135    }
1136
1137    /// Configure retry settings
1138    pub fn with_retry_config(
1139        mut self,
1140        max_retries: usize,
1141        retry_delay: std::time::Duration,
1142    ) -> Self {
1143        self.max_retries = max_retries;
1144        self.retry_delay = retry_delay;
1145        self
1146    }
1147
1148    /// Download multiple datasets in batch
1149    #[cfg(feature = "download")]
1150    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1151        let start_time = std::time::Instant::now();
1152        let mut result = BatchResult::new();
1153
1154        if self.parallel {
1155            self.batch_download_parallel(urls_andnames, &mut result)
1156        } else {
1157            self.batch_download_sequential(urls_andnames, &mut result)
1158        }
1159
1160        result.elapsed_time = start_time.elapsed();
1161        result
1162    }
1163
1164    #[cfg(feature = "download")]
1165    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1166        use std::fs::File;
1167        use std::io::Write;
1168        use std::sync::{Arc, Mutex};
1169        use std::thread;
1170
1171        // Ensure cache directory exists before spawning threads
1172        if let Err(e) = self.cache.cache.ensure_cachedir() {
1173            result.failure_count += urls_andnames.len();
1174            for &(_, name) in urls_andnames {
1175                result
1176                    .failures
1177                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1178            }
1179            return;
1180        }
1181
1182        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1183        let cachedir = self.cache.cache.cachedir.clone();
1184        let max_retries = self.max_retries;
1185        let retry_delay = self.retry_delay;
1186
1187        let handles: Vec<_> = urls_andnames
1188            .iter()
1189            .map(|&(url, name)| {
1190                let result_clone = Arc::clone(&result_arc);
1191                let url = url.to_string();
1192                let name = name.to_string();
1193                let cachedir = cachedir.clone();
1194
1195                thread::spawn(move || {
1196                    let mut success = false;
1197                    let mut last_error = String::new();
1198                    let mut downloaded_data = Vec::new();
1199
1200                    for attempt in 0..=max_retries {
1201                        match download_data(&url, false) {
1202                            Ok(data) => {
1203                                // Write directly to filesystem (bypassing RefCell memory cache)
1204                                let path = cachedir.join(&name);
1205                                match File::create(&path) {
1206                                    Ok(mut file) => match file.write_all(&data) {
1207                                        Ok(_) => {
1208                                            let mut r =
1209                                                result_clone.lock().expect("Operation failed");
1210                                            r.success_count += 1;
1211                                            r.total_bytes += data.len() as u64;
1212                                            downloaded_data = data;
1213                                            success = true;
1214                                            break;
1215                                        }
1216                                        Err(e) => {
1217                                            last_error = format!("Failed to write cache file: {e}");
1218                                        }
1219                                    },
1220                                    Err(e) => {
1221                                        last_error = format!("Failed to create cache file: {e}");
1222                                    }
1223                                }
1224                            }
1225                            Err(e) => {
1226                                last_error = format!("Download failed: {e}");
1227                                if attempt < max_retries {
1228                                    thread::sleep(retry_delay);
1229                                }
1230                            }
1231                        }
1232                    }
1233
1234                    if !success {
1235                        let mut r = result_clone.lock().expect("Operation failed");
1236                        r.failure_count += 1;
1237                        r.failures.push((name.clone(), last_error));
1238                    }
1239
1240                    (name, success, downloaded_data)
1241                })
1242            })
1243            .collect();
1244
1245        // Collect results and update memory cache for successful downloads
1246        let mut successful_downloads = Vec::new();
1247        for handle in handles {
1248            if let Ok((name, success, data)) = handle.join() {
1249                if success && !data.is_empty() {
1250                    successful_downloads.push((name, data));
1251                }
1252            }
1253        }
1254
1255        // Merge the results from the arc back into the original result
1256        if let Ok(arc_result) = result_arc.lock() {
1257            result.success_count += arc_result.success_count;
1258            result.failure_count += arc_result.failure_count;
1259            result.failures.extend(arc_result.failures.clone());
1260        }
1261
1262        // Update memory cache after all threads complete
1263        for (name, data) in successful_downloads {
1264            let key = FileCacheKey(name);
1265            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1266        }
1267    }
1268
1269    #[cfg(feature = "download")]
1270    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1271        for &(url, name) in urls_andnames {
1272            let mut success = false;
1273            let mut last_error = String::new();
1274
1275            for attempt in 0..=self.max_retries {
1276                match download_data(url, false) {
1277                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1278                        Ok(_) => {
1279                            result.success_count += 1;
1280                            result.total_bytes += data.len() as u64;
1281                            success = true;
1282                            break;
1283                        }
1284                        Err(e) => {
1285                            last_error = format!("Cache write failed: {e}");
1286                        }
1287                    },
1288                    Err(e) => {
1289                        last_error = format!("Download failed: {e}");
1290                        if attempt < self.max_retries {
1291                            std::thread::sleep(self.retry_delay);
1292                        }
1293                    }
1294                }
1295            }
1296
1297            if !success {
1298                result.failure_count += 1;
1299                result.failures.push((name.to_string(), last_error));
1300            }
1301        }
1302    }
1303
1304    /// Verify integrity of multiple cached files
1305    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1306        let start_time = std::time::Instant::now();
1307        let mut result = BatchResult::new();
1308
1309        for &(filename, expected_hash) in files_andhashes {
1310            match self.cache.cache.get_cachedpath(filename).exists() {
1311                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1312                    Ok(actual_hash) => {
1313                        if actual_hash == expected_hash {
1314                            result.success_count += 1;
1315                            if let Ok(metadata) =
1316                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1317                            {
1318                                result.total_bytes += metadata.len();
1319                            }
1320                        } else {
1321                            result.failure_count += 1;
1322                            result.failures.push((
1323                                filename.to_string(),
1324                                format!(
1325                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1326                                ),
1327                            ));
1328                        }
1329                    }
1330                    Err(e) => {
1331                        result.failure_count += 1;
1332                        result.failures.push((
1333                            filename.to_string(),
1334                            format!("Hash computation failed: {e}"),
1335                        ));
1336                    }
1337                },
1338                false => {
1339                    result.failure_count += 1;
1340                    result
1341                        .failures
1342                        .push((filename.to_string(), "File not found in cache".to_string()));
1343                }
1344            }
1345        }
1346
1347        result.elapsed_time = start_time.elapsed();
1348        result
1349    }
1350
1351    /// Clean up cache selectively based on patterns
1352    pub fn selective_cleanup(
1353        &self,
1354        patterns: &[&str],
1355        max_age_days: Option<u32>,
1356    ) -> Result<BatchResult> {
1357        let start_time = std::time::Instant::now();
1358        let mut result = BatchResult::new();
1359
1360        let cached_files = self.cache.list_cached_files()?;
1361        let now = std::time::SystemTime::now();
1362
1363        for filename in cached_files {
1364            let should_remove = patterns.iter().any(|pattern| {
1365                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1366            });
1367
1368            if should_remove {
1369                let filepath = self.cache.cache.get_cachedpath(&filename);
1370
1371                // Check age if max_age_days is specified
1372                let remove_due_to_age = if let Some(max_age) = max_age_days {
1373                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1374                        if let Ok(modified) = metadata.modified() {
1375                            if let Ok(age) = now.duration_since(modified) {
1376                                age.as_secs() > (max_age as u64 * 24 * 3600)
1377                            } else {
1378                                false
1379                            }
1380                        } else {
1381                            false
1382                        }
1383                    } else {
1384                        false
1385                    }
1386                } else {
1387                    true // Remove regardless of age if no age limit specified
1388                };
1389
1390                if remove_due_to_age {
1391                    match self.cache.remove(&filename) {
1392                        Ok(_) => {
1393                            result.success_count += 1;
1394                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1395                                result.total_bytes += metadata.len();
1396                            }
1397                        }
1398                        Err(e) => {
1399                            result.failure_count += 1;
1400                            result
1401                                .failures
1402                                .push((filename, format!("Removal failed: {e}")));
1403                        }
1404                    }
1405                }
1406            }
1407        }
1408
1409        result.elapsed_time = start_time.elapsed();
1410        Ok(result)
1411    }
1412
1413    /// Process multiple datasets with a given function
1414    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1415    where
1416        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1417        E: std::fmt::Display,
1418        T: Send,
1419    {
1420        let start_time = std::time::Instant::now();
1421        let mut result = BatchResult::new();
1422
1423        if self.parallel {
1424            self.batch_process_parallel(names, processor, &mut result)
1425        } else {
1426            self.batch_process_sequential(names, processor, &mut result)
1427        }
1428
1429        result.elapsed_time = start_time.elapsed();
1430        result
1431    }
1432
1433    fn batch_process_parallel<F, T, E>(
1434        &self,
1435        names: &[String],
1436        processor: F,
1437        result: &mut BatchResult,
1438    ) where
1439        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1440        E: std::fmt::Display,
1441        T: Send,
1442    {
1443        // For thread safety with the current cache implementation,
1444        // we need to read all data first, then process in parallel
1445        let mut data_pairs = Vec::new();
1446
1447        // Sequential read phase
1448        for name in names {
1449            match self.cache.cache.read_cached(name) {
1450                Ok(data) => data_pairs.push((name.clone(), data)),
1451                Err(e) => {
1452                    result.failure_count += 1;
1453                    result
1454                        .failures
1455                        .push((name.clone(), format!("Cache read failed: {e}")));
1456                }
1457            }
1458        }
1459
1460        // Parallel processing phase
1461        if !data_pairs.is_empty() {
1462            use std::sync::{Arc, Mutex};
1463            use std::thread;
1464
1465            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1466            let processor = Arc::new(processor);
1467
1468            let handles: Vec<_> = data_pairs
1469                .into_iter()
1470                .map(|(name, data)| {
1471                    let result_clone = Arc::clone(&parallel_result);
1472                    let processor_clone = Arc::clone(&processor);
1473
1474                    thread::spawn(move || match processor_clone(&name, &data) {
1475                        Ok(_) => {
1476                            let mut r = result_clone.lock().expect("Operation failed");
1477                            r.success_count += 1;
1478                            r.total_bytes += data.len() as u64;
1479                        }
1480                        Err(e) => {
1481                            let mut r = result_clone.lock().expect("Operation failed");
1482                            r.failure_count += 1;
1483                            r.failures.push((name, format!("Processing failed: {e}")));
1484                        }
1485                    })
1486                })
1487                .collect();
1488
1489            for handle in handles {
1490                let _ = handle.join();
1491            }
1492
1493            // Merge parallel results into main result
1494            let parallel_result = parallel_result.lock().expect("Operation failed");
1495            result.success_count += parallel_result.success_count;
1496            result.failure_count += parallel_result.failure_count;
1497            result.total_bytes += parallel_result.total_bytes;
1498            result.failures.extend(parallel_result.failures.clone());
1499        }
1500    }
1501
1502    fn batch_process_sequential<F, T, E>(
1503        &self,
1504        names: &[String],
1505        processor: F,
1506        result: &mut BatchResult,
1507    ) where
1508        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1509        E: std::fmt::Display,
1510    {
1511        for name in names {
1512            match self.cache.cache.read_cached(name) {
1513                Ok(data) => match processor(name, &data) {
1514                    Ok(_) => {
1515                        result.success_count += 1;
1516                        result.total_bytes += data.len() as u64;
1517                    }
1518                    Err(e) => {
1519                        result.failure_count += 1;
1520                        result
1521                            .failures
1522                            .push((name.clone(), format!("Processing failed: {e}")));
1523                    }
1524                },
1525                Err(e) => {
1526                    result.failure_count += 1;
1527                    result
1528                        .failures
1529                        .push((name.clone(), format!("Cache read failed: {e}")));
1530                }
1531            }
1532        }
1533    }
1534
1535    /// Get access to the underlying cache manager
1536    pub fn cache_manager(&self) -> &CacheManager {
1537        &self.cache
1538    }
1539
1540    /// Write data to cache
1541    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1542        self.cache.cache.write_cached(name, data)
1543    }
1544
1545    /// Read data from cache
1546    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1547        self.cache.cache.read_cached(name)
1548    }
1549
1550    /// List cached files
1551    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1552        self.cache.list_cached_files()
1553    }
1554
1555    /// Print cache report
1556    pub fn print_cache_report(&self) -> Result<()> {
1557        self.cache.print_cache_report()
1558    }
1559
1560    /// Get statistics about cached datasets
1561    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1562        let start_time = std::time::Instant::now();
1563        let mut result = BatchResult::new();
1564
1565        let cached_files = self.cache.list_cached_files()?;
1566
1567        for filename in cached_files {
1568            let filepath = self.cache.cache.get_cachedpath(&filename);
1569            match std::fs::metadata(&filepath) {
1570                Ok(metadata) => {
1571                    result.success_count += 1;
1572                    result.total_bytes += metadata.len();
1573                }
1574                Err(e) => {
1575                    result.failure_count += 1;
1576                    result
1577                        .failures
1578                        .push((filename, format!("Metadata read failed: {e}")));
1579                }
1580            }
1581        }
1582
1583        result.elapsed_time = start_time.elapsed();
1584        Ok(result)
1585    }
1586}
1587
1588/// Simple glob pattern matching for filenames
1589#[allow(dead_code)]
1590fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1591    if pattern == "*" {
1592        return true;
1593    }
1594
1595    if pattern.contains('*') {
1596        let parts: Vec<&str> = pattern.split('*').collect();
1597        if parts.len() == 2 {
1598            let prefix = parts[0];
1599            let suffix = parts[1];
1600            return filename.starts_with(prefix) && filename.ends_with(suffix);
1601        }
1602    }
1603
1604    filename == pattern
1605}
1606
1607#[cfg(test)]
1608mod tests {
1609    use super::*;
1610    use tempfile::TempDir;
1611
1612    #[test]
1613    fn test_batch_result() {
1614        let mut result = BatchResult::new();
1615        assert_eq!(result.success_count, 0);
1616        assert_eq!(result.failure_count, 0);
1617        assert!(result.is_all_success());
1618        assert_eq!(result.success_rate(), 0.0);
1619
1620        result.success_count = 8;
1621        result.failure_count = 2;
1622        result.total_bytes = 1024;
1623
1624        assert!(!result.is_all_success());
1625        assert_eq!(result.success_rate(), 80.0);
1626        assert!(result.summary().contains("8/10 successful"));
1627        assert!(result.summary().contains("80.0%"));
1628    }
1629
1630    #[test]
1631    fn test_batch_operations_creation() {
1632        let tempdir = TempDir::new().expect("Operation failed");
1633        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1634        let batch_ops = BatchOperations::new(cache_manager)
1635            .with_parallel(false)
1636            .with_retry_config(2, std::time::Duration::from_millis(500));
1637
1638        assert!(!batch_ops.parallel);
1639        assert_eq!(batch_ops.max_retries, 2);
1640    }
1641
1642    #[test]
1643    fn test_selective_cleanup() {
1644        let tempdir = TempDir::new().expect("Operation failed");
1645        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1646        let batch_ops = BatchOperations::new(cache_manager);
1647
1648        // Create some test files
1649        let test_data = vec![0u8; 100];
1650        batch_ops
1651            .cache
1652            .cache
1653            .write_cached("test1.csv", &test_data)
1654            .expect("Test: cache operation failed");
1655        batch_ops
1656            .cache
1657            .cache
1658            .write_cached("test2.csv", &test_data)
1659            .expect("Test: cache operation failed");
1660        batch_ops
1661            .cache
1662            .cache
1663            .write_cached("data.json", &test_data)
1664            .expect("Test: cache operation failed");
1665
1666        // Clean up files matching pattern
1667        let result = batch_ops
1668            .selective_cleanup(&["*.csv"], None)
1669            .expect("Operation failed");
1670
1671        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1672        assert!(!batch_ops.cache.is_cached("test1.csv"));
1673        assert!(!batch_ops.cache.is_cached("test2.csv"));
1674        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1675    }
1676
1677    #[test]
1678    fn test_batch_process() {
1679        let tempdir = TempDir::new().expect("Operation failed");
1680        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1681        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1682
1683        // Create test files
1684        let test_data1 = vec![1u8; 100];
1685        let test_data2 = vec![2u8; 200];
1686        batch_ops
1687            .cache
1688            .cache
1689            .write_cached("file1.dat", &test_data1)
1690            .expect("Test: cache operation failed");
1691        batch_ops
1692            .cache
1693            .cache
1694            .write_cached("file2.dat", &test_data2)
1695            .expect("Test: cache operation failed");
1696
1697        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1698
1699        // Process files (verify they're non-empty)
1700        let result = batch_ops.batch_process(&files, |_name, data| {
1701            if data.is_empty() {
1702                Err("Empty file")
1703            } else {
1704                Ok(data.len())
1705            }
1706        });
1707
1708        assert_eq!(result.success_count, 2);
1709        assert_eq!(result.failure_count, 0);
1710        assert_eq!(result.total_bytes, 300); // 100 + 200
1711    }
1712
1713    #[test]
1714    fn test_get_cache_statistics() {
1715        let tempdir = TempDir::new().expect("Operation failed");
1716        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1717        let batch_ops = BatchOperations::new(cache_manager);
1718
1719        // Start with empty cache
1720        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1721        assert_eq!(result.success_count, 0);
1722
1723        // Add some files
1724        let test_data = vec![0u8; 500];
1725        batch_ops
1726            .cache
1727            .cache
1728            .write_cached("test1.dat", &test_data)
1729            .expect("Test: cache operation failed");
1730        batch_ops
1731            .cache
1732            .cache
1733            .write_cached("test2.dat", &test_data)
1734            .expect("Test: cache operation failed");
1735
1736        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1737        assert_eq!(result.success_count, 2);
1738        assert_eq!(result.total_bytes, 1000);
1739    }
1740
1741    #[test]
1742    fn test_matches_glob_pattern() {
1743        assert!(matches_glob_pattern("test.csv", "*"));
1744        assert!(matches_glob_pattern("test.csv", "*.csv"));
1745        assert!(matches_glob_pattern("test.csv", "test.*"));
1746        assert!(matches_glob_pattern("test.csv", "test.csv"));
1747
1748        assert!(!matches_glob_pattern("test.json", "*.csv"));
1749        assert!(!matches_glob_pattern("other.csv", "test.*"));
1750    }
1751
1752    #[test]
1753    fn test_cache_manager_creation() {
1754        let tempdir = TempDir::new().expect("Operation failed");
1755        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1756        let stats = manager.get_stats();
1757        assert_eq!(stats.file_count, 0);
1758    }
1759
1760    #[test]
1761    fn test_cache_stats_formatting() {
1762        let tempdir = TempDir::new().expect("Operation failed");
1763        let stats = CacheStats {
1764            total_size_bytes: 1024,
1765            file_count: 1,
1766            cachedir: tempdir.path().to_path_buf(),
1767        };
1768
1769        assert_eq!(stats.formatted_size(), "1.0 KB");
1770
1771        let stats_large = CacheStats {
1772            total_size_bytes: 1024 * 1024 * 1024,
1773            file_count: 1,
1774            cachedir: tempdir.path().to_path_buf(),
1775        };
1776
1777        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1778    }
1779
1780    #[test]
1781    fn test_hash_file_name() {
1782        let hash1 = DatasetCache::hash_filename("test.csv");
1783        let hash2 = DatasetCache::hash_filename("test.csv");
1784        let hash3 = DatasetCache::hash_filename("different.csv");
1785
1786        assert_eq!(hash1, hash2);
1787        assert_ne!(hash1, hash3);
1788        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1789    }
1790
1791    #[test]
1792    fn test_platform_cachedir() {
1793        let cachedir = get_platform_cachedir();
1794        // Should work on any platform
1795        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1796
1797        if let Some(dir) = cachedir {
1798            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1799        }
1800    }
1801
1802    #[test]
1803    fn test_cache_size_management() {
1804        let tempdir = TempDir::new().expect("Operation failed");
1805        let cache = DatasetCache::with_full_config(
1806            tempdir.path().to_path_buf(),
1807            10,
1808            3600,
1809            2048, // 2KB limit
1810            false,
1811        );
1812
1813        // Write multiple small files to approach the limit
1814        let small_data1 = vec![0u8; 400];
1815        cache
1816            .write_cached("small1.dat", &small_data1)
1817            .expect("Operation failed");
1818
1819        let small_data2 = vec![0u8; 400];
1820        cache
1821            .write_cached("small2.dat", &small_data2)
1822            .expect("Operation failed");
1823
1824        let small_data3 = vec![0u8; 400];
1825        cache
1826            .write_cached("small3.dat", &small_data3)
1827            .expect("Operation failed");
1828
1829        // Now write a file that should trigger cleanup
1830        let medium_data = vec![0u8; 800];
1831        cache
1832            .write_cached("medium.dat", &medium_data)
1833            .expect("Operation failed");
1834
1835        // The cache should have cleaned up to stay under the limit
1836        let stats = cache.get_detailed_stats().expect("Operation failed");
1837        assert!(stats.total_size_bytes <= cache.max_cache_size());
1838
1839        // The most recent file should still be cached
1840        assert!(cache.is_cached("medium.dat"));
1841    }
1842
1843    #[test]
1844    fn test_offline_mode() {
1845        let tempdir = TempDir::new().expect("Operation failed");
1846        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1847
1848        assert!(!cache.is_offline());
1849        cache.set_offline_mode(true);
1850        assert!(cache.is_offline());
1851    }
1852
1853    #[test]
1854    fn test_detailed_stats() {
1855        let tempdir = TempDir::new().expect("Operation failed");
1856        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1857
1858        let test_data = vec![1, 2, 3, 4, 5];
1859        cache
1860            .write_cached("test.dat", &test_data)
1861            .expect("Operation failed");
1862
1863        let stats = cache.get_detailed_stats().expect("Operation failed");
1864        assert_eq!(stats.file_count, 1);
1865        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1866        assert_eq!(stats.files.len(), 1);
1867        assert_eq!(stats.files[0].name, "test.dat");
1868        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1869    }
1870
1871    #[test]
1872    fn test_cache_manager() {
1873        let tempdir = TempDir::new().expect("Operation failed");
1874        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1875
1876        let stats = manager.get_stats();
1877        assert_eq!(stats.file_count, 0);
1878        assert_eq!(stats.total_size_bytes, 0);
1879
1880        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1881    }
1882
1883    #[test]
1884    fn test_format_bytes() {
1885        assert_eq!(format_bytes(512), "512 B");
1886        assert_eq!(format_bytes(1024), "1.0 KB");
1887        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1888        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1889    }
1890}