Skip to main content

scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(format!("{:x}", hasher.finalize()))
46}
47
48/// Registry entry for dataset files
49pub struct RegistryEntry {
50    /// SHA256 hash of the file
51    pub sha256: &'static str,
52    /// URL to download the file from
53    pub url: &'static str,
54}
55
56/// Get the platform-specific cache directory for downloading and storing datasets
57///
58/// The cache directory is determined in the following order:
59/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
60/// 2. Platform-specific cache directory:
61///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
62///    - macOS: `~/Library/Caches/scirs2-datasets`
63///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
64/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
65#[allow(dead_code)]
66pub fn get_cachedir() -> Result<PathBuf> {
67    // Check environment variable first
68    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
69        let cachepath = PathBuf::from(cachedir);
70        ensuredirectory_exists(&cachepath)?;
71        return Ok(cachepath);
72    }
73
74    // Try platform-specific cache directory
75    if let Some(cachedir) = get_platform_cachedir() {
76        ensuredirectory_exists(&cachedir)?;
77        return Ok(cachedir);
78    }
79
80    // Fallback to home directory
81    let homedir = crate::platform_dirs::home_dir()
82        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
83    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
84    ensuredirectory_exists(&cachedir)?;
85
86    Ok(cachedir)
87}
88
89/// Get platform-specific cache directory
90#[allow(dead_code)]
91fn get_platform_cachedir() -> Option<PathBuf> {
92    #[cfg(target_os = "windows")]
93    {
94        crate::platform_dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
95    }
96    #[cfg(target_os = "macos")]
97    {
98        crate::platform_dirs::home_dir()
99            .map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
100    }
101    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
102    {
103        // Linux/Unix: Use XDG cache directory
104        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
105            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
106        } else {
107            crate::platform_dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
108        }
109    }
110}
111
112/// Ensure a directory exists, creating it if necessary
113#[allow(dead_code)]
114fn ensuredirectory_exists(dir: &Path) -> Result<()> {
115    if !dir.exists() {
116        fs::create_dir_all(dir).map_err(|e| {
117            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
118        })?;
119    }
120    Ok(())
121}
122
123/// Fetch a dataset file from either cache or download it from the URL
124///
125/// This function will:
126/// 1. Check if the file exists in the cache directory
127/// 2. If not, download it from the URL in the registry entry
128/// 3. Store it in the cache directory
129/// 4. Return the path to the cached file
130///
131/// # Arguments
132///
133/// * `filename` - The name of the file to fetch
134/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
135///
136/// # Returns
137///
138/// * `Ok(PathBuf)` - Path to the cached file
139/// * `Err(String)` - Error message if fetching fails
140#[cfg(feature = "download-sync")]
141#[allow(dead_code)]
142pub fn fetch_data(
143    filename: &str,
144    registry_entry: Option<&RegistryEntry>,
145) -> std::result::Result<PathBuf, String> {
146    // Get the cache directory
147    let cachedir = match get_cachedir() {
148        Ok(dir) => dir,
149        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
150    };
151
152    // Check if file exists in cache
153    let cachepath = cachedir.join(filename);
154    if cachepath.exists() {
155        return Ok(cachepath);
156    }
157
158    // If not in cache, fetch from the URL
159    let entry = match registry_entry {
160        Some(entry) => entry,
161        None => return Err(format!("No registry entry found for {filename}")),
162    };
163
164    // Create a temporary file to download to
165    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
166    let temp_file = tempdir.path().join(filename);
167
168    // Download the file
169    let response = ureq::get(entry.url)
170        .call()
171        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
172
173    // Read body into memory (ureq 3.x: use into_body which implements Read)
174    let mut body = response.into_body();
175    let bytes = body
176        .read_to_vec()
177        .map_err(|e| format!("Failed to read response body: {e}"))?;
178    let mut file = std::fs::File::create(&temp_file)
179        .map_err(|e| format!("Failed to create temp file: {e}"))?;
180    file.write_all(&bytes)
181        .map_err(|e| format!("Failed to write downloaded file: {e}"))?;
182
183    // Verify the SHA256 hash of the downloaded file if provided
184    if !entry.sha256.is_empty() {
185        let computed_hash = sha256_hash_file(&temp_file)?;
186        if computed_hash != entry.sha256 {
187            return Err(format!(
188                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
189                entry.sha256
190            ));
191        }
192    }
193
194    // Move the file to the cache
195    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
196    if let Some(parent) = cachepath.parent() {
197        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
198    }
199
200    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
201
202    Ok(cachepath)
203}
204
205/// Stub for fetch_data when download-sync feature is disabled
206#[cfg(not(feature = "download-sync"))]
207#[allow(dead_code)]
208pub fn fetch_data(
209    _filename: &str,
210    _registry_entry: Option<&RegistryEntry>,
211) -> std::result::Result<PathBuf, String> {
212    Err("Synchronous download feature is disabled. Enable 'download-sync' feature.".to_string())
213}
214
215/// Cache key for dataset caching with configuration-aware hashing
216#[derive(Clone, Debug, Eq, PartialEq, Hash)]
217pub struct CacheKey {
218    name: String,
219    config_hash: String,
220}
221
222impl CacheKey {
223    /// Create a new cache key from dataset name and configuration
224    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
225        use std::collections::hash_map::DefaultHasher;
226        use std::hash::{Hash, Hasher};
227
228        let mut hasher = DefaultHasher::new();
229        config.use_cache.hash(&mut hasher);
230        config.download_if_missing.hash(&mut hasher);
231        config.return_preprocessed.hash(&mut hasher);
232        config.subset.hash(&mut hasher);
233        config.random_state.hash(&mut hasher);
234
235        Self {
236            name: name.to_string(),
237            config_hash: format!("{:x}", hasher.finish()),
238        }
239    }
240
241    /// Get the cache key as a string
242    pub fn as_string(&self) -> String {
243        format!("{}_{}", self.name, self.config_hash)
244    }
245}
246
247/// File path wrapper for hashing
248#[derive(Clone, Debug, Eq, PartialEq)]
249struct FileCacheKey(String);
250
251impl Hash for FileCacheKey {
252    fn hash<H: Hasher>(&self, state: &mut H) {
253        self.0.hash(state);
254    }
255}
256
257/// Manages caching of downloaded datasets, using both file-based and in-memory caching
258///
259/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
260/// while maintaining the file-based persistence for long-term storage.
261pub struct DatasetCache {
262    /// Directory for file-based caching
263    cachedir: PathBuf,
264    /// In-memory cache for frequently accessed datasets
265    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
266    /// Maximum cache size in bytes (0 means unlimited)
267    max_cache_size: u64,
268    /// Whether to operate in offline mode (no downloads)
269    offline_mode: bool,
270}
271
272impl Default for DatasetCache {
273    fn default() -> Self {
274        let cachedir = get_cachedir().expect("Could not get cache directory");
275
276        let mem_cache = RefCell::new(
277            CacheBuilder::new()
278                .with_size(DEFAULT_CACHE_SIZE)
279                .with_ttl(DEFAULT_CACHE_TTL)
280                .build_sized_cache(),
281        );
282
283        // Check if offline mode is enabled via environment variable
284        let offline_mode = std::env::var("SCIRS2_OFFLINE")
285            .map(|v| v.to_lowercase() == "true" || v == "1")
286            .unwrap_or(false);
287
288        DatasetCache {
289            cachedir,
290            mem_cache,
291            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
292            offline_mode,
293        }
294    }
295}
296
297impl DatasetCache {
298    /// Create a new dataset cache with the given cache directory and default memory cache
299    pub fn new(cachedir: PathBuf) -> Self {
300        let mem_cache = RefCell::new(
301            CacheBuilder::new()
302                .with_size(DEFAULT_CACHE_SIZE)
303                .with_ttl(DEFAULT_CACHE_TTL)
304                .build_sized_cache(),
305        );
306
307        let offline_mode = std::env::var("SCIRS2_OFFLINE")
308            .map(|v| v.to_lowercase() == "true" || v == "1")
309            .unwrap_or(false);
310
311        DatasetCache {
312            cachedir,
313            mem_cache,
314            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
315            offline_mode,
316        }
317    }
318
319    /// Create a new dataset cache with custom settings
320    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
321        let mem_cache = RefCell::new(
322            CacheBuilder::new()
323                .with_size(cache_size)
324                .with_ttl(ttl_seconds)
325                .build_sized_cache(),
326        );
327
328        let offline_mode = std::env::var("SCIRS2_OFFLINE")
329            .map(|v| v.to_lowercase() == "true" || v == "1")
330            .unwrap_or(false);
331
332        DatasetCache {
333            cachedir,
334            mem_cache,
335            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
336            offline_mode,
337        }
338    }
339
340    /// Create a new dataset cache with comprehensive configuration
341    pub fn with_full_config(
342        cachedir: PathBuf,
343        cache_size: usize,
344        ttl_seconds: u64,
345        max_cache_size: u64,
346        offline_mode: bool,
347    ) -> Self {
348        let mem_cache = RefCell::new(
349            CacheBuilder::new()
350                .with_size(cache_size)
351                .with_ttl(ttl_seconds)
352                .build_sized_cache(),
353        );
354
355        DatasetCache {
356            cachedir,
357            mem_cache,
358            max_cache_size,
359            offline_mode,
360        }
361    }
362
363    /// Create the cache directory if it doesn't exist
364    pub fn ensure_cachedir(&self) -> Result<()> {
365        if !self.cachedir.exists() {
366            fs::create_dir_all(&self.cachedir).map_err(|e| {
367                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
368            })?;
369        }
370        Ok(())
371    }
372
373    /// Get the path to a cached file
374    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
375        self.cachedir.join(name)
376    }
377
378    /// Check if a file is already cached (either in memory or on disk)
379    pub fn is_cached(&self, name: &str) -> bool {
380        // Check memory cache first
381        let key = FileCacheKey(name.to_string());
382        if self.mem_cache.borrow_mut().get(&key).is_some() {
383            return true;
384        }
385
386        // Then check file system
387        self.get_cachedpath(name).exists()
388    }
389
390    /// Read a cached file as bytes
391    ///
392    /// This method checks the in-memory cache first, and falls back to the file system if needed.
393    /// When reading from the file system, the result is also stored in the in-memory cache.
394    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
395        // Try memory cache first
396        let key = FileCacheKey(name.to_string());
397        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
398            return Ok(data);
399        }
400
401        // Fall back to file system cache
402        let path = self.get_cachedpath(name);
403        if !path.exists() {
404            return Err(DatasetsError::CacheError(format!(
405                "Cached file does not exist: {name}"
406            )));
407        }
408
409        let mut file = File::open(path)
410            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
411
412        let mut buffer = Vec::new();
413        file.read_to_end(&mut buffer)
414            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
415
416        // Update memory cache
417        self.mem_cache.borrow_mut().insert(key, buffer.clone());
418
419        Ok(buffer)
420    }
421
422    /// Write data to both the file cache and memory cache
423    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
424        self.ensure_cachedir()?;
425
426        // Check if writing this file would exceed cache size limit
427        if self.max_cache_size > 0 {
428            let current_size = self.get_cache_size_bytes()?;
429            let new_file_size = data.len() as u64;
430
431            if current_size + new_file_size > self.max_cache_size {
432                self.cleanup_cache_to_fit(new_file_size)?;
433            }
434        }
435
436        // Write to file system cache
437        let path = self.get_cachedpath(name);
438        let mut file = File::create(path)
439            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
440
441        file.write_all(data).map_err(|e| {
442            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
443        })?;
444
445        // Update memory cache
446        let key = FileCacheKey(name.to_string());
447        self.mem_cache.borrow_mut().insert(key, data.to_vec());
448
449        Ok(())
450    }
451
452    /// Clear the entire cache (both memory and file-based)
453    pub fn clear_cache(&self) -> Result<()> {
454        // Clear file system cache
455        if self.cachedir.exists() {
456            fs::remove_dir_all(&self.cachedir)
457                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
458        }
459
460        // Clear memory cache
461        self.mem_cache.borrow_mut().clear();
462
463        Ok(())
464    }
465
466    /// Remove a specific cached file (from both memory and file system)
467    pub fn remove_cached(&self, name: &str) -> Result<()> {
468        // Remove from file system
469        let path = self.get_cachedpath(name);
470        if path.exists() {
471            fs::remove_file(path).map_err(|e| {
472                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
473            })?;
474        }
475
476        // Remove from memory cache
477        let key = FileCacheKey(name.to_string());
478        self.mem_cache.borrow_mut().remove(&key);
479
480        Ok(())
481    }
482
483    /// Compute a hash for a filename or URL
484    pub fn hash_filename(name: &str) -> String {
485        let hash = blake3::hash(name.as_bytes());
486        hash.to_hex().to_string()
487    }
488
489    /// Get the total size of the cache in bytes
490    pub fn get_cache_size_bytes(&self) -> Result<u64> {
491        let mut total_size = 0u64;
492
493        if self.cachedir.exists() {
494            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
495                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
496            })?;
497
498            for entry in entries {
499                let entry = entry.map_err(|e| {
500                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
501                })?;
502
503                if let Ok(metadata) = entry.metadata() {
504                    if metadata.is_file() {
505                        total_size += metadata.len();
506                    }
507                }
508            }
509        }
510
511        Ok(total_size)
512    }
513
514    /// Clean up cache to fit a new file of specified size
515    ///
516    /// This method removes the oldest files first until there's enough space
517    /// for the new file plus some buffer space.
518    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
519        if self.max_cache_size == 0 {
520            return Ok(()); // No _size limit
521        }
522
523        let current_size = self.get_cache_size_bytes()?;
524        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
525        let total_needed = current_size + needed_size;
526
527        if total_needed <= target_size {
528            return Ok(()); // No cleanup needed
529        }
530
531        let size_to_free = total_needed - target_size;
532
533        // Get all files with their modification times
534        let mut files_with_times = Vec::new();
535
536        if self.cachedir.exists() {
537            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
538                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
539            })?;
540
541            for entry in entries {
542                let entry = entry.map_err(|e| {
543                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
544                })?;
545
546                if let Ok(metadata) = entry.metadata() {
547                    if metadata.is_file() {
548                        if let Ok(modified) = metadata.modified() {
549                            files_with_times.push((entry.path(), metadata.len(), modified));
550                        }
551                    }
552                }
553            }
554        }
555
556        // Sort by modification time (oldest first)
557        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
558
559        // Remove files until we've freed enough space
560        let mut freed_size = 0u64;
561        for (path, size, _modified) in files_with_times {
562            if freed_size >= size_to_free {
563                break;
564            }
565
566            // Remove from memory cache first
567            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
568                let key = FileCacheKey(filename.to_string());
569                self.mem_cache.borrow_mut().remove(&key);
570            }
571
572            // Remove file
573            if let Err(e) = fs::remove_file(&path) {
574                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
575            } else {
576                freed_size += size;
577            }
578        }
579
580        Ok(())
581    }
582
583    /// Set offline mode
584    pub fn set_offline_mode(&mut self, offline: bool) {
585        self.offline_mode = offline;
586    }
587
588    /// Check if cache is in offline mode
589    pub fn is_offline(&self) -> bool {
590        self.offline_mode
591    }
592
593    /// Set maximum cache size in bytes (0 for unlimited)
594    pub fn set_max_cache_size(&mut self, max_size: u64) {
595        self.max_cache_size = max_size;
596    }
597
598    /// Get maximum cache size in bytes
599    pub fn max_cache_size(&self) -> u64 {
600        self.max_cache_size
601    }
602
603    /// Put data into the cache (alias for write_cached)
604    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
605        self.write_cached(name, data)
606    }
607
608    /// Get detailed cache information
609    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
610        let mut total_size = 0u64;
611        let mut file_count = 0usize;
612        let mut files = Vec::new();
613
614        if self.cachedir.exists() {
615            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
616                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
617            })?;
618
619            for entry in entries {
620                let entry = entry.map_err(|e| {
621                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
622                })?;
623
624                if let Ok(metadata) = entry.metadata() {
625                    if metadata.is_file() {
626                        let size = metadata.len();
627                        total_size += size;
628                        file_count += 1;
629
630                        if let Some(filename) = entry.file_name().to_str() {
631                            files.push(CacheFileInfo {
632                                name: filename.to_string(),
633                                size_bytes: size,
634                                modified: metadata.modified().ok(),
635                            });
636                        }
637                    }
638                }
639            }
640        }
641
642        // Sort files by size (largest first)
643        files.sort_by_key(|f| std::cmp::Reverse(f.size_bytes));
644
645        Ok(DetailedCacheStats {
646            total_size_bytes: total_size,
647            file_count,
648            cachedir: self.cachedir.clone(),
649            max_cache_size: self.max_cache_size,
650            offline_mode: self.offline_mode,
651            files,
652        })
653    }
654}
655
656/// Downloads data from a URL and returns it as bytes, using the cache when possible
657#[cfg(feature = "download")]
658#[allow(dead_code)]
659pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
660    let cache = DatasetCache::default();
661    let cache_key = DatasetCache::hash_filename(_url);
662
663    // Check if the data is already cached
664    if !force_download && cache.is_cached(&cache_key) {
665        return cache.read_cached(&cache_key);
666    }
667
668    // Download the data
669    let response = reqwest::blocking::get(_url).map_err(|e| {
670        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
671    })?;
672
673    if !response.status().is_success() {
674        return Err(DatasetsError::DownloadError(format!(
675            "Failed to download from {_url}: HTTP status {}",
676            response.status()
677        )));
678    }
679
680    let data = response
681        .bytes()
682        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
683
684    let data_vec = data.to_vec();
685
686    // Cache the data
687    cache.write_cached(&cache_key, &data_vec)?;
688
689    Ok(data_vec)
690}
691
692// Stub for when download feature is not enabled
693#[cfg(not(feature = "download"))]
694/// Downloads data from a URL or retrieves it from cache
695///
696/// This is a stub implementation when the download feature is not enabled.
697/// It returns an error informing the user to enable the download feature.
698///
699/// # Arguments
700///
701/// * `_url` - The URL to download from
702/// * `_force_download` - If true, force a new download instead of using cache
703///
704/// # Returns
705///
706/// * An error indicating that the download feature is not enabled
707#[allow(dead_code)]
708pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
709    Err(DatasetsError::Other(
710        "Download feature is not enabled. Recompile with --features download".to_string(),
711    ))
712}
713
714/// Cache management utilities
715pub struct CacheManager {
716    cache: DatasetCache,
717}
718
719impl CacheManager {
720    /// Create a new cache manager with default settings
721    pub fn new() -> Result<Self> {
722        let cachedir = get_cachedir()?;
723        Ok(Self {
724            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
725        })
726    }
727
728    /// Create a new cache manager with custom settings
729    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
730        Self {
731            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
732        }
733    }
734
735    /// Get a dataset from cache using CacheKey
736    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
737        let name = key.as_string();
738        if self.cache.is_cached(&name) {
739            match self.cache.read_cached(&name) {
740                Ok(cached_data) => {
741                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
742                        Ok(dataset) => Ok(Some(dataset)),
743                        Err(e) => {
744                            // If deserialization fails, consider the cache entry invalid
745                            self.cache
746                                .mem_cache
747                                .borrow_mut()
748                                .remove(&FileCacheKey(name.clone()));
749                            Err(DatasetsError::CacheError(format!(
750                                "Failed to deserialize cached dataset: {e}"
751                            )))
752                        }
753                    }
754                }
755                Err(e) => Err(DatasetsError::CacheError(format!(
756                    "Failed to read cached data: {e}"
757                ))),
758            }
759        } else {
760            Ok(None)
761        }
762    }
763
764    /// Put a dataset into cache using CacheKey
765    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
766        let name = key.as_string();
767
768        // Serialize the dataset to JSON bytes for caching
769        let serialized = serde_json::to_vec(dataset)
770            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
771
772        // Write the serialized data to cache
773        self.cache
774            .write_cached(&name, &serialized)
775            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
776    }
777
778    /// Create a cache manager with comprehensive configuration
779    pub fn with_full_config(
780        cachedir: PathBuf,
781        cache_size: usize,
782        ttl_seconds: u64,
783        max_cache_size: u64,
784        offline_mode: bool,
785    ) -> Self {
786        Self {
787            cache: DatasetCache::with_full_config(
788                cachedir,
789                cache_size,
790                ttl_seconds,
791                max_cache_size,
792                offline_mode,
793            ),
794        }
795    }
796
797    /// Get basic cache statistics
798    pub fn get_stats(&self) -> CacheStats {
799        let cachedir = &self.cache.cachedir;
800        let mut total_size = 0u64;
801        let mut file_count = 0usize;
802
803        if cachedir.exists() {
804            if let Ok(entries) = fs::read_dir(cachedir) {
805                for entry in entries.flatten() {
806                    if let Ok(metadata) = entry.metadata() {
807                        if metadata.is_file() {
808                            total_size += metadata.len();
809                            file_count += 1;
810                        }
811                    }
812                }
813            }
814        }
815
816        CacheStats {
817            total_size_bytes: total_size,
818            file_count,
819            cachedir: cachedir.clone(),
820        }
821    }
822
823    /// Get detailed cache statistics
824    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
825        self.cache.get_detailed_stats()
826    }
827
828    /// Set offline mode
829    pub fn set_offline_mode(&mut self, offline: bool) {
830        self.cache.set_offline_mode(offline);
831    }
832
833    /// Check if in offline mode
834    pub fn is_offline(&self) -> bool {
835        self.cache.is_offline()
836    }
837
838    /// Set maximum cache size in bytes (0 for unlimited)
839    pub fn set_max_cache_size(&mut self, max_size: u64) {
840        self.cache.set_max_cache_size(max_size);
841    }
842
843    /// Get maximum cache size in bytes
844    pub fn max_cache_size(&self) -> u64 {
845        self.cache.max_cache_size()
846    }
847
848    /// Clear all cached data
849    pub fn clear_all(&self) -> Result<()> {
850        self.cache.clear_cache()
851    }
852
853    /// Remove specific cached file
854    pub fn remove(&self, name: &str) -> Result<()> {
855        self.cache.remove_cached(name)
856    }
857
858    /// Remove old files to free up space
859    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
860        self.cache.cleanup_cache_to_fit(target_size)
861    }
862
863    /// List all cached files
864    pub fn list_cached_files(&self) -> Result<Vec<String>> {
865        let cachedir = &self.cache.cachedir;
866        let mut files = Vec::new();
867
868        if cachedir.exists() {
869            let entries = fs::read_dir(cachedir).map_err(|e| {
870                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
871            })?;
872
873            for entry in entries {
874                let entry = entry.map_err(|e| {
875                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
876                })?;
877
878                if let Some(filename) = entry.file_name().to_str() {
879                    files.push(filename.to_string());
880                }
881            }
882        }
883
884        files.sort();
885        Ok(files)
886    }
887
888    /// Get cache directory path
889    pub fn cachedir(&self) -> &PathBuf {
890        &self.cache.cachedir
891    }
892
893    /// Check if a file is cached
894    pub fn is_cached(&self, name: &str) -> bool {
895        self.cache.is_cached(name)
896    }
897
898    /// Print detailed cache report
899    pub fn print_cache_report(&self) -> Result<()> {
900        let stats = self.get_detailed_stats()?;
901
902        println!("=== Cache Report ===");
903        println!("Cache Directory: {}", stats.cachedir.display());
904        println!(
905            "Total Size: {} ({} files)",
906            stats.formatted_size(),
907            stats.file_count
908        );
909        println!("Max Size: {}", stats.formatted_max_size());
910
911        if stats.max_cache_size > 0 {
912            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
913        }
914
915        println!(
916            "Offline Mode: {}",
917            if stats.offline_mode {
918                "Enabled"
919            } else {
920                "Disabled"
921            }
922        );
923
924        if !stats.files.is_empty() {
925            println!("\nCached Files:");
926            for file in &stats.files {
927                println!(
928                    "  {} - {} ({})",
929                    file.name,
930                    file.formatted_size(),
931                    file.formatted_modified()
932                );
933            }
934        }
935
936        Ok(())
937    }
938}
939
940/// Cache statistics
941pub struct CacheStats {
942    /// Total size of all cached files in bytes
943    pub total_size_bytes: u64,
944    /// Number of cached files
945    pub file_count: usize,
946    /// Cache directory path
947    pub cachedir: PathBuf,
948}
949
950/// Detailed cache statistics with file-level information
951pub struct DetailedCacheStats {
952    /// Total size of all cached files in bytes
953    pub total_size_bytes: u64,
954    /// Number of cached files
955    pub file_count: usize,
956    /// Cache directory path
957    pub cachedir: PathBuf,
958    /// Maximum cache size (0 = unlimited)
959    pub max_cache_size: u64,
960    /// Whether cache is in offline mode
961    pub offline_mode: bool,
962    /// Information about individual cached files
963    pub files: Vec<CacheFileInfo>,
964}
965
966/// Information about a cached file
967#[derive(Debug, Clone)]
968pub struct CacheFileInfo {
969    /// Name of the cached file
970    pub name: String,
971    /// Size in bytes
972    pub size_bytes: u64,
973    /// Last modified time
974    pub modified: Option<std::time::SystemTime>,
975}
976
977impl CacheStats {
978    /// Get total size formatted as human-readable string
979    pub fn formatted_size(&self) -> String {
980        format_bytes(self.total_size_bytes)
981    }
982}
983
984impl DetailedCacheStats {
985    /// Get total size formatted as human-readable string
986    pub fn formatted_size(&self) -> String {
987        format_bytes(self.total_size_bytes)
988    }
989
990    /// Get max cache size formatted as human-readable string
991    pub fn formatted_max_size(&self) -> String {
992        if self.max_cache_size == 0 {
993            "Unlimited".to_string()
994        } else {
995            format_bytes(self.max_cache_size)
996        }
997    }
998
999    /// Get cache usage percentage (0.0-1.0)
1000    pub fn usage_percentage(&self) -> f64 {
1001        if self.max_cache_size == 0 {
1002            0.0
1003        } else {
1004            self.total_size_bytes as f64 / self.max_cache_size as f64
1005        }
1006    }
1007}
1008
1009impl CacheFileInfo {
1010    /// Get file size formatted as human-readable string
1011    pub fn formatted_size(&self) -> String {
1012        format_bytes(self.size_bytes)
1013    }
1014
1015    /// Get formatted modification time
1016    pub fn formatted_modified(&self) -> String {
1017        match &self.modified {
1018            Some(time) => {
1019                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1020                {
1021                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1022                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1023                        let days = diff_secs / 86400;
1024                        let hours = (diff_secs % 86400) / 3600;
1025                        let mins = (diff_secs % 3600) / 60;
1026
1027                        if days > 0 {
1028                            format!("{days} days ago")
1029                        } else if hours > 0 {
1030                            format!("{hours} hours ago")
1031                        } else if mins > 0 {
1032                            format!("{mins} minutes ago")
1033                        } else {
1034                            "Just now".to_string()
1035                        }
1036                    } else {
1037                        "Unknown".to_string()
1038                    }
1039                } else {
1040                    "Unknown".to_string()
1041                }
1042            }
1043            None => "Unknown".to_string(),
1044        }
1045    }
1046}
1047
1048/// Format bytes as human-readable string
1049#[allow(dead_code)]
1050fn format_bytes(bytes: u64) -> String {
1051    let size = bytes as f64;
1052    if size < 1024.0 {
1053        format!("{size} B")
1054    } else if size < 1024.0 * 1024.0 {
1055        format!("{:.1} KB", size / 1024.0)
1056    } else if size < 1024.0 * 1024.0 * 1024.0 {
1057        format!("{:.1} MB", size / (1024.0 * 1024.0))
1058    } else {
1059        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1060    }
1061}
1062
1063/// Batch operation result containing success/failure information
1064#[derive(Debug, Clone)]
1065pub struct BatchResult {
1066    /// Number of successful operations
1067    pub success_count: usize,
1068    /// Number of failed operations
1069    pub failure_count: usize,
1070    /// List of failed items with error messages
1071    pub failures: Vec<(String, String)>,
1072    /// Total bytes processed
1073    pub total_bytes: u64,
1074    /// Total time taken for the batch operation
1075    pub elapsed_time: std::time::Duration,
1076}
1077
1078impl BatchResult {
1079    /// Create a new empty batch result
1080    pub fn new() -> Self {
1081        Self {
1082            success_count: 0,
1083            failure_count: 0,
1084            failures: Vec::new(),
1085            total_bytes: 0,
1086            elapsed_time: std::time::Duration::ZERO,
1087        }
1088    }
1089
1090    /// Check if all operations were successful
1091    pub fn is_all_success(&self) -> bool {
1092        self.failure_count == 0
1093    }
1094
1095    /// Get success rate as percentage
1096    pub fn success_rate(&self) -> f64 {
1097        let total = self.success_count + self.failure_count;
1098        if total == 0 {
1099            0.0
1100        } else {
1101            (self.success_count as f64 / total as f64) * 100.0
1102        }
1103    }
1104
1105    /// Get formatted summary
1106    pub fn summary(&self) -> String {
1107        format!(
1108            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1109            self.success_count,
1110            self.success_count + self.failure_count,
1111            self.success_rate(),
1112            format_bytes(self.total_bytes),
1113            self.elapsed_time.as_secs_f64()
1114        )
1115    }
1116}
1117
1118impl Default for BatchResult {
1119    fn default() -> Self {
1120        Self::new()
1121    }
1122}
1123
1124/// Batch operations manager for dataset caching
1125pub struct BatchOperations {
1126    cache: CacheManager,
1127    parallel: bool,
1128    max_retries: usize,
1129    retry_delay: std::time::Duration,
1130}
1131
1132impl BatchOperations {
1133    /// Create a new batch operations manager
1134    pub fn new(cache: CacheManager) -> Self {
1135        Self {
1136            cache,
1137            parallel: true,
1138            max_retries: 3,
1139            retry_delay: std::time::Duration::from_millis(1000),
1140        }
1141    }
1142
1143    /// Configure parallel processing
1144    pub fn with_parallel(mut self, parallel: bool) -> Self {
1145        self.parallel = parallel;
1146        self
1147    }
1148
1149    /// Configure retry settings
1150    pub fn with_retry_config(
1151        mut self,
1152        max_retries: usize,
1153        retry_delay: std::time::Duration,
1154    ) -> Self {
1155        self.max_retries = max_retries;
1156        self.retry_delay = retry_delay;
1157        self
1158    }
1159
1160    /// Download multiple datasets in batch
1161    #[cfg(feature = "download")]
1162    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1163        let start_time = std::time::Instant::now();
1164        let mut result = BatchResult::new();
1165
1166        if self.parallel {
1167            self.batch_download_parallel(urls_andnames, &mut result)
1168        } else {
1169            self.batch_download_sequential(urls_andnames, &mut result)
1170        }
1171
1172        result.elapsed_time = start_time.elapsed();
1173        result
1174    }
1175
1176    #[cfg(feature = "download")]
1177    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1178        use std::fs::File;
1179        use std::io::Write;
1180        use std::sync::{Arc, Mutex};
1181        use std::thread;
1182
1183        // Ensure cache directory exists before spawning threads
1184        if let Err(e) = self.cache.cache.ensure_cachedir() {
1185            result.failure_count += urls_andnames.len();
1186            for &(_, name) in urls_andnames {
1187                result
1188                    .failures
1189                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1190            }
1191            return;
1192        }
1193
1194        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1195        let cachedir = self.cache.cache.cachedir.clone();
1196        let max_retries = self.max_retries;
1197        let retry_delay = self.retry_delay;
1198
1199        let handles: Vec<_> = urls_andnames
1200            .iter()
1201            .map(|&(url, name)| {
1202                let result_clone = Arc::clone(&result_arc);
1203                let url = url.to_string();
1204                let name = name.to_string();
1205                let cachedir = cachedir.clone();
1206
1207                thread::spawn(move || {
1208                    let mut success = false;
1209                    let mut last_error = String::new();
1210                    let mut downloaded_data = Vec::new();
1211
1212                    for attempt in 0..=max_retries {
1213                        match download_data(&url, false) {
1214                            Ok(data) => {
1215                                // Write directly to filesystem (bypassing RefCell memory cache)
1216                                let path = cachedir.join(&name);
1217                                match File::create(&path) {
1218                                    Ok(mut file) => match file.write_all(&data) {
1219                                        Ok(_) => {
1220                                            let mut r =
1221                                                result_clone.lock().expect("Operation failed");
1222                                            r.success_count += 1;
1223                                            r.total_bytes += data.len() as u64;
1224                                            downloaded_data = data;
1225                                            success = true;
1226                                            break;
1227                                        }
1228                                        Err(e) => {
1229                                            last_error = format!("Failed to write cache file: {e}");
1230                                        }
1231                                    },
1232                                    Err(e) => {
1233                                        last_error = format!("Failed to create cache file: {e}");
1234                                    }
1235                                }
1236                            }
1237                            Err(e) => {
1238                                last_error = format!("Download failed: {e}");
1239                                if attempt < max_retries {
1240                                    thread::sleep(retry_delay);
1241                                }
1242                            }
1243                        }
1244                    }
1245
1246                    if !success {
1247                        let mut r = result_clone.lock().expect("Operation failed");
1248                        r.failure_count += 1;
1249                        r.failures.push((name.clone(), last_error));
1250                    }
1251
1252                    (name, success, downloaded_data)
1253                })
1254            })
1255            .collect();
1256
1257        // Collect results and update memory cache for successful downloads
1258        let mut successful_downloads = Vec::new();
1259        for handle in handles {
1260            if let Ok((name, success, data)) = handle.join() {
1261                if success && !data.is_empty() {
1262                    successful_downloads.push((name, data));
1263                }
1264            }
1265        }
1266
1267        // Merge the results from the arc back into the original result
1268        if let Ok(arc_result) = result_arc.lock() {
1269            result.success_count += arc_result.success_count;
1270            result.failure_count += arc_result.failure_count;
1271            result.failures.extend(arc_result.failures.clone());
1272        }
1273
1274        // Update memory cache after all threads complete
1275        for (name, data) in successful_downloads {
1276            let key = FileCacheKey(name);
1277            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1278        }
1279    }
1280
1281    #[cfg(feature = "download")]
1282    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1283        for &(url, name) in urls_andnames {
1284            let mut success = false;
1285            let mut last_error = String::new();
1286
1287            for attempt in 0..=self.max_retries {
1288                match download_data(url, false) {
1289                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1290                        Ok(_) => {
1291                            result.success_count += 1;
1292                            result.total_bytes += data.len() as u64;
1293                            success = true;
1294                            break;
1295                        }
1296                        Err(e) => {
1297                            last_error = format!("Cache write failed: {e}");
1298                        }
1299                    },
1300                    Err(e) => {
1301                        last_error = format!("Download failed: {e}");
1302                        if attempt < self.max_retries {
1303                            std::thread::sleep(self.retry_delay);
1304                        }
1305                    }
1306                }
1307            }
1308
1309            if !success {
1310                result.failure_count += 1;
1311                result.failures.push((name.to_string(), last_error));
1312            }
1313        }
1314    }
1315
1316    /// Verify integrity of multiple cached files
1317    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1318        let start_time = std::time::Instant::now();
1319        let mut result = BatchResult::new();
1320
1321        for &(filename, expected_hash) in files_andhashes {
1322            match self.cache.cache.get_cachedpath(filename).exists() {
1323                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1324                    Ok(actual_hash) => {
1325                        if actual_hash == expected_hash {
1326                            result.success_count += 1;
1327                            if let Ok(metadata) =
1328                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1329                            {
1330                                result.total_bytes += metadata.len();
1331                            }
1332                        } else {
1333                            result.failure_count += 1;
1334                            result.failures.push((
1335                                filename.to_string(),
1336                                format!(
1337                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1338                                ),
1339                            ));
1340                        }
1341                    }
1342                    Err(e) => {
1343                        result.failure_count += 1;
1344                        result.failures.push((
1345                            filename.to_string(),
1346                            format!("Hash computation failed: {e}"),
1347                        ));
1348                    }
1349                },
1350                false => {
1351                    result.failure_count += 1;
1352                    result
1353                        .failures
1354                        .push((filename.to_string(), "File not found in cache".to_string()));
1355                }
1356            }
1357        }
1358
1359        result.elapsed_time = start_time.elapsed();
1360        result
1361    }
1362
1363    /// Clean up cache selectively based on patterns
1364    pub fn selective_cleanup(
1365        &self,
1366        patterns: &[&str],
1367        max_age_days: Option<u32>,
1368    ) -> Result<BatchResult> {
1369        let start_time = std::time::Instant::now();
1370        let mut result = BatchResult::new();
1371
1372        let cached_files = self.cache.list_cached_files()?;
1373        let now = std::time::SystemTime::now();
1374
1375        for filename in cached_files {
1376            let should_remove = patterns.iter().any(|pattern| {
1377                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1378            });
1379
1380            if should_remove {
1381                let filepath = self.cache.cache.get_cachedpath(&filename);
1382
1383                // Check age if max_age_days is specified
1384                let remove_due_to_age = if let Some(max_age) = max_age_days {
1385                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1386                        if let Ok(modified) = metadata.modified() {
1387                            if let Ok(age) = now.duration_since(modified) {
1388                                age.as_secs() > (max_age as u64 * 24 * 3600)
1389                            } else {
1390                                false
1391                            }
1392                        } else {
1393                            false
1394                        }
1395                    } else {
1396                        false
1397                    }
1398                } else {
1399                    true // Remove regardless of age if no age limit specified
1400                };
1401
1402                if remove_due_to_age {
1403                    match self.cache.remove(&filename) {
1404                        Ok(_) => {
1405                            result.success_count += 1;
1406                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1407                                result.total_bytes += metadata.len();
1408                            }
1409                        }
1410                        Err(e) => {
1411                            result.failure_count += 1;
1412                            result
1413                                .failures
1414                                .push((filename, format!("Removal failed: {e}")));
1415                        }
1416                    }
1417                }
1418            }
1419        }
1420
1421        result.elapsed_time = start_time.elapsed();
1422        Ok(result)
1423    }
1424
1425    /// Process multiple datasets with a given function
1426    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1427    where
1428        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1429        E: std::fmt::Display,
1430        T: Send,
1431    {
1432        let start_time = std::time::Instant::now();
1433        let mut result = BatchResult::new();
1434
1435        if self.parallel {
1436            self.batch_process_parallel(names, processor, &mut result)
1437        } else {
1438            self.batch_process_sequential(names, processor, &mut result)
1439        }
1440
1441        result.elapsed_time = start_time.elapsed();
1442        result
1443    }
1444
1445    fn batch_process_parallel<F, T, E>(
1446        &self,
1447        names: &[String],
1448        processor: F,
1449        result: &mut BatchResult,
1450    ) where
1451        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1452        E: std::fmt::Display,
1453        T: Send,
1454    {
1455        // For thread safety with the current cache implementation,
1456        // we need to read all data first, then process in parallel
1457        let mut data_pairs = Vec::new();
1458
1459        // Sequential read phase
1460        for name in names {
1461            match self.cache.cache.read_cached(name) {
1462                Ok(data) => data_pairs.push((name.clone(), data)),
1463                Err(e) => {
1464                    result.failure_count += 1;
1465                    result
1466                        .failures
1467                        .push((name.clone(), format!("Cache read failed: {e}")));
1468                }
1469            }
1470        }
1471
1472        // Parallel processing phase
1473        if !data_pairs.is_empty() {
1474            use std::sync::{Arc, Mutex};
1475            use std::thread;
1476
1477            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1478            let processor = Arc::new(processor);
1479
1480            let handles: Vec<_> = data_pairs
1481                .into_iter()
1482                .map(|(name, data)| {
1483                    let result_clone = Arc::clone(&parallel_result);
1484                    let processor_clone = Arc::clone(&processor);
1485
1486                    thread::spawn(move || match processor_clone(&name, &data) {
1487                        Ok(_) => {
1488                            let mut r = result_clone.lock().expect("Operation failed");
1489                            r.success_count += 1;
1490                            r.total_bytes += data.len() as u64;
1491                        }
1492                        Err(e) => {
1493                            let mut r = result_clone.lock().expect("Operation failed");
1494                            r.failure_count += 1;
1495                            r.failures.push((name, format!("Processing failed: {e}")));
1496                        }
1497                    })
1498                })
1499                .collect();
1500
1501            for handle in handles {
1502                let _ = handle.join();
1503            }
1504
1505            // Merge parallel results into main result
1506            let parallel_result = parallel_result.lock().expect("Operation failed");
1507            result.success_count += parallel_result.success_count;
1508            result.failure_count += parallel_result.failure_count;
1509            result.total_bytes += parallel_result.total_bytes;
1510            result.failures.extend(parallel_result.failures.clone());
1511        }
1512    }
1513
1514    fn batch_process_sequential<F, T, E>(
1515        &self,
1516        names: &[String],
1517        processor: F,
1518        result: &mut BatchResult,
1519    ) where
1520        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1521        E: std::fmt::Display,
1522    {
1523        for name in names {
1524            match self.cache.cache.read_cached(name) {
1525                Ok(data) => match processor(name, &data) {
1526                    Ok(_) => {
1527                        result.success_count += 1;
1528                        result.total_bytes += data.len() as u64;
1529                    }
1530                    Err(e) => {
1531                        result.failure_count += 1;
1532                        result
1533                            .failures
1534                            .push((name.clone(), format!("Processing failed: {e}")));
1535                    }
1536                },
1537                Err(e) => {
1538                    result.failure_count += 1;
1539                    result
1540                        .failures
1541                        .push((name.clone(), format!("Cache read failed: {e}")));
1542                }
1543            }
1544        }
1545    }
1546
1547    /// Get access to the underlying cache manager
1548    pub fn cache_manager(&self) -> &CacheManager {
1549        &self.cache
1550    }
1551
1552    /// Write data to cache
1553    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1554        self.cache.cache.write_cached(name, data)
1555    }
1556
1557    /// Read data from cache
1558    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1559        self.cache.cache.read_cached(name)
1560    }
1561
1562    /// List cached files
1563    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1564        self.cache.list_cached_files()
1565    }
1566
1567    /// Print cache report
1568    pub fn print_cache_report(&self) -> Result<()> {
1569        self.cache.print_cache_report()
1570    }
1571
1572    /// Get statistics about cached datasets
1573    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1574        let start_time = std::time::Instant::now();
1575        let mut result = BatchResult::new();
1576
1577        let cached_files = self.cache.list_cached_files()?;
1578
1579        for filename in cached_files {
1580            let filepath = self.cache.cache.get_cachedpath(&filename);
1581            match std::fs::metadata(&filepath) {
1582                Ok(metadata) => {
1583                    result.success_count += 1;
1584                    result.total_bytes += metadata.len();
1585                }
1586                Err(e) => {
1587                    result.failure_count += 1;
1588                    result
1589                        .failures
1590                        .push((filename, format!("Metadata read failed: {e}")));
1591                }
1592            }
1593        }
1594
1595        result.elapsed_time = start_time.elapsed();
1596        Ok(result)
1597    }
1598}
1599
1600/// Simple glob pattern matching for filenames
1601#[allow(dead_code)]
1602fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1603    if pattern == "*" {
1604        return true;
1605    }
1606
1607    if pattern.contains('*') {
1608        let parts: Vec<&str> = pattern.split('*').collect();
1609        if parts.len() == 2 {
1610            let prefix = parts[0];
1611            let suffix = parts[1];
1612            return filename.starts_with(prefix) && filename.ends_with(suffix);
1613        }
1614    }
1615
1616    filename == pattern
1617}
1618
1619#[cfg(test)]
1620mod tests {
1621    use super::*;
1622    use tempfile::TempDir;
1623
1624    #[test]
1625    fn test_batch_result() {
1626        let mut result = BatchResult::new();
1627        assert_eq!(result.success_count, 0);
1628        assert_eq!(result.failure_count, 0);
1629        assert!(result.is_all_success());
1630        assert_eq!(result.success_rate(), 0.0);
1631
1632        result.success_count = 8;
1633        result.failure_count = 2;
1634        result.total_bytes = 1024;
1635
1636        assert!(!result.is_all_success());
1637        assert_eq!(result.success_rate(), 80.0);
1638        assert!(result.summary().contains("8/10 successful"));
1639        assert!(result.summary().contains("80.0%"));
1640    }
1641
1642    #[test]
1643    fn test_batch_operations_creation() {
1644        let tempdir = TempDir::new().expect("Operation failed");
1645        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1646        let batch_ops = BatchOperations::new(cache_manager)
1647            .with_parallel(false)
1648            .with_retry_config(2, std::time::Duration::from_millis(500));
1649
1650        assert!(!batch_ops.parallel);
1651        assert_eq!(batch_ops.max_retries, 2);
1652    }
1653
1654    #[test]
1655    fn test_selective_cleanup() {
1656        let tempdir = TempDir::new().expect("Operation failed");
1657        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1658        let batch_ops = BatchOperations::new(cache_manager);
1659
1660        // Create some test files
1661        let test_data = vec![0u8; 100];
1662        batch_ops
1663            .cache
1664            .cache
1665            .write_cached("test1.csv", &test_data)
1666            .expect("Test: cache operation failed");
1667        batch_ops
1668            .cache
1669            .cache
1670            .write_cached("test2.csv", &test_data)
1671            .expect("Test: cache operation failed");
1672        batch_ops
1673            .cache
1674            .cache
1675            .write_cached("data.json", &test_data)
1676            .expect("Test: cache operation failed");
1677
1678        // Clean up files matching pattern
1679        let result = batch_ops
1680            .selective_cleanup(&["*.csv"], None)
1681            .expect("Operation failed");
1682
1683        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1684        assert!(!batch_ops.cache.is_cached("test1.csv"));
1685        assert!(!batch_ops.cache.is_cached("test2.csv"));
1686        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1687    }
1688
1689    #[test]
1690    fn test_batch_process() {
1691        let tempdir = TempDir::new().expect("Operation failed");
1692        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1693        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1694
1695        // Create test files
1696        let test_data1 = vec![1u8; 100];
1697        let test_data2 = vec![2u8; 200];
1698        batch_ops
1699            .cache
1700            .cache
1701            .write_cached("file1.dat", &test_data1)
1702            .expect("Test: cache operation failed");
1703        batch_ops
1704            .cache
1705            .cache
1706            .write_cached("file2.dat", &test_data2)
1707            .expect("Test: cache operation failed");
1708
1709        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1710
1711        // Process files (verify they're non-empty)
1712        let result = batch_ops.batch_process(&files, |_name, data| {
1713            if data.is_empty() {
1714                Err("Empty file")
1715            } else {
1716                Ok(data.len())
1717            }
1718        });
1719
1720        assert_eq!(result.success_count, 2);
1721        assert_eq!(result.failure_count, 0);
1722        assert_eq!(result.total_bytes, 300); // 100 + 200
1723    }
1724
1725    #[test]
1726    fn test_get_cache_statistics() {
1727        let tempdir = TempDir::new().expect("Operation failed");
1728        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1729        let batch_ops = BatchOperations::new(cache_manager);
1730
1731        // Start with empty cache
1732        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1733        assert_eq!(result.success_count, 0);
1734
1735        // Add some files
1736        let test_data = vec![0u8; 500];
1737        batch_ops
1738            .cache
1739            .cache
1740            .write_cached("test1.dat", &test_data)
1741            .expect("Test: cache operation failed");
1742        batch_ops
1743            .cache
1744            .cache
1745            .write_cached("test2.dat", &test_data)
1746            .expect("Test: cache operation failed");
1747
1748        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1749        assert_eq!(result.success_count, 2);
1750        assert_eq!(result.total_bytes, 1000);
1751    }
1752
1753    #[test]
1754    fn test_matches_glob_pattern() {
1755        assert!(matches_glob_pattern("test.csv", "*"));
1756        assert!(matches_glob_pattern("test.csv", "*.csv"));
1757        assert!(matches_glob_pattern("test.csv", "test.*"));
1758        assert!(matches_glob_pattern("test.csv", "test.csv"));
1759
1760        assert!(!matches_glob_pattern("test.json", "*.csv"));
1761        assert!(!matches_glob_pattern("other.csv", "test.*"));
1762    }
1763
1764    #[test]
1765    fn test_cache_manager_creation() {
1766        let tempdir = TempDir::new().expect("Operation failed");
1767        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1768        let stats = manager.get_stats();
1769        assert_eq!(stats.file_count, 0);
1770    }
1771
1772    #[test]
1773    fn test_cache_stats_formatting() {
1774        let tempdir = TempDir::new().expect("Operation failed");
1775        let stats = CacheStats {
1776            total_size_bytes: 1024,
1777            file_count: 1,
1778            cachedir: tempdir.path().to_path_buf(),
1779        };
1780
1781        assert_eq!(stats.formatted_size(), "1.0 KB");
1782
1783        let stats_large = CacheStats {
1784            total_size_bytes: 1024 * 1024 * 1024,
1785            file_count: 1,
1786            cachedir: tempdir.path().to_path_buf(),
1787        };
1788
1789        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1790    }
1791
1792    #[test]
1793    fn test_hash_file_name() {
1794        let hash1 = DatasetCache::hash_filename("test.csv");
1795        let hash2 = DatasetCache::hash_filename("test.csv");
1796        let hash3 = DatasetCache::hash_filename("different.csv");
1797
1798        assert_eq!(hash1, hash2);
1799        assert_ne!(hash1, hash3);
1800        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1801    }
1802
1803    #[test]
1804    fn test_platform_cachedir() {
1805        let cachedir = get_platform_cachedir();
1806        // Should work on any platform
1807        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1808
1809        if let Some(dir) = cachedir {
1810            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1811        }
1812    }
1813
1814    #[test]
1815    fn test_cache_size_management() {
1816        let tempdir = TempDir::new().expect("Operation failed");
1817        let cache = DatasetCache::with_full_config(
1818            tempdir.path().to_path_buf(),
1819            10,
1820            3600,
1821            2048, // 2KB limit
1822            false,
1823        );
1824
1825        // Write multiple small files to approach the limit
1826        let small_data1 = vec![0u8; 400];
1827        cache
1828            .write_cached("small1.dat", &small_data1)
1829            .expect("Operation failed");
1830
1831        let small_data2 = vec![0u8; 400];
1832        cache
1833            .write_cached("small2.dat", &small_data2)
1834            .expect("Operation failed");
1835
1836        let small_data3 = vec![0u8; 400];
1837        cache
1838            .write_cached("small3.dat", &small_data3)
1839            .expect("Operation failed");
1840
1841        // Now write a file that should trigger cleanup
1842        let medium_data = vec![0u8; 800];
1843        cache
1844            .write_cached("medium.dat", &medium_data)
1845            .expect("Operation failed");
1846
1847        // The cache should have cleaned up to stay under the limit
1848        let stats = cache.get_detailed_stats().expect("Operation failed");
1849        assert!(stats.total_size_bytes <= cache.max_cache_size());
1850
1851        // The most recent file should still be cached
1852        assert!(cache.is_cached("medium.dat"));
1853    }
1854
1855    #[test]
1856    fn test_offline_mode() {
1857        let tempdir = TempDir::new().expect("Operation failed");
1858        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1859
1860        assert!(!cache.is_offline());
1861        cache.set_offline_mode(true);
1862        assert!(cache.is_offline());
1863    }
1864
1865    #[test]
1866    fn test_detailed_stats() {
1867        let tempdir = TempDir::new().expect("Operation failed");
1868        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1869
1870        let test_data = vec![1, 2, 3, 4, 5];
1871        cache
1872            .write_cached("test.dat", &test_data)
1873            .expect("Operation failed");
1874
1875        let stats = cache.get_detailed_stats().expect("Operation failed");
1876        assert_eq!(stats.file_count, 1);
1877        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1878        assert_eq!(stats.files.len(), 1);
1879        assert_eq!(stats.files[0].name, "test.dat");
1880        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1881    }
1882
1883    #[test]
1884    fn test_cache_manager() {
1885        let tempdir = TempDir::new().expect("Operation failed");
1886        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1887
1888        let stats = manager.get_stats();
1889        assert_eq!(stats.file_count, 0);
1890        assert_eq!(stats.total_size_bytes, 0);
1891
1892        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1893    }
1894
1895    #[test]
1896    fn test_format_bytes() {
1897        assert_eq!(format_bytes(512), "512 B");
1898        assert_eq!(format_bytes(1024), "1.0 KB");
1899        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1900        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1901    }
1902}