Skip to main content

scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(hasher
46        .finalize()
47        .iter()
48        .map(|b| format!("{:02x}", b))
49        .collect())
50}
51
52/// Registry entry for dataset files
53pub struct RegistryEntry {
54    /// SHA256 hash of the file
55    pub sha256: &'static str,
56    /// URL to download the file from
57    pub url: &'static str,
58}
59
60/// Get the platform-specific cache directory for downloading and storing datasets
61///
62/// The cache directory is determined in the following order:
63/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
64/// 2. Platform-specific cache directory:
65///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
66///    - macOS: `~/Library/Caches/scirs2-datasets`
67///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
68/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
69#[allow(dead_code)]
70pub fn get_cachedir() -> Result<PathBuf> {
71    // Check environment variable first
72    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
73        let cachepath = PathBuf::from(cachedir);
74        ensuredirectory_exists(&cachepath)?;
75        return Ok(cachepath);
76    }
77
78    // Try platform-specific cache directory
79    if let Some(cachedir) = get_platform_cachedir() {
80        ensuredirectory_exists(&cachedir)?;
81        return Ok(cachedir);
82    }
83
84    // Fallback to home directory
85    let homedir = crate::platform_dirs::home_dir()
86        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
87    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
88    ensuredirectory_exists(&cachedir)?;
89
90    Ok(cachedir)
91}
92
93/// Get platform-specific cache directory
94#[allow(dead_code)]
95fn get_platform_cachedir() -> Option<PathBuf> {
96    #[cfg(target_os = "windows")]
97    {
98        crate::platform_dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
99    }
100    #[cfg(target_os = "macos")]
101    {
102        crate::platform_dirs::home_dir()
103            .map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
104    }
105    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
106    {
107        // Linux/Unix: Use XDG cache directory
108        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
109            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
110        } else {
111            crate::platform_dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
112        }
113    }
114}
115
116/// Ensure a directory exists, creating it if necessary
117#[allow(dead_code)]
118fn ensuredirectory_exists(dir: &Path) -> Result<()> {
119    if !dir.exists() {
120        fs::create_dir_all(dir).map_err(|e| {
121            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
122        })?;
123    }
124    Ok(())
125}
126
127/// Fetch a dataset file from either cache or download it from the URL
128///
129/// This function will:
130/// 1. Check if the file exists in the cache directory
131/// 2. If not, download it from the URL in the registry entry
132/// 3. Store it in the cache directory
133/// 4. Return the path to the cached file
134///
135/// # Arguments
136///
137/// * `filename` - The name of the file to fetch
138/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
139///
140/// # Returns
141///
142/// * `Ok(PathBuf)` - Path to the cached file
143/// * `Err(String)` - Error message if fetching fails
144#[cfg(feature = "download-sync")]
145#[allow(dead_code)]
146pub fn fetch_data(
147    filename: &str,
148    registry_entry: Option<&RegistryEntry>,
149) -> std::result::Result<PathBuf, String> {
150    // Get the cache directory
151    let cachedir = match get_cachedir() {
152        Ok(dir) => dir,
153        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
154    };
155
156    // Check if file exists in cache
157    let cachepath = cachedir.join(filename);
158    if cachepath.exists() {
159        return Ok(cachepath);
160    }
161
162    // If not in cache, fetch from the URL
163    let entry = match registry_entry {
164        Some(entry) => entry,
165        None => return Err(format!("No registry entry found for {filename}")),
166    };
167
168    // Create a temporary file to download to
169    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
170    let temp_file = tempdir.path().join(filename);
171
172    // Download the file
173    let response = ureq::get(entry.url)
174        .call()
175        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
176
177    // Read body into memory (ureq 3.x: use into_body which implements Read)
178    let mut body = response.into_body();
179    let bytes = body
180        .read_to_vec()
181        .map_err(|e| format!("Failed to read response body: {e}"))?;
182    let mut file = std::fs::File::create(&temp_file)
183        .map_err(|e| format!("Failed to create temp file: {e}"))?;
184    file.write_all(&bytes)
185        .map_err(|e| format!("Failed to write downloaded file: {e}"))?;
186
187    // Verify the SHA256 hash of the downloaded file if provided
188    if !entry.sha256.is_empty() {
189        let computed_hash = sha256_hash_file(&temp_file)?;
190        if computed_hash != entry.sha256 {
191            return Err(format!(
192                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
193                entry.sha256
194            ));
195        }
196    }
197
198    // Move the file to the cache
199    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
200    if let Some(parent) = cachepath.parent() {
201        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
202    }
203
204    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
205
206    Ok(cachepath)
207}
208
209/// Stub for fetch_data when download-sync feature is disabled
210#[cfg(not(feature = "download-sync"))]
211#[allow(dead_code)]
212pub fn fetch_data(
213    _filename: &str,
214    _registry_entry: Option<&RegistryEntry>,
215) -> std::result::Result<PathBuf, String> {
216    Err("Synchronous download feature is disabled. Enable 'download-sync' feature.".to_string())
217}
218
219/// Cache key for dataset caching with configuration-aware hashing
220#[derive(Clone, Debug, Eq, PartialEq, Hash)]
221pub struct CacheKey {
222    name: String,
223    config_hash: String,
224}
225
226impl CacheKey {
227    /// Create a new cache key from dataset name and configuration
228    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
229        use std::collections::hash_map::DefaultHasher;
230        use std::hash::{Hash, Hasher};
231
232        let mut hasher = DefaultHasher::new();
233        config.use_cache.hash(&mut hasher);
234        config.download_if_missing.hash(&mut hasher);
235        config.return_preprocessed.hash(&mut hasher);
236        config.subset.hash(&mut hasher);
237        config.random_state.hash(&mut hasher);
238
239        Self {
240            name: name.to_string(),
241            config_hash: format!("{:x}", hasher.finish()),
242        }
243    }
244
245    /// Get the cache key as a string
246    pub fn as_string(&self) -> String {
247        format!("{}_{}", self.name, self.config_hash)
248    }
249}
250
251/// File path wrapper for hashing
252#[derive(Clone, Debug, Eq, PartialEq)]
253struct FileCacheKey(String);
254
255impl Hash for FileCacheKey {
256    fn hash<H: Hasher>(&self, state: &mut H) {
257        self.0.hash(state);
258    }
259}
260
261/// Manages caching of downloaded datasets, using both file-based and in-memory caching
262///
263/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
264/// while maintaining the file-based persistence for long-term storage.
265pub struct DatasetCache {
266    /// Directory for file-based caching
267    cachedir: PathBuf,
268    /// In-memory cache for frequently accessed datasets
269    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
270    /// Maximum cache size in bytes (0 means unlimited)
271    max_cache_size: u64,
272    /// Whether to operate in offline mode (no downloads)
273    offline_mode: bool,
274}
275
276impl Default for DatasetCache {
277    fn default() -> Self {
278        let cachedir = get_cachedir().expect("Could not get cache directory");
279
280        let mem_cache = RefCell::new(
281            CacheBuilder::new()
282                .with_size(DEFAULT_CACHE_SIZE)
283                .with_ttl(DEFAULT_CACHE_TTL)
284                .build_sized_cache(),
285        );
286
287        // Check if offline mode is enabled via environment variable
288        let offline_mode = std::env::var("SCIRS2_OFFLINE")
289            .map(|v| v.to_lowercase() == "true" || v == "1")
290            .unwrap_or(false);
291
292        DatasetCache {
293            cachedir,
294            mem_cache,
295            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
296            offline_mode,
297        }
298    }
299}
300
301impl DatasetCache {
302    /// Create a new dataset cache with the given cache directory and default memory cache
303    pub fn new(cachedir: PathBuf) -> Self {
304        let mem_cache = RefCell::new(
305            CacheBuilder::new()
306                .with_size(DEFAULT_CACHE_SIZE)
307                .with_ttl(DEFAULT_CACHE_TTL)
308                .build_sized_cache(),
309        );
310
311        let offline_mode = std::env::var("SCIRS2_OFFLINE")
312            .map(|v| v.to_lowercase() == "true" || v == "1")
313            .unwrap_or(false);
314
315        DatasetCache {
316            cachedir,
317            mem_cache,
318            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
319            offline_mode,
320        }
321    }
322
323    /// Create a new dataset cache with custom settings
324    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
325        let mem_cache = RefCell::new(
326            CacheBuilder::new()
327                .with_size(cache_size)
328                .with_ttl(ttl_seconds)
329                .build_sized_cache(),
330        );
331
332        let offline_mode = std::env::var("SCIRS2_OFFLINE")
333            .map(|v| v.to_lowercase() == "true" || v == "1")
334            .unwrap_or(false);
335
336        DatasetCache {
337            cachedir,
338            mem_cache,
339            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
340            offline_mode,
341        }
342    }
343
344    /// Create a new dataset cache with comprehensive configuration
345    pub fn with_full_config(
346        cachedir: PathBuf,
347        cache_size: usize,
348        ttl_seconds: u64,
349        max_cache_size: u64,
350        offline_mode: bool,
351    ) -> Self {
352        let mem_cache = RefCell::new(
353            CacheBuilder::new()
354                .with_size(cache_size)
355                .with_ttl(ttl_seconds)
356                .build_sized_cache(),
357        );
358
359        DatasetCache {
360            cachedir,
361            mem_cache,
362            max_cache_size,
363            offline_mode,
364        }
365    }
366
367    /// Create the cache directory if it doesn't exist
368    pub fn ensure_cachedir(&self) -> Result<()> {
369        if !self.cachedir.exists() {
370            fs::create_dir_all(&self.cachedir).map_err(|e| {
371                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
372            })?;
373        }
374        Ok(())
375    }
376
377    /// Get the path to a cached file
378    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
379        self.cachedir.join(name)
380    }
381
382    /// Check if a file is already cached (either in memory or on disk)
383    pub fn is_cached(&self, name: &str) -> bool {
384        // Check memory cache first
385        let key = FileCacheKey(name.to_string());
386        if self.mem_cache.borrow_mut().get(&key).is_some() {
387            return true;
388        }
389
390        // Then check file system
391        self.get_cachedpath(name).exists()
392    }
393
394    /// Read a cached file as bytes
395    ///
396    /// This method checks the in-memory cache first, and falls back to the file system if needed.
397    /// When reading from the file system, the result is also stored in the in-memory cache.
398    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
399        // Try memory cache first
400        let key = FileCacheKey(name.to_string());
401        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
402            return Ok(data);
403        }
404
405        // Fall back to file system cache
406        let path = self.get_cachedpath(name);
407        if !path.exists() {
408            return Err(DatasetsError::CacheError(format!(
409                "Cached file does not exist: {name}"
410            )));
411        }
412
413        let mut file = File::open(path)
414            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
415
416        let mut buffer = Vec::new();
417        file.read_to_end(&mut buffer)
418            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
419
420        // Update memory cache
421        self.mem_cache.borrow_mut().insert(key, buffer.clone());
422
423        Ok(buffer)
424    }
425
426    /// Write data to both the file cache and memory cache
427    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
428        self.ensure_cachedir()?;
429
430        // Check if writing this file would exceed cache size limit
431        if self.max_cache_size > 0 {
432            let current_size = self.get_cache_size_bytes()?;
433            let new_file_size = data.len() as u64;
434
435            if current_size + new_file_size > self.max_cache_size {
436                self.cleanup_cache_to_fit(new_file_size)?;
437            }
438        }
439
440        // Write to file system cache
441        let path = self.get_cachedpath(name);
442        let mut file = File::create(path)
443            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
444
445        file.write_all(data).map_err(|e| {
446            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
447        })?;
448
449        // Update memory cache
450        let key = FileCacheKey(name.to_string());
451        self.mem_cache.borrow_mut().insert(key, data.to_vec());
452
453        Ok(())
454    }
455
456    /// Clear the entire cache (both memory and file-based)
457    pub fn clear_cache(&self) -> Result<()> {
458        // Clear file system cache
459        if self.cachedir.exists() {
460            fs::remove_dir_all(&self.cachedir)
461                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
462        }
463
464        // Clear memory cache
465        self.mem_cache.borrow_mut().clear();
466
467        Ok(())
468    }
469
470    /// Remove a specific cached file (from both memory and file system)
471    pub fn remove_cached(&self, name: &str) -> Result<()> {
472        // Remove from file system
473        let path = self.get_cachedpath(name);
474        if path.exists() {
475            fs::remove_file(path).map_err(|e| {
476                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
477            })?;
478        }
479
480        // Remove from memory cache
481        let key = FileCacheKey(name.to_string());
482        self.mem_cache.borrow_mut().remove(&key);
483
484        Ok(())
485    }
486
487    /// Compute a hash for a filename or URL
488    pub fn hash_filename(name: &str) -> String {
489        let hash = blake3::hash(name.as_bytes());
490        hash.to_hex().to_string()
491    }
492
493    /// Get the total size of the cache in bytes
494    pub fn get_cache_size_bytes(&self) -> Result<u64> {
495        let mut total_size = 0u64;
496
497        if self.cachedir.exists() {
498            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
499                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
500            })?;
501
502            for entry in entries {
503                let entry = entry.map_err(|e| {
504                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
505                })?;
506
507                if let Ok(metadata) = entry.metadata() {
508                    if metadata.is_file() {
509                        total_size += metadata.len();
510                    }
511                }
512            }
513        }
514
515        Ok(total_size)
516    }
517
518    /// Clean up cache to fit a new file of specified size
519    ///
520    /// This method removes the oldest files first until there's enough space
521    /// for the new file plus some buffer space.
522    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
523        if self.max_cache_size == 0 {
524            return Ok(()); // No _size limit
525        }
526
527        let current_size = self.get_cache_size_bytes()?;
528        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
529        let total_needed = current_size + needed_size;
530
531        if total_needed <= target_size {
532            return Ok(()); // No cleanup needed
533        }
534
535        let size_to_free = total_needed - target_size;
536
537        // Get all files with their modification times
538        let mut files_with_times = Vec::new();
539
540        if self.cachedir.exists() {
541            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
542                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
543            })?;
544
545            for entry in entries {
546                let entry = entry.map_err(|e| {
547                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
548                })?;
549
550                if let Ok(metadata) = entry.metadata() {
551                    if metadata.is_file() {
552                        if let Ok(modified) = metadata.modified() {
553                            files_with_times.push((entry.path(), metadata.len(), modified));
554                        }
555                    }
556                }
557            }
558        }
559
560        // Sort by modification time (oldest first)
561        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
562
563        // Remove files until we've freed enough space
564        let mut freed_size = 0u64;
565        for (path, size, _modified) in files_with_times {
566            if freed_size >= size_to_free {
567                break;
568            }
569
570            // Remove from memory cache first
571            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
572                let key = FileCacheKey(filename.to_string());
573                self.mem_cache.borrow_mut().remove(&key);
574            }
575
576            // Remove file
577            if let Err(e) = fs::remove_file(&path) {
578                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
579            } else {
580                freed_size += size;
581            }
582        }
583
584        Ok(())
585    }
586
587    /// Set offline mode
588    pub fn set_offline_mode(&mut self, offline: bool) {
589        self.offline_mode = offline;
590    }
591
592    /// Check if cache is in offline mode
593    pub fn is_offline(&self) -> bool {
594        self.offline_mode
595    }
596
597    /// Set maximum cache size in bytes (0 for unlimited)
598    pub fn set_max_cache_size(&mut self, max_size: u64) {
599        self.max_cache_size = max_size;
600    }
601
602    /// Get maximum cache size in bytes
603    pub fn max_cache_size(&self) -> u64 {
604        self.max_cache_size
605    }
606
607    /// Put data into the cache (alias for write_cached)
608    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
609        self.write_cached(name, data)
610    }
611
612    /// Get detailed cache information
613    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
614        let mut total_size = 0u64;
615        let mut file_count = 0usize;
616        let mut files = Vec::new();
617
618        if self.cachedir.exists() {
619            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
620                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
621            })?;
622
623            for entry in entries {
624                let entry = entry.map_err(|e| {
625                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
626                })?;
627
628                if let Ok(metadata) = entry.metadata() {
629                    if metadata.is_file() {
630                        let size = metadata.len();
631                        total_size += size;
632                        file_count += 1;
633
634                        if let Some(filename) = entry.file_name().to_str() {
635                            files.push(CacheFileInfo {
636                                name: filename.to_string(),
637                                size_bytes: size,
638                                modified: metadata.modified().ok(),
639                            });
640                        }
641                    }
642                }
643            }
644        }
645
646        // Sort files by size (largest first)
647        files.sort_by_key(|f| std::cmp::Reverse(f.size_bytes));
648
649        Ok(DetailedCacheStats {
650            total_size_bytes: total_size,
651            file_count,
652            cachedir: self.cachedir.clone(),
653            max_cache_size: self.max_cache_size,
654            offline_mode: self.offline_mode,
655            files,
656        })
657    }
658}
659
660/// Downloads data from a URL and returns it as bytes, using the cache when possible
661#[cfg(feature = "download")]
662#[allow(dead_code)]
663pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
664    let cache = DatasetCache::default();
665    let cache_key = DatasetCache::hash_filename(_url);
666
667    // Check if the data is already cached
668    if !force_download && cache.is_cached(&cache_key) {
669        return cache.read_cached(&cache_key);
670    }
671
672    // Download the data
673    let response = reqwest::blocking::get(_url).map_err(|e| {
674        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
675    })?;
676
677    if !response.status().is_success() {
678        return Err(DatasetsError::DownloadError(format!(
679            "Failed to download from {_url}: HTTP status {}",
680            response.status()
681        )));
682    }
683
684    let data = response
685        .bytes()
686        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
687
688    let data_vec = data.to_vec();
689
690    // Cache the data
691    cache.write_cached(&cache_key, &data_vec)?;
692
693    Ok(data_vec)
694}
695
696// Stub for when download feature is not enabled
697#[cfg(not(feature = "download"))]
698/// Downloads data from a URL or retrieves it from cache
699///
700/// This is a stub implementation when the download feature is not enabled.
701/// It returns an error informing the user to enable the download feature.
702///
703/// # Arguments
704///
705/// * `_url` - The URL to download from
706/// * `_force_download` - If true, force a new download instead of using cache
707///
708/// # Returns
709///
710/// * An error indicating that the download feature is not enabled
711#[allow(dead_code)]
712pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
713    Err(DatasetsError::Other(
714        "Download feature is not enabled. Recompile with --features download".to_string(),
715    ))
716}
717
718/// Cache management utilities
719pub struct CacheManager {
720    cache: DatasetCache,
721}
722
723impl CacheManager {
724    /// Create a new cache manager with default settings
725    pub fn new() -> Result<Self> {
726        let cachedir = get_cachedir()?;
727        Ok(Self {
728            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
729        })
730    }
731
732    /// Create a new cache manager with custom settings
733    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
734        Self {
735            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
736        }
737    }
738
739    /// Get a dataset from cache using CacheKey
740    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
741        let name = key.as_string();
742        if self.cache.is_cached(&name) {
743            match self.cache.read_cached(&name) {
744                Ok(cached_data) => {
745                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
746                        Ok(dataset) => Ok(Some(dataset)),
747                        Err(e) => {
748                            // If deserialization fails, consider the cache entry invalid
749                            self.cache
750                                .mem_cache
751                                .borrow_mut()
752                                .remove(&FileCacheKey(name.clone()));
753                            Err(DatasetsError::CacheError(format!(
754                                "Failed to deserialize cached dataset: {e}"
755                            )))
756                        }
757                    }
758                }
759                Err(e) => Err(DatasetsError::CacheError(format!(
760                    "Failed to read cached data: {e}"
761                ))),
762            }
763        } else {
764            Ok(None)
765        }
766    }
767
768    /// Put a dataset into cache using CacheKey
769    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
770        let name = key.as_string();
771
772        // Serialize the dataset to JSON bytes for caching
773        let serialized = serde_json::to_vec(dataset)
774            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
775
776        // Write the serialized data to cache
777        self.cache
778            .write_cached(&name, &serialized)
779            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
780    }
781
782    /// Create a cache manager with comprehensive configuration
783    pub fn with_full_config(
784        cachedir: PathBuf,
785        cache_size: usize,
786        ttl_seconds: u64,
787        max_cache_size: u64,
788        offline_mode: bool,
789    ) -> Self {
790        Self {
791            cache: DatasetCache::with_full_config(
792                cachedir,
793                cache_size,
794                ttl_seconds,
795                max_cache_size,
796                offline_mode,
797            ),
798        }
799    }
800
801    /// Get basic cache statistics
802    pub fn get_stats(&self) -> CacheStats {
803        let cachedir = &self.cache.cachedir;
804        let mut total_size = 0u64;
805        let mut file_count = 0usize;
806
807        if cachedir.exists() {
808            if let Ok(entries) = fs::read_dir(cachedir) {
809                for entry in entries.flatten() {
810                    if let Ok(metadata) = entry.metadata() {
811                        if metadata.is_file() {
812                            total_size += metadata.len();
813                            file_count += 1;
814                        }
815                    }
816                }
817            }
818        }
819
820        CacheStats {
821            total_size_bytes: total_size,
822            file_count,
823            cachedir: cachedir.clone(),
824        }
825    }
826
827    /// Get detailed cache statistics
828    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
829        self.cache.get_detailed_stats()
830    }
831
832    /// Set offline mode
833    pub fn set_offline_mode(&mut self, offline: bool) {
834        self.cache.set_offline_mode(offline);
835    }
836
837    /// Check if in offline mode
838    pub fn is_offline(&self) -> bool {
839        self.cache.is_offline()
840    }
841
842    /// Set maximum cache size in bytes (0 for unlimited)
843    pub fn set_max_cache_size(&mut self, max_size: u64) {
844        self.cache.set_max_cache_size(max_size);
845    }
846
847    /// Get maximum cache size in bytes
848    pub fn max_cache_size(&self) -> u64 {
849        self.cache.max_cache_size()
850    }
851
852    /// Clear all cached data
853    pub fn clear_all(&self) -> Result<()> {
854        self.cache.clear_cache()
855    }
856
857    /// Remove specific cached file
858    pub fn remove(&self, name: &str) -> Result<()> {
859        self.cache.remove_cached(name)
860    }
861
862    /// Remove old files to free up space
863    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
864        self.cache.cleanup_cache_to_fit(target_size)
865    }
866
867    /// List all cached files
868    pub fn list_cached_files(&self) -> Result<Vec<String>> {
869        let cachedir = &self.cache.cachedir;
870        let mut files = Vec::new();
871
872        if cachedir.exists() {
873            let entries = fs::read_dir(cachedir).map_err(|e| {
874                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
875            })?;
876
877            for entry in entries {
878                let entry = entry.map_err(|e| {
879                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
880                })?;
881
882                if let Some(filename) = entry.file_name().to_str() {
883                    files.push(filename.to_string());
884                }
885            }
886        }
887
888        files.sort();
889        Ok(files)
890    }
891
892    /// Get cache directory path
893    pub fn cachedir(&self) -> &PathBuf {
894        &self.cache.cachedir
895    }
896
897    /// Check if a file is cached
898    pub fn is_cached(&self, name: &str) -> bool {
899        self.cache.is_cached(name)
900    }
901
902    /// Print detailed cache report
903    pub fn print_cache_report(&self) -> Result<()> {
904        let stats = self.get_detailed_stats()?;
905
906        println!("=== Cache Report ===");
907        println!("Cache Directory: {}", stats.cachedir.display());
908        println!(
909            "Total Size: {} ({} files)",
910            stats.formatted_size(),
911            stats.file_count
912        );
913        println!("Max Size: {}", stats.formatted_max_size());
914
915        if stats.max_cache_size > 0 {
916            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
917        }
918
919        println!(
920            "Offline Mode: {}",
921            if stats.offline_mode {
922                "Enabled"
923            } else {
924                "Disabled"
925            }
926        );
927
928        if !stats.files.is_empty() {
929            println!("\nCached Files:");
930            for file in &stats.files {
931                println!(
932                    "  {} - {} ({})",
933                    file.name,
934                    file.formatted_size(),
935                    file.formatted_modified()
936                );
937            }
938        }
939
940        Ok(())
941    }
942}
943
944/// Cache statistics
945pub struct CacheStats {
946    /// Total size of all cached files in bytes
947    pub total_size_bytes: u64,
948    /// Number of cached files
949    pub file_count: usize,
950    /// Cache directory path
951    pub cachedir: PathBuf,
952}
953
954/// Detailed cache statistics with file-level information
955pub struct DetailedCacheStats {
956    /// Total size of all cached files in bytes
957    pub total_size_bytes: u64,
958    /// Number of cached files
959    pub file_count: usize,
960    /// Cache directory path
961    pub cachedir: PathBuf,
962    /// Maximum cache size (0 = unlimited)
963    pub max_cache_size: u64,
964    /// Whether cache is in offline mode
965    pub offline_mode: bool,
966    /// Information about individual cached files
967    pub files: Vec<CacheFileInfo>,
968}
969
970/// Information about a cached file
971#[derive(Debug, Clone)]
972pub struct CacheFileInfo {
973    /// Name of the cached file
974    pub name: String,
975    /// Size in bytes
976    pub size_bytes: u64,
977    /// Last modified time
978    pub modified: Option<std::time::SystemTime>,
979}
980
981impl CacheStats {
982    /// Get total size formatted as human-readable string
983    pub fn formatted_size(&self) -> String {
984        format_bytes(self.total_size_bytes)
985    }
986}
987
988impl DetailedCacheStats {
989    /// Get total size formatted as human-readable string
990    pub fn formatted_size(&self) -> String {
991        format_bytes(self.total_size_bytes)
992    }
993
994    /// Get max cache size formatted as human-readable string
995    pub fn formatted_max_size(&self) -> String {
996        if self.max_cache_size == 0 {
997            "Unlimited".to_string()
998        } else {
999            format_bytes(self.max_cache_size)
1000        }
1001    }
1002
1003    /// Get cache usage percentage (0.0-1.0)
1004    pub fn usage_percentage(&self) -> f64 {
1005        if self.max_cache_size == 0 {
1006            0.0
1007        } else {
1008            self.total_size_bytes as f64 / self.max_cache_size as f64
1009        }
1010    }
1011}
1012
1013impl CacheFileInfo {
1014    /// Get file size formatted as human-readable string
1015    pub fn formatted_size(&self) -> String {
1016        format_bytes(self.size_bytes)
1017    }
1018
1019    /// Get formatted modification time
1020    pub fn formatted_modified(&self) -> String {
1021        match &self.modified {
1022            Some(time) => {
1023                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1024                {
1025                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1026                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1027                        let days = diff_secs / 86400;
1028                        let hours = (diff_secs % 86400) / 3600;
1029                        let mins = (diff_secs % 3600) / 60;
1030
1031                        if days > 0 {
1032                            format!("{days} days ago")
1033                        } else if hours > 0 {
1034                            format!("{hours} hours ago")
1035                        } else if mins > 0 {
1036                            format!("{mins} minutes ago")
1037                        } else {
1038                            "Just now".to_string()
1039                        }
1040                    } else {
1041                        "Unknown".to_string()
1042                    }
1043                } else {
1044                    "Unknown".to_string()
1045                }
1046            }
1047            None => "Unknown".to_string(),
1048        }
1049    }
1050}
1051
1052/// Format bytes as human-readable string
1053#[allow(dead_code)]
1054fn format_bytes(bytes: u64) -> String {
1055    let size = bytes as f64;
1056    if size < 1024.0 {
1057        format!("{size} B")
1058    } else if size < 1024.0 * 1024.0 {
1059        format!("{:.1} KB", size / 1024.0)
1060    } else if size < 1024.0 * 1024.0 * 1024.0 {
1061        format!("{:.1} MB", size / (1024.0 * 1024.0))
1062    } else {
1063        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1064    }
1065}
1066
1067/// Batch operation result containing success/failure information
1068#[derive(Debug, Clone)]
1069pub struct BatchResult {
1070    /// Number of successful operations
1071    pub success_count: usize,
1072    /// Number of failed operations
1073    pub failure_count: usize,
1074    /// List of failed items with error messages
1075    pub failures: Vec<(String, String)>,
1076    /// Total bytes processed
1077    pub total_bytes: u64,
1078    /// Total time taken for the batch operation
1079    pub elapsed_time: std::time::Duration,
1080}
1081
1082impl BatchResult {
1083    /// Create a new empty batch result
1084    pub fn new() -> Self {
1085        Self {
1086            success_count: 0,
1087            failure_count: 0,
1088            failures: Vec::new(),
1089            total_bytes: 0,
1090            elapsed_time: std::time::Duration::ZERO,
1091        }
1092    }
1093
1094    /// Check if all operations were successful
1095    pub fn is_all_success(&self) -> bool {
1096        self.failure_count == 0
1097    }
1098
1099    /// Get success rate as percentage
1100    pub fn success_rate(&self) -> f64 {
1101        let total = self.success_count + self.failure_count;
1102        if total == 0 {
1103            0.0
1104        } else {
1105            (self.success_count as f64 / total as f64) * 100.0
1106        }
1107    }
1108
1109    /// Get formatted summary
1110    pub fn summary(&self) -> String {
1111        format!(
1112            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1113            self.success_count,
1114            self.success_count + self.failure_count,
1115            self.success_rate(),
1116            format_bytes(self.total_bytes),
1117            self.elapsed_time.as_secs_f64()
1118        )
1119    }
1120}
1121
1122impl Default for BatchResult {
1123    fn default() -> Self {
1124        Self::new()
1125    }
1126}
1127
1128/// Batch operations manager for dataset caching
1129pub struct BatchOperations {
1130    cache: CacheManager,
1131    parallel: bool,
1132    max_retries: usize,
1133    retry_delay: std::time::Duration,
1134}
1135
1136impl BatchOperations {
1137    /// Create a new batch operations manager
1138    pub fn new(cache: CacheManager) -> Self {
1139        Self {
1140            cache,
1141            parallel: true,
1142            max_retries: 3,
1143            retry_delay: std::time::Duration::from_millis(1000),
1144        }
1145    }
1146
1147    /// Configure parallel processing
1148    pub fn with_parallel(mut self, parallel: bool) -> Self {
1149        self.parallel = parallel;
1150        self
1151    }
1152
1153    /// Configure retry settings
1154    pub fn with_retry_config(
1155        mut self,
1156        max_retries: usize,
1157        retry_delay: std::time::Duration,
1158    ) -> Self {
1159        self.max_retries = max_retries;
1160        self.retry_delay = retry_delay;
1161        self
1162    }
1163
1164    /// Download multiple datasets in batch
1165    #[cfg(feature = "download")]
1166    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1167        let start_time = std::time::Instant::now();
1168        let mut result = BatchResult::new();
1169
1170        if self.parallel {
1171            self.batch_download_parallel(urls_andnames, &mut result)
1172        } else {
1173            self.batch_download_sequential(urls_andnames, &mut result)
1174        }
1175
1176        result.elapsed_time = start_time.elapsed();
1177        result
1178    }
1179
1180    #[cfg(feature = "download")]
1181    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1182        use std::fs::File;
1183        use std::io::Write;
1184        use std::sync::{Arc, Mutex};
1185        use std::thread;
1186
1187        // Ensure cache directory exists before spawning threads
1188        if let Err(e) = self.cache.cache.ensure_cachedir() {
1189            result.failure_count += urls_andnames.len();
1190            for &(_, name) in urls_andnames {
1191                result
1192                    .failures
1193                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1194            }
1195            return;
1196        }
1197
1198        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1199        let cachedir = self.cache.cache.cachedir.clone();
1200        let max_retries = self.max_retries;
1201        let retry_delay = self.retry_delay;
1202
1203        let handles: Vec<_> = urls_andnames
1204            .iter()
1205            .map(|&(url, name)| {
1206                let result_clone = Arc::clone(&result_arc);
1207                let url = url.to_string();
1208                let name = name.to_string();
1209                let cachedir = cachedir.clone();
1210
1211                thread::spawn(move || {
1212                    let mut success = false;
1213                    let mut last_error = String::new();
1214                    let mut downloaded_data = Vec::new();
1215
1216                    for attempt in 0..=max_retries {
1217                        match download_data(&url, false) {
1218                            Ok(data) => {
1219                                // Write directly to filesystem (bypassing RefCell memory cache)
1220                                let path = cachedir.join(&name);
1221                                match File::create(&path) {
1222                                    Ok(mut file) => match file.write_all(&data) {
1223                                        Ok(_) => {
1224                                            let mut r =
1225                                                result_clone.lock().expect("Operation failed");
1226                                            r.success_count += 1;
1227                                            r.total_bytes += data.len() as u64;
1228                                            downloaded_data = data;
1229                                            success = true;
1230                                            break;
1231                                        }
1232                                        Err(e) => {
1233                                            last_error = format!("Failed to write cache file: {e}");
1234                                        }
1235                                    },
1236                                    Err(e) => {
1237                                        last_error = format!("Failed to create cache file: {e}");
1238                                    }
1239                                }
1240                            }
1241                            Err(e) => {
1242                                last_error = format!("Download failed: {e}");
1243                                if attempt < max_retries {
1244                                    thread::sleep(retry_delay);
1245                                }
1246                            }
1247                        }
1248                    }
1249
1250                    if !success {
1251                        let mut r = result_clone.lock().expect("Operation failed");
1252                        r.failure_count += 1;
1253                        r.failures.push((name.clone(), last_error));
1254                    }
1255
1256                    (name, success, downloaded_data)
1257                })
1258            })
1259            .collect();
1260
1261        // Collect results and update memory cache for successful downloads
1262        let mut successful_downloads = Vec::new();
1263        for handle in handles {
1264            if let Ok((name, success, data)) = handle.join() {
1265                if success && !data.is_empty() {
1266                    successful_downloads.push((name, data));
1267                }
1268            }
1269        }
1270
1271        // Merge the results from the arc back into the original result
1272        if let Ok(arc_result) = result_arc.lock() {
1273            result.success_count += arc_result.success_count;
1274            result.failure_count += arc_result.failure_count;
1275            result.failures.extend(arc_result.failures.clone());
1276        }
1277
1278        // Update memory cache after all threads complete
1279        for (name, data) in successful_downloads {
1280            let key = FileCacheKey(name);
1281            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1282        }
1283    }
1284
1285    #[cfg(feature = "download")]
1286    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1287        for &(url, name) in urls_andnames {
1288            let mut success = false;
1289            let mut last_error = String::new();
1290
1291            for attempt in 0..=self.max_retries {
1292                match download_data(url, false) {
1293                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1294                        Ok(_) => {
1295                            result.success_count += 1;
1296                            result.total_bytes += data.len() as u64;
1297                            success = true;
1298                            break;
1299                        }
1300                        Err(e) => {
1301                            last_error = format!("Cache write failed: {e}");
1302                        }
1303                    },
1304                    Err(e) => {
1305                        last_error = format!("Download failed: {e}");
1306                        if attempt < self.max_retries {
1307                            std::thread::sleep(self.retry_delay);
1308                        }
1309                    }
1310                }
1311            }
1312
1313            if !success {
1314                result.failure_count += 1;
1315                result.failures.push((name.to_string(), last_error));
1316            }
1317        }
1318    }
1319
1320    /// Verify integrity of multiple cached files
1321    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1322        let start_time = std::time::Instant::now();
1323        let mut result = BatchResult::new();
1324
1325        for &(filename, expected_hash) in files_andhashes {
1326            match self.cache.cache.get_cachedpath(filename).exists() {
1327                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1328                    Ok(actual_hash) => {
1329                        if actual_hash == expected_hash {
1330                            result.success_count += 1;
1331                            if let Ok(metadata) =
1332                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1333                            {
1334                                result.total_bytes += metadata.len();
1335                            }
1336                        } else {
1337                            result.failure_count += 1;
1338                            result.failures.push((
1339                                filename.to_string(),
1340                                format!(
1341                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1342                                ),
1343                            ));
1344                        }
1345                    }
1346                    Err(e) => {
1347                        result.failure_count += 1;
1348                        result.failures.push((
1349                            filename.to_string(),
1350                            format!("Hash computation failed: {e}"),
1351                        ));
1352                    }
1353                },
1354                false => {
1355                    result.failure_count += 1;
1356                    result
1357                        .failures
1358                        .push((filename.to_string(), "File not found in cache".to_string()));
1359                }
1360            }
1361        }
1362
1363        result.elapsed_time = start_time.elapsed();
1364        result
1365    }
1366
1367    /// Clean up cache selectively based on patterns
1368    pub fn selective_cleanup(
1369        &self,
1370        patterns: &[&str],
1371        max_age_days: Option<u32>,
1372    ) -> Result<BatchResult> {
1373        let start_time = std::time::Instant::now();
1374        let mut result = BatchResult::new();
1375
1376        let cached_files = self.cache.list_cached_files()?;
1377        let now = std::time::SystemTime::now();
1378
1379        for filename in cached_files {
1380            let should_remove = patterns.iter().any(|pattern| {
1381                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1382            });
1383
1384            if should_remove {
1385                let filepath = self.cache.cache.get_cachedpath(&filename);
1386
1387                // Check age if max_age_days is specified
1388                let remove_due_to_age = if let Some(max_age) = max_age_days {
1389                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1390                        if let Ok(modified) = metadata.modified() {
1391                            if let Ok(age) = now.duration_since(modified) {
1392                                age.as_secs() > (max_age as u64 * 24 * 3600)
1393                            } else {
1394                                false
1395                            }
1396                        } else {
1397                            false
1398                        }
1399                    } else {
1400                        false
1401                    }
1402                } else {
1403                    true // Remove regardless of age if no age limit specified
1404                };
1405
1406                if remove_due_to_age {
1407                    match self.cache.remove(&filename) {
1408                        Ok(_) => {
1409                            result.success_count += 1;
1410                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1411                                result.total_bytes += metadata.len();
1412                            }
1413                        }
1414                        Err(e) => {
1415                            result.failure_count += 1;
1416                            result
1417                                .failures
1418                                .push((filename, format!("Removal failed: {e}")));
1419                        }
1420                    }
1421                }
1422            }
1423        }
1424
1425        result.elapsed_time = start_time.elapsed();
1426        Ok(result)
1427    }
1428
1429    /// Process multiple datasets with a given function
1430    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1431    where
1432        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1433        E: std::fmt::Display,
1434        T: Send,
1435    {
1436        let start_time = std::time::Instant::now();
1437        let mut result = BatchResult::new();
1438
1439        if self.parallel {
1440            self.batch_process_parallel(names, processor, &mut result)
1441        } else {
1442            self.batch_process_sequential(names, processor, &mut result)
1443        }
1444
1445        result.elapsed_time = start_time.elapsed();
1446        result
1447    }
1448
1449    fn batch_process_parallel<F, T, E>(
1450        &self,
1451        names: &[String],
1452        processor: F,
1453        result: &mut BatchResult,
1454    ) where
1455        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1456        E: std::fmt::Display,
1457        T: Send,
1458    {
1459        // For thread safety with the current cache implementation,
1460        // we need to read all data first, then process in parallel
1461        let mut data_pairs = Vec::new();
1462
1463        // Sequential read phase
1464        for name in names {
1465            match self.cache.cache.read_cached(name) {
1466                Ok(data) => data_pairs.push((name.clone(), data)),
1467                Err(e) => {
1468                    result.failure_count += 1;
1469                    result
1470                        .failures
1471                        .push((name.clone(), format!("Cache read failed: {e}")));
1472                }
1473            }
1474        }
1475
1476        // Parallel processing phase
1477        if !data_pairs.is_empty() {
1478            use std::sync::{Arc, Mutex};
1479            use std::thread;
1480
1481            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1482            let processor = Arc::new(processor);
1483
1484            let handles: Vec<_> = data_pairs
1485                .into_iter()
1486                .map(|(name, data)| {
1487                    let result_clone = Arc::clone(&parallel_result);
1488                    let processor_clone = Arc::clone(&processor);
1489
1490                    thread::spawn(move || match processor_clone(&name, &data) {
1491                        Ok(_) => {
1492                            let mut r = result_clone.lock().expect("Operation failed");
1493                            r.success_count += 1;
1494                            r.total_bytes += data.len() as u64;
1495                        }
1496                        Err(e) => {
1497                            let mut r = result_clone.lock().expect("Operation failed");
1498                            r.failure_count += 1;
1499                            r.failures.push((name, format!("Processing failed: {e}")));
1500                        }
1501                    })
1502                })
1503                .collect();
1504
1505            for handle in handles {
1506                let _ = handle.join();
1507            }
1508
1509            // Merge parallel results into main result
1510            let parallel_result = parallel_result.lock().expect("Operation failed");
1511            result.success_count += parallel_result.success_count;
1512            result.failure_count += parallel_result.failure_count;
1513            result.total_bytes += parallel_result.total_bytes;
1514            result.failures.extend(parallel_result.failures.clone());
1515        }
1516    }
1517
1518    fn batch_process_sequential<F, T, E>(
1519        &self,
1520        names: &[String],
1521        processor: F,
1522        result: &mut BatchResult,
1523    ) where
1524        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1525        E: std::fmt::Display,
1526    {
1527        for name in names {
1528            match self.cache.cache.read_cached(name) {
1529                Ok(data) => match processor(name, &data) {
1530                    Ok(_) => {
1531                        result.success_count += 1;
1532                        result.total_bytes += data.len() as u64;
1533                    }
1534                    Err(e) => {
1535                        result.failure_count += 1;
1536                        result
1537                            .failures
1538                            .push((name.clone(), format!("Processing failed: {e}")));
1539                    }
1540                },
1541                Err(e) => {
1542                    result.failure_count += 1;
1543                    result
1544                        .failures
1545                        .push((name.clone(), format!("Cache read failed: {e}")));
1546                }
1547            }
1548        }
1549    }
1550
1551    /// Get access to the underlying cache manager
1552    pub fn cache_manager(&self) -> &CacheManager {
1553        &self.cache
1554    }
1555
1556    /// Write data to cache
1557    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1558        self.cache.cache.write_cached(name, data)
1559    }
1560
1561    /// Read data from cache
1562    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1563        self.cache.cache.read_cached(name)
1564    }
1565
1566    /// List cached files
1567    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1568        self.cache.list_cached_files()
1569    }
1570
1571    /// Print cache report
1572    pub fn print_cache_report(&self) -> Result<()> {
1573        self.cache.print_cache_report()
1574    }
1575
1576    /// Get statistics about cached datasets
1577    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1578        let start_time = std::time::Instant::now();
1579        let mut result = BatchResult::new();
1580
1581        let cached_files = self.cache.list_cached_files()?;
1582
1583        for filename in cached_files {
1584            let filepath = self.cache.cache.get_cachedpath(&filename);
1585            match std::fs::metadata(&filepath) {
1586                Ok(metadata) => {
1587                    result.success_count += 1;
1588                    result.total_bytes += metadata.len();
1589                }
1590                Err(e) => {
1591                    result.failure_count += 1;
1592                    result
1593                        .failures
1594                        .push((filename, format!("Metadata read failed: {e}")));
1595                }
1596            }
1597        }
1598
1599        result.elapsed_time = start_time.elapsed();
1600        Ok(result)
1601    }
1602}
1603
1604/// Simple glob pattern matching for filenames
1605#[allow(dead_code)]
1606fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1607    if pattern == "*" {
1608        return true;
1609    }
1610
1611    if pattern.contains('*') {
1612        let parts: Vec<&str> = pattern.split('*').collect();
1613        if parts.len() == 2 {
1614            let prefix = parts[0];
1615            let suffix = parts[1];
1616            return filename.starts_with(prefix) && filename.ends_with(suffix);
1617        }
1618    }
1619
1620    filename == pattern
1621}
1622
1623#[cfg(test)]
1624mod tests {
1625    use super::*;
1626    use tempfile::TempDir;
1627
1628    #[test]
1629    fn test_batch_result() {
1630        let mut result = BatchResult::new();
1631        assert_eq!(result.success_count, 0);
1632        assert_eq!(result.failure_count, 0);
1633        assert!(result.is_all_success());
1634        assert_eq!(result.success_rate(), 0.0);
1635
1636        result.success_count = 8;
1637        result.failure_count = 2;
1638        result.total_bytes = 1024;
1639
1640        assert!(!result.is_all_success());
1641        assert_eq!(result.success_rate(), 80.0);
1642        assert!(result.summary().contains("8/10 successful"));
1643        assert!(result.summary().contains("80.0%"));
1644    }
1645
1646    #[test]
1647    fn test_batch_operations_creation() {
1648        let tempdir = TempDir::new().expect("Operation failed");
1649        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1650        let batch_ops = BatchOperations::new(cache_manager)
1651            .with_parallel(false)
1652            .with_retry_config(2, std::time::Duration::from_millis(500));
1653
1654        assert!(!batch_ops.parallel);
1655        assert_eq!(batch_ops.max_retries, 2);
1656    }
1657
1658    #[test]
1659    fn test_selective_cleanup() {
1660        let tempdir = TempDir::new().expect("Operation failed");
1661        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1662        let batch_ops = BatchOperations::new(cache_manager);
1663
1664        // Create some test files
1665        let test_data = vec![0u8; 100];
1666        batch_ops
1667            .cache
1668            .cache
1669            .write_cached("test1.csv", &test_data)
1670            .expect("Test: cache operation failed");
1671        batch_ops
1672            .cache
1673            .cache
1674            .write_cached("test2.csv", &test_data)
1675            .expect("Test: cache operation failed");
1676        batch_ops
1677            .cache
1678            .cache
1679            .write_cached("data.json", &test_data)
1680            .expect("Test: cache operation failed");
1681
1682        // Clean up files matching pattern
1683        let result = batch_ops
1684            .selective_cleanup(&["*.csv"], None)
1685            .expect("Operation failed");
1686
1687        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1688        assert!(!batch_ops.cache.is_cached("test1.csv"));
1689        assert!(!batch_ops.cache.is_cached("test2.csv"));
1690        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1691    }
1692
1693    #[test]
1694    fn test_batch_process() {
1695        let tempdir = TempDir::new().expect("Operation failed");
1696        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1697        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1698
1699        // Create test files
1700        let test_data1 = vec![1u8; 100];
1701        let test_data2 = vec![2u8; 200];
1702        batch_ops
1703            .cache
1704            .cache
1705            .write_cached("file1.dat", &test_data1)
1706            .expect("Test: cache operation failed");
1707        batch_ops
1708            .cache
1709            .cache
1710            .write_cached("file2.dat", &test_data2)
1711            .expect("Test: cache operation failed");
1712
1713        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1714
1715        // Process files (verify they're non-empty)
1716        let result = batch_ops.batch_process(&files, |_name, data| {
1717            if data.is_empty() {
1718                Err("Empty file")
1719            } else {
1720                Ok(data.len())
1721            }
1722        });
1723
1724        assert_eq!(result.success_count, 2);
1725        assert_eq!(result.failure_count, 0);
1726        assert_eq!(result.total_bytes, 300); // 100 + 200
1727    }
1728
1729    #[test]
1730    fn test_get_cache_statistics() {
1731        let tempdir = TempDir::new().expect("Operation failed");
1732        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1733        let batch_ops = BatchOperations::new(cache_manager);
1734
1735        // Start with empty cache
1736        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1737        assert_eq!(result.success_count, 0);
1738
1739        // Add some files
1740        let test_data = vec![0u8; 500];
1741        batch_ops
1742            .cache
1743            .cache
1744            .write_cached("test1.dat", &test_data)
1745            .expect("Test: cache operation failed");
1746        batch_ops
1747            .cache
1748            .cache
1749            .write_cached("test2.dat", &test_data)
1750            .expect("Test: cache operation failed");
1751
1752        let result = batch_ops.get_cache_statistics().expect("Operation failed");
1753        assert_eq!(result.success_count, 2);
1754        assert_eq!(result.total_bytes, 1000);
1755    }
1756
1757    #[test]
1758    fn test_matches_glob_pattern() {
1759        assert!(matches_glob_pattern("test.csv", "*"));
1760        assert!(matches_glob_pattern("test.csv", "*.csv"));
1761        assert!(matches_glob_pattern("test.csv", "test.*"));
1762        assert!(matches_glob_pattern("test.csv", "test.csv"));
1763
1764        assert!(!matches_glob_pattern("test.json", "*.csv"));
1765        assert!(!matches_glob_pattern("other.csv", "test.*"));
1766    }
1767
1768    #[test]
1769    fn test_cache_manager_creation() {
1770        let tempdir = TempDir::new().expect("Operation failed");
1771        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1772        let stats = manager.get_stats();
1773        assert_eq!(stats.file_count, 0);
1774    }
1775
1776    #[test]
1777    fn test_cache_stats_formatting() {
1778        let tempdir = TempDir::new().expect("Operation failed");
1779        let stats = CacheStats {
1780            total_size_bytes: 1024,
1781            file_count: 1,
1782            cachedir: tempdir.path().to_path_buf(),
1783        };
1784
1785        assert_eq!(stats.formatted_size(), "1.0 KB");
1786
1787        let stats_large = CacheStats {
1788            total_size_bytes: 1024 * 1024 * 1024,
1789            file_count: 1,
1790            cachedir: tempdir.path().to_path_buf(),
1791        };
1792
1793        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1794    }
1795
1796    #[test]
1797    fn test_hash_file_name() {
1798        let hash1 = DatasetCache::hash_filename("test.csv");
1799        let hash2 = DatasetCache::hash_filename("test.csv");
1800        let hash3 = DatasetCache::hash_filename("different.csv");
1801
1802        assert_eq!(hash1, hash2);
1803        assert_ne!(hash1, hash3);
1804        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1805    }
1806
1807    #[test]
1808    fn test_platform_cachedir() {
1809        let cachedir = get_platform_cachedir();
1810        // Should work on any platform
1811        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1812
1813        if let Some(dir) = cachedir {
1814            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1815        }
1816    }
1817
1818    #[test]
1819    fn test_cache_size_management() {
1820        let tempdir = TempDir::new().expect("Operation failed");
1821        let cache = DatasetCache::with_full_config(
1822            tempdir.path().to_path_buf(),
1823            10,
1824            3600,
1825            2048, // 2KB limit
1826            false,
1827        );
1828
1829        // Write multiple small files to approach the limit
1830        let small_data1 = vec![0u8; 400];
1831        cache
1832            .write_cached("small1.dat", &small_data1)
1833            .expect("Operation failed");
1834
1835        let small_data2 = vec![0u8; 400];
1836        cache
1837            .write_cached("small2.dat", &small_data2)
1838            .expect("Operation failed");
1839
1840        let small_data3 = vec![0u8; 400];
1841        cache
1842            .write_cached("small3.dat", &small_data3)
1843            .expect("Operation failed");
1844
1845        // Now write a file that should trigger cleanup
1846        let medium_data = vec![0u8; 800];
1847        cache
1848            .write_cached("medium.dat", &medium_data)
1849            .expect("Operation failed");
1850
1851        // The cache should have cleaned up to stay under the limit
1852        let stats = cache.get_detailed_stats().expect("Operation failed");
1853        assert!(stats.total_size_bytes <= cache.max_cache_size());
1854
1855        // The most recent file should still be cached
1856        assert!(cache.is_cached("medium.dat"));
1857    }
1858
1859    #[test]
1860    fn test_offline_mode() {
1861        let tempdir = TempDir::new().expect("Operation failed");
1862        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1863
1864        assert!(!cache.is_offline());
1865        cache.set_offline_mode(true);
1866        assert!(cache.is_offline());
1867    }
1868
1869    #[test]
1870    fn test_detailed_stats() {
1871        let tempdir = TempDir::new().expect("Operation failed");
1872        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1873
1874        let test_data = vec![1, 2, 3, 4, 5];
1875        cache
1876            .write_cached("test.dat", &test_data)
1877            .expect("Operation failed");
1878
1879        let stats = cache.get_detailed_stats().expect("Operation failed");
1880        assert_eq!(stats.file_count, 1);
1881        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1882        assert_eq!(stats.files.len(), 1);
1883        assert_eq!(stats.files[0].name, "test.dat");
1884        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1885    }
1886
1887    #[test]
1888    fn test_cache_manager() {
1889        let tempdir = TempDir::new().expect("Operation failed");
1890        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1891
1892        let stats = manager.get_stats();
1893        assert_eq!(stats.file_count, 0);
1894        assert_eq!(stats.total_size_bytes, 0);
1895
1896        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1897    }
1898
1899    #[test]
1900    fn test_format_bytes() {
1901        assert_eq!(format_bytes(512), "512 B");
1902        assert_eq!(format_bytes(1024), "1.0 KB");
1903        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1904        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1905    }
1906}