scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27#[allow(dead_code)]
28pub fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
29    use sha2::{Digest, Sha256};
30
31    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
32    let mut hasher = Sha256::new();
33    let mut buffer = [0; 8192];
34
35    loop {
36        let bytes_read = file
37            .read(&mut buffer)
38            .map_err(|e| format!("Failed to read file: {e}"))?;
39        if bytes_read == 0 {
40            break;
41        }
42        hasher.update(&buffer[..bytes_read]);
43    }
44
45    Ok(format!("{:x}", hasher.finalize()))
46}
47
48/// Registry entry for dataset files
49pub struct RegistryEntry {
50    /// SHA256 hash of the file
51    pub sha256: &'static str,
52    /// URL to download the file from
53    pub url: &'static str,
54}
55
56/// Get the platform-specific cache directory for downloading and storing datasets
57///
58/// The cache directory is determined in the following order:
59/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
60/// 2. Platform-specific cache directory:
61///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
62///    - macOS: `~/Library/Caches/scirs2-datasets`
63///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
64/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
65#[allow(dead_code)]
66pub fn get_cachedir() -> Result<PathBuf> {
67    // Check environment variable first
68    if let Ok(cachedir) = std::env::var(CACHE_DIR_ENV) {
69        let cachepath = PathBuf::from(cachedir);
70        ensuredirectory_exists(&cachepath)?;
71        return Ok(cachepath);
72    }
73
74    // Try platform-specific cache directory
75    if let Some(cachedir) = get_platform_cachedir() {
76        ensuredirectory_exists(&cachedir)?;
77        return Ok(cachedir);
78    }
79
80    // Fallback to home directory
81    let homedir = dirs::home_dir()
82        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
83    let cachedir = homedir.join(format!(".{CACHE_DIR_NAME}"));
84    ensuredirectory_exists(&cachedir)?;
85
86    Ok(cachedir)
87}
88
89/// Get platform-specific cache directory
90#[allow(dead_code)]
91fn get_platform_cachedir() -> Option<PathBuf> {
92    #[cfg(target_os = "windows")]
93    {
94        dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
95    }
96    #[cfg(target_os = "macos")]
97    {
98        dirs::home_dir().map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
99    }
100    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
101    {
102        // Linux/Unix: Use XDG cache directory
103        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
104            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
105        } else {
106            dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
107        }
108    }
109}
110
111/// Ensure a directory exists, creating it if necessary
112#[allow(dead_code)]
113fn ensuredirectory_exists(dir: &Path) -> Result<()> {
114    if !dir.exists() {
115        fs::create_dir_all(dir).map_err(|e| {
116            DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
117        })?;
118    }
119    Ok(())
120}
121
122/// Fetch a dataset file from either cache or download it from the URL
123///
124/// This function will:
125/// 1. Check if the file exists in the cache directory
126/// 2. If not, download it from the URL in the registry entry
127/// 3. Store it in the cache directory
128/// 4. Return the path to the cached file
129///
130/// # Arguments
131///
132/// * `filename` - The name of the file to fetch
133/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
134///
135/// # Returns
136///
137/// * `Ok(PathBuf)` - Path to the cached file
138/// * `Err(String)` - Error message if fetching fails
139#[allow(dead_code)]
140pub fn fetch_data(
141    filename: &str,
142    registry_entry: Option<&RegistryEntry>,
143) -> std::result::Result<PathBuf, String> {
144    // Get the cache directory
145    let cachedir = match get_cachedir() {
146        Ok(dir) => dir,
147        Err(e) => return Err(format!("Failed to get cache directory: {e}")),
148    };
149
150    // Check if file exists in cache
151    let cachepath = cachedir.join(filename);
152    if cachepath.exists() {
153        return Ok(cachepath);
154    }
155
156    // If not in cache, fetch from the URL
157    let entry = match registry_entry {
158        Some(entry) => entry,
159        None => return Err(format!("No registry entry found for {filename}")),
160    };
161
162    // Create a temporary file to download to
163    let tempdir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {e}"))?;
164    let temp_file = tempdir.path().join(filename);
165
166    // Download the file
167    let response = ureq::get(entry.url)
168        .call()
169        .map_err(|e| format!("Failed to download {filename}: {e}"))?;
170
171    let mut reader = response.into_reader();
172    let mut file = std::fs::File::create(&temp_file)
173        .map_err(|e| format!("Failed to create temp file: {e}"))?;
174
175    std::io::copy(&mut reader, &mut file).map_err(|e| format!("Failed to download file: {e}"))?;
176
177    // Verify the SHA256 hash of the downloaded file if provided
178    if !entry.sha256.is_empty() {
179        let computed_hash = sha256_hash_file(&temp_file)?;
180        if computed_hash != entry.sha256 {
181            return Err(format!(
182                "SHA256 hash mismatch for {filename}: expected {}, got {computed_hash}",
183                entry.sha256
184            ));
185        }
186    }
187
188    // Move the file to the cache
189    fs::create_dir_all(&cachedir).map_err(|e| format!("Failed to create cache dir: {e}"))?;
190    if let Some(parent) = cachepath.parent() {
191        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {e}"))?;
192    }
193
194    fs::copy(&temp_file, &cachepath).map_err(|e| format!("Failed to copy to cache: {e}"))?;
195
196    Ok(cachepath)
197}
198
199/// Cache key for dataset caching with configuration-aware hashing
200#[derive(Clone, Debug, Eq, PartialEq, Hash)]
201pub struct CacheKey {
202    name: String,
203    config_hash: String,
204}
205
206impl CacheKey {
207    /// Create a new cache key from dataset name and configuration
208    pub fn new(name: &str, config: &crate::real_world::RealWorldConfig) -> Self {
209        use std::collections::hash_map::DefaultHasher;
210        use std::hash::{Hash, Hasher};
211
212        let mut hasher = DefaultHasher::new();
213        config.use_cache.hash(&mut hasher);
214        config.download_if_missing.hash(&mut hasher);
215        config.return_preprocessed.hash(&mut hasher);
216        config.subset.hash(&mut hasher);
217        config.random_state.hash(&mut hasher);
218
219        Self {
220            name: name.to_string(),
221            config_hash: format!("{:x}", hasher.finish()),
222        }
223    }
224
225    /// Get the cache key as a string
226    pub fn as_string(&self) -> String {
227        format!("{}_{}", self.name, self.config_hash)
228    }
229}
230
231/// File path wrapper for hashing
232#[derive(Clone, Debug, Eq, PartialEq)]
233struct FileCacheKey(String);
234
235impl Hash for FileCacheKey {
236    fn hash<H: Hasher>(&self, state: &mut H) {
237        self.0.hash(state);
238    }
239}
240
241/// Manages caching of downloaded datasets, using both file-based and in-memory caching
242///
243/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
244/// while maintaining the file-based persistence for long-term storage.
245pub struct DatasetCache {
246    /// Directory for file-based caching
247    cachedir: PathBuf,
248    /// In-memory cache for frequently accessed datasets
249    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
250    /// Maximum cache size in bytes (0 means unlimited)
251    max_cache_size: u64,
252    /// Whether to operate in offline mode (no downloads)
253    offline_mode: bool,
254}
255
256impl Default for DatasetCache {
257    fn default() -> Self {
258        let cachedir = get_cachedir().expect("Could not get cache directory");
259
260        let mem_cache = RefCell::new(
261            CacheBuilder::new()
262                .with_size(DEFAULT_CACHE_SIZE)
263                .with_ttl(DEFAULT_CACHE_TTL)
264                .build_sized_cache(),
265        );
266
267        // Check if offline mode is enabled via environment variable
268        let offline_mode = std::env::var("SCIRS2_OFFLINE")
269            .map(|v| v.to_lowercase() == "true" || v == "1")
270            .unwrap_or(false);
271
272        DatasetCache {
273            cachedir,
274            mem_cache,
275            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
276            offline_mode,
277        }
278    }
279}
280
281impl DatasetCache {
282    /// Create a new dataset cache with the given cache directory and default memory cache
283    pub fn new(cachedir: PathBuf) -> Self {
284        let mem_cache = RefCell::new(
285            CacheBuilder::new()
286                .with_size(DEFAULT_CACHE_SIZE)
287                .with_ttl(DEFAULT_CACHE_TTL)
288                .build_sized_cache(),
289        );
290
291        let offline_mode = std::env::var("SCIRS2_OFFLINE")
292            .map(|v| v.to_lowercase() == "true" || v == "1")
293            .unwrap_or(false);
294
295        DatasetCache {
296            cachedir,
297            mem_cache,
298            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
299            offline_mode,
300        }
301    }
302
303    /// Create a new dataset cache with custom settings
304    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
305        let mem_cache = RefCell::new(
306            CacheBuilder::new()
307                .with_size(cache_size)
308                .with_ttl(ttl_seconds)
309                .build_sized_cache(),
310        );
311
312        let offline_mode = std::env::var("SCIRS2_OFFLINE")
313            .map(|v| v.to_lowercase() == "true" || v == "1")
314            .unwrap_or(false);
315
316        DatasetCache {
317            cachedir,
318            mem_cache,
319            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
320            offline_mode,
321        }
322    }
323
324    /// Create a new dataset cache with comprehensive configuration
325    pub fn with_full_config(
326        cachedir: PathBuf,
327        cache_size: usize,
328        ttl_seconds: u64,
329        max_cache_size: u64,
330        offline_mode: bool,
331    ) -> Self {
332        let mem_cache = RefCell::new(
333            CacheBuilder::new()
334                .with_size(cache_size)
335                .with_ttl(ttl_seconds)
336                .build_sized_cache(),
337        );
338
339        DatasetCache {
340            cachedir,
341            mem_cache,
342            max_cache_size,
343            offline_mode,
344        }
345    }
346
347    /// Create the cache directory if it doesn't exist
348    pub fn ensure_cachedir(&self) -> Result<()> {
349        if !self.cachedir.exists() {
350            fs::create_dir_all(&self.cachedir).map_err(|e| {
351                DatasetsError::CacheError(format!("Failed to create cache directory: {e}"))
352            })?;
353        }
354        Ok(())
355    }
356
357    /// Get the path to a cached file
358    pub fn get_cachedpath(&self, name: &str) -> PathBuf {
359        self.cachedir.join(name)
360    }
361
362    /// Check if a file is already cached (either in memory or on disk)
363    pub fn is_cached(&self, name: &str) -> bool {
364        // Check memory cache first
365        let key = FileCacheKey(name.to_string());
366        if self.mem_cache.borrow_mut().get(&key).is_some() {
367            return true;
368        }
369
370        // Then check file system
371        self.get_cachedpath(name).exists()
372    }
373
374    /// Read a cached file as bytes
375    ///
376    /// This method checks the in-memory cache first, and falls back to the file system if needed.
377    /// When reading from the file system, the result is also stored in the in-memory cache.
378    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
379        // Try memory cache first
380        let key = FileCacheKey(name.to_string());
381        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
382            return Ok(data);
383        }
384
385        // Fall back to file system cache
386        let path = self.get_cachedpath(name);
387        if !path.exists() {
388            return Err(DatasetsError::CacheError(format!(
389                "Cached file does not exist: {name}"
390            )));
391        }
392
393        let mut file = File::open(path)
394            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {e}")))?;
395
396        let mut buffer = Vec::new();
397        file.read_to_end(&mut buffer)
398            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {e}")))?;
399
400        // Update memory cache
401        self.mem_cache.borrow_mut().insert(key, buffer.clone());
402
403        Ok(buffer)
404    }
405
406    /// Write data to both the file cache and memory cache
407    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
408        self.ensure_cachedir()?;
409
410        // Check if writing this file would exceed cache size limit
411        if self.max_cache_size > 0 {
412            let current_size = self.get_cache_size_bytes()?;
413            let new_file_size = data.len() as u64;
414
415            if current_size + new_file_size > self.max_cache_size {
416                self.cleanup_cache_to_fit(new_file_size)?;
417            }
418        }
419
420        // Write to file system cache
421        let path = self.get_cachedpath(name);
422        let mut file = File::create(path)
423            .map_err(|e| DatasetsError::CacheError(format!("Failed to create cache file: {e}")))?;
424
425        file.write_all(data).map_err(|e| {
426            DatasetsError::CacheError(format!("Failed to write to cache file: {e}"))
427        })?;
428
429        // Update memory cache
430        let key = FileCacheKey(name.to_string());
431        self.mem_cache.borrow_mut().insert(key, data.to_vec());
432
433        Ok(())
434    }
435
436    /// Clear the entire cache (both memory and file-based)
437    pub fn clear_cache(&self) -> Result<()> {
438        // Clear file system cache
439        if self.cachedir.exists() {
440            fs::remove_dir_all(&self.cachedir)
441                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {e}")))?;
442        }
443
444        // Clear memory cache
445        self.mem_cache.borrow_mut().clear();
446
447        Ok(())
448    }
449
450    /// Remove a specific cached file (from both memory and file system)
451    pub fn remove_cached(&self, name: &str) -> Result<()> {
452        // Remove from file system
453        let path = self.get_cachedpath(name);
454        if path.exists() {
455            fs::remove_file(path).map_err(|e| {
456                DatasetsError::CacheError(format!("Failed to remove cached file: {e}"))
457            })?;
458        }
459
460        // Remove from memory cache
461        let key = FileCacheKey(name.to_string());
462        self.mem_cache.borrow_mut().remove(&key);
463
464        Ok(())
465    }
466
467    /// Compute a hash for a filename or URL
468    pub fn hash_filename(name: &str) -> String {
469        let hash = blake3::hash(name.as_bytes());
470        hash.to_hex().to_string()
471    }
472
473    /// Get the total size of the cache in bytes
474    pub fn get_cache_size_bytes(&self) -> Result<u64> {
475        let mut total_size = 0u64;
476
477        if self.cachedir.exists() {
478            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
479                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
480            })?;
481
482            for entry in entries {
483                let entry = entry.map_err(|e| {
484                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
485                })?;
486
487                if let Ok(metadata) = entry.metadata() {
488                    if metadata.is_file() {
489                        total_size += metadata.len();
490                    }
491                }
492            }
493        }
494
495        Ok(total_size)
496    }
497
498    /// Clean up cache to fit a new file of specified size
499    ///
500    /// This method removes the oldest files first until there's enough space
501    /// for the new file plus some buffer space.
502    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
503        if self.max_cache_size == 0 {
504            return Ok(()); // No _size limit
505        }
506
507        let current_size = self.get_cache_size_bytes()?;
508        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
509        let total_needed = current_size + needed_size;
510
511        if total_needed <= target_size {
512            return Ok(()); // No cleanup needed
513        }
514
515        let size_to_free = total_needed - target_size;
516
517        // Get all files with their modification times
518        let mut files_with_times = Vec::new();
519
520        if self.cachedir.exists() {
521            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
522                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
523            })?;
524
525            for entry in entries {
526                let entry = entry.map_err(|e| {
527                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
528                })?;
529
530                if let Ok(metadata) = entry.metadata() {
531                    if metadata.is_file() {
532                        if let Ok(modified) = metadata.modified() {
533                            files_with_times.push((entry.path(), metadata.len(), modified));
534                        }
535                    }
536                }
537            }
538        }
539
540        // Sort by modification time (oldest first)
541        files_with_times.sort_by_key(|(_path, _size, modified)| *modified);
542
543        // Remove files until we've freed enough space
544        let mut freed_size = 0u64;
545        for (path, size, _modified) in files_with_times {
546            if freed_size >= size_to_free {
547                break;
548            }
549
550            // Remove from memory cache first
551            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
552                let key = FileCacheKey(filename.to_string());
553                self.mem_cache.borrow_mut().remove(&key);
554            }
555
556            // Remove file
557            if let Err(e) = fs::remove_file(&path) {
558                eprintln!("Warning: Failed to remove cache file {path:?}: {e}");
559            } else {
560                freed_size += size;
561            }
562        }
563
564        Ok(())
565    }
566
567    /// Set offline mode
568    pub fn set_offline_mode(&mut self, offline: bool) {
569        self.offline_mode = offline;
570    }
571
572    /// Check if cache is in offline mode
573    pub fn is_offline(&self) -> bool {
574        self.offline_mode
575    }
576
577    /// Set maximum cache size in bytes (0 for unlimited)
578    pub fn set_max_cache_size(&mut self, max_size: u64) {
579        self.max_cache_size = max_size;
580    }
581
582    /// Get maximum cache size in bytes
583    pub fn max_cache_size(&self) -> u64 {
584        self.max_cache_size
585    }
586
587    /// Put data into the cache (alias for write_cached)
588    pub fn put(&self, name: &str, data: &[u8]) -> Result<()> {
589        self.write_cached(name, data)
590    }
591
592    /// Get detailed cache information
593    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
594        let mut total_size = 0u64;
595        let mut file_count = 0usize;
596        let mut files = Vec::new();
597
598        if self.cachedir.exists() {
599            let entries = fs::read_dir(&self.cachedir).map_err(|e| {
600                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
601            })?;
602
603            for entry in entries {
604                let entry = entry.map_err(|e| {
605                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
606                })?;
607
608                if let Ok(metadata) = entry.metadata() {
609                    if metadata.is_file() {
610                        let size = metadata.len();
611                        total_size += size;
612                        file_count += 1;
613
614                        if let Some(filename) = entry.file_name().to_str() {
615                            files.push(CacheFileInfo {
616                                name: filename.to_string(),
617                                size_bytes: size,
618                                modified: metadata.modified().ok(),
619                            });
620                        }
621                    }
622                }
623            }
624        }
625
626        // Sort files by size (largest first)
627        files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
628
629        Ok(DetailedCacheStats {
630            total_size_bytes: total_size,
631            file_count,
632            cachedir: self.cachedir.clone(),
633            max_cache_size: self.max_cache_size,
634            offline_mode: self.offline_mode,
635            files,
636        })
637    }
638}
639
640/// Downloads data from a URL and returns it as bytes, using the cache when possible
641#[cfg(feature = "download")]
642#[allow(dead_code)]
643pub fn download_data(_url: &str, force_download: bool) -> Result<Vec<u8>> {
644    let cache = DatasetCache::default();
645    let cache_key = DatasetCache::hash_filename(_url);
646
647    // Check if the data is already cached
648    if !force_download && cache.is_cached(&cache_key) {
649        return cache.read_cached(&cache_key);
650    }
651
652    // Download the data
653    let response = reqwest::blocking::get(_url).map_err(|e| {
654        DatasetsError::DownloadError(format!("Failed to download from {_url}: {e}"))
655    })?;
656
657    if !response.status().is_success() {
658        return Err(DatasetsError::DownloadError(format!(
659            "Failed to download from {_url}: HTTP status {}",
660            response.status()
661        )));
662    }
663
664    let data = response
665        .bytes()
666        .map_err(|e| DatasetsError::DownloadError(format!("Failed to read response data: {e}")))?;
667
668    let data_vec = data.to_vec();
669
670    // Cache the data
671    cache.write_cached(&cache_key, &data_vec)?;
672
673    Ok(data_vec)
674}
675
676// Stub for when download feature is not enabled
677#[cfg(not(feature = "download"))]
678/// Downloads data from a URL or retrieves it from cache
679///
680/// This is a stub implementation when the download feature is not enabled.
681/// It returns an error informing the user to enable the download feature.
682///
683/// # Arguments
684///
685/// * `_url` - The URL to download from
686/// * `_force_download` - If true, force a new download instead of using cache
687///
688/// # Returns
689///
690/// * An error indicating that the download feature is not enabled
691#[allow(dead_code)]
692pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
693    Err(DatasetsError::Other(
694        "Download feature is not enabled. Recompile with --features download".to_string(),
695    ))
696}
697
698/// Cache management utilities
699pub struct CacheManager {
700    cache: DatasetCache,
701}
702
703impl CacheManager {
704    /// Create a new cache manager with default settings
705    pub fn new() -> Result<Self> {
706        let cachedir = get_cachedir()?;
707        Ok(Self {
708            cache: DatasetCache::with_config(cachedir, DEFAULT_CACHE_SIZE, DEFAULT_CACHE_TTL),
709        })
710    }
711
712    /// Create a new cache manager with custom settings
713    pub fn with_config(cachedir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
714        Self {
715            cache: DatasetCache::with_config(cachedir, cache_size, ttl_seconds),
716        }
717    }
718
719    /// Get a dataset from cache using CacheKey
720    pub fn get(&self, key: &CacheKey) -> Result<Option<crate::utils::Dataset>> {
721        let name = key.as_string();
722        if self.cache.is_cached(&name) {
723            match self.cache.read_cached(&name) {
724                Ok(cached_data) => {
725                    match serde_json::from_slice::<crate::utils::Dataset>(&cached_data) {
726                        Ok(dataset) => Ok(Some(dataset)),
727                        Err(e) => {
728                            // If deserialization fails, consider the cache entry invalid
729                            self.cache
730                                .mem_cache
731                                .borrow_mut()
732                                .remove(&FileCacheKey(name.clone()));
733                            Err(DatasetsError::CacheError(format!(
734                                "Failed to deserialize cached dataset: {e}"
735                            )))
736                        }
737                    }
738                }
739                Err(e) => Err(DatasetsError::CacheError(format!(
740                    "Failed to read cached data: {e}"
741                ))),
742            }
743        } else {
744            Ok(None)
745        }
746    }
747
748    /// Put a dataset into cache using CacheKey
749    pub fn put(&self, key: &CacheKey, dataset: &crate::utils::Dataset) -> Result<()> {
750        let name = key.as_string();
751
752        // Serialize the dataset to JSON bytes for caching
753        let serialized = serde_json::to_vec(dataset)
754            .map_err(|e| DatasetsError::CacheError(format!("Failed to serialize dataset: {e}")))?;
755
756        // Write the serialized data to cache
757        self.cache
758            .write_cached(&name, &serialized)
759            .map_err(|e| DatasetsError::CacheError(format!("Failed to write to cache: {e}")))
760    }
761
762    /// Create a cache manager with comprehensive configuration
763    pub fn with_full_config(
764        cachedir: PathBuf,
765        cache_size: usize,
766        ttl_seconds: u64,
767        max_cache_size: u64,
768        offline_mode: bool,
769    ) -> Self {
770        Self {
771            cache: DatasetCache::with_full_config(
772                cachedir,
773                cache_size,
774                ttl_seconds,
775                max_cache_size,
776                offline_mode,
777            ),
778        }
779    }
780
781    /// Get basic cache statistics
782    pub fn get_stats(&self) -> CacheStats {
783        let cachedir = &self.cache.cachedir;
784        let mut total_size = 0u64;
785        let mut file_count = 0usize;
786
787        if cachedir.exists() {
788            if let Ok(entries) = fs::read_dir(cachedir) {
789                for entry in entries.flatten() {
790                    if let Ok(metadata) = entry.metadata() {
791                        if metadata.is_file() {
792                            total_size += metadata.len();
793                            file_count += 1;
794                        }
795                    }
796                }
797            }
798        }
799
800        CacheStats {
801            total_size_bytes: total_size,
802            file_count,
803            cachedir: cachedir.clone(),
804        }
805    }
806
807    /// Get detailed cache statistics
808    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
809        self.cache.get_detailed_stats()
810    }
811
812    /// Set offline mode
813    pub fn set_offline_mode(&mut self, offline: bool) {
814        self.cache.set_offline_mode(offline);
815    }
816
817    /// Check if in offline mode
818    pub fn is_offline(&self) -> bool {
819        self.cache.is_offline()
820    }
821
822    /// Set maximum cache size in bytes (0 for unlimited)
823    pub fn set_max_cache_size(&mut self, max_size: u64) {
824        self.cache.set_max_cache_size(max_size);
825    }
826
827    /// Get maximum cache size in bytes
828    pub fn max_cache_size(&self) -> u64 {
829        self.cache.max_cache_size()
830    }
831
832    /// Clear all cached data
833    pub fn clear_all(&self) -> Result<()> {
834        self.cache.clear_cache()
835    }
836
837    /// Remove specific cached file
838    pub fn remove(&self, name: &str) -> Result<()> {
839        self.cache.remove_cached(name)
840    }
841
842    /// Remove old files to free up space
843    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
844        self.cache.cleanup_cache_to_fit(target_size)
845    }
846
847    /// List all cached files
848    pub fn list_cached_files(&self) -> Result<Vec<String>> {
849        let cachedir = &self.cache.cachedir;
850        let mut files = Vec::new();
851
852        if cachedir.exists() {
853            let entries = fs::read_dir(cachedir).map_err(|e| {
854                DatasetsError::CacheError(format!("Failed to read cache directory: {e}"))
855            })?;
856
857            for entry in entries {
858                let entry = entry.map_err(|e| {
859                    DatasetsError::CacheError(format!("Failed to read directory entry: {e}"))
860                })?;
861
862                if let Some(filename) = entry.file_name().to_str() {
863                    files.push(filename.to_string());
864                }
865            }
866        }
867
868        files.sort();
869        Ok(files)
870    }
871
872    /// Get cache directory path
873    pub fn cachedir(&self) -> &PathBuf {
874        &self.cache.cachedir
875    }
876
877    /// Check if a file is cached
878    pub fn is_cached(&self, name: &str) -> bool {
879        self.cache.is_cached(name)
880    }
881
882    /// Print detailed cache report
883    pub fn print_cache_report(&self) -> Result<()> {
884        let stats = self.get_detailed_stats()?;
885
886        println!("=== Cache Report ===");
887        println!("Cache Directory: {}", stats.cachedir.display());
888        println!(
889            "Total Size: {} ({} files)",
890            stats.formatted_size(),
891            stats.file_count
892        );
893        println!("Max Size: {}", stats.formatted_max_size());
894
895        if stats.max_cache_size > 0 {
896            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
897        }
898
899        println!(
900            "Offline Mode: {}",
901            if stats.offline_mode {
902                "Enabled"
903            } else {
904                "Disabled"
905            }
906        );
907
908        if !stats.files.is_empty() {
909            println!("\nCached Files:");
910            for file in &stats.files {
911                println!(
912                    "  {} - {} ({})",
913                    file.name,
914                    file.formatted_size(),
915                    file.formatted_modified()
916                );
917            }
918        }
919
920        Ok(())
921    }
922}
923
924/// Cache statistics
925pub struct CacheStats {
926    /// Total size of all cached files in bytes
927    pub total_size_bytes: u64,
928    /// Number of cached files
929    pub file_count: usize,
930    /// Cache directory path
931    pub cachedir: PathBuf,
932}
933
934/// Detailed cache statistics with file-level information
935pub struct DetailedCacheStats {
936    /// Total size of all cached files in bytes
937    pub total_size_bytes: u64,
938    /// Number of cached files
939    pub file_count: usize,
940    /// Cache directory path
941    pub cachedir: PathBuf,
942    /// Maximum cache size (0 = unlimited)
943    pub max_cache_size: u64,
944    /// Whether cache is in offline mode
945    pub offline_mode: bool,
946    /// Information about individual cached files
947    pub files: Vec<CacheFileInfo>,
948}
949
950/// Information about a cached file
951#[derive(Debug, Clone)]
952pub struct CacheFileInfo {
953    /// Name of the cached file
954    pub name: String,
955    /// Size in bytes
956    pub size_bytes: u64,
957    /// Last modified time
958    pub modified: Option<std::time::SystemTime>,
959}
960
961impl CacheStats {
962    /// Get total size formatted as human-readable string
963    pub fn formatted_size(&self) -> String {
964        format_bytes(self.total_size_bytes)
965    }
966}
967
968impl DetailedCacheStats {
969    /// Get total size formatted as human-readable string
970    pub fn formatted_size(&self) -> String {
971        format_bytes(self.total_size_bytes)
972    }
973
974    /// Get max cache size formatted as human-readable string
975    pub fn formatted_max_size(&self) -> String {
976        if self.max_cache_size == 0 {
977            "Unlimited".to_string()
978        } else {
979            format_bytes(self.max_cache_size)
980        }
981    }
982
983    /// Get cache usage percentage (0.0-1.0)
984    pub fn usage_percentage(&self) -> f64 {
985        if self.max_cache_size == 0 {
986            0.0
987        } else {
988            self.total_size_bytes as f64 / self.max_cache_size as f64
989        }
990    }
991}
992
993impl CacheFileInfo {
994    /// Get file size formatted as human-readable string
995    pub fn formatted_size(&self) -> String {
996        format_bytes(self.size_bytes)
997    }
998
999    /// Get formatted modification time
1000    pub fn formatted_modified(&self) -> String {
1001        match &self.modified {
1002            Some(time) => {
1003                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
1004                {
1005                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
1006                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
1007                        let days = diff_secs / 86400;
1008                        let hours = (diff_secs % 86400) / 3600;
1009                        let mins = (diff_secs % 3600) / 60;
1010
1011                        if days > 0 {
1012                            format!("{days} days ago")
1013                        } else if hours > 0 {
1014                            format!("{hours} hours ago")
1015                        } else if mins > 0 {
1016                            format!("{mins} minutes ago")
1017                        } else {
1018                            "Just now".to_string()
1019                        }
1020                    } else {
1021                        "Unknown".to_string()
1022                    }
1023                } else {
1024                    "Unknown".to_string()
1025                }
1026            }
1027            None => "Unknown".to_string(),
1028        }
1029    }
1030}
1031
1032/// Format bytes as human-readable string
1033#[allow(dead_code)]
1034fn format_bytes(bytes: u64) -> String {
1035    let size = bytes as f64;
1036    if size < 1024.0 {
1037        format!("{size} B")
1038    } else if size < 1024.0 * 1024.0 {
1039        format!("{:.1} KB", size / 1024.0)
1040    } else if size < 1024.0 * 1024.0 * 1024.0 {
1041        format!("{:.1} MB", size / (1024.0 * 1024.0))
1042    } else {
1043        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
1044    }
1045}
1046
1047/// Batch operation result containing success/failure information
1048#[derive(Debug, Clone)]
1049pub struct BatchResult {
1050    /// Number of successful operations
1051    pub success_count: usize,
1052    /// Number of failed operations
1053    pub failure_count: usize,
1054    /// List of failed items with error messages
1055    pub failures: Vec<(String, String)>,
1056    /// Total bytes processed
1057    pub total_bytes: u64,
1058    /// Total time taken for the batch operation
1059    pub elapsed_time: std::time::Duration,
1060}
1061
1062impl BatchResult {
1063    /// Create a new empty batch result
1064    pub fn new() -> Self {
1065        Self {
1066            success_count: 0,
1067            failure_count: 0,
1068            failures: Vec::new(),
1069            total_bytes: 0,
1070            elapsed_time: std::time::Duration::ZERO,
1071        }
1072    }
1073
1074    /// Check if all operations were successful
1075    pub fn is_all_success(&self) -> bool {
1076        self.failure_count == 0
1077    }
1078
1079    /// Get success rate as percentage
1080    pub fn success_rate(&self) -> f64 {
1081        let total = self.success_count + self.failure_count;
1082        if total == 0 {
1083            0.0
1084        } else {
1085            (self.success_count as f64 / total as f64) * 100.0
1086        }
1087    }
1088
1089    /// Get formatted summary
1090    pub fn summary(&self) -> String {
1091        format!(
1092            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1093            self.success_count,
1094            self.success_count + self.failure_count,
1095            self.success_rate(),
1096            format_bytes(self.total_bytes),
1097            self.elapsed_time.as_secs_f64()
1098        )
1099    }
1100}
1101
1102impl Default for BatchResult {
1103    fn default() -> Self {
1104        Self::new()
1105    }
1106}
1107
1108/// Batch operations manager for dataset caching
1109pub struct BatchOperations {
1110    cache: CacheManager,
1111    parallel: bool,
1112    max_retries: usize,
1113    retry_delay: std::time::Duration,
1114}
1115
1116impl BatchOperations {
1117    /// Create a new batch operations manager
1118    pub fn new(cache: CacheManager) -> Self {
1119        Self {
1120            cache,
1121            parallel: true,
1122            max_retries: 3,
1123            retry_delay: std::time::Duration::from_millis(1000),
1124        }
1125    }
1126
1127    /// Configure parallel processing
1128    pub fn with_parallel(mut self, parallel: bool) -> Self {
1129        self.parallel = parallel;
1130        self
1131    }
1132
1133    /// Configure retry settings
1134    pub fn with_retry_config(
1135        mut self,
1136        max_retries: usize,
1137        retry_delay: std::time::Duration,
1138    ) -> Self {
1139        self.max_retries = max_retries;
1140        self.retry_delay = retry_delay;
1141        self
1142    }
1143
1144    /// Download multiple datasets in batch
1145    #[cfg(feature = "download")]
1146    pub fn batch_download(&self, urls_andnames: &[(&str, &str)]) -> BatchResult {
1147        let start_time = std::time::Instant::now();
1148        let mut result = BatchResult::new();
1149
1150        if self.parallel {
1151            self.batch_download_parallel(urls_andnames, &mut result)
1152        } else {
1153            self.batch_download_sequential(urls_andnames, &mut result)
1154        }
1155
1156        result.elapsed_time = start_time.elapsed();
1157        result
1158    }
1159
1160    #[cfg(feature = "download")]
1161    fn batch_download_parallel(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1162        use std::fs::File;
1163        use std::io::Write;
1164        use std::sync::{Arc, Mutex};
1165        use std::thread;
1166
1167        // Ensure cache directory exists before spawning threads
1168        if let Err(e) = self.cache.cache.ensure_cachedir() {
1169            result.failure_count += urls_andnames.len();
1170            for &(_, name) in urls_andnames {
1171                result
1172                    .failures
1173                    .push((name.to_string(), format!("Cache setup failed: {e}")));
1174            }
1175            return;
1176        }
1177
1178        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1179        let cachedir = self.cache.cache.cachedir.clone();
1180        let max_retries = self.max_retries;
1181        let retry_delay = self.retry_delay;
1182
1183        let handles: Vec<_> = urls_andnames
1184            .iter()
1185            .map(|&(url, name)| {
1186                let result_clone = Arc::clone(&result_arc);
1187                let url = url.to_string();
1188                let name = name.to_string();
1189                let cachedir = cachedir.clone();
1190
1191                thread::spawn(move || {
1192                    let mut success = false;
1193                    let mut last_error = String::new();
1194                    let mut downloaded_data = Vec::new();
1195
1196                    for attempt in 0..=max_retries {
1197                        match download_data(&url, false) {
1198                            Ok(data) => {
1199                                // Write directly to filesystem (bypassing RefCell memory cache)
1200                                let path = cachedir.join(&name);
1201                                match File::create(&path) {
1202                                    Ok(mut file) => match file.write_all(&data) {
1203                                        Ok(_) => {
1204                                            let mut r = result_clone.lock().unwrap();
1205                                            r.success_count += 1;
1206                                            r.total_bytes += data.len() as u64;
1207                                            downloaded_data = data;
1208                                            success = true;
1209                                            break;
1210                                        }
1211                                        Err(e) => {
1212                                            last_error = format!("Failed to write cache file: {e}");
1213                                        }
1214                                    },
1215                                    Err(e) => {
1216                                        last_error = format!("Failed to create cache file: {e}");
1217                                    }
1218                                }
1219                            }
1220                            Err(e) => {
1221                                last_error = format!("Download failed: {e}");
1222                                if attempt < max_retries {
1223                                    thread::sleep(retry_delay);
1224                                }
1225                            }
1226                        }
1227                    }
1228
1229                    if !success {
1230                        let mut r = result_clone.lock().unwrap();
1231                        r.failure_count += 1;
1232                        r.failures.push((name.clone(), last_error));
1233                    }
1234
1235                    (name, success, downloaded_data)
1236                })
1237            })
1238            .collect();
1239
1240        // Collect results and update memory cache for successful downloads
1241        let mut successful_downloads = Vec::new();
1242        for handle in handles {
1243            if let Ok((name, success, data)) = handle.join() {
1244                if success && !data.is_empty() {
1245                    successful_downloads.push((name, data));
1246                }
1247            }
1248        }
1249
1250        // Merge the results from the arc back into the original result
1251        if let Ok(arc_result) = result_arc.lock() {
1252            result.success_count += arc_result.success_count;
1253            result.failure_count += arc_result.failure_count;
1254            result.failures.extend(arc_result.failures.clone());
1255        }
1256
1257        // Update memory cache after all threads complete
1258        for (name, data) in successful_downloads {
1259            let key = FileCacheKey(name);
1260            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1261        }
1262    }
1263
1264    #[cfg(feature = "download")]
1265    fn batch_download_sequential(&self, urls_andnames: &[(&str, &str)], result: &mut BatchResult) {
1266        for &(url, name) in urls_andnames {
1267            let mut success = false;
1268            let mut last_error = String::new();
1269
1270            for attempt in 0..=self.max_retries {
1271                match download_data(url, false) {
1272                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1273                        Ok(_) => {
1274                            result.success_count += 1;
1275                            result.total_bytes += data.len() as u64;
1276                            success = true;
1277                            break;
1278                        }
1279                        Err(e) => {
1280                            last_error = format!("Cache write failed: {e}");
1281                        }
1282                    },
1283                    Err(e) => {
1284                        last_error = format!("Download failed: {e}");
1285                        if attempt < self.max_retries {
1286                            std::thread::sleep(self.retry_delay);
1287                        }
1288                    }
1289                }
1290            }
1291
1292            if !success {
1293                result.failure_count += 1;
1294                result.failures.push((name.to_string(), last_error));
1295            }
1296        }
1297    }
1298
1299    /// Verify integrity of multiple cached files
1300    pub fn batch_verify_integrity(&self, files_andhashes: &[(&str, &str)]) -> BatchResult {
1301        let start_time = std::time::Instant::now();
1302        let mut result = BatchResult::new();
1303
1304        for &(filename, expected_hash) in files_andhashes {
1305            match self.cache.cache.get_cachedpath(filename).exists() {
1306                true => match sha256_hash_file(&self.cache.cache.get_cachedpath(filename)) {
1307                    Ok(actual_hash) => {
1308                        if actual_hash == expected_hash {
1309                            result.success_count += 1;
1310                            if let Ok(metadata) =
1311                                std::fs::metadata(self.cache.cache.get_cachedpath(filename))
1312                            {
1313                                result.total_bytes += metadata.len();
1314                            }
1315                        } else {
1316                            result.failure_count += 1;
1317                            result.failures.push((
1318                                filename.to_string(),
1319                                format!(
1320                                    "Hash mismatch: expected {expected_hash}, got {actual_hash}"
1321                                ),
1322                            ));
1323                        }
1324                    }
1325                    Err(e) => {
1326                        result.failure_count += 1;
1327                        result.failures.push((
1328                            filename.to_string(),
1329                            format!("Hash computation failed: {e}"),
1330                        ));
1331                    }
1332                },
1333                false => {
1334                    result.failure_count += 1;
1335                    result
1336                        .failures
1337                        .push((filename.to_string(), "File not found in cache".to_string()));
1338                }
1339            }
1340        }
1341
1342        result.elapsed_time = start_time.elapsed();
1343        result
1344    }
1345
1346    /// Clean up cache selectively based on patterns
1347    pub fn selective_cleanup(
1348        &self,
1349        patterns: &[&str],
1350        max_age_days: Option<u32>,
1351    ) -> Result<BatchResult> {
1352        let start_time = std::time::Instant::now();
1353        let mut result = BatchResult::new();
1354
1355        let cached_files = self.cache.list_cached_files()?;
1356        let now = std::time::SystemTime::now();
1357
1358        for filename in cached_files {
1359            let should_remove = patterns.iter().any(|pattern| {
1360                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1361            });
1362
1363            if should_remove {
1364                let filepath = self.cache.cache.get_cachedpath(&filename);
1365
1366                // Check age if max_age_days is specified
1367                let remove_due_to_age = if let Some(max_age) = max_age_days {
1368                    if let Ok(metadata) = std::fs::metadata(&filepath) {
1369                        if let Ok(modified) = metadata.modified() {
1370                            if let Ok(age) = now.duration_since(modified) {
1371                                age.as_secs() > (max_age as u64 * 24 * 3600)
1372                            } else {
1373                                false
1374                            }
1375                        } else {
1376                            false
1377                        }
1378                    } else {
1379                        false
1380                    }
1381                } else {
1382                    true // Remove regardless of age if no age limit specified
1383                };
1384
1385                if remove_due_to_age {
1386                    match self.cache.remove(&filename) {
1387                        Ok(_) => {
1388                            result.success_count += 1;
1389                            if let Ok(metadata) = std::fs::metadata(&filepath) {
1390                                result.total_bytes += metadata.len();
1391                            }
1392                        }
1393                        Err(e) => {
1394                            result.failure_count += 1;
1395                            result
1396                                .failures
1397                                .push((filename, format!("Removal failed: {e}")));
1398                        }
1399                    }
1400                }
1401            }
1402        }
1403
1404        result.elapsed_time = start_time.elapsed();
1405        Ok(result)
1406    }
1407
1408    /// Process multiple datasets with a given function
1409    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1410    where
1411        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1412        E: std::fmt::Display,
1413        T: Send,
1414    {
1415        let start_time = std::time::Instant::now();
1416        let mut result = BatchResult::new();
1417
1418        if self.parallel {
1419            self.batch_process_parallel(names, processor, &mut result)
1420        } else {
1421            self.batch_process_sequential(names, processor, &mut result)
1422        }
1423
1424        result.elapsed_time = start_time.elapsed();
1425        result
1426    }
1427
1428    fn batch_process_parallel<F, T, E>(
1429        &self,
1430        names: &[String],
1431        processor: F,
1432        result: &mut BatchResult,
1433    ) where
1434        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1435        E: std::fmt::Display,
1436        T: Send,
1437    {
1438        // For thread safety with the current cache implementation,
1439        // we need to read all data first, then process in parallel
1440        let mut data_pairs = Vec::new();
1441
1442        // Sequential read phase
1443        for name in names {
1444            match self.cache.cache.read_cached(name) {
1445                Ok(data) => data_pairs.push((name.clone(), data)),
1446                Err(e) => {
1447                    result.failure_count += 1;
1448                    result
1449                        .failures
1450                        .push((name.clone(), format!("Cache read failed: {e}")));
1451                }
1452            }
1453        }
1454
1455        // Parallel processing phase
1456        if !data_pairs.is_empty() {
1457            use std::sync::{Arc, Mutex};
1458            use std::thread;
1459
1460            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1461            let processor = Arc::new(processor);
1462
1463            let handles: Vec<_> = data_pairs
1464                .into_iter()
1465                .map(|(name, data)| {
1466                    let result_clone = Arc::clone(&parallel_result);
1467                    let processor_clone = Arc::clone(&processor);
1468
1469                    thread::spawn(move || match processor_clone(&name, &data) {
1470                        Ok(_) => {
1471                            let mut r = result_clone.lock().unwrap();
1472                            r.success_count += 1;
1473                            r.total_bytes += data.len() as u64;
1474                        }
1475                        Err(e) => {
1476                            let mut r = result_clone.lock().unwrap();
1477                            r.failure_count += 1;
1478                            r.failures.push((name, format!("Processing failed: {e}")));
1479                        }
1480                    })
1481                })
1482                .collect();
1483
1484            for handle in handles {
1485                let _ = handle.join();
1486            }
1487
1488            // Merge parallel results into main result
1489            let parallel_result = parallel_result.lock().unwrap();
1490            result.success_count += parallel_result.success_count;
1491            result.failure_count += parallel_result.failure_count;
1492            result.total_bytes += parallel_result.total_bytes;
1493            result.failures.extend(parallel_result.failures.clone());
1494        }
1495    }
1496
1497    fn batch_process_sequential<F, T, E>(
1498        &self,
1499        names: &[String],
1500        processor: F,
1501        result: &mut BatchResult,
1502    ) where
1503        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1504        E: std::fmt::Display,
1505    {
1506        for name in names {
1507            match self.cache.cache.read_cached(name) {
1508                Ok(data) => match processor(name, &data) {
1509                    Ok(_) => {
1510                        result.success_count += 1;
1511                        result.total_bytes += data.len() as u64;
1512                    }
1513                    Err(e) => {
1514                        result.failure_count += 1;
1515                        result
1516                            .failures
1517                            .push((name.clone(), format!("Processing failed: {e}")));
1518                    }
1519                },
1520                Err(e) => {
1521                    result.failure_count += 1;
1522                    result
1523                        .failures
1524                        .push((name.clone(), format!("Cache read failed: {e}")));
1525                }
1526            }
1527        }
1528    }
1529
1530    /// Get access to the underlying cache manager
1531    pub fn cache_manager(&self) -> &CacheManager {
1532        &self.cache
1533    }
1534
1535    /// Write data to cache
1536    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1537        self.cache.cache.write_cached(name, data)
1538    }
1539
1540    /// Read data from cache
1541    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1542        self.cache.cache.read_cached(name)
1543    }
1544
1545    /// List cached files
1546    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1547        self.cache.list_cached_files()
1548    }
1549
1550    /// Print cache report
1551    pub fn print_cache_report(&self) -> Result<()> {
1552        self.cache.print_cache_report()
1553    }
1554
1555    /// Get statistics about cached datasets
1556    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1557        let start_time = std::time::Instant::now();
1558        let mut result = BatchResult::new();
1559
1560        let cached_files = self.cache.list_cached_files()?;
1561
1562        for filename in cached_files {
1563            let filepath = self.cache.cache.get_cachedpath(&filename);
1564            match std::fs::metadata(&filepath) {
1565                Ok(metadata) => {
1566                    result.success_count += 1;
1567                    result.total_bytes += metadata.len();
1568                }
1569                Err(e) => {
1570                    result.failure_count += 1;
1571                    result
1572                        .failures
1573                        .push((filename, format!("Metadata read failed: {e}")));
1574                }
1575            }
1576        }
1577
1578        result.elapsed_time = start_time.elapsed();
1579        Ok(result)
1580    }
1581}
1582
1583/// Simple glob pattern matching for filenames
1584#[allow(dead_code)]
1585fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1586    if pattern == "*" {
1587        return true;
1588    }
1589
1590    if pattern.contains('*') {
1591        let parts: Vec<&str> = pattern.split('*').collect();
1592        if parts.len() == 2 {
1593            let prefix = parts[0];
1594            let suffix = parts[1];
1595            return filename.starts_with(prefix) && filename.ends_with(suffix);
1596        }
1597    }
1598
1599    filename == pattern
1600}
1601
1602#[cfg(test)]
1603mod tests {
1604    use super::*;
1605    use tempfile::TempDir;
1606
1607    #[test]
1608    fn test_batch_result() {
1609        let mut result = BatchResult::new();
1610        assert_eq!(result.success_count, 0);
1611        assert_eq!(result.failure_count, 0);
1612        assert!(result.is_all_success());
1613        assert_eq!(result.success_rate(), 0.0);
1614
1615        result.success_count = 8;
1616        result.failure_count = 2;
1617        result.total_bytes = 1024;
1618
1619        assert!(!result.is_all_success());
1620        assert_eq!(result.success_rate(), 80.0);
1621        assert!(result.summary().contains("8/10 successful"));
1622        assert!(result.summary().contains("80.0%"));
1623    }
1624
1625    #[test]
1626    fn test_batch_operations_creation() {
1627        let tempdir = TempDir::new().unwrap();
1628        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1629        let batch_ops = BatchOperations::new(cache_manager)
1630            .with_parallel(false)
1631            .with_retry_config(2, std::time::Duration::from_millis(500));
1632
1633        assert!(!batch_ops.parallel);
1634        assert_eq!(batch_ops.max_retries, 2);
1635    }
1636
1637    #[test]
1638    fn test_selective_cleanup() {
1639        let tempdir = TempDir::new().unwrap();
1640        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1641        let batch_ops = BatchOperations::new(cache_manager);
1642
1643        // Create some test files
1644        let test_data = vec![0u8; 100];
1645        batch_ops
1646            .cache
1647            .cache
1648            .write_cached("test1.csv", &test_data)
1649            .unwrap();
1650        batch_ops
1651            .cache
1652            .cache
1653            .write_cached("test2.csv", &test_data)
1654            .unwrap();
1655        batch_ops
1656            .cache
1657            .cache
1658            .write_cached("data.json", &test_data)
1659            .unwrap();
1660
1661        // Clean up files matching pattern
1662        let result = batch_ops.selective_cleanup(&["*.csv"], None).unwrap();
1663
1664        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1665        assert!(!batch_ops.cache.is_cached("test1.csv"));
1666        assert!(!batch_ops.cache.is_cached("test2.csv"));
1667        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1668    }
1669
1670    #[test]
1671    fn test_batch_process() {
1672        let tempdir = TempDir::new().unwrap();
1673        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1674        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1675
1676        // Create test files
1677        let test_data1 = vec![1u8; 100];
1678        let test_data2 = vec![2u8; 200];
1679        batch_ops
1680            .cache
1681            .cache
1682            .write_cached("file1.dat", &test_data1)
1683            .unwrap();
1684        batch_ops
1685            .cache
1686            .cache
1687            .write_cached("file2.dat", &test_data2)
1688            .unwrap();
1689
1690        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1691
1692        // Process files (verify they're non-empty)
1693        let result = batch_ops.batch_process(&files, |_name, data| {
1694            if data.is_empty() {
1695                Err("Empty file")
1696            } else {
1697                Ok(data.len())
1698            }
1699        });
1700
1701        assert_eq!(result.success_count, 2);
1702        assert_eq!(result.failure_count, 0);
1703        assert_eq!(result.total_bytes, 300); // 100 + 200
1704    }
1705
1706    #[test]
1707    fn test_get_cache_statistics() {
1708        let tempdir = TempDir::new().unwrap();
1709        let cache_manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1710        let batch_ops = BatchOperations::new(cache_manager);
1711
1712        // Start with empty cache
1713        let result = batch_ops.get_cache_statistics().unwrap();
1714        assert_eq!(result.success_count, 0);
1715
1716        // Add some files
1717        let test_data = vec![0u8; 500];
1718        batch_ops
1719            .cache
1720            .cache
1721            .write_cached("test1.dat", &test_data)
1722            .unwrap();
1723        batch_ops
1724            .cache
1725            .cache
1726            .write_cached("test2.dat", &test_data)
1727            .unwrap();
1728
1729        let result = batch_ops.get_cache_statistics().unwrap();
1730        assert_eq!(result.success_count, 2);
1731        assert_eq!(result.total_bytes, 1000);
1732    }
1733
1734    #[test]
1735    fn test_matches_glob_pattern() {
1736        assert!(matches_glob_pattern("test.csv", "*"));
1737        assert!(matches_glob_pattern("test.csv", "*.csv"));
1738        assert!(matches_glob_pattern("test.csv", "test.*"));
1739        assert!(matches_glob_pattern("test.csv", "test.csv"));
1740
1741        assert!(!matches_glob_pattern("test.json", "*.csv"));
1742        assert!(!matches_glob_pattern("other.csv", "test.*"));
1743    }
1744
1745    #[test]
1746    fn test_cache_manager_creation() {
1747        let tempdir = TempDir::new().unwrap();
1748        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1749        let stats = manager.get_stats();
1750        assert_eq!(stats.file_count, 0);
1751    }
1752
1753    #[test]
1754    fn test_cache_stats_formatting() {
1755        let tempdir = TempDir::new().unwrap();
1756        let stats = CacheStats {
1757            total_size_bytes: 1024,
1758            file_count: 1,
1759            cachedir: tempdir.path().to_path_buf(),
1760        };
1761
1762        assert_eq!(stats.formatted_size(), "1.0 KB");
1763
1764        let stats_large = CacheStats {
1765            total_size_bytes: 1024 * 1024 * 1024,
1766            file_count: 1,
1767            cachedir: tempdir.path().to_path_buf(),
1768        };
1769
1770        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1771    }
1772
1773    #[test]
1774    fn test_hash_file_name() {
1775        let hash1 = DatasetCache::hash_filename("test.csv");
1776        let hash2 = DatasetCache::hash_filename("test.csv");
1777        let hash3 = DatasetCache::hash_filename("different.csv");
1778
1779        assert_eq!(hash1, hash2);
1780        assert_ne!(hash1, hash3);
1781        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1782    }
1783
1784    #[test]
1785    fn test_platform_cachedir() {
1786        let cachedir = get_platform_cachedir();
1787        // Should work on any platform
1788        assert!(cachedir.is_some() || cfg!(target_os = "unknown"));
1789
1790        if let Some(dir) = cachedir {
1791            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1792        }
1793    }
1794
1795    #[test]
1796    fn test_cache_size_management() {
1797        let tempdir = TempDir::new().unwrap();
1798        let cache = DatasetCache::with_full_config(
1799            tempdir.path().to_path_buf(),
1800            10,
1801            3600,
1802            2048, // 2KB limit
1803            false,
1804        );
1805
1806        // Write multiple small files to approach the limit
1807        let small_data1 = vec![0u8; 400];
1808        cache.write_cached("small1.dat", &small_data1).unwrap();
1809
1810        let small_data2 = vec![0u8; 400];
1811        cache.write_cached("small2.dat", &small_data2).unwrap();
1812
1813        let small_data3 = vec![0u8; 400];
1814        cache.write_cached("small3.dat", &small_data3).unwrap();
1815
1816        // Now write a file that should trigger cleanup
1817        let medium_data = vec![0u8; 800];
1818        cache.write_cached("medium.dat", &medium_data).unwrap();
1819
1820        // The cache should have cleaned up to stay under the limit
1821        let stats = cache.get_detailed_stats().unwrap();
1822        assert!(stats.total_size_bytes <= cache.max_cache_size());
1823
1824        // The most recent file should still be cached
1825        assert!(cache.is_cached("medium.dat"));
1826    }
1827
1828    #[test]
1829    fn test_offline_mode() {
1830        let tempdir = TempDir::new().unwrap();
1831        let mut cache = DatasetCache::new(tempdir.path().to_path_buf());
1832
1833        assert!(!cache.is_offline());
1834        cache.set_offline_mode(true);
1835        assert!(cache.is_offline());
1836    }
1837
1838    #[test]
1839    fn test_detailed_stats() {
1840        let tempdir = TempDir::new().unwrap();
1841        let cache = DatasetCache::new(tempdir.path().to_path_buf());
1842
1843        let test_data = vec![1, 2, 3, 4, 5];
1844        cache.write_cached("test.dat", &test_data).unwrap();
1845
1846        let stats = cache.get_detailed_stats().unwrap();
1847        assert_eq!(stats.file_count, 1);
1848        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1849        assert_eq!(stats.files.len(), 1);
1850        assert_eq!(stats.files[0].name, "test.dat");
1851        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1852    }
1853
1854    #[test]
1855    fn test_cache_manager() {
1856        let tempdir = TempDir::new().unwrap();
1857        let manager = CacheManager::with_config(tempdir.path().to_path_buf(), 10, 3600);
1858
1859        let stats = manager.get_stats();
1860        assert_eq!(stats.file_count, 0);
1861        assert_eq!(stats.total_size_bytes, 0);
1862
1863        assert_eq!(manager.cachedir(), &tempdir.path().to_path_buf());
1864    }
1865
1866    #[test]
1867    fn test_format_bytes() {
1868        assert_eq!(format_bytes(512), "512 B");
1869        assert_eq!(format_bytes(1024), "1.0 KB");
1870        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1871        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1872    }
1873}