scirs2_datasets/
cache.rs

1//! Dataset caching functionality
2
3use crate::error::{DatasetsError, Result};
4use scirs2_core::cache::{CacheBuilder, TTLSizedCache};
5use std::cell::RefCell;
6use std::fs::{self, File};
7use std::hash::{Hash, Hasher};
8use std::io::{Read, Write};
9use std::path::{Path, PathBuf};
10
11/// The base directory name for caching datasets
12const CACHE_DIR_NAME: &str = "scirs2-datasets";
13
14/// Default cache size for in-memory caching
15const DEFAULT_CACHE_SIZE: usize = 100;
16
17/// Default TTL for in-memory cache (in seconds)
18const DEFAULT_CACHE_TTL: u64 = 3600; // 1 hour
19
20/// Default maximum cache size on disk (in bytes) - 500 MB
21const DEFAULT_MAX_CACHE_SIZE: u64 = 500 * 1024 * 1024;
22
23/// Cache directory environment variable
24const CACHE_DIR_ENV: &str = "SCIRS2_CACHE_DIR";
25
26/// Compute SHA256 hash of a file
27fn sha256_hash_file(path: &Path) -> std::result::Result<String, String> {
28    use sha2::{Digest, Sha256};
29
30    let mut file = File::open(path).map_err(|e| format!("Failed to open file: {}", e))?;
31    let mut hasher = Sha256::new();
32    let mut buffer = [0; 8192];
33
34    loop {
35        let bytes_read = file
36            .read(&mut buffer)
37            .map_err(|e| format!("Failed to read file: {}", e))?;
38        if bytes_read == 0 {
39            break;
40        }
41        hasher.update(&buffer[..bytes_read]);
42    }
43
44    Ok(format!("{:x}", hasher.finalize()))
45}
46
47/// Registry entry for dataset files
48pub struct RegistryEntry {
49    /// SHA256 hash of the file
50    pub sha256: &'static str,
51    /// URL to download the file from
52    pub url: &'static str,
53}
54
55/// Get the platform-specific cache directory for downloading and storing datasets
56///
57/// The cache directory is determined in the following order:
58/// 1. Environment variable `SCIRS2_CACHE_DIR` if set
59/// 2. Platform-specific cache directory:
60///    - Windows: `%LOCALAPPDATA%\scirs2-datasets`
61///    - macOS: `~/Library/Caches/scirs2-datasets`
62///    - Linux/Unix: `~/.cache/scirs2-datasets` (respects XDG_CACHE_HOME)
63/// 3. Fallback to `~/.scirs2-datasets` if platform-specific directory fails
64pub fn get_cache_dir() -> Result<PathBuf> {
65    // Check environment variable first
66    if let Ok(cache_dir) = std::env::var(CACHE_DIR_ENV) {
67        let cache_path = PathBuf::from(cache_dir);
68        ensure_directory_exists(&cache_path)?;
69        return Ok(cache_path);
70    }
71
72    // Try platform-specific cache directory
73    if let Some(cache_dir) = get_platform_cache_dir() {
74        ensure_directory_exists(&cache_dir)?;
75        return Ok(cache_dir);
76    }
77
78    // Fallback to home directory
79    let home_dir = dirs::home_dir()
80        .ok_or_else(|| DatasetsError::CacheError("Could not find home directory".to_string()))?;
81    let cache_dir = home_dir.join(format!(".{}", CACHE_DIR_NAME));
82    ensure_directory_exists(&cache_dir)?;
83
84    Ok(cache_dir)
85}
86
87/// Get platform-specific cache directory
88fn get_platform_cache_dir() -> Option<PathBuf> {
89    #[cfg(target_os = "windows")]
90    {
91        dirs::data_local_dir().map(|dir| dir.join(CACHE_DIR_NAME))
92    }
93    #[cfg(target_os = "macos")]
94    {
95        dirs::home_dir().map(|dir| dir.join("Library").join("Caches").join(CACHE_DIR_NAME))
96    }
97    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
98    {
99        // Linux/Unix: Use XDG cache directory
100        if let Ok(xdg_cache) = std::env::var("XDG_CACHE_HOME") {
101            Some(PathBuf::from(xdg_cache).join(CACHE_DIR_NAME))
102        } else {
103            dirs::home_dir().map(|home| home.join(".cache").join(CACHE_DIR_NAME))
104        }
105    }
106}
107
108/// Ensure a directory exists, creating it if necessary
109fn ensure_directory_exists(dir: &Path) -> Result<()> {
110    if !dir.exists() {
111        fs::create_dir_all(dir).map_err(|e| {
112            DatasetsError::CacheError(format!("Failed to create cache directory: {}", e))
113        })?;
114    }
115    Ok(())
116}
117
118/// Fetch a dataset file from either cache or download it from the URL
119///
120/// This function will:
121/// 1. Check if the file exists in the cache directory
122/// 2. If not, download it from the URL in the registry entry
123/// 3. Store it in the cache directory
124/// 4. Return the path to the cached file
125///
126/// # Arguments
127///
128/// * `filename` - The name of the file to fetch
129/// * `registry_entry` - Optional registry entry containing URL and SHA256 hash
130///
131/// # Returns
132///
133/// * `Ok(PathBuf)` - Path to the cached file
134/// * `Err(String)` - Error message if fetching fails
135pub fn fetch_data(
136    filename: &str,
137    registry_entry: Option<&RegistryEntry>,
138) -> std::result::Result<PathBuf, String> {
139    // Get the cache directory
140    let cache_dir = match get_cache_dir() {
141        Ok(dir) => dir,
142        Err(e) => return Err(format!("Failed to get cache directory: {}", e)),
143    };
144
145    // Check if file exists in cache
146    let cache_path = cache_dir.join(filename);
147    if cache_path.exists() {
148        return Ok(cache_path);
149    }
150
151    // If not in cache, fetch from the URL
152    let entry = match registry_entry {
153        Some(entry) => entry,
154        None => return Err(format!("No registry entry found for {}", filename)),
155    };
156
157    // Create a temporary file to download to
158    let temp_dir = tempfile::tempdir().map_err(|e| format!("Failed to create temp dir: {}", e))?;
159    let temp_file = temp_dir.path().join(filename);
160
161    // Download the file
162    let response = ureq::get(entry.url)
163        .call()
164        .map_err(|e| format!("Failed to download {}: {}", filename, e))?;
165
166    let mut reader = response.into_reader();
167    let mut file = std::fs::File::create(&temp_file)
168        .map_err(|e| format!("Failed to create temp file: {}", e))?;
169
170    std::io::copy(&mut reader, &mut file).map_err(|e| format!("Failed to download file: {}", e))?;
171
172    // Verify the SHA256 hash of the downloaded file if provided
173    if !entry.sha256.is_empty() {
174        let computed_hash = sha256_hash_file(&temp_file)?;
175        if computed_hash != entry.sha256 {
176            return Err(format!(
177                "SHA256 hash mismatch for {}: expected {}, got {}",
178                filename, entry.sha256, computed_hash
179            ));
180        }
181    }
182
183    // Move the file to the cache
184    fs::create_dir_all(&cache_dir).map_err(|e| format!("Failed to create cache dir: {}", e))?;
185    if let Some(parent) = cache_path.parent() {
186        fs::create_dir_all(parent).map_err(|e| format!("Failed to create cache dir: {}", e))?;
187    }
188
189    fs::copy(&temp_file, &cache_path).map_err(|e| format!("Failed to copy to cache: {}", e))?;
190
191    Ok(cache_path)
192}
193
194/// File path wrapper for hashing
195#[derive(Clone, Debug, Eq, PartialEq)]
196struct FileCacheKey(String);
197
198impl Hash for FileCacheKey {
199    fn hash<H: Hasher>(&self, state: &mut H) {
200        self.0.hash(state);
201    }
202}
203
204/// Manages caching of downloaded datasets, using both file-based and in-memory caching
205///
206/// This implementation uses scirs2-core::cache's TTLSizedCache for in-memory caching,
207/// while maintaining the file-based persistence for long-term storage.
208pub struct DatasetCache {
209    /// Directory for file-based caching
210    cache_dir: PathBuf,
211    /// In-memory cache for frequently accessed datasets
212    mem_cache: RefCell<TTLSizedCache<FileCacheKey, Vec<u8>>>,
213    /// Maximum cache size in bytes (0 means unlimited)
214    max_cache_size: u64,
215    /// Whether to operate in offline mode (no downloads)
216    offline_mode: bool,
217}
218
219impl Default for DatasetCache {
220    fn default() -> Self {
221        let cache_dir = get_cache_dir().expect("Could not get cache directory");
222
223        let mem_cache = RefCell::new(
224            CacheBuilder::new()
225                .with_size(DEFAULT_CACHE_SIZE)
226                .with_ttl(DEFAULT_CACHE_TTL)
227                .build_sized_cache(),
228        );
229
230        // Check if offline mode is enabled via environment variable
231        let offline_mode = std::env::var("SCIRS2_OFFLINE")
232            .map(|v| v.to_lowercase() == "true" || v == "1")
233            .unwrap_or(false);
234
235        DatasetCache {
236            cache_dir,
237            mem_cache,
238            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
239            offline_mode,
240        }
241    }
242}
243
244impl DatasetCache {
245    /// Create a new dataset cache with the given cache directory and default memory cache
246    pub fn new(cache_dir: PathBuf) -> Self {
247        let mem_cache = RefCell::new(
248            CacheBuilder::new()
249                .with_size(DEFAULT_CACHE_SIZE)
250                .with_ttl(DEFAULT_CACHE_TTL)
251                .build_sized_cache(),
252        );
253
254        let offline_mode = std::env::var("SCIRS2_OFFLINE")
255            .map(|v| v.to_lowercase() == "true" || v == "1")
256            .unwrap_or(false);
257
258        DatasetCache {
259            cache_dir,
260            mem_cache,
261            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
262            offline_mode,
263        }
264    }
265
266    /// Create a new dataset cache with custom settings
267    pub fn with_config(cache_dir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
268        let mem_cache = RefCell::new(
269            CacheBuilder::new()
270                .with_size(cache_size)
271                .with_ttl(ttl_seconds)
272                .build_sized_cache(),
273        );
274
275        let offline_mode = std::env::var("SCIRS2_OFFLINE")
276            .map(|v| v.to_lowercase() == "true" || v == "1")
277            .unwrap_or(false);
278
279        DatasetCache {
280            cache_dir,
281            mem_cache,
282            max_cache_size: DEFAULT_MAX_CACHE_SIZE,
283            offline_mode,
284        }
285    }
286
287    /// Create a new dataset cache with comprehensive configuration
288    pub fn with_full_config(
289        cache_dir: PathBuf,
290        cache_size: usize,
291        ttl_seconds: u64,
292        max_cache_size: u64,
293        offline_mode: bool,
294    ) -> Self {
295        let mem_cache = RefCell::new(
296            CacheBuilder::new()
297                .with_size(cache_size)
298                .with_ttl(ttl_seconds)
299                .build_sized_cache(),
300        );
301
302        DatasetCache {
303            cache_dir,
304            mem_cache,
305            max_cache_size,
306            offline_mode,
307        }
308    }
309
310    /// Create the cache directory if it doesn't exist
311    pub fn ensure_cache_dir(&self) -> Result<()> {
312        if !self.cache_dir.exists() {
313            fs::create_dir_all(&self.cache_dir).map_err(|e| {
314                DatasetsError::CacheError(format!("Failed to create cache directory: {}", e))
315            })?;
316        }
317        Ok(())
318    }
319
320    /// Get the path to a cached file
321    pub fn get_cached_path(&self, name: &str) -> PathBuf {
322        self.cache_dir.join(name)
323    }
324
325    /// Check if a file is already cached (either in memory or on disk)
326    pub fn is_cached(&self, name: &str) -> bool {
327        // Check memory cache first
328        let key = FileCacheKey(name.to_string());
329        if self.mem_cache.borrow_mut().get(&key).is_some() {
330            return true;
331        }
332
333        // Then check file system
334        self.get_cached_path(name).exists()
335    }
336
337    /// Read a cached file as bytes
338    ///
339    /// This method checks the in-memory cache first, and falls back to the file system if needed.
340    /// When reading from the file system, the result is also stored in the in-memory cache.
341    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
342        // Try memory cache first
343        let key = FileCacheKey(name.to_string());
344        if let Some(data) = self.mem_cache.borrow_mut().get(&key) {
345            return Ok(data);
346        }
347
348        // Fall back to file system cache
349        let path = self.get_cached_path(name);
350        if !path.exists() {
351            return Err(DatasetsError::CacheError(format!(
352                "Cached file does not exist: {}",
353                name
354            )));
355        }
356
357        let mut file = File::open(path)
358            .map_err(|e| DatasetsError::CacheError(format!("Failed to open cached file: {}", e)))?;
359
360        let mut buffer = Vec::new();
361        file.read_to_end(&mut buffer)
362            .map_err(|e| DatasetsError::CacheError(format!("Failed to read cached file: {}", e)))?;
363
364        // Update memory cache
365        self.mem_cache.borrow_mut().insert(key, buffer.clone());
366
367        Ok(buffer)
368    }
369
370    /// Write data to both the file cache and memory cache
371    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
372        self.ensure_cache_dir()?;
373
374        // Check if writing this file would exceed cache size limit
375        if self.max_cache_size > 0 {
376            let current_size = self.get_cache_size_bytes()?;
377            let new_file_size = data.len() as u64;
378
379            if current_size + new_file_size > self.max_cache_size {
380                self.cleanup_cache_to_fit(new_file_size)?;
381            }
382        }
383
384        // Write to file system cache
385        let path = self.get_cached_path(name);
386        let mut file = File::create(path).map_err(|e| {
387            DatasetsError::CacheError(format!("Failed to create cache file: {}", e))
388        })?;
389
390        file.write_all(data).map_err(|e| {
391            DatasetsError::CacheError(format!("Failed to write to cache file: {}", e))
392        })?;
393
394        // Update memory cache
395        let key = FileCacheKey(name.to_string());
396        self.mem_cache.borrow_mut().insert(key, data.to_vec());
397
398        Ok(())
399    }
400
401    /// Clear the entire cache (both memory and file-based)
402    pub fn clear_cache(&self) -> Result<()> {
403        // Clear file system cache
404        if self.cache_dir.exists() {
405            fs::remove_dir_all(&self.cache_dir)
406                .map_err(|e| DatasetsError::CacheError(format!("Failed to clear cache: {}", e)))?;
407        }
408
409        // Clear memory cache
410        self.mem_cache.borrow_mut().clear();
411
412        Ok(())
413    }
414
415    /// Remove a specific cached file (from both memory and file system)
416    pub fn remove_cached(&self, name: &str) -> Result<()> {
417        // Remove from file system
418        let path = self.get_cached_path(name);
419        if path.exists() {
420            fs::remove_file(path).map_err(|e| {
421                DatasetsError::CacheError(format!("Failed to remove cached file: {}", e))
422            })?;
423        }
424
425        // Remove from memory cache
426        let key = FileCacheKey(name.to_string());
427        self.mem_cache.borrow_mut().remove(&key);
428
429        Ok(())
430    }
431
432    /// Compute a hash for a filename or URL
433    pub fn hash_filename(name: &str) -> String {
434        let hash = blake3::hash(name.as_bytes());
435        hash.to_hex().to_string()
436    }
437
438    /// Get the total size of the cache in bytes
439    pub fn get_cache_size_bytes(&self) -> Result<u64> {
440        let mut total_size = 0u64;
441
442        if self.cache_dir.exists() {
443            let entries = fs::read_dir(&self.cache_dir).map_err(|e| {
444                DatasetsError::CacheError(format!("Failed to read cache directory: {}", e))
445            })?;
446
447            for entry in entries {
448                let entry = entry.map_err(|e| {
449                    DatasetsError::CacheError(format!("Failed to read directory entry: {}", e))
450                })?;
451
452                if let Ok(metadata) = entry.metadata() {
453                    if metadata.is_file() {
454                        total_size += metadata.len();
455                    }
456                }
457            }
458        }
459
460        Ok(total_size)
461    }
462
463    /// Clean up cache to fit a new file of specified size
464    ///
465    /// This method removes the oldest files first until there's enough space
466    /// for the new file plus some buffer space.
467    fn cleanup_cache_to_fit(&self, needed_size: u64) -> Result<()> {
468        if self.max_cache_size == 0 {
469            return Ok(()); // No size limit
470        }
471
472        let current_size = self.get_cache_size_bytes()?;
473        let target_size = (self.max_cache_size as f64 * 0.8) as u64; // Leave 20% buffer
474        let total_needed = current_size + needed_size;
475
476        if total_needed <= target_size {
477            return Ok(()); // No cleanup needed
478        }
479
480        let size_to_free = total_needed - target_size;
481
482        // Get all files with their modification times
483        let mut files_with_times = Vec::new();
484
485        if self.cache_dir.exists() {
486            let entries = fs::read_dir(&self.cache_dir).map_err(|e| {
487                DatasetsError::CacheError(format!("Failed to read cache directory: {}", e))
488            })?;
489
490            for entry in entries {
491                let entry = entry.map_err(|e| {
492                    DatasetsError::CacheError(format!("Failed to read directory entry: {}", e))
493                })?;
494
495                if let Ok(metadata) = entry.metadata() {
496                    if metadata.is_file() {
497                        if let Ok(modified) = metadata.modified() {
498                            files_with_times.push((entry.path(), metadata.len(), modified));
499                        }
500                    }
501                }
502            }
503        }
504
505        // Sort by modification time (oldest first)
506        files_with_times.sort_by_key(|(_, _, modified)| *modified);
507
508        // Remove files until we've freed enough space
509        let mut freed_size = 0u64;
510        for (path, size, _) in files_with_times {
511            if freed_size >= size_to_free {
512                break;
513            }
514
515            // Remove from memory cache first
516            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
517                let key = FileCacheKey(filename.to_string());
518                self.mem_cache.borrow_mut().remove(&key);
519            }
520
521            // Remove file
522            if let Err(e) = fs::remove_file(&path) {
523                eprintln!("Warning: Failed to remove cache file {:?}: {}", path, e);
524            } else {
525                freed_size += size;
526            }
527        }
528
529        Ok(())
530    }
531
532    /// Set offline mode
533    pub fn set_offline_mode(&mut self, offline: bool) {
534        self.offline_mode = offline;
535    }
536
537    /// Check if cache is in offline mode
538    pub fn is_offline(&self) -> bool {
539        self.offline_mode
540    }
541
542    /// Set maximum cache size in bytes (0 for unlimited)
543    pub fn set_max_cache_size(&mut self, max_size: u64) {
544        self.max_cache_size = max_size;
545    }
546
547    /// Get maximum cache size in bytes
548    pub fn max_cache_size(&self) -> u64 {
549        self.max_cache_size
550    }
551
552    /// Get detailed cache information
553    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
554        let mut total_size = 0u64;
555        let mut file_count = 0usize;
556        let mut files = Vec::new();
557
558        if self.cache_dir.exists() {
559            let entries = fs::read_dir(&self.cache_dir).map_err(|e| {
560                DatasetsError::CacheError(format!("Failed to read cache directory: {}", e))
561            })?;
562
563            for entry in entries {
564                let entry = entry.map_err(|e| {
565                    DatasetsError::CacheError(format!("Failed to read directory entry: {}", e))
566                })?;
567
568                if let Ok(metadata) = entry.metadata() {
569                    if metadata.is_file() {
570                        let size = metadata.len();
571                        total_size += size;
572                        file_count += 1;
573
574                        if let Some(filename) = entry.file_name().to_str() {
575                            files.push(CacheFileInfo {
576                                name: filename.to_string(),
577                                size_bytes: size,
578                                modified: metadata.modified().ok(),
579                            });
580                        }
581                    }
582                }
583            }
584        }
585
586        // Sort files by size (largest first)
587        files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
588
589        Ok(DetailedCacheStats {
590            total_size_bytes: total_size,
591            file_count,
592            cache_dir: self.cache_dir.clone(),
593            max_cache_size: self.max_cache_size,
594            offline_mode: self.offline_mode,
595            files,
596        })
597    }
598}
599
600/// Downloads data from a URL and returns it as bytes, using the cache when possible
601#[cfg(feature = "download")]
602pub fn download_data(url: &str, force_download: bool) -> Result<Vec<u8>> {
603    let cache = DatasetCache::default();
604    let cache_key = DatasetCache::hash_filename(url);
605
606    // Check if the data is already cached
607    if !force_download && cache.is_cached(&cache_key) {
608        return cache.read_cached(&cache_key);
609    }
610
611    // Download the data
612    let response = reqwest::blocking::get(url).map_err(|e| {
613        DatasetsError::DownloadError(format!("Failed to download from {}: {}", url, e))
614    })?;
615
616    if !response.status().is_success() {
617        return Err(DatasetsError::DownloadError(format!(
618            "Failed to download from {}: HTTP status {}",
619            url,
620            response.status()
621        )));
622    }
623
624    let data = response.bytes().map_err(|e| {
625        DatasetsError::DownloadError(format!("Failed to read response data: {}", e))
626    })?;
627
628    let data_vec = data.to_vec();
629
630    // Cache the data
631    cache.write_cached(&cache_key, &data_vec)?;
632
633    Ok(data_vec)
634}
635
636// Stub for when download feature is not enabled
637#[cfg(not(feature = "download"))]
638/// Downloads data from a URL or retrieves it from cache
639///
640/// This is a stub implementation when the download feature is not enabled.
641/// It returns an error informing the user to enable the download feature.
642///
643/// # Arguments
644///
645/// * `_url` - The URL to download from
646/// * `_force_download` - If true, force a new download instead of using cache
647///
648/// # Returns
649///
650/// * An error indicating that the download feature is not enabled
651pub fn download_data(_url: &str, _force_download: bool) -> Result<Vec<u8>> {
652    Err(DatasetsError::Other(
653        "Download feature is not enabled. Recompile with --features download".to_string(),
654    ))
655}
656
657/// Cache management utilities
658#[derive(Default)]
659pub struct CacheManager {
660    cache: DatasetCache,
661}
662
663impl CacheManager {
664    /// Create a new cache manager with custom settings
665    pub fn new(cache_dir: PathBuf, cache_size: usize, ttl_seconds: u64) -> Self {
666        Self {
667            cache: DatasetCache::with_config(cache_dir, cache_size, ttl_seconds),
668        }
669    }
670
671    /// Create a cache manager with comprehensive configuration
672    pub fn with_full_config(
673        cache_dir: PathBuf,
674        cache_size: usize,
675        ttl_seconds: u64,
676        max_cache_size: u64,
677        offline_mode: bool,
678    ) -> Self {
679        Self {
680            cache: DatasetCache::with_full_config(
681                cache_dir,
682                cache_size,
683                ttl_seconds,
684                max_cache_size,
685                offline_mode,
686            ),
687        }
688    }
689
690    /// Get basic cache statistics
691    pub fn get_stats(&self) -> CacheStats {
692        let cache_dir = &self.cache.cache_dir;
693        let mut total_size = 0u64;
694        let mut file_count = 0usize;
695
696        if cache_dir.exists() {
697            if let Ok(entries) = fs::read_dir(cache_dir) {
698                for entry in entries.flatten() {
699                    if let Ok(metadata) = entry.metadata() {
700                        if metadata.is_file() {
701                            total_size += metadata.len();
702                            file_count += 1;
703                        }
704                    }
705                }
706            }
707        }
708
709        CacheStats {
710            total_size_bytes: total_size,
711            file_count,
712            cache_dir: cache_dir.clone(),
713        }
714    }
715
716    /// Get detailed cache statistics
717    pub fn get_detailed_stats(&self) -> Result<DetailedCacheStats> {
718        self.cache.get_detailed_stats()
719    }
720
721    /// Set offline mode
722    pub fn set_offline_mode(&mut self, offline: bool) {
723        self.cache.set_offline_mode(offline);
724    }
725
726    /// Check if in offline mode
727    pub fn is_offline(&self) -> bool {
728        self.cache.is_offline()
729    }
730
731    /// Set maximum cache size in bytes (0 for unlimited)
732    pub fn set_max_cache_size(&mut self, max_size: u64) {
733        self.cache.set_max_cache_size(max_size);
734    }
735
736    /// Get maximum cache size in bytes
737    pub fn max_cache_size(&self) -> u64 {
738        self.cache.max_cache_size()
739    }
740
741    /// Clear all cached data
742    pub fn clear_all(&self) -> Result<()> {
743        self.cache.clear_cache()
744    }
745
746    /// Remove specific cached file
747    pub fn remove(&self, name: &str) -> Result<()> {
748        self.cache.remove_cached(name)
749    }
750
751    /// Remove old files to free up space
752    pub fn cleanup_old_files(&self, target_size: u64) -> Result<()> {
753        self.cache.cleanup_cache_to_fit(target_size)
754    }
755
756    /// List all cached files
757    pub fn list_cached_files(&self) -> Result<Vec<String>> {
758        let cache_dir = &self.cache.cache_dir;
759        let mut files = Vec::new();
760
761        if cache_dir.exists() {
762            let entries = fs::read_dir(cache_dir).map_err(|e| {
763                DatasetsError::CacheError(format!("Failed to read cache directory: {}", e))
764            })?;
765
766            for entry in entries {
767                let entry = entry.map_err(|e| {
768                    DatasetsError::CacheError(format!("Failed to read directory entry: {}", e))
769                })?;
770
771                if let Some(filename) = entry.file_name().to_str() {
772                    files.push(filename.to_string());
773                }
774            }
775        }
776
777        files.sort();
778        Ok(files)
779    }
780
781    /// Get cache directory path
782    pub fn cache_dir(&self) -> &PathBuf {
783        &self.cache.cache_dir
784    }
785
786    /// Check if a file is cached
787    pub fn is_cached(&self, name: &str) -> bool {
788        self.cache.is_cached(name)
789    }
790
791    /// Print detailed cache report
792    pub fn print_cache_report(&self) -> Result<()> {
793        let stats = self.get_detailed_stats()?;
794
795        println!("=== Cache Report ===");
796        println!("Cache Directory: {}", stats.cache_dir.display());
797        println!(
798            "Total Size: {} ({} files)",
799            stats.formatted_size(),
800            stats.file_count
801        );
802        println!("Max Size: {}", stats.formatted_max_size());
803
804        if stats.max_cache_size > 0 {
805            println!("Usage: {:.1}%", stats.usage_percentage() * 100.0);
806        }
807
808        println!(
809            "Offline Mode: {}",
810            if stats.offline_mode {
811                "Enabled"
812            } else {
813                "Disabled"
814            }
815        );
816
817        if !stats.files.is_empty() {
818            println!("\nCached Files:");
819            for file in &stats.files {
820                println!(
821                    "  {} - {} ({})",
822                    file.name,
823                    file.formatted_size(),
824                    file.formatted_modified()
825                );
826            }
827        }
828
829        Ok(())
830    }
831}
832
833/// Cache statistics
834pub struct CacheStats {
835    /// Total size of all cached files in bytes
836    pub total_size_bytes: u64,
837    /// Number of cached files
838    pub file_count: usize,
839    /// Cache directory path
840    pub cache_dir: PathBuf,
841}
842
843/// Detailed cache statistics with file-level information
844pub struct DetailedCacheStats {
845    /// Total size of all cached files in bytes
846    pub total_size_bytes: u64,
847    /// Number of cached files
848    pub file_count: usize,
849    /// Cache directory path
850    pub cache_dir: PathBuf,
851    /// Maximum cache size (0 = unlimited)
852    pub max_cache_size: u64,
853    /// Whether cache is in offline mode
854    pub offline_mode: bool,
855    /// Information about individual cached files
856    pub files: Vec<CacheFileInfo>,
857}
858
859/// Information about a cached file
860#[derive(Debug, Clone)]
861pub struct CacheFileInfo {
862    /// Name of the cached file
863    pub name: String,
864    /// Size in bytes
865    pub size_bytes: u64,
866    /// Last modified time
867    pub modified: Option<std::time::SystemTime>,
868}
869
870impl CacheStats {
871    /// Get total size formatted as human-readable string
872    pub fn formatted_size(&self) -> String {
873        format_bytes(self.total_size_bytes)
874    }
875}
876
877impl DetailedCacheStats {
878    /// Get total size formatted as human-readable string
879    pub fn formatted_size(&self) -> String {
880        format_bytes(self.total_size_bytes)
881    }
882
883    /// Get max cache size formatted as human-readable string
884    pub fn formatted_max_size(&self) -> String {
885        if self.max_cache_size == 0 {
886            "Unlimited".to_string()
887        } else {
888            format_bytes(self.max_cache_size)
889        }
890    }
891
892    /// Get cache usage percentage (0.0-1.0)
893    pub fn usage_percentage(&self) -> f64 {
894        if self.max_cache_size == 0 {
895            0.0
896        } else {
897            self.total_size_bytes as f64 / self.max_cache_size as f64
898        }
899    }
900}
901
902impl CacheFileInfo {
903    /// Get file size formatted as human-readable string
904    pub fn formatted_size(&self) -> String {
905        format_bytes(self.size_bytes)
906    }
907
908    /// Get formatted modification time
909    pub fn formatted_modified(&self) -> String {
910        match &self.modified {
911            Some(time) => {
912                if let Ok(now) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)
913                {
914                    if let Ok(modified) = time.duration_since(std::time::UNIX_EPOCH) {
915                        let diff_secs = now.as_secs().saturating_sub(modified.as_secs());
916                        let days = diff_secs / 86400;
917                        let hours = (diff_secs % 86400) / 3600;
918                        let mins = (diff_secs % 3600) / 60;
919
920                        if days > 0 {
921                            format!("{} days ago", days)
922                        } else if hours > 0 {
923                            format!("{} hours ago", hours)
924                        } else if mins > 0 {
925                            format!("{} minutes ago", mins)
926                        } else {
927                            "Just now".to_string()
928                        }
929                    } else {
930                        "Unknown".to_string()
931                    }
932                } else {
933                    "Unknown".to_string()
934                }
935            }
936            None => "Unknown".to_string(),
937        }
938    }
939}
940
941/// Format bytes as human-readable string
942fn format_bytes(bytes: u64) -> String {
943    let size = bytes as f64;
944    if size < 1024.0 {
945        format!("{} B", size)
946    } else if size < 1024.0 * 1024.0 {
947        format!("{:.1} KB", size / 1024.0)
948    } else if size < 1024.0 * 1024.0 * 1024.0 {
949        format!("{:.1} MB", size / (1024.0 * 1024.0))
950    } else {
951        format!("{:.1} GB", size / (1024.0 * 1024.0 * 1024.0))
952    }
953}
954
955/// Batch operation result containing success/failure information
956#[derive(Debug, Clone)]
957pub struct BatchResult {
958    /// Number of successful operations
959    pub success_count: usize,
960    /// Number of failed operations
961    pub failure_count: usize,
962    /// List of failed items with error messages
963    pub failures: Vec<(String, String)>,
964    /// Total bytes processed
965    pub total_bytes: u64,
966    /// Total time taken for the batch operation
967    pub elapsed_time: std::time::Duration,
968}
969
970impl BatchResult {
971    /// Create a new empty batch result
972    pub fn new() -> Self {
973        Self {
974            success_count: 0,
975            failure_count: 0,
976            failures: Vec::new(),
977            total_bytes: 0,
978            elapsed_time: std::time::Duration::ZERO,
979        }
980    }
981
982    /// Check if all operations were successful
983    pub fn is_all_success(&self) -> bool {
984        self.failure_count == 0
985    }
986
987    /// Get success rate as percentage
988    pub fn success_rate(&self) -> f64 {
989        let total = self.success_count + self.failure_count;
990        if total == 0 {
991            0.0
992        } else {
993            (self.success_count as f64 / total as f64) * 100.0
994        }
995    }
996
997    /// Get formatted summary
998    pub fn summary(&self) -> String {
999        format!(
1000            "Batch completed: {}/{} successful ({:.1}%), {} bytes processed in {:.2}s",
1001            self.success_count,
1002            self.success_count + self.failure_count,
1003            self.success_rate(),
1004            format_bytes(self.total_bytes),
1005            self.elapsed_time.as_secs_f64()
1006        )
1007    }
1008}
1009
1010impl Default for BatchResult {
1011    fn default() -> Self {
1012        Self::new()
1013    }
1014}
1015
1016/// Batch operations manager for dataset caching
1017pub struct BatchOperations {
1018    cache: CacheManager,
1019    parallel: bool,
1020    max_retries: usize,
1021    retry_delay: std::time::Duration,
1022}
1023
1024impl BatchOperations {
1025    /// Create a new batch operations manager
1026    pub fn new(cache: CacheManager) -> Self {
1027        Self {
1028            cache,
1029            parallel: true,
1030            max_retries: 3,
1031            retry_delay: std::time::Duration::from_millis(1000),
1032        }
1033    }
1034
1035    /// Configure parallel processing
1036    pub fn with_parallel(mut self, parallel: bool) -> Self {
1037        self.parallel = parallel;
1038        self
1039    }
1040
1041    /// Configure retry settings
1042    pub fn with_retry_config(
1043        mut self,
1044        max_retries: usize,
1045        retry_delay: std::time::Duration,
1046    ) -> Self {
1047        self.max_retries = max_retries;
1048        self.retry_delay = retry_delay;
1049        self
1050    }
1051
1052    /// Download multiple datasets in batch
1053    #[cfg(feature = "download")]
1054    pub fn batch_download(&self, urls_and_names: &[(&str, &str)]) -> BatchResult {
1055        let start_time = std::time::Instant::now();
1056        let mut result = BatchResult::new();
1057
1058        if self.parallel {
1059            self.batch_download_parallel(urls_and_names, &mut result)
1060        } else {
1061            self.batch_download_sequential(urls_and_names, &mut result)
1062        }
1063
1064        result.elapsed_time = start_time.elapsed();
1065        result
1066    }
1067
1068    #[cfg(feature = "download")]
1069    fn batch_download_parallel(&self, urls_and_names: &[(&str, &str)], result: &mut BatchResult) {
1070        use std::fs::File;
1071        use std::io::Write;
1072        use std::sync::{Arc, Mutex};
1073        use std::thread;
1074
1075        // Ensure cache directory exists before spawning threads
1076        if let Err(e) = self.cache.cache.ensure_cache_dir() {
1077            result.failure_count += urls_and_names.len();
1078            for &(_, name) in urls_and_names {
1079                result
1080                    .failures
1081                    .push((name.to_string(), format!("Cache setup failed: {}", e)));
1082            }
1083            return;
1084        }
1085
1086        let result_arc = Arc::new(Mutex::new(BatchResult::new()));
1087        let cache_dir = self.cache.cache.cache_dir.clone();
1088        let max_retries = self.max_retries;
1089        let retry_delay = self.retry_delay;
1090
1091        let handles: Vec<_> = urls_and_names
1092            .iter()
1093            .map(|&(url, name)| {
1094                let result_clone = Arc::clone(&result_arc);
1095                let url = url.to_string();
1096                let name = name.to_string();
1097                let cache_dir = cache_dir.clone();
1098
1099                thread::spawn(move || {
1100                    let mut success = false;
1101                    let mut last_error = String::new();
1102                    let mut downloaded_data = Vec::new();
1103
1104                    for attempt in 0..=max_retries {
1105                        match download_data(&url, false) {
1106                            Ok(data) => {
1107                                // Write directly to filesystem (bypassing RefCell memory cache)
1108                                let path = cache_dir.join(&name);
1109                                match File::create(&path) {
1110                                    Ok(mut file) => match file.write_all(&data) {
1111                                        Ok(_) => {
1112                                            let mut r = result_clone.lock().unwrap();
1113                                            r.success_count += 1;
1114                                            r.total_bytes += data.len() as u64;
1115                                            downloaded_data = data;
1116                                            success = true;
1117                                            break;
1118                                        }
1119                                        Err(e) => {
1120                                            last_error =
1121                                                format!("Failed to write cache file: {}", e);
1122                                        }
1123                                    },
1124                                    Err(e) => {
1125                                        last_error = format!("Failed to create cache file: {}", e);
1126                                    }
1127                                }
1128                            }
1129                            Err(e) => {
1130                                last_error = format!("Download failed: {}", e);
1131                                if attempt < max_retries {
1132                                    thread::sleep(retry_delay);
1133                                }
1134                            }
1135                        }
1136                    }
1137
1138                    if !success {
1139                        let mut r = result_clone.lock().unwrap();
1140                        r.failure_count += 1;
1141                        r.failures.push((name.clone(), last_error));
1142                    }
1143
1144                    (name, success, downloaded_data)
1145                })
1146            })
1147            .collect();
1148
1149        // Collect results and update memory cache for successful downloads
1150        let mut successful_downloads = Vec::new();
1151        for handle in handles {
1152            if let Ok((name, success, data)) = handle.join() {
1153                if success && !data.is_empty() {
1154                    successful_downloads.push((name, data));
1155                }
1156            }
1157        }
1158
1159        // Merge the results from the arc back into the original result
1160        if let Ok(arc_result) = result_arc.lock() {
1161            result.success_count += arc_result.success_count;
1162            result.failure_count += arc_result.failure_count;
1163            result.failures.extend(arc_result.failures.clone());
1164        }
1165
1166        // Update memory cache after all threads complete
1167        for (name, data) in successful_downloads {
1168            let key = FileCacheKey(name);
1169            self.cache.cache.mem_cache.borrow_mut().insert(key, data);
1170        }
1171    }
1172
1173    #[cfg(feature = "download")]
1174    fn batch_download_sequential(&self, urls_and_names: &[(&str, &str)], result: &mut BatchResult) {
1175        for &(url, name) in urls_and_names {
1176            let mut success = false;
1177            let mut last_error = String::new();
1178
1179            for attempt in 0..=self.max_retries {
1180                match download_data(url, false) {
1181                    Ok(data) => match self.cache.cache.write_cached(name, &data) {
1182                        Ok(_) => {
1183                            result.success_count += 1;
1184                            result.total_bytes += data.len() as u64;
1185                            success = true;
1186                            break;
1187                        }
1188                        Err(e) => {
1189                            last_error = format!("Cache write failed: {}", e);
1190                        }
1191                    },
1192                    Err(e) => {
1193                        last_error = format!("Download failed: {}", e);
1194                        if attempt < self.max_retries {
1195                            std::thread::sleep(self.retry_delay);
1196                        }
1197                    }
1198                }
1199            }
1200
1201            if !success {
1202                result.failure_count += 1;
1203                result.failures.push((name.to_string(), last_error));
1204            }
1205        }
1206    }
1207
1208    /// Verify integrity of multiple cached files
1209    pub fn batch_verify_integrity(&self, files_and_hashes: &[(&str, &str)]) -> BatchResult {
1210        let start_time = std::time::Instant::now();
1211        let mut result = BatchResult::new();
1212
1213        for &(filename, expected_hash) in files_and_hashes {
1214            match self.cache.cache.get_cached_path(filename).exists() {
1215                true => match sha256_hash_file(&self.cache.cache.get_cached_path(filename)) {
1216                    Ok(actual_hash) => {
1217                        if actual_hash == expected_hash {
1218                            result.success_count += 1;
1219                            if let Ok(metadata) =
1220                                std::fs::metadata(self.cache.cache.get_cached_path(filename))
1221                            {
1222                                result.total_bytes += metadata.len();
1223                            }
1224                        } else {
1225                            result.failure_count += 1;
1226                            result.failures.push((
1227                                filename.to_string(),
1228                                format!(
1229                                    "Hash mismatch: expected {}, got {}",
1230                                    expected_hash, actual_hash
1231                                ),
1232                            ));
1233                        }
1234                    }
1235                    Err(e) => {
1236                        result.failure_count += 1;
1237                        result.failures.push((
1238                            filename.to_string(),
1239                            format!("Hash computation failed: {}", e),
1240                        ));
1241                    }
1242                },
1243                false => {
1244                    result.failure_count += 1;
1245                    result
1246                        .failures
1247                        .push((filename.to_string(), "File not found in cache".to_string()));
1248                }
1249            }
1250        }
1251
1252        result.elapsed_time = start_time.elapsed();
1253        result
1254    }
1255
1256    /// Clean up cache selectively based on patterns
1257    pub fn selective_cleanup(
1258        &self,
1259        patterns: &[&str],
1260        max_age_days: Option<u32>,
1261    ) -> Result<BatchResult> {
1262        let start_time = std::time::Instant::now();
1263        let mut result = BatchResult::new();
1264
1265        let cached_files = self.cache.list_cached_files()?;
1266        let now = std::time::SystemTime::now();
1267
1268        for filename in cached_files {
1269            let should_remove = patterns.iter().any(|pattern| {
1270                filename.contains(pattern) || matches_glob_pattern(&filename, pattern)
1271            });
1272
1273            if should_remove {
1274                let file_path = self.cache.cache.get_cached_path(&filename);
1275
1276                // Check age if max_age_days is specified
1277                let remove_due_to_age = if let Some(max_age) = max_age_days {
1278                    if let Ok(metadata) = std::fs::metadata(&file_path) {
1279                        if let Ok(modified) = metadata.modified() {
1280                            if let Ok(age) = now.duration_since(modified) {
1281                                age.as_secs() > (max_age as u64 * 24 * 3600)
1282                            } else {
1283                                false
1284                            }
1285                        } else {
1286                            false
1287                        }
1288                    } else {
1289                        false
1290                    }
1291                } else {
1292                    true // Remove regardless of age if no age limit specified
1293                };
1294
1295                if remove_due_to_age {
1296                    match self.cache.remove(&filename) {
1297                        Ok(_) => {
1298                            result.success_count += 1;
1299                            if let Ok(metadata) = std::fs::metadata(&file_path) {
1300                                result.total_bytes += metadata.len();
1301                            }
1302                        }
1303                        Err(e) => {
1304                            result.failure_count += 1;
1305                            result
1306                                .failures
1307                                .push((filename, format!("Removal failed: {}", e)));
1308                        }
1309                    }
1310                }
1311            }
1312        }
1313
1314        result.elapsed_time = start_time.elapsed();
1315        Ok(result)
1316    }
1317
1318    /// Process multiple datasets with a given function
1319    pub fn batch_process<F, T, E>(&self, names: &[String], processor: F) -> BatchResult
1320    where
1321        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1322        E: std::fmt::Display,
1323        T: Send,
1324    {
1325        let start_time = std::time::Instant::now();
1326        let mut result = BatchResult::new();
1327
1328        if self.parallel {
1329            self.batch_process_parallel(names, processor, &mut result)
1330        } else {
1331            self.batch_process_sequential(names, processor, &mut result)
1332        }
1333
1334        result.elapsed_time = start_time.elapsed();
1335        result
1336    }
1337
1338    fn batch_process_parallel<F, T, E>(
1339        &self,
1340        names: &[String],
1341        processor: F,
1342        result: &mut BatchResult,
1343    ) where
1344        F: Fn(&str, &[u8]) -> std::result::Result<T, E> + Sync + Send + 'static,
1345        E: std::fmt::Display,
1346        T: Send,
1347    {
1348        // For thread safety with the current cache implementation,
1349        // we need to read all data first, then process in parallel
1350        let mut data_pairs = Vec::new();
1351
1352        // Sequential read phase
1353        for name in names {
1354            match self.cache.cache.read_cached(name) {
1355                Ok(data) => data_pairs.push((name.clone(), data)),
1356                Err(e) => {
1357                    result.failure_count += 1;
1358                    result
1359                        .failures
1360                        .push((name.clone(), format!("Cache read failed: {}", e)));
1361                }
1362            }
1363        }
1364
1365        // Parallel processing phase
1366        if !data_pairs.is_empty() {
1367            use std::sync::{Arc, Mutex};
1368            use std::thread;
1369
1370            let parallel_result = Arc::new(Mutex::new(BatchResult::new()));
1371            let processor = Arc::new(processor);
1372
1373            let handles: Vec<_> = data_pairs
1374                .into_iter()
1375                .map(|(name, data)| {
1376                    let result_clone = Arc::clone(&parallel_result);
1377                    let processor_clone = Arc::clone(&processor);
1378
1379                    thread::spawn(move || match processor_clone(&name, &data) {
1380                        Ok(_) => {
1381                            let mut r = result_clone.lock().unwrap();
1382                            r.success_count += 1;
1383                            r.total_bytes += data.len() as u64;
1384                        }
1385                        Err(e) => {
1386                            let mut r = result_clone.lock().unwrap();
1387                            r.failure_count += 1;
1388                            r.failures.push((name, format!("Processing failed: {}", e)));
1389                        }
1390                    })
1391                })
1392                .collect();
1393
1394            for handle in handles {
1395                let _ = handle.join();
1396            }
1397
1398            // Merge parallel results into main result
1399            let parallel_result = parallel_result.lock().unwrap();
1400            result.success_count += parallel_result.success_count;
1401            result.failure_count += parallel_result.failure_count;
1402            result.total_bytes += parallel_result.total_bytes;
1403            result.failures.extend(parallel_result.failures.clone());
1404        }
1405    }
1406
1407    fn batch_process_sequential<F, T, E>(
1408        &self,
1409        names: &[String],
1410        processor: F,
1411        result: &mut BatchResult,
1412    ) where
1413        F: Fn(&str, &[u8]) -> std::result::Result<T, E>,
1414        E: std::fmt::Display,
1415    {
1416        for name in names {
1417            match self.cache.cache.read_cached(name) {
1418                Ok(data) => match processor(name, &data) {
1419                    Ok(_) => {
1420                        result.success_count += 1;
1421                        result.total_bytes += data.len() as u64;
1422                    }
1423                    Err(e) => {
1424                        result.failure_count += 1;
1425                        result
1426                            .failures
1427                            .push((name.clone(), format!("Processing failed: {}", e)));
1428                    }
1429                },
1430                Err(e) => {
1431                    result.failure_count += 1;
1432                    result
1433                        .failures
1434                        .push((name.clone(), format!("Cache read failed: {}", e)));
1435                }
1436            }
1437        }
1438    }
1439
1440    /// Get access to the underlying cache manager
1441    pub fn cache_manager(&self) -> &CacheManager {
1442        &self.cache
1443    }
1444
1445    /// Write data to cache
1446    pub fn write_cached(&self, name: &str, data: &[u8]) -> Result<()> {
1447        self.cache.cache.write_cached(name, data)
1448    }
1449
1450    /// Read data from cache
1451    pub fn read_cached(&self, name: &str) -> Result<Vec<u8>> {
1452        self.cache.cache.read_cached(name)
1453    }
1454
1455    /// List cached files
1456    pub fn list_cached_files(&self) -> Result<Vec<String>> {
1457        self.cache.list_cached_files()
1458    }
1459
1460    /// Print cache report
1461    pub fn print_cache_report(&self) -> Result<()> {
1462        self.cache.print_cache_report()
1463    }
1464
1465    /// Get statistics about cached datasets
1466    pub fn get_cache_statistics(&self) -> Result<BatchResult> {
1467        let start_time = std::time::Instant::now();
1468        let mut result = BatchResult::new();
1469
1470        let cached_files = self.cache.list_cached_files()?;
1471
1472        for filename in cached_files {
1473            let file_path = self.cache.cache.get_cached_path(&filename);
1474            match std::fs::metadata(&file_path) {
1475                Ok(metadata) => {
1476                    result.success_count += 1;
1477                    result.total_bytes += metadata.len();
1478                }
1479                Err(e) => {
1480                    result.failure_count += 1;
1481                    result
1482                        .failures
1483                        .push((filename, format!("Metadata read failed: {}", e)));
1484                }
1485            }
1486        }
1487
1488        result.elapsed_time = start_time.elapsed();
1489        Ok(result)
1490    }
1491}
1492
1493/// Simple glob pattern matching for filenames
1494fn matches_glob_pattern(filename: &str, pattern: &str) -> bool {
1495    if pattern == "*" {
1496        return true;
1497    }
1498
1499    if pattern.contains('*') {
1500        let parts: Vec<&str> = pattern.split('*').collect();
1501        if parts.len() == 2 {
1502            let prefix = parts[0];
1503            let suffix = parts[1];
1504            return filename.starts_with(prefix) && filename.ends_with(suffix);
1505        }
1506    }
1507
1508    filename == pattern
1509}
1510
1511#[cfg(test)]
1512mod tests {
1513    use super::*;
1514    use tempfile::TempDir;
1515
1516    #[test]
1517    fn test_batch_result() {
1518        let mut result = BatchResult::new();
1519        assert_eq!(result.success_count, 0);
1520        assert_eq!(result.failure_count, 0);
1521        assert!(result.is_all_success());
1522        assert_eq!(result.success_rate(), 0.0);
1523
1524        result.success_count = 8;
1525        result.failure_count = 2;
1526        result.total_bytes = 1024;
1527
1528        assert!(!result.is_all_success());
1529        assert_eq!(result.success_rate(), 80.0);
1530        assert!(result.summary().contains("8/10 successful"));
1531        assert!(result.summary().contains("80.0%"));
1532    }
1533
1534    #[test]
1535    fn test_batch_operations_creation() {
1536        let temp_dir = TempDir::new().unwrap();
1537        let cache_manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1538        let batch_ops = BatchOperations::new(cache_manager)
1539            .with_parallel(false)
1540            .with_retry_config(2, std::time::Duration::from_millis(500));
1541
1542        assert!(!batch_ops.parallel);
1543        assert_eq!(batch_ops.max_retries, 2);
1544    }
1545
1546    #[test]
1547    fn test_selective_cleanup() {
1548        let temp_dir = TempDir::new().unwrap();
1549        let cache_manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1550        let batch_ops = BatchOperations::new(cache_manager);
1551
1552        // Create some test files
1553        let test_data = vec![0u8; 100];
1554        batch_ops
1555            .cache
1556            .cache
1557            .write_cached("test1.csv", &test_data)
1558            .unwrap();
1559        batch_ops
1560            .cache
1561            .cache
1562            .write_cached("test2.csv", &test_data)
1563            .unwrap();
1564        batch_ops
1565            .cache
1566            .cache
1567            .write_cached("data.json", &test_data)
1568            .unwrap();
1569
1570        // Clean up files matching pattern
1571        let result = batch_ops.selective_cleanup(&["*.csv"], None).unwrap();
1572
1573        assert_eq!(result.success_count, 2); // Should remove test1.csv and test2.csv
1574        assert!(!batch_ops.cache.is_cached("test1.csv"));
1575        assert!(!batch_ops.cache.is_cached("test2.csv"));
1576        assert!(batch_ops.cache.is_cached("data.json")); // Should remain
1577    }
1578
1579    #[test]
1580    fn test_batch_process() {
1581        let temp_dir = TempDir::new().unwrap();
1582        let cache_manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1583        let batch_ops = BatchOperations::new(cache_manager).with_parallel(false);
1584
1585        // Create test files
1586        let test_data1 = vec![1u8; 100];
1587        let test_data2 = vec![2u8; 200];
1588        batch_ops
1589            .cache
1590            .cache
1591            .write_cached("file1.dat", &test_data1)
1592            .unwrap();
1593        batch_ops
1594            .cache
1595            .cache
1596            .write_cached("file2.dat", &test_data2)
1597            .unwrap();
1598
1599        let files = vec!["file1.dat".to_string(), "file2.dat".to_string()];
1600
1601        // Process files (verify they're non-empty)
1602        let result = batch_ops.batch_process(&files, |_name, data| {
1603            if data.is_empty() {
1604                Err("Empty file")
1605            } else {
1606                Ok(data.len())
1607            }
1608        });
1609
1610        assert_eq!(result.success_count, 2);
1611        assert_eq!(result.failure_count, 0);
1612        assert_eq!(result.total_bytes, 300); // 100 + 200
1613    }
1614
1615    #[test]
1616    fn test_get_cache_statistics() {
1617        let temp_dir = TempDir::new().unwrap();
1618        let cache_manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1619        let batch_ops = BatchOperations::new(cache_manager);
1620
1621        // Start with empty cache
1622        let result = batch_ops.get_cache_statistics().unwrap();
1623        assert_eq!(result.success_count, 0);
1624
1625        // Add some files
1626        let test_data = vec![0u8; 500];
1627        batch_ops
1628            .cache
1629            .cache
1630            .write_cached("test1.dat", &test_data)
1631            .unwrap();
1632        batch_ops
1633            .cache
1634            .cache
1635            .write_cached("test2.dat", &test_data)
1636            .unwrap();
1637
1638        let result = batch_ops.get_cache_statistics().unwrap();
1639        assert_eq!(result.success_count, 2);
1640        assert_eq!(result.total_bytes, 1000);
1641    }
1642
1643    #[test]
1644    fn test_matches_glob_pattern() {
1645        assert!(matches_glob_pattern("test.csv", "*"));
1646        assert!(matches_glob_pattern("test.csv", "*.csv"));
1647        assert!(matches_glob_pattern("test.csv", "test.*"));
1648        assert!(matches_glob_pattern("test.csv", "test.csv"));
1649
1650        assert!(!matches_glob_pattern("test.json", "*.csv"));
1651        assert!(!matches_glob_pattern("other.csv", "test.*"));
1652    }
1653
1654    #[test]
1655    fn test_cache_manager_creation() {
1656        let temp_dir = TempDir::new().unwrap();
1657        let manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1658        let stats = manager.get_stats();
1659        assert_eq!(stats.file_count, 0);
1660    }
1661
1662    #[test]
1663    fn test_cache_stats_formatting() {
1664        let temp_dir = TempDir::new().unwrap();
1665        let stats = CacheStats {
1666            total_size_bytes: 1024,
1667            file_count: 1,
1668            cache_dir: temp_dir.path().to_path_buf(),
1669        };
1670
1671        assert_eq!(stats.formatted_size(), "1.0 KB");
1672
1673        let stats_large = CacheStats {
1674            total_size_bytes: 1024 * 1024 * 1024,
1675            file_count: 1,
1676            cache_dir: temp_dir.path().to_path_buf(),
1677        };
1678
1679        assert_eq!(stats_large.formatted_size(), "1.0 GB");
1680    }
1681
1682    #[test]
1683    fn test_hash_filename() {
1684        let hash1 = DatasetCache::hash_filename("test.csv");
1685        let hash2 = DatasetCache::hash_filename("test.csv");
1686        let hash3 = DatasetCache::hash_filename("different.csv");
1687
1688        assert_eq!(hash1, hash2);
1689        assert_ne!(hash1, hash3);
1690        assert_eq!(hash1.len(), 64); // Blake3 produces 32-byte hashes = 64 hex chars
1691    }
1692
1693    #[test]
1694    fn test_platform_cache_dir() {
1695        let cache_dir = get_platform_cache_dir();
1696        // Should work on any platform
1697        assert!(cache_dir.is_some() || cfg!(target_os = "unknown"));
1698
1699        if let Some(dir) = cache_dir {
1700            assert!(dir.to_string_lossy().contains("scirs2-datasets"));
1701        }
1702    }
1703
1704    #[test]
1705    fn test_cache_size_management() {
1706        let temp_dir = TempDir::new().unwrap();
1707        let cache = DatasetCache::with_full_config(
1708            temp_dir.path().to_path_buf(),
1709            10,
1710            3600,
1711            2048, // 2KB limit
1712            false,
1713        );
1714
1715        // Write multiple small files to approach the limit
1716        let small_data1 = vec![0u8; 400];
1717        cache.write_cached("small1.dat", &small_data1).unwrap();
1718
1719        let small_data2 = vec![0u8; 400];
1720        cache.write_cached("small2.dat", &small_data2).unwrap();
1721
1722        let small_data3 = vec![0u8; 400];
1723        cache.write_cached("small3.dat", &small_data3).unwrap();
1724
1725        // Now write a file that should trigger cleanup
1726        let medium_data = vec![0u8; 800];
1727        cache.write_cached("medium.dat", &medium_data).unwrap();
1728
1729        // The cache should have cleaned up to stay under the limit
1730        let stats = cache.get_detailed_stats().unwrap();
1731        assert!(stats.total_size_bytes <= cache.max_cache_size());
1732
1733        // The most recent file should still be cached
1734        assert!(cache.is_cached("medium.dat"));
1735    }
1736
1737    #[test]
1738    fn test_offline_mode() {
1739        let temp_dir = TempDir::new().unwrap();
1740        let mut cache = DatasetCache::new(temp_dir.path().to_path_buf());
1741
1742        assert!(!cache.is_offline());
1743        cache.set_offline_mode(true);
1744        assert!(cache.is_offline());
1745    }
1746
1747    #[test]
1748    fn test_detailed_stats() {
1749        let temp_dir = TempDir::new().unwrap();
1750        let cache = DatasetCache::new(temp_dir.path().to_path_buf());
1751
1752        let test_data = vec![1, 2, 3, 4, 5];
1753        cache.write_cached("test.dat", &test_data).unwrap();
1754
1755        let stats = cache.get_detailed_stats().unwrap();
1756        assert_eq!(stats.file_count, 1);
1757        assert_eq!(stats.total_size_bytes, test_data.len() as u64);
1758        assert_eq!(stats.files.len(), 1);
1759        assert_eq!(stats.files[0].name, "test.dat");
1760        assert_eq!(stats.files[0].size_bytes, test_data.len() as u64);
1761    }
1762
1763    #[test]
1764    fn test_cache_manager() {
1765        let temp_dir = TempDir::new().unwrap();
1766        let manager = CacheManager::new(temp_dir.path().to_path_buf(), 10, 3600);
1767
1768        let stats = manager.get_stats();
1769        assert_eq!(stats.file_count, 0);
1770        assert_eq!(stats.total_size_bytes, 0);
1771
1772        assert_eq!(manager.cache_dir(), &temp_dir.path().to_path_buf());
1773    }
1774
1775    #[test]
1776    fn test_format_bytes() {
1777        assert_eq!(format_bytes(512), "512 B");
1778        assert_eq!(format_bytes(1024), "1.0 KB");
1779        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
1780        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
1781    }
1782}