Skip to main content

datasynth_core/
disk_guard.rs

1//! Disk space management and guardrails for preventing disk exhaustion.
2//!
3//! This module provides disk space tracking and enforcement across different platforms,
4//! with configurable minimum free space limits and pre-write checks.
5
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
8use std::sync::Arc;
9
10/// Disk space usage statistics.
11#[derive(Debug, Clone, Default)]
12pub struct DiskStats {
13    /// Total disk space in bytes
14    pub total_bytes: u64,
15    /// Available disk space in bytes
16    pub available_bytes: u64,
17    /// Used disk space in bytes
18    pub used_bytes: u64,
19    /// Number of disk space checks performed
20    pub checks_performed: u64,
21    /// Number of soft limit warnings
22    pub soft_limit_warnings: u64,
23    /// Whether hard limit was ever exceeded
24    pub hard_limit_exceeded: bool,
25    /// Estimated bytes written this session
26    pub estimated_bytes_written: u64,
27}
28
29/// Disk space guard configuration.
30#[derive(Debug, Clone)]
31pub struct DiskSpaceGuardConfig {
32    /// Minimum free space required in MB (hard limit)
33    pub hard_limit_mb: usize,
34    /// Warning threshold in MB (soft limit)
35    pub soft_limit_mb: usize,
36    /// Check interval (every N write operations)
37    pub check_interval: usize,
38    /// Reserve buffer to maintain in MB
39    pub reserve_buffer_mb: usize,
40    /// Path to monitor (defaults to output directory)
41    pub monitor_path: Option<PathBuf>,
42}
43
44impl Default for DiskSpaceGuardConfig {
45    fn default() -> Self {
46        Self {
47            hard_limit_mb: 100,    // Require at least 100 MB free
48            soft_limit_mb: 500,    // Warn when below 500 MB
49            check_interval: 500,   // Check every 500 operations
50            reserve_buffer_mb: 50, // Keep 50 MB buffer
51            monitor_path: None,
52        }
53    }
54}
55
56impl DiskSpaceGuardConfig {
57    /// Create config with specified minimum free space.
58    pub fn with_min_free_mb(hard_limit_mb: usize) -> Self {
59        Self {
60            hard_limit_mb,
61            soft_limit_mb: hard_limit_mb * 5, // Soft limit at 5x hard limit
62            ..Default::default()
63        }
64    }
65
66    /// Set the path to monitor for disk space.
67    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
68        self.monitor_path = Some(path.as_ref().to_path_buf());
69        self
70    }
71
72    /// Set the reserve buffer.
73    pub fn with_reserve(mut self, reserve_mb: usize) -> Self {
74        self.reserve_buffer_mb = reserve_mb;
75        self
76    }
77}
78
79/// Disk space limit exceeded error.
80#[derive(Debug, Clone)]
81pub struct DiskSpaceExhausted {
82    pub available_mb: usize,
83    pub required_mb: usize,
84    pub is_soft_limit: bool,
85    pub message: String,
86}
87
88impl std::fmt::Display for DiskSpaceExhausted {
89    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
90        write!(f, "{}", self.message)
91    }
92}
93
94impl std::error::Error for DiskSpaceExhausted {}
95
96/// Thread-safe disk space guard for monitoring and enforcing disk limits.
97#[derive(Debug)]
98pub struct DiskSpaceGuard {
99    config: DiskSpaceGuardConfig,
100    operation_counter: AtomicU64,
101    soft_warnings_count: AtomicU64,
102    hard_limit_exceeded: AtomicBool,
103    bytes_written_estimate: AtomicU64,
104    last_available_mb: AtomicUsize,
105}
106
107impl DiskSpaceGuard {
108    /// Create a new disk space guard with the given configuration.
109    pub fn new(config: DiskSpaceGuardConfig) -> Self {
110        Self {
111            config,
112            operation_counter: AtomicU64::new(0),
113            soft_warnings_count: AtomicU64::new(0),
114            hard_limit_exceeded: AtomicBool::new(false),
115            bytes_written_estimate: AtomicU64::new(0),
116            last_available_mb: AtomicUsize::new(0),
117        }
118    }
119
120    /// Create a disk space guard with default configuration.
121    pub fn default_guard() -> Self {
122        Self::new(DiskSpaceGuardConfig::default())
123    }
124
125    /// Create a disk space guard with specified minimum free space.
126    pub fn with_min_free(min_free_mb: usize) -> Self {
127        Self::new(DiskSpaceGuardConfig::with_min_free_mb(min_free_mb))
128    }
129
130    /// Create an Arc-wrapped disk space guard for sharing across threads.
131    pub fn shared(config: DiskSpaceGuardConfig) -> Arc<Self> {
132        Arc::new(Self::new(config))
133    }
134
135    /// Check disk space limit (returns error if hard limit exceeded).
136    ///
137    /// This should be called periodically during file writes.
138    /// It's designed to be efficient - actual disk checks only happen
139    /// at the configured interval.
140    pub fn check(&self) -> Result<(), DiskSpaceExhausted> {
141        // Disabled if no limits set
142        if self.config.hard_limit_mb == 0 {
143            return Ok(());
144        }
145
146        let count = self.operation_counter.fetch_add(1, Ordering::Relaxed);
147
148        // Only check at intervals to minimize overhead
149        if !count.is_multiple_of(self.config.check_interval as u64) {
150            return Ok(());
151        }
152
153        self.check_now()
154    }
155
156    /// Force an immediate disk space check (bypasses interval).
157    pub fn check_now(&self) -> Result<(), DiskSpaceExhausted> {
158        if self.config.hard_limit_mb == 0 {
159            return Ok(());
160        }
161
162        let path = self
163            .config
164            .monitor_path
165            .as_deref()
166            .unwrap_or(Path::new("."));
167
168        let available_mb = get_available_space_mb(path).unwrap_or(usize::MAX);
169        self.last_available_mb
170            .store(available_mb, Ordering::Relaxed);
171
172        // Check hard limit (minimum free space required)
173        let required_mb = self.config.hard_limit_mb + self.config.reserve_buffer_mb;
174        if available_mb < required_mb {
175            self.hard_limit_exceeded.store(true, Ordering::Relaxed);
176            return Err(DiskSpaceExhausted {
177                available_mb,
178                required_mb,
179                is_soft_limit: false,
180                message: format!(
181                    "Disk space exhausted: only {available_mb} MB available, need at least {required_mb} MB. \
182                     Free up disk space or reduce output volume."
183                ),
184            });
185        }
186
187        // Check soft limit (warning only)
188        if available_mb < self.config.soft_limit_mb {
189            self.soft_warnings_count.fetch_add(1, Ordering::Relaxed);
190            // Soft limit exceeded - consumer should check stats for warning count
191        }
192
193        Ok(())
194    }
195
196    /// Pre-check if there's enough space for an estimated write.
197    pub fn check_before_write(&self, estimated_bytes: u64) -> Result<(), DiskSpaceExhausted> {
198        if self.config.hard_limit_mb == 0 {
199            return Ok(());
200        }
201
202        let path = self
203            .config
204            .monitor_path
205            .as_deref()
206            .unwrap_or(Path::new("."));
207
208        let available_mb = get_available_space_mb(path).unwrap_or(usize::MAX);
209        let estimated_mb = (estimated_bytes / (1024 * 1024)) as usize;
210        let required_mb = self.config.hard_limit_mb + self.config.reserve_buffer_mb + estimated_mb;
211
212        if available_mb < required_mb {
213            return Err(DiskSpaceExhausted {
214                available_mb,
215                required_mb,
216                is_soft_limit: false,
217                message: format!(
218                    "Insufficient disk space for write: {} MB available, need {} MB \
219                     (estimated write: {} MB, reserve: {} MB).",
220                    available_mb, required_mb, estimated_mb, self.config.reserve_buffer_mb
221                ),
222            });
223        }
224
225        Ok(())
226    }
227
228    /// Record bytes written (for estimation).
229    pub fn record_write(&self, bytes: u64) {
230        self.bytes_written_estimate
231            .fetch_add(bytes, Ordering::Relaxed);
232    }
233
234    /// Get current disk space statistics.
235    pub fn stats(&self) -> DiskStats {
236        let path = self
237            .config
238            .monitor_path
239            .as_deref()
240            .unwrap_or(Path::new("."));
241
242        let (total, available) = get_disk_space(path).unwrap_or((0, 0));
243
244        DiskStats {
245            total_bytes: total,
246            available_bytes: available,
247            used_bytes: total.saturating_sub(available),
248            checks_performed: self.operation_counter.load(Ordering::Relaxed),
249            soft_limit_warnings: self.soft_warnings_count.load(Ordering::Relaxed),
250            hard_limit_exceeded: self.hard_limit_exceeded.load(Ordering::Relaxed),
251            estimated_bytes_written: self.bytes_written_estimate.load(Ordering::Relaxed),
252        }
253    }
254
255    /// Get current available space in MB.
256    pub fn available_space_mb(&self) -> usize {
257        let path = self
258            .config
259            .monitor_path
260            .as_deref()
261            .unwrap_or(Path::new("."));
262        get_available_space_mb(path).unwrap_or(0)
263    }
264
265    /// Check if disk space tracking is available on this platform.
266    pub fn is_available() -> bool {
267        get_available_space_mb(Path::new(".")).is_some()
268    }
269
270    /// Reset statistics (for testing).
271    pub fn reset_stats(&self) {
272        self.operation_counter.store(0, Ordering::Relaxed);
273        self.soft_warnings_count.store(0, Ordering::Relaxed);
274        self.hard_limit_exceeded.store(false, Ordering::Relaxed);
275        self.bytes_written_estimate.store(0, Ordering::Relaxed);
276    }
277}
278
279impl Default for DiskSpaceGuard {
280    fn default() -> Self {
281        Self::default_guard()
282    }
283}
284
285/// Get available disk space in MB (Linux/macOS implementation using statvfs).
286#[cfg(unix)]
287#[allow(clippy::unnecessary_cast)] // Casts needed for cross-platform compatibility
288pub fn get_available_space_mb(path: &Path) -> Option<usize> {
289    use std::ffi::CString;
290    use std::os::unix::ffi::OsStrExt;
291
292    let path_cstr = CString::new(path.as_os_str().as_bytes()).ok()?;
293
294    #[repr(C)]
295    struct Statvfs {
296        f_bsize: libc::c_ulong,
297        f_frsize: libc::c_ulong,
298        f_blocks: libc::fsblkcnt_t,
299        f_bfree: libc::fsblkcnt_t,
300        f_bavail: libc::fsblkcnt_t,
301        // Remaining fields are not needed
302        _rest: [u8; 128],
303    }
304
305    let mut stat: Statvfs = unsafe { std::mem::zeroed() };
306
307    let result = unsafe { libc::statvfs(path_cstr.as_ptr(), &mut stat as *mut _ as *mut _) };
308
309    if result == 0 {
310        let block_size = stat.f_frsize as u64;
311        let available_blocks = stat.f_bavail as u64;
312        let available_bytes = available_blocks * block_size;
313        Some((available_bytes / (1024 * 1024)) as usize)
314    } else {
315        None
316    }
317}
318
319/// Get total and available disk space in bytes (Linux/macOS).
320#[cfg(unix)]
321#[allow(clippy::unnecessary_cast)] // Casts needed for cross-platform compatibility
322pub fn get_disk_space(path: &Path) -> Option<(u64, u64)> {
323    use std::ffi::CString;
324    use std::os::unix::ffi::OsStrExt;
325
326    let path_cstr = CString::new(path.as_os_str().as_bytes()).ok()?;
327
328    #[repr(C)]
329    struct Statvfs {
330        f_bsize: libc::c_ulong,
331        f_frsize: libc::c_ulong,
332        f_blocks: libc::fsblkcnt_t,
333        f_bfree: libc::fsblkcnt_t,
334        f_bavail: libc::fsblkcnt_t,
335        _rest: [u8; 128],
336    }
337
338    let mut stat: Statvfs = unsafe { std::mem::zeroed() };
339
340    let result = unsafe { libc::statvfs(path_cstr.as_ptr(), &mut stat as *mut _ as *mut _) };
341
342    if result == 0 {
343        let block_size = stat.f_frsize as u64;
344        let total = stat.f_blocks as u64 * block_size;
345        let available = stat.f_bavail as u64 * block_size;
346        Some((total, available))
347    } else {
348        None
349    }
350}
351
352/// Get available disk space in MB (Windows implementation).
353#[cfg(target_os = "windows")]
354pub fn get_available_space_mb(path: &Path) -> Option<usize> {
355    use std::os::windows::ffi::OsStrExt;
356
357    // Convert path to wide string
358    let wide_path: Vec<u16> = path
359        .as_os_str()
360        .encode_wide()
361        .chain(std::iter::once(0))
362        .collect();
363
364    let mut free_bytes_available: u64 = 0;
365    let mut total_bytes: u64 = 0;
366    let mut total_free_bytes: u64 = 0;
367
368    #[link(name = "kernel32")]
369    extern "system" {
370        fn GetDiskFreeSpaceExW(
371            lpDirectoryName: *const u16,
372            lpFreeBytesAvailableToCaller: *mut u64,
373            lpTotalNumberOfBytes: *mut u64,
374            lpTotalNumberOfFreeBytes: *mut u64,
375        ) -> i32;
376    }
377
378    let result = unsafe {
379        GetDiskFreeSpaceExW(
380            wide_path.as_ptr(),
381            &mut free_bytes_available,
382            &mut total_bytes,
383            &mut total_free_bytes,
384        )
385    };
386
387    if result != 0 {
388        Some((free_bytes_available / (1024 * 1024)) as usize)
389    } else {
390        None
391    }
392}
393
394/// Get total and available disk space in bytes (Windows).
395#[cfg(target_os = "windows")]
396pub fn get_disk_space(path: &Path) -> Option<(u64, u64)> {
397    use std::os::windows::ffi::OsStrExt;
398
399    let wide_path: Vec<u16> = path
400        .as_os_str()
401        .encode_wide()
402        .chain(std::iter::once(0))
403        .collect();
404
405    let mut free_bytes_available: u64 = 0;
406    let mut total_bytes: u64 = 0;
407    let mut total_free_bytes: u64 = 0;
408
409    #[link(name = "kernel32")]
410    extern "system" {
411        fn GetDiskFreeSpaceExW(
412            lpDirectoryName: *const u16,
413            lpFreeBytesAvailableToCaller: *mut u64,
414            lpTotalNumberOfBytes: *mut u64,
415            lpTotalNumberOfFreeBytes: *mut u64,
416        ) -> i32;
417    }
418
419    let result = unsafe {
420        GetDiskFreeSpaceExW(
421            wide_path.as_ptr(),
422            &mut free_bytes_available,
423            &mut total_bytes,
424            &mut total_free_bytes,
425        )
426    };
427
428    if result != 0 {
429        Some((total_bytes, free_bytes_available))
430    } else {
431        None
432    }
433}
434
435/// Fallback for unsupported platforms.
436#[cfg(not(any(unix, target_os = "windows")))]
437pub fn get_available_space_mb(_path: &Path) -> Option<usize> {
438    None
439}
440
441#[cfg(not(any(unix, target_os = "windows")))]
442pub fn get_disk_space(_path: &Path) -> Option<(u64, u64)> {
443    None
444}
445
446/// Estimate output size in MB for planned generation.
447pub fn estimate_output_size_mb(
448    num_entries: usize,
449    formats: &[OutputFormat],
450    compression: bool,
451) -> usize {
452    // Average bytes per journal entry by format
453    let base_bytes_per_entry = |format: &OutputFormat| -> usize {
454        match format {
455            OutputFormat::Csv => 400,     // CSV is compact
456            OutputFormat::Json => 800,    // JSON has field names
457            OutputFormat::Parquet => 200, // Parquet is compressed columnar
458        }
459    };
460
461    let total: usize = formats
462        .iter()
463        .map(|f| num_entries * base_bytes_per_entry(f))
464        .sum();
465
466    let with_compression = if compression {
467        total / 5 // ~5x compression ratio
468    } else {
469        total
470    };
471
472    // Add overhead for master data, indexes, etc.
473    let with_overhead = (with_compression as f64 * 1.3) as usize;
474
475    with_overhead.div_ceil(1024 * 1024)
476}
477
478/// Output format for size estimation.
479#[derive(Debug, Clone, Copy, PartialEq, Eq)]
480pub enum OutputFormat {
481    Csv,
482    Json,
483    Parquet,
484}
485
486/// Check if there's enough disk space for planned output.
487pub fn check_sufficient_disk_space(
488    path: &Path,
489    planned_entries: usize,
490    formats: &[OutputFormat],
491    compression: bool,
492    min_free_mb: usize,
493) -> Result<(), String> {
494    let estimated = estimate_output_size_mb(planned_entries, formats, compression);
495    let available = get_available_space_mb(path)
496        .ok_or_else(|| "Unable to determine available disk space on this platform".to_string())?;
497
498    let required = estimated + min_free_mb;
499
500    if available < required {
501        Err(format!(
502            "Insufficient disk space: {available} MB available, need {required} MB \
503             (estimated output: {estimated} MB, minimum free: {min_free_mb} MB). \
504             Reduce output volume or free up disk space."
505        ))
506    } else {
507        Ok(())
508    }
509}
510
511#[cfg(test)]
512#[allow(clippy::unwrap_used)]
513mod tests {
514    use super::*;
515
516    #[test]
517    fn test_disk_guard_creation() {
518        let guard = DiskSpaceGuard::with_min_free(100);
519        assert_eq!(guard.config.hard_limit_mb, 100);
520        assert_eq!(guard.config.soft_limit_mb, 500);
521    }
522
523    #[test]
524    fn test_disk_guard_disabled() {
525        let config = DiskSpaceGuardConfig {
526            hard_limit_mb: 0,
527            ..Default::default()
528        };
529        let guard = DiskSpaceGuard::new(config);
530        // Should always succeed when disabled
531        assert!(guard.check().is_ok());
532        assert!(guard.check_now().is_ok());
533    }
534
535    #[test]
536    fn test_output_size_estimation() {
537        let formats = vec![OutputFormat::Csv, OutputFormat::Json];
538        let est = estimate_output_size_mb(1000, &formats, false);
539        assert!(est > 0);
540        assert!(est < 10); // Should be reasonable for 1000 entries
541
542        let est_compressed = estimate_output_size_mb(1000, &formats, true);
543        assert!(est_compressed < est); // Compressed should be smaller
544    }
545
546    #[test]
547    fn test_stats_tracking() {
548        let guard = DiskSpaceGuard::with_min_free(1);
549
550        for _ in 0..1000 {
551            let _ = guard.check();
552        }
553
554        guard.record_write(1024 * 1024);
555
556        let stats = guard.stats();
557        assert!(stats.checks_performed > 0);
558        assert_eq!(stats.estimated_bytes_written, 1024 * 1024);
559    }
560
561    #[test]
562    fn test_is_available() {
563        #[cfg(unix)]
564        assert!(DiskSpaceGuard::is_available());
565    }
566
567    #[test]
568    fn test_check_before_write() {
569        let guard = DiskSpaceGuard::with_min_free(1);
570        // This should succeed for a small write on most systems
571        let result = guard.check_before_write(1024);
572        assert!(result.is_ok());
573    }
574}