datasynth_core/
disk_guard.rs

1//! Disk space management and guardrails for preventing disk exhaustion.
2//!
3//! This module provides disk space tracking and enforcement across different platforms,
4//! with configurable minimum free space limits and pre-write checks.
5
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
8use std::sync::Arc;
9
10/// Disk space usage statistics.
11#[derive(Debug, Clone, Default)]
12pub struct DiskStats {
13    /// Total disk space in bytes
14    pub total_bytes: u64,
15    /// Available disk space in bytes
16    pub available_bytes: u64,
17    /// Used disk space in bytes
18    pub used_bytes: u64,
19    /// Number of disk space checks performed
20    pub checks_performed: u64,
21    /// Number of soft limit warnings
22    pub soft_limit_warnings: u64,
23    /// Whether hard limit was ever exceeded
24    pub hard_limit_exceeded: bool,
25    /// Estimated bytes written this session
26    pub estimated_bytes_written: u64,
27}
28
29/// Disk space guard configuration.
30#[derive(Debug, Clone)]
31pub struct DiskSpaceGuardConfig {
32    /// Minimum free space required in MB (hard limit)
33    pub hard_limit_mb: usize,
34    /// Warning threshold in MB (soft limit)
35    pub soft_limit_mb: usize,
36    /// Check interval (every N write operations)
37    pub check_interval: usize,
38    /// Reserve buffer to maintain in MB
39    pub reserve_buffer_mb: usize,
40    /// Path to monitor (defaults to output directory)
41    pub monitor_path: Option<PathBuf>,
42}
43
44impl Default for DiskSpaceGuardConfig {
45    fn default() -> Self {
46        Self {
47            hard_limit_mb: 100,    // Require at least 100 MB free
48            soft_limit_mb: 500,    // Warn when below 500 MB
49            check_interval: 500,   // Check every 500 operations
50            reserve_buffer_mb: 50, // Keep 50 MB buffer
51            monitor_path: None,
52        }
53    }
54}
55
56impl DiskSpaceGuardConfig {
57    /// Create config with specified minimum free space.
58    pub fn with_min_free_mb(hard_limit_mb: usize) -> Self {
59        Self {
60            hard_limit_mb,
61            soft_limit_mb: hard_limit_mb * 5, // Soft limit at 5x hard limit
62            ..Default::default()
63        }
64    }
65
66    /// Set the path to monitor for disk space.
67    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
68        self.monitor_path = Some(path.as_ref().to_path_buf());
69        self
70    }
71
72    /// Set the reserve buffer.
73    pub fn with_reserve(mut self, reserve_mb: usize) -> Self {
74        self.reserve_buffer_mb = reserve_mb;
75        self
76    }
77}
78
79/// Disk space limit exceeded error.
80#[derive(Debug, Clone)]
81pub struct DiskSpaceExhausted {
82    pub available_mb: usize,
83    pub required_mb: usize,
84    pub is_soft_limit: bool,
85    pub message: String,
86}
87
88impl std::fmt::Display for DiskSpaceExhausted {
89    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
90        write!(f, "{}", self.message)
91    }
92}
93
94impl std::error::Error for DiskSpaceExhausted {}
95
96/// Thread-safe disk space guard for monitoring and enforcing disk limits.
97#[derive(Debug)]
98pub struct DiskSpaceGuard {
99    config: DiskSpaceGuardConfig,
100    operation_counter: AtomicU64,
101    soft_warnings_count: AtomicU64,
102    hard_limit_exceeded: AtomicBool,
103    bytes_written_estimate: AtomicU64,
104    last_available_mb: AtomicUsize,
105}
106
107impl DiskSpaceGuard {
108    /// Create a new disk space guard with the given configuration.
109    pub fn new(config: DiskSpaceGuardConfig) -> Self {
110        Self {
111            config,
112            operation_counter: AtomicU64::new(0),
113            soft_warnings_count: AtomicU64::new(0),
114            hard_limit_exceeded: AtomicBool::new(false),
115            bytes_written_estimate: AtomicU64::new(0),
116            last_available_mb: AtomicUsize::new(0),
117        }
118    }
119
120    /// Create a disk space guard with default configuration.
121    pub fn default_guard() -> Self {
122        Self::new(DiskSpaceGuardConfig::default())
123    }
124
125    /// Create a disk space guard with specified minimum free space.
126    pub fn with_min_free(min_free_mb: usize) -> Self {
127        Self::new(DiskSpaceGuardConfig::with_min_free_mb(min_free_mb))
128    }
129
130    /// Create an Arc-wrapped disk space guard for sharing across threads.
131    pub fn shared(config: DiskSpaceGuardConfig) -> Arc<Self> {
132        Arc::new(Self::new(config))
133    }
134
135    /// Check disk space limit (returns error if hard limit exceeded).
136    ///
137    /// This should be called periodically during file writes.
138    /// It's designed to be efficient - actual disk checks only happen
139    /// at the configured interval.
140    pub fn check(&self) -> Result<(), DiskSpaceExhausted> {
141        // Disabled if no limits set
142        if self.config.hard_limit_mb == 0 {
143            return Ok(());
144        }
145
146        let count = self.operation_counter.fetch_add(1, Ordering::Relaxed);
147
148        // Only check at intervals to minimize overhead
149        if count % self.config.check_interval as u64 != 0 {
150            return Ok(());
151        }
152
153        self.check_now()
154    }
155
156    /// Force an immediate disk space check (bypasses interval).
157    pub fn check_now(&self) -> Result<(), DiskSpaceExhausted> {
158        if self.config.hard_limit_mb == 0 {
159            return Ok(());
160        }
161
162        let path = self
163            .config
164            .monitor_path
165            .as_deref()
166            .unwrap_or(Path::new("."));
167
168        let available_mb = get_available_space_mb(path).unwrap_or(usize::MAX);
169        self.last_available_mb
170            .store(available_mb, Ordering::Relaxed);
171
172        // Check hard limit (minimum free space required)
173        let required_mb = self.config.hard_limit_mb + self.config.reserve_buffer_mb;
174        if available_mb < required_mb {
175            self.hard_limit_exceeded.store(true, Ordering::Relaxed);
176            return Err(DiskSpaceExhausted {
177                available_mb,
178                required_mb,
179                is_soft_limit: false,
180                message: format!(
181                    "Disk space exhausted: only {} MB available, need at least {} MB. \
182                     Free up disk space or reduce output volume.",
183                    available_mb, required_mb
184                ),
185            });
186        }
187
188        // Check soft limit (warning only)
189        if available_mb < self.config.soft_limit_mb {
190            self.soft_warnings_count.fetch_add(1, Ordering::Relaxed);
191            // Soft limit exceeded - consumer should check stats for warning count
192        }
193
194        Ok(())
195    }
196
197    /// Pre-check if there's enough space for an estimated write.
198    pub fn check_before_write(&self, estimated_bytes: u64) -> Result<(), DiskSpaceExhausted> {
199        if self.config.hard_limit_mb == 0 {
200            return Ok(());
201        }
202
203        let path = self
204            .config
205            .monitor_path
206            .as_deref()
207            .unwrap_or(Path::new("."));
208
209        let available_mb = get_available_space_mb(path).unwrap_or(usize::MAX);
210        let estimated_mb = (estimated_bytes / (1024 * 1024)) as usize;
211        let required_mb = self.config.hard_limit_mb + self.config.reserve_buffer_mb + estimated_mb;
212
213        if available_mb < required_mb {
214            return Err(DiskSpaceExhausted {
215                available_mb,
216                required_mb,
217                is_soft_limit: false,
218                message: format!(
219                    "Insufficient disk space for write: {} MB available, need {} MB \
220                     (estimated write: {} MB, reserve: {} MB).",
221                    available_mb, required_mb, estimated_mb, self.config.reserve_buffer_mb
222                ),
223            });
224        }
225
226        Ok(())
227    }
228
229    /// Record bytes written (for estimation).
230    pub fn record_write(&self, bytes: u64) {
231        self.bytes_written_estimate
232            .fetch_add(bytes, Ordering::Relaxed);
233    }
234
235    /// Get current disk space statistics.
236    pub fn stats(&self) -> DiskStats {
237        let path = self
238            .config
239            .monitor_path
240            .as_deref()
241            .unwrap_or(Path::new("."));
242
243        let (total, available) = get_disk_space(path).unwrap_or((0, 0));
244
245        DiskStats {
246            total_bytes: total,
247            available_bytes: available,
248            used_bytes: total.saturating_sub(available),
249            checks_performed: self.operation_counter.load(Ordering::Relaxed),
250            soft_limit_warnings: self.soft_warnings_count.load(Ordering::Relaxed),
251            hard_limit_exceeded: self.hard_limit_exceeded.load(Ordering::Relaxed),
252            estimated_bytes_written: self.bytes_written_estimate.load(Ordering::Relaxed),
253        }
254    }
255
256    /// Get current available space in MB.
257    pub fn available_space_mb(&self) -> usize {
258        let path = self
259            .config
260            .monitor_path
261            .as_deref()
262            .unwrap_or(Path::new("."));
263        get_available_space_mb(path).unwrap_or(0)
264    }
265
266    /// Check if disk space tracking is available on this platform.
267    pub fn is_available() -> bool {
268        get_available_space_mb(Path::new(".")).is_some()
269    }
270
271    /// Reset statistics (for testing).
272    pub fn reset_stats(&self) {
273        self.operation_counter.store(0, Ordering::Relaxed);
274        self.soft_warnings_count.store(0, Ordering::Relaxed);
275        self.hard_limit_exceeded.store(false, Ordering::Relaxed);
276        self.bytes_written_estimate.store(0, Ordering::Relaxed);
277    }
278}
279
280impl Default for DiskSpaceGuard {
281    fn default() -> Self {
282        Self::default_guard()
283    }
284}
285
286/// Get available disk space in MB (Linux/macOS implementation using statvfs).
287#[cfg(unix)]
288#[allow(clippy::unnecessary_cast)] // Casts needed for cross-platform compatibility
289pub fn get_available_space_mb(path: &Path) -> Option<usize> {
290    use std::ffi::CString;
291    use std::os::unix::ffi::OsStrExt;
292
293    let path_cstr = CString::new(path.as_os_str().as_bytes()).ok()?;
294
295    #[repr(C)]
296    struct Statvfs {
297        f_bsize: libc::c_ulong,
298        f_frsize: libc::c_ulong,
299        f_blocks: libc::fsblkcnt_t,
300        f_bfree: libc::fsblkcnt_t,
301        f_bavail: libc::fsblkcnt_t,
302        // Remaining fields are not needed
303        _rest: [u8; 128],
304    }
305
306    let mut stat: Statvfs = unsafe { std::mem::zeroed() };
307
308    let result = unsafe { libc::statvfs(path_cstr.as_ptr(), &mut stat as *mut _ as *mut _) };
309
310    if result == 0 {
311        let block_size = stat.f_frsize as u64;
312        let available_blocks = stat.f_bavail as u64;
313        let available_bytes = available_blocks * block_size;
314        Some((available_bytes / (1024 * 1024)) as usize)
315    } else {
316        None
317    }
318}
319
320/// Get total and available disk space in bytes (Linux/macOS).
321#[cfg(unix)]
322#[allow(clippy::unnecessary_cast)] // Casts needed for cross-platform compatibility
323pub fn get_disk_space(path: &Path) -> Option<(u64, u64)> {
324    use std::ffi::CString;
325    use std::os::unix::ffi::OsStrExt;
326
327    let path_cstr = CString::new(path.as_os_str().as_bytes()).ok()?;
328
329    #[repr(C)]
330    struct Statvfs {
331        f_bsize: libc::c_ulong,
332        f_frsize: libc::c_ulong,
333        f_blocks: libc::fsblkcnt_t,
334        f_bfree: libc::fsblkcnt_t,
335        f_bavail: libc::fsblkcnt_t,
336        _rest: [u8; 128],
337    }
338
339    let mut stat: Statvfs = unsafe { std::mem::zeroed() };
340
341    let result = unsafe { libc::statvfs(path_cstr.as_ptr(), &mut stat as *mut _ as *mut _) };
342
343    if result == 0 {
344        let block_size = stat.f_frsize as u64;
345        let total = stat.f_blocks as u64 * block_size;
346        let available = stat.f_bavail as u64 * block_size;
347        Some((total, available))
348    } else {
349        None
350    }
351}
352
353/// Get available disk space in MB (Windows implementation).
354#[cfg(target_os = "windows")]
355pub fn get_available_space_mb(path: &Path) -> Option<usize> {
356    use std::os::windows::ffi::OsStrExt;
357
358    // Convert path to wide string
359    let wide_path: Vec<u16> = path
360        .as_os_str()
361        .encode_wide()
362        .chain(std::iter::once(0))
363        .collect();
364
365    let mut free_bytes_available: u64 = 0;
366    let mut total_bytes: u64 = 0;
367    let mut total_free_bytes: u64 = 0;
368
369    #[link(name = "kernel32")]
370    extern "system" {
371        fn GetDiskFreeSpaceExW(
372            lpDirectoryName: *const u16,
373            lpFreeBytesAvailableToCaller: *mut u64,
374            lpTotalNumberOfBytes: *mut u64,
375            lpTotalNumberOfFreeBytes: *mut u64,
376        ) -> i32;
377    }
378
379    let result = unsafe {
380        GetDiskFreeSpaceExW(
381            wide_path.as_ptr(),
382            &mut free_bytes_available,
383            &mut total_bytes,
384            &mut total_free_bytes,
385        )
386    };
387
388    if result != 0 {
389        Some((free_bytes_available / (1024 * 1024)) as usize)
390    } else {
391        None
392    }
393}
394
395/// Get total and available disk space in bytes (Windows).
396#[cfg(target_os = "windows")]
397pub fn get_disk_space(path: &Path) -> Option<(u64, u64)> {
398    use std::os::windows::ffi::OsStrExt;
399
400    let wide_path: Vec<u16> = path
401        .as_os_str()
402        .encode_wide()
403        .chain(std::iter::once(0))
404        .collect();
405
406    let mut free_bytes_available: u64 = 0;
407    let mut total_bytes: u64 = 0;
408    let mut total_free_bytes: u64 = 0;
409
410    #[link(name = "kernel32")]
411    extern "system" {
412        fn GetDiskFreeSpaceExW(
413            lpDirectoryName: *const u16,
414            lpFreeBytesAvailableToCaller: *mut u64,
415            lpTotalNumberOfBytes: *mut u64,
416            lpTotalNumberOfFreeBytes: *mut u64,
417        ) -> i32;
418    }
419
420    let result = unsafe {
421        GetDiskFreeSpaceExW(
422            wide_path.as_ptr(),
423            &mut free_bytes_available,
424            &mut total_bytes,
425            &mut total_free_bytes,
426        )
427    };
428
429    if result != 0 {
430        Some((total_bytes, free_bytes_available))
431    } else {
432        None
433    }
434}
435
436/// Fallback for unsupported platforms.
437#[cfg(not(any(unix, target_os = "windows")))]
438pub fn get_available_space_mb(_path: &Path) -> Option<usize> {
439    None
440}
441
442#[cfg(not(any(unix, target_os = "windows")))]
443pub fn get_disk_space(_path: &Path) -> Option<(u64, u64)> {
444    None
445}
446
447/// Estimate output size in MB for planned generation.
448pub fn estimate_output_size_mb(
449    num_entries: usize,
450    formats: &[OutputFormat],
451    compression: bool,
452) -> usize {
453    // Average bytes per journal entry by format
454    let base_bytes_per_entry = |format: &OutputFormat| -> usize {
455        match format {
456            OutputFormat::Csv => 400,     // CSV is compact
457            OutputFormat::Json => 800,    // JSON has field names
458            OutputFormat::Parquet => 200, // Parquet is compressed columnar
459        }
460    };
461
462    let total: usize = formats
463        .iter()
464        .map(|f| num_entries * base_bytes_per_entry(f))
465        .sum();
466
467    let with_compression = if compression {
468        total / 5 // ~5x compression ratio
469    } else {
470        total
471    };
472
473    // Add overhead for master data, indexes, etc.
474    let with_overhead = (with_compression as f64 * 1.3) as usize;
475
476    with_overhead.div_ceil(1024 * 1024)
477}
478
479/// Output format for size estimation.
480#[derive(Debug, Clone, Copy, PartialEq, Eq)]
481pub enum OutputFormat {
482    Csv,
483    Json,
484    Parquet,
485}
486
487/// Check if there's enough disk space for planned output.
488pub fn check_sufficient_disk_space(
489    path: &Path,
490    planned_entries: usize,
491    formats: &[OutputFormat],
492    compression: bool,
493    min_free_mb: usize,
494) -> Result<(), String> {
495    let estimated = estimate_output_size_mb(planned_entries, formats, compression);
496    let available = get_available_space_mb(path)
497        .ok_or_else(|| "Unable to determine available disk space on this platform".to_string())?;
498
499    let required = estimated + min_free_mb;
500
501    if available < required {
502        Err(format!(
503            "Insufficient disk space: {} MB available, need {} MB \
504             (estimated output: {} MB, minimum free: {} MB). \
505             Reduce output volume or free up disk space.",
506            available, required, estimated, min_free_mb
507        ))
508    } else {
509        Ok(())
510    }
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn test_disk_guard_creation() {
519        let guard = DiskSpaceGuard::with_min_free(100);
520        assert_eq!(guard.config.hard_limit_mb, 100);
521        assert_eq!(guard.config.soft_limit_mb, 500);
522    }
523
524    #[test]
525    fn test_disk_guard_disabled() {
526        let config = DiskSpaceGuardConfig {
527            hard_limit_mb: 0,
528            ..Default::default()
529        };
530        let guard = DiskSpaceGuard::new(config);
531        // Should always succeed when disabled
532        assert!(guard.check().is_ok());
533        assert!(guard.check_now().is_ok());
534    }
535
536    #[test]
537    fn test_output_size_estimation() {
538        let formats = vec![OutputFormat::Csv, OutputFormat::Json];
539        let est = estimate_output_size_mb(1000, &formats, false);
540        assert!(est > 0);
541        assert!(est < 10); // Should be reasonable for 1000 entries
542
543        let est_compressed = estimate_output_size_mb(1000, &formats, true);
544        assert!(est_compressed < est); // Compressed should be smaller
545    }
546
547    #[test]
548    fn test_stats_tracking() {
549        let guard = DiskSpaceGuard::with_min_free(1);
550
551        for _ in 0..1000 {
552            let _ = guard.check();
553        }
554
555        guard.record_write(1024 * 1024);
556
557        let stats = guard.stats();
558        assert!(stats.checks_performed > 0);
559        assert_eq!(stats.estimated_bytes_written, 1024 * 1024);
560    }
561
562    #[test]
563    fn test_is_available() {
564        #[cfg(unix)]
565        assert!(DiskSpaceGuard::is_available());
566    }
567
568    #[test]
569    fn test_check_before_write() {
570        let guard = DiskSpaceGuard::with_min_free(1);
571        // This should succeed for a small write on most systems
572        let result = guard.check_before_write(1024);
573        assert!(result.is_ok());
574    }
575}