Skip to main content

dataprof_core/sampling/
chunk_size.rs

1use sysinfo::System;
2
3#[derive(Debug, Clone, Default)]
4pub enum ChunkSize {
5    /// Fixed chunk size in rows
6    Fixed(usize),
7
8    /// Adaptive based on available memory
9    #[default]
10    Adaptive,
11
12    /// Custom sizing function - cannot derive Debug/Clone with function pointer
13    Custom(fn(u64) -> usize),
14}
15
16impl ChunkSize {
17    /// Calculate optimal chunk size based on available memory and file size
18    pub fn calculate(&self, file_size_bytes: u64) -> usize {
19        match self {
20            ChunkSize::Fixed(size) => *size,
21            ChunkSize::Adaptive => self.adaptive_size(file_size_bytes),
22            ChunkSize::Custom(func) => func(file_size_bytes),
23        }
24    }
25
26    fn adaptive_size(&self, file_size_bytes: u64) -> usize {
27        let mut system = System::new_all();
28        system.refresh_memory();
29
30        let available_memory = system.available_memory();
31
32        // Use max 10% of available memory for each chunk
33        let memory_per_chunk = (available_memory / 10).max(64 * 1024 * 1024); // Min 64MB
34
35        // Estimate rows per MB (rough heuristic: 1KB per row average)
36        let estimated_row_size = 1024;
37        let rows_per_chunk = (memory_per_chunk / estimated_row_size) as usize;
38
39        // Adjust based on file size
40        let file_size_mb = file_size_bytes / (1024 * 1024);
41
42        if file_size_mb < 100 {
43            // Small files: process all at once
44            rows_per_chunk * 10
45        } else if file_size_mb > 10_000 {
46            // Very large files: smaller chunks to avoid memory pressure
47            rows_per_chunk / 2
48        } else {
49            rows_per_chunk
50        }
51    }
52}