Skip to main content

sqry_core/io/
file_reader.rs

1//! Efficient file reading abstraction with memory-mapping support
2//!
3//! Provides `FileReader` that automatically chooses between memory-mapped
4//! and buffered reading based on file size and platform capabilities.
5
6use anyhow::{Context, Result, bail};
7use memmap2::Mmap;
8use std::fs::File;
9use std::io::Read;
10use std::path::Path;
11
12// P1-17: Use configurable mmap threshold from config::buffers
13// RR-10: Use configurable max file size for DoS prevention
14use crate::config::buffers::{max_source_file_size, mmap_threshold};
15
16/// Policy for choosing file reading strategy
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum ReaderPolicy {
19    /// Always use buffered reading
20    Buffered,
21    /// Always attempt memory-mapping (fallback to buffered on failure)
22    Mmap,
23    /// Automatically choose based on file size
24    Auto {
25        /// Threshold in bytes for switching to mmap
26        threshold: u64,
27    },
28}
29
30impl Default for ReaderPolicy {
31    fn default() -> Self {
32        // P1-17: Use configurable threshold (respects SQRY_MMAP_THRESHOLD)
33        Self::Auto {
34            threshold: mmap_threshold(),
35        }
36    }
37}
38
39/// File reader that supports both memory-mapped and buffered reading
40pub enum FileReader {
41    /// Memory-mapped file
42    Mmap {
43        /// File handle (kept alive to ensure mmap validity)
44        #[allow(dead_code)]
45        file: File,
46        /// Memory-mapped region
47        mmap: Mmap,
48    },
49    /// Buffered file data
50    Buffered {
51        /// File contents loaded into memory
52        data: Vec<u8>,
53    },
54}
55
56impl FileReader {
57    /// Open a file using the default policy (auto with 10MB threshold)
58    ///
59    /// # Errors
60    ///
61    /// Returns [`anyhow::Error`] when the file cannot be opened, memory-mapped, or read.
62    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
63        Self::open_with_policy(path, ReaderPolicy::default())
64    }
65
66    /// Open a file with a specific reading policy
67    ///
68    /// # Errors
69    ///
70    /// Returns [`anyhow::Error`] when file metadata access, mmap, or buffered reads fail.
71    /// Also returns error if file exceeds the maximum source file size limit (RR-10 `DoS` prevention).
72    pub fn open_with_policy<P: AsRef<Path>>(path: P, policy: ReaderPolicy) -> Result<Self> {
73        let path = path.as_ref();
74        let file =
75            File::open(path).with_context(|| format!("Failed to open file: {}", path.display()))?;
76
77        let metadata = file
78            .metadata()
79            .with_context(|| format!("Failed to read file metadata: {}", path.display()))?;
80
81        let file_size = metadata.len();
82
83        // RR-10 Gap #1: Enforce maximum file size to prevent DoS via huge files
84        let max_size = max_source_file_size();
85        if file_size > max_size {
86            bail!(
87                "File too large to index: {} ({} MB exceeds {} MB limit). \
88                 Adjust SQRY_MAX_SOURCE_FILE_SIZE environment variable if needed.",
89                path.display(),
90                file_size / (1024 * 1024),
91                max_size / (1024 * 1024)
92            );
93        }
94
95        // Decide reading strategy
96        let use_mmap = match policy {
97            ReaderPolicy::Buffered => false,
98            ReaderPolicy::Mmap => true,
99            ReaderPolicy::Auto { threshold } => file_size >= threshold,
100        };
101
102        if use_mmap {
103            // Try memory-mapping first
104            match Self::try_mmap(file, path) {
105                Ok(reader) => Ok(reader),
106                Err(_e) => {
107                    // Mmap failed, fallback to buffered reading
108                    // Reopen file since we consumed it
109                    let mut file = File::open(path)?;
110                    Self::read_buffered(&mut file, path)
111                }
112            }
113        } else {
114            let mut file_for_read = file;
115            Self::read_buffered(&mut file_for_read, path)
116        }
117    }
118
119    /// Attempt to create a memory-mapped reader
120    fn try_mmap(file: File, path: &Path) -> Result<Self> {
121        // Safety: We're only reading the file, and we keep the File handle alive
122        // to ensure the mapping remains valid
123        let mmap = unsafe {
124            Mmap::map(&file).with_context(|| format!("Failed to mmap file: {}", path.display()))?
125        };
126
127        Ok(FileReader::Mmap { file, mmap })
128    }
129
130    /// Read file contents into a buffer
131    fn read_buffered(file: &mut File, path: &Path) -> Result<Self> {
132        let mut data = Vec::new();
133        file.read_to_end(&mut data)
134            .with_context(|| format!("Failed to read file: {}", path.display()))?;
135
136        Ok(FileReader::Buffered { data })
137    }
138
139    /// Get a slice of the file contents
140    #[must_use]
141    pub fn as_slice(&self) -> &[u8] {
142        match self {
143            FileReader::Mmap { mmap, .. } => &mmap[..],
144            FileReader::Buffered { data } => &data[..],
145        }
146    }
147
148    /// Get the size of the file in bytes
149    #[must_use]
150    pub fn len(&self) -> usize {
151        self.as_slice().len()
152    }
153
154    /// Check if the file is empty
155    #[must_use]
156    pub fn is_empty(&self) -> bool {
157        self.len() == 0
158    }
159
160    /// Iterate over chunks of the file
161    pub fn chunks(&self, chunk_size: usize) -> impl Iterator<Item = &[u8]> {
162        self.as_slice().chunks(chunk_size)
163    }
164}
165
166impl AsRef<[u8]> for FileReader {
167    fn as_ref(&self) -> &[u8] {
168        self.as_slice()
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175    use std::io::Write;
176    use tempfile::NamedTempFile;
177
178    fn create_temp_file(size: usize) -> (NamedTempFile, Vec<u8>) {
179        let mut file = NamedTempFile::new().expect("Failed to create temp file");
180        // Modulo 256 ensures value fits in u8; safe cast
181        #[allow(clippy::cast_possible_truncation)]
182        let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
183        file.write_all(&data).expect("Failed to write temp file");
184        file.flush().expect("Failed to flush temp file");
185        (file, data)
186    }
187
188    #[test]
189    fn test_buffered_small_file() {
190        let (file, expected_data) = create_temp_file(1024);
191
192        let reader = FileReader::open_with_policy(file.path(), ReaderPolicy::Buffered)
193            .expect("Failed to open file");
194
195        assert_eq!(reader.as_slice(), &expected_data[..]);
196        assert_eq!(reader.len(), 1024);
197        assert!(!reader.is_empty());
198    }
199
200    #[test]
201    fn test_mmap_large_file() {
202        let size = 15 * 1024 * 1024; // 15 MB
203        let (file, expected_data) = create_temp_file(size);
204
205        let reader = FileReader::open_with_policy(file.path(), ReaderPolicy::Mmap)
206            .expect("Failed to open file");
207
208        assert_eq!(reader.as_slice(), &expected_data[..]);
209        assert_eq!(reader.len(), size);
210    }
211
212    #[test]
213    fn test_auto_policy_small_file() {
214        let (file, expected_data) = create_temp_file(1024);
215
216        let reader = FileReader::open_with_policy(
217            file.path(),
218            ReaderPolicy::Auto {
219                threshold: 10 * 1024 * 1024,
220            },
221        )
222        .expect("Failed to open file");
223
224        assert_eq!(reader.as_slice(), &expected_data[..]);
225    }
226
227    #[test]
228    fn test_auto_policy_large_file() {
229        let size = 15 * 1024 * 1024; // 15 MB
230        let (file, expected_data) = create_temp_file(size);
231
232        let reader = FileReader::open_with_policy(
233            file.path(),
234            ReaderPolicy::Auto {
235                threshold: 10 * 1024 * 1024,
236            },
237        )
238        .expect("Failed to open file");
239
240        assert_eq!(reader.as_slice(), &expected_data[..]);
241        assert_eq!(reader.len(), size);
242    }
243
244    #[test]
245    fn test_chunks_iteration() {
246        let (file, _) = create_temp_file(1000);
247
248        let reader = FileReader::open(file.path()).expect("Failed to open file");
249
250        let chunks: Vec<_> = reader.chunks(100).collect();
251        assert_eq!(chunks.len(), 10);
252        assert_eq!(chunks[0].len(), 100);
253        assert_eq!(chunks[9].len(), 100);
254    }
255
256    #[test]
257    fn test_empty_file() {
258        let file = NamedTempFile::new().expect("Failed to create temp file");
259
260        let reader = FileReader::open(file.path()).expect("Failed to open file");
261
262        assert!(reader.is_empty());
263        assert_eq!(reader.len(), 0);
264    }
265
266    #[test]
267    fn test_threshold_boundary() {
268        let threshold = 5 * 1024; // 5 KB
269
270        // Test file sizes bounded by realistic test data (5KB±1); safe to convert
271        // Just below threshold
272        let (file_small, data_small) =
273            create_temp_file(threshold.try_into().unwrap_or(usize::MAX).saturating_sub(1));
274        let reader_small =
275            FileReader::open_with_policy(file_small.path(), ReaderPolicy::Auto { threshold })
276                .expect("Failed to open small file");
277        assert_eq!(reader_small.as_slice(), &data_small[..]);
278
279        // At threshold
280        let (file_exact, data_exact) = create_temp_file(threshold.try_into().unwrap_or(usize::MAX));
281        let reader_exact =
282            FileReader::open_with_policy(file_exact.path(), ReaderPolicy::Auto { threshold })
283                .expect("Failed to open exact file");
284        assert_eq!(reader_exact.as_slice(), &data_exact[..]);
285
286        // Just above threshold
287        let (file_large, data_large) =
288            create_temp_file(threshold.try_into().unwrap_or(usize::MAX).saturating_add(1));
289        let reader_large =
290            FileReader::open_with_policy(file_large.path(), ReaderPolicy::Auto { threshold })
291                .expect("Failed to open large file");
292        assert_eq!(reader_large.as_slice(), &data_large[..]);
293    }
294}