fast_yaml_parallel/io/
reader.rs

1//! Smart file reading with automatic strategy selection based on file size.
2
3use std::fs::File;
4use std::path::Path;
5
6use memmap2::Mmap;
7
8use crate::error::{Error, Result};
9
10/// Memory-map threshold constant: 512KB
11const MMAP_THRESHOLD: u64 = 512 * 1024;
12
13/// File content holder that abstracts over in-memory strings and memory-mapped files.
14#[derive(Debug)]
15pub enum FileContent {
16    /// Content loaded into memory as a String
17    String(String),
18    /// Content accessed via memory-mapped file
19    Mmap(Mmap),
20}
21
22impl FileContent {
23    /// Returns the content as a string slice.
24    ///
25    /// For String variant, returns the string directly.
26    /// For Mmap variant, validates UTF-8 encoding first.
27    pub fn as_str(&self) -> Result<&str> {
28        match self {
29            Self::String(s) => Ok(s),
30            Self::Mmap(mmap) => std::str::from_utf8(mmap).map_err(|source| Error::Utf8 { source }),
31        }
32    }
33
34    /// Returns true if content is memory-mapped
35    pub const fn is_mmap(&self) -> bool {
36        matches!(self, Self::Mmap(_))
37    }
38
39    /// Returns the size of the content in bytes
40    pub fn len(&self) -> usize {
41        match self {
42            Self::String(s) => s.len(),
43            Self::Mmap(mmap) => mmap.len(),
44        }
45    }
46
47    /// Returns true if the content is empty
48    pub fn is_empty(&self) -> bool {
49        self.len() == 0
50    }
51}
52
53/// Smart file reader that chooses optimal reading strategy based on file size.
54///
55/// For files smaller than the threshold, uses `std::fs::read_to_string` for simplicity.
56/// For larger files, uses memory-mapped files to avoid loading entire content into heap.
57#[derive(Debug)]
58pub struct SmartReader {
59    mmap_threshold: u64,
60}
61
62impl SmartReader {
63    /// Creates a new `SmartReader` with the default threshold (512KB).
64    ///
65    /// # Examples
66    ///
67    /// ```
68    /// use fast_yaml_parallel::SmartReader;
69    /// use std::path::Path;
70    ///
71    /// let reader = SmartReader::new();
72    /// # let temp_file = tempfile::NamedTempFile::new().unwrap();
73    /// # std::fs::write(temp_file.path(), "key: value\n").unwrap();
74    /// let content = reader.read(temp_file.path())?;
75    /// let yaml = content.as_str()?;
76    /// assert!(yaml.contains("key"));
77    /// # Ok::<(), fast_yaml_parallel::Error>(())
78    /// ```
79    pub const fn new() -> Self {
80        Self::with_threshold(MMAP_THRESHOLD)
81    }
82
83    /// Creates a new `SmartReader` with a custom threshold.
84    ///
85    /// # Examples
86    ///
87    /// ```
88    /// use fast_yaml_parallel::SmartReader;
89    ///
90    /// // Use mmap for files larger than 1MB
91    /// let reader = SmartReader::with_threshold(1024 * 1024);
92    /// ```
93    pub const fn with_threshold(threshold: u64) -> Self {
94        Self {
95            mmap_threshold: threshold,
96        }
97    }
98
99    /// Reads file content using the optimal strategy based on file size.
100    ///
101    /// Returns `FileContent` and automatically chooses between:
102    /// - `read_to_string` for files < threshold
103    /// - `mmap` for files >= threshold
104    ///
105    /// Falls back to `read_to_string` if mmap fails.
106    ///
107    /// # Errors
108    ///
109    /// Returns `Error::Io` if:
110    /// - Path does not exist
111    /// - Path is a directory
112    /// - Insufficient permissions
113    pub fn read(&self, path: &Path) -> Result<FileContent> {
114        let metadata = std::fs::metadata(path).map_err(|source| Error::Io {
115            path: path.to_path_buf(),
116            source,
117        })?;
118
119        if metadata.is_dir() {
120            return Err(Error::Io {
121                path: path.to_path_buf(),
122                source: std::io::Error::new(
123                    std::io::ErrorKind::InvalidInput,
124                    "path is a directory, not a file",
125                ),
126            });
127        }
128
129        let size = metadata.len();
130
131        if size >= self.mmap_threshold {
132            Self::read_mmap(path).or_else(|_| {
133                // Fallback to read_to_string if mmap fails
134                Self::read_string(path)
135            })
136        } else {
137            Self::read_string(path)
138        }
139    }
140
141    /// Reads file into memory as a String
142    fn read_string(path: &Path) -> Result<FileContent> {
143        let content = std::fs::read_to_string(path).map_err(|source| Error::Io {
144            path: path.to_path_buf(),
145            source,
146        })?;
147        Ok(FileContent::String(content))
148    }
149
150    /// Reads file using memory-mapped file
151    #[allow(unsafe_code)]
152    fn read_mmap(path: &Path) -> Result<FileContent> {
153        let file = File::open(path).map_err(|source| Error::Io {
154            path: path.to_path_buf(),
155            source,
156        })?;
157
158        // SAFETY: We're opening the file read-only and mapping it.
159        // The file could be modified by another process during reading,
160        // but this is acceptable for a parser tool:
161        // - If modified, worst case is a parse error (which is handled)
162        // - User expectation is that files aren't modified during parsing
163        // - Same race condition exists with read_to_string
164        // - The mmap is read-only, so we won't write to mapped memory
165        // - Mmap type ensures memory is unmapped when dropped
166        let mmap = unsafe {
167            Mmap::map(&file).map_err(|source| Error::Io {
168                path: path.to_path_buf(),
169                source,
170            })?
171        };
172
173        Ok(FileContent::Mmap(mmap))
174    }
175}
176
177impl Default for SmartReader {
178    fn default() -> Self {
179        Self::new()
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use std::io::Write;
187    use tempfile::NamedTempFile;
188
189    #[test]
190    fn test_file_content_as_str_string() {
191        let content = FileContent::String("test content".to_string());
192        assert_eq!(content.as_str().unwrap(), "test content");
193        assert!(!content.is_mmap());
194        assert_eq!(content.len(), 12);
195        assert!(!content.is_empty());
196    }
197
198    #[test]
199    fn test_file_content_is_empty() {
200        let content = FileContent::String(String::new());
201        assert!(content.is_empty());
202    }
203
204    #[test]
205    fn test_reader_small_file_uses_string() {
206        let mut file = NamedTempFile::new().unwrap();
207        write!(file, "small: content").unwrap();
208
209        let reader = SmartReader::new();
210        let content = reader.read(file.path()).unwrap();
211
212        assert!(!content.is_mmap());
213        assert_eq!(content.as_str().unwrap(), "small: content");
214    }
215
216    #[test]
217    fn test_reader_large_file_uses_mmap() {
218        let mut file = NamedTempFile::new().unwrap();
219
220        // Write content larger than 512KB threshold
221        let large_content = "x".repeat(600 * 1024);
222        write!(file, "{large_content}").unwrap();
223
224        let reader = SmartReader::new();
225        let content = reader.read(file.path()).unwrap();
226
227        assert!(content.is_mmap());
228        assert_eq!(content.len(), large_content.len());
229    }
230
231    #[test]
232    fn test_reader_custom_threshold() {
233        let mut file = NamedTempFile::new().unwrap();
234        write!(file, "test content").unwrap();
235
236        // Threshold of 5 bytes should trigger mmap for our 12-byte file
237        let reader = SmartReader::with_threshold(5);
238        let content = reader.read(file.path()).unwrap();
239
240        // Should use mmap since file > 5 bytes
241        assert!(content.is_mmap());
242    }
243
244    #[test]
245    fn test_reader_default_equals_new() {
246        let reader1 = SmartReader::new();
247        let reader2 = SmartReader::default();
248
249        assert_eq!(reader1.mmap_threshold, reader2.mmap_threshold);
250    }
251
252    #[test]
253    fn test_read_nonexistent_file() {
254        let reader = SmartReader::new();
255        let result = reader.read(Path::new("/nonexistent/file.yaml"));
256        assert!(result.is_err());
257    }
258
259    #[test]
260    fn test_file_content_len() {
261        let content = FileContent::String("hello".to_string());
262        assert_eq!(content.len(), 5);
263    }
264
265    #[test]
266    fn test_read_utf8_validation_with_mmap() {
267        let mut file = NamedTempFile::new().unwrap();
268
269        // Write valid UTF-8 content larger than threshold
270        let content = "valid: utf8 content\n".repeat(30_000);
271        write!(file, "{content}").unwrap();
272
273        let reader = SmartReader::new();
274        let file_content = reader.read(file.path()).unwrap();
275
276        // Should be mmap and valid UTF-8
277        assert!(file_content.is_mmap());
278        assert!(file_content.as_str().is_ok());
279    }
280
281    #[test]
282    #[cfg(unix)]
283    fn test_symlink_handling() {
284        use std::os::unix::fs::symlink;
285
286        let temp_dir = tempfile::tempdir().unwrap();
287        let target = temp_dir.path().join("target.yaml");
288        let link = temp_dir.path().join("link.yaml");
289
290        // Create target file
291        std::fs::write(&target, "key: value\n").unwrap();
292
293        // Create symlink
294        symlink(&target, &link).unwrap();
295
296        // Reader should follow symlink and read content
297        let reader = SmartReader::new();
298        let content = reader.read(&link).unwrap();
299
300        assert_eq!(content.as_str().unwrap(), "key: value\n");
301    }
302
303    #[test]
304    #[cfg(unix)]
305    fn test_broken_symlink_error() {
306        use std::os::unix::fs::symlink;
307
308        let temp_dir = tempfile::tempdir().unwrap();
309        let nonexistent = temp_dir.path().join("nonexistent.yaml");
310        let link = temp_dir.path().join("broken_link.yaml");
311
312        // Create symlink to nonexistent file
313        symlink(&nonexistent, &link).unwrap();
314
315        // Reading broken symlink should fail
316        let reader = SmartReader::new();
317        let result = reader.read(&link);
318
319        assert!(result.is_err());
320    }
321
322    #[test]
323    fn test_file_exactly_at_threshold() {
324        let mut file = NamedTempFile::new().unwrap();
325
326        // Write exactly 512KB
327        let content = "x".repeat(512 * 1024);
328        write!(file, "{content}").unwrap();
329
330        let reader = SmartReader::new();
331        let file_content = reader.read(file.path()).unwrap();
332
333        // At threshold, should use mmap
334        assert!(file_content.is_mmap());
335        assert_eq!(file_content.len(), 512 * 1024);
336    }
337
338    #[test]
339    fn test_file_just_below_threshold() {
340        let mut file = NamedTempFile::new().unwrap();
341
342        // Write 512KB - 1 byte
343        let content = "x".repeat(512 * 1024 - 1);
344        write!(file, "{content}").unwrap();
345
346        let reader = SmartReader::new();
347        let file_content = reader.read(file.path()).unwrap();
348
349        // Below threshold, should use String
350        assert!(!file_content.is_mmap());
351        assert_eq!(file_content.len(), 512 * 1024 - 1);
352    }
353
354    #[test]
355    fn test_file_just_above_threshold() {
356        let mut file = NamedTempFile::new().unwrap();
357
358        // Write 512KB + 1 byte
359        let content = "x".repeat(512 * 1024 + 1);
360        write!(file, "{content}").unwrap();
361
362        let reader = SmartReader::new();
363        let file_content = reader.read(file.path()).unwrap();
364
365        // Above threshold, should use mmap
366        assert!(file_content.is_mmap());
367        assert_eq!(file_content.len(), 512 * 1024 + 1);
368    }
369
370    #[test]
371    fn test_zero_length_file() {
372        let file = NamedTempFile::new().unwrap();
373        // Don't write anything - file is empty
374
375        let reader = SmartReader::new();
376        let content = reader.read(file.path()).unwrap();
377
378        assert!(content.is_empty());
379        assert_eq!(content.len(), 0);
380        assert_eq!(content.as_str().unwrap(), "");
381    }
382
383    #[test]
384    fn test_directory_instead_of_file() {
385        let temp_dir = tempfile::tempdir().unwrap();
386
387        let reader = SmartReader::new();
388        let result = reader.read(temp_dir.path());
389
390        // Reading a directory should fail
391        assert!(result.is_err());
392    }
393
394    #[test]
395    fn test_invalid_utf8_with_string() {
396        let temp_dir = tempfile::tempdir().unwrap();
397        let path = temp_dir.path().join("invalid.bin");
398
399        // Write invalid UTF-8 bytes (small file, uses String path)
400        let invalid_bytes = b"\xFF\xFE invalid utf8";
401        std::fs::write(&path, invalid_bytes).unwrap();
402
403        let reader = SmartReader::new();
404        let result = reader.read(&path);
405
406        // Should fail on UTF-8 validation
407        assert!(result.is_err());
408    }
409
410    #[test]
411    fn test_invalid_utf8_with_mmap() {
412        let temp_dir = tempfile::tempdir().unwrap();
413        let path = temp_dir.path().join("invalid_large.bin");
414
415        // Write invalid UTF-8 bytes (large file, uses mmap)
416        let mut invalid_content = vec![0xFF; 600 * 1024];
417        invalid_content.extend_from_slice(b" invalid utf8");
418        std::fs::write(&path, invalid_content).unwrap();
419
420        let reader = SmartReader::new();
421        let file_content = reader.read(&path).unwrap();
422
423        // File read succeeds (mmap created)
424        assert!(file_content.is_mmap());
425
426        // But as_str() fails on UTF-8 validation
427        let result = file_content.as_str();
428        assert!(result.is_err());
429    }
430
431    #[test]
432    fn test_empty_mmap_file() {
433        let temp_dir = tempfile::tempdir().unwrap();
434        let path = temp_dir.path().join("empty.yaml");
435
436        // Create empty file
437        std::fs::write(&path, "").unwrap();
438
439        // Force mmap with low threshold
440        let reader = SmartReader::with_threshold(0);
441        let content = reader.read(&path).unwrap();
442
443        // Empty files might use String path even with low threshold
444        // This is OK - just verify it works
445        assert!(content.is_empty());
446        assert_eq!(content.as_str().unwrap(), "");
447    }
448
449    #[test]
450    fn test_file_content_mmap_is_mmap() {
451        let mut file = NamedTempFile::new().unwrap();
452        let content = "x".repeat(600 * 1024);
453        write!(file, "{content}").unwrap();
454
455        let reader = SmartReader::new();
456        let file_content = reader.read(file.path()).unwrap();
457
458        assert!(file_content.is_mmap());
459        assert_eq!(file_content.len(), 600 * 1024);
460    }
461
462    #[test]
463    #[cfg(unix)]
464    fn test_directory_symlink_rejection() {
465        use std::os::unix::fs::symlink;
466
467        let temp_dir = tempfile::tempdir().unwrap();
468        let target_dir = temp_dir.path().join("target_dir");
469        let link = temp_dir.path().join("dir_link");
470
471        // Create target directory
472        std::fs::create_dir(&target_dir).unwrap();
473
474        // Create symlink to directory
475        symlink(&target_dir, &link).unwrap();
476
477        // Reading directory symlink should fail
478        let reader = SmartReader::new();
479        let result = reader.read(&link);
480
481        assert!(result.is_err());
482        match result {
483            Err(Error::Io { source, .. }) => {
484                assert_eq!(source.kind(), std::io::ErrorKind::InvalidInput);
485            }
486            _ => panic!("expected Io error"),
487        }
488    }
489}