Skip to main content

rlm_rs/io/
reader.rs

1//! File reading utilities with memory mapping support.
2//!
3//! Provides efficient file reading for both small and large files,
4//! with automatic detection of when to use memory mapping.
5
6// Memory mapping requires unsafe but is well-documented and safe for read-only access
7#![allow(unsafe_code)]
8
9use crate::error::{IoError, Result};
10use memmap2::Mmap;
11use std::fs::File;
12use std::io::Read;
13use std::path::Path;
14
15/// Threshold for using memory mapping (1MB).
16const MMAP_THRESHOLD: u64 = 1024 * 1024;
17
18/// Maximum file size to read into memory (1GB).
19const MAX_FILE_SIZE: u64 = 1024 * 1024 * 1024;
20
21/// File reader with support for memory mapping.
22///
23/// Automatically chooses the best reading strategy based on file size:
24/// - Small files (< 1MB): Read directly into memory
25/// - Large files (>= 1MB): Use memory mapping
26///
27/// # Examples
28///
29/// ```no_run
30/// use rlm_rs::io::FileReader;
31///
32/// let reader = FileReader::open("large_file.txt").unwrap();
33/// let content = reader.read_to_string().unwrap();
34/// ```
35pub struct FileReader {
36    /// File handle.
37    file: File,
38    /// File size in bytes.
39    size: u64,
40    /// File path for error messages.
41    path: String,
42}
43
44impl FileReader {
45    /// Opens a file for reading.
46    ///
47    /// # Arguments
48    ///
49    /// * `path` - Path to the file.
50    ///
51    /// # Errors
52    ///
53    /// Returns an error if the file doesn't exist or can't be opened.
54    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
55        let path_ref = path.as_ref();
56        let path_str = path_ref.to_string_lossy().to_string();
57
58        if !path_ref.exists() {
59            return Err(IoError::FileNotFound { path: path_str }.into());
60        }
61
62        let file = File::open(path_ref).map_err(|e| IoError::ReadFailed {
63            path: path_str.clone(),
64            reason: e.to_string(),
65        })?;
66
67        let metadata = file.metadata().map_err(|e| IoError::ReadFailed {
68            path: path_str.clone(),
69            reason: e.to_string(),
70        })?;
71
72        let size = metadata.len();
73
74        if size > MAX_FILE_SIZE {
75            return Err(IoError::ReadFailed {
76                path: path_str,
77                reason: format!("file too large: {size} bytes (max: {MAX_FILE_SIZE} bytes)"),
78            }
79            .into());
80        }
81
82        Ok(Self {
83            file,
84            size,
85            path: path_str,
86        })
87    }
88
89    /// Returns the file size in bytes.
90    #[must_use]
91    pub const fn size(&self) -> u64 {
92        self.size
93    }
94
95    /// Returns the file path.
96    #[must_use]
97    pub fn path(&self) -> &str {
98        &self.path
99    }
100
101    /// Reads the file content as a string.
102    ///
103    /// Uses memory mapping for large files.
104    ///
105    /// # Errors
106    ///
107    /// Returns an error if reading fails or content is not valid UTF-8.
108    pub fn read_to_string(&self) -> Result<String> {
109        if self.size >= MMAP_THRESHOLD {
110            self.read_mmap()
111        } else {
112            self.read_direct()
113        }
114    }
115
116    /// Reads the file content as bytes.
117    ///
118    /// # Errors
119    ///
120    /// Returns an error if reading fails.
121    pub fn read_to_bytes(&self) -> Result<Vec<u8>> {
122        if self.size >= MMAP_THRESHOLD {
123            self.read_mmap_bytes()
124        } else {
125            self.read_direct_bytes()
126        }
127    }
128
129    /// Reads using memory mapping.
130    fn read_mmap(&self) -> Result<String> {
131        let bytes = self.read_mmap_bytes()?;
132        String::from_utf8(bytes).map_err(|e| {
133            IoError::ReadFailed {
134                path: self.path.clone(),
135                reason: format!("invalid UTF-8: {e}"),
136            }
137            .into()
138        })
139    }
140
141    /// Reads bytes using memory mapping.
142    fn read_mmap_bytes(&self) -> Result<Vec<u8>> {
143        // Safety: We're only reading from the file, which is safe
144        let mmap = unsafe {
145            Mmap::map(&self.file).map_err(|e| IoError::MmapFailed {
146                path: self.path.clone(),
147                reason: e.to_string(),
148            })?
149        };
150
151        Ok(mmap.to_vec())
152    }
153
154    /// Reads directly into memory.
155    fn read_direct(&self) -> Result<String> {
156        let bytes = self.read_direct_bytes()?;
157        String::from_utf8(bytes).map_err(|e| {
158            IoError::ReadFailed {
159                path: self.path.clone(),
160                reason: format!("invalid UTF-8: {e}"),
161            }
162            .into()
163        })
164    }
165
166    /// Reads bytes directly into memory.
167    #[allow(clippy::cast_possible_truncation)]
168    fn read_direct_bytes(&self) -> Result<Vec<u8>> {
169        let mut file = &self.file;
170        let mut buffer = Vec::with_capacity(self.size as usize);
171        file.read_to_end(&mut buffer)
172            .map_err(|e| IoError::ReadFailed {
173                path: self.path.clone(),
174                reason: e.to_string(),
175            })?;
176        Ok(buffer)
177    }
178
179    /// Creates a memory-mapped view of the file.
180    ///
181    /// Useful when you need to access the file content multiple times
182    /// without copying.
183    ///
184    /// # Errors
185    ///
186    /// Returns an error if memory mapping fails.
187    pub fn mmap(&self) -> Result<Mmap> {
188        // Safety: We're only reading from the file
189        unsafe {
190            Mmap::map(&self.file).map_err(|e| {
191                IoError::MmapFailed {
192                    path: self.path.clone(),
193                    reason: e.to_string(),
194                }
195                .into()
196            })
197        }
198    }
199}
200
201/// Reads a file to string, automatically choosing the best method.
202///
203/// # Arguments
204///
205/// * `path` - Path to the file.
206///
207/// # Errors
208///
209/// Returns an error if the file cannot be read or is not valid UTF-8.
210///
211/// # Examples
212///
213/// ```no_run
214/// use rlm_rs::io::read_file;
215///
216/// let content = read_file("example.txt").unwrap();
217/// ```
218pub fn read_file<P: AsRef<Path>>(path: P) -> Result<String> {
219    FileReader::open(path)?.read_to_string()
220}
221
222/// Reads a file using memory mapping.
223///
224/// This is useful for very large files that shouldn't be fully loaded
225/// into memory.
226///
227/// # Arguments
228///
229/// * `path` - Path to the file.
230///
231/// # Errors
232///
233/// Returns an error if the file cannot be opened or memory mapping fails.
234pub fn read_file_mmap<P: AsRef<Path>>(path: P) -> Result<Mmap> {
235    FileReader::open(path)?.mmap()
236}
237
238/// Writes content to a file, creating parent directories if needed.
239///
240/// # Arguments
241///
242/// * `path` - Path to the file.
243/// * `content` - Content to write.
244///
245/// # Errors
246///
247/// Returns an error if directory creation or file writing fails.
248pub fn write_file<P: AsRef<Path>>(path: P, content: &str) -> Result<()> {
249    let path_ref = path.as_ref();
250    let path_str = path_ref.to_string_lossy().to_string();
251
252    // Create parent directories
253    if let Some(parent) = path_ref.parent()
254        && !parent.exists()
255    {
256        std::fs::create_dir_all(parent).map_err(|e| IoError::DirectoryFailed {
257            path: parent.to_string_lossy().to_string(),
258            reason: e.to_string(),
259        })?;
260    }
261
262    std::fs::write(path_ref, content).map_err(|e| IoError::WriteFailed {
263        path: path_str,
264        reason: e.to_string(),
265    })?;
266
267    Ok(())
268}
269
270/// Writes chunks to individual files in a directory.
271///
272/// # Arguments
273///
274/// * `out_dir` - Directory to write chunks to.
275/// * `chunks` - Iterator of (index, content) pairs.
276/// * `prefix` - Filename prefix (e.g., "chunk").
277///
278/// # Returns
279///
280/// Vector of paths to the written files.
281///
282/// # Errors
283///
284/// Returns an error if directory creation or file writing fails.
285pub fn write_chunks<'a, P, I>(out_dir: P, chunks: I, prefix: &str) -> Result<Vec<String>>
286where
287    P: AsRef<Path>,
288    I: Iterator<Item = (usize, &'a str)>,
289{
290    let out_path = out_dir.as_ref();
291    let out_str = out_path.to_string_lossy().to_string();
292
293    // Create output directory
294    if !out_path.exists() {
295        std::fs::create_dir_all(out_path).map_err(|e| IoError::DirectoryFailed {
296            path: out_str.clone(),
297            reason: e.to_string(),
298        })?;
299    }
300
301    let mut paths = Vec::new();
302
303    for (index, content) in chunks {
304        let filename = format!("{prefix}_{index:04}.txt");
305        let file_path = out_path.join(&filename);
306        let file_str = file_path.to_string_lossy().to_string();
307
308        std::fs::write(&file_path, content).map_err(|e| IoError::WriteFailed {
309            path: file_str.clone(),
310            reason: e.to_string(),
311        })?;
312
313        paths.push(file_str);
314    }
315
316    Ok(paths)
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322    use tempfile::TempDir;
323
324    #[test]
325    fn test_read_small_file() {
326        let temp_dir = TempDir::new().unwrap();
327        let file_path = temp_dir.path().join("small.txt");
328        std::fs::write(&file_path, "Hello, world!").unwrap();
329
330        let content = read_file(&file_path).unwrap();
331        assert_eq!(content, "Hello, world!");
332    }
333
334    #[test]
335    fn test_read_nonexistent_file() {
336        let result = read_file("/nonexistent/path/file.txt");
337        assert!(result.is_err());
338    }
339
340    #[test]
341    fn test_file_reader_size() {
342        let temp_dir = TempDir::new().unwrap();
343        let file_path = temp_dir.path().join("test.txt");
344        std::fs::write(&file_path, "Hello").unwrap();
345
346        let reader = FileReader::open(&file_path).unwrap();
347        assert_eq!(reader.size(), 5);
348    }
349
350    #[test]
351    fn test_file_reader_path() {
352        let temp_dir = TempDir::new().unwrap();
353        let file_path = temp_dir.path().join("test.txt");
354        std::fs::write(&file_path, "Hello").unwrap();
355
356        let reader = FileReader::open(&file_path).unwrap();
357        assert!(reader.path().contains("test.txt"));
358    }
359
360    #[test]
361    fn test_write_file() {
362        let temp_dir = TempDir::new().unwrap();
363        let file_path = temp_dir.path().join("subdir/output.txt");
364
365        write_file(&file_path, "Test content").unwrap();
366
367        let content = std::fs::read_to_string(&file_path).unwrap();
368        assert_eq!(content, "Test content");
369    }
370
371    #[test]
372    fn test_write_file_existing_dir() {
373        let temp_dir = TempDir::new().unwrap();
374        let file_path = temp_dir.path().join("output.txt");
375
376        write_file(&file_path, "Test content").unwrap();
377
378        let content = std::fs::read_to_string(&file_path).unwrap();
379        assert_eq!(content, "Test content");
380    }
381
382    #[test]
383    fn test_write_chunks() {
384        let temp_dir = TempDir::new().unwrap();
385        let out_dir = temp_dir.path().join("chunks");
386
387        let chunks = vec![(0, "First chunk"), (1, "Second chunk")];
388        let paths = write_chunks(&out_dir, chunks.into_iter(), "chunk").unwrap();
389
390        assert_eq!(paths.len(), 2);
391
392        let content0 = std::fs::read_to_string(&paths[0]).unwrap();
393        let content1 = std::fs::read_to_string(&paths[1]).unwrap();
394        assert_eq!(content0, "First chunk");
395        assert_eq!(content1, "Second chunk");
396    }
397
398    #[test]
399    fn test_write_chunks_existing_dir() {
400        let temp_dir = TempDir::new().unwrap();
401        let out_dir = temp_dir.path().join("existing");
402        std::fs::create_dir_all(&out_dir).unwrap();
403
404        let chunks = vec![(0, "Content")];
405        let paths = write_chunks(&out_dir, chunks.into_iter(), "data").unwrap();
406
407        assert_eq!(paths.len(), 1);
408        assert!(paths[0].contains("data_0000.txt"));
409    }
410
411    #[test]
412    fn test_read_utf8_file() {
413        let temp_dir = TempDir::new().unwrap();
414        let file_path = temp_dir.path().join("unicode.txt");
415        std::fs::write(&file_path, "Hello, δΈ–η•Œ! 🌍").unwrap();
416
417        let content = read_file(&file_path).unwrap();
418        assert_eq!(content, "Hello, δΈ–η•Œ! 🌍");
419    }
420
421    #[test]
422    fn test_read_to_bytes() {
423        let temp_dir = TempDir::new().unwrap();
424        let file_path = temp_dir.path().join("bytes.bin");
425        std::fs::write(&file_path, b"binary\x00data").unwrap();
426
427        let reader = FileReader::open(&file_path).unwrap();
428        let bytes = reader.read_to_bytes().unwrap();
429        assert_eq!(bytes, b"binary\x00data");
430    }
431
432    #[test]
433    fn test_read_file_mmap() {
434        let temp_dir = TempDir::new().unwrap();
435        let file_path = temp_dir.path().join("mmap.txt");
436        std::fs::write(&file_path, "Memory mapped content").unwrap();
437
438        let mmap = read_file_mmap(&file_path).unwrap();
439        assert_eq!(&mmap[..], b"Memory mapped content");
440    }
441
442    #[test]
443    fn test_file_reader_mmap() {
444        let temp_dir = TempDir::new().unwrap();
445        let file_path = temp_dir.path().join("mmap.txt");
446        std::fs::write(&file_path, "Test content for mmap").unwrap();
447
448        let reader = FileReader::open(&file_path).unwrap();
449        let mmap = reader.mmap().unwrap();
450        assert_eq!(&mmap[..], b"Test content for mmap");
451    }
452
453    #[test]
454    fn test_read_empty_file() {
455        let temp_dir = TempDir::new().unwrap();
456        let file_path = temp_dir.path().join("empty.txt");
457        std::fs::write(&file_path, "").unwrap();
458
459        let content = read_file(&file_path).unwrap();
460        assert!(content.is_empty());
461    }
462
463    #[test]
464    fn test_read_large_file_mmap_path() {
465        // Create a file larger than MMAP_THRESHOLD (1MB) to test mmap path
466        let temp_dir = TempDir::new().unwrap();
467        let file_path = temp_dir.path().join("large.txt");
468
469        // Create 1.5MB file
470        let large_content = "x".repeat(1024 * 1024 + 512 * 1024);
471        std::fs::write(&file_path, &large_content).unwrap();
472
473        let reader = FileReader::open(&file_path).unwrap();
474        assert!(reader.size() >= MMAP_THRESHOLD);
475
476        let content = reader.read_to_string().unwrap();
477        assert_eq!(content.len(), large_content.len());
478
479        let bytes = FileReader::open(&file_path)
480            .unwrap()
481            .read_to_bytes()
482            .unwrap();
483        assert_eq!(bytes.len(), large_content.len());
484    }
485
486    #[test]
487    fn test_read_invalid_utf8() {
488        let temp_dir = TempDir::new().unwrap();
489        let file_path = temp_dir.path().join("invalid.bin");
490        // Invalid UTF-8 sequence
491        std::fs::write(&file_path, [0xff, 0xfe, 0x00, 0x01]).unwrap();
492
493        let reader = FileReader::open(&file_path).unwrap();
494        let result = reader.read_to_string();
495        assert!(result.is_err());
496    }
497
498    #[test]
499    fn test_read_invalid_utf8_via_mmap() {
500        let temp_dir = TempDir::new().unwrap();
501        let file_path = temp_dir.path().join("large_invalid.bin");
502
503        // Create file larger than MMAP_THRESHOLD with invalid UTF-8
504        let mut content = vec![0x78u8; 1024 * 1024 + 100]; // 'x' bytes
505        content[0] = 0xff; // Invalid UTF-8 at start
506
507        std::fs::write(&file_path, &content).unwrap();
508
509        let reader = FileReader::open(&file_path).unwrap();
510        let result = reader.read_to_string();
511        assert!(result.is_err());
512    }
513
514    #[test]
515    fn test_write_chunks_empty() {
516        let temp_dir = TempDir::new().unwrap();
517        let out_dir = temp_dir.path().join("empty_chunks");
518
519        let chunks: Vec<(usize, &str)> = vec![];
520        let paths = write_chunks(&out_dir, chunks.into_iter(), "chunk").unwrap();
521
522        assert!(paths.is_empty());
523    }
524
525    #[test]
526    fn test_file_reader_read_to_string_small() {
527        // Test direct read path (small file < MMAP_THRESHOLD)
528        let temp_dir = TempDir::new().unwrap();
529        let file_path = temp_dir.path().join("small_string.txt");
530        let content = "Small file content for direct read";
531        std::fs::write(&file_path, content).unwrap();
532
533        let reader = FileReader::open(&file_path).unwrap();
534        assert!(reader.size() < MMAP_THRESHOLD);
535        let result = reader.read_to_string().unwrap();
536        assert_eq!(result, content);
537    }
538
539    #[test]
540    fn test_file_reader_read_to_bytes_small() {
541        // Test direct bytes read path
542        let temp_dir = TempDir::new().unwrap();
543        let file_path = temp_dir.path().join("small_bytes.bin");
544        let content = b"Small binary content";
545        std::fs::write(&file_path, content).unwrap();
546
547        let reader = FileReader::open(&file_path).unwrap();
548        assert!(reader.size() < MMAP_THRESHOLD);
549        let result = reader.read_to_bytes().unwrap();
550        assert_eq!(result, content);
551    }
552
553    #[test]
554    fn test_write_file_to_nested_dirs() {
555        // Test write_file creating nested parent directories (lines 255-258)
556        let temp_dir = TempDir::new().unwrap();
557        let file_path = temp_dir.path().join("a/b/c/deep.txt");
558
559        write_file(&file_path, "Deep content").unwrap();
560
561        let content = std::fs::read_to_string(&file_path).unwrap();
562        assert_eq!(content, "Deep content");
563    }
564
565    #[test]
566    fn test_write_chunks_creates_directory() {
567        // Test write_chunks creating output directory (lines 294-298)
568        let temp_dir = TempDir::new().unwrap();
569        let out_dir = temp_dir.path().join("new_chunks_dir");
570
571        assert!(!out_dir.exists());
572
573        let chunks = vec![(0, "Chunk content")];
574        let paths = write_chunks(&out_dir, chunks.into_iter(), "test").unwrap();
575
576        assert!(out_dir.exists());
577        assert_eq!(paths.len(), 1);
578    }
579
580    #[test]
581    fn test_file_reader_read_to_bytes_binary() {
582        let temp_dir = TempDir::new().unwrap();
583        let file_path = temp_dir.path().join("bytes.bin");
584        let content = b"\x00\x01\x02\x03\x04";
585        std::fs::write(&file_path, content).unwrap();
586
587        let reader = FileReader::open(&file_path).unwrap();
588        let bytes = reader.read_to_bytes().unwrap();
589        assert_eq!(bytes, content);
590    }
591
592    #[test]
593    fn test_file_reader_read_to_bytes_large() {
594        // Test mmap path for large file
595        let temp_dir = TempDir::new().unwrap();
596        let file_path = temp_dir.path().join("large_bytes.bin");
597
598        // Create file larger than MMAP_THRESHOLD
599        let large_content: Vec<u8> = (0..255u8).cycle().take(1024 * 1024 + 100).collect();
600        std::fs::write(&file_path, &large_content).unwrap();
601
602        let reader = FileReader::open(&file_path).unwrap();
603        let bytes = reader.read_to_bytes().unwrap();
604        assert_eq!(bytes.len(), large_content.len());
605    }
606
607    #[test]
608    fn test_read_file_mmap_nonexistent() {
609        let result = read_file_mmap("/nonexistent/path/file.txt");
610        assert!(result.is_err());
611    }
612
613    #[test]
614    fn test_file_reader_open_nonexistent() {
615        let result = FileReader::open("/nonexistent/path/file.bin");
616        assert!(result.is_err());
617    }
618}