rust_expect/expect/
large_buffer.rs

1//! Large buffer support using memory-mapped files.
2//!
3//! This module provides a buffer implementation optimized for handling
4//! very large outputs (>10MB) using memory-mapped files for efficiency.
5
6use std::fs::{File, OpenOptions};
7use std::io::{self, Read, Seek, SeekFrom, Write};
8use std::path::{Path, PathBuf};
9
10/// Default threshold for switching to mmap buffer (10 MB).
11pub const MMAP_THRESHOLD: usize = 10 * 1024 * 1024;
12
13/// A large buffer backed by a temporary file.
14///
15/// This buffer uses a file to store data, with optional memory mapping
16/// for efficient access to large datasets.
17pub struct LargeBuffer {
18    /// The backing file.
19    file: File,
20    /// Path to the temporary file.
21    path: PathBuf,
22    /// Current size of the buffer.
23    size: usize,
24    /// Whether to delete the file on drop.
25    cleanup: bool,
26    /// Read position for streaming.
27    read_pos: usize,
28}
29
30impl LargeBuffer {
31    /// Create a new large buffer with a temporary file.
32    ///
33    /// # Errors
34    ///
35    /// Returns an error if the temporary file cannot be created.
36    pub fn new() -> io::Result<Self> {
37        let path = std::env::temp_dir().join(format!(
38            "rust_expect_buffer_{}_{}",
39            std::process::id(),
40            std::time::SystemTime::now()
41                .duration_since(std::time::UNIX_EPOCH)
42                .map(|d| d.as_nanos())
43                .unwrap_or(0)
44        ));
45        Self::with_path(&path)
46    }
47
48    /// Create a new large buffer at the specified path.
49    ///
50    /// # Errors
51    ///
52    /// Returns an error if the file cannot be created.
53    pub fn with_path(path: &Path) -> io::Result<Self> {
54        let file = OpenOptions::new()
55            .read(true)
56            .write(true)
57            .create(true)
58            .truncate(true)
59            .open(path)?;
60
61        Ok(Self {
62            file,
63            path: path.to_path_buf(),
64            size: 0,
65            cleanup: true,
66            read_pos: 0,
67        })
68    }
69
70    /// Set whether to delete the file on drop.
71    pub const fn set_cleanup(&mut self, cleanup: bool) {
72        self.cleanup = cleanup;
73    }
74
75    /// Get the path to the backing file.
76    #[must_use]
77    pub fn path(&self) -> &Path {
78        &self.path
79    }
80
81    /// Append data to the buffer.
82    ///
83    /// # Errors
84    ///
85    /// Returns an error if the write fails.
86    pub fn append(&mut self, data: &[u8]) -> io::Result<()> {
87        self.file.seek(SeekFrom::End(0))?;
88        self.file.write_all(data)?;
89        self.size += data.len();
90        Ok(())
91    }
92
93    /// Get the current size of the buffer.
94    #[must_use]
95    pub const fn len(&self) -> usize {
96        self.size
97    }
98
99    /// Check if the buffer is empty.
100    #[must_use]
101    pub const fn is_empty(&self) -> bool {
102        self.size == 0
103    }
104
105    /// Read a range of bytes from the buffer.
106    ///
107    /// # Errors
108    ///
109    /// Returns an error if the read fails.
110    pub fn read_range(&mut self, start: usize, len: usize) -> io::Result<Vec<u8>> {
111        if start >= self.size {
112            return Ok(Vec::new());
113        }
114
115        let actual_len = len.min(self.size - start);
116        let mut buf = vec![0u8; actual_len];
117
118        self.file.seek(SeekFrom::Start(start as u64))?;
119        self.file.read_exact(&mut buf)?;
120
121        Ok(buf)
122    }
123
124    /// Read all data from the buffer.
125    ///
126    /// # Errors
127    ///
128    /// Returns an error if the read fails.
129    pub fn read_all(&mut self) -> io::Result<Vec<u8>> {
130        self.read_range(0, self.size)
131    }
132
133    /// Read the last N bytes from the buffer.
134    ///
135    /// # Errors
136    ///
137    /// Returns an error if the read fails.
138    pub fn tail(&mut self, n: usize) -> io::Result<Vec<u8>> {
139        let start = self.size.saturating_sub(n);
140        self.read_range(start, n)
141    }
142
143    /// Read the first N bytes from the buffer.
144    ///
145    /// # Errors
146    ///
147    /// Returns an error if the read fails.
148    pub fn head(&mut self, n: usize) -> io::Result<Vec<u8>> {
149        self.read_range(0, n)
150    }
151
152    /// Clear the buffer.
153    ///
154    /// # Errors
155    ///
156    /// Returns an error if the truncation fails.
157    pub fn clear(&mut self) -> io::Result<()> {
158        self.file.set_len(0)?;
159        self.size = 0;
160        self.read_pos = 0;
161        Ok(())
162    }
163
164    /// Find a byte sequence in the buffer.
165    ///
166    /// This performs a linear search through the file.
167    ///
168    /// # Errors
169    ///
170    /// Returns an error if reading fails.
171    pub fn find(&mut self, needle: &[u8]) -> io::Result<Option<usize>> {
172        // Read in chunks for efficiency
173        const CHUNK_SIZE: usize = 64 * 1024;
174
175        if needle.is_empty() {
176            return Ok(Some(0));
177        }
178        if needle.len() > self.size {
179            return Ok(None);
180        }
181
182        let mut pos = 0;
183        let mut overlap = Vec::new();
184
185        self.file.seek(SeekFrom::Start(0))?;
186
187        while pos < self.size {
188            let read_size = CHUNK_SIZE.min(self.size - pos);
189            let mut chunk = vec![0u8; read_size];
190            self.file.read_exact(&mut chunk)?;
191
192            // Prepend overlap from previous chunk
193            let search_data = if overlap.is_empty() {
194                chunk.clone()
195            } else {
196                let mut combined = overlap.clone();
197                combined.extend(&chunk);
198                combined
199            };
200
201            // Search in combined data
202            if let Some(idx) = find_subsequence(&search_data, needle) {
203                let actual_pos = if overlap.is_empty() {
204                    pos + idx
205                } else {
206                    pos - overlap.len() + idx
207                };
208                return Ok(Some(actual_pos));
209            }
210
211            // Keep overlap for next iteration (to handle matches across chunks)
212            overlap = if chunk.len() >= needle.len() - 1 {
213                chunk[chunk.len() - (needle.len() - 1)..].to_vec()
214            } else {
215                chunk
216            };
217
218            pos += read_size;
219        }
220
221        Ok(None)
222    }
223
224    /// Find a string in the buffer.
225    ///
226    /// # Errors
227    ///
228    /// Returns an error if reading fails.
229    pub fn find_str(&mut self, needle: &str) -> io::Result<Option<usize>> {
230        self.find(needle.as_bytes())
231    }
232
233    /// Read data as a string (lossy UTF-8 conversion).
234    ///
235    /// # Errors
236    ///
237    /// Returns an error if reading fails.
238    pub fn as_str_lossy(&mut self) -> io::Result<String> {
239        let data = self.read_all()?;
240        Ok(String::from_utf8_lossy(&data).into_owned())
241    }
242
243    /// Consume data from the beginning of the buffer.
244    ///
245    /// This is expensive for large buffers as it requires rewriting the file.
246    ///
247    /// # Errors
248    ///
249    /// Returns an error if the operation fails.
250    pub fn consume(&mut self, len: usize) -> io::Result<Vec<u8>> {
251        if len == 0 {
252            return Ok(Vec::new());
253        }
254
255        let consume_len = len.min(self.size);
256
257        // Read the data to consume
258        let consumed = self.read_range(0, consume_len)?;
259
260        // Read remaining data
261        let remaining = self.read_range(consume_len, self.size - consume_len)?;
262
263        // Rewrite the file with remaining data
264        self.file.seek(SeekFrom::Start(0))?;
265        self.file.set_len(0)?;
266        self.file.write_all(&remaining)?;
267        self.size = remaining.len();
268
269        Ok(consumed)
270    }
271
272    /// Sync the buffer to disk.
273    ///
274    /// # Errors
275    ///
276    /// Returns an error if sync fails.
277    pub fn sync(&self) -> io::Result<()> {
278        self.file.sync_all()
279    }
280}
281
282impl Drop for LargeBuffer {
283    fn drop(&mut self) {
284        if self.cleanup {
285            let _ = std::fs::remove_file(&self.path);
286        }
287    }
288}
289
290impl std::fmt::Debug for LargeBuffer {
291    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292        f.debug_struct("LargeBuffer")
293            .field("path", &self.path)
294            .field("size", &self.size)
295            .field("cleanup", &self.cleanup)
296            .finish()
297    }
298}
299
300/// Find a subsequence in a slice.
301fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
302    haystack
303        .windows(needle.len())
304        .position(|window| window == needle)
305}
306
307/// Adaptive buffer that switches between in-memory and file-backed storage.
308pub enum AdaptiveBuffer {
309    /// In-memory buffer for small data.
310    Memory(Vec<u8>),
311    /// File-backed buffer for large data.
312    File(LargeBuffer),
313}
314
315impl AdaptiveBuffer {
316    /// Create a new adaptive buffer.
317    #[must_use]
318    pub const fn new() -> Self {
319        Self::Memory(Vec::new())
320    }
321
322    /// Create a new adaptive buffer with a custom threshold.
323    #[must_use]
324    pub const fn with_threshold(_threshold: usize) -> Self {
325        // Threshold stored elsewhere, just create memory buffer
326        Self::Memory(Vec::new())
327    }
328
329    /// Append data to the buffer.
330    ///
331    /// # Errors
332    ///
333    /// Returns an error if file operations fail when using file-backed storage.
334    pub fn append(&mut self, data: &[u8], threshold: usize) -> io::Result<()> {
335        match self {
336            Self::Memory(buf) => {
337                if buf.len() + data.len() > threshold {
338                    // Switch to file-backed storage
339                    let mut large = LargeBuffer::new()?;
340                    large.append(buf)?;
341                    large.append(data)?;
342                    *self = Self::File(large);
343                } else {
344                    buf.extend_from_slice(data);
345                }
346            }
347            Self::File(large) => {
348                large.append(data)?;
349            }
350        }
351        Ok(())
352    }
353
354    /// Get the current size of the buffer.
355    #[must_use]
356    pub const fn len(&self) -> usize {
357        match self {
358            Self::Memory(buf) => buf.len(),
359            Self::File(large) => large.len(),
360        }
361    }
362
363    /// Check if the buffer is empty.
364    #[must_use]
365    pub const fn is_empty(&self) -> bool {
366        self.len() == 0
367    }
368
369    /// Check if the buffer is using file-backed storage.
370    #[must_use]
371    pub const fn is_file_backed(&self) -> bool {
372        matches!(self, Self::File(_))
373    }
374
375    /// Read all data from the buffer.
376    ///
377    /// # Errors
378    ///
379    /// Returns an error if file operations fail.
380    pub fn read_all(&mut self) -> io::Result<Vec<u8>> {
381        match self {
382            Self::Memory(buf) => Ok(buf.clone()),
383            Self::File(large) => large.read_all(),
384        }
385    }
386
387    /// Read the last N bytes.
388    ///
389    /// # Errors
390    ///
391    /// Returns an error if file operations fail.
392    pub fn tail(&mut self, n: usize) -> io::Result<Vec<u8>> {
393        match self {
394            Self::Memory(buf) => {
395                let start = buf.len().saturating_sub(n);
396                Ok(buf[start..].to_vec())
397            }
398            Self::File(large) => large.tail(n),
399        }
400    }
401
402    /// Clear the buffer.
403    ///
404    /// # Errors
405    ///
406    /// Returns an error if file operations fail.
407    pub fn clear(&mut self) -> io::Result<()> {
408        match self {
409            Self::Memory(buf) => {
410                buf.clear();
411                Ok(())
412            }
413            Self::File(large) => large.clear(),
414        }
415    }
416
417    /// Get the data as a string (lossy UTF-8).
418    ///
419    /// # Errors
420    ///
421    /// Returns an error if file operations fail.
422    pub fn as_str_lossy(&mut self) -> io::Result<String> {
423        match self {
424            Self::Memory(buf) => Ok(String::from_utf8_lossy(buf).into_owned()),
425            Self::File(large) => large.as_str_lossy(),
426        }
427    }
428}
429
430impl Default for AdaptiveBuffer {
431    fn default() -> Self {
432        Self::new()
433    }
434}
435
436impl std::fmt::Debug for AdaptiveBuffer {
437    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
438        match self {
439            Self::Memory(buf) => f.debug_tuple("Memory").field(&buf.len()).finish(),
440            Self::File(large) => f.debug_tuple("File").field(large).finish(),
441        }
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448
449    #[test]
450    fn large_buffer_basic() {
451        let mut buf = LargeBuffer::new().unwrap();
452        buf.append(b"hello world").unwrap();
453
454        assert_eq!(buf.len(), 11);
455        assert_eq!(buf.read_all().unwrap(), b"hello world");
456    }
457
458    #[test]
459    fn large_buffer_find() {
460        let mut buf = LargeBuffer::new().unwrap();
461        buf.append(b"the quick brown fox").unwrap();
462
463        assert_eq!(buf.find(b"quick").unwrap(), Some(4));
464        assert_eq!(buf.find(b"lazy").unwrap(), None);
465    }
466
467    #[test]
468    fn large_buffer_tail() {
469        let mut buf = LargeBuffer::new().unwrap();
470        buf.append(b"hello world").unwrap();
471
472        assert_eq!(buf.tail(5).unwrap(), b"world");
473    }
474
475    #[test]
476    fn large_buffer_consume() {
477        let mut buf = LargeBuffer::new().unwrap();
478        buf.append(b"hello world").unwrap();
479
480        let consumed = buf.consume(6).unwrap();
481        assert_eq!(consumed, b"hello ");
482        assert_eq!(buf.read_all().unwrap(), b"world");
483    }
484
485    #[test]
486    fn adaptive_buffer_stays_memory() {
487        let mut buf = AdaptiveBuffer::new();
488        buf.append(b"small data", MMAP_THRESHOLD).unwrap();
489
490        assert!(!buf.is_file_backed());
491    }
492
493    #[test]
494    fn adaptive_buffer_switches_to_file() {
495        let mut buf = AdaptiveBuffer::new();
496        let threshold = 100;
497
498        // Add more than threshold
499        let large_data = vec![b'x'; 150];
500        buf.append(&large_data, threshold).unwrap();
501
502        assert!(buf.is_file_backed());
503        assert_eq!(buf.len(), 150);
504    }
505}