Skip to main content

pure_magic/
readers.rs

1//! Data readers for magic number detection.
2//!
3//! Provides efficient readers for different data sources with caching and buffering
4//! strategies optimized for file format identification.
5//!
6//! # Types
7//!
8//! - [`DataReader`] - A generic reader enum supporting slices, vectors, and files.
9//! - [`BufReader`] - A buffered reader for in-memory byte slices.
10//! - [`LazyCache`] - A lazy-loading cache reader for files with multi-tiered caching.
11//!
12//! # Traits
13//!
14//! - [`DataRead`] - Extended read operations for magic number detection.
15
16use std::{
17    fs::File,
18    io::{self, SeekFrom},
19    ops::Range,
20};
21
22mod cache;
23pub use cache::LazyCache;
24
25mod slice;
26pub use slice::BufReader;
27
28use crate::FILE_BYTES_MAX;
29
30/// A trait for reading data with position tracking and range-based access.
31///
32/// Implementors provide efficient random access to byte data for file magic
33/// detection, supporting both in-memory and file-backed storage.
34pub trait DataRead {
35    /// Returns the current position in the data stream.
36    fn stream_position(&self) -> u64;
37
38    /// Computes the absolute byte offset from a [`SeekFrom`] position.
39    #[inline]
40    fn offset_from_start(&self, pos: SeekFrom) -> u64 {
41        match pos {
42            SeekFrom::Start(s) => s,
43            SeekFrom::Current(p) => {
44                (self.stream_position() as i128 + p as i128).clamp(0, u64::MAX as i128) as u64
45            }
46            SeekFrom::End(e) => {
47                (self.data_size() as i128 + e as i128).clamp(0, u64::MAX as i128) as u64
48            }
49        }
50    }
51
52    /// Reads a range of bytes from the data.
53    ///
54    /// Returns an empty slice if the range is beyond the end of data.
55    fn read_range(&mut self, range: Range<u64>) -> Result<&[u8], io::Error>;
56
57    /// Reads up to `count` bytes from the current position.
58    ///
59    /// Returns fewer bytes if the end of data is reached.
60    #[inline]
61    fn read_count(&mut self, count: u64) -> Result<&[u8], io::Error> {
62        let pos = self.stream_position();
63        let range = pos..(pos.saturating_add(count));
64        self.read_range(range)
65    }
66
67    /// Reads exactly the specified byte range.
68    ///
69    /// # Errors
70    ///
71    /// Returns an error if the range extends beyond the available data.
72    fn read_exact_range(&mut self, range: Range<u64>) -> Result<&[u8], io::Error> {
73        let range_len = range.end - range.start;
74        let b = self.read_range(range)?;
75        if b.len() as u64 != range_len {
76            Err(io::Error::from(io::ErrorKind::UnexpectedEof))
77        } else {
78            Ok(b)
79        }
80    }
81
82    /// Reads exactly `count` bytes from the current position.
83    ///
84    /// # Errors
85    ///
86    /// Returns an error if fewer than `count` bytes are available.
87    fn read_exact_count(&mut self, count: u64) -> Result<&[u8], io::Error> {
88        let b = self.read_count(count)?;
89        debug_assert!(b.len() <= count as usize);
90        if b.len() as u64 != count {
91            Err(io::ErrorKind::UnexpectedEof.into())
92        } else {
93            Ok(b)
94        }
95    }
96
97    /// Reads exactly enough bytes to fill `buf`.
98    ///
99    /// # Errors
100    ///
101    /// Returns an error if fewer than `buf.len()` bytes are available.
102    fn read_exact_into(&mut self, buf: &mut [u8]) -> Result<(), io::Error> {
103        let read = self.read_exact_count(buf.len() as u64)?;
104        // this function call should not panic as read_exact
105        // guarantees we read exactly the length of buf
106        buf.copy_from_slice(read);
107        Ok(())
108    }
109
110    /// Reads bytes until any of the delimiters or `limit` bytes is reached.
111    ///
112    /// The delimiter byte is included in the returned slice.
113    fn read_until_any_delim_or_limit(
114        &mut self,
115        delims: &[u8],
116        limit: u64,
117    ) -> Result<&[u8], io::Error>;
118
119    /// Reads bytes until `byte` or `limit` bytes is reached.
120    ///
121    /// The delimiter byte is included in the returned slice.
122    fn read_until_or_limit(&mut self, byte: u8, limit: u64) -> Result<&[u8], io::Error>;
123
124    /// Reads bytes while `f` returns `true` or until `limit` bytes is reached.
125    ///
126    /// The byte that caused `f` to return `false` is not included.
127    fn read_while_or_limit<F>(&mut self, f: F, limit: u64) -> Result<&[u8], io::Error>
128    where
129        F: Fn(u8) -> bool;
130
131    /// Reads bytes until a UTF-16 character or `limit` bytes is reached.
132    ///
133    /// The UTF-16 character is included in the returned slice.
134    fn read_until_utf16_or_limit(
135        &mut self,
136        utf16_char: &[u8; 2],
137        limit: u64,
138    ) -> Result<&[u8], io::Error>;
139
140    /// Returns the total size of the data in bytes.
141    fn data_size(&self) -> u64;
142
143    /// Sets the position for future reads.
144    fn seek(&mut self, pos: SeekFrom) -> io::Result<u64>;
145}
146
147/// A generic reader for data backed by different sources.
148///
149/// Provides a uniform interface for reading from in-memory buffers or files.
150pub enum DataReader<'b> {
151    /// A reader backed by a borrowed byte slice.
152    ///
153    /// Useful for zero-copy reads from existing in-memory data.
154    Slice(BufReader<&'b [u8]>),
155    /// A reader backed by an owned byte vector.
156    ///
157    /// Useful when the data needs to be owned.
158    Vec(BufReader<Vec<u8>>),
159    /// A reader backed by a file with lazy caching.
160    ///
161    /// Uses [`LazyCache`] for efficient disk I/O.
162    File(LazyCache<File>),
163}
164
165impl DataReader<'_> {
166    /// Creates a new `DataReader` backed by a file with lazy caching.
167    ///
168    /// The file is wrapped in a [`LazyCache`] with:
169    /// - A hot cache of 14 MiB (2 × [`FILE_BYTES_MAX`])
170    /// - A warm cache of 100 MiB
171    ///
172    /// This configuration is optimized for file-based magic number detection,
173    /// balancing memory usage with I/O efficiency.
174    ///
175    /// # Errors
176    ///
177    /// Returns an error if the file cannot be read or if cache initialization fails.
178    pub fn from_file(r: File) -> Result<Self, io::Error> {
179        let x = LazyCache::<File>::from_read_seek(r)
180            .and_then(|lc| lc.with_hot_cache(2 * FILE_BYTES_MAX))
181            .map(|lc| lc.with_warm_cache(100 << 20))?;
182        Ok(Self::File(x))
183    }
184}
185
186impl<'b> DataReader<'b> {
187    /// Creates a new `DataReader` backed by a borrowed byte slice.
188    ///
189    /// This is a zero-copy constructor that wraps the slice in a [`BufReader`].
190    /// The lifetime of the returned reader is tied to the input slice.
191    ///
192    /// # Examples
193    ///
194    /// ```
195    /// use pure_magic::readers::{DataReader, DataRead};
196    ///
197    /// let data = b"hello world";
198    /// let reader = DataReader::from_slice(data);
199    /// assert_eq!(reader.data_size(), data.len() as u64);
200    /// ```
201    pub fn from_slice(s: &'b [u8]) -> Self {
202        Self::Slice(BufReader::from_slice(s))
203    }
204}
205
206impl DataReader<'_> {
207    /// Creates a new `DataReader` backed by an owned byte vector.
208    ///
209    /// The vector is wrapped in a [`BufReader`], allowing the data to be owned
210    /// independently of any borrow.
211    ///
212    /// # Examples
213    ///
214    /// ```
215    /// use pure_magic::readers::{DataReader, DataRead};
216    ///
217    /// let data = vec![1u8, 2, 3, 4, 5];
218    /// let reader = DataReader::from_vec(data);
219    /// assert_eq!(reader.data_size(), 5);
220    /// ```
221    pub fn from_vec(v: Vec<u8>) -> Self {
222        Self::Vec(BufReader::from_slice(v))
223    }
224}
225
226impl DataRead for DataReader<'_> {
227    fn stream_position(&self) -> u64 {
228        match self {
229            DataReader::Slice(b) => b.stream_position(),
230            DataReader::Vec(v) => v.stream_position(),
231            DataReader::File(f) => f.stream_position(),
232        }
233    }
234
235    fn offset_from_start(&self, pos: SeekFrom) -> u64 {
236        match self {
237            DataReader::Slice(b) => b.offset_from_start(pos),
238            DataReader::Vec(v) => v.offset_from_start(pos),
239            DataReader::File(f) => f.offset_from_start(pos),
240        }
241    }
242
243    fn read_range(&mut self, range: Range<u64>) -> Result<&[u8], io::Error> {
244        match self {
245            DataReader::Slice(b) => b.read_range(range),
246            DataReader::Vec(v) => v.read_range(range),
247            DataReader::File(f) => f.read_range(range),
248        }
249    }
250
251    fn read_count(&mut self, count: u64) -> Result<&[u8], io::Error> {
252        match self {
253            DataReader::Slice(b) => b.read_count(count),
254            DataReader::Vec(v) => v.read_count(count),
255            DataReader::File(f) => f.read_count(count),
256        }
257    }
258
259    fn read_exact_range(&mut self, range: Range<u64>) -> Result<&[u8], io::Error> {
260        match self {
261            DataReader::Slice(b) => b.read_exact_range(range),
262            DataReader::Vec(v) => v.read_exact_range(range),
263            DataReader::File(f) => f.read_exact_range(range),
264        }
265    }
266
267    fn read_exact_count(&mut self, count: u64) -> Result<&[u8], io::Error> {
268        match self {
269            DataReader::Slice(b) => b.read_exact_count(count),
270            DataReader::Vec(v) => v.read_exact_count(count),
271            DataReader::File(f) => f.read_exact_count(count),
272        }
273    }
274
275    fn read_exact_into(&mut self, buf: &mut [u8]) -> Result<(), io::Error> {
276        match self {
277            DataReader::Slice(b) => b.read_exact_into(buf),
278            DataReader::Vec(v) => v.read_exact_into(buf),
279            DataReader::File(f) => f.read_exact_into(buf),
280        }
281    }
282
283    fn read_until_any_delim_or_limit(
284        &mut self,
285        delims: &[u8],
286        limit: u64,
287    ) -> Result<&[u8], io::Error> {
288        match self {
289            DataReader::Slice(b) => b.read_until_any_delim_or_limit(delims, limit),
290            DataReader::Vec(v) => v.read_until_any_delim_or_limit(delims, limit),
291            DataReader::File(f) => f.read_until_any_delim_or_limit(delims, limit),
292        }
293    }
294
295    fn read_until_or_limit(&mut self, byte: u8, limit: u64) -> Result<&[u8], io::Error> {
296        match self {
297            DataReader::Slice(b) => b.read_until_or_limit(byte, limit),
298            DataReader::Vec(v) => v.read_until_or_limit(byte, limit),
299            DataReader::File(f) => f.read_until_or_limit(byte, limit),
300        }
301    }
302
303    fn read_while_or_limit<F>(&mut self, f: F, limit: u64) -> Result<&[u8], io::Error>
304    where
305        F: Fn(u8) -> bool,
306    {
307        match self {
308            DataReader::Slice(b) => b.read_while_or_limit(f, limit),
309            DataReader::Vec(v) => v.read_while_or_limit(f, limit),
310            DataReader::File(l) => l.read_while_or_limit(f, limit),
311        }
312    }
313
314    fn read_until_utf16_or_limit(
315        &mut self,
316        utf16_char: &[u8; 2],
317        limit: u64,
318    ) -> Result<&[u8], io::Error> {
319        match self {
320            DataReader::Slice(b) => b.read_until_utf16_or_limit(utf16_char, limit),
321            DataReader::Vec(v) => v.read_until_utf16_or_limit(utf16_char, limit),
322            DataReader::File(f) => f.read_until_utf16_or_limit(utf16_char, limit),
323        }
324    }
325
326    fn data_size(&self) -> u64 {
327        match self {
328            DataReader::Slice(b) => b.data_size(),
329            DataReader::Vec(v) => v.data_size(),
330            DataReader::File(f) => f.data_size(),
331        }
332    }
333
334    fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
335        match self {
336            DataReader::Slice(b) => b.seek(pos),
337            DataReader::Vec(v) => v.seek(pos),
338            DataReader::File(f) => f.seek(pos),
339        }
340    }
341}