rsonpath/input/
mmap.rs

1//! Uses [`Mmap`](memmap2) to map a file into memory with kernel support.
2//!
3//! Choose this implementation if:
4//!
5//! 1. Your platform supports memory maps.
6//! 2. The input data is in a file or comes from standard input:
7//!    a) if from a file, then you can guarantee that the file is not going to be modified
8//!       in or out of process while the input is alive;
9//!    b) if from stdin, then that the input lives in memory (for example comes via a pipe);
10//!       input from a tty is not memory-mappable.
11//!
12//! ## Performance characteristics
13//!
14//! A memory map is by far the fastest way to process a file. For some queries it is faster
15//! by an order of magnitude to execute the query on a memory map than it is to simply read the
16//! file into main memory.
17
18use super::{
19    borrowed::BorrowedBytesBlockIterator,
20    error::{Infallible, InputError},
21    padding::PaddedBlock,
22    Input, SliceSeekable, MAX_BLOCK_SIZE,
23};
24use crate::{input::padding::EndPaddedInput, result::InputRecorder, string_pattern::StringPattern};
25use memmap2::{Mmap, MmapAsRawDesc};
26
27/// Input wrapping a memory mapped file.
28pub struct MmapInput {
29    mmap: Mmap,
30    last_block_start: usize,
31    last_block: PaddedBlock,
32}
33
34impl MmapInput {
35    /// Map a file to memory.
36    ///
37    /// # Safety
38    ///
39    /// This operation is inherently unsafe, since the file can be modified
40    /// in or out of process. See [Mmap documentation](https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html).
41    ///
42    /// # Errors
43    ///
44    /// Calling mmap might result in an IO error.
45    #[inline]
46    pub unsafe fn map_file<D: MmapAsRawDesc>(file_desc: D) -> Result<Self, InputError> {
47        match Mmap::map(file_desc) {
48            Ok(mmap) => {
49                let last_block_start = (mmap.len() / MAX_BLOCK_SIZE) * MAX_BLOCK_SIZE;
50                let last_block = PaddedBlock::pad_last_block(&mmap[last_block_start..]);
51                Ok(Self {
52                    mmap,
53                    last_block_start,
54                    last_block,
55                })
56            }
57            Err(err) => Err(err.into()),
58        }
59    }
60
61    pub(super) fn as_padded_input(&self) -> EndPaddedInput {
62        let middle = &self.mmap.as_ref()[..self.last_block_start];
63        EndPaddedInput::new(middle, &self.last_block)
64    }
65}
66
67impl Input for MmapInput {
68    type BlockIterator<'a, 'r, R, const N: usize>
69        = BorrowedBytesBlockIterator<'r, EndPaddedInput<'a>, R, N>
70    where
71        R: InputRecorder<&'a [u8]> + 'r;
72
73    type Error = Infallible;
74    type Block<'a, const N: usize> = &'a [u8];
75
76    #[inline(always)]
77    fn leading_padding_len(&self) -> usize {
78        0
79    }
80
81    #[inline(always)]
82    fn trailing_padding_len(&self) -> usize {
83        self.last_block.padding_len()
84    }
85
86    #[inline(always)]
87    fn len_hint(&self) -> Option<usize> {
88        Some((self.mmap.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
89    }
90
91    #[inline(always)]
92    fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, R, N>
93    where
94        R: InputRecorder<&'a [u8]>,
95    {
96        let padded_input = EndPaddedInput::new(&self.mmap[..self.last_block_start], &self.last_block);
97
98        BorrowedBytesBlockIterator::new(padded_input, recorder)
99    }
100
101    #[inline]
102    fn seek_backward(&self, from: usize, needle: u8) -> Option<usize> {
103        if from < self.last_block_start {
104            self.mmap.seek_backward(from, needle)
105        } else {
106            self.as_padded_input().seek_backward(from, needle)
107        }
108    }
109
110    #[inline]
111    fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Result<Option<(usize, u8)>, Infallible> {
112        return Ok(if from < self.last_block_start {
113            self.mmap
114                .seek_forward(from, needles)
115                .or_else(|| handle_last(&self.last_block, self.last_block_start, needles))
116        } else {
117            self.as_padded_input().seek_forward(from, needles)
118        });
119
120        #[cold]
121        #[inline(never)]
122        fn handle_last<const N: usize>(
123            last_block: &PaddedBlock,
124            offset: usize,
125            needles: [u8; N],
126        ) -> Option<(usize, u8)> {
127            last_block
128                .bytes()
129                .seek_forward(0, needles)
130                .map(|(x, y)| (x + offset, y))
131        }
132    }
133
134    #[inline]
135    fn seek_non_whitespace_forward(&self, from: usize) -> Result<Option<(usize, u8)>, Infallible> {
136        return Ok(if from < self.last_block_start {
137            self.mmap
138                .seek_non_whitespace_forward(from)
139                .or_else(|| handle_last(&self.last_block, self.last_block_start))
140        } else {
141            self.as_padded_input().seek_non_whitespace_forward(from)
142        });
143
144        #[cold]
145        #[inline(never)]
146        fn handle_last(last_block: &PaddedBlock, offset: usize) -> Option<(usize, u8)> {
147            last_block
148                .bytes()
149                .seek_non_whitespace_forward(0)
150                .map(|(x, y)| (x + offset, y))
151        }
152    }
153
154    #[inline]
155    fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)> {
156        if from < self.last_block_start {
157            self.mmap.seek_non_whitespace_backward(from)
158        } else {
159            self.as_padded_input().seek_non_whitespace_backward(from)
160        }
161    }
162
163    #[inline]
164    fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result<bool, Self::Error> {
165        debug_assert!(from < to);
166        // The hot path is when we're checking fully within the middle section.
167        // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput
168        // impl is explicitly marked with #[cold].
169        if to < self.last_block_start {
170            // This is the hot path -- do the bounds check and memcmp.
171            let bytes = &self.mmap;
172            let slice = &bytes[from..to];
173            Ok(member.quoted() == slice && (from == 0 || bytes[from - 1] != b'\\'))
174        } else {
175            // This is a very expensive, cold path.
176            Ok(self.as_padded_input().is_member_match(from, to, member))
177        }
178    }
179}