1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
//! Uses [`Mmap`](memmap2) to map a file into memory with kernel support.
//!
//! Choose this implementation if:
//!
//! 1. Your platform supports memory maps.
//! 2. The input data is in a file or comes from standard input:
//!   a) if from a file, then you can guarantee that the file is not going to be modified
//!      in or out of process while the input is alive;
//!   b) if from stdin, then that the input lives in memory (for example comes via a pipe);
//!      input from a tty is not memory-mappable.
//!
//! ## Performance characteristics
//!
//! A memory map is by far the fastest way to process a file. For some queries it is faster
//! by an order of magnitude to execute the query on a memory map than it is to simply read the
//! file into main memory.

use super::{
    borrowed::BorrowedBytesBlockIterator,
    error::{Infallible, InputError},
    padding::PaddedBlock,
    Input, SliceSeekable, MAX_BLOCK_SIZE,
};
use crate::{input::padding::EndPaddedInput, query::JsonString, result::InputRecorder};
use memmap2::{Mmap, MmapAsRawDesc};

/// Input wrapping a memory mapped file.
pub struct MmapInput {
    mmap: Mmap,
    last_block_start: usize,
    last_block: PaddedBlock,
}

impl MmapInput {
    /// Map a file to memory.
    ///
    /// # Safety
    ///
    /// This operation is inherently unsafe, since the file can be modified
    /// in or out of process. See [Mmap documentation](https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html).
    ///
    /// # Errors
    ///
    /// Calling mmap might result in an IO error.
    #[inline]
    pub unsafe fn map_file<D: MmapAsRawDesc>(file_desc: D) -> Result<Self, InputError> {
        match Mmap::map(file_desc) {
            Ok(mmap) => {
                let last_block_start = (mmap.len() / MAX_BLOCK_SIZE) * MAX_BLOCK_SIZE;
                let last_block = PaddedBlock::pad_last_block(&mmap[last_block_start..]);
                Ok(Self {
                    mmap,
                    last_block_start,
                    last_block,
                })
            }
            Err(err) => Err(err.into()),
        }
    }

    pub(super) fn as_padded_input(&self) -> EndPaddedInput {
        let middle = &self.mmap.as_ref()[..self.last_block_start];
        EndPaddedInput::new(middle, &self.last_block)
    }
}

impl Input for MmapInput {
    type BlockIterator<'a, 'r, R, const N: usize> = BorrowedBytesBlockIterator<'r, EndPaddedInput<'a>, R, N>
    where
        R: InputRecorder<&'a [u8]> + 'r;

    type Error = Infallible;
    type Block<'a, const N: usize> = &'a [u8];

    #[inline(always)]
    fn leading_padding_len(&self) -> usize {
        0
    }

    #[inline(always)]
    fn trailing_padding_len(&self) -> usize {
        self.last_block.padding_len()
    }

    #[inline(always)]
    fn len_hint(&self) -> Option<usize> {
        Some((self.mmap.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
    }

    #[inline(always)]
    fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, R, N>
    where
        R: InputRecorder<&'a [u8]>,
    {
        let padded_input = EndPaddedInput::new(&self.mmap[..self.last_block_start], &self.last_block);

        BorrowedBytesBlockIterator::new(padded_input, recorder)
    }

    #[inline]
    fn seek_backward(&self, from: usize, needle: u8) -> Option<usize> {
        return if from < self.last_block_start {
            self.mmap.seek_backward(from, needle)
        } else {
            self.as_padded_input().seek_backward(from, needle)
        };
    }

    #[inline]
    fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Result<Option<(usize, u8)>, Infallible> {
        return Ok(if from < self.last_block_start {
            self.mmap
                .seek_forward(from, needles)
                .or_else(|| handle_last(&self.last_block, self.last_block_start, needles))
        } else {
            self.as_padded_input().seek_forward(from, needles)
        });

        #[cold]
        #[inline(never)]
        fn handle_last<const N: usize>(
            last_block: &PaddedBlock,
            offset: usize,
            needles: [u8; N],
        ) -> Option<(usize, u8)> {
            last_block
                .bytes()
                .seek_forward(0, needles)
                .map(|(x, y)| (x + offset, y))
        }
    }

    #[inline]
    fn seek_non_whitespace_forward(&self, from: usize) -> Result<Option<(usize, u8)>, Infallible> {
        return Ok(if from < self.last_block_start {
            self.mmap
                .seek_non_whitespace_forward(from)
                .or_else(|| handle_last(&self.last_block, self.last_block_start))
        } else {
            self.as_padded_input().seek_non_whitespace_forward(from)
        });

        #[cold]
        #[inline(never)]
        fn handle_last(last_block: &PaddedBlock, offset: usize) -> Option<(usize, u8)> {
            last_block
                .bytes()
                .seek_non_whitespace_forward(0)
                .map(|(x, y)| (x + offset, y))
        }
    }

    #[inline]
    fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)> {
        return if from < self.last_block_start {
            self.mmap.seek_non_whitespace_backward(from)
        } else {
            self.as_padded_input().seek_non_whitespace_backward(from)
        };
    }

    #[inline]
    fn is_member_match(&self, from: usize, to: usize, member: &JsonString) -> Result<bool, Self::Error> {
        debug_assert!(from < to);
        // The hot path is when we're checking fully within the middle section.
        // This has to be as fast as possible, so the "cold" path referring to the TwoSidesPaddedInput
        // impl is explicitly marked with #[cold].
        if to < self.last_block_start {
            // This is the hot path -- do the bounds check and memcmp.
            let bytes = &self.mmap;
            let slice = &bytes[from..to];
            Ok(member.bytes_with_quotes() == slice && (from == 0 || bytes[from - 1] != b'\\'))
        } else {
            // This is a very expensive, cold path.
            Ok(self.as_padded_input().is_member_match(from, to, member))
        }
    }
}