bvr_core/buf/
segment.rs

1use crate::Result;
2use memmap2::{Mmap, MmapMut};
3use std::{borrow::Cow, ops::Range, ptr::NonNull, sync::Arc};
4
5#[cfg(unix)]
6pub(crate) use std::os::fd::AsRawFd as Mmappable;
7#[cfg(windows)]
8pub(crate) use std::os::windows::io::AsRawHandle as Mmappable;
9
10pub struct SegmentRaw<Buf> {
11    range: Range<u64>,
12    data: Buf,
13}
14
15pub type SegmentMut = SegmentRaw<MmapMut>;
16pub type Segment = SegmentRaw<Mmap>;
17
18impl<Buf> SegmentRaw<Buf>
19where
20    Buf: AsRef<[u8]>,
21{
22    pub const MAX_SIZE: u64 = 1 << 20;
23
24    #[inline]
25    pub fn start(&self) -> u64 {
26        self.range.start
27    }
28
29    #[inline]
30    pub fn translate_inner_data_index(&self, start: u64) -> u64 {
31        debug_assert!(self.range.start <= start);
32        // TODO: make this better... i don't like that its <=
33        //       but technically its fine as long as start
34        //       is the end of the buffer
35        debug_assert!(start <= self.range.end);
36        start - self.range.start
37    }
38
39    #[inline]
40    pub fn translate_inner_data_range(&self, start: u64, end: u64) -> Range<u64> {
41        self.translate_inner_data_index(start)..self.translate_inner_data_index(end)
42    }
43
44    #[inline]
45    pub fn id_of_data(start: u64) -> usize {
46        (start / Self::MAX_SIZE) as usize
47    }
48
49    #[inline]
50    pub fn data_range_of_id(id: usize) -> Range<u64> {
51        let start = id as u64 * Self::MAX_SIZE;
52        start..start + Self::MAX_SIZE
53    }
54}
55
56impl<Buf> std::ops::Deref for SegmentRaw<Buf>
57where
58    Buf: std::ops::Deref<Target = [u8]>,
59{
60    type Target = [u8];
61
62    fn deref(&self) -> &Self::Target {
63        &self.data
64    }
65}
66
67impl<Buf> std::ops::DerefMut for SegmentRaw<Buf>
68where
69    Buf: std::ops::DerefMut<Target = [u8]>,
70{
71    fn deref_mut(&mut self) -> &mut Self::Target {
72        &mut self.data
73    }
74}
75
76impl SegmentMut {
77    pub(crate) fn new(start: u64) -> Result<Self> {
78        let data = memmap2::MmapOptions::new()
79            .len(Self::MAX_SIZE as usize)
80            .map_anon()?;
81        #[cfg(unix)]
82        data.advise(memmap2::Advice::Sequential)?;
83        Ok(Self {
84            data,
85            range: start..start + Self::MAX_SIZE,
86        })
87    }
88
89    pub fn into_read_only(self) -> Result<Segment> {
90        Ok(Segment {
91            data: self.data.make_read_only()?,
92            range: self.range,
93        })
94    }
95}
96
97impl Segment {
98    pub(crate) fn map_file<F: Mmappable>(range: Range<u64>, file: &F) -> Result<Self> {
99        let size = range.end - range.start;
100        debug_assert!(size <= Self::MAX_SIZE);
101        let data = unsafe {
102            memmap2::MmapOptions::new()
103                .offset(range.start)
104                .len(size as usize)
105                .map(file)?
106        };
107        #[cfg(unix)]
108        data.advise(memmap2::Advice::WillNeed)?;
109        Ok(Self { data, range })
110    }
111
112    #[inline]
113    pub fn get_line(self: &Arc<Self>, range: Range<u64>) -> SegStr {
114        SegStr::from_bytes(self.get_bytes(range))
115    }
116
117    #[inline]
118    pub fn get_bytes(self: &Arc<Self>, range: Range<u64>) -> SegBytes {
119        SegBytes::new_borrow(self.clone(), range)
120    }
121}
122
123/// Line buffer that comes from a [Segment].
124///
125/// If the [SegSlice] borrows from the segment, the segment will not be dropped until
126/// all of its referents is dropped.
127///
128/// This structure avoids cloning unnecessarily.
129pub struct SegBytes(SegBytesRepr);
130
131/// Internal representation of [SegSlice].
132enum SegBytesRepr {
133    Borrowed {
134        // This field refs the segment so its data does not get munmap'd and remains valid.
135        _ref: Arc<Segment>,
136        // This data point to the ref-counted `_pin` field.
137        // Maybe if polonius supports self-referential slices one day, this
138        // spicy unsafe code can be dropped.
139        ptr: NonNull<u8>,
140        len: usize,
141    },
142    Owned(Vec<u8>),
143}
144
145impl SegBytes {
146    /// Constructs a string that might borrows data from a [Segment]. If the data
147    /// is invalid utf-8, it will be converted into an owned [String] using `String::from_utf8_lossy`.
148    ///
149    /// # Safety
150    ///
151    /// 1. The provided slice must point to data that lives inside the ref-counted [Segment].
152    /// 2. The length must encompass a valid range of data inside the [Segment].
153    fn new_borrow(origin: Arc<Segment>, range: Range<u64>) -> Self {
154        // Safety: This ptr came from a slice that we prevent from
155        //         being dropped by having it inside a ref counter
156        // Safety: The length is computed by a (assumed to be correct)
157        //         index. It is undefined behavior if the file changes
158        //         in a non-appending way after the index is created.
159        let data = &origin.data[range.start as usize..range.end as usize];
160        Self(SegBytesRepr::Borrowed {
161            ptr: unsafe { NonNull::new(data.as_ptr().cast_mut()).unwrap_unchecked() },
162            len: data.len(),
163            _ref: origin,
164        })
165    }
166
167    /// Constructs a string that owns its data.
168    #[inline]
169    pub fn new_owned(s: Vec<u8>) -> Self {
170        Self(SegBytesRepr::Owned(s))
171    }
172
173    /// Returns a byte slice of this [SegBytes]'s components.
174    #[inline]
175    pub fn as_bytes(&self) -> &[u8] {
176        // Safety: We have already checked in the constructor.
177        match &self.0 {
178            SegBytesRepr::Borrowed { ptr, len, .. } => unsafe {
179                std::slice::from_raw_parts(ptr.as_ptr(), *len)
180            },
181            SegBytesRepr::Owned(s) => s.as_slice(),
182        }
183    }
184}
185
186impl std::borrow::Borrow<[u8]> for SegBytes {
187    #[inline]
188    fn borrow(&self) -> &[u8] {
189        self
190    }
191}
192
193impl std::ops::Deref for SegBytes {
194    type Target = [u8];
195
196    #[inline]
197    fn deref(&self) -> &Self::Target {
198        self.as_bytes()
199    }
200}
201
202impl std::convert::AsRef<[u8]> for SegBytes {
203    #[inline]
204    fn as_ref(&self) -> &[u8] {
205        self.as_bytes()
206    }
207}
208
209/// Line string that comes from a [Segment].
210///
211/// If the [SegStr] borrows from the segment, the segment will not be dropped until
212/// all of its referents is dropped.
213///
214/// This structure avoids cloning unnecessarily.
215#[derive(Clone)]
216pub struct SegStr(SegStrRepr);
217
218/// Internal representation of [SegStr].
219#[derive(Clone)]
220enum SegStrRepr {
221    Borrowed {
222        // This field refs the segment so its data does not get munmap'd and remains valid.
223        _ref: Arc<Segment>,
224        // This data point to the ref-counted `_pin` field.
225        // Maybe if polonius supports self-referential slices one day, this
226        // spicy unsafe code can be dropped.
227        ptr: NonNull<u8>,
228        len: usize,
229    },
230    Owned(String),
231}
232
233impl SegStr {
234    /// Constructs a string that might borrows data from a [Segment]. If the data
235    /// is invalid utf-8, it will be converted into an owned [String] using `String::from_utf8_lossy`.
236    pub fn from_bytes(bytes: SegBytes) -> Self {
237        match bytes.0 {
238            SegBytesRepr::Borrowed { _ref, ptr, len } => {
239                // Safety: by construction of SegBytes
240                let data = unsafe { std::slice::from_raw_parts(ptr.as_ptr(), len) };
241                match String::from_utf8_lossy(data) {
242                    Cow::Owned(s) => Self(SegStrRepr::Owned(s)),
243                    Cow::Borrowed(_) => Self(SegStrRepr::Borrowed { ptr, len, _ref }),
244                }
245            }
246            SegBytesRepr::Owned(b) => match String::from_utf8_lossy(&b) {
247                Cow::Owned(s) => Self(SegStrRepr::Owned(s)),
248                Cow::Borrowed(_) => {
249                    // Safety: We already checked that the data is valid utf-8
250                    //         in the `String::from_utf8_lossy` call.
251                    Self(SegStrRepr::Owned(unsafe { String::from_utf8_unchecked(b) }))
252                }
253            },
254        }
255    }
256
257    /// Returns a byte slice of this [SegStr]'s components.
258    #[inline]
259    pub fn as_bytes(&self) -> &[u8] {
260        // Safety: We have already checked in the constructor.
261        match &self.0 {
262            SegStrRepr::Borrowed { ptr, len, .. } => unsafe {
263                std::slice::from_raw_parts(ptr.as_ptr(), *len)
264            },
265            SegStrRepr::Owned(s) => s.as_bytes(),
266        }
267    }
268
269    /// Extract a [str] slice backed by the pinned segment data or owned data.
270    #[inline]
271    pub fn as_str(&self) -> &str {
272        // Safety: we already did utf-8 checking
273        unsafe { std::str::from_utf8_unchecked(self.as_bytes()) }
274    }
275}
276
277impl std::borrow::Borrow<str> for SegStr {
278    #[inline]
279    fn borrow(&self) -> &str {
280        self
281    }
282}
283
284impl std::ops::Deref for SegStr {
285    type Target = str;
286
287    #[inline]
288    fn deref(&self) -> &Self::Target {
289        self.as_str()
290    }
291}
292
293impl std::convert::AsRef<str> for SegStr {
294    #[inline]
295    fn as_ref(&self) -> &str {
296        self.as_str()
297    }
298}
299
300impl std::fmt::Debug for SegStr {
301    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
302        std::fmt::Debug::fmt(self.as_str(), f)
303    }
304}