mmap_vec/
lib.rs

1#![allow(clippy::partialeq_ne_impl)]
2#![warn(missing_docs)]
3#![deny(clippy::unwrap_used)]
4
5/*! # Rust memory mapped vector
6
7[![CI Status](https://github.com/arthurlm/mmap-vec/workflows/Test/badge.svg)](https://github.com/arthurlm/mmap-vec/actions/)
8[![codecov](https://codecov.io/gh/arthurlm/mmap-vec/graph/badge.svg?token=1TXRTK3C3Q)](https://codecov.io/gh/arthurlm/mmap-vec)
9[![docs.rs](https://docs.rs/mmap-vec/badge.svg)](https://docs.rs/mmap-vec/)
10[![Crates.io](https://img.shields.io/crates/v/mmap-vec)](https://crates.io/crates/mmap-vec)
11[![LICENSE](https://img.shields.io/crates/l/mmap-vec)](https://raw.githubusercontent.com/arthurlm/mmap-vec/main/LICENSE)
12
13This crate contains implementation / helper to create data struct that are memory mapped.
14
15Sometime, you have to deal with vector / data that cannot fit in memory.
16Moving them to disk and memory map them is a good way to deal with this problem.
17
18## How to use it ?
19
20That is so simple !
21
22```rust
23use mmap_vec::MmapVec;
24
25#[derive(Debug, PartialEq, Clone, Copy)]
26struct Row {
27    id: usize,
28    age: u8,
29}
30
31let row1 = Row { id: 42, age: 18 };
32let row2 = Row { id: 894, age: 99 };
33
34// Create a memory mapped vec 😎
35let mut v = MmapVec::<Row>::new();
36
37// Push can trigger new mmap segment creation, so it can fail.
38v.push(row1).unwrap();
39v.push(row2).unwrap();
40
41// Check the content
42assert_eq!(v[0], row1);
43assert_eq!(&v[..], &[row1, row2]);
44
45// Pop content
46assert_eq!(v.pop(), Some(row2));
47assert_eq!(v.pop(), Some(row1));
48```
49
50Check the unit tests for more example.
51
52## How it works ?
53
54The main idea here is to provide a basic `struct Segment`.
55
56This struct provides constant size memory mapped array of type `T`.
57Wrapping `Segment` into a new struct `MmapVec` that handle segment growth / shrink does the trick.
58
59### Where does the segment are store on disk ?
60
61For now data are stored in `.cache` (if using 'cache-dirs' feature) or `/tmp` under a dedicated folder.
62
63UUID V4 are generated in order to avoid collision when creating segment.
64
65```text
66❯ ls /tmp/mmap-vec-rs -1
67/tmp/mmap-vec-rs/00d977bf-b556-475e-8de5-d35e7baaa39d.seg
68/tmp/mmap-vec-rs/6cb81228-9cf3-4918-a3ef-863907b32830.seg
69/tmp/mmap-vec-rs/8a86eeaa-1fa8-4535-9e23-6c59e0c9c376.seg
70/tmp/mmap-vec-rs/de62bde3-6524-4c4b-b514-24f6a44d6323.seg
71```
72
73### Does segment creation is configurable ?
74
75Yes ! Check out `test_custom_segment_creator::test_custom_segment_builder` for example.
76
77Since segment creation are manage through a trait. You are free to configure it the way you want.
78
79### Does this work on Windows ?
80
81__Nope__. I am not targeting this OS and would like to keep this crate as simple as possible.
82
83I also would like to reduce dependencies as much as possible.
84
85```text
86❯ cargo tree
87mmap-vec v0.1.1
88├── libc v0.2.147
89├── uuid v1.4.1
90|   └── getrandom v0.2.10
91|       ├── cfg-if v1.0.0
92|       └── libc v0.2.147
93# Optional using 'cache-dir' feature
94├── dirs v5.0.1
95│   └── dirs-sys v0.4.1
96│       ├── libc v0.2.147
97│       └── option-ext v0.2.0
98[dev-dependencies]
99└── glob v0.3.1
100```
101
102### Is this crate production ready ?
103
104Yes 😁 !
105Since v0.1.1. But feature are a little bit limited for now ...
106
107Github PR to help on this are welcomed !
108
109Prefetching API is not fully stable for now and may change in the future.
110
111## Ideas / new features ?
112
113- Implement custom `std::alloc::Allocator` to use with `std::vec::Vec`
114 */
115
116use std::{
117    fs, io, mem,
118    ops::{Deref, DerefMut},
119    path::PathBuf,
120};
121
122#[cfg(feature = "serde")]
123use std::marker::PhantomData;
124
125pub use segment::Segment;
126pub use segment_builder::{DefaultSegmentBuilder, SegmentBuilder};
127pub use stats::MmapStats;
128use utils::check_zst;
129pub use vec_builder::MmapVecBuilder;
130
131#[cfg(feature = "serde")]
132use serde::{
133    de::{SeqAccess, Visitor},
134    ser::SerializeSeq,
135    Deserialize, Deserializer, Serialize, Serializer,
136};
137
138use crate::utils::page_size;
139
140mod segment;
141mod segment_builder;
142mod stats;
143mod utils;
144mod vec_builder;
145
146/// A disk memory mapped vector.
147#[derive(Debug)]
148pub struct MmapVec<T, B: SegmentBuilder = DefaultSegmentBuilder> {
149    pub(crate) segment: Segment<T>,
150    pub(crate) builder: B,
151    pub(crate) path: PathBuf,
152}
153
154impl<T, B> MmapVec<T, B>
155where
156    B: SegmentBuilder,
157{
158    /// Create a zero size mmap vec.
159    #[inline(always)]
160    pub fn new() -> Self {
161        check_zst::<T>();
162
163        let builder = B::default();
164        let path = builder.new_segment_path();
165        Self {
166            segment: Segment::null(),
167            builder,
168            path,
169        }
170    }
171
172    /// Create a mmap vec with a given capacity.
173    ///
174    /// This function can fail if FS / IO failed.
175    #[inline(always)]
176    pub fn with_capacity(capacity: usize) -> io::Result<Self> {
177        MmapVecBuilder::new().capacity(capacity).try_build()
178    }
179
180    /// Currently used vec size.
181    #[inline(always)]
182    pub fn capacity(&self) -> usize {
183        self.segment.capacity()
184    }
185
186    /// Bytes use on disk for this vec.
187    #[inline(always)]
188    pub fn disk_size(&self) -> usize {
189        self.segment.disk_size()
190    }
191
192    /// Shortens the vec, keeping the first `new_len` elements and dropping
193    /// the rest.
194    #[inline(always)]
195    pub fn truncate(&mut self, new_len: usize) {
196        self.segment.truncate(new_len);
197    }
198
199    /// Remove `delete_count` element at beginning of the vec.
200    ///
201    /// Element will be drop in place.
202    ///
203    /// If delete count is greater than the segment len, then this call will be
204    /// equivalent to calling `clear` function.
205    ///
206    /// Example:
207    /// ```rust
208    /// # use mmap_vec::MmapVec;
209    /// let mut v = MmapVec::<u8>::new();
210    /// assert!(v.push(8).is_ok());
211    /// assert!(v.push(5).is_ok());
212    /// assert!(v.push(3).is_ok());
213    /// assert!(v.push(12).is_ok());
214    /// assert_eq!(&v[..], &[8, 5, 3, 12]);
215    ///
216    /// v.truncate_first(2);
217    /// assert_eq!(&v[..], [3, 12]);
218    ///
219    /// v.truncate_first(100);
220    /// assert_eq!(&v[..], []);
221    /// ```
222    #[inline(always)]
223    pub fn truncate_first(&mut self, delete_count: usize) {
224        self.segment.truncate_first(delete_count);
225    }
226
227    /// Clears the vec, removing all values.
228    #[inline(always)]
229    pub fn clear(&mut self) {
230        self.segment.clear();
231    }
232
233    /// Remove last value of the vec.
234    ///
235    /// Value will be return if data structure is not empty.
236    #[inline(always)]
237    pub fn pop(&mut self) -> Option<T> {
238        self.segment.pop()
239    }
240
241    /// Append a value to the vec.
242    ///
243    /// If vec is too small:
244    /// - new segment may be created.
245    /// - current segment may be resize.
246    ///
247    /// This is why this function can fail, because it depends on FS / IO calls.
248    pub fn push(&mut self, value: T) -> Result<(), io::Error> {
249        // Reserve some space if vec is full.
250        if self.capacity() == self.len() {
251            let min_capacity = page_size() / mem::size_of::<T>();
252            self.reserve(std::cmp::max(self.len(), min_capacity))?;
253        }
254
255        // Add new value to vec.
256        assert!(
257            self.push_within_capacity(value).is_ok(),
258            "Fail to push to newly created segment"
259        );
260
261        Ok(())
262    }
263
264    /// Try to push a new value to the data structure.
265    ///
266    /// If vec is too small, value will be return as an `Err`.
267    #[inline(always)]
268    pub fn push_within_capacity(&mut self, value: T) -> Result<(), T> {
269        self.segment.push_within_capacity(value)
270    }
271
272    /// Resize the vec without copying data.
273    ///
274    /// # How it works ?
275    ///
276    /// 1. It first check we need to grow the segment.
277    /// 2. Call `Segment::<T>::open_rw` with a bigger capacity that what we already reserve.
278    ///    At this point, the file is mmap twice.
279    /// 3. Replace `self.segment` we newly mapped segment if there is no error.
280    /// 4. Update segment len to avoid calling drop on unwanted data.
281    pub fn reserve(&mut self, additional: usize) -> Result<(), io::Error> {
282        let current_len = self.len();
283        let mut new_capacity = current_len + additional;
284
285        if self.capacity() < new_capacity {
286            // Round to upper page new capacity
287            let page_size = page_size();
288            let page_capacity = page_size / mem::size_of::<T>();
289            if new_capacity % page_capacity != 0 {
290                new_capacity += page_capacity - (new_capacity % page_capacity);
291            }
292            assert!(new_capacity > self.segment.capacity());
293
294            // Map again path with a new segment but with bigger capacity.
295            let new_segment = Segment::<T>::open_rw(&self.path, new_capacity)?;
296            debug_assert!(new_segment.capacity() > self.segment.capacity());
297
298            // At this point we cannot panic anymore !
299            // We have to carefully unmap region to avoid calling multiple times drop
300            let mut old_segment = mem::replace(&mut self.segment, new_segment);
301            assert_ne!(old_segment.addr, self.segment.addr);
302
303            // Update capacity to nothing should be dropped twice.
304            unsafe {
305                old_segment.set_len(0);
306                self.segment.set_len(current_len);
307            }
308        }
309
310        Ok(())
311    }
312
313    /// Inform the kernel that the complete segment will be access in a near future.
314    #[inline(always)]
315    pub fn advice_prefetch_all_pages(&self) {
316        self.segment.advice_prefetch_all_pages()
317    }
318
319    /// Inform the kernel that underlying page for `index` will be access in a near future.
320    #[inline(always)]
321    pub fn advice_prefetch_page_at(&self, index: usize) {
322        self.segment.advice_prefetch_page_at(index)
323    }
324
325    /// Get underlying file path.
326    pub fn path(&self) -> PathBuf {
327        self.path.clone()
328    }
329}
330
331impl<T, B> MmapVec<T, B>
332where
333    B: SegmentBuilder + Clone,
334    T: Clone,
335{
336    /// Try cloning the vector.
337    ///
338    /// A new segment will be created for output vec.
339    /// Capacity of the new vec will be the same as source vec.
340    pub fn try_clone(&self) -> io::Result<Self> {
341        if self.len() == 0 {
342            return Ok(Self::default());
343        }
344
345        let other_path = self.builder.new_segment_path();
346        let mut other_segment = Segment::open_rw(&other_path, self.capacity())?;
347
348        // Bellow code could be optimize, but we have to deal with Clone implementation that can panic ...
349        for row in &self[..] {
350            // It is "safe" here to call panic on error since we already have reserved correct segment capacity.
351            assert!(
352                other_segment.push_within_capacity(row.clone()).is_ok(),
353                "Fail to push to newly cloned segment"
354            );
355        }
356
357        Ok(Self {
358            builder: self.builder.clone(),
359            segment: other_segment,
360            path: other_path,
361        })
362    }
363}
364
365impl<T, B> Default for MmapVec<T, B>
366where
367    B: SegmentBuilder,
368{
369    #[inline(always)]
370    fn default() -> Self {
371        Self::new()
372    }
373}
374
375impl<T, B> Deref for MmapVec<T, B>
376where
377    B: SegmentBuilder,
378{
379    type Target = [T];
380
381    #[inline(always)]
382    fn deref(&self) -> &Self::Target {
383        self.segment.deref()
384    }
385}
386
387impl<T, B> DerefMut for MmapVec<T, B>
388where
389    B: SegmentBuilder,
390{
391    #[inline(always)]
392    fn deref_mut(&mut self) -> &mut Self::Target {
393        self.segment.deref_mut()
394    }
395}
396
397impl<T, U, B1, B2> PartialEq<MmapVec<U, B2>> for MmapVec<T, B1>
398where
399    B1: SegmentBuilder,
400    B2: SegmentBuilder,
401    T: PartialEq<U>,
402{
403    #[inline(always)]
404    fn eq(&self, other: &MmapVec<U, B2>) -> bool {
405        self[..] == other[..]
406    }
407
408    #[inline(always)]
409    fn ne(&self, other: &MmapVec<U, B2>) -> bool {
410        self[..] != other[..]
411    }
412}
413
414impl<T, B> Eq for MmapVec<T, B>
415where
416    B: SegmentBuilder,
417    T: Eq,
418{
419}
420
421impl<T, B> Drop for MmapVec<T, B>
422where
423    B: SegmentBuilder,
424{
425    fn drop(&mut self) {
426        let _ = fs::remove_file(&self.path);
427    }
428}
429
430#[inline(never)]
431#[cold]
432fn panic_bad_capacity() {
433    panic!("MmapVec was build with bad capacity");
434}
435
436impl<T, B, const N: usize> TryFrom<[T; N]> for MmapVec<T, B>
437where
438    B: SegmentBuilder,
439{
440    type Error = io::Error;
441
442    fn try_from(values: [T; N]) -> Result<Self, Self::Error> {
443        let mut out = Self::with_capacity(N)?;
444        for val in values {
445            if out.push_within_capacity(val).is_err() {
446                panic_bad_capacity();
447            }
448        }
449        Ok(out)
450    }
451}
452
453impl<T, B> TryFrom<&[T]> for MmapVec<T, B>
454where
455    T: Clone,
456    B: SegmentBuilder,
457{
458    type Error = io::Error;
459
460    fn try_from(values: &[T]) -> Result<Self, Self::Error> {
461        let mut out = Self::with_capacity(values.len())?;
462        for val in values {
463            if out.push_within_capacity(val.clone()).is_err() {
464                panic_bad_capacity();
465            }
466        }
467        Ok(out)
468    }
469}
470
471impl<T, B> TryFrom<Vec<T>> for MmapVec<T, B>
472where
473    B: SegmentBuilder,
474{
475    type Error = io::Error;
476
477    fn try_from(values: Vec<T>) -> Result<Self, Self::Error> {
478        let mut out = Self::with_capacity(values.len())?;
479        for val in values {
480            if out.push_within_capacity(val).is_err() {
481                panic_bad_capacity();
482            }
483        }
484        Ok(out)
485    }
486}
487
488#[cfg(feature = "serde")]
489impl<T, B> Serialize for MmapVec<T, B>
490where
491    T: Serialize,
492    B: SegmentBuilder,
493{
494    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
495    where
496        S: Serializer,
497    {
498        let mut seq = serializer.serialize_seq(Some(self.len()))?;
499        for element in self.iter() {
500            seq.serialize_element(element)?;
501        }
502        seq.end()
503    }
504}
505
506#[cfg(feature = "serde")]
507struct MmapVecVisitor<T, B: SegmentBuilder> {
508    _marker: PhantomData<fn() -> MmapVec<T, B>>,
509}
510
511#[cfg(feature = "serde")]
512impl<T, B: SegmentBuilder> MmapVecVisitor<T, B> {
513    fn new() -> Self {
514        Self {
515            _marker: PhantomData,
516        }
517    }
518}
519
520#[cfg(feature = "serde")]
521impl<'de, T, B> Visitor<'de> for MmapVecVisitor<T, B>
522where
523    T: Deserialize<'de>,
524    B: SegmentBuilder,
525{
526    type Value = MmapVec<T, B>;
527
528    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
529        formatter.write_str("expected sequence of element")
530    }
531
532    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
533    where
534        A: SeqAccess<'de>,
535    {
536        use serde::de::Error;
537
538        let capacity = seq.size_hint().unwrap_or(0);
539        let mut output = MmapVec::<T, B>::with_capacity(capacity).map_err(Error::custom)?;
540
541        while let Some(element) = seq.next_element()? {
542            output.push(element).map_err(Error::custom)?;
543        }
544
545        Ok(output)
546    }
547}
548
549#[cfg(feature = "serde")]
550impl<'de, T, B> Deserialize<'de> for MmapVec<T, B>
551where
552    T: Deserialize<'de>,
553    B: SegmentBuilder,
554{
555    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
556    where
557        D: Deserializer<'de>,
558    {
559        deserializer.deserialize_seq(MmapVecVisitor::new())
560    }
561}