Skip to main content

ferray_io/
memmap.rs

1// ferray-io: Memory-mapped array I/O
2//
3// REQ-10: memmap::<T>(path, mode) with MemmapMode::ReadOnly, ReadWrite, CopyOnWrite
4// REQ-11: Memory-mapped arrays are views into file memory, not owned copies
5
6use std::fs::{File, OpenOptions};
7use std::io::{BufReader, Seek};
8use std::marker::PhantomData;
9use std::path::Path;
10
11use memmap2::{Mmap, MmapMut, MmapOptions};
12
13use ferray_core::Array;
14use ferray_core::array::view::ArrayView;
15use ferray_core::dimension::IxDyn;
16use ferray_core::dtype::Element;
17use ferray_core::error::{FerrayError, FerrayResult};
18
19use crate::format::MemmapMode;
20use crate::npy::NpyElement;
21use crate::npy::checked_total_elements;
22use crate::npy::header::{self, NpyHeader};
23
24/// A read-only memory-mapped array backed by a `.npy` file.
25///
26/// The array data is mapped directly from the file. No copy is made.
27/// The data remains valid as long as this struct is alive.
28pub struct MemmapArray<T: Element> {
29    /// The underlying memory map.
30    _mmap: Mmap,
31    /// Pointer to the start of element data.
32    data_ptr: *const T,
33    /// Shape of the array.
34    shape: Vec<usize>,
35    /// Number of elements.
36    len: usize,
37    /// Marker for the element type.
38    _marker: PhantomData<T>,
39}
40
41// SAFETY: The underlying Mmap is Send + Sync and the data pointer
42// is derived from it. We only provide read access to the data.
43unsafe impl<T: Element> Send for MemmapArray<T> {}
44unsafe impl<T: Element> Sync for MemmapArray<T> {}
45
46impl<T: Element> MemmapArray<T> {
47    /// Return the shape of the mapped array.
48    #[must_use]
49    pub fn shape(&self) -> &[usize] {
50        &self.shape
51    }
52
53    /// Return the mapped data as a slice.
54    #[must_use]
55    pub const fn as_slice(&self) -> &[T] {
56        // SAFETY: data_ptr points to properly aligned, initialized data
57        // within the mmap region, and self.len is validated during construction.
58        unsafe { std::slice::from_raw_parts(self.data_ptr, self.len) }
59    }
60
61    /// Copy the memory-mapped data into an owned `Array`.
62    pub fn to_array(&self) -> FerrayResult<Array<T, IxDyn>> {
63        let data = self.as_slice().to_vec();
64        Array::from_vec(IxDyn::new(&self.shape), data)
65    }
66
67    /// Borrow the memory-mapped data as an `ArrayView<T, IxDyn>` so it
68    /// can be passed directly to ferray functions that expect an
69    /// `&Array<_>` or `ArrayView<_>` (#496). The view is C-contiguous
70    /// row-major and lives as long as the underlying mmap.
71    #[must_use]
72    pub fn view(&self) -> ArrayView<'_, T, IxDyn> {
73        // Row-major strides for the shape: stride[i] = product(shape[i+1..]).
74        let ndim = self.shape.len();
75        let mut strides = vec![1usize; ndim];
76        for i in (0..ndim.saturating_sub(1)).rev() {
77            strides[i] = strides[i + 1] * self.shape[i + 1];
78        }
79        // SAFETY: `data_ptr` is valid for reads of `len * size_of::<T>()`
80        // bytes for the lifetime of `self._mmap`, which transitively
81        // outlives the borrow `&self`. Strides describe the same
82        // C-contiguous layout the data was written in.
83        unsafe { ArrayView::from_shape_ptr(self.data_ptr, &self.shape, &strides) }
84    }
85}
86
87/// A read-write memory-mapped array backed by a `.npy` file.
88///
89/// Modifications to the array data are written back to the underlying file.
90pub struct MemmapArrayMut<T: Element> {
91    /// The underlying mutable memory map.
92    mmap: MmapMut,
93    /// Pointer to the start of element data.
94    data_ptr: *mut T,
95    /// Shape of the array.
96    shape: Vec<usize>,
97    /// Number of elements.
98    len: usize,
99    /// Marker for the element type.
100    _marker: PhantomData<T>,
101}
102
103unsafe impl<T: Element> Send for MemmapArrayMut<T> {}
104unsafe impl<T: Element> Sync for MemmapArrayMut<T> {}
105
106impl<T: Element> MemmapArrayMut<T> {
107    /// Return the shape of the mapped array.
108    #[must_use]
109    pub fn shape(&self) -> &[usize] {
110        &self.shape
111    }
112
113    /// Return the mapped data as a slice.
114    #[must_use]
115    pub const fn as_slice(&self) -> &[T] {
116        unsafe { std::slice::from_raw_parts(self.data_ptr, self.len) }
117    }
118
119    /// Return the mapped data as a mutable slice.
120    ///
121    /// Modifications will be persisted to the file (for `ReadWrite` mode)
122    /// or kept in memory only (for `CopyOnWrite` mode).
123    pub const fn as_slice_mut(&mut self) -> &mut [T] {
124        unsafe { std::slice::from_raw_parts_mut(self.data_ptr, self.len) }
125    }
126
127    /// Copy the memory-mapped data into an owned `Array`.
128    pub fn to_array(&self) -> FerrayResult<Array<T, IxDyn>> {
129        let data = self.as_slice().to_vec();
130        Array::from_vec(IxDyn::new(&self.shape), data)
131    }
132
133    /// Borrow the memory-mapped data as an immutable
134    /// `ArrayView<T, IxDyn>`. See [`MemmapArray::view`] for the
135    /// rationale (#496).
136    #[must_use]
137    pub fn view(&self) -> ArrayView<'_, T, IxDyn> {
138        let ndim = self.shape.len();
139        let mut strides = vec![1usize; ndim];
140        for i in (0..ndim.saturating_sub(1)).rev() {
141            strides[i] = strides[i + 1] * self.shape[i + 1];
142        }
143        // SAFETY: same invariants as MemmapArray::view; the *mut T is
144        // immediately demoted to *const T.
145        unsafe { ArrayView::from_shape_ptr(self.data_ptr.cast_const(), &self.shape, &strides) }
146    }
147
148    /// Flush changes to disk (only meaningful for `ReadWrite` mode).
149    pub fn flush(&self) -> FerrayResult<()> {
150        self.mmap
151            .flush()
152            .map_err(|e| FerrayError::io_error(format!("failed to flush mmap: {e}")))
153    }
154}
155
156/// Open a `.npy` file as a read-only memory-mapped array.
157///
158/// The file must contain data in native byte order and C-contiguous layout.
159///
160/// # Errors
161/// - `FerrayError::InvalidDtype` if the file dtype doesn't match `T`.
162/// - `FerrayError::IoError` on file or mapping failures.
163pub fn memmap_readonly<T: Element + NpyElement, P: AsRef<Path>>(
164    path: P,
165) -> FerrayResult<MemmapArray<T>> {
166    let (header, data_offset) = read_npy_header_with_offset(path.as_ref())?;
167    validate_dtype::<T>(&header)?;
168    validate_native_endian(&header)?;
169
170    let len = checked_total_elements(&header.shape)?;
171    let file = File::open(path.as_ref())?;
172    let mmap = unsafe {
173        MmapOptions::new()
174            .offset(data_offset as u64)
175            .len(len * std::mem::size_of::<T>())
176            .map(&file)
177            .map_err(|e| FerrayError::io_error(format!("mmap failed: {e}")))?
178    };
179    let data_ptr = mmap.as_ptr().cast::<T>();
180
181    // Validate alignment
182    if (data_ptr as usize) % std::mem::align_of::<T>() != 0 {
183        return Err(FerrayError::io_error(
184            "memory-mapped data is not properly aligned for the element type",
185        ));
186    }
187
188    Ok(MemmapArray {
189        _mmap: mmap,
190        data_ptr,
191        shape: header.shape,
192        len,
193        _marker: PhantomData,
194    })
195}
196
197/// Open a `.npy` file as a mutable memory-mapped array.
198///
199/// # Arguments
200/// - `mode`: `MemmapMode::ReadWrite` persists changes to disk.
201///   `MemmapMode::CopyOnWrite` keeps changes in memory only.
202///
203/// # Errors
204/// - `FerrayError::InvalidDtype` if the file dtype doesn't match `T`.
205/// - `FerrayError::IoError` on file or mapping failures.
206/// - `FerrayError::InvalidValue` if `mode` is `ReadOnly` (use `memmap_readonly` instead).
207pub fn memmap_mut<T: Element + NpyElement, P: AsRef<Path>>(
208    path: P,
209    mode: MemmapMode,
210) -> FerrayResult<MemmapArrayMut<T>> {
211    if mode == MemmapMode::ReadOnly {
212        return Err(FerrayError::invalid_value(
213            "use memmap_readonly for read-only access",
214        ));
215    }
216
217    let (header, data_offset) = read_npy_header_with_offset(path.as_ref())?;
218    validate_dtype::<T>(&header)?;
219    validate_native_endian(&header)?;
220
221    let len = checked_total_elements(&header.shape)?;
222    let data_bytes = len * std::mem::size_of::<T>();
223
224    let mmap = match mode {
225        MemmapMode::ReadWrite => {
226            let file = OpenOptions::new()
227                .read(true)
228                .write(true)
229                .open(path.as_ref())?;
230            unsafe {
231                MmapOptions::new()
232                    .offset(data_offset as u64)
233                    .len(data_bytes)
234                    .map_mut(&file)
235                    .map_err(|e| FerrayError::io_error(format!("mmap_mut failed: {e}")))?
236            }
237        }
238        MemmapMode::CopyOnWrite => {
239            let file = File::open(path.as_ref())?;
240            unsafe {
241                MmapOptions::new()
242                    .offset(data_offset as u64)
243                    .len(data_bytes)
244                    .map_copy(&file)
245                    .map_err(|e| FerrayError::io_error(format!("mmap copy-on-write failed: {e}")))?
246            }
247        }
248        MemmapMode::ReadOnly => unreachable!(),
249    };
250
251    let data_ptr = mmap.as_ptr().cast::<T>().cast_mut();
252
253    if (data_ptr as usize) % std::mem::align_of::<T>() != 0 {
254        return Err(FerrayError::io_error(
255            "memory-mapped data is not properly aligned for the element type",
256        ));
257    }
258
259    Ok(MemmapArrayMut {
260        mmap,
261        data_ptr,
262        shape: header.shape,
263        len,
264        _marker: PhantomData,
265    })
266}
267
268/// Combined entry point matching `NumPy`'s `memmap` function signature.
269///
270/// Dispatches to `memmap_readonly` or `memmap_mut` based on `mode`,
271/// then copies the mapped data into an owned `Array<T, IxDyn>`.
272///
273/// **This always copies** because `Array<T, IxDyn>` owns its buffer.
274/// For zero-copy access, use [`memmap_readonly`] or [`memmap_mut`]
275/// directly and call `.view()` on the result to get an `ArrayView`
276/// backed by the mmap (#239, #496).
277pub fn open_memmap<T: Element + NpyElement, P: AsRef<Path>>(
278    path: P,
279    mode: MemmapMode,
280) -> FerrayResult<Array<T, IxDyn>> {
281    if mode == MemmapMode::ReadOnly {
282        let mapped = memmap_readonly::<T, _>(path)?;
283        mapped.to_array()
284    } else {
285        let mapped = memmap_mut::<T, _>(path, mode)?;
286        mapped.to_array()
287    }
288}
289
290/// Read the npy header and compute the data byte offset.
291///
292/// Uses `stream_position()` after parsing the header to determine the data
293/// offset in a single open, avoiding a TOCTOU race from re-opening the file.
294fn read_npy_header_with_offset(path: &Path) -> FerrayResult<(NpyHeader, usize)> {
295    let file = File::open(path)?;
296    let mut reader = BufReader::new(file);
297    let hdr = header::read_header(&mut reader)?;
298
299    // read_header consumes exactly the header bytes; the reader is now
300    // positioned at the start of the data section.
301    let data_offset = reader
302        .stream_position()
303        .map_err(|e| FerrayError::io_error(format!("failed to get stream position: {e}")))?
304        as usize;
305
306    Ok((hdr, data_offset))
307}
308
309fn validate_dtype<T: Element>(header: &NpyHeader) -> FerrayResult<()> {
310    if header.dtype != T::dtype() {
311        return Err(FerrayError::invalid_dtype(format!(
312            "expected dtype {:?} for type {}, but file has {:?}",
313            T::dtype(),
314            std::any::type_name::<T>(),
315            header.dtype,
316        )));
317    }
318    Ok(())
319}
320
321fn validate_native_endian(header: &NpyHeader) -> FerrayResult<()> {
322    if header.endianness.needs_swap() {
323        return Err(FerrayError::io_error(
324            "memory-mapped arrays require native byte order; file has non-native endianness",
325        ));
326    }
327    Ok(())
328}
329
330#[cfg(test)]
331#[allow(clippy::float_cmp)] // Tests assert exact roundtrip equality on hand-picked memmap values.
332mod tests {
333    use super::*;
334    use crate::npy;
335    use ferray_core::dimension::Ix1;
336
337    fn test_dir() -> std::path::PathBuf {
338        let dir = std::env::temp_dir().join(format!("ferray_io_mmap_{}", std::process::id()));
339        let _ = std::fs::create_dir_all(&dir);
340        dir
341    }
342
343    fn test_file(name: &str) -> std::path::PathBuf {
344        test_dir().join(name)
345    }
346
347    #[test]
348    fn memmap_readonly_f64() {
349        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
350        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([5]), data.clone()).unwrap();
351
352        let path = test_file("mm_ro_f64.npy");
353        npy::save(&path, &arr).unwrap();
354
355        let mapped = memmap_readonly::<f64, _>(&path).unwrap();
356        assert_eq!(mapped.shape(), &[5]);
357        assert_eq!(mapped.as_slice(), &data[..]);
358        let _ = std::fs::remove_file(&path);
359    }
360
361    #[test]
362    fn memmap_to_array() {
363        let data = vec![10i32, 20, 30];
364        let arr = Array::<i32, Ix1>::from_vec(Ix1::new([3]), data.clone()).unwrap();
365
366        let path = test_file("mm_to_arr.npy");
367        npy::save(&path, &arr).unwrap();
368
369        let mapped = memmap_readonly::<i32, _>(&path).unwrap();
370        let owned = mapped.to_array().unwrap();
371        assert_eq!(owned.shape(), &[3]);
372        assert_eq!(owned.as_slice().unwrap(), &data[..]);
373        let _ = std::fs::remove_file(&path);
374    }
375
376    #[test]
377    fn memmap_readwrite_persist() {
378        let data = vec![1.0_f64, 2.0, 3.0];
379        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data).unwrap();
380
381        let path = test_file("mm_rw.npy");
382        npy::save(&path, &arr).unwrap();
383
384        // Modify via mmap
385        {
386            let mut mapped = memmap_mut::<f64, _>(&path, MemmapMode::ReadWrite).unwrap();
387            mapped.as_slice_mut()[0] = 999.0;
388            mapped.flush().unwrap();
389        }
390
391        // Read back and verify the change persisted
392        let loaded: Array<f64, Ix1> = npy::load(&path).unwrap();
393        assert_eq!(loaded.as_slice().unwrap()[0], 999.0);
394        assert_eq!(loaded.as_slice().unwrap()[1], 2.0);
395        assert_eq!(loaded.as_slice().unwrap()[2], 3.0);
396        let _ = std::fs::remove_file(&path);
397    }
398
399    #[test]
400    fn memmap_copy_on_write() {
401        let data = vec![1.0_f64, 2.0, 3.0];
402        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data).unwrap();
403
404        let path = test_file("mm_cow.npy");
405        npy::save(&path, &arr).unwrap();
406
407        // Modify via copy-on-write mmap
408        {
409            let mut mapped = memmap_mut::<f64, _>(&path, MemmapMode::CopyOnWrite).unwrap();
410            mapped.as_slice_mut()[0] = 999.0;
411            assert_eq!(mapped.as_slice()[0], 999.0);
412        }
413
414        // Original file should be unmodified
415        let loaded: Array<f64, Ix1> = npy::load(&path).unwrap();
416        assert_eq!(loaded.as_slice().unwrap()[0], 1.0);
417        let _ = std::fs::remove_file(&path);
418    }
419
420    #[test]
421    fn memmap_wrong_dtype_error() {
422        let data = vec![1.0_f64, 2.0];
423        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([2]), data).unwrap();
424
425        let path = test_file("mm_wrong_dt.npy");
426        npy::save(&path, &arr).unwrap();
427
428        let result = memmap_readonly::<f32, _>(&path);
429        assert!(result.is_err());
430        let _ = std::fs::remove_file(&path);
431    }
432
433    #[test]
434    fn open_memmap_readonly() {
435        let data = vec![1.0_f64, 2.0, 3.0];
436        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data.clone()).unwrap();
437
438        let path = test_file("mm_open_ro.npy");
439        npy::save(&path, &arr).unwrap();
440
441        let loaded = open_memmap::<f64, _>(&path, MemmapMode::ReadOnly).unwrap();
442        assert_eq!(loaded.shape(), &[3]);
443        assert_eq!(loaded.as_slice().unwrap(), &data[..]);
444        let _ = std::fs::remove_file(&path);
445    }
446
447    #[test]
448    fn memmap_view_borrows_underlying_data() {
449        // Issue #496: memmap arrays should expose an ArrayView so they
450        // can be passed to ferray functions that take `&Array` /
451        // `ArrayView` without an intermediate copy.
452        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0];
453        let arr = Array::<f64, ferray_core::dimension::Ix2>::from_vec(
454            ferray_core::dimension::Ix2::new([2, 3]),
455            data.clone(),
456        )
457        .unwrap();
458
459        let path = test_file("mm_view.npy");
460        npy::save(&path, &arr).unwrap();
461
462        let mapped = memmap_readonly::<f64, _>(&path).unwrap();
463        let view = mapped.view();
464        assert_eq!(view.shape(), &[2, 3]);
465        let collected: Vec<f64> = view.iter().copied().collect();
466        assert_eq!(collected, data);
467        let _ = std::fs::remove_file(&path);
468    }
469}