Skip to main content

ferray_io/
memmap.rs

1// ferray-io: Memory-mapped array I/O
2//
3// REQ-10: memmap::<T>(path, mode) with MemmapMode::ReadOnly, ReadWrite, CopyOnWrite
4// REQ-11: Memory-mapped arrays are views into file memory, not owned copies
5
6use std::fs::{File, OpenOptions};
7use std::io::{BufReader, Read};
8use std::marker::PhantomData;
9use std::path::Path;
10
11use memmap2::{Mmap, MmapMut, MmapOptions};
12
13use ferray_core::Array;
14use ferray_core::dimension::IxDyn;
15use ferray_core::dtype::Element;
16use ferray_core::error::{FerrayError, FerrayResult};
17
18use crate::format::MemmapMode;
19use crate::npy::NpyElement;
20use crate::npy::header::{self, NpyHeader};
21
22/// A read-only memory-mapped array backed by a `.npy` file.
23///
24/// The array data is mapped directly from the file. No copy is made.
25/// The data remains valid as long as this struct is alive.
26pub struct MemmapArray<T: Element> {
27    /// The underlying memory map.
28    _mmap: Mmap,
29    /// Pointer to the start of element data.
30    data_ptr: *const T,
31    /// Shape of the array.
32    shape: Vec<usize>,
33    /// Number of elements.
34    len: usize,
35    /// Marker for the element type.
36    _marker: PhantomData<T>,
37}
38
39// SAFETY: The underlying Mmap is Send + Sync and the data pointer
40// is derived from it. We only provide read access to the data.
41unsafe impl<T: Element> Send for MemmapArray<T> {}
42unsafe impl<T: Element> Sync for MemmapArray<T> {}
43
44impl<T: Element> MemmapArray<T> {
45    /// Return the shape of the mapped array.
46    pub fn shape(&self) -> &[usize] {
47        &self.shape
48    }
49
50    /// Return the mapped data as a slice.
51    pub fn as_slice(&self) -> &[T] {
52        // SAFETY: data_ptr points to properly aligned, initialized data
53        // within the mmap region, and self.len is validated during construction.
54        unsafe { std::slice::from_raw_parts(self.data_ptr, self.len) }
55    }
56
57    /// Copy the memory-mapped data into an owned `Array`.
58    pub fn to_array(&self) -> FerrayResult<Array<T, IxDyn>> {
59        let data = self.as_slice().to_vec();
60        Array::from_vec(IxDyn::new(&self.shape), data)
61    }
62}
63
64/// A read-write memory-mapped array backed by a `.npy` file.
65///
66/// Modifications to the array data are written back to the underlying file.
67pub struct MemmapArrayMut<T: Element> {
68    /// The underlying mutable memory map.
69    _mmap: MmapMut,
70    /// Pointer to the start of element data.
71    data_ptr: *mut T,
72    /// Shape of the array.
73    shape: Vec<usize>,
74    /// Number of elements.
75    len: usize,
76    /// Marker for the element type.
77    _marker: PhantomData<T>,
78}
79
80unsafe impl<T: Element> Send for MemmapArrayMut<T> {}
81unsafe impl<T: Element> Sync for MemmapArrayMut<T> {}
82
83impl<T: Element> MemmapArrayMut<T> {
84    /// Return the shape of the mapped array.
85    pub fn shape(&self) -> &[usize] {
86        &self.shape
87    }
88
89    /// Return the mapped data as a slice.
90    pub fn as_slice(&self) -> &[T] {
91        unsafe { std::slice::from_raw_parts(self.data_ptr, self.len) }
92    }
93
94    /// Return the mapped data as a mutable slice.
95    ///
96    /// Modifications will be persisted to the file (for ReadWrite mode)
97    /// or kept in memory only (for CopyOnWrite mode).
98    pub fn as_slice_mut(&mut self) -> &mut [T] {
99        unsafe { std::slice::from_raw_parts_mut(self.data_ptr, self.len) }
100    }
101
102    /// Copy the memory-mapped data into an owned `Array`.
103    pub fn to_array(&self) -> FerrayResult<Array<T, IxDyn>> {
104        let data = self.as_slice().to_vec();
105        Array::from_vec(IxDyn::new(&self.shape), data)
106    }
107
108    /// Flush changes to disk (only meaningful for ReadWrite mode).
109    pub fn flush(&self) -> FerrayResult<()> {
110        self._mmap
111            .flush()
112            .map_err(|e| FerrayError::io_error(format!("failed to flush mmap: {e}")))
113    }
114}
115
116/// Open a `.npy` file as a read-only memory-mapped array.
117///
118/// The file must contain data in native byte order and C-contiguous layout.
119///
120/// # Errors
121/// - `FerrayError::InvalidDtype` if the file dtype doesn't match `T`.
122/// - `FerrayError::IoError` on file or mapping failures.
123pub fn memmap_readonly<T: Element + NpyElement, P: AsRef<Path>>(
124    path: P,
125) -> FerrayResult<MemmapArray<T>> {
126    let (header, data_offset) = read_npy_header_with_offset(path.as_ref())?;
127    validate_dtype::<T>(&header)?;
128    validate_native_endian(&header)?;
129
130    let file = File::open(path.as_ref())?;
131    let mmap = unsafe {
132        MmapOptions::new()
133            .offset(data_offset as u64)
134            .len(header.shape.iter().product::<usize>() * std::mem::size_of::<T>())
135            .map(&file)
136            .map_err(|e| FerrayError::io_error(format!("mmap failed: {e}")))?
137    };
138
139    let len: usize = header.shape.iter().product();
140    let data_ptr = mmap.as_ptr() as *const T;
141
142    // Validate alignment
143    if (data_ptr as usize) % std::mem::align_of::<T>() != 0 {
144        return Err(FerrayError::io_error(
145            "memory-mapped data is not properly aligned for the element type",
146        ));
147    }
148
149    Ok(MemmapArray {
150        _mmap: mmap,
151        data_ptr,
152        shape: header.shape,
153        len,
154        _marker: PhantomData,
155    })
156}
157
158/// Open a `.npy` file as a mutable memory-mapped array.
159///
160/// # Arguments
161/// - `mode`: `MemmapMode::ReadWrite` persists changes to disk.
162///   `MemmapMode::CopyOnWrite` keeps changes in memory only.
163///
164/// # Errors
165/// - `FerrayError::InvalidDtype` if the file dtype doesn't match `T`.
166/// - `FerrayError::IoError` on file or mapping failures.
167/// - `FerrayError::InvalidValue` if `mode` is `ReadOnly` (use `memmap_readonly` instead).
168pub fn memmap_mut<T: Element + NpyElement, P: AsRef<Path>>(
169    path: P,
170    mode: MemmapMode,
171) -> FerrayResult<MemmapArrayMut<T>> {
172    if mode == MemmapMode::ReadOnly {
173        return Err(FerrayError::invalid_value(
174            "use memmap_readonly for read-only access",
175        ));
176    }
177
178    let (header, data_offset) = read_npy_header_with_offset(path.as_ref())?;
179    validate_dtype::<T>(&header)?;
180    validate_native_endian(&header)?;
181
182    let len: usize = header.shape.iter().product();
183    let data_bytes = len * std::mem::size_of::<T>();
184
185    let mmap = match mode {
186        MemmapMode::ReadWrite => {
187            let file = OpenOptions::new()
188                .read(true)
189                .write(true)
190                .open(path.as_ref())?;
191            unsafe {
192                MmapOptions::new()
193                    .offset(data_offset as u64)
194                    .len(data_bytes)
195                    .map_mut(&file)
196                    .map_err(|e| FerrayError::io_error(format!("mmap_mut failed: {e}")))?
197            }
198        }
199        MemmapMode::CopyOnWrite => {
200            let file = File::open(path.as_ref())?;
201            unsafe {
202                MmapOptions::new()
203                    .offset(data_offset as u64)
204                    .len(data_bytes)
205                    .map_copy(&file)
206                    .map_err(|e| FerrayError::io_error(format!("mmap copy-on-write failed: {e}")))?
207            }
208        }
209        MemmapMode::ReadOnly => unreachable!(),
210    };
211
212    let data_ptr = mmap.as_ptr() as *mut T;
213
214    if (data_ptr as usize) % std::mem::align_of::<T>() != 0 {
215        return Err(FerrayError::io_error(
216            "memory-mapped data is not properly aligned for the element type",
217        ));
218    }
219
220    Ok(MemmapArrayMut {
221        _mmap: mmap,
222        data_ptr,
223        shape: header.shape,
224        len,
225        _marker: PhantomData,
226    })
227}
228
229/// Combined entry point matching NumPy's `memmap` function signature.
230///
231/// Dispatches to `memmap_readonly` or `memmap_mut` based on `mode`.
232/// For ReadOnly mode, copies the data to an owned array (since the return type
233/// must be uniform). For mutable modes, returns the data copied into an owned array
234/// after applying the mapping.
235///
236/// For zero-copy access, use `memmap_readonly` or `memmap_mut` directly.
237pub fn open_memmap<T: Element + NpyElement, P: AsRef<Path>>(
238    path: P,
239    mode: MemmapMode,
240) -> FerrayResult<Array<T, IxDyn>> {
241    match mode {
242        MemmapMode::ReadOnly => {
243            let mapped = memmap_readonly::<T, _>(path)?;
244            mapped.to_array()
245        }
246        _ => {
247            let mapped = memmap_mut::<T, _>(path, mode)?;
248            mapped.to_array()
249        }
250    }
251}
252
253/// Read the npy header and compute the data byte offset.
254fn read_npy_header_with_offset(path: &Path) -> FerrayResult<(NpyHeader, usize)> {
255    let file = File::open(path)?;
256    let mut reader = BufReader::new(file);
257    let hdr = header::read_header(&mut reader)?;
258
259    // Compute data offset: the reader has consumed the header, so we compute
260    // the offset from the version and header_len.
261    let preamble_len = crate::format::NPY_MAGIC_LEN + 2; // magic + version bytes
262    let header_len_field_size = if hdr.version.0 == 1 { 2 } else { 4 };
263
264    // We need to figure out the total bytes consumed.
265    // Re-read the file to get the header length from the raw bytes.
266    let file2 = File::open(path)?;
267    let mut reader2 = BufReader::new(file2);
268    let mut skip = vec![0u8; preamble_len + header_len_field_size];
269    reader2.read_exact(&mut skip)?;
270
271    let header_len = if hdr.version.0 == 1 {
272        u16::from_le_bytes([skip[preamble_len], skip[preamble_len + 1]]) as usize
273    } else {
274        u32::from_le_bytes([
275            skip[preamble_len],
276            skip[preamble_len + 1],
277            skip[preamble_len + 2],
278            skip[preamble_len + 3],
279        ]) as usize
280    };
281
282    let data_offset = preamble_len + header_len_field_size + header_len;
283
284    Ok((hdr, data_offset))
285}
286
287fn validate_dtype<T: Element>(header: &NpyHeader) -> FerrayResult<()> {
288    if header.dtype != T::dtype() {
289        return Err(FerrayError::invalid_dtype(format!(
290            "expected dtype {:?} for type {}, but file has {:?}",
291            T::dtype(),
292            std::any::type_name::<T>(),
293            header.dtype,
294        )));
295    }
296    Ok(())
297}
298
299fn validate_native_endian(header: &NpyHeader) -> FerrayResult<()> {
300    if header.endianness.needs_swap() {
301        return Err(FerrayError::io_error(
302            "memory-mapped arrays require native byte order; file has non-native endianness",
303        ));
304    }
305    Ok(())
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311    use crate::npy;
312    use ferray_core::dimension::Ix1;
313
314    fn test_dir() -> std::path::PathBuf {
315        let dir = std::env::temp_dir().join(format!("ferray_io_mmap_{}", std::process::id()));
316        let _ = std::fs::create_dir_all(&dir);
317        dir
318    }
319
320    fn test_file(name: &str) -> std::path::PathBuf {
321        test_dir().join(name)
322    }
323
324    #[test]
325    fn memmap_readonly_f64() {
326        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
327        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([5]), data.clone()).unwrap();
328
329        let path = test_file("mm_ro_f64.npy");
330        npy::save(&path, &arr).unwrap();
331
332        let mapped = memmap_readonly::<f64, _>(&path).unwrap();
333        assert_eq!(mapped.shape(), &[5]);
334        assert_eq!(mapped.as_slice(), &data[..]);
335        let _ = std::fs::remove_file(&path);
336    }
337
338    #[test]
339    fn memmap_to_array() {
340        let data = vec![10i32, 20, 30];
341        let arr = Array::<i32, Ix1>::from_vec(Ix1::new([3]), data.clone()).unwrap();
342
343        let path = test_file("mm_to_arr.npy");
344        npy::save(&path, &arr).unwrap();
345
346        let mapped = memmap_readonly::<i32, _>(&path).unwrap();
347        let owned = mapped.to_array().unwrap();
348        assert_eq!(owned.shape(), &[3]);
349        assert_eq!(owned.as_slice().unwrap(), &data[..]);
350        let _ = std::fs::remove_file(&path);
351    }
352
353    #[test]
354    fn memmap_readwrite_persist() {
355        let data = vec![1.0_f64, 2.0, 3.0];
356        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data).unwrap();
357
358        let path = test_file("mm_rw.npy");
359        npy::save(&path, &arr).unwrap();
360
361        // Modify via mmap
362        {
363            let mut mapped = memmap_mut::<f64, _>(&path, MemmapMode::ReadWrite).unwrap();
364            mapped.as_slice_mut()[0] = 999.0;
365            mapped.flush().unwrap();
366        }
367
368        // Read back and verify the change persisted
369        let loaded: Array<f64, Ix1> = npy::load(&path).unwrap();
370        assert_eq!(loaded.as_slice().unwrap()[0], 999.0);
371        assert_eq!(loaded.as_slice().unwrap()[1], 2.0);
372        assert_eq!(loaded.as_slice().unwrap()[2], 3.0);
373        let _ = std::fs::remove_file(&path);
374    }
375
376    #[test]
377    fn memmap_copy_on_write() {
378        let data = vec![1.0_f64, 2.0, 3.0];
379        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data).unwrap();
380
381        let path = test_file("mm_cow.npy");
382        npy::save(&path, &arr).unwrap();
383
384        // Modify via copy-on-write mmap
385        {
386            let mut mapped = memmap_mut::<f64, _>(&path, MemmapMode::CopyOnWrite).unwrap();
387            mapped.as_slice_mut()[0] = 999.0;
388            assert_eq!(mapped.as_slice()[0], 999.0);
389        }
390
391        // Original file should be unmodified
392        let loaded: Array<f64, Ix1> = npy::load(&path).unwrap();
393        assert_eq!(loaded.as_slice().unwrap()[0], 1.0);
394        let _ = std::fs::remove_file(&path);
395    }
396
397    #[test]
398    fn memmap_wrong_dtype_error() {
399        let data = vec![1.0_f64, 2.0];
400        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([2]), data).unwrap();
401
402        let path = test_file("mm_wrong_dt.npy");
403        npy::save(&path, &arr).unwrap();
404
405        let result = memmap_readonly::<f32, _>(&path);
406        assert!(result.is_err());
407        let _ = std::fs::remove_file(&path);
408    }
409
410    #[test]
411    fn open_memmap_readonly() {
412        let data = vec![1.0_f64, 2.0, 3.0];
413        let arr = Array::<f64, Ix1>::from_vec(Ix1::new([3]), data.clone()).unwrap();
414
415        let path = test_file("mm_open_ro.npy");
416        npy::save(&path, &arr).unwrap();
417
418        let loaded = open_memmap::<f64, _>(&path, MemmapMode::ReadOnly).unwrap();
419        assert_eq!(loaded.shape(), &[3]);
420        assert_eq!(loaded.as_slice().unwrap(), &data[..]);
421        let _ = std::fs::remove_file(&path);
422    }
423}