Skip to main content

simd_r_drive_entry_handle/
entry_handle.rs

1use super::constants::METADATA_SIZE;
2use crate::EntryMetadata;
3use memmap2::{Mmap, MmapMut};
4use std::ops::Range;
5use std::sync::Arc;
6
7/// Zero-copy owner of a sub-slice in an `Arc<Mmap>`.
8/// Provides access to the bytes of an entry as long as this struct is alive.
9#[derive(Debug, Clone)]
10pub struct EntryHandle {
11    /// The underlying memory map.
12    pub mmap_arc: Arc<Mmap>,
13
14    /// The range of bytes within the memory-mapped file corresponding to the payload.
15    pub range: Range<usize>,
16
17    /// Metadata associated with the entry, including key hash and checksum.
18    pub metadata: EntryMetadata,
19}
20
21impl EntryHandle {
22    /// Provides access to the raw pointer of the memory-mapped file for testing.
23    ///
24    /// This method allows unit tests to verify that multiple `EntryHandle` instances
25    /// share the same underlying memory map, ensuring zero-copy behavior.
26    ///
27    /// # Returns
28    /// - A raw pointer to the underlying `Mmap`.
29    #[cfg(test)]
30    pub fn arc_ptr(&self) -> *const Mmap {
31        Arc::as_ptr(&self.mmap_arc)
32    }
33}
34
35/// Enable `*entry_handle` to act like a `&[u8]`
36impl std::ops::Deref for EntryHandle {
37    type Target = [u8];
38
39    fn deref(&self) -> &Self::Target {
40        self.as_slice()
41    }
42}
43
44/// Let us do: `assert_eq!(entry_handle, b"some bytes")`
45impl PartialEq<[u8]> for EntryHandle {
46    fn eq(&self, other: &[u8]) -> bool {
47        self.as_slice() == other
48    }
49}
50
51/// Allow comparisons with `&[u8]`
52impl PartialEq<&[u8]> for EntryHandle {
53    fn eq(&self, other: &&[u8]) -> bool {
54        self.as_slice() == *other
55    }
56}
57
58/// Allow comparisons with `Vec<u8>`
59impl PartialEq<Vec<u8>> for EntryHandle {
60    fn eq(&self, other: &Vec<u8>) -> bool {
61        self.as_slice() == other.as_slice()
62    }
63}
64
65impl EntryHandle {
66    /// Construct an in-memory, read-only entry backed by an anonymous mmap.
67    ///
68    /// This copies `bytes` **once** into an anonymous `MmapMut`, then seals it
69    /// to a read-only `Mmap`. The result behaves like a file-backed entry
70    /// (zero-copy reads via `as_slice()`), but never touches the filesystem.
71    ///
72    /// The `EntryMetadata` is populated using the supplied `key_hash`, a
73    /// `prev_offset` of `0` (not used for in-memory entries), and a 32-bit
74    /// checksum computed by the same algorithm used in `is_valid_checksum()`.
75    ///
76    /// # When to use
77    /// - Unit tests and benchmarks.
78    /// - Backends that ingest bytes from the network or RAM but still want an
79    ///   `EntryHandle` with mmap-like semantics.
80    ///
81    /// # Cost
82    /// - One O(len) copy into the anonymous mapping.
83    ///
84    /// # Errors
85    /// - Returns `std::io::Error` if the platform cannot create an anonymous
86    ///   mapping or the mapping fails.
87    pub fn from_owned_bytes_anon(bytes: &[u8], key_hash: u64) -> std::io::Result<Self> {
88        // 1) anon mmap (writable)
89        let mut mm = MmapMut::map_anon(bytes.len())?;
90        // 2) copy once
91        mm[..bytes.len()].copy_from_slice(bytes);
92        // 3) freeze to read-only Mmap
93        let ro: Mmap = mm.make_read_only()?;
94        // 4) compute checksum the same way your store does
95        let checksum = {
96            let mut hasher = crc32fast::Hasher::new();
97            hasher.update(bytes);
98            hasher.finalize().to_le_bytes()
99        };
100
101        // 5) fill metadata; set prev_offset to 0 (unused for in-memory)
102        let metadata = EntryMetadata {
103            key_hash,
104            prev_offset: 0,
105            checksum,
106        };
107
108        Ok(Self {
109            mmap_arc: Arc::new(ro),
110            range: 0..bytes.len(),
111            metadata,
112        })
113    }
114
115    /// Wrap a region in an existing `Arc<Mmap)` without copying.
116    ///
117    /// The caller provides the shared mapping, a `range` within that mapping
118    /// that contains the payload bytes, and the `EntryMetadata` corresponding
119    /// to those bytes.
120    ///
121    /// ### Safety & Correctness
122    /// - **Bounds:** `range` must lie entirely within the mapping.
123    /// - **Lifetime:** The `Arc<Mmap>` is cloned and keeps the mapping alive as
124    ///   long as any `EntryHandle` exists.
125    /// - **Integrity:** `metadata.checksum` should match the bytes in `range`
126    ///   (use `is_valid_checksum()` to verify).
127    ///
128    /// This is the zero-copy path used by file-backed stores.
129    pub fn from_arc_mmap(
130        mmap_arc: Arc<Mmap>,
131        range: Range<usize>,
132        metadata: EntryMetadata,
133    ) -> Self {
134        Self {
135            mmap_arc,
136            range,
137            metadata,
138        }
139    }
140
141    /// Returns a zero-copy reference to the sub-slice of bytes corresponding to the entry.
142    ///
143    /// This method ensures **no additional allocations** occur by referencing the memory-mapped
144    /// region instead of copying data.
145    ///
146    /// # Returns
147    /// - A byte slice (`&[u8]`) referencing the original data.
148    ///
149    /// # Zero-Copy Guarantee
150    /// - The returned slice directly references the **underlying memory-mapped file**.
151    pub fn as_slice(&self) -> &[u8] {
152        // Returning a *cloned reference* to the memory-mapped data rather than
153        // cloning the values. This is expected behavior for zero-copy access.
154        &self.mmap_arc[self.range.clone()]
155    }
156
157    /// Creates a new `EntryHandle` with the same memory-mapped reference.
158    ///
159    /// This method provides a way to duplicate an `EntryHandle` **without cloning the underlying data**.
160    /// Instead, it increments the reference count on the `Arc<Mmap>`, ensuring that the same memory-mapped
161    /// file remains accessible across multiple handles.
162    ///
163    /// # Usage
164    ///
165    /// - This is useful when multiple parts of the system need to access the same entry
166    ///   without creating redundant copies.
167    /// - Unlike `Clone`, which is not implemented for `EntryHandle`, this method allows controlled
168    ///   duplication without unnecessary allocations.
169    ///
170    /// # Returns
171    /// - A new `EntryHandle` referencing the same underlying data and metadata.
172    ///
173    /// # Zero-Copy Guarantee
174    /// - Both the original and cloned handle will refer to the same memory-mapped region.
175    /// - The `Arc<Mmap>` ensures the mapped file stays valid as long as any handle is in scope.
176    ///
177    /// # Safety Considerations
178    /// - Do **not** use this method if you need to modify data, as all handles share the same immutable mapping.
179    pub fn clone_arc(&self) -> Self {
180        Self {
181            mmap_arc: Arc::clone(&self.mmap_arc), // Keeps same mmap reference
182            range: self.range.clone(),
183            metadata: self.metadata.clone(),
184        }
185    }
186
187    /// Returns a reference to the entry’s parsed metadata.
188    ///
189    /// This metadata includes:
190    /// - `key_hash`: The hash of the key.
191    /// - `prev_offset`: The offset of the previous entry.
192    /// - `checksum`: A checksum for verifying data integrity.
193    ///
194    /// # Returns
195    /// - A reference to the `EntryMetadata` struct.
196    pub fn metadata(&self) -> &EntryMetadata {
197        &self.metadata
198    }
199
200    /// Returns the payload size of the entry.
201    ///
202    /// # Returns
203    /// - The size of the payload in bytes.
204    pub fn size(&self) -> usize {
205        self.range.len()
206    }
207
208    /// Returns the total size of the entry, including metadata.
209    ///
210    /// # Returns
211    /// - The size of the payload plus metadata in bytes.
212    pub fn file_size(&self) -> usize {
213        self.range.len() + METADATA_SIZE
214    }
215
216    /// Returns the 64-bit hash of this entry’s key.
217    ///
218    /// The value is read from the entry’s metadata exactly as it was written:
219    /// for APIs that accept raw keys it is `compute_hash(key)`; for APIs that
220    /// accept pre-hashed keys (e.g. `write_with_key_hash`, `batch_write_with_key_hashes`)
221    /// it is the caller-supplied hash. No hashing is performed when reading.
222    ///
223    /// This hash is used by the index for fast lookup and collision checks.
224    ///
225    /// # Returns
226    /// - A 64-bit unsigned integer representing the key hash.
227    pub fn key_hash(&self) -> u64 {
228        self.metadata.key_hash
229    }
230
231    /// Returns the checksum of the entry's payload.
232    ///
233    /// The checksum is a 32-bit value used for data integrity verification.
234    ///
235    /// # Returns
236    /// - A 32-bit unsigned integer representing the checksum.
237    pub fn checksum(&self) -> u32 {
238        u32::from_le_bytes(self.metadata.checksum)
239    }
240
241    /// Returns the raw checksum bytes of the entry.
242    ///
243    /// This method provides direct access to the checksum bytes for additional processing.
244    ///
245    /// # Returns
246    /// - A `[u8; 4]` array containing the raw checksum.
247    pub fn raw_checksum(&self) -> [u8; 4] {
248        self.metadata.checksum
249    }
250
251    /// Validates the integrity of the entry using its stored checksum.
252    ///
253    /// This method computes the checksum of the payload **in chunks** (streaming)
254    /// to match how it was originally computed during writes. This ensures that
255    /// large entries and small entries are handled consistently.
256    ///
257    /// # Returns
258    /// - `true` if the computed checksum matches the stored value.
259    /// - `false` if the data has been corrupted.
260    pub fn is_valid_checksum(&self) -> bool {
261        let mut hasher = crc32fast::Hasher::new();
262        let chunk_size = 4096; // Process in 4KB chunks
263        let data = self.as_slice();
264
265        // Compute checksum in a streaming manner
266        let mut offset = 0;
267        while offset < data.len() {
268            let end = std::cmp::min(offset + chunk_size, data.len());
269            hasher.update(&data[offset..end]);
270            offset = end;
271        }
272
273        let computed = hasher.finalize().to_le_bytes();
274        self.metadata.checksum == computed
275    }
276
277    /// Returns the absolute start byte offset within the mapped file.
278    ///
279    /// This offset represents where the payload begins in the memory-mapped storage.
280    ///
281    /// # Returns
282    /// - A `usize` representing the start offset.
283    pub fn start_offset(&self) -> usize {
284        self.range.start
285    }
286
287    /// Returns the absolute end byte offset within the mapped file.
288    ///
289    /// This offset represents where the payload ends in the memory-mapped storage.
290    ///
291    /// # Returns
292    /// - A `usize` representing the end offset.
293    pub fn end_offset(&self) -> usize {
294        self.range.end
295    }
296
297    /// Returns the byte offset range for the entry within the mapped file.
298    ///
299    /// This provides a structured way to access the start and end offsets.
300    ///
301    /// # Returns
302    /// - A `Range<usize>` representing the byte range of the entry.
303    pub fn offset_range(&self) -> Range<usize> {
304        self.range.clone()
305    }
306
307    /// Returns the pointer range in the current process's memory.
308    ///
309    /// This is the actual *virtual address* space that the entry occupies.
310    /// - The `start_ptr` points to the beginning of the payload in memory.
311    /// - The `end_ptr` is `start_ptr + payload_length`.
312    ///
313    /// **Note**: These addresses are valid only in this process and can become
314    /// invalid if the memory map is remapped or unmapped.
315    pub fn address_range(&self) -> std::ops::Range<*const u8> {
316        let slice = self.as_slice();
317        let start_ptr = slice.as_ptr();
318        let end_ptr = unsafe { start_ptr.add(slice.len()) };
319        start_ptr..end_ptr
320    }
321
322    /// Returns a reference to the shared memory-mapped file.
323    ///
324    /// This exposes the underlying `Arc<Mmap>` used to back the entry's data.
325    ///
326    /// # Returns
327    /// - A reference to the `Arc<Mmap>` instance holding the memory-mapped file.
328    ///
329    /// # Use Cases
330    /// - Verifying that two `EntryHandle`s share the same `Mmap` backing.
331    /// - Providing foreign-language bindings (e.g., Python) access to shared memory.
332    /// - Internal testing or diagnostics (e.g., checking refcounts).
333    ///
334    /// # Safety Considerations
335    /// - Do **not** attempt to unmap, remap, or modify the memory manually.
336    /// - The returned mapping is shared and valid only as long as an `Arc` exists.
337    ///
338    /// # Feature Flag
339    /// This method is gated behind the `expose-internal-api` Cargo feature:
340    ///
341    /// ```toml
342    /// [features]
343    /// expose-internal-api = []
344    /// ```
345    ///
346    /// It is **not part of the stable public API** and may be changed or removed
347    /// in future versions. It is intended for internal or FFI-bound use only.
348    #[cfg(feature = "expose-internal-api")]
349    pub fn mmap_arc(&self) -> &Arc<Mmap> {
350        &self.mmap_arc
351    }
352}
353
354/// Zero-copy Arrow Buffer views over this entry.
355///
356/// Safety: the pointer comes from an `Arc<Mmap)` and stays valid for the
357/// life of the returned `Buffer` via the captured owner. The owner is an
358/// `Arc<EntryHandle>`, which keeps the underlying `Arc<Mmap>` alive.
359#[cfg(feature = "arrow")]
360impl EntryHandle {
361    /// View the payload as an Arrow `Buffer` without copying.
362    ///
363    /// Feature: `arrow`
364    ///
365    /// Returns a zero-copy `arrow::buffer::Buffer` whose contents point at
366    /// the same bytes as `self.as_slice()`. The returned `Buffer` captures
367    /// an `Arc<EntryHandle>` internally, which keeps the `Arc<Mmap)` alive
368    /// for the lifetime of the `Buffer`.
369    ///
370    /// No allocation or memcpy of the payload occurs. The only work here is
371    /// constructing the `Buffer` and cloning the `Arc` owner.
372    ///
373    /// Safety
374    /// ------
375    /// Internally uses `Buffer::from_custom_allocation`, which assumes:
376    /// - `self.as_slice().as_ptr()` is valid for `self.size()` bytes.
377    /// - The memory remains valid and immutable for the `Buffer` lifetime.
378    /// - The pointer is suitably aligned for `u8`.
379    ///
380    /// Panics
381    /// ------
382    /// Rust guarantees `&[u8]::as_ptr()` is non-null, even for empty slices.
383    /// The `NonNull::new(...).expect(...)` check is defensive and should
384    /// never panic.
385    pub fn as_arrow_buffer(&self) -> arrow::buffer::Buffer {
386        use arrow::buffer::Buffer;
387        use std::ptr::NonNull;
388        use std::sync::Arc;
389
390        let slice = self.as_slice();
391        #[cfg(any(test, debug_assertions))]
392        {
393            use crate::{
394                constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
395            };
396            // Assert actual pointer alignment.
397            debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
398            // Assert derived file offset alignment.
399            debug_assert_aligned_offset(self.range.start as u64);
400        }
401
402        let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
403        unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self.clone())) }
404    }
405
406    /// Convert this handle into an Arrow `Buffer` without copying.
407    ///
408    /// Feature: `arrow`
409    ///
410    /// Like [`as_arrow_buffer`](Self::as_arrow_buffer) but consumes `self`
411    /// to avoid one extra `Arc` clone. This is otherwise identical to the
412    /// borrowing variant and still performs zero copies of the payload.
413    ///
414    /// Safety
415    /// ------
416    /// Same assumptions as [`as_arrow_buffer`](Self::as_arrow_buffer):
417    /// - Pointer is valid for `len` bytes and remains immutable while the
418    ///   `Buffer` lives.
419    /// - Alignment is suitable for `u8`.
420    ///
421    /// Panics
422    /// ------
423    /// See [`as_arrow_buffer`](Self::as_arrow_buffer). The check is
424    /// defensive and should never panic.
425    pub fn into_arrow_buffer(self) -> arrow::buffer::Buffer {
426        use arrow::buffer::Buffer;
427        use std::ptr::NonNull;
428        use std::sync::Arc;
429
430        let slice = self.as_slice();
431        #[cfg(any(test, debug_assertions))]
432        {
433            use crate::{
434                constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
435            };
436            // Assert actual pointer alignment.
437            debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
438            // Assert derived file offset alignment.
439            debug_assert_aligned_offset(self.range.start as u64);
440        }
441
442        let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
443        unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self)) }
444    }
445}
446
447impl AsRef<[u8]> for EntryHandle {
448    fn as_ref(&self) -> &[u8] {
449        self.as_slice()
450    }
451}