simd_r_drive_entry_handle/entry_handle.rs
1use super::constants::METADATA_SIZE;
2use crate::EntryMetadata;
3use memmap2::{Mmap, MmapMut};
4use std::ops::Range;
5use std::sync::Arc;
6
7/// Zero-copy owner of a sub-slice in an `Arc<Mmap>`.
8/// Provides access to the bytes of an entry as long as this struct is alive.
9#[derive(Debug, Clone)]
10pub struct EntryHandle {
11 /// The underlying memory map.
12 pub mmap_arc: Arc<Mmap>,
13
14 /// The range of bytes within the memory-mapped file corresponding to the payload.
15 pub range: Range<usize>,
16
17 /// Metadata associated with the entry, including key hash and checksum.
18 pub metadata: EntryMetadata,
19}
20
21impl EntryHandle {
22 /// Provides access to the raw pointer of the memory-mapped file for testing.
23 ///
24 /// This method allows unit tests to verify that multiple `EntryHandle` instances
25 /// share the same underlying memory map, ensuring zero-copy behavior.
26 ///
27 /// # Returns
28 /// - A raw pointer to the underlying `Mmap`.
29 #[cfg(test)]
30 pub fn arc_ptr(&self) -> *const Mmap {
31 Arc::as_ptr(&self.mmap_arc)
32 }
33}
34
35/// Enable `*entry_handle` to act like a `&[u8]`
36impl std::ops::Deref for EntryHandle {
37 type Target = [u8];
38
39 fn deref(&self) -> &Self::Target {
40 self.as_slice()
41 }
42}
43
44/// Let us do: `assert_eq!(entry_handle, b"some bytes")`
45impl PartialEq<[u8]> for EntryHandle {
46 fn eq(&self, other: &[u8]) -> bool {
47 self.as_slice() == other
48 }
49}
50
51/// Allow comparisons with `&[u8]`
52impl PartialEq<&[u8]> for EntryHandle {
53 fn eq(&self, other: &&[u8]) -> bool {
54 self.as_slice() == *other
55 }
56}
57
58/// Allow comparisons with `Vec<u8>`
59impl PartialEq<Vec<u8>> for EntryHandle {
60 fn eq(&self, other: &Vec<u8>) -> bool {
61 self.as_slice() == other.as_slice()
62 }
63}
64
65impl EntryHandle {
66 /// Construct an in-memory, read-only entry backed by an anonymous mmap.
67 ///
68 /// This copies `bytes` **once** into an anonymous `MmapMut`, then seals it
69 /// to a read-only `Mmap`. The result behaves like a file-backed entry
70 /// (zero-copy reads via `as_slice()`), but never touches the filesystem.
71 ///
72 /// The `EntryMetadata` is populated using the supplied `key_hash`, a
73 /// `prev_offset` of `0` (not used for in-memory entries), and a 32-bit
74 /// checksum computed by the same algorithm used in `is_valid_checksum()`.
75 ///
76 /// # When to use
77 /// - Unit tests and benchmarks.
78 /// - Backends that ingest bytes from the network or RAM but still want an
79 /// `EntryHandle` with mmap-like semantics.
80 ///
81 /// # Cost
82 /// - One O(len) copy into the anonymous mapping.
83 ///
84 /// # Errors
85 /// - Returns `std::io::Error` if the platform cannot create an anonymous
86 /// mapping or the mapping fails.
87 pub fn from_owned_bytes_anon(bytes: &[u8], key_hash: u64) -> std::io::Result<Self> {
88 // 1) anon mmap (writable)
89 let mut mm = MmapMut::map_anon(bytes.len())?;
90 // 2) copy once
91 mm[..bytes.len()].copy_from_slice(bytes);
92 // 3) freeze to read-only Mmap
93 let ro: Mmap = mm.make_read_only()?;
94 // 4) compute checksum the same way your store does
95 let checksum = {
96 let mut hasher = crc32fast::Hasher::new();
97 hasher.update(bytes);
98 hasher.finalize().to_le_bytes()
99 };
100
101 // 5) fill metadata; set prev_offset to 0 (unused for in-memory)
102 let metadata = EntryMetadata {
103 key_hash,
104 prev_offset: 0,
105 checksum,
106 };
107
108 Ok(Self {
109 mmap_arc: Arc::new(ro),
110 range: 0..bytes.len(),
111 metadata,
112 })
113 }
114
115 /// Wrap a region in an existing `Arc<Mmap)` without copying.
116 ///
117 /// The caller provides the shared mapping, a `range` within that mapping
118 /// that contains the payload bytes, and the `EntryMetadata` corresponding
119 /// to those bytes.
120 ///
121 /// ### Safety & Correctness
122 /// - **Bounds:** `range` must lie entirely within the mapping.
123 /// - **Lifetime:** The `Arc<Mmap>` is cloned and keeps the mapping alive as
124 /// long as any `EntryHandle` exists.
125 /// - **Integrity:** `metadata.checksum` should match the bytes in `range`
126 /// (use `is_valid_checksum()` to verify).
127 ///
128 /// This is the zero-copy path used by file-backed stores.
129 pub fn from_arc_mmap(
130 mmap_arc: Arc<Mmap>,
131 range: Range<usize>,
132 metadata: EntryMetadata,
133 ) -> Self {
134 Self {
135 mmap_arc,
136 range,
137 metadata,
138 }
139 }
140
141 /// Returns a zero-copy reference to the sub-slice of bytes corresponding to the entry.
142 ///
143 /// This method ensures **no additional allocations** occur by referencing the memory-mapped
144 /// region instead of copying data.
145 ///
146 /// # Returns
147 /// - A byte slice (`&[u8]`) referencing the original data.
148 ///
149 /// # Zero-Copy Guarantee
150 /// - The returned slice directly references the **underlying memory-mapped file**.
151 pub fn as_slice(&self) -> &[u8] {
152 // Returning a *cloned reference* to the memory-mapped data rather than
153 // cloning the values. This is expected behavior for zero-copy access.
154 &self.mmap_arc[self.range.clone()]
155 }
156
157 /// Creates a new `EntryHandle` with the same memory-mapped reference.
158 ///
159 /// This method provides a way to duplicate an `EntryHandle` **without cloning the underlying data**.
160 /// Instead, it increments the reference count on the `Arc<Mmap>`, ensuring that the same memory-mapped
161 /// file remains accessible across multiple handles.
162 ///
163 /// # Usage
164 ///
165 /// - This is useful when multiple parts of the system need to access the same entry
166 /// without creating redundant copies.
167 /// - Unlike `Clone`, which is not implemented for `EntryHandle`, this method allows controlled
168 /// duplication without unnecessary allocations.
169 ///
170 /// # Returns
171 /// - A new `EntryHandle` referencing the same underlying data and metadata.
172 ///
173 /// # Zero-Copy Guarantee
174 /// - Both the original and cloned handle will refer to the same memory-mapped region.
175 /// - The `Arc<Mmap>` ensures the mapped file stays valid as long as any handle is in scope.
176 ///
177 /// # Safety Considerations
178 /// - Do **not** use this method if you need to modify data, as all handles share the same immutable mapping.
179 pub fn clone_arc(&self) -> Self {
180 Self {
181 mmap_arc: Arc::clone(&self.mmap_arc), // Keeps same mmap reference
182 range: self.range.clone(),
183 metadata: self.metadata.clone(),
184 }
185 }
186
187 /// Returns a reference to the entry’s parsed metadata.
188 ///
189 /// This metadata includes:
190 /// - `key_hash`: The hash of the key.
191 /// - `prev_offset`: The offset of the previous entry.
192 /// - `checksum`: A checksum for verifying data integrity.
193 ///
194 /// # Returns
195 /// - A reference to the `EntryMetadata` struct.
196 pub fn metadata(&self) -> &EntryMetadata {
197 &self.metadata
198 }
199
200 /// Returns the payload size of the entry.
201 ///
202 /// # Returns
203 /// - The size of the payload in bytes.
204 pub fn size(&self) -> usize {
205 self.range.len()
206 }
207
208 /// Returns the total size of the entry, including metadata.
209 ///
210 /// # Returns
211 /// - The size of the payload plus metadata in bytes.
212 pub fn file_size(&self) -> usize {
213 self.range.len() + METADATA_SIZE
214 }
215
216 /// Returns the 64-bit hash of this entry’s key.
217 ///
218 /// The value is read from the entry’s metadata exactly as it was written:
219 /// for APIs that accept raw keys it is `compute_hash(key)`; for APIs that
220 /// accept pre-hashed keys (e.g. `write_with_key_hash`, `batch_write_with_key_hashes`)
221 /// it is the caller-supplied hash. No hashing is performed when reading.
222 ///
223 /// This hash is used by the index for fast lookup and collision checks.
224 ///
225 /// # Returns
226 /// - A 64-bit unsigned integer representing the key hash.
227 pub fn key_hash(&self) -> u64 {
228 self.metadata.key_hash
229 }
230
231 /// Returns the checksum of the entry's payload.
232 ///
233 /// The checksum is a 32-bit value used for data integrity verification.
234 ///
235 /// # Returns
236 /// - A 32-bit unsigned integer representing the checksum.
237 pub fn checksum(&self) -> u32 {
238 u32::from_le_bytes(self.metadata.checksum)
239 }
240
241 /// Returns the raw checksum bytes of the entry.
242 ///
243 /// This method provides direct access to the checksum bytes for additional processing.
244 ///
245 /// # Returns
246 /// - A `[u8; 4]` array containing the raw checksum.
247 pub fn raw_checksum(&self) -> [u8; 4] {
248 self.metadata.checksum
249 }
250
251 /// Validates the integrity of the entry using its stored checksum.
252 ///
253 /// This method computes the checksum of the payload **in chunks** (streaming)
254 /// to match how it was originally computed during writes. This ensures that
255 /// large entries and small entries are handled consistently.
256 ///
257 /// # Returns
258 /// - `true` if the computed checksum matches the stored value.
259 /// - `false` if the data has been corrupted.
260 pub fn is_valid_checksum(&self) -> bool {
261 let mut hasher = crc32fast::Hasher::new();
262 let chunk_size = 4096; // Process in 4KB chunks
263 let data = self.as_slice();
264
265 // Compute checksum in a streaming manner
266 let mut offset = 0;
267 while offset < data.len() {
268 let end = std::cmp::min(offset + chunk_size, data.len());
269 hasher.update(&data[offset..end]);
270 offset = end;
271 }
272
273 let computed = hasher.finalize().to_le_bytes();
274 self.metadata.checksum == computed
275 }
276
277 /// Returns the absolute start byte offset within the mapped file.
278 ///
279 /// This offset represents where the payload begins in the memory-mapped storage.
280 ///
281 /// # Returns
282 /// - A `usize` representing the start offset.
283 pub fn start_offset(&self) -> usize {
284 self.range.start
285 }
286
287 /// Returns the absolute end byte offset within the mapped file.
288 ///
289 /// This offset represents where the payload ends in the memory-mapped storage.
290 ///
291 /// # Returns
292 /// - A `usize` representing the end offset.
293 pub fn end_offset(&self) -> usize {
294 self.range.end
295 }
296
297 /// Returns the byte offset range for the entry within the mapped file.
298 ///
299 /// This provides a structured way to access the start and end offsets.
300 ///
301 /// # Returns
302 /// - A `Range<usize>` representing the byte range of the entry.
303 pub fn offset_range(&self) -> Range<usize> {
304 self.range.clone()
305 }
306
307 /// Returns the pointer range in the current process's memory.
308 ///
309 /// This is the actual *virtual address* space that the entry occupies.
310 /// - The `start_ptr` points to the beginning of the payload in memory.
311 /// - The `end_ptr` is `start_ptr + payload_length`.
312 ///
313 /// **Note**: These addresses are valid only in this process and can become
314 /// invalid if the memory map is remapped or unmapped.
315 pub fn address_range(&self) -> std::ops::Range<*const u8> {
316 let slice = self.as_slice();
317 let start_ptr = slice.as_ptr();
318 let end_ptr = unsafe { start_ptr.add(slice.len()) };
319 start_ptr..end_ptr
320 }
321
322 /// Returns a reference to the shared memory-mapped file.
323 ///
324 /// This exposes the underlying `Arc<Mmap>` used to back the entry's data.
325 ///
326 /// # Returns
327 /// - A reference to the `Arc<Mmap>` instance holding the memory-mapped file.
328 ///
329 /// # Use Cases
330 /// - Verifying that two `EntryHandle`s share the same `Mmap` backing.
331 /// - Providing foreign-language bindings (e.g., Python) access to shared memory.
332 /// - Internal testing or diagnostics (e.g., checking refcounts).
333 ///
334 /// # Safety Considerations
335 /// - Do **not** attempt to unmap, remap, or modify the memory manually.
336 /// - The returned mapping is shared and valid only as long as an `Arc` exists.
337 ///
338 /// # Feature Flag
339 /// This method is gated behind the `expose-internal-api` Cargo feature:
340 ///
341 /// ```toml
342 /// [features]
343 /// expose-internal-api = []
344 /// ```
345 ///
346 /// It is **not part of the stable public API** and may be changed or removed
347 /// in future versions. It is intended for internal or FFI-bound use only.
348 #[cfg(feature = "expose-internal-api")]
349 pub fn mmap_arc(&self) -> &Arc<Mmap> {
350 &self.mmap_arc
351 }
352}
353
354/// Zero-copy Arrow Buffer views over this entry.
355///
356/// Safety: the pointer comes from an `Arc<Mmap)` and stays valid for the
357/// life of the returned `Buffer` via the captured owner. The owner is an
358/// `Arc<EntryHandle>`, which keeps the underlying `Arc<Mmap>` alive.
359#[cfg(feature = "arrow")]
360impl EntryHandle {
361 /// View the payload as an Arrow `Buffer` without copying.
362 ///
363 /// Feature: `arrow`
364 ///
365 /// Returns a zero-copy `arrow::buffer::Buffer` whose contents point at
366 /// the same bytes as `self.as_slice()`. The returned `Buffer` captures
367 /// an `Arc<EntryHandle>` internally, which keeps the `Arc<Mmap)` alive
368 /// for the lifetime of the `Buffer`.
369 ///
370 /// No allocation or memcpy of the payload occurs. The only work here is
371 /// constructing the `Buffer` and cloning the `Arc` owner.
372 ///
373 /// Safety
374 /// ------
375 /// Internally uses `Buffer::from_custom_allocation`, which assumes:
376 /// - `self.as_slice().as_ptr()` is valid for `self.size()` bytes.
377 /// - The memory remains valid and immutable for the `Buffer` lifetime.
378 /// - The pointer is suitably aligned for `u8`.
379 ///
380 /// Panics
381 /// ------
382 /// Rust guarantees `&[u8]::as_ptr()` is non-null, even for empty slices.
383 /// The `NonNull::new(...).expect(...)` check is defensive and should
384 /// never panic.
385 pub fn as_arrow_buffer(&self) -> arrow::buffer::Buffer {
386 use arrow::buffer::Buffer;
387 use std::ptr::NonNull;
388 use std::sync::Arc;
389
390 let slice = self.as_slice();
391 #[cfg(any(test, debug_assertions))]
392 {
393 use crate::{
394 constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
395 };
396 // Assert actual pointer alignment.
397 debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
398 // Assert derived file offset alignment.
399 debug_assert_aligned_offset(self.range.start as u64);
400 }
401
402 let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
403 unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self.clone())) }
404 }
405
406 /// Convert this handle into an Arrow `Buffer` without copying.
407 ///
408 /// Feature: `arrow`
409 ///
410 /// Like [`as_arrow_buffer`](Self::as_arrow_buffer) but consumes `self`
411 /// to avoid one extra `Arc` clone. This is otherwise identical to the
412 /// borrowing variant and still performs zero copies of the payload.
413 ///
414 /// Safety
415 /// ------
416 /// Same assumptions as [`as_arrow_buffer`](Self::as_arrow_buffer):
417 /// - Pointer is valid for `len` bytes and remains immutable while the
418 /// `Buffer` lives.
419 /// - Alignment is suitable for `u8`.
420 ///
421 /// Panics
422 /// ------
423 /// See [`as_arrow_buffer`](Self::as_arrow_buffer). The check is
424 /// defensive and should never panic.
425 pub fn into_arrow_buffer(self) -> arrow::buffer::Buffer {
426 use arrow::buffer::Buffer;
427 use std::ptr::NonNull;
428 use std::sync::Arc;
429
430 let slice = self.as_slice();
431 #[cfg(any(test, debug_assertions))]
432 {
433 use crate::{
434 constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
435 };
436 // Assert actual pointer alignment.
437 debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
438 // Assert derived file offset alignment.
439 debug_assert_aligned_offset(self.range.start as u64);
440 }
441
442 let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
443 unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self)) }
444 }
445}
446
447impl AsRef<[u8]> for EntryHandle {
448 fn as_ref(&self) -> &[u8] {
449 self.as_slice()
450 }
451}