obj-core 1.1.0

Storage engine internals for the obj embedded document database (pager, WAL, B-tree, codec, catalog).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
//! Cross-process byte-range file locking.
//!
//! M6 issue #44. POSIX uses OFD `fcntl` locks (`F_OFD_SETLK` /
//! `F_OFD_SETLKW`) — kernel-tracked per-fd, fork-safe, automatically
//! released on process exit. Windows uses `LockFileEx` /
//! `UnlockFileEx`.
//!
//! Locks anchor against a dedicated `<db>.obj-lock` sidecar file
//! created by `Db::open` next to the main database (mirroring the
//! existing `<db>.obj-wal` sidecar convention). Using a sidecar
//! decouples the lock-byte range from any region the pager may
//! read or write, so the lock byte offsets can be the same on
//! every platform and need not be placed past EOF:
//!
//! - [`WRITER_LOCK_OFFSET`] = 96 (exclusive, 1 byte).
//! - [`READER_LOCK_RANGE_OFFSET`] = 97..128 (shared, 31 slots).
//!
//! On Windows `LockFileEx` produces *mandatory* byte-range locks.
//! Issue #1: prior versions placed the Windows lock anchor at
//! `0x4000_0000` (past EOF of an empty file) so that pager I/O
//! could not overlap the locked range. That assumption broke when
//! the main DB file grew past 1 GiB — any page write whose offset
//! crossed `0x4000_0000` failed with `ERROR_LOCK_VIOLATION`. The
//! sidecar fixes the hazard structurally: the lock handle and the
//! pager handle target *different files*, so no pager I/O can
//! ever overlap a lock byte regardless of how large the DB grows.
//! See `docs/format.md` § File locking.
//!
//! The lock state lives in the OS kernel's per-fd lock table — the
//! bytes on disk are never read or written by obj. See
//! `docs/format.md` § File locking and § Reader snapshots (MVCC)
//! for the user-visible protocol.
//!
//! # `unsafe` policy
//!
//! `rustix::fs::fcntl_lock` does whole-file locking with `F_SETLK*`,
//! not OFD locks. We therefore call `libc::fcntl` directly with the
//! OFD command IDs. On Windows we call `LockFileEx` /
//! `UnlockFileEx` via `windows-sys`. Every `unsafe` block carries a
//! `// SAFETY:` comment per power-of-ten Rule 8.

// Re-introduce unsafe inside this submodule.  The parent
// `platform/mod.rs` is `#![deny(unsafe_code)]`; we override the
// deny here because the OS-side lock syscalls are pointer-based
// and `rustix` does not expose the OFD variants.
#![allow(unsafe_code)]

use std::os::raw::c_int;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{Duration, Instant};

use crate::error::{Error, LockKind, Result};
use crate::platform::FileHandle;

/// Byte offset of the `WRITER_LOCK` (1 byte, exclusive) inside the
/// `<db>.obj-lock` sidecar file.
///
/// The lock anchor lives at the same offset on every platform
/// because the sidecar file is never read or written by the pager
/// — its only purpose is to carry kernel-side lock metadata. On
/// POSIX this byte exists inside a 128-byte sidecar (see
/// `Db::open`'s `set_len(128)` on the sidecar). OFD locks are
/// advisory and would tolerate locks past EOF, but giving the byte
/// a physical existence is the conservative choice across kernels.
/// On Windows `LockFileEx` produces **mandatory** byte-range
/// locks — the sidecar guarantees pager I/O cannot overlap the
/// locked region regardless of how large the main DB grows
/// (issue #1; the previous past-EOF strategy broke at >1 GiB).
pub const WRITER_LOCK_OFFSET: u64 = 96;
/// Byte offset of the first reader-lock slot inside the
/// `<db>.obj-lock` sidecar. See [`WRITER_LOCK_OFFSET`] for the
/// sidecar rationale.
pub const READER_LOCK_RANGE_OFFSET: u64 = 97;
/// Length of the reader-lock byte range. 31 slots.
pub const READER_LOCK_RANGE_LEN: u64 = 31;

/// Initial backoff between busy-loop retries.  Power-of-ten Rule 2:
/// the retry loop is bounded by `deadline / INITIAL_BACKOFF` so an
/// exhausted budget surfaces deterministically.
const INITIAL_BACKOFF: Duration = Duration::from_millis(1);
/// Cap on the per-retry sleep so a long timeout stays responsive.
const MAX_BACKOFF: Duration = Duration::from_millis(100);

/// RAII guard for a held `WRITER_LOCK` byte. Dropping the guard
/// releases the OS-side lock. The guard is `!Send` only by virtue of
/// the file handle it does NOT own — the underlying lock is per-fd,
/// so as long as the fd survives, releasing from any thread is
/// sound.
#[derive(Debug)]
#[must_use = "WriterLock releases the OS-side lock when dropped"]
pub struct WriterLock {
    fd: c_int,
    released: bool,
}

impl WriterLock {
    /// Explicitly release the lock.  Equivalent to `Drop` but lets
    /// the caller observe a release error (the `Drop` impl silently
    /// swallows errors because panics from `Drop` are toxic).
    ///
    /// # Errors
    ///
    /// Returns `Error::Io` on the unlikely event that the OS
    /// rejects the unlock syscall.
    pub fn release(mut self) -> Result<()> {
        if self.released {
            return Ok(());
        }
        self.released = true;
        unlock_range(self.fd, WRITER_LOCK_OFFSET, 1)
    }
}

impl Drop for WriterLock {
    fn drop(&mut self) {
        if !self.released {
            let _ = unlock_range(self.fd, WRITER_LOCK_OFFSET, 1);
        }
    }
}

/// RAII guard for a held reader-lock byte. Dropping the guard
/// releases the OS-side lock.
#[derive(Debug)]
#[must_use = "ReaderLock releases the OS-side lock when dropped"]
pub struct ReaderLock {
    fd: c_int,
    slot: u64,
    released: bool,
}

impl ReaderLock {
    /// Byte offset of the reader-slot this guard holds.  Useful for
    /// diagnostics.
    #[must_use]
    pub fn slot(&self) -> u64 {
        self.slot
    }

    /// Explicitly release the lock.
    ///
    /// # Errors
    ///
    /// Returns `Error::Io` on the unlikely event that the OS
    /// rejects the unlock syscall.
    pub fn release(mut self) -> Result<()> {
        if self.released {
            return Ok(());
        }
        self.released = true;
        unlock_range(self.fd, self.slot, 1)
    }
}

impl Drop for ReaderLock {
    fn drop(&mut self) {
        if !self.released {
            let _ = unlock_range(self.fd, self.slot, 1);
        }
    }
}

impl FileHandle {
    /// Try once, non-blocking, to acquire the `WRITER_LOCK`. Returns
    /// `Ok(Some(guard))` if the lock was acquired, `Ok(None)` if it
    /// is held by someone else, or `Err(Error::Io)` on syscall
    /// failure.
    ///
    /// # Errors
    ///
    /// Returns [`Error::Io`] on syscall failure other than
    /// "would-block / already-locked".
    pub fn try_lock_writer(&self) -> Result<Option<WriterLock>> {
        ensure_ofd_locks_supported()?;
        let fd = self.raw_fd();
        if try_lock_range(fd, WRITER_LOCK_OFFSET, 1, LockMode::Exclusive)? {
            Ok(Some(WriterLock {
                fd,
                released: false,
            }))
        } else {
            Ok(None)
        }
    }

    /// Acquire the `WRITER_LOCK`, retrying with bounded exponential
    /// backoff until either acquired or `timeout` elapses. Returns
    /// `Err(Error::Busy { kind: LockKind::Writer })` on timeout.
    ///
    /// # Errors
    ///
    /// - [`Error::Busy`] with `LockKind::Writer` on timeout.
    /// - [`Error::Io`] on any non-"would-block" syscall failure.
    pub fn lock_writer(&self, timeout: Duration) -> Result<WriterLock> {
        ensure_ofd_locks_supported()?;
        let fd = self.raw_fd();
        retry_until_acquired(timeout, LockKind::Writer, || {
            try_lock_range(fd, WRITER_LOCK_OFFSET, 1, LockMode::Exclusive)
        })?;
        Ok(WriterLock {
            fd,
            released: false,
        })
    }

    /// Acquire any one of the 31 reader-lock slots in shared mode,
    /// retrying with bounded backoff until either acquired or
    /// `timeout` elapses.
    ///
    /// The slot is chosen with a per-process round-robin counter so
    /// concurrent readers in the same process do not all race for
    /// the same byte.  Shared locks compose, so falling on the same
    /// byte is not a correctness bug — just a hot-spot the spread
    /// avoids in practice.
    ///
    /// # Errors
    ///
    /// - [`Error::Busy`] with `LockKind::Reader` on timeout (very
    ///   rare — shared locks rarely contend).
    /// - [`Error::Io`] on syscall failure.
    pub fn lock_reader(&self, timeout: Duration) -> Result<ReaderLock> {
        ensure_ofd_locks_supported()?;
        let fd = self.raw_fd();
        let start_slot = next_reader_slot();
        // Try every slot once round-robin; if all 31 slots are
        // contended (very rare), fall back to bounded retry on the
        // start slot until the deadline expires.
        let mut last_err: Option<Error> = None;
        for offset in 0..READER_LOCK_RANGE_LEN {
            let slot = READER_LOCK_RANGE_OFFSET + ((start_slot + offset) % READER_LOCK_RANGE_LEN);
            match try_lock_range(fd, slot, 1, LockMode::Shared) {
                Ok(true) => {
                    return Ok(ReaderLock {
                        fd,
                        slot,
                        released: false,
                    });
                }
                Ok(false) => {}
                Err(e) => last_err = Some(e),
            }
        }
        if let Some(err) = last_err {
            return Err(err);
        }
        // All slots reported "would-block" — fall back to busy-wait
        // on the start slot with the caller's timeout.
        let slot = READER_LOCK_RANGE_OFFSET + start_slot;
        retry_until_acquired(timeout, LockKind::Reader, || {
            try_lock_range(fd, slot, 1, LockMode::Shared)
        })?;
        Ok(ReaderLock {
            fd,
            slot,
            released: false,
        })
    }

    /// Raw fd accessor (POSIX) or HANDLE (Windows; cast through
    /// `as_raw_handle`). Internal to the platform layer.
    #[cfg(unix)]
    fn raw_fd(&self) -> c_int {
        use std::os::unix::io::AsRawFd;
        self.file_ref().as_raw_fd()
    }

    #[cfg(windows)]
    fn raw_fd(&self) -> c_int {
        use std::os::windows::io::AsRawHandle;
        // Carry the HANDLE through the `c_int` slot.  The lock
        // syscalls cast it back to HANDLE before use; `c_int` is
        // chosen so the cross-platform signature stays uniform.
        self.file_ref().as_raw_handle() as c_int
    }
}

// ---------- internal helpers --------------------------------------

/// Per-process round-robin counter so threads in the same process
/// pick different reader-slot bytes by default.  Wraps at
/// `READER_LOCK_RANGE_LEN` — the modulo arithmetic in `lock_reader`
/// handles the actual selection.
static READER_ROUND_ROBIN: AtomicU64 = AtomicU64::new(0);

fn next_reader_slot() -> u64 {
    READER_ROUND_ROBIN.fetch_add(1, Ordering::Relaxed) % READER_LOCK_RANGE_LEN
}

#[derive(Debug, Clone, Copy)]
enum LockMode {
    Exclusive,
    Shared,
}

/// Bounded retry harness shared by `lock_writer` / `lock_reader`.
/// Power-of-ten Rule 2: the loop's upper bound is
/// `deadline.elapsed() < timeout`; once `Instant::now() >= deadline`
/// the function returns `Err(Error::Busy)`.
fn retry_until_acquired<F>(timeout: Duration, kind: LockKind, mut once: F) -> Result<()>
where
    F: FnMut() -> Result<bool>,
{
    let start = Instant::now();
    let mut backoff = INITIAL_BACKOFF;
    // Rule-2 upper bound on iteration count: `timeout` / 1 ms.  With
    // exponential backoff capped at 100 ms, the real count is far
    // lower than this; the explicit bound is defensive.
    let timeout_millis = u64::try_from(timeout.as_millis()).unwrap_or(u64::MAX);
    let max_iters: u64 = timeout_millis.saturating_add(2);
    let mut iters: u64 = 0;
    loop {
        iters = iters.saturating_add(1);
        if iters > max_iters.saturating_add(64) {
            return Err(Error::Busy { kind });
        }
        if once()? {
            return Ok(());
        }
        if start.elapsed() >= timeout {
            return Err(Error::Busy { kind });
        }
        std::thread::sleep(backoff);
        backoff = (backoff * 2).min(MAX_BACKOFF);
    }
}

// ---------- platform-specific lock primitives ---------------------

/// Build a POSIX `struct flock` for the given byte range.  The
/// numeric type of `l_type` / `l_whence` differs per platform
/// (`i16` on Linux/macOS, `c_short` typedef elsewhere); we use
/// `try_from` rather than `as` so clippy's pedantic
/// `cast_possible_truncation` lint stays clean (Rule 10).
#[cfg(unix)]
fn build_flock(l_type: i32, offset: u64, len: u64) -> Result<libc::flock> {
    // `libc::flock.l_type` has type `libc::c_short` on every POSIX
    // target.  The narrowing here is exact because every libc
    // constant we pass (`F_WRLCK`, `F_RDLCK`, `F_UNLCK`) fits in
    // `i16` on every supported target.  `libc::F_WRLCK` is typed
    // `i32` on Linux but `i16` on macOS; we widen at the call site
    // and narrow here so the common helper has one signature.
    let l_type_short =
        libc::c_short::try_from(l_type).map_err(|_| Error::InvalidArgument("lock l_type"))?;
    let l_whence_short = libc::c_short::try_from(libc::SEEK_SET)
        .map_err(|_| Error::InvalidArgument("lock l_whence"))?;
    Ok(libc::flock {
        l_type: l_type_short,
        l_whence: l_whence_short,
        l_start: offset_to_off_t(offset)?,
        l_len: offset_to_off_t(len)?,
        l_pid: 0,
        #[cfg(target_os = "freebsd")]
        l_sysid: 0,
    })
}

#[cfg(unix)]
fn try_lock_range(fd: c_int, offset: u64, len: u64, mode: LockMode) -> Result<bool> {
    // POSIX OFD lock: kernel-tracked per-fd, fork-safe, released on
    // exit.  Linux ≥ 3.15, macOS ≥ 10.14, FreeBSD ≥ 12.
    // libc::F_*LCK is `i16` on macOS, `i32` on Linux; the `.into()`
    // widens on macOS and is a no-op on Linux.
    #[allow(clippy::useless_conversion)]
    let l_type: i32 = match mode {
        LockMode::Exclusive => libc::F_WRLCK.into(),
        LockMode::Shared => libc::F_RDLCK.into(),
    };
    let flock = build_flock(l_type, offset, len)?;
    // SAFETY: `fd` is a borrowed fd from the FileHandle that
    // outlives this call (we never store the fd beyond the
    // call). The third argument matches the kernel's expected
    // `struct flock*` for F_OFD_SETLK.  The kernel writes nothing
    // through the pointer for the SETLK variant.
    let ret = unsafe { libc::fcntl(fd, ofd_setlk_cmd(), &raw const flock) };
    if ret == 0 {
        return Ok(true);
    }
    // SAFETY: errno is a thread-local set by the libc call we just
    // made; calling `__errno_location()` (and friends) is sound on
    // every POSIX target libc supports.
    let errno = unsafe { *libc_errno() };
    if errno == libc::EAGAIN || errno == libc::EACCES {
        // POSIX permits either EAGAIN or EACCES for "would-block".
        return Ok(false);
    }
    Err(Error::Io(std::io::Error::from_raw_os_error(errno)))
}

#[cfg(unix)]
fn unlock_range(fd: c_int, offset: u64, len: u64) -> Result<()> {
    // libc::F_UNLCK is `i16` on macOS, `i32` on Linux; see try_lock_range.
    #[allow(clippy::useless_conversion)]
    let flock = build_flock(libc::F_UNLCK.into(), offset, len)?;
    // SAFETY: same contract as `try_lock_range` above. F_OFD_SETLK
    // with l_type = F_UNLCK is the standard release primitive.
    let ret = unsafe { libc::fcntl(fd, ofd_setlk_cmd(), &raw const flock) };
    if ret == 0 {
        return Ok(());
    }
    // SAFETY: see comment in try_lock_range.
    let errno = unsafe { *libc_errno() };
    Err(Error::Io(std::io::Error::from_raw_os_error(errno)))
}

/// `true` iff the build target provides OFD (open-file-description)
/// `fcntl` locks — `F_OFD_SETLK` / `F_OFD_SETLKW`. These are the
/// only POSIX lock primitive obj's concurrency model can rely on:
/// they are tracked PER-fd, so two `Db` handles to the same file in
/// one process correctly exclude each other, and they are released
/// on the owning fd's close rather than coalescing across the whole
/// process.
///
/// Classic POSIX `F_SETLK` locks are tracked PER-PROCESS: a second
/// `Db` handle in the same process would silently share (and on the
/// first handle's close, silently drop) the first handle's lock,
/// breaking the single-writer invariant without any error. Rather
/// than fall back to that unsound primitive (#30, #44), obj refuses
/// to lock on a target without OFD locks — see
/// [`ensure_ofd_locks_supported`].
///
/// # Supported-target matrix
///
/// | Target | OFD locks | obj locking |
/// |---|---|---|
/// | Linux ≥ 3.15 / Android | yes (`F_OFD_SETLK` = 37) | supported |
/// | macOS ≥ 10.14 / iOS (Apple) | yes (`F_OFD_SETLK` = 90) | supported |
/// | Windows | n/a (`LockFileEx`) | supported (separate path) |
/// | FreeBSD / other POSIX | not exported by `libc` | **refused at open** |
///
/// Windows uses a completely separate `LockFileEx` path and never
/// consults this constant; it is only meaningful on `unix`.
#[cfg(unix)]
const TARGET_HAS_OFD_LOCKS: bool = cfg!(any(
    target_os = "linux",
    target_os = "android",
    target_vendor = "apple",
));

/// Hard, documented gate for the non-OFD targets (#30). Called at
/// the head of every lock-acquisition entry point so the failure
/// surfaces at `Db::open` time rather than as silent, per-process
/// lock coalescing later.
///
/// On Linux/macOS this is a compile-time-`true` check the optimiser
/// erases — the lock fast path is byte-for-byte unchanged. On a
/// target without OFD locks it returns
/// [`std::io::ErrorKind::Unsupported`] wrapped in [`Error::Io`].
///
/// # Errors
///
/// Returns [`Error::Io`] with `ErrorKind::Unsupported` when the
/// build target lacks OFD `fcntl` locks.
#[cfg(unix)]
fn ensure_ofd_locks_supported() -> Result<()> {
    if TARGET_HAS_OFD_LOCKS {
        return Ok(());
    }
    Err(Error::Io(std::io::Error::new(
        std::io::ErrorKind::Unsupported,
        "obj requires OFD (open-file-description) fcntl locks, which \
         this target does not provide; classic POSIX F_SETLK locks are \
         per-process and would silently break same-process multi-handle \
         exclusion (see obj-core platform::lock supported-target matrix)",
    )))
}

/// Resolve the `F_OFD_SETLK` command id.  Linux and Apple ship it as
/// a numeric constant in `<fcntl.h>` (`37` on Linux, `90` on macOS
/// 10.14+).  We hard-code the numeric values here because `libc`
/// does not export them on every target.
///
/// This function is only ever reached after
/// [`ensure_ofd_locks_supported`] has returned `Ok` (every public
/// lock entry point gates on it first), so the non-OFD targets never
/// execute the fallback arm below. The `unreachable_target` arm
/// exists solely to keep the function total across `cfg` targets; it
/// returns a deliberately invalid command id (`-1`) so that if a
/// future refactor were ever to call this without the guard, the
/// `fcntl` would fail with `EINVAL` rather than silently install a
/// per-process lock (#30).
#[cfg(unix)]
fn ofd_setlk_cmd() -> c_int {
    #[cfg(any(target_os = "linux", target_os = "android"))]
    {
        37 // F_OFD_SETLK on Linux
    }
    #[cfg(target_vendor = "apple")]
    {
        90 // F_OFD_SETLK on macOS 10.14+
    }
    #[cfg(not(any(target_os = "linux", target_os = "android", target_vendor = "apple",)))]
    {
        // Unreachable in practice: `ensure_ofd_locks_supported`
        // rejects these targets at open. `-1` is an invalid `fcntl`
        // command (EINVAL) — never the unsound `F_SETLK` fallback.
        -1
    }
}

#[cfg(unix)]
fn offset_to_off_t(v: u64) -> Result<libc::off_t> {
    libc::off_t::try_from(v).map_err(|_| Error::InvalidArgument("lock offset overflow"))
}

#[cfg(unix)]
fn libc_errno() -> *mut c_int {
    // The errno-location accessor name varies by platform. Each
    // branch returns a thread-local pointer that lives as long as
    // the calling thread.
    #[cfg(any(target_os = "linux", target_os = "android"))]
    // SAFETY: libc-provided extern "C" function with C linkage;
    // calling without arguments is always sound and returns a
    // thread-local pointer.
    unsafe {
        libc::__errno_location()
    }
    #[cfg(target_vendor = "apple")]
    // SAFETY: libc-provided extern "C" function with C linkage;
    // calling without arguments is always sound and returns a
    // thread-local pointer.
    unsafe {
        libc::__error()
    }
    #[cfg(any(target_os = "freebsd", target_os = "dragonfly"))]
    // SAFETY: libc-provided extern "C" function with C linkage;
    // calling without arguments is always sound and returns a
    // thread-local pointer.
    unsafe {
        libc::__error()
    }
    #[cfg(any(target_os = "openbsd", target_os = "netbsd"))]
    // SAFETY: libc-provided extern "C" function with C linkage;
    // calling without arguments is always sound and returns a
    // thread-local pointer.
    unsafe {
        libc::__errno()
    }
    #[cfg(not(any(
        target_os = "linux",
        target_os = "android",
        target_vendor = "apple",
        target_os = "freebsd",
        target_os = "dragonfly",
        target_os = "openbsd",
        target_os = "netbsd",
    )))]
    // SAFETY: libc-provided extern "C" function with C linkage on
    // the fallback path; calling without arguments is always sound.
    unsafe {
        libc::__errno_location()
    }
}

// ---------- Windows -----------------------------------------------

/// Windows counterpart to the unix [`ensure_ofd_locks_supported`]
/// gate (#30). Windows acquires byte-range locks via `LockFileEx`,
/// which are mandatory PER-handle — they do not have the
/// per-process coalescing hazard the POSIX `F_SETLK` fallback has —
/// so the Windows lock path is always supported and this gate is an
/// unconditional `Ok(())`. Defined so the cross-platform
/// `FileHandle::{try_lock_writer, lock_writer, lock_reader}` entry
/// points can call one guard regardless of target.
#[cfg(windows)]
// The unconditional `Ok(())` wrap is deliberate: a single `Result<()>`
// signature is shared across platforms so the lock entry points `?` one
// guard on every target. The unix counterpart genuinely can `Err`.
#[allow(clippy::unnecessary_wraps)]
fn ensure_ofd_locks_supported() -> Result<()> {
    Ok(())
}

/// Split a `u64` into the (low, high) `u32` halves the Windows
/// `OVERLAPPED` ABI expects. The truncation is intentional — each
/// half holds 32 distinct bits of the original value — so the
/// `cast_possible_truncation` lint is scoped to this helper.
#[cfg(windows)]
// Truncation here is the whole point: we're splitting a u64 into its low
// and high u32 halves for the Win32 OVERLAPPED ABI.
#[allow(clippy::cast_possible_truncation)]
fn split_u64(v: u64) -> (u32, u32) {
    (v as u32, (v >> 32) as u32)
}

#[cfg(windows)]
fn try_lock_range(fd: c_int, offset: u64, len: u64, mode: LockMode) -> Result<bool> {
    use windows_sys::Win32::Foundation::{ERROR_IO_PENDING, ERROR_LOCK_VIOLATION, HANDLE};
    use windows_sys::Win32::Storage::FileSystem::{
        LockFileEx, LOCKFILE_EXCLUSIVE_LOCK, LOCKFILE_FAIL_IMMEDIATELY,
    };
    use windows_sys::Win32::System::IO::OVERLAPPED;

    let mut flags = LOCKFILE_FAIL_IMMEDIATELY;
    if matches!(mode, LockMode::Exclusive) {
        flags |= LOCKFILE_EXCLUSIVE_LOCK;
    }
    // SAFETY: `OVERLAPPED` is a plain-data Win32 struct (no pointers
    // we set later); an all-zero bit pattern is a valid initialised
    // value per the Windows SDK header.
    let mut overlapped: OVERLAPPED = unsafe { std::mem::zeroed() };
    let (off_lo, off_hi) = split_u64(offset);
    let (len_lo, len_hi) = split_u64(len);
    overlapped.Anonymous.Anonymous.Offset = off_lo;
    overlapped.Anonymous.Anonymous.OffsetHigh = off_hi;
    // SAFETY: `fd` was obtained from `AsRawHandle::as_raw_handle()`
    // on a still-open FileHandle that outlives this call. The
    // OVERLAPPED struct is owned and zeroed; LockFileEx only reads
    // `Offset`/`OffsetHigh` (and writes nothing to the rest, per
    // its docs, because we do not pass an event handle).
    let ret = unsafe { LockFileEx(fd as HANDLE, flags, 0, len_lo, len_hi, &raw mut overlapped) };
    if ret != 0 {
        return Ok(true);
    }
    // SAFETY: GetLastError reads the thread-local last-error slot
    // and never writes.
    let last = unsafe { windows_sys::Win32::Foundation::GetLastError() };
    if last == ERROR_LOCK_VIOLATION || last == ERROR_IO_PENDING {
        return Ok(false);
    }
    Err(Error::Io(std::io::Error::from_raw_os_error(
        last.cast_signed(),
    )))
}

#[cfg(windows)]
fn unlock_range(fd: c_int, offset: u64, len: u64) -> Result<()> {
    use windows_sys::Win32::Foundation::HANDLE;
    use windows_sys::Win32::Storage::FileSystem::UnlockFileEx;
    use windows_sys::Win32::System::IO::OVERLAPPED;

    // SAFETY: `OVERLAPPED` is a plain-data Win32 struct; an all-zero
    // bit pattern is a valid initialised value per the Windows SDK
    // header. Same rationale as `try_lock_range` above.
    let mut overlapped: OVERLAPPED = unsafe { std::mem::zeroed() };
    let (off_lo, off_hi) = split_u64(offset);
    let (len_lo, len_hi) = split_u64(len);
    overlapped.Anonymous.Anonymous.Offset = off_lo;
    overlapped.Anonymous.Anonymous.OffsetHigh = off_hi;
    // SAFETY: same contract as try_lock_range above.
    let ret = unsafe { UnlockFileEx(fd as HANDLE, 0, len_lo, len_hi, &raw mut overlapped) };
    if ret != 0 {
        return Ok(());
    }
    // SAFETY: GetLastError reads a thread-local slot.
    let last = unsafe { windows_sys::Win32::Foundation::GetLastError() };
    Err(Error::Io(std::io::Error::from_raw_os_error(
        last.cast_signed(),
    )))
}

// ---------- internal access to the FileHandle's inner File --------

impl FileHandle {
    /// Borrow the inner `std::fs::File`. Internal to the platform
    /// layer; only the lock submodule needs the raw fd / handle.
    fn file_ref(&self) -> &std::fs::File {
        &self.file
    }
}

#[cfg(test)]
mod tests {
    #[cfg(unix)]
    use super::*;
    #[cfg(unix)]
    use tempfile::TempDir;

    /// Create a file that's at least 4 KiB so the lock byte
    /// offsets at 96 / 97..128 are inside the file. Unix-only
    /// because every caller is gated on `cfg(unix)`; on Windows
    /// the lock tests live elsewhere and the helper would be dead
    /// code.
    #[cfg(unix)]
    fn fresh_handle(dir: &TempDir, name: &str) -> FileHandle {
        let path = dir.path().join(name);
        let h = FileHandle::open_or_create(&path).expect("open");
        h.set_len(4096).expect("extend");
        h
    }

    #[test]
    #[cfg(unix)]
    fn writer_lock_excludes_writers() {
        let dir = TempDir::new().expect("tmp");
        let path = dir.path().join("lock.obj");
        FileHandle::open_or_create(&path)
            .expect("init")
            .set_len(4096)
            .expect("len");

        let h1 = FileHandle::open_or_create(&path).expect("h1");
        let h2 = FileHandle::open_or_create(&path).expect("h2");

        let guard = h1
            .try_lock_writer()
            .expect("try lock h1")
            .expect("must acquire");
        let none = h2.try_lock_writer().expect("try lock h2");
        assert!(none.is_none(), "second writer lock must be refused");
        drop(guard);
        let _g2 = h2
            .try_lock_writer()
            .expect("try lock h2 again")
            .expect("now acquires");
    }

    #[test]
    #[cfg(unix)]
    fn writer_busy_timeout_returns_err_busy() {
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");

        let h1 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h1");
        let h2 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h2");
        let _g1 = h1
            .try_lock_writer()
            .expect("h1 lock")
            .expect("h1 must acquire");
        let start = std::time::Instant::now();
        let err = h2
            .lock_writer(Duration::from_millis(50))
            .expect_err("must time out");
        let elapsed = start.elapsed();
        assert!(matches!(
            err,
            Error::Busy {
                kind: LockKind::Writer
            }
        ));
        // Some slack for test scheduler jitter.
        assert!(
            elapsed >= Duration::from_millis(45),
            "must wait at least the timeout (~50 ms); got {elapsed:?}",
        );
    }

    #[test]
    #[cfg(unix)]
    fn many_readers_can_coexist() {
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");
        let h1 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h1");
        let h2 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h2");
        let h3 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h3");
        let g1 = h1.lock_reader(Duration::from_millis(50)).expect("r1");
        let g2 = h2.lock_reader(Duration::from_millis(50)).expect("r2");
        let g3 = h3.lock_reader(Duration::from_millis(50)).expect("r3");
        // The three guards may sit on the same or different slots.
        // The contract is only that they were all acquired
        // simultaneously without erroring.
        drop((g1, g2, g3));
    }

    #[test]
    #[cfg(unix)]
    fn reader_and_writer_dont_collide_on_separate_anchors() {
        // The WRITER_LOCK lives at byte 96; reader range is 97..128.
        // A held writer lock must NOT block a reader from acquiring
        // a slot in the reader range.  (The exclusion between
        // readers and writers is the TXN layer's job — the byte
        // layout intentionally keeps them on separate anchors so
        // open_readonly callers don't have to wait for an in-flight
        // writer.)
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");
        let h1 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h1");
        let h2 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h2");
        let _wg = h1.lock_writer(Duration::from_millis(50)).expect("writer");
        let _rg = h2
            .lock_reader(Duration::from_millis(50))
            .expect("reader must not collide");
    }

    #[test]
    #[cfg(unix)]
    fn explicit_release_returns_ok() {
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");
        let h = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h");
        let g = h.lock_writer(Duration::from_millis(50)).expect("lock");
        g.release().expect("release ok");
        // A re-acquire from the same handle must succeed.
        let _g2 = h.lock_writer(Duration::from_millis(50)).expect("relock");
    }

    #[test]
    #[cfg(unix)]
    fn lock_methods_compile_when_dropped() {
        // Compile-only smoke test that the `#[must_use]` annotation
        // is non-fatal when the caller actually uses the guard.
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");
        let h = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h");
        let g = h.lock_reader(Duration::from_millis(10)).expect("rlock");
        drop(g);
    }

    /// #30: the OFD-capability gate must agree with the build target.
    /// On any target obj actually supports (Linux/Android/Apple) the
    /// gate is `Ok` and the per-fd locking primitive is OFD. The
    /// classic-`F_SETLK` fallback that silently broke same-process
    /// multi-fd exclusion on FreeBSD / unknown POSIX is gone: those
    /// targets now hard-error at `ensure_ofd_locks_supported`.
    #[test]
    #[cfg(unix)]
    fn ofd_capability_gate_matches_target() {
        // The CI matrix is Linux + macOS; both have OFD locks. If
        // this assertion ever fires, obj is being built for a target
        // whose lock soundness has not been established — the gate
        // below will (correctly) refuse to lock.
        assert_eq!(
            TARGET_HAS_OFD_LOCKS,
            cfg!(any(
                target_os = "linux",
                target_os = "android",
                target_vendor = "apple",
            )),
            "OFD capability constant must track the supported-target set",
        );
        let gate = ensure_ofd_locks_supported();
        if TARGET_HAS_OFD_LOCKS {
            gate.expect("supported targets must pass the gate");
        } else {
            // Unsupported target: every lock entry point must refuse
            // with ErrorKind::Unsupported rather than fall back to a
            // per-process F_SETLK lock.
            match gate {
                Err(Error::Io(e)) => {
                    assert_eq!(e.kind(), std::io::ErrorKind::Unsupported);
                }
                other => panic!("expected Io(Unsupported), got {other:?}"),
            }
        }
    }

    /// #30 regression: two `FileHandle`s to the SAME file in the SAME
    /// process must exclude each other for the writer lock. This is
    /// exactly the invariant the classic per-process `F_SETLK`
    /// fallback silently broke (a second open in-process would
    /// "succeed" because the lock coalesces per process). OFD locks
    /// are per-fd, so the second handle is correctly refused.
    #[test]
    #[cfg(unix)]
    fn same_process_multi_fd_writer_exclusion_holds() {
        let dir = TempDir::new().expect("tmp");
        let _h0 = fresh_handle(&dir, "lock.obj");
        let h1 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h1");
        let h2 = FileHandle::open_or_create(dir.path().join("lock.obj")).expect("h2");
        let g1 = h1.try_lock_writer().expect("h1 try").expect("h1 acquires");
        // Same process, different fd: must NOT coalesce — refused.
        assert!(
            h2.try_lock_writer().expect("h2 try").is_none(),
            "per-fd OFD lock must refuse a second in-process handle; a \
             per-process F_SETLK fallback would wrongly grant this",
        );
        drop(g1);
    }
}