lsm_tree/seqno.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use alloc::sync::Arc;
6use core::fmt::Debug;
7use core::sync::atomic::Ordering::{AcqRel, Acquire, Release};
8
9use crate::SeqNo;
10use portable_atomic::AtomicU64;
11
12/// The maximum allowed sequence number value.
13///
14/// The MSB (`0x8000_0000_0000_0000`) is reserved for internal use by the
15/// transaction layer, so all sequence numbers must be strictly less than
16/// this boundary. Values returned by [`SequenceNumberGenerator::next`]
17/// must be at most `MAX_SEQNO - 1` (`0x7FFF_FFFF_FFFF_FFFE`), leaving
18/// room for `seqno + 1` operations used internally by the engine.
19pub const MAX_SEQNO: SeqNo = 0x7FFF_FFFF_FFFF_FFFF;
20
21/// Trait for custom sequence number generation.
22///
23/// Implementations must be thread-safe and provide atomic operations
24/// for sequence number management.
25///
26/// # Invariants
27///
28/// Implementors **must** respect the following invariants, which are
29/// relied upon by the storage engine (e.g., for MVCC and transactions):
30///
31/// - The most-significant bit (MSB) of a sequence number is reserved
32/// for internal use by the transaction layer.
33/// - All sequence numbers produced by this generator **must** therefore
34/// be strictly less than `0x8000_0000_0000_0000`.
35/// - Calls to [`SequenceNumberGenerator::next`] on a given generator
36/// instance must return monotonically increasing values (each `next`
37/// returns a value that is strictly greater than any value previously
38/// returned by `next` on the same instance) until the reserved MSB
39/// boundary is reached.
40/// - In practice, this means `next`-generated sequence numbers are
41/// unique for the lifetime of the generator (modulo wrap-around, which
42/// must not occur within the reserved MSB range).
43///
44/// Implementors are also responsible for ensuring that [`Self::get`] does not
45/// return values that violate these invariants, and that [`Self::set`] and
46/// [`Self::fetch_max`] do not violate them (e.g., by setting the counter to a
47/// value at or above `0x8000_0000_0000_0000`, or by moving the counter
48/// backwards such that subsequent calls to [`Self::next`] would observe
49/// non-monotonic sequence numbers).
50///
51/// The [`Self::set`] method is a special-case escape hatch intended for use
52/// during initialization or recovery. It may move the counter forwards
53/// or backwards and therefore **may** cause subsequent calls to
54/// [`Self::next`] to observe non-monotonic sequence numbers. This is
55/// permitted; callers are responsible for using [`Self::fetch_max`] (or other
56/// application-level logic) after `set` if they need to reestablish any
57/// monotonicity guarantees.
58///
59/// # Supertraits
60///
61/// `UnwindSafe + RefUnwindSafe` are required because generators may be
62/// captured inside `catch_unwind` (e.g., during ingestion error recovery).
63///
64/// The default implementation is [`SequenceNumberCounter`], which uses
65/// an `Arc<AtomicU64>` for lock-free atomic operations. It enforces the
66/// MSB boundary in `new`, `set`, and `fetch_max` (via assert/clamp),
67/// and panics in `next` if the counter reaches the reserved range.
68pub trait SequenceNumberGenerator:
69 Send + Sync + Debug + core::panic::UnwindSafe + core::panic::RefUnwindSafe + 'static
70{
71 /// Gets the next sequence number, atomically incrementing the counter.
72 ///
73 /// This must:
74 /// - Return a value strictly greater than any value previously returned
75 /// by `next` on the same generator instance (monotonic increase).
76 /// - Never return a value greater than `MAX_SEQNO - 1`, leaving room
77 /// for internal `seqno + 1` operations that must remain strictly
78 /// less than [`MAX_SEQNO`].
79 #[must_use]
80 fn next(&self) -> SeqNo;
81
82 /// Gets the current sequence number without incrementing.
83 ///
84 /// This should be consistent with the value that will be used as the
85 /// basis for the next call to [`Self::next`], and must not itself alter the
86 /// monotonicity guarantees.
87 #[must_use]
88 fn get(&self) -> SeqNo;
89
90 /// Sets the sequence number to the given value.
91 ///
92 /// Implementations must ensure that `value` is strictly less than
93 /// `0x8000_0000_0000_0000` (the MSB is reserved).
94 ///
95 /// This method is intended for direct overrides (e.g., during
96 /// initialization or recovery) and is **not** required to preserve
97 /// monotonicity with respect to values previously returned by
98 /// [`Self::next`]. In particular, calling `set` with a value lower than a
99 /// previously returned `next` value may cause subsequent calls to
100 /// [`Self::next`] to observe non-monotonic sequence numbers. This is
101 /// allowed; callers that need to advance the sequence number in a
102 /// monotonic fashion should prefer [`Self::fetch_max`] instead of `set`.
103 fn set(&self, value: SeqNo);
104
105 /// Atomically updates the sequence number to the maximum of
106 /// the current value and the given value.
107 ///
108 /// The resulting stored value must:
109 /// - Remain strictly less than `0x8000_0000_0000_0000`.
110 /// - Preserve the monotonicity guarantees expected by [`Self::next`].
111 fn fetch_max(&self, value: SeqNo);
112}
113
114/// A shared, cloneable sequence number generator.
115pub type SharedSequenceNumberGenerator = Arc<dyn SequenceNumberGenerator>;
116
117/// Thread-safe sequence number generator
118///
119/// # Examples
120///
121/// ```
122/// # use lsm_tree::{AbstractTree, Config, SequenceNumberCounter};
123/// #
124/// # let path = tempfile::tempdir()?;
125///
126/// let seqno = SequenceNumberCounter::default();
127/// let visible_seqno = SequenceNumberCounter::default();
128///
129/// let tree = Config::new(path, seqno.clone(), visible_seqno.clone()).open()?;
130///
131/// // Do some inserts...
132/// for k in [b"a", b"b", b"c"] {
133/// let batch_seqno = seqno.next();
134/// tree.insert(k, "abc", batch_seqno);
135/// visible_seqno.fetch_max(batch_seqno + 1);
136/// }
137///
138/// // Create a batch
139/// let batch_seqno = seqno.next();
140/// tree.remove("a".as_bytes(), batch_seqno);
141/// tree.remove("b".as_bytes(), batch_seqno);
142/// tree.remove("c".as_bytes(), batch_seqno);
143/// visible_seqno.fetch_max(batch_seqno + 1);
144/// #
145/// # assert!(tree.is_empty(visible_seqno.get(), None)?);
146/// # Ok::<(), lsm_tree::Error>(())
147/// ```
148#[derive(Clone, Default, Debug)]
149pub struct SequenceNumberCounter(Arc<AtomicU64>);
150
151impl SequenceNumberCounter {
152 /// Creates a new counter, setting it to some previous value.
153 ///
154 /// # Panics
155 ///
156 /// Panics if `prev > MAX_SEQNO` (reserved MSB range).
157 ///
158 /// Note: `prev == MAX_SEQNO` is allowed but means the counter is
159 /// exhausted — any subsequent call to [`next()`](Self::next) will panic.
160 #[must_use]
161 pub fn new(prev: SeqNo) -> Self {
162 assert!(
163 prev <= MAX_SEQNO,
164 "Sequence number must not use the reserved MSB"
165 );
166 Self(Arc::new(AtomicU64::new(prev)))
167 }
168
169 /// Allocates and returns the next sequence number, atomically
170 /// incrementing the internal counter.
171 ///
172 /// The returned value is the counter's value **before** the increment
173 /// (i.e., pre-increment / fetch-then-add semantics).
174 ///
175 /// # Panics
176 ///
177 /// Panics if the current value is already [`MAX_SEQNO`] (i.e., when
178 /// advancing past it would enter the reserved MSB range).
179 #[must_use]
180 pub fn next(&self) -> SeqNo {
181 <Self as SequenceNumberGenerator>::next(self)
182 }
183
184 /// Returns the current internal counter value without incrementing.
185 ///
186 /// This is the value that the next call to [`next()`](Self::next) will
187 /// return (and then advance past).
188 #[must_use]
189 pub fn get(&self) -> SeqNo {
190 <Self as SequenceNumberGenerator>::get(self)
191 }
192
193 /// Sets the sequence number to the given value.
194 ///
195 /// # Panics
196 ///
197 /// Panics if `value > MAX_SEQNO` (reserved MSB range).
198 pub fn set(&self, value: SeqNo) {
199 <Self as SequenceNumberGenerator>::set(self, value);
200 }
201
202 /// Atomically updates the sequence number to the maximum of
203 /// the current value and the provided value.
204 pub fn fetch_max(&self, value: SeqNo) {
205 <Self as SequenceNumberGenerator>::fetch_max(self, value);
206 }
207}
208
209impl SequenceNumberGenerator for SequenceNumberCounter {
210 fn next(&self) -> SeqNo {
211 // We use fetch_update (CAS loop) instead of fetch_add to ensure the
212 // internal counter never enters the reserved MSB range. With fetch_add,
213 // a caught panic (via catch_unwind, which the trait requires via
214 // UnwindSafe) would leave the counter permanently stuck at an invalid
215 // value. fetch_update only stores the new value on success.
216 match self.0.fetch_update(AcqRel, Acquire, |current| {
217 if current >= MAX_SEQNO {
218 None
219 } else {
220 Some(current + 1)
221 }
222 }) {
223 Ok(seqno) => seqno,
224 Err(current) => panic!("Ran out of sequence numbers (current: {current})"),
225 }
226 }
227
228 fn get(&self) -> SeqNo {
229 self.0.load(Acquire)
230 }
231
232 fn set(&self, value: SeqNo) {
233 assert!(
234 value <= MAX_SEQNO,
235 "Sequence number must not use the reserved MSB"
236 );
237 self.0.store(value, Release);
238 }
239
240 fn fetch_max(&self, value: SeqNo) {
241 // Clamp to the maximum allowed sequence number to avoid storing
242 // a value in the reserved MSB range.
243 let clamped = value.min(MAX_SEQNO);
244 self.0.fetch_max(clamped, AcqRel);
245 }
246}
247
248// Orphan rules satisfied: SequenceNumberGenerator is a local trait,
249// so Arc<dyn SequenceNumberGenerator> is covered by a local type parameter.
250impl From<SequenceNumberCounter> for SharedSequenceNumberGenerator {
251 fn from(counter: SequenceNumberCounter) -> Self {
252 Arc::new(counter)
253 }
254}
255
256#[cfg(test)]
257mod tests;