fsys/journal/backend.rs
1//! Journal backend abstraction (1.1.0).
2//!
3//! 1.0 shipped a single journal implementation: the kernel + io_uring
4//! path that lives inline in [`super::JournalHandle`]. 1.1.0 introduces
5//! pluggable backends so SPDK (kernel-bypass, user-space NVMe access)
6//! can plug in beside the kernel path without duplicating the journal
7//! contract.
8//!
9//! The trait itself is the **stable public observability surface** in
10//! 1.1.0:
11//!
12//! - [`JournalBackendKind`] — which backend is live for a journal.
13//! - [`JournalBackendHealth`] — running counters for monitoring.
14//! - [`JournalBackendInfo`] — startup-time selection trail showing
15//! which backends were considered, which was chosen, and why.
16//!
17//! ## Status of the trait abstraction
18//!
19//! 1.1.0 defines the trait + observability types and surfaces them
20//! through [`super::JournalHandle::backend_kind`] /
21//! [`super::JournalHandle::backend_health`] /
22//! [`super::JournalHandle::backend_info`]. The existing
23//! [`super::JournalHandle`] implementation is the *current*
24//! kernel-path backend and reports its own backend identity
25//! honestly via those accessors — there is no internal indirection
26//! through `Box<dyn JournalBackend>` yet.
27//!
28//! Trait extraction (moving the in-line append/sync paths behind a
29//! `Box<dyn JournalBackend>` so the kernel and SPDK implementations
30//! are interchangeable at the type level) lands in a follow-up
31//! session — that refactor touches every method on the load-bearing
32//! journal hot path and demands careful, isolated testing against
33//! the existing emdb production workload. Splitting it from the
34//! observability + capability work in 1.1.0 keeps the public 1.x
35//! surface frozen while the internal restructuring happens with
36//! regression discipline.
37//!
38//! Consumers can write **forward-compatible** code today against the
39//! public types here — they will continue to work unchanged when the
40//! trait extraction completes.
41
42use crate::journal::Lsn;
43use crate::Result;
44use std::time::SystemTime;
45
46/// The interface every journal backend implements.
47///
48/// **Trait stability status:** the trait shape itself is provisional
49/// until the kernel-path extraction completes (see module-level
50/// comment). Public types referenced by the trait
51/// ([`JournalBackendKind`], [`JournalBackendHealth`],
52/// [`JournalBackendInfo`]) and the accessor methods on
53/// [`super::JournalHandle`] that surface them are stable in 1.x —
54/// they are what consumers should depend on. The trait method
55/// signatures may be refined in 1.2 / 1.3 as additional backends
56/// land; we will go through the deprecation cycle described in
57/// `docs/STABILITY-1.0.md` if any signature changes.
58///
59/// # Implementations
60///
61/// - `KernelJournalBackend` (planned for the 1.2 refactor) — the
62/// existing in-line implementation, extracted behind the trait.
63/// - `SpdkJournalBackend` (planned for the 1.1.x companion crate
64/// `fsys-spdk`) — the kernel-bypass NVMe path.
65pub trait JournalBackend: Send + Sync {
66 /// Append a single record. Returns the LSN assigned by the
67 /// backend.
68 ///
69 /// # Errors
70 ///
71 /// Returns whatever the backend's underlying append path
72 /// produces — typically [`crate::Error::Io`],
73 /// [`crate::Error::ShutdownInProgress`], or a backend-specific
74 /// variant.
75 fn append(&self, record: &[u8]) -> Result<Lsn>;
76
77 /// Append a batch of records. All records become durable together
78 /// at the next [`Self::flush`] call against any of the returned
79 /// LSNs (or any later LSN).
80 ///
81 /// # Errors
82 ///
83 /// Same shape as [`Self::append`].
84 fn append_batch(&self, records: &[&[u8]]) -> Result<Vec<Lsn>>;
85
86 /// Force durability of every record up to and including `up_to`.
87 /// Concurrent calls coalesce into one platform syscall via the
88 /// backend's group-commit coordinator.
89 ///
90 /// # Errors
91 ///
92 /// Surfaces the backend's underlying sync failure when the
93 /// platform's durability primitive returns an error.
94 fn flush(&self, up_to: Lsn) -> Result<()>;
95
96 /// Reads a single record at the given LSN. Used during recovery
97 /// and by `JournalReader::read_at_lsn`.
98 ///
99 /// # Errors
100 ///
101 /// Returns [`crate::Error::Io`] for backend-level IO failures.
102 /// Returns the backend's record-decode error variant (typically
103 /// surfaced via the per-backend frame format) on a malformed
104 /// record.
105 fn read(&self, lsn: Lsn) -> Result<Vec<u8>>;
106
107 /// Backend identity for observability.
108 fn backend_kind(&self) -> JournalBackendKind;
109
110 /// Running health counters for monitoring.
111 fn health(&self) -> JournalBackendHealth;
112}
113
114/// Concrete backend identity reported via [`JournalBackend::backend_kind`].
115///
116/// The enum is `#[non_exhaustive]` so adding new backend identities
117/// in a future minor release (PMEM, RDMA, etc.) is non-breaking.
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
119#[non_exhaustive]
120pub enum JournalBackendKind {
121 /// Kernel path with `io_uring` submission + completion (Linux
122 /// `Method::Direct` + `Method::Auto` selection).
123 KernelIoUring,
124 /// Kernel path with `O_DIRECT` + synchronous `pwrite` + manual
125 /// `fdatasync` (Linux without `io_uring`, macOS, Windows
126 /// `FILE_FLAG_NO_BUFFERING`).
127 KernelDirect,
128 /// Kernel path with buffered IO + `fdatasync` / equivalent
129 /// (the universal fallback; default mode of [`super::JournalHandle`]
130 /// when no `JournalOptions::direct(true)` is set).
131 KernelBuffered,
132 /// SPDK kernel-bypass backend — Linux + `fsys-spdk` feature.
133 /// Not selectable in 1.1.0 (the implementation lives in a
134 /// companion crate that is in scaffold state); reserved here
135 /// so consumers can pattern-match against it today.
136 Spdk,
137}
138
139impl JournalBackendKind {
140 /// Returns the canonical lowercase name for the backend. Used
141 /// in selection-reason strings on [`JournalBackendInfo`] and in
142 /// the observability accessor on [`super::JournalHandle`].
143 #[must_use]
144 #[inline]
145 pub const fn as_str(self) -> &'static str {
146 match self {
147 JournalBackendKind::KernelIoUring => "kernel-io-uring",
148 JournalBackendKind::KernelDirect => "kernel-direct",
149 JournalBackendKind::KernelBuffered => "kernel-buffered",
150 JournalBackendKind::Spdk => "spdk",
151 }
152 }
153
154 /// Returns `true` when the backend is a kernel-path backend
155 /// (anything except [`JournalBackendKind::Spdk`]).
156 #[must_use]
157 #[inline]
158 pub const fn is_kernel(self) -> bool {
159 matches!(
160 self,
161 JournalBackendKind::KernelIoUring
162 | JournalBackendKind::KernelDirect
163 | JournalBackendKind::KernelBuffered
164 )
165 }
166}
167
168impl std::fmt::Display for JournalBackendKind {
169 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170 f.write_str(self.as_str())
171 }
172}
173
174/// Running health counters for a journal backend.
175///
176/// Returned by [`super::JournalHandle::backend_health`] and
177/// [`JournalBackend::health`]. The counters are snapshots; consumers
178/// that want trend data should poll on a fixed interval and diff
179/// against the previous snapshot.
180///
181/// All counters except the latency percentiles are monotonically
182/// non-decreasing for the lifetime of the journal. Latency
183/// percentiles are sliding-window estimates and may decrease as
184/// older samples roll out of the window.
185#[derive(Debug, Clone)]
186#[non_exhaustive]
187pub struct JournalBackendHealth {
188 /// Which backend produced this snapshot.
189 pub backend: JournalBackendKind,
190 /// Current in-flight submission count (records appended but not
191 /// yet acknowledged as durable by the backend's storage stack).
192 pub queue_depth_current: usize,
193 /// Maximum value [`Self::queue_depth_current`] has reached
194 /// since the journal opened. Useful for sizing.
195 pub queue_depth_max: usize,
196 /// Recent appends-per-second rate. Sliding-window estimate; the
197 /// exact window size is backend-specific. May be `0` when the
198 /// journal has just been opened or has been idle.
199 pub appends_per_second: u64,
200 /// Average append latency in microseconds, over the same sliding
201 /// window as [`Self::appends_per_second`].
202 pub avg_append_latency_us: u64,
203 /// p99 append latency in microseconds.
204 pub p99_append_latency_us: u64,
205 /// Number of append calls that returned `Err`. Includes both
206 /// transient errors (e.g. `EAGAIN`) and durable errors. Monotonic.
207 pub failed_appends: u64,
208}
209
210impl JournalBackendHealth {
211 /// Constructs a zero-counter health snapshot for `backend`.
212 ///
213 /// Used by backends that don't yet emit detailed counters (the
214 /// kernel path's pre-instrumented baseline) so the accessor on
215 /// [`super::JournalHandle`] always has a well-formed value to
216 /// return rather than `Option<_>`.
217 #[must_use]
218 #[inline]
219 pub const fn empty(backend: JournalBackendKind) -> Self {
220 Self {
221 backend,
222 queue_depth_current: 0,
223 queue_depth_max: 0,
224 appends_per_second: 0,
225 avg_append_latency_us: 0,
226 p99_append_latency_us: 0,
227 failed_appends: 0,
228 }
229 }
230}
231
232/// Verbose selection trail produced at journal-open time.
233///
234/// Returned by [`super::JournalHandle::backend_info`]. Operators
235/// **must** be able to verify which backend is actually serving a
236/// journal — silently falling through to the kernel path when SPDK
237/// was requested would invalidate downstream performance
238/// expectations. This struct is that verification surface.
239#[derive(Debug, Clone)]
240#[non_exhaustive]
241pub struct JournalBackendInfo {
242 /// The backend currently serving this journal.
243 pub selected: JournalBackendKind,
244 /// Plain-English description of why this backend was chosen
245 /// (e.g. `"Method::Auto resolved to kernel-io-uring (Linux 6.8,
246 /// io_uring available)"`).
247 pub selection_reason: String,
248 /// Backends considered during selection and skipped, with the
249 /// reason. Allows ops to verify that SPDK was tried before
250 /// falling through to the kernel path (and to see exactly which
251 /// SPDK precondition failed).
252 pub fallbacks_skipped: Vec<(JournalBackendKind, String)>,
253 /// Wall-clock time the journal opened. Useful for correlating
254 /// with system logs and metric backends.
255 pub opened_at: SystemTime,
256}
257
258impl JournalBackendInfo {
259 /// Constructs a single-selection info record with no fallback
260 /// trail. Used by the kernel-path implementation when no
261 /// alternative was considered (e.g. user explicitly passed a
262 /// non-`Auto` method).
263 #[must_use]
264 pub fn single(selected: JournalBackendKind, selection_reason: impl Into<String>) -> Self {
265 Self {
266 selected,
267 selection_reason: selection_reason.into(),
268 fallbacks_skipped: Vec::new(),
269 opened_at: SystemTime::now(),
270 }
271 }
272}
273
274#[cfg(test)]
275mod tests {
276 use super::*;
277
278 #[test]
279 fn test_backend_kind_as_str_round_trip_via_match() {
280 let labels: Vec<&str> = [
281 JournalBackendKind::KernelIoUring,
282 JournalBackendKind::KernelDirect,
283 JournalBackendKind::KernelBuffered,
284 JournalBackendKind::Spdk,
285 ]
286 .iter()
287 .map(|k| k.as_str())
288 .collect();
289 assert_eq!(
290 labels,
291 vec![
292 "kernel-io-uring",
293 "kernel-direct",
294 "kernel-buffered",
295 "spdk"
296 ]
297 );
298 }
299
300 #[test]
301 fn test_backend_kind_display_matches_as_str() {
302 for k in [
303 JournalBackendKind::KernelIoUring,
304 JournalBackendKind::KernelDirect,
305 JournalBackendKind::KernelBuffered,
306 JournalBackendKind::Spdk,
307 ] {
308 assert_eq!(k.to_string(), k.as_str());
309 }
310 }
311
312 #[test]
313 fn test_backend_kind_is_kernel_classifies_correctly() {
314 assert!(JournalBackendKind::KernelIoUring.is_kernel());
315 assert!(JournalBackendKind::KernelDirect.is_kernel());
316 assert!(JournalBackendKind::KernelBuffered.is_kernel());
317 assert!(!JournalBackendKind::Spdk.is_kernel());
318 }
319
320 #[test]
321 fn test_backend_health_empty_constructor_zeroes_counters() {
322 let h = JournalBackendHealth::empty(JournalBackendKind::KernelBuffered);
323 assert_eq!(h.backend, JournalBackendKind::KernelBuffered);
324 assert_eq!(h.queue_depth_current, 0);
325 assert_eq!(h.queue_depth_max, 0);
326 assert_eq!(h.appends_per_second, 0);
327 assert_eq!(h.avg_append_latency_us, 0);
328 assert_eq!(h.p99_append_latency_us, 0);
329 assert_eq!(h.failed_appends, 0);
330 }
331
332 #[test]
333 fn test_backend_info_single_no_fallbacks() {
334 let info = JournalBackendInfo::single(
335 JournalBackendKind::KernelBuffered,
336 "explicit Method::Sync request",
337 );
338 assert_eq!(info.selected, JournalBackendKind::KernelBuffered);
339 assert!(info.selection_reason.contains("Method::Sync"));
340 assert!(info.fallbacks_skipped.is_empty());
341 // opened_at is sensible (within the last second).
342 let elapsed = info
343 .opened_at
344 .elapsed()
345 .unwrap_or(std::time::Duration::from_secs(0));
346 assert!(elapsed < std::time::Duration::from_secs(2));
347 }
348}