Skip to main content

fsys/journal/
backend.rs

1//! Journal backend abstraction (1.1.0).
2//!
3//! 1.0 shipped a single journal implementation: the kernel + io_uring
4//! path that lives inline in [`super::JournalHandle`]. 1.1.0 introduces
5//! pluggable backends so SPDK (kernel-bypass, user-space NVMe access)
6//! can plug in beside the kernel path without duplicating the journal
7//! contract.
8//!
9//! The trait itself is the **stable public observability surface** in
10//! 1.1.0:
11//!
12//! - [`JournalBackendKind`] — which backend is live for a journal.
13//! - [`JournalBackendHealth`] — running counters for monitoring.
14//! - [`JournalBackendInfo`] — startup-time selection trail showing
15//!   which backends were considered, which was chosen, and why.
16//!
17//! ## Status of the trait abstraction
18//!
19//! 1.1.0 defines the trait + observability types and surfaces them
20//! through [`super::JournalHandle::backend_kind`] /
21//! [`super::JournalHandle::backend_health`] /
22//! [`super::JournalHandle::backend_info`]. The existing
23//! [`super::JournalHandle`] implementation is the *current*
24//! kernel-path backend and reports its own backend identity
25//! honestly via those accessors — there is no internal indirection
26//! through `Box<dyn JournalBackend>` yet.
27//!
28//! Trait extraction (moving the in-line append/sync paths behind a
29//! `Box<dyn JournalBackend>` so the kernel and SPDK implementations
30//! are interchangeable at the type level) lands in a follow-up
31//! session — that refactor touches every method on the load-bearing
32//! journal hot path and demands careful, isolated testing against
33//! the existing emdb production workload. Splitting it from the
34//! observability + capability work in 1.1.0 keeps the public 1.x
35//! surface frozen while the internal restructuring happens with
36//! regression discipline.
37//!
38//! Consumers can write **forward-compatible** code today against the
39//! public types here — they will continue to work unchanged when the
40//! trait extraction completes.
41
42use crate::journal::Lsn;
43use crate::Result;
44use std::time::SystemTime;
45
46/// The interface every journal backend implements.
47///
48/// **Trait stability status:** the trait shape itself is provisional
49/// until the kernel-path extraction completes (see module-level
50/// comment). Public types referenced by the trait
51/// ([`JournalBackendKind`], [`JournalBackendHealth`],
52/// [`JournalBackendInfo`]) and the accessor methods on
53/// [`super::JournalHandle`] that surface them are stable in 1.x —
54/// they are what consumers should depend on. The trait method
55/// signatures may be refined in 1.2 / 1.3 as additional backends
56/// land; we will go through the deprecation cycle described in
57/// `docs/STABILITY-1.0.md` if any signature changes.
58///
59/// # Implementations
60///
61/// - `KernelJournalBackend` (planned for the 1.2 refactor) — the
62///   existing in-line implementation, extracted behind the trait.
63/// - `SpdkJournalBackend` (planned for the 1.1.x companion crate
64///   `fsys-spdk`) — the kernel-bypass NVMe path.
65pub trait JournalBackend: Send + Sync {
66    /// Append a single record. Returns the LSN assigned by the
67    /// backend.
68    ///
69    /// # Errors
70    ///
71    /// Returns whatever the backend's underlying append path
72    /// produces — typically [`crate::Error::Io`],
73    /// [`crate::Error::ShutdownInProgress`], or a backend-specific
74    /// variant.
75    fn append(&self, record: &[u8]) -> Result<Lsn>;
76
77    /// Append a batch of records. All records become durable together
78    /// at the next [`Self::flush`] call against any of the returned
79    /// LSNs (or any later LSN).
80    ///
81    /// # Errors
82    ///
83    /// Same shape as [`Self::append`].
84    fn append_batch(&self, records: &[&[u8]]) -> Result<Vec<Lsn>>;
85
86    /// Force durability of every record up to and including `up_to`.
87    /// Concurrent calls coalesce into one platform syscall via the
88    /// backend's group-commit coordinator.
89    ///
90    /// # Errors
91    ///
92    /// Surfaces the backend's underlying sync failure when the
93    /// platform's durability primitive returns an error.
94    fn flush(&self, up_to: Lsn) -> Result<()>;
95
96    /// Reads a single record at the given LSN. Used during recovery
97    /// and by `JournalReader::read_at_lsn`.
98    ///
99    /// # Errors
100    ///
101    /// Returns [`crate::Error::Io`] for backend-level IO failures.
102    /// Returns the backend's record-decode error variant (typically
103    /// surfaced via the per-backend frame format) on a malformed
104    /// record.
105    fn read(&self, lsn: Lsn) -> Result<Vec<u8>>;
106
107    /// Backend identity for observability.
108    fn backend_kind(&self) -> JournalBackendKind;
109
110    /// Running health counters for monitoring.
111    fn health(&self) -> JournalBackendHealth;
112}
113
114/// Concrete backend identity reported via [`JournalBackend::backend_kind`].
115///
116/// The enum is `#[non_exhaustive]` so adding new backend identities
117/// in a future minor release (PMEM, RDMA, etc.) is non-breaking.
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
119#[non_exhaustive]
120pub enum JournalBackendKind {
121    /// Kernel path with `io_uring` submission + completion (Linux
122    /// `Method::Direct` + `Method::Auto` selection).
123    KernelIoUring,
124    /// Kernel path with `O_DIRECT` + synchronous `pwrite` + manual
125    /// `fdatasync` (Linux without `io_uring`, macOS, Windows
126    /// `FILE_FLAG_NO_BUFFERING`).
127    KernelDirect,
128    /// Kernel path with buffered IO + `fdatasync` / equivalent
129    /// (the universal fallback; default mode of [`super::JournalHandle`]
130    /// when no `JournalOptions::direct(true)` is set).
131    KernelBuffered,
132    /// SPDK kernel-bypass backend — Linux + `fsys-spdk` feature.
133    /// Not selectable in 1.1.0 (the implementation lives in a
134    /// companion crate that is in scaffold state); reserved here
135    /// so consumers can pattern-match against it today.
136    Spdk,
137}
138
139impl JournalBackendKind {
140    /// Returns the canonical lowercase name for the backend. Used
141    /// in selection-reason strings on [`JournalBackendInfo`] and in
142    /// the observability accessor on [`super::JournalHandle`].
143    #[must_use]
144    #[inline]
145    pub const fn as_str(self) -> &'static str {
146        match self {
147            JournalBackendKind::KernelIoUring => "kernel-io-uring",
148            JournalBackendKind::KernelDirect => "kernel-direct",
149            JournalBackendKind::KernelBuffered => "kernel-buffered",
150            JournalBackendKind::Spdk => "spdk",
151        }
152    }
153
154    /// Returns `true` when the backend is a kernel-path backend
155    /// (anything except [`JournalBackendKind::Spdk`]).
156    #[must_use]
157    #[inline]
158    pub const fn is_kernel(self) -> bool {
159        matches!(
160            self,
161            JournalBackendKind::KernelIoUring
162                | JournalBackendKind::KernelDirect
163                | JournalBackendKind::KernelBuffered
164        )
165    }
166}
167
168impl std::fmt::Display for JournalBackendKind {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        f.write_str(self.as_str())
171    }
172}
173
174/// Running health counters for a journal backend.
175///
176/// Returned by [`super::JournalHandle::backend_health`] and
177/// [`JournalBackend::health`]. The counters are snapshots; consumers
178/// that want trend data should poll on a fixed interval and diff
179/// against the previous snapshot.
180///
181/// All counters except the latency percentiles are monotonically
182/// non-decreasing for the lifetime of the journal. Latency
183/// percentiles are sliding-window estimates and may decrease as
184/// older samples roll out of the window.
185#[derive(Debug, Clone)]
186#[non_exhaustive]
187pub struct JournalBackendHealth {
188    /// Which backend produced this snapshot.
189    pub backend: JournalBackendKind,
190    /// Current in-flight submission count (records appended but not
191    /// yet acknowledged as durable by the backend's storage stack).
192    pub queue_depth_current: usize,
193    /// Maximum value [`Self::queue_depth_current`] has reached
194    /// since the journal opened. Useful for sizing.
195    pub queue_depth_max: usize,
196    /// Recent appends-per-second rate. Sliding-window estimate; the
197    /// exact window size is backend-specific. May be `0` when the
198    /// journal has just been opened or has been idle.
199    pub appends_per_second: u64,
200    /// Average append latency in microseconds, over the same sliding
201    /// window as [`Self::appends_per_second`].
202    pub avg_append_latency_us: u64,
203    /// p99 append latency in microseconds.
204    pub p99_append_latency_us: u64,
205    /// Number of append calls that returned `Err`. Includes both
206    /// transient errors (e.g. `EAGAIN`) and durable errors. Monotonic.
207    pub failed_appends: u64,
208}
209
210impl JournalBackendHealth {
211    /// Constructs a zero-counter health snapshot for `backend`.
212    ///
213    /// Used by backends that don't yet emit detailed counters (the
214    /// kernel path's pre-instrumented baseline) so the accessor on
215    /// [`super::JournalHandle`] always has a well-formed value to
216    /// return rather than `Option<_>`.
217    #[must_use]
218    #[inline]
219    pub const fn empty(backend: JournalBackendKind) -> Self {
220        Self {
221            backend,
222            queue_depth_current: 0,
223            queue_depth_max: 0,
224            appends_per_second: 0,
225            avg_append_latency_us: 0,
226            p99_append_latency_us: 0,
227            failed_appends: 0,
228        }
229    }
230}
231
232/// Verbose selection trail produced at journal-open time.
233///
234/// Returned by [`super::JournalHandle::backend_info`]. Operators
235/// **must** be able to verify which backend is actually serving a
236/// journal — silently falling through to the kernel path when SPDK
237/// was requested would invalidate downstream performance
238/// expectations. This struct is that verification surface.
239#[derive(Debug, Clone)]
240#[non_exhaustive]
241pub struct JournalBackendInfo {
242    /// The backend currently serving this journal.
243    pub selected: JournalBackendKind,
244    /// Plain-English description of why this backend was chosen
245    /// (e.g. `"Method::Auto resolved to kernel-io-uring (Linux 6.8,
246    /// io_uring available)"`).
247    pub selection_reason: String,
248    /// Backends considered during selection and skipped, with the
249    /// reason. Allows ops to verify that SPDK was tried before
250    /// falling through to the kernel path (and to see exactly which
251    /// SPDK precondition failed).
252    pub fallbacks_skipped: Vec<(JournalBackendKind, String)>,
253    /// Wall-clock time the journal opened. Useful for correlating
254    /// with system logs and metric backends.
255    pub opened_at: SystemTime,
256}
257
258impl JournalBackendInfo {
259    /// Constructs a single-selection info record with no fallback
260    /// trail. Used by the kernel-path implementation when no
261    /// alternative was considered (e.g. user explicitly passed a
262    /// non-`Auto` method).
263    #[must_use]
264    pub fn single(selected: JournalBackendKind, selection_reason: impl Into<String>) -> Self {
265        Self {
266            selected,
267            selection_reason: selection_reason.into(),
268            fallbacks_skipped: Vec::new(),
269            opened_at: SystemTime::now(),
270        }
271    }
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    #[test]
279    fn test_backend_kind_as_str_round_trip_via_match() {
280        let labels: Vec<&str> = [
281            JournalBackendKind::KernelIoUring,
282            JournalBackendKind::KernelDirect,
283            JournalBackendKind::KernelBuffered,
284            JournalBackendKind::Spdk,
285        ]
286        .iter()
287        .map(|k| k.as_str())
288        .collect();
289        assert_eq!(
290            labels,
291            vec![
292                "kernel-io-uring",
293                "kernel-direct",
294                "kernel-buffered",
295                "spdk"
296            ]
297        );
298    }
299
300    #[test]
301    fn test_backend_kind_display_matches_as_str() {
302        for k in [
303            JournalBackendKind::KernelIoUring,
304            JournalBackendKind::KernelDirect,
305            JournalBackendKind::KernelBuffered,
306            JournalBackendKind::Spdk,
307        ] {
308            assert_eq!(k.to_string(), k.as_str());
309        }
310    }
311
312    #[test]
313    fn test_backend_kind_is_kernel_classifies_correctly() {
314        assert!(JournalBackendKind::KernelIoUring.is_kernel());
315        assert!(JournalBackendKind::KernelDirect.is_kernel());
316        assert!(JournalBackendKind::KernelBuffered.is_kernel());
317        assert!(!JournalBackendKind::Spdk.is_kernel());
318    }
319
320    #[test]
321    fn test_backend_health_empty_constructor_zeroes_counters() {
322        let h = JournalBackendHealth::empty(JournalBackendKind::KernelBuffered);
323        assert_eq!(h.backend, JournalBackendKind::KernelBuffered);
324        assert_eq!(h.queue_depth_current, 0);
325        assert_eq!(h.queue_depth_max, 0);
326        assert_eq!(h.appends_per_second, 0);
327        assert_eq!(h.avg_append_latency_us, 0);
328        assert_eq!(h.p99_append_latency_us, 0);
329        assert_eq!(h.failed_appends, 0);
330    }
331
332    #[test]
333    fn test_backend_info_single_no_fallbacks() {
334        let info = JournalBackendInfo::single(
335            JournalBackendKind::KernelBuffered,
336            "explicit Method::Sync request",
337        );
338        assert_eq!(info.selected, JournalBackendKind::KernelBuffered);
339        assert!(info.selection_reason.contains("Method::Sync"));
340        assert!(info.fallbacks_skipped.is_empty());
341        // opened_at is sensible (within the last second).
342        let elapsed = info
343            .opened_at
344            .elapsed()
345            .unwrap_or(std::time::Duration::from_secs(0));
346        assert!(elapsed < std::time::Duration::from_secs(2));
347    }
348}