kevy_rt/
lib.rs

1//! kevy-rt — shared-nothing, thread-per-core runtime.
2//!
3//! Each core runs its own reactor (kqueue/epoll) and owns one **shard** of the
4//! keyspace (`hash(key) % nshards`). There is no shared mutable state and no
5//! lock on the hot path — cores communicate only by message passing over
6//! channels, woken via a self-pipe ([`kevy_sys::Waker`]). Connections are spread
7//! across cores by `SO_REUSEPORT`; a command whose key lives on another core is
8//! forwarded to that core, executed there, and the reply routed back to the
9//! originating connection.
10//!
11//! Per-connection reply ordering is preserved (RESP is pipelined): each command
12//! gets a monotonic seq; replies are emitted only in contiguous seq order, so an
13//! async cross-core reply never overtakes an earlier one.
14//!
15//! The cross-core channel currently uses `std::sync::mpsc` (pure Rust, zero
16//! deps); swapping in a lock-free SPSC/MPSC ring is a perf-polish item.
17//! Command semantics are injected via the [`Commands`] trait, keeping the
18//! runtime independent of the concrete command set. Part of the [kevy] server.
19//!
20//! [kevy]: https://crates.io/crates/kevy
21//!
22//! # Module map
23//!
24//! - [`Runtime`] (in `runtime`) — public entry point; spawns one `shard` per core.
25//! - `shard` — the per-core reactor: sockets, the inbound queue, reply flushing.
26//! - `exec` — command semantics: routing, execution, and result reduction.
27//! - `message` — internal cross-core work/result types.
28//! - `conn` — per-connection state (input/output, seq ring, subscriptions).
29//! - `reduce` — reply reduction (`materialize`) and pure helpers (set algebra,
30//!   shard hashing, pub/sub framing).
31//!
32//! # Example
33//!
34//! Implement [`Commands`] for your command set and run it. ([`Store`] is
35//! re-exported so you don't need a separate dependency.)
36//!
37//! ```no_run
38//! use kevy_rt::{ArgvView, Commands, Route, Runtime, Store, TxnKind};
39//! use std::sync::Arc;
40//! use std::sync::atomic::AtomicBool;
41//!
42//! #[derive(Clone)]
43//! struct MyCommands;
44//! impl Commands for MyCommands {
45//!     fn route<A: ArgvView + ?Sized>(&self, args: &A) -> Route {
46//!         if args.len() >= 2 { Route::Single(1) } else { Route::Local }
47//!     }
48//!     fn dispatch<A: ArgvView + ?Sized>(&self, _store: &mut Store, _args: &A) -> Vec<u8> {
49//!         b"+OK\r\n".to_vec()
50//!     }
51//!     fn is_quit<A: ArgvView + ?Sized>(&self, args: &A) -> bool {
52//!         args.first().is_some_and(|c| c.eq_ignore_ascii_case(b"QUIT"))
53//!     }
54//!     fn is_write<A: ArgvView + ?Sized>(&self, _args: &A) -> bool { false }
55//!     fn txn_kind<A: ArgvView + ?Sized>(&self, _args: &A) -> TxnKind { TxnKind::Other }
56//! }
57//!
58//! // One shard per core, listening on 127.0.0.1:6379, until `stop` is set.
59//! let rt = Runtime::new([127, 0, 0, 1], 6379, 4, MyCommands);
60//! rt.run(Arc::new(AtomicBool::new(false))).unwrap();
61//! ```
62// Almost entirely safe: the only `unsafe` is in `uring_reactor` (Linux io_uring),
63// which needs raw buffer pointers for zero-allocation completion I/O — on the hot
64// path toward kevy's disk-I/O-ceiling goal, where a buffer-ownership safe wrapper
65// would add per-op cost. Each such block documents its invariant; the
66// epoll/kqueue path and every other module stay safe, and all libc lives in
67// kevy-sys.
68#![deny(unsafe_op_in_unsafe_fn)]
69
70mod bio;
71mod block_xshard;
72mod blocked;
73mod lua_wake_bridge;
74mod cache_padded;
75mod cluster;
76mod conn;
77mod exec;
78mod exec_build;
79mod exec_client_intercept;
80mod exec_crossslot;
81mod exec_dispatch;
82mod exec_notify;
83mod exec_op;
84mod exec_pubsub;
85mod exec_pubsub_pattern;
86mod exec_rename;
87mod exec_slowlog;
88mod exec_watch;
89mod inbox;
90mod persist_worker;
91mod message;
92mod reduce;
93mod replica_inbox;
94mod replication;
95mod replication_apply;
96mod replication_gate;
97mod replication_io;
98mod replication_pump;
99mod reshard;
100mod route;
101mod runtime;
102mod runtime_builders;
103mod shard;
104mod shard_flush;
105mod shard_lifecycle;
106mod shard_tick;
107#[cfg(target_os = "linux")]
108mod uring_arm;
109#[cfg(target_os = "linux")]
110mod uring_bigbulk;
111#[cfg(target_os = "linux")]
112mod uring_bigbulk_b2alt;
113#[cfg(target_os = "linux")]
114mod uring_bigbulk_probe;
115#[cfg(target_os = "linux")]
116mod uring_conn;
117#[cfg(target_os = "linux")]
118mod uring_inbox;
119#[cfg(target_os = "linux")]
120mod uring_io;
121#[cfg(target_os = "linux")]
122mod uring_park;
123#[cfg(target_os = "linux")]
124mod uring_reactor;
125
126pub use blocked::{BlockHint, BlockKind};
127pub use lua_wake_bridge::push_lua_wake_key;
128pub use reduce::shard_of as shard_of_key;
129pub use cluster::shard_slot_range;
130pub use exec_slowlog::{SlowlogSub, parse_slowlog_sub};
131pub use kevy_config::NotificationFlags;
132pub use kevy_persist::Fsync;
133pub use kevy_resp::{Argv, ArgvBorrowed, ArgvView, RespVersion};
134pub use kevy_store::Store;
135pub use replica_inbox::{ReplicaApply, ReplicaInboxReceiver, ReplicaInboxSender, replica_inbox_pair};
136pub use replication_gate::ReplicatedApplyGuard;
137pub use route::{Route, XGroupCtx};
138pub use runtime::Runtime;
139
140/// Command-set semantics injected into the runtime. Cloned to every core, so it
141/// must be cheap/stateless to clone.
142pub trait Commands: Clone + Send + 'static {
143    /// Classify how a command is routed across shards.
144    fn route<A: ArgvView + ?Sized>(&self, args: &A) -> Route;
145    /// Execute a full command against one shard's store, returning RESP bytes.
146    fn dispatch<A: ArgvView + ?Sized>(&self, store: &mut Store, args: &A) -> Vec<u8>;
147    /// RESP3 variant of [`Self::dispatch`] — called when the connection
148    /// has negotiated `HELLO 3`. Default: delegate to the RESP2 path
149    /// (the cross-shard forward carries a per-cmd `RespVersion`
150    /// so a V2 client and a V3 client can share the owning shard).
151    fn dispatch_resp3<A: ArgvView + ?Sized>(&self, store: &mut Store, args: &A) -> Vec<u8> {
152        self.dispatch(store, args)
153    }
154    /// Execute a command, appending the RESP reply to `out`. The in-order local
155    /// fast path uses this to write straight into the connection's output buffer
156    /// (no per-command reply `Vec`). Default: delegate to [`dispatch`](Self::dispatch).
157    fn dispatch_into<A: ArgvView + ?Sized>(&self, store: &mut Store, args: &A, out: &mut Vec<u8>) {
158        out.extend_from_slice(&self.dispatch(store, args));
159    }
160    /// RESP3 variant of [`Self::dispatch_into`] — called when the
161    /// connection has negotiated `HELLO 3`. Default: delegate to the
162    /// RESP2 path (so a server that hasn't migrated any replies still
163    /// works correctly with a RESP3 client, per spec). Override per
164    /// command to emit RESP3 shapes (Map / Set / Double / …).
165    fn dispatch_into_resp3<A: ArgvView + ?Sized>(
166        &self,
167        store: &mut Store,
168        args: &A,
169        out: &mut Vec<u8>,
170    ) {
171        self.dispatch_into(store, args, out);
172    }
173    /// Classify a command for keyspace notifications. Returns `Some`
174    /// for write commands that should fire a notification when the
175    /// corresponding flag is enabled; `None` for read-only / no-op /
176    /// not-yet-classified commands (those never publish). Default
177    /// `None` so non-kevy embedders pay nothing.
178    fn notify_class<A: ArgvView + ?Sized>(&self, _args: &A) -> Option<NotifyClass> {
179        None
180    }
181
182    /// Handle `HELLO` — return the new connection protocol version + the
183    /// reply bytes. The runtime applies the new version to the conn
184    /// before scheduling the reply, so a `HELLO 3` ack itself comes out
185    /// shaped as a RESP3 Map (the new protocol is in effect for its own
186    /// reply).
187    ///
188    /// Default: ignore the args, keep `current_proto`, emit a minimal
189    /// RESP2 +OK so embedders that don't care still see a sane reply.
190    /// kevy's own impl in `kevy::KevyCommands` parses the optional
191    /// protover and emits the full server-info shape.
192    fn hello_reply<A: ArgvView + ?Sized>(
193        &self,
194        _args: &A,
195        current_proto: RespVersion,
196    ) -> (RespVersion, Vec<u8>) {
197        (current_proto, b"+OK\r\n".to_vec())
198    }
199    /// Whether this command should close the connection (QUIT).
200    fn is_quit<A: ArgvView + ?Sized>(&self, args: &A) -> bool;
201    /// Whether this command mutates the keyspace (so it must be logged to the AOF).
202    fn is_write<A: ArgvView + ?Sized>(&self, args: &A) -> bool;
203    /// Transaction-control classification (MULTI/EXEC/DISCARD vs anything else).
204    fn txn_kind<A: ArgvView + ?Sized>(&self, args: &A) -> TxnKind;
205    /// Called once per shard, immediately after [`Store::new`], before the
206    /// reactor enters its event loop. Implementations install per-shard
207    /// configuration that the runtime doesn't know about — currently the
208    /// `maxmemory` + eviction-policy pair, which kevy ships via its own
209    /// process-wide config snapshot. Default: no-op so non-kevy embedders
210    /// aren't forced to override.
211    fn on_shard_init(&self, _store: &mut Store) {}
212
213    /// Called once on the shard's own thread, first thing in the reactor
214    /// entry (both reactors), before restore/replay. Implementations that
215    /// need per-shard identity at dispatch time (e.g. kevy's `CLUSTER MYID`
216    /// / `CLUSTER NODES` `myself` flag) stash `shard` in a thread-local here
217    /// — in a thread-per-core runtime the current thread *is* the shard.
218    /// Default: no-op.
219    fn on_shard_start(&self, _shard: usize) {}
220
221    /// Per-tick persistence-stats publication: whether this shard has a
222    /// background save/rewrite in flight and how many AOF rewrites have
223    /// completed since open. Command layers that serve `INFO persistence`
224    /// stash these in a thread-local (thread-per-core: the answering
225    /// thread *is* the shard, same pattern as [`Self::on_shard_start`]).
226    /// Default: no-op.
227    fn on_persist_stats(&self, _in_flight: bool, _aof_rewrites_total: u64) {}
228
229    /// Per-tick replication-view publication: the answering shard's
230    /// current `master_repl_offset` (== `ReplicationSource::next_offset()`)
231    /// plus the per-replica `(ipv4, port, sent_offset)` triple for
232    /// every handshake-complete replica (in `AckSent`, `Streaming`,
233    /// or `SnapshotShipping`). `connected_slaves` for `INFO` /
234    /// `ROLE` is derived as `replicas.len()`.
235    /// Only called when this shard has a `ReplicationSource`
236    /// installed (i.e. `Runtime::with_replication(true, ...)` was
237    /// requested); standalone setups pay nothing. Command layers
238    /// that serve `ROLE` / `INFO replication` stash the values in a
239    /// thread-local (thread-per-core: the answering thread *is* the
240    /// shard, same pattern as [`Self::on_persist_stats`]). Default
241    /// no-op.
242    fn on_replication_view(
243        &self,
244        _master_repl_offset: u64,
245        _replicas: Vec<(std::net::Ipv4Addr, u16, u64)>,
246    ) {}
247
248    /// Periodic shard housekeeping (the equivalent of Redis's `serverCron`).
249    /// kevy uses this to run [`Store::tick_expire`] at the configured
250    /// `[expiry].hz`. Default no-op so non-kevy embedders / runtimes can
251    /// ignore it.
252    fn on_shard_tick(&self, _store: &mut Store) {}
253
254    /// Called once per client command at dispatch entry (before routing /
255    /// fan-out, so a multi-key command counts once). kevy uses it for
256    /// `INFO stats: total_commands_processed`. Hot path — keep it to a single
257    /// thread-local bump. Default no-op so non-kevy embedders pay nothing.
258    fn on_command(&self) {}
259
260    /// Called once per accepted client connection. kevy uses it for
261    /// `INFO stats: total_connections_received`. Default no-op.
262    fn on_connection(&self) {}
263
264    /// Interval between [`Self::on_shard_tick`] calls. Default 100 ms
265    /// (matching Redis's `hz = 10`). `0` disables ticking entirely.
266    fn shard_tick_interval_ms(&self) -> u64 {
267        100
268    }
269
270    /// Snapshot of the runtime-owned knobs that can be hot-modified
271    /// (the kevy server wires this to `CONFIG SET`). Called once per
272    /// shard tick — each `Some` value is applied to the shard's live
273    /// state; each `None` keeps the existing setting untouched.
274    ///
275    /// Default returns all-None so embedders that never hot-swap config
276    /// pay nothing beyond one struct-build per tick. The cost lives in
277    /// the impl's read of its own config source.
278    fn live_runtime_config(&self) -> LiveRuntimeConfig {
279        LiveRuntimeConfig::default()
280    }
281
282    /// Index into `args` of the key whose write may wake a blocked waiter
283    /// (`LPUSH` / `RPUSH` feed `BLPOP` / `BRPOP`; `XADD` feeds the stream
284    /// blocks). `Some(1)` for those verbs, `None` for everything else. The
285    /// in-shard fast path reads this off [`ResolvedCmd::wake_idx`]; the
286    /// cross-shard write path (`exec_op`, where a forwarded write
287    /// lands on the key's owning shard) re-derives it via this method since
288    /// the forwarded envelope doesn't carry the resolved hint. Default
289    /// `None` so non-blocking embedders pay nothing.
290    fn wake_idx<A: ArgvView + ?Sized>(&self, _args: &A) -> Option<u8> {
291        None
292    }
293
294    /// Classify a command for blocking semantics. `BlockHint::None`
295    /// (default) is the zero-cost answer for every non-blocking verb;
296    /// the dispatcher only registers a waiter when this returns
297    /// `BlockHint::Block` *and* the command's `dispatch_into` produced no
298    /// reply (i.e. it could not satisfy itself immediately — e.g. BLPOP
299    /// on an empty list). Concrete impls should fold this into their
300    /// override of [`Self::resolve`] so the verb-table lookup happens
301    /// once per command.
302    fn block_hint<A: ArgvView + ?Sized>(&self, _args: &A) -> BlockHint {
303        BlockHint::None
304    }
305
306    /// Rewrite `args` into the owned [`Argv`] that the dispatcher will
307    /// store as the parked waiter's command and replay on wake. Lets a
308    /// command set normalise positional ID / cursor arguments that would
309    /// otherwise re-resolve to a different value on retry — most notably
310    /// `XREAD BLOCK ... STREAMS k $`, where leaving `$` literal in the
311    /// retried argv causes a fresh re-resolve to the post-`XADD` last_id
312    /// and zero matching entries (the wake hangs).
313    ///
314    /// Default: just materialise the argv unchanged. Concrete impls only
315    /// need to override when a registered command carries an arg whose
316    /// meaning depends on store state at park time (`XREAD $`, the
317    /// classic case).
318    ///
319    /// For the cross-shard arbiter this runs on the **target** shard (the
320    /// one that owns the key) when the waiter is armed, so `$` snapshots
321    /// the target's real `last_id` — not the origin shard's (which may not
322    /// hold the stream at all).
323    fn resolve_block_argv<A: ArgvView + ?Sized>(
324        &self,
325        _store: &mut Store,
326        args: &A,
327        _kind: BlockKind,
328    ) -> Argv {
329        args.to_argv()
330    }
331
332    /// Build the **single-key** command the dispatcher will replay to
333    /// satisfy one watched `key` of a (possibly multi-key) blocking
334    /// command. `args` is the original command; `key` is one of its
335    /// watched keys. Returns an [`Argv`] that, when dispatched, pops /
336    /// reads only `key` — e.g. `BLPOP k1 k2 0` watching `k2` yields
337    /// `BLPOP k2 0`; `XREAD … STREAMS s1 s2 id1 id2` watching `s2`
338    /// yields `XREAD … STREAMS s2 id2`.
339    ///
340    /// Any state-dependent positional arg (`$`) is left **literal** here —
341    /// it's frozen later by [`Self::resolve_block_argv`] on the key's
342    /// owning shard. No store access needed (pure argv slicing). Default:
343    /// the unchanged argv (single-key blocking commands need no rewrite).
344    fn block_serve_argv<A: ArgvView + ?Sized>(
345        &self,
346        args: &A,
347        _kind: BlockKind,
348        _key: &[u8],
349    ) -> Argv {
350        args.to_argv()
351    }
352
353    /// Non-destructive readiness peek for a parked waiter: would replaying
354    /// `serve_argv` (built by [`Self::block_serve_argv`], `$` already
355    /// frozen) produce a reply right now? Runs on the key's owning shard
356    /// when arming and is the gate for emitting a cross-shard wake. Must
357    /// NOT mutate the store (no pop / no group-cursor advance). Default
358    /// `false` so non-blocking embedders never spuriously wake.
359    fn block_ready<A: ArgvView + ?Sized>(
360        &self,
361        _store: &mut Store,
362        _serve_argv: &A,
363        _kind: BlockKind,
364    ) -> bool {
365        false
366    }
367
368    /// Resolve all verb-dependent attributes in **one** verb-table lookup.
369    /// The default implementation calls the per-attribute methods above
370    /// (five upper_verb scans + matches); concrete impls SHOULD override
371    /// this with a single match so the reactor's hot path pays the verb-
372    /// resolution cost only once per command.
373    fn resolve<A: ArgvView + ?Sized>(&self, args: &A) -> ResolvedCmd {
374        ResolvedCmd {
375            txn_kind: self.txn_kind(args),
376            route: self.route(args),
377            is_quit: self.is_quit(args),
378            is_write: self.is_write(args),
379            block_hint: self.block_hint(args),
380            wake_idx: None,
381        }
382    }
383}
384
385/// Per-command verb-resolution result. Produced once by [`Commands::resolve`]
386/// in the reactor's parse-then-dispatch loop, reused for routing decisions,
387/// AOF logging, and the QUIT branch — so the per-cmd `upper_verb` cost goes
388/// from 4× down to 1×.
389pub struct ResolvedCmd {
390    pub txn_kind: TxnKind,
391    pub route: Route,
392    pub is_quit: bool,
393    pub is_write: bool,
394    /// Blocking-command classification (see [`Commands::block_hint`]).
395    /// `BlockHint::None` for every non-blocking verb.
396    pub block_hint: BlockHint,
397    /// Index into `args` whose write may wake a `BLPOP` / `XREAD BLOCK`
398    /// waiter parked on that key — `Some(1)` for `LPUSH` / `RPUSH` /
399    /// `XADD`, `None` for every other command (including reads). The
400    /// dispatcher's wake hook is gated on both this being `Some` *and*
401    /// the per-shard `BlockedClients` registry being non-empty, so the
402    /// steady-state cost when nobody is parked is one `is_empty()` check.
403    pub wake_idx: Option<u8>,
404}
405
406/// Keyspace-notification event class — what category a write command
407/// belongs to, so the runtime can match it against the per-conn
408/// notify_keyspace_events flags before publishing.
409#[derive(Debug, Clone, Copy, PartialEq, Eq)]
410pub enum NotifyClass {
411    /// `g` — generic key commands (DEL / EXPIRE / PERSIST / RENAME / TYPE).
412    Generic,
413    /// `$` — string commands (SET / GETSET / INCR / APPEND / MSET).
414    String,
415    /// `l` — list commands (LPUSH / RPUSH / LPOP / LREM / LTRIM / …).
416    List,
417    /// `s` — set commands (SADD / SREM / SPOP / …).
418    Set,
419    /// `h` — hash commands (HSET / HDEL / HINCRBY / …).
420    Hash,
421    /// `z` — sorted-set commands (ZADD / ZREM / ZINCRBY / …).
422    Zset,
423    /// `t` — stream commands (XADD / XDEL / XTRIM / XGROUP / XACK /
424    /// XCLAIM / XREADGROUP / …). Matches Redis's `t` class.
425    Stream,
426}
427
428impl NotifyClass {
429    /// Whether `flags` enables this event class.
430    #[inline]
431    pub fn enabled_in(self, flags: &NotificationFlags) -> bool {
432        match self {
433            NotifyClass::Generic => flags.generic,
434            NotifyClass::String => flags.string,
435            NotifyClass::List => flags.list,
436            NotifyClass::Set => flags.set,
437            NotifyClass::Hash => flags.hash,
438            NotifyClass::Zset => flags.zset,
439            NotifyClass::Stream => flags.stream,
440        }
441    }
442}
443
444/// Transaction-control classification for a command.
445pub enum TxnKind {
446    Multi,
447    Exec,
448    Discard,
449    /// `WATCH` — outside MULTI runs the fan-out; inside MULTI is rejected
450    /// with an error (Redis semantics: `WATCH inside MULTI is not allowed`).
451    /// `UNWATCH` is plain [`Self::Other`] — outside MULTI it routes to
452    /// [`Route::Unwatch`] (clear + OK); inside MULTI it queues as a no-op
453    /// that dispatch resolves to +OK at EXEC time.
454    Watch,
455    Other,
456}
457
458/// Live snapshot of the runtime-owned knobs that may have been changed
459/// since this shard's last tick. Built by the [`Commands`] impl from
460/// its own config source (e.g. kevy reads `config_global`). Each
461/// `Some(_)` is applied to the shard; each `None` leaves the existing
462/// setting alone.
463///
464/// One snapshot is built per tick (every 100 ms by default), so its
465/// cost is amortised across thousands of commands.
466#[derive(Debug, Default, Clone, Copy)]
467pub struct LiveRuntimeConfig {
468    /// AOF fsync policy. Applied via `Aof::set_fsync` — switching to
469    /// `Always` mid-flight also flushes any buffered bytes so the new
470    /// "every write is on disk before reply" contract is honoured from
471    /// the next append onward.
472    pub appendfsync: Option<Fsync>,
473    /// `auto_aof_rewrite_percentage`. `0` disables the auto-trigger.
474    pub auto_aof_rewrite_pct: Option<u32>,
475    /// `auto_aof_rewrite_min_size` in bytes.
476    pub auto_aof_rewrite_min_size: Option<u64>,
477    /// New tick interval in ms (`1000/hz`). `0` disables ticking
478    /// entirely — note that disabling also turns off active TTL
479    /// expiry and the auto-rewrite tick path. Lazy expiry on access
480    /// always still works.
481    pub tick_interval_ms: Option<u64>,
482    /// `notify_keyspace_events` flags. Parsed by the [`Commands`]
483    /// impl from its config source (e.g. kevy reads
484    /// `config_global` + [`kevy_config::parse_notification_flags`]).
485    /// Default-empty flags mean OFF — writes pay one bool-OR check
486    /// and skip every per-key keyspace notification publish.
487    pub notify_flags: Option<NotificationFlags>,
488    /// `[slowlog].slower_than_micros` — `-1` disables, `0` records all,
489    /// `>0` is the strict micros threshold. `None` keeps the existing
490    /// shard setting (set by the [`Runtime`] builder at startup).
491    pub slowlog_slower_than_micros: Option<i64>,
492    /// `[slowlog].max_len` — ring cap per shard. Shrinking trims the
493    /// oldest entries on the next tick application.
494    pub slowlog_max_len: Option<u32>,
495}
kevy_rt/lib.rs

kevy_rt/
lib.rs