Skip to main content

ursula_runtime/
lib.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2use std::fmt;
3use std::fs::{self, File, OpenOptions};
4use std::future::Future;
5use std::io;
6use std::io::{BufRead, BufReader, Write};
7use std::path::{Path, PathBuf};
8use std::pin::Pin;
9use std::sync::Arc;
10use std::sync::atomic::{AtomicU64, Ordering};
11use std::time::{Instant, SystemTime, UNIX_EPOCH};
12
13use bytes::Bytes;
14use opendal::{Operator, Scheme};
15use serde::{Deserialize, Serialize};
16use tokio::sync::{Semaphore, mpsc, oneshot};
17use tokio::task::JoinSet;
18use ursula_shard::{
19    BucketStreamId, CoreId, RaftGroupId, ShardId, ShardMapError, ShardPlacement, StaticShardMap,
20};
21use ursula_stream::{
22    AppendStreamInput, ObjectPayloadRef, StreamCommand, StreamMessageRecord, StreamReadPlan,
23    StreamReadSegment, StreamResponse, StreamSnapshot, StreamStateMachine,
24};
25pub use ursula_stream::{
26    ColdChunkRef, ColdFlushCandidate, ExternalPayloadRef, ProducerRequest, StreamErrorCode,
27};
28
29const DEFAULT_CONTENT_TYPE: &str = "application/octet-stream";
30static COLD_CHUNK_SEQUENCE: AtomicU64 = AtomicU64::new(0);
31
32#[derive(Clone, Debug)]
33pub struct ColdStore {
34    operator: Operator,
35}
36
37pub type ColdStoreHandle = Arc<ColdStore>;
38
39impl ColdStore {
40    pub fn memory() -> io::Result<Self> {
41        let operator = Operator::via_iter(Scheme::Memory, [])
42            .map_err(|err| io::Error::other(err.to_string()))?;
43        Ok(Self { operator })
44    }
45
46    pub fn fs(root: impl AsRef<Path>) -> io::Result<Self> {
47        let root = root.as_ref();
48        fs::create_dir_all(root)?;
49        let operator = Operator::via_iter(
50            Scheme::Fs,
51            [("root".to_owned(), root.to_string_lossy().to_string())],
52        )
53        .map_err(|err| io::Error::other(err.to_string()))?;
54        Ok(Self { operator })
55    }
56
57    pub fn s3_from_env() -> io::Result<Self> {
58        Self::s3_from_env_with_root(None)
59    }
60
61    pub fn s3_from_env_with_root(root_override: Option<&str>) -> io::Result<Self> {
62        let bucket = std::env::var("URSULA_COLD_S3_BUCKET").map_err(|_| {
63            io::Error::new(
64                io::ErrorKind::InvalidInput,
65                "URSULA_COLD_S3_BUCKET is required when URSULA_COLD_BACKEND=s3",
66            )
67        })?;
68        if bucket.trim().is_empty() {
69            return Err(io::Error::new(
70                io::ErrorKind::InvalidInput,
71                "URSULA_COLD_S3_BUCKET must not be empty",
72            ));
73        }
74
75        let mut builder = opendal::services::S3::default().bucket(&bucket);
76        if let Some(root) = root_override {
77            if !root.trim().is_empty() {
78                builder = builder.root(root);
79            }
80        } else if let Ok(root) = std::env::var("URSULA_COLD_ROOT")
81            && !root.trim().is_empty()
82        {
83            builder = builder.root(&root);
84        }
85        if let Ok(region) = std::env::var("URSULA_COLD_S3_REGION")
86            && !region.trim().is_empty()
87        {
88            builder = builder.region(&region);
89        }
90        if let Ok(endpoint) = std::env::var("URSULA_COLD_S3_ENDPOINT")
91            && !endpoint.trim().is_empty()
92        {
93            builder = builder.endpoint(&endpoint);
94        }
95        if let Ok(access_key_id) = std::env::var("URSULA_COLD_S3_ACCESS_KEY_ID")
96            && !access_key_id.trim().is_empty()
97        {
98            builder = builder.access_key_id(&access_key_id);
99        }
100        if let Ok(secret_access_key) = std::env::var("URSULA_COLD_S3_SECRET_ACCESS_KEY")
101            && !secret_access_key.trim().is_empty()
102        {
103            builder = builder.secret_access_key(&secret_access_key);
104        }
105        if let Ok(session_token) = std::env::var("URSULA_COLD_S3_SESSION_TOKEN")
106            && !session_token.trim().is_empty()
107        {
108            builder = builder.session_token(&session_token);
109        }
110
111        Ok(Self {
112            operator: Operator::new(builder)
113                .map_err(|err| io::Error::other(err.to_string()))?
114                .finish(),
115        })
116    }
117
118    pub fn from_env() -> io::Result<Option<ColdStoreHandle>> {
119        let backend = std::env::var("URSULA_COLD_BACKEND")
120            .unwrap_or_else(|_| "none".to_owned())
121            .to_ascii_lowercase();
122        let store = match backend.as_str() {
123            "none" | "disabled" | "off" => return Ok(None),
124            "memory" | "mem" | "inmem" => Self::memory()?,
125            "fs" => {
126                let root =
127                    std::env::var("URSULA_COLD_ROOT").unwrap_or_else(|_| "data/cold".to_owned());
128                Self::fs(root)?
129            }
130            "s3" => Self::s3_from_env()?,
131            other => {
132                return Err(io::Error::new(
133                    io::ErrorKind::InvalidInput,
134                    format!("unsupported URSULA_COLD_BACKEND '{other}'"),
135                ));
136            }
137        };
138        Ok(Some(Arc::new(store)))
139    }
140
141    pub async fn write_chunk(&self, path: &str, payload: &[u8]) -> io::Result<u64> {
142        if path.trim().is_empty() {
143            return Err(io::Error::new(
144                io::ErrorKind::InvalidInput,
145                "cold chunk path must not be empty",
146            ));
147        }
148        self.operator
149            .write(path, payload.to_vec())
150            .await
151            .map_err(|err| cold_store_io_error(path, err))?;
152        Ok(u64::try_from(payload.len()).expect("payload len fits u64"))
153    }
154
155    pub async fn delete_chunk(&self, path: &str) -> io::Result<()> {
156        if path.trim().is_empty() {
157            return Err(io::Error::new(
158                io::ErrorKind::InvalidInput,
159                "cold chunk path must not be empty",
160            ));
161        }
162        self.operator
163            .delete(path)
164            .await
165            .map_err(|err| cold_store_io_error(path, err))
166    }
167
168    pub async fn remove_all(&self, path: &str) -> io::Result<()> {
169        self.operator
170            .remove_all(path)
171            .await
172            .map_err(|err| cold_store_io_error(path, err))
173    }
174
175    pub async fn read_chunk_range(
176        &self,
177        chunk: &ColdChunkRef,
178        read_start_offset: u64,
179        len: usize,
180    ) -> io::Result<Vec<u8>> {
181        let object = ObjectPayloadRef {
182            start_offset: chunk.start_offset,
183            end_offset: chunk.end_offset,
184            s3_path: chunk.s3_path.clone(),
185            object_size: chunk.object_size,
186        };
187        self.read_object_range(&object, read_start_offset, len)
188            .await
189    }
190
191    pub async fn read_object_range(
192        &self,
193        object: &ObjectPayloadRef,
194        read_start_offset: u64,
195        len: usize,
196    ) -> io::Result<Vec<u8>> {
197        if len == 0 {
198            return Ok(Vec::new());
199        }
200        let len_u64 = u64::try_from(len).map_err(|_| {
201            io::Error::new(io::ErrorKind::InvalidInput, "cold read length exceeds u64")
202        })?;
203        let read_end = read_start_offset.checked_add(len_u64).ok_or_else(|| {
204            io::Error::new(io::ErrorKind::InvalidInput, "cold read range overflow")
205        })?;
206        if read_start_offset < object.start_offset || read_end > object.end_offset {
207            return Err(io::Error::new(
208                io::ErrorKind::InvalidInput,
209                format!(
210                    "cold read range [{read_start_offset}..{read_end}) is outside object segment [{}..{})",
211                    object.start_offset, object.end_offset
212                ),
213            ));
214        }
215        let object_start = read_start_offset - object.start_offset;
216        let object_end = object_start.checked_add(len_u64).ok_or_else(|| {
217            io::Error::new(io::ErrorKind::InvalidInput, "cold read range overflow")
218        })?;
219        if object_end > object.object_size {
220            return Err(io::Error::new(
221                io::ErrorKind::InvalidData,
222                format!(
223                    "cold read range [{object_start}..{object_end}) is outside object '{}' size {}",
224                    object.s3_path, object.object_size
225                ),
226            ));
227        }
228        let bytes = self
229            .operator
230            .read_with(&object.s3_path)
231            .range(object_start..object_end)
232            .await
233            .map_err(|err| cold_store_io_error(&object.s3_path, err))?
234            .to_bytes();
235        if bytes.len() != len {
236            return Err(io::Error::new(
237                io::ErrorKind::InvalidData,
238                format!(
239                    "cold object '{}' returned {} bytes for requested range [{}..{})",
240                    object.s3_path,
241                    bytes.len(),
242                    object_start,
243                    object_end
244                ),
245            ));
246        }
247        Ok(bytes.to_vec())
248    }
249}
250
251fn cold_store_io_error(path: &str, err: opendal::Error) -> io::Error {
252    io::Error::other(format!("cold object '{path}': {err}"))
253}
254
255pub fn new_cold_chunk_path(
256    stream_id: &BucketStreamId,
257    start_offset: u64,
258    end_offset: u64,
259) -> String {
260    let unix_nanos = SystemTime::now()
261        .duration_since(UNIX_EPOCH)
262        .map(|duration| duration.as_nanos())
263        .unwrap_or(0);
264    let sequence = COLD_CHUNK_SEQUENCE.fetch_add(1, Ordering::Relaxed);
265    format!(
266        "{stream_id}/chunks/{start_offset:016x}-{end_offset:016x}-{unix_nanos:032x}-{sequence:016x}.bin"
267    )
268}
269
270pub fn new_external_payload_path(stream_id: &BucketStreamId) -> String {
271    let unix_nanos = SystemTime::now()
272        .duration_since(UNIX_EPOCH)
273        .map(|duration| duration.as_nanos())
274        .unwrap_or(0);
275    let sequence = COLD_CHUNK_SEQUENCE.fetch_add(1, Ordering::Relaxed);
276    format!("{stream_id}/external/{unix_nanos:032x}-{sequence:016x}.bin")
277}
278
279#[derive(Debug, Clone, PartialEq, Eq)]
280pub struct CreateStreamRequest {
281    pub stream_id: BucketStreamId,
282    pub content_type: String,
283    pub content_type_explicit: bool,
284    pub initial_payload: Bytes,
285    pub close_after: bool,
286    pub stream_seq: Option<String>,
287    pub producer: Option<ProducerRequest>,
288    pub stream_ttl_seconds: Option<u64>,
289    pub stream_expires_at_ms: Option<u64>,
290    pub forked_from: Option<BucketStreamId>,
291    pub fork_offset: Option<u64>,
292    pub now_ms: u64,
293}
294
295#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
296pub struct CreateStreamExternalRequest {
297    pub stream_id: BucketStreamId,
298    pub content_type: String,
299    pub initial_payload: ExternalPayloadRef,
300    pub close_after: bool,
301    pub stream_seq: Option<String>,
302    pub producer: Option<ProducerRequest>,
303    pub stream_ttl_seconds: Option<u64>,
304    pub stream_expires_at_ms: Option<u64>,
305    pub forked_from: Option<BucketStreamId>,
306    pub fork_offset: Option<u64>,
307    pub now_ms: u64,
308}
309
310impl CreateStreamExternalRequest {
311    pub fn from_create_request(
312        request: CreateStreamRequest,
313        initial_payload: ExternalPayloadRef,
314    ) -> Self {
315        Self {
316            stream_id: request.stream_id,
317            content_type: request.content_type,
318            initial_payload,
319            close_after: request.close_after,
320            stream_seq: request.stream_seq,
321            producer: request.producer,
322            stream_ttl_seconds: request.stream_ttl_seconds,
323            stream_expires_at_ms: request.stream_expires_at_ms,
324            forked_from: request.forked_from,
325            fork_offset: request.fork_offset,
326            now_ms: request.now_ms,
327        }
328    }
329}
330
331impl CreateStreamRequest {
332    pub fn new(stream_id: BucketStreamId, content_type: impl Into<String>) -> Self {
333        Self {
334            stream_id,
335            content_type: content_type.into(),
336            content_type_explicit: true,
337            initial_payload: Bytes::new(),
338            close_after: false,
339            stream_seq: None,
340            producer: None,
341            stream_ttl_seconds: None,
342            stream_expires_at_ms: None,
343            forked_from: None,
344            fork_offset: None,
345            now_ms: 0,
346        }
347    }
348}
349
350#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
351pub struct CreateStreamResponse {
352    pub placement: ShardPlacement,
353    pub next_offset: u64,
354    pub closed: bool,
355    pub already_exists: bool,
356    pub group_commit_index: u64,
357}
358
359#[derive(Debug, Clone, PartialEq, Eq)]
360pub struct HeadStreamRequest {
361    pub stream_id: BucketStreamId,
362    pub now_ms: u64,
363}
364
365#[derive(Debug, Clone, PartialEq, Eq)]
366pub struct HeadStreamResponse {
367    pub placement: ShardPlacement,
368    pub content_type: String,
369    pub tail_offset: u64,
370    pub closed: bool,
371    pub stream_ttl_seconds: Option<u64>,
372    pub stream_expires_at_ms: Option<u64>,
373    pub snapshot_offset: Option<u64>,
374}
375
376#[derive(Debug, Clone, PartialEq, Eq)]
377pub struct ReadStreamRequest {
378    pub stream_id: BucketStreamId,
379    pub offset: u64,
380    pub max_len: usize,
381    pub now_ms: u64,
382}
383
384#[derive(Debug, Clone, PartialEq, Eq)]
385pub struct ReadStreamResponse {
386    pub placement: ShardPlacement,
387    pub offset: u64,
388    pub next_offset: u64,
389    pub content_type: String,
390    pub payload: Vec<u8>,
391    pub up_to_date: bool,
392    pub closed: bool,
393}
394
395pub enum GroupReadStreamBody {
396    Materialized(Vec<u8>),
397    Planned {
398        stream_id: BucketStreamId,
399        plan: StreamReadPlan,
400        cold_store: Option<ColdStoreHandle>,
401    },
402    #[cfg(test)]
403    Blocking {
404        entered: Arc<tokio::sync::Notify>,
405        release: Arc<tokio::sync::Notify>,
406        payload: Vec<u8>,
407    },
408}
409
410pub struct GroupReadStreamParts {
411    pub placement: ShardPlacement,
412    pub offset: u64,
413    pub next_offset: u64,
414    pub content_type: String,
415    pub up_to_date: bool,
416    pub closed: bool,
417    pub body: GroupReadStreamBody,
418}
419
420impl GroupReadStreamParts {
421    pub fn from_response(response: ReadStreamResponse) -> Self {
422        Self {
423            placement: response.placement,
424            offset: response.offset,
425            next_offset: response.next_offset,
426            content_type: response.content_type,
427            up_to_date: response.up_to_date,
428            closed: response.closed,
429            body: GroupReadStreamBody::Materialized(response.payload),
430        }
431    }
432
433    pub fn from_plan(
434        placement: ShardPlacement,
435        stream_id: BucketStreamId,
436        plan: StreamReadPlan,
437        cold_store: Option<ColdStoreHandle>,
438    ) -> Self {
439        Self {
440            placement,
441            offset: plan.offset,
442            next_offset: plan.next_offset,
443            content_type: plan.content_type.clone(),
444            up_to_date: plan.up_to_date,
445            closed: plan.closed,
446            body: GroupReadStreamBody::Planned {
447                stream_id,
448                plan,
449                cold_store,
450            },
451        }
452    }
453
454    pub async fn into_response(self) -> Result<ReadStreamResponse, GroupEngineError> {
455        let payload = match &self.body {
456            GroupReadStreamBody::Materialized(payload) => payload.clone(),
457            GroupReadStreamBody::Planned {
458                stream_id,
459                plan,
460                cold_store,
461            } => {
462                InMemoryGroupEngine::read_payload_from_plan(cold_store.as_ref(), stream_id, plan)
463                    .await?
464            }
465            #[cfg(test)]
466            GroupReadStreamBody::Blocking {
467                entered,
468                release,
469                payload,
470            } => {
471                entered.notify_one();
472                release.notified().await;
473                payload.clone()
474            }
475        };
476        Ok(ReadStreamResponse {
477            placement: self.placement,
478            offset: self.offset,
479            next_offset: self.next_offset,
480            content_type: self.content_type,
481            payload,
482            up_to_date: self.up_to_date,
483            closed: self.closed,
484        })
485    }
486
487    fn payload_is_empty(&self) -> bool {
488        match &self.body {
489            GroupReadStreamBody::Materialized(payload) => payload.is_empty(),
490            GroupReadStreamBody::Planned { plan, .. } => {
491                plan.segments.iter().all(|segment| match segment {
492                    StreamReadSegment::Hot(payload) => payload.is_empty(),
493                    StreamReadSegment::Object(segment) => segment.len == 0,
494                })
495            }
496            #[cfg(test)]
497            GroupReadStreamBody::Blocking { payload, .. } => payload.is_empty(),
498        }
499    }
500}
501
502#[derive(Debug, Clone, PartialEq, Eq)]
503pub struct PublishSnapshotRequest {
504    pub stream_id: BucketStreamId,
505    pub snapshot_offset: u64,
506    pub content_type: String,
507    pub payload: Bytes,
508    pub now_ms: u64,
509}
510
511#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
512pub struct PublishSnapshotResponse {
513    pub placement: ShardPlacement,
514    pub snapshot_offset: u64,
515    pub group_commit_index: u64,
516}
517
518#[derive(Debug, Clone, PartialEq, Eq)]
519pub struct ReadSnapshotRequest {
520    pub stream_id: BucketStreamId,
521    pub snapshot_offset: Option<u64>,
522    pub now_ms: u64,
523}
524
525#[derive(Debug, Clone, PartialEq, Eq)]
526pub struct ReadSnapshotResponse {
527    pub placement: ShardPlacement,
528    pub snapshot_offset: u64,
529    pub next_offset: u64,
530    pub content_type: String,
531    pub payload: Vec<u8>,
532    pub up_to_date: bool,
533}
534
535#[derive(Debug, Clone, PartialEq, Eq)]
536pub struct DeleteSnapshotRequest {
537    pub stream_id: BucketStreamId,
538    pub snapshot_offset: u64,
539    pub now_ms: u64,
540}
541
542#[derive(Debug, Clone, PartialEq, Eq)]
543pub struct BootstrapStreamRequest {
544    pub stream_id: BucketStreamId,
545    pub now_ms: u64,
546}
547
548#[derive(Debug, Clone, PartialEq, Eq)]
549pub struct BootstrapUpdate {
550    pub start_offset: u64,
551    pub next_offset: u64,
552    pub content_type: String,
553    pub payload: Vec<u8>,
554}
555
556#[derive(Debug, Clone, PartialEq, Eq)]
557pub struct BootstrapStreamResponse {
558    pub placement: ShardPlacement,
559    pub snapshot_offset: Option<u64>,
560    pub snapshot_content_type: String,
561    pub snapshot_payload: Vec<u8>,
562    pub updates: Vec<BootstrapUpdate>,
563    pub next_offset: u64,
564    pub up_to_date: bool,
565    pub closed: bool,
566}
567
568#[derive(Debug, Clone, PartialEq, Eq)]
569pub struct CloseStreamRequest {
570    pub stream_id: BucketStreamId,
571    pub stream_seq: Option<String>,
572    pub producer: Option<ProducerRequest>,
573    pub now_ms: u64,
574}
575
576#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
577pub struct CloseStreamResponse {
578    pub placement: ShardPlacement,
579    pub next_offset: u64,
580    pub group_commit_index: u64,
581    pub deduplicated: bool,
582}
583
584#[derive(Debug, Clone, PartialEq, Eq)]
585pub struct DeleteStreamRequest {
586    pub stream_id: BucketStreamId,
587}
588
589#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
590pub struct DeleteStreamResponse {
591    pub placement: ShardPlacement,
592    pub group_commit_index: u64,
593    pub hard_deleted: bool,
594    pub parent_to_release: Option<BucketStreamId>,
595}
596
597#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
598pub struct ForkRefResponse {
599    pub placement: ShardPlacement,
600    pub fork_ref_count: u64,
601    pub hard_deleted: bool,
602    pub parent_to_release: Option<BucketStreamId>,
603    pub group_commit_index: u64,
604}
605
606#[derive(Debug, Clone, PartialEq, Eq)]
607pub struct FlushColdRequest {
608    pub stream_id: BucketStreamId,
609    pub chunk: ColdChunkRef,
610}
611
612#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
613pub struct FlushColdResponse {
614    pub placement: ShardPlacement,
615    pub hot_start_offset: u64,
616    pub group_commit_index: u64,
617}
618
619#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
620pub struct TouchStreamAccessResponse {
621    pub placement: ShardPlacement,
622    pub changed: bool,
623    pub expired: bool,
624    pub group_commit_index: u64,
625}
626
627#[derive(Debug, Clone, PartialEq, Eq)]
628pub struct PlanColdFlushRequest {
629    pub stream_id: BucketStreamId,
630    pub min_hot_bytes: usize,
631    pub max_flush_bytes: usize,
632}
633
634#[derive(Debug, Clone, PartialEq, Eq)]
635pub struct PlanGroupColdFlushRequest {
636    pub min_hot_bytes: usize,
637    pub max_flush_bytes: usize,
638}
639
640#[derive(Debug, Clone, PartialEq, Eq)]
641pub struct ColdHotBacklog {
642    pub stream_id: BucketStreamId,
643    pub stream_hot_bytes: u64,
644    pub group_hot_bytes: u64,
645}
646
647#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
648pub struct ColdWriteAdmission {
649    pub max_hot_bytes_per_group: Option<u64>,
650}
651
652impl ColdWriteAdmission {
653    fn is_enabled(self) -> bool {
654        self.max_hot_bytes_per_group.is_some()
655    }
656}
657
658#[derive(Debug, Clone, PartialEq, Eq)]
659pub struct AppendRequest {
660    pub stream_id: BucketStreamId,
661    pub content_type: String,
662    pub payload: Bytes,
663    pub close_after: bool,
664    pub stream_seq: Option<String>,
665    pub producer: Option<ProducerRequest>,
666    pub now_ms: u64,
667}
668
669#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
670pub struct AppendExternalRequest {
671    pub stream_id: BucketStreamId,
672    pub content_type: String,
673    pub payload: ExternalPayloadRef,
674    pub close_after: bool,
675    pub stream_seq: Option<String>,
676    pub producer: Option<ProducerRequest>,
677    pub now_ms: u64,
678}
679
680impl AppendExternalRequest {
681    pub fn from_append_request(request: AppendRequest, payload: ExternalPayloadRef) -> Self {
682        Self {
683            stream_id: request.stream_id,
684            content_type: request.content_type,
685            payload,
686            close_after: request.close_after,
687            stream_seq: request.stream_seq,
688            producer: request.producer,
689            now_ms: request.now_ms,
690        }
691    }
692}
693
694impl AppendRequest {
695    pub fn new(stream_id: BucketStreamId, payload_len: u64) -> Self {
696        Self {
697            stream_id,
698            content_type: DEFAULT_CONTENT_TYPE.to_owned(),
699            payload: Bytes::from(vec![
700                0;
701                usize::try_from(payload_len)
702                    .expect("payload_len fits usize")
703            ]),
704            close_after: false,
705            stream_seq: None,
706            producer: None,
707            now_ms: 0,
708        }
709    }
710
711    pub fn from_bytes(stream_id: BucketStreamId, payload: impl Into<Bytes>) -> Self {
712        Self {
713            stream_id,
714            content_type: DEFAULT_CONTENT_TYPE.to_owned(),
715            payload: payload.into(),
716            close_after: false,
717            stream_seq: None,
718            producer: None,
719            now_ms: 0,
720        }
721    }
722
723    pub fn payload_len(&self) -> u64 {
724        u64::try_from(self.payload.len()).expect("payload len fits u64")
725    }
726}
727
728#[derive(Debug, Clone, PartialEq, Eq)]
729pub struct AppendBatchRequest {
730    pub stream_id: BucketStreamId,
731    pub content_type: String,
732    pub payloads: Vec<Bytes>,
733    pub producer: Option<ProducerRequest>,
734    pub now_ms: u64,
735}
736
737impl AppendBatchRequest {
738    pub fn new<P>(stream_id: BucketStreamId, payloads: Vec<P>) -> Self
739    where
740        P: Into<Bytes>,
741    {
742        Self {
743            stream_id,
744            content_type: DEFAULT_CONTENT_TYPE.to_owned(),
745            payloads: payloads.into_iter().map(Into::into).collect(),
746            producer: None,
747            now_ms: 0,
748        }
749    }
750}
751
752#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
753pub struct AppendResponse {
754    pub placement: ShardPlacement,
755    pub start_offset: u64,
756    pub next_offset: u64,
757    pub stream_append_count: u64,
758    pub group_commit_index: u64,
759    pub closed: bool,
760    pub deduplicated: bool,
761    pub producer: Option<ProducerRequest>,
762}
763
764#[derive(Debug, Clone, PartialEq, Eq)]
765pub struct AppendBatchResponse {
766    pub placement: ShardPlacement,
767    pub items: Vec<Result<AppendResponse, RuntimeError>>,
768}
769
770#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
771pub struct StreamAppendCount {
772    pub stream_id: BucketStreamId,
773    pub append_count: u64,
774}
775
776#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
777pub struct GroupSnapshot {
778    pub placement: ShardPlacement,
779    pub group_commit_index: u64,
780    pub stream_snapshot: StreamSnapshot,
781    pub stream_append_counts: Vec<StreamAppendCount>,
782}
783
784#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
785#[serde(tag = "command", rename_all = "snake_case")]
786pub enum GroupWriteCommand {
787    CreateStream {
788        stream_id: BucketStreamId,
789        content_type: String,
790        initial_payload: Bytes,
791        close_after: bool,
792        stream_seq: Option<String>,
793        producer: Option<ProducerRequest>,
794        stream_ttl_seconds: Option<u64>,
795        stream_expires_at_ms: Option<u64>,
796        forked_from: Option<BucketStreamId>,
797        fork_offset: Option<u64>,
798        now_ms: u64,
799    },
800    CreateExternal {
801        stream_id: BucketStreamId,
802        content_type: String,
803        initial_payload: ExternalPayloadRef,
804        close_after: bool,
805        stream_seq: Option<String>,
806        producer: Option<ProducerRequest>,
807        stream_ttl_seconds: Option<u64>,
808        stream_expires_at_ms: Option<u64>,
809        forked_from: Option<BucketStreamId>,
810        fork_offset: Option<u64>,
811        now_ms: u64,
812    },
813    Append {
814        stream_id: BucketStreamId,
815        content_type: String,
816        payload: Bytes,
817        close_after: bool,
818        stream_seq: Option<String>,
819        producer: Option<ProducerRequest>,
820        now_ms: u64,
821    },
822    AppendExternal {
823        stream_id: BucketStreamId,
824        content_type: String,
825        payload: ExternalPayloadRef,
826        close_after: bool,
827        stream_seq: Option<String>,
828        producer: Option<ProducerRequest>,
829        now_ms: u64,
830    },
831    AppendBatch {
832        stream_id: BucketStreamId,
833        content_type: String,
834        payloads: Vec<Bytes>,
835        producer: Option<ProducerRequest>,
836        now_ms: u64,
837    },
838    PublishSnapshot {
839        stream_id: BucketStreamId,
840        snapshot_offset: u64,
841        content_type: String,
842        payload: Bytes,
843        now_ms: u64,
844    },
845    TouchStreamAccess {
846        stream_id: BucketStreamId,
847        now_ms: u64,
848        renew_ttl: bool,
849    },
850    AddForkRef {
851        stream_id: BucketStreamId,
852        now_ms: u64,
853    },
854    ReleaseForkRef {
855        stream_id: BucketStreamId,
856    },
857    FlushCold {
858        stream_id: BucketStreamId,
859        chunk: ColdChunkRef,
860    },
861    CloseStream {
862        stream_id: BucketStreamId,
863        stream_seq: Option<String>,
864        producer: Option<ProducerRequest>,
865        now_ms: u64,
866    },
867    DeleteStream {
868        stream_id: BucketStreamId,
869    },
870    Batch {
871        commands: Vec<GroupWriteCommand>,
872    },
873}
874
875impl From<CreateStreamRequest> for GroupWriteCommand {
876    fn from(request: CreateStreamRequest) -> Self {
877        Self::CreateStream {
878            stream_id: request.stream_id,
879            content_type: request.content_type,
880            initial_payload: request.initial_payload,
881            close_after: request.close_after,
882            stream_seq: request.stream_seq,
883            producer: request.producer,
884            stream_ttl_seconds: request.stream_ttl_seconds,
885            stream_expires_at_ms: request.stream_expires_at_ms,
886            forked_from: request.forked_from,
887            fork_offset: request.fork_offset,
888            now_ms: request.now_ms,
889        }
890    }
891}
892
893impl From<&CreateStreamRequest> for GroupWriteCommand {
894    fn from(request: &CreateStreamRequest) -> Self {
895        Self::CreateStream {
896            stream_id: request.stream_id.clone(),
897            content_type: request.content_type.clone(),
898            initial_payload: request.initial_payload.clone(),
899            close_after: request.close_after,
900            stream_seq: request.stream_seq.clone(),
901            producer: request.producer.clone(),
902            stream_ttl_seconds: request.stream_ttl_seconds,
903            stream_expires_at_ms: request.stream_expires_at_ms,
904            forked_from: request.forked_from.clone(),
905            fork_offset: request.fork_offset,
906            now_ms: request.now_ms,
907        }
908    }
909}
910
911impl From<CreateStreamExternalRequest> for GroupWriteCommand {
912    fn from(request: CreateStreamExternalRequest) -> Self {
913        Self::CreateExternal {
914            stream_id: request.stream_id,
915            content_type: request.content_type,
916            initial_payload: request.initial_payload,
917            close_after: request.close_after,
918            stream_seq: request.stream_seq,
919            producer: request.producer,
920            stream_ttl_seconds: request.stream_ttl_seconds,
921            stream_expires_at_ms: request.stream_expires_at_ms,
922            forked_from: request.forked_from,
923            fork_offset: request.fork_offset,
924            now_ms: request.now_ms,
925        }
926    }
927}
928
929impl From<&CreateStreamExternalRequest> for GroupWriteCommand {
930    fn from(request: &CreateStreamExternalRequest) -> Self {
931        Self::CreateExternal {
932            stream_id: request.stream_id.clone(),
933            content_type: request.content_type.clone(),
934            initial_payload: request.initial_payload.clone(),
935            close_after: request.close_after,
936            stream_seq: request.stream_seq.clone(),
937            producer: request.producer.clone(),
938            stream_ttl_seconds: request.stream_ttl_seconds,
939            stream_expires_at_ms: request.stream_expires_at_ms,
940            forked_from: request.forked_from.clone(),
941            fork_offset: request.fork_offset,
942            now_ms: request.now_ms,
943        }
944    }
945}
946
947impl From<AppendRequest> for GroupWriteCommand {
948    fn from(request: AppendRequest) -> Self {
949        Self::Append {
950            stream_id: request.stream_id,
951            content_type: request.content_type,
952            payload: request.payload,
953            close_after: request.close_after,
954            stream_seq: request.stream_seq,
955            producer: request.producer,
956            now_ms: request.now_ms,
957        }
958    }
959}
960
961impl From<&AppendRequest> for GroupWriteCommand {
962    fn from(request: &AppendRequest) -> Self {
963        Self::Append {
964            stream_id: request.stream_id.clone(),
965            content_type: request.content_type.clone(),
966            payload: request.payload.clone(),
967            close_after: request.close_after,
968            stream_seq: request.stream_seq.clone(),
969            producer: request.producer.clone(),
970            now_ms: request.now_ms,
971        }
972    }
973}
974
975impl From<AppendExternalRequest> for GroupWriteCommand {
976    fn from(request: AppendExternalRequest) -> Self {
977        Self::AppendExternal {
978            stream_id: request.stream_id,
979            content_type: request.content_type,
980            payload: request.payload,
981            close_after: request.close_after,
982            stream_seq: request.stream_seq,
983            producer: request.producer,
984            now_ms: request.now_ms,
985        }
986    }
987}
988
989impl From<&AppendExternalRequest> for GroupWriteCommand {
990    fn from(request: &AppendExternalRequest) -> Self {
991        Self::AppendExternal {
992            stream_id: request.stream_id.clone(),
993            content_type: request.content_type.clone(),
994            payload: request.payload.clone(),
995            close_after: request.close_after,
996            stream_seq: request.stream_seq.clone(),
997            producer: request.producer.clone(),
998            now_ms: request.now_ms,
999        }
1000    }
1001}
1002
1003impl From<AppendBatchRequest> for GroupWriteCommand {
1004    fn from(request: AppendBatchRequest) -> Self {
1005        Self::AppendBatch {
1006            stream_id: request.stream_id,
1007            content_type: request.content_type,
1008            payloads: request.payloads,
1009            producer: request.producer,
1010            now_ms: request.now_ms,
1011        }
1012    }
1013}
1014
1015impl From<&AppendBatchRequest> for GroupWriteCommand {
1016    fn from(request: &AppendBatchRequest) -> Self {
1017        Self::AppendBatch {
1018            stream_id: request.stream_id.clone(),
1019            content_type: request.content_type.clone(),
1020            payloads: request.payloads.clone(),
1021            producer: request.producer.clone(),
1022            now_ms: request.now_ms,
1023        }
1024    }
1025}
1026
1027impl From<PublishSnapshotRequest> for GroupWriteCommand {
1028    fn from(request: PublishSnapshotRequest) -> Self {
1029        Self::PublishSnapshot {
1030            stream_id: request.stream_id,
1031            snapshot_offset: request.snapshot_offset,
1032            content_type: request.content_type,
1033            payload: request.payload,
1034            now_ms: request.now_ms,
1035        }
1036    }
1037}
1038
1039impl From<&PublishSnapshotRequest> for GroupWriteCommand {
1040    fn from(request: &PublishSnapshotRequest) -> Self {
1041        Self::PublishSnapshot {
1042            stream_id: request.stream_id.clone(),
1043            snapshot_offset: request.snapshot_offset,
1044            content_type: request.content_type.clone(),
1045            payload: request.payload.clone(),
1046            now_ms: request.now_ms,
1047        }
1048    }
1049}
1050
1051impl From<CloseStreamRequest> for GroupWriteCommand {
1052    fn from(request: CloseStreamRequest) -> Self {
1053        Self::CloseStream {
1054            stream_id: request.stream_id,
1055            stream_seq: request.stream_seq,
1056            producer: request.producer,
1057            now_ms: request.now_ms,
1058        }
1059    }
1060}
1061
1062impl From<&CloseStreamRequest> for GroupWriteCommand {
1063    fn from(request: &CloseStreamRequest) -> Self {
1064        Self::CloseStream {
1065            stream_id: request.stream_id.clone(),
1066            stream_seq: request.stream_seq.clone(),
1067            producer: request.producer.clone(),
1068            now_ms: request.now_ms,
1069        }
1070    }
1071}
1072
1073impl From<DeleteStreamRequest> for GroupWriteCommand {
1074    fn from(request: DeleteStreamRequest) -> Self {
1075        Self::DeleteStream {
1076            stream_id: request.stream_id,
1077        }
1078    }
1079}
1080
1081impl From<&DeleteStreamRequest> for GroupWriteCommand {
1082    fn from(request: &DeleteStreamRequest) -> Self {
1083        Self::DeleteStream {
1084            stream_id: request.stream_id.clone(),
1085        }
1086    }
1087}
1088
1089impl From<FlushColdRequest> for GroupWriteCommand {
1090    fn from(request: FlushColdRequest) -> Self {
1091        Self::FlushCold {
1092            stream_id: request.stream_id,
1093            chunk: request.chunk,
1094        }
1095    }
1096}
1097
1098impl From<&FlushColdRequest> for GroupWriteCommand {
1099    fn from(request: &FlushColdRequest) -> Self {
1100        Self::FlushCold {
1101            stream_id: request.stream_id.clone(),
1102            chunk: request.chunk.clone(),
1103        }
1104    }
1105}
1106
1107impl fmt::Display for GroupWriteCommand {
1108    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1109        match self {
1110            Self::CreateStream { stream_id, .. } => {
1111                write!(f, "create_stream:{stream_id}")
1112            }
1113            Self::CreateExternal {
1114                stream_id,
1115                initial_payload,
1116                ..
1117            } => {
1118                write!(
1119                    f,
1120                    "create_external:{stream_id}:{} bytes",
1121                    initial_payload.payload_len
1122                )
1123            }
1124            Self::Append {
1125                stream_id, payload, ..
1126            } => {
1127                write!(f, "append:{stream_id}:{} bytes", payload.len())
1128            }
1129            Self::AppendExternal {
1130                stream_id, payload, ..
1131            } => {
1132                write!(
1133                    f,
1134                    "append_external:{stream_id}:{} bytes",
1135                    payload.payload_len
1136                )
1137            }
1138            Self::AppendBatch {
1139                stream_id,
1140                payloads,
1141                ..
1142            } => {
1143                write!(f, "append_batch:{stream_id}:{} items", payloads.len())
1144            }
1145            Self::PublishSnapshot {
1146                stream_id,
1147                snapshot_offset,
1148                payload,
1149                ..
1150            } => {
1151                write!(
1152                    f,
1153                    "publish_snapshot:{stream_id}:{snapshot_offset}:{} bytes",
1154                    payload.len()
1155                )
1156            }
1157            Self::TouchStreamAccess {
1158                stream_id,
1159                renew_ttl,
1160                ..
1161            } => {
1162                write!(f, "touch_stream_access:{stream_id}:renew_ttl={renew_ttl}")
1163            }
1164            Self::AddForkRef { stream_id, .. } => {
1165                write!(f, "add_fork_ref:{stream_id}")
1166            }
1167            Self::ReleaseForkRef { stream_id } => {
1168                write!(f, "release_fork_ref:{stream_id}")
1169            }
1170            Self::FlushCold { stream_id, chunk } => {
1171                write!(
1172                    f,
1173                    "flush_cold:{stream_id}:{}..{}",
1174                    chunk.start_offset, chunk.end_offset
1175                )
1176            }
1177            Self::CloseStream { stream_id, .. } => {
1178                write!(f, "close_stream:{stream_id}")
1179            }
1180            Self::DeleteStream { stream_id } => {
1181                write!(f, "delete_stream:{stream_id}")
1182            }
1183            Self::Batch { commands } => {
1184                write!(f, "batch:{} commands", commands.len())
1185            }
1186        }
1187    }
1188}
1189
1190#[derive(Debug, Clone, PartialEq, Eq)]
1191pub enum RuntimeError {
1192    InvalidConfig(ShardMapError),
1193    InvalidRaftGroup {
1194        raft_group_id: RaftGroupId,
1195        raft_group_count: u32,
1196    },
1197    SnapshotPlacementMismatch {
1198        expected: ShardPlacement,
1199        actual: ShardPlacement,
1200    },
1201    EmptyAppend,
1202    ColdStoreConfig {
1203        message: String,
1204    },
1205    ColdStoreIo {
1206        message: String,
1207    },
1208    LiveReadBackpressure {
1209        core_id: CoreId,
1210        current_waiters: u64,
1211        limit: u64,
1212    },
1213    GroupEngine {
1214        core_id: CoreId,
1215        raft_group_id: RaftGroupId,
1216        message: String,
1217        next_offset: Option<u64>,
1218        leader_hint: Option<GroupLeaderHint>,
1219    },
1220    MailboxClosed {
1221        core_id: CoreId,
1222    },
1223    ResponseDropped {
1224        core_id: CoreId,
1225    },
1226    SpawnCoreThread {
1227        core_id: CoreId,
1228        message: String,
1229    },
1230}
1231
1232impl RuntimeError {
1233    fn group_engine(placement: ShardPlacement, err: GroupEngineError) -> Self {
1234        Self::GroupEngine {
1235            core_id: placement.core_id,
1236            raft_group_id: placement.raft_group_id,
1237            message: err.message().to_owned(),
1238            next_offset: err.next_offset(),
1239            leader_hint: err.leader_hint().cloned(),
1240        }
1241    }
1242}
1243
1244impl std::fmt::Display for RuntimeError {
1245    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1246        match self {
1247            Self::InvalidConfig(err) => write!(f, "invalid shard runtime config: {err}"),
1248            Self::InvalidRaftGroup {
1249                raft_group_id,
1250                raft_group_count,
1251            } => write!(
1252                f,
1253                "raft group {} is outside configured range 0..{}",
1254                raft_group_id.0, raft_group_count
1255            ),
1256            Self::SnapshotPlacementMismatch { expected, actual } => write!(
1257                f,
1258                "snapshot placement for raft group {} is core {}, expected core {}",
1259                actual.raft_group_id.0, actual.core_id.0, expected.core_id.0
1260            ),
1261            Self::EmptyAppend => f.write_str("append payload must be non-empty"),
1262            Self::ColdStoreConfig { message } => {
1263                write!(f, "invalid cold store config: {message}")
1264            }
1265            Self::ColdStoreIo { message } => write!(f, "cold store IO error: {message}"),
1266            Self::LiveReadBackpressure {
1267                core_id,
1268                current_waiters,
1269                limit,
1270            } => write!(
1271                f,
1272                "core {} live read waiters at {} would exceed limit {}",
1273                core_id.0, current_waiters, limit
1274            ),
1275            Self::GroupEngine {
1276                core_id,
1277                raft_group_id,
1278                message,
1279                ..
1280            } => write!(
1281                f,
1282                "core {} raft group {} append failed: {message}",
1283                core_id.0, raft_group_id.0
1284            ),
1285            Self::MailboxClosed { core_id } => {
1286                write!(f, "core {} mailbox is closed", core_id.0)
1287            }
1288            Self::ResponseDropped { core_id } => {
1289                write!(f, "core {} dropped append response", core_id.0)
1290            }
1291            Self::SpawnCoreThread { core_id, message } => {
1292                write!(f, "failed to spawn core {} thread: {message}", core_id.0)
1293            }
1294        }
1295    }
1296}
1297
1298impl std::error::Error for RuntimeError {}
1299
1300impl From<ShardMapError> for RuntimeError {
1301    fn from(value: ShardMapError) -> Self {
1302        Self::InvalidConfig(value)
1303    }
1304}
1305
1306fn map_fork_source_ref_error(err: RuntimeError, placement: ShardPlacement) -> RuntimeError {
1307    if let RuntimeError::GroupEngine { message, .. } = &err
1308        && message.contains("StreamGone")
1309    {
1310        return RuntimeError::group_engine(
1311            placement,
1312            GroupEngineError::stream(
1313                StreamErrorCode::StreamAlreadyExistsConflict,
1314                "source stream is gone and cannot be forked",
1315            ),
1316        );
1317    }
1318    err
1319}
1320
1321pub type GroupAppendFuture<'a> =
1322    Pin<Box<dyn Future<Output = Result<AppendResponse, GroupEngineError>> + Send + 'a>>;
1323pub type GroupAppendBatchFuture<'a> =
1324    Pin<Box<dyn Future<Output = Result<GroupAppendBatchResponse, GroupEngineError>> + Send + 'a>>;
1325pub type GroupFlushColdFuture<'a> =
1326    Pin<Box<dyn Future<Output = Result<FlushColdResponse, GroupEngineError>> + Send + 'a>>;
1327pub type GroupPlanColdFlushFuture<'a> =
1328    Pin<Box<dyn Future<Output = Result<Option<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1329pub type GroupPlanNextColdFlushFuture<'a> =
1330    Pin<Box<dyn Future<Output = Result<Option<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1331pub type GroupPlanNextColdFlushBatchFuture<'a> =
1332    Pin<Box<dyn Future<Output = Result<Vec<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1333pub type GroupColdHotBacklogFuture<'a> =
1334    Pin<Box<dyn Future<Output = Result<ColdHotBacklog, GroupEngineError>> + Send + 'a>>;
1335pub type GroupCreateStreamFuture<'a> =
1336    Pin<Box<dyn Future<Output = Result<CreateStreamResponse, GroupEngineError>> + Send + 'a>>;
1337pub type GroupHeadStreamFuture<'a> =
1338    Pin<Box<dyn Future<Output = Result<HeadStreamResponse, GroupEngineError>> + Send + 'a>>;
1339pub type GroupReadStreamFuture<'a> =
1340    Pin<Box<dyn Future<Output = Result<ReadStreamResponse, GroupEngineError>> + Send + 'a>>;
1341pub type GroupReadStreamPartsFuture<'a> =
1342    Pin<Box<dyn Future<Output = Result<GroupReadStreamParts, GroupEngineError>> + Send + 'a>>;
1343pub type GroupRequireLiveReadOwnerFuture<'a> =
1344    Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1345pub type GroupPublishSnapshotFuture<'a> =
1346    Pin<Box<dyn Future<Output = Result<PublishSnapshotResponse, GroupEngineError>> + Send + 'a>>;
1347pub type GroupReadSnapshotFuture<'a> =
1348    Pin<Box<dyn Future<Output = Result<ReadSnapshotResponse, GroupEngineError>> + Send + 'a>>;
1349pub type GroupDeleteSnapshotFuture<'a> =
1350    Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1351pub type GroupBootstrapStreamFuture<'a> =
1352    Pin<Box<dyn Future<Output = Result<BootstrapStreamResponse, GroupEngineError>> + Send + 'a>>;
1353pub type GroupTouchStreamAccessFuture<'a> =
1354    Pin<Box<dyn Future<Output = Result<TouchStreamAccessResponse, GroupEngineError>> + Send + 'a>>;
1355pub type GroupCloseStreamFuture<'a> =
1356    Pin<Box<dyn Future<Output = Result<CloseStreamResponse, GroupEngineError>> + Send + 'a>>;
1357pub type GroupDeleteStreamFuture<'a> =
1358    Pin<Box<dyn Future<Output = Result<DeleteStreamResponse, GroupEngineError>> + Send + 'a>>;
1359pub type GroupForkRefFuture<'a> =
1360    Pin<Box<dyn Future<Output = Result<ForkRefResponse, GroupEngineError>> + Send + 'a>>;
1361pub type GroupSnapshotFuture<'a> =
1362    Pin<Box<dyn Future<Output = Result<GroupSnapshot, GroupEngineError>> + Send + 'a>>;
1363pub type GroupInstallSnapshotFuture<'a> =
1364    Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1365pub type GroupWriteBatchFuture<'a> = Pin<
1366    Box<
1367        dyn Future<
1368                Output = Result<
1369                    Vec<Result<GroupWriteResponse, GroupEngineError>>,
1370                    GroupEngineError,
1371                >,
1372            > + Send
1373            + 'a,
1374    >,
1375>;
1376pub type GroupEngineCreateFuture<'a> =
1377    Pin<Box<dyn Future<Output = Result<Box<dyn GroupEngine>, GroupEngineError>> + Send + 'a>>;
1378
1379#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1380pub struct GroupAppendBatchResponse {
1381    pub placement: ShardPlacement,
1382    pub items: Vec<Result<AppendResponse, GroupEngineError>>,
1383}
1384
1385#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1386pub enum GroupWriteResponse {
1387    CreateStream(CreateStreamResponse),
1388    Append(AppendResponse),
1389    AppendBatch(GroupAppendBatchResponse),
1390    PublishSnapshot(PublishSnapshotResponse),
1391    TouchStreamAccess(TouchStreamAccessResponse),
1392    AddForkRef(ForkRefResponse),
1393    ReleaseForkRef(ForkRefResponse),
1394    FlushCold(FlushColdResponse),
1395    CloseStream(CloseStreamResponse),
1396    DeleteStream(DeleteStreamResponse),
1397    Batch(Vec<Result<GroupWriteResponse, GroupEngineError>>),
1398}
1399
1400pub trait GroupEngine: Send + 'static {
1401    fn accepts_local_writes(&self) -> bool {
1402        true
1403    }
1404
1405    fn create_stream<'a>(
1406        &'a mut self,
1407        request: CreateStreamRequest,
1408        placement: ShardPlacement,
1409    ) -> GroupCreateStreamFuture<'a>;
1410
1411    fn create_stream_external<'a>(
1412        &'a mut self,
1413        request: CreateStreamExternalRequest,
1414        _placement: ShardPlacement,
1415    ) -> GroupCreateStreamFuture<'a> {
1416        Box::pin(async move {
1417            Err(GroupEngineError::new(format!(
1418                "external stream create is not supported for stream '{}'",
1419                request.stream_id
1420            )))
1421        })
1422    }
1423
1424    fn head_stream<'a>(
1425        &'a mut self,
1426        request: HeadStreamRequest,
1427        placement: ShardPlacement,
1428    ) -> GroupHeadStreamFuture<'a>;
1429
1430    fn read_stream<'a>(
1431        &'a mut self,
1432        request: ReadStreamRequest,
1433        placement: ShardPlacement,
1434    ) -> GroupReadStreamFuture<'a>;
1435
1436    fn read_stream_parts<'a>(
1437        &'a mut self,
1438        request: ReadStreamRequest,
1439        placement: ShardPlacement,
1440    ) -> GroupReadStreamPartsFuture<'a> {
1441        Box::pin(async move {
1442            let response = self.read_stream(request, placement).await?;
1443            Ok(GroupReadStreamParts::from_response(response))
1444        })
1445    }
1446
1447    fn require_local_live_read_owner<'a>(
1448        &'a mut self,
1449        _placement: ShardPlacement,
1450    ) -> GroupRequireLiveReadOwnerFuture<'a> {
1451        Box::pin(async { Ok(()) })
1452    }
1453
1454    fn publish_snapshot<'a>(
1455        &'a mut self,
1456        request: PublishSnapshotRequest,
1457        _placement: ShardPlacement,
1458    ) -> GroupPublishSnapshotFuture<'a> {
1459        Box::pin(async move {
1460            Err(GroupEngineError::new(format!(
1461                "snapshot publish is not supported for stream '{}'",
1462                request.stream_id
1463            )))
1464        })
1465    }
1466
1467    fn read_snapshot<'a>(
1468        &'a mut self,
1469        request: ReadSnapshotRequest,
1470        _placement: ShardPlacement,
1471    ) -> GroupReadSnapshotFuture<'a> {
1472        Box::pin(async move {
1473            Err(GroupEngineError::new(format!(
1474                "snapshot read is not supported for stream '{}'",
1475                request.stream_id
1476            )))
1477        })
1478    }
1479
1480    fn delete_snapshot<'a>(
1481        &'a mut self,
1482        request: DeleteSnapshotRequest,
1483        _placement: ShardPlacement,
1484    ) -> GroupDeleteSnapshotFuture<'a> {
1485        Box::pin(async move {
1486            Err(GroupEngineError::new(format!(
1487                "snapshot delete is not supported for stream '{}'",
1488                request.stream_id
1489            )))
1490        })
1491    }
1492
1493    fn bootstrap_stream<'a>(
1494        &'a mut self,
1495        request: BootstrapStreamRequest,
1496        _placement: ShardPlacement,
1497    ) -> GroupBootstrapStreamFuture<'a> {
1498        Box::pin(async move {
1499            Err(GroupEngineError::new(format!(
1500                "bootstrap is not supported for stream '{}'",
1501                request.stream_id
1502            )))
1503        })
1504    }
1505
1506    fn touch_stream_access<'a>(
1507        &'a mut self,
1508        stream_id: BucketStreamId,
1509        now_ms: u64,
1510        renew_ttl: bool,
1511        placement: ShardPlacement,
1512    ) -> GroupTouchStreamAccessFuture<'a>;
1513
1514    fn add_fork_ref<'a>(
1515        &'a mut self,
1516        stream_id: BucketStreamId,
1517        now_ms: u64,
1518        placement: ShardPlacement,
1519    ) -> GroupForkRefFuture<'a>;
1520
1521    fn release_fork_ref<'a>(
1522        &'a mut self,
1523        stream_id: BucketStreamId,
1524        placement: ShardPlacement,
1525    ) -> GroupForkRefFuture<'a>;
1526
1527    fn close_stream<'a>(
1528        &'a mut self,
1529        request: CloseStreamRequest,
1530        placement: ShardPlacement,
1531    ) -> GroupCloseStreamFuture<'a>;
1532
1533    fn delete_stream<'a>(
1534        &'a mut self,
1535        request: DeleteStreamRequest,
1536        placement: ShardPlacement,
1537    ) -> GroupDeleteStreamFuture<'a>;
1538
1539    fn append<'a>(
1540        &'a mut self,
1541        request: AppendRequest,
1542        placement: ShardPlacement,
1543    ) -> GroupAppendFuture<'a>;
1544
1545    fn append_external<'a>(
1546        &'a mut self,
1547        request: AppendExternalRequest,
1548        _placement: ShardPlacement,
1549    ) -> GroupAppendFuture<'a> {
1550        Box::pin(async move {
1551            Err(GroupEngineError::new(format!(
1552                "external append is not supported for stream '{}'",
1553                request.stream_id
1554            )))
1555        })
1556    }
1557
1558    fn append_batch<'a>(
1559        &'a mut self,
1560        request: AppendBatchRequest,
1561        placement: ShardPlacement,
1562    ) -> GroupAppendBatchFuture<'a>;
1563
1564    fn create_stream_with_cold_admission<'a>(
1565        &'a mut self,
1566        request: CreateStreamRequest,
1567        placement: ShardPlacement,
1568        _admission: ColdWriteAdmission,
1569    ) -> GroupCreateStreamFuture<'a> {
1570        self.create_stream(request, placement)
1571    }
1572
1573    fn append_with_cold_admission<'a>(
1574        &'a mut self,
1575        request: AppendRequest,
1576        placement: ShardPlacement,
1577        _admission: ColdWriteAdmission,
1578    ) -> GroupAppendFuture<'a> {
1579        self.append(request, placement)
1580    }
1581
1582    fn append_batch_with_cold_admission<'a>(
1583        &'a mut self,
1584        request: AppendBatchRequest,
1585        placement: ShardPlacement,
1586        _admission: ColdWriteAdmission,
1587    ) -> GroupAppendBatchFuture<'a> {
1588        self.append_batch(request, placement)
1589    }
1590
1591    fn append_batch_many_with_cold_admission<'a>(
1592        &'a mut self,
1593        requests: Vec<AppendBatchRequest>,
1594        placement: ShardPlacement,
1595        admission: ColdWriteAdmission,
1596    ) -> GroupWriteBatchFuture<'a> {
1597        Box::pin(async move {
1598            let mut responses = Vec::with_capacity(requests.len());
1599            for request in requests {
1600                let response = self
1601                    .append_batch_with_cold_admission(request, placement, admission)
1602                    .await
1603                    .map(GroupWriteResponse::AppendBatch);
1604                responses.push(response);
1605            }
1606            Ok(responses)
1607        })
1608    }
1609
1610    fn flush_cold<'a>(
1611        &'a mut self,
1612        request: FlushColdRequest,
1613        _placement: ShardPlacement,
1614    ) -> GroupFlushColdFuture<'a> {
1615        Box::pin(async move {
1616            Err(GroupEngineError::new(format!(
1617                "cold flush is not supported for stream '{}'",
1618                request.stream_id
1619            )))
1620        })
1621    }
1622
1623    fn plan_cold_flush<'a>(
1624        &'a mut self,
1625        request: PlanColdFlushRequest,
1626        _placement: ShardPlacement,
1627    ) -> GroupPlanColdFlushFuture<'a> {
1628        Box::pin(async move {
1629            Err(GroupEngineError::new(format!(
1630                "cold flush planning is not supported for stream '{}'",
1631                request.stream_id
1632            )))
1633        })
1634    }
1635
1636    fn plan_next_cold_flush<'a>(
1637        &'a mut self,
1638        _request: PlanGroupColdFlushRequest,
1639        _placement: ShardPlacement,
1640    ) -> GroupPlanNextColdFlushFuture<'a> {
1641        Box::pin(async move {
1642            Err(GroupEngineError::new(
1643                "group cold flush planning is not supported",
1644            ))
1645        })
1646    }
1647
1648    fn plan_next_cold_flush_batch<'a>(
1649        &'a mut self,
1650        request: PlanGroupColdFlushRequest,
1651        placement: ShardPlacement,
1652        max_candidates: usize,
1653    ) -> GroupPlanNextColdFlushBatchFuture<'a> {
1654        Box::pin(async move {
1655            match self.plan_next_cold_flush(request, placement).await? {
1656                Some(candidate) if max_candidates > 0 => Ok(vec![candidate]),
1657                _ => Ok(Vec::new()),
1658            }
1659        })
1660    }
1661
1662    fn cold_hot_backlog<'a>(
1663        &'a mut self,
1664        stream_id: BucketStreamId,
1665        _placement: ShardPlacement,
1666    ) -> GroupColdHotBacklogFuture<'a> {
1667        Box::pin(async move {
1668            Err(GroupEngineError::new(format!(
1669                "cold hot backlog is not supported for stream '{stream_id}'"
1670            )))
1671        })
1672    }
1673
1674    fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a>;
1675
1676    fn install_snapshot<'a>(
1677        &'a mut self,
1678        snapshot: GroupSnapshot,
1679    ) -> GroupInstallSnapshotFuture<'a>;
1680
1681    fn write_batch<'a>(
1682        &'a mut self,
1683        commands: Vec<GroupWriteCommand>,
1684        placement: ShardPlacement,
1685    ) -> GroupWriteBatchFuture<'a> {
1686        Box::pin(async move {
1687            let mut responses = Vec::with_capacity(commands.len());
1688            for command in commands {
1689                let response = match command {
1690                    GroupWriteCommand::CreateStream {
1691                        stream_id,
1692                        content_type,
1693                        initial_payload,
1694                        close_after,
1695                        stream_seq,
1696                        producer,
1697                        stream_ttl_seconds,
1698                        stream_expires_at_ms,
1699                        forked_from,
1700                        fork_offset,
1701                        now_ms,
1702                    } => self
1703                        .create_stream(
1704                            CreateStreamRequest {
1705                                stream_id,
1706                                content_type,
1707                                content_type_explicit: true,
1708                                initial_payload,
1709                                close_after,
1710                                stream_seq,
1711                                producer,
1712                                stream_ttl_seconds,
1713                                stream_expires_at_ms,
1714                                forked_from,
1715                                fork_offset,
1716                                now_ms,
1717                            },
1718                            placement,
1719                        )
1720                        .await
1721                        .map(GroupWriteResponse::CreateStream),
1722                    GroupWriteCommand::CreateExternal {
1723                        stream_id,
1724                        content_type,
1725                        initial_payload,
1726                        close_after,
1727                        stream_seq,
1728                        producer,
1729                        stream_ttl_seconds,
1730                        stream_expires_at_ms,
1731                        forked_from,
1732                        fork_offset,
1733                        now_ms,
1734                    } => self
1735                        .create_stream_external(
1736                            CreateStreamExternalRequest {
1737                                stream_id,
1738                                content_type,
1739                                initial_payload,
1740                                close_after,
1741                                stream_seq,
1742                                producer,
1743                                stream_ttl_seconds,
1744                                stream_expires_at_ms,
1745                                forked_from,
1746                                fork_offset,
1747                                now_ms,
1748                            },
1749                            placement,
1750                        )
1751                        .await
1752                        .map(GroupWriteResponse::CreateStream),
1753                    GroupWriteCommand::Append {
1754                        stream_id,
1755                        content_type,
1756                        payload,
1757                        close_after,
1758                        stream_seq,
1759                        producer,
1760                        now_ms,
1761                    } => self
1762                        .append(
1763                            AppendRequest {
1764                                stream_id,
1765                                content_type,
1766                                payload,
1767                                close_after,
1768                                stream_seq,
1769                                producer,
1770                                now_ms,
1771                            },
1772                            placement,
1773                        )
1774                        .await
1775                        .map(GroupWriteResponse::Append),
1776                    GroupWriteCommand::AppendExternal {
1777                        stream_id,
1778                        content_type,
1779                        payload,
1780                        close_after,
1781                        stream_seq,
1782                        producer,
1783                        now_ms,
1784                    } => self
1785                        .append_external(
1786                            AppendExternalRequest {
1787                                stream_id,
1788                                content_type,
1789                                payload,
1790                                close_after,
1791                                stream_seq,
1792                                producer,
1793                                now_ms,
1794                            },
1795                            placement,
1796                        )
1797                        .await
1798                        .map(GroupWriteResponse::Append),
1799                    GroupWriteCommand::AppendBatch {
1800                        stream_id,
1801                        content_type,
1802                        payloads,
1803                        producer,
1804                        now_ms,
1805                    } => self
1806                        .append_batch(
1807                            AppendBatchRequest {
1808                                stream_id,
1809                                content_type,
1810                                payloads,
1811                                producer,
1812                                now_ms,
1813                            },
1814                            placement,
1815                        )
1816                        .await
1817                        .map(GroupWriteResponse::AppendBatch),
1818                    GroupWriteCommand::PublishSnapshot {
1819                        stream_id,
1820                        snapshot_offset,
1821                        content_type,
1822                        payload,
1823                        now_ms,
1824                    } => self
1825                        .publish_snapshot(
1826                            PublishSnapshotRequest {
1827                                stream_id,
1828                                snapshot_offset,
1829                                content_type,
1830                                payload,
1831                                now_ms,
1832                            },
1833                            placement,
1834                        )
1835                        .await
1836                        .map(GroupWriteResponse::PublishSnapshot),
1837                    GroupWriteCommand::TouchStreamAccess {
1838                        stream_id,
1839                        now_ms,
1840                        renew_ttl,
1841                    } => self
1842                        .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
1843                        .await
1844                        .map(GroupWriteResponse::TouchStreamAccess),
1845                    GroupWriteCommand::AddForkRef { stream_id, now_ms } => self
1846                        .add_fork_ref(stream_id, now_ms, placement)
1847                        .await
1848                        .map(GroupWriteResponse::AddForkRef),
1849                    GroupWriteCommand::ReleaseForkRef { stream_id } => self
1850                        .release_fork_ref(stream_id, placement)
1851                        .await
1852                        .map(GroupWriteResponse::ReleaseForkRef),
1853                    GroupWriteCommand::FlushCold { stream_id, chunk } => self
1854                        .flush_cold(FlushColdRequest { stream_id, chunk }, placement)
1855                        .await
1856                        .map(GroupWriteResponse::FlushCold),
1857                    GroupWriteCommand::CloseStream {
1858                        stream_id,
1859                        stream_seq,
1860                        producer,
1861                        now_ms,
1862                    } => self
1863                        .close_stream(
1864                            CloseStreamRequest {
1865                                stream_id,
1866                                stream_seq,
1867                                producer,
1868                                now_ms,
1869                            },
1870                            placement,
1871                        )
1872                        .await
1873                        .map(GroupWriteResponse::CloseStream),
1874                    GroupWriteCommand::DeleteStream { stream_id } => self
1875                        .delete_stream(DeleteStreamRequest { stream_id }, placement)
1876                        .await
1877                        .map(GroupWriteResponse::DeleteStream),
1878                    GroupWriteCommand::Batch { commands } => self
1879                        .write_batch(commands, placement)
1880                        .await
1881                        .map(GroupWriteResponse::Batch),
1882                };
1883                responses.push(response);
1884            }
1885            Ok(responses)
1886        })
1887    }
1888}
1889
1890pub trait GroupEngineFactory: Send + Sync + 'static {
1891    fn create<'a>(
1892        &'a self,
1893        placement: ShardPlacement,
1894        metrics: GroupEngineMetrics,
1895    ) -> GroupEngineCreateFuture<'a>;
1896}
1897
1898#[derive(Debug, Clone)]
1899pub struct GroupEngineMetrics {
1900    inner: Arc<RuntimeMetricsInner>,
1901}
1902
1903impl GroupEngineMetrics {
1904    pub fn record_wal_batch(
1905        &self,
1906        placement: ShardPlacement,
1907        record_count: usize,
1908        write_ns: u64,
1909        sync_ns: u64,
1910    ) {
1911        self.inner.record_wal_batch(
1912            placement.core_id,
1913            placement.raft_group_id,
1914            u64::try_from(record_count).expect("record count fits u64"),
1915            write_ns,
1916            sync_ns,
1917        );
1918    }
1919
1920    pub fn record_raft_write_many(
1921        &self,
1922        placement: ShardPlacement,
1923        command_count: usize,
1924        logical_command_count: usize,
1925        response_count: usize,
1926        submit_ns: u64,
1927        response_ns: u64,
1928    ) {
1929        self.inner.record_raft_write_many(
1930            placement.core_id,
1931            placement.raft_group_id,
1932            RaftWriteManySample {
1933                command_count: u64::try_from(command_count).expect("command count fits u64"),
1934                logical_command_count: u64::try_from(logical_command_count)
1935                    .expect("logical command count fits u64"),
1936                response_count: u64::try_from(response_count).expect("response count fits u64"),
1937                submit_ns,
1938                response_ns,
1939            },
1940        );
1941    }
1942
1943    pub fn record_raft_apply_batch(
1944        &self,
1945        placement: ShardPlacement,
1946        entry_count: usize,
1947        apply_ns: u64,
1948    ) {
1949        self.inner.record_raft_apply_batch(
1950            placement.core_id,
1951            placement.raft_group_id,
1952            u64::try_from(entry_count).expect("entry count fits u64"),
1953            apply_ns,
1954        );
1955    }
1956}
1957
1958#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1959pub struct GroupLeaderHint {
1960    pub node_id: Option<u64>,
1961    pub address: Option<String>,
1962}
1963
1964#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1965pub struct GroupEngineError {
1966    message: String,
1967    code: Option<StreamErrorCode>,
1968    next_offset: Option<u64>,
1969    #[serde(default, skip_serializing_if = "Option::is_none")]
1970    leader_hint: Option<GroupLeaderHint>,
1971}
1972
1973impl GroupEngineError {
1974    pub fn new(message: impl Into<String>) -> Self {
1975        Self {
1976            message: message.into(),
1977            code: None,
1978            next_offset: None,
1979            leader_hint: None,
1980        }
1981    }
1982
1983    pub fn stream(code: StreamErrorCode, message: impl Into<String>) -> Self {
1984        Self::stream_with_next_offset(code, message, None)
1985    }
1986
1987    pub fn stream_with_next_offset(
1988        code: StreamErrorCode,
1989        message: impl Into<String>,
1990        next_offset: Option<u64>,
1991    ) -> Self {
1992        Self {
1993            message: format!("{code:?}: {}", message.into()),
1994            code: Some(code),
1995            next_offset,
1996            leader_hint: None,
1997        }
1998    }
1999
2000    pub fn forward_to_leader(
2001        message: impl Into<String>,
2002        node_id: Option<u64>,
2003        address: Option<String>,
2004    ) -> Self {
2005        Self {
2006            message: message.into(),
2007            code: None,
2008            next_offset: None,
2009            leader_hint: Some(GroupLeaderHint { node_id, address }),
2010        }
2011    }
2012
2013    pub fn from_replicated_parts(
2014        message: impl Into<String>,
2015        code: Option<StreamErrorCode>,
2016        next_offset: Option<u64>,
2017        leader_hint: Option<GroupLeaderHint>,
2018    ) -> Self {
2019        Self {
2020            message: message.into(),
2021            code,
2022            next_offset,
2023            leader_hint,
2024        }
2025    }
2026
2027    pub fn message(&self) -> &str {
2028        &self.message
2029    }
2030
2031    pub fn code(&self) -> Option<StreamErrorCode> {
2032        self.code
2033    }
2034
2035    pub fn next_offset(&self) -> Option<u64> {
2036        self.next_offset
2037    }
2038
2039    pub fn leader_hint(&self) -> Option<&GroupLeaderHint> {
2040        self.leader_hint.as_ref()
2041    }
2042}
2043
2044impl std::fmt::Display for GroupEngineError {
2045    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2046        f.write_str(&self.message)
2047    }
2048}
2049
2050impl std::error::Error for GroupEngineError {}
2051
2052struct AppendPayloadInput<'a> {
2053    stream_id: BucketStreamId,
2054    content_type: Option<&'a str>,
2055    payload: &'a [u8],
2056    close_after: bool,
2057    stream_seq: Option<String>,
2058    producer: Option<ProducerRequest>,
2059    now_ms: u64,
2060}
2061
2062#[derive(Debug, Clone, Default)]
2063pub struct InMemoryGroupEngine {
2064    commit_index: u64,
2065    state_machine: StreamStateMachine,
2066    stream_append_counts: HashMap<BucketStreamId, u64>,
2067    cold_store: Option<ColdStoreHandle>,
2068}
2069
2070impl InMemoryGroupEngine {
2071    pub fn with_cold_store(cold_store: ColdStoreHandle) -> Self {
2072        Self {
2073            cold_store: Some(cold_store),
2074            ..Self::default()
2075        }
2076    }
2077
2078    pub fn cold_store(&self) -> Option<ColdStoreHandle> {
2079        self.cold_store.clone()
2080    }
2081
2082    pub fn apply_committed_write(
2083        &mut self,
2084        command: GroupWriteCommand,
2085        placement: ShardPlacement,
2086    ) -> Result<GroupWriteResponse, GroupEngineError> {
2087        match command {
2088            GroupWriteCommand::CreateStream {
2089                stream_id,
2090                content_type,
2091                initial_payload,
2092                close_after,
2093                stream_seq,
2094                producer,
2095                stream_ttl_seconds,
2096                stream_expires_at_ms,
2097                forked_from,
2098                fork_offset,
2099                now_ms,
2100            } => {
2101                ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2102                let response = self.state_machine.apply(StreamCommand::CreateStream {
2103                    stream_id,
2104                    content_type,
2105                    initial_payload: initial_payload.to_vec(),
2106                    close_after,
2107                    stream_seq,
2108                    producer,
2109                    stream_ttl_seconds,
2110                    stream_expires_at_ms,
2111                    forked_from,
2112                    fork_offset,
2113                    now_ms,
2114                });
2115                match response {
2116                    StreamResponse::Created {
2117                        next_offset,
2118                        closed,
2119                        ..
2120                    } => {
2121                        self.commit_index += 1;
2122                        Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2123                            placement,
2124                            next_offset,
2125                            closed,
2126                            already_exists: false,
2127                            group_commit_index: self.commit_index,
2128                        }))
2129                    }
2130                    StreamResponse::AlreadyExists {
2131                        next_offset,
2132                        closed,
2133                        ..
2134                    } => Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2135                        placement,
2136                        next_offset,
2137                        closed,
2138                        already_exists: true,
2139                        group_commit_index: self.commit_index,
2140                    })),
2141                    StreamResponse::Error {
2142                        code,
2143                        message,
2144                        next_offset,
2145                    } => Err(GroupEngineError::stream_with_next_offset(
2146                        code,
2147                        message,
2148                        next_offset,
2149                    )),
2150                    other => Err(GroupEngineError::new(format!(
2151                        "unexpected create stream response: {other:?}"
2152                    ))),
2153                }
2154            }
2155            GroupWriteCommand::CreateExternal {
2156                stream_id,
2157                content_type,
2158                initial_payload,
2159                close_after,
2160                stream_seq,
2161                producer,
2162                stream_ttl_seconds,
2163                stream_expires_at_ms,
2164                forked_from,
2165                fork_offset,
2166                now_ms,
2167            } => {
2168                ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2169                let response = self.state_machine.apply(StreamCommand::CreateExternal {
2170                    stream_id,
2171                    content_type,
2172                    initial_payload,
2173                    close_after,
2174                    stream_seq,
2175                    producer,
2176                    stream_ttl_seconds,
2177                    stream_expires_at_ms,
2178                    forked_from,
2179                    fork_offset,
2180                    now_ms,
2181                });
2182                match response {
2183                    StreamResponse::Created {
2184                        next_offset,
2185                        closed,
2186                        ..
2187                    } => {
2188                        self.commit_index += 1;
2189                        Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2190                            placement,
2191                            next_offset,
2192                            closed,
2193                            already_exists: false,
2194                            group_commit_index: self.commit_index,
2195                        }))
2196                    }
2197                    StreamResponse::AlreadyExists {
2198                        next_offset,
2199                        closed,
2200                        ..
2201                    } => Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2202                        placement,
2203                        next_offset,
2204                        closed,
2205                        already_exists: true,
2206                        group_commit_index: self.commit_index,
2207                    })),
2208                    StreamResponse::Error {
2209                        code,
2210                        message,
2211                        next_offset,
2212                    } => Err(GroupEngineError::stream_with_next_offset(
2213                        code,
2214                        message,
2215                        next_offset,
2216                    )),
2217                    other => Err(GroupEngineError::new(format!(
2218                        "unexpected create external stream response: {other:?}"
2219                    ))),
2220                }
2221            }
2222            GroupWriteCommand::Append {
2223                stream_id,
2224                content_type,
2225                payload,
2226                close_after,
2227                stream_seq,
2228                producer,
2229                now_ms,
2230            } => self
2231                .append_payload(
2232                    AppendPayloadInput {
2233                        stream_id,
2234                        content_type: Some(&content_type),
2235                        payload: &payload,
2236                        close_after,
2237                        stream_seq,
2238                        producer,
2239                        now_ms,
2240                    },
2241                    placement,
2242                )
2243                .map(GroupWriteResponse::Append),
2244            GroupWriteCommand::AppendExternal {
2245                stream_id,
2246                content_type,
2247                payload,
2248                close_after,
2249                stream_seq,
2250                producer,
2251                now_ms,
2252            } => {
2253                let response = self.state_machine.apply(StreamCommand::AppendExternal {
2254                    stream_id: stream_id.clone(),
2255                    content_type: Some(content_type),
2256                    payload,
2257                    close_after,
2258                    stream_seq,
2259                    producer,
2260                    now_ms,
2261                });
2262                match response {
2263                    StreamResponse::Appended {
2264                        offset,
2265                        next_offset,
2266                        closed,
2267                        deduplicated,
2268                        producer,
2269                        ..
2270                    } => {
2271                        let stream_append_count =
2272                            self.stream_append_counts.entry(stream_id).or_insert(0);
2273                        if !deduplicated {
2274                            self.commit_index += 1;
2275                            *stream_append_count += 1;
2276                        }
2277                        Ok(GroupWriteResponse::Append(AppendResponse {
2278                            placement,
2279                            start_offset: offset,
2280                            next_offset,
2281                            stream_append_count: *stream_append_count,
2282                            group_commit_index: self.commit_index,
2283                            closed,
2284                            deduplicated,
2285                            producer,
2286                        }))
2287                    }
2288                    StreamResponse::Error {
2289                        code,
2290                        message,
2291                        next_offset,
2292                    } => Err(GroupEngineError::stream_with_next_offset(
2293                        code,
2294                        message,
2295                        next_offset,
2296                    )),
2297                    other => Err(GroupEngineError::new(format!(
2298                        "unexpected append external response: {other:?}"
2299                    ))),
2300                }
2301            }
2302            GroupWriteCommand::AppendBatch {
2303                stream_id,
2304                content_type,
2305                payloads,
2306                producer,
2307                now_ms,
2308            } => {
2309                if producer.is_some() {
2310                    let payload_refs = payloads.iter().map(Bytes::as_ref).collect::<Vec<_>>();
2311                    let batch = self
2312                        .state_machine
2313                        .append_batch_borrowed(
2314                            stream_id.clone(),
2315                            Some(&content_type),
2316                            &payload_refs,
2317                            producer,
2318                            now_ms,
2319                        )
2320                        .map_err(stream_response_error)?;
2321                    let old_commit_index = self.commit_index;
2322                    let old_append_count = *self.stream_append_counts.get(&stream_id).unwrap_or(&0);
2323                    if !batch.deduplicated {
2324                        let count = u64::try_from(batch.items.len()).expect("item count fits u64");
2325                        self.commit_index += count;
2326                        *self.stream_append_counts.entry(stream_id).or_insert(0) += count;
2327                    }
2328                    let items = batch
2329                        .items
2330                        .into_iter()
2331                        .enumerate()
2332                        .map(|(index, item)| {
2333                            let item_index = u64::try_from(index + 1).expect("item index fits u64");
2334                            Ok(AppendResponse {
2335                                placement,
2336                                start_offset: item.offset,
2337                                next_offset: item.next_offset,
2338                                stream_append_count: if item.deduplicated {
2339                                    old_append_count
2340                                } else {
2341                                    old_append_count + item_index
2342                                },
2343                                group_commit_index: if item.deduplicated {
2344                                    old_commit_index
2345                                } else {
2346                                    old_commit_index + item_index
2347                                },
2348                                closed: item.closed,
2349                                deduplicated: item.deduplicated,
2350                                producer: None,
2351                            })
2352                        })
2353                        .collect();
2354                    return Ok(GroupWriteResponse::AppendBatch(GroupAppendBatchResponse {
2355                        placement,
2356                        items,
2357                    }));
2358                }
2359
2360                let mut items = Vec::with_capacity(payloads.len());
2361                for payload in payloads {
2362                    if payload.is_empty() {
2363                        items.push(Err(GroupEngineError::stream(
2364                            StreamErrorCode::EmptyAppend,
2365                            "append payload must be non-empty",
2366                        )));
2367                        continue;
2368                    }
2369                    items.push(self.append_payload(
2370                        AppendPayloadInput {
2371                            stream_id: stream_id.clone(),
2372                            content_type: Some(&content_type),
2373                            payload: &payload,
2374                            close_after: false,
2375                            stream_seq: None,
2376                            producer: None,
2377                            now_ms,
2378                        },
2379                        placement,
2380                    ));
2381                }
2382                Ok(GroupWriteResponse::AppendBatch(GroupAppendBatchResponse {
2383                    placement,
2384                    items,
2385                }))
2386            }
2387            GroupWriteCommand::PublishSnapshot {
2388                stream_id,
2389                snapshot_offset,
2390                content_type,
2391                payload,
2392                now_ms,
2393            } => {
2394                let response = self.state_machine.apply(StreamCommand::PublishSnapshot {
2395                    stream_id,
2396                    snapshot_offset,
2397                    content_type,
2398                    payload: payload.to_vec(),
2399                    now_ms,
2400                });
2401                match response {
2402                    StreamResponse::SnapshotPublished { snapshot_offset } => {
2403                        self.commit_index += 1;
2404                        Ok(GroupWriteResponse::PublishSnapshot(
2405                            PublishSnapshotResponse {
2406                                placement,
2407                                snapshot_offset,
2408                                group_commit_index: self.commit_index,
2409                            },
2410                        ))
2411                    }
2412                    StreamResponse::Error {
2413                        code,
2414                        message,
2415                        next_offset,
2416                    } => Err(GroupEngineError::stream_with_next_offset(
2417                        code,
2418                        message,
2419                        next_offset,
2420                    )),
2421                    other => Err(GroupEngineError::new(format!(
2422                        "unexpected publish snapshot response: {other:?}"
2423                    ))),
2424                }
2425            }
2426            GroupWriteCommand::TouchStreamAccess {
2427                stream_id,
2428                now_ms,
2429                renew_ttl,
2430            } => {
2431                let response = self.state_machine.apply(StreamCommand::TouchStreamAccess {
2432                    stream_id,
2433                    now_ms,
2434                    renew_ttl,
2435                });
2436                match response {
2437                    StreamResponse::Accessed { changed, expired } => {
2438                        if changed || expired {
2439                            self.commit_index += 1;
2440                        }
2441                        Ok(GroupWriteResponse::TouchStreamAccess(
2442                            TouchStreamAccessResponse {
2443                                placement,
2444                                changed,
2445                                expired,
2446                                group_commit_index: self.commit_index,
2447                            },
2448                        ))
2449                    }
2450                    StreamResponse::Error {
2451                        code,
2452                        message,
2453                        next_offset,
2454                    } => Err(GroupEngineError::stream_with_next_offset(
2455                        code,
2456                        message,
2457                        next_offset,
2458                    )),
2459                    other => Err(GroupEngineError::new(format!(
2460                        "unexpected touch stream access response: {other:?}"
2461                    ))),
2462                }
2463            }
2464            GroupWriteCommand::AddForkRef { stream_id, now_ms } => {
2465                let response = self
2466                    .state_machine
2467                    .apply(StreamCommand::AddForkRef { stream_id, now_ms });
2468                match response {
2469                    StreamResponse::ForkRefAdded { fork_ref_count } => {
2470                        self.commit_index += 1;
2471                        Ok(GroupWriteResponse::AddForkRef(ForkRefResponse {
2472                            placement,
2473                            fork_ref_count,
2474                            hard_deleted: false,
2475                            parent_to_release: None,
2476                            group_commit_index: self.commit_index,
2477                        }))
2478                    }
2479                    StreamResponse::Error {
2480                        code,
2481                        message,
2482                        next_offset,
2483                    } => Err(GroupEngineError::stream_with_next_offset(
2484                        code,
2485                        message,
2486                        next_offset,
2487                    )),
2488                    other => Err(GroupEngineError::new(format!(
2489                        "unexpected add fork ref response: {other:?}"
2490                    ))),
2491                }
2492            }
2493            GroupWriteCommand::ReleaseForkRef { stream_id } => {
2494                let response = self
2495                    .state_machine
2496                    .apply(StreamCommand::ReleaseForkRef { stream_id });
2497                match response {
2498                    StreamResponse::ForkRefReleased {
2499                        hard_deleted,
2500                        fork_ref_count,
2501                        parent_to_release,
2502                    } => {
2503                        self.commit_index += 1;
2504                        Ok(GroupWriteResponse::ReleaseForkRef(ForkRefResponse {
2505                            placement,
2506                            fork_ref_count,
2507                            hard_deleted,
2508                            parent_to_release,
2509                            group_commit_index: self.commit_index,
2510                        }))
2511                    }
2512                    StreamResponse::Error {
2513                        code,
2514                        message,
2515                        next_offset,
2516                    } => Err(GroupEngineError::stream_with_next_offset(
2517                        code,
2518                        message,
2519                        next_offset,
2520                    )),
2521                    other => Err(GroupEngineError::new(format!(
2522                        "unexpected release fork ref response: {other:?}"
2523                    ))),
2524                }
2525            }
2526            GroupWriteCommand::FlushCold { stream_id, chunk } => {
2527                let response = self
2528                    .state_machine
2529                    .apply(StreamCommand::FlushCold { stream_id, chunk });
2530                match response {
2531                    StreamResponse::ColdFlushed { hot_start_offset } => {
2532                        self.commit_index += 1;
2533                        Ok(GroupWriteResponse::FlushCold(FlushColdResponse {
2534                            placement,
2535                            hot_start_offset,
2536                            group_commit_index: self.commit_index,
2537                        }))
2538                    }
2539                    StreamResponse::Error {
2540                        code,
2541                        message,
2542                        next_offset,
2543                    } => Err(GroupEngineError::stream_with_next_offset(
2544                        code,
2545                        message,
2546                        next_offset,
2547                    )),
2548                    other => Err(GroupEngineError::new(format!(
2549                        "unexpected flush cold response: {other:?}"
2550                    ))),
2551                }
2552            }
2553            GroupWriteCommand::CloseStream {
2554                stream_id,
2555                stream_seq,
2556                producer,
2557                now_ms,
2558            } => {
2559                let response = self.state_machine.apply(StreamCommand::Close {
2560                    stream_id,
2561                    stream_seq,
2562                    producer,
2563                    now_ms,
2564                });
2565                match response {
2566                    StreamResponse::Closed {
2567                        next_offset,
2568                        deduplicated,
2569                        ..
2570                    } => {
2571                        if !deduplicated {
2572                            self.commit_index += 1;
2573                        }
2574                        Ok(GroupWriteResponse::CloseStream(CloseStreamResponse {
2575                            placement,
2576                            next_offset,
2577                            group_commit_index: self.commit_index,
2578                            deduplicated,
2579                        }))
2580                    }
2581                    StreamResponse::Error {
2582                        code,
2583                        message,
2584                        next_offset,
2585                    } => Err(GroupEngineError::stream_with_next_offset(
2586                        code,
2587                        message,
2588                        next_offset,
2589                    )),
2590                    other => Err(GroupEngineError::new(format!(
2591                        "unexpected close stream response: {other:?}"
2592                    ))),
2593                }
2594            }
2595            GroupWriteCommand::DeleteStream { stream_id } => {
2596                let response = self
2597                    .state_machine
2598                    .apply(StreamCommand::DeleteStream { stream_id });
2599                match response {
2600                    StreamResponse::Deleted {
2601                        hard_deleted,
2602                        parent_to_release,
2603                    } => {
2604                        self.commit_index += 1;
2605                        Ok(GroupWriteResponse::DeleteStream(DeleteStreamResponse {
2606                            placement,
2607                            group_commit_index: self.commit_index,
2608                            hard_deleted,
2609                            parent_to_release,
2610                        }))
2611                    }
2612                    StreamResponse::Error {
2613                        code,
2614                        message,
2615                        next_offset,
2616                    } => Err(GroupEngineError::stream_with_next_offset(
2617                        code,
2618                        message,
2619                        next_offset,
2620                    )),
2621                    other => Err(GroupEngineError::new(format!(
2622                        "unexpected delete stream response: {other:?}"
2623                    ))),
2624                }
2625            }
2626            GroupWriteCommand::Batch { commands } => Ok(GroupWriteResponse::Batch(
2627                self.apply_committed_write_batch(commands, placement),
2628            )),
2629        }
2630    }
2631
2632    fn cold_hot_backlog_for(
2633        &self,
2634        stream_id: BucketStreamId,
2635    ) -> Result<ColdHotBacklog, GroupEngineError> {
2636        let stream_hot_bytes = self.state_machine.hot_payload_len(&stream_id).unwrap_or(0);
2637        Ok(ColdHotBacklog {
2638            stream_id,
2639            stream_hot_bytes,
2640            group_hot_bytes: self.state_machine.total_hot_payload_bytes(),
2641        })
2642    }
2643
2644    fn enforce_cold_write_admission(
2645        &self,
2646        stream_id: &BucketStreamId,
2647        admission: ColdWriteAdmission,
2648        before_group_hot_bytes: u64,
2649        after_group_hot_bytes: u64,
2650        mutating: bool,
2651    ) -> Result<(), GroupEngineError> {
2652        let Some(limit) = admission.max_hot_bytes_per_group else {
2653            return Ok(());
2654        };
2655        if !mutating || after_group_hot_bytes <= limit {
2656            return Ok(());
2657        }
2658        Err(GroupEngineError::new(format!(
2659            "ColdBackpressure: stream '{stream_id}' would raise group hot bytes from {before_group_hot_bytes} to {after_group_hot_bytes}, above limit {limit}"
2660        )))
2661    }
2662
2663    fn create_stream_with_admission_inner(
2664        &mut self,
2665        request: CreateStreamRequest,
2666        placement: ShardPlacement,
2667        admission: ColdWriteAdmission,
2668    ) -> Result<CreateStreamResponse, GroupEngineError> {
2669        let stream_id = request.stream_id.clone();
2670        let command = GroupWriteCommand::from(request);
2671        let before = self.state_machine.total_hot_payload_bytes();
2672        let mut preview = self.clone();
2673        let response = match preview.apply_committed_write(command, placement)? {
2674            GroupWriteResponse::CreateStream(response) => response,
2675            other => {
2676                return Err(GroupEngineError::new(format!(
2677                    "unexpected create stream write response: {other:?}"
2678                )));
2679            }
2680        };
2681        preview.enforce_cold_write_admission(
2682            &stream_id,
2683            admission,
2684            before,
2685            preview.state_machine.total_hot_payload_bytes(),
2686            !response.already_exists,
2687        )?;
2688        *self = preview;
2689        Ok(response)
2690    }
2691
2692    fn append_with_admission_inner(
2693        &mut self,
2694        request: AppendRequest,
2695        placement: ShardPlacement,
2696        admission: ColdWriteAdmission,
2697    ) -> Result<AppendResponse, GroupEngineError> {
2698        let stream_id = request.stream_id.clone();
2699        let command = GroupWriteCommand::from(request);
2700        let before = self.state_machine.total_hot_payload_bytes();
2701        let mut preview = self.clone();
2702        let response = match preview.apply_committed_write(command, placement)? {
2703            GroupWriteResponse::Append(response) => response,
2704            other => {
2705                return Err(GroupEngineError::new(format!(
2706                    "unexpected append write response: {other:?}"
2707                )));
2708            }
2709        };
2710        preview.enforce_cold_write_admission(
2711            &stream_id,
2712            admission,
2713            before,
2714            preview.state_machine.total_hot_payload_bytes(),
2715            !response.deduplicated,
2716        )?;
2717        *self = preview;
2718        Ok(response)
2719    }
2720
2721    fn append_batch_with_admission_inner(
2722        &mut self,
2723        request: AppendBatchRequest,
2724        placement: ShardPlacement,
2725        admission: ColdWriteAdmission,
2726    ) -> Result<GroupAppendBatchResponse, GroupEngineError> {
2727        let stream_id = request.stream_id.clone();
2728        let command = GroupWriteCommand::from(request);
2729        let before = self.state_machine.total_hot_payload_bytes();
2730        let mut preview = self.clone();
2731        let response = match preview.apply_committed_write(command, placement)? {
2732            GroupWriteResponse::AppendBatch(response) => response,
2733            other => {
2734                return Err(GroupEngineError::new(format!(
2735                    "unexpected append batch write response: {other:?}"
2736                )));
2737            }
2738        };
2739        let mutating = response
2740            .items
2741            .iter()
2742            .any(|item| matches!(item, Ok(response) if !response.deduplicated));
2743        preview.enforce_cold_write_admission(
2744            &stream_id,
2745            admission,
2746            before,
2747            preview.state_machine.total_hot_payload_bytes(),
2748            mutating,
2749        )?;
2750        *self = preview;
2751        Ok(response)
2752    }
2753
2754    pub fn access_requires_write(
2755        &self,
2756        stream_id: &BucketStreamId,
2757        now_ms: u64,
2758        renew_ttl: bool,
2759    ) -> Result<bool, GroupEngineError> {
2760        self.state_machine
2761            .access_requires_write(stream_id, now_ms, renew_ttl)
2762            .map_err(stream_response_error)
2763    }
2764
2765    fn apply_access_command(
2766        &mut self,
2767        stream_id: BucketStreamId,
2768        now_ms: u64,
2769        renew_ttl: bool,
2770        placement: ShardPlacement,
2771    ) -> Result<TouchStreamAccessResponse, GroupEngineError> {
2772        match self.apply_committed_write(
2773            GroupWriteCommand::TouchStreamAccess {
2774                stream_id,
2775                now_ms,
2776                renew_ttl,
2777            },
2778            placement,
2779        )? {
2780            GroupWriteResponse::TouchStreamAccess(response) => Ok(response),
2781            other => Err(GroupEngineError::new(format!(
2782                "unexpected touch stream access write response: {other:?}"
2783            ))),
2784        }
2785    }
2786
2787    fn ensure_stream_access(
2788        &mut self,
2789        stream_id: &BucketStreamId,
2790        now_ms: u64,
2791        renew_ttl: bool,
2792        placement: ShardPlacement,
2793    ) -> Result<Option<TouchStreamAccessResponse>, GroupEngineError> {
2794        if !self.access_requires_write(stream_id, now_ms, renew_ttl)? {
2795            return Ok(None);
2796        }
2797        let response =
2798            self.apply_access_command(stream_id.clone(), now_ms, renew_ttl, placement)?;
2799        if response.expired {
2800            return Err(GroupEngineError::stream(
2801                StreamErrorCode::StreamNotFound,
2802                format!("stream '{stream_id}' does not exist"),
2803            ));
2804        }
2805        Ok(Some(response))
2806    }
2807
2808    pub fn apply_committed_write_batch(
2809        &mut self,
2810        commands: Vec<GroupWriteCommand>,
2811        placement: ShardPlacement,
2812    ) -> Vec<Result<GroupWriteResponse, GroupEngineError>> {
2813        commands
2814            .into_iter()
2815            .map(|command| self.apply_committed_write(command, placement))
2816            .collect()
2817    }
2818
2819    fn apply_replayed_write_command(
2820        &mut self,
2821        command: GroupWriteCommand,
2822    ) -> Result<(), GroupEngineError> {
2823        let placement = ShardPlacement {
2824            core_id: CoreId(0),
2825            shard_id: ShardId(0),
2826            raft_group_id: RaftGroupId(0),
2827        };
2828        self.apply_committed_write(command, placement).map(|_| ())
2829    }
2830
2831    fn apply_replayed_command(&mut self, command: StreamCommand) -> Result<(), GroupEngineError> {
2832        match command {
2833            StreamCommand::CreateBucket { bucket_id } => {
2834                match self
2835                    .state_machine
2836                    .apply(StreamCommand::CreateBucket { bucket_id })
2837                {
2838                    StreamResponse::BucketCreated { .. } => {
2839                        self.commit_index += 1;
2840                        Ok(())
2841                    }
2842                    StreamResponse::BucketAlreadyExists { .. } => Ok(()),
2843                    StreamResponse::Error {
2844                        code,
2845                        message,
2846                        next_offset,
2847                    } => Err(GroupEngineError::stream_with_next_offset(
2848                        code,
2849                        message,
2850                        next_offset,
2851                    )),
2852                    other => Err(GroupEngineError::new(format!(
2853                        "unexpected replay create bucket response: {other:?}"
2854                    ))),
2855                }
2856            }
2857            StreamCommand::DeleteBucket { bucket_id } => {
2858                match self
2859                    .state_machine
2860                    .apply(StreamCommand::DeleteBucket { bucket_id })
2861                {
2862                    StreamResponse::BucketDeleted { .. } => {
2863                        self.commit_index += 1;
2864                        Ok(())
2865                    }
2866                    StreamResponse::Error {
2867                        code,
2868                        message,
2869                        next_offset,
2870                    } => Err(GroupEngineError::stream_with_next_offset(
2871                        code,
2872                        message,
2873                        next_offset,
2874                    )),
2875                    other => Err(GroupEngineError::new(format!(
2876                        "unexpected replay delete bucket response: {other:?}"
2877                    ))),
2878                }
2879            }
2880            StreamCommand::CreateStream {
2881                stream_id,
2882                content_type,
2883                initial_payload,
2884                close_after,
2885                stream_seq,
2886                producer,
2887                stream_ttl_seconds,
2888                stream_expires_at_ms,
2889                forked_from,
2890                fork_offset,
2891                now_ms,
2892            } => {
2893                ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2894                let response = self.state_machine.apply(StreamCommand::CreateStream {
2895                    stream_id,
2896                    content_type,
2897                    initial_payload,
2898                    close_after,
2899                    stream_seq,
2900                    producer,
2901                    stream_ttl_seconds,
2902                    stream_expires_at_ms,
2903                    forked_from,
2904                    fork_offset,
2905                    now_ms,
2906                });
2907                match response {
2908                    StreamResponse::Created { .. } => {
2909                        self.commit_index += 1;
2910                        Ok(())
2911                    }
2912                    StreamResponse::AlreadyExists { .. } => Ok(()),
2913                    StreamResponse::Error {
2914                        code,
2915                        message,
2916                        next_offset,
2917                    } => Err(GroupEngineError::stream_with_next_offset(
2918                        code,
2919                        message,
2920                        next_offset,
2921                    )),
2922                    other => Err(GroupEngineError::new(format!(
2923                        "unexpected replay create stream response: {other:?}"
2924                    ))),
2925                }
2926            }
2927            StreamCommand::CreateExternal {
2928                stream_id,
2929                content_type,
2930                initial_payload,
2931                close_after,
2932                stream_seq,
2933                producer,
2934                stream_ttl_seconds,
2935                stream_expires_at_ms,
2936                forked_from,
2937                fork_offset,
2938                now_ms,
2939            } => {
2940                ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2941                let response = self.state_machine.apply(StreamCommand::CreateExternal {
2942                    stream_id,
2943                    content_type,
2944                    initial_payload,
2945                    close_after,
2946                    stream_seq,
2947                    producer,
2948                    stream_ttl_seconds,
2949                    stream_expires_at_ms,
2950                    forked_from,
2951                    fork_offset,
2952                    now_ms,
2953                });
2954                match response {
2955                    StreamResponse::Created { .. } => {
2956                        self.commit_index += 1;
2957                        Ok(())
2958                    }
2959                    StreamResponse::AlreadyExists { .. } => Ok(()),
2960                    StreamResponse::Error {
2961                        code,
2962                        message,
2963                        next_offset,
2964                    } => Err(GroupEngineError::stream_with_next_offset(
2965                        code,
2966                        message,
2967                        next_offset,
2968                    )),
2969                    other => Err(GroupEngineError::new(format!(
2970                        "unexpected replay external create stream response: {other:?}"
2971                    ))),
2972                }
2973            }
2974            StreamCommand::Append {
2975                stream_id,
2976                content_type,
2977                payload,
2978                close_after,
2979                stream_seq,
2980                producer,
2981                now_ms,
2982            } => {
2983                let stream_count_key = stream_id.clone();
2984                let response = self.state_machine.apply(StreamCommand::Append {
2985                    stream_id,
2986                    content_type,
2987                    payload,
2988                    close_after,
2989                    stream_seq,
2990                    producer,
2991                    now_ms,
2992                });
2993                match response {
2994                    StreamResponse::Appended { deduplicated, .. } => {
2995                        if !deduplicated {
2996                            self.commit_index += 1;
2997                            *self
2998                                .stream_append_counts
2999                                .entry(stream_count_key)
3000                                .or_insert(0) += 1;
3001                        }
3002                        Ok(())
3003                    }
3004                    StreamResponse::Closed { deduplicated, .. } => {
3005                        if !deduplicated {
3006                            self.commit_index += 1;
3007                        }
3008                        Ok(())
3009                    }
3010                    StreamResponse::Error {
3011                        code,
3012                        message,
3013                        next_offset,
3014                    } => Err(GroupEngineError::stream_with_next_offset(
3015                        code,
3016                        message,
3017                        next_offset,
3018                    )),
3019                    other => Err(GroupEngineError::new(format!(
3020                        "unexpected replay append response: {other:?}"
3021                    ))),
3022                }
3023            }
3024            StreamCommand::AppendExternal {
3025                stream_id,
3026                content_type,
3027                payload,
3028                close_after,
3029                stream_seq,
3030                producer,
3031                now_ms,
3032            } => {
3033                let stream_count_key = stream_id.clone();
3034                let response = self.state_machine.apply(StreamCommand::AppendExternal {
3035                    stream_id,
3036                    content_type,
3037                    payload,
3038                    close_after,
3039                    stream_seq,
3040                    producer,
3041                    now_ms,
3042                });
3043                match response {
3044                    StreamResponse::Appended { deduplicated, .. } => {
3045                        if !deduplicated {
3046                            self.commit_index += 1;
3047                            *self
3048                                .stream_append_counts
3049                                .entry(stream_count_key)
3050                                .or_insert(0) += 1;
3051                        }
3052                        Ok(())
3053                    }
3054                    StreamResponse::Error {
3055                        code,
3056                        message,
3057                        next_offset,
3058                    } => Err(GroupEngineError::stream_with_next_offset(
3059                        code,
3060                        message,
3061                        next_offset,
3062                    )),
3063                    other => Err(GroupEngineError::new(format!(
3064                        "unexpected replay external append response: {other:?}"
3065                    ))),
3066                }
3067            }
3068            StreamCommand::AppendBatch {
3069                stream_id,
3070                content_type,
3071                payloads,
3072                producer,
3073                now_ms,
3074            } => {
3075                let stream_count_key = stream_id.clone();
3076                let payload_refs = payloads.iter().map(Vec::as_slice).collect::<Vec<_>>();
3077                let response = self
3078                    .state_machine
3079                    .append_batch_borrowed(
3080                        stream_id,
3081                        content_type.as_deref(),
3082                        &payload_refs,
3083                        producer,
3084                        now_ms,
3085                    )
3086                    .map_err(stream_response_error)?;
3087                if !response.deduplicated {
3088                    let count = u64::try_from(response.items.len()).expect("item count fits u64");
3089                    self.commit_index += count;
3090                    *self
3091                        .stream_append_counts
3092                        .entry(stream_count_key)
3093                        .or_insert(0) += count;
3094                }
3095                Ok(())
3096            }
3097            StreamCommand::PublishSnapshot {
3098                stream_id,
3099                snapshot_offset,
3100                content_type,
3101                payload,
3102                now_ms,
3103            } => {
3104                let response = self.state_machine.apply(StreamCommand::PublishSnapshot {
3105                    stream_id,
3106                    snapshot_offset,
3107                    content_type,
3108                    payload,
3109                    now_ms,
3110                });
3111                match response {
3112                    StreamResponse::SnapshotPublished { .. } => {
3113                        self.commit_index += 1;
3114                        Ok(())
3115                    }
3116                    StreamResponse::Error {
3117                        code,
3118                        message,
3119                        next_offset,
3120                    } => Err(GroupEngineError::stream_with_next_offset(
3121                        code,
3122                        message,
3123                        next_offset,
3124                    )),
3125                    other => Err(GroupEngineError::new(format!(
3126                        "unexpected replay publish snapshot response: {other:?}"
3127                    ))),
3128                }
3129            }
3130            StreamCommand::TouchStreamAccess {
3131                stream_id,
3132                now_ms,
3133                renew_ttl,
3134            } => {
3135                let response = self.state_machine.apply(StreamCommand::TouchStreamAccess {
3136                    stream_id,
3137                    now_ms,
3138                    renew_ttl,
3139                });
3140                match response {
3141                    StreamResponse::Accessed { changed, expired } => {
3142                        if changed || expired {
3143                            self.commit_index += 1;
3144                        }
3145                        Ok(())
3146                    }
3147                    StreamResponse::Error {
3148                        code,
3149                        message,
3150                        next_offset,
3151                    } => Err(GroupEngineError::stream_with_next_offset(
3152                        code,
3153                        message,
3154                        next_offset,
3155                    )),
3156                    other => Err(GroupEngineError::new(format!(
3157                        "unexpected replay touch stream access response: {other:?}"
3158                    ))),
3159                }
3160            }
3161            StreamCommand::AddForkRef { stream_id, now_ms } => {
3162                let response = self
3163                    .state_machine
3164                    .apply(StreamCommand::AddForkRef { stream_id, now_ms });
3165                match response {
3166                    StreamResponse::ForkRefAdded { .. } => {
3167                        self.commit_index += 1;
3168                        Ok(())
3169                    }
3170                    StreamResponse::Error {
3171                        code,
3172                        message,
3173                        next_offset,
3174                    } => Err(GroupEngineError::stream_with_next_offset(
3175                        code,
3176                        message,
3177                        next_offset,
3178                    )),
3179                    other => Err(GroupEngineError::new(format!(
3180                        "unexpected replay add fork ref response: {other:?}"
3181                    ))),
3182                }
3183            }
3184            StreamCommand::ReleaseForkRef { stream_id } => {
3185                let response = self
3186                    .state_machine
3187                    .apply(StreamCommand::ReleaseForkRef { stream_id });
3188                match response {
3189                    StreamResponse::ForkRefReleased { .. } => {
3190                        self.commit_index += 1;
3191                        Ok(())
3192                    }
3193                    StreamResponse::Error {
3194                        code,
3195                        message,
3196                        next_offset,
3197                    } => Err(GroupEngineError::stream_with_next_offset(
3198                        code,
3199                        message,
3200                        next_offset,
3201                    )),
3202                    other => Err(GroupEngineError::new(format!(
3203                        "unexpected replay release fork ref response: {other:?}"
3204                    ))),
3205                }
3206            }
3207            StreamCommand::FlushCold { stream_id, chunk } => {
3208                let response = self
3209                    .state_machine
3210                    .apply(StreamCommand::FlushCold { stream_id, chunk });
3211                match response {
3212                    StreamResponse::ColdFlushed { .. } => {
3213                        self.commit_index += 1;
3214                        Ok(())
3215                    }
3216                    StreamResponse::Error {
3217                        code,
3218                        message,
3219                        next_offset,
3220                    } => Err(GroupEngineError::stream_with_next_offset(
3221                        code,
3222                        message,
3223                        next_offset,
3224                    )),
3225                    other => Err(GroupEngineError::new(format!(
3226                        "unexpected replay flush cold response: {other:?}"
3227                    ))),
3228                }
3229            }
3230            StreamCommand::Close {
3231                stream_id,
3232                stream_seq,
3233                producer,
3234                now_ms,
3235            } => {
3236                let response = self.state_machine.apply(StreamCommand::Close {
3237                    stream_id,
3238                    stream_seq,
3239                    producer,
3240                    now_ms,
3241                });
3242                match response {
3243                    StreamResponse::Closed { deduplicated, .. } => {
3244                        if !deduplicated {
3245                            self.commit_index += 1;
3246                        }
3247                        Ok(())
3248                    }
3249                    StreamResponse::Error {
3250                        code,
3251                        message,
3252                        next_offset,
3253                    } => Err(GroupEngineError::stream_with_next_offset(
3254                        code,
3255                        message,
3256                        next_offset,
3257                    )),
3258                    other => Err(GroupEngineError::new(format!(
3259                        "unexpected replay close stream response: {other:?}"
3260                    ))),
3261                }
3262            }
3263            StreamCommand::DeleteStream { stream_id } => {
3264                let response = self
3265                    .state_machine
3266                    .apply(StreamCommand::DeleteStream { stream_id });
3267                match response {
3268                    StreamResponse::Deleted { .. } => {
3269                        self.commit_index += 1;
3270                        Ok(())
3271                    }
3272                    StreamResponse::Error {
3273                        code,
3274                        message,
3275                        next_offset,
3276                    } => Err(GroupEngineError::stream_with_next_offset(
3277                        code,
3278                        message,
3279                        next_offset,
3280                    )),
3281                    other => Err(GroupEngineError::new(format!(
3282                        "unexpected replay delete stream response: {other:?}"
3283                    ))),
3284                }
3285            }
3286        }
3287    }
3288
3289    fn append_payload(
3290        &mut self,
3291        input: AppendPayloadInput<'_>,
3292        placement: ShardPlacement,
3293    ) -> Result<AppendResponse, GroupEngineError> {
3294        let AppendPayloadInput {
3295            stream_id,
3296            content_type,
3297            payload,
3298            close_after,
3299            stream_seq,
3300            producer,
3301            now_ms,
3302        } = input;
3303        let stream_count_key = stream_id.clone();
3304        let response = self.state_machine.append_borrowed(AppendStreamInput {
3305            stream_id,
3306            content_type,
3307            payload,
3308            close_after,
3309            stream_seq,
3310            producer,
3311            now_ms,
3312        });
3313        match response {
3314            StreamResponse::Appended {
3315                offset,
3316                next_offset,
3317                closed,
3318                deduplicated,
3319                producer,
3320                ..
3321            } => {
3322                let stream_append_count = self
3323                    .stream_append_counts
3324                    .entry(stream_count_key)
3325                    .or_insert(0);
3326                if !deduplicated {
3327                    self.commit_index += 1;
3328                    *stream_append_count += 1;
3329                }
3330                Ok(AppendResponse {
3331                    placement,
3332                    start_offset: offset,
3333                    next_offset,
3334                    stream_append_count: *stream_append_count,
3335                    group_commit_index: self.commit_index,
3336                    closed,
3337                    deduplicated,
3338                    producer,
3339                })
3340            }
3341            StreamResponse::Error {
3342                code,
3343                message,
3344                next_offset,
3345            } => Err(GroupEngineError::stream_with_next_offset(
3346                code,
3347                message,
3348                next_offset,
3349            )),
3350            other => Err(GroupEngineError::new(format!(
3351                "unexpected append response: {other:?}"
3352            ))),
3353        }
3354    }
3355
3356    pub fn read_stream_plan(
3357        &mut self,
3358        request: &ReadStreamRequest,
3359        placement: ShardPlacement,
3360    ) -> Result<StreamReadPlan, GroupEngineError> {
3361        self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3362        self.read_stream_plan_after_access(request)
3363    }
3364
3365    pub fn read_stream_plan_after_access(
3366        &self,
3367        request: &ReadStreamRequest,
3368    ) -> Result<StreamReadPlan, GroupEngineError> {
3369        self.state_machine
3370            .read_plan_at(
3371                &request.stream_id,
3372                request.offset,
3373                request.max_len,
3374                request.now_ms,
3375            )
3376            .map_err(stream_response_error)
3377    }
3378
3379    pub async fn read_payload_from_plan(
3380        cold_store: Option<&ColdStoreHandle>,
3381        stream_id: &BucketStreamId,
3382        plan: &StreamReadPlan,
3383    ) -> Result<Vec<u8>, GroupEngineError> {
3384        let mut payload = Vec::new();
3385        for segment in &plan.segments {
3386            match segment {
3387                StreamReadSegment::Hot(bytes) => payload.extend_from_slice(bytes),
3388                StreamReadSegment::Object(segment) => {
3389                    let Some(cold_store) = cold_store else {
3390                        return Err(GroupEngineError::stream_with_next_offset(
3391                            StreamErrorCode::InvalidColdFlush,
3392                            format!("stream '{stream_id}' read requires object payload store"),
3393                            Some(plan.next_offset),
3394                        ));
3395                    };
3396                    let bytes = cold_store
3397                        .read_object_range(&segment.object, segment.read_start_offset, segment.len)
3398                        .await
3399                        .map_err(|err| GroupEngineError::new(err.to_string()))?;
3400                    payload.extend_from_slice(&bytes);
3401                }
3402            }
3403        }
3404        Ok(payload)
3405    }
3406
3407    async fn read_own_payload_from_plan(
3408        &self,
3409        stream_id: &BucketStreamId,
3410        plan: &StreamReadPlan,
3411    ) -> Result<Vec<u8>, GroupEngineError> {
3412        Self::read_payload_from_plan(self.cold_store.as_ref(), stream_id, plan).await
3413    }
3414
3415    async fn bootstrap_updates(
3416        &self,
3417        stream_id: &BucketStreamId,
3418        records: &[StreamMessageRecord],
3419        content_type: &str,
3420        now_ms: u64,
3421    ) -> Result<Vec<BootstrapUpdate>, GroupEngineError> {
3422        let mut updates = Vec::with_capacity(records.len());
3423        for record in records {
3424            let len = usize::try_from(record.end_offset - record.start_offset).map_err(|_| {
3425                GroupEngineError::stream(
3426                    StreamErrorCode::InvalidSnapshot,
3427                    format!(
3428                        "bootstrap message [{}..{}) for stream '{stream_id}' is too large",
3429                        record.start_offset, record.end_offset
3430                    ),
3431                )
3432            })?;
3433            let plan = self
3434                .state_machine
3435                .read_plan_at(stream_id, record.start_offset, len, now_ms)
3436                .map_err(stream_response_error)?;
3437            let payload = self.read_own_payload_from_plan(stream_id, &plan).await?;
3438            updates.push(BootstrapUpdate {
3439                start_offset: record.start_offset,
3440                next_offset: record.end_offset,
3441                content_type: content_type.to_owned(),
3442                payload,
3443            });
3444        }
3445        Ok(updates)
3446    }
3447
3448    fn build_snapshot(&self, placement: ShardPlacement) -> GroupSnapshot {
3449        GroupSnapshot {
3450            placement,
3451            group_commit_index: self.commit_index,
3452            stream_snapshot: self.state_machine.snapshot(),
3453            stream_append_counts: self.stream_append_counts_snapshot(),
3454        }
3455    }
3456
3457    fn stream_append_counts_snapshot(&self) -> Vec<StreamAppendCount> {
3458        let mut counts = self
3459            .stream_append_counts
3460            .iter()
3461            .map(|(stream_id, append_count)| StreamAppendCount {
3462                stream_id: stream_id.clone(),
3463                append_count: *append_count,
3464            })
3465            .collect::<Vec<_>>();
3466        counts.sort_by(|left, right| compare_stream_ids(&left.stream_id, &right.stream_id));
3467        counts
3468    }
3469
3470    fn install_snapshot_inner(&mut self, snapshot: GroupSnapshot) -> Result<(), GroupEngineError> {
3471        let GroupSnapshot {
3472            placement: _,
3473            group_commit_index,
3474            stream_snapshot,
3475            stream_append_counts,
3476        } = snapshot;
3477        self.install_snapshot_parts(group_commit_index, stream_snapshot, stream_append_counts)
3478    }
3479
3480    fn install_snapshot_parts(
3481        &mut self,
3482        group_commit_index: u64,
3483        stream_snapshot: StreamSnapshot,
3484        stream_append_counts: Vec<StreamAppendCount>,
3485    ) -> Result<(), GroupEngineError> {
3486        let stream_ids = stream_snapshot
3487            .streams
3488            .iter()
3489            .map(|entry| entry.metadata.stream_id.clone())
3490            .collect::<HashSet<_>>();
3491        let state_machine = StreamStateMachine::restore(stream_snapshot)
3492            .map_err(|err| GroupEngineError::new(format!("restore stream snapshot: {err}")))?;
3493        let stream_append_counts = restore_stream_append_counts(stream_append_counts, &stream_ids)?;
3494
3495        self.commit_index = group_commit_index;
3496        self.state_machine = state_machine;
3497        self.stream_append_counts = stream_append_counts;
3498        Ok(())
3499    }
3500}
3501
3502impl GroupEngine for InMemoryGroupEngine {
3503    fn create_stream<'a>(
3504        &'a mut self,
3505        request: CreateStreamRequest,
3506        placement: ShardPlacement,
3507    ) -> GroupCreateStreamFuture<'a> {
3508        let command = GroupWriteCommand::from(request);
3509        Box::pin(async move {
3510            match self.apply_committed_write(command, placement)? {
3511                GroupWriteResponse::CreateStream(response) => Ok(response),
3512                other => Err(GroupEngineError::new(format!(
3513                    "unexpected create stream write response: {other:?}"
3514                ))),
3515            }
3516        })
3517    }
3518
3519    fn create_stream_with_cold_admission<'a>(
3520        &'a mut self,
3521        request: CreateStreamRequest,
3522        placement: ShardPlacement,
3523        admission: ColdWriteAdmission,
3524    ) -> GroupCreateStreamFuture<'a> {
3525        if !admission.is_enabled() {
3526            return self.create_stream(request, placement);
3527        }
3528        Box::pin(
3529            async move { self.create_stream_with_admission_inner(request, placement, admission) },
3530        )
3531    }
3532
3533    fn create_stream_external<'a>(
3534        &'a mut self,
3535        request: CreateStreamExternalRequest,
3536        placement: ShardPlacement,
3537    ) -> GroupCreateStreamFuture<'a> {
3538        let command = GroupWriteCommand::from(request);
3539        Box::pin(async move {
3540            match self.apply_committed_write(command, placement)? {
3541                GroupWriteResponse::CreateStream(response) => Ok(response),
3542                other => Err(GroupEngineError::new(format!(
3543                    "unexpected external create stream write response: {other:?}"
3544                ))),
3545            }
3546        })
3547    }
3548
3549    fn read_stream<'a>(
3550        &'a mut self,
3551        request: ReadStreamRequest,
3552        placement: ShardPlacement,
3553    ) -> GroupReadStreamFuture<'a> {
3554        Box::pin(async move {
3555            self.read_stream_parts(request, placement)
3556                .await?
3557                .into_response()
3558                .await
3559        })
3560    }
3561
3562    fn read_stream_parts<'a>(
3563        &'a mut self,
3564        request: ReadStreamRequest,
3565        placement: ShardPlacement,
3566    ) -> GroupReadStreamPartsFuture<'a> {
3567        Box::pin(async move {
3568            let stream_id = request.stream_id.clone();
3569            let plan = self.read_stream_plan(&request, placement)?;
3570            Ok(GroupReadStreamParts::from_plan(
3571                placement,
3572                stream_id,
3573                plan,
3574                self.cold_store(),
3575            ))
3576        })
3577    }
3578
3579    fn publish_snapshot<'a>(
3580        &'a mut self,
3581        request: PublishSnapshotRequest,
3582        placement: ShardPlacement,
3583    ) -> GroupPublishSnapshotFuture<'a> {
3584        Box::pin(async move {
3585            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3586            let command = GroupWriteCommand::from(request);
3587            match self.apply_committed_write(command, placement)? {
3588                GroupWriteResponse::PublishSnapshot(response) => Ok(response),
3589                other => Err(GroupEngineError::new(format!(
3590                    "unexpected publish snapshot write response: {other:?}"
3591                ))),
3592            }
3593        })
3594    }
3595
3596    fn read_snapshot<'a>(
3597        &'a mut self,
3598        request: ReadSnapshotRequest,
3599        placement: ShardPlacement,
3600    ) -> GroupReadSnapshotFuture<'a> {
3601        Box::pin(async move {
3602            self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3603            let snapshot = match request.snapshot_offset {
3604                Some(offset) => self
3605                    .state_machine
3606                    .read_snapshot(&request.stream_id, offset)
3607                    .map_err(stream_response_error)?,
3608                None => self
3609                    .state_machine
3610                    .latest_snapshot(&request.stream_id)
3611                    .map_err(stream_response_error)?
3612                    .ok_or_else(|| {
3613                        GroupEngineError::stream(
3614                            StreamErrorCode::SnapshotNotFound,
3615                            format!("stream '{}' has no visible snapshot", request.stream_id),
3616                        )
3617                    })?,
3618            };
3619            let tail_offset = self
3620                .state_machine
3621                .head_at(&request.stream_id, request.now_ms)
3622                .map(|metadata| metadata.tail_offset)
3623                .unwrap_or(snapshot.offset);
3624            Ok(ReadSnapshotResponse {
3625                placement,
3626                snapshot_offset: snapshot.offset,
3627                next_offset: snapshot.offset,
3628                content_type: snapshot.content_type,
3629                payload: snapshot.payload,
3630                up_to_date: snapshot.offset == tail_offset,
3631            })
3632        })
3633    }
3634
3635    fn delete_snapshot<'a>(
3636        &'a mut self,
3637        request: DeleteSnapshotRequest,
3638        placement: ShardPlacement,
3639    ) -> GroupDeleteSnapshotFuture<'a> {
3640        Box::pin(async move {
3641            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3642            match self
3643                .state_machine
3644                .delete_snapshot(&request.stream_id, request.snapshot_offset)
3645            {
3646                StreamResponse::Error {
3647                    code,
3648                    message,
3649                    next_offset,
3650                } => Err(GroupEngineError::stream_with_next_offset(
3651                    code,
3652                    message,
3653                    next_offset,
3654                )),
3655                other => Err(GroupEngineError::new(format!(
3656                    "unexpected delete snapshot response: {other:?}"
3657                ))),
3658            }
3659        })
3660    }
3661
3662    fn bootstrap_stream<'a>(
3663        &'a mut self,
3664        request: BootstrapStreamRequest,
3665        placement: ShardPlacement,
3666    ) -> GroupBootstrapStreamFuture<'a> {
3667        Box::pin(async move {
3668            self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3669            let plan = self
3670                .state_machine
3671                .bootstrap_plan(&request.stream_id)
3672                .map_err(stream_response_error)?;
3673            let snapshot_offset = plan.snapshot.as_ref().map(|snapshot| snapshot.offset);
3674            let snapshot_content_type = plan
3675                .snapshot
3676                .as_ref()
3677                .map(|snapshot| snapshot.content_type.clone())
3678                .unwrap_or_else(|| DEFAULT_CONTENT_TYPE.to_owned());
3679            let snapshot_payload = plan
3680                .snapshot
3681                .as_ref()
3682                .map(|snapshot| snapshot.payload.clone())
3683                .unwrap_or_default();
3684            let updates = self
3685                .bootstrap_updates(
3686                    &request.stream_id,
3687                    &plan.updates,
3688                    &plan.content_type,
3689                    request.now_ms,
3690                )
3691                .await?;
3692            Ok(BootstrapStreamResponse {
3693                placement,
3694                snapshot_offset,
3695                snapshot_content_type,
3696                snapshot_payload,
3697                updates,
3698                next_offset: plan.next_offset,
3699                up_to_date: plan.up_to_date,
3700                closed: plan.closed,
3701            })
3702        })
3703    }
3704
3705    fn touch_stream_access<'a>(
3706        &'a mut self,
3707        stream_id: BucketStreamId,
3708        now_ms: u64,
3709        renew_ttl: bool,
3710        placement: ShardPlacement,
3711    ) -> GroupTouchStreamAccessFuture<'a> {
3712        Box::pin(async move { self.apply_access_command(stream_id, now_ms, renew_ttl, placement) })
3713    }
3714
3715    fn add_fork_ref<'a>(
3716        &'a mut self,
3717        stream_id: BucketStreamId,
3718        now_ms: u64,
3719        placement: ShardPlacement,
3720    ) -> GroupForkRefFuture<'a> {
3721        Box::pin(async move {
3722            match self.apply_committed_write(
3723                GroupWriteCommand::AddForkRef { stream_id, now_ms },
3724                placement,
3725            )? {
3726                GroupWriteResponse::AddForkRef(response) => Ok(response),
3727                other => Err(GroupEngineError::new(format!(
3728                    "unexpected add fork ref write response: {other:?}"
3729                ))),
3730            }
3731        })
3732    }
3733
3734    fn release_fork_ref<'a>(
3735        &'a mut self,
3736        stream_id: BucketStreamId,
3737        placement: ShardPlacement,
3738    ) -> GroupForkRefFuture<'a> {
3739        Box::pin(async move {
3740            match self
3741                .apply_committed_write(GroupWriteCommand::ReleaseForkRef { stream_id }, placement)?
3742            {
3743                GroupWriteResponse::ReleaseForkRef(response) => Ok(response),
3744                other => Err(GroupEngineError::new(format!(
3745                    "unexpected release fork ref write response: {other:?}"
3746                ))),
3747            }
3748        })
3749    }
3750
3751    fn head_stream<'a>(
3752        &'a mut self,
3753        request: HeadStreamRequest,
3754        placement: ShardPlacement,
3755    ) -> GroupHeadStreamFuture<'a> {
3756        Box::pin(async move {
3757            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3758            let Some(metadata) = self
3759                .state_machine
3760                .head_at(&request.stream_id, request.now_ms)
3761            else {
3762                return Err(GroupEngineError::stream(
3763                    StreamErrorCode::StreamNotFound,
3764                    format!("stream '{}' does not exist", request.stream_id),
3765                ));
3766            };
3767            Ok(HeadStreamResponse {
3768                placement,
3769                content_type: metadata.content_type.clone(),
3770                tail_offset: metadata.tail_offset,
3771                closed: metadata.status == ursula_stream::StreamStatus::Closed,
3772                stream_ttl_seconds: metadata.stream_ttl_seconds,
3773                stream_expires_at_ms: metadata.stream_expires_at_ms,
3774                snapshot_offset: self
3775                    .state_machine
3776                    .latest_snapshot(&request.stream_id)
3777                    .map_err(stream_response_error)?
3778                    .map(|snapshot| snapshot.offset),
3779            })
3780        })
3781    }
3782
3783    fn close_stream<'a>(
3784        &'a mut self,
3785        request: CloseStreamRequest,
3786        placement: ShardPlacement,
3787    ) -> GroupCloseStreamFuture<'a> {
3788        Box::pin(async move {
3789            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3790            let command = GroupWriteCommand::from(request);
3791            match self.apply_committed_write(command, placement)? {
3792                GroupWriteResponse::CloseStream(response) => Ok(response),
3793                other => Err(GroupEngineError::new(format!(
3794                    "unexpected close stream write response: {other:?}"
3795                ))),
3796            }
3797        })
3798    }
3799
3800    fn delete_stream<'a>(
3801        &'a mut self,
3802        request: DeleteStreamRequest,
3803        placement: ShardPlacement,
3804    ) -> GroupDeleteStreamFuture<'a> {
3805        let command = GroupWriteCommand::from(request);
3806        Box::pin(async move {
3807            match self.apply_committed_write(command, placement)? {
3808                GroupWriteResponse::DeleteStream(response) => Ok(response),
3809                other => Err(GroupEngineError::new(format!(
3810                    "unexpected delete stream write response: {other:?}"
3811                ))),
3812            }
3813        })
3814    }
3815
3816    fn append<'a>(
3817        &'a mut self,
3818        request: AppendRequest,
3819        placement: ShardPlacement,
3820    ) -> GroupAppendFuture<'a> {
3821        Box::pin(async move {
3822            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3823            let command = GroupWriteCommand::from(request);
3824            match self.apply_committed_write(command, placement)? {
3825                GroupWriteResponse::Append(response) => Ok(response),
3826                other => Err(GroupEngineError::new(format!(
3827                    "unexpected append write response: {other:?}"
3828                ))),
3829            }
3830        })
3831    }
3832
3833    fn append_with_cold_admission<'a>(
3834        &'a mut self,
3835        request: AppendRequest,
3836        placement: ShardPlacement,
3837        admission: ColdWriteAdmission,
3838    ) -> GroupAppendFuture<'a> {
3839        if !admission.is_enabled() {
3840            return self.append(request, placement);
3841        }
3842        Box::pin(async move { self.append_with_admission_inner(request, placement, admission) })
3843    }
3844
3845    fn append_external<'a>(
3846        &'a mut self,
3847        request: AppendExternalRequest,
3848        placement: ShardPlacement,
3849    ) -> GroupAppendFuture<'a> {
3850        Box::pin(async move {
3851            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3852            let command = GroupWriteCommand::from(request);
3853            match self.apply_committed_write(command, placement)? {
3854                GroupWriteResponse::Append(response) => Ok(response),
3855                other => Err(GroupEngineError::new(format!(
3856                    "unexpected external append write response: {other:?}"
3857                ))),
3858            }
3859        })
3860    }
3861
3862    fn append_batch<'a>(
3863        &'a mut self,
3864        request: AppendBatchRequest,
3865        placement: ShardPlacement,
3866    ) -> GroupAppendBatchFuture<'a> {
3867        Box::pin(async move {
3868            self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3869            let command = GroupWriteCommand::from(request);
3870            match self.apply_committed_write(command, placement)? {
3871                GroupWriteResponse::AppendBatch(response) => Ok(response),
3872                other => Err(GroupEngineError::new(format!(
3873                    "unexpected append batch write response: {other:?}"
3874                ))),
3875            }
3876        })
3877    }
3878
3879    fn append_batch_with_cold_admission<'a>(
3880        &'a mut self,
3881        request: AppendBatchRequest,
3882        placement: ShardPlacement,
3883        admission: ColdWriteAdmission,
3884    ) -> GroupAppendBatchFuture<'a> {
3885        if !admission.is_enabled() {
3886            return self.append_batch(request, placement);
3887        }
3888        Box::pin(
3889            async move { self.append_batch_with_admission_inner(request, placement, admission) },
3890        )
3891    }
3892
3893    fn flush_cold<'a>(
3894        &'a mut self,
3895        request: FlushColdRequest,
3896        placement: ShardPlacement,
3897    ) -> GroupFlushColdFuture<'a> {
3898        let command = GroupWriteCommand::from(request);
3899        Box::pin(async move {
3900            match self.apply_committed_write(command, placement)? {
3901                GroupWriteResponse::FlushCold(response) => Ok(response),
3902                other => Err(GroupEngineError::new(format!(
3903                    "unexpected flush cold write response: {other:?}"
3904                ))),
3905            }
3906        })
3907    }
3908
3909    fn plan_cold_flush<'a>(
3910        &'a mut self,
3911        request: PlanColdFlushRequest,
3912        _placement: ShardPlacement,
3913    ) -> GroupPlanColdFlushFuture<'a> {
3914        Box::pin(async move {
3915            self.state_machine
3916                .plan_cold_flush(
3917                    &request.stream_id,
3918                    request.min_hot_bytes,
3919                    request.max_flush_bytes,
3920                )
3921                .map_err(stream_response_error)
3922        })
3923    }
3924
3925    fn plan_next_cold_flush<'a>(
3926        &'a mut self,
3927        request: PlanGroupColdFlushRequest,
3928        _placement: ShardPlacement,
3929    ) -> GroupPlanNextColdFlushFuture<'a> {
3930        Box::pin(async move {
3931            self.state_machine
3932                .plan_next_cold_flush(request.min_hot_bytes, request.max_flush_bytes)
3933                .map_err(stream_response_error)
3934        })
3935    }
3936
3937    fn plan_next_cold_flush_batch<'a>(
3938        &'a mut self,
3939        request: PlanGroupColdFlushRequest,
3940        _placement: ShardPlacement,
3941        max_candidates: usize,
3942    ) -> GroupPlanNextColdFlushBatchFuture<'a> {
3943        Box::pin(async move {
3944            self.state_machine
3945                .plan_next_cold_flush_batch(
3946                    request.min_hot_bytes,
3947                    request.max_flush_bytes,
3948                    max_candidates,
3949                )
3950                .map_err(stream_response_error)
3951        })
3952    }
3953
3954    fn cold_hot_backlog<'a>(
3955        &'a mut self,
3956        stream_id: BucketStreamId,
3957        _placement: ShardPlacement,
3958    ) -> GroupColdHotBacklogFuture<'a> {
3959        Box::pin(async move { self.cold_hot_backlog_for(stream_id) })
3960    }
3961
3962    fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
3963        Box::pin(async move { Ok(self.build_snapshot(placement)) })
3964    }
3965
3966    fn install_snapshot<'a>(
3967        &'a mut self,
3968        snapshot: GroupSnapshot,
3969    ) -> GroupInstallSnapshotFuture<'a> {
3970        Box::pin(async move { self.install_snapshot_inner(snapshot) })
3971    }
3972}
3973
3974#[derive(Debug, Clone, Default)]
3975pub struct InMemoryGroupEngineFactory {
3976    cold_store: Option<ColdStoreHandle>,
3977}
3978
3979impl InMemoryGroupEngineFactory {
3980    pub fn new() -> Self {
3981        Self::default()
3982    }
3983
3984    pub fn with_cold_store(cold_store: Option<ColdStoreHandle>) -> Self {
3985        Self { cold_store }
3986    }
3987}
3988
3989impl GroupEngineFactory for InMemoryGroupEngineFactory {
3990    fn create<'a>(
3991        &'a self,
3992        _placement: ShardPlacement,
3993        _metrics: GroupEngineMetrics,
3994    ) -> GroupEngineCreateFuture<'a> {
3995        Box::pin(async move {
3996            let engine = InMemoryGroupEngine {
3997                cold_store: self.cold_store.clone(),
3998                ..InMemoryGroupEngine::default()
3999            };
4000            let engine: Box<dyn GroupEngine> = Box::new(engine);
4001            Ok(engine)
4002        })
4003    }
4004}
4005
4006#[derive(Debug, Clone)]
4007pub struct WalGroupEngineFactory {
4008    root: PathBuf,
4009    cold_store: Option<ColdStoreHandle>,
4010}
4011
4012impl WalGroupEngineFactory {
4013    pub fn new(root: impl Into<PathBuf>) -> Self {
4014        Self {
4015            root: root.into(),
4016            cold_store: None,
4017        }
4018    }
4019
4020    pub fn with_cold_store(root: impl Into<PathBuf>, cold_store: Option<ColdStoreHandle>) -> Self {
4021        Self {
4022            root: root.into(),
4023            cold_store,
4024        }
4025    }
4026}
4027
4028impl GroupEngineFactory for WalGroupEngineFactory {
4029    fn create<'a>(
4030        &'a self,
4031        placement: ShardPlacement,
4032        metrics: GroupEngineMetrics,
4033    ) -> GroupEngineCreateFuture<'a> {
4034        Box::pin(async move {
4035            let engine: Box<dyn GroupEngine> = Box::new(WalGroupEngine::open(
4036                &self.root,
4037                placement,
4038                metrics,
4039                self.cold_store.clone(),
4040            ));
4041            Ok(engine)
4042        })
4043    }
4044}
4045
4046pub struct WalGroupEngine {
4047    inner: InMemoryGroupEngine,
4048    log_path: PathBuf,
4049    placement: ShardPlacement,
4050    metrics: GroupEngineMetrics,
4051    init_error: Option<String>,
4052}
4053
4054#[derive(Debug, Clone, Serialize, Deserialize)]
4055#[serde(tag = "wal_record", rename_all = "snake_case")]
4056enum WalRecord {
4057    Command {
4058        command: Box<GroupWriteCommand>,
4059    },
4060    Snapshot {
4061        group_commit_index: u64,
4062        stream_snapshot: StreamSnapshot,
4063        stream_append_counts: Vec<StreamAppendCount>,
4064    },
4065}
4066
4067impl WalGroupEngine {
4068    fn open(
4069        root: &Path,
4070        placement: ShardPlacement,
4071        metrics: GroupEngineMetrics,
4072        cold_store: Option<ColdStoreHandle>,
4073    ) -> Self {
4074        let log_path = group_log_path(root, placement);
4075        match replay_group_log(&log_path) {
4076            Ok(mut inner) => {
4077                inner.cold_store = cold_store;
4078                Self {
4079                    inner,
4080                    log_path,
4081                    placement,
4082                    metrics,
4083                    init_error: None,
4084                }
4085            }
4086            Err(err) => Self {
4087                inner: InMemoryGroupEngine {
4088                    cold_store,
4089                    ..InMemoryGroupEngine::default()
4090                },
4091                log_path,
4092                placement,
4093                metrics,
4094                init_error: Some(err.message().to_owned()),
4095            },
4096        }
4097    }
4098
4099    fn ensure_ready(&self) -> Result<(), GroupEngineError> {
4100        match &self.init_error {
4101            Some(message) => Err(GroupEngineError::new(message.clone())),
4102            None => Ok(()),
4103        }
4104    }
4105
4106    fn append_record(&self, command: &GroupWriteCommand) -> Result<(), GroupEngineError> {
4107        self.append_records(std::slice::from_ref(command))
4108    }
4109
4110    fn append_records(&self, commands: &[GroupWriteCommand]) -> Result<(), GroupEngineError> {
4111        if commands.is_empty() {
4112            return Ok(());
4113        }
4114        let Some(parent) = self.log_path.parent() else {
4115            return Err(GroupEngineError::new(format!(
4116                "WAL path '{}' has no parent directory",
4117                self.log_path.display()
4118            )));
4119        };
4120        fs::create_dir_all(parent).map_err(|err| {
4121            GroupEngineError::new(format!("create WAL dir '{}': {err}", parent.display()))
4122        })?;
4123        let write_started_at = Instant::now();
4124        let mut file = OpenOptions::new()
4125            .create(true)
4126            .append(true)
4127            .open(&self.log_path)
4128            .map_err(|err| {
4129                GroupEngineError::new(format!("open WAL '{}': {err}", self.log_path.display()))
4130            })?;
4131        for command in commands {
4132            let record = WalRecord::Command {
4133                command: Box::new(command.clone()),
4134            };
4135            serde_json::to_writer(&mut file, &record).map_err(|err| {
4136                GroupEngineError::new(format!("encode WAL '{}': {err}", self.log_path.display()))
4137            })?;
4138            file.write_all(b"\n").map_err(|err| {
4139                GroupEngineError::new(format!("write WAL '{}': {err}", self.log_path.display()))
4140            })?;
4141        }
4142        let write_ns = elapsed_ns(write_started_at);
4143        let sync_started_at = Instant::now();
4144        file.sync_data().map_err(|err| {
4145            GroupEngineError::new(format!("sync WAL '{}': {err}", self.log_path.display()))
4146        })?;
4147        self.metrics.record_wal_batch(
4148            self.placement,
4149            commands.len(),
4150            write_ns,
4151            elapsed_ns(sync_started_at),
4152        );
4153        Ok(())
4154    }
4155
4156    fn append_snapshot_record(&self, snapshot: &GroupSnapshot) -> Result<(), GroupEngineError> {
4157        let record = WalRecord::Snapshot {
4158            group_commit_index: snapshot.group_commit_index,
4159            stream_snapshot: snapshot.stream_snapshot.clone(),
4160            stream_append_counts: snapshot.stream_append_counts.clone(),
4161        };
4162        let Some(parent) = self.log_path.parent() else {
4163            return Err(GroupEngineError::new(format!(
4164                "WAL path '{}' has no parent directory",
4165                self.log_path.display()
4166            )));
4167        };
4168        fs::create_dir_all(parent).map_err(|err| {
4169            GroupEngineError::new(format!("create WAL dir '{}': {err}", parent.display()))
4170        })?;
4171        let write_started_at = Instant::now();
4172        let mut file = OpenOptions::new()
4173            .create(true)
4174            .append(true)
4175            .open(&self.log_path)
4176            .map_err(|err| {
4177                GroupEngineError::new(format!("open WAL '{}': {err}", self.log_path.display()))
4178            })?;
4179        serde_json::to_writer(&mut file, &record).map_err(|err| {
4180            GroupEngineError::new(format!("encode WAL '{}': {err}", self.log_path.display()))
4181        })?;
4182        file.write_all(b"\n").map_err(|err| {
4183            GroupEngineError::new(format!("write WAL '{}': {err}", self.log_path.display()))
4184        })?;
4185        let write_ns = elapsed_ns(write_started_at);
4186        let sync_started_at = Instant::now();
4187        file.sync_data().map_err(|err| {
4188            GroupEngineError::new(format!("sync WAL '{}': {err}", self.log_path.display()))
4189        })?;
4190        self.metrics
4191            .record_wal_batch(self.placement, 1, write_ns, elapsed_ns(sync_started_at));
4192        Ok(())
4193    }
4194
4195    fn commit_access_if_needed(
4196        &mut self,
4197        stream_id: &BucketStreamId,
4198        now_ms: u64,
4199        renew_ttl: bool,
4200        placement: ShardPlacement,
4201    ) -> Result<Option<TouchStreamAccessResponse>, GroupEngineError> {
4202        if !self
4203            .inner
4204            .access_requires_write(stream_id, now_ms, renew_ttl)?
4205        {
4206            return Ok(None);
4207        }
4208        let command = GroupWriteCommand::TouchStreamAccess {
4209            stream_id: stream_id.clone(),
4210            now_ms,
4211            renew_ttl,
4212        };
4213        let mut preview = self.inner.clone();
4214        let response = match preview.apply_committed_write(command.clone(), placement)? {
4215            GroupWriteResponse::TouchStreamAccess(response) => response,
4216            other => {
4217                return Err(GroupEngineError::new(format!(
4218                    "unexpected touch stream access write response: {other:?}"
4219                )));
4220            }
4221        };
4222        if response.changed || response.expired {
4223            self.append_record(&command)?;
4224        }
4225        self.inner = preview;
4226        if response.expired {
4227            return Err(GroupEngineError::stream(
4228                StreamErrorCode::StreamNotFound,
4229                format!("stream '{stream_id}' does not exist"),
4230            ));
4231        }
4232        Ok(Some(response))
4233    }
4234}
4235
4236impl GroupEngine for WalGroupEngine {
4237    fn create_stream<'a>(
4238        &'a mut self,
4239        request: CreateStreamRequest,
4240        placement: ShardPlacement,
4241    ) -> GroupCreateStreamFuture<'a> {
4242        Box::pin(async move {
4243            self.ensure_ready()?;
4244            let command = GroupWriteCommand::from(request);
4245            let mut preview = self.inner.clone();
4246            let response = match preview.apply_committed_write(command.clone(), placement)? {
4247                GroupWriteResponse::CreateStream(response) => response,
4248                other => {
4249                    return Err(GroupEngineError::new(format!(
4250                        "unexpected create stream write response: {other:?}"
4251                    )));
4252                }
4253            };
4254            if !response.already_exists {
4255                self.append_record(&command)?;
4256            }
4257            self.inner = preview;
4258            Ok(response)
4259        })
4260    }
4261
4262    fn create_stream_with_cold_admission<'a>(
4263        &'a mut self,
4264        request: CreateStreamRequest,
4265        placement: ShardPlacement,
4266        admission: ColdWriteAdmission,
4267    ) -> GroupCreateStreamFuture<'a> {
4268        if !admission.is_enabled() {
4269            return self.create_stream(request, placement);
4270        }
4271        Box::pin(async move {
4272            self.ensure_ready()?;
4273            let command = GroupWriteCommand::from(request.clone());
4274            let mut preview = self.inner.clone();
4275            let response =
4276                preview.create_stream_with_admission_inner(request, placement, admission)?;
4277            if !response.already_exists {
4278                self.append_record(&command)?;
4279            }
4280            self.inner = preview;
4281            Ok(response)
4282        })
4283    }
4284
4285    fn head_stream<'a>(
4286        &'a mut self,
4287        request: HeadStreamRequest,
4288        placement: ShardPlacement,
4289    ) -> GroupHeadStreamFuture<'a> {
4290        Box::pin(async move {
4291            self.ensure_ready()?;
4292            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4293            self.inner.head_stream(request, placement).await
4294        })
4295    }
4296
4297    fn read_stream<'a>(
4298        &'a mut self,
4299        request: ReadStreamRequest,
4300        placement: ShardPlacement,
4301    ) -> GroupReadStreamFuture<'a> {
4302        Box::pin(async move {
4303            self.ensure_ready()?;
4304            self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4305            self.inner.read_stream(request, placement).await
4306        })
4307    }
4308
4309    fn publish_snapshot<'a>(
4310        &'a mut self,
4311        request: PublishSnapshotRequest,
4312        placement: ShardPlacement,
4313    ) -> GroupPublishSnapshotFuture<'a> {
4314        Box::pin(async move {
4315            self.ensure_ready()?;
4316            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4317            let command = GroupWriteCommand::from(request);
4318            let mut preview = self.inner.clone();
4319            let response = match preview.apply_committed_write(command.clone(), placement)? {
4320                GroupWriteResponse::PublishSnapshot(response) => response,
4321                other => {
4322                    return Err(GroupEngineError::new(format!(
4323                        "unexpected publish snapshot write response: {other:?}"
4324                    )));
4325                }
4326            };
4327            self.append_record(&command)?;
4328            self.inner = preview;
4329            Ok(response)
4330        })
4331    }
4332
4333    fn read_snapshot<'a>(
4334        &'a mut self,
4335        request: ReadSnapshotRequest,
4336        placement: ShardPlacement,
4337    ) -> GroupReadSnapshotFuture<'a> {
4338        Box::pin(async move {
4339            self.ensure_ready()?;
4340            self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4341            self.inner.read_snapshot(request, placement).await
4342        })
4343    }
4344
4345    fn delete_snapshot<'a>(
4346        &'a mut self,
4347        request: DeleteSnapshotRequest,
4348        placement: ShardPlacement,
4349    ) -> GroupDeleteSnapshotFuture<'a> {
4350        Box::pin(async move {
4351            self.ensure_ready()?;
4352            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4353            self.inner.delete_snapshot(request, placement).await
4354        })
4355    }
4356
4357    fn bootstrap_stream<'a>(
4358        &'a mut self,
4359        request: BootstrapStreamRequest,
4360        placement: ShardPlacement,
4361    ) -> GroupBootstrapStreamFuture<'a> {
4362        Box::pin(async move {
4363            self.ensure_ready()?;
4364            self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4365            self.inner.bootstrap_stream(request, placement).await
4366        })
4367    }
4368
4369    fn touch_stream_access<'a>(
4370        &'a mut self,
4371        stream_id: BucketStreamId,
4372        now_ms: u64,
4373        renew_ttl: bool,
4374        placement: ShardPlacement,
4375    ) -> GroupTouchStreamAccessFuture<'a> {
4376        Box::pin(async move {
4377            self.ensure_ready()?;
4378            let command = GroupWriteCommand::TouchStreamAccess {
4379                stream_id,
4380                now_ms,
4381                renew_ttl,
4382            };
4383            let mut preview = self.inner.clone();
4384            let response = match preview.apply_committed_write(command.clone(), placement)? {
4385                GroupWriteResponse::TouchStreamAccess(response) => response,
4386                other => {
4387                    return Err(GroupEngineError::new(format!(
4388                        "unexpected touch stream access write response: {other:?}"
4389                    )));
4390                }
4391            };
4392            if response.changed || response.expired {
4393                self.append_record(&command)?;
4394            }
4395            self.inner = preview;
4396            Ok(response)
4397        })
4398    }
4399
4400    fn add_fork_ref<'a>(
4401        &'a mut self,
4402        stream_id: BucketStreamId,
4403        now_ms: u64,
4404        placement: ShardPlacement,
4405    ) -> GroupForkRefFuture<'a> {
4406        Box::pin(async move {
4407            self.ensure_ready()?;
4408            let command = GroupWriteCommand::AddForkRef { stream_id, now_ms };
4409            let mut preview = self.inner.clone();
4410            let response = match preview.apply_committed_write(command.clone(), placement)? {
4411                GroupWriteResponse::AddForkRef(response) => response,
4412                other => {
4413                    return Err(GroupEngineError::new(format!(
4414                        "unexpected add fork ref write response: {other:?}"
4415                    )));
4416                }
4417            };
4418            self.append_record(&command)?;
4419            self.inner = preview;
4420            Ok(response)
4421        })
4422    }
4423
4424    fn release_fork_ref<'a>(
4425        &'a mut self,
4426        stream_id: BucketStreamId,
4427        placement: ShardPlacement,
4428    ) -> GroupForkRefFuture<'a> {
4429        Box::pin(async move {
4430            self.ensure_ready()?;
4431            let command = GroupWriteCommand::ReleaseForkRef { stream_id };
4432            let mut preview = self.inner.clone();
4433            let response = match preview.apply_committed_write(command.clone(), placement)? {
4434                GroupWriteResponse::ReleaseForkRef(response) => response,
4435                other => {
4436                    return Err(GroupEngineError::new(format!(
4437                        "unexpected release fork ref write response: {other:?}"
4438                    )));
4439                }
4440            };
4441            self.append_record(&command)?;
4442            self.inner = preview;
4443            Ok(response)
4444        })
4445    }
4446
4447    fn close_stream<'a>(
4448        &'a mut self,
4449        request: CloseStreamRequest,
4450        placement: ShardPlacement,
4451    ) -> GroupCloseStreamFuture<'a> {
4452        Box::pin(async move {
4453            self.ensure_ready()?;
4454            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4455            let command = GroupWriteCommand::from(request);
4456            let mut preview = self.inner.clone();
4457            let response = match preview.apply_committed_write(command.clone(), placement)? {
4458                GroupWriteResponse::CloseStream(response) => response,
4459                other => {
4460                    return Err(GroupEngineError::new(format!(
4461                        "unexpected close stream write response: {other:?}"
4462                    )));
4463                }
4464            };
4465            self.append_record(&command)?;
4466            self.inner = preview;
4467            Ok(response)
4468        })
4469    }
4470
4471    fn delete_stream<'a>(
4472        &'a mut self,
4473        request: DeleteStreamRequest,
4474        placement: ShardPlacement,
4475    ) -> GroupDeleteStreamFuture<'a> {
4476        Box::pin(async move {
4477            self.ensure_ready()?;
4478            let command = GroupWriteCommand::from(request);
4479            let mut preview = self.inner.clone();
4480            let response = match preview.apply_committed_write(command.clone(), placement)? {
4481                GroupWriteResponse::DeleteStream(response) => response,
4482                other => {
4483                    return Err(GroupEngineError::new(format!(
4484                        "unexpected delete stream write response: {other:?}"
4485                    )));
4486                }
4487            };
4488            self.append_record(&command)?;
4489            self.inner = preview;
4490            Ok(response)
4491        })
4492    }
4493
4494    fn append<'a>(
4495        &'a mut self,
4496        request: AppendRequest,
4497        placement: ShardPlacement,
4498    ) -> GroupAppendFuture<'a> {
4499        Box::pin(async move {
4500            self.ensure_ready()?;
4501            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4502            let command = GroupWriteCommand::from(request);
4503            let mut preview = self.inner.clone();
4504            let response = match preview.apply_committed_write(command.clone(), placement)? {
4505                GroupWriteResponse::Append(response) => response,
4506                other => {
4507                    return Err(GroupEngineError::new(format!(
4508                        "unexpected append write response: {other:?}"
4509                    )));
4510                }
4511            };
4512            self.append_record(&command)?;
4513            self.inner = preview;
4514            Ok(response)
4515        })
4516    }
4517
4518    fn append_with_cold_admission<'a>(
4519        &'a mut self,
4520        request: AppendRequest,
4521        placement: ShardPlacement,
4522        admission: ColdWriteAdmission,
4523    ) -> GroupAppendFuture<'a> {
4524        if !admission.is_enabled() {
4525            return self.append(request, placement);
4526        }
4527        Box::pin(async move {
4528            self.ensure_ready()?;
4529            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4530            let command = GroupWriteCommand::from(request.clone());
4531            let mut preview = self.inner.clone();
4532            let response = preview.append_with_admission_inner(request, placement, admission)?;
4533            if !response.deduplicated {
4534                self.append_record(&command)?;
4535            }
4536            self.inner = preview;
4537            Ok(response)
4538        })
4539    }
4540
4541    fn append_batch<'a>(
4542        &'a mut self,
4543        request: AppendBatchRequest,
4544        placement: ShardPlacement,
4545    ) -> GroupAppendBatchFuture<'a> {
4546        Box::pin(async move {
4547            self.ensure_ready()?;
4548            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4549            let command = GroupWriteCommand::from(request);
4550            let mut preview = self.inner.clone();
4551            let response = match preview.apply_committed_write(command.clone(), placement)? {
4552                GroupWriteResponse::AppendBatch(response) => response,
4553                other => {
4554                    return Err(GroupEngineError::new(format!(
4555                        "unexpected append batch write response: {other:?}"
4556                    )));
4557                }
4558            };
4559            if response
4560                .items
4561                .iter()
4562                .any(|item| matches!(item, Ok(response) if !response.deduplicated))
4563            {
4564                self.append_record(&command)?;
4565            }
4566            self.inner = preview;
4567            Ok(response)
4568        })
4569    }
4570
4571    fn append_batch_with_cold_admission<'a>(
4572        &'a mut self,
4573        request: AppendBatchRequest,
4574        placement: ShardPlacement,
4575        admission: ColdWriteAdmission,
4576    ) -> GroupAppendBatchFuture<'a> {
4577        if !admission.is_enabled() {
4578            return self.append_batch(request, placement);
4579        }
4580        Box::pin(async move {
4581            self.ensure_ready()?;
4582            self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4583            let command = GroupWriteCommand::from(request.clone());
4584            let mut preview = self.inner.clone();
4585            let response =
4586                preview.append_batch_with_admission_inner(request, placement, admission)?;
4587            if response
4588                .items
4589                .iter()
4590                .any(|item| matches!(item, Ok(response) if !response.deduplicated))
4591            {
4592                self.append_record(&command)?;
4593            }
4594            self.inner = preview;
4595            Ok(response)
4596        })
4597    }
4598
4599    fn flush_cold<'a>(
4600        &'a mut self,
4601        request: FlushColdRequest,
4602        placement: ShardPlacement,
4603    ) -> GroupFlushColdFuture<'a> {
4604        Box::pin(async move {
4605            self.ensure_ready()?;
4606            let command = GroupWriteCommand::from(request);
4607            let mut preview = self.inner.clone();
4608            let response = match preview.apply_committed_write(command.clone(), placement)? {
4609                GroupWriteResponse::FlushCold(response) => response,
4610                other => {
4611                    return Err(GroupEngineError::new(format!(
4612                        "unexpected flush cold write response: {other:?}"
4613                    )));
4614                }
4615            };
4616            self.append_record(&command)?;
4617            self.inner = preview;
4618            Ok(response)
4619        })
4620    }
4621
4622    fn plan_cold_flush<'a>(
4623        &'a mut self,
4624        request: PlanColdFlushRequest,
4625        placement: ShardPlacement,
4626    ) -> GroupPlanColdFlushFuture<'a> {
4627        Box::pin(async move {
4628            self.ensure_ready()?;
4629            self.inner.plan_cold_flush(request, placement).await
4630        })
4631    }
4632
4633    fn plan_next_cold_flush<'a>(
4634        &'a mut self,
4635        request: PlanGroupColdFlushRequest,
4636        placement: ShardPlacement,
4637    ) -> GroupPlanNextColdFlushFuture<'a> {
4638        Box::pin(async move {
4639            self.ensure_ready()?;
4640            self.inner.plan_next_cold_flush(request, placement).await
4641        })
4642    }
4643
4644    fn plan_next_cold_flush_batch<'a>(
4645        &'a mut self,
4646        request: PlanGroupColdFlushRequest,
4647        placement: ShardPlacement,
4648        max_candidates: usize,
4649    ) -> GroupPlanNextColdFlushBatchFuture<'a> {
4650        Box::pin(async move {
4651            self.ensure_ready()?;
4652            self.inner
4653                .plan_next_cold_flush_batch(request, placement, max_candidates)
4654                .await
4655        })
4656    }
4657
4658    fn cold_hot_backlog<'a>(
4659        &'a mut self,
4660        stream_id: BucketStreamId,
4661        placement: ShardPlacement,
4662    ) -> GroupColdHotBacklogFuture<'a> {
4663        Box::pin(async move {
4664            self.ensure_ready()?;
4665            self.inner.cold_hot_backlog(stream_id, placement).await
4666        })
4667    }
4668
4669    fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
4670        Box::pin(async move {
4671            self.ensure_ready()?;
4672            self.inner.snapshot(placement).await
4673        })
4674    }
4675
4676    fn install_snapshot<'a>(
4677        &'a mut self,
4678        snapshot: GroupSnapshot,
4679    ) -> GroupInstallSnapshotFuture<'a> {
4680        Box::pin(async move {
4681            self.ensure_ready()?;
4682            let mut preview = self.inner.clone();
4683            preview.install_snapshot(snapshot.clone()).await?;
4684            self.append_snapshot_record(&snapshot)?;
4685            self.inner = preview;
4686            Ok(())
4687        })
4688    }
4689}
4690
4691fn group_log_path(root: &Path, placement: ShardPlacement) -> PathBuf {
4692    root.join(format!("core-{}", placement.core_id.0))
4693        .join(format!("group-{}.jsonl", placement.raft_group_id.0))
4694}
4695
4696fn replay_group_log(log_path: &Path) -> Result<InMemoryGroupEngine, GroupEngineError> {
4697    if !log_path.exists() {
4698        return Ok(InMemoryGroupEngine::default());
4699    }
4700
4701    let file = File::open(log_path).map_err(|err| {
4702        GroupEngineError::new(format!("open WAL '{}': {err}", log_path.display()))
4703    })?;
4704    let reader = BufReader::new(file);
4705    let mut inner = InMemoryGroupEngine::default();
4706    for (line_index, line) in reader.lines().enumerate() {
4707        let line = line.map_err(|err| {
4708            GroupEngineError::new(format!(
4709                "read WAL '{}' line {}: {err}",
4710                log_path.display(),
4711                line_index + 1
4712            ))
4713        })?;
4714        if line.trim().is_empty() {
4715            continue;
4716        }
4717        if let Ok(record) = serde_json::from_str::<WalRecord>(&line) {
4718            match record {
4719                WalRecord::Command { command } => inner
4720                    .apply_replayed_write_command(*command)
4721                    .map_err(|err| {
4722                        GroupEngineError::new(format!(
4723                            "replay WAL command '{}' line {}: {err}",
4724                            log_path.display(),
4725                            line_index + 1
4726                        ))
4727                    })?,
4728                WalRecord::Snapshot {
4729                    group_commit_index,
4730                    stream_snapshot,
4731                    stream_append_counts,
4732                } => inner
4733                    .install_snapshot_parts(
4734                        group_commit_index,
4735                        stream_snapshot,
4736                        stream_append_counts,
4737                    )
4738                    .map_err(|err| {
4739                        GroupEngineError::new(format!(
4740                            "replay WAL snapshot '{}' line {}: {err}",
4741                            log_path.display(),
4742                            line_index + 1
4743                        ))
4744                    })?,
4745            }
4746            continue;
4747        }
4748
4749        let command = serde_json::from_str::<StreamCommand>(&line).map_err(|err| {
4750            GroupEngineError::new(format!(
4751                "decode WAL '{}' line {}: {err}",
4752                log_path.display(),
4753                line_index + 1
4754            ))
4755        })?;
4756        inner.apply_replayed_command(command).map_err(|err| {
4757            GroupEngineError::new(format!(
4758                "replay WAL '{}' line {}: {err}",
4759                log_path.display(),
4760                line_index + 1
4761            ))
4762        })?;
4763    }
4764    Ok(inner)
4765}
4766
4767fn ensure_bucket_exists(
4768    state_machine: &mut StreamStateMachine,
4769    stream_id: &BucketStreamId,
4770) -> Result<(), GroupEngineError> {
4771    if state_machine.bucket_exists(&stream_id.bucket_id) {
4772        return Ok(());
4773    }
4774
4775    match state_machine.apply(StreamCommand::CreateBucket {
4776        bucket_id: stream_id.bucket_id.clone(),
4777    }) {
4778        StreamResponse::BucketCreated { .. } | StreamResponse::BucketAlreadyExists { .. } => Ok(()),
4779        StreamResponse::Error {
4780            code,
4781            message,
4782            next_offset,
4783        } => Err(GroupEngineError::stream_with_next_offset(
4784            code,
4785            message,
4786            next_offset,
4787        )),
4788        other => Err(GroupEngineError::new(format!(
4789            "unexpected create bucket response: {other:?}"
4790        ))),
4791    }
4792}
4793
4794fn stream_response_error(response: StreamResponse) -> GroupEngineError {
4795    match response {
4796        StreamResponse::Error {
4797            code,
4798            message,
4799            next_offset,
4800        } => GroupEngineError::stream_with_next_offset(code, message, next_offset),
4801        other => GroupEngineError::new(format!("unexpected stream response error: {other:?}")),
4802    }
4803}
4804
4805fn restore_stream_append_counts(
4806    counts: Vec<StreamAppendCount>,
4807    snapshot_stream_ids: &HashSet<BucketStreamId>,
4808) -> Result<HashMap<BucketStreamId, u64>, GroupEngineError> {
4809    let mut restored = HashMap::with_capacity(counts.len());
4810    for count in counts {
4811        if !snapshot_stream_ids.contains(&count.stream_id) {
4812            return Err(GroupEngineError::new(format!(
4813                "append count references missing snapshot stream '{}'",
4814                count.stream_id
4815            )));
4816        }
4817        if restored
4818            .insert(count.stream_id.clone(), count.append_count)
4819            .is_some()
4820        {
4821            return Err(GroupEngineError::new(format!(
4822                "snapshot contains duplicate append count for stream '{}'",
4823                count.stream_id
4824            )));
4825        }
4826    }
4827    Ok(restored)
4828}
4829
4830fn compare_stream_ids(left: &BucketStreamId, right: &BucketStreamId) -> std::cmp::Ordering {
4831    left.bucket_id
4832        .cmp(&right.bucket_id)
4833        .then_with(|| left.stream_id.cmp(&right.stream_id))
4834}
4835
4836#[derive(Debug, Clone)]
4837pub struct RuntimeConfig {
4838    pub core_count: usize,
4839    pub raft_group_count: usize,
4840    pub mailbox_capacity: usize,
4841    pub threading: RuntimeThreading,
4842    pub cold_max_hot_bytes_per_group: Option<u64>,
4843    pub live_read_max_waiters_per_core: Option<u64>,
4844}
4845
4846impl RuntimeConfig {
4847    pub fn new(core_count: usize, raft_group_count: usize) -> Self {
4848        Self {
4849            core_count,
4850            raft_group_count,
4851            mailbox_capacity: 1024,
4852            threading: RuntimeThreading::ThreadPerCore,
4853            cold_max_hot_bytes_per_group: None,
4854            live_read_max_waiters_per_core: Some(65_536),
4855        }
4856    }
4857
4858    pub fn with_cold_max_hot_bytes_per_group(mut self, value: Option<u64>) -> Self {
4859        self.cold_max_hot_bytes_per_group = value;
4860        self
4861    }
4862
4863    pub fn with_live_read_max_waiters_per_core(mut self, value: Option<u64>) -> Self {
4864        self.live_read_max_waiters_per_core = value;
4865        self
4866    }
4867}
4868
4869#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4870pub enum RuntimeThreading {
4871    ThreadPerCore,
4872    HostedTokio,
4873}
4874
4875#[derive(Debug, Clone)]
4876pub struct ShardRuntime {
4877    shard_map: StaticShardMap,
4878    mailboxes: Vec<CoreMailbox>,
4879    metrics: Arc<RuntimeMetricsInner>,
4880    next_waiter_id: Arc<AtomicU64>,
4881    cold_store: Option<ColdStoreHandle>,
4882}
4883
4884impl ShardRuntime {
4885    pub fn spawn(config: RuntimeConfig) -> Result<Self, RuntimeError> {
4886        Self::spawn_with_engine_factory(config, InMemoryGroupEngineFactory::default())
4887    }
4888
4889    pub fn spawn_with_engine_factory(
4890        config: RuntimeConfig,
4891        engine_factory: impl GroupEngineFactory,
4892    ) -> Result<Self, RuntimeError> {
4893        Self::spawn_with_engine_factory_and_cold_store(config, engine_factory, None)
4894    }
4895
4896    pub fn spawn_with_engine_factory_and_cold_store(
4897        config: RuntimeConfig,
4898        engine_factory: impl GroupEngineFactory,
4899        cold_store: Option<ColdStoreHandle>,
4900    ) -> Result<Self, RuntimeError> {
4901        let shard_map = StaticShardMap::new(config.core_count, config.raft_group_count)?;
4902        let metrics = Arc::new(RuntimeMetricsInner::new(
4903            usize::from(shard_map.core_count()),
4904            usize::try_from(shard_map.raft_group_count()).expect("u32 fits usize"),
4905        ));
4906        let cold_write_admission = ColdWriteAdmission {
4907            max_hot_bytes_per_group: config.cold_max_hot_bytes_per_group,
4908        };
4909        let engine_factory: Arc<dyn GroupEngineFactory> = Arc::new(engine_factory);
4910        let read_materialization = Arc::new(Semaphore::new(config.mailbox_capacity.max(1)));
4911        let mut mailboxes = Vec::with_capacity(usize::from(shard_map.core_count()));
4912        for raw_core_id in 0..shard_map.core_count() {
4913            let core_id = CoreId(raw_core_id);
4914            let (tx, rx) = mpsc::channel(config.mailbox_capacity.max(1));
4915            let worker = CoreWorker {
4916                core_id,
4917                rx,
4918                engine_factory: engine_factory.clone(),
4919                groups: HashMap::new(),
4920                metrics: metrics.clone(),
4921                group_mailbox_capacity: config.mailbox_capacity.max(1),
4922                cold_write_admission,
4923                live_read_max_waiters_per_core: config.live_read_max_waiters_per_core,
4924                read_materialization: read_materialization.clone(),
4925            };
4926            spawn_core_worker(config.threading, worker)?;
4927            mailboxes.push(CoreMailbox { core_id, tx });
4928        }
4929        Ok(Self {
4930            shard_map,
4931            mailboxes,
4932            metrics,
4933            next_waiter_id: Arc::new(AtomicU64::new(1)),
4934            cold_store,
4935        })
4936    }
4937
4938    pub fn locate(&self, stream_id: &BucketStreamId) -> ShardPlacement {
4939        self.shard_map.locate(stream_id)
4940    }
4941
4942    pub fn has_cold_store(&self) -> bool {
4943        self.cold_store.is_some()
4944    }
4945
4946    pub fn cold_store(&self) -> Option<ColdStoreHandle> {
4947        self.cold_store.clone()
4948    }
4949
4950    pub async fn create_stream(
4951        &self,
4952        request: CreateStreamRequest,
4953    ) -> Result<CreateStreamResponse, RuntimeError> {
4954        if request.forked_from.is_some() {
4955            return self.create_fork_stream(request).await;
4956        }
4957        self.create_stream_on_owner(request).await
4958    }
4959
4960    pub async fn create_stream_external(
4961        &self,
4962        request: CreateStreamExternalRequest,
4963    ) -> Result<CreateStreamResponse, RuntimeError> {
4964        let placement = self.shard_map.locate(&request.stream_id);
4965        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
4966        let (response_tx, response_rx) = oneshot::channel();
4967        self.send_core_command(
4968            mailbox,
4969            CoreCommand::CreateExternal {
4970                request,
4971                placement,
4972                response_tx,
4973            },
4974            response_rx,
4975        )
4976        .await
4977    }
4978
4979    async fn create_stream_on_owner(
4980        &self,
4981        request: CreateStreamRequest,
4982    ) -> Result<CreateStreamResponse, RuntimeError> {
4983        let placement = self.shard_map.locate(&request.stream_id);
4984        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
4985        let (response_tx, response_rx) = oneshot::channel();
4986        self.send_core_command(
4987            mailbox,
4988            CoreCommand::CreateStream {
4989                request,
4990                placement,
4991                response_tx,
4992            },
4993            response_rx,
4994        )
4995        .await
4996    }
4997
4998    async fn create_fork_stream(
4999        &self,
5000        mut request: CreateStreamRequest,
5001    ) -> Result<CreateStreamResponse, RuntimeError> {
5002        let source_id = request
5003            .forked_from
5004            .clone()
5005            .expect("forked_from checked before create_fork_stream");
5006        let now_ms = request.now_ms;
5007        let source_placement = self.shard_map.locate(&source_id);
5008        let source_head = self
5009            .head_stream(HeadStreamRequest {
5010                stream_id: source_id.clone(),
5011                now_ms,
5012            })
5013            .await
5014            .map_err(|err| map_fork_source_ref_error(err, source_placement))?;
5015
5016        if request.content_type_explicit {
5017            if request.content_type != source_head.content_type {
5018                return Err(RuntimeError::group_engine(
5019                    source_placement,
5020                    GroupEngineError::stream(
5021                        StreamErrorCode::ContentTypeMismatch,
5022                        format!(
5023                            "fork content type '{}' does not match source content type '{}'",
5024                            request.content_type, source_head.content_type
5025                        ),
5026                    ),
5027                ));
5028            }
5029        } else {
5030            request.content_type.clone_from(&source_head.content_type);
5031        }
5032
5033        let fork_offset = request.fork_offset.unwrap_or(source_head.tail_offset);
5034        if fork_offset > source_head.tail_offset {
5035            return Err(RuntimeError::group_engine(
5036                source_placement,
5037                GroupEngineError::stream(
5038                    StreamErrorCode::InvalidFork,
5039                    format!(
5040                        "fork offset {fork_offset} is beyond source stream '{}' tail {}",
5041                        source_id, source_head.tail_offset
5042                    ),
5043                ),
5044            ));
5045        }
5046
5047        let max_len = usize::try_from(fork_offset).map_err(|_| {
5048            RuntimeError::group_engine(
5049                source_placement,
5050                GroupEngineError::stream(
5051                    StreamErrorCode::InvalidFork,
5052                    format!("fork offset {fork_offset} cannot fit in memory on this host"),
5053                ),
5054            )
5055        })?;
5056        request.initial_payload = if fork_offset == 0 {
5057            Bytes::new()
5058        } else {
5059            self.read_stream(ReadStreamRequest {
5060                stream_id: source_id.clone(),
5061                offset: 0,
5062                max_len,
5063                now_ms,
5064            })
5065            .await?
5066            .payload
5067            .into()
5068        };
5069        self.add_fork_ref_on_owner(source_id.clone(), now_ms)
5070            .await
5071            .map_err(|err| map_fork_source_ref_error(err, source_placement))?;
5072        request.close_after = false;
5073        request.stream_seq = None;
5074        request.producer = None;
5075        if request.stream_ttl_seconds.is_none() && request.stream_expires_at_ms.is_none() {
5076            request.stream_ttl_seconds = source_head.stream_ttl_seconds;
5077            request.stream_expires_at_ms = source_head.stream_expires_at_ms;
5078        }
5079        request.fork_offset = Some(fork_offset);
5080        match self.create_stream_on_owner(request).await {
5081            Ok(response) if response.already_exists => {
5082                self.release_fork_ref_cascade(source_id).await?;
5083                Ok(response)
5084            }
5085            Ok(response) => Ok(response),
5086            Err(err) => {
5087                let _ = self.release_fork_ref_cascade(source_id).await;
5088                Err(err)
5089            }
5090        }
5091    }
5092
5093    pub async fn head_stream(
5094        &self,
5095        request: HeadStreamRequest,
5096    ) -> Result<HeadStreamResponse, RuntimeError> {
5097        let placement = self.shard_map.locate(&request.stream_id);
5098        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5099        let (response_tx, response_rx) = oneshot::channel();
5100        self.send_core_command(
5101            mailbox,
5102            CoreCommand::HeadStream {
5103                request,
5104                placement,
5105                response_tx,
5106            },
5107            response_rx,
5108        )
5109        .await
5110    }
5111
5112    pub async fn read_stream(
5113        &self,
5114        request: ReadStreamRequest,
5115    ) -> Result<ReadStreamResponse, RuntimeError> {
5116        let placement = self.shard_map.locate(&request.stream_id);
5117        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5118        let (response_tx, response_rx) = oneshot::channel();
5119        self.send_core_command(
5120            mailbox,
5121            CoreCommand::ReadStream {
5122                request,
5123                placement,
5124                response_tx,
5125            },
5126            response_rx,
5127        )
5128        .await
5129    }
5130
5131    pub async fn publish_snapshot(
5132        &self,
5133        request: PublishSnapshotRequest,
5134    ) -> Result<PublishSnapshotResponse, RuntimeError> {
5135        let placement = self.shard_map.locate(&request.stream_id);
5136        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5137        let (response_tx, response_rx) = oneshot::channel();
5138        self.send_core_command(
5139            mailbox,
5140            CoreCommand::PublishSnapshot {
5141                request,
5142                placement,
5143                response_tx,
5144            },
5145            response_rx,
5146        )
5147        .await
5148    }
5149
5150    pub async fn read_snapshot(
5151        &self,
5152        request: ReadSnapshotRequest,
5153    ) -> Result<ReadSnapshotResponse, RuntimeError> {
5154        let placement = self.shard_map.locate(&request.stream_id);
5155        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5156        let (response_tx, response_rx) = oneshot::channel();
5157        self.send_core_command(
5158            mailbox,
5159            CoreCommand::ReadSnapshot {
5160                request,
5161                placement,
5162                response_tx,
5163            },
5164            response_rx,
5165        )
5166        .await
5167    }
5168
5169    pub async fn delete_snapshot(
5170        &self,
5171        request: DeleteSnapshotRequest,
5172    ) -> Result<(), RuntimeError> {
5173        let placement = self.shard_map.locate(&request.stream_id);
5174        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5175        let (response_tx, response_rx) = oneshot::channel();
5176        self.send_core_command(
5177            mailbox,
5178            CoreCommand::DeleteSnapshot {
5179                request,
5180                placement,
5181                response_tx,
5182            },
5183            response_rx,
5184        )
5185        .await
5186    }
5187
5188    pub async fn bootstrap_stream(
5189        &self,
5190        request: BootstrapStreamRequest,
5191    ) -> Result<BootstrapStreamResponse, RuntimeError> {
5192        let placement = self.shard_map.locate(&request.stream_id);
5193        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5194        let (response_tx, response_rx) = oneshot::channel();
5195        self.send_core_command(
5196            mailbox,
5197            CoreCommand::BootstrapStream {
5198                request,
5199                placement,
5200                response_tx,
5201            },
5202            response_rx,
5203        )
5204        .await
5205    }
5206
5207    pub async fn wait_read_stream(
5208        &self,
5209        request: ReadStreamRequest,
5210    ) -> Result<ReadStreamResponse, RuntimeError> {
5211        let placement = self.shard_map.locate(&request.stream_id);
5212        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5213        let waiter_id = self.next_waiter_id.fetch_add(1, Ordering::Relaxed);
5214        let stream_id = request.stream_id.clone();
5215        let (response_tx, response_rx) = oneshot::channel();
5216        self.enqueue_core_command(
5217            mailbox,
5218            CoreCommand::WaitRead {
5219                request,
5220                placement,
5221                waiter_id,
5222                response_tx,
5223            },
5224        )
5225        .await?;
5226        let mut cancel = WaitReadCancel::new(mailbox.tx.clone(), stream_id, placement, waiter_id);
5227        let response = response_rx
5228            .await
5229            .map_err(|_| RuntimeError::ResponseDropped {
5230                core_id: mailbox.core_id,
5231            })?;
5232        cancel.disarm();
5233        response
5234    }
5235
5236    pub async fn require_local_live_read_owner(
5237        &self,
5238        stream_id: &BucketStreamId,
5239    ) -> Result<(), RuntimeError> {
5240        let placement = self.shard_map.locate(stream_id);
5241        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5242        let (response_tx, response_rx) = oneshot::channel();
5243        self.send_core_command(
5244            mailbox,
5245            CoreCommand::RequireLiveReadOwner {
5246                placement,
5247                response_tx,
5248            },
5249            response_rx,
5250        )
5251        .await
5252    }
5253
5254    pub async fn close_stream(
5255        &self,
5256        request: CloseStreamRequest,
5257    ) -> Result<CloseStreamResponse, RuntimeError> {
5258        let placement = self.shard_map.locate(&request.stream_id);
5259        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5260        let (response_tx, response_rx) = oneshot::channel();
5261        self.send_core_command(
5262            mailbox,
5263            CoreCommand::CloseStream {
5264                request,
5265                placement,
5266                response_tx,
5267            },
5268            response_rx,
5269        )
5270        .await
5271    }
5272
5273    pub async fn delete_stream(
5274        &self,
5275        request: DeleteStreamRequest,
5276    ) -> Result<DeleteStreamResponse, RuntimeError> {
5277        let response = self.delete_stream_on_owner(request).await?;
5278        if let Some(parent_to_release) = response.parent_to_release.clone() {
5279            self.release_fork_ref_cascade(parent_to_release).await?;
5280        }
5281        Ok(response)
5282    }
5283
5284    async fn delete_stream_on_owner(
5285        &self,
5286        request: DeleteStreamRequest,
5287    ) -> Result<DeleteStreamResponse, RuntimeError> {
5288        let placement = self.shard_map.locate(&request.stream_id);
5289        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5290        let (response_tx, response_rx) = oneshot::channel();
5291        self.send_core_command(
5292            mailbox,
5293            CoreCommand::DeleteStream {
5294                request,
5295                placement,
5296                response_tx,
5297            },
5298            response_rx,
5299        )
5300        .await
5301    }
5302
5303    async fn add_fork_ref_on_owner(
5304        &self,
5305        stream_id: BucketStreamId,
5306        now_ms: u64,
5307    ) -> Result<ForkRefResponse, RuntimeError> {
5308        let placement = self.shard_map.locate(&stream_id);
5309        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5310        let (response_tx, response_rx) = oneshot::channel();
5311        self.send_core_command(
5312            mailbox,
5313            CoreCommand::AddForkRef {
5314                stream_id,
5315                now_ms,
5316                placement,
5317                response_tx,
5318            },
5319            response_rx,
5320        )
5321        .await
5322    }
5323
5324    async fn release_fork_ref_on_owner(
5325        &self,
5326        stream_id: BucketStreamId,
5327    ) -> Result<ForkRefResponse, RuntimeError> {
5328        let placement = self.shard_map.locate(&stream_id);
5329        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5330        let (response_tx, response_rx) = oneshot::channel();
5331        self.send_core_command(
5332            mailbox,
5333            CoreCommand::ReleaseForkRef {
5334                stream_id,
5335                placement,
5336                response_tx,
5337            },
5338            response_rx,
5339        )
5340        .await
5341    }
5342
5343    async fn release_fork_ref_cascade(
5344        &self,
5345        stream_id: BucketStreamId,
5346    ) -> Result<(), RuntimeError> {
5347        let mut next = Some(stream_id);
5348        while let Some(current) = next {
5349            let response = self.release_fork_ref_on_owner(current).await?;
5350            next = response.parent_to_release;
5351        }
5352        Ok(())
5353    }
5354
5355    pub async fn flush_cold(
5356        &self,
5357        request: FlushColdRequest,
5358    ) -> Result<FlushColdResponse, RuntimeError> {
5359        let placement = self.shard_map.locate(&request.stream_id);
5360        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5361        let (response_tx, response_rx) = oneshot::channel();
5362        self.send_core_command(
5363            mailbox,
5364            CoreCommand::FlushCold {
5365                request,
5366                placement,
5367                response_tx,
5368            },
5369            response_rx,
5370        )
5371        .await
5372    }
5373
5374    pub async fn append_external(
5375        &self,
5376        request: AppendExternalRequest,
5377    ) -> Result<AppendResponse, RuntimeError> {
5378        let placement = self.shard_map.locate(&request.stream_id);
5379        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5380        let (response_tx, response_rx) = oneshot::channel();
5381        self.send_core_command(
5382            mailbox,
5383            CoreCommand::AppendExternal {
5384                request,
5385                placement,
5386                response_tx,
5387            },
5388            response_rx,
5389        )
5390        .await
5391    }
5392
5393    pub async fn plan_cold_flush(
5394        &self,
5395        request: PlanColdFlushRequest,
5396    ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
5397        let placement = self.shard_map.locate(&request.stream_id);
5398        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5399        let (response_tx, response_rx) = oneshot::channel();
5400        self.send_core_command(
5401            mailbox,
5402            CoreCommand::PlanColdFlush {
5403                request,
5404                placement,
5405                response_tx,
5406            },
5407            response_rx,
5408        )
5409        .await
5410    }
5411
5412    pub async fn flush_cold_once(
5413        &self,
5414        request: PlanColdFlushRequest,
5415    ) -> Result<Option<FlushColdResponse>, RuntimeError> {
5416        let Some(candidate) = self.plan_cold_flush(request).await? else {
5417            return Ok(None);
5418        };
5419        self.flush_cold_candidate(candidate).await.map(Some)
5420    }
5421
5422    pub async fn plan_next_cold_flush(
5423        &self,
5424        raft_group_id: RaftGroupId,
5425        request: PlanGroupColdFlushRequest,
5426    ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
5427        let placement = self.placement_for_group(raft_group_id)?;
5428        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5429        let (response_tx, response_rx) = oneshot::channel();
5430        self.send_core_command(
5431            mailbox,
5432            CoreCommand::PlanNextColdFlush {
5433                request,
5434                placement,
5435                response_tx,
5436            },
5437            response_rx,
5438        )
5439        .await
5440    }
5441
5442    pub async fn plan_next_cold_flush_batch(
5443        &self,
5444        raft_group_id: RaftGroupId,
5445        request: PlanGroupColdFlushRequest,
5446        max_candidates: usize,
5447    ) -> Result<Vec<ColdFlushCandidate>, RuntimeError> {
5448        let placement = self.placement_for_group(raft_group_id)?;
5449        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5450        let (response_tx, response_rx) = oneshot::channel();
5451        self.send_core_command(
5452            mailbox,
5453            CoreCommand::PlanNextColdFlushBatch {
5454                request,
5455                placement,
5456                max_candidates,
5457                response_tx,
5458            },
5459            response_rx,
5460        )
5461        .await
5462    }
5463
5464    pub async fn flush_cold_group_once(
5465        &self,
5466        raft_group_id: RaftGroupId,
5467        request: PlanGroupColdFlushRequest,
5468    ) -> Result<Option<FlushColdResponse>, RuntimeError> {
5469        let Some(candidate) = self.plan_next_cold_flush(raft_group_id, request).await? else {
5470            return Ok(None);
5471        };
5472        match self.flush_cold_candidate(candidate).await {
5473            Ok(response) => Ok(Some(response)),
5474            Err(err) if is_stale_cold_flush_candidate_error(&err) => Ok(None),
5475            Err(err) => Err(err),
5476        }
5477    }
5478
5479    pub async fn flush_cold_group_batch_once(
5480        &self,
5481        raft_group_id: RaftGroupId,
5482        request: PlanGroupColdFlushRequest,
5483        max_candidates: usize,
5484    ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
5485        let candidates = self
5486            .plan_next_cold_flush_batch(raft_group_id, request, max_candidates)
5487            .await?;
5488        if candidates.is_empty() {
5489            return Ok(Vec::new());
5490        }
5491        match self.flush_cold_candidates_batch(candidates).await {
5492            Ok(responses) => Ok(responses),
5493            Err(err) if is_stale_cold_flush_candidate_error(&err) => Ok(Vec::new()),
5494            Err(err) => Err(err),
5495        }
5496    }
5497
5498    async fn flush_cold_candidate(
5499        &self,
5500        candidate: ColdFlushCandidate,
5501    ) -> Result<FlushColdResponse, RuntimeError> {
5502        let Some(cold_store) = self.cold_store.as_ref() else {
5503            return Err(RuntimeError::ColdStoreConfig {
5504                message: "URSULA_COLD_BACKEND must be configured before flushing cold chunks"
5505                    .to_owned(),
5506            });
5507        };
5508        let path = new_cold_chunk_path(
5509            &candidate.stream_id,
5510            candidate.start_offset,
5511            candidate.end_offset,
5512        );
5513        let upload_started_at = Instant::now();
5514        let object_size = cold_store
5515            .write_chunk(&path, &candidate.payload)
5516            .await
5517            .map_err(|err| RuntimeError::ColdStoreIo {
5518                message: err.to_string(),
5519            })?;
5520        self.metrics
5521            .record_cold_upload(object_size, elapsed_ns(upload_started_at));
5522        let publish_started_at = Instant::now();
5523        let publish = self
5524            .flush_cold(FlushColdRequest {
5525                stream_id: candidate.stream_id,
5526                chunk: ColdChunkRef {
5527                    start_offset: candidate.start_offset,
5528                    end_offset: candidate.end_offset,
5529                    s3_path: path.clone(),
5530                    object_size,
5531                },
5532            })
5533            .await;
5534        match publish {
5535            Ok(response) => {
5536                self.metrics
5537                    .record_cold_publish(object_size, elapsed_ns(publish_started_at));
5538                Ok(response)
5539            }
5540            Err(err) => {
5541                let cleanup_failed = cold_store.delete_chunk(&path).await.is_err();
5542                self.metrics
5543                    .record_cold_orphan_cleanup(object_size, cleanup_failed);
5544                Err(err)
5545            }
5546        }
5547    }
5548
5549    async fn flush_cold_candidates_batch(
5550        &self,
5551        candidates: Vec<ColdFlushCandidate>,
5552    ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
5553        let Some(cold_store) = self.cold_store.as_ref() else {
5554            return Err(RuntimeError::ColdStoreConfig {
5555                message: "URSULA_COLD_BACKEND must be configured before flushing cold chunks"
5556                    .to_owned(),
5557            });
5558        };
5559        let mut requests = Vec::with_capacity(candidates.len());
5560        let mut uploaded = Vec::with_capacity(candidates.len());
5561        for candidate in candidates {
5562            let path = new_cold_chunk_path(
5563                &candidate.stream_id,
5564                candidate.start_offset,
5565                candidate.end_offset,
5566            );
5567            let upload_started_at = Instant::now();
5568            let object_size = cold_store
5569                .write_chunk(&path, &candidate.payload)
5570                .await
5571                .map_err(|err| RuntimeError::ColdStoreIo {
5572                    message: err.to_string(),
5573                })?;
5574            self.metrics
5575                .record_cold_upload(object_size, elapsed_ns(upload_started_at));
5576            uploaded.push((path.clone(), object_size));
5577            requests.push(FlushColdRequest {
5578                stream_id: candidate.stream_id,
5579                chunk: ColdChunkRef {
5580                    start_offset: candidate.start_offset,
5581                    end_offset: candidate.end_offset,
5582                    s3_path: path,
5583                    object_size,
5584                },
5585            });
5586        }
5587
5588        let placement = self.shard_map.locate(&requests[0].stream_id);
5589        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5590        let (response_tx, response_rx) = oneshot::channel();
5591        let publish_started_at = Instant::now();
5592        let publish = self
5593            .send_core_command(
5594                mailbox,
5595                CoreCommand::FlushColdBatch {
5596                    requests,
5597                    placement,
5598                    response_tx,
5599                },
5600                response_rx,
5601            )
5602            .await;
5603        match publish {
5604            Ok(responses) => {
5605                let publish_ns = elapsed_ns(publish_started_at);
5606                let per_chunk_publish_ns =
5607                    publish_ns / u64::try_from(uploaded.len()).expect("uploaded len fits u64");
5608                for (_, object_size) in &uploaded {
5609                    self.metrics
5610                        .record_cold_publish(*object_size, per_chunk_publish_ns);
5611                }
5612                Ok(responses)
5613            }
5614            Err(err) => {
5615                for (path, object_size) in uploaded {
5616                    let cleanup_failed = cold_store.delete_chunk(&path).await.is_err();
5617                    self.metrics
5618                        .record_cold_orphan_cleanup(object_size, cleanup_failed);
5619                }
5620                Err(err)
5621            }
5622        }
5623    }
5624
5625    pub async fn flush_cold_all_groups_once(
5626        &self,
5627        request: PlanGroupColdFlushRequest,
5628    ) -> Result<usize, RuntimeError> {
5629        self.flush_cold_all_groups_once_bounded(request, 1).await
5630    }
5631
5632    pub async fn flush_cold_all_groups_once_bounded(
5633        &self,
5634        request: PlanGroupColdFlushRequest,
5635        max_concurrency: usize,
5636    ) -> Result<usize, RuntimeError> {
5637        let max_concurrency = max_concurrency.max(1);
5638        if max_concurrency == 1 {
5639            return self.flush_cold_all_groups_once_serial(request).await;
5640        }
5641        let mut flushed = 0;
5642        let mut next_group_id = 0;
5643        let group_count = self.shard_map.raft_group_count();
5644        let mut tasks = JoinSet::new();
5645
5646        while next_group_id < group_count || !tasks.is_empty() {
5647            while next_group_id < group_count && tasks.len() < max_concurrency {
5648                let runtime = self.clone();
5649                let request = request.clone();
5650                let group_id = RaftGroupId(next_group_id);
5651                next_group_id += 1;
5652                tasks.spawn(async move {
5653                    runtime
5654                        .flush_cold_group_batch_once(
5655                            group_id,
5656                            request,
5657                            COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS,
5658                        )
5659                        .await
5660                        .map(|responses| responses.len())
5661                });
5662            }
5663            if let Some(result) = tasks.join_next().await {
5664                match result {
5665                    Ok(Ok(count)) => flushed += count,
5666                    Ok(Err(err)) => return Err(err),
5667                    Err(err) => {
5668                        return Err(RuntimeError::ColdStoreIo {
5669                            message: format!("cold flush task failed: {err}"),
5670                        });
5671                    }
5672                }
5673            }
5674        }
5675        Ok(flushed)
5676    }
5677
5678    async fn flush_cold_all_groups_once_serial(
5679        &self,
5680        request: PlanGroupColdFlushRequest,
5681    ) -> Result<usize, RuntimeError> {
5682        let mut flushed = 0;
5683        for group_id in 0..self.shard_map.raft_group_count() {
5684            flushed += self
5685                .flush_cold_group_batch_once(
5686                    RaftGroupId(group_id),
5687                    request.clone(),
5688                    COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS,
5689                )
5690                .await?
5691                .len();
5692        }
5693        Ok(flushed)
5694    }
5695
5696    pub async fn append(&self, request: AppendRequest) -> Result<AppendResponse, RuntimeError> {
5697        if request.payload.is_empty() {
5698            return Err(RuntimeError::EmptyAppend);
5699        }
5700        let placement = self.shard_map.locate(&request.stream_id);
5701        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5702        let (response_tx, response_rx) = oneshot::channel();
5703        self.send_core_command(
5704            mailbox,
5705            CoreCommand::Append {
5706                request,
5707                placement,
5708                response_tx,
5709            },
5710            response_rx,
5711        )
5712        .await
5713    }
5714
5715    pub async fn append_batch(
5716        &self,
5717        request: AppendBatchRequest,
5718    ) -> Result<AppendBatchResponse, RuntimeError> {
5719        if request.payloads.is_empty() {
5720            return Err(RuntimeError::EmptyAppend);
5721        }
5722        let placement = self.shard_map.locate(&request.stream_id);
5723        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5724        let (response_tx, response_rx) = oneshot::channel();
5725        self.send_core_command(
5726            mailbox,
5727            CoreCommand::AppendBatch {
5728                request,
5729                placement,
5730                response_tx,
5731            },
5732            response_rx,
5733        )
5734        .await
5735    }
5736
5737    pub async fn snapshot_group(
5738        &self,
5739        raft_group_id: RaftGroupId,
5740    ) -> Result<GroupSnapshot, RuntimeError> {
5741        let placement = self.placement_for_group(raft_group_id)?;
5742        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5743        let (response_tx, response_rx) = oneshot::channel();
5744        self.send_core_command(
5745            mailbox,
5746            CoreCommand::SnapshotGroup {
5747                placement,
5748                response_tx,
5749            },
5750            response_rx,
5751        )
5752        .await
5753    }
5754
5755    pub async fn install_group_snapshot(
5756        &self,
5757        snapshot: GroupSnapshot,
5758    ) -> Result<(), RuntimeError> {
5759        let expected = self.placement_for_group(snapshot.placement.raft_group_id)?;
5760        if snapshot.placement != expected {
5761            return Err(RuntimeError::SnapshotPlacementMismatch {
5762                expected,
5763                actual: snapshot.placement,
5764            });
5765        }
5766        let mailbox = &self.mailboxes[usize::from(expected.core_id.0)];
5767        let (response_tx, response_rx) = oneshot::channel();
5768        self.send_core_command(
5769            mailbox,
5770            CoreCommand::InstallGroupSnapshot {
5771                snapshot,
5772                response_tx,
5773            },
5774            response_rx,
5775        )
5776        .await
5777    }
5778
5779    pub async fn warm_group(
5780        &self,
5781        raft_group_id: RaftGroupId,
5782    ) -> Result<ShardPlacement, RuntimeError> {
5783        let placement = self.placement_for_group(raft_group_id)?;
5784        let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5785        let (response_tx, response_rx) = oneshot::channel();
5786        self.send_core_command(
5787            mailbox,
5788            CoreCommand::WarmGroup {
5789                placement,
5790                response_tx,
5791            },
5792            response_rx,
5793        )
5794        .await
5795    }
5796
5797    pub async fn warm_all_groups(&self) -> Result<(), RuntimeError> {
5798        for raw_group_id in 0..self.shard_map.raft_group_count() {
5799            self.warm_group(RaftGroupId(raw_group_id)).await?;
5800        }
5801        Ok(())
5802    }
5803
5804    fn placement_for_group(
5805        &self,
5806        raft_group_id: RaftGroupId,
5807    ) -> Result<ShardPlacement, RuntimeError> {
5808        if raft_group_id.0 >= self.shard_map.raft_group_count() {
5809            return Err(RuntimeError::InvalidRaftGroup {
5810                raft_group_id,
5811                raft_group_count: self.shard_map.raft_group_count(),
5812            });
5813        }
5814        Ok(ShardPlacement {
5815            core_id: CoreId(
5816                (raft_group_id.0 % u32::from(self.shard_map.core_count()))
5817                    .try_into()
5818                    .expect("core id fits u16"),
5819            ),
5820            shard_id: ShardId(raft_group_id.0),
5821            raft_group_id,
5822        })
5823    }
5824
5825    async fn send_core_command<T>(
5826        &self,
5827        mailbox: &CoreMailbox,
5828        command: CoreCommand,
5829        response_rx: oneshot::Receiver<Result<T, RuntimeError>>,
5830    ) -> Result<T, RuntimeError> {
5831        self.enqueue_core_command(mailbox, command).await?;
5832        response_rx
5833            .await
5834            .map_err(|_| RuntimeError::ResponseDropped {
5835                core_id: mailbox.core_id,
5836            })?
5837    }
5838
5839    async fn enqueue_core_command(
5840        &self,
5841        mailbox: &CoreMailbox,
5842        command: CoreCommand,
5843    ) -> Result<(), RuntimeError> {
5844        if mailbox.tx.capacity() == 0 {
5845            self.metrics.record_mailbox_full(mailbox.core_id);
5846        }
5847        let started_at = Instant::now();
5848        mailbox
5849            .tx
5850            .send(command)
5851            .await
5852            .map_err(|_| RuntimeError::MailboxClosed {
5853                core_id: mailbox.core_id,
5854            })?;
5855        self.metrics
5856            .record_routed_request(mailbox.core_id, elapsed_ns(started_at));
5857        Ok(())
5858    }
5859
5860    pub fn metrics(&self) -> RuntimeMetrics {
5861        RuntimeMetrics {
5862            inner: self.metrics.clone(),
5863        }
5864    }
5865
5866    pub fn mailbox_snapshot(&self) -> RuntimeMailboxSnapshot {
5867        let depths = self
5868            .mailboxes
5869            .iter()
5870            .map(CoreMailbox::depth)
5871            .collect::<Vec<_>>();
5872        let capacities = self
5873            .mailboxes
5874            .iter()
5875            .map(CoreMailbox::capacity)
5876            .collect::<Vec<_>>();
5877        RuntimeMailboxSnapshot { depths, capacities }
5878    }
5879}
5880
5881fn spawn_core_worker(threading: RuntimeThreading, worker: CoreWorker) -> Result<(), RuntimeError> {
5882    let core_id = worker.core_id;
5883    match threading {
5884        RuntimeThreading::HostedTokio => {
5885            tokio::spawn(worker.run());
5886            Ok(())
5887        }
5888        RuntimeThreading::ThreadPerCore => std::thread::Builder::new()
5889            .name(format!("ursula-core-{}", core_id.0))
5890            .spawn(move || {
5891                let runtime = tokio::runtime::Builder::new_current_thread()
5892                    .enable_all()
5893                    .build()
5894                    .expect("build per-core tokio runtime");
5895                runtime.block_on(worker.run());
5896            })
5897            .map(|_| ())
5898            .map_err(|err| RuntimeError::SpawnCoreThread {
5899                core_id,
5900                message: err.to_string(),
5901            }),
5902    }
5903}
5904
5905#[derive(Debug, Clone)]
5906struct CoreMailbox {
5907    core_id: CoreId,
5908    tx: mpsc::Sender<CoreCommand>,
5909}
5910
5911impl CoreMailbox {
5912    fn depth(&self) -> usize {
5913        self.tx.max_capacity() - self.tx.capacity()
5914    }
5915
5916    fn capacity(&self) -> usize {
5917        self.tx.max_capacity()
5918    }
5919}
5920
5921#[derive(Debug)]
5922enum CoreCommand {
5923    CreateStream {
5924        request: CreateStreamRequest,
5925        placement: ShardPlacement,
5926        response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
5927    },
5928    CreateExternal {
5929        request: CreateStreamExternalRequest,
5930        placement: ShardPlacement,
5931        response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
5932    },
5933    HeadStream {
5934        request: HeadStreamRequest,
5935        placement: ShardPlacement,
5936        response_tx: oneshot::Sender<Result<HeadStreamResponse, RuntimeError>>,
5937    },
5938    ReadStream {
5939        request: ReadStreamRequest,
5940        placement: ShardPlacement,
5941        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
5942    },
5943    PublishSnapshot {
5944        request: PublishSnapshotRequest,
5945        placement: ShardPlacement,
5946        response_tx: oneshot::Sender<Result<PublishSnapshotResponse, RuntimeError>>,
5947    },
5948    ReadSnapshot {
5949        request: ReadSnapshotRequest,
5950        placement: ShardPlacement,
5951        response_tx: oneshot::Sender<Result<ReadSnapshotResponse, RuntimeError>>,
5952    },
5953    DeleteSnapshot {
5954        request: DeleteSnapshotRequest,
5955        placement: ShardPlacement,
5956        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
5957    },
5958    BootstrapStream {
5959        request: BootstrapStreamRequest,
5960        placement: ShardPlacement,
5961        response_tx: oneshot::Sender<Result<BootstrapStreamResponse, RuntimeError>>,
5962    },
5963    WaitRead {
5964        request: ReadStreamRequest,
5965        placement: ShardPlacement,
5966        waiter_id: u64,
5967        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
5968    },
5969    RequireLiveReadOwner {
5970        placement: ShardPlacement,
5971        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
5972    },
5973    CancelWaitRead {
5974        stream_id: BucketStreamId,
5975        placement: ShardPlacement,
5976        waiter_id: u64,
5977    },
5978    CloseStream {
5979        request: CloseStreamRequest,
5980        placement: ShardPlacement,
5981        response_tx: oneshot::Sender<Result<CloseStreamResponse, RuntimeError>>,
5982    },
5983    AddForkRef {
5984        stream_id: BucketStreamId,
5985        now_ms: u64,
5986        placement: ShardPlacement,
5987        response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
5988    },
5989    ReleaseForkRef {
5990        stream_id: BucketStreamId,
5991        placement: ShardPlacement,
5992        response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
5993    },
5994    DeleteStream {
5995        request: DeleteStreamRequest,
5996        placement: ShardPlacement,
5997        response_tx: oneshot::Sender<Result<DeleteStreamResponse, RuntimeError>>,
5998    },
5999    FlushCold {
6000        request: FlushColdRequest,
6001        placement: ShardPlacement,
6002        response_tx: oneshot::Sender<Result<FlushColdResponse, RuntimeError>>,
6003    },
6004    FlushColdBatch {
6005        requests: Vec<FlushColdRequest>,
6006        placement: ShardPlacement,
6007        response_tx: oneshot::Sender<Result<Vec<FlushColdResponse>, RuntimeError>>,
6008    },
6009    PlanColdFlush {
6010        request: PlanColdFlushRequest,
6011        placement: ShardPlacement,
6012        response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6013    },
6014    PlanNextColdFlush {
6015        request: PlanGroupColdFlushRequest,
6016        placement: ShardPlacement,
6017        response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6018    },
6019    PlanNextColdFlushBatch {
6020        request: PlanGroupColdFlushRequest,
6021        placement: ShardPlacement,
6022        max_candidates: usize,
6023        response_tx: oneshot::Sender<Result<Vec<ColdFlushCandidate>, RuntimeError>>,
6024    },
6025    Append {
6026        request: AppendRequest,
6027        placement: ShardPlacement,
6028        response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6029    },
6030    AppendExternal {
6031        request: AppendExternalRequest,
6032        placement: ShardPlacement,
6033        response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6034    },
6035    AppendBatch {
6036        request: AppendBatchRequest,
6037        placement: ShardPlacement,
6038        response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6039    },
6040    WarmGroup {
6041        placement: ShardPlacement,
6042        response_tx: oneshot::Sender<Result<ShardPlacement, RuntimeError>>,
6043    },
6044    SnapshotGroup {
6045        placement: ShardPlacement,
6046        response_tx: oneshot::Sender<Result<GroupSnapshot, RuntimeError>>,
6047    },
6048    InstallGroupSnapshot {
6049        snapshot: GroupSnapshot,
6050        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6051    },
6052}
6053
6054struct CoreWorker {
6055    core_id: CoreId,
6056    rx: mpsc::Receiver<CoreCommand>,
6057    engine_factory: Arc<dyn GroupEngineFactory>,
6058    groups: HashMap<RaftGroupId, GroupMailbox>,
6059    metrics: Arc<RuntimeMetricsInner>,
6060    group_mailbox_capacity: usize,
6061    cold_write_admission: ColdWriteAdmission,
6062    live_read_max_waiters_per_core: Option<u64>,
6063    read_materialization: Arc<Semaphore>,
6064}
6065
6066#[derive(Clone)]
6067struct AppendBatchRuntime {
6068    metrics: Arc<RuntimeMetricsInner>,
6069    read_materialization: Arc<Semaphore>,
6070    placement: ShardPlacement,
6071}
6072
6073type ReadWatchers = HashMap<BucketStreamId, Vec<ReadWatcher>>;
6074const GROUP_ACTOR_MAX_WRITE_BATCH: usize = 64;
6075const COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS: usize = 64;
6076
6077#[derive(Clone)]
6078struct GroupMailbox {
6079    group_id: RaftGroupId,
6080    tx: mpsc::Sender<GroupCommand>,
6081    metrics: Arc<RuntimeMetricsInner>,
6082}
6083
6084impl GroupMailbox {
6085    async fn send(&self, command: GroupCommand) -> Result<(), Box<GroupCommand>> {
6086        match self.tx.try_send(command) {
6087            Ok(()) => {
6088                self.metrics.record_group_mailbox_enqueued(self.group_id);
6089                Ok(())
6090            }
6091            Err(mpsc::error::TrySendError::Full(command)) => {
6092                self.metrics.record_group_mailbox_full(self.group_id);
6093                match self.tx.send(command).await {
6094                    Ok(()) => {
6095                        self.metrics.record_group_mailbox_enqueued(self.group_id);
6096                        Ok(())
6097                    }
6098                    Err(err) => Err(Box::new(err.0)),
6099                }
6100            }
6101            Err(mpsc::error::TrySendError::Closed(command)) => Err(Box::new(command)),
6102        }
6103    }
6104}
6105
6106struct PendingAppendBatch {
6107    stream_id: BucketStreamId,
6108    incoming_bytes: u64,
6109    response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6110    started_at: Instant,
6111}
6112
6113#[derive(Debug)]
6114enum GroupCommand {
6115    CreateStream {
6116        request: CreateStreamRequest,
6117        response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
6118    },
6119    CreateExternal {
6120        request: CreateStreamExternalRequest,
6121        response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
6122    },
6123    HeadStream {
6124        request: HeadStreamRequest,
6125        response_tx: oneshot::Sender<Result<HeadStreamResponse, RuntimeError>>,
6126    },
6127    ReadStream {
6128        request: ReadStreamRequest,
6129        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6130    },
6131    PublishSnapshot {
6132        request: PublishSnapshotRequest,
6133        response_tx: oneshot::Sender<Result<PublishSnapshotResponse, RuntimeError>>,
6134    },
6135    ReadSnapshot {
6136        request: ReadSnapshotRequest,
6137        response_tx: oneshot::Sender<Result<ReadSnapshotResponse, RuntimeError>>,
6138    },
6139    DeleteSnapshot {
6140        request: DeleteSnapshotRequest,
6141        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6142    },
6143    BootstrapStream {
6144        request: BootstrapStreamRequest,
6145        response_tx: oneshot::Sender<Result<BootstrapStreamResponse, RuntimeError>>,
6146    },
6147    WaitRead {
6148        request: ReadStreamRequest,
6149        waiter_id: u64,
6150        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6151    },
6152    CancelWaitRead {
6153        stream_id: BucketStreamId,
6154        waiter_id: u64,
6155    },
6156    RequireLiveReadOwner {
6157        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6158    },
6159    CloseStream {
6160        request: CloseStreamRequest,
6161        response_tx: oneshot::Sender<Result<CloseStreamResponse, RuntimeError>>,
6162    },
6163    AddForkRef {
6164        stream_id: BucketStreamId,
6165        now_ms: u64,
6166        response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
6167    },
6168    ReleaseForkRef {
6169        stream_id: BucketStreamId,
6170        response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
6171    },
6172    DeleteStream {
6173        request: DeleteStreamRequest,
6174        response_tx: oneshot::Sender<Result<DeleteStreamResponse, RuntimeError>>,
6175    },
6176    FlushCold {
6177        request: FlushColdRequest,
6178        response_tx: oneshot::Sender<Result<FlushColdResponse, RuntimeError>>,
6179    },
6180    FlushColdBatch {
6181        requests: Vec<FlushColdRequest>,
6182        response_tx: oneshot::Sender<Result<Vec<FlushColdResponse>, RuntimeError>>,
6183    },
6184    PlanColdFlush {
6185        request: PlanColdFlushRequest,
6186        response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6187    },
6188    PlanNextColdFlush {
6189        request: PlanGroupColdFlushRequest,
6190        response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6191    },
6192    PlanNextColdFlushBatch {
6193        request: PlanGroupColdFlushRequest,
6194        max_candidates: usize,
6195        response_tx: oneshot::Sender<Result<Vec<ColdFlushCandidate>, RuntimeError>>,
6196    },
6197    Append {
6198        request: AppendRequest,
6199        response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6200    },
6201    AppendExternal {
6202        request: AppendExternalRequest,
6203        response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6204    },
6205    AppendBatch {
6206        request: AppendBatchRequest,
6207        response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6208    },
6209    SnapshotGroup {
6210        response_tx: oneshot::Sender<Result<GroupSnapshot, RuntimeError>>,
6211    },
6212    InstallGroupSnapshot {
6213        snapshot: GroupSnapshot,
6214        response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6215    },
6216}
6217
6218impl GroupCommand {
6219    fn send_error(self, err: RuntimeError) {
6220        match self {
6221            Self::CreateStream { response_tx, .. } => {
6222                let _ = response_tx.send(Err(err));
6223            }
6224            Self::CreateExternal { response_tx, .. } => {
6225                let _ = response_tx.send(Err(err));
6226            }
6227            Self::HeadStream { response_tx, .. } => {
6228                let _ = response_tx.send(Err(err));
6229            }
6230            Self::ReadStream { response_tx, .. } | Self::WaitRead { response_tx, .. } => {
6231                let _ = response_tx.send(Err(err));
6232            }
6233            Self::CancelWaitRead { .. } => {}
6234            Self::RequireLiveReadOwner { response_tx } => {
6235                let _ = response_tx.send(Err(err));
6236            }
6237            Self::PublishSnapshot { response_tx, .. } => {
6238                let _ = response_tx.send(Err(err));
6239            }
6240            Self::ReadSnapshot { response_tx, .. } => {
6241                let _ = response_tx.send(Err(err));
6242            }
6243            Self::DeleteSnapshot { response_tx, .. } => {
6244                let _ = response_tx.send(Err(err));
6245            }
6246            Self::BootstrapStream { response_tx, .. } => {
6247                let _ = response_tx.send(Err(err));
6248            }
6249            Self::CloseStream { response_tx, .. } => {
6250                let _ = response_tx.send(Err(err));
6251            }
6252            Self::AddForkRef { response_tx, .. } | Self::ReleaseForkRef { response_tx, .. } => {
6253                let _ = response_tx.send(Err(err));
6254            }
6255            Self::DeleteStream { response_tx, .. } => {
6256                let _ = response_tx.send(Err(err));
6257            }
6258            Self::FlushCold { response_tx, .. } => {
6259                let _ = response_tx.send(Err(err));
6260            }
6261            Self::FlushColdBatch { response_tx, .. } => {
6262                let _ = response_tx.send(Err(err));
6263            }
6264            Self::PlanColdFlush { response_tx, .. } => {
6265                let _ = response_tx.send(Err(err));
6266            }
6267            Self::PlanNextColdFlush { response_tx, .. } => {
6268                let _ = response_tx.send(Err(err));
6269            }
6270            Self::PlanNextColdFlushBatch { response_tx, .. } => {
6271                let _ = response_tx.send(Err(err));
6272            }
6273            Self::Append { response_tx, .. } => {
6274                let _ = response_tx.send(Err(err));
6275            }
6276            Self::AppendExternal { response_tx, .. } => {
6277                let _ = response_tx.send(Err(err));
6278            }
6279            Self::AppendBatch { response_tx, .. } => {
6280                let _ = response_tx.send(Err(err));
6281            }
6282            Self::SnapshotGroup { response_tx } => {
6283                let _ = response_tx.send(Err(err));
6284            }
6285            Self::InstallGroupSnapshot { response_tx, .. } => {
6286                let _ = response_tx.send(Err(err));
6287            }
6288        }
6289    }
6290}
6291
6292struct GroupActor {
6293    placement: ShardPlacement,
6294    engine: Box<dyn GroupEngine>,
6295    rx: mpsc::Receiver<GroupCommand>,
6296    read_watchers: ReadWatchers,
6297    metrics: Arc<RuntimeMetricsInner>,
6298    cold_write_admission: ColdWriteAdmission,
6299    live_read_max_waiters_per_core: Option<u64>,
6300    read_materialization: Arc<Semaphore>,
6301}
6302
6303impl GroupActor {
6304    async fn run(mut self) {
6305        let mut pending = VecDeque::new();
6306        loop {
6307            let Some(command) = self.next_command(&mut pending).await else {
6308                break;
6309            };
6310            match command {
6311                GroupCommand::CreateStream {
6312                    request,
6313                    response_tx,
6314                } => {
6315                    let response = CoreWorker::create_stream(
6316                        &mut self.engine,
6317                        self.metrics.clone(),
6318                        request,
6319                        self.placement,
6320                        self.cold_write_admission,
6321                    )
6322                    .await;
6323                    let _ = response_tx.send(response);
6324                }
6325                GroupCommand::CreateExternal {
6326                    request,
6327                    response_tx,
6328                } => {
6329                    let response = CoreWorker::create_stream_external(
6330                        &mut self.engine,
6331                        self.metrics.clone(),
6332                        request,
6333                        self.placement,
6334                    )
6335                    .await;
6336                    let _ = response_tx.send(response);
6337                }
6338                GroupCommand::HeadStream {
6339                    request,
6340                    response_tx,
6341                } => {
6342                    let response = CoreWorker::head_stream(
6343                        &mut self.engine,
6344                        self.metrics.clone(),
6345                        request,
6346                        self.placement,
6347                    )
6348                    .await;
6349                    let _ = response_tx.send(response);
6350                }
6351                GroupCommand::ReadStream {
6352                    request,
6353                    response_tx,
6354                } => {
6355                    CoreWorker::read_stream(
6356                        &mut self.engine,
6357                        self.metrics.clone(),
6358                        self.read_materialization.clone(),
6359                        request,
6360                        self.placement,
6361                        response_tx,
6362                    )
6363                    .await;
6364                }
6365                GroupCommand::PublishSnapshot {
6366                    request,
6367                    response_tx,
6368                } => {
6369                    let response = CoreWorker::publish_snapshot(
6370                        &mut self.engine,
6371                        self.metrics.clone(),
6372                        self.read_materialization.clone(),
6373                        &mut self.read_watchers,
6374                        request,
6375                        self.placement,
6376                    )
6377                    .await;
6378                    let _ = response_tx.send(response);
6379                }
6380                GroupCommand::ReadSnapshot {
6381                    request,
6382                    response_tx,
6383                } => {
6384                    let response = CoreWorker::read_snapshot(
6385                        &mut self.engine,
6386                        self.metrics.clone(),
6387                        request,
6388                        self.placement,
6389                    )
6390                    .await;
6391                    let _ = response_tx.send(response);
6392                }
6393                GroupCommand::DeleteSnapshot {
6394                    request,
6395                    response_tx,
6396                } => {
6397                    let response = CoreWorker::delete_snapshot(
6398                        &mut self.engine,
6399                        self.metrics.clone(),
6400                        request,
6401                        self.placement,
6402                    )
6403                    .await;
6404                    let _ = response_tx.send(response);
6405                }
6406                GroupCommand::BootstrapStream {
6407                    request,
6408                    response_tx,
6409                } => {
6410                    let response = CoreWorker::bootstrap_stream(
6411                        &mut self.engine,
6412                        self.metrics.clone(),
6413                        request,
6414                        self.placement,
6415                    )
6416                    .await;
6417                    let _ = response_tx.send(response);
6418                }
6419                GroupCommand::WaitRead {
6420                    request,
6421                    waiter_id,
6422                    response_tx,
6423                } => {
6424                    let watcher = ReadWatcher {
6425                        waiter_id,
6426                        request,
6427                        response_tx,
6428                    };
6429                    CoreWorker::wait_read_stream(
6430                        &mut self.engine,
6431                        self.metrics.clone(),
6432                        self.read_materialization.clone(),
6433                        &mut self.read_watchers,
6434                        self.placement,
6435                        watcher,
6436                        self.live_read_max_waiters_per_core,
6437                    )
6438                    .await;
6439                }
6440                GroupCommand::CancelWaitRead {
6441                    stream_id,
6442                    waiter_id,
6443                } => {
6444                    CoreWorker::cancel_read_watcher(
6445                        &mut self.read_watchers,
6446                        self.metrics.clone(),
6447                        self.placement.core_id,
6448                        stream_id,
6449                        waiter_id,
6450                    );
6451                }
6452                GroupCommand::RequireLiveReadOwner { response_tx } => {
6453                    let response = self
6454                        .engine
6455                        .require_local_live_read_owner(self.placement)
6456                        .await
6457                        .map_err(|err| RuntimeError::group_engine(self.placement, err));
6458                    let _ = response_tx.send(response);
6459                }
6460                GroupCommand::CloseStream {
6461                    request,
6462                    response_tx,
6463                } => {
6464                    let response = CoreWorker::close_stream(
6465                        &mut self.engine,
6466                        self.metrics.clone(),
6467                        self.read_materialization.clone(),
6468                        &mut self.read_watchers,
6469                        request,
6470                        self.placement,
6471                    )
6472                    .await;
6473                    let _ = response_tx.send(response);
6474                }
6475                GroupCommand::AddForkRef {
6476                    stream_id,
6477                    now_ms,
6478                    response_tx,
6479                } => {
6480                    let response = CoreWorker::add_fork_ref(
6481                        &mut self.engine,
6482                        self.metrics.clone(),
6483                        stream_id,
6484                        now_ms,
6485                        self.placement,
6486                    )
6487                    .await;
6488                    let _ = response_tx.send(response);
6489                }
6490                GroupCommand::ReleaseForkRef {
6491                    stream_id,
6492                    response_tx,
6493                } => {
6494                    let response = CoreWorker::release_fork_ref(
6495                        &mut self.engine,
6496                        self.metrics.clone(),
6497                        self.read_materialization.clone(),
6498                        &mut self.read_watchers,
6499                        stream_id,
6500                        self.placement,
6501                    )
6502                    .await;
6503                    let _ = response_tx.send(response);
6504                }
6505                GroupCommand::DeleteStream {
6506                    request,
6507                    response_tx,
6508                } => {
6509                    let response = CoreWorker::delete_stream(
6510                        &mut self.engine,
6511                        self.metrics.clone(),
6512                        self.read_materialization.clone(),
6513                        &mut self.read_watchers,
6514                        request,
6515                        self.placement,
6516                    )
6517                    .await;
6518                    let _ = response_tx.send(response);
6519                }
6520                GroupCommand::FlushCold {
6521                    request,
6522                    response_tx,
6523                } => {
6524                    let response = CoreWorker::flush_cold(
6525                        &mut self.engine,
6526                        self.metrics.clone(),
6527                        self.read_materialization.clone(),
6528                        &mut self.read_watchers,
6529                        request,
6530                        self.placement,
6531                    )
6532                    .await;
6533                    let _ = response_tx.send(response);
6534                }
6535                GroupCommand::FlushColdBatch {
6536                    requests,
6537                    response_tx,
6538                } => {
6539                    let response = CoreWorker::flush_cold_batch(
6540                        &mut self.engine,
6541                        self.metrics.clone(),
6542                        self.read_materialization.clone(),
6543                        &mut self.read_watchers,
6544                        requests,
6545                        self.placement,
6546                    )
6547                    .await;
6548                    let _ = response_tx.send(response);
6549                }
6550                GroupCommand::PlanColdFlush {
6551                    request,
6552                    response_tx,
6553                } => {
6554                    let response = CoreWorker::plan_cold_flush(
6555                        &mut self.engine,
6556                        self.metrics.clone(),
6557                        request,
6558                        self.placement,
6559                    )
6560                    .await;
6561                    let _ = response_tx.send(response);
6562                }
6563                GroupCommand::PlanNextColdFlush {
6564                    request,
6565                    response_tx,
6566                } => {
6567                    let response = CoreWorker::plan_next_cold_flush(
6568                        &mut self.engine,
6569                        self.metrics.clone(),
6570                        request,
6571                        self.placement,
6572                    )
6573                    .await;
6574                    let _ = response_tx.send(response);
6575                }
6576                GroupCommand::PlanNextColdFlushBatch {
6577                    request,
6578                    max_candidates,
6579                    response_tx,
6580                } => {
6581                    let response = CoreWorker::plan_next_cold_flush_batch(
6582                        &mut self.engine,
6583                        self.metrics.clone(),
6584                        request,
6585                        self.placement,
6586                        max_candidates,
6587                    )
6588                    .await;
6589                    let _ = response_tx.send(response);
6590                }
6591                GroupCommand::Append {
6592                    request,
6593                    response_tx,
6594                } => {
6595                    let response = CoreWorker::apply_append(
6596                        &mut self.engine,
6597                        self.metrics.clone(),
6598                        self.read_materialization.clone(),
6599                        &mut self.read_watchers,
6600                        request,
6601                        self.placement,
6602                        self.cold_write_admission,
6603                    )
6604                    .await;
6605                    let _ = response_tx.send(response);
6606                }
6607                GroupCommand::AppendExternal {
6608                    request,
6609                    response_tx,
6610                } => {
6611                    let response = CoreWorker::apply_append_external(
6612                        &mut self.engine,
6613                        self.metrics.clone(),
6614                        self.read_materialization.clone(),
6615                        &mut self.read_watchers,
6616                        request,
6617                        self.placement,
6618                    )
6619                    .await;
6620                    let _ = response_tx.send(response);
6621                }
6622                GroupCommand::AppendBatch {
6623                    request,
6624                    response_tx,
6625                } => {
6626                    let mut batch = vec![(request, response_tx)];
6627                    self.collect_append_batch_commands(&mut pending, &mut batch);
6628                    if self.cold_write_admission.is_enabled() {
6629                        let (requests, pending_batch) =
6630                            CoreWorker::prepare_append_batch_requests(batch);
6631                        CoreWorker::apply_prepared_append_batch_requests_with_cold_admission(
6632                            &mut self.engine,
6633                            AppendBatchRuntime {
6634                                metrics: self.metrics.clone(),
6635                                read_materialization: self.read_materialization.clone(),
6636                                placement: self.placement,
6637                            },
6638                            &mut self.read_watchers,
6639                            pending_batch,
6640                            requests,
6641                            self.cold_write_admission,
6642                        )
6643                        .await;
6644                    } else {
6645                        let (commands, pending_batch) =
6646                            CoreWorker::prepare_append_batch_commands(batch);
6647                        CoreWorker::apply_prepared_append_batch_commands(
6648                            &mut self.engine,
6649                            AppendBatchRuntime {
6650                                metrics: self.metrics.clone(),
6651                                read_materialization: self.read_materialization.clone(),
6652                                placement: self.placement,
6653                            },
6654                            &mut self.read_watchers,
6655                            pending_batch,
6656                            commands,
6657                        )
6658                        .await;
6659                    }
6660                }
6661                GroupCommand::SnapshotGroup { response_tx } => {
6662                    let response = CoreWorker::snapshot_group(
6663                        &mut self.engine,
6664                        self.metrics.clone(),
6665                        self.placement,
6666                    )
6667                    .await;
6668                    let _ = response_tx.send(response);
6669                }
6670                GroupCommand::InstallGroupSnapshot {
6671                    snapshot,
6672                    response_tx,
6673                } => {
6674                    let response = CoreWorker::install_group_snapshot(
6675                        &mut self.engine,
6676                        self.metrics.clone(),
6677                        snapshot,
6678                    )
6679                    .await;
6680                    let _ = response_tx.send(response);
6681                }
6682            }
6683        }
6684    }
6685
6686    async fn next_command(&mut self, pending: &mut VecDeque<GroupCommand>) -> Option<GroupCommand> {
6687        match pending.pop_front() {
6688            Some(command) => Some(command),
6689            None => {
6690                let command = self.rx.recv().await;
6691                if command.is_some() {
6692                    self.metrics
6693                        .record_group_mailbox_dequeued(self.placement.raft_group_id);
6694                }
6695                command
6696            }
6697        }
6698    }
6699
6700    fn collect_append_batch_commands(
6701        &mut self,
6702        pending: &mut VecDeque<GroupCommand>,
6703        batch: &mut Vec<(
6704            AppendBatchRequest,
6705            oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6706        )>,
6707    ) {
6708        while batch.len() < GROUP_ACTOR_MAX_WRITE_BATCH {
6709            let command = match pending.pop_front() {
6710                Some(command) => Some(command),
6711                None => match self.rx.try_recv() {
6712                    Ok(command) => {
6713                        self.metrics
6714                            .record_group_mailbox_dequeued(self.placement.raft_group_id);
6715                        Some(command)
6716                    }
6717                    Err(_) => None,
6718                },
6719            };
6720            match command {
6721                Some(GroupCommand::AppendBatch {
6722                    request,
6723                    response_tx,
6724                }) => batch.push((request, response_tx)),
6725                Some(other) => {
6726                    pending.push_front(other);
6727                    break;
6728                }
6729                None => break,
6730            }
6731        }
6732    }
6733}
6734
6735struct ReadWatcher {
6736    waiter_id: u64,
6737    request: ReadStreamRequest,
6738    response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6739}
6740
6741fn live_read_watcher_count(read_watchers: &HashMap<BucketStreamId, Vec<ReadWatcher>>) -> u64 {
6742    read_watchers
6743        .values()
6744        .map(|watchers| u64::try_from(watchers.len()).expect("watcher count fits u64"))
6745        .sum()
6746}
6747
6748struct WaitReadCancel {
6749    tx: mpsc::Sender<CoreCommand>,
6750    stream_id: Option<BucketStreamId>,
6751    placement: ShardPlacement,
6752    waiter_id: u64,
6753}
6754
6755impl WaitReadCancel {
6756    fn new(
6757        tx: mpsc::Sender<CoreCommand>,
6758        stream_id: BucketStreamId,
6759        placement: ShardPlacement,
6760        waiter_id: u64,
6761    ) -> Self {
6762        Self {
6763            tx,
6764            stream_id: Some(stream_id),
6765            placement,
6766            waiter_id,
6767        }
6768    }
6769
6770    fn disarm(&mut self) {
6771        self.stream_id = None;
6772    }
6773}
6774
6775impl Drop for WaitReadCancel {
6776    fn drop(&mut self) {
6777        if let Some(stream_id) = self.stream_id.take() {
6778            // Drop cannot await. If the owner mailbox is full, the stale
6779            // waiter is still removed when the next stream notification
6780            // consumes the closed oneshot sender.
6781            let _ = self.tx.try_send(CoreCommand::CancelWaitRead {
6782                stream_id,
6783                placement: self.placement,
6784                waiter_id: self.waiter_id,
6785            });
6786        }
6787    }
6788}
6789
6790impl CoreWorker {
6791    async fn run(mut self) {
6792        while let Some(command) = self.rx.recv().await {
6793            match command {
6794                CoreCommand::CreateStream {
6795                    request,
6796                    placement,
6797                    response_tx,
6798                } => {
6799                    debug_assert_eq!(placement.core_id, self.core_id);
6800                    self.send_group_command(
6801                        placement,
6802                        GroupCommand::CreateStream {
6803                            request,
6804                            response_tx,
6805                        },
6806                    )
6807                    .await;
6808                }
6809                CoreCommand::CreateExternal {
6810                    request,
6811                    placement,
6812                    response_tx,
6813                } => {
6814                    debug_assert_eq!(placement.core_id, self.core_id);
6815                    self.send_group_command(
6816                        placement,
6817                        GroupCommand::CreateExternal {
6818                            request,
6819                            response_tx,
6820                        },
6821                    )
6822                    .await;
6823                }
6824                CoreCommand::HeadStream {
6825                    request,
6826                    placement,
6827                    response_tx,
6828                } => {
6829                    debug_assert_eq!(placement.core_id, self.core_id);
6830                    self.send_group_command(
6831                        placement,
6832                        GroupCommand::HeadStream {
6833                            request,
6834                            response_tx,
6835                        },
6836                    )
6837                    .await;
6838                }
6839                CoreCommand::ReadStream {
6840                    request,
6841                    placement,
6842                    response_tx,
6843                } => {
6844                    debug_assert_eq!(placement.core_id, self.core_id);
6845                    self.send_group_command(
6846                        placement,
6847                        GroupCommand::ReadStream {
6848                            request,
6849                            response_tx,
6850                        },
6851                    )
6852                    .await;
6853                }
6854                CoreCommand::PublishSnapshot {
6855                    request,
6856                    placement,
6857                    response_tx,
6858                } => {
6859                    debug_assert_eq!(placement.core_id, self.core_id);
6860                    self.send_group_command(
6861                        placement,
6862                        GroupCommand::PublishSnapshot {
6863                            request,
6864                            response_tx,
6865                        },
6866                    )
6867                    .await;
6868                }
6869                CoreCommand::ReadSnapshot {
6870                    request,
6871                    placement,
6872                    response_tx,
6873                } => {
6874                    debug_assert_eq!(placement.core_id, self.core_id);
6875                    self.send_group_command(
6876                        placement,
6877                        GroupCommand::ReadSnapshot {
6878                            request,
6879                            response_tx,
6880                        },
6881                    )
6882                    .await;
6883                }
6884                CoreCommand::DeleteSnapshot {
6885                    request,
6886                    placement,
6887                    response_tx,
6888                } => {
6889                    debug_assert_eq!(placement.core_id, self.core_id);
6890                    self.send_group_command(
6891                        placement,
6892                        GroupCommand::DeleteSnapshot {
6893                            request,
6894                            response_tx,
6895                        },
6896                    )
6897                    .await;
6898                }
6899                CoreCommand::BootstrapStream {
6900                    request,
6901                    placement,
6902                    response_tx,
6903                } => {
6904                    debug_assert_eq!(placement.core_id, self.core_id);
6905                    self.send_group_command(
6906                        placement,
6907                        GroupCommand::BootstrapStream {
6908                            request,
6909                            response_tx,
6910                        },
6911                    )
6912                    .await;
6913                }
6914                CoreCommand::WaitRead {
6915                    request,
6916                    placement,
6917                    waiter_id,
6918                    response_tx,
6919                } => {
6920                    debug_assert_eq!(placement.core_id, self.core_id);
6921                    self.send_group_command(
6922                        placement,
6923                        GroupCommand::WaitRead {
6924                            request,
6925                            waiter_id,
6926                            response_tx,
6927                        },
6928                    )
6929                    .await;
6930                }
6931                CoreCommand::RequireLiveReadOwner {
6932                    placement,
6933                    response_tx,
6934                } => {
6935                    debug_assert_eq!(placement.core_id, self.core_id);
6936                    self.send_group_command(
6937                        placement,
6938                        GroupCommand::RequireLiveReadOwner { response_tx },
6939                    )
6940                    .await;
6941                }
6942                CoreCommand::CancelWaitRead {
6943                    stream_id,
6944                    placement,
6945                    waiter_id,
6946                } => {
6947                    debug_assert_eq!(placement.core_id, self.core_id);
6948                    self.send_group_command(
6949                        placement,
6950                        GroupCommand::CancelWaitRead {
6951                            stream_id,
6952                            waiter_id,
6953                        },
6954                    )
6955                    .await;
6956                }
6957                CoreCommand::CloseStream {
6958                    request,
6959                    placement,
6960                    response_tx,
6961                } => {
6962                    debug_assert_eq!(placement.core_id, self.core_id);
6963                    self.send_group_command(
6964                        placement,
6965                        GroupCommand::CloseStream {
6966                            request,
6967                            response_tx,
6968                        },
6969                    )
6970                    .await;
6971                }
6972                CoreCommand::AddForkRef {
6973                    stream_id,
6974                    now_ms,
6975                    placement,
6976                    response_tx,
6977                } => {
6978                    debug_assert_eq!(placement.core_id, self.core_id);
6979                    self.send_group_command(
6980                        placement,
6981                        GroupCommand::AddForkRef {
6982                            stream_id,
6983                            now_ms,
6984                            response_tx,
6985                        },
6986                    )
6987                    .await;
6988                }
6989                CoreCommand::ReleaseForkRef {
6990                    stream_id,
6991                    placement,
6992                    response_tx,
6993                } => {
6994                    debug_assert_eq!(placement.core_id, self.core_id);
6995                    self.send_group_command(
6996                        placement,
6997                        GroupCommand::ReleaseForkRef {
6998                            stream_id,
6999                            response_tx,
7000                        },
7001                    )
7002                    .await;
7003                }
7004                CoreCommand::DeleteStream {
7005                    request,
7006                    placement,
7007                    response_tx,
7008                } => {
7009                    debug_assert_eq!(placement.core_id, self.core_id);
7010                    self.send_group_command(
7011                        placement,
7012                        GroupCommand::DeleteStream {
7013                            request,
7014                            response_tx,
7015                        },
7016                    )
7017                    .await;
7018                }
7019                CoreCommand::FlushCold {
7020                    request,
7021                    placement,
7022                    response_tx,
7023                } => {
7024                    debug_assert_eq!(placement.core_id, self.core_id);
7025                    self.send_group_command(
7026                        placement,
7027                        GroupCommand::FlushCold {
7028                            request,
7029                            response_tx,
7030                        },
7031                    )
7032                    .await;
7033                }
7034                CoreCommand::FlushColdBatch {
7035                    requests,
7036                    placement,
7037                    response_tx,
7038                } => {
7039                    debug_assert_eq!(placement.core_id, self.core_id);
7040                    self.send_group_command(
7041                        placement,
7042                        GroupCommand::FlushColdBatch {
7043                            requests,
7044                            response_tx,
7045                        },
7046                    )
7047                    .await;
7048                }
7049                CoreCommand::PlanColdFlush {
7050                    request,
7051                    placement,
7052                    response_tx,
7053                } => {
7054                    debug_assert_eq!(placement.core_id, self.core_id);
7055                    self.send_group_command(
7056                        placement,
7057                        GroupCommand::PlanColdFlush {
7058                            request,
7059                            response_tx,
7060                        },
7061                    )
7062                    .await;
7063                }
7064                CoreCommand::PlanNextColdFlush {
7065                    request,
7066                    placement,
7067                    response_tx,
7068                } => {
7069                    debug_assert_eq!(placement.core_id, self.core_id);
7070                    self.send_group_command(
7071                        placement,
7072                        GroupCommand::PlanNextColdFlush {
7073                            request,
7074                            response_tx,
7075                        },
7076                    )
7077                    .await;
7078                }
7079                CoreCommand::PlanNextColdFlushBatch {
7080                    request,
7081                    placement,
7082                    max_candidates,
7083                    response_tx,
7084                } => {
7085                    debug_assert_eq!(placement.core_id, self.core_id);
7086                    self.send_group_command(
7087                        placement,
7088                        GroupCommand::PlanNextColdFlushBatch {
7089                            request,
7090                            max_candidates,
7091                            response_tx,
7092                        },
7093                    )
7094                    .await;
7095                }
7096                CoreCommand::Append {
7097                    request,
7098                    placement,
7099                    response_tx,
7100                } => {
7101                    debug_assert_eq!(placement.core_id, self.core_id);
7102                    self.send_group_command(
7103                        placement,
7104                        GroupCommand::Append {
7105                            request,
7106                            response_tx,
7107                        },
7108                    )
7109                    .await;
7110                }
7111                CoreCommand::AppendExternal {
7112                    request,
7113                    placement,
7114                    response_tx,
7115                } => {
7116                    debug_assert_eq!(placement.core_id, self.core_id);
7117                    self.send_group_command(
7118                        placement,
7119                        GroupCommand::AppendExternal {
7120                            request,
7121                            response_tx,
7122                        },
7123                    )
7124                    .await;
7125                }
7126                CoreCommand::AppendBatch {
7127                    request,
7128                    placement,
7129                    response_tx,
7130                } => {
7131                    debug_assert_eq!(placement.core_id, self.core_id);
7132                    self.send_group_command(
7133                        placement,
7134                        GroupCommand::AppendBatch {
7135                            request,
7136                            response_tx,
7137                        },
7138                    )
7139                    .await;
7140                }
7141                CoreCommand::WarmGroup {
7142                    placement,
7143                    response_tx,
7144                } => {
7145                    debug_assert_eq!(placement.core_id, self.core_id);
7146                    let response = self.group(placement).await.map(|_| placement);
7147                    let _ = response_tx.send(response);
7148                }
7149                CoreCommand::SnapshotGroup {
7150                    placement,
7151                    response_tx,
7152                } => {
7153                    debug_assert_eq!(placement.core_id, self.core_id);
7154                    self.send_group_command(placement, GroupCommand::SnapshotGroup { response_tx })
7155                        .await;
7156                }
7157                CoreCommand::InstallGroupSnapshot {
7158                    snapshot,
7159                    response_tx,
7160                } => {
7161                    debug_assert_eq!(snapshot.placement.core_id, self.core_id);
7162                    self.send_group_command(
7163                        snapshot.placement,
7164                        GroupCommand::InstallGroupSnapshot {
7165                            snapshot,
7166                            response_tx,
7167                        },
7168                    )
7169                    .await;
7170                }
7171            }
7172        }
7173    }
7174
7175    async fn send_group_command(&mut self, placement: ShardPlacement, command: GroupCommand) {
7176        let core_id = placement.core_id;
7177        match self.group(placement).await {
7178            Ok(group) => {
7179                if let Err(command) = group.send(command).await {
7180                    (*command).send_error(RuntimeError::MailboxClosed { core_id });
7181                }
7182            }
7183            Err(err) => command.send_error(err),
7184        }
7185    }
7186
7187    async fn group(&mut self, placement: ShardPlacement) -> Result<GroupMailbox, RuntimeError> {
7188        if !self.groups.contains_key(&placement.raft_group_id) {
7189            let engine_factory = self.engine_factory.clone();
7190            let metrics = GroupEngineMetrics {
7191                inner: self.metrics.clone(),
7192            };
7193            let engine = engine_factory
7194                .create(placement, metrics)
7195                .await
7196                .map_err(|err| RuntimeError::group_engine(placement, err))?;
7197            let (tx, rx) = mpsc::channel(self.group_mailbox_capacity);
7198            let actor = GroupActor {
7199                placement,
7200                engine,
7201                rx,
7202                read_watchers: HashMap::new(),
7203                metrics: self.metrics.clone(),
7204                cold_write_admission: self.cold_write_admission,
7205                live_read_max_waiters_per_core: self.live_read_max_waiters_per_core,
7206                read_materialization: self.read_materialization.clone(),
7207            };
7208            tokio::spawn(actor.run());
7209            self.groups.insert(
7210                placement.raft_group_id,
7211                GroupMailbox {
7212                    group_id: placement.raft_group_id,
7213                    tx,
7214                    metrics: self.metrics.clone(),
7215                },
7216            );
7217        }
7218        Ok(self
7219            .groups
7220            .get(&placement.raft_group_id)
7221            .expect("group was just inserted")
7222            .clone())
7223    }
7224
7225    async fn read_stream(
7226        group: &mut Box<dyn GroupEngine>,
7227        metrics: Arc<RuntimeMetricsInner>,
7228        read_materialization: Arc<Semaphore>,
7229        request: ReadStreamRequest,
7230        placement: ShardPlacement,
7231        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
7232    ) {
7233        let exec_started_at = Instant::now();
7234        let parts = group
7235            .read_stream_parts(request, placement)
7236            .await
7237            .map_err(|err| RuntimeError::group_engine(placement, err));
7238        metrics.record_group_engine_exec(
7239            placement.core_id,
7240            placement.raft_group_id,
7241            elapsed_ns(exec_started_at),
7242        );
7243        match parts {
7244            Ok(parts) => {
7245                Self::send_read_parts_response(placement, read_materialization, parts, response_tx);
7246            }
7247            Err(err) => {
7248                let _ = response_tx.send(Err(err));
7249            }
7250        }
7251    }
7252
7253    fn send_read_parts_response(
7254        placement: ShardPlacement,
7255        read_materialization: Arc<Semaphore>,
7256        parts: GroupReadStreamParts,
7257        response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
7258    ) {
7259        tokio::spawn(async move {
7260            let response = match read_materialization.acquire_owned().await {
7261                Ok(_permit) => parts
7262                    .into_response()
7263                    .await
7264                    .map_err(|err| RuntimeError::group_engine(placement, err)),
7265                Err(_) => Err(RuntimeError::MailboxClosed {
7266                    core_id: placement.core_id,
7267                }),
7268            };
7269            let _ = response_tx.send(response);
7270        });
7271    }
7272
7273    fn send_read_parts_to_watchers(
7274        placement: ShardPlacement,
7275        read_materialization: Arc<Semaphore>,
7276        parts: GroupReadStreamParts,
7277        watchers: Vec<ReadWatcher>,
7278    ) {
7279        tokio::spawn(async move {
7280            let response = match read_materialization.acquire_owned().await {
7281                Ok(_permit) => parts
7282                    .into_response()
7283                    .await
7284                    .map_err(|err| RuntimeError::group_engine(placement, err)),
7285                Err(_) => Err(RuntimeError::MailboxClosed {
7286                    core_id: placement.core_id,
7287                }),
7288            };
7289            for watcher in watchers {
7290                let _ = watcher.response_tx.send(response.clone());
7291            }
7292        });
7293    }
7294
7295    async fn publish_snapshot(
7296        group: &mut Box<dyn GroupEngine>,
7297        metrics: Arc<RuntimeMetricsInner>,
7298        read_materialization: Arc<Semaphore>,
7299        read_watchers: &mut ReadWatchers,
7300        request: PublishSnapshotRequest,
7301        placement: ShardPlacement,
7302    ) -> Result<PublishSnapshotResponse, RuntimeError> {
7303        let stream_id = request.stream_id.clone();
7304        let started_at = Instant::now();
7305        let exec_started_at = Instant::now();
7306        let response = group
7307            .publish_snapshot(request, placement)
7308            .await
7309            .map_err(|err| RuntimeError::group_engine(placement, err));
7310        metrics.record_group_engine_exec(
7311            placement.core_id,
7312            placement.raft_group_id,
7313            elapsed_ns(exec_started_at),
7314        );
7315        if response.is_ok() {
7316            metrics.record_applied_mutation(
7317                placement.core_id,
7318                placement.raft_group_id,
7319                elapsed_ns(started_at),
7320            );
7321            record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7322            Self::notify_read_watchers(
7323                group,
7324                metrics,
7325                read_materialization,
7326                read_watchers,
7327                &stream_id,
7328                placement,
7329            )
7330            .await;
7331        }
7332        response
7333    }
7334
7335    async fn read_snapshot(
7336        group: &mut Box<dyn GroupEngine>,
7337        metrics: Arc<RuntimeMetricsInner>,
7338        request: ReadSnapshotRequest,
7339        placement: ShardPlacement,
7340    ) -> Result<ReadSnapshotResponse, RuntimeError> {
7341        let exec_started_at = Instant::now();
7342        let response = group
7343            .read_snapshot(request, placement)
7344            .await
7345            .map_err(|err| RuntimeError::group_engine(placement, err));
7346        metrics.record_group_engine_exec(
7347            placement.core_id,
7348            placement.raft_group_id,
7349            elapsed_ns(exec_started_at),
7350        );
7351        response
7352    }
7353
7354    async fn delete_snapshot(
7355        group: &mut Box<dyn GroupEngine>,
7356        metrics: Arc<RuntimeMetricsInner>,
7357        request: DeleteSnapshotRequest,
7358        placement: ShardPlacement,
7359    ) -> Result<(), RuntimeError> {
7360        let exec_started_at = Instant::now();
7361        let response = group
7362            .delete_snapshot(request, placement)
7363            .await
7364            .map_err(|err| RuntimeError::group_engine(placement, err));
7365        metrics.record_group_engine_exec(
7366            placement.core_id,
7367            placement.raft_group_id,
7368            elapsed_ns(exec_started_at),
7369        );
7370        response
7371    }
7372
7373    async fn bootstrap_stream(
7374        group: &mut Box<dyn GroupEngine>,
7375        metrics: Arc<RuntimeMetricsInner>,
7376        request: BootstrapStreamRequest,
7377        placement: ShardPlacement,
7378    ) -> Result<BootstrapStreamResponse, RuntimeError> {
7379        let exec_started_at = Instant::now();
7380        let response = group
7381            .bootstrap_stream(request, placement)
7382            .await
7383            .map_err(|err| RuntimeError::group_engine(placement, err));
7384        metrics.record_group_engine_exec(
7385            placement.core_id,
7386            placement.raft_group_id,
7387            elapsed_ns(exec_started_at),
7388        );
7389        response
7390    }
7391
7392    async fn wait_read_stream(
7393        group: &mut Box<dyn GroupEngine>,
7394        metrics: Arc<RuntimeMetricsInner>,
7395        read_materialization: Arc<Semaphore>,
7396        read_watchers: &mut ReadWatchers,
7397        placement: ShardPlacement,
7398        watcher: ReadWatcher,
7399        live_read_max_waiters_per_core: Option<u64>,
7400    ) {
7401        let exec_started_at = Instant::now();
7402        let parts = group
7403            .read_stream_parts(watcher.request.clone(), placement)
7404            .await
7405            .map_err(|err| RuntimeError::group_engine(placement, err));
7406        metrics.record_group_engine_exec(
7407            placement.core_id,
7408            placement.raft_group_id,
7409            elapsed_ns(exec_started_at),
7410        );
7411        match parts {
7412            Ok(parts) if parts.payload_is_empty() && parts.up_to_date && !parts.closed => {
7413                if watcher.response_tx.is_closed() {
7414                    return;
7415                }
7416                let current_waiters = live_read_watcher_count(read_watchers);
7417                if let Some(limit) = live_read_max_waiters_per_core
7418                    && current_waiters >= limit
7419                {
7420                    metrics.record_live_read_backpressure(placement.core_id);
7421                    let _ = watcher
7422                        .response_tx
7423                        .send(Err(RuntimeError::LiveReadBackpressure {
7424                            core_id: placement.core_id,
7425                            current_waiters,
7426                            limit,
7427                        }));
7428                    return;
7429                }
7430                metrics.record_read_watcher_added(placement.core_id);
7431                read_watchers
7432                    .entry(watcher.request.stream_id.clone())
7433                    .or_default()
7434                    .push(watcher);
7435            }
7436            Ok(parts) => {
7437                Self::send_read_parts_response(
7438                    placement,
7439                    read_materialization.clone(),
7440                    parts,
7441                    watcher.response_tx,
7442                );
7443            }
7444            Err(err) => {
7445                let _ = watcher.response_tx.send(Err(err));
7446            }
7447        }
7448    }
7449
7450    fn cancel_read_watcher(
7451        read_watchers: &mut ReadWatchers,
7452        metrics: Arc<RuntimeMetricsInner>,
7453        core_id: CoreId,
7454        stream_id: BucketStreamId,
7455        waiter_id: u64,
7456    ) {
7457        let Some(watchers) = read_watchers.get_mut(&stream_id) else {
7458            return;
7459        };
7460        let before = watchers.len();
7461        watchers.retain(|watcher| watcher.waiter_id != waiter_id);
7462        let removed = before - watchers.len();
7463        let is_empty = watchers.is_empty();
7464        if removed > 0 {
7465            metrics.record_read_watchers_removed(core_id, removed);
7466        }
7467        if is_empty {
7468            read_watchers.remove(&stream_id);
7469        }
7470    }
7471
7472    async fn close_stream(
7473        group: &mut Box<dyn GroupEngine>,
7474        metrics: Arc<RuntimeMetricsInner>,
7475        read_materialization: Arc<Semaphore>,
7476        read_watchers: &mut ReadWatchers,
7477        request: CloseStreamRequest,
7478        placement: ShardPlacement,
7479    ) -> Result<CloseStreamResponse, RuntimeError> {
7480        let stream_id = request.stream_id.clone();
7481        let started_at = Instant::now();
7482        let exec_started_at = Instant::now();
7483        let response = group
7484            .close_stream(request, placement)
7485            .await
7486            .map_err(|err| RuntimeError::group_engine(placement, err));
7487        metrics.record_group_engine_exec(
7488            placement.core_id,
7489            placement.raft_group_id,
7490            elapsed_ns(exec_started_at),
7491        );
7492        if response
7493            .as_ref()
7494            .is_ok_and(|response| !response.deduplicated)
7495        {
7496            metrics.record_applied_mutation(
7497                placement.core_id,
7498                placement.raft_group_id,
7499                elapsed_ns(started_at),
7500            );
7501            Self::notify_read_watchers(
7502                group,
7503                metrics,
7504                read_materialization,
7505                read_watchers,
7506                &stream_id,
7507                placement,
7508            )
7509            .await;
7510        }
7511        response
7512    }
7513
7514    async fn add_fork_ref(
7515        group: &mut Box<dyn GroupEngine>,
7516        metrics: Arc<RuntimeMetricsInner>,
7517        stream_id: BucketStreamId,
7518        now_ms: u64,
7519        placement: ShardPlacement,
7520    ) -> Result<ForkRefResponse, RuntimeError> {
7521        let started_at = Instant::now();
7522        let exec_started_at = Instant::now();
7523        let response = group
7524            .add_fork_ref(stream_id, now_ms, placement)
7525            .await
7526            .map_err(|err| RuntimeError::group_engine(placement, err));
7527        metrics.record_group_engine_exec(
7528            placement.core_id,
7529            placement.raft_group_id,
7530            elapsed_ns(exec_started_at),
7531        );
7532        if response.is_ok() {
7533            metrics.record_applied_mutation(
7534                placement.core_id,
7535                placement.raft_group_id,
7536                elapsed_ns(started_at),
7537            );
7538        }
7539        response
7540    }
7541
7542    async fn release_fork_ref(
7543        group: &mut Box<dyn GroupEngine>,
7544        metrics: Arc<RuntimeMetricsInner>,
7545        read_materialization: Arc<Semaphore>,
7546        read_watchers: &mut ReadWatchers,
7547        stream_id: BucketStreamId,
7548        placement: ShardPlacement,
7549    ) -> Result<ForkRefResponse, RuntimeError> {
7550        let started_at = Instant::now();
7551        let exec_started_at = Instant::now();
7552        let response = group
7553            .release_fork_ref(stream_id.clone(), placement)
7554            .await
7555            .map_err(|err| RuntimeError::group_engine(placement, err));
7556        metrics.record_group_engine_exec(
7557            placement.core_id,
7558            placement.raft_group_id,
7559            elapsed_ns(exec_started_at),
7560        );
7561        if response.is_ok() {
7562            metrics.record_applied_mutation(
7563                placement.core_id,
7564                placement.raft_group_id,
7565                elapsed_ns(started_at),
7566            );
7567            Self::notify_read_watchers(
7568                group,
7569                metrics,
7570                read_materialization,
7571                read_watchers,
7572                &stream_id,
7573                placement,
7574            )
7575            .await;
7576        }
7577        response
7578    }
7579
7580    async fn delete_stream(
7581        group: &mut Box<dyn GroupEngine>,
7582        metrics: Arc<RuntimeMetricsInner>,
7583        read_materialization: Arc<Semaphore>,
7584        read_watchers: &mut ReadWatchers,
7585        request: DeleteStreamRequest,
7586        placement: ShardPlacement,
7587    ) -> Result<DeleteStreamResponse, RuntimeError> {
7588        let stream_id = request.stream_id.clone();
7589        let started_at = Instant::now();
7590        let exec_started_at = Instant::now();
7591        let response = group
7592            .delete_stream(request, placement)
7593            .await
7594            .map_err(|err| RuntimeError::group_engine(placement, err));
7595        metrics.record_group_engine_exec(
7596            placement.core_id,
7597            placement.raft_group_id,
7598            elapsed_ns(exec_started_at),
7599        );
7600        if response.is_ok() {
7601            metrics.record_applied_mutation(
7602                placement.core_id,
7603                placement.raft_group_id,
7604                elapsed_ns(started_at),
7605            );
7606            record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7607            Self::notify_read_watchers(
7608                group,
7609                metrics,
7610                read_materialization,
7611                read_watchers,
7612                &stream_id,
7613                placement,
7614            )
7615            .await;
7616        }
7617        response
7618    }
7619
7620    async fn flush_cold(
7621        group: &mut Box<dyn GroupEngine>,
7622        metrics: Arc<RuntimeMetricsInner>,
7623        read_materialization: Arc<Semaphore>,
7624        read_watchers: &mut ReadWatchers,
7625        request: FlushColdRequest,
7626        placement: ShardPlacement,
7627    ) -> Result<FlushColdResponse, RuntimeError> {
7628        let stream_id = request.stream_id.clone();
7629        let started_at = Instant::now();
7630        let exec_started_at = Instant::now();
7631        let response = group
7632            .flush_cold(request, placement)
7633            .await
7634            .map_err(|err| RuntimeError::group_engine(placement, err));
7635        metrics.record_group_engine_exec(
7636            placement.core_id,
7637            placement.raft_group_id,
7638            elapsed_ns(exec_started_at),
7639        );
7640        if response.is_ok() {
7641            metrics.record_applied_mutation(
7642                placement.core_id,
7643                placement.raft_group_id,
7644                elapsed_ns(started_at),
7645            );
7646            record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7647            Self::notify_read_watchers(
7648                group,
7649                metrics,
7650                read_materialization,
7651                read_watchers,
7652                &stream_id,
7653                placement,
7654            )
7655            .await;
7656        }
7657        response
7658    }
7659
7660    async fn flush_cold_batch(
7661        group: &mut Box<dyn GroupEngine>,
7662        metrics: Arc<RuntimeMetricsInner>,
7663        read_materialization: Arc<Semaphore>,
7664        read_watchers: &mut ReadWatchers,
7665        requests: Vec<FlushColdRequest>,
7666        placement: ShardPlacement,
7667    ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
7668        if requests.is_empty() {
7669            return Ok(Vec::new());
7670        }
7671        let stream_ids = requests
7672            .iter()
7673            .map(|request| request.stream_id.clone())
7674            .collect::<Vec<_>>();
7675        let commands = requests
7676            .into_iter()
7677            .map(GroupWriteCommand::from)
7678            .collect::<Vec<_>>();
7679        let started_at = Instant::now();
7680        let exec_started_at = Instant::now();
7681        let response = group
7682            .write_batch(vec![GroupWriteCommand::Batch { commands }], placement)
7683            .await
7684            .map_err(|err| RuntimeError::group_engine(placement, err));
7685        metrics.record_group_engine_exec(
7686            placement.core_id,
7687            placement.raft_group_id,
7688            elapsed_ns(exec_started_at),
7689        );
7690        let mut outer = response?;
7691        let Some(batch_response) = outer.pop() else {
7692            return Err(RuntimeError::group_engine(
7693                placement,
7694                GroupEngineError::new("cold flush batch returned no response"),
7695            ));
7696        };
7697        let items =
7698            match batch_response.map_err(|err| RuntimeError::group_engine(placement, err))? {
7699                GroupWriteResponse::Batch(items) => items,
7700                other => {
7701                    return Err(RuntimeError::group_engine(
7702                        placement,
7703                        GroupEngineError::new(format!(
7704                            "unexpected cold flush batch response: {other:?}"
7705                        )),
7706                    ));
7707                }
7708            };
7709        let mut responses = Vec::with_capacity(items.len());
7710        let mutation_ns = elapsed_ns(started_at);
7711        for (index, item) in items.into_iter().enumerate() {
7712            match item.map_err(|err| RuntimeError::group_engine(placement, err))? {
7713                GroupWriteResponse::FlushCold(response) => {
7714                    metrics.record_applied_mutation(
7715                        placement.core_id,
7716                        placement.raft_group_id,
7717                        mutation_ns,
7718                    );
7719                    if let Some(stream_id) = stream_ids.get(index) {
7720                        record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement)
7721                            .await;
7722                        Self::notify_read_watchers(
7723                            group,
7724                            metrics.clone(),
7725                            read_materialization.clone(),
7726                            read_watchers,
7727                            stream_id,
7728                            placement,
7729                        )
7730                        .await;
7731                    }
7732                    responses.push(response);
7733                }
7734                other => {
7735                    return Err(RuntimeError::group_engine(
7736                        placement,
7737                        GroupEngineError::new(format!(
7738                            "unexpected cold flush batch item response: {other:?}"
7739                        )),
7740                    ));
7741                }
7742            }
7743        }
7744        Ok(responses)
7745    }
7746
7747    async fn plan_cold_flush(
7748        group: &mut Box<dyn GroupEngine>,
7749        metrics: Arc<RuntimeMetricsInner>,
7750        request: PlanColdFlushRequest,
7751        placement: ShardPlacement,
7752    ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
7753        let exec_started_at = Instant::now();
7754        let response = group
7755            .plan_cold_flush(request, placement)
7756            .await
7757            .map_err(|err| RuntimeError::group_engine(placement, err));
7758        metrics.record_group_engine_exec(
7759            placement.core_id,
7760            placement.raft_group_id,
7761            elapsed_ns(exec_started_at),
7762        );
7763        response
7764    }
7765
7766    async fn plan_next_cold_flush(
7767        group: &mut Box<dyn GroupEngine>,
7768        metrics: Arc<RuntimeMetricsInner>,
7769        request: PlanGroupColdFlushRequest,
7770        placement: ShardPlacement,
7771    ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
7772        if !group.accepts_local_writes() {
7773            return Ok(None);
7774        }
7775        let exec_started_at = Instant::now();
7776        let response = group
7777            .plan_next_cold_flush(request, placement)
7778            .await
7779            .map_err(|err| RuntimeError::group_engine(placement, err));
7780        metrics.record_group_engine_exec(
7781            placement.core_id,
7782            placement.raft_group_id,
7783            elapsed_ns(exec_started_at),
7784        );
7785        response
7786    }
7787
7788    async fn plan_next_cold_flush_batch(
7789        group: &mut Box<dyn GroupEngine>,
7790        metrics: Arc<RuntimeMetricsInner>,
7791        request: PlanGroupColdFlushRequest,
7792        placement: ShardPlacement,
7793        max_candidates: usize,
7794    ) -> Result<Vec<ColdFlushCandidate>, RuntimeError> {
7795        if !group.accepts_local_writes() {
7796            return Ok(Vec::new());
7797        }
7798        let exec_started_at = Instant::now();
7799        let response = group
7800            .plan_next_cold_flush_batch(request, placement, max_candidates)
7801            .await
7802            .map_err(|err| RuntimeError::group_engine(placement, err));
7803        metrics.record_group_engine_exec(
7804            placement.core_id,
7805            placement.raft_group_id,
7806            elapsed_ns(exec_started_at),
7807        );
7808        response
7809    }
7810
7811    async fn head_stream(
7812        group: &mut Box<dyn GroupEngine>,
7813        metrics: Arc<RuntimeMetricsInner>,
7814        request: HeadStreamRequest,
7815        placement: ShardPlacement,
7816    ) -> Result<HeadStreamResponse, RuntimeError> {
7817        let exec_started_at = Instant::now();
7818        let response = group
7819            .head_stream(request, placement)
7820            .await
7821            .map_err(|err| RuntimeError::group_engine(placement, err));
7822        metrics.record_group_engine_exec(
7823            placement.core_id,
7824            placement.raft_group_id,
7825            elapsed_ns(exec_started_at),
7826        );
7827        response
7828    }
7829
7830    async fn snapshot_group(
7831        group: &mut Box<dyn GroupEngine>,
7832        metrics: Arc<RuntimeMetricsInner>,
7833        placement: ShardPlacement,
7834    ) -> Result<GroupSnapshot, RuntimeError> {
7835        let exec_started_at = Instant::now();
7836        let response = group
7837            .snapshot(placement)
7838            .await
7839            .map_err(|err| RuntimeError::group_engine(placement, err));
7840        metrics.record_group_engine_exec(
7841            placement.core_id,
7842            placement.raft_group_id,
7843            elapsed_ns(exec_started_at),
7844        );
7845        response
7846    }
7847
7848    async fn install_group_snapshot(
7849        group: &mut Box<dyn GroupEngine>,
7850        metrics: Arc<RuntimeMetricsInner>,
7851        snapshot: GroupSnapshot,
7852    ) -> Result<(), RuntimeError> {
7853        let placement = snapshot.placement;
7854        let exec_started_at = Instant::now();
7855        let response = group
7856            .install_snapshot(snapshot)
7857            .await
7858            .map_err(|err| RuntimeError::group_engine(placement, err));
7859        metrics.record_group_engine_exec(
7860            placement.core_id,
7861            placement.raft_group_id,
7862            elapsed_ns(exec_started_at),
7863        );
7864        response
7865    }
7866
7867    async fn create_stream(
7868        group: &mut Box<dyn GroupEngine>,
7869        metrics: Arc<RuntimeMetricsInner>,
7870        request: CreateStreamRequest,
7871        placement: ShardPlacement,
7872        admission: ColdWriteAdmission,
7873    ) -> Result<CreateStreamResponse, RuntimeError> {
7874        let stream_id = request.stream_id.clone();
7875        let incoming_bytes =
7876            u64::try_from(request.initial_payload.len()).expect("payload len fits u64");
7877        let started_at = Instant::now();
7878        let exec_started_at = Instant::now();
7879        let response = group
7880            .create_stream_with_cold_admission(request, placement, admission)
7881            .await
7882            .map_err(|err| {
7883                record_cold_backpressure_error(
7884                    &metrics,
7885                    placement,
7886                    incoming_bytes,
7887                    admission,
7888                    &err,
7889                );
7890                RuntimeError::group_engine(placement, err)
7891            })?;
7892        metrics.record_group_engine_exec(
7893            placement.core_id,
7894            placement.raft_group_id,
7895            elapsed_ns(exec_started_at),
7896        );
7897        if !response.already_exists {
7898            metrics.record_applied_mutation(
7899                placement.core_id,
7900                placement.raft_group_id,
7901                elapsed_ns(started_at),
7902            );
7903            record_cold_hot_backlog(group, &metrics, stream_id, placement).await;
7904        }
7905        Ok(response)
7906    }
7907
7908    async fn create_stream_external(
7909        group: &mut Box<dyn GroupEngine>,
7910        metrics: Arc<RuntimeMetricsInner>,
7911        request: CreateStreamExternalRequest,
7912        placement: ShardPlacement,
7913    ) -> Result<CreateStreamResponse, RuntimeError> {
7914        let stream_id = request.stream_id.clone();
7915        let started_at = Instant::now();
7916        let exec_started_at = Instant::now();
7917        let response = group
7918            .create_stream_external(request, placement)
7919            .await
7920            .map_err(|err| RuntimeError::group_engine(placement, err))?;
7921        metrics.record_group_engine_exec(
7922            placement.core_id,
7923            placement.raft_group_id,
7924            elapsed_ns(exec_started_at),
7925        );
7926        if !response.already_exists {
7927            metrics.record_applied_mutation(
7928                placement.core_id,
7929                placement.raft_group_id,
7930                elapsed_ns(started_at),
7931            );
7932            record_cold_hot_backlog(group, &metrics, stream_id, placement).await;
7933        }
7934        Ok(response)
7935    }
7936
7937    async fn apply_append(
7938        group: &mut Box<dyn GroupEngine>,
7939        metrics: Arc<RuntimeMetricsInner>,
7940        read_materialization: Arc<Semaphore>,
7941        read_watchers: &mut ReadWatchers,
7942        request: AppendRequest,
7943        placement: ShardPlacement,
7944        admission: ColdWriteAdmission,
7945    ) -> Result<AppendResponse, RuntimeError> {
7946        let stream_id = request.stream_id.clone();
7947        let incoming_bytes = request.payload_len();
7948        let started_at = Instant::now();
7949        let exec_started_at = Instant::now();
7950        let response = group
7951            .append_with_cold_admission(request, placement, admission)
7952            .await
7953            .map_err(|err| {
7954                record_cold_backpressure_error(
7955                    &metrics,
7956                    placement,
7957                    incoming_bytes,
7958                    admission,
7959                    &err,
7960                );
7961                RuntimeError::group_engine(placement, err)
7962            })?;
7963        metrics.record_group_engine_exec(
7964            placement.core_id,
7965            placement.raft_group_id,
7966            elapsed_ns(exec_started_at),
7967        );
7968
7969        if !response.deduplicated {
7970            metrics.record_append(placement.core_id, placement.raft_group_id);
7971            metrics.record_applied_mutation(
7972                placement.core_id,
7973                placement.raft_group_id,
7974                elapsed_ns(started_at),
7975            );
7976            record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7977            Self::notify_read_watchers(
7978                group,
7979                metrics,
7980                read_materialization,
7981                read_watchers,
7982                &stream_id,
7983                placement,
7984            )
7985            .await;
7986        }
7987        Ok(response)
7988    }
7989
7990    async fn apply_append_external(
7991        group: &mut Box<dyn GroupEngine>,
7992        metrics: Arc<RuntimeMetricsInner>,
7993        read_materialization: Arc<Semaphore>,
7994        read_watchers: &mut ReadWatchers,
7995        request: AppendExternalRequest,
7996        placement: ShardPlacement,
7997    ) -> Result<AppendResponse, RuntimeError> {
7998        let stream_id = request.stream_id.clone();
7999        let started_at = Instant::now();
8000        let exec_started_at = Instant::now();
8001        let response = group
8002            .append_external(request, placement)
8003            .await
8004            .map_err(|err| RuntimeError::group_engine(placement, err))?;
8005        metrics.record_group_engine_exec(
8006            placement.core_id,
8007            placement.raft_group_id,
8008            elapsed_ns(exec_started_at),
8009        );
8010
8011        if !response.deduplicated {
8012            metrics.record_append(placement.core_id, placement.raft_group_id);
8013            metrics.record_applied_mutation(
8014                placement.core_id,
8015                placement.raft_group_id,
8016                elapsed_ns(started_at),
8017            );
8018            record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
8019            Self::notify_read_watchers(
8020                group,
8021                metrics,
8022                read_materialization,
8023                read_watchers,
8024                &stream_id,
8025                placement,
8026            )
8027            .await;
8028        }
8029        Ok(response)
8030    }
8031
8032    fn prepare_append_batch_commands(
8033        batch: Vec<(
8034            AppendBatchRequest,
8035            oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
8036        )>,
8037    ) -> (Vec<GroupWriteCommand>, Vec<PendingAppendBatch>) {
8038        let mut commands = Vec::with_capacity(batch.len());
8039        let mut pending = Vec::with_capacity(batch.len());
8040        for (request, response_tx) in batch {
8041            pending.push(PendingAppendBatch {
8042                stream_id: request.stream_id.clone(),
8043                incoming_bytes: append_batch_payload_bytes(&request),
8044                response_tx,
8045                started_at: Instant::now(),
8046            });
8047            commands.push(GroupWriteCommand::from(request));
8048        }
8049        (commands, pending)
8050    }
8051
8052    fn prepare_append_batch_requests(
8053        batch: Vec<(
8054            AppendBatchRequest,
8055            oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
8056        )>,
8057    ) -> (Vec<AppendBatchRequest>, Vec<PendingAppendBatch>) {
8058        let mut requests = Vec::with_capacity(batch.len());
8059        let mut pending = Vec::with_capacity(batch.len());
8060        for (request, response_tx) in batch {
8061            pending.push(PendingAppendBatch {
8062                stream_id: request.stream_id.clone(),
8063                incoming_bytes: append_batch_payload_bytes(&request),
8064                response_tx,
8065                started_at: Instant::now(),
8066            });
8067            requests.push(request);
8068        }
8069        (requests, pending)
8070    }
8071
8072    async fn apply_prepared_append_batch_requests_with_cold_admission(
8073        group: &mut Box<dyn GroupEngine>,
8074        runtime: AppendBatchRuntime,
8075        read_watchers: &mut ReadWatchers,
8076        pending: Vec<PendingAppendBatch>,
8077        requests: Vec<AppendBatchRequest>,
8078        admission: ColdWriteAdmission,
8079    ) {
8080        let exec_started_at = Instant::now();
8081        let responses = group
8082            .append_batch_many_with_cold_admission(requests, runtime.placement, admission)
8083            .await
8084            .map_err(|err| RuntimeError::group_engine(runtime.placement, err));
8085        runtime.metrics.record_group_engine_exec(
8086            runtime.placement.core_id,
8087            runtime.placement.raft_group_id,
8088            elapsed_ns(exec_started_at),
8089        );
8090        Self::finish_append_batch_commands(
8091            group,
8092            runtime,
8093            read_watchers,
8094            pending,
8095            responses,
8096            Some(admission),
8097        )
8098        .await;
8099    }
8100
8101    async fn apply_prepared_append_batch_commands(
8102        group: &mut Box<dyn GroupEngine>,
8103        runtime: AppendBatchRuntime,
8104        read_watchers: &mut ReadWatchers,
8105        pending: Vec<PendingAppendBatch>,
8106        commands: Vec<GroupWriteCommand>,
8107    ) {
8108        let exec_started_at = Instant::now();
8109        let responses = group
8110            .write_batch(commands, runtime.placement)
8111            .await
8112            .map_err(|err| RuntimeError::group_engine(runtime.placement, err));
8113        runtime.metrics.record_group_engine_exec(
8114            runtime.placement.core_id,
8115            runtime.placement.raft_group_id,
8116            elapsed_ns(exec_started_at),
8117        );
8118        Self::finish_append_batch_commands(group, runtime, read_watchers, pending, responses, None)
8119            .await;
8120    }
8121
8122    async fn finish_append_batch_commands(
8123        group: &mut Box<dyn GroupEngine>,
8124        runtime: AppendBatchRuntime,
8125        read_watchers: &mut ReadWatchers,
8126        pending: Vec<PendingAppendBatch>,
8127        responses: Result<Vec<Result<GroupWriteResponse, GroupEngineError>>, RuntimeError>,
8128        admission: Option<ColdWriteAdmission>,
8129    ) {
8130        let placement = runtime.placement;
8131        let responses = match responses {
8132            Ok(responses) => responses,
8133            Err(err) => {
8134                for pending in pending {
8135                    if let Some(admission) = admission
8136                        && let RuntimeError::GroupEngine { message, .. } = &err
8137                        && message.contains("ColdBackpressure")
8138                    {
8139                        runtime.metrics.record_cold_backpressure(
8140                            placement.core_id,
8141                            placement.raft_group_id,
8142                            pending.incoming_bytes,
8143                            admission.max_hot_bytes_per_group.unwrap_or(0),
8144                        );
8145                    }
8146                    let _ = pending.response_tx.send(Err(err.clone()));
8147                }
8148                return;
8149            }
8150        };
8151
8152        if responses.len() != pending.len() {
8153            let err = RuntimeError::GroupEngine {
8154                core_id: placement.core_id,
8155                raft_group_id: placement.raft_group_id,
8156                message: format!(
8157                    "batched append response count {} does not match request count {}",
8158                    responses.len(),
8159                    pending.len()
8160                ),
8161                next_offset: None,
8162                leader_hint: None,
8163            };
8164            for pending in pending {
8165                let _ = pending.response_tx.send(Err(err.clone()));
8166            }
8167            return;
8168        }
8169
8170        for (pending, response) in pending.into_iter().zip(responses) {
8171            let response = match response {
8172                Ok(GroupWriteResponse::AppendBatch(response)) => Ok(response),
8173                Ok(other) => Err(RuntimeError::GroupEngine {
8174                    core_id: placement.core_id,
8175                    raft_group_id: placement.raft_group_id,
8176                    message: format!("unexpected batched append response: {other:?}"),
8177                    next_offset: None,
8178                    leader_hint: None,
8179                }),
8180                Err(err) => Err(RuntimeError::group_engine(placement, err)),
8181            };
8182
8183            match response {
8184                Ok(response) => {
8185                    let success_count = response
8186                        .items
8187                        .iter()
8188                        .filter(|item| matches!(item, Ok(response) if !response.deduplicated))
8189                        .count();
8190                    if success_count > 0 {
8191                        let success_count = u64::try_from(success_count).expect("count fits u64");
8192                        runtime.metrics.record_append_batch(
8193                            placement.core_id,
8194                            placement.raft_group_id,
8195                            success_count,
8196                        );
8197                        runtime.metrics.record_applied_mutation_batch(
8198                            placement.core_id,
8199                            placement.raft_group_id,
8200                            success_count,
8201                            elapsed_ns(pending.started_at),
8202                        );
8203                        Self::notify_read_watchers(
8204                            group,
8205                            runtime.metrics.clone(),
8206                            runtime.read_materialization.clone(),
8207                            read_watchers,
8208                            &pending.stream_id,
8209                            placement,
8210                        )
8211                        .await;
8212                    }
8213
8214                    let items = response
8215                        .items
8216                        .into_iter()
8217                        .map(|item| item.map_err(|err| RuntimeError::group_engine(placement, err)))
8218                        .collect();
8219                    let _ = pending
8220                        .response_tx
8221                        .send(Ok(AppendBatchResponse { placement, items }));
8222                }
8223                Err(err) => {
8224                    if let Some(admission) = admission
8225                        && let RuntimeError::GroupEngine { message, .. } = &err
8226                        && message.contains("ColdBackpressure")
8227                    {
8228                        runtime.metrics.record_cold_backpressure(
8229                            placement.core_id,
8230                            placement.raft_group_id,
8231                            pending.incoming_bytes,
8232                            admission.max_hot_bytes_per_group.unwrap_or(0),
8233                        );
8234                    }
8235                    let _ = pending.response_tx.send(Err(err));
8236                }
8237            }
8238        }
8239    }
8240
8241    async fn notify_read_watchers(
8242        group: &mut Box<dyn GroupEngine>,
8243        metrics: Arc<RuntimeMetricsInner>,
8244        read_materialization: Arc<Semaphore>,
8245        read_watchers: &mut ReadWatchers,
8246        stream_id: &BucketStreamId,
8247        placement: ShardPlacement,
8248    ) {
8249        let Some(watchers) = read_watchers.remove(stream_id) else {
8250            return;
8251        };
8252        metrics.record_read_watchers_removed(placement.core_id, watchers.len());
8253
8254        let mut request_groups: Vec<(ReadStreamRequest, Vec<ReadWatcher>)> = Vec::new();
8255        for watcher in watchers {
8256            if let Some((_, grouped)) = request_groups
8257                .iter_mut()
8258                .find(|(request, _)| *request == watcher.request)
8259            {
8260                grouped.push(watcher);
8261            } else {
8262                request_groups.push((watcher.request.clone(), vec![watcher]));
8263            }
8264        }
8265
8266        let mut pending = Vec::new();
8267        for (request, watchers) in request_groups {
8268            let parts = group
8269                .read_stream_parts(request, placement)
8270                .await
8271                .map_err(|err| RuntimeError::group_engine(placement, err));
8272            match parts {
8273                Ok(parts) if parts.payload_is_empty() && parts.up_to_date && !parts.closed => {
8274                    pending.extend(watchers);
8275                }
8276                Ok(parts) => {
8277                    Self::send_read_parts_to_watchers(
8278                        placement,
8279                        read_materialization.clone(),
8280                        parts,
8281                        watchers,
8282                    );
8283                }
8284                Err(err) => {
8285                    for watcher in watchers {
8286                        let _ = watcher.response_tx.send(Err(err.clone()));
8287                    }
8288                }
8289            }
8290        }
8291
8292        if !pending.is_empty() {
8293            metrics.record_read_watchers_added(placement.core_id, pending.len());
8294            read_watchers
8295                .entry(stream_id.clone())
8296                .or_default()
8297                .extend(pending);
8298        }
8299    }
8300}
8301
8302#[derive(Debug, Clone)]
8303pub struct RuntimeMetrics {
8304    inner: Arc<RuntimeMetricsInner>,
8305}
8306
8307impl RuntimeMetrics {
8308    pub fn snapshot(&self) -> RuntimeMetricsSnapshot {
8309        let per_core_appends = self
8310            .inner
8311            .per_core_appends
8312            .iter()
8313            .map(PaddedAtomicU64::load_relaxed)
8314            .collect::<Vec<_>>();
8315        let accepted_appends = per_core_appends.iter().sum();
8316        let per_group_appends = self
8317            .inner
8318            .per_group_appends
8319            .iter()
8320            .map(PaddedAtomicU64::load_relaxed)
8321            .collect();
8322        let per_core_applied_mutations = self
8323            .inner
8324            .per_core_applied_mutations
8325            .iter()
8326            .map(PaddedAtomicU64::load_relaxed)
8327            .collect::<Vec<_>>();
8328        let applied_mutations = per_core_applied_mutations.iter().sum();
8329        let per_group_applied_mutations = self
8330            .inner
8331            .per_group_applied_mutations
8332            .iter()
8333            .map(PaddedAtomicU64::load_relaxed)
8334            .collect();
8335        let per_core_mutation_apply_ns = self
8336            .inner
8337            .per_core_mutation_apply_ns
8338            .iter()
8339            .map(PaddedAtomicU64::load_relaxed)
8340            .collect::<Vec<_>>();
8341        let mutation_apply_ns = per_core_mutation_apply_ns.iter().sum();
8342        let per_group_mutation_apply_ns = self
8343            .inner
8344            .per_group_mutation_apply_ns
8345            .iter()
8346            .map(PaddedAtomicU64::load_relaxed)
8347            .collect();
8348        let per_core_group_lock_wait_ns = self
8349            .inner
8350            .per_core_group_lock_wait_ns
8351            .iter()
8352            .map(PaddedAtomicU64::load_relaxed)
8353            .collect::<Vec<_>>();
8354        let group_lock_wait_ns = per_core_group_lock_wait_ns.iter().sum();
8355        let per_group_group_lock_wait_ns = self
8356            .inner
8357            .per_group_group_lock_wait_ns
8358            .iter()
8359            .map(PaddedAtomicU64::load_relaxed)
8360            .collect();
8361        let per_core_group_engine_exec_ns = self
8362            .inner
8363            .per_core_group_engine_exec_ns
8364            .iter()
8365            .map(PaddedAtomicU64::load_relaxed)
8366            .collect::<Vec<_>>();
8367        let group_engine_exec_ns = per_core_group_engine_exec_ns.iter().sum();
8368        let per_group_group_engine_exec_ns = self
8369            .inner
8370            .per_group_group_engine_exec_ns
8371            .iter()
8372            .map(PaddedAtomicU64::load_relaxed)
8373            .collect();
8374        let per_group_group_mailbox_depth = self
8375            .inner
8376            .per_group_group_mailbox_depth
8377            .iter()
8378            .map(PaddedAtomicU64::load_relaxed)
8379            .collect::<Vec<_>>();
8380        let group_mailbox_depth = per_group_group_mailbox_depth.iter().sum();
8381        let per_group_group_mailbox_max_depth = self
8382            .inner
8383            .per_group_group_mailbox_max_depth
8384            .iter()
8385            .map(PaddedAtomicU64::load_relaxed)
8386            .collect::<Vec<_>>();
8387        let group_mailbox_max_depth = per_group_group_mailbox_max_depth
8388            .iter()
8389            .copied()
8390            .max()
8391            .unwrap_or(0);
8392        let per_group_group_mailbox_full_events = self
8393            .inner
8394            .per_group_group_mailbox_full_events
8395            .iter()
8396            .map(PaddedAtomicU64::load_relaxed)
8397            .collect::<Vec<_>>();
8398        let group_mailbox_full_events = per_group_group_mailbox_full_events.iter().sum();
8399        let per_core_raft_write_many_batches = self
8400            .inner
8401            .per_core_raft_write_many_batches
8402            .iter()
8403            .map(PaddedAtomicU64::load_relaxed)
8404            .collect::<Vec<_>>();
8405        let raft_write_many_batches = per_core_raft_write_many_batches.iter().sum();
8406        let per_group_raft_write_many_batches = self
8407            .inner
8408            .per_group_raft_write_many_batches
8409            .iter()
8410            .map(PaddedAtomicU64::load_relaxed)
8411            .collect();
8412        let per_core_raft_write_many_commands = self
8413            .inner
8414            .per_core_raft_write_many_commands
8415            .iter()
8416            .map(PaddedAtomicU64::load_relaxed)
8417            .collect::<Vec<_>>();
8418        let raft_write_many_commands = per_core_raft_write_many_commands.iter().sum();
8419        let per_group_raft_write_many_commands = self
8420            .inner
8421            .per_group_raft_write_many_commands
8422            .iter()
8423            .map(PaddedAtomicU64::load_relaxed)
8424            .collect();
8425        let per_core_raft_write_many_logical_commands = self
8426            .inner
8427            .per_core_raft_write_many_logical_commands
8428            .iter()
8429            .map(PaddedAtomicU64::load_relaxed)
8430            .collect::<Vec<_>>();
8431        let raft_write_many_logical_commands =
8432            per_core_raft_write_many_logical_commands.iter().sum();
8433        let per_group_raft_write_many_logical_commands = self
8434            .inner
8435            .per_group_raft_write_many_logical_commands
8436            .iter()
8437            .map(PaddedAtomicU64::load_relaxed)
8438            .collect();
8439        let per_core_raft_write_many_responses = self
8440            .inner
8441            .per_core_raft_write_many_responses
8442            .iter()
8443            .map(PaddedAtomicU64::load_relaxed)
8444            .collect::<Vec<_>>();
8445        let raft_write_many_responses = per_core_raft_write_many_responses.iter().sum();
8446        let per_group_raft_write_many_responses = self
8447            .inner
8448            .per_group_raft_write_many_responses
8449            .iter()
8450            .map(PaddedAtomicU64::load_relaxed)
8451            .collect();
8452        let per_core_raft_write_many_submit_ns = self
8453            .inner
8454            .per_core_raft_write_many_submit_ns
8455            .iter()
8456            .map(PaddedAtomicU64::load_relaxed)
8457            .collect::<Vec<_>>();
8458        let raft_write_many_submit_ns = per_core_raft_write_many_submit_ns.iter().sum();
8459        let per_group_raft_write_many_submit_ns = self
8460            .inner
8461            .per_group_raft_write_many_submit_ns
8462            .iter()
8463            .map(PaddedAtomicU64::load_relaxed)
8464            .collect();
8465        let per_core_raft_write_many_response_ns = self
8466            .inner
8467            .per_core_raft_write_many_response_ns
8468            .iter()
8469            .map(PaddedAtomicU64::load_relaxed)
8470            .collect::<Vec<_>>();
8471        let raft_write_many_response_ns = per_core_raft_write_many_response_ns.iter().sum();
8472        let per_group_raft_write_many_response_ns = self
8473            .inner
8474            .per_group_raft_write_many_response_ns
8475            .iter()
8476            .map(PaddedAtomicU64::load_relaxed)
8477            .collect();
8478        let per_core_raft_apply_entries = self
8479            .inner
8480            .per_core_raft_apply_entries
8481            .iter()
8482            .map(PaddedAtomicU64::load_relaxed)
8483            .collect::<Vec<_>>();
8484        let raft_apply_entries = per_core_raft_apply_entries.iter().sum();
8485        let per_group_raft_apply_entries = self
8486            .inner
8487            .per_group_raft_apply_entries
8488            .iter()
8489            .map(PaddedAtomicU64::load_relaxed)
8490            .collect();
8491        let per_core_raft_apply_ns = self
8492            .inner
8493            .per_core_raft_apply_ns
8494            .iter()
8495            .map(PaddedAtomicU64::load_relaxed)
8496            .collect::<Vec<_>>();
8497        let raft_apply_ns = per_core_raft_apply_ns.iter().sum();
8498        let per_group_raft_apply_ns = self
8499            .inner
8500            .per_group_raft_apply_ns
8501            .iter()
8502            .map(PaddedAtomicU64::load_relaxed)
8503            .collect();
8504        let per_core_live_read_waiters = self
8505            .inner
8506            .per_core_live_read_waiters
8507            .iter()
8508            .map(PaddedAtomicU64::load_relaxed)
8509            .collect::<Vec<_>>();
8510        let live_read_waiters = per_core_live_read_waiters.iter().sum();
8511        let per_core_live_read_backpressure_events = self
8512            .inner
8513            .per_core_live_read_backpressure_events
8514            .iter()
8515            .map(PaddedAtomicU64::load_relaxed)
8516            .collect::<Vec<_>>();
8517        let live_read_backpressure_events = per_core_live_read_backpressure_events.iter().sum();
8518        let per_core_routed_requests = self
8519            .inner
8520            .per_core_routed_requests
8521            .iter()
8522            .map(PaddedAtomicU64::load_relaxed)
8523            .collect::<Vec<_>>();
8524        let routed_requests = per_core_routed_requests.iter().sum();
8525        let per_core_mailbox_send_wait_ns = self
8526            .inner
8527            .per_core_mailbox_send_wait_ns
8528            .iter()
8529            .map(PaddedAtomicU64::load_relaxed)
8530            .collect::<Vec<_>>();
8531        let mailbox_send_wait_ns = per_core_mailbox_send_wait_ns.iter().sum();
8532        let per_core_mailbox_full_events = self
8533            .inner
8534            .per_core_mailbox_full_events
8535            .iter()
8536            .map(PaddedAtomicU64::load_relaxed)
8537            .collect::<Vec<_>>();
8538        let mailbox_full_events = per_core_mailbox_full_events.iter().sum();
8539        let per_core_wal_batches = self
8540            .inner
8541            .per_core_wal_batches
8542            .iter()
8543            .map(PaddedAtomicU64::load_relaxed)
8544            .collect::<Vec<_>>();
8545        let wal_batches = per_core_wal_batches.iter().sum();
8546        let per_group_wal_batches = self
8547            .inner
8548            .per_group_wal_batches
8549            .iter()
8550            .map(PaddedAtomicU64::load_relaxed)
8551            .collect();
8552        let per_core_wal_records = self
8553            .inner
8554            .per_core_wal_records
8555            .iter()
8556            .map(PaddedAtomicU64::load_relaxed)
8557            .collect::<Vec<_>>();
8558        let wal_records = per_core_wal_records.iter().sum();
8559        let per_group_wal_records = self
8560            .inner
8561            .per_group_wal_records
8562            .iter()
8563            .map(PaddedAtomicU64::load_relaxed)
8564            .collect();
8565        let per_core_wal_write_ns = self
8566            .inner
8567            .per_core_wal_write_ns
8568            .iter()
8569            .map(PaddedAtomicU64::load_relaxed)
8570            .collect::<Vec<_>>();
8571        let wal_write_ns = per_core_wal_write_ns.iter().sum();
8572        let per_group_wal_write_ns = self
8573            .inner
8574            .per_group_wal_write_ns
8575            .iter()
8576            .map(PaddedAtomicU64::load_relaxed)
8577            .collect();
8578        let per_core_wal_sync_ns = self
8579            .inner
8580            .per_core_wal_sync_ns
8581            .iter()
8582            .map(PaddedAtomicU64::load_relaxed)
8583            .collect::<Vec<_>>();
8584        let wal_sync_ns = per_core_wal_sync_ns.iter().sum();
8585        let per_group_wal_sync_ns = self
8586            .inner
8587            .per_group_wal_sync_ns
8588            .iter()
8589            .map(PaddedAtomicU64::load_relaxed)
8590            .collect();
8591        let cold_flush_uploads = self.inner.cold_flush_uploads.load_relaxed();
8592        let cold_flush_upload_bytes = self.inner.cold_flush_upload_bytes.load_relaxed();
8593        let cold_flush_upload_ns = self.inner.cold_flush_upload_ns.load_relaxed();
8594        let cold_flush_publishes = self.inner.cold_flush_publishes.load_relaxed();
8595        let cold_flush_publish_bytes = self.inner.cold_flush_publish_bytes.load_relaxed();
8596        let cold_flush_publish_ns = self.inner.cold_flush_publish_ns.load_relaxed();
8597        let cold_orphan_cleanup_attempts = self.inner.cold_orphan_cleanup_attempts.load_relaxed();
8598        let cold_orphan_cleanup_errors = self.inner.cold_orphan_cleanup_errors.load_relaxed();
8599        let cold_orphan_bytes = self.inner.cold_orphan_bytes.load_relaxed();
8600        let per_group_cold_hot_bytes = self
8601            .inner
8602            .per_group_cold_hot_bytes
8603            .iter()
8604            .map(PaddedAtomicU64::load_relaxed)
8605            .collect::<Vec<_>>();
8606        let cold_hot_bytes = per_group_cold_hot_bytes.iter().sum();
8607        let per_group_cold_hot_bytes_max = self
8608            .inner
8609            .per_group_cold_hot_bytes_max
8610            .iter()
8611            .map(PaddedAtomicU64::load_relaxed)
8612            .collect::<Vec<_>>();
8613        let cold_hot_group_bytes_max = per_group_cold_hot_bytes_max
8614            .iter()
8615            .copied()
8616            .max()
8617            .unwrap_or(0);
8618        let cold_hot_stream_bytes_max = self.inner.cold_hot_stream_bytes_max.load_relaxed();
8619        let per_core_cold_backpressure_events = self
8620            .inner
8621            .per_core_cold_backpressure_events
8622            .iter()
8623            .map(PaddedAtomicU64::load_relaxed)
8624            .collect::<Vec<_>>();
8625        let cold_backpressure_events = per_core_cold_backpressure_events.iter().sum();
8626        let per_group_cold_backpressure_events = self
8627            .inner
8628            .per_group_cold_backpressure_events
8629            .iter()
8630            .map(PaddedAtomicU64::load_relaxed)
8631            .collect();
8632        let cold_backpressure_bytes = self.inner.cold_backpressure_bytes.load_relaxed();
8633
8634        RuntimeMetricsSnapshot {
8635            accepted_appends,
8636            per_core_appends,
8637            per_group_appends,
8638            applied_mutations,
8639            per_core_applied_mutations,
8640            per_group_applied_mutations,
8641            mutation_apply_ns,
8642            per_core_mutation_apply_ns,
8643            per_group_mutation_apply_ns,
8644            group_lock_wait_ns,
8645            per_core_group_lock_wait_ns,
8646            per_group_group_lock_wait_ns,
8647            group_engine_exec_ns,
8648            per_core_group_engine_exec_ns,
8649            per_group_group_engine_exec_ns,
8650            group_mailbox_depth,
8651            per_group_group_mailbox_depth,
8652            group_mailbox_max_depth,
8653            per_group_group_mailbox_max_depth,
8654            group_mailbox_full_events,
8655            per_group_group_mailbox_full_events,
8656            raft_write_many_batches,
8657            per_core_raft_write_many_batches,
8658            per_group_raft_write_many_batches,
8659            raft_write_many_commands,
8660            per_core_raft_write_many_commands,
8661            per_group_raft_write_many_commands,
8662            raft_write_many_logical_commands,
8663            per_core_raft_write_many_logical_commands,
8664            per_group_raft_write_many_logical_commands,
8665            raft_write_many_responses,
8666            per_core_raft_write_many_responses,
8667            per_group_raft_write_many_responses,
8668            raft_write_many_submit_ns,
8669            per_core_raft_write_many_submit_ns,
8670            per_group_raft_write_many_submit_ns,
8671            raft_write_many_response_ns,
8672            per_core_raft_write_many_response_ns,
8673            per_group_raft_write_many_response_ns,
8674            raft_apply_entries,
8675            per_core_raft_apply_entries,
8676            per_group_raft_apply_entries,
8677            raft_apply_ns,
8678            per_core_raft_apply_ns,
8679            per_group_raft_apply_ns,
8680            live_read_waiters,
8681            per_core_live_read_waiters,
8682            live_read_backpressure_events,
8683            per_core_live_read_backpressure_events,
8684            routed_requests,
8685            per_core_routed_requests,
8686            mailbox_send_wait_ns,
8687            per_core_mailbox_send_wait_ns,
8688            mailbox_full_events,
8689            per_core_mailbox_full_events,
8690            wal_batches,
8691            per_core_wal_batches,
8692            per_group_wal_batches,
8693            wal_records,
8694            per_core_wal_records,
8695            per_group_wal_records,
8696            wal_write_ns,
8697            per_core_wal_write_ns,
8698            per_group_wal_write_ns,
8699            wal_sync_ns,
8700            per_core_wal_sync_ns,
8701            per_group_wal_sync_ns,
8702            cold_flush_uploads,
8703            cold_flush_upload_bytes,
8704            cold_flush_upload_ns,
8705            cold_flush_publishes,
8706            cold_flush_publish_bytes,
8707            cold_flush_publish_ns,
8708            cold_orphan_cleanup_attempts,
8709            cold_orphan_cleanup_errors,
8710            cold_orphan_bytes,
8711            cold_hot_bytes,
8712            per_group_cold_hot_bytes,
8713            cold_hot_group_bytes_max,
8714            per_group_cold_hot_bytes_max,
8715            cold_hot_stream_bytes_max,
8716            cold_backpressure_events,
8717            per_core_cold_backpressure_events,
8718            per_group_cold_backpressure_events,
8719            cold_backpressure_bytes,
8720        }
8721    }
8722}
8723
8724#[derive(Debug, Clone, PartialEq, Eq)]
8725pub struct RuntimeMetricsSnapshot {
8726    pub accepted_appends: u64,
8727    pub per_core_appends: Vec<u64>,
8728    pub per_group_appends: Vec<u64>,
8729    pub applied_mutations: u64,
8730    pub per_core_applied_mutations: Vec<u64>,
8731    pub per_group_applied_mutations: Vec<u64>,
8732    pub mutation_apply_ns: u64,
8733    pub per_core_mutation_apply_ns: Vec<u64>,
8734    pub per_group_mutation_apply_ns: Vec<u64>,
8735    pub group_lock_wait_ns: u64,
8736    pub per_core_group_lock_wait_ns: Vec<u64>,
8737    pub per_group_group_lock_wait_ns: Vec<u64>,
8738    pub group_engine_exec_ns: u64,
8739    pub per_core_group_engine_exec_ns: Vec<u64>,
8740    pub per_group_group_engine_exec_ns: Vec<u64>,
8741    pub group_mailbox_depth: u64,
8742    pub per_group_group_mailbox_depth: Vec<u64>,
8743    pub group_mailbox_max_depth: u64,
8744    pub per_group_group_mailbox_max_depth: Vec<u64>,
8745    pub group_mailbox_full_events: u64,
8746    pub per_group_group_mailbox_full_events: Vec<u64>,
8747    pub raft_write_many_batches: u64,
8748    pub per_core_raft_write_many_batches: Vec<u64>,
8749    pub per_group_raft_write_many_batches: Vec<u64>,
8750    pub raft_write_many_commands: u64,
8751    pub per_core_raft_write_many_commands: Vec<u64>,
8752    pub per_group_raft_write_many_commands: Vec<u64>,
8753    pub raft_write_many_logical_commands: u64,
8754    pub per_core_raft_write_many_logical_commands: Vec<u64>,
8755    pub per_group_raft_write_many_logical_commands: Vec<u64>,
8756    pub raft_write_many_responses: u64,
8757    pub per_core_raft_write_many_responses: Vec<u64>,
8758    pub per_group_raft_write_many_responses: Vec<u64>,
8759    pub raft_write_many_submit_ns: u64,
8760    pub per_core_raft_write_many_submit_ns: Vec<u64>,
8761    pub per_group_raft_write_many_submit_ns: Vec<u64>,
8762    pub raft_write_many_response_ns: u64,
8763    pub per_core_raft_write_many_response_ns: Vec<u64>,
8764    pub per_group_raft_write_many_response_ns: Vec<u64>,
8765    pub raft_apply_entries: u64,
8766    pub per_core_raft_apply_entries: Vec<u64>,
8767    pub per_group_raft_apply_entries: Vec<u64>,
8768    pub raft_apply_ns: u64,
8769    pub per_core_raft_apply_ns: Vec<u64>,
8770    pub per_group_raft_apply_ns: Vec<u64>,
8771    pub live_read_waiters: u64,
8772    pub per_core_live_read_waiters: Vec<u64>,
8773    pub live_read_backpressure_events: u64,
8774    pub per_core_live_read_backpressure_events: Vec<u64>,
8775    pub routed_requests: u64,
8776    pub per_core_routed_requests: Vec<u64>,
8777    pub mailbox_send_wait_ns: u64,
8778    pub per_core_mailbox_send_wait_ns: Vec<u64>,
8779    pub mailbox_full_events: u64,
8780    pub per_core_mailbox_full_events: Vec<u64>,
8781    pub wal_batches: u64,
8782    pub per_core_wal_batches: Vec<u64>,
8783    pub per_group_wal_batches: Vec<u64>,
8784    pub wal_records: u64,
8785    pub per_core_wal_records: Vec<u64>,
8786    pub per_group_wal_records: Vec<u64>,
8787    pub wal_write_ns: u64,
8788    pub per_core_wal_write_ns: Vec<u64>,
8789    pub per_group_wal_write_ns: Vec<u64>,
8790    pub wal_sync_ns: u64,
8791    pub per_core_wal_sync_ns: Vec<u64>,
8792    pub per_group_wal_sync_ns: Vec<u64>,
8793    pub cold_flush_uploads: u64,
8794    pub cold_flush_upload_bytes: u64,
8795    pub cold_flush_upload_ns: u64,
8796    pub cold_flush_publishes: u64,
8797    pub cold_flush_publish_bytes: u64,
8798    pub cold_flush_publish_ns: u64,
8799    pub cold_orphan_cleanup_attempts: u64,
8800    pub cold_orphan_cleanup_errors: u64,
8801    pub cold_orphan_bytes: u64,
8802    pub cold_hot_bytes: u64,
8803    pub per_group_cold_hot_bytes: Vec<u64>,
8804    pub cold_hot_group_bytes_max: u64,
8805    pub per_group_cold_hot_bytes_max: Vec<u64>,
8806    pub cold_hot_stream_bytes_max: u64,
8807    pub cold_backpressure_events: u64,
8808    pub per_core_cold_backpressure_events: Vec<u64>,
8809    pub per_group_cold_backpressure_events: Vec<u64>,
8810    pub cold_backpressure_bytes: u64,
8811}
8812
8813#[derive(Debug, Clone, PartialEq, Eq)]
8814pub struct RuntimeMailboxSnapshot {
8815    pub depths: Vec<usize>,
8816    pub capacities: Vec<usize>,
8817}
8818
8819#[derive(Debug)]
8820struct RuntimeMetricsInner {
8821    per_core_appends: Vec<PaddedAtomicU64>,
8822    per_group_appends: Vec<PaddedAtomicU64>,
8823    per_core_applied_mutations: Vec<PaddedAtomicU64>,
8824    per_group_applied_mutations: Vec<PaddedAtomicU64>,
8825    per_core_mutation_apply_ns: Vec<PaddedAtomicU64>,
8826    per_group_mutation_apply_ns: Vec<PaddedAtomicU64>,
8827    per_core_group_lock_wait_ns: Vec<PaddedAtomicU64>,
8828    per_group_group_lock_wait_ns: Vec<PaddedAtomicU64>,
8829    per_core_group_engine_exec_ns: Vec<PaddedAtomicU64>,
8830    per_group_group_engine_exec_ns: Vec<PaddedAtomicU64>,
8831    per_group_group_mailbox_depth: Vec<PaddedAtomicU64>,
8832    per_group_group_mailbox_max_depth: Vec<PaddedAtomicU64>,
8833    per_group_group_mailbox_full_events: Vec<PaddedAtomicU64>,
8834    per_core_raft_write_many_batches: Vec<PaddedAtomicU64>,
8835    per_group_raft_write_many_batches: Vec<PaddedAtomicU64>,
8836    per_core_raft_write_many_commands: Vec<PaddedAtomicU64>,
8837    per_group_raft_write_many_commands: Vec<PaddedAtomicU64>,
8838    per_core_raft_write_many_logical_commands: Vec<PaddedAtomicU64>,
8839    per_group_raft_write_many_logical_commands: Vec<PaddedAtomicU64>,
8840    per_core_raft_write_many_responses: Vec<PaddedAtomicU64>,
8841    per_group_raft_write_many_responses: Vec<PaddedAtomicU64>,
8842    per_core_raft_write_many_submit_ns: Vec<PaddedAtomicU64>,
8843    per_group_raft_write_many_submit_ns: Vec<PaddedAtomicU64>,
8844    per_core_raft_write_many_response_ns: Vec<PaddedAtomicU64>,
8845    per_group_raft_write_many_response_ns: Vec<PaddedAtomicU64>,
8846    per_core_raft_apply_entries: Vec<PaddedAtomicU64>,
8847    per_group_raft_apply_entries: Vec<PaddedAtomicU64>,
8848    per_core_raft_apply_ns: Vec<PaddedAtomicU64>,
8849    per_group_raft_apply_ns: Vec<PaddedAtomicU64>,
8850    per_core_live_read_waiters: Vec<PaddedAtomicU64>,
8851    per_core_live_read_backpressure_events: Vec<PaddedAtomicU64>,
8852    per_core_routed_requests: Vec<PaddedAtomicU64>,
8853    per_core_mailbox_send_wait_ns: Vec<PaddedAtomicU64>,
8854    per_core_mailbox_full_events: Vec<PaddedAtomicU64>,
8855    per_core_wal_batches: Vec<PaddedAtomicU64>,
8856    per_group_wal_batches: Vec<PaddedAtomicU64>,
8857    per_core_wal_records: Vec<PaddedAtomicU64>,
8858    per_group_wal_records: Vec<PaddedAtomicU64>,
8859    per_core_wal_write_ns: Vec<PaddedAtomicU64>,
8860    per_group_wal_write_ns: Vec<PaddedAtomicU64>,
8861    per_core_wal_sync_ns: Vec<PaddedAtomicU64>,
8862    per_group_wal_sync_ns: Vec<PaddedAtomicU64>,
8863    cold_flush_uploads: PaddedAtomicU64,
8864    cold_flush_upload_bytes: PaddedAtomicU64,
8865    cold_flush_upload_ns: PaddedAtomicU64,
8866    cold_flush_publishes: PaddedAtomicU64,
8867    cold_flush_publish_bytes: PaddedAtomicU64,
8868    cold_flush_publish_ns: PaddedAtomicU64,
8869    cold_orphan_cleanup_attempts: PaddedAtomicU64,
8870    cold_orphan_cleanup_errors: PaddedAtomicU64,
8871    cold_orphan_bytes: PaddedAtomicU64,
8872    per_group_cold_hot_bytes: Vec<PaddedAtomicU64>,
8873    per_group_cold_hot_bytes_max: Vec<PaddedAtomicU64>,
8874    cold_hot_stream_bytes_max: PaddedAtomicU64,
8875    per_core_cold_backpressure_events: Vec<PaddedAtomicU64>,
8876    per_group_cold_backpressure_events: Vec<PaddedAtomicU64>,
8877    cold_backpressure_bytes: PaddedAtomicU64,
8878}
8879
8880#[derive(Debug, Clone, Copy)]
8881struct RaftWriteManySample {
8882    command_count: u64,
8883    logical_command_count: u64,
8884    response_count: u64,
8885    submit_ns: u64,
8886    response_ns: u64,
8887}
8888
8889impl RuntimeMetricsInner {
8890    fn new(core_count: usize, raft_group_count: usize) -> Self {
8891        Self {
8892            per_core_appends: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8893            per_group_appends: (0..raft_group_count)
8894                .map(|_| PaddedAtomicU64::new(0))
8895                .collect(),
8896            per_core_applied_mutations: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8897            per_group_applied_mutations: (0..raft_group_count)
8898                .map(|_| PaddedAtomicU64::new(0))
8899                .collect(),
8900            per_core_mutation_apply_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8901            per_group_mutation_apply_ns: (0..raft_group_count)
8902                .map(|_| PaddedAtomicU64::new(0))
8903                .collect(),
8904            per_core_group_lock_wait_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8905            per_group_group_lock_wait_ns: (0..raft_group_count)
8906                .map(|_| PaddedAtomicU64::new(0))
8907                .collect(),
8908            per_core_group_engine_exec_ns: (0..core_count)
8909                .map(|_| PaddedAtomicU64::new(0))
8910                .collect(),
8911            per_group_group_engine_exec_ns: (0..raft_group_count)
8912                .map(|_| PaddedAtomicU64::new(0))
8913                .collect(),
8914            per_group_group_mailbox_depth: (0..raft_group_count)
8915                .map(|_| PaddedAtomicU64::new(0))
8916                .collect(),
8917            per_group_group_mailbox_max_depth: (0..raft_group_count)
8918                .map(|_| PaddedAtomicU64::new(0))
8919                .collect(),
8920            per_group_group_mailbox_full_events: (0..raft_group_count)
8921                .map(|_| PaddedAtomicU64::new(0))
8922                .collect(),
8923            per_core_raft_write_many_batches: (0..core_count)
8924                .map(|_| PaddedAtomicU64::new(0))
8925                .collect(),
8926            per_group_raft_write_many_batches: (0..raft_group_count)
8927                .map(|_| PaddedAtomicU64::new(0))
8928                .collect(),
8929            per_core_raft_write_many_commands: (0..core_count)
8930                .map(|_| PaddedAtomicU64::new(0))
8931                .collect(),
8932            per_group_raft_write_many_commands: (0..raft_group_count)
8933                .map(|_| PaddedAtomicU64::new(0))
8934                .collect(),
8935            per_core_raft_write_many_logical_commands: (0..core_count)
8936                .map(|_| PaddedAtomicU64::new(0))
8937                .collect(),
8938            per_group_raft_write_many_logical_commands: (0..raft_group_count)
8939                .map(|_| PaddedAtomicU64::new(0))
8940                .collect(),
8941            per_core_raft_write_many_responses: (0..core_count)
8942                .map(|_| PaddedAtomicU64::new(0))
8943                .collect(),
8944            per_group_raft_write_many_responses: (0..raft_group_count)
8945                .map(|_| PaddedAtomicU64::new(0))
8946                .collect(),
8947            per_core_raft_write_many_submit_ns: (0..core_count)
8948                .map(|_| PaddedAtomicU64::new(0))
8949                .collect(),
8950            per_group_raft_write_many_submit_ns: (0..raft_group_count)
8951                .map(|_| PaddedAtomicU64::new(0))
8952                .collect(),
8953            per_core_raft_write_many_response_ns: (0..core_count)
8954                .map(|_| PaddedAtomicU64::new(0))
8955                .collect(),
8956            per_group_raft_write_many_response_ns: (0..raft_group_count)
8957                .map(|_| PaddedAtomicU64::new(0))
8958                .collect(),
8959            per_core_raft_apply_entries: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8960            per_group_raft_apply_entries: (0..raft_group_count)
8961                .map(|_| PaddedAtomicU64::new(0))
8962                .collect(),
8963            per_core_raft_apply_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8964            per_group_raft_apply_ns: (0..raft_group_count)
8965                .map(|_| PaddedAtomicU64::new(0))
8966                .collect(),
8967            per_core_live_read_waiters: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8968            per_core_live_read_backpressure_events: (0..core_count)
8969                .map(|_| PaddedAtomicU64::new(0))
8970                .collect(),
8971            per_core_routed_requests: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8972            per_core_mailbox_send_wait_ns: (0..core_count)
8973                .map(|_| PaddedAtomicU64::new(0))
8974                .collect(),
8975            per_core_mailbox_full_events: (0..core_count)
8976                .map(|_| PaddedAtomicU64::new(0))
8977                .collect(),
8978            per_core_wal_batches: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8979            per_group_wal_batches: (0..raft_group_count)
8980                .map(|_| PaddedAtomicU64::new(0))
8981                .collect(),
8982            per_core_wal_records: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8983            per_group_wal_records: (0..raft_group_count)
8984                .map(|_| PaddedAtomicU64::new(0))
8985                .collect(),
8986            per_core_wal_write_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8987            per_group_wal_write_ns: (0..raft_group_count)
8988                .map(|_| PaddedAtomicU64::new(0))
8989                .collect(),
8990            per_core_wal_sync_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8991            per_group_wal_sync_ns: (0..raft_group_count)
8992                .map(|_| PaddedAtomicU64::new(0))
8993                .collect(),
8994            cold_flush_uploads: PaddedAtomicU64::new(0),
8995            cold_flush_upload_bytes: PaddedAtomicU64::new(0),
8996            cold_flush_upload_ns: PaddedAtomicU64::new(0),
8997            cold_flush_publishes: PaddedAtomicU64::new(0),
8998            cold_flush_publish_bytes: PaddedAtomicU64::new(0),
8999            cold_flush_publish_ns: PaddedAtomicU64::new(0),
9000            cold_orphan_cleanup_attempts: PaddedAtomicU64::new(0),
9001            cold_orphan_cleanup_errors: PaddedAtomicU64::new(0),
9002            cold_orphan_bytes: PaddedAtomicU64::new(0),
9003            per_group_cold_hot_bytes: (0..raft_group_count)
9004                .map(|_| PaddedAtomicU64::new(0))
9005                .collect(),
9006            per_group_cold_hot_bytes_max: (0..raft_group_count)
9007                .map(|_| PaddedAtomicU64::new(0))
9008                .collect(),
9009            cold_hot_stream_bytes_max: PaddedAtomicU64::new(0),
9010            per_core_cold_backpressure_events: (0..core_count)
9011                .map(|_| PaddedAtomicU64::new(0))
9012                .collect(),
9013            per_group_cold_backpressure_events: (0..raft_group_count)
9014                .map(|_| PaddedAtomicU64::new(0))
9015                .collect(),
9016            cold_backpressure_bytes: PaddedAtomicU64::new(0),
9017        }
9018    }
9019
9020    fn record_routed_request(&self, core_id: CoreId, mailbox_send_wait_ns: u64) {
9021        let index = usize::from(core_id.0);
9022        self.per_core_routed_requests[index].fetch_add_relaxed(1);
9023        self.per_core_mailbox_send_wait_ns[index].fetch_add_relaxed(mailbox_send_wait_ns);
9024    }
9025
9026    fn record_mailbox_full(&self, core_id: CoreId) {
9027        self.per_core_mailbox_full_events[usize::from(core_id.0)].fetch_add_relaxed(1);
9028    }
9029
9030    fn record_append(&self, core_id: CoreId, group_id: RaftGroupId) {
9031        self.record_append_batch(core_id, group_id, 1);
9032    }
9033
9034    fn record_append_batch(&self, core_id: CoreId, group_id: RaftGroupId, count: u64) {
9035        self.per_core_appends[usize::from(core_id.0)].fetch_add_relaxed(count);
9036        self.per_group_appends[usize::try_from(group_id.0).expect("u32 fits usize")]
9037            .fetch_add_relaxed(count);
9038    }
9039
9040    fn record_applied_mutation(&self, core_id: CoreId, group_id: RaftGroupId, apply_ns: u64) {
9041        self.record_applied_mutation_batch(core_id, group_id, 1, apply_ns);
9042    }
9043
9044    fn record_applied_mutation_batch(
9045        &self,
9046        core_id: CoreId,
9047        group_id: RaftGroupId,
9048        count: u64,
9049        apply_ns: u64,
9050    ) {
9051        let core_index = usize::from(core_id.0);
9052        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9053        self.per_core_applied_mutations[core_index].fetch_add_relaxed(count);
9054        self.per_group_applied_mutations[group_index].fetch_add_relaxed(count);
9055        self.per_core_mutation_apply_ns[core_index].fetch_add_relaxed(apply_ns);
9056        self.per_group_mutation_apply_ns[group_index].fetch_add_relaxed(apply_ns);
9057    }
9058
9059    fn record_group_engine_exec(&self, core_id: CoreId, group_id: RaftGroupId, exec_ns: u64) {
9060        let core_index = usize::from(core_id.0);
9061        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9062        self.per_core_group_engine_exec_ns[core_index].fetch_add_relaxed(exec_ns);
9063        self.per_group_group_engine_exec_ns[group_index].fetch_add_relaxed(exec_ns);
9064    }
9065
9066    fn record_group_mailbox_enqueued(&self, group_id: RaftGroupId) {
9067        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9068        let depth = self.per_group_group_mailbox_depth[group_index].fetch_add_relaxed(1) + 1;
9069        self.per_group_group_mailbox_max_depth[group_index].fetch_max_relaxed(depth);
9070    }
9071
9072    fn record_group_mailbox_dequeued(&self, group_id: RaftGroupId) {
9073        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9074        self.per_group_group_mailbox_depth[group_index].fetch_sub_relaxed(1);
9075    }
9076
9077    fn record_group_mailbox_full(&self, group_id: RaftGroupId) {
9078        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9079        self.per_group_group_mailbox_full_events[group_index].fetch_add_relaxed(1);
9080    }
9081
9082    fn record_raft_write_many(
9083        &self,
9084        core_id: CoreId,
9085        group_id: RaftGroupId,
9086        sample: RaftWriteManySample,
9087    ) {
9088        let core_index = usize::from(core_id.0);
9089        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9090        self.per_core_raft_write_many_batches[core_index].fetch_add_relaxed(1);
9091        self.per_group_raft_write_many_batches[group_index].fetch_add_relaxed(1);
9092        self.per_core_raft_write_many_commands[core_index].fetch_add_relaxed(sample.command_count);
9093        self.per_group_raft_write_many_commands[group_index]
9094            .fetch_add_relaxed(sample.command_count);
9095        self.per_core_raft_write_many_logical_commands[core_index]
9096            .fetch_add_relaxed(sample.logical_command_count);
9097        self.per_group_raft_write_many_logical_commands[group_index]
9098            .fetch_add_relaxed(sample.logical_command_count);
9099        self.per_core_raft_write_many_responses[core_index]
9100            .fetch_add_relaxed(sample.response_count);
9101        self.per_group_raft_write_many_responses[group_index]
9102            .fetch_add_relaxed(sample.response_count);
9103        self.per_core_raft_write_many_submit_ns[core_index].fetch_add_relaxed(sample.submit_ns);
9104        self.per_group_raft_write_many_submit_ns[group_index].fetch_add_relaxed(sample.submit_ns);
9105        self.per_core_raft_write_many_response_ns[core_index].fetch_add_relaxed(sample.response_ns);
9106        self.per_group_raft_write_many_response_ns[group_index]
9107            .fetch_add_relaxed(sample.response_ns);
9108    }
9109
9110    fn record_raft_apply_batch(
9111        &self,
9112        core_id: CoreId,
9113        group_id: RaftGroupId,
9114        entry_count: u64,
9115        apply_ns: u64,
9116    ) {
9117        let core_index = usize::from(core_id.0);
9118        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9119        self.per_core_raft_apply_entries[core_index].fetch_add_relaxed(entry_count);
9120        self.per_group_raft_apply_entries[group_index].fetch_add_relaxed(entry_count);
9121        self.per_core_raft_apply_ns[core_index].fetch_add_relaxed(apply_ns);
9122        self.per_group_raft_apply_ns[group_index].fetch_add_relaxed(apply_ns);
9123    }
9124
9125    fn record_wal_batch(
9126        &self,
9127        core_id: CoreId,
9128        group_id: RaftGroupId,
9129        record_count: u64,
9130        write_ns: u64,
9131        sync_ns: u64,
9132    ) {
9133        let core_index = usize::from(core_id.0);
9134        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9135        self.per_core_wal_batches[core_index].fetch_add_relaxed(1);
9136        self.per_group_wal_batches[group_index].fetch_add_relaxed(1);
9137        self.per_core_wal_records[core_index].fetch_add_relaxed(record_count);
9138        self.per_group_wal_records[group_index].fetch_add_relaxed(record_count);
9139        self.per_core_wal_write_ns[core_index].fetch_add_relaxed(write_ns);
9140        self.per_group_wal_write_ns[group_index].fetch_add_relaxed(write_ns);
9141        self.per_core_wal_sync_ns[core_index].fetch_add_relaxed(sync_ns);
9142        self.per_group_wal_sync_ns[group_index].fetch_add_relaxed(sync_ns);
9143    }
9144
9145    fn record_cold_upload(&self, bytes: u64, upload_ns: u64) {
9146        self.cold_flush_uploads.fetch_add_relaxed(1);
9147        self.cold_flush_upload_bytes.fetch_add_relaxed(bytes);
9148        self.cold_flush_upload_ns.fetch_add_relaxed(upload_ns);
9149    }
9150
9151    fn record_cold_publish(&self, bytes: u64, publish_ns: u64) {
9152        self.cold_flush_publishes.fetch_add_relaxed(1);
9153        self.cold_flush_publish_bytes.fetch_add_relaxed(bytes);
9154        self.cold_flush_publish_ns.fetch_add_relaxed(publish_ns);
9155    }
9156
9157    fn record_cold_orphan_cleanup(&self, bytes: u64, cleanup_failed: bool) {
9158        self.cold_orphan_cleanup_attempts.fetch_add_relaxed(1);
9159        if cleanup_failed {
9160            self.cold_orphan_cleanup_errors.fetch_add_relaxed(1);
9161            self.cold_orphan_bytes.fetch_add_relaxed(bytes);
9162        }
9163    }
9164
9165    fn record_cold_hot_backlog(
9166        &self,
9167        group_id: RaftGroupId,
9168        stream_hot_bytes: u64,
9169        group_hot_bytes: u64,
9170    ) {
9171        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9172        self.per_group_cold_hot_bytes[group_index].store_relaxed(group_hot_bytes);
9173        self.per_group_cold_hot_bytes_max[group_index].fetch_max_relaxed(group_hot_bytes);
9174        self.cold_hot_stream_bytes_max
9175            .fetch_max_relaxed(stream_hot_bytes);
9176    }
9177
9178    fn record_cold_backpressure(
9179        &self,
9180        core_id: CoreId,
9181        group_id: RaftGroupId,
9182        incoming_bytes: u64,
9183        _limit: u64,
9184    ) {
9185        let core_index = usize::from(core_id.0);
9186        let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9187        self.per_core_cold_backpressure_events[core_index].fetch_add_relaxed(1);
9188        self.per_group_cold_backpressure_events[group_index].fetch_add_relaxed(1);
9189        self.cold_backpressure_bytes
9190            .fetch_add_relaxed(incoming_bytes);
9191    }
9192
9193    fn record_read_watcher_added(&self, core_id: CoreId) {
9194        self.record_read_watchers_added(core_id, 1);
9195    }
9196
9197    fn record_read_watchers_added(&self, core_id: CoreId, count: usize) {
9198        self.per_core_live_read_waiters[usize::from(core_id.0)]
9199            .fetch_add_relaxed(u64::try_from(count).expect("watcher count fits u64"));
9200    }
9201
9202    fn record_read_watchers_removed(&self, core_id: CoreId, count: usize) {
9203        self.per_core_live_read_waiters[usize::from(core_id.0)]
9204            .fetch_sub_relaxed(u64::try_from(count).expect("watcher count fits u64"));
9205    }
9206
9207    fn record_live_read_backpressure(&self, core_id: CoreId) {
9208        self.per_core_live_read_backpressure_events[usize::from(core_id.0)].fetch_add_relaxed(1);
9209    }
9210}
9211
9212fn elapsed_ns(started_at: Instant) -> u64 {
9213    u64::try_from(started_at.elapsed().as_nanos()).unwrap_or(u64::MAX)
9214}
9215
9216fn append_batch_payload_bytes(request: &AppendBatchRequest) -> u64 {
9217    request
9218        .payloads
9219        .iter()
9220        .map(|payload| u64::try_from(payload.len()).expect("payload len fits u64"))
9221        .sum()
9222}
9223
9224fn record_cold_backpressure_error(
9225    metrics: &RuntimeMetricsInner,
9226    placement: ShardPlacement,
9227    incoming_bytes: u64,
9228    admission: ColdWriteAdmission,
9229    err: &GroupEngineError,
9230) {
9231    if !err.message().contains("ColdBackpressure") {
9232        return;
9233    }
9234    metrics.record_cold_backpressure(
9235        placement.core_id,
9236        placement.raft_group_id,
9237        incoming_bytes,
9238        admission.max_hot_bytes_per_group.unwrap_or(0),
9239    );
9240}
9241
9242fn is_stale_cold_flush_candidate_error(err: &RuntimeError) -> bool {
9243    let RuntimeError::GroupEngine { message, .. } = err else {
9244        return false;
9245    };
9246    message.contains("StreamGone")
9247        || message.contains("StreamNotFound")
9248        || (message.contains("InvalidColdFlush")
9249            && (message.contains("beyond stream")
9250                || message.contains("does not match the start of a hot payload segment")
9251                || message.contains("does not cover contiguous hot payload segments")
9252                || message.contains("exceeds stream")
9253                || message.contains("non-contiguous hot payload metadata")))
9254}
9255
9256async fn record_cold_hot_backlog(
9257    group: &mut Box<dyn GroupEngine>,
9258    metrics: &RuntimeMetricsInner,
9259    stream_id: BucketStreamId,
9260    placement: ShardPlacement,
9261) {
9262    if let Ok(backlog) = group.cold_hot_backlog(stream_id, placement).await {
9263        metrics.record_cold_hot_backlog(
9264            placement.raft_group_id,
9265            backlog.stream_hot_bytes,
9266            backlog.group_hot_bytes,
9267        );
9268    }
9269}
9270
9271#[derive(Debug)]
9272#[repr(align(128))]
9273struct PaddedAtomicU64 {
9274    value: AtomicU64,
9275}
9276
9277impl PaddedAtomicU64 {
9278    fn new(value: u64) -> Self {
9279        Self {
9280            value: AtomicU64::new(value),
9281        }
9282    }
9283
9284    fn load_relaxed(&self) -> u64 {
9285        self.value.load(Ordering::Relaxed)
9286    }
9287
9288    fn fetch_add_relaxed(&self, value: u64) -> u64 {
9289        self.value.fetch_add(value, Ordering::Relaxed)
9290    }
9291
9292    fn fetch_sub_relaxed(&self, value: u64) {
9293        self.value.fetch_sub(value, Ordering::Relaxed);
9294    }
9295
9296    fn fetch_max_relaxed(&self, value: u64) {
9297        self.value.fetch_max(value, Ordering::Relaxed);
9298    }
9299
9300    fn store_relaxed(&self, value: u64) {
9301        self.value.store(value, Ordering::Relaxed);
9302    }
9303}
9304
9305#[cfg(test)]
9306mod tests {
9307    use super::*;
9308    use std::collections::HashSet;
9309    use std::sync::Mutex;
9310    use std::sync::atomic::AtomicBool;
9311    use tokio::sync::Notify;
9312
9313    fn runtime(core_count: usize, group_count: usize) -> ShardRuntime {
9314        ShardRuntime::spawn(RuntimeConfig {
9315            core_count,
9316            raft_group_count: group_count,
9317            mailbox_capacity: 128,
9318            threading: RuntimeThreading::HostedTokio,
9319            cold_max_hot_bytes_per_group: None,
9320            live_read_max_waiters_per_core: Some(65_536),
9321        })
9322        .expect("spawn runtime")
9323    }
9324
9325    fn stream_on_group(
9326        runtime: &ShardRuntime,
9327        group_id: RaftGroupId,
9328        prefix: &str,
9329    ) -> BucketStreamId {
9330        for index in 0..10_000 {
9331            let stream = BucketStreamId::new("benchcmp", format!("{prefix}-{index}"));
9332            if runtime.locate(&stream).raft_group_id == group_id {
9333                return stream;
9334            }
9335        }
9336        panic!("could not find stream for group {}", group_id.0);
9337    }
9338
9339    async fn create_stream(
9340        runtime: &ShardRuntime,
9341        stream: &BucketStreamId,
9342    ) -> CreateStreamResponse {
9343        runtime
9344            .create_stream(CreateStreamRequest::new(
9345                stream.clone(),
9346                DEFAULT_CONTENT_TYPE,
9347            ))
9348            .await
9349            .expect("create stream")
9350    }
9351
9352    fn producer(id: &str, epoch: u64, seq: u64) -> ProducerRequest {
9353        ProducerRequest {
9354            producer_id: id.to_owned(),
9355            producer_epoch: epoch,
9356            producer_seq: seq,
9357        }
9358    }
9359
9360    fn placement() -> ShardPlacement {
9361        ShardPlacement {
9362            core_id: CoreId(0),
9363            shard_id: ShardId(0),
9364            raft_group_id: RaftGroupId(0),
9365        }
9366    }
9367
9368    #[test]
9369    fn group_write_command_round_trips_as_log_payload() {
9370        let command = GroupWriteCommand::AppendBatch {
9371            stream_id: BucketStreamId::new("benchcmp", "raft-log"),
9372            content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9373            payloads: vec![Bytes::from_static(b"ab"), Bytes::from_static(b"cd")],
9374            producer: Some(producer("writer-1", 7, 42)),
9375            now_ms: 0,
9376        };
9377
9378        let encoded = serde_json::to_vec(&command).expect("encode command");
9379        let decoded =
9380            serde_json::from_slice::<GroupWriteCommand>(&encoded).expect("decode command");
9381
9382        assert_eq!(decoded, command);
9383    }
9384
9385    #[test]
9386    fn committed_write_command_is_state_machine_apply_boundary() {
9387        let placement = ShardPlacement {
9388            core_id: CoreId(0),
9389            shard_id: ShardId(0),
9390            raft_group_id: RaftGroupId(0),
9391        };
9392        let stream = BucketStreamId::new("benchcmp", "apply-command");
9393        let mut engine = InMemoryGroupEngine::default();
9394
9395        let created = engine
9396            .apply_committed_write(
9397                GroupWriteCommand::CreateStream {
9398                    stream_id: stream.clone(),
9399                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9400                    initial_payload: Bytes::new(),
9401                    close_after: false,
9402                    stream_seq: None,
9403                    producer: None,
9404                    stream_ttl_seconds: None,
9405                    stream_expires_at_ms: None,
9406                    forked_from: None,
9407                    fork_offset: None,
9408                    now_ms: 0,
9409                },
9410                placement,
9411            )
9412            .expect("create stream");
9413        assert_eq!(
9414            created,
9415            GroupWriteResponse::CreateStream(CreateStreamResponse {
9416                placement,
9417                next_offset: 0,
9418                closed: false,
9419                already_exists: false,
9420                group_commit_index: 1,
9421            })
9422        );
9423
9424        let appended = engine
9425            .apply_committed_write(
9426                GroupWriteCommand::Append {
9427                    stream_id: stream.clone(),
9428                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9429                    payload: Bytes::from_static(b"abc"),
9430                    close_after: false,
9431                    stream_seq: None,
9432                    producer: None,
9433                    now_ms: 0,
9434                },
9435                placement,
9436            )
9437            .expect("append");
9438        assert_eq!(
9439            appended,
9440            GroupWriteResponse::Append(AppendResponse {
9441                placement,
9442                start_offset: 0,
9443                next_offset: 3,
9444                stream_append_count: 1,
9445                group_commit_index: 2,
9446                closed: false,
9447                deduplicated: false,
9448                producer: None,
9449            })
9450        );
9451
9452        let flushed = engine
9453            .apply_committed_write(
9454                GroupWriteCommand::FlushCold {
9455                    stream_id: stream.clone(),
9456                    chunk: ColdChunkRef {
9457                        start_offset: 0,
9458                        end_offset: 2,
9459                        s3_path: "s3://bucket/apply-command/000000".to_owned(),
9460                        object_size: 2,
9461                    },
9462                },
9463                placement,
9464            )
9465            .expect("flush cold");
9466        assert_eq!(
9467            flushed,
9468            GroupWriteResponse::FlushCold(FlushColdResponse {
9469                placement,
9470                hot_start_offset: 2,
9471                group_commit_index: 3,
9472            })
9473        );
9474
9475        let read = engine
9476            .state_machine
9477            .read(&stream, 2, 16)
9478            .expect("read applied command");
9479        assert_eq!(read.payload, b"c");
9480        let plan = engine
9481            .state_machine
9482            .read_plan(&stream, 0, 16)
9483            .expect("read plan");
9484        assert_eq!(plan.segments.len(), 2);
9485        assert!(matches!(plan.segments[0], StreamReadSegment::Object(_)));
9486        assert_eq!(plan.segments[1], StreamReadSegment::Hot(b"c".to_vec()));
9487    }
9488
9489    #[tokio::test]
9490    async fn cold_store_read_reassembles_cold_and_hot_segments() {
9491        let placement = placement();
9492        let stream = BucketStreamId::new("benchcmp", "cold-read");
9493        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
9494        cold_store
9495            .write_chunk("benchcmp/cold-read/chunks/000000.bin", b"abcd")
9496            .await
9497            .expect("write cold object");
9498        let mut engine = InMemoryGroupEngine::with_cold_store(cold_store);
9499
9500        engine
9501            .apply_committed_write(
9502                GroupWriteCommand::CreateStream {
9503                    stream_id: stream.clone(),
9504                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9505                    initial_payload: Bytes::new(),
9506                    close_after: false,
9507                    stream_seq: None,
9508                    producer: None,
9509                    stream_ttl_seconds: None,
9510                    stream_expires_at_ms: None,
9511                    forked_from: None,
9512                    fork_offset: None,
9513                    now_ms: 0,
9514                },
9515                placement,
9516            )
9517            .expect("create stream");
9518        engine
9519            .apply_committed_write(
9520                GroupWriteCommand::Append {
9521                    stream_id: stream.clone(),
9522                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9523                    payload: Bytes::from_static(b"abcdef"),
9524                    close_after: false,
9525                    stream_seq: None,
9526                    producer: None,
9527                    now_ms: 0,
9528                },
9529                placement,
9530            )
9531            .expect("append");
9532        engine
9533            .apply_committed_write(
9534                GroupWriteCommand::FlushCold {
9535                    stream_id: stream.clone(),
9536                    chunk: ColdChunkRef {
9537                        start_offset: 0,
9538                        end_offset: 4,
9539                        s3_path: "benchcmp/cold-read/chunks/000000.bin".to_owned(),
9540                        object_size: 4,
9541                    },
9542                },
9543                placement,
9544            )
9545            .expect("flush cold");
9546
9547        let read = engine
9548            .read_stream(
9549                ReadStreamRequest {
9550                    stream_id: stream,
9551                    offset: 2,
9552                    max_len: 4,
9553                    now_ms: 0,
9554                },
9555                placement,
9556            )
9557            .await
9558            .expect("read cold and hot segments");
9559        assert_eq!(read.payload, b"cdef");
9560        assert_eq!(read.next_offset, 6);
9561        assert!(read.up_to_date);
9562    }
9563
9564    #[tokio::test]
9565    async fn bootstrap_reads_retained_updates_from_cold_chunk_after_snapshot() {
9566        let placement = placement();
9567        let stream = BucketStreamId::new("benchcmp", "cold-bootstrap");
9568        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
9569        cold_store
9570            .write_chunk("benchcmp/cold-bootstrap/chunks/000000.bin", b"abcde")
9571            .await
9572            .expect("write cold object");
9573        let mut engine = InMemoryGroupEngine::with_cold_store(cold_store);
9574
9575        engine
9576            .create_stream(
9577                CreateStreamRequest::new(stream.clone(), DEFAULT_CONTENT_TYPE),
9578                placement,
9579            )
9580            .await
9581            .expect("create stream");
9582        engine
9583            .append(
9584                AppendRequest::from_bytes(stream.clone(), b"abc".to_vec()),
9585                placement,
9586            )
9587            .await
9588            .expect("append first message");
9589        engine
9590            .append(
9591                AppendRequest::from_bytes(stream.clone(), b"de".to_vec()),
9592                placement,
9593            )
9594            .await
9595            .expect("append second message");
9596        engine
9597            .flush_cold(
9598                FlushColdRequest {
9599                    stream_id: stream.clone(),
9600                    chunk: ColdChunkRef {
9601                        start_offset: 0,
9602                        end_offset: 5,
9603                        s3_path: "benchcmp/cold-bootstrap/chunks/000000.bin".to_owned(),
9604                        object_size: 5,
9605                    },
9606                },
9607                placement,
9608            )
9609            .await
9610            .expect("flush all hot bytes");
9611        engine
9612            .publish_snapshot(
9613                PublishSnapshotRequest {
9614                    stream_id: stream.clone(),
9615                    snapshot_offset: 3,
9616                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9617                    payload: Bytes::from_static(b"abc-state"),
9618                    now_ms: 0,
9619                },
9620                placement,
9621            )
9622            .await
9623            .expect("publish snapshot");
9624
9625        let read = engine
9626            .read_stream(
9627                ReadStreamRequest {
9628                    stream_id: stream.clone(),
9629                    offset: 3,
9630                    max_len: 2,
9631                    now_ms: 0,
9632                },
9633                placement,
9634            )
9635            .await
9636            .expect("read retained update from cold chunk");
9637        assert_eq!(read.payload, b"de");
9638
9639        let bootstrap = engine
9640            .bootstrap_stream(
9641                BootstrapStreamRequest {
9642                    stream_id: stream,
9643                    now_ms: 0,
9644                },
9645                placement,
9646            )
9647            .await
9648            .expect("bootstrap");
9649        assert_eq!(bootstrap.snapshot_offset, Some(3));
9650        assert_eq!(bootstrap.snapshot_payload, b"abc-state");
9651        assert_eq!(bootstrap.next_offset, 5);
9652        assert_eq!(bootstrap.updates.len(), 1);
9653        assert_eq!(bootstrap.updates[0].start_offset, 3);
9654        assert_eq!(bootstrap.updates[0].next_offset, 5);
9655        assert_eq!(bootstrap.updates[0].payload, b"de");
9656    }
9657
9658    #[tokio::test]
9659    async fn cold_store_reads_only_requested_range() {
9660        let cold_store = ColdStore::memory().expect("memory cold store");
9661        cold_store
9662            .write_chunk("benchcmp/cold-range/chunks/000000.bin", b"abcdefgh")
9663            .await
9664            .expect("write cold object");
9665        let bytes = cold_store
9666            .read_chunk_range(
9667                &ColdChunkRef {
9668                    start_offset: 10,
9669                    end_offset: 18,
9670                    s3_path: "benchcmp/cold-range/chunks/000000.bin".to_owned(),
9671                    object_size: 8,
9672                },
9673                12,
9674                3,
9675            )
9676            .await
9677            .expect("read range");
9678        assert_eq!(bytes, b"cde");
9679    }
9680
9681    #[tokio::test]
9682    async fn ttl_read_access_is_committed_and_expiry_removes_stream() {
9683        let placement = placement();
9684        let stream = BucketStreamId::new("benchcmp", "runtime-ttl");
9685        let mut engine = InMemoryGroupEngine::default();
9686
9687        let mut create = CreateStreamRequest::new(stream.clone(), DEFAULT_CONTENT_TYPE);
9688        create.initial_payload = Bytes::from_static(b"abc");
9689        create.stream_ttl_seconds = Some(1);
9690        create.now_ms = 1_000;
9691        engine
9692            .create_stream(create, placement)
9693            .await
9694            .expect("create ttl stream");
9695
9696        let read = engine
9697            .read_stream(
9698                ReadStreamRequest {
9699                    stream_id: stream.clone(),
9700                    offset: 0,
9701                    max_len: 16,
9702                    now_ms: 1_500,
9703                },
9704                placement,
9705            )
9706            .await
9707            .expect("read renews ttl");
9708        assert_eq!(read.payload, b"abc");
9709        assert_eq!(
9710            engine
9711                .snapshot(placement)
9712                .await
9713                .expect("snapshot")
9714                .group_commit_index,
9715            2
9716        );
9717
9718        engine
9719            .head_stream(
9720                HeadStreamRequest {
9721                    stream_id: stream.clone(),
9722                    now_ms: 2_499,
9723                },
9724                placement,
9725            )
9726            .await
9727            .expect("head does not renew but stream is still live");
9728        assert_eq!(
9729            engine
9730                .snapshot(placement)
9731                .await
9732                .expect("snapshot")
9733                .group_commit_index,
9734            2
9735        );
9736
9737        let err = engine
9738            .read_stream(
9739                ReadStreamRequest {
9740                    stream_id: stream.clone(),
9741                    offset: 0,
9742                    max_len: 16,
9743                    now_ms: 2_500,
9744                },
9745                placement,
9746            )
9747            .await
9748            .expect_err("expired stream read is not found");
9749        assert_eq!(err.code(), Some(StreamErrorCode::StreamNotFound));
9750        assert_eq!(
9751            engine
9752                .snapshot(placement)
9753                .await
9754                .expect("snapshot")
9755                .group_commit_index,
9756            3
9757        );
9758
9759        let mut recreate = CreateStreamRequest::new(stream, "text/plain");
9760        recreate.now_ms = 2_501;
9761        let recreated = engine
9762            .create_stream(recreate, placement)
9763            .await
9764            .expect("recreate expired stream");
9765        assert!(!recreated.already_exists);
9766    }
9767
9768    #[test]
9769    fn committed_write_batch_preserves_logical_command_responses() {
9770        let placement = placement();
9771        let stream = BucketStreamId::new("benchcmp", "apply-command-batch");
9772        let mut engine = InMemoryGroupEngine::default();
9773
9774        let response = engine
9775            .apply_committed_write(
9776                GroupWriteCommand::Batch {
9777                    commands: vec![
9778                        GroupWriteCommand::from(CreateStreamRequest::new(
9779                            stream.clone(),
9780                            DEFAULT_CONTENT_TYPE,
9781                        )),
9782                        GroupWriteCommand::from(AppendBatchRequest::new(
9783                            stream.clone(),
9784                            vec![Bytes::from_static(b"ab"), Bytes::from_static(b"cd")],
9785                        )),
9786                    ],
9787                },
9788                placement,
9789            )
9790            .expect("apply command batch");
9791
9792        let GroupWriteResponse::Batch(items) = response else {
9793            panic!("unexpected batch response: {response:?}");
9794        };
9795        assert_eq!(items.len(), 2);
9796        assert!(matches!(
9797            &items[0],
9798            Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
9799                group_commit_index: 1,
9800                ..
9801            }))
9802        ));
9803        match &items[1] {
9804            Ok(GroupWriteResponse::AppendBatch(response)) => {
9805                assert_eq!(response.items.len(), 2);
9806                assert_eq!(
9807                    response.items[0].as_ref().expect("first item").start_offset,
9808                    0
9809                );
9810                assert_eq!(
9811                    response.items[1]
9812                        .as_ref()
9813                        .expect("second item")
9814                        .start_offset,
9815                    2
9816                );
9817                assert_eq!(
9818                    response.items[1]
9819                        .as_ref()
9820                        .expect("second item")
9821                        .group_commit_index,
9822                    3
9823                );
9824            }
9825            other => panic!("unexpected append batch response: {other:?}"),
9826        }
9827
9828        let read = engine
9829            .state_machine
9830            .read(&stream, 0, 16)
9831            .expect("read applied command batch");
9832        assert_eq!(read.payload, b"abcd");
9833    }
9834
9835    async fn wait_for_live_waiters(runtime: &ShardRuntime, expected: u64) {
9836        for _ in 0..100 {
9837            if runtime.metrics().snapshot().live_read_waiters == expected {
9838                return;
9839            }
9840            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9841        }
9842        panic!(
9843            "expected {expected} live waiters, got {}",
9844            runtime.metrics().snapshot().live_read_waiters
9845        );
9846    }
9847
9848    async fn wait_for_mailbox_depth(runtime: &ShardRuntime, core_index: usize, expected: usize) {
9849        for _ in 0..100 {
9850            if runtime.mailbox_snapshot().depths[core_index] == expected {
9851                return;
9852            }
9853            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9854        }
9855        panic!(
9856            "expected core {core_index} mailbox depth {expected}, got {}",
9857            runtime.mailbox_snapshot().depths[core_index]
9858        );
9859    }
9860
9861    async fn wait_for_mailbox_full_events(runtime: &ShardRuntime, expected: u64) {
9862        for _ in 0..100 {
9863            if runtime.metrics().snapshot().mailbox_full_events == expected {
9864                return;
9865            }
9866            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9867        }
9868        panic!(
9869            "expected {expected} mailbox full events, got {}",
9870            runtime.metrics().snapshot().mailbox_full_events
9871        );
9872    }
9873
9874    async fn wait_for_group_mailbox_full_events(runtime: &ShardRuntime, expected: u64) {
9875        for _ in 0..100 {
9876            if runtime.metrics().snapshot().group_mailbox_full_events == expected {
9877                return;
9878            }
9879            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9880        }
9881        panic!(
9882            "expected {expected} group mailbox full events, got {}",
9883            runtime.metrics().snapshot().group_mailbox_full_events
9884        );
9885    }
9886
9887    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
9888    async fn repeated_appends_to_one_stream_are_ordered() {
9889        let runtime = runtime(4, 32);
9890        let stream = BucketStreamId::new("benchcmp", "one-stream");
9891        create_stream(&runtime, &stream).await;
9892        for index in 0..100 {
9893            let response = runtime
9894                .append(AppendRequest::new(stream.clone(), 7))
9895                .await
9896                .expect("append");
9897            assert_eq!(response.start_offset, index * 7);
9898            assert_eq!(response.next_offset, (index + 1) * 7);
9899            assert_eq!(response.stream_append_count, index + 1);
9900        }
9901    }
9902
9903    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
9904    async fn independent_streams_reach_all_cores_and_many_groups() {
9905        let runtime = runtime(4, 64);
9906        let mut tasks = Vec::new();
9907        for index in 0..4096 {
9908            let runtime = runtime.clone();
9909            tasks.push(tokio::spawn(async move {
9910                let stream = BucketStreamId::new("benchcmp", format!("stream-{index}"));
9911                create_stream(&runtime, &stream).await;
9912                runtime
9913                    .append(AppendRequest::new(stream, 1))
9914                    .await
9915                    .expect("append")
9916            }));
9917        }
9918
9919        for task in tasks {
9920            let response = task.await.expect("task");
9921            assert_eq!(response.start_offset, 0);
9922            assert_eq!(response.next_offset, 1);
9923        }
9924
9925        let snapshot = runtime.metrics().snapshot();
9926        assert_eq!(snapshot.accepted_appends, 4096);
9927        assert!(snapshot.per_core_appends.iter().all(|value| *value > 0));
9928        let active_groups = snapshot
9929            .per_group_appends
9930            .iter()
9931            .filter(|value| **value > 0)
9932            .count();
9933        assert!(active_groups > 48, "active_groups={active_groups}");
9934    }
9935
9936    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9937    async fn empty_append_is_rejected_before_routing() {
9938        let runtime = runtime(2, 8);
9939        let err = runtime
9940            .append(AppendRequest::new(BucketStreamId::new("b", "s"), 0))
9941            .await
9942            .expect_err("empty append rejected");
9943        assert_eq!(err, RuntimeError::EmptyAppend);
9944        assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
9945    }
9946
9947    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9948    async fn append_batch_routes_once_and_applies_each_payload_on_owner_core() {
9949        let runtime = runtime(2, 8);
9950        let stream = BucketStreamId::new("benchcmp", "batch-runtime");
9951        let owner_core = usize::from(runtime.locate(&stream).core_id.0);
9952        let owner_group =
9953            usize::try_from(runtime.locate(&stream).raft_group_id.0).expect("u32 fits usize");
9954
9955        create_stream(&runtime, &stream).await;
9956        let response = runtime
9957            .append_batch(AppendBatchRequest::new(
9958                stream.clone(),
9959                vec![b"ab".to_vec(), b"c".to_vec(), b"def".to_vec()],
9960            ))
9961            .await
9962            .expect("append batch");
9963        assert_eq!(response.items.len(), 3);
9964        assert_eq!(response.items[0].as_ref().expect("first").start_offset, 0);
9965        assert_eq!(response.items[1].as_ref().expect("second").start_offset, 2);
9966        assert_eq!(response.items[2].as_ref().expect("third").start_offset, 3);
9967
9968        let read = runtime
9969            .read_stream(ReadStreamRequest {
9970                stream_id: stream.clone(),
9971                offset: 0,
9972                max_len: 16,
9973                now_ms: 0,
9974            })
9975            .await
9976            .expect("read");
9977        assert_eq!(read.payload, b"abcdef");
9978
9979        let snapshot = runtime.metrics().snapshot();
9980        assert_eq!(snapshot.accepted_appends, 3);
9981        assert_eq!(snapshot.applied_mutations, 4);
9982        assert_eq!(snapshot.routed_requests, 3);
9983        assert_eq!(snapshot.per_core_appends[owner_core], 3);
9984        assert_eq!(snapshot.per_group_appends[owner_group], 3);
9985        assert_eq!(snapshot.per_core_applied_mutations[owner_core], 4);
9986        assert_eq!(snapshot.per_group_applied_mutations[owner_group], 4);
9987    }
9988
9989    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9990    async fn append_batch_reports_item_errors_without_stopping_later_payloads() {
9991        let runtime = runtime(2, 8);
9992        let stream = BucketStreamId::new("benchcmp", "batch-partial");
9993        create_stream(&runtime, &stream).await;
9994
9995        let response = runtime
9996            .append_batch(AppendBatchRequest::new(
9997                stream.clone(),
9998                vec![b"a".to_vec(), Vec::new(), b"b".to_vec()],
9999            ))
10000            .await
10001            .expect("append batch");
10002        assert!(response.items[0].is_ok());
10003        assert!(response.items[1].is_err());
10004        assert!(response.items[2].is_ok());
10005        assert_eq!(response.items[0].as_ref().expect("first").start_offset, 0);
10006        assert_eq!(response.items[2].as_ref().expect("third").start_offset, 1);
10007
10008        let read = runtime
10009            .read_stream(ReadStreamRequest {
10010                stream_id: stream,
10011                offset: 0,
10012                max_len: 16,
10013                now_ms: 0,
10014            })
10015            .await
10016            .expect("read");
10017        assert_eq!(read.payload, b"ab");
10018
10019        let snapshot = runtime.metrics().snapshot();
10020        assert_eq!(snapshot.accepted_appends, 2);
10021        assert_eq!(snapshot.applied_mutations, 3);
10022        assert_eq!(snapshot.routed_requests, 3);
10023    }
10024
10025    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10026    async fn producer_duplicate_append_returns_prior_offsets_without_mutating_metrics() {
10027        let runtime = runtime(2, 8);
10028        let stream = BucketStreamId::new("benchcmp", "producer-runtime");
10029        create_stream(&runtime, &stream).await;
10030
10031        let mut first = AppendRequest::from_bytes(stream.clone(), b"a".to_vec());
10032        first.producer = Some(producer("writer-1", 0, 0));
10033        let first = runtime.append(first).await.expect("first append");
10034        assert_eq!(first.start_offset, 0);
10035        assert_eq!(first.next_offset, 1);
10036        assert_eq!(first.stream_append_count, 1);
10037        assert!(!first.deduplicated);
10038
10039        let mut duplicate = AppendRequest::from_bytes(stream.clone(), b"ignored".to_vec());
10040        duplicate.producer = Some(producer("writer-1", 0, 0));
10041        let duplicate = runtime.append(duplicate).await.expect("duplicate append");
10042        assert_eq!(duplicate.start_offset, 0);
10043        assert_eq!(duplicate.next_offset, 1);
10044        assert_eq!(duplicate.stream_append_count, 1);
10045        assert!(duplicate.deduplicated);
10046
10047        let mut next = AppendRequest::from_bytes(stream.clone(), b"b".to_vec());
10048        next.producer = Some(producer("writer-1", 0, 1));
10049        let next = runtime.append(next).await.expect("next append");
10050        assert_eq!(next.start_offset, 1);
10051        assert_eq!(next.next_offset, 2);
10052        assert_eq!(next.stream_append_count, 2);
10053        assert!(!next.deduplicated);
10054
10055        let read = runtime
10056            .read_stream(ReadStreamRequest {
10057                stream_id: stream,
10058                offset: 0,
10059                max_len: 16,
10060                now_ms: 0,
10061            })
10062            .await
10063            .expect("read");
10064        assert_eq!(read.payload, b"ab");
10065
10066        let metrics = runtime.metrics().snapshot();
10067        assert_eq!(metrics.accepted_appends, 2);
10068        assert_eq!(metrics.applied_mutations, 3);
10069        assert_eq!(metrics.routed_requests, 5);
10070    }
10071
10072    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10073    async fn producer_duplicate_append_batch_returns_prior_offsets_without_mutating_metrics() {
10074        let runtime = runtime(2, 8);
10075        let stream = BucketStreamId::new("benchcmp", "producer-batch-runtime");
10076        create_stream(&runtime, &stream).await;
10077
10078        let mut first =
10079            AppendBatchRequest::new(stream.clone(), vec![b"ab".to_vec(), b"c".to_vec()]);
10080        first.producer = Some(producer("writer-1", 0, 0));
10081        let first = runtime.append_batch(first).await.expect("first batch");
10082        assert_eq!(first.items.len(), 2);
10083        let first_item = first.items[0].as_ref().expect("first item");
10084        let second_item = first.items[1].as_ref().expect("second item");
10085        assert_eq!(first_item.start_offset, 0);
10086        assert_eq!(first_item.next_offset, 2);
10087        assert_eq!(first_item.stream_append_count, 1);
10088        assert!(!first_item.deduplicated);
10089        assert_eq!(second_item.start_offset, 2);
10090        assert_eq!(second_item.next_offset, 3);
10091        assert_eq!(second_item.stream_append_count, 2);
10092        assert!(!second_item.deduplicated);
10093
10094        let mut duplicate =
10095            AppendBatchRequest::new(stream.clone(), vec![b"ignored".to_vec(), b"body".to_vec()]);
10096        duplicate.producer = Some(producer("writer-1", 0, 0));
10097        let duplicate = runtime
10098            .append_batch(duplicate)
10099            .await
10100            .expect("duplicate batch");
10101        assert_eq!(duplicate.items.len(), 2);
10102        assert!(
10103            duplicate
10104                .items
10105                .iter()
10106                .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
10107        );
10108        assert_eq!(
10109            duplicate.items[0]
10110                .as_ref()
10111                .expect("first duplicate")
10112                .start_offset,
10113            0
10114        );
10115        assert_eq!(
10116            duplicate.items[1]
10117                .as_ref()
10118                .expect("second duplicate")
10119                .next_offset,
10120            3
10121        );
10122
10123        let mut next = AppendBatchRequest::new(stream.clone(), vec![b"d".to_vec()]);
10124        next.producer = Some(producer("writer-1", 0, 1));
10125        let next = runtime.append_batch(next).await.expect("next batch");
10126        let next_item = next.items[0].as_ref().expect("next item");
10127        assert_eq!(next_item.start_offset, 3);
10128        assert_eq!(next_item.next_offset, 4);
10129        assert_eq!(next_item.stream_append_count, 3);
10130        assert!(!next_item.deduplicated);
10131
10132        let read = runtime
10133            .read_stream(ReadStreamRequest {
10134                stream_id: stream,
10135                offset: 0,
10136                max_len: 16,
10137                now_ms: 0,
10138            })
10139            .await
10140            .expect("read");
10141        assert_eq!(read.payload, b"abcd");
10142
10143        let metrics = runtime.metrics().snapshot();
10144        assert_eq!(metrics.accepted_appends, 3);
10145        assert_eq!(metrics.applied_mutations, 4);
10146        assert_eq!(metrics.routed_requests, 5);
10147    }
10148
10149    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10150    async fn snapshot_group_routes_to_owner_core_and_captures_only_group_state() {
10151        let runtime = runtime(2, 8);
10152        let first_stream = BucketStreamId::new("benchcmp", "snapshot-first");
10153        let first_placement = runtime.locate(&first_stream);
10154        let second_stream = (0..512)
10155            .map(|index| BucketStreamId::new("benchcmp", format!("snapshot-other-{index}")))
10156            .find(|stream| runtime.locate(stream).core_id != first_placement.core_id)
10157            .expect("stream on another core");
10158
10159        create_stream(&runtime, &first_stream).await;
10160        runtime
10161            .append(AppendRequest::from_bytes(
10162                first_stream.clone(),
10163                b"first".to_vec(),
10164            ))
10165            .await
10166            .expect("append first stream");
10167        create_stream(&runtime, &second_stream).await;
10168        runtime
10169            .append(AppendRequest::from_bytes(
10170                second_stream.clone(),
10171                b"second".to_vec(),
10172            ))
10173            .await
10174            .expect("append second stream");
10175
10176        let snapshot = runtime
10177            .snapshot_group(first_placement.raft_group_id)
10178            .await
10179            .expect("snapshot group");
10180        assert_eq!(snapshot.placement, first_placement);
10181        assert_eq!(snapshot.group_commit_index, 2);
10182        assert_eq!(snapshot.stream_snapshot.buckets, vec!["benchcmp"]);
10183        assert_eq!(
10184            snapshot
10185                .stream_snapshot
10186                .streams
10187                .iter()
10188                .map(|entry| entry.metadata.stream_id.clone())
10189                .collect::<Vec<_>>(),
10190            vec![first_stream.clone()]
10191        );
10192
10193        let restored =
10194            StreamStateMachine::restore(snapshot.stream_snapshot).expect("restore group snapshot");
10195        let read = restored
10196            .read(&first_stream, 0, 16)
10197            .expect("read restored snapshot");
10198        assert_eq!(read.payload, b"first");
10199        assert_eq!(read.next_offset, 5);
10200        assert!(restored.read(&second_stream, 0, 16).is_err());
10201
10202        let metrics = runtime.metrics().snapshot();
10203        assert_eq!(metrics.routed_requests, 5);
10204        assert_eq!(
10205            metrics.per_core_routed_requests[usize::from(first_placement.core_id.0)],
10206            3
10207        );
10208    }
10209
10210    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10211    async fn snapshot_group_rejects_out_of_range_group_before_routing() {
10212        let runtime = runtime(2, 8);
10213        let err = runtime
10214            .snapshot_group(RaftGroupId(8))
10215            .await
10216            .expect_err("invalid group");
10217        assert_eq!(
10218            err,
10219            RuntimeError::InvalidRaftGroup {
10220                raft_group_id: RaftGroupId(8),
10221                raft_group_count: 8,
10222            }
10223        );
10224        assert_eq!(runtime.metrics().snapshot().routed_requests, 0);
10225    }
10226
10227    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10228    async fn install_group_snapshot_restores_group_state_and_append_counts() {
10229        let source = runtime(2, 8);
10230        let stream = BucketStreamId::new("benchcmp", "install-snapshot");
10231        let placement = source.locate(&stream);
10232        create_stream(&source, &stream).await;
10233        source
10234            .append(AppendRequest::from_bytes(stream.clone(), b"ab".to_vec()))
10235            .await
10236            .expect("append first");
10237        source
10238            .append(AppendRequest::from_bytes(stream.clone(), b"cd".to_vec()))
10239            .await
10240            .expect("append second");
10241
10242        let snapshot = source
10243            .snapshot_group(placement.raft_group_id)
10244            .await
10245            .expect("snapshot group");
10246        assert_eq!(snapshot.group_commit_index, 3);
10247        assert_eq!(
10248            snapshot.stream_append_counts,
10249            vec![StreamAppendCount {
10250                stream_id: stream.clone(),
10251                append_count: 2,
10252            }]
10253        );
10254
10255        let target = runtime(2, 8);
10256        target
10257            .install_group_snapshot(snapshot)
10258            .await
10259            .expect("install snapshot");
10260
10261        let read = target
10262            .read_stream(ReadStreamRequest {
10263                stream_id: stream.clone(),
10264                offset: 0,
10265                max_len: 16,
10266                now_ms: 0,
10267            })
10268            .await
10269            .expect("read restored stream");
10270        assert_eq!(read.placement, placement);
10271        assert_eq!(read.payload, b"abcd");
10272        assert_eq!(read.next_offset, 4);
10273
10274        let appended = target
10275            .append(AppendRequest::from_bytes(stream, b"ef".to_vec()))
10276            .await
10277            .expect("append after restore");
10278        assert_eq!(appended.start_offset, 4);
10279        assert_eq!(appended.next_offset, 6);
10280        assert_eq!(appended.stream_append_count, 3);
10281        assert_eq!(appended.group_commit_index, 4);
10282    }
10283
10284    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10285    async fn install_group_snapshot_rejects_mismatched_placement_before_routing() {
10286        let runtime = runtime(2, 8);
10287        let snapshot = GroupSnapshot {
10288            placement: ShardPlacement {
10289                core_id: CoreId(1),
10290                shard_id: ShardId(0),
10291                raft_group_id: RaftGroupId(0),
10292            },
10293            group_commit_index: 0,
10294            stream_snapshot: StreamSnapshot {
10295                buckets: Vec::new(),
10296                streams: Vec::new(),
10297            },
10298            stream_append_counts: Vec::new(),
10299        };
10300
10301        let err = runtime
10302            .install_group_snapshot(snapshot)
10303            .await
10304            .expect_err("mismatched placement rejected");
10305        assert_eq!(
10306            err,
10307            RuntimeError::SnapshotPlacementMismatch {
10308                expected: ShardPlacement {
10309                    core_id: CoreId(0),
10310                    shard_id: ShardId(0),
10311                    raft_group_id: RaftGroupId(0),
10312                },
10313                actual: ShardPlacement {
10314                    core_id: CoreId(1),
10315                    shard_id: ShardId(0),
10316                    raft_group_id: RaftGroupId(0),
10317                },
10318            }
10319        );
10320        assert_eq!(runtime.metrics().snapshot().routed_requests, 0);
10321    }
10322
10323    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10324    async fn mailbox_snapshot_reports_per_core_depths_and_capacities() {
10325        let runtime = ShardRuntime::spawn(RuntimeConfig {
10326            core_count: 3,
10327            raft_group_count: 9,
10328            mailbox_capacity: 7,
10329            threading: RuntimeThreading::HostedTokio,
10330            cold_max_hot_bytes_per_group: None,
10331            live_read_max_waiters_per_core: Some(65_536),
10332        })
10333        .expect("spawn runtime");
10334
10335        let snapshot = runtime.mailbox_snapshot();
10336        assert_eq!(snapshot.depths, vec![0, 0, 0]);
10337        assert_eq!(snapshot.capacities, vec![7, 7, 7]);
10338    }
10339
10340    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10341    async fn runtime_metrics_track_owner_core_routing_and_mailbox_wait() {
10342        let runtime = runtime(2, 8);
10343        let stream = BucketStreamId::new("benchcmp", "routing-metrics");
10344        let owner_core = usize::from(runtime.locate(&stream).core_id.0);
10345
10346        create_stream(&runtime, &stream).await;
10347        runtime
10348            .append(AppendRequest::from_bytes(stream.clone(), b"hello".to_vec()))
10349            .await
10350            .expect("append");
10351        runtime
10352            .read_stream(ReadStreamRequest {
10353                stream_id: stream.clone(),
10354                offset: 0,
10355                max_len: 16,
10356                now_ms: 0,
10357            })
10358            .await
10359            .expect("read");
10360
10361        let snapshot = runtime.metrics().snapshot();
10362        assert_eq!(snapshot.accepted_appends, 1);
10363        assert_eq!(snapshot.applied_mutations, 2);
10364        assert_eq!(snapshot.routed_requests, 3);
10365        assert_eq!(snapshot.per_core_routed_requests.len(), 2);
10366        assert_eq!(snapshot.per_core_routed_requests[owner_core], 3);
10367        assert_eq!(snapshot.per_core_applied_mutations[owner_core], 2);
10368        assert_eq!(
10369            snapshot.per_group_applied_mutations
10370                [usize::try_from(runtime.locate(&stream).raft_group_id.0).expect("u32 fits usize")],
10371            2
10372        );
10373        assert_eq!(
10374            snapshot.mutation_apply_ns,
10375            snapshot.per_core_mutation_apply_ns.iter().sum::<u64>()
10376        );
10377        assert_eq!(
10378            snapshot.mutation_apply_ns,
10379            snapshot.per_group_mutation_apply_ns.iter().sum::<u64>()
10380        );
10381        assert_eq!(
10382            snapshot.group_lock_wait_ns,
10383            snapshot.per_core_group_lock_wait_ns.iter().sum::<u64>()
10384        );
10385        assert_eq!(
10386            snapshot.group_lock_wait_ns,
10387            snapshot.per_group_group_lock_wait_ns.iter().sum::<u64>()
10388        );
10389        assert_eq!(
10390            snapshot.group_engine_exec_ns,
10391            snapshot.per_core_group_engine_exec_ns.iter().sum::<u64>()
10392        );
10393        assert_eq!(
10394            snapshot.group_engine_exec_ns,
10395            snapshot.per_group_group_engine_exec_ns.iter().sum::<u64>()
10396        );
10397        assert_eq!(
10398            snapshot.raft_write_many_batches,
10399            snapshot
10400                .per_core_raft_write_many_batches
10401                .iter()
10402                .sum::<u64>()
10403        );
10404        assert_eq!(
10405            snapshot.raft_write_many_batches,
10406            snapshot
10407                .per_group_raft_write_many_batches
10408                .iter()
10409                .sum::<u64>()
10410        );
10411        assert_eq!(
10412            snapshot.raft_write_many_commands,
10413            snapshot
10414                .per_core_raft_write_many_commands
10415                .iter()
10416                .sum::<u64>()
10417        );
10418        assert_eq!(
10419            snapshot.raft_write_many_commands,
10420            snapshot
10421                .per_group_raft_write_many_commands
10422                .iter()
10423                .sum::<u64>()
10424        );
10425        assert_eq!(
10426            snapshot.raft_write_many_logical_commands,
10427            snapshot
10428                .per_core_raft_write_many_logical_commands
10429                .iter()
10430                .sum::<u64>()
10431        );
10432        assert_eq!(
10433            snapshot.raft_write_many_logical_commands,
10434            snapshot
10435                .per_group_raft_write_many_logical_commands
10436                .iter()
10437                .sum::<u64>()
10438        );
10439        assert_eq!(
10440            snapshot.raft_write_many_responses,
10441            snapshot
10442                .per_core_raft_write_many_responses
10443                .iter()
10444                .sum::<u64>()
10445        );
10446        assert_eq!(
10447            snapshot.raft_write_many_responses,
10448            snapshot
10449                .per_group_raft_write_many_responses
10450                .iter()
10451                .sum::<u64>()
10452        );
10453        assert_eq!(
10454            snapshot.raft_write_many_submit_ns,
10455            snapshot
10456                .per_core_raft_write_many_submit_ns
10457                .iter()
10458                .sum::<u64>()
10459        );
10460        assert_eq!(
10461            snapshot.raft_write_many_submit_ns,
10462            snapshot
10463                .per_group_raft_write_many_submit_ns
10464                .iter()
10465                .sum::<u64>()
10466        );
10467        assert_eq!(
10468            snapshot.raft_write_many_response_ns,
10469            snapshot
10470                .per_core_raft_write_many_response_ns
10471                .iter()
10472                .sum::<u64>()
10473        );
10474        assert_eq!(
10475            snapshot.raft_write_many_response_ns,
10476            snapshot
10477                .per_group_raft_write_many_response_ns
10478                .iter()
10479                .sum::<u64>()
10480        );
10481        assert_eq!(
10482            snapshot.raft_apply_entries,
10483            snapshot.per_core_raft_apply_entries.iter().sum::<u64>()
10484        );
10485        assert_eq!(
10486            snapshot.raft_apply_entries,
10487            snapshot.per_group_raft_apply_entries.iter().sum::<u64>()
10488        );
10489        assert_eq!(
10490            snapshot.raft_apply_ns,
10491            snapshot.per_core_raft_apply_ns.iter().sum::<u64>()
10492        );
10493        assert_eq!(
10494            snapshot.raft_apply_ns,
10495            snapshot.per_group_raft_apply_ns.iter().sum::<u64>()
10496        );
10497        assert_eq!(
10498            snapshot.mailbox_send_wait_ns,
10499            snapshot.per_core_mailbox_send_wait_ns.iter().sum::<u64>()
10500        );
10501    }
10502
10503    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10504    async fn append_before_stream_setup_uses_stream_state_machine_error() {
10505        let runtime = runtime(2, 8);
10506        let stream = BucketStreamId::new("benchcmp", "missing-stream");
10507        let placement = runtime.locate(&stream);
10508        let err = runtime
10509            .append(AppendRequest::new(stream, 1))
10510            .await
10511            .expect_err("missing stream rejected");
10512
10513        match err {
10514            RuntimeError::GroupEngine {
10515                core_id,
10516                raft_group_id,
10517                message,
10518                ..
10519            } => {
10520                assert_eq!(core_id, placement.core_id);
10521                assert_eq!(raft_group_id, placement.raft_group_id);
10522                assert!(message.contains("BucketNotFound"), "message={message}");
10523            }
10524            other => panic!("expected group engine error, got {other:?}"),
10525        }
10526        assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
10527    }
10528
10529    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10530    async fn create_stream_is_routed_and_idempotent_for_matching_metadata() {
10531        let runtime = runtime(2, 8);
10532        let stream = BucketStreamId::new("benchcmp", "create-stream");
10533        let placement = runtime.locate(&stream);
10534
10535        let created = create_stream(&runtime, &stream).await;
10536        assert_eq!(created.placement, placement);
10537        assert_eq!(created.next_offset, 0);
10538        assert!(!created.closed);
10539        assert!(!created.already_exists);
10540
10541        let existing = create_stream(&runtime, &stream).await;
10542        assert_eq!(existing.placement, placement);
10543        assert_eq!(existing.next_offset, 0);
10544        assert!(!existing.closed);
10545        assert!(existing.already_exists);
10546    }
10547
10548    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10549    async fn head_stream_reflects_append_and_closed_state_on_owner_group() {
10550        let runtime = runtime(2, 8);
10551        let stream = BucketStreamId::new("benchcmp", "head-stream");
10552        let placement = runtime.locate(&stream);
10553        runtime
10554            .create_stream(CreateStreamRequest::new(stream.clone(), "text/plain"))
10555            .await
10556            .expect("create stream");
10557
10558        let mut append = AppendRequest::new(stream.clone(), 3);
10559        append.content_type = "text/plain".to_owned();
10560        append.close_after = true;
10561        let response = runtime.append(append).await.expect("append");
10562        assert_eq!(response.start_offset, 0);
10563        assert_eq!(response.next_offset, 3);
10564
10565        let head = runtime
10566            .head_stream(HeadStreamRequest {
10567                stream_id: stream,
10568                now_ms: 0,
10569            })
10570            .await
10571            .expect("head stream");
10572        assert_eq!(head.placement, placement);
10573        assert_eq!(head.content_type, "text/plain");
10574        assert_eq!(head.tail_offset, 3);
10575        assert!(head.closed);
10576    }
10577
10578    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10579    async fn read_stream_returns_payload_slice_from_owner_group() {
10580        let runtime = runtime(2, 8);
10581        let stream = BucketStreamId::new("benchcmp", "read-stream");
10582        let placement = runtime.locate(&stream);
10583        create_stream(&runtime, &stream).await;
10584        runtime
10585            .append(AppendRequest::from_bytes(
10586                stream.clone(),
10587                b"abcdefg".to_vec(),
10588            ))
10589            .await
10590            .expect("append");
10591
10592        let read = runtime
10593            .read_stream(ReadStreamRequest {
10594                stream_id: stream.clone(),
10595                offset: 2,
10596                max_len: 3,
10597                now_ms: 0,
10598            })
10599            .await
10600            .expect("read stream");
10601        assert_eq!(read.placement, placement);
10602        assert_eq!(read.offset, 2);
10603        assert_eq!(read.next_offset, 5);
10604        assert_eq!(read.payload, b"cde");
10605        assert!(!read.up_to_date);
10606        assert!(!read.closed);
10607
10608        let tail = runtime
10609            .read_stream(ReadStreamRequest {
10610                stream_id: stream,
10611                offset: 7,
10612                max_len: 16,
10613                now_ms: 0,
10614            })
10615            .await
10616            .expect("tail read");
10617        assert_eq!(tail.next_offset, 7);
10618        assert!(tail.payload.is_empty());
10619        assert!(tail.up_to_date);
10620    }
10621
10622    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10623    async fn flush_cold_publishes_chunk_metadata_on_owner_group() {
10624        let runtime = runtime(2, 8);
10625        let stream = BucketStreamId::new("benchcmp", "cold-runtime");
10626        let placement = runtime.locate(&stream);
10627        create_stream(&runtime, &stream).await;
10628        runtime
10629            .append(AppendRequest::from_bytes(
10630                stream.clone(),
10631                b"abcdef".to_vec(),
10632            ))
10633            .await
10634            .expect("append");
10635
10636        let flushed = runtime
10637            .flush_cold(FlushColdRequest {
10638                stream_id: stream.clone(),
10639                chunk: ColdChunkRef {
10640                    start_offset: 0,
10641                    end_offset: 4,
10642                    s3_path: "s3://bucket/cold-runtime/000000".to_owned(),
10643                    object_size: 4,
10644                },
10645            })
10646            .await
10647            .expect("flush cold");
10648        assert_eq!(flushed.placement, placement);
10649        assert_eq!(flushed.hot_start_offset, 4);
10650
10651        let hot = runtime
10652            .read_stream(ReadStreamRequest {
10653                stream_id: stream.clone(),
10654                offset: 4,
10655                max_len: 16,
10656                now_ms: 0,
10657            })
10658            .await
10659            .expect("hot read");
10660        assert_eq!(hot.payload, b"ef");
10661
10662        let err = runtime
10663            .read_stream(ReadStreamRequest {
10664                stream_id: stream,
10665                offset: 0,
10666                max_len: 16,
10667                now_ms: 0,
10668            })
10669            .await
10670            .expect_err("cold read needs store");
10671        match err {
10672            RuntimeError::GroupEngine {
10673                message,
10674                next_offset: Some(6),
10675                ..
10676            } if message.contains("InvalidColdFlush") => {}
10677            other => panic!("expected cold read error, got {other:?}"),
10678        }
10679    }
10680
10681    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10682    async fn flush_cold_once_uploads_outside_group_and_reads_back() {
10683        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10684        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10685            RuntimeConfig::new(2, 8),
10686            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10687            Some(cold_store),
10688        )
10689        .expect("spawn runtime");
10690        let stream = BucketStreamId::new("benchcmp", "cold-once");
10691        create_stream(&runtime, &stream).await;
10692        runtime
10693            .append(AppendRequest::from_bytes(
10694                stream.clone(),
10695                b"abcdef".to_vec(),
10696            ))
10697            .await
10698            .expect("append");
10699
10700        let flushed = runtime
10701            .flush_cold_once(PlanColdFlushRequest {
10702                stream_id: stream.clone(),
10703                min_hot_bytes: 4,
10704                max_flush_bytes: 4,
10705            })
10706            .await
10707            .expect("flush once")
10708            .expect("candidate flushed");
10709        assert_eq!(flushed.hot_start_offset, 4);
10710        let metrics = runtime.metrics().snapshot();
10711        assert_eq!(metrics.cold_flush_uploads, 1);
10712        assert_eq!(metrics.cold_flush_upload_bytes, 4);
10713        assert_eq!(metrics.cold_flush_publishes, 1);
10714        assert_eq!(metrics.cold_flush_publish_bytes, 4);
10715        assert_eq!(metrics.cold_orphan_cleanup_attempts, 0);
10716
10717        let read = runtime
10718            .read_stream(ReadStreamRequest {
10719                stream_id: stream,
10720                offset: 0,
10721                max_len: 6,
10722                now_ms: 0,
10723            })
10724            .await
10725            .expect("read cold and hot");
10726        assert_eq!(read.payload, b"abcdef");
10727        assert_eq!(read.next_offset, 6);
10728    }
10729
10730    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10731    async fn flush_cold_group_batch_once_publishes_multiple_chunks() {
10732        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10733        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10734            RuntimeConfig::new(2, 8),
10735            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10736            Some(cold_store),
10737        )
10738        .expect("spawn runtime");
10739        let stream = BucketStreamId::new("benchcmp", "cold-batch");
10740        let placement = runtime.locate(&stream);
10741        create_stream(&runtime, &stream).await;
10742        runtime
10743            .append(AppendRequest::from_bytes(stream.clone(), b"abcd".to_vec()))
10744            .await
10745            .expect("append");
10746
10747        let flushed = runtime
10748            .flush_cold_group_batch_once(
10749                placement.raft_group_id,
10750                PlanGroupColdFlushRequest {
10751                    min_hot_bytes: 1,
10752                    max_flush_bytes: 1,
10753                },
10754                4,
10755            )
10756            .await
10757            .expect("flush batch");
10758        assert_eq!(flushed.len(), 4);
10759        assert!(
10760            flushed
10761                .iter()
10762                .all(|response| response.placement == placement)
10763        );
10764        assert_eq!(
10765            flushed
10766                .iter()
10767                .map(|response| response.hot_start_offset)
10768                .collect::<Vec<_>>(),
10769            vec![1, 2, 3, 0]
10770        );
10771
10772        let metrics = runtime.metrics().snapshot();
10773        assert_eq!(metrics.cold_flush_uploads, 4);
10774        assert_eq!(metrics.cold_flush_upload_bytes, 4);
10775        assert_eq!(metrics.cold_flush_publishes, 4);
10776        assert_eq!(metrics.cold_flush_publish_bytes, 4);
10777        assert_eq!(metrics.cold_hot_bytes, 0);
10778
10779        let snapshot = runtime
10780            .snapshot_group(placement.raft_group_id)
10781            .await
10782            .expect("snapshot group");
10783        let entry = snapshot
10784            .stream_snapshot
10785            .streams
10786            .iter()
10787            .find(|entry| entry.metadata.stream_id == stream)
10788            .expect("stream snapshot");
10789        assert_eq!(entry.cold_chunks.len(), 4);
10790        assert!(entry.payload.is_empty());
10791
10792        let read = runtime
10793            .read_stream(ReadStreamRequest {
10794                stream_id: stream,
10795                offset: 0,
10796                max_len: 4,
10797                now_ms: 0,
10798            })
10799            .await
10800            .expect("read cold chunks");
10801        assert_eq!(read.payload, b"abcd");
10802        assert_eq!(read.next_offset, 4);
10803    }
10804
10805    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10806    async fn stale_cold_flush_batch_after_delete_recreate_is_classified_for_cleanup() {
10807        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10808        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10809            RuntimeConfig::new(2, 8),
10810            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10811            Some(cold_store),
10812        )
10813        .expect("spawn runtime");
10814        let stream = BucketStreamId::new("benchcmp", "stale-cold-runtime");
10815        let placement = runtime.locate(&stream);
10816        create_stream(&runtime, &stream).await;
10817        runtime
10818            .append(AppendRequest::from_bytes(
10819                stream.clone(),
10820                b"abcdefghijklmnopqr".to_vec(),
10821            ))
10822            .await
10823            .expect("append old stream");
10824        let candidates = runtime
10825            .plan_next_cold_flush_batch(
10826                placement.raft_group_id,
10827                PlanGroupColdFlushRequest {
10828                    min_hot_bytes: 18,
10829                    max_flush_bytes: 18,
10830                },
10831                1,
10832            )
10833            .await
10834            .expect("plan candidate");
10835        assert_eq!(candidates.len(), 1);
10836
10837        runtime
10838            .delete_stream(DeleteStreamRequest {
10839                stream_id: stream.clone(),
10840            })
10841            .await
10842            .expect("delete old stream");
10843        create_stream(&runtime, &stream).await;
10844        runtime
10845            .append(AppendRequest::from_bytes(
10846                stream.clone(),
10847                b"abcdefghijklmnopq".to_vec(),
10848            ))
10849            .await
10850            .expect("append recreated stream");
10851
10852        let err = runtime
10853            .flush_cold_candidates_batch(candidates)
10854            .await
10855            .expect_err("stale candidate should fail publish");
10856        assert!(is_stale_cold_flush_candidate_error(&err));
10857        let metrics = runtime.metrics().snapshot();
10858        assert_eq!(metrics.cold_flush_uploads, 1);
10859        assert_eq!(metrics.cold_flush_publishes, 0);
10860        assert_eq!(metrics.cold_orphan_cleanup_attempts, 1);
10861        assert_eq!(metrics.cold_orphan_cleanup_errors, 0);
10862
10863        let read = runtime
10864            .read_stream(ReadStreamRequest {
10865                stream_id: stream,
10866                offset: 0,
10867                max_len: 32,
10868                now_ms: 0,
10869            })
10870            .await
10871            .expect("read recreated stream");
10872        assert_eq!(read.payload, b"abcdefghijklmnopq");
10873        assert_eq!(read.next_offset, 17);
10874    }
10875
10876    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10877    async fn cold_write_admission_rejects_new_bytes_until_flush_catches_up() {
10878        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10879        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10880            RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(4)),
10881            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10882            Some(cold_store),
10883        )
10884        .expect("spawn runtime");
10885        let stream = BucketStreamId::new("benchcmp", "cold-admission");
10886        create_stream(&runtime, &stream).await;
10887        runtime
10888            .append(AppendRequest::from_bytes(stream.clone(), b"abcd".to_vec()))
10889            .await
10890            .expect("append below limit");
10891
10892        let err = runtime
10893            .append(AppendRequest::from_bytes(stream.clone(), b"e".to_vec()))
10894            .await
10895            .expect_err("append should be backpressured");
10896        match err {
10897            RuntimeError::GroupEngine { message, .. } if message.contains("ColdBackpressure") => {}
10898            other => panic!("expected cold backpressure, got {other:?}"),
10899        }
10900        let metrics = runtime.metrics().snapshot();
10901        let group_index = usize::try_from(runtime.locate(&stream).raft_group_id.0).unwrap();
10902        assert_eq!(metrics.accepted_appends, 1);
10903        assert_eq!(metrics.cold_hot_bytes, 4);
10904        assert_eq!(metrics.per_group_cold_hot_bytes[group_index], 4);
10905        assert_eq!(metrics.cold_hot_group_bytes_max, 4);
10906        assert_eq!(metrics.cold_hot_stream_bytes_max, 4);
10907        assert_eq!(metrics.cold_backpressure_events, 1);
10908        assert_eq!(metrics.per_group_cold_backpressure_events[group_index], 1);
10909        assert_eq!(metrics.cold_backpressure_bytes, 1);
10910
10911        runtime
10912            .flush_cold_once(PlanColdFlushRequest {
10913                stream_id: stream.clone(),
10914                min_hot_bytes: 4,
10915                max_flush_bytes: 4,
10916            })
10917            .await
10918            .expect("flush once")
10919            .expect("candidate flushed");
10920        assert_eq!(runtime.metrics().snapshot().cold_hot_bytes, 0);
10921
10922        runtime
10923            .append(AppendRequest::from_bytes(stream.clone(), b"e".to_vec()))
10924            .await
10925            .expect("append after flush");
10926        let read = runtime
10927            .read_stream(ReadStreamRequest {
10928                stream_id: stream,
10929                offset: 0,
10930                max_len: 5,
10931                now_ms: 0,
10932            })
10933            .await
10934            .expect("read cold and hot");
10935        assert_eq!(read.payload, b"abcde");
10936    }
10937
10938    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10939    async fn cold_write_admission_rejects_append_batch_without_partial_mutation() {
10940        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10941        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10942            RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(4)),
10943            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10944            Some(cold_store),
10945        )
10946        .expect("spawn runtime");
10947        let stream = BucketStreamId::new("benchcmp", "cold-admission-batch");
10948        create_stream(&runtime, &stream).await;
10949        runtime
10950            .append(AppendRequest::from_bytes(stream.clone(), b"abc".to_vec()))
10951            .await
10952            .expect("append below limit");
10953
10954        let err = runtime
10955            .append_batch(AppendBatchRequest::new(
10956                stream.clone(),
10957                vec![b"d".to_vec(), b"e".to_vec()],
10958            ))
10959            .await
10960            .expect_err("batch should be backpressured");
10961        match err {
10962            RuntimeError::GroupEngine { message, .. } if message.contains("ColdBackpressure") => {}
10963            other => panic!("expected cold backpressure, got {other:?}"),
10964        }
10965        let read = runtime
10966            .read_stream(ReadStreamRequest {
10967                stream_id: stream.clone(),
10968                offset: 0,
10969                max_len: 8,
10970                now_ms: 0,
10971            })
10972            .await
10973            .expect("read");
10974        assert_eq!(read.payload, b"abc");
10975        let metrics = runtime.metrics().snapshot();
10976        assert_eq!(metrics.accepted_appends, 1);
10977        assert_eq!(metrics.cold_backpressure_events, 1);
10978        assert_eq!(metrics.cold_backpressure_bytes, 2);
10979    }
10980
10981    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10982    async fn flush_cold_group_once_selects_stream_inside_owner_group() {
10983        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10984        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10985            RuntimeConfig::new(2, 8),
10986            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10987            Some(cold_store),
10988        )
10989        .expect("spawn runtime");
10990        let group_id = RaftGroupId(3);
10991        let stream = stream_on_group(&runtime, group_id, "cold-group");
10992        create_stream(&runtime, &stream).await;
10993        runtime
10994            .append(AppendRequest::from_bytes(
10995                stream.clone(),
10996                b"abcdef".to_vec(),
10997            ))
10998            .await
10999            .expect("append");
11000
11001        let flushed = runtime
11002            .flush_cold_group_once(
11003                group_id,
11004                PlanGroupColdFlushRequest {
11005                    min_hot_bytes: 4,
11006                    max_flush_bytes: 4,
11007                },
11008            )
11009            .await
11010            .expect("flush group")
11011            .expect("candidate flushed");
11012        assert_eq!(flushed.hot_start_offset, 4);
11013
11014        let read = runtime
11015            .read_stream(ReadStreamRequest {
11016                stream_id: stream,
11017                offset: 0,
11018                max_len: 6,
11019                now_ms: 0,
11020            })
11021            .await
11022            .expect("read cold and hot");
11023        assert_eq!(read.payload, b"abcdef");
11024    }
11025
11026    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11027    async fn flush_cold_all_groups_once_bounded_flushes_multiple_groups() {
11028        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11029        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11030            RuntimeConfig::new(2, 8),
11031            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
11032            Some(cold_store),
11033        )
11034        .expect("spawn runtime");
11035        let first = stream_on_group(&runtime, RaftGroupId(1), "cold-bounded-a");
11036        let second = stream_on_group(&runtime, RaftGroupId(6), "cold-bounded-b");
11037        for stream in [&first, &second] {
11038            create_stream(&runtime, stream).await;
11039            runtime
11040                .append(AppendRequest::from_bytes(
11041                    stream.clone(),
11042                    b"abcdef".to_vec(),
11043                ))
11044                .await
11045                .expect("append");
11046        }
11047
11048        let flushed = runtime
11049            .flush_cold_all_groups_once_bounded(
11050                PlanGroupColdFlushRequest {
11051                    min_hot_bytes: 4,
11052                    max_flush_bytes: 4,
11053                },
11054                2,
11055            )
11056            .await
11057            .expect("flush all bounded");
11058        assert_eq!(flushed, 2);
11059        let metrics = runtime.metrics().snapshot();
11060        assert_eq!(metrics.cold_flush_uploads, 2);
11061        assert_eq!(metrics.cold_flush_upload_bytes, 8);
11062        assert_eq!(metrics.cold_flush_publishes, 2);
11063        assert_eq!(metrics.cold_flush_publish_bytes, 8);
11064    }
11065
11066    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11067    async fn repeated_cold_flush_keeps_hot_bytes_bounded_while_writes_continue() {
11068        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11069        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11070            RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(16)),
11071            InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
11072            Some(cold_store),
11073        )
11074        .expect("spawn runtime");
11075        let streams = [
11076            stream_on_group(&runtime, RaftGroupId(0), "cold-steady-a"),
11077            stream_on_group(&runtime, RaftGroupId(3), "cold-steady-b"),
11078            stream_on_group(&runtime, RaftGroupId(5), "cold-steady-c"),
11079            stream_on_group(&runtime, RaftGroupId(7), "cold-steady-d"),
11080        ];
11081        for stream in &streams {
11082            create_stream(&runtime, stream).await;
11083        }
11084
11085        let mut expected = Vec::new();
11086        for round in 0..8u8 {
11087            let payload = vec![b'a' + round; 4];
11088            expected.extend_from_slice(&payload);
11089            for stream in &streams {
11090                runtime
11091                    .append(AppendRequest::from_bytes(stream.clone(), payload.clone()))
11092                    .await
11093                    .expect("append while cold worker keeps up");
11094            }
11095
11096            let metrics_before_flush = runtime.metrics().snapshot();
11097            assert!(
11098                metrics_before_flush.cold_hot_bytes <= 64,
11099                "hot bytes should stay within one unflushed batch per group before flush: {}",
11100                metrics_before_flush.cold_hot_bytes
11101            );
11102
11103            let flushed = runtime
11104                .flush_cold_all_groups_once_bounded(
11105                    PlanGroupColdFlushRequest {
11106                        min_hot_bytes: 4,
11107                        max_flush_bytes: 4,
11108                    },
11109                    streams.len(),
11110                )
11111                .await
11112                .expect("flush all bounded");
11113            assert_eq!(flushed, streams.len());
11114            let metrics_after_flush = runtime.metrics().snapshot();
11115            assert_eq!(
11116                metrics_after_flush.cold_hot_bytes, 0,
11117                "all newly appended bytes should be offloaded after round {round}"
11118            );
11119            assert_eq!(
11120                metrics_after_flush.cold_flush_uploads,
11121                u64::try_from((usize::from(round) + 1) * streams.len()).expect("count fits u64")
11122            );
11123            assert_eq!(metrics_after_flush.cold_orphan_cleanup_attempts, 0);
11124            assert_eq!(metrics_after_flush.cold_backpressure_events, 0);
11125        }
11126
11127        for stream in streams {
11128            let read = runtime
11129                .read_stream(ReadStreamRequest {
11130                    stream_id: stream,
11131                    offset: 0,
11132                    max_len: expected.len(),
11133                    now_ms: 0,
11134                })
11135                .await
11136                .expect("read cold-backed stream");
11137            assert_eq!(read.payload, expected);
11138            assert_eq!(read.next_offset, u64::try_from(expected.len()).unwrap());
11139        }
11140    }
11141
11142    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11143    async fn wait_read_stream_completes_after_owner_append() {
11144        let runtime = runtime(2, 8);
11145        let stream = BucketStreamId::new("benchcmp", "wait-read");
11146        create_stream(&runtime, &stream).await;
11147
11148        let wait = {
11149            let runtime = runtime.clone();
11150            let stream = stream.clone();
11151            tokio::spawn(async move {
11152                runtime
11153                    .wait_read_stream(ReadStreamRequest {
11154                        stream_id: stream,
11155                        offset: 0,
11156                        max_len: 16,
11157                        now_ms: 0,
11158                    })
11159                    .await
11160                    .expect("wait read")
11161            })
11162        };
11163        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
11164        runtime
11165            .append(AppendRequest::from_bytes(stream.clone(), b"hello".to_vec()))
11166            .await
11167            .expect("append");
11168
11169        let read = tokio::time::timeout(std::time::Duration::from_secs(1), wait)
11170            .await
11171            .expect("wait read timeout")
11172            .expect("wait task");
11173        assert_eq!(read.payload, b"hello");
11174        assert_eq!(read.next_offset, 5);
11175        assert!(read.up_to_date);
11176        assert!(!read.closed);
11177        assert_eq!(runtime.metrics().snapshot().live_read_waiters, 0);
11178    }
11179
11180    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11181    async fn wait_read_stream_completes_on_close_at_tail() {
11182        let runtime = runtime(2, 8);
11183        let stream = BucketStreamId::new("benchcmp", "wait-close");
11184        create_stream(&runtime, &stream).await;
11185
11186        let wait = {
11187            let runtime = runtime.clone();
11188            let stream = stream.clone();
11189            tokio::spawn(async move {
11190                runtime
11191                    .wait_read_stream(ReadStreamRequest {
11192                        stream_id: stream,
11193                        offset: 0,
11194                        max_len: 16,
11195                        now_ms: 0,
11196                    })
11197                    .await
11198                    .expect("wait read")
11199            })
11200        };
11201        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
11202        runtime
11203            .close_stream(CloseStreamRequest {
11204                stream_id: stream,
11205                stream_seq: None,
11206                producer: None,
11207                now_ms: 0,
11208            })
11209            .await
11210            .expect("close stream");
11211
11212        let read = tokio::time::timeout(std::time::Duration::from_secs(1), wait)
11213            .await
11214            .expect("wait read timeout")
11215            .expect("wait task");
11216        assert!(read.payload.is_empty());
11217        assert_eq!(read.next_offset, 0);
11218        assert!(read.up_to_date);
11219        assert!(read.closed);
11220        assert_eq!(runtime.metrics().snapshot().live_read_waiters, 0);
11221    }
11222
11223    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11224    async fn canceled_wait_read_stream_removes_owner_waiter() {
11225        let runtime = runtime(2, 8);
11226        let stream = BucketStreamId::new("benchcmp", "wait-cancel");
11227        create_stream(&runtime, &stream).await;
11228
11229        let wait = {
11230            let runtime = runtime.clone();
11231            let stream = stream.clone();
11232            tokio::spawn(async move {
11233                runtime
11234                    .wait_read_stream(ReadStreamRequest {
11235                        stream_id: stream,
11236                        offset: 0,
11237                        max_len: 16,
11238                        now_ms: 0,
11239                    })
11240                    .await
11241            })
11242        };
11243        wait_for_live_waiters(&runtime, 1).await;
11244        wait.abort();
11245        let _ = wait.await;
11246        wait_for_live_waiters(&runtime, 0).await;
11247    }
11248
11249    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11250    async fn live_read_waiter_limit_rejects_excess_waiters_on_owner_core() {
11251        let runtime = ShardRuntime::spawn(
11252            RuntimeConfig::new(1, 1).with_live_read_max_waiters_per_core(Some(1)),
11253        )
11254        .expect("spawn runtime");
11255        let stream = BucketStreamId::new("benchcmp", "wait-limit");
11256        create_stream(&runtime, &stream).await;
11257
11258        let first = {
11259            let runtime = runtime.clone();
11260            let stream = stream.clone();
11261            tokio::spawn(async move {
11262                runtime
11263                    .wait_read_stream(ReadStreamRequest {
11264                        stream_id: stream,
11265                        offset: 0,
11266                        max_len: 16,
11267                        now_ms: 0,
11268                    })
11269                    .await
11270            })
11271        };
11272        wait_for_live_waiters(&runtime, 1).await;
11273
11274        let err = runtime
11275            .wait_read_stream(ReadStreamRequest {
11276                stream_id: stream.clone(),
11277                offset: 0,
11278                max_len: 16,
11279                now_ms: 0,
11280            })
11281            .await
11282            .expect_err("second waiter should hit owner-core limit");
11283        assert_eq!(
11284            err,
11285            RuntimeError::LiveReadBackpressure {
11286                core_id: CoreId(0),
11287                current_waiters: 1,
11288                limit: 1,
11289            }
11290        );
11291        let snapshot = runtime.metrics().snapshot();
11292        assert_eq!(snapshot.live_read_waiters, 1);
11293        assert_eq!(snapshot.live_read_backpressure_events, 1);
11294        assert_eq!(snapshot.per_core_live_read_backpressure_events, vec![1]);
11295
11296        first.abort();
11297        let _ = first.await;
11298        wait_for_live_waiters(&runtime, 0).await;
11299    }
11300
11301    #[test]
11302    fn cancel_read_watcher_removes_group_local_waiter() {
11303        let stream = BucketStreamId::new("benchcmp", "watcher-cancel-local");
11304        let mut read_watchers = ReadWatchers::new();
11305        let (first_tx, _first_rx) = oneshot::channel();
11306        let (second_tx, _second_rx) = oneshot::channel();
11307        read_watchers.insert(
11308            stream.clone(),
11309            vec![
11310                ReadWatcher {
11311                    waiter_id: 1,
11312                    request: ReadStreamRequest {
11313                        stream_id: stream.clone(),
11314                        offset: 0,
11315                        max_len: 16,
11316                        now_ms: 0,
11317                    },
11318                    response_tx: first_tx,
11319                },
11320                ReadWatcher {
11321                    waiter_id: 2,
11322                    request: ReadStreamRequest {
11323                        stream_id: stream.clone(),
11324                        offset: 0,
11325                        max_len: 16,
11326                        now_ms: 0,
11327                    },
11328                    response_tx: second_tx,
11329                },
11330            ],
11331        );
11332
11333        let metrics = Arc::new(RuntimeMetricsInner::new(1, 1));
11334        metrics.record_read_watchers_added(CoreId(0), 2);
11335        CoreWorker::cancel_read_watcher(
11336            &mut read_watchers,
11337            metrics.clone(),
11338            CoreId(0),
11339            stream.clone(),
11340            1,
11341        );
11342
11343        let watcher_ids = read_watchers
11344            .get(&stream)
11345            .expect("one watcher remains")
11346            .iter()
11347            .map(|watcher| watcher.waiter_id)
11348            .collect::<Vec<_>>();
11349        assert_eq!(watcher_ids, vec![2]);
11350        assert_eq!(metrics.per_core_live_read_waiters[0].load_relaxed(), 1);
11351    }
11352
11353    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11354    async fn notify_read_watchers_shares_identical_reads_across_watchers() {
11355        let factory = BlockingReadFactory::default();
11356        let runtime = ShardRuntime::spawn_with_engine_factory(
11357            RuntimeConfig {
11358                core_count: 1,
11359                raft_group_count: 1,
11360                mailbox_capacity: 8,
11361                threading: RuntimeThreading::HostedTokio,
11362                cold_max_hot_bytes_per_group: None,
11363                live_read_max_waiters_per_core: Some(65_536),
11364            },
11365            factory.clone(),
11366        )
11367        .expect("spawn runtime");
11368        let stream = BucketStreamId::new("benchcmp", "watcher-shared-read");
11369        let placement = runtime.locate(&stream);
11370        let request = ReadStreamRequest {
11371            stream_id: stream.clone(),
11372            offset: 0,
11373            max_len: 16,
11374            now_ms: 0,
11375        };
11376        let mut read_watchers = ReadWatchers::new();
11377        let (first_tx, _first_rx) = oneshot::channel();
11378        let (second_tx, _second_rx) = oneshot::channel();
11379        read_watchers.insert(
11380            stream.clone(),
11381            vec![
11382                ReadWatcher {
11383                    waiter_id: 1,
11384                    request: request.clone(),
11385                    response_tx: first_tx,
11386                },
11387                ReadWatcher {
11388                    waiter_id: 2,
11389                    request,
11390                    response_tx: second_tx,
11391                },
11392            ],
11393        );
11394
11395        let metrics = Arc::new(RuntimeMetricsInner::new(1, 1));
11396        let mut engine = factory
11397            .create(
11398                placement,
11399                GroupEngineMetrics {
11400                    inner: metrics.clone(),
11401                },
11402            )
11403            .await
11404            .expect("create engine");
11405        let notify = {
11406            let stream = stream.clone();
11407            tokio::spawn(async move {
11408                CoreWorker::notify_read_watchers(
11409                    &mut engine,
11410                    metrics,
11411                    Arc::new(Semaphore::new(8)),
11412                    &mut read_watchers,
11413                    &stream,
11414                    placement,
11415                )
11416                .await;
11417                read_watchers
11418            })
11419        };
11420        tokio::time::timeout(
11421            std::time::Duration::from_secs(1),
11422            factory.entered.notified(),
11423        )
11424        .await
11425        .expect("notify issued one grouped read");
11426        factory.release.notify_one();
11427        let read_watchers = tokio::time::timeout(std::time::Duration::from_secs(1), notify)
11428            .await
11429            .expect("notify should finish after one read")
11430            .expect("notify task");
11431
11432        let watcher_ids = read_watchers
11433            .get(&stream)
11434            .expect("pending watchers reinserted")
11435            .iter()
11436            .map(|watcher| watcher.waiter_id)
11437            .collect::<Vec<_>>();
11438        assert_eq!(watcher_ids, vec![1, 2]);
11439        assert_eq!(factory.read_count.load(Ordering::Relaxed), 1);
11440    }
11441
11442    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11443    async fn close_stream_allows_close_only_and_rejects_later_appends() {
11444        let runtime = runtime(2, 8);
11445        let stream = BucketStreamId::new("benchcmp", "close-only");
11446        let placement = runtime.locate(&stream);
11447        create_stream(&runtime, &stream).await;
11448
11449        let closed = runtime
11450            .close_stream(CloseStreamRequest {
11451                stream_id: stream.clone(),
11452                stream_seq: None,
11453                producer: None,
11454                now_ms: 0,
11455            })
11456            .await
11457            .expect("close stream");
11458        assert_eq!(closed.placement, placement);
11459        assert_eq!(closed.next_offset, 0);
11460
11461        let err = runtime
11462            .append(AppendRequest::new(stream.clone(), 1))
11463            .await
11464            .expect_err("append after close rejected");
11465        match err {
11466            RuntimeError::GroupEngine { message, .. } => {
11467                assert!(message.contains("StreamClosed"), "message={message}");
11468            }
11469            other => panic!("expected group engine error, got {other:?}"),
11470        }
11471
11472        let head = runtime
11473            .head_stream(HeadStreamRequest {
11474                stream_id: stream,
11475                now_ms: 0,
11476            })
11477            .await
11478            .expect("head stream");
11479        assert_eq!(head.tail_offset, 0);
11480        assert!(head.closed);
11481        assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
11482    }
11483
11484    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11485    async fn delete_stream_removes_state_on_owner_group() {
11486        let runtime = runtime(2, 8);
11487        let stream = BucketStreamId::new("benchcmp", "delete-stream");
11488        let placement = runtime.locate(&stream);
11489        create_stream(&runtime, &stream).await;
11490        runtime
11491            .append(AppendRequest::from_bytes(
11492                stream.clone(),
11493                b"payload".to_vec(),
11494            ))
11495            .await
11496            .expect("append");
11497
11498        let deleted = runtime
11499            .delete_stream(DeleteStreamRequest {
11500                stream_id: stream.clone(),
11501            })
11502            .await
11503            .expect("delete stream");
11504        assert_eq!(deleted.placement, placement);
11505
11506        let err = runtime
11507            .head_stream(HeadStreamRequest {
11508                stream_id: stream.clone(),
11509                now_ms: 0,
11510            })
11511            .await
11512            .expect_err("head after delete rejected");
11513        match err {
11514            RuntimeError::GroupEngine { message, .. } => {
11515                assert!(message.contains("StreamNotFound"), "message={message}");
11516            }
11517            other => panic!("expected group engine error, got {other:?}"),
11518        }
11519
11520        let err = runtime
11521            .append(AppendRequest::new(stream, 1))
11522            .await
11523            .expect_err("append after delete rejected");
11524        match err {
11525            RuntimeError::GroupEngine { message, .. } => {
11526                assert!(message.contains("StreamNotFound"), "message={message}");
11527            }
11528            other => panic!("expected group engine error, got {other:?}"),
11529        }
11530    }
11531
11532    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11533    async fn fork_ref_keeps_deleted_source_gone_until_last_fork_delete() {
11534        let runtime = runtime(2, 8);
11535        let source = BucketStreamId::new("benchcmp", "fork-ref-source");
11536        let fork = BucketStreamId::new("benchcmp", "fork-ref-child");
11537        let mut source_create = CreateStreamRequest::new(source.clone(), DEFAULT_CONTENT_TYPE);
11538        source_create.initial_payload = Bytes::from_static(b"abc");
11539        runtime
11540            .create_stream(source_create)
11541            .await
11542            .expect("create source");
11543
11544        let mut fork_create = CreateStreamRequest::new(fork.clone(), DEFAULT_CONTENT_TYPE);
11545        fork_create.forked_from = Some(source.clone());
11546        runtime
11547            .create_stream(fork_create)
11548            .await
11549            .expect("create fork");
11550
11551        runtime
11552            .delete_stream(DeleteStreamRequest {
11553                stream_id: source.clone(),
11554            })
11555            .await
11556            .expect("delete source");
11557        let err = runtime
11558            .head_stream(HeadStreamRequest {
11559                stream_id: source.clone(),
11560                now_ms: 0,
11561            })
11562            .await
11563            .expect_err("soft-deleted source is gone");
11564        match err {
11565            RuntimeError::GroupEngine { message, .. } => {
11566                assert!(message.contains("StreamGone"), "message={message}");
11567            }
11568            other => panic!("expected group engine error, got {other:?}"),
11569        }
11570
11571        let fork_read = runtime
11572            .read_stream(ReadStreamRequest {
11573                stream_id: fork.clone(),
11574                offset: 0,
11575                max_len: 16,
11576                now_ms: 0,
11577            })
11578            .await
11579            .expect("fork remains readable");
11580        assert_eq!(fork_read.payload, b"abc");
11581
11582        runtime
11583            .delete_stream(DeleteStreamRequest { stream_id: fork })
11584            .await
11585            .expect("delete fork");
11586        let err = runtime
11587            .head_stream(HeadStreamRequest {
11588                stream_id: source,
11589                now_ms: 0,
11590            })
11591            .await
11592            .expect_err("source is hard-deleted after last fork");
11593        match err {
11594            RuntimeError::GroupEngine { message, .. } => {
11595                assert!(message.contains("StreamNotFound"), "message={message}");
11596            }
11597            other => panic!("expected group engine error, got {other:?}"),
11598        }
11599    }
11600
11601    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
11602    async fn thread_per_core_runtime_reaches_all_configured_cores() {
11603        let mut config = RuntimeConfig::new(4, 32);
11604        config.mailbox_capacity = 128;
11605        assert_eq!(config.threading, RuntimeThreading::ThreadPerCore);
11606        let runtime = ShardRuntime::spawn(config).expect("spawn runtime");
11607
11608        let mut tasks = Vec::new();
11609        for index in 0..1024 {
11610            let runtime = runtime.clone();
11611            tasks.push(tokio::spawn(async move {
11612                let stream = BucketStreamId::new("benchcmp", format!("thread-core-{index}"));
11613                create_stream(&runtime, &stream).await;
11614                runtime
11615                    .append(AppendRequest::new(stream, 1))
11616                    .await
11617                    .expect("append");
11618            }));
11619        }
11620
11621        for task in tasks {
11622            task.await.expect("task");
11623        }
11624
11625        let snapshot = runtime.metrics().snapshot();
11626        assert_eq!(snapshot.accepted_appends, 1024);
11627        assert!(snapshot.per_core_appends.iter().all(|value| *value > 0));
11628    }
11629
11630    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
11631    async fn custom_group_engine_is_created_once_per_touched_group_on_owner_core() {
11632        let factory = RecordingFactory::default();
11633        let runtime = ShardRuntime::spawn_with_engine_factory(
11634            RuntimeConfig {
11635                core_count: 4,
11636                raft_group_count: 32,
11637                mailbox_capacity: 128,
11638                threading: RuntimeThreading::HostedTokio,
11639                cold_max_hot_bytes_per_group: None,
11640                live_read_max_waiters_per_core: Some(65_536),
11641            },
11642            factory.clone(),
11643        )
11644        .expect("spawn runtime");
11645
11646        let mut touched_groups = HashSet::new();
11647        for index in 0..4096 {
11648            let stream = BucketStreamId::new("benchcmp", format!("engine-{index}"));
11649            let placement = runtime.locate(&stream);
11650            runtime
11651                .create_stream(CreateStreamRequest::new(stream, DEFAULT_CONTENT_TYPE))
11652                .await
11653                .expect("create stream");
11654            touched_groups.insert(placement.raft_group_id);
11655            if touched_groups.len() == 16 {
11656                break;
11657            }
11658        }
11659
11660        let created = factory.created();
11661        let created_groups = created
11662            .iter()
11663            .map(|placement| placement.raft_group_id)
11664            .collect::<HashSet<_>>();
11665        assert_eq!(created_groups, touched_groups);
11666        for placement in created {
11667            assert_eq!(
11668                u32::from(placement.core_id.0),
11669                placement.raft_group_id.0 % 4
11670            );
11671        }
11672    }
11673
11674    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11675    async fn background_cold_flush_skips_groups_that_cannot_accept_local_writes() {
11676        let factory = RecordingFactory::without_local_writes();
11677        let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11678        let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11679            RuntimeConfig {
11680                core_count: 2,
11681                raft_group_count: 4,
11682                mailbox_capacity: 128,
11683                threading: RuntimeThreading::HostedTokio,
11684                cold_max_hot_bytes_per_group: None,
11685                live_read_max_waiters_per_core: Some(65_536),
11686            },
11687            factory.clone(),
11688            Some(cold_store),
11689        )
11690        .expect("spawn runtime");
11691
11692        let flushed = runtime
11693            .flush_cold_all_groups_once_bounded(
11694                PlanGroupColdFlushRequest {
11695                    min_hot_bytes: 1,
11696                    max_flush_bytes: 1,
11697                },
11698                4,
11699            )
11700            .await
11701            .expect("flush all groups");
11702
11703        assert_eq!(flushed, 0);
11704        assert_eq!(factory.created().len(), 4);
11705        let metrics = runtime.metrics().snapshot();
11706        assert_eq!(metrics.cold_flush_uploads, 0);
11707        assert_eq!(metrics.cold_flush_publishes, 0);
11708    }
11709
11710    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11711    async fn warm_group_instantiates_engine_on_owner_core_without_stream_mutation() {
11712        let factory = RecordingFactory::default();
11713        let runtime = ShardRuntime::spawn_with_engine_factory(
11714            RuntimeConfig {
11715                core_count: 2,
11716                raft_group_count: 4,
11717                mailbox_capacity: 128,
11718                threading: RuntimeThreading::HostedTokio,
11719                cold_max_hot_bytes_per_group: None,
11720                live_read_max_waiters_per_core: Some(65_536),
11721            },
11722            factory.clone(),
11723        )
11724        .expect("spawn runtime");
11725
11726        let warmed = runtime
11727            .warm_group(RaftGroupId(3))
11728            .await
11729            .expect("warm group");
11730        assert_eq!(warmed.core_id, CoreId(1));
11731        assert_eq!(warmed.raft_group_id, RaftGroupId(3));
11732
11733        runtime
11734            .warm_group(RaftGroupId(3))
11735            .await
11736            .expect("second warm is idempotent");
11737
11738        let created = factory.created();
11739        assert_eq!(created, vec![warmed]);
11740
11741        runtime.warm_all_groups().await.expect("warm all groups");
11742        let created_groups = factory
11743            .created()
11744            .into_iter()
11745            .map(|placement| placement.raft_group_id)
11746            .collect::<HashSet<_>>();
11747        assert_eq!(
11748            created_groups,
11749            [
11750                RaftGroupId(0),
11751                RaftGroupId(1),
11752                RaftGroupId(2),
11753                RaftGroupId(3)
11754            ]
11755            .into_iter()
11756            .collect()
11757        );
11758    }
11759
11760    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11761    async fn core_worker_dispatches_other_groups_while_one_group_waits() {
11762        let factory = BlockingFirstCreateEngineFactory::default();
11763        let runtime = ShardRuntime::spawn_with_engine_factory(
11764            RuntimeConfig {
11765                core_count: 1,
11766                raft_group_count: 2,
11767                mailbox_capacity: 128,
11768                threading: RuntimeThreading::HostedTokio,
11769                cold_max_hot_bytes_per_group: None,
11770                live_read_max_waiters_per_core: Some(65_536),
11771            },
11772            factory.clone(),
11773        )
11774        .expect("spawn runtime");
11775
11776        let blocked_stream = stream_on_group(&runtime, RaftGroupId(0), "blocked-group");
11777        let free_stream = stream_on_group(&runtime, RaftGroupId(1), "free-group");
11778        let entered_wait = factory.entered.notified();
11779        let blocked_runtime = runtime.clone();
11780        let blocked =
11781            tokio::spawn(async move { create_stream(&blocked_runtime, &blocked_stream).await });
11782
11783        tokio::time::timeout(std::time::Duration::from_secs(1), entered_wait)
11784            .await
11785            .expect("first group entered blocking create");
11786
11787        let completed = tokio::time::timeout(
11788            std::time::Duration::from_secs(1),
11789            create_stream(&runtime, &free_stream),
11790        )
11791        .await
11792        .expect("other group should complete while first group is blocked");
11793        assert_eq!(completed.placement.raft_group_id, RaftGroupId(1));
11794
11795        factory.release.notify_one();
11796        blocked.await.expect("blocked task");
11797    }
11798
11799    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11800    async fn runtime_read_uses_group_read_parts_fast_path() {
11801        let factory = BlockingReadFactory::default();
11802        let runtime = ShardRuntime::spawn_with_engine_factory(
11803            RuntimeConfig {
11804                core_count: 1,
11805                raft_group_count: 1,
11806                mailbox_capacity: 128,
11807                threading: RuntimeThreading::HostedTokio,
11808                cold_max_hot_bytes_per_group: None,
11809                live_read_max_waiters_per_core: Some(65_536),
11810            },
11811            factory.clone(),
11812        )
11813        .expect("spawn runtime");
11814        let stream = BucketStreamId::new("benchcmp", "read-offload");
11815        create_stream(&runtime, &stream).await;
11816
11817        let read = tokio::time::timeout(
11818            std::time::Duration::from_secs(1),
11819            runtime.read_stream(ReadStreamRequest {
11820                stream_id: stream.clone(),
11821                offset: 0,
11822                max_len: 16,
11823                now_ms: 0,
11824            }),
11825        )
11826        .await
11827        .expect("runtime read should not use blocking legacy read_stream")
11828        .expect("read stream");
11829        assert_eq!(read.placement.raft_group_id, RaftGroupId(0));
11830        assert_eq!(factory.read_count.load(Ordering::Relaxed), 1);
11831
11832        let head = runtime
11833            .head_stream(HeadStreamRequest {
11834                stream_id: stream,
11835                now_ms: 0,
11836            })
11837            .await
11838            .expect("head stream");
11839        assert_eq!(head.placement.raft_group_id, RaftGroupId(0));
11840    }
11841
11842    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11843    async fn read_materialization_is_bounded_without_blocking_group_actor() {
11844        let factory = BlockingReadFactory::block_materialization();
11845        let mut config = RuntimeConfig::new(1, 1);
11846        config.mailbox_capacity = 1;
11847        config.threading = RuntimeThreading::HostedTokio;
11848        let runtime = ShardRuntime::spawn_with_engine_factory(config, factory.clone())
11849            .expect("spawn runtime");
11850        let first_stream = BucketStreamId::new("benchcmp", "materialize-bound-1");
11851        let second_stream = BucketStreamId::new("benchcmp", "materialize-bound-2");
11852        create_stream(&runtime, &first_stream).await;
11853        create_stream(&runtime, &second_stream).await;
11854
11855        let first_runtime = runtime.clone();
11856        let first_stream_for_read = first_stream.clone();
11857        let first_read = tokio::spawn(async move {
11858            first_runtime
11859                .read_stream(ReadStreamRequest {
11860                    stream_id: first_stream_for_read,
11861                    offset: 0,
11862                    max_len: 16,
11863                    now_ms: 0,
11864                })
11865                .await
11866        });
11867        tokio::time::timeout(
11868            std::time::Duration::from_secs(1),
11869            factory.entered.notified(),
11870        )
11871        .await
11872        .expect("first materialization acquired the only permit");
11873
11874        let second_runtime = runtime.clone();
11875        let second_stream_for_read = second_stream.clone();
11876        let second_read = tokio::spawn(async move {
11877            second_runtime
11878                .read_stream(ReadStreamRequest {
11879                    stream_id: second_stream_for_read,
11880                    offset: 0,
11881                    max_len: 16,
11882                    now_ms: 0,
11883                })
11884                .await
11885        });
11886
11887        let head = tokio::time::timeout(
11888            std::time::Duration::from_secs(1),
11889            runtime.head_stream(HeadStreamRequest {
11890                stream_id: first_stream,
11891                now_ms: 0,
11892            }),
11893        )
11894        .await
11895        .expect("group actor should keep serving metadata while materialization waits")
11896        .expect("head stream");
11897        assert_eq!(head.placement.raft_group_id, RaftGroupId(0));
11898        assert!(!second_read.is_finished());
11899
11900        factory.release.notify_one();
11901        let first = first_read
11902            .await
11903            .expect("first read task")
11904            .expect("first read");
11905        assert_eq!(first.payload, b"ready");
11906        tokio::time::timeout(
11907            std::time::Duration::from_secs(1),
11908            factory.entered.notified(),
11909        )
11910        .await
11911        .expect("second materialization acquired permit after first released it");
11912        factory.release.notify_one();
11913        let second = second_read
11914            .await
11915            .expect("second read task")
11916            .expect("second read");
11917        assert_eq!(second.payload, b"ready");
11918    }
11919
11920    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11921    async fn group_engine_errors_include_group_context_and_do_not_record_success_metrics() {
11922        let runtime = ShardRuntime::spawn_with_engine_factory(
11923            RuntimeConfig {
11924                core_count: 2,
11925                raft_group_count: 8,
11926                mailbox_capacity: 128,
11927                threading: RuntimeThreading::HostedTokio,
11928                cold_max_hot_bytes_per_group: None,
11929                live_read_max_waiters_per_core: Some(65_536),
11930            },
11931            FailingFactory,
11932        )
11933        .expect("spawn runtime");
11934
11935        let stream = BucketStreamId::new("benchcmp", "failing-stream");
11936        let placement = runtime.locate(&stream);
11937        let err = runtime
11938            .append(AppendRequest::new(stream, 1))
11939            .await
11940            .expect_err("engine failure");
11941
11942        assert_eq!(
11943            err,
11944            RuntimeError::GroupEngine {
11945                core_id: placement.core_id,
11946                raft_group_id: placement.raft_group_id,
11947                message: "proposal rejected".to_owned(),
11948                next_offset: None,
11949                leader_hint: None,
11950            }
11951        );
11952        assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
11953    }
11954
11955    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11956    async fn mailbox_full_events_record_owner_core_backpressure() {
11957        let factory = BlockingOnceFactory::default();
11958        let runtime = ShardRuntime::spawn_with_engine_factory(
11959            RuntimeConfig {
11960                core_count: 1,
11961                raft_group_count: 1,
11962                mailbox_capacity: 1,
11963                threading: RuntimeThreading::HostedTokio,
11964                cold_max_hot_bytes_per_group: None,
11965                live_read_max_waiters_per_core: Some(65_536),
11966            },
11967            factory.clone(),
11968        )
11969        .expect("spawn runtime");
11970
11971        let entered = factory.entered.clone();
11972        let entered_wait = entered.notified();
11973        let first_runtime = runtime.clone();
11974        let first = tokio::spawn(async move {
11975            create_stream(
11976                &first_runtime,
11977                &BucketStreamId::new("benchcmp", "backpressure-1"),
11978            )
11979            .await
11980        });
11981        tokio::time::timeout(std::time::Duration::from_secs(1), entered_wait)
11982            .await
11983            .expect("first create entered blocking engine factory");
11984
11985        let second_runtime = runtime.clone();
11986        let second = tokio::spawn(async move {
11987            create_stream(
11988                &second_runtime,
11989                &BucketStreamId::new("benchcmp", "backpressure-2"),
11990            )
11991            .await
11992        });
11993        wait_for_mailbox_depth(&runtime, 0, 1).await;
11994
11995        let third_runtime = runtime.clone();
11996        let third = tokio::spawn(async move {
11997            create_stream(
11998                &third_runtime,
11999                &BucketStreamId::new("benchcmp", "backpressure-3"),
12000            )
12001            .await
12002        });
12003        wait_for_mailbox_full_events(&runtime, 1).await;
12004        assert_eq!(
12005            runtime.metrics().snapshot().per_core_mailbox_full_events[0],
12006            1
12007        );
12008
12009        factory.release.notify_one();
12010        first.await.expect("first task");
12011        second.await.expect("second task");
12012        third.await.expect("third task");
12013    }
12014
12015    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12016    async fn group_mailbox_full_events_record_inner_actor_backpressure() {
12017        let factory = BlockingFirstCreateEngineFactory::default();
12018        let runtime = ShardRuntime::spawn_with_engine_factory(
12019            RuntimeConfig {
12020                core_count: 1,
12021                raft_group_count: 1,
12022                mailbox_capacity: 1,
12023                threading: RuntimeThreading::HostedTokio,
12024                cold_max_hot_bytes_per_group: None,
12025                live_read_max_waiters_per_core: Some(65_536),
12026            },
12027            factory.clone(),
12028        )
12029        .expect("spawn runtime");
12030
12031        let first_runtime = runtime.clone();
12032        let first = tokio::spawn(async move {
12033            create_stream(
12034                &first_runtime,
12035                &BucketStreamId::new("benchcmp", "group-backpressure-1"),
12036            )
12037            .await
12038        });
12039        tokio::time::timeout(
12040            std::time::Duration::from_secs(1),
12041            factory.entered.notified(),
12042        )
12043        .await
12044        .expect("first append entered blocking group engine");
12045
12046        let second_runtime = runtime.clone();
12047        let second = tokio::spawn(async move {
12048            create_stream(
12049                &second_runtime,
12050                &BucketStreamId::new("benchcmp", "group-backpressure-2"),
12051            )
12052            .await
12053        });
12054        for _ in 0..100 {
12055            if runtime.metrics().snapshot().group_mailbox_depth == 1 {
12056                break;
12057            }
12058            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
12059        }
12060
12061        let third_runtime = runtime.clone();
12062        let third = tokio::spawn(async move {
12063            create_stream(
12064                &third_runtime,
12065                &BucketStreamId::new("benchcmp", "group-backpressure-3"),
12066            )
12067            .await
12068        });
12069        wait_for_group_mailbox_full_events(&runtime, 1).await;
12070        assert_eq!(
12071            runtime
12072                .metrics()
12073                .snapshot()
12074                .per_group_group_mailbox_full_events[0],
12075            1
12076        );
12077
12078        factory.release.notify_one();
12079        first.await.expect("first task");
12080        second.await.expect("second task");
12081        third.await.expect("third task");
12082    }
12083
12084    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12085    async fn wal_group_engine_recovers_multiple_groups_from_per_group_logs() {
12086        let wal_root = std::env::temp_dir().join(format!(
12087            "ursula-wal-test-{}-{}",
12088            std::process::id(),
12089            std::time::SystemTime::now()
12090                .duration_since(std::time::UNIX_EPOCH)
12091                .expect("system time after unix epoch")
12092                .as_nanos()
12093        ));
12094        let _ = std::fs::remove_dir_all(&wal_root);
12095        let config = RuntimeConfig {
12096            core_count: 2,
12097            raft_group_count: 8,
12098            mailbox_capacity: 128,
12099            threading: RuntimeThreading::HostedTokio,
12100            cold_max_hot_bytes_per_group: None,
12101            live_read_max_waiters_per_core: Some(65_536),
12102        };
12103
12104        let (first_stream, second_stream) = {
12105            let runtime = ShardRuntime::spawn_with_engine_factory(
12106                config.clone(),
12107                WalGroupEngineFactory::new(&wal_root),
12108            )
12109            .expect("spawn runtime");
12110
12111            let mut seen_groups = HashSet::new();
12112            let mut streams = Vec::new();
12113            for index in 0..256 {
12114                let stream = BucketStreamId::new("benchcmp", format!("wal-{index}"));
12115                if seen_groups.insert(runtime.locate(&stream).raft_group_id) {
12116                    streams.push(stream);
12117                }
12118                if streams.len() == 2 {
12119                    break;
12120                }
12121            }
12122            assert_eq!(streams.len(), 2, "expected streams on two groups");
12123            let first_stream = streams[0].clone();
12124            let second_stream = streams[1].clone();
12125
12126            create_stream(&runtime, &first_stream).await;
12127            runtime
12128                .append(AppendRequest::from_bytes(
12129                    first_stream.clone(),
12130                    b"first-payload".to_vec(),
12131                ))
12132                .await
12133                .expect("append first stream");
12134
12135            create_stream(&runtime, &second_stream).await;
12136            let mut append_second =
12137                AppendRequest::from_bytes(second_stream.clone(), b"second-payload".to_vec());
12138            append_second.close_after = true;
12139            runtime
12140                .append(append_second)
12141                .await
12142                .expect("append second stream");
12143
12144            (first_stream, second_stream)
12145        };
12146
12147        let recovered =
12148            ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12149                .expect("spawn recovered runtime");
12150
12151        let first_read = recovered
12152            .read_stream(ReadStreamRequest {
12153                stream_id: first_stream.clone(),
12154                offset: 0,
12155                max_len: 128,
12156                now_ms: 0,
12157            })
12158            .await
12159            .expect("read recovered first stream");
12160        assert_eq!(first_read.payload, b"first-payload");
12161        assert!(!first_read.closed);
12162
12163        let second_read = recovered
12164            .read_stream(ReadStreamRequest {
12165                stream_id: second_stream.clone(),
12166                offset: 0,
12167                max_len: 128,
12168                now_ms: 0,
12169            })
12170            .await
12171            .expect("read recovered second stream");
12172        assert_eq!(second_read.payload, b"second-payload");
12173        assert!(second_read.closed);
12174
12175        let mut wal_file_count = 0;
12176        for core_entry in std::fs::read_dir(&wal_root).expect("read WAL root") {
12177            let core_entry = core_entry.expect("read core WAL dir");
12178            for group_entry in std::fs::read_dir(core_entry.path()).expect("read group WAL dir") {
12179                let group_entry = group_entry.expect("read group WAL file");
12180                if group_entry
12181                    .path()
12182                    .extension()
12183                    .is_some_and(|ext| ext == "jsonl")
12184                {
12185                    wal_file_count += 1;
12186                }
12187            }
12188        }
12189        assert_eq!(wal_file_count, 2);
12190
12191        drop(recovered);
12192        std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12193    }
12194
12195    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12196    async fn wal_group_engine_batches_append_records_and_recovers() {
12197        let wal_root = std::env::temp_dir().join(format!(
12198            "ursula-wal-batch-test-{}-{}",
12199            std::process::id(),
12200            std::time::SystemTime::now()
12201                .duration_since(std::time::UNIX_EPOCH)
12202                .expect("system time after unix epoch")
12203                .as_nanos()
12204        ));
12205        let _ = std::fs::remove_dir_all(&wal_root);
12206        let config = RuntimeConfig {
12207            core_count: 2,
12208            raft_group_count: 8,
12209            mailbox_capacity: 128,
12210            threading: RuntimeThreading::HostedTokio,
12211            cold_max_hot_bytes_per_group: None,
12212            live_read_max_waiters_per_core: Some(65_536),
12213        };
12214        let stream = BucketStreamId::new("benchcmp", "wal-batch");
12215        let placement;
12216
12217        {
12218            let runtime = ShardRuntime::spawn_with_engine_factory(
12219                config.clone(),
12220                WalGroupEngineFactory::new(&wal_root),
12221            )
12222            .expect("spawn runtime");
12223            placement = runtime.locate(&stream);
12224            create_stream(&runtime, &stream).await;
12225            let response = runtime
12226                .append_batch(AppendBatchRequest::new(
12227                    stream.clone(),
12228                    vec![b"ab".to_vec(), b"cd".to_vec(), b"ef".to_vec()],
12229                ))
12230                .await
12231                .expect("append batch");
12232            assert_eq!(response.items.len(), 3);
12233            assert!(response.items.iter().all(Result::is_ok));
12234
12235            let read = runtime
12236                .read_stream(ReadStreamRequest {
12237                    stream_id: stream.clone(),
12238                    offset: 0,
12239                    max_len: 16,
12240                    now_ms: 0,
12241                })
12242                .await
12243                .expect("read");
12244            assert_eq!(read.payload, b"abcdef");
12245
12246            let snapshot = runtime.metrics().snapshot();
12247            let core_index = usize::from(placement.core_id.0);
12248            let group_index = usize::try_from(placement.raft_group_id.0).expect("u32 fits usize");
12249            assert_eq!(snapshot.wal_batches, 2);
12250            assert_eq!(snapshot.wal_records, 2);
12251            assert_eq!(snapshot.per_core_wal_batches[core_index], 2);
12252            assert_eq!(snapshot.per_group_wal_batches[group_index], 2);
12253            assert_eq!(snapshot.per_core_wal_records[core_index], 2);
12254            assert_eq!(snapshot.per_group_wal_records[group_index], 2);
12255            assert!(snapshot.wal_write_ns > 0);
12256            assert!(snapshot.wal_sync_ns > 0);
12257            assert_eq!(
12258                snapshot.wal_write_ns,
12259                snapshot.per_core_wal_write_ns.iter().sum::<u64>()
12260            );
12261            assert_eq!(
12262                snapshot.wal_sync_ns,
12263                snapshot.per_group_wal_sync_ns.iter().sum::<u64>()
12264            );
12265        }
12266
12267        let log_path = group_log_path(&wal_root, placement);
12268        let line_count = std::fs::read_to_string(&log_path)
12269            .expect("read WAL log")
12270            .lines()
12271            .count();
12272        assert_eq!(line_count, 2);
12273
12274        let recovered =
12275            ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12276                .expect("spawn recovered runtime");
12277        let read = recovered
12278            .read_stream(ReadStreamRequest {
12279                stream_id: stream,
12280                offset: 0,
12281                max_len: 16,
12282                now_ms: 0,
12283            })
12284            .await
12285            .expect("read recovered batch");
12286        assert_eq!(read.payload, b"abcdef");
12287
12288        drop(recovered);
12289        std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12290    }
12291
12292    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12293    async fn wal_group_engine_persists_installed_snapshot() {
12294        let wal_root = std::env::temp_dir().join(format!(
12295            "ursula-wal-install-snapshot-test-{}-{}",
12296            std::process::id(),
12297            std::time::SystemTime::now()
12298                .duration_since(std::time::UNIX_EPOCH)
12299                .expect("system time after unix epoch")
12300                .as_nanos()
12301        ));
12302        let _ = std::fs::remove_dir_all(&wal_root);
12303        let config = RuntimeConfig {
12304            core_count: 2,
12305            raft_group_count: 8,
12306            mailbox_capacity: 128,
12307            threading: RuntimeThreading::HostedTokio,
12308            cold_max_hot_bytes_per_group: None,
12309            live_read_max_waiters_per_core: Some(65_536),
12310        };
12311        let stream = BucketStreamId::new("benchcmp", "wal-installed-snapshot");
12312        let source = runtime(2, 8);
12313        let placement = source.locate(&stream);
12314        create_stream(&source, &stream).await;
12315        source
12316            .append(AppendRequest::from_bytes(
12317                stream.clone(),
12318                b"snapshot-payload".to_vec(),
12319            ))
12320            .await
12321            .expect("append source");
12322        let snapshot = source
12323            .snapshot_group(placement.raft_group_id)
12324            .await
12325            .expect("snapshot source");
12326
12327        {
12328            let target = ShardRuntime::spawn_with_engine_factory(
12329                config.clone(),
12330                WalGroupEngineFactory::new(&wal_root),
12331            )
12332            .expect("spawn WAL runtime");
12333            target
12334                .install_group_snapshot(snapshot)
12335                .await
12336                .expect("install snapshot");
12337        }
12338
12339        let recovered =
12340            ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12341                .expect("spawn recovered WAL runtime");
12342        let read = recovered
12343            .read_stream(ReadStreamRequest {
12344                stream_id: stream.clone(),
12345                offset: 0,
12346                max_len: 32,
12347                now_ms: 0,
12348            })
12349            .await
12350            .expect("read recovered snapshot");
12351        assert_eq!(read.payload, b"snapshot-payload");
12352
12353        let appended = recovered
12354            .append(AppendRequest::from_bytes(stream, b"-next".to_vec()))
12355            .await
12356            .expect("append after recovered snapshot");
12357        assert_eq!(appended.start_offset, 16);
12358        assert_eq!(appended.stream_append_count, 2);
12359
12360        drop(recovered);
12361        std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12362    }
12363
12364    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12365    async fn wal_group_engine_recovers_producer_dedup_state() {
12366        let wal_root = std::env::temp_dir().join(format!(
12367            "ursula-wal-producer-test-{}-{}",
12368            std::process::id(),
12369            std::time::SystemTime::now()
12370                .duration_since(std::time::UNIX_EPOCH)
12371                .expect("system time after unix epoch")
12372                .as_nanos()
12373        ));
12374        let _ = std::fs::remove_dir_all(&wal_root);
12375        let config = RuntimeConfig {
12376            core_count: 2,
12377            raft_group_count: 8,
12378            mailbox_capacity: 128,
12379            threading: RuntimeThreading::HostedTokio,
12380            cold_max_hot_bytes_per_group: None,
12381            live_read_max_waiters_per_core: Some(65_536),
12382        };
12383        let stream = BucketStreamId::new("benchcmp", "wal-producer");
12384
12385        {
12386            let runtime = ShardRuntime::spawn_with_engine_factory(
12387                config.clone(),
12388                WalGroupEngineFactory::new(&wal_root),
12389            )
12390            .expect("spawn WAL runtime");
12391            create_stream(&runtime, &stream).await;
12392            let mut append = AppendRequest::from_bytes(stream.clone(), b"a".to_vec());
12393            append.producer = Some(producer("writer-1", 0, 0));
12394            runtime.append(append).await.expect("append");
12395        }
12396
12397        let recovered =
12398            ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12399                .expect("spawn recovered runtime");
12400        let mut duplicate = AppendRequest::from_bytes(stream.clone(), b"ignored".to_vec());
12401        duplicate.producer = Some(producer("writer-1", 0, 0));
12402        let duplicate = recovered
12403            .append(duplicate)
12404            .await
12405            .expect("deduplicated retry");
12406        assert!(duplicate.deduplicated);
12407        assert_eq!(duplicate.start_offset, 0);
12408        assert_eq!(duplicate.next_offset, 1);
12409        assert_eq!(duplicate.stream_append_count, 1);
12410
12411        let mut next = AppendRequest::from_bytes(stream.clone(), b"b".to_vec());
12412        next.producer = Some(producer("writer-1", 0, 1));
12413        let next = recovered.append(next).await.expect("next append");
12414        assert_eq!(next.start_offset, 1);
12415        assert_eq!(next.next_offset, 2);
12416        assert_eq!(next.stream_append_count, 2);
12417
12418        let read = recovered
12419            .read_stream(ReadStreamRequest {
12420                stream_id: stream,
12421                offset: 0,
12422                max_len: 16,
12423                now_ms: 0,
12424            })
12425            .await
12426            .expect("read");
12427        assert_eq!(read.payload, b"ab");
12428
12429        drop(recovered);
12430        std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12431    }
12432
12433    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12434    async fn wal_group_engine_recovers_producer_append_batch_dedup_state() {
12435        let wal_root = std::env::temp_dir().join(format!(
12436            "ursula-wal-producer-batch-test-{}-{}",
12437            std::process::id(),
12438            std::time::SystemTime::now()
12439                .duration_since(std::time::UNIX_EPOCH)
12440                .expect("system time after unix epoch")
12441                .as_nanos()
12442        ));
12443        let _ = std::fs::remove_dir_all(&wal_root);
12444        let config = RuntimeConfig {
12445            core_count: 2,
12446            raft_group_count: 8,
12447            mailbox_capacity: 128,
12448            threading: RuntimeThreading::HostedTokio,
12449            cold_max_hot_bytes_per_group: None,
12450            live_read_max_waiters_per_core: Some(65_536),
12451        };
12452        let stream = BucketStreamId::new("benchcmp", "wal-producer-batch");
12453        let placement;
12454
12455        {
12456            let runtime = ShardRuntime::spawn_with_engine_factory(
12457                config.clone(),
12458                WalGroupEngineFactory::new(&wal_root),
12459            )
12460            .expect("spawn WAL runtime");
12461            placement = runtime.locate(&stream);
12462            create_stream(&runtime, &stream).await;
12463
12464            let mut first =
12465                AppendBatchRequest::new(stream.clone(), vec![b"a".to_vec(), b"b".to_vec()]);
12466            first.producer = Some(producer("writer-1", 0, 0));
12467            let first = runtime.append_batch(first).await.expect("first batch");
12468            assert!(first.items.iter().all(Result::is_ok));
12469
12470            let mut duplicate = AppendBatchRequest::new(stream.clone(), vec![b"ignored".to_vec()]);
12471            duplicate.producer = Some(producer("writer-1", 0, 0));
12472            let duplicate = runtime
12473                .append_batch(duplicate)
12474                .await
12475                .expect("duplicate batch");
12476            assert!(
12477                duplicate
12478                    .items
12479                    .iter()
12480                    .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
12481            );
12482        }
12483
12484        let log_path = group_log_path(&wal_root, placement);
12485        let line_count = std::fs::read_to_string(&log_path)
12486            .expect("read WAL log")
12487            .lines()
12488            .count();
12489        assert_eq!(line_count, 2);
12490
12491        let recovered =
12492            ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12493                .expect("spawn recovered runtime");
12494        let mut duplicate = AppendBatchRequest::new(stream.clone(), vec![b"retry".to_vec()]);
12495        duplicate.producer = Some(producer("writer-1", 0, 0));
12496        let duplicate = recovered
12497            .append_batch(duplicate)
12498            .await
12499            .expect("deduplicated retry");
12500        assert_eq!(duplicate.items.len(), 2);
12501        assert!(
12502            duplicate
12503                .items
12504                .iter()
12505                .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
12506        );
12507
12508        let mut next = AppendBatchRequest::new(stream.clone(), vec![b"c".to_vec()]);
12509        next.producer = Some(producer("writer-1", 0, 1));
12510        let next = recovered.append_batch(next).await.expect("next batch");
12511        assert_eq!(next.items[0].as_ref().expect("next item").start_offset, 2);
12512
12513        let read = recovered
12514            .read_stream(ReadStreamRequest {
12515                stream_id: stream,
12516                offset: 0,
12517                max_len: 16,
12518                now_ms: 0,
12519            })
12520            .await
12521            .expect("read");
12522        assert_eq!(read.payload, b"abc");
12523
12524        drop(recovered);
12525        std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12526    }
12527
12528    #[derive(Debug, Clone)]
12529    struct RecordingFactory {
12530        created: Arc<Mutex<Vec<ShardPlacement>>>,
12531        accepts_local_writes: bool,
12532    }
12533
12534    impl Default for RecordingFactory {
12535        fn default() -> Self {
12536            Self {
12537                created: Arc::default(),
12538                accepts_local_writes: true,
12539            }
12540        }
12541    }
12542
12543    impl RecordingFactory {
12544        fn without_local_writes() -> Self {
12545            Self {
12546                accepts_local_writes: false,
12547                ..Self::default()
12548            }
12549        }
12550
12551        fn created(&self) -> Vec<ShardPlacement> {
12552            self.created.lock().expect("lock created groups").clone()
12553        }
12554    }
12555
12556    impl GroupEngineFactory for RecordingFactory {
12557        fn create<'a>(
12558            &'a self,
12559            placement: ShardPlacement,
12560            _metrics: GroupEngineMetrics,
12561        ) -> GroupEngineCreateFuture<'a> {
12562            Box::pin(async move {
12563                self.created
12564                    .lock()
12565                    .expect("lock created groups")
12566                    .push(placement);
12567                let engine: Box<dyn GroupEngine> = Box::new(RecordingEngine {
12568                    placement,
12569                    commit_index: 0,
12570                    accepts_local_writes: self.accepts_local_writes,
12571                });
12572                Ok(engine)
12573            })
12574        }
12575    }
12576
12577    struct RecordingEngine {
12578        placement: ShardPlacement,
12579        commit_index: u64,
12580        accepts_local_writes: bool,
12581    }
12582
12583    #[derive(Clone)]
12584    struct BlockingReadFactory {
12585        entered: Arc<Notify>,
12586        release: Arc<Notify>,
12587        read_count: Arc<AtomicU64>,
12588        block_parts: bool,
12589    }
12590
12591    impl Default for BlockingReadFactory {
12592        fn default() -> Self {
12593            Self {
12594                entered: Arc::new(Notify::new()),
12595                release: Arc::new(Notify::new()),
12596                read_count: Arc::new(AtomicU64::new(0)),
12597                block_parts: false,
12598            }
12599        }
12600    }
12601
12602    impl BlockingReadFactory {
12603        fn block_materialization() -> Self {
12604            Self {
12605                block_parts: true,
12606                ..Self::default()
12607            }
12608        }
12609    }
12610
12611    impl GroupEngineFactory for BlockingReadFactory {
12612        fn create<'a>(
12613            &'a self,
12614            placement: ShardPlacement,
12615            _metrics: GroupEngineMetrics,
12616        ) -> GroupEngineCreateFuture<'a> {
12617            Box::pin(async move {
12618                let engine: Box<dyn GroupEngine> = Box::new(BlockingReadEngine {
12619                    inner: InMemoryGroupEngine::default(),
12620                    placement,
12621                    entered: self.entered.clone(),
12622                    release: self.release.clone(),
12623                    read_count: self.read_count.clone(),
12624                    block_parts: self.block_parts,
12625                });
12626                Ok(engine)
12627            })
12628        }
12629    }
12630
12631    struct BlockingReadEngine {
12632        inner: InMemoryGroupEngine,
12633        placement: ShardPlacement,
12634        entered: Arc<Notify>,
12635        release: Arc<Notify>,
12636        read_count: Arc<AtomicU64>,
12637        block_parts: bool,
12638    }
12639
12640    impl GroupEngine for BlockingReadEngine {
12641        fn create_stream<'a>(
12642            &'a mut self,
12643            request: CreateStreamRequest,
12644            placement: ShardPlacement,
12645        ) -> GroupCreateStreamFuture<'a> {
12646            self.inner.create_stream(request, placement)
12647        }
12648
12649        fn head_stream<'a>(
12650            &'a mut self,
12651            request: HeadStreamRequest,
12652            placement: ShardPlacement,
12653        ) -> GroupHeadStreamFuture<'a> {
12654            self.inner.head_stream(request, placement)
12655        }
12656
12657        fn read_stream<'a>(
12658            &'a mut self,
12659            request: ReadStreamRequest,
12660            placement: ShardPlacement,
12661        ) -> GroupReadStreamFuture<'a> {
12662            let entered = self.entered.clone();
12663            let release = self.release.clone();
12664            let read_count = self.read_count.clone();
12665            Box::pin(async move {
12666                assert_eq!(placement, self.placement);
12667                read_count.fetch_add(1, Ordering::Relaxed);
12668                entered.notify_one();
12669                release.notified().await;
12670                Ok(ReadStreamResponse {
12671                    placement,
12672                    offset: request.offset,
12673                    next_offset: request.offset,
12674                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12675                    payload: Vec::new(),
12676                    up_to_date: true,
12677                    closed: false,
12678                })
12679            })
12680        }
12681
12682        fn read_stream_parts<'a>(
12683            &'a mut self,
12684            request: ReadStreamRequest,
12685            placement: ShardPlacement,
12686        ) -> GroupReadStreamPartsFuture<'a> {
12687            let entered = self.entered.clone();
12688            let read_count = self.read_count.clone();
12689            Box::pin(async move {
12690                assert_eq!(placement, self.placement);
12691                read_count.fetch_add(1, Ordering::Relaxed);
12692                entered.notify_one();
12693                if self.block_parts {
12694                    return Ok(GroupReadStreamParts {
12695                        placement,
12696                        offset: request.offset,
12697                        next_offset: request.offset
12698                            + u64::try_from(b"ready".len()).expect("payload len fits u64"),
12699                        content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12700                        up_to_date: true,
12701                        closed: false,
12702                        body: GroupReadStreamBody::Blocking {
12703                            entered: self.entered.clone(),
12704                            release: self.release.clone(),
12705                            payload: b"ready".to_vec(),
12706                        },
12707                    });
12708                }
12709                let response = ReadStreamResponse {
12710                    placement,
12711                    offset: request.offset,
12712                    next_offset: request.offset,
12713                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12714                    payload: Vec::new(),
12715                    up_to_date: true,
12716                    closed: false,
12717                };
12718                Ok(GroupReadStreamParts::from_response(response))
12719            })
12720        }
12721
12722        fn touch_stream_access<'a>(
12723            &'a mut self,
12724            stream_id: BucketStreamId,
12725            now_ms: u64,
12726            renew_ttl: bool,
12727            placement: ShardPlacement,
12728        ) -> GroupTouchStreamAccessFuture<'a> {
12729            self.inner
12730                .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
12731        }
12732
12733        fn add_fork_ref<'a>(
12734            &'a mut self,
12735            stream_id: BucketStreamId,
12736            now_ms: u64,
12737            placement: ShardPlacement,
12738        ) -> GroupForkRefFuture<'a> {
12739            self.inner.add_fork_ref(stream_id, now_ms, placement)
12740        }
12741
12742        fn release_fork_ref<'a>(
12743            &'a mut self,
12744            stream_id: BucketStreamId,
12745            placement: ShardPlacement,
12746        ) -> GroupForkRefFuture<'a> {
12747            self.inner.release_fork_ref(stream_id, placement)
12748        }
12749
12750        fn close_stream<'a>(
12751            &'a mut self,
12752            request: CloseStreamRequest,
12753            placement: ShardPlacement,
12754        ) -> GroupCloseStreamFuture<'a> {
12755            self.inner.close_stream(request, placement)
12756        }
12757
12758        fn delete_stream<'a>(
12759            &'a mut self,
12760            request: DeleteStreamRequest,
12761            placement: ShardPlacement,
12762        ) -> GroupDeleteStreamFuture<'a> {
12763            self.inner.delete_stream(request, placement)
12764        }
12765
12766        fn append<'a>(
12767            &'a mut self,
12768            request: AppendRequest,
12769            placement: ShardPlacement,
12770        ) -> GroupAppendFuture<'a> {
12771            self.inner.append(request, placement)
12772        }
12773
12774        fn append_batch<'a>(
12775            &'a mut self,
12776            request: AppendBatchRequest,
12777            placement: ShardPlacement,
12778        ) -> GroupAppendBatchFuture<'a> {
12779            self.inner.append_batch(request, placement)
12780        }
12781
12782        fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
12783            Box::pin(async move {
12784                Ok(GroupSnapshot {
12785                    placement,
12786                    group_commit_index: 0,
12787                    stream_snapshot: StreamSnapshot {
12788                        buckets: Vec::new(),
12789                        streams: Vec::new(),
12790                    },
12791                    stream_append_counts: Vec::new(),
12792                })
12793            })
12794        }
12795
12796        fn install_snapshot<'a>(
12797            &'a mut self,
12798            _snapshot: GroupSnapshot,
12799        ) -> GroupInstallSnapshotFuture<'a> {
12800            Box::pin(async { Ok(()) })
12801        }
12802    }
12803
12804    impl GroupEngine for RecordingEngine {
12805        fn accepts_local_writes(&self) -> bool {
12806            self.accepts_local_writes
12807        }
12808
12809        fn create_stream<'a>(
12810            &'a mut self,
12811            request: CreateStreamRequest,
12812            placement: ShardPlacement,
12813        ) -> GroupCreateStreamFuture<'a> {
12814            Box::pin(async move {
12815                assert_eq!(placement, self.placement);
12816                self.commit_index += 1;
12817                Ok(CreateStreamResponse {
12818                    placement,
12819                    next_offset: u64::try_from(request.initial_payload.len())
12820                        .expect("payload len fits u64"),
12821                    closed: request.close_after,
12822                    already_exists: false,
12823                    group_commit_index: self.commit_index,
12824                })
12825            })
12826        }
12827
12828        fn head_stream<'a>(
12829            &'a mut self,
12830            request: HeadStreamRequest,
12831            placement: ShardPlacement,
12832        ) -> GroupHeadStreamFuture<'a> {
12833            Box::pin(async move {
12834                assert_eq!(placement, self.placement);
12835                Ok(HeadStreamResponse {
12836                    placement,
12837                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12838                    tail_offset: request.stream_id.stream_id.len() as u64,
12839                    closed: false,
12840                    stream_ttl_seconds: None,
12841                    stream_expires_at_ms: None,
12842                    snapshot_offset: None,
12843                })
12844            })
12845        }
12846
12847        fn read_stream<'a>(
12848            &'a mut self,
12849            request: ReadStreamRequest,
12850            placement: ShardPlacement,
12851        ) -> GroupReadStreamFuture<'a> {
12852            Box::pin(async move {
12853                assert_eq!(placement, self.placement);
12854                Ok(ReadStreamResponse {
12855                    placement,
12856                    offset: request.offset,
12857                    next_offset: request.offset,
12858                    content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12859                    payload: Vec::new(),
12860                    up_to_date: true,
12861                    closed: false,
12862                })
12863            })
12864        }
12865
12866        fn touch_stream_access<'a>(
12867            &'a mut self,
12868            _stream_id: BucketStreamId,
12869            _now_ms: u64,
12870            _renew_ttl: bool,
12871            placement: ShardPlacement,
12872        ) -> GroupTouchStreamAccessFuture<'a> {
12873            Box::pin(async move {
12874                assert_eq!(placement, self.placement);
12875                Ok(TouchStreamAccessResponse {
12876                    placement,
12877                    changed: false,
12878                    expired: false,
12879                    group_commit_index: self.commit_index,
12880                })
12881            })
12882        }
12883
12884        fn add_fork_ref<'a>(
12885            &'a mut self,
12886            _stream_id: BucketStreamId,
12887            _now_ms: u64,
12888            placement: ShardPlacement,
12889        ) -> GroupForkRefFuture<'a> {
12890            Box::pin(async move {
12891                assert_eq!(placement, self.placement);
12892                self.commit_index += 1;
12893                Ok(ForkRefResponse {
12894                    placement,
12895                    fork_ref_count: 1,
12896                    hard_deleted: false,
12897                    parent_to_release: None,
12898                    group_commit_index: self.commit_index,
12899                })
12900            })
12901        }
12902
12903        fn release_fork_ref<'a>(
12904            &'a mut self,
12905            _stream_id: BucketStreamId,
12906            placement: ShardPlacement,
12907        ) -> GroupForkRefFuture<'a> {
12908            Box::pin(async move {
12909                assert_eq!(placement, self.placement);
12910                self.commit_index += 1;
12911                Ok(ForkRefResponse {
12912                    placement,
12913                    fork_ref_count: 0,
12914                    hard_deleted: false,
12915                    parent_to_release: None,
12916                    group_commit_index: self.commit_index,
12917                })
12918            })
12919        }
12920
12921        fn close_stream<'a>(
12922            &'a mut self,
12923            _request: CloseStreamRequest,
12924            placement: ShardPlacement,
12925        ) -> GroupCloseStreamFuture<'a> {
12926            Box::pin(async move {
12927                assert_eq!(placement, self.placement);
12928                self.commit_index += 1;
12929                Ok(CloseStreamResponse {
12930                    placement,
12931                    next_offset: self.commit_index,
12932                    group_commit_index: self.commit_index,
12933                    deduplicated: false,
12934                })
12935            })
12936        }
12937
12938        fn delete_stream<'a>(
12939            &'a mut self,
12940            _request: DeleteStreamRequest,
12941            placement: ShardPlacement,
12942        ) -> GroupDeleteStreamFuture<'a> {
12943            Box::pin(async move {
12944                assert_eq!(placement, self.placement);
12945                self.commit_index += 1;
12946                Ok(DeleteStreamResponse {
12947                    placement,
12948                    group_commit_index: self.commit_index,
12949                    hard_deleted: true,
12950                    parent_to_release: None,
12951                })
12952            })
12953        }
12954
12955        fn append<'a>(
12956            &'a mut self,
12957            request: AppendRequest,
12958            placement: ShardPlacement,
12959        ) -> GroupAppendFuture<'a> {
12960            Box::pin(async move {
12961                assert_eq!(placement, self.placement);
12962                let start_offset = self.commit_index;
12963                let next_offset = start_offset + request.payload_len();
12964                self.commit_index += 1;
12965                Ok(AppendResponse {
12966                    placement,
12967                    start_offset,
12968                    next_offset,
12969                    stream_append_count: self.commit_index,
12970                    group_commit_index: self.commit_index,
12971                    closed: request.close_after,
12972                    deduplicated: false,
12973                    producer: request.producer,
12974                })
12975            })
12976        }
12977
12978        fn append_batch<'a>(
12979            &'a mut self,
12980            request: AppendBatchRequest,
12981            placement: ShardPlacement,
12982        ) -> GroupAppendBatchFuture<'a> {
12983            Box::pin(async move {
12984                assert_eq!(placement, self.placement);
12985                let AppendBatchRequest {
12986                    stream_id: _,
12987                    content_type: _,
12988                    payloads,
12989                    producer: _,
12990                    ..
12991                } = request;
12992                let mut items = Vec::with_capacity(payloads.len());
12993                for payload in payloads {
12994                    let start_offset = self.commit_index;
12995                    let next_offset =
12996                        start_offset + u64::try_from(payload.len()).expect("payload len fits u64");
12997                    self.commit_index += 1;
12998                    items.push(Ok(AppendResponse {
12999                        placement,
13000                        start_offset,
13001                        next_offset,
13002                        stream_append_count: self.commit_index,
13003                        group_commit_index: self.commit_index,
13004                        closed: false,
13005                        deduplicated: false,
13006                        producer: None,
13007                    }));
13008                }
13009                Ok(GroupAppendBatchResponse { placement, items })
13010            })
13011        }
13012
13013        fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13014            Box::pin(async move {
13015                assert_eq!(placement, self.placement);
13016                Ok(GroupSnapshot {
13017                    placement,
13018                    group_commit_index: self.commit_index,
13019                    stream_snapshot: StreamSnapshot {
13020                        buckets: Vec::new(),
13021                        streams: Vec::new(),
13022                    },
13023                    stream_append_counts: Vec::new(),
13024                })
13025            })
13026        }
13027
13028        fn install_snapshot<'a>(
13029            &'a mut self,
13030            snapshot: GroupSnapshot,
13031        ) -> GroupInstallSnapshotFuture<'a> {
13032            Box::pin(async move {
13033                assert_eq!(snapshot.placement, self.placement);
13034                self.commit_index = snapshot.group_commit_index;
13035                Ok(())
13036            })
13037        }
13038    }
13039
13040    #[derive(Debug, Clone)]
13041    struct BlockingFirstCreateEngineFactory {
13042        first_create_blocks: Arc<AtomicBool>,
13043        entered: Arc<Notify>,
13044        release: Arc<Notify>,
13045    }
13046
13047    impl Default for BlockingFirstCreateEngineFactory {
13048        fn default() -> Self {
13049            Self {
13050                first_create_blocks: Arc::new(AtomicBool::new(true)),
13051                entered: Arc::new(Notify::new()),
13052                release: Arc::new(Notify::new()),
13053            }
13054        }
13055    }
13056
13057    impl GroupEngineFactory for BlockingFirstCreateEngineFactory {
13058        fn create<'a>(
13059            &'a self,
13060            _placement: ShardPlacement,
13061            _metrics: GroupEngineMetrics,
13062        ) -> GroupEngineCreateFuture<'a> {
13063            Box::pin(async move {
13064                let engine: Box<dyn GroupEngine> = Box::new(BlockingFirstCreateEngine {
13065                    inner: InMemoryGroupEngine::default(),
13066                    first_create_blocks: self.first_create_blocks.clone(),
13067                    entered: self.entered.clone(),
13068                    release: self.release.clone(),
13069                });
13070                Ok(engine)
13071            })
13072        }
13073    }
13074
13075    struct BlockingFirstCreateEngine {
13076        inner: InMemoryGroupEngine,
13077        first_create_blocks: Arc<AtomicBool>,
13078        entered: Arc<Notify>,
13079        release: Arc<Notify>,
13080    }
13081
13082    impl GroupEngine for BlockingFirstCreateEngine {
13083        fn create_stream<'a>(
13084            &'a mut self,
13085            request: CreateStreamRequest,
13086            placement: ShardPlacement,
13087        ) -> GroupCreateStreamFuture<'a> {
13088            let should_block = self.first_create_blocks.swap(false, Ordering::SeqCst);
13089            let entered = self.entered.clone();
13090            let release = self.release.clone();
13091            Box::pin(async move {
13092                if should_block {
13093                    entered.notify_one();
13094                    release.notified().await;
13095                }
13096                self.inner.create_stream(request, placement).await
13097            })
13098        }
13099
13100        fn head_stream<'a>(
13101            &'a mut self,
13102            request: HeadStreamRequest,
13103            placement: ShardPlacement,
13104        ) -> GroupHeadStreamFuture<'a> {
13105            self.inner.head_stream(request, placement)
13106        }
13107
13108        fn read_stream<'a>(
13109            &'a mut self,
13110            request: ReadStreamRequest,
13111            placement: ShardPlacement,
13112        ) -> GroupReadStreamFuture<'a> {
13113            self.inner.read_stream(request, placement)
13114        }
13115
13116        fn touch_stream_access<'a>(
13117            &'a mut self,
13118            stream_id: BucketStreamId,
13119            now_ms: u64,
13120            renew_ttl: bool,
13121            placement: ShardPlacement,
13122        ) -> GroupTouchStreamAccessFuture<'a> {
13123            self.inner
13124                .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
13125        }
13126
13127        fn add_fork_ref<'a>(
13128            &'a mut self,
13129            stream_id: BucketStreamId,
13130            now_ms: u64,
13131            placement: ShardPlacement,
13132        ) -> GroupForkRefFuture<'a> {
13133            self.inner.add_fork_ref(stream_id, now_ms, placement)
13134        }
13135
13136        fn release_fork_ref<'a>(
13137            &'a mut self,
13138            stream_id: BucketStreamId,
13139            placement: ShardPlacement,
13140        ) -> GroupForkRefFuture<'a> {
13141            self.inner.release_fork_ref(stream_id, placement)
13142        }
13143
13144        fn close_stream<'a>(
13145            &'a mut self,
13146            request: CloseStreamRequest,
13147            placement: ShardPlacement,
13148        ) -> GroupCloseStreamFuture<'a> {
13149            self.inner.close_stream(request, placement)
13150        }
13151
13152        fn delete_stream<'a>(
13153            &'a mut self,
13154            request: DeleteStreamRequest,
13155            placement: ShardPlacement,
13156        ) -> GroupDeleteStreamFuture<'a> {
13157            self.inner.delete_stream(request, placement)
13158        }
13159
13160        fn append<'a>(
13161            &'a mut self,
13162            request: AppendRequest,
13163            placement: ShardPlacement,
13164        ) -> GroupAppendFuture<'a> {
13165            self.inner.append(request, placement)
13166        }
13167
13168        fn append_batch<'a>(
13169            &'a mut self,
13170            request: AppendBatchRequest,
13171            placement: ShardPlacement,
13172        ) -> GroupAppendBatchFuture<'a> {
13173            self.inner.append_batch(request, placement)
13174        }
13175
13176        fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13177            self.inner.snapshot(placement)
13178        }
13179
13180        fn install_snapshot<'a>(
13181            &'a mut self,
13182            snapshot: GroupSnapshot,
13183        ) -> GroupInstallSnapshotFuture<'a> {
13184            self.inner.install_snapshot(snapshot)
13185        }
13186    }
13187
13188    #[derive(Debug, Clone)]
13189    struct BlockingOnceFactory {
13190        first_create_blocks: Arc<AtomicBool>,
13191        entered: Arc<Notify>,
13192        release: Arc<Notify>,
13193    }
13194
13195    impl Default for BlockingOnceFactory {
13196        fn default() -> Self {
13197            Self {
13198                first_create_blocks: Arc::new(AtomicBool::new(true)),
13199                entered: Arc::new(Notify::new()),
13200                release: Arc::new(Notify::new()),
13201            }
13202        }
13203    }
13204
13205    impl GroupEngineFactory for BlockingOnceFactory {
13206        fn create<'a>(
13207            &'a self,
13208            _placement: ShardPlacement,
13209            _metrics: GroupEngineMetrics,
13210        ) -> GroupEngineCreateFuture<'a> {
13211            Box::pin(async move {
13212                if self.first_create_blocks.swap(false, Ordering::SeqCst) {
13213                    self.entered.notify_one();
13214                    self.release.notified().await;
13215                }
13216                let engine: Box<dyn GroupEngine> = Box::new(InMemoryGroupEngine::default());
13217                Ok(engine)
13218            })
13219        }
13220    }
13221
13222    #[derive(Debug, Clone, Copy)]
13223    struct FailingFactory;
13224
13225    impl GroupEngineFactory for FailingFactory {
13226        fn create<'a>(
13227            &'a self,
13228            _placement: ShardPlacement,
13229            _metrics: GroupEngineMetrics,
13230        ) -> GroupEngineCreateFuture<'a> {
13231            Box::pin(async {
13232                let engine: Box<dyn GroupEngine> = Box::new(FailingEngine);
13233                Ok(engine)
13234            })
13235        }
13236    }
13237
13238    struct FailingEngine;
13239
13240    impl GroupEngine for FailingEngine {
13241        fn create_stream<'a>(
13242            &'a mut self,
13243            _request: CreateStreamRequest,
13244            _placement: ShardPlacement,
13245        ) -> GroupCreateStreamFuture<'a> {
13246            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13247        }
13248
13249        fn head_stream<'a>(
13250            &'a mut self,
13251            _request: HeadStreamRequest,
13252            _placement: ShardPlacement,
13253        ) -> GroupHeadStreamFuture<'a> {
13254            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13255        }
13256
13257        fn read_stream<'a>(
13258            &'a mut self,
13259            _request: ReadStreamRequest,
13260            _placement: ShardPlacement,
13261        ) -> GroupReadStreamFuture<'a> {
13262            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13263        }
13264
13265        fn touch_stream_access<'a>(
13266            &'a mut self,
13267            _stream_id: BucketStreamId,
13268            _now_ms: u64,
13269            _renew_ttl: bool,
13270            _placement: ShardPlacement,
13271        ) -> GroupTouchStreamAccessFuture<'a> {
13272            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13273        }
13274
13275        fn add_fork_ref<'a>(
13276            &'a mut self,
13277            _stream_id: BucketStreamId,
13278            _now_ms: u64,
13279            _placement: ShardPlacement,
13280        ) -> GroupForkRefFuture<'a> {
13281            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13282        }
13283
13284        fn release_fork_ref<'a>(
13285            &'a mut self,
13286            _stream_id: BucketStreamId,
13287            _placement: ShardPlacement,
13288        ) -> GroupForkRefFuture<'a> {
13289            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13290        }
13291
13292        fn close_stream<'a>(
13293            &'a mut self,
13294            _request: CloseStreamRequest,
13295            _placement: ShardPlacement,
13296        ) -> GroupCloseStreamFuture<'a> {
13297            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13298        }
13299
13300        fn delete_stream<'a>(
13301            &'a mut self,
13302            _request: DeleteStreamRequest,
13303            _placement: ShardPlacement,
13304        ) -> GroupDeleteStreamFuture<'a> {
13305            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13306        }
13307
13308        fn append<'a>(
13309            &'a mut self,
13310            _request: AppendRequest,
13311            _placement: ShardPlacement,
13312        ) -> GroupAppendFuture<'a> {
13313            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13314        }
13315
13316        fn append_batch<'a>(
13317            &'a mut self,
13318            _request: AppendBatchRequest,
13319            _placement: ShardPlacement,
13320        ) -> GroupAppendBatchFuture<'a> {
13321            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13322        }
13323
13324        fn snapshot<'a>(&'a mut self, _placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13325            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13326        }
13327
13328        fn install_snapshot<'a>(
13329            &'a mut self,
13330            _snapshot: GroupSnapshot,
13331        ) -> GroupInstallSnapshotFuture<'a> {
13332            Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13333        }
13334    }
13335}