1use std::collections::{HashMap, HashSet, VecDeque};
2use std::fmt;
3use std::fs::{self, File, OpenOptions};
4use std::future::Future;
5use std::io;
6use std::io::{BufRead, BufReader, Write};
7use std::path::{Path, PathBuf};
8use std::pin::Pin;
9use std::sync::Arc;
10use std::sync::atomic::{AtomicU64, Ordering};
11use std::time::{Instant, SystemTime, UNIX_EPOCH};
12
13use bytes::Bytes;
14use opendal::{Operator, Scheme};
15use serde::{Deserialize, Serialize};
16use tokio::sync::{Semaphore, mpsc, oneshot};
17use tokio::task::JoinSet;
18use ursula_shard::{
19 BucketStreamId, CoreId, RaftGroupId, ShardId, ShardMapError, ShardPlacement, StaticShardMap,
20};
21use ursula_stream::{
22 AppendStreamInput, ObjectPayloadRef, StreamCommand, StreamMessageRecord, StreamReadPlan,
23 StreamReadSegment, StreamResponse, StreamSnapshot, StreamStateMachine,
24};
25pub use ursula_stream::{
26 ColdChunkRef, ColdFlushCandidate, ExternalPayloadRef, ProducerRequest, StreamErrorCode,
27};
28
29const DEFAULT_CONTENT_TYPE: &str = "application/octet-stream";
30static COLD_CHUNK_SEQUENCE: AtomicU64 = AtomicU64::new(0);
31
32#[derive(Clone, Debug)]
33pub struct ColdStore {
34 operator: Operator,
35}
36
37pub type ColdStoreHandle = Arc<ColdStore>;
38
39impl ColdStore {
40 pub fn memory() -> io::Result<Self> {
41 let operator = Operator::via_iter(Scheme::Memory, [])
42 .map_err(|err| io::Error::other(err.to_string()))?;
43 Ok(Self { operator })
44 }
45
46 pub fn fs(root: impl AsRef<Path>) -> io::Result<Self> {
47 let root = root.as_ref();
48 fs::create_dir_all(root)?;
49 let operator = Operator::via_iter(
50 Scheme::Fs,
51 [("root".to_owned(), root.to_string_lossy().to_string())],
52 )
53 .map_err(|err| io::Error::other(err.to_string()))?;
54 Ok(Self { operator })
55 }
56
57 pub fn s3_from_env() -> io::Result<Self> {
58 Self::s3_from_env_with_root(None)
59 }
60
61 pub fn s3_from_env_with_root(root_override: Option<&str>) -> io::Result<Self> {
62 let bucket = std::env::var("URSULA_COLD_S3_BUCKET").map_err(|_| {
63 io::Error::new(
64 io::ErrorKind::InvalidInput,
65 "URSULA_COLD_S3_BUCKET is required when URSULA_COLD_BACKEND=s3",
66 )
67 })?;
68 if bucket.trim().is_empty() {
69 return Err(io::Error::new(
70 io::ErrorKind::InvalidInput,
71 "URSULA_COLD_S3_BUCKET must not be empty",
72 ));
73 }
74
75 let mut builder = opendal::services::S3::default().bucket(&bucket);
76 if let Some(root) = root_override {
77 if !root.trim().is_empty() {
78 builder = builder.root(root);
79 }
80 } else if let Ok(root) = std::env::var("URSULA_COLD_ROOT")
81 && !root.trim().is_empty()
82 {
83 builder = builder.root(&root);
84 }
85 if let Ok(region) = std::env::var("URSULA_COLD_S3_REGION")
86 && !region.trim().is_empty()
87 {
88 builder = builder.region(®ion);
89 }
90 if let Ok(endpoint) = std::env::var("URSULA_COLD_S3_ENDPOINT")
91 && !endpoint.trim().is_empty()
92 {
93 builder = builder.endpoint(&endpoint);
94 }
95 if let Ok(access_key_id) = std::env::var("URSULA_COLD_S3_ACCESS_KEY_ID")
96 && !access_key_id.trim().is_empty()
97 {
98 builder = builder.access_key_id(&access_key_id);
99 }
100 if let Ok(secret_access_key) = std::env::var("URSULA_COLD_S3_SECRET_ACCESS_KEY")
101 && !secret_access_key.trim().is_empty()
102 {
103 builder = builder.secret_access_key(&secret_access_key);
104 }
105 if let Ok(session_token) = std::env::var("URSULA_COLD_S3_SESSION_TOKEN")
106 && !session_token.trim().is_empty()
107 {
108 builder = builder.session_token(&session_token);
109 }
110
111 Ok(Self {
112 operator: Operator::new(builder)
113 .map_err(|err| io::Error::other(err.to_string()))?
114 .finish(),
115 })
116 }
117
118 pub fn from_env() -> io::Result<Option<ColdStoreHandle>> {
119 let backend = std::env::var("URSULA_COLD_BACKEND")
120 .unwrap_or_else(|_| "none".to_owned())
121 .to_ascii_lowercase();
122 let store = match backend.as_str() {
123 "none" | "disabled" | "off" => return Ok(None),
124 "memory" | "mem" | "inmem" => Self::memory()?,
125 "fs" => {
126 let root =
127 std::env::var("URSULA_COLD_ROOT").unwrap_or_else(|_| "data/cold".to_owned());
128 Self::fs(root)?
129 }
130 "s3" => Self::s3_from_env()?,
131 other => {
132 return Err(io::Error::new(
133 io::ErrorKind::InvalidInput,
134 format!("unsupported URSULA_COLD_BACKEND '{other}'"),
135 ));
136 }
137 };
138 Ok(Some(Arc::new(store)))
139 }
140
141 pub async fn write_chunk(&self, path: &str, payload: &[u8]) -> io::Result<u64> {
142 if path.trim().is_empty() {
143 return Err(io::Error::new(
144 io::ErrorKind::InvalidInput,
145 "cold chunk path must not be empty",
146 ));
147 }
148 self.operator
149 .write(path, payload.to_vec())
150 .await
151 .map_err(|err| cold_store_io_error(path, err))?;
152 Ok(u64::try_from(payload.len()).expect("payload len fits u64"))
153 }
154
155 pub async fn delete_chunk(&self, path: &str) -> io::Result<()> {
156 if path.trim().is_empty() {
157 return Err(io::Error::new(
158 io::ErrorKind::InvalidInput,
159 "cold chunk path must not be empty",
160 ));
161 }
162 self.operator
163 .delete(path)
164 .await
165 .map_err(|err| cold_store_io_error(path, err))
166 }
167
168 pub async fn remove_all(&self, path: &str) -> io::Result<()> {
169 self.operator
170 .remove_all(path)
171 .await
172 .map_err(|err| cold_store_io_error(path, err))
173 }
174
175 pub async fn read_chunk_range(
176 &self,
177 chunk: &ColdChunkRef,
178 read_start_offset: u64,
179 len: usize,
180 ) -> io::Result<Vec<u8>> {
181 let object = ObjectPayloadRef {
182 start_offset: chunk.start_offset,
183 end_offset: chunk.end_offset,
184 s3_path: chunk.s3_path.clone(),
185 object_size: chunk.object_size,
186 };
187 self.read_object_range(&object, read_start_offset, len)
188 .await
189 }
190
191 pub async fn read_object_range(
192 &self,
193 object: &ObjectPayloadRef,
194 read_start_offset: u64,
195 len: usize,
196 ) -> io::Result<Vec<u8>> {
197 if len == 0 {
198 return Ok(Vec::new());
199 }
200 let len_u64 = u64::try_from(len).map_err(|_| {
201 io::Error::new(io::ErrorKind::InvalidInput, "cold read length exceeds u64")
202 })?;
203 let read_end = read_start_offset.checked_add(len_u64).ok_or_else(|| {
204 io::Error::new(io::ErrorKind::InvalidInput, "cold read range overflow")
205 })?;
206 if read_start_offset < object.start_offset || read_end > object.end_offset {
207 return Err(io::Error::new(
208 io::ErrorKind::InvalidInput,
209 format!(
210 "cold read range [{read_start_offset}..{read_end}) is outside object segment [{}..{})",
211 object.start_offset, object.end_offset
212 ),
213 ));
214 }
215 let object_start = read_start_offset - object.start_offset;
216 let object_end = object_start.checked_add(len_u64).ok_or_else(|| {
217 io::Error::new(io::ErrorKind::InvalidInput, "cold read range overflow")
218 })?;
219 if object_end > object.object_size {
220 return Err(io::Error::new(
221 io::ErrorKind::InvalidData,
222 format!(
223 "cold read range [{object_start}..{object_end}) is outside object '{}' size {}",
224 object.s3_path, object.object_size
225 ),
226 ));
227 }
228 let bytes = self
229 .operator
230 .read_with(&object.s3_path)
231 .range(object_start..object_end)
232 .await
233 .map_err(|err| cold_store_io_error(&object.s3_path, err))?
234 .to_bytes();
235 if bytes.len() != len {
236 return Err(io::Error::new(
237 io::ErrorKind::InvalidData,
238 format!(
239 "cold object '{}' returned {} bytes for requested range [{}..{})",
240 object.s3_path,
241 bytes.len(),
242 object_start,
243 object_end
244 ),
245 ));
246 }
247 Ok(bytes.to_vec())
248 }
249}
250
251fn cold_store_io_error(path: &str, err: opendal::Error) -> io::Error {
252 io::Error::other(format!("cold object '{path}': {err}"))
253}
254
255pub fn new_cold_chunk_path(
256 stream_id: &BucketStreamId,
257 start_offset: u64,
258 end_offset: u64,
259) -> String {
260 let unix_nanos = SystemTime::now()
261 .duration_since(UNIX_EPOCH)
262 .map(|duration| duration.as_nanos())
263 .unwrap_or(0);
264 let sequence = COLD_CHUNK_SEQUENCE.fetch_add(1, Ordering::Relaxed);
265 format!(
266 "{stream_id}/chunks/{start_offset:016x}-{end_offset:016x}-{unix_nanos:032x}-{sequence:016x}.bin"
267 )
268}
269
270pub fn new_external_payload_path(stream_id: &BucketStreamId) -> String {
271 let unix_nanos = SystemTime::now()
272 .duration_since(UNIX_EPOCH)
273 .map(|duration| duration.as_nanos())
274 .unwrap_or(0);
275 let sequence = COLD_CHUNK_SEQUENCE.fetch_add(1, Ordering::Relaxed);
276 format!("{stream_id}/external/{unix_nanos:032x}-{sequence:016x}.bin")
277}
278
279#[derive(Debug, Clone, PartialEq, Eq)]
280pub struct CreateStreamRequest {
281 pub stream_id: BucketStreamId,
282 pub content_type: String,
283 pub content_type_explicit: bool,
284 pub initial_payload: Bytes,
285 pub close_after: bool,
286 pub stream_seq: Option<String>,
287 pub producer: Option<ProducerRequest>,
288 pub stream_ttl_seconds: Option<u64>,
289 pub stream_expires_at_ms: Option<u64>,
290 pub forked_from: Option<BucketStreamId>,
291 pub fork_offset: Option<u64>,
292 pub now_ms: u64,
293}
294
295#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
296pub struct CreateStreamExternalRequest {
297 pub stream_id: BucketStreamId,
298 pub content_type: String,
299 pub initial_payload: ExternalPayloadRef,
300 pub close_after: bool,
301 pub stream_seq: Option<String>,
302 pub producer: Option<ProducerRequest>,
303 pub stream_ttl_seconds: Option<u64>,
304 pub stream_expires_at_ms: Option<u64>,
305 pub forked_from: Option<BucketStreamId>,
306 pub fork_offset: Option<u64>,
307 pub now_ms: u64,
308}
309
310impl CreateStreamExternalRequest {
311 pub fn from_create_request(
312 request: CreateStreamRequest,
313 initial_payload: ExternalPayloadRef,
314 ) -> Self {
315 Self {
316 stream_id: request.stream_id,
317 content_type: request.content_type,
318 initial_payload,
319 close_after: request.close_after,
320 stream_seq: request.stream_seq,
321 producer: request.producer,
322 stream_ttl_seconds: request.stream_ttl_seconds,
323 stream_expires_at_ms: request.stream_expires_at_ms,
324 forked_from: request.forked_from,
325 fork_offset: request.fork_offset,
326 now_ms: request.now_ms,
327 }
328 }
329}
330
331impl CreateStreamRequest {
332 pub fn new(stream_id: BucketStreamId, content_type: impl Into<String>) -> Self {
333 Self {
334 stream_id,
335 content_type: content_type.into(),
336 content_type_explicit: true,
337 initial_payload: Bytes::new(),
338 close_after: false,
339 stream_seq: None,
340 producer: None,
341 stream_ttl_seconds: None,
342 stream_expires_at_ms: None,
343 forked_from: None,
344 fork_offset: None,
345 now_ms: 0,
346 }
347 }
348}
349
350#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
351pub struct CreateStreamResponse {
352 pub placement: ShardPlacement,
353 pub next_offset: u64,
354 pub closed: bool,
355 pub already_exists: bool,
356 pub group_commit_index: u64,
357}
358
359#[derive(Debug, Clone, PartialEq, Eq)]
360pub struct HeadStreamRequest {
361 pub stream_id: BucketStreamId,
362 pub now_ms: u64,
363}
364
365#[derive(Debug, Clone, PartialEq, Eq)]
366pub struct HeadStreamResponse {
367 pub placement: ShardPlacement,
368 pub content_type: String,
369 pub tail_offset: u64,
370 pub closed: bool,
371 pub stream_ttl_seconds: Option<u64>,
372 pub stream_expires_at_ms: Option<u64>,
373 pub snapshot_offset: Option<u64>,
374}
375
376#[derive(Debug, Clone, PartialEq, Eq)]
377pub struct ReadStreamRequest {
378 pub stream_id: BucketStreamId,
379 pub offset: u64,
380 pub max_len: usize,
381 pub now_ms: u64,
382}
383
384#[derive(Debug, Clone, PartialEq, Eq)]
385pub struct ReadStreamResponse {
386 pub placement: ShardPlacement,
387 pub offset: u64,
388 pub next_offset: u64,
389 pub content_type: String,
390 pub payload: Vec<u8>,
391 pub up_to_date: bool,
392 pub closed: bool,
393}
394
395pub enum GroupReadStreamBody {
396 Materialized(Vec<u8>),
397 Planned {
398 stream_id: BucketStreamId,
399 plan: StreamReadPlan,
400 cold_store: Option<ColdStoreHandle>,
401 },
402 #[cfg(test)]
403 Blocking {
404 entered: Arc<tokio::sync::Notify>,
405 release: Arc<tokio::sync::Notify>,
406 payload: Vec<u8>,
407 },
408}
409
410pub struct GroupReadStreamParts {
411 pub placement: ShardPlacement,
412 pub offset: u64,
413 pub next_offset: u64,
414 pub content_type: String,
415 pub up_to_date: bool,
416 pub closed: bool,
417 pub body: GroupReadStreamBody,
418}
419
420impl GroupReadStreamParts {
421 pub fn from_response(response: ReadStreamResponse) -> Self {
422 Self {
423 placement: response.placement,
424 offset: response.offset,
425 next_offset: response.next_offset,
426 content_type: response.content_type,
427 up_to_date: response.up_to_date,
428 closed: response.closed,
429 body: GroupReadStreamBody::Materialized(response.payload),
430 }
431 }
432
433 pub fn from_plan(
434 placement: ShardPlacement,
435 stream_id: BucketStreamId,
436 plan: StreamReadPlan,
437 cold_store: Option<ColdStoreHandle>,
438 ) -> Self {
439 Self {
440 placement,
441 offset: plan.offset,
442 next_offset: plan.next_offset,
443 content_type: plan.content_type.clone(),
444 up_to_date: plan.up_to_date,
445 closed: plan.closed,
446 body: GroupReadStreamBody::Planned {
447 stream_id,
448 plan,
449 cold_store,
450 },
451 }
452 }
453
454 pub async fn into_response(self) -> Result<ReadStreamResponse, GroupEngineError> {
455 let payload = match &self.body {
456 GroupReadStreamBody::Materialized(payload) => payload.clone(),
457 GroupReadStreamBody::Planned {
458 stream_id,
459 plan,
460 cold_store,
461 } => {
462 InMemoryGroupEngine::read_payload_from_plan(cold_store.as_ref(), stream_id, plan)
463 .await?
464 }
465 #[cfg(test)]
466 GroupReadStreamBody::Blocking {
467 entered,
468 release,
469 payload,
470 } => {
471 entered.notify_one();
472 release.notified().await;
473 payload.clone()
474 }
475 };
476 Ok(ReadStreamResponse {
477 placement: self.placement,
478 offset: self.offset,
479 next_offset: self.next_offset,
480 content_type: self.content_type,
481 payload,
482 up_to_date: self.up_to_date,
483 closed: self.closed,
484 })
485 }
486
487 fn payload_is_empty(&self) -> bool {
488 match &self.body {
489 GroupReadStreamBody::Materialized(payload) => payload.is_empty(),
490 GroupReadStreamBody::Planned { plan, .. } => {
491 plan.segments.iter().all(|segment| match segment {
492 StreamReadSegment::Hot(payload) => payload.is_empty(),
493 StreamReadSegment::Object(segment) => segment.len == 0,
494 })
495 }
496 #[cfg(test)]
497 GroupReadStreamBody::Blocking { payload, .. } => payload.is_empty(),
498 }
499 }
500}
501
502#[derive(Debug, Clone, PartialEq, Eq)]
503pub struct PublishSnapshotRequest {
504 pub stream_id: BucketStreamId,
505 pub snapshot_offset: u64,
506 pub content_type: String,
507 pub payload: Bytes,
508 pub now_ms: u64,
509}
510
511#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
512pub struct PublishSnapshotResponse {
513 pub placement: ShardPlacement,
514 pub snapshot_offset: u64,
515 pub group_commit_index: u64,
516}
517
518#[derive(Debug, Clone, PartialEq, Eq)]
519pub struct ReadSnapshotRequest {
520 pub stream_id: BucketStreamId,
521 pub snapshot_offset: Option<u64>,
522 pub now_ms: u64,
523}
524
525#[derive(Debug, Clone, PartialEq, Eq)]
526pub struct ReadSnapshotResponse {
527 pub placement: ShardPlacement,
528 pub snapshot_offset: u64,
529 pub next_offset: u64,
530 pub content_type: String,
531 pub payload: Vec<u8>,
532 pub up_to_date: bool,
533}
534
535#[derive(Debug, Clone, PartialEq, Eq)]
536pub struct DeleteSnapshotRequest {
537 pub stream_id: BucketStreamId,
538 pub snapshot_offset: u64,
539 pub now_ms: u64,
540}
541
542#[derive(Debug, Clone, PartialEq, Eq)]
543pub struct BootstrapStreamRequest {
544 pub stream_id: BucketStreamId,
545 pub now_ms: u64,
546}
547
548#[derive(Debug, Clone, PartialEq, Eq)]
549pub struct BootstrapUpdate {
550 pub start_offset: u64,
551 pub next_offset: u64,
552 pub content_type: String,
553 pub payload: Vec<u8>,
554}
555
556#[derive(Debug, Clone, PartialEq, Eq)]
557pub struct BootstrapStreamResponse {
558 pub placement: ShardPlacement,
559 pub snapshot_offset: Option<u64>,
560 pub snapshot_content_type: String,
561 pub snapshot_payload: Vec<u8>,
562 pub updates: Vec<BootstrapUpdate>,
563 pub next_offset: u64,
564 pub up_to_date: bool,
565 pub closed: bool,
566}
567
568#[derive(Debug, Clone, PartialEq, Eq)]
569pub struct CloseStreamRequest {
570 pub stream_id: BucketStreamId,
571 pub stream_seq: Option<String>,
572 pub producer: Option<ProducerRequest>,
573 pub now_ms: u64,
574}
575
576#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
577pub struct CloseStreamResponse {
578 pub placement: ShardPlacement,
579 pub next_offset: u64,
580 pub group_commit_index: u64,
581 pub deduplicated: bool,
582}
583
584#[derive(Debug, Clone, PartialEq, Eq)]
585pub struct DeleteStreamRequest {
586 pub stream_id: BucketStreamId,
587}
588
589#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
590pub struct DeleteStreamResponse {
591 pub placement: ShardPlacement,
592 pub group_commit_index: u64,
593 pub hard_deleted: bool,
594 pub parent_to_release: Option<BucketStreamId>,
595}
596
597#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
598pub struct ForkRefResponse {
599 pub placement: ShardPlacement,
600 pub fork_ref_count: u64,
601 pub hard_deleted: bool,
602 pub parent_to_release: Option<BucketStreamId>,
603 pub group_commit_index: u64,
604}
605
606#[derive(Debug, Clone, PartialEq, Eq)]
607pub struct FlushColdRequest {
608 pub stream_id: BucketStreamId,
609 pub chunk: ColdChunkRef,
610}
611
612#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
613pub struct FlushColdResponse {
614 pub placement: ShardPlacement,
615 pub hot_start_offset: u64,
616 pub group_commit_index: u64,
617}
618
619#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
620pub struct TouchStreamAccessResponse {
621 pub placement: ShardPlacement,
622 pub changed: bool,
623 pub expired: bool,
624 pub group_commit_index: u64,
625}
626
627#[derive(Debug, Clone, PartialEq, Eq)]
628pub struct PlanColdFlushRequest {
629 pub stream_id: BucketStreamId,
630 pub min_hot_bytes: usize,
631 pub max_flush_bytes: usize,
632}
633
634#[derive(Debug, Clone, PartialEq, Eq)]
635pub struct PlanGroupColdFlushRequest {
636 pub min_hot_bytes: usize,
637 pub max_flush_bytes: usize,
638}
639
640#[derive(Debug, Clone, PartialEq, Eq)]
641pub struct ColdHotBacklog {
642 pub stream_id: BucketStreamId,
643 pub stream_hot_bytes: u64,
644 pub group_hot_bytes: u64,
645}
646
647#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
648pub struct ColdWriteAdmission {
649 pub max_hot_bytes_per_group: Option<u64>,
650}
651
652impl ColdWriteAdmission {
653 fn is_enabled(self) -> bool {
654 self.max_hot_bytes_per_group.is_some()
655 }
656}
657
658#[derive(Debug, Clone, PartialEq, Eq)]
659pub struct AppendRequest {
660 pub stream_id: BucketStreamId,
661 pub content_type: String,
662 pub payload: Bytes,
663 pub close_after: bool,
664 pub stream_seq: Option<String>,
665 pub producer: Option<ProducerRequest>,
666 pub now_ms: u64,
667}
668
669#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
670pub struct AppendExternalRequest {
671 pub stream_id: BucketStreamId,
672 pub content_type: String,
673 pub payload: ExternalPayloadRef,
674 pub close_after: bool,
675 pub stream_seq: Option<String>,
676 pub producer: Option<ProducerRequest>,
677 pub now_ms: u64,
678}
679
680impl AppendExternalRequest {
681 pub fn from_append_request(request: AppendRequest, payload: ExternalPayloadRef) -> Self {
682 Self {
683 stream_id: request.stream_id,
684 content_type: request.content_type,
685 payload,
686 close_after: request.close_after,
687 stream_seq: request.stream_seq,
688 producer: request.producer,
689 now_ms: request.now_ms,
690 }
691 }
692}
693
694impl AppendRequest {
695 pub fn new(stream_id: BucketStreamId, payload_len: u64) -> Self {
696 Self {
697 stream_id,
698 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
699 payload: Bytes::from(vec![
700 0;
701 usize::try_from(payload_len)
702 .expect("payload_len fits usize")
703 ]),
704 close_after: false,
705 stream_seq: None,
706 producer: None,
707 now_ms: 0,
708 }
709 }
710
711 pub fn from_bytes(stream_id: BucketStreamId, payload: impl Into<Bytes>) -> Self {
712 Self {
713 stream_id,
714 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
715 payload: payload.into(),
716 close_after: false,
717 stream_seq: None,
718 producer: None,
719 now_ms: 0,
720 }
721 }
722
723 pub fn payload_len(&self) -> u64 {
724 u64::try_from(self.payload.len()).expect("payload len fits u64")
725 }
726}
727
728#[derive(Debug, Clone, PartialEq, Eq)]
729pub struct AppendBatchRequest {
730 pub stream_id: BucketStreamId,
731 pub content_type: String,
732 pub payloads: Vec<Bytes>,
733 pub producer: Option<ProducerRequest>,
734 pub now_ms: u64,
735}
736
737impl AppendBatchRequest {
738 pub fn new<P>(stream_id: BucketStreamId, payloads: Vec<P>) -> Self
739 where
740 P: Into<Bytes>,
741 {
742 Self {
743 stream_id,
744 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
745 payloads: payloads.into_iter().map(Into::into).collect(),
746 producer: None,
747 now_ms: 0,
748 }
749 }
750}
751
752#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
753pub struct AppendResponse {
754 pub placement: ShardPlacement,
755 pub start_offset: u64,
756 pub next_offset: u64,
757 pub stream_append_count: u64,
758 pub group_commit_index: u64,
759 pub closed: bool,
760 pub deduplicated: bool,
761 pub producer: Option<ProducerRequest>,
762}
763
764#[derive(Debug, Clone, PartialEq, Eq)]
765pub struct AppendBatchResponse {
766 pub placement: ShardPlacement,
767 pub items: Vec<Result<AppendResponse, RuntimeError>>,
768}
769
770#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
771pub struct StreamAppendCount {
772 pub stream_id: BucketStreamId,
773 pub append_count: u64,
774}
775
776#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
777pub struct GroupSnapshot {
778 pub placement: ShardPlacement,
779 pub group_commit_index: u64,
780 pub stream_snapshot: StreamSnapshot,
781 pub stream_append_counts: Vec<StreamAppendCount>,
782}
783
784#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
785#[serde(tag = "command", rename_all = "snake_case")]
786pub enum GroupWriteCommand {
787 CreateStream {
788 stream_id: BucketStreamId,
789 content_type: String,
790 initial_payload: Bytes,
791 close_after: bool,
792 stream_seq: Option<String>,
793 producer: Option<ProducerRequest>,
794 stream_ttl_seconds: Option<u64>,
795 stream_expires_at_ms: Option<u64>,
796 forked_from: Option<BucketStreamId>,
797 fork_offset: Option<u64>,
798 now_ms: u64,
799 },
800 CreateExternal {
801 stream_id: BucketStreamId,
802 content_type: String,
803 initial_payload: ExternalPayloadRef,
804 close_after: bool,
805 stream_seq: Option<String>,
806 producer: Option<ProducerRequest>,
807 stream_ttl_seconds: Option<u64>,
808 stream_expires_at_ms: Option<u64>,
809 forked_from: Option<BucketStreamId>,
810 fork_offset: Option<u64>,
811 now_ms: u64,
812 },
813 Append {
814 stream_id: BucketStreamId,
815 content_type: String,
816 payload: Bytes,
817 close_after: bool,
818 stream_seq: Option<String>,
819 producer: Option<ProducerRequest>,
820 now_ms: u64,
821 },
822 AppendExternal {
823 stream_id: BucketStreamId,
824 content_type: String,
825 payload: ExternalPayloadRef,
826 close_after: bool,
827 stream_seq: Option<String>,
828 producer: Option<ProducerRequest>,
829 now_ms: u64,
830 },
831 AppendBatch {
832 stream_id: BucketStreamId,
833 content_type: String,
834 payloads: Vec<Bytes>,
835 producer: Option<ProducerRequest>,
836 now_ms: u64,
837 },
838 PublishSnapshot {
839 stream_id: BucketStreamId,
840 snapshot_offset: u64,
841 content_type: String,
842 payload: Bytes,
843 now_ms: u64,
844 },
845 TouchStreamAccess {
846 stream_id: BucketStreamId,
847 now_ms: u64,
848 renew_ttl: bool,
849 },
850 AddForkRef {
851 stream_id: BucketStreamId,
852 now_ms: u64,
853 },
854 ReleaseForkRef {
855 stream_id: BucketStreamId,
856 },
857 FlushCold {
858 stream_id: BucketStreamId,
859 chunk: ColdChunkRef,
860 },
861 CloseStream {
862 stream_id: BucketStreamId,
863 stream_seq: Option<String>,
864 producer: Option<ProducerRequest>,
865 now_ms: u64,
866 },
867 DeleteStream {
868 stream_id: BucketStreamId,
869 },
870 Batch {
871 commands: Vec<GroupWriteCommand>,
872 },
873}
874
875impl From<CreateStreamRequest> for GroupWriteCommand {
876 fn from(request: CreateStreamRequest) -> Self {
877 Self::CreateStream {
878 stream_id: request.stream_id,
879 content_type: request.content_type,
880 initial_payload: request.initial_payload,
881 close_after: request.close_after,
882 stream_seq: request.stream_seq,
883 producer: request.producer,
884 stream_ttl_seconds: request.stream_ttl_seconds,
885 stream_expires_at_ms: request.stream_expires_at_ms,
886 forked_from: request.forked_from,
887 fork_offset: request.fork_offset,
888 now_ms: request.now_ms,
889 }
890 }
891}
892
893impl From<&CreateStreamRequest> for GroupWriteCommand {
894 fn from(request: &CreateStreamRequest) -> Self {
895 Self::CreateStream {
896 stream_id: request.stream_id.clone(),
897 content_type: request.content_type.clone(),
898 initial_payload: request.initial_payload.clone(),
899 close_after: request.close_after,
900 stream_seq: request.stream_seq.clone(),
901 producer: request.producer.clone(),
902 stream_ttl_seconds: request.stream_ttl_seconds,
903 stream_expires_at_ms: request.stream_expires_at_ms,
904 forked_from: request.forked_from.clone(),
905 fork_offset: request.fork_offset,
906 now_ms: request.now_ms,
907 }
908 }
909}
910
911impl From<CreateStreamExternalRequest> for GroupWriteCommand {
912 fn from(request: CreateStreamExternalRequest) -> Self {
913 Self::CreateExternal {
914 stream_id: request.stream_id,
915 content_type: request.content_type,
916 initial_payload: request.initial_payload,
917 close_after: request.close_after,
918 stream_seq: request.stream_seq,
919 producer: request.producer,
920 stream_ttl_seconds: request.stream_ttl_seconds,
921 stream_expires_at_ms: request.stream_expires_at_ms,
922 forked_from: request.forked_from,
923 fork_offset: request.fork_offset,
924 now_ms: request.now_ms,
925 }
926 }
927}
928
929impl From<&CreateStreamExternalRequest> for GroupWriteCommand {
930 fn from(request: &CreateStreamExternalRequest) -> Self {
931 Self::CreateExternal {
932 stream_id: request.stream_id.clone(),
933 content_type: request.content_type.clone(),
934 initial_payload: request.initial_payload.clone(),
935 close_after: request.close_after,
936 stream_seq: request.stream_seq.clone(),
937 producer: request.producer.clone(),
938 stream_ttl_seconds: request.stream_ttl_seconds,
939 stream_expires_at_ms: request.stream_expires_at_ms,
940 forked_from: request.forked_from.clone(),
941 fork_offset: request.fork_offset,
942 now_ms: request.now_ms,
943 }
944 }
945}
946
947impl From<AppendRequest> for GroupWriteCommand {
948 fn from(request: AppendRequest) -> Self {
949 Self::Append {
950 stream_id: request.stream_id,
951 content_type: request.content_type,
952 payload: request.payload,
953 close_after: request.close_after,
954 stream_seq: request.stream_seq,
955 producer: request.producer,
956 now_ms: request.now_ms,
957 }
958 }
959}
960
961impl From<&AppendRequest> for GroupWriteCommand {
962 fn from(request: &AppendRequest) -> Self {
963 Self::Append {
964 stream_id: request.stream_id.clone(),
965 content_type: request.content_type.clone(),
966 payload: request.payload.clone(),
967 close_after: request.close_after,
968 stream_seq: request.stream_seq.clone(),
969 producer: request.producer.clone(),
970 now_ms: request.now_ms,
971 }
972 }
973}
974
975impl From<AppendExternalRequest> for GroupWriteCommand {
976 fn from(request: AppendExternalRequest) -> Self {
977 Self::AppendExternal {
978 stream_id: request.stream_id,
979 content_type: request.content_type,
980 payload: request.payload,
981 close_after: request.close_after,
982 stream_seq: request.stream_seq,
983 producer: request.producer,
984 now_ms: request.now_ms,
985 }
986 }
987}
988
989impl From<&AppendExternalRequest> for GroupWriteCommand {
990 fn from(request: &AppendExternalRequest) -> Self {
991 Self::AppendExternal {
992 stream_id: request.stream_id.clone(),
993 content_type: request.content_type.clone(),
994 payload: request.payload.clone(),
995 close_after: request.close_after,
996 stream_seq: request.stream_seq.clone(),
997 producer: request.producer.clone(),
998 now_ms: request.now_ms,
999 }
1000 }
1001}
1002
1003impl From<AppendBatchRequest> for GroupWriteCommand {
1004 fn from(request: AppendBatchRequest) -> Self {
1005 Self::AppendBatch {
1006 stream_id: request.stream_id,
1007 content_type: request.content_type,
1008 payloads: request.payloads,
1009 producer: request.producer,
1010 now_ms: request.now_ms,
1011 }
1012 }
1013}
1014
1015impl From<&AppendBatchRequest> for GroupWriteCommand {
1016 fn from(request: &AppendBatchRequest) -> Self {
1017 Self::AppendBatch {
1018 stream_id: request.stream_id.clone(),
1019 content_type: request.content_type.clone(),
1020 payloads: request.payloads.clone(),
1021 producer: request.producer.clone(),
1022 now_ms: request.now_ms,
1023 }
1024 }
1025}
1026
1027impl From<PublishSnapshotRequest> for GroupWriteCommand {
1028 fn from(request: PublishSnapshotRequest) -> Self {
1029 Self::PublishSnapshot {
1030 stream_id: request.stream_id,
1031 snapshot_offset: request.snapshot_offset,
1032 content_type: request.content_type,
1033 payload: request.payload,
1034 now_ms: request.now_ms,
1035 }
1036 }
1037}
1038
1039impl From<&PublishSnapshotRequest> for GroupWriteCommand {
1040 fn from(request: &PublishSnapshotRequest) -> Self {
1041 Self::PublishSnapshot {
1042 stream_id: request.stream_id.clone(),
1043 snapshot_offset: request.snapshot_offset,
1044 content_type: request.content_type.clone(),
1045 payload: request.payload.clone(),
1046 now_ms: request.now_ms,
1047 }
1048 }
1049}
1050
1051impl From<CloseStreamRequest> for GroupWriteCommand {
1052 fn from(request: CloseStreamRequest) -> Self {
1053 Self::CloseStream {
1054 stream_id: request.stream_id,
1055 stream_seq: request.stream_seq,
1056 producer: request.producer,
1057 now_ms: request.now_ms,
1058 }
1059 }
1060}
1061
1062impl From<&CloseStreamRequest> for GroupWriteCommand {
1063 fn from(request: &CloseStreamRequest) -> Self {
1064 Self::CloseStream {
1065 stream_id: request.stream_id.clone(),
1066 stream_seq: request.stream_seq.clone(),
1067 producer: request.producer.clone(),
1068 now_ms: request.now_ms,
1069 }
1070 }
1071}
1072
1073impl From<DeleteStreamRequest> for GroupWriteCommand {
1074 fn from(request: DeleteStreamRequest) -> Self {
1075 Self::DeleteStream {
1076 stream_id: request.stream_id,
1077 }
1078 }
1079}
1080
1081impl From<&DeleteStreamRequest> for GroupWriteCommand {
1082 fn from(request: &DeleteStreamRequest) -> Self {
1083 Self::DeleteStream {
1084 stream_id: request.stream_id.clone(),
1085 }
1086 }
1087}
1088
1089impl From<FlushColdRequest> for GroupWriteCommand {
1090 fn from(request: FlushColdRequest) -> Self {
1091 Self::FlushCold {
1092 stream_id: request.stream_id,
1093 chunk: request.chunk,
1094 }
1095 }
1096}
1097
1098impl From<&FlushColdRequest> for GroupWriteCommand {
1099 fn from(request: &FlushColdRequest) -> Self {
1100 Self::FlushCold {
1101 stream_id: request.stream_id.clone(),
1102 chunk: request.chunk.clone(),
1103 }
1104 }
1105}
1106
1107impl fmt::Display for GroupWriteCommand {
1108 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1109 match self {
1110 Self::CreateStream { stream_id, .. } => {
1111 write!(f, "create_stream:{stream_id}")
1112 }
1113 Self::CreateExternal {
1114 stream_id,
1115 initial_payload,
1116 ..
1117 } => {
1118 write!(
1119 f,
1120 "create_external:{stream_id}:{} bytes",
1121 initial_payload.payload_len
1122 )
1123 }
1124 Self::Append {
1125 stream_id, payload, ..
1126 } => {
1127 write!(f, "append:{stream_id}:{} bytes", payload.len())
1128 }
1129 Self::AppendExternal {
1130 stream_id, payload, ..
1131 } => {
1132 write!(
1133 f,
1134 "append_external:{stream_id}:{} bytes",
1135 payload.payload_len
1136 )
1137 }
1138 Self::AppendBatch {
1139 stream_id,
1140 payloads,
1141 ..
1142 } => {
1143 write!(f, "append_batch:{stream_id}:{} items", payloads.len())
1144 }
1145 Self::PublishSnapshot {
1146 stream_id,
1147 snapshot_offset,
1148 payload,
1149 ..
1150 } => {
1151 write!(
1152 f,
1153 "publish_snapshot:{stream_id}:{snapshot_offset}:{} bytes",
1154 payload.len()
1155 )
1156 }
1157 Self::TouchStreamAccess {
1158 stream_id,
1159 renew_ttl,
1160 ..
1161 } => {
1162 write!(f, "touch_stream_access:{stream_id}:renew_ttl={renew_ttl}")
1163 }
1164 Self::AddForkRef { stream_id, .. } => {
1165 write!(f, "add_fork_ref:{stream_id}")
1166 }
1167 Self::ReleaseForkRef { stream_id } => {
1168 write!(f, "release_fork_ref:{stream_id}")
1169 }
1170 Self::FlushCold { stream_id, chunk } => {
1171 write!(
1172 f,
1173 "flush_cold:{stream_id}:{}..{}",
1174 chunk.start_offset, chunk.end_offset
1175 )
1176 }
1177 Self::CloseStream { stream_id, .. } => {
1178 write!(f, "close_stream:{stream_id}")
1179 }
1180 Self::DeleteStream { stream_id } => {
1181 write!(f, "delete_stream:{stream_id}")
1182 }
1183 Self::Batch { commands } => {
1184 write!(f, "batch:{} commands", commands.len())
1185 }
1186 }
1187 }
1188}
1189
1190#[derive(Debug, Clone, PartialEq, Eq)]
1191pub enum RuntimeError {
1192 InvalidConfig(ShardMapError),
1193 InvalidRaftGroup {
1194 raft_group_id: RaftGroupId,
1195 raft_group_count: u32,
1196 },
1197 SnapshotPlacementMismatch {
1198 expected: ShardPlacement,
1199 actual: ShardPlacement,
1200 },
1201 EmptyAppend,
1202 ColdStoreConfig {
1203 message: String,
1204 },
1205 ColdStoreIo {
1206 message: String,
1207 },
1208 LiveReadBackpressure {
1209 core_id: CoreId,
1210 current_waiters: u64,
1211 limit: u64,
1212 },
1213 GroupEngine {
1214 core_id: CoreId,
1215 raft_group_id: RaftGroupId,
1216 message: String,
1217 next_offset: Option<u64>,
1218 leader_hint: Option<GroupLeaderHint>,
1219 },
1220 MailboxClosed {
1221 core_id: CoreId,
1222 },
1223 ResponseDropped {
1224 core_id: CoreId,
1225 },
1226 SpawnCoreThread {
1227 core_id: CoreId,
1228 message: String,
1229 },
1230}
1231
1232impl RuntimeError {
1233 fn group_engine(placement: ShardPlacement, err: GroupEngineError) -> Self {
1234 Self::GroupEngine {
1235 core_id: placement.core_id,
1236 raft_group_id: placement.raft_group_id,
1237 message: err.message().to_owned(),
1238 next_offset: err.next_offset(),
1239 leader_hint: err.leader_hint().cloned(),
1240 }
1241 }
1242}
1243
1244impl std::fmt::Display for RuntimeError {
1245 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1246 match self {
1247 Self::InvalidConfig(err) => write!(f, "invalid shard runtime config: {err}"),
1248 Self::InvalidRaftGroup {
1249 raft_group_id,
1250 raft_group_count,
1251 } => write!(
1252 f,
1253 "raft group {} is outside configured range 0..{}",
1254 raft_group_id.0, raft_group_count
1255 ),
1256 Self::SnapshotPlacementMismatch { expected, actual } => write!(
1257 f,
1258 "snapshot placement for raft group {} is core {}, expected core {}",
1259 actual.raft_group_id.0, actual.core_id.0, expected.core_id.0
1260 ),
1261 Self::EmptyAppend => f.write_str("append payload must be non-empty"),
1262 Self::ColdStoreConfig { message } => {
1263 write!(f, "invalid cold store config: {message}")
1264 }
1265 Self::ColdStoreIo { message } => write!(f, "cold store IO error: {message}"),
1266 Self::LiveReadBackpressure {
1267 core_id,
1268 current_waiters,
1269 limit,
1270 } => write!(
1271 f,
1272 "core {} live read waiters at {} would exceed limit {}",
1273 core_id.0, current_waiters, limit
1274 ),
1275 Self::GroupEngine {
1276 core_id,
1277 raft_group_id,
1278 message,
1279 ..
1280 } => write!(
1281 f,
1282 "core {} raft group {} append failed: {message}",
1283 core_id.0, raft_group_id.0
1284 ),
1285 Self::MailboxClosed { core_id } => {
1286 write!(f, "core {} mailbox is closed", core_id.0)
1287 }
1288 Self::ResponseDropped { core_id } => {
1289 write!(f, "core {} dropped append response", core_id.0)
1290 }
1291 Self::SpawnCoreThread { core_id, message } => {
1292 write!(f, "failed to spawn core {} thread: {message}", core_id.0)
1293 }
1294 }
1295 }
1296}
1297
1298impl std::error::Error for RuntimeError {}
1299
1300impl From<ShardMapError> for RuntimeError {
1301 fn from(value: ShardMapError) -> Self {
1302 Self::InvalidConfig(value)
1303 }
1304}
1305
1306fn map_fork_source_ref_error(err: RuntimeError, placement: ShardPlacement) -> RuntimeError {
1307 if let RuntimeError::GroupEngine { message, .. } = &err
1308 && message.contains("StreamGone")
1309 {
1310 return RuntimeError::group_engine(
1311 placement,
1312 GroupEngineError::stream(
1313 StreamErrorCode::StreamAlreadyExistsConflict,
1314 "source stream is gone and cannot be forked",
1315 ),
1316 );
1317 }
1318 err
1319}
1320
1321pub type GroupAppendFuture<'a> =
1322 Pin<Box<dyn Future<Output = Result<AppendResponse, GroupEngineError>> + Send + 'a>>;
1323pub type GroupAppendBatchFuture<'a> =
1324 Pin<Box<dyn Future<Output = Result<GroupAppendBatchResponse, GroupEngineError>> + Send + 'a>>;
1325pub type GroupFlushColdFuture<'a> =
1326 Pin<Box<dyn Future<Output = Result<FlushColdResponse, GroupEngineError>> + Send + 'a>>;
1327pub type GroupPlanColdFlushFuture<'a> =
1328 Pin<Box<dyn Future<Output = Result<Option<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1329pub type GroupPlanNextColdFlushFuture<'a> =
1330 Pin<Box<dyn Future<Output = Result<Option<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1331pub type GroupPlanNextColdFlushBatchFuture<'a> =
1332 Pin<Box<dyn Future<Output = Result<Vec<ColdFlushCandidate>, GroupEngineError>> + Send + 'a>>;
1333pub type GroupColdHotBacklogFuture<'a> =
1334 Pin<Box<dyn Future<Output = Result<ColdHotBacklog, GroupEngineError>> + Send + 'a>>;
1335pub type GroupCreateStreamFuture<'a> =
1336 Pin<Box<dyn Future<Output = Result<CreateStreamResponse, GroupEngineError>> + Send + 'a>>;
1337pub type GroupHeadStreamFuture<'a> =
1338 Pin<Box<dyn Future<Output = Result<HeadStreamResponse, GroupEngineError>> + Send + 'a>>;
1339pub type GroupReadStreamFuture<'a> =
1340 Pin<Box<dyn Future<Output = Result<ReadStreamResponse, GroupEngineError>> + Send + 'a>>;
1341pub type GroupReadStreamPartsFuture<'a> =
1342 Pin<Box<dyn Future<Output = Result<GroupReadStreamParts, GroupEngineError>> + Send + 'a>>;
1343pub type GroupRequireLiveReadOwnerFuture<'a> =
1344 Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1345pub type GroupPublishSnapshotFuture<'a> =
1346 Pin<Box<dyn Future<Output = Result<PublishSnapshotResponse, GroupEngineError>> + Send + 'a>>;
1347pub type GroupReadSnapshotFuture<'a> =
1348 Pin<Box<dyn Future<Output = Result<ReadSnapshotResponse, GroupEngineError>> + Send + 'a>>;
1349pub type GroupDeleteSnapshotFuture<'a> =
1350 Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1351pub type GroupBootstrapStreamFuture<'a> =
1352 Pin<Box<dyn Future<Output = Result<BootstrapStreamResponse, GroupEngineError>> + Send + 'a>>;
1353pub type GroupTouchStreamAccessFuture<'a> =
1354 Pin<Box<dyn Future<Output = Result<TouchStreamAccessResponse, GroupEngineError>> + Send + 'a>>;
1355pub type GroupCloseStreamFuture<'a> =
1356 Pin<Box<dyn Future<Output = Result<CloseStreamResponse, GroupEngineError>> + Send + 'a>>;
1357pub type GroupDeleteStreamFuture<'a> =
1358 Pin<Box<dyn Future<Output = Result<DeleteStreamResponse, GroupEngineError>> + Send + 'a>>;
1359pub type GroupForkRefFuture<'a> =
1360 Pin<Box<dyn Future<Output = Result<ForkRefResponse, GroupEngineError>> + Send + 'a>>;
1361pub type GroupSnapshotFuture<'a> =
1362 Pin<Box<dyn Future<Output = Result<GroupSnapshot, GroupEngineError>> + Send + 'a>>;
1363pub type GroupInstallSnapshotFuture<'a> =
1364 Pin<Box<dyn Future<Output = Result<(), GroupEngineError>> + Send + 'a>>;
1365pub type GroupWriteBatchFuture<'a> = Pin<
1366 Box<
1367 dyn Future<
1368 Output = Result<
1369 Vec<Result<GroupWriteResponse, GroupEngineError>>,
1370 GroupEngineError,
1371 >,
1372 > + Send
1373 + 'a,
1374 >,
1375>;
1376pub type GroupEngineCreateFuture<'a> =
1377 Pin<Box<dyn Future<Output = Result<Box<dyn GroupEngine>, GroupEngineError>> + Send + 'a>>;
1378
1379#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1380pub struct GroupAppendBatchResponse {
1381 pub placement: ShardPlacement,
1382 pub items: Vec<Result<AppendResponse, GroupEngineError>>,
1383}
1384
1385#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1386pub enum GroupWriteResponse {
1387 CreateStream(CreateStreamResponse),
1388 Append(AppendResponse),
1389 AppendBatch(GroupAppendBatchResponse),
1390 PublishSnapshot(PublishSnapshotResponse),
1391 TouchStreamAccess(TouchStreamAccessResponse),
1392 AddForkRef(ForkRefResponse),
1393 ReleaseForkRef(ForkRefResponse),
1394 FlushCold(FlushColdResponse),
1395 CloseStream(CloseStreamResponse),
1396 DeleteStream(DeleteStreamResponse),
1397 Batch(Vec<Result<GroupWriteResponse, GroupEngineError>>),
1398}
1399
1400pub trait GroupEngine: Send + 'static {
1401 fn accepts_local_writes(&self) -> bool {
1402 true
1403 }
1404
1405 fn create_stream<'a>(
1406 &'a mut self,
1407 request: CreateStreamRequest,
1408 placement: ShardPlacement,
1409 ) -> GroupCreateStreamFuture<'a>;
1410
1411 fn create_stream_external<'a>(
1412 &'a mut self,
1413 request: CreateStreamExternalRequest,
1414 _placement: ShardPlacement,
1415 ) -> GroupCreateStreamFuture<'a> {
1416 Box::pin(async move {
1417 Err(GroupEngineError::new(format!(
1418 "external stream create is not supported for stream '{}'",
1419 request.stream_id
1420 )))
1421 })
1422 }
1423
1424 fn head_stream<'a>(
1425 &'a mut self,
1426 request: HeadStreamRequest,
1427 placement: ShardPlacement,
1428 ) -> GroupHeadStreamFuture<'a>;
1429
1430 fn read_stream<'a>(
1431 &'a mut self,
1432 request: ReadStreamRequest,
1433 placement: ShardPlacement,
1434 ) -> GroupReadStreamFuture<'a>;
1435
1436 fn read_stream_parts<'a>(
1437 &'a mut self,
1438 request: ReadStreamRequest,
1439 placement: ShardPlacement,
1440 ) -> GroupReadStreamPartsFuture<'a> {
1441 Box::pin(async move {
1442 let response = self.read_stream(request, placement).await?;
1443 Ok(GroupReadStreamParts::from_response(response))
1444 })
1445 }
1446
1447 fn require_local_live_read_owner<'a>(
1448 &'a mut self,
1449 _placement: ShardPlacement,
1450 ) -> GroupRequireLiveReadOwnerFuture<'a> {
1451 Box::pin(async { Ok(()) })
1452 }
1453
1454 fn publish_snapshot<'a>(
1455 &'a mut self,
1456 request: PublishSnapshotRequest,
1457 _placement: ShardPlacement,
1458 ) -> GroupPublishSnapshotFuture<'a> {
1459 Box::pin(async move {
1460 Err(GroupEngineError::new(format!(
1461 "snapshot publish is not supported for stream '{}'",
1462 request.stream_id
1463 )))
1464 })
1465 }
1466
1467 fn read_snapshot<'a>(
1468 &'a mut self,
1469 request: ReadSnapshotRequest,
1470 _placement: ShardPlacement,
1471 ) -> GroupReadSnapshotFuture<'a> {
1472 Box::pin(async move {
1473 Err(GroupEngineError::new(format!(
1474 "snapshot read is not supported for stream '{}'",
1475 request.stream_id
1476 )))
1477 })
1478 }
1479
1480 fn delete_snapshot<'a>(
1481 &'a mut self,
1482 request: DeleteSnapshotRequest,
1483 _placement: ShardPlacement,
1484 ) -> GroupDeleteSnapshotFuture<'a> {
1485 Box::pin(async move {
1486 Err(GroupEngineError::new(format!(
1487 "snapshot delete is not supported for stream '{}'",
1488 request.stream_id
1489 )))
1490 })
1491 }
1492
1493 fn bootstrap_stream<'a>(
1494 &'a mut self,
1495 request: BootstrapStreamRequest,
1496 _placement: ShardPlacement,
1497 ) -> GroupBootstrapStreamFuture<'a> {
1498 Box::pin(async move {
1499 Err(GroupEngineError::new(format!(
1500 "bootstrap is not supported for stream '{}'",
1501 request.stream_id
1502 )))
1503 })
1504 }
1505
1506 fn touch_stream_access<'a>(
1507 &'a mut self,
1508 stream_id: BucketStreamId,
1509 now_ms: u64,
1510 renew_ttl: bool,
1511 placement: ShardPlacement,
1512 ) -> GroupTouchStreamAccessFuture<'a>;
1513
1514 fn add_fork_ref<'a>(
1515 &'a mut self,
1516 stream_id: BucketStreamId,
1517 now_ms: u64,
1518 placement: ShardPlacement,
1519 ) -> GroupForkRefFuture<'a>;
1520
1521 fn release_fork_ref<'a>(
1522 &'a mut self,
1523 stream_id: BucketStreamId,
1524 placement: ShardPlacement,
1525 ) -> GroupForkRefFuture<'a>;
1526
1527 fn close_stream<'a>(
1528 &'a mut self,
1529 request: CloseStreamRequest,
1530 placement: ShardPlacement,
1531 ) -> GroupCloseStreamFuture<'a>;
1532
1533 fn delete_stream<'a>(
1534 &'a mut self,
1535 request: DeleteStreamRequest,
1536 placement: ShardPlacement,
1537 ) -> GroupDeleteStreamFuture<'a>;
1538
1539 fn append<'a>(
1540 &'a mut self,
1541 request: AppendRequest,
1542 placement: ShardPlacement,
1543 ) -> GroupAppendFuture<'a>;
1544
1545 fn append_external<'a>(
1546 &'a mut self,
1547 request: AppendExternalRequest,
1548 _placement: ShardPlacement,
1549 ) -> GroupAppendFuture<'a> {
1550 Box::pin(async move {
1551 Err(GroupEngineError::new(format!(
1552 "external append is not supported for stream '{}'",
1553 request.stream_id
1554 )))
1555 })
1556 }
1557
1558 fn append_batch<'a>(
1559 &'a mut self,
1560 request: AppendBatchRequest,
1561 placement: ShardPlacement,
1562 ) -> GroupAppendBatchFuture<'a>;
1563
1564 fn create_stream_with_cold_admission<'a>(
1565 &'a mut self,
1566 request: CreateStreamRequest,
1567 placement: ShardPlacement,
1568 _admission: ColdWriteAdmission,
1569 ) -> GroupCreateStreamFuture<'a> {
1570 self.create_stream(request, placement)
1571 }
1572
1573 fn append_with_cold_admission<'a>(
1574 &'a mut self,
1575 request: AppendRequest,
1576 placement: ShardPlacement,
1577 _admission: ColdWriteAdmission,
1578 ) -> GroupAppendFuture<'a> {
1579 self.append(request, placement)
1580 }
1581
1582 fn append_batch_with_cold_admission<'a>(
1583 &'a mut self,
1584 request: AppendBatchRequest,
1585 placement: ShardPlacement,
1586 _admission: ColdWriteAdmission,
1587 ) -> GroupAppendBatchFuture<'a> {
1588 self.append_batch(request, placement)
1589 }
1590
1591 fn append_batch_many_with_cold_admission<'a>(
1592 &'a mut self,
1593 requests: Vec<AppendBatchRequest>,
1594 placement: ShardPlacement,
1595 admission: ColdWriteAdmission,
1596 ) -> GroupWriteBatchFuture<'a> {
1597 Box::pin(async move {
1598 let mut responses = Vec::with_capacity(requests.len());
1599 for request in requests {
1600 let response = self
1601 .append_batch_with_cold_admission(request, placement, admission)
1602 .await
1603 .map(GroupWriteResponse::AppendBatch);
1604 responses.push(response);
1605 }
1606 Ok(responses)
1607 })
1608 }
1609
1610 fn flush_cold<'a>(
1611 &'a mut self,
1612 request: FlushColdRequest,
1613 _placement: ShardPlacement,
1614 ) -> GroupFlushColdFuture<'a> {
1615 Box::pin(async move {
1616 Err(GroupEngineError::new(format!(
1617 "cold flush is not supported for stream '{}'",
1618 request.stream_id
1619 )))
1620 })
1621 }
1622
1623 fn plan_cold_flush<'a>(
1624 &'a mut self,
1625 request: PlanColdFlushRequest,
1626 _placement: ShardPlacement,
1627 ) -> GroupPlanColdFlushFuture<'a> {
1628 Box::pin(async move {
1629 Err(GroupEngineError::new(format!(
1630 "cold flush planning is not supported for stream '{}'",
1631 request.stream_id
1632 )))
1633 })
1634 }
1635
1636 fn plan_next_cold_flush<'a>(
1637 &'a mut self,
1638 _request: PlanGroupColdFlushRequest,
1639 _placement: ShardPlacement,
1640 ) -> GroupPlanNextColdFlushFuture<'a> {
1641 Box::pin(async move {
1642 Err(GroupEngineError::new(
1643 "group cold flush planning is not supported",
1644 ))
1645 })
1646 }
1647
1648 fn plan_next_cold_flush_batch<'a>(
1649 &'a mut self,
1650 request: PlanGroupColdFlushRequest,
1651 placement: ShardPlacement,
1652 max_candidates: usize,
1653 ) -> GroupPlanNextColdFlushBatchFuture<'a> {
1654 Box::pin(async move {
1655 match self.plan_next_cold_flush(request, placement).await? {
1656 Some(candidate) if max_candidates > 0 => Ok(vec![candidate]),
1657 _ => Ok(Vec::new()),
1658 }
1659 })
1660 }
1661
1662 fn cold_hot_backlog<'a>(
1663 &'a mut self,
1664 stream_id: BucketStreamId,
1665 _placement: ShardPlacement,
1666 ) -> GroupColdHotBacklogFuture<'a> {
1667 Box::pin(async move {
1668 Err(GroupEngineError::new(format!(
1669 "cold hot backlog is not supported for stream '{stream_id}'"
1670 )))
1671 })
1672 }
1673
1674 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a>;
1675
1676 fn install_snapshot<'a>(
1677 &'a mut self,
1678 snapshot: GroupSnapshot,
1679 ) -> GroupInstallSnapshotFuture<'a>;
1680
1681 fn write_batch<'a>(
1682 &'a mut self,
1683 commands: Vec<GroupWriteCommand>,
1684 placement: ShardPlacement,
1685 ) -> GroupWriteBatchFuture<'a> {
1686 Box::pin(async move {
1687 let mut responses = Vec::with_capacity(commands.len());
1688 for command in commands {
1689 let response = match command {
1690 GroupWriteCommand::CreateStream {
1691 stream_id,
1692 content_type,
1693 initial_payload,
1694 close_after,
1695 stream_seq,
1696 producer,
1697 stream_ttl_seconds,
1698 stream_expires_at_ms,
1699 forked_from,
1700 fork_offset,
1701 now_ms,
1702 } => self
1703 .create_stream(
1704 CreateStreamRequest {
1705 stream_id,
1706 content_type,
1707 content_type_explicit: true,
1708 initial_payload,
1709 close_after,
1710 stream_seq,
1711 producer,
1712 stream_ttl_seconds,
1713 stream_expires_at_ms,
1714 forked_from,
1715 fork_offset,
1716 now_ms,
1717 },
1718 placement,
1719 )
1720 .await
1721 .map(GroupWriteResponse::CreateStream),
1722 GroupWriteCommand::CreateExternal {
1723 stream_id,
1724 content_type,
1725 initial_payload,
1726 close_after,
1727 stream_seq,
1728 producer,
1729 stream_ttl_seconds,
1730 stream_expires_at_ms,
1731 forked_from,
1732 fork_offset,
1733 now_ms,
1734 } => self
1735 .create_stream_external(
1736 CreateStreamExternalRequest {
1737 stream_id,
1738 content_type,
1739 initial_payload,
1740 close_after,
1741 stream_seq,
1742 producer,
1743 stream_ttl_seconds,
1744 stream_expires_at_ms,
1745 forked_from,
1746 fork_offset,
1747 now_ms,
1748 },
1749 placement,
1750 )
1751 .await
1752 .map(GroupWriteResponse::CreateStream),
1753 GroupWriteCommand::Append {
1754 stream_id,
1755 content_type,
1756 payload,
1757 close_after,
1758 stream_seq,
1759 producer,
1760 now_ms,
1761 } => self
1762 .append(
1763 AppendRequest {
1764 stream_id,
1765 content_type,
1766 payload,
1767 close_after,
1768 stream_seq,
1769 producer,
1770 now_ms,
1771 },
1772 placement,
1773 )
1774 .await
1775 .map(GroupWriteResponse::Append),
1776 GroupWriteCommand::AppendExternal {
1777 stream_id,
1778 content_type,
1779 payload,
1780 close_after,
1781 stream_seq,
1782 producer,
1783 now_ms,
1784 } => self
1785 .append_external(
1786 AppendExternalRequest {
1787 stream_id,
1788 content_type,
1789 payload,
1790 close_after,
1791 stream_seq,
1792 producer,
1793 now_ms,
1794 },
1795 placement,
1796 )
1797 .await
1798 .map(GroupWriteResponse::Append),
1799 GroupWriteCommand::AppendBatch {
1800 stream_id,
1801 content_type,
1802 payloads,
1803 producer,
1804 now_ms,
1805 } => self
1806 .append_batch(
1807 AppendBatchRequest {
1808 stream_id,
1809 content_type,
1810 payloads,
1811 producer,
1812 now_ms,
1813 },
1814 placement,
1815 )
1816 .await
1817 .map(GroupWriteResponse::AppendBatch),
1818 GroupWriteCommand::PublishSnapshot {
1819 stream_id,
1820 snapshot_offset,
1821 content_type,
1822 payload,
1823 now_ms,
1824 } => self
1825 .publish_snapshot(
1826 PublishSnapshotRequest {
1827 stream_id,
1828 snapshot_offset,
1829 content_type,
1830 payload,
1831 now_ms,
1832 },
1833 placement,
1834 )
1835 .await
1836 .map(GroupWriteResponse::PublishSnapshot),
1837 GroupWriteCommand::TouchStreamAccess {
1838 stream_id,
1839 now_ms,
1840 renew_ttl,
1841 } => self
1842 .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
1843 .await
1844 .map(GroupWriteResponse::TouchStreamAccess),
1845 GroupWriteCommand::AddForkRef { stream_id, now_ms } => self
1846 .add_fork_ref(stream_id, now_ms, placement)
1847 .await
1848 .map(GroupWriteResponse::AddForkRef),
1849 GroupWriteCommand::ReleaseForkRef { stream_id } => self
1850 .release_fork_ref(stream_id, placement)
1851 .await
1852 .map(GroupWriteResponse::ReleaseForkRef),
1853 GroupWriteCommand::FlushCold { stream_id, chunk } => self
1854 .flush_cold(FlushColdRequest { stream_id, chunk }, placement)
1855 .await
1856 .map(GroupWriteResponse::FlushCold),
1857 GroupWriteCommand::CloseStream {
1858 stream_id,
1859 stream_seq,
1860 producer,
1861 now_ms,
1862 } => self
1863 .close_stream(
1864 CloseStreamRequest {
1865 stream_id,
1866 stream_seq,
1867 producer,
1868 now_ms,
1869 },
1870 placement,
1871 )
1872 .await
1873 .map(GroupWriteResponse::CloseStream),
1874 GroupWriteCommand::DeleteStream { stream_id } => self
1875 .delete_stream(DeleteStreamRequest { stream_id }, placement)
1876 .await
1877 .map(GroupWriteResponse::DeleteStream),
1878 GroupWriteCommand::Batch { commands } => self
1879 .write_batch(commands, placement)
1880 .await
1881 .map(GroupWriteResponse::Batch),
1882 };
1883 responses.push(response);
1884 }
1885 Ok(responses)
1886 })
1887 }
1888}
1889
1890pub trait GroupEngineFactory: Send + Sync + 'static {
1891 fn create<'a>(
1892 &'a self,
1893 placement: ShardPlacement,
1894 metrics: GroupEngineMetrics,
1895 ) -> GroupEngineCreateFuture<'a>;
1896}
1897
1898#[derive(Debug, Clone)]
1899pub struct GroupEngineMetrics {
1900 inner: Arc<RuntimeMetricsInner>,
1901}
1902
1903impl GroupEngineMetrics {
1904 pub fn record_wal_batch(
1905 &self,
1906 placement: ShardPlacement,
1907 record_count: usize,
1908 write_ns: u64,
1909 sync_ns: u64,
1910 ) {
1911 self.inner.record_wal_batch(
1912 placement.core_id,
1913 placement.raft_group_id,
1914 u64::try_from(record_count).expect("record count fits u64"),
1915 write_ns,
1916 sync_ns,
1917 );
1918 }
1919
1920 pub fn record_raft_write_many(
1921 &self,
1922 placement: ShardPlacement,
1923 command_count: usize,
1924 logical_command_count: usize,
1925 response_count: usize,
1926 submit_ns: u64,
1927 response_ns: u64,
1928 ) {
1929 self.inner.record_raft_write_many(
1930 placement.core_id,
1931 placement.raft_group_id,
1932 RaftWriteManySample {
1933 command_count: u64::try_from(command_count).expect("command count fits u64"),
1934 logical_command_count: u64::try_from(logical_command_count)
1935 .expect("logical command count fits u64"),
1936 response_count: u64::try_from(response_count).expect("response count fits u64"),
1937 submit_ns,
1938 response_ns,
1939 },
1940 );
1941 }
1942
1943 pub fn record_raft_apply_batch(
1944 &self,
1945 placement: ShardPlacement,
1946 entry_count: usize,
1947 apply_ns: u64,
1948 ) {
1949 self.inner.record_raft_apply_batch(
1950 placement.core_id,
1951 placement.raft_group_id,
1952 u64::try_from(entry_count).expect("entry count fits u64"),
1953 apply_ns,
1954 );
1955 }
1956}
1957
1958#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1959pub struct GroupLeaderHint {
1960 pub node_id: Option<u64>,
1961 pub address: Option<String>,
1962}
1963
1964#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1965pub struct GroupEngineError {
1966 message: String,
1967 code: Option<StreamErrorCode>,
1968 next_offset: Option<u64>,
1969 #[serde(default, skip_serializing_if = "Option::is_none")]
1970 leader_hint: Option<GroupLeaderHint>,
1971}
1972
1973impl GroupEngineError {
1974 pub fn new(message: impl Into<String>) -> Self {
1975 Self {
1976 message: message.into(),
1977 code: None,
1978 next_offset: None,
1979 leader_hint: None,
1980 }
1981 }
1982
1983 pub fn stream(code: StreamErrorCode, message: impl Into<String>) -> Self {
1984 Self::stream_with_next_offset(code, message, None)
1985 }
1986
1987 pub fn stream_with_next_offset(
1988 code: StreamErrorCode,
1989 message: impl Into<String>,
1990 next_offset: Option<u64>,
1991 ) -> Self {
1992 Self {
1993 message: format!("{code:?}: {}", message.into()),
1994 code: Some(code),
1995 next_offset,
1996 leader_hint: None,
1997 }
1998 }
1999
2000 pub fn forward_to_leader(
2001 message: impl Into<String>,
2002 node_id: Option<u64>,
2003 address: Option<String>,
2004 ) -> Self {
2005 Self {
2006 message: message.into(),
2007 code: None,
2008 next_offset: None,
2009 leader_hint: Some(GroupLeaderHint { node_id, address }),
2010 }
2011 }
2012
2013 pub fn from_replicated_parts(
2014 message: impl Into<String>,
2015 code: Option<StreamErrorCode>,
2016 next_offset: Option<u64>,
2017 leader_hint: Option<GroupLeaderHint>,
2018 ) -> Self {
2019 Self {
2020 message: message.into(),
2021 code,
2022 next_offset,
2023 leader_hint,
2024 }
2025 }
2026
2027 pub fn message(&self) -> &str {
2028 &self.message
2029 }
2030
2031 pub fn code(&self) -> Option<StreamErrorCode> {
2032 self.code
2033 }
2034
2035 pub fn next_offset(&self) -> Option<u64> {
2036 self.next_offset
2037 }
2038
2039 pub fn leader_hint(&self) -> Option<&GroupLeaderHint> {
2040 self.leader_hint.as_ref()
2041 }
2042}
2043
2044impl std::fmt::Display for GroupEngineError {
2045 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2046 f.write_str(&self.message)
2047 }
2048}
2049
2050impl std::error::Error for GroupEngineError {}
2051
2052struct AppendPayloadInput<'a> {
2053 stream_id: BucketStreamId,
2054 content_type: Option<&'a str>,
2055 payload: &'a [u8],
2056 close_after: bool,
2057 stream_seq: Option<String>,
2058 producer: Option<ProducerRequest>,
2059 now_ms: u64,
2060}
2061
2062#[derive(Debug, Clone, Default)]
2063pub struct InMemoryGroupEngine {
2064 commit_index: u64,
2065 state_machine: StreamStateMachine,
2066 stream_append_counts: HashMap<BucketStreamId, u64>,
2067 cold_store: Option<ColdStoreHandle>,
2068}
2069
2070impl InMemoryGroupEngine {
2071 pub fn with_cold_store(cold_store: ColdStoreHandle) -> Self {
2072 Self {
2073 cold_store: Some(cold_store),
2074 ..Self::default()
2075 }
2076 }
2077
2078 pub fn cold_store(&self) -> Option<ColdStoreHandle> {
2079 self.cold_store.clone()
2080 }
2081
2082 pub fn apply_committed_write(
2083 &mut self,
2084 command: GroupWriteCommand,
2085 placement: ShardPlacement,
2086 ) -> Result<GroupWriteResponse, GroupEngineError> {
2087 match command {
2088 GroupWriteCommand::CreateStream {
2089 stream_id,
2090 content_type,
2091 initial_payload,
2092 close_after,
2093 stream_seq,
2094 producer,
2095 stream_ttl_seconds,
2096 stream_expires_at_ms,
2097 forked_from,
2098 fork_offset,
2099 now_ms,
2100 } => {
2101 ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2102 let response = self.state_machine.apply(StreamCommand::CreateStream {
2103 stream_id,
2104 content_type,
2105 initial_payload: initial_payload.to_vec(),
2106 close_after,
2107 stream_seq,
2108 producer,
2109 stream_ttl_seconds,
2110 stream_expires_at_ms,
2111 forked_from,
2112 fork_offset,
2113 now_ms,
2114 });
2115 match response {
2116 StreamResponse::Created {
2117 next_offset,
2118 closed,
2119 ..
2120 } => {
2121 self.commit_index += 1;
2122 Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2123 placement,
2124 next_offset,
2125 closed,
2126 already_exists: false,
2127 group_commit_index: self.commit_index,
2128 }))
2129 }
2130 StreamResponse::AlreadyExists {
2131 next_offset,
2132 closed,
2133 ..
2134 } => Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2135 placement,
2136 next_offset,
2137 closed,
2138 already_exists: true,
2139 group_commit_index: self.commit_index,
2140 })),
2141 StreamResponse::Error {
2142 code,
2143 message,
2144 next_offset,
2145 } => Err(GroupEngineError::stream_with_next_offset(
2146 code,
2147 message,
2148 next_offset,
2149 )),
2150 other => Err(GroupEngineError::new(format!(
2151 "unexpected create stream response: {other:?}"
2152 ))),
2153 }
2154 }
2155 GroupWriteCommand::CreateExternal {
2156 stream_id,
2157 content_type,
2158 initial_payload,
2159 close_after,
2160 stream_seq,
2161 producer,
2162 stream_ttl_seconds,
2163 stream_expires_at_ms,
2164 forked_from,
2165 fork_offset,
2166 now_ms,
2167 } => {
2168 ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2169 let response = self.state_machine.apply(StreamCommand::CreateExternal {
2170 stream_id,
2171 content_type,
2172 initial_payload,
2173 close_after,
2174 stream_seq,
2175 producer,
2176 stream_ttl_seconds,
2177 stream_expires_at_ms,
2178 forked_from,
2179 fork_offset,
2180 now_ms,
2181 });
2182 match response {
2183 StreamResponse::Created {
2184 next_offset,
2185 closed,
2186 ..
2187 } => {
2188 self.commit_index += 1;
2189 Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2190 placement,
2191 next_offset,
2192 closed,
2193 already_exists: false,
2194 group_commit_index: self.commit_index,
2195 }))
2196 }
2197 StreamResponse::AlreadyExists {
2198 next_offset,
2199 closed,
2200 ..
2201 } => Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
2202 placement,
2203 next_offset,
2204 closed,
2205 already_exists: true,
2206 group_commit_index: self.commit_index,
2207 })),
2208 StreamResponse::Error {
2209 code,
2210 message,
2211 next_offset,
2212 } => Err(GroupEngineError::stream_with_next_offset(
2213 code,
2214 message,
2215 next_offset,
2216 )),
2217 other => Err(GroupEngineError::new(format!(
2218 "unexpected create external stream response: {other:?}"
2219 ))),
2220 }
2221 }
2222 GroupWriteCommand::Append {
2223 stream_id,
2224 content_type,
2225 payload,
2226 close_after,
2227 stream_seq,
2228 producer,
2229 now_ms,
2230 } => self
2231 .append_payload(
2232 AppendPayloadInput {
2233 stream_id,
2234 content_type: Some(&content_type),
2235 payload: &payload,
2236 close_after,
2237 stream_seq,
2238 producer,
2239 now_ms,
2240 },
2241 placement,
2242 )
2243 .map(GroupWriteResponse::Append),
2244 GroupWriteCommand::AppendExternal {
2245 stream_id,
2246 content_type,
2247 payload,
2248 close_after,
2249 stream_seq,
2250 producer,
2251 now_ms,
2252 } => {
2253 let response = self.state_machine.apply(StreamCommand::AppendExternal {
2254 stream_id: stream_id.clone(),
2255 content_type: Some(content_type),
2256 payload,
2257 close_after,
2258 stream_seq,
2259 producer,
2260 now_ms,
2261 });
2262 match response {
2263 StreamResponse::Appended {
2264 offset,
2265 next_offset,
2266 closed,
2267 deduplicated,
2268 producer,
2269 ..
2270 } => {
2271 let stream_append_count =
2272 self.stream_append_counts.entry(stream_id).or_insert(0);
2273 if !deduplicated {
2274 self.commit_index += 1;
2275 *stream_append_count += 1;
2276 }
2277 Ok(GroupWriteResponse::Append(AppendResponse {
2278 placement,
2279 start_offset: offset,
2280 next_offset,
2281 stream_append_count: *stream_append_count,
2282 group_commit_index: self.commit_index,
2283 closed,
2284 deduplicated,
2285 producer,
2286 }))
2287 }
2288 StreamResponse::Error {
2289 code,
2290 message,
2291 next_offset,
2292 } => Err(GroupEngineError::stream_with_next_offset(
2293 code,
2294 message,
2295 next_offset,
2296 )),
2297 other => Err(GroupEngineError::new(format!(
2298 "unexpected append external response: {other:?}"
2299 ))),
2300 }
2301 }
2302 GroupWriteCommand::AppendBatch {
2303 stream_id,
2304 content_type,
2305 payloads,
2306 producer,
2307 now_ms,
2308 } => {
2309 if producer.is_some() {
2310 let payload_refs = payloads.iter().map(Bytes::as_ref).collect::<Vec<_>>();
2311 let batch = self
2312 .state_machine
2313 .append_batch_borrowed(
2314 stream_id.clone(),
2315 Some(&content_type),
2316 &payload_refs,
2317 producer,
2318 now_ms,
2319 )
2320 .map_err(stream_response_error)?;
2321 let old_commit_index = self.commit_index;
2322 let old_append_count = *self.stream_append_counts.get(&stream_id).unwrap_or(&0);
2323 if !batch.deduplicated {
2324 let count = u64::try_from(batch.items.len()).expect("item count fits u64");
2325 self.commit_index += count;
2326 *self.stream_append_counts.entry(stream_id).or_insert(0) += count;
2327 }
2328 let items = batch
2329 .items
2330 .into_iter()
2331 .enumerate()
2332 .map(|(index, item)| {
2333 let item_index = u64::try_from(index + 1).expect("item index fits u64");
2334 Ok(AppendResponse {
2335 placement,
2336 start_offset: item.offset,
2337 next_offset: item.next_offset,
2338 stream_append_count: if item.deduplicated {
2339 old_append_count
2340 } else {
2341 old_append_count + item_index
2342 },
2343 group_commit_index: if item.deduplicated {
2344 old_commit_index
2345 } else {
2346 old_commit_index + item_index
2347 },
2348 closed: item.closed,
2349 deduplicated: item.deduplicated,
2350 producer: None,
2351 })
2352 })
2353 .collect();
2354 return Ok(GroupWriteResponse::AppendBatch(GroupAppendBatchResponse {
2355 placement,
2356 items,
2357 }));
2358 }
2359
2360 let mut items = Vec::with_capacity(payloads.len());
2361 for payload in payloads {
2362 if payload.is_empty() {
2363 items.push(Err(GroupEngineError::stream(
2364 StreamErrorCode::EmptyAppend,
2365 "append payload must be non-empty",
2366 )));
2367 continue;
2368 }
2369 items.push(self.append_payload(
2370 AppendPayloadInput {
2371 stream_id: stream_id.clone(),
2372 content_type: Some(&content_type),
2373 payload: &payload,
2374 close_after: false,
2375 stream_seq: None,
2376 producer: None,
2377 now_ms,
2378 },
2379 placement,
2380 ));
2381 }
2382 Ok(GroupWriteResponse::AppendBatch(GroupAppendBatchResponse {
2383 placement,
2384 items,
2385 }))
2386 }
2387 GroupWriteCommand::PublishSnapshot {
2388 stream_id,
2389 snapshot_offset,
2390 content_type,
2391 payload,
2392 now_ms,
2393 } => {
2394 let response = self.state_machine.apply(StreamCommand::PublishSnapshot {
2395 stream_id,
2396 snapshot_offset,
2397 content_type,
2398 payload: payload.to_vec(),
2399 now_ms,
2400 });
2401 match response {
2402 StreamResponse::SnapshotPublished { snapshot_offset } => {
2403 self.commit_index += 1;
2404 Ok(GroupWriteResponse::PublishSnapshot(
2405 PublishSnapshotResponse {
2406 placement,
2407 snapshot_offset,
2408 group_commit_index: self.commit_index,
2409 },
2410 ))
2411 }
2412 StreamResponse::Error {
2413 code,
2414 message,
2415 next_offset,
2416 } => Err(GroupEngineError::stream_with_next_offset(
2417 code,
2418 message,
2419 next_offset,
2420 )),
2421 other => Err(GroupEngineError::new(format!(
2422 "unexpected publish snapshot response: {other:?}"
2423 ))),
2424 }
2425 }
2426 GroupWriteCommand::TouchStreamAccess {
2427 stream_id,
2428 now_ms,
2429 renew_ttl,
2430 } => {
2431 let response = self.state_machine.apply(StreamCommand::TouchStreamAccess {
2432 stream_id,
2433 now_ms,
2434 renew_ttl,
2435 });
2436 match response {
2437 StreamResponse::Accessed { changed, expired } => {
2438 if changed || expired {
2439 self.commit_index += 1;
2440 }
2441 Ok(GroupWriteResponse::TouchStreamAccess(
2442 TouchStreamAccessResponse {
2443 placement,
2444 changed,
2445 expired,
2446 group_commit_index: self.commit_index,
2447 },
2448 ))
2449 }
2450 StreamResponse::Error {
2451 code,
2452 message,
2453 next_offset,
2454 } => Err(GroupEngineError::stream_with_next_offset(
2455 code,
2456 message,
2457 next_offset,
2458 )),
2459 other => Err(GroupEngineError::new(format!(
2460 "unexpected touch stream access response: {other:?}"
2461 ))),
2462 }
2463 }
2464 GroupWriteCommand::AddForkRef { stream_id, now_ms } => {
2465 let response = self
2466 .state_machine
2467 .apply(StreamCommand::AddForkRef { stream_id, now_ms });
2468 match response {
2469 StreamResponse::ForkRefAdded { fork_ref_count } => {
2470 self.commit_index += 1;
2471 Ok(GroupWriteResponse::AddForkRef(ForkRefResponse {
2472 placement,
2473 fork_ref_count,
2474 hard_deleted: false,
2475 parent_to_release: None,
2476 group_commit_index: self.commit_index,
2477 }))
2478 }
2479 StreamResponse::Error {
2480 code,
2481 message,
2482 next_offset,
2483 } => Err(GroupEngineError::stream_with_next_offset(
2484 code,
2485 message,
2486 next_offset,
2487 )),
2488 other => Err(GroupEngineError::new(format!(
2489 "unexpected add fork ref response: {other:?}"
2490 ))),
2491 }
2492 }
2493 GroupWriteCommand::ReleaseForkRef { stream_id } => {
2494 let response = self
2495 .state_machine
2496 .apply(StreamCommand::ReleaseForkRef { stream_id });
2497 match response {
2498 StreamResponse::ForkRefReleased {
2499 hard_deleted,
2500 fork_ref_count,
2501 parent_to_release,
2502 } => {
2503 self.commit_index += 1;
2504 Ok(GroupWriteResponse::ReleaseForkRef(ForkRefResponse {
2505 placement,
2506 fork_ref_count,
2507 hard_deleted,
2508 parent_to_release,
2509 group_commit_index: self.commit_index,
2510 }))
2511 }
2512 StreamResponse::Error {
2513 code,
2514 message,
2515 next_offset,
2516 } => Err(GroupEngineError::stream_with_next_offset(
2517 code,
2518 message,
2519 next_offset,
2520 )),
2521 other => Err(GroupEngineError::new(format!(
2522 "unexpected release fork ref response: {other:?}"
2523 ))),
2524 }
2525 }
2526 GroupWriteCommand::FlushCold { stream_id, chunk } => {
2527 let response = self
2528 .state_machine
2529 .apply(StreamCommand::FlushCold { stream_id, chunk });
2530 match response {
2531 StreamResponse::ColdFlushed { hot_start_offset } => {
2532 self.commit_index += 1;
2533 Ok(GroupWriteResponse::FlushCold(FlushColdResponse {
2534 placement,
2535 hot_start_offset,
2536 group_commit_index: self.commit_index,
2537 }))
2538 }
2539 StreamResponse::Error {
2540 code,
2541 message,
2542 next_offset,
2543 } => Err(GroupEngineError::stream_with_next_offset(
2544 code,
2545 message,
2546 next_offset,
2547 )),
2548 other => Err(GroupEngineError::new(format!(
2549 "unexpected flush cold response: {other:?}"
2550 ))),
2551 }
2552 }
2553 GroupWriteCommand::CloseStream {
2554 stream_id,
2555 stream_seq,
2556 producer,
2557 now_ms,
2558 } => {
2559 let response = self.state_machine.apply(StreamCommand::Close {
2560 stream_id,
2561 stream_seq,
2562 producer,
2563 now_ms,
2564 });
2565 match response {
2566 StreamResponse::Closed {
2567 next_offset,
2568 deduplicated,
2569 ..
2570 } => {
2571 if !deduplicated {
2572 self.commit_index += 1;
2573 }
2574 Ok(GroupWriteResponse::CloseStream(CloseStreamResponse {
2575 placement,
2576 next_offset,
2577 group_commit_index: self.commit_index,
2578 deduplicated,
2579 }))
2580 }
2581 StreamResponse::Error {
2582 code,
2583 message,
2584 next_offset,
2585 } => Err(GroupEngineError::stream_with_next_offset(
2586 code,
2587 message,
2588 next_offset,
2589 )),
2590 other => Err(GroupEngineError::new(format!(
2591 "unexpected close stream response: {other:?}"
2592 ))),
2593 }
2594 }
2595 GroupWriteCommand::DeleteStream { stream_id } => {
2596 let response = self
2597 .state_machine
2598 .apply(StreamCommand::DeleteStream { stream_id });
2599 match response {
2600 StreamResponse::Deleted {
2601 hard_deleted,
2602 parent_to_release,
2603 } => {
2604 self.commit_index += 1;
2605 Ok(GroupWriteResponse::DeleteStream(DeleteStreamResponse {
2606 placement,
2607 group_commit_index: self.commit_index,
2608 hard_deleted,
2609 parent_to_release,
2610 }))
2611 }
2612 StreamResponse::Error {
2613 code,
2614 message,
2615 next_offset,
2616 } => Err(GroupEngineError::stream_with_next_offset(
2617 code,
2618 message,
2619 next_offset,
2620 )),
2621 other => Err(GroupEngineError::new(format!(
2622 "unexpected delete stream response: {other:?}"
2623 ))),
2624 }
2625 }
2626 GroupWriteCommand::Batch { commands } => Ok(GroupWriteResponse::Batch(
2627 self.apply_committed_write_batch(commands, placement),
2628 )),
2629 }
2630 }
2631
2632 fn cold_hot_backlog_for(
2633 &self,
2634 stream_id: BucketStreamId,
2635 ) -> Result<ColdHotBacklog, GroupEngineError> {
2636 let stream_hot_bytes = self.state_machine.hot_payload_len(&stream_id).unwrap_or(0);
2637 Ok(ColdHotBacklog {
2638 stream_id,
2639 stream_hot_bytes,
2640 group_hot_bytes: self.state_machine.total_hot_payload_bytes(),
2641 })
2642 }
2643
2644 fn enforce_cold_write_admission(
2645 &self,
2646 stream_id: &BucketStreamId,
2647 admission: ColdWriteAdmission,
2648 before_group_hot_bytes: u64,
2649 after_group_hot_bytes: u64,
2650 mutating: bool,
2651 ) -> Result<(), GroupEngineError> {
2652 let Some(limit) = admission.max_hot_bytes_per_group else {
2653 return Ok(());
2654 };
2655 if !mutating || after_group_hot_bytes <= limit {
2656 return Ok(());
2657 }
2658 Err(GroupEngineError::new(format!(
2659 "ColdBackpressure: stream '{stream_id}' would raise group hot bytes from {before_group_hot_bytes} to {after_group_hot_bytes}, above limit {limit}"
2660 )))
2661 }
2662
2663 fn create_stream_with_admission_inner(
2664 &mut self,
2665 request: CreateStreamRequest,
2666 placement: ShardPlacement,
2667 admission: ColdWriteAdmission,
2668 ) -> Result<CreateStreamResponse, GroupEngineError> {
2669 let stream_id = request.stream_id.clone();
2670 let command = GroupWriteCommand::from(request);
2671 let before = self.state_machine.total_hot_payload_bytes();
2672 let mut preview = self.clone();
2673 let response = match preview.apply_committed_write(command, placement)? {
2674 GroupWriteResponse::CreateStream(response) => response,
2675 other => {
2676 return Err(GroupEngineError::new(format!(
2677 "unexpected create stream write response: {other:?}"
2678 )));
2679 }
2680 };
2681 preview.enforce_cold_write_admission(
2682 &stream_id,
2683 admission,
2684 before,
2685 preview.state_machine.total_hot_payload_bytes(),
2686 !response.already_exists,
2687 )?;
2688 *self = preview;
2689 Ok(response)
2690 }
2691
2692 fn append_with_admission_inner(
2693 &mut self,
2694 request: AppendRequest,
2695 placement: ShardPlacement,
2696 admission: ColdWriteAdmission,
2697 ) -> Result<AppendResponse, GroupEngineError> {
2698 let stream_id = request.stream_id.clone();
2699 let command = GroupWriteCommand::from(request);
2700 let before = self.state_machine.total_hot_payload_bytes();
2701 let mut preview = self.clone();
2702 let response = match preview.apply_committed_write(command, placement)? {
2703 GroupWriteResponse::Append(response) => response,
2704 other => {
2705 return Err(GroupEngineError::new(format!(
2706 "unexpected append write response: {other:?}"
2707 )));
2708 }
2709 };
2710 preview.enforce_cold_write_admission(
2711 &stream_id,
2712 admission,
2713 before,
2714 preview.state_machine.total_hot_payload_bytes(),
2715 !response.deduplicated,
2716 )?;
2717 *self = preview;
2718 Ok(response)
2719 }
2720
2721 fn append_batch_with_admission_inner(
2722 &mut self,
2723 request: AppendBatchRequest,
2724 placement: ShardPlacement,
2725 admission: ColdWriteAdmission,
2726 ) -> Result<GroupAppendBatchResponse, GroupEngineError> {
2727 let stream_id = request.stream_id.clone();
2728 let command = GroupWriteCommand::from(request);
2729 let before = self.state_machine.total_hot_payload_bytes();
2730 let mut preview = self.clone();
2731 let response = match preview.apply_committed_write(command, placement)? {
2732 GroupWriteResponse::AppendBatch(response) => response,
2733 other => {
2734 return Err(GroupEngineError::new(format!(
2735 "unexpected append batch write response: {other:?}"
2736 )));
2737 }
2738 };
2739 let mutating = response
2740 .items
2741 .iter()
2742 .any(|item| matches!(item, Ok(response) if !response.deduplicated));
2743 preview.enforce_cold_write_admission(
2744 &stream_id,
2745 admission,
2746 before,
2747 preview.state_machine.total_hot_payload_bytes(),
2748 mutating,
2749 )?;
2750 *self = preview;
2751 Ok(response)
2752 }
2753
2754 pub fn access_requires_write(
2755 &self,
2756 stream_id: &BucketStreamId,
2757 now_ms: u64,
2758 renew_ttl: bool,
2759 ) -> Result<bool, GroupEngineError> {
2760 self.state_machine
2761 .access_requires_write(stream_id, now_ms, renew_ttl)
2762 .map_err(stream_response_error)
2763 }
2764
2765 fn apply_access_command(
2766 &mut self,
2767 stream_id: BucketStreamId,
2768 now_ms: u64,
2769 renew_ttl: bool,
2770 placement: ShardPlacement,
2771 ) -> Result<TouchStreamAccessResponse, GroupEngineError> {
2772 match self.apply_committed_write(
2773 GroupWriteCommand::TouchStreamAccess {
2774 stream_id,
2775 now_ms,
2776 renew_ttl,
2777 },
2778 placement,
2779 )? {
2780 GroupWriteResponse::TouchStreamAccess(response) => Ok(response),
2781 other => Err(GroupEngineError::new(format!(
2782 "unexpected touch stream access write response: {other:?}"
2783 ))),
2784 }
2785 }
2786
2787 fn ensure_stream_access(
2788 &mut self,
2789 stream_id: &BucketStreamId,
2790 now_ms: u64,
2791 renew_ttl: bool,
2792 placement: ShardPlacement,
2793 ) -> Result<Option<TouchStreamAccessResponse>, GroupEngineError> {
2794 if !self.access_requires_write(stream_id, now_ms, renew_ttl)? {
2795 return Ok(None);
2796 }
2797 let response =
2798 self.apply_access_command(stream_id.clone(), now_ms, renew_ttl, placement)?;
2799 if response.expired {
2800 return Err(GroupEngineError::stream(
2801 StreamErrorCode::StreamNotFound,
2802 format!("stream '{stream_id}' does not exist"),
2803 ));
2804 }
2805 Ok(Some(response))
2806 }
2807
2808 pub fn apply_committed_write_batch(
2809 &mut self,
2810 commands: Vec<GroupWriteCommand>,
2811 placement: ShardPlacement,
2812 ) -> Vec<Result<GroupWriteResponse, GroupEngineError>> {
2813 commands
2814 .into_iter()
2815 .map(|command| self.apply_committed_write(command, placement))
2816 .collect()
2817 }
2818
2819 fn apply_replayed_write_command(
2820 &mut self,
2821 command: GroupWriteCommand,
2822 ) -> Result<(), GroupEngineError> {
2823 let placement = ShardPlacement {
2824 core_id: CoreId(0),
2825 shard_id: ShardId(0),
2826 raft_group_id: RaftGroupId(0),
2827 };
2828 self.apply_committed_write(command, placement).map(|_| ())
2829 }
2830
2831 fn apply_replayed_command(&mut self, command: StreamCommand) -> Result<(), GroupEngineError> {
2832 match command {
2833 StreamCommand::CreateBucket { bucket_id } => {
2834 match self
2835 .state_machine
2836 .apply(StreamCommand::CreateBucket { bucket_id })
2837 {
2838 StreamResponse::BucketCreated { .. } => {
2839 self.commit_index += 1;
2840 Ok(())
2841 }
2842 StreamResponse::BucketAlreadyExists { .. } => Ok(()),
2843 StreamResponse::Error {
2844 code,
2845 message,
2846 next_offset,
2847 } => Err(GroupEngineError::stream_with_next_offset(
2848 code,
2849 message,
2850 next_offset,
2851 )),
2852 other => Err(GroupEngineError::new(format!(
2853 "unexpected replay create bucket response: {other:?}"
2854 ))),
2855 }
2856 }
2857 StreamCommand::DeleteBucket { bucket_id } => {
2858 match self
2859 .state_machine
2860 .apply(StreamCommand::DeleteBucket { bucket_id })
2861 {
2862 StreamResponse::BucketDeleted { .. } => {
2863 self.commit_index += 1;
2864 Ok(())
2865 }
2866 StreamResponse::Error {
2867 code,
2868 message,
2869 next_offset,
2870 } => Err(GroupEngineError::stream_with_next_offset(
2871 code,
2872 message,
2873 next_offset,
2874 )),
2875 other => Err(GroupEngineError::new(format!(
2876 "unexpected replay delete bucket response: {other:?}"
2877 ))),
2878 }
2879 }
2880 StreamCommand::CreateStream {
2881 stream_id,
2882 content_type,
2883 initial_payload,
2884 close_after,
2885 stream_seq,
2886 producer,
2887 stream_ttl_seconds,
2888 stream_expires_at_ms,
2889 forked_from,
2890 fork_offset,
2891 now_ms,
2892 } => {
2893 ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2894 let response = self.state_machine.apply(StreamCommand::CreateStream {
2895 stream_id,
2896 content_type,
2897 initial_payload,
2898 close_after,
2899 stream_seq,
2900 producer,
2901 stream_ttl_seconds,
2902 stream_expires_at_ms,
2903 forked_from,
2904 fork_offset,
2905 now_ms,
2906 });
2907 match response {
2908 StreamResponse::Created { .. } => {
2909 self.commit_index += 1;
2910 Ok(())
2911 }
2912 StreamResponse::AlreadyExists { .. } => Ok(()),
2913 StreamResponse::Error {
2914 code,
2915 message,
2916 next_offset,
2917 } => Err(GroupEngineError::stream_with_next_offset(
2918 code,
2919 message,
2920 next_offset,
2921 )),
2922 other => Err(GroupEngineError::new(format!(
2923 "unexpected replay create stream response: {other:?}"
2924 ))),
2925 }
2926 }
2927 StreamCommand::CreateExternal {
2928 stream_id,
2929 content_type,
2930 initial_payload,
2931 close_after,
2932 stream_seq,
2933 producer,
2934 stream_ttl_seconds,
2935 stream_expires_at_ms,
2936 forked_from,
2937 fork_offset,
2938 now_ms,
2939 } => {
2940 ensure_bucket_exists(&mut self.state_machine, &stream_id)?;
2941 let response = self.state_machine.apply(StreamCommand::CreateExternal {
2942 stream_id,
2943 content_type,
2944 initial_payload,
2945 close_after,
2946 stream_seq,
2947 producer,
2948 stream_ttl_seconds,
2949 stream_expires_at_ms,
2950 forked_from,
2951 fork_offset,
2952 now_ms,
2953 });
2954 match response {
2955 StreamResponse::Created { .. } => {
2956 self.commit_index += 1;
2957 Ok(())
2958 }
2959 StreamResponse::AlreadyExists { .. } => Ok(()),
2960 StreamResponse::Error {
2961 code,
2962 message,
2963 next_offset,
2964 } => Err(GroupEngineError::stream_with_next_offset(
2965 code,
2966 message,
2967 next_offset,
2968 )),
2969 other => Err(GroupEngineError::new(format!(
2970 "unexpected replay external create stream response: {other:?}"
2971 ))),
2972 }
2973 }
2974 StreamCommand::Append {
2975 stream_id,
2976 content_type,
2977 payload,
2978 close_after,
2979 stream_seq,
2980 producer,
2981 now_ms,
2982 } => {
2983 let stream_count_key = stream_id.clone();
2984 let response = self.state_machine.apply(StreamCommand::Append {
2985 stream_id,
2986 content_type,
2987 payload,
2988 close_after,
2989 stream_seq,
2990 producer,
2991 now_ms,
2992 });
2993 match response {
2994 StreamResponse::Appended { deduplicated, .. } => {
2995 if !deduplicated {
2996 self.commit_index += 1;
2997 *self
2998 .stream_append_counts
2999 .entry(stream_count_key)
3000 .or_insert(0) += 1;
3001 }
3002 Ok(())
3003 }
3004 StreamResponse::Closed { deduplicated, .. } => {
3005 if !deduplicated {
3006 self.commit_index += 1;
3007 }
3008 Ok(())
3009 }
3010 StreamResponse::Error {
3011 code,
3012 message,
3013 next_offset,
3014 } => Err(GroupEngineError::stream_with_next_offset(
3015 code,
3016 message,
3017 next_offset,
3018 )),
3019 other => Err(GroupEngineError::new(format!(
3020 "unexpected replay append response: {other:?}"
3021 ))),
3022 }
3023 }
3024 StreamCommand::AppendExternal {
3025 stream_id,
3026 content_type,
3027 payload,
3028 close_after,
3029 stream_seq,
3030 producer,
3031 now_ms,
3032 } => {
3033 let stream_count_key = stream_id.clone();
3034 let response = self.state_machine.apply(StreamCommand::AppendExternal {
3035 stream_id,
3036 content_type,
3037 payload,
3038 close_after,
3039 stream_seq,
3040 producer,
3041 now_ms,
3042 });
3043 match response {
3044 StreamResponse::Appended { deduplicated, .. } => {
3045 if !deduplicated {
3046 self.commit_index += 1;
3047 *self
3048 .stream_append_counts
3049 .entry(stream_count_key)
3050 .or_insert(0) += 1;
3051 }
3052 Ok(())
3053 }
3054 StreamResponse::Error {
3055 code,
3056 message,
3057 next_offset,
3058 } => Err(GroupEngineError::stream_with_next_offset(
3059 code,
3060 message,
3061 next_offset,
3062 )),
3063 other => Err(GroupEngineError::new(format!(
3064 "unexpected replay external append response: {other:?}"
3065 ))),
3066 }
3067 }
3068 StreamCommand::AppendBatch {
3069 stream_id,
3070 content_type,
3071 payloads,
3072 producer,
3073 now_ms,
3074 } => {
3075 let stream_count_key = stream_id.clone();
3076 let payload_refs = payloads.iter().map(Vec::as_slice).collect::<Vec<_>>();
3077 let response = self
3078 .state_machine
3079 .append_batch_borrowed(
3080 stream_id,
3081 content_type.as_deref(),
3082 &payload_refs,
3083 producer,
3084 now_ms,
3085 )
3086 .map_err(stream_response_error)?;
3087 if !response.deduplicated {
3088 let count = u64::try_from(response.items.len()).expect("item count fits u64");
3089 self.commit_index += count;
3090 *self
3091 .stream_append_counts
3092 .entry(stream_count_key)
3093 .or_insert(0) += count;
3094 }
3095 Ok(())
3096 }
3097 StreamCommand::PublishSnapshot {
3098 stream_id,
3099 snapshot_offset,
3100 content_type,
3101 payload,
3102 now_ms,
3103 } => {
3104 let response = self.state_machine.apply(StreamCommand::PublishSnapshot {
3105 stream_id,
3106 snapshot_offset,
3107 content_type,
3108 payload,
3109 now_ms,
3110 });
3111 match response {
3112 StreamResponse::SnapshotPublished { .. } => {
3113 self.commit_index += 1;
3114 Ok(())
3115 }
3116 StreamResponse::Error {
3117 code,
3118 message,
3119 next_offset,
3120 } => Err(GroupEngineError::stream_with_next_offset(
3121 code,
3122 message,
3123 next_offset,
3124 )),
3125 other => Err(GroupEngineError::new(format!(
3126 "unexpected replay publish snapshot response: {other:?}"
3127 ))),
3128 }
3129 }
3130 StreamCommand::TouchStreamAccess {
3131 stream_id,
3132 now_ms,
3133 renew_ttl,
3134 } => {
3135 let response = self.state_machine.apply(StreamCommand::TouchStreamAccess {
3136 stream_id,
3137 now_ms,
3138 renew_ttl,
3139 });
3140 match response {
3141 StreamResponse::Accessed { changed, expired } => {
3142 if changed || expired {
3143 self.commit_index += 1;
3144 }
3145 Ok(())
3146 }
3147 StreamResponse::Error {
3148 code,
3149 message,
3150 next_offset,
3151 } => Err(GroupEngineError::stream_with_next_offset(
3152 code,
3153 message,
3154 next_offset,
3155 )),
3156 other => Err(GroupEngineError::new(format!(
3157 "unexpected replay touch stream access response: {other:?}"
3158 ))),
3159 }
3160 }
3161 StreamCommand::AddForkRef { stream_id, now_ms } => {
3162 let response = self
3163 .state_machine
3164 .apply(StreamCommand::AddForkRef { stream_id, now_ms });
3165 match response {
3166 StreamResponse::ForkRefAdded { .. } => {
3167 self.commit_index += 1;
3168 Ok(())
3169 }
3170 StreamResponse::Error {
3171 code,
3172 message,
3173 next_offset,
3174 } => Err(GroupEngineError::stream_with_next_offset(
3175 code,
3176 message,
3177 next_offset,
3178 )),
3179 other => Err(GroupEngineError::new(format!(
3180 "unexpected replay add fork ref response: {other:?}"
3181 ))),
3182 }
3183 }
3184 StreamCommand::ReleaseForkRef { stream_id } => {
3185 let response = self
3186 .state_machine
3187 .apply(StreamCommand::ReleaseForkRef { stream_id });
3188 match response {
3189 StreamResponse::ForkRefReleased { .. } => {
3190 self.commit_index += 1;
3191 Ok(())
3192 }
3193 StreamResponse::Error {
3194 code,
3195 message,
3196 next_offset,
3197 } => Err(GroupEngineError::stream_with_next_offset(
3198 code,
3199 message,
3200 next_offset,
3201 )),
3202 other => Err(GroupEngineError::new(format!(
3203 "unexpected replay release fork ref response: {other:?}"
3204 ))),
3205 }
3206 }
3207 StreamCommand::FlushCold { stream_id, chunk } => {
3208 let response = self
3209 .state_machine
3210 .apply(StreamCommand::FlushCold { stream_id, chunk });
3211 match response {
3212 StreamResponse::ColdFlushed { .. } => {
3213 self.commit_index += 1;
3214 Ok(())
3215 }
3216 StreamResponse::Error {
3217 code,
3218 message,
3219 next_offset,
3220 } => Err(GroupEngineError::stream_with_next_offset(
3221 code,
3222 message,
3223 next_offset,
3224 )),
3225 other => Err(GroupEngineError::new(format!(
3226 "unexpected replay flush cold response: {other:?}"
3227 ))),
3228 }
3229 }
3230 StreamCommand::Close {
3231 stream_id,
3232 stream_seq,
3233 producer,
3234 now_ms,
3235 } => {
3236 let response = self.state_machine.apply(StreamCommand::Close {
3237 stream_id,
3238 stream_seq,
3239 producer,
3240 now_ms,
3241 });
3242 match response {
3243 StreamResponse::Closed { deduplicated, .. } => {
3244 if !deduplicated {
3245 self.commit_index += 1;
3246 }
3247 Ok(())
3248 }
3249 StreamResponse::Error {
3250 code,
3251 message,
3252 next_offset,
3253 } => Err(GroupEngineError::stream_with_next_offset(
3254 code,
3255 message,
3256 next_offset,
3257 )),
3258 other => Err(GroupEngineError::new(format!(
3259 "unexpected replay close stream response: {other:?}"
3260 ))),
3261 }
3262 }
3263 StreamCommand::DeleteStream { stream_id } => {
3264 let response = self
3265 .state_machine
3266 .apply(StreamCommand::DeleteStream { stream_id });
3267 match response {
3268 StreamResponse::Deleted { .. } => {
3269 self.commit_index += 1;
3270 Ok(())
3271 }
3272 StreamResponse::Error {
3273 code,
3274 message,
3275 next_offset,
3276 } => Err(GroupEngineError::stream_with_next_offset(
3277 code,
3278 message,
3279 next_offset,
3280 )),
3281 other => Err(GroupEngineError::new(format!(
3282 "unexpected replay delete stream response: {other:?}"
3283 ))),
3284 }
3285 }
3286 }
3287 }
3288
3289 fn append_payload(
3290 &mut self,
3291 input: AppendPayloadInput<'_>,
3292 placement: ShardPlacement,
3293 ) -> Result<AppendResponse, GroupEngineError> {
3294 let AppendPayloadInput {
3295 stream_id,
3296 content_type,
3297 payload,
3298 close_after,
3299 stream_seq,
3300 producer,
3301 now_ms,
3302 } = input;
3303 let stream_count_key = stream_id.clone();
3304 let response = self.state_machine.append_borrowed(AppendStreamInput {
3305 stream_id,
3306 content_type,
3307 payload,
3308 close_after,
3309 stream_seq,
3310 producer,
3311 now_ms,
3312 });
3313 match response {
3314 StreamResponse::Appended {
3315 offset,
3316 next_offset,
3317 closed,
3318 deduplicated,
3319 producer,
3320 ..
3321 } => {
3322 let stream_append_count = self
3323 .stream_append_counts
3324 .entry(stream_count_key)
3325 .or_insert(0);
3326 if !deduplicated {
3327 self.commit_index += 1;
3328 *stream_append_count += 1;
3329 }
3330 Ok(AppendResponse {
3331 placement,
3332 start_offset: offset,
3333 next_offset,
3334 stream_append_count: *stream_append_count,
3335 group_commit_index: self.commit_index,
3336 closed,
3337 deduplicated,
3338 producer,
3339 })
3340 }
3341 StreamResponse::Error {
3342 code,
3343 message,
3344 next_offset,
3345 } => Err(GroupEngineError::stream_with_next_offset(
3346 code,
3347 message,
3348 next_offset,
3349 )),
3350 other => Err(GroupEngineError::new(format!(
3351 "unexpected append response: {other:?}"
3352 ))),
3353 }
3354 }
3355
3356 pub fn read_stream_plan(
3357 &mut self,
3358 request: &ReadStreamRequest,
3359 placement: ShardPlacement,
3360 ) -> Result<StreamReadPlan, GroupEngineError> {
3361 self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3362 self.read_stream_plan_after_access(request)
3363 }
3364
3365 pub fn read_stream_plan_after_access(
3366 &self,
3367 request: &ReadStreamRequest,
3368 ) -> Result<StreamReadPlan, GroupEngineError> {
3369 self.state_machine
3370 .read_plan_at(
3371 &request.stream_id,
3372 request.offset,
3373 request.max_len,
3374 request.now_ms,
3375 )
3376 .map_err(stream_response_error)
3377 }
3378
3379 pub async fn read_payload_from_plan(
3380 cold_store: Option<&ColdStoreHandle>,
3381 stream_id: &BucketStreamId,
3382 plan: &StreamReadPlan,
3383 ) -> Result<Vec<u8>, GroupEngineError> {
3384 let mut payload = Vec::new();
3385 for segment in &plan.segments {
3386 match segment {
3387 StreamReadSegment::Hot(bytes) => payload.extend_from_slice(bytes),
3388 StreamReadSegment::Object(segment) => {
3389 let Some(cold_store) = cold_store else {
3390 return Err(GroupEngineError::stream_with_next_offset(
3391 StreamErrorCode::InvalidColdFlush,
3392 format!("stream '{stream_id}' read requires object payload store"),
3393 Some(plan.next_offset),
3394 ));
3395 };
3396 let bytes = cold_store
3397 .read_object_range(&segment.object, segment.read_start_offset, segment.len)
3398 .await
3399 .map_err(|err| GroupEngineError::new(err.to_string()))?;
3400 payload.extend_from_slice(&bytes);
3401 }
3402 }
3403 }
3404 Ok(payload)
3405 }
3406
3407 async fn read_own_payload_from_plan(
3408 &self,
3409 stream_id: &BucketStreamId,
3410 plan: &StreamReadPlan,
3411 ) -> Result<Vec<u8>, GroupEngineError> {
3412 Self::read_payload_from_plan(self.cold_store.as_ref(), stream_id, plan).await
3413 }
3414
3415 async fn bootstrap_updates(
3416 &self,
3417 stream_id: &BucketStreamId,
3418 records: &[StreamMessageRecord],
3419 content_type: &str,
3420 now_ms: u64,
3421 ) -> Result<Vec<BootstrapUpdate>, GroupEngineError> {
3422 let mut updates = Vec::with_capacity(records.len());
3423 for record in records {
3424 let len = usize::try_from(record.end_offset - record.start_offset).map_err(|_| {
3425 GroupEngineError::stream(
3426 StreamErrorCode::InvalidSnapshot,
3427 format!(
3428 "bootstrap message [{}..{}) for stream '{stream_id}' is too large",
3429 record.start_offset, record.end_offset
3430 ),
3431 )
3432 })?;
3433 let plan = self
3434 .state_machine
3435 .read_plan_at(stream_id, record.start_offset, len, now_ms)
3436 .map_err(stream_response_error)?;
3437 let payload = self.read_own_payload_from_plan(stream_id, &plan).await?;
3438 updates.push(BootstrapUpdate {
3439 start_offset: record.start_offset,
3440 next_offset: record.end_offset,
3441 content_type: content_type.to_owned(),
3442 payload,
3443 });
3444 }
3445 Ok(updates)
3446 }
3447
3448 fn build_snapshot(&self, placement: ShardPlacement) -> GroupSnapshot {
3449 GroupSnapshot {
3450 placement,
3451 group_commit_index: self.commit_index,
3452 stream_snapshot: self.state_machine.snapshot(),
3453 stream_append_counts: self.stream_append_counts_snapshot(),
3454 }
3455 }
3456
3457 fn stream_append_counts_snapshot(&self) -> Vec<StreamAppendCount> {
3458 let mut counts = self
3459 .stream_append_counts
3460 .iter()
3461 .map(|(stream_id, append_count)| StreamAppendCount {
3462 stream_id: stream_id.clone(),
3463 append_count: *append_count,
3464 })
3465 .collect::<Vec<_>>();
3466 counts.sort_by(|left, right| compare_stream_ids(&left.stream_id, &right.stream_id));
3467 counts
3468 }
3469
3470 fn install_snapshot_inner(&mut self, snapshot: GroupSnapshot) -> Result<(), GroupEngineError> {
3471 let GroupSnapshot {
3472 placement: _,
3473 group_commit_index,
3474 stream_snapshot,
3475 stream_append_counts,
3476 } = snapshot;
3477 self.install_snapshot_parts(group_commit_index, stream_snapshot, stream_append_counts)
3478 }
3479
3480 fn install_snapshot_parts(
3481 &mut self,
3482 group_commit_index: u64,
3483 stream_snapshot: StreamSnapshot,
3484 stream_append_counts: Vec<StreamAppendCount>,
3485 ) -> Result<(), GroupEngineError> {
3486 let stream_ids = stream_snapshot
3487 .streams
3488 .iter()
3489 .map(|entry| entry.metadata.stream_id.clone())
3490 .collect::<HashSet<_>>();
3491 let state_machine = StreamStateMachine::restore(stream_snapshot)
3492 .map_err(|err| GroupEngineError::new(format!("restore stream snapshot: {err}")))?;
3493 let stream_append_counts = restore_stream_append_counts(stream_append_counts, &stream_ids)?;
3494
3495 self.commit_index = group_commit_index;
3496 self.state_machine = state_machine;
3497 self.stream_append_counts = stream_append_counts;
3498 Ok(())
3499 }
3500}
3501
3502impl GroupEngine for InMemoryGroupEngine {
3503 fn create_stream<'a>(
3504 &'a mut self,
3505 request: CreateStreamRequest,
3506 placement: ShardPlacement,
3507 ) -> GroupCreateStreamFuture<'a> {
3508 let command = GroupWriteCommand::from(request);
3509 Box::pin(async move {
3510 match self.apply_committed_write(command, placement)? {
3511 GroupWriteResponse::CreateStream(response) => Ok(response),
3512 other => Err(GroupEngineError::new(format!(
3513 "unexpected create stream write response: {other:?}"
3514 ))),
3515 }
3516 })
3517 }
3518
3519 fn create_stream_with_cold_admission<'a>(
3520 &'a mut self,
3521 request: CreateStreamRequest,
3522 placement: ShardPlacement,
3523 admission: ColdWriteAdmission,
3524 ) -> GroupCreateStreamFuture<'a> {
3525 if !admission.is_enabled() {
3526 return self.create_stream(request, placement);
3527 }
3528 Box::pin(
3529 async move { self.create_stream_with_admission_inner(request, placement, admission) },
3530 )
3531 }
3532
3533 fn create_stream_external<'a>(
3534 &'a mut self,
3535 request: CreateStreamExternalRequest,
3536 placement: ShardPlacement,
3537 ) -> GroupCreateStreamFuture<'a> {
3538 let command = GroupWriteCommand::from(request);
3539 Box::pin(async move {
3540 match self.apply_committed_write(command, placement)? {
3541 GroupWriteResponse::CreateStream(response) => Ok(response),
3542 other => Err(GroupEngineError::new(format!(
3543 "unexpected external create stream write response: {other:?}"
3544 ))),
3545 }
3546 })
3547 }
3548
3549 fn read_stream<'a>(
3550 &'a mut self,
3551 request: ReadStreamRequest,
3552 placement: ShardPlacement,
3553 ) -> GroupReadStreamFuture<'a> {
3554 Box::pin(async move {
3555 self.read_stream_parts(request, placement)
3556 .await?
3557 .into_response()
3558 .await
3559 })
3560 }
3561
3562 fn read_stream_parts<'a>(
3563 &'a mut self,
3564 request: ReadStreamRequest,
3565 placement: ShardPlacement,
3566 ) -> GroupReadStreamPartsFuture<'a> {
3567 Box::pin(async move {
3568 let stream_id = request.stream_id.clone();
3569 let plan = self.read_stream_plan(&request, placement)?;
3570 Ok(GroupReadStreamParts::from_plan(
3571 placement,
3572 stream_id,
3573 plan,
3574 self.cold_store(),
3575 ))
3576 })
3577 }
3578
3579 fn publish_snapshot<'a>(
3580 &'a mut self,
3581 request: PublishSnapshotRequest,
3582 placement: ShardPlacement,
3583 ) -> GroupPublishSnapshotFuture<'a> {
3584 Box::pin(async move {
3585 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3586 let command = GroupWriteCommand::from(request);
3587 match self.apply_committed_write(command, placement)? {
3588 GroupWriteResponse::PublishSnapshot(response) => Ok(response),
3589 other => Err(GroupEngineError::new(format!(
3590 "unexpected publish snapshot write response: {other:?}"
3591 ))),
3592 }
3593 })
3594 }
3595
3596 fn read_snapshot<'a>(
3597 &'a mut self,
3598 request: ReadSnapshotRequest,
3599 placement: ShardPlacement,
3600 ) -> GroupReadSnapshotFuture<'a> {
3601 Box::pin(async move {
3602 self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3603 let snapshot = match request.snapshot_offset {
3604 Some(offset) => self
3605 .state_machine
3606 .read_snapshot(&request.stream_id, offset)
3607 .map_err(stream_response_error)?,
3608 None => self
3609 .state_machine
3610 .latest_snapshot(&request.stream_id)
3611 .map_err(stream_response_error)?
3612 .ok_or_else(|| {
3613 GroupEngineError::stream(
3614 StreamErrorCode::SnapshotNotFound,
3615 format!("stream '{}' has no visible snapshot", request.stream_id),
3616 )
3617 })?,
3618 };
3619 let tail_offset = self
3620 .state_machine
3621 .head_at(&request.stream_id, request.now_ms)
3622 .map(|metadata| metadata.tail_offset)
3623 .unwrap_or(snapshot.offset);
3624 Ok(ReadSnapshotResponse {
3625 placement,
3626 snapshot_offset: snapshot.offset,
3627 next_offset: snapshot.offset,
3628 content_type: snapshot.content_type,
3629 payload: snapshot.payload,
3630 up_to_date: snapshot.offset == tail_offset,
3631 })
3632 })
3633 }
3634
3635 fn delete_snapshot<'a>(
3636 &'a mut self,
3637 request: DeleteSnapshotRequest,
3638 placement: ShardPlacement,
3639 ) -> GroupDeleteSnapshotFuture<'a> {
3640 Box::pin(async move {
3641 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3642 match self
3643 .state_machine
3644 .delete_snapshot(&request.stream_id, request.snapshot_offset)
3645 {
3646 StreamResponse::Error {
3647 code,
3648 message,
3649 next_offset,
3650 } => Err(GroupEngineError::stream_with_next_offset(
3651 code,
3652 message,
3653 next_offset,
3654 )),
3655 other => Err(GroupEngineError::new(format!(
3656 "unexpected delete snapshot response: {other:?}"
3657 ))),
3658 }
3659 })
3660 }
3661
3662 fn bootstrap_stream<'a>(
3663 &'a mut self,
3664 request: BootstrapStreamRequest,
3665 placement: ShardPlacement,
3666 ) -> GroupBootstrapStreamFuture<'a> {
3667 Box::pin(async move {
3668 self.ensure_stream_access(&request.stream_id, request.now_ms, true, placement)?;
3669 let plan = self
3670 .state_machine
3671 .bootstrap_plan(&request.stream_id)
3672 .map_err(stream_response_error)?;
3673 let snapshot_offset = plan.snapshot.as_ref().map(|snapshot| snapshot.offset);
3674 let snapshot_content_type = plan
3675 .snapshot
3676 .as_ref()
3677 .map(|snapshot| snapshot.content_type.clone())
3678 .unwrap_or_else(|| DEFAULT_CONTENT_TYPE.to_owned());
3679 let snapshot_payload = plan
3680 .snapshot
3681 .as_ref()
3682 .map(|snapshot| snapshot.payload.clone())
3683 .unwrap_or_default();
3684 let updates = self
3685 .bootstrap_updates(
3686 &request.stream_id,
3687 &plan.updates,
3688 &plan.content_type,
3689 request.now_ms,
3690 )
3691 .await?;
3692 Ok(BootstrapStreamResponse {
3693 placement,
3694 snapshot_offset,
3695 snapshot_content_type,
3696 snapshot_payload,
3697 updates,
3698 next_offset: plan.next_offset,
3699 up_to_date: plan.up_to_date,
3700 closed: plan.closed,
3701 })
3702 })
3703 }
3704
3705 fn touch_stream_access<'a>(
3706 &'a mut self,
3707 stream_id: BucketStreamId,
3708 now_ms: u64,
3709 renew_ttl: bool,
3710 placement: ShardPlacement,
3711 ) -> GroupTouchStreamAccessFuture<'a> {
3712 Box::pin(async move { self.apply_access_command(stream_id, now_ms, renew_ttl, placement) })
3713 }
3714
3715 fn add_fork_ref<'a>(
3716 &'a mut self,
3717 stream_id: BucketStreamId,
3718 now_ms: u64,
3719 placement: ShardPlacement,
3720 ) -> GroupForkRefFuture<'a> {
3721 Box::pin(async move {
3722 match self.apply_committed_write(
3723 GroupWriteCommand::AddForkRef { stream_id, now_ms },
3724 placement,
3725 )? {
3726 GroupWriteResponse::AddForkRef(response) => Ok(response),
3727 other => Err(GroupEngineError::new(format!(
3728 "unexpected add fork ref write response: {other:?}"
3729 ))),
3730 }
3731 })
3732 }
3733
3734 fn release_fork_ref<'a>(
3735 &'a mut self,
3736 stream_id: BucketStreamId,
3737 placement: ShardPlacement,
3738 ) -> GroupForkRefFuture<'a> {
3739 Box::pin(async move {
3740 match self
3741 .apply_committed_write(GroupWriteCommand::ReleaseForkRef { stream_id }, placement)?
3742 {
3743 GroupWriteResponse::ReleaseForkRef(response) => Ok(response),
3744 other => Err(GroupEngineError::new(format!(
3745 "unexpected release fork ref write response: {other:?}"
3746 ))),
3747 }
3748 })
3749 }
3750
3751 fn head_stream<'a>(
3752 &'a mut self,
3753 request: HeadStreamRequest,
3754 placement: ShardPlacement,
3755 ) -> GroupHeadStreamFuture<'a> {
3756 Box::pin(async move {
3757 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3758 let Some(metadata) = self
3759 .state_machine
3760 .head_at(&request.stream_id, request.now_ms)
3761 else {
3762 return Err(GroupEngineError::stream(
3763 StreamErrorCode::StreamNotFound,
3764 format!("stream '{}' does not exist", request.stream_id),
3765 ));
3766 };
3767 Ok(HeadStreamResponse {
3768 placement,
3769 content_type: metadata.content_type.clone(),
3770 tail_offset: metadata.tail_offset,
3771 closed: metadata.status == ursula_stream::StreamStatus::Closed,
3772 stream_ttl_seconds: metadata.stream_ttl_seconds,
3773 stream_expires_at_ms: metadata.stream_expires_at_ms,
3774 snapshot_offset: self
3775 .state_machine
3776 .latest_snapshot(&request.stream_id)
3777 .map_err(stream_response_error)?
3778 .map(|snapshot| snapshot.offset),
3779 })
3780 })
3781 }
3782
3783 fn close_stream<'a>(
3784 &'a mut self,
3785 request: CloseStreamRequest,
3786 placement: ShardPlacement,
3787 ) -> GroupCloseStreamFuture<'a> {
3788 Box::pin(async move {
3789 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3790 let command = GroupWriteCommand::from(request);
3791 match self.apply_committed_write(command, placement)? {
3792 GroupWriteResponse::CloseStream(response) => Ok(response),
3793 other => Err(GroupEngineError::new(format!(
3794 "unexpected close stream write response: {other:?}"
3795 ))),
3796 }
3797 })
3798 }
3799
3800 fn delete_stream<'a>(
3801 &'a mut self,
3802 request: DeleteStreamRequest,
3803 placement: ShardPlacement,
3804 ) -> GroupDeleteStreamFuture<'a> {
3805 let command = GroupWriteCommand::from(request);
3806 Box::pin(async move {
3807 match self.apply_committed_write(command, placement)? {
3808 GroupWriteResponse::DeleteStream(response) => Ok(response),
3809 other => Err(GroupEngineError::new(format!(
3810 "unexpected delete stream write response: {other:?}"
3811 ))),
3812 }
3813 })
3814 }
3815
3816 fn append<'a>(
3817 &'a mut self,
3818 request: AppendRequest,
3819 placement: ShardPlacement,
3820 ) -> GroupAppendFuture<'a> {
3821 Box::pin(async move {
3822 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3823 let command = GroupWriteCommand::from(request);
3824 match self.apply_committed_write(command, placement)? {
3825 GroupWriteResponse::Append(response) => Ok(response),
3826 other => Err(GroupEngineError::new(format!(
3827 "unexpected append write response: {other:?}"
3828 ))),
3829 }
3830 })
3831 }
3832
3833 fn append_with_cold_admission<'a>(
3834 &'a mut self,
3835 request: AppendRequest,
3836 placement: ShardPlacement,
3837 admission: ColdWriteAdmission,
3838 ) -> GroupAppendFuture<'a> {
3839 if !admission.is_enabled() {
3840 return self.append(request, placement);
3841 }
3842 Box::pin(async move { self.append_with_admission_inner(request, placement, admission) })
3843 }
3844
3845 fn append_external<'a>(
3846 &'a mut self,
3847 request: AppendExternalRequest,
3848 placement: ShardPlacement,
3849 ) -> GroupAppendFuture<'a> {
3850 Box::pin(async move {
3851 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3852 let command = GroupWriteCommand::from(request);
3853 match self.apply_committed_write(command, placement)? {
3854 GroupWriteResponse::Append(response) => Ok(response),
3855 other => Err(GroupEngineError::new(format!(
3856 "unexpected external append write response: {other:?}"
3857 ))),
3858 }
3859 })
3860 }
3861
3862 fn append_batch<'a>(
3863 &'a mut self,
3864 request: AppendBatchRequest,
3865 placement: ShardPlacement,
3866 ) -> GroupAppendBatchFuture<'a> {
3867 Box::pin(async move {
3868 self.ensure_stream_access(&request.stream_id, request.now_ms, false, placement)?;
3869 let command = GroupWriteCommand::from(request);
3870 match self.apply_committed_write(command, placement)? {
3871 GroupWriteResponse::AppendBatch(response) => Ok(response),
3872 other => Err(GroupEngineError::new(format!(
3873 "unexpected append batch write response: {other:?}"
3874 ))),
3875 }
3876 })
3877 }
3878
3879 fn append_batch_with_cold_admission<'a>(
3880 &'a mut self,
3881 request: AppendBatchRequest,
3882 placement: ShardPlacement,
3883 admission: ColdWriteAdmission,
3884 ) -> GroupAppendBatchFuture<'a> {
3885 if !admission.is_enabled() {
3886 return self.append_batch(request, placement);
3887 }
3888 Box::pin(
3889 async move { self.append_batch_with_admission_inner(request, placement, admission) },
3890 )
3891 }
3892
3893 fn flush_cold<'a>(
3894 &'a mut self,
3895 request: FlushColdRequest,
3896 placement: ShardPlacement,
3897 ) -> GroupFlushColdFuture<'a> {
3898 let command = GroupWriteCommand::from(request);
3899 Box::pin(async move {
3900 match self.apply_committed_write(command, placement)? {
3901 GroupWriteResponse::FlushCold(response) => Ok(response),
3902 other => Err(GroupEngineError::new(format!(
3903 "unexpected flush cold write response: {other:?}"
3904 ))),
3905 }
3906 })
3907 }
3908
3909 fn plan_cold_flush<'a>(
3910 &'a mut self,
3911 request: PlanColdFlushRequest,
3912 _placement: ShardPlacement,
3913 ) -> GroupPlanColdFlushFuture<'a> {
3914 Box::pin(async move {
3915 self.state_machine
3916 .plan_cold_flush(
3917 &request.stream_id,
3918 request.min_hot_bytes,
3919 request.max_flush_bytes,
3920 )
3921 .map_err(stream_response_error)
3922 })
3923 }
3924
3925 fn plan_next_cold_flush<'a>(
3926 &'a mut self,
3927 request: PlanGroupColdFlushRequest,
3928 _placement: ShardPlacement,
3929 ) -> GroupPlanNextColdFlushFuture<'a> {
3930 Box::pin(async move {
3931 self.state_machine
3932 .plan_next_cold_flush(request.min_hot_bytes, request.max_flush_bytes)
3933 .map_err(stream_response_error)
3934 })
3935 }
3936
3937 fn plan_next_cold_flush_batch<'a>(
3938 &'a mut self,
3939 request: PlanGroupColdFlushRequest,
3940 _placement: ShardPlacement,
3941 max_candidates: usize,
3942 ) -> GroupPlanNextColdFlushBatchFuture<'a> {
3943 Box::pin(async move {
3944 self.state_machine
3945 .plan_next_cold_flush_batch(
3946 request.min_hot_bytes,
3947 request.max_flush_bytes,
3948 max_candidates,
3949 )
3950 .map_err(stream_response_error)
3951 })
3952 }
3953
3954 fn cold_hot_backlog<'a>(
3955 &'a mut self,
3956 stream_id: BucketStreamId,
3957 _placement: ShardPlacement,
3958 ) -> GroupColdHotBacklogFuture<'a> {
3959 Box::pin(async move { self.cold_hot_backlog_for(stream_id) })
3960 }
3961
3962 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
3963 Box::pin(async move { Ok(self.build_snapshot(placement)) })
3964 }
3965
3966 fn install_snapshot<'a>(
3967 &'a mut self,
3968 snapshot: GroupSnapshot,
3969 ) -> GroupInstallSnapshotFuture<'a> {
3970 Box::pin(async move { self.install_snapshot_inner(snapshot) })
3971 }
3972}
3973
3974#[derive(Debug, Clone, Default)]
3975pub struct InMemoryGroupEngineFactory {
3976 cold_store: Option<ColdStoreHandle>,
3977}
3978
3979impl InMemoryGroupEngineFactory {
3980 pub fn new() -> Self {
3981 Self::default()
3982 }
3983
3984 pub fn with_cold_store(cold_store: Option<ColdStoreHandle>) -> Self {
3985 Self { cold_store }
3986 }
3987}
3988
3989impl GroupEngineFactory for InMemoryGroupEngineFactory {
3990 fn create<'a>(
3991 &'a self,
3992 _placement: ShardPlacement,
3993 _metrics: GroupEngineMetrics,
3994 ) -> GroupEngineCreateFuture<'a> {
3995 Box::pin(async move {
3996 let engine = InMemoryGroupEngine {
3997 cold_store: self.cold_store.clone(),
3998 ..InMemoryGroupEngine::default()
3999 };
4000 let engine: Box<dyn GroupEngine> = Box::new(engine);
4001 Ok(engine)
4002 })
4003 }
4004}
4005
4006#[derive(Debug, Clone)]
4007pub struct WalGroupEngineFactory {
4008 root: PathBuf,
4009 cold_store: Option<ColdStoreHandle>,
4010}
4011
4012impl WalGroupEngineFactory {
4013 pub fn new(root: impl Into<PathBuf>) -> Self {
4014 Self {
4015 root: root.into(),
4016 cold_store: None,
4017 }
4018 }
4019
4020 pub fn with_cold_store(root: impl Into<PathBuf>, cold_store: Option<ColdStoreHandle>) -> Self {
4021 Self {
4022 root: root.into(),
4023 cold_store,
4024 }
4025 }
4026}
4027
4028impl GroupEngineFactory for WalGroupEngineFactory {
4029 fn create<'a>(
4030 &'a self,
4031 placement: ShardPlacement,
4032 metrics: GroupEngineMetrics,
4033 ) -> GroupEngineCreateFuture<'a> {
4034 Box::pin(async move {
4035 let engine: Box<dyn GroupEngine> = Box::new(WalGroupEngine::open(
4036 &self.root,
4037 placement,
4038 metrics,
4039 self.cold_store.clone(),
4040 ));
4041 Ok(engine)
4042 })
4043 }
4044}
4045
4046pub struct WalGroupEngine {
4047 inner: InMemoryGroupEngine,
4048 log_path: PathBuf,
4049 placement: ShardPlacement,
4050 metrics: GroupEngineMetrics,
4051 init_error: Option<String>,
4052}
4053
4054#[derive(Debug, Clone, Serialize, Deserialize)]
4055#[serde(tag = "wal_record", rename_all = "snake_case")]
4056enum WalRecord {
4057 Command {
4058 command: Box<GroupWriteCommand>,
4059 },
4060 Snapshot {
4061 group_commit_index: u64,
4062 stream_snapshot: StreamSnapshot,
4063 stream_append_counts: Vec<StreamAppendCount>,
4064 },
4065}
4066
4067impl WalGroupEngine {
4068 fn open(
4069 root: &Path,
4070 placement: ShardPlacement,
4071 metrics: GroupEngineMetrics,
4072 cold_store: Option<ColdStoreHandle>,
4073 ) -> Self {
4074 let log_path = group_log_path(root, placement);
4075 match replay_group_log(&log_path) {
4076 Ok(mut inner) => {
4077 inner.cold_store = cold_store;
4078 Self {
4079 inner,
4080 log_path,
4081 placement,
4082 metrics,
4083 init_error: None,
4084 }
4085 }
4086 Err(err) => Self {
4087 inner: InMemoryGroupEngine {
4088 cold_store,
4089 ..InMemoryGroupEngine::default()
4090 },
4091 log_path,
4092 placement,
4093 metrics,
4094 init_error: Some(err.message().to_owned()),
4095 },
4096 }
4097 }
4098
4099 fn ensure_ready(&self) -> Result<(), GroupEngineError> {
4100 match &self.init_error {
4101 Some(message) => Err(GroupEngineError::new(message.clone())),
4102 None => Ok(()),
4103 }
4104 }
4105
4106 fn append_record(&self, command: &GroupWriteCommand) -> Result<(), GroupEngineError> {
4107 self.append_records(std::slice::from_ref(command))
4108 }
4109
4110 fn append_records(&self, commands: &[GroupWriteCommand]) -> Result<(), GroupEngineError> {
4111 if commands.is_empty() {
4112 return Ok(());
4113 }
4114 let Some(parent) = self.log_path.parent() else {
4115 return Err(GroupEngineError::new(format!(
4116 "WAL path '{}' has no parent directory",
4117 self.log_path.display()
4118 )));
4119 };
4120 fs::create_dir_all(parent).map_err(|err| {
4121 GroupEngineError::new(format!("create WAL dir '{}': {err}", parent.display()))
4122 })?;
4123 let write_started_at = Instant::now();
4124 let mut file = OpenOptions::new()
4125 .create(true)
4126 .append(true)
4127 .open(&self.log_path)
4128 .map_err(|err| {
4129 GroupEngineError::new(format!("open WAL '{}': {err}", self.log_path.display()))
4130 })?;
4131 for command in commands {
4132 let record = WalRecord::Command {
4133 command: Box::new(command.clone()),
4134 };
4135 serde_json::to_writer(&mut file, &record).map_err(|err| {
4136 GroupEngineError::new(format!("encode WAL '{}': {err}", self.log_path.display()))
4137 })?;
4138 file.write_all(b"\n").map_err(|err| {
4139 GroupEngineError::new(format!("write WAL '{}': {err}", self.log_path.display()))
4140 })?;
4141 }
4142 let write_ns = elapsed_ns(write_started_at);
4143 let sync_started_at = Instant::now();
4144 file.sync_data().map_err(|err| {
4145 GroupEngineError::new(format!("sync WAL '{}': {err}", self.log_path.display()))
4146 })?;
4147 self.metrics.record_wal_batch(
4148 self.placement,
4149 commands.len(),
4150 write_ns,
4151 elapsed_ns(sync_started_at),
4152 );
4153 Ok(())
4154 }
4155
4156 fn append_snapshot_record(&self, snapshot: &GroupSnapshot) -> Result<(), GroupEngineError> {
4157 let record = WalRecord::Snapshot {
4158 group_commit_index: snapshot.group_commit_index,
4159 stream_snapshot: snapshot.stream_snapshot.clone(),
4160 stream_append_counts: snapshot.stream_append_counts.clone(),
4161 };
4162 let Some(parent) = self.log_path.parent() else {
4163 return Err(GroupEngineError::new(format!(
4164 "WAL path '{}' has no parent directory",
4165 self.log_path.display()
4166 )));
4167 };
4168 fs::create_dir_all(parent).map_err(|err| {
4169 GroupEngineError::new(format!("create WAL dir '{}': {err}", parent.display()))
4170 })?;
4171 let write_started_at = Instant::now();
4172 let mut file = OpenOptions::new()
4173 .create(true)
4174 .append(true)
4175 .open(&self.log_path)
4176 .map_err(|err| {
4177 GroupEngineError::new(format!("open WAL '{}': {err}", self.log_path.display()))
4178 })?;
4179 serde_json::to_writer(&mut file, &record).map_err(|err| {
4180 GroupEngineError::new(format!("encode WAL '{}': {err}", self.log_path.display()))
4181 })?;
4182 file.write_all(b"\n").map_err(|err| {
4183 GroupEngineError::new(format!("write WAL '{}': {err}", self.log_path.display()))
4184 })?;
4185 let write_ns = elapsed_ns(write_started_at);
4186 let sync_started_at = Instant::now();
4187 file.sync_data().map_err(|err| {
4188 GroupEngineError::new(format!("sync WAL '{}': {err}", self.log_path.display()))
4189 })?;
4190 self.metrics
4191 .record_wal_batch(self.placement, 1, write_ns, elapsed_ns(sync_started_at));
4192 Ok(())
4193 }
4194
4195 fn commit_access_if_needed(
4196 &mut self,
4197 stream_id: &BucketStreamId,
4198 now_ms: u64,
4199 renew_ttl: bool,
4200 placement: ShardPlacement,
4201 ) -> Result<Option<TouchStreamAccessResponse>, GroupEngineError> {
4202 if !self
4203 .inner
4204 .access_requires_write(stream_id, now_ms, renew_ttl)?
4205 {
4206 return Ok(None);
4207 }
4208 let command = GroupWriteCommand::TouchStreamAccess {
4209 stream_id: stream_id.clone(),
4210 now_ms,
4211 renew_ttl,
4212 };
4213 let mut preview = self.inner.clone();
4214 let response = match preview.apply_committed_write(command.clone(), placement)? {
4215 GroupWriteResponse::TouchStreamAccess(response) => response,
4216 other => {
4217 return Err(GroupEngineError::new(format!(
4218 "unexpected touch stream access write response: {other:?}"
4219 )));
4220 }
4221 };
4222 if response.changed || response.expired {
4223 self.append_record(&command)?;
4224 }
4225 self.inner = preview;
4226 if response.expired {
4227 return Err(GroupEngineError::stream(
4228 StreamErrorCode::StreamNotFound,
4229 format!("stream '{stream_id}' does not exist"),
4230 ));
4231 }
4232 Ok(Some(response))
4233 }
4234}
4235
4236impl GroupEngine for WalGroupEngine {
4237 fn create_stream<'a>(
4238 &'a mut self,
4239 request: CreateStreamRequest,
4240 placement: ShardPlacement,
4241 ) -> GroupCreateStreamFuture<'a> {
4242 Box::pin(async move {
4243 self.ensure_ready()?;
4244 let command = GroupWriteCommand::from(request);
4245 let mut preview = self.inner.clone();
4246 let response = match preview.apply_committed_write(command.clone(), placement)? {
4247 GroupWriteResponse::CreateStream(response) => response,
4248 other => {
4249 return Err(GroupEngineError::new(format!(
4250 "unexpected create stream write response: {other:?}"
4251 )));
4252 }
4253 };
4254 if !response.already_exists {
4255 self.append_record(&command)?;
4256 }
4257 self.inner = preview;
4258 Ok(response)
4259 })
4260 }
4261
4262 fn create_stream_with_cold_admission<'a>(
4263 &'a mut self,
4264 request: CreateStreamRequest,
4265 placement: ShardPlacement,
4266 admission: ColdWriteAdmission,
4267 ) -> GroupCreateStreamFuture<'a> {
4268 if !admission.is_enabled() {
4269 return self.create_stream(request, placement);
4270 }
4271 Box::pin(async move {
4272 self.ensure_ready()?;
4273 let command = GroupWriteCommand::from(request.clone());
4274 let mut preview = self.inner.clone();
4275 let response =
4276 preview.create_stream_with_admission_inner(request, placement, admission)?;
4277 if !response.already_exists {
4278 self.append_record(&command)?;
4279 }
4280 self.inner = preview;
4281 Ok(response)
4282 })
4283 }
4284
4285 fn head_stream<'a>(
4286 &'a mut self,
4287 request: HeadStreamRequest,
4288 placement: ShardPlacement,
4289 ) -> GroupHeadStreamFuture<'a> {
4290 Box::pin(async move {
4291 self.ensure_ready()?;
4292 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4293 self.inner.head_stream(request, placement).await
4294 })
4295 }
4296
4297 fn read_stream<'a>(
4298 &'a mut self,
4299 request: ReadStreamRequest,
4300 placement: ShardPlacement,
4301 ) -> GroupReadStreamFuture<'a> {
4302 Box::pin(async move {
4303 self.ensure_ready()?;
4304 self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4305 self.inner.read_stream(request, placement).await
4306 })
4307 }
4308
4309 fn publish_snapshot<'a>(
4310 &'a mut self,
4311 request: PublishSnapshotRequest,
4312 placement: ShardPlacement,
4313 ) -> GroupPublishSnapshotFuture<'a> {
4314 Box::pin(async move {
4315 self.ensure_ready()?;
4316 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4317 let command = GroupWriteCommand::from(request);
4318 let mut preview = self.inner.clone();
4319 let response = match preview.apply_committed_write(command.clone(), placement)? {
4320 GroupWriteResponse::PublishSnapshot(response) => response,
4321 other => {
4322 return Err(GroupEngineError::new(format!(
4323 "unexpected publish snapshot write response: {other:?}"
4324 )));
4325 }
4326 };
4327 self.append_record(&command)?;
4328 self.inner = preview;
4329 Ok(response)
4330 })
4331 }
4332
4333 fn read_snapshot<'a>(
4334 &'a mut self,
4335 request: ReadSnapshotRequest,
4336 placement: ShardPlacement,
4337 ) -> GroupReadSnapshotFuture<'a> {
4338 Box::pin(async move {
4339 self.ensure_ready()?;
4340 self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4341 self.inner.read_snapshot(request, placement).await
4342 })
4343 }
4344
4345 fn delete_snapshot<'a>(
4346 &'a mut self,
4347 request: DeleteSnapshotRequest,
4348 placement: ShardPlacement,
4349 ) -> GroupDeleteSnapshotFuture<'a> {
4350 Box::pin(async move {
4351 self.ensure_ready()?;
4352 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4353 self.inner.delete_snapshot(request, placement).await
4354 })
4355 }
4356
4357 fn bootstrap_stream<'a>(
4358 &'a mut self,
4359 request: BootstrapStreamRequest,
4360 placement: ShardPlacement,
4361 ) -> GroupBootstrapStreamFuture<'a> {
4362 Box::pin(async move {
4363 self.ensure_ready()?;
4364 self.commit_access_if_needed(&request.stream_id, request.now_ms, true, placement)?;
4365 self.inner.bootstrap_stream(request, placement).await
4366 })
4367 }
4368
4369 fn touch_stream_access<'a>(
4370 &'a mut self,
4371 stream_id: BucketStreamId,
4372 now_ms: u64,
4373 renew_ttl: bool,
4374 placement: ShardPlacement,
4375 ) -> GroupTouchStreamAccessFuture<'a> {
4376 Box::pin(async move {
4377 self.ensure_ready()?;
4378 let command = GroupWriteCommand::TouchStreamAccess {
4379 stream_id,
4380 now_ms,
4381 renew_ttl,
4382 };
4383 let mut preview = self.inner.clone();
4384 let response = match preview.apply_committed_write(command.clone(), placement)? {
4385 GroupWriteResponse::TouchStreamAccess(response) => response,
4386 other => {
4387 return Err(GroupEngineError::new(format!(
4388 "unexpected touch stream access write response: {other:?}"
4389 )));
4390 }
4391 };
4392 if response.changed || response.expired {
4393 self.append_record(&command)?;
4394 }
4395 self.inner = preview;
4396 Ok(response)
4397 })
4398 }
4399
4400 fn add_fork_ref<'a>(
4401 &'a mut self,
4402 stream_id: BucketStreamId,
4403 now_ms: u64,
4404 placement: ShardPlacement,
4405 ) -> GroupForkRefFuture<'a> {
4406 Box::pin(async move {
4407 self.ensure_ready()?;
4408 let command = GroupWriteCommand::AddForkRef { stream_id, now_ms };
4409 let mut preview = self.inner.clone();
4410 let response = match preview.apply_committed_write(command.clone(), placement)? {
4411 GroupWriteResponse::AddForkRef(response) => response,
4412 other => {
4413 return Err(GroupEngineError::new(format!(
4414 "unexpected add fork ref write response: {other:?}"
4415 )));
4416 }
4417 };
4418 self.append_record(&command)?;
4419 self.inner = preview;
4420 Ok(response)
4421 })
4422 }
4423
4424 fn release_fork_ref<'a>(
4425 &'a mut self,
4426 stream_id: BucketStreamId,
4427 placement: ShardPlacement,
4428 ) -> GroupForkRefFuture<'a> {
4429 Box::pin(async move {
4430 self.ensure_ready()?;
4431 let command = GroupWriteCommand::ReleaseForkRef { stream_id };
4432 let mut preview = self.inner.clone();
4433 let response = match preview.apply_committed_write(command.clone(), placement)? {
4434 GroupWriteResponse::ReleaseForkRef(response) => response,
4435 other => {
4436 return Err(GroupEngineError::new(format!(
4437 "unexpected release fork ref write response: {other:?}"
4438 )));
4439 }
4440 };
4441 self.append_record(&command)?;
4442 self.inner = preview;
4443 Ok(response)
4444 })
4445 }
4446
4447 fn close_stream<'a>(
4448 &'a mut self,
4449 request: CloseStreamRequest,
4450 placement: ShardPlacement,
4451 ) -> GroupCloseStreamFuture<'a> {
4452 Box::pin(async move {
4453 self.ensure_ready()?;
4454 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4455 let command = GroupWriteCommand::from(request);
4456 let mut preview = self.inner.clone();
4457 let response = match preview.apply_committed_write(command.clone(), placement)? {
4458 GroupWriteResponse::CloseStream(response) => response,
4459 other => {
4460 return Err(GroupEngineError::new(format!(
4461 "unexpected close stream write response: {other:?}"
4462 )));
4463 }
4464 };
4465 self.append_record(&command)?;
4466 self.inner = preview;
4467 Ok(response)
4468 })
4469 }
4470
4471 fn delete_stream<'a>(
4472 &'a mut self,
4473 request: DeleteStreamRequest,
4474 placement: ShardPlacement,
4475 ) -> GroupDeleteStreamFuture<'a> {
4476 Box::pin(async move {
4477 self.ensure_ready()?;
4478 let command = GroupWriteCommand::from(request);
4479 let mut preview = self.inner.clone();
4480 let response = match preview.apply_committed_write(command.clone(), placement)? {
4481 GroupWriteResponse::DeleteStream(response) => response,
4482 other => {
4483 return Err(GroupEngineError::new(format!(
4484 "unexpected delete stream write response: {other:?}"
4485 )));
4486 }
4487 };
4488 self.append_record(&command)?;
4489 self.inner = preview;
4490 Ok(response)
4491 })
4492 }
4493
4494 fn append<'a>(
4495 &'a mut self,
4496 request: AppendRequest,
4497 placement: ShardPlacement,
4498 ) -> GroupAppendFuture<'a> {
4499 Box::pin(async move {
4500 self.ensure_ready()?;
4501 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4502 let command = GroupWriteCommand::from(request);
4503 let mut preview = self.inner.clone();
4504 let response = match preview.apply_committed_write(command.clone(), placement)? {
4505 GroupWriteResponse::Append(response) => response,
4506 other => {
4507 return Err(GroupEngineError::new(format!(
4508 "unexpected append write response: {other:?}"
4509 )));
4510 }
4511 };
4512 self.append_record(&command)?;
4513 self.inner = preview;
4514 Ok(response)
4515 })
4516 }
4517
4518 fn append_with_cold_admission<'a>(
4519 &'a mut self,
4520 request: AppendRequest,
4521 placement: ShardPlacement,
4522 admission: ColdWriteAdmission,
4523 ) -> GroupAppendFuture<'a> {
4524 if !admission.is_enabled() {
4525 return self.append(request, placement);
4526 }
4527 Box::pin(async move {
4528 self.ensure_ready()?;
4529 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4530 let command = GroupWriteCommand::from(request.clone());
4531 let mut preview = self.inner.clone();
4532 let response = preview.append_with_admission_inner(request, placement, admission)?;
4533 if !response.deduplicated {
4534 self.append_record(&command)?;
4535 }
4536 self.inner = preview;
4537 Ok(response)
4538 })
4539 }
4540
4541 fn append_batch<'a>(
4542 &'a mut self,
4543 request: AppendBatchRequest,
4544 placement: ShardPlacement,
4545 ) -> GroupAppendBatchFuture<'a> {
4546 Box::pin(async move {
4547 self.ensure_ready()?;
4548 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4549 let command = GroupWriteCommand::from(request);
4550 let mut preview = self.inner.clone();
4551 let response = match preview.apply_committed_write(command.clone(), placement)? {
4552 GroupWriteResponse::AppendBatch(response) => response,
4553 other => {
4554 return Err(GroupEngineError::new(format!(
4555 "unexpected append batch write response: {other:?}"
4556 )));
4557 }
4558 };
4559 if response
4560 .items
4561 .iter()
4562 .any(|item| matches!(item, Ok(response) if !response.deduplicated))
4563 {
4564 self.append_record(&command)?;
4565 }
4566 self.inner = preview;
4567 Ok(response)
4568 })
4569 }
4570
4571 fn append_batch_with_cold_admission<'a>(
4572 &'a mut self,
4573 request: AppendBatchRequest,
4574 placement: ShardPlacement,
4575 admission: ColdWriteAdmission,
4576 ) -> GroupAppendBatchFuture<'a> {
4577 if !admission.is_enabled() {
4578 return self.append_batch(request, placement);
4579 }
4580 Box::pin(async move {
4581 self.ensure_ready()?;
4582 self.commit_access_if_needed(&request.stream_id, request.now_ms, false, placement)?;
4583 let command = GroupWriteCommand::from(request.clone());
4584 let mut preview = self.inner.clone();
4585 let response =
4586 preview.append_batch_with_admission_inner(request, placement, admission)?;
4587 if response
4588 .items
4589 .iter()
4590 .any(|item| matches!(item, Ok(response) if !response.deduplicated))
4591 {
4592 self.append_record(&command)?;
4593 }
4594 self.inner = preview;
4595 Ok(response)
4596 })
4597 }
4598
4599 fn flush_cold<'a>(
4600 &'a mut self,
4601 request: FlushColdRequest,
4602 placement: ShardPlacement,
4603 ) -> GroupFlushColdFuture<'a> {
4604 Box::pin(async move {
4605 self.ensure_ready()?;
4606 let command = GroupWriteCommand::from(request);
4607 let mut preview = self.inner.clone();
4608 let response = match preview.apply_committed_write(command.clone(), placement)? {
4609 GroupWriteResponse::FlushCold(response) => response,
4610 other => {
4611 return Err(GroupEngineError::new(format!(
4612 "unexpected flush cold write response: {other:?}"
4613 )));
4614 }
4615 };
4616 self.append_record(&command)?;
4617 self.inner = preview;
4618 Ok(response)
4619 })
4620 }
4621
4622 fn plan_cold_flush<'a>(
4623 &'a mut self,
4624 request: PlanColdFlushRequest,
4625 placement: ShardPlacement,
4626 ) -> GroupPlanColdFlushFuture<'a> {
4627 Box::pin(async move {
4628 self.ensure_ready()?;
4629 self.inner.plan_cold_flush(request, placement).await
4630 })
4631 }
4632
4633 fn plan_next_cold_flush<'a>(
4634 &'a mut self,
4635 request: PlanGroupColdFlushRequest,
4636 placement: ShardPlacement,
4637 ) -> GroupPlanNextColdFlushFuture<'a> {
4638 Box::pin(async move {
4639 self.ensure_ready()?;
4640 self.inner.plan_next_cold_flush(request, placement).await
4641 })
4642 }
4643
4644 fn plan_next_cold_flush_batch<'a>(
4645 &'a mut self,
4646 request: PlanGroupColdFlushRequest,
4647 placement: ShardPlacement,
4648 max_candidates: usize,
4649 ) -> GroupPlanNextColdFlushBatchFuture<'a> {
4650 Box::pin(async move {
4651 self.ensure_ready()?;
4652 self.inner
4653 .plan_next_cold_flush_batch(request, placement, max_candidates)
4654 .await
4655 })
4656 }
4657
4658 fn cold_hot_backlog<'a>(
4659 &'a mut self,
4660 stream_id: BucketStreamId,
4661 placement: ShardPlacement,
4662 ) -> GroupColdHotBacklogFuture<'a> {
4663 Box::pin(async move {
4664 self.ensure_ready()?;
4665 self.inner.cold_hot_backlog(stream_id, placement).await
4666 })
4667 }
4668
4669 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
4670 Box::pin(async move {
4671 self.ensure_ready()?;
4672 self.inner.snapshot(placement).await
4673 })
4674 }
4675
4676 fn install_snapshot<'a>(
4677 &'a mut self,
4678 snapshot: GroupSnapshot,
4679 ) -> GroupInstallSnapshotFuture<'a> {
4680 Box::pin(async move {
4681 self.ensure_ready()?;
4682 let mut preview = self.inner.clone();
4683 preview.install_snapshot(snapshot.clone()).await?;
4684 self.append_snapshot_record(&snapshot)?;
4685 self.inner = preview;
4686 Ok(())
4687 })
4688 }
4689}
4690
4691fn group_log_path(root: &Path, placement: ShardPlacement) -> PathBuf {
4692 root.join(format!("core-{}", placement.core_id.0))
4693 .join(format!("group-{}.jsonl", placement.raft_group_id.0))
4694}
4695
4696fn replay_group_log(log_path: &Path) -> Result<InMemoryGroupEngine, GroupEngineError> {
4697 if !log_path.exists() {
4698 return Ok(InMemoryGroupEngine::default());
4699 }
4700
4701 let file = File::open(log_path).map_err(|err| {
4702 GroupEngineError::new(format!("open WAL '{}': {err}", log_path.display()))
4703 })?;
4704 let reader = BufReader::new(file);
4705 let mut inner = InMemoryGroupEngine::default();
4706 for (line_index, line) in reader.lines().enumerate() {
4707 let line = line.map_err(|err| {
4708 GroupEngineError::new(format!(
4709 "read WAL '{}' line {}: {err}",
4710 log_path.display(),
4711 line_index + 1
4712 ))
4713 })?;
4714 if line.trim().is_empty() {
4715 continue;
4716 }
4717 if let Ok(record) = serde_json::from_str::<WalRecord>(&line) {
4718 match record {
4719 WalRecord::Command { command } => inner
4720 .apply_replayed_write_command(*command)
4721 .map_err(|err| {
4722 GroupEngineError::new(format!(
4723 "replay WAL command '{}' line {}: {err}",
4724 log_path.display(),
4725 line_index + 1
4726 ))
4727 })?,
4728 WalRecord::Snapshot {
4729 group_commit_index,
4730 stream_snapshot,
4731 stream_append_counts,
4732 } => inner
4733 .install_snapshot_parts(
4734 group_commit_index,
4735 stream_snapshot,
4736 stream_append_counts,
4737 )
4738 .map_err(|err| {
4739 GroupEngineError::new(format!(
4740 "replay WAL snapshot '{}' line {}: {err}",
4741 log_path.display(),
4742 line_index + 1
4743 ))
4744 })?,
4745 }
4746 continue;
4747 }
4748
4749 let command = serde_json::from_str::<StreamCommand>(&line).map_err(|err| {
4750 GroupEngineError::new(format!(
4751 "decode WAL '{}' line {}: {err}",
4752 log_path.display(),
4753 line_index + 1
4754 ))
4755 })?;
4756 inner.apply_replayed_command(command).map_err(|err| {
4757 GroupEngineError::new(format!(
4758 "replay WAL '{}' line {}: {err}",
4759 log_path.display(),
4760 line_index + 1
4761 ))
4762 })?;
4763 }
4764 Ok(inner)
4765}
4766
4767fn ensure_bucket_exists(
4768 state_machine: &mut StreamStateMachine,
4769 stream_id: &BucketStreamId,
4770) -> Result<(), GroupEngineError> {
4771 if state_machine.bucket_exists(&stream_id.bucket_id) {
4772 return Ok(());
4773 }
4774
4775 match state_machine.apply(StreamCommand::CreateBucket {
4776 bucket_id: stream_id.bucket_id.clone(),
4777 }) {
4778 StreamResponse::BucketCreated { .. } | StreamResponse::BucketAlreadyExists { .. } => Ok(()),
4779 StreamResponse::Error {
4780 code,
4781 message,
4782 next_offset,
4783 } => Err(GroupEngineError::stream_with_next_offset(
4784 code,
4785 message,
4786 next_offset,
4787 )),
4788 other => Err(GroupEngineError::new(format!(
4789 "unexpected create bucket response: {other:?}"
4790 ))),
4791 }
4792}
4793
4794fn stream_response_error(response: StreamResponse) -> GroupEngineError {
4795 match response {
4796 StreamResponse::Error {
4797 code,
4798 message,
4799 next_offset,
4800 } => GroupEngineError::stream_with_next_offset(code, message, next_offset),
4801 other => GroupEngineError::new(format!("unexpected stream response error: {other:?}")),
4802 }
4803}
4804
4805fn restore_stream_append_counts(
4806 counts: Vec<StreamAppendCount>,
4807 snapshot_stream_ids: &HashSet<BucketStreamId>,
4808) -> Result<HashMap<BucketStreamId, u64>, GroupEngineError> {
4809 let mut restored = HashMap::with_capacity(counts.len());
4810 for count in counts {
4811 if !snapshot_stream_ids.contains(&count.stream_id) {
4812 return Err(GroupEngineError::new(format!(
4813 "append count references missing snapshot stream '{}'",
4814 count.stream_id
4815 )));
4816 }
4817 if restored
4818 .insert(count.stream_id.clone(), count.append_count)
4819 .is_some()
4820 {
4821 return Err(GroupEngineError::new(format!(
4822 "snapshot contains duplicate append count for stream '{}'",
4823 count.stream_id
4824 )));
4825 }
4826 }
4827 Ok(restored)
4828}
4829
4830fn compare_stream_ids(left: &BucketStreamId, right: &BucketStreamId) -> std::cmp::Ordering {
4831 left.bucket_id
4832 .cmp(&right.bucket_id)
4833 .then_with(|| left.stream_id.cmp(&right.stream_id))
4834}
4835
4836#[derive(Debug, Clone)]
4837pub struct RuntimeConfig {
4838 pub core_count: usize,
4839 pub raft_group_count: usize,
4840 pub mailbox_capacity: usize,
4841 pub threading: RuntimeThreading,
4842 pub cold_max_hot_bytes_per_group: Option<u64>,
4843 pub live_read_max_waiters_per_core: Option<u64>,
4844}
4845
4846impl RuntimeConfig {
4847 pub fn new(core_count: usize, raft_group_count: usize) -> Self {
4848 Self {
4849 core_count,
4850 raft_group_count,
4851 mailbox_capacity: 1024,
4852 threading: RuntimeThreading::ThreadPerCore,
4853 cold_max_hot_bytes_per_group: None,
4854 live_read_max_waiters_per_core: Some(65_536),
4855 }
4856 }
4857
4858 pub fn with_cold_max_hot_bytes_per_group(mut self, value: Option<u64>) -> Self {
4859 self.cold_max_hot_bytes_per_group = value;
4860 self
4861 }
4862
4863 pub fn with_live_read_max_waiters_per_core(mut self, value: Option<u64>) -> Self {
4864 self.live_read_max_waiters_per_core = value;
4865 self
4866 }
4867}
4868
4869#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4870pub enum RuntimeThreading {
4871 ThreadPerCore,
4872 HostedTokio,
4873}
4874
4875#[derive(Debug, Clone)]
4876pub struct ShardRuntime {
4877 shard_map: StaticShardMap,
4878 mailboxes: Vec<CoreMailbox>,
4879 metrics: Arc<RuntimeMetricsInner>,
4880 next_waiter_id: Arc<AtomicU64>,
4881 cold_store: Option<ColdStoreHandle>,
4882}
4883
4884impl ShardRuntime {
4885 pub fn spawn(config: RuntimeConfig) -> Result<Self, RuntimeError> {
4886 Self::spawn_with_engine_factory(config, InMemoryGroupEngineFactory::default())
4887 }
4888
4889 pub fn spawn_with_engine_factory(
4890 config: RuntimeConfig,
4891 engine_factory: impl GroupEngineFactory,
4892 ) -> Result<Self, RuntimeError> {
4893 Self::spawn_with_engine_factory_and_cold_store(config, engine_factory, None)
4894 }
4895
4896 pub fn spawn_with_engine_factory_and_cold_store(
4897 config: RuntimeConfig,
4898 engine_factory: impl GroupEngineFactory,
4899 cold_store: Option<ColdStoreHandle>,
4900 ) -> Result<Self, RuntimeError> {
4901 let shard_map = StaticShardMap::new(config.core_count, config.raft_group_count)?;
4902 let metrics = Arc::new(RuntimeMetricsInner::new(
4903 usize::from(shard_map.core_count()),
4904 usize::try_from(shard_map.raft_group_count()).expect("u32 fits usize"),
4905 ));
4906 let cold_write_admission = ColdWriteAdmission {
4907 max_hot_bytes_per_group: config.cold_max_hot_bytes_per_group,
4908 };
4909 let engine_factory: Arc<dyn GroupEngineFactory> = Arc::new(engine_factory);
4910 let read_materialization = Arc::new(Semaphore::new(config.mailbox_capacity.max(1)));
4911 let mut mailboxes = Vec::with_capacity(usize::from(shard_map.core_count()));
4912 for raw_core_id in 0..shard_map.core_count() {
4913 let core_id = CoreId(raw_core_id);
4914 let (tx, rx) = mpsc::channel(config.mailbox_capacity.max(1));
4915 let worker = CoreWorker {
4916 core_id,
4917 rx,
4918 engine_factory: engine_factory.clone(),
4919 groups: HashMap::new(),
4920 metrics: metrics.clone(),
4921 group_mailbox_capacity: config.mailbox_capacity.max(1),
4922 cold_write_admission,
4923 live_read_max_waiters_per_core: config.live_read_max_waiters_per_core,
4924 read_materialization: read_materialization.clone(),
4925 };
4926 spawn_core_worker(config.threading, worker)?;
4927 mailboxes.push(CoreMailbox { core_id, tx });
4928 }
4929 Ok(Self {
4930 shard_map,
4931 mailboxes,
4932 metrics,
4933 next_waiter_id: Arc::new(AtomicU64::new(1)),
4934 cold_store,
4935 })
4936 }
4937
4938 pub fn locate(&self, stream_id: &BucketStreamId) -> ShardPlacement {
4939 self.shard_map.locate(stream_id)
4940 }
4941
4942 pub fn has_cold_store(&self) -> bool {
4943 self.cold_store.is_some()
4944 }
4945
4946 pub fn cold_store(&self) -> Option<ColdStoreHandle> {
4947 self.cold_store.clone()
4948 }
4949
4950 pub async fn create_stream(
4951 &self,
4952 request: CreateStreamRequest,
4953 ) -> Result<CreateStreamResponse, RuntimeError> {
4954 if request.forked_from.is_some() {
4955 return self.create_fork_stream(request).await;
4956 }
4957 self.create_stream_on_owner(request).await
4958 }
4959
4960 pub async fn create_stream_external(
4961 &self,
4962 request: CreateStreamExternalRequest,
4963 ) -> Result<CreateStreamResponse, RuntimeError> {
4964 let placement = self.shard_map.locate(&request.stream_id);
4965 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
4966 let (response_tx, response_rx) = oneshot::channel();
4967 self.send_core_command(
4968 mailbox,
4969 CoreCommand::CreateExternal {
4970 request,
4971 placement,
4972 response_tx,
4973 },
4974 response_rx,
4975 )
4976 .await
4977 }
4978
4979 async fn create_stream_on_owner(
4980 &self,
4981 request: CreateStreamRequest,
4982 ) -> Result<CreateStreamResponse, RuntimeError> {
4983 let placement = self.shard_map.locate(&request.stream_id);
4984 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
4985 let (response_tx, response_rx) = oneshot::channel();
4986 self.send_core_command(
4987 mailbox,
4988 CoreCommand::CreateStream {
4989 request,
4990 placement,
4991 response_tx,
4992 },
4993 response_rx,
4994 )
4995 .await
4996 }
4997
4998 async fn create_fork_stream(
4999 &self,
5000 mut request: CreateStreamRequest,
5001 ) -> Result<CreateStreamResponse, RuntimeError> {
5002 let source_id = request
5003 .forked_from
5004 .clone()
5005 .expect("forked_from checked before create_fork_stream");
5006 let now_ms = request.now_ms;
5007 let source_placement = self.shard_map.locate(&source_id);
5008 let source_head = self
5009 .head_stream(HeadStreamRequest {
5010 stream_id: source_id.clone(),
5011 now_ms,
5012 })
5013 .await
5014 .map_err(|err| map_fork_source_ref_error(err, source_placement))?;
5015
5016 if request.content_type_explicit {
5017 if request.content_type != source_head.content_type {
5018 return Err(RuntimeError::group_engine(
5019 source_placement,
5020 GroupEngineError::stream(
5021 StreamErrorCode::ContentTypeMismatch,
5022 format!(
5023 "fork content type '{}' does not match source content type '{}'",
5024 request.content_type, source_head.content_type
5025 ),
5026 ),
5027 ));
5028 }
5029 } else {
5030 request.content_type.clone_from(&source_head.content_type);
5031 }
5032
5033 let fork_offset = request.fork_offset.unwrap_or(source_head.tail_offset);
5034 if fork_offset > source_head.tail_offset {
5035 return Err(RuntimeError::group_engine(
5036 source_placement,
5037 GroupEngineError::stream(
5038 StreamErrorCode::InvalidFork,
5039 format!(
5040 "fork offset {fork_offset} is beyond source stream '{}' tail {}",
5041 source_id, source_head.tail_offset
5042 ),
5043 ),
5044 ));
5045 }
5046
5047 let max_len = usize::try_from(fork_offset).map_err(|_| {
5048 RuntimeError::group_engine(
5049 source_placement,
5050 GroupEngineError::stream(
5051 StreamErrorCode::InvalidFork,
5052 format!("fork offset {fork_offset} cannot fit in memory on this host"),
5053 ),
5054 )
5055 })?;
5056 request.initial_payload = if fork_offset == 0 {
5057 Bytes::new()
5058 } else {
5059 self.read_stream(ReadStreamRequest {
5060 stream_id: source_id.clone(),
5061 offset: 0,
5062 max_len,
5063 now_ms,
5064 })
5065 .await?
5066 .payload
5067 .into()
5068 };
5069 self.add_fork_ref_on_owner(source_id.clone(), now_ms)
5070 .await
5071 .map_err(|err| map_fork_source_ref_error(err, source_placement))?;
5072 request.close_after = false;
5073 request.stream_seq = None;
5074 request.producer = None;
5075 if request.stream_ttl_seconds.is_none() && request.stream_expires_at_ms.is_none() {
5076 request.stream_ttl_seconds = source_head.stream_ttl_seconds;
5077 request.stream_expires_at_ms = source_head.stream_expires_at_ms;
5078 }
5079 request.fork_offset = Some(fork_offset);
5080 match self.create_stream_on_owner(request).await {
5081 Ok(response) if response.already_exists => {
5082 self.release_fork_ref_cascade(source_id).await?;
5083 Ok(response)
5084 }
5085 Ok(response) => Ok(response),
5086 Err(err) => {
5087 let _ = self.release_fork_ref_cascade(source_id).await;
5088 Err(err)
5089 }
5090 }
5091 }
5092
5093 pub async fn head_stream(
5094 &self,
5095 request: HeadStreamRequest,
5096 ) -> Result<HeadStreamResponse, RuntimeError> {
5097 let placement = self.shard_map.locate(&request.stream_id);
5098 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5099 let (response_tx, response_rx) = oneshot::channel();
5100 self.send_core_command(
5101 mailbox,
5102 CoreCommand::HeadStream {
5103 request,
5104 placement,
5105 response_tx,
5106 },
5107 response_rx,
5108 )
5109 .await
5110 }
5111
5112 pub async fn read_stream(
5113 &self,
5114 request: ReadStreamRequest,
5115 ) -> Result<ReadStreamResponse, RuntimeError> {
5116 let placement = self.shard_map.locate(&request.stream_id);
5117 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5118 let (response_tx, response_rx) = oneshot::channel();
5119 self.send_core_command(
5120 mailbox,
5121 CoreCommand::ReadStream {
5122 request,
5123 placement,
5124 response_tx,
5125 },
5126 response_rx,
5127 )
5128 .await
5129 }
5130
5131 pub async fn publish_snapshot(
5132 &self,
5133 request: PublishSnapshotRequest,
5134 ) -> Result<PublishSnapshotResponse, RuntimeError> {
5135 let placement = self.shard_map.locate(&request.stream_id);
5136 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5137 let (response_tx, response_rx) = oneshot::channel();
5138 self.send_core_command(
5139 mailbox,
5140 CoreCommand::PublishSnapshot {
5141 request,
5142 placement,
5143 response_tx,
5144 },
5145 response_rx,
5146 )
5147 .await
5148 }
5149
5150 pub async fn read_snapshot(
5151 &self,
5152 request: ReadSnapshotRequest,
5153 ) -> Result<ReadSnapshotResponse, RuntimeError> {
5154 let placement = self.shard_map.locate(&request.stream_id);
5155 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5156 let (response_tx, response_rx) = oneshot::channel();
5157 self.send_core_command(
5158 mailbox,
5159 CoreCommand::ReadSnapshot {
5160 request,
5161 placement,
5162 response_tx,
5163 },
5164 response_rx,
5165 )
5166 .await
5167 }
5168
5169 pub async fn delete_snapshot(
5170 &self,
5171 request: DeleteSnapshotRequest,
5172 ) -> Result<(), RuntimeError> {
5173 let placement = self.shard_map.locate(&request.stream_id);
5174 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5175 let (response_tx, response_rx) = oneshot::channel();
5176 self.send_core_command(
5177 mailbox,
5178 CoreCommand::DeleteSnapshot {
5179 request,
5180 placement,
5181 response_tx,
5182 },
5183 response_rx,
5184 )
5185 .await
5186 }
5187
5188 pub async fn bootstrap_stream(
5189 &self,
5190 request: BootstrapStreamRequest,
5191 ) -> Result<BootstrapStreamResponse, RuntimeError> {
5192 let placement = self.shard_map.locate(&request.stream_id);
5193 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5194 let (response_tx, response_rx) = oneshot::channel();
5195 self.send_core_command(
5196 mailbox,
5197 CoreCommand::BootstrapStream {
5198 request,
5199 placement,
5200 response_tx,
5201 },
5202 response_rx,
5203 )
5204 .await
5205 }
5206
5207 pub async fn wait_read_stream(
5208 &self,
5209 request: ReadStreamRequest,
5210 ) -> Result<ReadStreamResponse, RuntimeError> {
5211 let placement = self.shard_map.locate(&request.stream_id);
5212 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5213 let waiter_id = self.next_waiter_id.fetch_add(1, Ordering::Relaxed);
5214 let stream_id = request.stream_id.clone();
5215 let (response_tx, response_rx) = oneshot::channel();
5216 self.enqueue_core_command(
5217 mailbox,
5218 CoreCommand::WaitRead {
5219 request,
5220 placement,
5221 waiter_id,
5222 response_tx,
5223 },
5224 )
5225 .await?;
5226 let mut cancel = WaitReadCancel::new(mailbox.tx.clone(), stream_id, placement, waiter_id);
5227 let response = response_rx
5228 .await
5229 .map_err(|_| RuntimeError::ResponseDropped {
5230 core_id: mailbox.core_id,
5231 })?;
5232 cancel.disarm();
5233 response
5234 }
5235
5236 pub async fn require_local_live_read_owner(
5237 &self,
5238 stream_id: &BucketStreamId,
5239 ) -> Result<(), RuntimeError> {
5240 let placement = self.shard_map.locate(stream_id);
5241 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5242 let (response_tx, response_rx) = oneshot::channel();
5243 self.send_core_command(
5244 mailbox,
5245 CoreCommand::RequireLiveReadOwner {
5246 placement,
5247 response_tx,
5248 },
5249 response_rx,
5250 )
5251 .await
5252 }
5253
5254 pub async fn close_stream(
5255 &self,
5256 request: CloseStreamRequest,
5257 ) -> Result<CloseStreamResponse, RuntimeError> {
5258 let placement = self.shard_map.locate(&request.stream_id);
5259 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5260 let (response_tx, response_rx) = oneshot::channel();
5261 self.send_core_command(
5262 mailbox,
5263 CoreCommand::CloseStream {
5264 request,
5265 placement,
5266 response_tx,
5267 },
5268 response_rx,
5269 )
5270 .await
5271 }
5272
5273 pub async fn delete_stream(
5274 &self,
5275 request: DeleteStreamRequest,
5276 ) -> Result<DeleteStreamResponse, RuntimeError> {
5277 let response = self.delete_stream_on_owner(request).await?;
5278 if let Some(parent_to_release) = response.parent_to_release.clone() {
5279 self.release_fork_ref_cascade(parent_to_release).await?;
5280 }
5281 Ok(response)
5282 }
5283
5284 async fn delete_stream_on_owner(
5285 &self,
5286 request: DeleteStreamRequest,
5287 ) -> Result<DeleteStreamResponse, RuntimeError> {
5288 let placement = self.shard_map.locate(&request.stream_id);
5289 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5290 let (response_tx, response_rx) = oneshot::channel();
5291 self.send_core_command(
5292 mailbox,
5293 CoreCommand::DeleteStream {
5294 request,
5295 placement,
5296 response_tx,
5297 },
5298 response_rx,
5299 )
5300 .await
5301 }
5302
5303 async fn add_fork_ref_on_owner(
5304 &self,
5305 stream_id: BucketStreamId,
5306 now_ms: u64,
5307 ) -> Result<ForkRefResponse, RuntimeError> {
5308 let placement = self.shard_map.locate(&stream_id);
5309 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5310 let (response_tx, response_rx) = oneshot::channel();
5311 self.send_core_command(
5312 mailbox,
5313 CoreCommand::AddForkRef {
5314 stream_id,
5315 now_ms,
5316 placement,
5317 response_tx,
5318 },
5319 response_rx,
5320 )
5321 .await
5322 }
5323
5324 async fn release_fork_ref_on_owner(
5325 &self,
5326 stream_id: BucketStreamId,
5327 ) -> Result<ForkRefResponse, RuntimeError> {
5328 let placement = self.shard_map.locate(&stream_id);
5329 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5330 let (response_tx, response_rx) = oneshot::channel();
5331 self.send_core_command(
5332 mailbox,
5333 CoreCommand::ReleaseForkRef {
5334 stream_id,
5335 placement,
5336 response_tx,
5337 },
5338 response_rx,
5339 )
5340 .await
5341 }
5342
5343 async fn release_fork_ref_cascade(
5344 &self,
5345 stream_id: BucketStreamId,
5346 ) -> Result<(), RuntimeError> {
5347 let mut next = Some(stream_id);
5348 while let Some(current) = next {
5349 let response = self.release_fork_ref_on_owner(current).await?;
5350 next = response.parent_to_release;
5351 }
5352 Ok(())
5353 }
5354
5355 pub async fn flush_cold(
5356 &self,
5357 request: FlushColdRequest,
5358 ) -> Result<FlushColdResponse, RuntimeError> {
5359 let placement = self.shard_map.locate(&request.stream_id);
5360 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5361 let (response_tx, response_rx) = oneshot::channel();
5362 self.send_core_command(
5363 mailbox,
5364 CoreCommand::FlushCold {
5365 request,
5366 placement,
5367 response_tx,
5368 },
5369 response_rx,
5370 )
5371 .await
5372 }
5373
5374 pub async fn append_external(
5375 &self,
5376 request: AppendExternalRequest,
5377 ) -> Result<AppendResponse, RuntimeError> {
5378 let placement = self.shard_map.locate(&request.stream_id);
5379 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5380 let (response_tx, response_rx) = oneshot::channel();
5381 self.send_core_command(
5382 mailbox,
5383 CoreCommand::AppendExternal {
5384 request,
5385 placement,
5386 response_tx,
5387 },
5388 response_rx,
5389 )
5390 .await
5391 }
5392
5393 pub async fn plan_cold_flush(
5394 &self,
5395 request: PlanColdFlushRequest,
5396 ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
5397 let placement = self.shard_map.locate(&request.stream_id);
5398 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5399 let (response_tx, response_rx) = oneshot::channel();
5400 self.send_core_command(
5401 mailbox,
5402 CoreCommand::PlanColdFlush {
5403 request,
5404 placement,
5405 response_tx,
5406 },
5407 response_rx,
5408 )
5409 .await
5410 }
5411
5412 pub async fn flush_cold_once(
5413 &self,
5414 request: PlanColdFlushRequest,
5415 ) -> Result<Option<FlushColdResponse>, RuntimeError> {
5416 let Some(candidate) = self.plan_cold_flush(request).await? else {
5417 return Ok(None);
5418 };
5419 self.flush_cold_candidate(candidate).await.map(Some)
5420 }
5421
5422 pub async fn plan_next_cold_flush(
5423 &self,
5424 raft_group_id: RaftGroupId,
5425 request: PlanGroupColdFlushRequest,
5426 ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
5427 let placement = self.placement_for_group(raft_group_id)?;
5428 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5429 let (response_tx, response_rx) = oneshot::channel();
5430 self.send_core_command(
5431 mailbox,
5432 CoreCommand::PlanNextColdFlush {
5433 request,
5434 placement,
5435 response_tx,
5436 },
5437 response_rx,
5438 )
5439 .await
5440 }
5441
5442 pub async fn plan_next_cold_flush_batch(
5443 &self,
5444 raft_group_id: RaftGroupId,
5445 request: PlanGroupColdFlushRequest,
5446 max_candidates: usize,
5447 ) -> Result<Vec<ColdFlushCandidate>, RuntimeError> {
5448 let placement = self.placement_for_group(raft_group_id)?;
5449 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5450 let (response_tx, response_rx) = oneshot::channel();
5451 self.send_core_command(
5452 mailbox,
5453 CoreCommand::PlanNextColdFlushBatch {
5454 request,
5455 placement,
5456 max_candidates,
5457 response_tx,
5458 },
5459 response_rx,
5460 )
5461 .await
5462 }
5463
5464 pub async fn flush_cold_group_once(
5465 &self,
5466 raft_group_id: RaftGroupId,
5467 request: PlanGroupColdFlushRequest,
5468 ) -> Result<Option<FlushColdResponse>, RuntimeError> {
5469 let Some(candidate) = self.plan_next_cold_flush(raft_group_id, request).await? else {
5470 return Ok(None);
5471 };
5472 match self.flush_cold_candidate(candidate).await {
5473 Ok(response) => Ok(Some(response)),
5474 Err(err) if is_stale_cold_flush_candidate_error(&err) => Ok(None),
5475 Err(err) => Err(err),
5476 }
5477 }
5478
5479 pub async fn flush_cold_group_batch_once(
5480 &self,
5481 raft_group_id: RaftGroupId,
5482 request: PlanGroupColdFlushRequest,
5483 max_candidates: usize,
5484 ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
5485 let candidates = self
5486 .plan_next_cold_flush_batch(raft_group_id, request, max_candidates)
5487 .await?;
5488 if candidates.is_empty() {
5489 return Ok(Vec::new());
5490 }
5491 match self.flush_cold_candidates_batch(candidates).await {
5492 Ok(responses) => Ok(responses),
5493 Err(err) if is_stale_cold_flush_candidate_error(&err) => Ok(Vec::new()),
5494 Err(err) => Err(err),
5495 }
5496 }
5497
5498 async fn flush_cold_candidate(
5499 &self,
5500 candidate: ColdFlushCandidate,
5501 ) -> Result<FlushColdResponse, RuntimeError> {
5502 let Some(cold_store) = self.cold_store.as_ref() else {
5503 return Err(RuntimeError::ColdStoreConfig {
5504 message: "URSULA_COLD_BACKEND must be configured before flushing cold chunks"
5505 .to_owned(),
5506 });
5507 };
5508 let path = new_cold_chunk_path(
5509 &candidate.stream_id,
5510 candidate.start_offset,
5511 candidate.end_offset,
5512 );
5513 let upload_started_at = Instant::now();
5514 let object_size = cold_store
5515 .write_chunk(&path, &candidate.payload)
5516 .await
5517 .map_err(|err| RuntimeError::ColdStoreIo {
5518 message: err.to_string(),
5519 })?;
5520 self.metrics
5521 .record_cold_upload(object_size, elapsed_ns(upload_started_at));
5522 let publish_started_at = Instant::now();
5523 let publish = self
5524 .flush_cold(FlushColdRequest {
5525 stream_id: candidate.stream_id,
5526 chunk: ColdChunkRef {
5527 start_offset: candidate.start_offset,
5528 end_offset: candidate.end_offset,
5529 s3_path: path.clone(),
5530 object_size,
5531 },
5532 })
5533 .await;
5534 match publish {
5535 Ok(response) => {
5536 self.metrics
5537 .record_cold_publish(object_size, elapsed_ns(publish_started_at));
5538 Ok(response)
5539 }
5540 Err(err) => {
5541 let cleanup_failed = cold_store.delete_chunk(&path).await.is_err();
5542 self.metrics
5543 .record_cold_orphan_cleanup(object_size, cleanup_failed);
5544 Err(err)
5545 }
5546 }
5547 }
5548
5549 async fn flush_cold_candidates_batch(
5550 &self,
5551 candidates: Vec<ColdFlushCandidate>,
5552 ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
5553 let Some(cold_store) = self.cold_store.as_ref() else {
5554 return Err(RuntimeError::ColdStoreConfig {
5555 message: "URSULA_COLD_BACKEND must be configured before flushing cold chunks"
5556 .to_owned(),
5557 });
5558 };
5559 let mut requests = Vec::with_capacity(candidates.len());
5560 let mut uploaded = Vec::with_capacity(candidates.len());
5561 for candidate in candidates {
5562 let path = new_cold_chunk_path(
5563 &candidate.stream_id,
5564 candidate.start_offset,
5565 candidate.end_offset,
5566 );
5567 let upload_started_at = Instant::now();
5568 let object_size = cold_store
5569 .write_chunk(&path, &candidate.payload)
5570 .await
5571 .map_err(|err| RuntimeError::ColdStoreIo {
5572 message: err.to_string(),
5573 })?;
5574 self.metrics
5575 .record_cold_upload(object_size, elapsed_ns(upload_started_at));
5576 uploaded.push((path.clone(), object_size));
5577 requests.push(FlushColdRequest {
5578 stream_id: candidate.stream_id,
5579 chunk: ColdChunkRef {
5580 start_offset: candidate.start_offset,
5581 end_offset: candidate.end_offset,
5582 s3_path: path,
5583 object_size,
5584 },
5585 });
5586 }
5587
5588 let placement = self.shard_map.locate(&requests[0].stream_id);
5589 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5590 let (response_tx, response_rx) = oneshot::channel();
5591 let publish_started_at = Instant::now();
5592 let publish = self
5593 .send_core_command(
5594 mailbox,
5595 CoreCommand::FlushColdBatch {
5596 requests,
5597 placement,
5598 response_tx,
5599 },
5600 response_rx,
5601 )
5602 .await;
5603 match publish {
5604 Ok(responses) => {
5605 let publish_ns = elapsed_ns(publish_started_at);
5606 let per_chunk_publish_ns =
5607 publish_ns / u64::try_from(uploaded.len()).expect("uploaded len fits u64");
5608 for (_, object_size) in &uploaded {
5609 self.metrics
5610 .record_cold_publish(*object_size, per_chunk_publish_ns);
5611 }
5612 Ok(responses)
5613 }
5614 Err(err) => {
5615 for (path, object_size) in uploaded {
5616 let cleanup_failed = cold_store.delete_chunk(&path).await.is_err();
5617 self.metrics
5618 .record_cold_orphan_cleanup(object_size, cleanup_failed);
5619 }
5620 Err(err)
5621 }
5622 }
5623 }
5624
5625 pub async fn flush_cold_all_groups_once(
5626 &self,
5627 request: PlanGroupColdFlushRequest,
5628 ) -> Result<usize, RuntimeError> {
5629 self.flush_cold_all_groups_once_bounded(request, 1).await
5630 }
5631
5632 pub async fn flush_cold_all_groups_once_bounded(
5633 &self,
5634 request: PlanGroupColdFlushRequest,
5635 max_concurrency: usize,
5636 ) -> Result<usize, RuntimeError> {
5637 let max_concurrency = max_concurrency.max(1);
5638 if max_concurrency == 1 {
5639 return self.flush_cold_all_groups_once_serial(request).await;
5640 }
5641 let mut flushed = 0;
5642 let mut next_group_id = 0;
5643 let group_count = self.shard_map.raft_group_count();
5644 let mut tasks = JoinSet::new();
5645
5646 while next_group_id < group_count || !tasks.is_empty() {
5647 while next_group_id < group_count && tasks.len() < max_concurrency {
5648 let runtime = self.clone();
5649 let request = request.clone();
5650 let group_id = RaftGroupId(next_group_id);
5651 next_group_id += 1;
5652 tasks.spawn(async move {
5653 runtime
5654 .flush_cold_group_batch_once(
5655 group_id,
5656 request,
5657 COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS,
5658 )
5659 .await
5660 .map(|responses| responses.len())
5661 });
5662 }
5663 if let Some(result) = tasks.join_next().await {
5664 match result {
5665 Ok(Ok(count)) => flushed += count,
5666 Ok(Err(err)) => return Err(err),
5667 Err(err) => {
5668 return Err(RuntimeError::ColdStoreIo {
5669 message: format!("cold flush task failed: {err}"),
5670 });
5671 }
5672 }
5673 }
5674 }
5675 Ok(flushed)
5676 }
5677
5678 async fn flush_cold_all_groups_once_serial(
5679 &self,
5680 request: PlanGroupColdFlushRequest,
5681 ) -> Result<usize, RuntimeError> {
5682 let mut flushed = 0;
5683 for group_id in 0..self.shard_map.raft_group_count() {
5684 flushed += self
5685 .flush_cold_group_batch_once(
5686 RaftGroupId(group_id),
5687 request.clone(),
5688 COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS,
5689 )
5690 .await?
5691 .len();
5692 }
5693 Ok(flushed)
5694 }
5695
5696 pub async fn append(&self, request: AppendRequest) -> Result<AppendResponse, RuntimeError> {
5697 if request.payload.is_empty() {
5698 return Err(RuntimeError::EmptyAppend);
5699 }
5700 let placement = self.shard_map.locate(&request.stream_id);
5701 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5702 let (response_tx, response_rx) = oneshot::channel();
5703 self.send_core_command(
5704 mailbox,
5705 CoreCommand::Append {
5706 request,
5707 placement,
5708 response_tx,
5709 },
5710 response_rx,
5711 )
5712 .await
5713 }
5714
5715 pub async fn append_batch(
5716 &self,
5717 request: AppendBatchRequest,
5718 ) -> Result<AppendBatchResponse, RuntimeError> {
5719 if request.payloads.is_empty() {
5720 return Err(RuntimeError::EmptyAppend);
5721 }
5722 let placement = self.shard_map.locate(&request.stream_id);
5723 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5724 let (response_tx, response_rx) = oneshot::channel();
5725 self.send_core_command(
5726 mailbox,
5727 CoreCommand::AppendBatch {
5728 request,
5729 placement,
5730 response_tx,
5731 },
5732 response_rx,
5733 )
5734 .await
5735 }
5736
5737 pub async fn snapshot_group(
5738 &self,
5739 raft_group_id: RaftGroupId,
5740 ) -> Result<GroupSnapshot, RuntimeError> {
5741 let placement = self.placement_for_group(raft_group_id)?;
5742 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5743 let (response_tx, response_rx) = oneshot::channel();
5744 self.send_core_command(
5745 mailbox,
5746 CoreCommand::SnapshotGroup {
5747 placement,
5748 response_tx,
5749 },
5750 response_rx,
5751 )
5752 .await
5753 }
5754
5755 pub async fn install_group_snapshot(
5756 &self,
5757 snapshot: GroupSnapshot,
5758 ) -> Result<(), RuntimeError> {
5759 let expected = self.placement_for_group(snapshot.placement.raft_group_id)?;
5760 if snapshot.placement != expected {
5761 return Err(RuntimeError::SnapshotPlacementMismatch {
5762 expected,
5763 actual: snapshot.placement,
5764 });
5765 }
5766 let mailbox = &self.mailboxes[usize::from(expected.core_id.0)];
5767 let (response_tx, response_rx) = oneshot::channel();
5768 self.send_core_command(
5769 mailbox,
5770 CoreCommand::InstallGroupSnapshot {
5771 snapshot,
5772 response_tx,
5773 },
5774 response_rx,
5775 )
5776 .await
5777 }
5778
5779 pub async fn warm_group(
5780 &self,
5781 raft_group_id: RaftGroupId,
5782 ) -> Result<ShardPlacement, RuntimeError> {
5783 let placement = self.placement_for_group(raft_group_id)?;
5784 let mailbox = &self.mailboxes[usize::from(placement.core_id.0)];
5785 let (response_tx, response_rx) = oneshot::channel();
5786 self.send_core_command(
5787 mailbox,
5788 CoreCommand::WarmGroup {
5789 placement,
5790 response_tx,
5791 },
5792 response_rx,
5793 )
5794 .await
5795 }
5796
5797 pub async fn warm_all_groups(&self) -> Result<(), RuntimeError> {
5798 for raw_group_id in 0..self.shard_map.raft_group_count() {
5799 self.warm_group(RaftGroupId(raw_group_id)).await?;
5800 }
5801 Ok(())
5802 }
5803
5804 fn placement_for_group(
5805 &self,
5806 raft_group_id: RaftGroupId,
5807 ) -> Result<ShardPlacement, RuntimeError> {
5808 if raft_group_id.0 >= self.shard_map.raft_group_count() {
5809 return Err(RuntimeError::InvalidRaftGroup {
5810 raft_group_id,
5811 raft_group_count: self.shard_map.raft_group_count(),
5812 });
5813 }
5814 Ok(ShardPlacement {
5815 core_id: CoreId(
5816 (raft_group_id.0 % u32::from(self.shard_map.core_count()))
5817 .try_into()
5818 .expect("core id fits u16"),
5819 ),
5820 shard_id: ShardId(raft_group_id.0),
5821 raft_group_id,
5822 })
5823 }
5824
5825 async fn send_core_command<T>(
5826 &self,
5827 mailbox: &CoreMailbox,
5828 command: CoreCommand,
5829 response_rx: oneshot::Receiver<Result<T, RuntimeError>>,
5830 ) -> Result<T, RuntimeError> {
5831 self.enqueue_core_command(mailbox, command).await?;
5832 response_rx
5833 .await
5834 .map_err(|_| RuntimeError::ResponseDropped {
5835 core_id: mailbox.core_id,
5836 })?
5837 }
5838
5839 async fn enqueue_core_command(
5840 &self,
5841 mailbox: &CoreMailbox,
5842 command: CoreCommand,
5843 ) -> Result<(), RuntimeError> {
5844 if mailbox.tx.capacity() == 0 {
5845 self.metrics.record_mailbox_full(mailbox.core_id);
5846 }
5847 let started_at = Instant::now();
5848 mailbox
5849 .tx
5850 .send(command)
5851 .await
5852 .map_err(|_| RuntimeError::MailboxClosed {
5853 core_id: mailbox.core_id,
5854 })?;
5855 self.metrics
5856 .record_routed_request(mailbox.core_id, elapsed_ns(started_at));
5857 Ok(())
5858 }
5859
5860 pub fn metrics(&self) -> RuntimeMetrics {
5861 RuntimeMetrics {
5862 inner: self.metrics.clone(),
5863 }
5864 }
5865
5866 pub fn mailbox_snapshot(&self) -> RuntimeMailboxSnapshot {
5867 let depths = self
5868 .mailboxes
5869 .iter()
5870 .map(CoreMailbox::depth)
5871 .collect::<Vec<_>>();
5872 let capacities = self
5873 .mailboxes
5874 .iter()
5875 .map(CoreMailbox::capacity)
5876 .collect::<Vec<_>>();
5877 RuntimeMailboxSnapshot { depths, capacities }
5878 }
5879}
5880
5881fn spawn_core_worker(threading: RuntimeThreading, worker: CoreWorker) -> Result<(), RuntimeError> {
5882 let core_id = worker.core_id;
5883 match threading {
5884 RuntimeThreading::HostedTokio => {
5885 tokio::spawn(worker.run());
5886 Ok(())
5887 }
5888 RuntimeThreading::ThreadPerCore => std::thread::Builder::new()
5889 .name(format!("ursula-core-{}", core_id.0))
5890 .spawn(move || {
5891 let runtime = tokio::runtime::Builder::new_current_thread()
5892 .enable_all()
5893 .build()
5894 .expect("build per-core tokio runtime");
5895 runtime.block_on(worker.run());
5896 })
5897 .map(|_| ())
5898 .map_err(|err| RuntimeError::SpawnCoreThread {
5899 core_id,
5900 message: err.to_string(),
5901 }),
5902 }
5903}
5904
5905#[derive(Debug, Clone)]
5906struct CoreMailbox {
5907 core_id: CoreId,
5908 tx: mpsc::Sender<CoreCommand>,
5909}
5910
5911impl CoreMailbox {
5912 fn depth(&self) -> usize {
5913 self.tx.max_capacity() - self.tx.capacity()
5914 }
5915
5916 fn capacity(&self) -> usize {
5917 self.tx.max_capacity()
5918 }
5919}
5920
5921#[derive(Debug)]
5922enum CoreCommand {
5923 CreateStream {
5924 request: CreateStreamRequest,
5925 placement: ShardPlacement,
5926 response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
5927 },
5928 CreateExternal {
5929 request: CreateStreamExternalRequest,
5930 placement: ShardPlacement,
5931 response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
5932 },
5933 HeadStream {
5934 request: HeadStreamRequest,
5935 placement: ShardPlacement,
5936 response_tx: oneshot::Sender<Result<HeadStreamResponse, RuntimeError>>,
5937 },
5938 ReadStream {
5939 request: ReadStreamRequest,
5940 placement: ShardPlacement,
5941 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
5942 },
5943 PublishSnapshot {
5944 request: PublishSnapshotRequest,
5945 placement: ShardPlacement,
5946 response_tx: oneshot::Sender<Result<PublishSnapshotResponse, RuntimeError>>,
5947 },
5948 ReadSnapshot {
5949 request: ReadSnapshotRequest,
5950 placement: ShardPlacement,
5951 response_tx: oneshot::Sender<Result<ReadSnapshotResponse, RuntimeError>>,
5952 },
5953 DeleteSnapshot {
5954 request: DeleteSnapshotRequest,
5955 placement: ShardPlacement,
5956 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
5957 },
5958 BootstrapStream {
5959 request: BootstrapStreamRequest,
5960 placement: ShardPlacement,
5961 response_tx: oneshot::Sender<Result<BootstrapStreamResponse, RuntimeError>>,
5962 },
5963 WaitRead {
5964 request: ReadStreamRequest,
5965 placement: ShardPlacement,
5966 waiter_id: u64,
5967 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
5968 },
5969 RequireLiveReadOwner {
5970 placement: ShardPlacement,
5971 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
5972 },
5973 CancelWaitRead {
5974 stream_id: BucketStreamId,
5975 placement: ShardPlacement,
5976 waiter_id: u64,
5977 },
5978 CloseStream {
5979 request: CloseStreamRequest,
5980 placement: ShardPlacement,
5981 response_tx: oneshot::Sender<Result<CloseStreamResponse, RuntimeError>>,
5982 },
5983 AddForkRef {
5984 stream_id: BucketStreamId,
5985 now_ms: u64,
5986 placement: ShardPlacement,
5987 response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
5988 },
5989 ReleaseForkRef {
5990 stream_id: BucketStreamId,
5991 placement: ShardPlacement,
5992 response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
5993 },
5994 DeleteStream {
5995 request: DeleteStreamRequest,
5996 placement: ShardPlacement,
5997 response_tx: oneshot::Sender<Result<DeleteStreamResponse, RuntimeError>>,
5998 },
5999 FlushCold {
6000 request: FlushColdRequest,
6001 placement: ShardPlacement,
6002 response_tx: oneshot::Sender<Result<FlushColdResponse, RuntimeError>>,
6003 },
6004 FlushColdBatch {
6005 requests: Vec<FlushColdRequest>,
6006 placement: ShardPlacement,
6007 response_tx: oneshot::Sender<Result<Vec<FlushColdResponse>, RuntimeError>>,
6008 },
6009 PlanColdFlush {
6010 request: PlanColdFlushRequest,
6011 placement: ShardPlacement,
6012 response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6013 },
6014 PlanNextColdFlush {
6015 request: PlanGroupColdFlushRequest,
6016 placement: ShardPlacement,
6017 response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6018 },
6019 PlanNextColdFlushBatch {
6020 request: PlanGroupColdFlushRequest,
6021 placement: ShardPlacement,
6022 max_candidates: usize,
6023 response_tx: oneshot::Sender<Result<Vec<ColdFlushCandidate>, RuntimeError>>,
6024 },
6025 Append {
6026 request: AppendRequest,
6027 placement: ShardPlacement,
6028 response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6029 },
6030 AppendExternal {
6031 request: AppendExternalRequest,
6032 placement: ShardPlacement,
6033 response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6034 },
6035 AppendBatch {
6036 request: AppendBatchRequest,
6037 placement: ShardPlacement,
6038 response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6039 },
6040 WarmGroup {
6041 placement: ShardPlacement,
6042 response_tx: oneshot::Sender<Result<ShardPlacement, RuntimeError>>,
6043 },
6044 SnapshotGroup {
6045 placement: ShardPlacement,
6046 response_tx: oneshot::Sender<Result<GroupSnapshot, RuntimeError>>,
6047 },
6048 InstallGroupSnapshot {
6049 snapshot: GroupSnapshot,
6050 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6051 },
6052}
6053
6054struct CoreWorker {
6055 core_id: CoreId,
6056 rx: mpsc::Receiver<CoreCommand>,
6057 engine_factory: Arc<dyn GroupEngineFactory>,
6058 groups: HashMap<RaftGroupId, GroupMailbox>,
6059 metrics: Arc<RuntimeMetricsInner>,
6060 group_mailbox_capacity: usize,
6061 cold_write_admission: ColdWriteAdmission,
6062 live_read_max_waiters_per_core: Option<u64>,
6063 read_materialization: Arc<Semaphore>,
6064}
6065
6066#[derive(Clone)]
6067struct AppendBatchRuntime {
6068 metrics: Arc<RuntimeMetricsInner>,
6069 read_materialization: Arc<Semaphore>,
6070 placement: ShardPlacement,
6071}
6072
6073type ReadWatchers = HashMap<BucketStreamId, Vec<ReadWatcher>>;
6074const GROUP_ACTOR_MAX_WRITE_BATCH: usize = 64;
6075const COLD_FLUSH_GROUP_BATCH_MAX_CHUNKS: usize = 64;
6076
6077#[derive(Clone)]
6078struct GroupMailbox {
6079 group_id: RaftGroupId,
6080 tx: mpsc::Sender<GroupCommand>,
6081 metrics: Arc<RuntimeMetricsInner>,
6082}
6083
6084impl GroupMailbox {
6085 async fn send(&self, command: GroupCommand) -> Result<(), Box<GroupCommand>> {
6086 match self.tx.try_send(command) {
6087 Ok(()) => {
6088 self.metrics.record_group_mailbox_enqueued(self.group_id);
6089 Ok(())
6090 }
6091 Err(mpsc::error::TrySendError::Full(command)) => {
6092 self.metrics.record_group_mailbox_full(self.group_id);
6093 match self.tx.send(command).await {
6094 Ok(()) => {
6095 self.metrics.record_group_mailbox_enqueued(self.group_id);
6096 Ok(())
6097 }
6098 Err(err) => Err(Box::new(err.0)),
6099 }
6100 }
6101 Err(mpsc::error::TrySendError::Closed(command)) => Err(Box::new(command)),
6102 }
6103 }
6104}
6105
6106struct PendingAppendBatch {
6107 stream_id: BucketStreamId,
6108 incoming_bytes: u64,
6109 response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6110 started_at: Instant,
6111}
6112
6113#[derive(Debug)]
6114enum GroupCommand {
6115 CreateStream {
6116 request: CreateStreamRequest,
6117 response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
6118 },
6119 CreateExternal {
6120 request: CreateStreamExternalRequest,
6121 response_tx: oneshot::Sender<Result<CreateStreamResponse, RuntimeError>>,
6122 },
6123 HeadStream {
6124 request: HeadStreamRequest,
6125 response_tx: oneshot::Sender<Result<HeadStreamResponse, RuntimeError>>,
6126 },
6127 ReadStream {
6128 request: ReadStreamRequest,
6129 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6130 },
6131 PublishSnapshot {
6132 request: PublishSnapshotRequest,
6133 response_tx: oneshot::Sender<Result<PublishSnapshotResponse, RuntimeError>>,
6134 },
6135 ReadSnapshot {
6136 request: ReadSnapshotRequest,
6137 response_tx: oneshot::Sender<Result<ReadSnapshotResponse, RuntimeError>>,
6138 },
6139 DeleteSnapshot {
6140 request: DeleteSnapshotRequest,
6141 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6142 },
6143 BootstrapStream {
6144 request: BootstrapStreamRequest,
6145 response_tx: oneshot::Sender<Result<BootstrapStreamResponse, RuntimeError>>,
6146 },
6147 WaitRead {
6148 request: ReadStreamRequest,
6149 waiter_id: u64,
6150 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6151 },
6152 CancelWaitRead {
6153 stream_id: BucketStreamId,
6154 waiter_id: u64,
6155 },
6156 RequireLiveReadOwner {
6157 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6158 },
6159 CloseStream {
6160 request: CloseStreamRequest,
6161 response_tx: oneshot::Sender<Result<CloseStreamResponse, RuntimeError>>,
6162 },
6163 AddForkRef {
6164 stream_id: BucketStreamId,
6165 now_ms: u64,
6166 response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
6167 },
6168 ReleaseForkRef {
6169 stream_id: BucketStreamId,
6170 response_tx: oneshot::Sender<Result<ForkRefResponse, RuntimeError>>,
6171 },
6172 DeleteStream {
6173 request: DeleteStreamRequest,
6174 response_tx: oneshot::Sender<Result<DeleteStreamResponse, RuntimeError>>,
6175 },
6176 FlushCold {
6177 request: FlushColdRequest,
6178 response_tx: oneshot::Sender<Result<FlushColdResponse, RuntimeError>>,
6179 },
6180 FlushColdBatch {
6181 requests: Vec<FlushColdRequest>,
6182 response_tx: oneshot::Sender<Result<Vec<FlushColdResponse>, RuntimeError>>,
6183 },
6184 PlanColdFlush {
6185 request: PlanColdFlushRequest,
6186 response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6187 },
6188 PlanNextColdFlush {
6189 request: PlanGroupColdFlushRequest,
6190 response_tx: oneshot::Sender<Result<Option<ColdFlushCandidate>, RuntimeError>>,
6191 },
6192 PlanNextColdFlushBatch {
6193 request: PlanGroupColdFlushRequest,
6194 max_candidates: usize,
6195 response_tx: oneshot::Sender<Result<Vec<ColdFlushCandidate>, RuntimeError>>,
6196 },
6197 Append {
6198 request: AppendRequest,
6199 response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6200 },
6201 AppendExternal {
6202 request: AppendExternalRequest,
6203 response_tx: oneshot::Sender<Result<AppendResponse, RuntimeError>>,
6204 },
6205 AppendBatch {
6206 request: AppendBatchRequest,
6207 response_tx: oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6208 },
6209 SnapshotGroup {
6210 response_tx: oneshot::Sender<Result<GroupSnapshot, RuntimeError>>,
6211 },
6212 InstallGroupSnapshot {
6213 snapshot: GroupSnapshot,
6214 response_tx: oneshot::Sender<Result<(), RuntimeError>>,
6215 },
6216}
6217
6218impl GroupCommand {
6219 fn send_error(self, err: RuntimeError) {
6220 match self {
6221 Self::CreateStream { response_tx, .. } => {
6222 let _ = response_tx.send(Err(err));
6223 }
6224 Self::CreateExternal { response_tx, .. } => {
6225 let _ = response_tx.send(Err(err));
6226 }
6227 Self::HeadStream { response_tx, .. } => {
6228 let _ = response_tx.send(Err(err));
6229 }
6230 Self::ReadStream { response_tx, .. } | Self::WaitRead { response_tx, .. } => {
6231 let _ = response_tx.send(Err(err));
6232 }
6233 Self::CancelWaitRead { .. } => {}
6234 Self::RequireLiveReadOwner { response_tx } => {
6235 let _ = response_tx.send(Err(err));
6236 }
6237 Self::PublishSnapshot { response_tx, .. } => {
6238 let _ = response_tx.send(Err(err));
6239 }
6240 Self::ReadSnapshot { response_tx, .. } => {
6241 let _ = response_tx.send(Err(err));
6242 }
6243 Self::DeleteSnapshot { response_tx, .. } => {
6244 let _ = response_tx.send(Err(err));
6245 }
6246 Self::BootstrapStream { response_tx, .. } => {
6247 let _ = response_tx.send(Err(err));
6248 }
6249 Self::CloseStream { response_tx, .. } => {
6250 let _ = response_tx.send(Err(err));
6251 }
6252 Self::AddForkRef { response_tx, .. } | Self::ReleaseForkRef { response_tx, .. } => {
6253 let _ = response_tx.send(Err(err));
6254 }
6255 Self::DeleteStream { response_tx, .. } => {
6256 let _ = response_tx.send(Err(err));
6257 }
6258 Self::FlushCold { response_tx, .. } => {
6259 let _ = response_tx.send(Err(err));
6260 }
6261 Self::FlushColdBatch { response_tx, .. } => {
6262 let _ = response_tx.send(Err(err));
6263 }
6264 Self::PlanColdFlush { response_tx, .. } => {
6265 let _ = response_tx.send(Err(err));
6266 }
6267 Self::PlanNextColdFlush { response_tx, .. } => {
6268 let _ = response_tx.send(Err(err));
6269 }
6270 Self::PlanNextColdFlushBatch { response_tx, .. } => {
6271 let _ = response_tx.send(Err(err));
6272 }
6273 Self::Append { response_tx, .. } => {
6274 let _ = response_tx.send(Err(err));
6275 }
6276 Self::AppendExternal { response_tx, .. } => {
6277 let _ = response_tx.send(Err(err));
6278 }
6279 Self::AppendBatch { response_tx, .. } => {
6280 let _ = response_tx.send(Err(err));
6281 }
6282 Self::SnapshotGroup { response_tx } => {
6283 let _ = response_tx.send(Err(err));
6284 }
6285 Self::InstallGroupSnapshot { response_tx, .. } => {
6286 let _ = response_tx.send(Err(err));
6287 }
6288 }
6289 }
6290}
6291
6292struct GroupActor {
6293 placement: ShardPlacement,
6294 engine: Box<dyn GroupEngine>,
6295 rx: mpsc::Receiver<GroupCommand>,
6296 read_watchers: ReadWatchers,
6297 metrics: Arc<RuntimeMetricsInner>,
6298 cold_write_admission: ColdWriteAdmission,
6299 live_read_max_waiters_per_core: Option<u64>,
6300 read_materialization: Arc<Semaphore>,
6301}
6302
6303impl GroupActor {
6304 async fn run(mut self) {
6305 let mut pending = VecDeque::new();
6306 loop {
6307 let Some(command) = self.next_command(&mut pending).await else {
6308 break;
6309 };
6310 match command {
6311 GroupCommand::CreateStream {
6312 request,
6313 response_tx,
6314 } => {
6315 let response = CoreWorker::create_stream(
6316 &mut self.engine,
6317 self.metrics.clone(),
6318 request,
6319 self.placement,
6320 self.cold_write_admission,
6321 )
6322 .await;
6323 let _ = response_tx.send(response);
6324 }
6325 GroupCommand::CreateExternal {
6326 request,
6327 response_tx,
6328 } => {
6329 let response = CoreWorker::create_stream_external(
6330 &mut self.engine,
6331 self.metrics.clone(),
6332 request,
6333 self.placement,
6334 )
6335 .await;
6336 let _ = response_tx.send(response);
6337 }
6338 GroupCommand::HeadStream {
6339 request,
6340 response_tx,
6341 } => {
6342 let response = CoreWorker::head_stream(
6343 &mut self.engine,
6344 self.metrics.clone(),
6345 request,
6346 self.placement,
6347 )
6348 .await;
6349 let _ = response_tx.send(response);
6350 }
6351 GroupCommand::ReadStream {
6352 request,
6353 response_tx,
6354 } => {
6355 CoreWorker::read_stream(
6356 &mut self.engine,
6357 self.metrics.clone(),
6358 self.read_materialization.clone(),
6359 request,
6360 self.placement,
6361 response_tx,
6362 )
6363 .await;
6364 }
6365 GroupCommand::PublishSnapshot {
6366 request,
6367 response_tx,
6368 } => {
6369 let response = CoreWorker::publish_snapshot(
6370 &mut self.engine,
6371 self.metrics.clone(),
6372 self.read_materialization.clone(),
6373 &mut self.read_watchers,
6374 request,
6375 self.placement,
6376 )
6377 .await;
6378 let _ = response_tx.send(response);
6379 }
6380 GroupCommand::ReadSnapshot {
6381 request,
6382 response_tx,
6383 } => {
6384 let response = CoreWorker::read_snapshot(
6385 &mut self.engine,
6386 self.metrics.clone(),
6387 request,
6388 self.placement,
6389 )
6390 .await;
6391 let _ = response_tx.send(response);
6392 }
6393 GroupCommand::DeleteSnapshot {
6394 request,
6395 response_tx,
6396 } => {
6397 let response = CoreWorker::delete_snapshot(
6398 &mut self.engine,
6399 self.metrics.clone(),
6400 request,
6401 self.placement,
6402 )
6403 .await;
6404 let _ = response_tx.send(response);
6405 }
6406 GroupCommand::BootstrapStream {
6407 request,
6408 response_tx,
6409 } => {
6410 let response = CoreWorker::bootstrap_stream(
6411 &mut self.engine,
6412 self.metrics.clone(),
6413 request,
6414 self.placement,
6415 )
6416 .await;
6417 let _ = response_tx.send(response);
6418 }
6419 GroupCommand::WaitRead {
6420 request,
6421 waiter_id,
6422 response_tx,
6423 } => {
6424 let watcher = ReadWatcher {
6425 waiter_id,
6426 request,
6427 response_tx,
6428 };
6429 CoreWorker::wait_read_stream(
6430 &mut self.engine,
6431 self.metrics.clone(),
6432 self.read_materialization.clone(),
6433 &mut self.read_watchers,
6434 self.placement,
6435 watcher,
6436 self.live_read_max_waiters_per_core,
6437 )
6438 .await;
6439 }
6440 GroupCommand::CancelWaitRead {
6441 stream_id,
6442 waiter_id,
6443 } => {
6444 CoreWorker::cancel_read_watcher(
6445 &mut self.read_watchers,
6446 self.metrics.clone(),
6447 self.placement.core_id,
6448 stream_id,
6449 waiter_id,
6450 );
6451 }
6452 GroupCommand::RequireLiveReadOwner { response_tx } => {
6453 let response = self
6454 .engine
6455 .require_local_live_read_owner(self.placement)
6456 .await
6457 .map_err(|err| RuntimeError::group_engine(self.placement, err));
6458 let _ = response_tx.send(response);
6459 }
6460 GroupCommand::CloseStream {
6461 request,
6462 response_tx,
6463 } => {
6464 let response = CoreWorker::close_stream(
6465 &mut self.engine,
6466 self.metrics.clone(),
6467 self.read_materialization.clone(),
6468 &mut self.read_watchers,
6469 request,
6470 self.placement,
6471 )
6472 .await;
6473 let _ = response_tx.send(response);
6474 }
6475 GroupCommand::AddForkRef {
6476 stream_id,
6477 now_ms,
6478 response_tx,
6479 } => {
6480 let response = CoreWorker::add_fork_ref(
6481 &mut self.engine,
6482 self.metrics.clone(),
6483 stream_id,
6484 now_ms,
6485 self.placement,
6486 )
6487 .await;
6488 let _ = response_tx.send(response);
6489 }
6490 GroupCommand::ReleaseForkRef {
6491 stream_id,
6492 response_tx,
6493 } => {
6494 let response = CoreWorker::release_fork_ref(
6495 &mut self.engine,
6496 self.metrics.clone(),
6497 self.read_materialization.clone(),
6498 &mut self.read_watchers,
6499 stream_id,
6500 self.placement,
6501 )
6502 .await;
6503 let _ = response_tx.send(response);
6504 }
6505 GroupCommand::DeleteStream {
6506 request,
6507 response_tx,
6508 } => {
6509 let response = CoreWorker::delete_stream(
6510 &mut self.engine,
6511 self.metrics.clone(),
6512 self.read_materialization.clone(),
6513 &mut self.read_watchers,
6514 request,
6515 self.placement,
6516 )
6517 .await;
6518 let _ = response_tx.send(response);
6519 }
6520 GroupCommand::FlushCold {
6521 request,
6522 response_tx,
6523 } => {
6524 let response = CoreWorker::flush_cold(
6525 &mut self.engine,
6526 self.metrics.clone(),
6527 self.read_materialization.clone(),
6528 &mut self.read_watchers,
6529 request,
6530 self.placement,
6531 )
6532 .await;
6533 let _ = response_tx.send(response);
6534 }
6535 GroupCommand::FlushColdBatch {
6536 requests,
6537 response_tx,
6538 } => {
6539 let response = CoreWorker::flush_cold_batch(
6540 &mut self.engine,
6541 self.metrics.clone(),
6542 self.read_materialization.clone(),
6543 &mut self.read_watchers,
6544 requests,
6545 self.placement,
6546 )
6547 .await;
6548 let _ = response_tx.send(response);
6549 }
6550 GroupCommand::PlanColdFlush {
6551 request,
6552 response_tx,
6553 } => {
6554 let response = CoreWorker::plan_cold_flush(
6555 &mut self.engine,
6556 self.metrics.clone(),
6557 request,
6558 self.placement,
6559 )
6560 .await;
6561 let _ = response_tx.send(response);
6562 }
6563 GroupCommand::PlanNextColdFlush {
6564 request,
6565 response_tx,
6566 } => {
6567 let response = CoreWorker::plan_next_cold_flush(
6568 &mut self.engine,
6569 self.metrics.clone(),
6570 request,
6571 self.placement,
6572 )
6573 .await;
6574 let _ = response_tx.send(response);
6575 }
6576 GroupCommand::PlanNextColdFlushBatch {
6577 request,
6578 max_candidates,
6579 response_tx,
6580 } => {
6581 let response = CoreWorker::plan_next_cold_flush_batch(
6582 &mut self.engine,
6583 self.metrics.clone(),
6584 request,
6585 self.placement,
6586 max_candidates,
6587 )
6588 .await;
6589 let _ = response_tx.send(response);
6590 }
6591 GroupCommand::Append {
6592 request,
6593 response_tx,
6594 } => {
6595 let response = CoreWorker::apply_append(
6596 &mut self.engine,
6597 self.metrics.clone(),
6598 self.read_materialization.clone(),
6599 &mut self.read_watchers,
6600 request,
6601 self.placement,
6602 self.cold_write_admission,
6603 )
6604 .await;
6605 let _ = response_tx.send(response);
6606 }
6607 GroupCommand::AppendExternal {
6608 request,
6609 response_tx,
6610 } => {
6611 let response = CoreWorker::apply_append_external(
6612 &mut self.engine,
6613 self.metrics.clone(),
6614 self.read_materialization.clone(),
6615 &mut self.read_watchers,
6616 request,
6617 self.placement,
6618 )
6619 .await;
6620 let _ = response_tx.send(response);
6621 }
6622 GroupCommand::AppendBatch {
6623 request,
6624 response_tx,
6625 } => {
6626 let mut batch = vec![(request, response_tx)];
6627 self.collect_append_batch_commands(&mut pending, &mut batch);
6628 if self.cold_write_admission.is_enabled() {
6629 let (requests, pending_batch) =
6630 CoreWorker::prepare_append_batch_requests(batch);
6631 CoreWorker::apply_prepared_append_batch_requests_with_cold_admission(
6632 &mut self.engine,
6633 AppendBatchRuntime {
6634 metrics: self.metrics.clone(),
6635 read_materialization: self.read_materialization.clone(),
6636 placement: self.placement,
6637 },
6638 &mut self.read_watchers,
6639 pending_batch,
6640 requests,
6641 self.cold_write_admission,
6642 )
6643 .await;
6644 } else {
6645 let (commands, pending_batch) =
6646 CoreWorker::prepare_append_batch_commands(batch);
6647 CoreWorker::apply_prepared_append_batch_commands(
6648 &mut self.engine,
6649 AppendBatchRuntime {
6650 metrics: self.metrics.clone(),
6651 read_materialization: self.read_materialization.clone(),
6652 placement: self.placement,
6653 },
6654 &mut self.read_watchers,
6655 pending_batch,
6656 commands,
6657 )
6658 .await;
6659 }
6660 }
6661 GroupCommand::SnapshotGroup { response_tx } => {
6662 let response = CoreWorker::snapshot_group(
6663 &mut self.engine,
6664 self.metrics.clone(),
6665 self.placement,
6666 )
6667 .await;
6668 let _ = response_tx.send(response);
6669 }
6670 GroupCommand::InstallGroupSnapshot {
6671 snapshot,
6672 response_tx,
6673 } => {
6674 let response = CoreWorker::install_group_snapshot(
6675 &mut self.engine,
6676 self.metrics.clone(),
6677 snapshot,
6678 )
6679 .await;
6680 let _ = response_tx.send(response);
6681 }
6682 }
6683 }
6684 }
6685
6686 async fn next_command(&mut self, pending: &mut VecDeque<GroupCommand>) -> Option<GroupCommand> {
6687 match pending.pop_front() {
6688 Some(command) => Some(command),
6689 None => {
6690 let command = self.rx.recv().await;
6691 if command.is_some() {
6692 self.metrics
6693 .record_group_mailbox_dequeued(self.placement.raft_group_id);
6694 }
6695 command
6696 }
6697 }
6698 }
6699
6700 fn collect_append_batch_commands(
6701 &mut self,
6702 pending: &mut VecDeque<GroupCommand>,
6703 batch: &mut Vec<(
6704 AppendBatchRequest,
6705 oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
6706 )>,
6707 ) {
6708 while batch.len() < GROUP_ACTOR_MAX_WRITE_BATCH {
6709 let command = match pending.pop_front() {
6710 Some(command) => Some(command),
6711 None => match self.rx.try_recv() {
6712 Ok(command) => {
6713 self.metrics
6714 .record_group_mailbox_dequeued(self.placement.raft_group_id);
6715 Some(command)
6716 }
6717 Err(_) => None,
6718 },
6719 };
6720 match command {
6721 Some(GroupCommand::AppendBatch {
6722 request,
6723 response_tx,
6724 }) => batch.push((request, response_tx)),
6725 Some(other) => {
6726 pending.push_front(other);
6727 break;
6728 }
6729 None => break,
6730 }
6731 }
6732 }
6733}
6734
6735struct ReadWatcher {
6736 waiter_id: u64,
6737 request: ReadStreamRequest,
6738 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
6739}
6740
6741fn live_read_watcher_count(read_watchers: &HashMap<BucketStreamId, Vec<ReadWatcher>>) -> u64 {
6742 read_watchers
6743 .values()
6744 .map(|watchers| u64::try_from(watchers.len()).expect("watcher count fits u64"))
6745 .sum()
6746}
6747
6748struct WaitReadCancel {
6749 tx: mpsc::Sender<CoreCommand>,
6750 stream_id: Option<BucketStreamId>,
6751 placement: ShardPlacement,
6752 waiter_id: u64,
6753}
6754
6755impl WaitReadCancel {
6756 fn new(
6757 tx: mpsc::Sender<CoreCommand>,
6758 stream_id: BucketStreamId,
6759 placement: ShardPlacement,
6760 waiter_id: u64,
6761 ) -> Self {
6762 Self {
6763 tx,
6764 stream_id: Some(stream_id),
6765 placement,
6766 waiter_id,
6767 }
6768 }
6769
6770 fn disarm(&mut self) {
6771 self.stream_id = None;
6772 }
6773}
6774
6775impl Drop for WaitReadCancel {
6776 fn drop(&mut self) {
6777 if let Some(stream_id) = self.stream_id.take() {
6778 let _ = self.tx.try_send(CoreCommand::CancelWaitRead {
6782 stream_id,
6783 placement: self.placement,
6784 waiter_id: self.waiter_id,
6785 });
6786 }
6787 }
6788}
6789
6790impl CoreWorker {
6791 async fn run(mut self) {
6792 while let Some(command) = self.rx.recv().await {
6793 match command {
6794 CoreCommand::CreateStream {
6795 request,
6796 placement,
6797 response_tx,
6798 } => {
6799 debug_assert_eq!(placement.core_id, self.core_id);
6800 self.send_group_command(
6801 placement,
6802 GroupCommand::CreateStream {
6803 request,
6804 response_tx,
6805 },
6806 )
6807 .await;
6808 }
6809 CoreCommand::CreateExternal {
6810 request,
6811 placement,
6812 response_tx,
6813 } => {
6814 debug_assert_eq!(placement.core_id, self.core_id);
6815 self.send_group_command(
6816 placement,
6817 GroupCommand::CreateExternal {
6818 request,
6819 response_tx,
6820 },
6821 )
6822 .await;
6823 }
6824 CoreCommand::HeadStream {
6825 request,
6826 placement,
6827 response_tx,
6828 } => {
6829 debug_assert_eq!(placement.core_id, self.core_id);
6830 self.send_group_command(
6831 placement,
6832 GroupCommand::HeadStream {
6833 request,
6834 response_tx,
6835 },
6836 )
6837 .await;
6838 }
6839 CoreCommand::ReadStream {
6840 request,
6841 placement,
6842 response_tx,
6843 } => {
6844 debug_assert_eq!(placement.core_id, self.core_id);
6845 self.send_group_command(
6846 placement,
6847 GroupCommand::ReadStream {
6848 request,
6849 response_tx,
6850 },
6851 )
6852 .await;
6853 }
6854 CoreCommand::PublishSnapshot {
6855 request,
6856 placement,
6857 response_tx,
6858 } => {
6859 debug_assert_eq!(placement.core_id, self.core_id);
6860 self.send_group_command(
6861 placement,
6862 GroupCommand::PublishSnapshot {
6863 request,
6864 response_tx,
6865 },
6866 )
6867 .await;
6868 }
6869 CoreCommand::ReadSnapshot {
6870 request,
6871 placement,
6872 response_tx,
6873 } => {
6874 debug_assert_eq!(placement.core_id, self.core_id);
6875 self.send_group_command(
6876 placement,
6877 GroupCommand::ReadSnapshot {
6878 request,
6879 response_tx,
6880 },
6881 )
6882 .await;
6883 }
6884 CoreCommand::DeleteSnapshot {
6885 request,
6886 placement,
6887 response_tx,
6888 } => {
6889 debug_assert_eq!(placement.core_id, self.core_id);
6890 self.send_group_command(
6891 placement,
6892 GroupCommand::DeleteSnapshot {
6893 request,
6894 response_tx,
6895 },
6896 )
6897 .await;
6898 }
6899 CoreCommand::BootstrapStream {
6900 request,
6901 placement,
6902 response_tx,
6903 } => {
6904 debug_assert_eq!(placement.core_id, self.core_id);
6905 self.send_group_command(
6906 placement,
6907 GroupCommand::BootstrapStream {
6908 request,
6909 response_tx,
6910 },
6911 )
6912 .await;
6913 }
6914 CoreCommand::WaitRead {
6915 request,
6916 placement,
6917 waiter_id,
6918 response_tx,
6919 } => {
6920 debug_assert_eq!(placement.core_id, self.core_id);
6921 self.send_group_command(
6922 placement,
6923 GroupCommand::WaitRead {
6924 request,
6925 waiter_id,
6926 response_tx,
6927 },
6928 )
6929 .await;
6930 }
6931 CoreCommand::RequireLiveReadOwner {
6932 placement,
6933 response_tx,
6934 } => {
6935 debug_assert_eq!(placement.core_id, self.core_id);
6936 self.send_group_command(
6937 placement,
6938 GroupCommand::RequireLiveReadOwner { response_tx },
6939 )
6940 .await;
6941 }
6942 CoreCommand::CancelWaitRead {
6943 stream_id,
6944 placement,
6945 waiter_id,
6946 } => {
6947 debug_assert_eq!(placement.core_id, self.core_id);
6948 self.send_group_command(
6949 placement,
6950 GroupCommand::CancelWaitRead {
6951 stream_id,
6952 waiter_id,
6953 },
6954 )
6955 .await;
6956 }
6957 CoreCommand::CloseStream {
6958 request,
6959 placement,
6960 response_tx,
6961 } => {
6962 debug_assert_eq!(placement.core_id, self.core_id);
6963 self.send_group_command(
6964 placement,
6965 GroupCommand::CloseStream {
6966 request,
6967 response_tx,
6968 },
6969 )
6970 .await;
6971 }
6972 CoreCommand::AddForkRef {
6973 stream_id,
6974 now_ms,
6975 placement,
6976 response_tx,
6977 } => {
6978 debug_assert_eq!(placement.core_id, self.core_id);
6979 self.send_group_command(
6980 placement,
6981 GroupCommand::AddForkRef {
6982 stream_id,
6983 now_ms,
6984 response_tx,
6985 },
6986 )
6987 .await;
6988 }
6989 CoreCommand::ReleaseForkRef {
6990 stream_id,
6991 placement,
6992 response_tx,
6993 } => {
6994 debug_assert_eq!(placement.core_id, self.core_id);
6995 self.send_group_command(
6996 placement,
6997 GroupCommand::ReleaseForkRef {
6998 stream_id,
6999 response_tx,
7000 },
7001 )
7002 .await;
7003 }
7004 CoreCommand::DeleteStream {
7005 request,
7006 placement,
7007 response_tx,
7008 } => {
7009 debug_assert_eq!(placement.core_id, self.core_id);
7010 self.send_group_command(
7011 placement,
7012 GroupCommand::DeleteStream {
7013 request,
7014 response_tx,
7015 },
7016 )
7017 .await;
7018 }
7019 CoreCommand::FlushCold {
7020 request,
7021 placement,
7022 response_tx,
7023 } => {
7024 debug_assert_eq!(placement.core_id, self.core_id);
7025 self.send_group_command(
7026 placement,
7027 GroupCommand::FlushCold {
7028 request,
7029 response_tx,
7030 },
7031 )
7032 .await;
7033 }
7034 CoreCommand::FlushColdBatch {
7035 requests,
7036 placement,
7037 response_tx,
7038 } => {
7039 debug_assert_eq!(placement.core_id, self.core_id);
7040 self.send_group_command(
7041 placement,
7042 GroupCommand::FlushColdBatch {
7043 requests,
7044 response_tx,
7045 },
7046 )
7047 .await;
7048 }
7049 CoreCommand::PlanColdFlush {
7050 request,
7051 placement,
7052 response_tx,
7053 } => {
7054 debug_assert_eq!(placement.core_id, self.core_id);
7055 self.send_group_command(
7056 placement,
7057 GroupCommand::PlanColdFlush {
7058 request,
7059 response_tx,
7060 },
7061 )
7062 .await;
7063 }
7064 CoreCommand::PlanNextColdFlush {
7065 request,
7066 placement,
7067 response_tx,
7068 } => {
7069 debug_assert_eq!(placement.core_id, self.core_id);
7070 self.send_group_command(
7071 placement,
7072 GroupCommand::PlanNextColdFlush {
7073 request,
7074 response_tx,
7075 },
7076 )
7077 .await;
7078 }
7079 CoreCommand::PlanNextColdFlushBatch {
7080 request,
7081 placement,
7082 max_candidates,
7083 response_tx,
7084 } => {
7085 debug_assert_eq!(placement.core_id, self.core_id);
7086 self.send_group_command(
7087 placement,
7088 GroupCommand::PlanNextColdFlushBatch {
7089 request,
7090 max_candidates,
7091 response_tx,
7092 },
7093 )
7094 .await;
7095 }
7096 CoreCommand::Append {
7097 request,
7098 placement,
7099 response_tx,
7100 } => {
7101 debug_assert_eq!(placement.core_id, self.core_id);
7102 self.send_group_command(
7103 placement,
7104 GroupCommand::Append {
7105 request,
7106 response_tx,
7107 },
7108 )
7109 .await;
7110 }
7111 CoreCommand::AppendExternal {
7112 request,
7113 placement,
7114 response_tx,
7115 } => {
7116 debug_assert_eq!(placement.core_id, self.core_id);
7117 self.send_group_command(
7118 placement,
7119 GroupCommand::AppendExternal {
7120 request,
7121 response_tx,
7122 },
7123 )
7124 .await;
7125 }
7126 CoreCommand::AppendBatch {
7127 request,
7128 placement,
7129 response_tx,
7130 } => {
7131 debug_assert_eq!(placement.core_id, self.core_id);
7132 self.send_group_command(
7133 placement,
7134 GroupCommand::AppendBatch {
7135 request,
7136 response_tx,
7137 },
7138 )
7139 .await;
7140 }
7141 CoreCommand::WarmGroup {
7142 placement,
7143 response_tx,
7144 } => {
7145 debug_assert_eq!(placement.core_id, self.core_id);
7146 let response = self.group(placement).await.map(|_| placement);
7147 let _ = response_tx.send(response);
7148 }
7149 CoreCommand::SnapshotGroup {
7150 placement,
7151 response_tx,
7152 } => {
7153 debug_assert_eq!(placement.core_id, self.core_id);
7154 self.send_group_command(placement, GroupCommand::SnapshotGroup { response_tx })
7155 .await;
7156 }
7157 CoreCommand::InstallGroupSnapshot {
7158 snapshot,
7159 response_tx,
7160 } => {
7161 debug_assert_eq!(snapshot.placement.core_id, self.core_id);
7162 self.send_group_command(
7163 snapshot.placement,
7164 GroupCommand::InstallGroupSnapshot {
7165 snapshot,
7166 response_tx,
7167 },
7168 )
7169 .await;
7170 }
7171 }
7172 }
7173 }
7174
7175 async fn send_group_command(&mut self, placement: ShardPlacement, command: GroupCommand) {
7176 let core_id = placement.core_id;
7177 match self.group(placement).await {
7178 Ok(group) => {
7179 if let Err(command) = group.send(command).await {
7180 (*command).send_error(RuntimeError::MailboxClosed { core_id });
7181 }
7182 }
7183 Err(err) => command.send_error(err),
7184 }
7185 }
7186
7187 async fn group(&mut self, placement: ShardPlacement) -> Result<GroupMailbox, RuntimeError> {
7188 if !self.groups.contains_key(&placement.raft_group_id) {
7189 let engine_factory = self.engine_factory.clone();
7190 let metrics = GroupEngineMetrics {
7191 inner: self.metrics.clone(),
7192 };
7193 let engine = engine_factory
7194 .create(placement, metrics)
7195 .await
7196 .map_err(|err| RuntimeError::group_engine(placement, err))?;
7197 let (tx, rx) = mpsc::channel(self.group_mailbox_capacity);
7198 let actor = GroupActor {
7199 placement,
7200 engine,
7201 rx,
7202 read_watchers: HashMap::new(),
7203 metrics: self.metrics.clone(),
7204 cold_write_admission: self.cold_write_admission,
7205 live_read_max_waiters_per_core: self.live_read_max_waiters_per_core,
7206 read_materialization: self.read_materialization.clone(),
7207 };
7208 tokio::spawn(actor.run());
7209 self.groups.insert(
7210 placement.raft_group_id,
7211 GroupMailbox {
7212 group_id: placement.raft_group_id,
7213 tx,
7214 metrics: self.metrics.clone(),
7215 },
7216 );
7217 }
7218 Ok(self
7219 .groups
7220 .get(&placement.raft_group_id)
7221 .expect("group was just inserted")
7222 .clone())
7223 }
7224
7225 async fn read_stream(
7226 group: &mut Box<dyn GroupEngine>,
7227 metrics: Arc<RuntimeMetricsInner>,
7228 read_materialization: Arc<Semaphore>,
7229 request: ReadStreamRequest,
7230 placement: ShardPlacement,
7231 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
7232 ) {
7233 let exec_started_at = Instant::now();
7234 let parts = group
7235 .read_stream_parts(request, placement)
7236 .await
7237 .map_err(|err| RuntimeError::group_engine(placement, err));
7238 metrics.record_group_engine_exec(
7239 placement.core_id,
7240 placement.raft_group_id,
7241 elapsed_ns(exec_started_at),
7242 );
7243 match parts {
7244 Ok(parts) => {
7245 Self::send_read_parts_response(placement, read_materialization, parts, response_tx);
7246 }
7247 Err(err) => {
7248 let _ = response_tx.send(Err(err));
7249 }
7250 }
7251 }
7252
7253 fn send_read_parts_response(
7254 placement: ShardPlacement,
7255 read_materialization: Arc<Semaphore>,
7256 parts: GroupReadStreamParts,
7257 response_tx: oneshot::Sender<Result<ReadStreamResponse, RuntimeError>>,
7258 ) {
7259 tokio::spawn(async move {
7260 let response = match read_materialization.acquire_owned().await {
7261 Ok(_permit) => parts
7262 .into_response()
7263 .await
7264 .map_err(|err| RuntimeError::group_engine(placement, err)),
7265 Err(_) => Err(RuntimeError::MailboxClosed {
7266 core_id: placement.core_id,
7267 }),
7268 };
7269 let _ = response_tx.send(response);
7270 });
7271 }
7272
7273 fn send_read_parts_to_watchers(
7274 placement: ShardPlacement,
7275 read_materialization: Arc<Semaphore>,
7276 parts: GroupReadStreamParts,
7277 watchers: Vec<ReadWatcher>,
7278 ) {
7279 tokio::spawn(async move {
7280 let response = match read_materialization.acquire_owned().await {
7281 Ok(_permit) => parts
7282 .into_response()
7283 .await
7284 .map_err(|err| RuntimeError::group_engine(placement, err)),
7285 Err(_) => Err(RuntimeError::MailboxClosed {
7286 core_id: placement.core_id,
7287 }),
7288 };
7289 for watcher in watchers {
7290 let _ = watcher.response_tx.send(response.clone());
7291 }
7292 });
7293 }
7294
7295 async fn publish_snapshot(
7296 group: &mut Box<dyn GroupEngine>,
7297 metrics: Arc<RuntimeMetricsInner>,
7298 read_materialization: Arc<Semaphore>,
7299 read_watchers: &mut ReadWatchers,
7300 request: PublishSnapshotRequest,
7301 placement: ShardPlacement,
7302 ) -> Result<PublishSnapshotResponse, RuntimeError> {
7303 let stream_id = request.stream_id.clone();
7304 let started_at = Instant::now();
7305 let exec_started_at = Instant::now();
7306 let response = group
7307 .publish_snapshot(request, placement)
7308 .await
7309 .map_err(|err| RuntimeError::group_engine(placement, err));
7310 metrics.record_group_engine_exec(
7311 placement.core_id,
7312 placement.raft_group_id,
7313 elapsed_ns(exec_started_at),
7314 );
7315 if response.is_ok() {
7316 metrics.record_applied_mutation(
7317 placement.core_id,
7318 placement.raft_group_id,
7319 elapsed_ns(started_at),
7320 );
7321 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7322 Self::notify_read_watchers(
7323 group,
7324 metrics,
7325 read_materialization,
7326 read_watchers,
7327 &stream_id,
7328 placement,
7329 )
7330 .await;
7331 }
7332 response
7333 }
7334
7335 async fn read_snapshot(
7336 group: &mut Box<dyn GroupEngine>,
7337 metrics: Arc<RuntimeMetricsInner>,
7338 request: ReadSnapshotRequest,
7339 placement: ShardPlacement,
7340 ) -> Result<ReadSnapshotResponse, RuntimeError> {
7341 let exec_started_at = Instant::now();
7342 let response = group
7343 .read_snapshot(request, placement)
7344 .await
7345 .map_err(|err| RuntimeError::group_engine(placement, err));
7346 metrics.record_group_engine_exec(
7347 placement.core_id,
7348 placement.raft_group_id,
7349 elapsed_ns(exec_started_at),
7350 );
7351 response
7352 }
7353
7354 async fn delete_snapshot(
7355 group: &mut Box<dyn GroupEngine>,
7356 metrics: Arc<RuntimeMetricsInner>,
7357 request: DeleteSnapshotRequest,
7358 placement: ShardPlacement,
7359 ) -> Result<(), RuntimeError> {
7360 let exec_started_at = Instant::now();
7361 let response = group
7362 .delete_snapshot(request, placement)
7363 .await
7364 .map_err(|err| RuntimeError::group_engine(placement, err));
7365 metrics.record_group_engine_exec(
7366 placement.core_id,
7367 placement.raft_group_id,
7368 elapsed_ns(exec_started_at),
7369 );
7370 response
7371 }
7372
7373 async fn bootstrap_stream(
7374 group: &mut Box<dyn GroupEngine>,
7375 metrics: Arc<RuntimeMetricsInner>,
7376 request: BootstrapStreamRequest,
7377 placement: ShardPlacement,
7378 ) -> Result<BootstrapStreamResponse, RuntimeError> {
7379 let exec_started_at = Instant::now();
7380 let response = group
7381 .bootstrap_stream(request, placement)
7382 .await
7383 .map_err(|err| RuntimeError::group_engine(placement, err));
7384 metrics.record_group_engine_exec(
7385 placement.core_id,
7386 placement.raft_group_id,
7387 elapsed_ns(exec_started_at),
7388 );
7389 response
7390 }
7391
7392 async fn wait_read_stream(
7393 group: &mut Box<dyn GroupEngine>,
7394 metrics: Arc<RuntimeMetricsInner>,
7395 read_materialization: Arc<Semaphore>,
7396 read_watchers: &mut ReadWatchers,
7397 placement: ShardPlacement,
7398 watcher: ReadWatcher,
7399 live_read_max_waiters_per_core: Option<u64>,
7400 ) {
7401 let exec_started_at = Instant::now();
7402 let parts = group
7403 .read_stream_parts(watcher.request.clone(), placement)
7404 .await
7405 .map_err(|err| RuntimeError::group_engine(placement, err));
7406 metrics.record_group_engine_exec(
7407 placement.core_id,
7408 placement.raft_group_id,
7409 elapsed_ns(exec_started_at),
7410 );
7411 match parts {
7412 Ok(parts) if parts.payload_is_empty() && parts.up_to_date && !parts.closed => {
7413 if watcher.response_tx.is_closed() {
7414 return;
7415 }
7416 let current_waiters = live_read_watcher_count(read_watchers);
7417 if let Some(limit) = live_read_max_waiters_per_core
7418 && current_waiters >= limit
7419 {
7420 metrics.record_live_read_backpressure(placement.core_id);
7421 let _ = watcher
7422 .response_tx
7423 .send(Err(RuntimeError::LiveReadBackpressure {
7424 core_id: placement.core_id,
7425 current_waiters,
7426 limit,
7427 }));
7428 return;
7429 }
7430 metrics.record_read_watcher_added(placement.core_id);
7431 read_watchers
7432 .entry(watcher.request.stream_id.clone())
7433 .or_default()
7434 .push(watcher);
7435 }
7436 Ok(parts) => {
7437 Self::send_read_parts_response(
7438 placement,
7439 read_materialization.clone(),
7440 parts,
7441 watcher.response_tx,
7442 );
7443 }
7444 Err(err) => {
7445 let _ = watcher.response_tx.send(Err(err));
7446 }
7447 }
7448 }
7449
7450 fn cancel_read_watcher(
7451 read_watchers: &mut ReadWatchers,
7452 metrics: Arc<RuntimeMetricsInner>,
7453 core_id: CoreId,
7454 stream_id: BucketStreamId,
7455 waiter_id: u64,
7456 ) {
7457 let Some(watchers) = read_watchers.get_mut(&stream_id) else {
7458 return;
7459 };
7460 let before = watchers.len();
7461 watchers.retain(|watcher| watcher.waiter_id != waiter_id);
7462 let removed = before - watchers.len();
7463 let is_empty = watchers.is_empty();
7464 if removed > 0 {
7465 metrics.record_read_watchers_removed(core_id, removed);
7466 }
7467 if is_empty {
7468 read_watchers.remove(&stream_id);
7469 }
7470 }
7471
7472 async fn close_stream(
7473 group: &mut Box<dyn GroupEngine>,
7474 metrics: Arc<RuntimeMetricsInner>,
7475 read_materialization: Arc<Semaphore>,
7476 read_watchers: &mut ReadWatchers,
7477 request: CloseStreamRequest,
7478 placement: ShardPlacement,
7479 ) -> Result<CloseStreamResponse, RuntimeError> {
7480 let stream_id = request.stream_id.clone();
7481 let started_at = Instant::now();
7482 let exec_started_at = Instant::now();
7483 let response = group
7484 .close_stream(request, placement)
7485 .await
7486 .map_err(|err| RuntimeError::group_engine(placement, err));
7487 metrics.record_group_engine_exec(
7488 placement.core_id,
7489 placement.raft_group_id,
7490 elapsed_ns(exec_started_at),
7491 );
7492 if response
7493 .as_ref()
7494 .is_ok_and(|response| !response.deduplicated)
7495 {
7496 metrics.record_applied_mutation(
7497 placement.core_id,
7498 placement.raft_group_id,
7499 elapsed_ns(started_at),
7500 );
7501 Self::notify_read_watchers(
7502 group,
7503 metrics,
7504 read_materialization,
7505 read_watchers,
7506 &stream_id,
7507 placement,
7508 )
7509 .await;
7510 }
7511 response
7512 }
7513
7514 async fn add_fork_ref(
7515 group: &mut Box<dyn GroupEngine>,
7516 metrics: Arc<RuntimeMetricsInner>,
7517 stream_id: BucketStreamId,
7518 now_ms: u64,
7519 placement: ShardPlacement,
7520 ) -> Result<ForkRefResponse, RuntimeError> {
7521 let started_at = Instant::now();
7522 let exec_started_at = Instant::now();
7523 let response = group
7524 .add_fork_ref(stream_id, now_ms, placement)
7525 .await
7526 .map_err(|err| RuntimeError::group_engine(placement, err));
7527 metrics.record_group_engine_exec(
7528 placement.core_id,
7529 placement.raft_group_id,
7530 elapsed_ns(exec_started_at),
7531 );
7532 if response.is_ok() {
7533 metrics.record_applied_mutation(
7534 placement.core_id,
7535 placement.raft_group_id,
7536 elapsed_ns(started_at),
7537 );
7538 }
7539 response
7540 }
7541
7542 async fn release_fork_ref(
7543 group: &mut Box<dyn GroupEngine>,
7544 metrics: Arc<RuntimeMetricsInner>,
7545 read_materialization: Arc<Semaphore>,
7546 read_watchers: &mut ReadWatchers,
7547 stream_id: BucketStreamId,
7548 placement: ShardPlacement,
7549 ) -> Result<ForkRefResponse, RuntimeError> {
7550 let started_at = Instant::now();
7551 let exec_started_at = Instant::now();
7552 let response = group
7553 .release_fork_ref(stream_id.clone(), placement)
7554 .await
7555 .map_err(|err| RuntimeError::group_engine(placement, err));
7556 metrics.record_group_engine_exec(
7557 placement.core_id,
7558 placement.raft_group_id,
7559 elapsed_ns(exec_started_at),
7560 );
7561 if response.is_ok() {
7562 metrics.record_applied_mutation(
7563 placement.core_id,
7564 placement.raft_group_id,
7565 elapsed_ns(started_at),
7566 );
7567 Self::notify_read_watchers(
7568 group,
7569 metrics,
7570 read_materialization,
7571 read_watchers,
7572 &stream_id,
7573 placement,
7574 )
7575 .await;
7576 }
7577 response
7578 }
7579
7580 async fn delete_stream(
7581 group: &mut Box<dyn GroupEngine>,
7582 metrics: Arc<RuntimeMetricsInner>,
7583 read_materialization: Arc<Semaphore>,
7584 read_watchers: &mut ReadWatchers,
7585 request: DeleteStreamRequest,
7586 placement: ShardPlacement,
7587 ) -> Result<DeleteStreamResponse, RuntimeError> {
7588 let stream_id = request.stream_id.clone();
7589 let started_at = Instant::now();
7590 let exec_started_at = Instant::now();
7591 let response = group
7592 .delete_stream(request, placement)
7593 .await
7594 .map_err(|err| RuntimeError::group_engine(placement, err));
7595 metrics.record_group_engine_exec(
7596 placement.core_id,
7597 placement.raft_group_id,
7598 elapsed_ns(exec_started_at),
7599 );
7600 if response.is_ok() {
7601 metrics.record_applied_mutation(
7602 placement.core_id,
7603 placement.raft_group_id,
7604 elapsed_ns(started_at),
7605 );
7606 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7607 Self::notify_read_watchers(
7608 group,
7609 metrics,
7610 read_materialization,
7611 read_watchers,
7612 &stream_id,
7613 placement,
7614 )
7615 .await;
7616 }
7617 response
7618 }
7619
7620 async fn flush_cold(
7621 group: &mut Box<dyn GroupEngine>,
7622 metrics: Arc<RuntimeMetricsInner>,
7623 read_materialization: Arc<Semaphore>,
7624 read_watchers: &mut ReadWatchers,
7625 request: FlushColdRequest,
7626 placement: ShardPlacement,
7627 ) -> Result<FlushColdResponse, RuntimeError> {
7628 let stream_id = request.stream_id.clone();
7629 let started_at = Instant::now();
7630 let exec_started_at = Instant::now();
7631 let response = group
7632 .flush_cold(request, placement)
7633 .await
7634 .map_err(|err| RuntimeError::group_engine(placement, err));
7635 metrics.record_group_engine_exec(
7636 placement.core_id,
7637 placement.raft_group_id,
7638 elapsed_ns(exec_started_at),
7639 );
7640 if response.is_ok() {
7641 metrics.record_applied_mutation(
7642 placement.core_id,
7643 placement.raft_group_id,
7644 elapsed_ns(started_at),
7645 );
7646 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7647 Self::notify_read_watchers(
7648 group,
7649 metrics,
7650 read_materialization,
7651 read_watchers,
7652 &stream_id,
7653 placement,
7654 )
7655 .await;
7656 }
7657 response
7658 }
7659
7660 async fn flush_cold_batch(
7661 group: &mut Box<dyn GroupEngine>,
7662 metrics: Arc<RuntimeMetricsInner>,
7663 read_materialization: Arc<Semaphore>,
7664 read_watchers: &mut ReadWatchers,
7665 requests: Vec<FlushColdRequest>,
7666 placement: ShardPlacement,
7667 ) -> Result<Vec<FlushColdResponse>, RuntimeError> {
7668 if requests.is_empty() {
7669 return Ok(Vec::new());
7670 }
7671 let stream_ids = requests
7672 .iter()
7673 .map(|request| request.stream_id.clone())
7674 .collect::<Vec<_>>();
7675 let commands = requests
7676 .into_iter()
7677 .map(GroupWriteCommand::from)
7678 .collect::<Vec<_>>();
7679 let started_at = Instant::now();
7680 let exec_started_at = Instant::now();
7681 let response = group
7682 .write_batch(vec![GroupWriteCommand::Batch { commands }], placement)
7683 .await
7684 .map_err(|err| RuntimeError::group_engine(placement, err));
7685 metrics.record_group_engine_exec(
7686 placement.core_id,
7687 placement.raft_group_id,
7688 elapsed_ns(exec_started_at),
7689 );
7690 let mut outer = response?;
7691 let Some(batch_response) = outer.pop() else {
7692 return Err(RuntimeError::group_engine(
7693 placement,
7694 GroupEngineError::new("cold flush batch returned no response"),
7695 ));
7696 };
7697 let items =
7698 match batch_response.map_err(|err| RuntimeError::group_engine(placement, err))? {
7699 GroupWriteResponse::Batch(items) => items,
7700 other => {
7701 return Err(RuntimeError::group_engine(
7702 placement,
7703 GroupEngineError::new(format!(
7704 "unexpected cold flush batch response: {other:?}"
7705 )),
7706 ));
7707 }
7708 };
7709 let mut responses = Vec::with_capacity(items.len());
7710 let mutation_ns = elapsed_ns(started_at);
7711 for (index, item) in items.into_iter().enumerate() {
7712 match item.map_err(|err| RuntimeError::group_engine(placement, err))? {
7713 GroupWriteResponse::FlushCold(response) => {
7714 metrics.record_applied_mutation(
7715 placement.core_id,
7716 placement.raft_group_id,
7717 mutation_ns,
7718 );
7719 if let Some(stream_id) = stream_ids.get(index) {
7720 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement)
7721 .await;
7722 Self::notify_read_watchers(
7723 group,
7724 metrics.clone(),
7725 read_materialization.clone(),
7726 read_watchers,
7727 stream_id,
7728 placement,
7729 )
7730 .await;
7731 }
7732 responses.push(response);
7733 }
7734 other => {
7735 return Err(RuntimeError::group_engine(
7736 placement,
7737 GroupEngineError::new(format!(
7738 "unexpected cold flush batch item response: {other:?}"
7739 )),
7740 ));
7741 }
7742 }
7743 }
7744 Ok(responses)
7745 }
7746
7747 async fn plan_cold_flush(
7748 group: &mut Box<dyn GroupEngine>,
7749 metrics: Arc<RuntimeMetricsInner>,
7750 request: PlanColdFlushRequest,
7751 placement: ShardPlacement,
7752 ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
7753 let exec_started_at = Instant::now();
7754 let response = group
7755 .plan_cold_flush(request, placement)
7756 .await
7757 .map_err(|err| RuntimeError::group_engine(placement, err));
7758 metrics.record_group_engine_exec(
7759 placement.core_id,
7760 placement.raft_group_id,
7761 elapsed_ns(exec_started_at),
7762 );
7763 response
7764 }
7765
7766 async fn plan_next_cold_flush(
7767 group: &mut Box<dyn GroupEngine>,
7768 metrics: Arc<RuntimeMetricsInner>,
7769 request: PlanGroupColdFlushRequest,
7770 placement: ShardPlacement,
7771 ) -> Result<Option<ColdFlushCandidate>, RuntimeError> {
7772 if !group.accepts_local_writes() {
7773 return Ok(None);
7774 }
7775 let exec_started_at = Instant::now();
7776 let response = group
7777 .plan_next_cold_flush(request, placement)
7778 .await
7779 .map_err(|err| RuntimeError::group_engine(placement, err));
7780 metrics.record_group_engine_exec(
7781 placement.core_id,
7782 placement.raft_group_id,
7783 elapsed_ns(exec_started_at),
7784 );
7785 response
7786 }
7787
7788 async fn plan_next_cold_flush_batch(
7789 group: &mut Box<dyn GroupEngine>,
7790 metrics: Arc<RuntimeMetricsInner>,
7791 request: PlanGroupColdFlushRequest,
7792 placement: ShardPlacement,
7793 max_candidates: usize,
7794 ) -> Result<Vec<ColdFlushCandidate>, RuntimeError> {
7795 if !group.accepts_local_writes() {
7796 return Ok(Vec::new());
7797 }
7798 let exec_started_at = Instant::now();
7799 let response = group
7800 .plan_next_cold_flush_batch(request, placement, max_candidates)
7801 .await
7802 .map_err(|err| RuntimeError::group_engine(placement, err));
7803 metrics.record_group_engine_exec(
7804 placement.core_id,
7805 placement.raft_group_id,
7806 elapsed_ns(exec_started_at),
7807 );
7808 response
7809 }
7810
7811 async fn head_stream(
7812 group: &mut Box<dyn GroupEngine>,
7813 metrics: Arc<RuntimeMetricsInner>,
7814 request: HeadStreamRequest,
7815 placement: ShardPlacement,
7816 ) -> Result<HeadStreamResponse, RuntimeError> {
7817 let exec_started_at = Instant::now();
7818 let response = group
7819 .head_stream(request, placement)
7820 .await
7821 .map_err(|err| RuntimeError::group_engine(placement, err));
7822 metrics.record_group_engine_exec(
7823 placement.core_id,
7824 placement.raft_group_id,
7825 elapsed_ns(exec_started_at),
7826 );
7827 response
7828 }
7829
7830 async fn snapshot_group(
7831 group: &mut Box<dyn GroupEngine>,
7832 metrics: Arc<RuntimeMetricsInner>,
7833 placement: ShardPlacement,
7834 ) -> Result<GroupSnapshot, RuntimeError> {
7835 let exec_started_at = Instant::now();
7836 let response = group
7837 .snapshot(placement)
7838 .await
7839 .map_err(|err| RuntimeError::group_engine(placement, err));
7840 metrics.record_group_engine_exec(
7841 placement.core_id,
7842 placement.raft_group_id,
7843 elapsed_ns(exec_started_at),
7844 );
7845 response
7846 }
7847
7848 async fn install_group_snapshot(
7849 group: &mut Box<dyn GroupEngine>,
7850 metrics: Arc<RuntimeMetricsInner>,
7851 snapshot: GroupSnapshot,
7852 ) -> Result<(), RuntimeError> {
7853 let placement = snapshot.placement;
7854 let exec_started_at = Instant::now();
7855 let response = group
7856 .install_snapshot(snapshot)
7857 .await
7858 .map_err(|err| RuntimeError::group_engine(placement, err));
7859 metrics.record_group_engine_exec(
7860 placement.core_id,
7861 placement.raft_group_id,
7862 elapsed_ns(exec_started_at),
7863 );
7864 response
7865 }
7866
7867 async fn create_stream(
7868 group: &mut Box<dyn GroupEngine>,
7869 metrics: Arc<RuntimeMetricsInner>,
7870 request: CreateStreamRequest,
7871 placement: ShardPlacement,
7872 admission: ColdWriteAdmission,
7873 ) -> Result<CreateStreamResponse, RuntimeError> {
7874 let stream_id = request.stream_id.clone();
7875 let incoming_bytes =
7876 u64::try_from(request.initial_payload.len()).expect("payload len fits u64");
7877 let started_at = Instant::now();
7878 let exec_started_at = Instant::now();
7879 let response = group
7880 .create_stream_with_cold_admission(request, placement, admission)
7881 .await
7882 .map_err(|err| {
7883 record_cold_backpressure_error(
7884 &metrics,
7885 placement,
7886 incoming_bytes,
7887 admission,
7888 &err,
7889 );
7890 RuntimeError::group_engine(placement, err)
7891 })?;
7892 metrics.record_group_engine_exec(
7893 placement.core_id,
7894 placement.raft_group_id,
7895 elapsed_ns(exec_started_at),
7896 );
7897 if !response.already_exists {
7898 metrics.record_applied_mutation(
7899 placement.core_id,
7900 placement.raft_group_id,
7901 elapsed_ns(started_at),
7902 );
7903 record_cold_hot_backlog(group, &metrics, stream_id, placement).await;
7904 }
7905 Ok(response)
7906 }
7907
7908 async fn create_stream_external(
7909 group: &mut Box<dyn GroupEngine>,
7910 metrics: Arc<RuntimeMetricsInner>,
7911 request: CreateStreamExternalRequest,
7912 placement: ShardPlacement,
7913 ) -> Result<CreateStreamResponse, RuntimeError> {
7914 let stream_id = request.stream_id.clone();
7915 let started_at = Instant::now();
7916 let exec_started_at = Instant::now();
7917 let response = group
7918 .create_stream_external(request, placement)
7919 .await
7920 .map_err(|err| RuntimeError::group_engine(placement, err))?;
7921 metrics.record_group_engine_exec(
7922 placement.core_id,
7923 placement.raft_group_id,
7924 elapsed_ns(exec_started_at),
7925 );
7926 if !response.already_exists {
7927 metrics.record_applied_mutation(
7928 placement.core_id,
7929 placement.raft_group_id,
7930 elapsed_ns(started_at),
7931 );
7932 record_cold_hot_backlog(group, &metrics, stream_id, placement).await;
7933 }
7934 Ok(response)
7935 }
7936
7937 async fn apply_append(
7938 group: &mut Box<dyn GroupEngine>,
7939 metrics: Arc<RuntimeMetricsInner>,
7940 read_materialization: Arc<Semaphore>,
7941 read_watchers: &mut ReadWatchers,
7942 request: AppendRequest,
7943 placement: ShardPlacement,
7944 admission: ColdWriteAdmission,
7945 ) -> Result<AppendResponse, RuntimeError> {
7946 let stream_id = request.stream_id.clone();
7947 let incoming_bytes = request.payload_len();
7948 let started_at = Instant::now();
7949 let exec_started_at = Instant::now();
7950 let response = group
7951 .append_with_cold_admission(request, placement, admission)
7952 .await
7953 .map_err(|err| {
7954 record_cold_backpressure_error(
7955 &metrics,
7956 placement,
7957 incoming_bytes,
7958 admission,
7959 &err,
7960 );
7961 RuntimeError::group_engine(placement, err)
7962 })?;
7963 metrics.record_group_engine_exec(
7964 placement.core_id,
7965 placement.raft_group_id,
7966 elapsed_ns(exec_started_at),
7967 );
7968
7969 if !response.deduplicated {
7970 metrics.record_append(placement.core_id, placement.raft_group_id);
7971 metrics.record_applied_mutation(
7972 placement.core_id,
7973 placement.raft_group_id,
7974 elapsed_ns(started_at),
7975 );
7976 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
7977 Self::notify_read_watchers(
7978 group,
7979 metrics,
7980 read_materialization,
7981 read_watchers,
7982 &stream_id,
7983 placement,
7984 )
7985 .await;
7986 }
7987 Ok(response)
7988 }
7989
7990 async fn apply_append_external(
7991 group: &mut Box<dyn GroupEngine>,
7992 metrics: Arc<RuntimeMetricsInner>,
7993 read_materialization: Arc<Semaphore>,
7994 read_watchers: &mut ReadWatchers,
7995 request: AppendExternalRequest,
7996 placement: ShardPlacement,
7997 ) -> Result<AppendResponse, RuntimeError> {
7998 let stream_id = request.stream_id.clone();
7999 let started_at = Instant::now();
8000 let exec_started_at = Instant::now();
8001 let response = group
8002 .append_external(request, placement)
8003 .await
8004 .map_err(|err| RuntimeError::group_engine(placement, err))?;
8005 metrics.record_group_engine_exec(
8006 placement.core_id,
8007 placement.raft_group_id,
8008 elapsed_ns(exec_started_at),
8009 );
8010
8011 if !response.deduplicated {
8012 metrics.record_append(placement.core_id, placement.raft_group_id);
8013 metrics.record_applied_mutation(
8014 placement.core_id,
8015 placement.raft_group_id,
8016 elapsed_ns(started_at),
8017 );
8018 record_cold_hot_backlog(group, &metrics, stream_id.clone(), placement).await;
8019 Self::notify_read_watchers(
8020 group,
8021 metrics,
8022 read_materialization,
8023 read_watchers,
8024 &stream_id,
8025 placement,
8026 )
8027 .await;
8028 }
8029 Ok(response)
8030 }
8031
8032 fn prepare_append_batch_commands(
8033 batch: Vec<(
8034 AppendBatchRequest,
8035 oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
8036 )>,
8037 ) -> (Vec<GroupWriteCommand>, Vec<PendingAppendBatch>) {
8038 let mut commands = Vec::with_capacity(batch.len());
8039 let mut pending = Vec::with_capacity(batch.len());
8040 for (request, response_tx) in batch {
8041 pending.push(PendingAppendBatch {
8042 stream_id: request.stream_id.clone(),
8043 incoming_bytes: append_batch_payload_bytes(&request),
8044 response_tx,
8045 started_at: Instant::now(),
8046 });
8047 commands.push(GroupWriteCommand::from(request));
8048 }
8049 (commands, pending)
8050 }
8051
8052 fn prepare_append_batch_requests(
8053 batch: Vec<(
8054 AppendBatchRequest,
8055 oneshot::Sender<Result<AppendBatchResponse, RuntimeError>>,
8056 )>,
8057 ) -> (Vec<AppendBatchRequest>, Vec<PendingAppendBatch>) {
8058 let mut requests = Vec::with_capacity(batch.len());
8059 let mut pending = Vec::with_capacity(batch.len());
8060 for (request, response_tx) in batch {
8061 pending.push(PendingAppendBatch {
8062 stream_id: request.stream_id.clone(),
8063 incoming_bytes: append_batch_payload_bytes(&request),
8064 response_tx,
8065 started_at: Instant::now(),
8066 });
8067 requests.push(request);
8068 }
8069 (requests, pending)
8070 }
8071
8072 async fn apply_prepared_append_batch_requests_with_cold_admission(
8073 group: &mut Box<dyn GroupEngine>,
8074 runtime: AppendBatchRuntime,
8075 read_watchers: &mut ReadWatchers,
8076 pending: Vec<PendingAppendBatch>,
8077 requests: Vec<AppendBatchRequest>,
8078 admission: ColdWriteAdmission,
8079 ) {
8080 let exec_started_at = Instant::now();
8081 let responses = group
8082 .append_batch_many_with_cold_admission(requests, runtime.placement, admission)
8083 .await
8084 .map_err(|err| RuntimeError::group_engine(runtime.placement, err));
8085 runtime.metrics.record_group_engine_exec(
8086 runtime.placement.core_id,
8087 runtime.placement.raft_group_id,
8088 elapsed_ns(exec_started_at),
8089 );
8090 Self::finish_append_batch_commands(
8091 group,
8092 runtime,
8093 read_watchers,
8094 pending,
8095 responses,
8096 Some(admission),
8097 )
8098 .await;
8099 }
8100
8101 async fn apply_prepared_append_batch_commands(
8102 group: &mut Box<dyn GroupEngine>,
8103 runtime: AppendBatchRuntime,
8104 read_watchers: &mut ReadWatchers,
8105 pending: Vec<PendingAppendBatch>,
8106 commands: Vec<GroupWriteCommand>,
8107 ) {
8108 let exec_started_at = Instant::now();
8109 let responses = group
8110 .write_batch(commands, runtime.placement)
8111 .await
8112 .map_err(|err| RuntimeError::group_engine(runtime.placement, err));
8113 runtime.metrics.record_group_engine_exec(
8114 runtime.placement.core_id,
8115 runtime.placement.raft_group_id,
8116 elapsed_ns(exec_started_at),
8117 );
8118 Self::finish_append_batch_commands(group, runtime, read_watchers, pending, responses, None)
8119 .await;
8120 }
8121
8122 async fn finish_append_batch_commands(
8123 group: &mut Box<dyn GroupEngine>,
8124 runtime: AppendBatchRuntime,
8125 read_watchers: &mut ReadWatchers,
8126 pending: Vec<PendingAppendBatch>,
8127 responses: Result<Vec<Result<GroupWriteResponse, GroupEngineError>>, RuntimeError>,
8128 admission: Option<ColdWriteAdmission>,
8129 ) {
8130 let placement = runtime.placement;
8131 let responses = match responses {
8132 Ok(responses) => responses,
8133 Err(err) => {
8134 for pending in pending {
8135 if let Some(admission) = admission
8136 && let RuntimeError::GroupEngine { message, .. } = &err
8137 && message.contains("ColdBackpressure")
8138 {
8139 runtime.metrics.record_cold_backpressure(
8140 placement.core_id,
8141 placement.raft_group_id,
8142 pending.incoming_bytes,
8143 admission.max_hot_bytes_per_group.unwrap_or(0),
8144 );
8145 }
8146 let _ = pending.response_tx.send(Err(err.clone()));
8147 }
8148 return;
8149 }
8150 };
8151
8152 if responses.len() != pending.len() {
8153 let err = RuntimeError::GroupEngine {
8154 core_id: placement.core_id,
8155 raft_group_id: placement.raft_group_id,
8156 message: format!(
8157 "batched append response count {} does not match request count {}",
8158 responses.len(),
8159 pending.len()
8160 ),
8161 next_offset: None,
8162 leader_hint: None,
8163 };
8164 for pending in pending {
8165 let _ = pending.response_tx.send(Err(err.clone()));
8166 }
8167 return;
8168 }
8169
8170 for (pending, response) in pending.into_iter().zip(responses) {
8171 let response = match response {
8172 Ok(GroupWriteResponse::AppendBatch(response)) => Ok(response),
8173 Ok(other) => Err(RuntimeError::GroupEngine {
8174 core_id: placement.core_id,
8175 raft_group_id: placement.raft_group_id,
8176 message: format!("unexpected batched append response: {other:?}"),
8177 next_offset: None,
8178 leader_hint: None,
8179 }),
8180 Err(err) => Err(RuntimeError::group_engine(placement, err)),
8181 };
8182
8183 match response {
8184 Ok(response) => {
8185 let success_count = response
8186 .items
8187 .iter()
8188 .filter(|item| matches!(item, Ok(response) if !response.deduplicated))
8189 .count();
8190 if success_count > 0 {
8191 let success_count = u64::try_from(success_count).expect("count fits u64");
8192 runtime.metrics.record_append_batch(
8193 placement.core_id,
8194 placement.raft_group_id,
8195 success_count,
8196 );
8197 runtime.metrics.record_applied_mutation_batch(
8198 placement.core_id,
8199 placement.raft_group_id,
8200 success_count,
8201 elapsed_ns(pending.started_at),
8202 );
8203 Self::notify_read_watchers(
8204 group,
8205 runtime.metrics.clone(),
8206 runtime.read_materialization.clone(),
8207 read_watchers,
8208 &pending.stream_id,
8209 placement,
8210 )
8211 .await;
8212 }
8213
8214 let items = response
8215 .items
8216 .into_iter()
8217 .map(|item| item.map_err(|err| RuntimeError::group_engine(placement, err)))
8218 .collect();
8219 let _ = pending
8220 .response_tx
8221 .send(Ok(AppendBatchResponse { placement, items }));
8222 }
8223 Err(err) => {
8224 if let Some(admission) = admission
8225 && let RuntimeError::GroupEngine { message, .. } = &err
8226 && message.contains("ColdBackpressure")
8227 {
8228 runtime.metrics.record_cold_backpressure(
8229 placement.core_id,
8230 placement.raft_group_id,
8231 pending.incoming_bytes,
8232 admission.max_hot_bytes_per_group.unwrap_or(0),
8233 );
8234 }
8235 let _ = pending.response_tx.send(Err(err));
8236 }
8237 }
8238 }
8239 }
8240
8241 async fn notify_read_watchers(
8242 group: &mut Box<dyn GroupEngine>,
8243 metrics: Arc<RuntimeMetricsInner>,
8244 read_materialization: Arc<Semaphore>,
8245 read_watchers: &mut ReadWatchers,
8246 stream_id: &BucketStreamId,
8247 placement: ShardPlacement,
8248 ) {
8249 let Some(watchers) = read_watchers.remove(stream_id) else {
8250 return;
8251 };
8252 metrics.record_read_watchers_removed(placement.core_id, watchers.len());
8253
8254 let mut request_groups: Vec<(ReadStreamRequest, Vec<ReadWatcher>)> = Vec::new();
8255 for watcher in watchers {
8256 if let Some((_, grouped)) = request_groups
8257 .iter_mut()
8258 .find(|(request, _)| *request == watcher.request)
8259 {
8260 grouped.push(watcher);
8261 } else {
8262 request_groups.push((watcher.request.clone(), vec![watcher]));
8263 }
8264 }
8265
8266 let mut pending = Vec::new();
8267 for (request, watchers) in request_groups {
8268 let parts = group
8269 .read_stream_parts(request, placement)
8270 .await
8271 .map_err(|err| RuntimeError::group_engine(placement, err));
8272 match parts {
8273 Ok(parts) if parts.payload_is_empty() && parts.up_to_date && !parts.closed => {
8274 pending.extend(watchers);
8275 }
8276 Ok(parts) => {
8277 Self::send_read_parts_to_watchers(
8278 placement,
8279 read_materialization.clone(),
8280 parts,
8281 watchers,
8282 );
8283 }
8284 Err(err) => {
8285 for watcher in watchers {
8286 let _ = watcher.response_tx.send(Err(err.clone()));
8287 }
8288 }
8289 }
8290 }
8291
8292 if !pending.is_empty() {
8293 metrics.record_read_watchers_added(placement.core_id, pending.len());
8294 read_watchers
8295 .entry(stream_id.clone())
8296 .or_default()
8297 .extend(pending);
8298 }
8299 }
8300}
8301
8302#[derive(Debug, Clone)]
8303pub struct RuntimeMetrics {
8304 inner: Arc<RuntimeMetricsInner>,
8305}
8306
8307impl RuntimeMetrics {
8308 pub fn snapshot(&self) -> RuntimeMetricsSnapshot {
8309 let per_core_appends = self
8310 .inner
8311 .per_core_appends
8312 .iter()
8313 .map(PaddedAtomicU64::load_relaxed)
8314 .collect::<Vec<_>>();
8315 let accepted_appends = per_core_appends.iter().sum();
8316 let per_group_appends = self
8317 .inner
8318 .per_group_appends
8319 .iter()
8320 .map(PaddedAtomicU64::load_relaxed)
8321 .collect();
8322 let per_core_applied_mutations = self
8323 .inner
8324 .per_core_applied_mutations
8325 .iter()
8326 .map(PaddedAtomicU64::load_relaxed)
8327 .collect::<Vec<_>>();
8328 let applied_mutations = per_core_applied_mutations.iter().sum();
8329 let per_group_applied_mutations = self
8330 .inner
8331 .per_group_applied_mutations
8332 .iter()
8333 .map(PaddedAtomicU64::load_relaxed)
8334 .collect();
8335 let per_core_mutation_apply_ns = self
8336 .inner
8337 .per_core_mutation_apply_ns
8338 .iter()
8339 .map(PaddedAtomicU64::load_relaxed)
8340 .collect::<Vec<_>>();
8341 let mutation_apply_ns = per_core_mutation_apply_ns.iter().sum();
8342 let per_group_mutation_apply_ns = self
8343 .inner
8344 .per_group_mutation_apply_ns
8345 .iter()
8346 .map(PaddedAtomicU64::load_relaxed)
8347 .collect();
8348 let per_core_group_lock_wait_ns = self
8349 .inner
8350 .per_core_group_lock_wait_ns
8351 .iter()
8352 .map(PaddedAtomicU64::load_relaxed)
8353 .collect::<Vec<_>>();
8354 let group_lock_wait_ns = per_core_group_lock_wait_ns.iter().sum();
8355 let per_group_group_lock_wait_ns = self
8356 .inner
8357 .per_group_group_lock_wait_ns
8358 .iter()
8359 .map(PaddedAtomicU64::load_relaxed)
8360 .collect();
8361 let per_core_group_engine_exec_ns = self
8362 .inner
8363 .per_core_group_engine_exec_ns
8364 .iter()
8365 .map(PaddedAtomicU64::load_relaxed)
8366 .collect::<Vec<_>>();
8367 let group_engine_exec_ns = per_core_group_engine_exec_ns.iter().sum();
8368 let per_group_group_engine_exec_ns = self
8369 .inner
8370 .per_group_group_engine_exec_ns
8371 .iter()
8372 .map(PaddedAtomicU64::load_relaxed)
8373 .collect();
8374 let per_group_group_mailbox_depth = self
8375 .inner
8376 .per_group_group_mailbox_depth
8377 .iter()
8378 .map(PaddedAtomicU64::load_relaxed)
8379 .collect::<Vec<_>>();
8380 let group_mailbox_depth = per_group_group_mailbox_depth.iter().sum();
8381 let per_group_group_mailbox_max_depth = self
8382 .inner
8383 .per_group_group_mailbox_max_depth
8384 .iter()
8385 .map(PaddedAtomicU64::load_relaxed)
8386 .collect::<Vec<_>>();
8387 let group_mailbox_max_depth = per_group_group_mailbox_max_depth
8388 .iter()
8389 .copied()
8390 .max()
8391 .unwrap_or(0);
8392 let per_group_group_mailbox_full_events = self
8393 .inner
8394 .per_group_group_mailbox_full_events
8395 .iter()
8396 .map(PaddedAtomicU64::load_relaxed)
8397 .collect::<Vec<_>>();
8398 let group_mailbox_full_events = per_group_group_mailbox_full_events.iter().sum();
8399 let per_core_raft_write_many_batches = self
8400 .inner
8401 .per_core_raft_write_many_batches
8402 .iter()
8403 .map(PaddedAtomicU64::load_relaxed)
8404 .collect::<Vec<_>>();
8405 let raft_write_many_batches = per_core_raft_write_many_batches.iter().sum();
8406 let per_group_raft_write_many_batches = self
8407 .inner
8408 .per_group_raft_write_many_batches
8409 .iter()
8410 .map(PaddedAtomicU64::load_relaxed)
8411 .collect();
8412 let per_core_raft_write_many_commands = self
8413 .inner
8414 .per_core_raft_write_many_commands
8415 .iter()
8416 .map(PaddedAtomicU64::load_relaxed)
8417 .collect::<Vec<_>>();
8418 let raft_write_many_commands = per_core_raft_write_many_commands.iter().sum();
8419 let per_group_raft_write_many_commands = self
8420 .inner
8421 .per_group_raft_write_many_commands
8422 .iter()
8423 .map(PaddedAtomicU64::load_relaxed)
8424 .collect();
8425 let per_core_raft_write_many_logical_commands = self
8426 .inner
8427 .per_core_raft_write_many_logical_commands
8428 .iter()
8429 .map(PaddedAtomicU64::load_relaxed)
8430 .collect::<Vec<_>>();
8431 let raft_write_many_logical_commands =
8432 per_core_raft_write_many_logical_commands.iter().sum();
8433 let per_group_raft_write_many_logical_commands = self
8434 .inner
8435 .per_group_raft_write_many_logical_commands
8436 .iter()
8437 .map(PaddedAtomicU64::load_relaxed)
8438 .collect();
8439 let per_core_raft_write_many_responses = self
8440 .inner
8441 .per_core_raft_write_many_responses
8442 .iter()
8443 .map(PaddedAtomicU64::load_relaxed)
8444 .collect::<Vec<_>>();
8445 let raft_write_many_responses = per_core_raft_write_many_responses.iter().sum();
8446 let per_group_raft_write_many_responses = self
8447 .inner
8448 .per_group_raft_write_many_responses
8449 .iter()
8450 .map(PaddedAtomicU64::load_relaxed)
8451 .collect();
8452 let per_core_raft_write_many_submit_ns = self
8453 .inner
8454 .per_core_raft_write_many_submit_ns
8455 .iter()
8456 .map(PaddedAtomicU64::load_relaxed)
8457 .collect::<Vec<_>>();
8458 let raft_write_many_submit_ns = per_core_raft_write_many_submit_ns.iter().sum();
8459 let per_group_raft_write_many_submit_ns = self
8460 .inner
8461 .per_group_raft_write_many_submit_ns
8462 .iter()
8463 .map(PaddedAtomicU64::load_relaxed)
8464 .collect();
8465 let per_core_raft_write_many_response_ns = self
8466 .inner
8467 .per_core_raft_write_many_response_ns
8468 .iter()
8469 .map(PaddedAtomicU64::load_relaxed)
8470 .collect::<Vec<_>>();
8471 let raft_write_many_response_ns = per_core_raft_write_many_response_ns.iter().sum();
8472 let per_group_raft_write_many_response_ns = self
8473 .inner
8474 .per_group_raft_write_many_response_ns
8475 .iter()
8476 .map(PaddedAtomicU64::load_relaxed)
8477 .collect();
8478 let per_core_raft_apply_entries = self
8479 .inner
8480 .per_core_raft_apply_entries
8481 .iter()
8482 .map(PaddedAtomicU64::load_relaxed)
8483 .collect::<Vec<_>>();
8484 let raft_apply_entries = per_core_raft_apply_entries.iter().sum();
8485 let per_group_raft_apply_entries = self
8486 .inner
8487 .per_group_raft_apply_entries
8488 .iter()
8489 .map(PaddedAtomicU64::load_relaxed)
8490 .collect();
8491 let per_core_raft_apply_ns = self
8492 .inner
8493 .per_core_raft_apply_ns
8494 .iter()
8495 .map(PaddedAtomicU64::load_relaxed)
8496 .collect::<Vec<_>>();
8497 let raft_apply_ns = per_core_raft_apply_ns.iter().sum();
8498 let per_group_raft_apply_ns = self
8499 .inner
8500 .per_group_raft_apply_ns
8501 .iter()
8502 .map(PaddedAtomicU64::load_relaxed)
8503 .collect();
8504 let per_core_live_read_waiters = self
8505 .inner
8506 .per_core_live_read_waiters
8507 .iter()
8508 .map(PaddedAtomicU64::load_relaxed)
8509 .collect::<Vec<_>>();
8510 let live_read_waiters = per_core_live_read_waiters.iter().sum();
8511 let per_core_live_read_backpressure_events = self
8512 .inner
8513 .per_core_live_read_backpressure_events
8514 .iter()
8515 .map(PaddedAtomicU64::load_relaxed)
8516 .collect::<Vec<_>>();
8517 let live_read_backpressure_events = per_core_live_read_backpressure_events.iter().sum();
8518 let per_core_routed_requests = self
8519 .inner
8520 .per_core_routed_requests
8521 .iter()
8522 .map(PaddedAtomicU64::load_relaxed)
8523 .collect::<Vec<_>>();
8524 let routed_requests = per_core_routed_requests.iter().sum();
8525 let per_core_mailbox_send_wait_ns = self
8526 .inner
8527 .per_core_mailbox_send_wait_ns
8528 .iter()
8529 .map(PaddedAtomicU64::load_relaxed)
8530 .collect::<Vec<_>>();
8531 let mailbox_send_wait_ns = per_core_mailbox_send_wait_ns.iter().sum();
8532 let per_core_mailbox_full_events = self
8533 .inner
8534 .per_core_mailbox_full_events
8535 .iter()
8536 .map(PaddedAtomicU64::load_relaxed)
8537 .collect::<Vec<_>>();
8538 let mailbox_full_events = per_core_mailbox_full_events.iter().sum();
8539 let per_core_wal_batches = self
8540 .inner
8541 .per_core_wal_batches
8542 .iter()
8543 .map(PaddedAtomicU64::load_relaxed)
8544 .collect::<Vec<_>>();
8545 let wal_batches = per_core_wal_batches.iter().sum();
8546 let per_group_wal_batches = self
8547 .inner
8548 .per_group_wal_batches
8549 .iter()
8550 .map(PaddedAtomicU64::load_relaxed)
8551 .collect();
8552 let per_core_wal_records = self
8553 .inner
8554 .per_core_wal_records
8555 .iter()
8556 .map(PaddedAtomicU64::load_relaxed)
8557 .collect::<Vec<_>>();
8558 let wal_records = per_core_wal_records.iter().sum();
8559 let per_group_wal_records = self
8560 .inner
8561 .per_group_wal_records
8562 .iter()
8563 .map(PaddedAtomicU64::load_relaxed)
8564 .collect();
8565 let per_core_wal_write_ns = self
8566 .inner
8567 .per_core_wal_write_ns
8568 .iter()
8569 .map(PaddedAtomicU64::load_relaxed)
8570 .collect::<Vec<_>>();
8571 let wal_write_ns = per_core_wal_write_ns.iter().sum();
8572 let per_group_wal_write_ns = self
8573 .inner
8574 .per_group_wal_write_ns
8575 .iter()
8576 .map(PaddedAtomicU64::load_relaxed)
8577 .collect();
8578 let per_core_wal_sync_ns = self
8579 .inner
8580 .per_core_wal_sync_ns
8581 .iter()
8582 .map(PaddedAtomicU64::load_relaxed)
8583 .collect::<Vec<_>>();
8584 let wal_sync_ns = per_core_wal_sync_ns.iter().sum();
8585 let per_group_wal_sync_ns = self
8586 .inner
8587 .per_group_wal_sync_ns
8588 .iter()
8589 .map(PaddedAtomicU64::load_relaxed)
8590 .collect();
8591 let cold_flush_uploads = self.inner.cold_flush_uploads.load_relaxed();
8592 let cold_flush_upload_bytes = self.inner.cold_flush_upload_bytes.load_relaxed();
8593 let cold_flush_upload_ns = self.inner.cold_flush_upload_ns.load_relaxed();
8594 let cold_flush_publishes = self.inner.cold_flush_publishes.load_relaxed();
8595 let cold_flush_publish_bytes = self.inner.cold_flush_publish_bytes.load_relaxed();
8596 let cold_flush_publish_ns = self.inner.cold_flush_publish_ns.load_relaxed();
8597 let cold_orphan_cleanup_attempts = self.inner.cold_orphan_cleanup_attempts.load_relaxed();
8598 let cold_orphan_cleanup_errors = self.inner.cold_orphan_cleanup_errors.load_relaxed();
8599 let cold_orphan_bytes = self.inner.cold_orphan_bytes.load_relaxed();
8600 let per_group_cold_hot_bytes = self
8601 .inner
8602 .per_group_cold_hot_bytes
8603 .iter()
8604 .map(PaddedAtomicU64::load_relaxed)
8605 .collect::<Vec<_>>();
8606 let cold_hot_bytes = per_group_cold_hot_bytes.iter().sum();
8607 let per_group_cold_hot_bytes_max = self
8608 .inner
8609 .per_group_cold_hot_bytes_max
8610 .iter()
8611 .map(PaddedAtomicU64::load_relaxed)
8612 .collect::<Vec<_>>();
8613 let cold_hot_group_bytes_max = per_group_cold_hot_bytes_max
8614 .iter()
8615 .copied()
8616 .max()
8617 .unwrap_or(0);
8618 let cold_hot_stream_bytes_max = self.inner.cold_hot_stream_bytes_max.load_relaxed();
8619 let per_core_cold_backpressure_events = self
8620 .inner
8621 .per_core_cold_backpressure_events
8622 .iter()
8623 .map(PaddedAtomicU64::load_relaxed)
8624 .collect::<Vec<_>>();
8625 let cold_backpressure_events = per_core_cold_backpressure_events.iter().sum();
8626 let per_group_cold_backpressure_events = self
8627 .inner
8628 .per_group_cold_backpressure_events
8629 .iter()
8630 .map(PaddedAtomicU64::load_relaxed)
8631 .collect();
8632 let cold_backpressure_bytes = self.inner.cold_backpressure_bytes.load_relaxed();
8633
8634 RuntimeMetricsSnapshot {
8635 accepted_appends,
8636 per_core_appends,
8637 per_group_appends,
8638 applied_mutations,
8639 per_core_applied_mutations,
8640 per_group_applied_mutations,
8641 mutation_apply_ns,
8642 per_core_mutation_apply_ns,
8643 per_group_mutation_apply_ns,
8644 group_lock_wait_ns,
8645 per_core_group_lock_wait_ns,
8646 per_group_group_lock_wait_ns,
8647 group_engine_exec_ns,
8648 per_core_group_engine_exec_ns,
8649 per_group_group_engine_exec_ns,
8650 group_mailbox_depth,
8651 per_group_group_mailbox_depth,
8652 group_mailbox_max_depth,
8653 per_group_group_mailbox_max_depth,
8654 group_mailbox_full_events,
8655 per_group_group_mailbox_full_events,
8656 raft_write_many_batches,
8657 per_core_raft_write_many_batches,
8658 per_group_raft_write_many_batches,
8659 raft_write_many_commands,
8660 per_core_raft_write_many_commands,
8661 per_group_raft_write_many_commands,
8662 raft_write_many_logical_commands,
8663 per_core_raft_write_many_logical_commands,
8664 per_group_raft_write_many_logical_commands,
8665 raft_write_many_responses,
8666 per_core_raft_write_many_responses,
8667 per_group_raft_write_many_responses,
8668 raft_write_many_submit_ns,
8669 per_core_raft_write_many_submit_ns,
8670 per_group_raft_write_many_submit_ns,
8671 raft_write_many_response_ns,
8672 per_core_raft_write_many_response_ns,
8673 per_group_raft_write_many_response_ns,
8674 raft_apply_entries,
8675 per_core_raft_apply_entries,
8676 per_group_raft_apply_entries,
8677 raft_apply_ns,
8678 per_core_raft_apply_ns,
8679 per_group_raft_apply_ns,
8680 live_read_waiters,
8681 per_core_live_read_waiters,
8682 live_read_backpressure_events,
8683 per_core_live_read_backpressure_events,
8684 routed_requests,
8685 per_core_routed_requests,
8686 mailbox_send_wait_ns,
8687 per_core_mailbox_send_wait_ns,
8688 mailbox_full_events,
8689 per_core_mailbox_full_events,
8690 wal_batches,
8691 per_core_wal_batches,
8692 per_group_wal_batches,
8693 wal_records,
8694 per_core_wal_records,
8695 per_group_wal_records,
8696 wal_write_ns,
8697 per_core_wal_write_ns,
8698 per_group_wal_write_ns,
8699 wal_sync_ns,
8700 per_core_wal_sync_ns,
8701 per_group_wal_sync_ns,
8702 cold_flush_uploads,
8703 cold_flush_upload_bytes,
8704 cold_flush_upload_ns,
8705 cold_flush_publishes,
8706 cold_flush_publish_bytes,
8707 cold_flush_publish_ns,
8708 cold_orphan_cleanup_attempts,
8709 cold_orphan_cleanup_errors,
8710 cold_orphan_bytes,
8711 cold_hot_bytes,
8712 per_group_cold_hot_bytes,
8713 cold_hot_group_bytes_max,
8714 per_group_cold_hot_bytes_max,
8715 cold_hot_stream_bytes_max,
8716 cold_backpressure_events,
8717 per_core_cold_backpressure_events,
8718 per_group_cold_backpressure_events,
8719 cold_backpressure_bytes,
8720 }
8721 }
8722}
8723
8724#[derive(Debug, Clone, PartialEq, Eq)]
8725pub struct RuntimeMetricsSnapshot {
8726 pub accepted_appends: u64,
8727 pub per_core_appends: Vec<u64>,
8728 pub per_group_appends: Vec<u64>,
8729 pub applied_mutations: u64,
8730 pub per_core_applied_mutations: Vec<u64>,
8731 pub per_group_applied_mutations: Vec<u64>,
8732 pub mutation_apply_ns: u64,
8733 pub per_core_mutation_apply_ns: Vec<u64>,
8734 pub per_group_mutation_apply_ns: Vec<u64>,
8735 pub group_lock_wait_ns: u64,
8736 pub per_core_group_lock_wait_ns: Vec<u64>,
8737 pub per_group_group_lock_wait_ns: Vec<u64>,
8738 pub group_engine_exec_ns: u64,
8739 pub per_core_group_engine_exec_ns: Vec<u64>,
8740 pub per_group_group_engine_exec_ns: Vec<u64>,
8741 pub group_mailbox_depth: u64,
8742 pub per_group_group_mailbox_depth: Vec<u64>,
8743 pub group_mailbox_max_depth: u64,
8744 pub per_group_group_mailbox_max_depth: Vec<u64>,
8745 pub group_mailbox_full_events: u64,
8746 pub per_group_group_mailbox_full_events: Vec<u64>,
8747 pub raft_write_many_batches: u64,
8748 pub per_core_raft_write_many_batches: Vec<u64>,
8749 pub per_group_raft_write_many_batches: Vec<u64>,
8750 pub raft_write_many_commands: u64,
8751 pub per_core_raft_write_many_commands: Vec<u64>,
8752 pub per_group_raft_write_many_commands: Vec<u64>,
8753 pub raft_write_many_logical_commands: u64,
8754 pub per_core_raft_write_many_logical_commands: Vec<u64>,
8755 pub per_group_raft_write_many_logical_commands: Vec<u64>,
8756 pub raft_write_many_responses: u64,
8757 pub per_core_raft_write_many_responses: Vec<u64>,
8758 pub per_group_raft_write_many_responses: Vec<u64>,
8759 pub raft_write_many_submit_ns: u64,
8760 pub per_core_raft_write_many_submit_ns: Vec<u64>,
8761 pub per_group_raft_write_many_submit_ns: Vec<u64>,
8762 pub raft_write_many_response_ns: u64,
8763 pub per_core_raft_write_many_response_ns: Vec<u64>,
8764 pub per_group_raft_write_many_response_ns: Vec<u64>,
8765 pub raft_apply_entries: u64,
8766 pub per_core_raft_apply_entries: Vec<u64>,
8767 pub per_group_raft_apply_entries: Vec<u64>,
8768 pub raft_apply_ns: u64,
8769 pub per_core_raft_apply_ns: Vec<u64>,
8770 pub per_group_raft_apply_ns: Vec<u64>,
8771 pub live_read_waiters: u64,
8772 pub per_core_live_read_waiters: Vec<u64>,
8773 pub live_read_backpressure_events: u64,
8774 pub per_core_live_read_backpressure_events: Vec<u64>,
8775 pub routed_requests: u64,
8776 pub per_core_routed_requests: Vec<u64>,
8777 pub mailbox_send_wait_ns: u64,
8778 pub per_core_mailbox_send_wait_ns: Vec<u64>,
8779 pub mailbox_full_events: u64,
8780 pub per_core_mailbox_full_events: Vec<u64>,
8781 pub wal_batches: u64,
8782 pub per_core_wal_batches: Vec<u64>,
8783 pub per_group_wal_batches: Vec<u64>,
8784 pub wal_records: u64,
8785 pub per_core_wal_records: Vec<u64>,
8786 pub per_group_wal_records: Vec<u64>,
8787 pub wal_write_ns: u64,
8788 pub per_core_wal_write_ns: Vec<u64>,
8789 pub per_group_wal_write_ns: Vec<u64>,
8790 pub wal_sync_ns: u64,
8791 pub per_core_wal_sync_ns: Vec<u64>,
8792 pub per_group_wal_sync_ns: Vec<u64>,
8793 pub cold_flush_uploads: u64,
8794 pub cold_flush_upload_bytes: u64,
8795 pub cold_flush_upload_ns: u64,
8796 pub cold_flush_publishes: u64,
8797 pub cold_flush_publish_bytes: u64,
8798 pub cold_flush_publish_ns: u64,
8799 pub cold_orphan_cleanup_attempts: u64,
8800 pub cold_orphan_cleanup_errors: u64,
8801 pub cold_orphan_bytes: u64,
8802 pub cold_hot_bytes: u64,
8803 pub per_group_cold_hot_bytes: Vec<u64>,
8804 pub cold_hot_group_bytes_max: u64,
8805 pub per_group_cold_hot_bytes_max: Vec<u64>,
8806 pub cold_hot_stream_bytes_max: u64,
8807 pub cold_backpressure_events: u64,
8808 pub per_core_cold_backpressure_events: Vec<u64>,
8809 pub per_group_cold_backpressure_events: Vec<u64>,
8810 pub cold_backpressure_bytes: u64,
8811}
8812
8813#[derive(Debug, Clone, PartialEq, Eq)]
8814pub struct RuntimeMailboxSnapshot {
8815 pub depths: Vec<usize>,
8816 pub capacities: Vec<usize>,
8817}
8818
8819#[derive(Debug)]
8820struct RuntimeMetricsInner {
8821 per_core_appends: Vec<PaddedAtomicU64>,
8822 per_group_appends: Vec<PaddedAtomicU64>,
8823 per_core_applied_mutations: Vec<PaddedAtomicU64>,
8824 per_group_applied_mutations: Vec<PaddedAtomicU64>,
8825 per_core_mutation_apply_ns: Vec<PaddedAtomicU64>,
8826 per_group_mutation_apply_ns: Vec<PaddedAtomicU64>,
8827 per_core_group_lock_wait_ns: Vec<PaddedAtomicU64>,
8828 per_group_group_lock_wait_ns: Vec<PaddedAtomicU64>,
8829 per_core_group_engine_exec_ns: Vec<PaddedAtomicU64>,
8830 per_group_group_engine_exec_ns: Vec<PaddedAtomicU64>,
8831 per_group_group_mailbox_depth: Vec<PaddedAtomicU64>,
8832 per_group_group_mailbox_max_depth: Vec<PaddedAtomicU64>,
8833 per_group_group_mailbox_full_events: Vec<PaddedAtomicU64>,
8834 per_core_raft_write_many_batches: Vec<PaddedAtomicU64>,
8835 per_group_raft_write_many_batches: Vec<PaddedAtomicU64>,
8836 per_core_raft_write_many_commands: Vec<PaddedAtomicU64>,
8837 per_group_raft_write_many_commands: Vec<PaddedAtomicU64>,
8838 per_core_raft_write_many_logical_commands: Vec<PaddedAtomicU64>,
8839 per_group_raft_write_many_logical_commands: Vec<PaddedAtomicU64>,
8840 per_core_raft_write_many_responses: Vec<PaddedAtomicU64>,
8841 per_group_raft_write_many_responses: Vec<PaddedAtomicU64>,
8842 per_core_raft_write_many_submit_ns: Vec<PaddedAtomicU64>,
8843 per_group_raft_write_many_submit_ns: Vec<PaddedAtomicU64>,
8844 per_core_raft_write_many_response_ns: Vec<PaddedAtomicU64>,
8845 per_group_raft_write_many_response_ns: Vec<PaddedAtomicU64>,
8846 per_core_raft_apply_entries: Vec<PaddedAtomicU64>,
8847 per_group_raft_apply_entries: Vec<PaddedAtomicU64>,
8848 per_core_raft_apply_ns: Vec<PaddedAtomicU64>,
8849 per_group_raft_apply_ns: Vec<PaddedAtomicU64>,
8850 per_core_live_read_waiters: Vec<PaddedAtomicU64>,
8851 per_core_live_read_backpressure_events: Vec<PaddedAtomicU64>,
8852 per_core_routed_requests: Vec<PaddedAtomicU64>,
8853 per_core_mailbox_send_wait_ns: Vec<PaddedAtomicU64>,
8854 per_core_mailbox_full_events: Vec<PaddedAtomicU64>,
8855 per_core_wal_batches: Vec<PaddedAtomicU64>,
8856 per_group_wal_batches: Vec<PaddedAtomicU64>,
8857 per_core_wal_records: Vec<PaddedAtomicU64>,
8858 per_group_wal_records: Vec<PaddedAtomicU64>,
8859 per_core_wal_write_ns: Vec<PaddedAtomicU64>,
8860 per_group_wal_write_ns: Vec<PaddedAtomicU64>,
8861 per_core_wal_sync_ns: Vec<PaddedAtomicU64>,
8862 per_group_wal_sync_ns: Vec<PaddedAtomicU64>,
8863 cold_flush_uploads: PaddedAtomicU64,
8864 cold_flush_upload_bytes: PaddedAtomicU64,
8865 cold_flush_upload_ns: PaddedAtomicU64,
8866 cold_flush_publishes: PaddedAtomicU64,
8867 cold_flush_publish_bytes: PaddedAtomicU64,
8868 cold_flush_publish_ns: PaddedAtomicU64,
8869 cold_orphan_cleanup_attempts: PaddedAtomicU64,
8870 cold_orphan_cleanup_errors: PaddedAtomicU64,
8871 cold_orphan_bytes: PaddedAtomicU64,
8872 per_group_cold_hot_bytes: Vec<PaddedAtomicU64>,
8873 per_group_cold_hot_bytes_max: Vec<PaddedAtomicU64>,
8874 cold_hot_stream_bytes_max: PaddedAtomicU64,
8875 per_core_cold_backpressure_events: Vec<PaddedAtomicU64>,
8876 per_group_cold_backpressure_events: Vec<PaddedAtomicU64>,
8877 cold_backpressure_bytes: PaddedAtomicU64,
8878}
8879
8880#[derive(Debug, Clone, Copy)]
8881struct RaftWriteManySample {
8882 command_count: u64,
8883 logical_command_count: u64,
8884 response_count: u64,
8885 submit_ns: u64,
8886 response_ns: u64,
8887}
8888
8889impl RuntimeMetricsInner {
8890 fn new(core_count: usize, raft_group_count: usize) -> Self {
8891 Self {
8892 per_core_appends: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8893 per_group_appends: (0..raft_group_count)
8894 .map(|_| PaddedAtomicU64::new(0))
8895 .collect(),
8896 per_core_applied_mutations: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8897 per_group_applied_mutations: (0..raft_group_count)
8898 .map(|_| PaddedAtomicU64::new(0))
8899 .collect(),
8900 per_core_mutation_apply_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8901 per_group_mutation_apply_ns: (0..raft_group_count)
8902 .map(|_| PaddedAtomicU64::new(0))
8903 .collect(),
8904 per_core_group_lock_wait_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8905 per_group_group_lock_wait_ns: (0..raft_group_count)
8906 .map(|_| PaddedAtomicU64::new(0))
8907 .collect(),
8908 per_core_group_engine_exec_ns: (0..core_count)
8909 .map(|_| PaddedAtomicU64::new(0))
8910 .collect(),
8911 per_group_group_engine_exec_ns: (0..raft_group_count)
8912 .map(|_| PaddedAtomicU64::new(0))
8913 .collect(),
8914 per_group_group_mailbox_depth: (0..raft_group_count)
8915 .map(|_| PaddedAtomicU64::new(0))
8916 .collect(),
8917 per_group_group_mailbox_max_depth: (0..raft_group_count)
8918 .map(|_| PaddedAtomicU64::new(0))
8919 .collect(),
8920 per_group_group_mailbox_full_events: (0..raft_group_count)
8921 .map(|_| PaddedAtomicU64::new(0))
8922 .collect(),
8923 per_core_raft_write_many_batches: (0..core_count)
8924 .map(|_| PaddedAtomicU64::new(0))
8925 .collect(),
8926 per_group_raft_write_many_batches: (0..raft_group_count)
8927 .map(|_| PaddedAtomicU64::new(0))
8928 .collect(),
8929 per_core_raft_write_many_commands: (0..core_count)
8930 .map(|_| PaddedAtomicU64::new(0))
8931 .collect(),
8932 per_group_raft_write_many_commands: (0..raft_group_count)
8933 .map(|_| PaddedAtomicU64::new(0))
8934 .collect(),
8935 per_core_raft_write_many_logical_commands: (0..core_count)
8936 .map(|_| PaddedAtomicU64::new(0))
8937 .collect(),
8938 per_group_raft_write_many_logical_commands: (0..raft_group_count)
8939 .map(|_| PaddedAtomicU64::new(0))
8940 .collect(),
8941 per_core_raft_write_many_responses: (0..core_count)
8942 .map(|_| PaddedAtomicU64::new(0))
8943 .collect(),
8944 per_group_raft_write_many_responses: (0..raft_group_count)
8945 .map(|_| PaddedAtomicU64::new(0))
8946 .collect(),
8947 per_core_raft_write_many_submit_ns: (0..core_count)
8948 .map(|_| PaddedAtomicU64::new(0))
8949 .collect(),
8950 per_group_raft_write_many_submit_ns: (0..raft_group_count)
8951 .map(|_| PaddedAtomicU64::new(0))
8952 .collect(),
8953 per_core_raft_write_many_response_ns: (0..core_count)
8954 .map(|_| PaddedAtomicU64::new(0))
8955 .collect(),
8956 per_group_raft_write_many_response_ns: (0..raft_group_count)
8957 .map(|_| PaddedAtomicU64::new(0))
8958 .collect(),
8959 per_core_raft_apply_entries: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8960 per_group_raft_apply_entries: (0..raft_group_count)
8961 .map(|_| PaddedAtomicU64::new(0))
8962 .collect(),
8963 per_core_raft_apply_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8964 per_group_raft_apply_ns: (0..raft_group_count)
8965 .map(|_| PaddedAtomicU64::new(0))
8966 .collect(),
8967 per_core_live_read_waiters: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8968 per_core_live_read_backpressure_events: (0..core_count)
8969 .map(|_| PaddedAtomicU64::new(0))
8970 .collect(),
8971 per_core_routed_requests: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8972 per_core_mailbox_send_wait_ns: (0..core_count)
8973 .map(|_| PaddedAtomicU64::new(0))
8974 .collect(),
8975 per_core_mailbox_full_events: (0..core_count)
8976 .map(|_| PaddedAtomicU64::new(0))
8977 .collect(),
8978 per_core_wal_batches: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8979 per_group_wal_batches: (0..raft_group_count)
8980 .map(|_| PaddedAtomicU64::new(0))
8981 .collect(),
8982 per_core_wal_records: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8983 per_group_wal_records: (0..raft_group_count)
8984 .map(|_| PaddedAtomicU64::new(0))
8985 .collect(),
8986 per_core_wal_write_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8987 per_group_wal_write_ns: (0..raft_group_count)
8988 .map(|_| PaddedAtomicU64::new(0))
8989 .collect(),
8990 per_core_wal_sync_ns: (0..core_count).map(|_| PaddedAtomicU64::new(0)).collect(),
8991 per_group_wal_sync_ns: (0..raft_group_count)
8992 .map(|_| PaddedAtomicU64::new(0))
8993 .collect(),
8994 cold_flush_uploads: PaddedAtomicU64::new(0),
8995 cold_flush_upload_bytes: PaddedAtomicU64::new(0),
8996 cold_flush_upload_ns: PaddedAtomicU64::new(0),
8997 cold_flush_publishes: PaddedAtomicU64::new(0),
8998 cold_flush_publish_bytes: PaddedAtomicU64::new(0),
8999 cold_flush_publish_ns: PaddedAtomicU64::new(0),
9000 cold_orphan_cleanup_attempts: PaddedAtomicU64::new(0),
9001 cold_orphan_cleanup_errors: PaddedAtomicU64::new(0),
9002 cold_orphan_bytes: PaddedAtomicU64::new(0),
9003 per_group_cold_hot_bytes: (0..raft_group_count)
9004 .map(|_| PaddedAtomicU64::new(0))
9005 .collect(),
9006 per_group_cold_hot_bytes_max: (0..raft_group_count)
9007 .map(|_| PaddedAtomicU64::new(0))
9008 .collect(),
9009 cold_hot_stream_bytes_max: PaddedAtomicU64::new(0),
9010 per_core_cold_backpressure_events: (0..core_count)
9011 .map(|_| PaddedAtomicU64::new(0))
9012 .collect(),
9013 per_group_cold_backpressure_events: (0..raft_group_count)
9014 .map(|_| PaddedAtomicU64::new(0))
9015 .collect(),
9016 cold_backpressure_bytes: PaddedAtomicU64::new(0),
9017 }
9018 }
9019
9020 fn record_routed_request(&self, core_id: CoreId, mailbox_send_wait_ns: u64) {
9021 let index = usize::from(core_id.0);
9022 self.per_core_routed_requests[index].fetch_add_relaxed(1);
9023 self.per_core_mailbox_send_wait_ns[index].fetch_add_relaxed(mailbox_send_wait_ns);
9024 }
9025
9026 fn record_mailbox_full(&self, core_id: CoreId) {
9027 self.per_core_mailbox_full_events[usize::from(core_id.0)].fetch_add_relaxed(1);
9028 }
9029
9030 fn record_append(&self, core_id: CoreId, group_id: RaftGroupId) {
9031 self.record_append_batch(core_id, group_id, 1);
9032 }
9033
9034 fn record_append_batch(&self, core_id: CoreId, group_id: RaftGroupId, count: u64) {
9035 self.per_core_appends[usize::from(core_id.0)].fetch_add_relaxed(count);
9036 self.per_group_appends[usize::try_from(group_id.0).expect("u32 fits usize")]
9037 .fetch_add_relaxed(count);
9038 }
9039
9040 fn record_applied_mutation(&self, core_id: CoreId, group_id: RaftGroupId, apply_ns: u64) {
9041 self.record_applied_mutation_batch(core_id, group_id, 1, apply_ns);
9042 }
9043
9044 fn record_applied_mutation_batch(
9045 &self,
9046 core_id: CoreId,
9047 group_id: RaftGroupId,
9048 count: u64,
9049 apply_ns: u64,
9050 ) {
9051 let core_index = usize::from(core_id.0);
9052 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9053 self.per_core_applied_mutations[core_index].fetch_add_relaxed(count);
9054 self.per_group_applied_mutations[group_index].fetch_add_relaxed(count);
9055 self.per_core_mutation_apply_ns[core_index].fetch_add_relaxed(apply_ns);
9056 self.per_group_mutation_apply_ns[group_index].fetch_add_relaxed(apply_ns);
9057 }
9058
9059 fn record_group_engine_exec(&self, core_id: CoreId, group_id: RaftGroupId, exec_ns: u64) {
9060 let core_index = usize::from(core_id.0);
9061 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9062 self.per_core_group_engine_exec_ns[core_index].fetch_add_relaxed(exec_ns);
9063 self.per_group_group_engine_exec_ns[group_index].fetch_add_relaxed(exec_ns);
9064 }
9065
9066 fn record_group_mailbox_enqueued(&self, group_id: RaftGroupId) {
9067 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9068 let depth = self.per_group_group_mailbox_depth[group_index].fetch_add_relaxed(1) + 1;
9069 self.per_group_group_mailbox_max_depth[group_index].fetch_max_relaxed(depth);
9070 }
9071
9072 fn record_group_mailbox_dequeued(&self, group_id: RaftGroupId) {
9073 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9074 self.per_group_group_mailbox_depth[group_index].fetch_sub_relaxed(1);
9075 }
9076
9077 fn record_group_mailbox_full(&self, group_id: RaftGroupId) {
9078 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9079 self.per_group_group_mailbox_full_events[group_index].fetch_add_relaxed(1);
9080 }
9081
9082 fn record_raft_write_many(
9083 &self,
9084 core_id: CoreId,
9085 group_id: RaftGroupId,
9086 sample: RaftWriteManySample,
9087 ) {
9088 let core_index = usize::from(core_id.0);
9089 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9090 self.per_core_raft_write_many_batches[core_index].fetch_add_relaxed(1);
9091 self.per_group_raft_write_many_batches[group_index].fetch_add_relaxed(1);
9092 self.per_core_raft_write_many_commands[core_index].fetch_add_relaxed(sample.command_count);
9093 self.per_group_raft_write_many_commands[group_index]
9094 .fetch_add_relaxed(sample.command_count);
9095 self.per_core_raft_write_many_logical_commands[core_index]
9096 .fetch_add_relaxed(sample.logical_command_count);
9097 self.per_group_raft_write_many_logical_commands[group_index]
9098 .fetch_add_relaxed(sample.logical_command_count);
9099 self.per_core_raft_write_many_responses[core_index]
9100 .fetch_add_relaxed(sample.response_count);
9101 self.per_group_raft_write_many_responses[group_index]
9102 .fetch_add_relaxed(sample.response_count);
9103 self.per_core_raft_write_many_submit_ns[core_index].fetch_add_relaxed(sample.submit_ns);
9104 self.per_group_raft_write_many_submit_ns[group_index].fetch_add_relaxed(sample.submit_ns);
9105 self.per_core_raft_write_many_response_ns[core_index].fetch_add_relaxed(sample.response_ns);
9106 self.per_group_raft_write_many_response_ns[group_index]
9107 .fetch_add_relaxed(sample.response_ns);
9108 }
9109
9110 fn record_raft_apply_batch(
9111 &self,
9112 core_id: CoreId,
9113 group_id: RaftGroupId,
9114 entry_count: u64,
9115 apply_ns: u64,
9116 ) {
9117 let core_index = usize::from(core_id.0);
9118 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9119 self.per_core_raft_apply_entries[core_index].fetch_add_relaxed(entry_count);
9120 self.per_group_raft_apply_entries[group_index].fetch_add_relaxed(entry_count);
9121 self.per_core_raft_apply_ns[core_index].fetch_add_relaxed(apply_ns);
9122 self.per_group_raft_apply_ns[group_index].fetch_add_relaxed(apply_ns);
9123 }
9124
9125 fn record_wal_batch(
9126 &self,
9127 core_id: CoreId,
9128 group_id: RaftGroupId,
9129 record_count: u64,
9130 write_ns: u64,
9131 sync_ns: u64,
9132 ) {
9133 let core_index = usize::from(core_id.0);
9134 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9135 self.per_core_wal_batches[core_index].fetch_add_relaxed(1);
9136 self.per_group_wal_batches[group_index].fetch_add_relaxed(1);
9137 self.per_core_wal_records[core_index].fetch_add_relaxed(record_count);
9138 self.per_group_wal_records[group_index].fetch_add_relaxed(record_count);
9139 self.per_core_wal_write_ns[core_index].fetch_add_relaxed(write_ns);
9140 self.per_group_wal_write_ns[group_index].fetch_add_relaxed(write_ns);
9141 self.per_core_wal_sync_ns[core_index].fetch_add_relaxed(sync_ns);
9142 self.per_group_wal_sync_ns[group_index].fetch_add_relaxed(sync_ns);
9143 }
9144
9145 fn record_cold_upload(&self, bytes: u64, upload_ns: u64) {
9146 self.cold_flush_uploads.fetch_add_relaxed(1);
9147 self.cold_flush_upload_bytes.fetch_add_relaxed(bytes);
9148 self.cold_flush_upload_ns.fetch_add_relaxed(upload_ns);
9149 }
9150
9151 fn record_cold_publish(&self, bytes: u64, publish_ns: u64) {
9152 self.cold_flush_publishes.fetch_add_relaxed(1);
9153 self.cold_flush_publish_bytes.fetch_add_relaxed(bytes);
9154 self.cold_flush_publish_ns.fetch_add_relaxed(publish_ns);
9155 }
9156
9157 fn record_cold_orphan_cleanup(&self, bytes: u64, cleanup_failed: bool) {
9158 self.cold_orphan_cleanup_attempts.fetch_add_relaxed(1);
9159 if cleanup_failed {
9160 self.cold_orphan_cleanup_errors.fetch_add_relaxed(1);
9161 self.cold_orphan_bytes.fetch_add_relaxed(bytes);
9162 }
9163 }
9164
9165 fn record_cold_hot_backlog(
9166 &self,
9167 group_id: RaftGroupId,
9168 stream_hot_bytes: u64,
9169 group_hot_bytes: u64,
9170 ) {
9171 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9172 self.per_group_cold_hot_bytes[group_index].store_relaxed(group_hot_bytes);
9173 self.per_group_cold_hot_bytes_max[group_index].fetch_max_relaxed(group_hot_bytes);
9174 self.cold_hot_stream_bytes_max
9175 .fetch_max_relaxed(stream_hot_bytes);
9176 }
9177
9178 fn record_cold_backpressure(
9179 &self,
9180 core_id: CoreId,
9181 group_id: RaftGroupId,
9182 incoming_bytes: u64,
9183 _limit: u64,
9184 ) {
9185 let core_index = usize::from(core_id.0);
9186 let group_index = usize::try_from(group_id.0).expect("u32 fits usize");
9187 self.per_core_cold_backpressure_events[core_index].fetch_add_relaxed(1);
9188 self.per_group_cold_backpressure_events[group_index].fetch_add_relaxed(1);
9189 self.cold_backpressure_bytes
9190 .fetch_add_relaxed(incoming_bytes);
9191 }
9192
9193 fn record_read_watcher_added(&self, core_id: CoreId) {
9194 self.record_read_watchers_added(core_id, 1);
9195 }
9196
9197 fn record_read_watchers_added(&self, core_id: CoreId, count: usize) {
9198 self.per_core_live_read_waiters[usize::from(core_id.0)]
9199 .fetch_add_relaxed(u64::try_from(count).expect("watcher count fits u64"));
9200 }
9201
9202 fn record_read_watchers_removed(&self, core_id: CoreId, count: usize) {
9203 self.per_core_live_read_waiters[usize::from(core_id.0)]
9204 .fetch_sub_relaxed(u64::try_from(count).expect("watcher count fits u64"));
9205 }
9206
9207 fn record_live_read_backpressure(&self, core_id: CoreId) {
9208 self.per_core_live_read_backpressure_events[usize::from(core_id.0)].fetch_add_relaxed(1);
9209 }
9210}
9211
9212fn elapsed_ns(started_at: Instant) -> u64 {
9213 u64::try_from(started_at.elapsed().as_nanos()).unwrap_or(u64::MAX)
9214}
9215
9216fn append_batch_payload_bytes(request: &AppendBatchRequest) -> u64 {
9217 request
9218 .payloads
9219 .iter()
9220 .map(|payload| u64::try_from(payload.len()).expect("payload len fits u64"))
9221 .sum()
9222}
9223
9224fn record_cold_backpressure_error(
9225 metrics: &RuntimeMetricsInner,
9226 placement: ShardPlacement,
9227 incoming_bytes: u64,
9228 admission: ColdWriteAdmission,
9229 err: &GroupEngineError,
9230) {
9231 if !err.message().contains("ColdBackpressure") {
9232 return;
9233 }
9234 metrics.record_cold_backpressure(
9235 placement.core_id,
9236 placement.raft_group_id,
9237 incoming_bytes,
9238 admission.max_hot_bytes_per_group.unwrap_or(0),
9239 );
9240}
9241
9242fn is_stale_cold_flush_candidate_error(err: &RuntimeError) -> bool {
9243 let RuntimeError::GroupEngine { message, .. } = err else {
9244 return false;
9245 };
9246 message.contains("StreamGone")
9247 || message.contains("StreamNotFound")
9248 || (message.contains("InvalidColdFlush")
9249 && (message.contains("beyond stream")
9250 || message.contains("does not match the start of a hot payload segment")
9251 || message.contains("does not cover contiguous hot payload segments")
9252 || message.contains("exceeds stream")
9253 || message.contains("non-contiguous hot payload metadata")))
9254}
9255
9256async fn record_cold_hot_backlog(
9257 group: &mut Box<dyn GroupEngine>,
9258 metrics: &RuntimeMetricsInner,
9259 stream_id: BucketStreamId,
9260 placement: ShardPlacement,
9261) {
9262 if let Ok(backlog) = group.cold_hot_backlog(stream_id, placement).await {
9263 metrics.record_cold_hot_backlog(
9264 placement.raft_group_id,
9265 backlog.stream_hot_bytes,
9266 backlog.group_hot_bytes,
9267 );
9268 }
9269}
9270
9271#[derive(Debug)]
9272#[repr(align(128))]
9273struct PaddedAtomicU64 {
9274 value: AtomicU64,
9275}
9276
9277impl PaddedAtomicU64 {
9278 fn new(value: u64) -> Self {
9279 Self {
9280 value: AtomicU64::new(value),
9281 }
9282 }
9283
9284 fn load_relaxed(&self) -> u64 {
9285 self.value.load(Ordering::Relaxed)
9286 }
9287
9288 fn fetch_add_relaxed(&self, value: u64) -> u64 {
9289 self.value.fetch_add(value, Ordering::Relaxed)
9290 }
9291
9292 fn fetch_sub_relaxed(&self, value: u64) {
9293 self.value.fetch_sub(value, Ordering::Relaxed);
9294 }
9295
9296 fn fetch_max_relaxed(&self, value: u64) {
9297 self.value.fetch_max(value, Ordering::Relaxed);
9298 }
9299
9300 fn store_relaxed(&self, value: u64) {
9301 self.value.store(value, Ordering::Relaxed);
9302 }
9303}
9304
9305#[cfg(test)]
9306mod tests {
9307 use super::*;
9308 use std::collections::HashSet;
9309 use std::sync::Mutex;
9310 use std::sync::atomic::AtomicBool;
9311 use tokio::sync::Notify;
9312
9313 fn runtime(core_count: usize, group_count: usize) -> ShardRuntime {
9314 ShardRuntime::spawn(RuntimeConfig {
9315 core_count,
9316 raft_group_count: group_count,
9317 mailbox_capacity: 128,
9318 threading: RuntimeThreading::HostedTokio,
9319 cold_max_hot_bytes_per_group: None,
9320 live_read_max_waiters_per_core: Some(65_536),
9321 })
9322 .expect("spawn runtime")
9323 }
9324
9325 fn stream_on_group(
9326 runtime: &ShardRuntime,
9327 group_id: RaftGroupId,
9328 prefix: &str,
9329 ) -> BucketStreamId {
9330 for index in 0..10_000 {
9331 let stream = BucketStreamId::new("benchcmp", format!("{prefix}-{index}"));
9332 if runtime.locate(&stream).raft_group_id == group_id {
9333 return stream;
9334 }
9335 }
9336 panic!("could not find stream for group {}", group_id.0);
9337 }
9338
9339 async fn create_stream(
9340 runtime: &ShardRuntime,
9341 stream: &BucketStreamId,
9342 ) -> CreateStreamResponse {
9343 runtime
9344 .create_stream(CreateStreamRequest::new(
9345 stream.clone(),
9346 DEFAULT_CONTENT_TYPE,
9347 ))
9348 .await
9349 .expect("create stream")
9350 }
9351
9352 fn producer(id: &str, epoch: u64, seq: u64) -> ProducerRequest {
9353 ProducerRequest {
9354 producer_id: id.to_owned(),
9355 producer_epoch: epoch,
9356 producer_seq: seq,
9357 }
9358 }
9359
9360 fn placement() -> ShardPlacement {
9361 ShardPlacement {
9362 core_id: CoreId(0),
9363 shard_id: ShardId(0),
9364 raft_group_id: RaftGroupId(0),
9365 }
9366 }
9367
9368 #[test]
9369 fn group_write_command_round_trips_as_log_payload() {
9370 let command = GroupWriteCommand::AppendBatch {
9371 stream_id: BucketStreamId::new("benchcmp", "raft-log"),
9372 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9373 payloads: vec![Bytes::from_static(b"ab"), Bytes::from_static(b"cd")],
9374 producer: Some(producer("writer-1", 7, 42)),
9375 now_ms: 0,
9376 };
9377
9378 let encoded = serde_json::to_vec(&command).expect("encode command");
9379 let decoded =
9380 serde_json::from_slice::<GroupWriteCommand>(&encoded).expect("decode command");
9381
9382 assert_eq!(decoded, command);
9383 }
9384
9385 #[test]
9386 fn committed_write_command_is_state_machine_apply_boundary() {
9387 let placement = ShardPlacement {
9388 core_id: CoreId(0),
9389 shard_id: ShardId(0),
9390 raft_group_id: RaftGroupId(0),
9391 };
9392 let stream = BucketStreamId::new("benchcmp", "apply-command");
9393 let mut engine = InMemoryGroupEngine::default();
9394
9395 let created = engine
9396 .apply_committed_write(
9397 GroupWriteCommand::CreateStream {
9398 stream_id: stream.clone(),
9399 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9400 initial_payload: Bytes::new(),
9401 close_after: false,
9402 stream_seq: None,
9403 producer: None,
9404 stream_ttl_seconds: None,
9405 stream_expires_at_ms: None,
9406 forked_from: None,
9407 fork_offset: None,
9408 now_ms: 0,
9409 },
9410 placement,
9411 )
9412 .expect("create stream");
9413 assert_eq!(
9414 created,
9415 GroupWriteResponse::CreateStream(CreateStreamResponse {
9416 placement,
9417 next_offset: 0,
9418 closed: false,
9419 already_exists: false,
9420 group_commit_index: 1,
9421 })
9422 );
9423
9424 let appended = engine
9425 .apply_committed_write(
9426 GroupWriteCommand::Append {
9427 stream_id: stream.clone(),
9428 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9429 payload: Bytes::from_static(b"abc"),
9430 close_after: false,
9431 stream_seq: None,
9432 producer: None,
9433 now_ms: 0,
9434 },
9435 placement,
9436 )
9437 .expect("append");
9438 assert_eq!(
9439 appended,
9440 GroupWriteResponse::Append(AppendResponse {
9441 placement,
9442 start_offset: 0,
9443 next_offset: 3,
9444 stream_append_count: 1,
9445 group_commit_index: 2,
9446 closed: false,
9447 deduplicated: false,
9448 producer: None,
9449 })
9450 );
9451
9452 let flushed = engine
9453 .apply_committed_write(
9454 GroupWriteCommand::FlushCold {
9455 stream_id: stream.clone(),
9456 chunk: ColdChunkRef {
9457 start_offset: 0,
9458 end_offset: 2,
9459 s3_path: "s3://bucket/apply-command/000000".to_owned(),
9460 object_size: 2,
9461 },
9462 },
9463 placement,
9464 )
9465 .expect("flush cold");
9466 assert_eq!(
9467 flushed,
9468 GroupWriteResponse::FlushCold(FlushColdResponse {
9469 placement,
9470 hot_start_offset: 2,
9471 group_commit_index: 3,
9472 })
9473 );
9474
9475 let read = engine
9476 .state_machine
9477 .read(&stream, 2, 16)
9478 .expect("read applied command");
9479 assert_eq!(read.payload, b"c");
9480 let plan = engine
9481 .state_machine
9482 .read_plan(&stream, 0, 16)
9483 .expect("read plan");
9484 assert_eq!(plan.segments.len(), 2);
9485 assert!(matches!(plan.segments[0], StreamReadSegment::Object(_)));
9486 assert_eq!(plan.segments[1], StreamReadSegment::Hot(b"c".to_vec()));
9487 }
9488
9489 #[tokio::test]
9490 async fn cold_store_read_reassembles_cold_and_hot_segments() {
9491 let placement = placement();
9492 let stream = BucketStreamId::new("benchcmp", "cold-read");
9493 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
9494 cold_store
9495 .write_chunk("benchcmp/cold-read/chunks/000000.bin", b"abcd")
9496 .await
9497 .expect("write cold object");
9498 let mut engine = InMemoryGroupEngine::with_cold_store(cold_store);
9499
9500 engine
9501 .apply_committed_write(
9502 GroupWriteCommand::CreateStream {
9503 stream_id: stream.clone(),
9504 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9505 initial_payload: Bytes::new(),
9506 close_after: false,
9507 stream_seq: None,
9508 producer: None,
9509 stream_ttl_seconds: None,
9510 stream_expires_at_ms: None,
9511 forked_from: None,
9512 fork_offset: None,
9513 now_ms: 0,
9514 },
9515 placement,
9516 )
9517 .expect("create stream");
9518 engine
9519 .apply_committed_write(
9520 GroupWriteCommand::Append {
9521 stream_id: stream.clone(),
9522 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9523 payload: Bytes::from_static(b"abcdef"),
9524 close_after: false,
9525 stream_seq: None,
9526 producer: None,
9527 now_ms: 0,
9528 },
9529 placement,
9530 )
9531 .expect("append");
9532 engine
9533 .apply_committed_write(
9534 GroupWriteCommand::FlushCold {
9535 stream_id: stream.clone(),
9536 chunk: ColdChunkRef {
9537 start_offset: 0,
9538 end_offset: 4,
9539 s3_path: "benchcmp/cold-read/chunks/000000.bin".to_owned(),
9540 object_size: 4,
9541 },
9542 },
9543 placement,
9544 )
9545 .expect("flush cold");
9546
9547 let read = engine
9548 .read_stream(
9549 ReadStreamRequest {
9550 stream_id: stream,
9551 offset: 2,
9552 max_len: 4,
9553 now_ms: 0,
9554 },
9555 placement,
9556 )
9557 .await
9558 .expect("read cold and hot segments");
9559 assert_eq!(read.payload, b"cdef");
9560 assert_eq!(read.next_offset, 6);
9561 assert!(read.up_to_date);
9562 }
9563
9564 #[tokio::test]
9565 async fn bootstrap_reads_retained_updates_from_cold_chunk_after_snapshot() {
9566 let placement = placement();
9567 let stream = BucketStreamId::new("benchcmp", "cold-bootstrap");
9568 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
9569 cold_store
9570 .write_chunk("benchcmp/cold-bootstrap/chunks/000000.bin", b"abcde")
9571 .await
9572 .expect("write cold object");
9573 let mut engine = InMemoryGroupEngine::with_cold_store(cold_store);
9574
9575 engine
9576 .create_stream(
9577 CreateStreamRequest::new(stream.clone(), DEFAULT_CONTENT_TYPE),
9578 placement,
9579 )
9580 .await
9581 .expect("create stream");
9582 engine
9583 .append(
9584 AppendRequest::from_bytes(stream.clone(), b"abc".to_vec()),
9585 placement,
9586 )
9587 .await
9588 .expect("append first message");
9589 engine
9590 .append(
9591 AppendRequest::from_bytes(stream.clone(), b"de".to_vec()),
9592 placement,
9593 )
9594 .await
9595 .expect("append second message");
9596 engine
9597 .flush_cold(
9598 FlushColdRequest {
9599 stream_id: stream.clone(),
9600 chunk: ColdChunkRef {
9601 start_offset: 0,
9602 end_offset: 5,
9603 s3_path: "benchcmp/cold-bootstrap/chunks/000000.bin".to_owned(),
9604 object_size: 5,
9605 },
9606 },
9607 placement,
9608 )
9609 .await
9610 .expect("flush all hot bytes");
9611 engine
9612 .publish_snapshot(
9613 PublishSnapshotRequest {
9614 stream_id: stream.clone(),
9615 snapshot_offset: 3,
9616 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
9617 payload: Bytes::from_static(b"abc-state"),
9618 now_ms: 0,
9619 },
9620 placement,
9621 )
9622 .await
9623 .expect("publish snapshot");
9624
9625 let read = engine
9626 .read_stream(
9627 ReadStreamRequest {
9628 stream_id: stream.clone(),
9629 offset: 3,
9630 max_len: 2,
9631 now_ms: 0,
9632 },
9633 placement,
9634 )
9635 .await
9636 .expect("read retained update from cold chunk");
9637 assert_eq!(read.payload, b"de");
9638
9639 let bootstrap = engine
9640 .bootstrap_stream(
9641 BootstrapStreamRequest {
9642 stream_id: stream,
9643 now_ms: 0,
9644 },
9645 placement,
9646 )
9647 .await
9648 .expect("bootstrap");
9649 assert_eq!(bootstrap.snapshot_offset, Some(3));
9650 assert_eq!(bootstrap.snapshot_payload, b"abc-state");
9651 assert_eq!(bootstrap.next_offset, 5);
9652 assert_eq!(bootstrap.updates.len(), 1);
9653 assert_eq!(bootstrap.updates[0].start_offset, 3);
9654 assert_eq!(bootstrap.updates[0].next_offset, 5);
9655 assert_eq!(bootstrap.updates[0].payload, b"de");
9656 }
9657
9658 #[tokio::test]
9659 async fn cold_store_reads_only_requested_range() {
9660 let cold_store = ColdStore::memory().expect("memory cold store");
9661 cold_store
9662 .write_chunk("benchcmp/cold-range/chunks/000000.bin", b"abcdefgh")
9663 .await
9664 .expect("write cold object");
9665 let bytes = cold_store
9666 .read_chunk_range(
9667 &ColdChunkRef {
9668 start_offset: 10,
9669 end_offset: 18,
9670 s3_path: "benchcmp/cold-range/chunks/000000.bin".to_owned(),
9671 object_size: 8,
9672 },
9673 12,
9674 3,
9675 )
9676 .await
9677 .expect("read range");
9678 assert_eq!(bytes, b"cde");
9679 }
9680
9681 #[tokio::test]
9682 async fn ttl_read_access_is_committed_and_expiry_removes_stream() {
9683 let placement = placement();
9684 let stream = BucketStreamId::new("benchcmp", "runtime-ttl");
9685 let mut engine = InMemoryGroupEngine::default();
9686
9687 let mut create = CreateStreamRequest::new(stream.clone(), DEFAULT_CONTENT_TYPE);
9688 create.initial_payload = Bytes::from_static(b"abc");
9689 create.stream_ttl_seconds = Some(1);
9690 create.now_ms = 1_000;
9691 engine
9692 .create_stream(create, placement)
9693 .await
9694 .expect("create ttl stream");
9695
9696 let read = engine
9697 .read_stream(
9698 ReadStreamRequest {
9699 stream_id: stream.clone(),
9700 offset: 0,
9701 max_len: 16,
9702 now_ms: 1_500,
9703 },
9704 placement,
9705 )
9706 .await
9707 .expect("read renews ttl");
9708 assert_eq!(read.payload, b"abc");
9709 assert_eq!(
9710 engine
9711 .snapshot(placement)
9712 .await
9713 .expect("snapshot")
9714 .group_commit_index,
9715 2
9716 );
9717
9718 engine
9719 .head_stream(
9720 HeadStreamRequest {
9721 stream_id: stream.clone(),
9722 now_ms: 2_499,
9723 },
9724 placement,
9725 )
9726 .await
9727 .expect("head does not renew but stream is still live");
9728 assert_eq!(
9729 engine
9730 .snapshot(placement)
9731 .await
9732 .expect("snapshot")
9733 .group_commit_index,
9734 2
9735 );
9736
9737 let err = engine
9738 .read_stream(
9739 ReadStreamRequest {
9740 stream_id: stream.clone(),
9741 offset: 0,
9742 max_len: 16,
9743 now_ms: 2_500,
9744 },
9745 placement,
9746 )
9747 .await
9748 .expect_err("expired stream read is not found");
9749 assert_eq!(err.code(), Some(StreamErrorCode::StreamNotFound));
9750 assert_eq!(
9751 engine
9752 .snapshot(placement)
9753 .await
9754 .expect("snapshot")
9755 .group_commit_index,
9756 3
9757 );
9758
9759 let mut recreate = CreateStreamRequest::new(stream, "text/plain");
9760 recreate.now_ms = 2_501;
9761 let recreated = engine
9762 .create_stream(recreate, placement)
9763 .await
9764 .expect("recreate expired stream");
9765 assert!(!recreated.already_exists);
9766 }
9767
9768 #[test]
9769 fn committed_write_batch_preserves_logical_command_responses() {
9770 let placement = placement();
9771 let stream = BucketStreamId::new("benchcmp", "apply-command-batch");
9772 let mut engine = InMemoryGroupEngine::default();
9773
9774 let response = engine
9775 .apply_committed_write(
9776 GroupWriteCommand::Batch {
9777 commands: vec![
9778 GroupWriteCommand::from(CreateStreamRequest::new(
9779 stream.clone(),
9780 DEFAULT_CONTENT_TYPE,
9781 )),
9782 GroupWriteCommand::from(AppendBatchRequest::new(
9783 stream.clone(),
9784 vec![Bytes::from_static(b"ab"), Bytes::from_static(b"cd")],
9785 )),
9786 ],
9787 },
9788 placement,
9789 )
9790 .expect("apply command batch");
9791
9792 let GroupWriteResponse::Batch(items) = response else {
9793 panic!("unexpected batch response: {response:?}");
9794 };
9795 assert_eq!(items.len(), 2);
9796 assert!(matches!(
9797 &items[0],
9798 Ok(GroupWriteResponse::CreateStream(CreateStreamResponse {
9799 group_commit_index: 1,
9800 ..
9801 }))
9802 ));
9803 match &items[1] {
9804 Ok(GroupWriteResponse::AppendBatch(response)) => {
9805 assert_eq!(response.items.len(), 2);
9806 assert_eq!(
9807 response.items[0].as_ref().expect("first item").start_offset,
9808 0
9809 );
9810 assert_eq!(
9811 response.items[1]
9812 .as_ref()
9813 .expect("second item")
9814 .start_offset,
9815 2
9816 );
9817 assert_eq!(
9818 response.items[1]
9819 .as_ref()
9820 .expect("second item")
9821 .group_commit_index,
9822 3
9823 );
9824 }
9825 other => panic!("unexpected append batch response: {other:?}"),
9826 }
9827
9828 let read = engine
9829 .state_machine
9830 .read(&stream, 0, 16)
9831 .expect("read applied command batch");
9832 assert_eq!(read.payload, b"abcd");
9833 }
9834
9835 async fn wait_for_live_waiters(runtime: &ShardRuntime, expected: u64) {
9836 for _ in 0..100 {
9837 if runtime.metrics().snapshot().live_read_waiters == expected {
9838 return;
9839 }
9840 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9841 }
9842 panic!(
9843 "expected {expected} live waiters, got {}",
9844 runtime.metrics().snapshot().live_read_waiters
9845 );
9846 }
9847
9848 async fn wait_for_mailbox_depth(runtime: &ShardRuntime, core_index: usize, expected: usize) {
9849 for _ in 0..100 {
9850 if runtime.mailbox_snapshot().depths[core_index] == expected {
9851 return;
9852 }
9853 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9854 }
9855 panic!(
9856 "expected core {core_index} mailbox depth {expected}, got {}",
9857 runtime.mailbox_snapshot().depths[core_index]
9858 );
9859 }
9860
9861 async fn wait_for_mailbox_full_events(runtime: &ShardRuntime, expected: u64) {
9862 for _ in 0..100 {
9863 if runtime.metrics().snapshot().mailbox_full_events == expected {
9864 return;
9865 }
9866 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9867 }
9868 panic!(
9869 "expected {expected} mailbox full events, got {}",
9870 runtime.metrics().snapshot().mailbox_full_events
9871 );
9872 }
9873
9874 async fn wait_for_group_mailbox_full_events(runtime: &ShardRuntime, expected: u64) {
9875 for _ in 0..100 {
9876 if runtime.metrics().snapshot().group_mailbox_full_events == expected {
9877 return;
9878 }
9879 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
9880 }
9881 panic!(
9882 "expected {expected} group mailbox full events, got {}",
9883 runtime.metrics().snapshot().group_mailbox_full_events
9884 );
9885 }
9886
9887 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
9888 async fn repeated_appends_to_one_stream_are_ordered() {
9889 let runtime = runtime(4, 32);
9890 let stream = BucketStreamId::new("benchcmp", "one-stream");
9891 create_stream(&runtime, &stream).await;
9892 for index in 0..100 {
9893 let response = runtime
9894 .append(AppendRequest::new(stream.clone(), 7))
9895 .await
9896 .expect("append");
9897 assert_eq!(response.start_offset, index * 7);
9898 assert_eq!(response.next_offset, (index + 1) * 7);
9899 assert_eq!(response.stream_append_count, index + 1);
9900 }
9901 }
9902
9903 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
9904 async fn independent_streams_reach_all_cores_and_many_groups() {
9905 let runtime = runtime(4, 64);
9906 let mut tasks = Vec::new();
9907 for index in 0..4096 {
9908 let runtime = runtime.clone();
9909 tasks.push(tokio::spawn(async move {
9910 let stream = BucketStreamId::new("benchcmp", format!("stream-{index}"));
9911 create_stream(&runtime, &stream).await;
9912 runtime
9913 .append(AppendRequest::new(stream, 1))
9914 .await
9915 .expect("append")
9916 }));
9917 }
9918
9919 for task in tasks {
9920 let response = task.await.expect("task");
9921 assert_eq!(response.start_offset, 0);
9922 assert_eq!(response.next_offset, 1);
9923 }
9924
9925 let snapshot = runtime.metrics().snapshot();
9926 assert_eq!(snapshot.accepted_appends, 4096);
9927 assert!(snapshot.per_core_appends.iter().all(|value| *value > 0));
9928 let active_groups = snapshot
9929 .per_group_appends
9930 .iter()
9931 .filter(|value| **value > 0)
9932 .count();
9933 assert!(active_groups > 48, "active_groups={active_groups}");
9934 }
9935
9936 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9937 async fn empty_append_is_rejected_before_routing() {
9938 let runtime = runtime(2, 8);
9939 let err = runtime
9940 .append(AppendRequest::new(BucketStreamId::new("b", "s"), 0))
9941 .await
9942 .expect_err("empty append rejected");
9943 assert_eq!(err, RuntimeError::EmptyAppend);
9944 assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
9945 }
9946
9947 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9948 async fn append_batch_routes_once_and_applies_each_payload_on_owner_core() {
9949 let runtime = runtime(2, 8);
9950 let stream = BucketStreamId::new("benchcmp", "batch-runtime");
9951 let owner_core = usize::from(runtime.locate(&stream).core_id.0);
9952 let owner_group =
9953 usize::try_from(runtime.locate(&stream).raft_group_id.0).expect("u32 fits usize");
9954
9955 create_stream(&runtime, &stream).await;
9956 let response = runtime
9957 .append_batch(AppendBatchRequest::new(
9958 stream.clone(),
9959 vec![b"ab".to_vec(), b"c".to_vec(), b"def".to_vec()],
9960 ))
9961 .await
9962 .expect("append batch");
9963 assert_eq!(response.items.len(), 3);
9964 assert_eq!(response.items[0].as_ref().expect("first").start_offset, 0);
9965 assert_eq!(response.items[1].as_ref().expect("second").start_offset, 2);
9966 assert_eq!(response.items[2].as_ref().expect("third").start_offset, 3);
9967
9968 let read = runtime
9969 .read_stream(ReadStreamRequest {
9970 stream_id: stream.clone(),
9971 offset: 0,
9972 max_len: 16,
9973 now_ms: 0,
9974 })
9975 .await
9976 .expect("read");
9977 assert_eq!(read.payload, b"abcdef");
9978
9979 let snapshot = runtime.metrics().snapshot();
9980 assert_eq!(snapshot.accepted_appends, 3);
9981 assert_eq!(snapshot.applied_mutations, 4);
9982 assert_eq!(snapshot.routed_requests, 3);
9983 assert_eq!(snapshot.per_core_appends[owner_core], 3);
9984 assert_eq!(snapshot.per_group_appends[owner_group], 3);
9985 assert_eq!(snapshot.per_core_applied_mutations[owner_core], 4);
9986 assert_eq!(snapshot.per_group_applied_mutations[owner_group], 4);
9987 }
9988
9989 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
9990 async fn append_batch_reports_item_errors_without_stopping_later_payloads() {
9991 let runtime = runtime(2, 8);
9992 let stream = BucketStreamId::new("benchcmp", "batch-partial");
9993 create_stream(&runtime, &stream).await;
9994
9995 let response = runtime
9996 .append_batch(AppendBatchRequest::new(
9997 stream.clone(),
9998 vec![b"a".to_vec(), Vec::new(), b"b".to_vec()],
9999 ))
10000 .await
10001 .expect("append batch");
10002 assert!(response.items[0].is_ok());
10003 assert!(response.items[1].is_err());
10004 assert!(response.items[2].is_ok());
10005 assert_eq!(response.items[0].as_ref().expect("first").start_offset, 0);
10006 assert_eq!(response.items[2].as_ref().expect("third").start_offset, 1);
10007
10008 let read = runtime
10009 .read_stream(ReadStreamRequest {
10010 stream_id: stream,
10011 offset: 0,
10012 max_len: 16,
10013 now_ms: 0,
10014 })
10015 .await
10016 .expect("read");
10017 assert_eq!(read.payload, b"ab");
10018
10019 let snapshot = runtime.metrics().snapshot();
10020 assert_eq!(snapshot.accepted_appends, 2);
10021 assert_eq!(snapshot.applied_mutations, 3);
10022 assert_eq!(snapshot.routed_requests, 3);
10023 }
10024
10025 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10026 async fn producer_duplicate_append_returns_prior_offsets_without_mutating_metrics() {
10027 let runtime = runtime(2, 8);
10028 let stream = BucketStreamId::new("benchcmp", "producer-runtime");
10029 create_stream(&runtime, &stream).await;
10030
10031 let mut first = AppendRequest::from_bytes(stream.clone(), b"a".to_vec());
10032 first.producer = Some(producer("writer-1", 0, 0));
10033 let first = runtime.append(first).await.expect("first append");
10034 assert_eq!(first.start_offset, 0);
10035 assert_eq!(first.next_offset, 1);
10036 assert_eq!(first.stream_append_count, 1);
10037 assert!(!first.deduplicated);
10038
10039 let mut duplicate = AppendRequest::from_bytes(stream.clone(), b"ignored".to_vec());
10040 duplicate.producer = Some(producer("writer-1", 0, 0));
10041 let duplicate = runtime.append(duplicate).await.expect("duplicate append");
10042 assert_eq!(duplicate.start_offset, 0);
10043 assert_eq!(duplicate.next_offset, 1);
10044 assert_eq!(duplicate.stream_append_count, 1);
10045 assert!(duplicate.deduplicated);
10046
10047 let mut next = AppendRequest::from_bytes(stream.clone(), b"b".to_vec());
10048 next.producer = Some(producer("writer-1", 0, 1));
10049 let next = runtime.append(next).await.expect("next append");
10050 assert_eq!(next.start_offset, 1);
10051 assert_eq!(next.next_offset, 2);
10052 assert_eq!(next.stream_append_count, 2);
10053 assert!(!next.deduplicated);
10054
10055 let read = runtime
10056 .read_stream(ReadStreamRequest {
10057 stream_id: stream,
10058 offset: 0,
10059 max_len: 16,
10060 now_ms: 0,
10061 })
10062 .await
10063 .expect("read");
10064 assert_eq!(read.payload, b"ab");
10065
10066 let metrics = runtime.metrics().snapshot();
10067 assert_eq!(metrics.accepted_appends, 2);
10068 assert_eq!(metrics.applied_mutations, 3);
10069 assert_eq!(metrics.routed_requests, 5);
10070 }
10071
10072 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10073 async fn producer_duplicate_append_batch_returns_prior_offsets_without_mutating_metrics() {
10074 let runtime = runtime(2, 8);
10075 let stream = BucketStreamId::new("benchcmp", "producer-batch-runtime");
10076 create_stream(&runtime, &stream).await;
10077
10078 let mut first =
10079 AppendBatchRequest::new(stream.clone(), vec![b"ab".to_vec(), b"c".to_vec()]);
10080 first.producer = Some(producer("writer-1", 0, 0));
10081 let first = runtime.append_batch(first).await.expect("first batch");
10082 assert_eq!(first.items.len(), 2);
10083 let first_item = first.items[0].as_ref().expect("first item");
10084 let second_item = first.items[1].as_ref().expect("second item");
10085 assert_eq!(first_item.start_offset, 0);
10086 assert_eq!(first_item.next_offset, 2);
10087 assert_eq!(first_item.stream_append_count, 1);
10088 assert!(!first_item.deduplicated);
10089 assert_eq!(second_item.start_offset, 2);
10090 assert_eq!(second_item.next_offset, 3);
10091 assert_eq!(second_item.stream_append_count, 2);
10092 assert!(!second_item.deduplicated);
10093
10094 let mut duplicate =
10095 AppendBatchRequest::new(stream.clone(), vec![b"ignored".to_vec(), b"body".to_vec()]);
10096 duplicate.producer = Some(producer("writer-1", 0, 0));
10097 let duplicate = runtime
10098 .append_batch(duplicate)
10099 .await
10100 .expect("duplicate batch");
10101 assert_eq!(duplicate.items.len(), 2);
10102 assert!(
10103 duplicate
10104 .items
10105 .iter()
10106 .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
10107 );
10108 assert_eq!(
10109 duplicate.items[0]
10110 .as_ref()
10111 .expect("first duplicate")
10112 .start_offset,
10113 0
10114 );
10115 assert_eq!(
10116 duplicate.items[1]
10117 .as_ref()
10118 .expect("second duplicate")
10119 .next_offset,
10120 3
10121 );
10122
10123 let mut next = AppendBatchRequest::new(stream.clone(), vec![b"d".to_vec()]);
10124 next.producer = Some(producer("writer-1", 0, 1));
10125 let next = runtime.append_batch(next).await.expect("next batch");
10126 let next_item = next.items[0].as_ref().expect("next item");
10127 assert_eq!(next_item.start_offset, 3);
10128 assert_eq!(next_item.next_offset, 4);
10129 assert_eq!(next_item.stream_append_count, 3);
10130 assert!(!next_item.deduplicated);
10131
10132 let read = runtime
10133 .read_stream(ReadStreamRequest {
10134 stream_id: stream,
10135 offset: 0,
10136 max_len: 16,
10137 now_ms: 0,
10138 })
10139 .await
10140 .expect("read");
10141 assert_eq!(read.payload, b"abcd");
10142
10143 let metrics = runtime.metrics().snapshot();
10144 assert_eq!(metrics.accepted_appends, 3);
10145 assert_eq!(metrics.applied_mutations, 4);
10146 assert_eq!(metrics.routed_requests, 5);
10147 }
10148
10149 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10150 async fn snapshot_group_routes_to_owner_core_and_captures_only_group_state() {
10151 let runtime = runtime(2, 8);
10152 let first_stream = BucketStreamId::new("benchcmp", "snapshot-first");
10153 let first_placement = runtime.locate(&first_stream);
10154 let second_stream = (0..512)
10155 .map(|index| BucketStreamId::new("benchcmp", format!("snapshot-other-{index}")))
10156 .find(|stream| runtime.locate(stream).core_id != first_placement.core_id)
10157 .expect("stream on another core");
10158
10159 create_stream(&runtime, &first_stream).await;
10160 runtime
10161 .append(AppendRequest::from_bytes(
10162 first_stream.clone(),
10163 b"first".to_vec(),
10164 ))
10165 .await
10166 .expect("append first stream");
10167 create_stream(&runtime, &second_stream).await;
10168 runtime
10169 .append(AppendRequest::from_bytes(
10170 second_stream.clone(),
10171 b"second".to_vec(),
10172 ))
10173 .await
10174 .expect("append second stream");
10175
10176 let snapshot = runtime
10177 .snapshot_group(first_placement.raft_group_id)
10178 .await
10179 .expect("snapshot group");
10180 assert_eq!(snapshot.placement, first_placement);
10181 assert_eq!(snapshot.group_commit_index, 2);
10182 assert_eq!(snapshot.stream_snapshot.buckets, vec!["benchcmp"]);
10183 assert_eq!(
10184 snapshot
10185 .stream_snapshot
10186 .streams
10187 .iter()
10188 .map(|entry| entry.metadata.stream_id.clone())
10189 .collect::<Vec<_>>(),
10190 vec![first_stream.clone()]
10191 );
10192
10193 let restored =
10194 StreamStateMachine::restore(snapshot.stream_snapshot).expect("restore group snapshot");
10195 let read = restored
10196 .read(&first_stream, 0, 16)
10197 .expect("read restored snapshot");
10198 assert_eq!(read.payload, b"first");
10199 assert_eq!(read.next_offset, 5);
10200 assert!(restored.read(&second_stream, 0, 16).is_err());
10201
10202 let metrics = runtime.metrics().snapshot();
10203 assert_eq!(metrics.routed_requests, 5);
10204 assert_eq!(
10205 metrics.per_core_routed_requests[usize::from(first_placement.core_id.0)],
10206 3
10207 );
10208 }
10209
10210 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10211 async fn snapshot_group_rejects_out_of_range_group_before_routing() {
10212 let runtime = runtime(2, 8);
10213 let err = runtime
10214 .snapshot_group(RaftGroupId(8))
10215 .await
10216 .expect_err("invalid group");
10217 assert_eq!(
10218 err,
10219 RuntimeError::InvalidRaftGroup {
10220 raft_group_id: RaftGroupId(8),
10221 raft_group_count: 8,
10222 }
10223 );
10224 assert_eq!(runtime.metrics().snapshot().routed_requests, 0);
10225 }
10226
10227 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10228 async fn install_group_snapshot_restores_group_state_and_append_counts() {
10229 let source = runtime(2, 8);
10230 let stream = BucketStreamId::new("benchcmp", "install-snapshot");
10231 let placement = source.locate(&stream);
10232 create_stream(&source, &stream).await;
10233 source
10234 .append(AppendRequest::from_bytes(stream.clone(), b"ab".to_vec()))
10235 .await
10236 .expect("append first");
10237 source
10238 .append(AppendRequest::from_bytes(stream.clone(), b"cd".to_vec()))
10239 .await
10240 .expect("append second");
10241
10242 let snapshot = source
10243 .snapshot_group(placement.raft_group_id)
10244 .await
10245 .expect("snapshot group");
10246 assert_eq!(snapshot.group_commit_index, 3);
10247 assert_eq!(
10248 snapshot.stream_append_counts,
10249 vec![StreamAppendCount {
10250 stream_id: stream.clone(),
10251 append_count: 2,
10252 }]
10253 );
10254
10255 let target = runtime(2, 8);
10256 target
10257 .install_group_snapshot(snapshot)
10258 .await
10259 .expect("install snapshot");
10260
10261 let read = target
10262 .read_stream(ReadStreamRequest {
10263 stream_id: stream.clone(),
10264 offset: 0,
10265 max_len: 16,
10266 now_ms: 0,
10267 })
10268 .await
10269 .expect("read restored stream");
10270 assert_eq!(read.placement, placement);
10271 assert_eq!(read.payload, b"abcd");
10272 assert_eq!(read.next_offset, 4);
10273
10274 let appended = target
10275 .append(AppendRequest::from_bytes(stream, b"ef".to_vec()))
10276 .await
10277 .expect("append after restore");
10278 assert_eq!(appended.start_offset, 4);
10279 assert_eq!(appended.next_offset, 6);
10280 assert_eq!(appended.stream_append_count, 3);
10281 assert_eq!(appended.group_commit_index, 4);
10282 }
10283
10284 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10285 async fn install_group_snapshot_rejects_mismatched_placement_before_routing() {
10286 let runtime = runtime(2, 8);
10287 let snapshot = GroupSnapshot {
10288 placement: ShardPlacement {
10289 core_id: CoreId(1),
10290 shard_id: ShardId(0),
10291 raft_group_id: RaftGroupId(0),
10292 },
10293 group_commit_index: 0,
10294 stream_snapshot: StreamSnapshot {
10295 buckets: Vec::new(),
10296 streams: Vec::new(),
10297 },
10298 stream_append_counts: Vec::new(),
10299 };
10300
10301 let err = runtime
10302 .install_group_snapshot(snapshot)
10303 .await
10304 .expect_err("mismatched placement rejected");
10305 assert_eq!(
10306 err,
10307 RuntimeError::SnapshotPlacementMismatch {
10308 expected: ShardPlacement {
10309 core_id: CoreId(0),
10310 shard_id: ShardId(0),
10311 raft_group_id: RaftGroupId(0),
10312 },
10313 actual: ShardPlacement {
10314 core_id: CoreId(1),
10315 shard_id: ShardId(0),
10316 raft_group_id: RaftGroupId(0),
10317 },
10318 }
10319 );
10320 assert_eq!(runtime.metrics().snapshot().routed_requests, 0);
10321 }
10322
10323 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10324 async fn mailbox_snapshot_reports_per_core_depths_and_capacities() {
10325 let runtime = ShardRuntime::spawn(RuntimeConfig {
10326 core_count: 3,
10327 raft_group_count: 9,
10328 mailbox_capacity: 7,
10329 threading: RuntimeThreading::HostedTokio,
10330 cold_max_hot_bytes_per_group: None,
10331 live_read_max_waiters_per_core: Some(65_536),
10332 })
10333 .expect("spawn runtime");
10334
10335 let snapshot = runtime.mailbox_snapshot();
10336 assert_eq!(snapshot.depths, vec![0, 0, 0]);
10337 assert_eq!(snapshot.capacities, vec![7, 7, 7]);
10338 }
10339
10340 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10341 async fn runtime_metrics_track_owner_core_routing_and_mailbox_wait() {
10342 let runtime = runtime(2, 8);
10343 let stream = BucketStreamId::new("benchcmp", "routing-metrics");
10344 let owner_core = usize::from(runtime.locate(&stream).core_id.0);
10345
10346 create_stream(&runtime, &stream).await;
10347 runtime
10348 .append(AppendRequest::from_bytes(stream.clone(), b"hello".to_vec()))
10349 .await
10350 .expect("append");
10351 runtime
10352 .read_stream(ReadStreamRequest {
10353 stream_id: stream.clone(),
10354 offset: 0,
10355 max_len: 16,
10356 now_ms: 0,
10357 })
10358 .await
10359 .expect("read");
10360
10361 let snapshot = runtime.metrics().snapshot();
10362 assert_eq!(snapshot.accepted_appends, 1);
10363 assert_eq!(snapshot.applied_mutations, 2);
10364 assert_eq!(snapshot.routed_requests, 3);
10365 assert_eq!(snapshot.per_core_routed_requests.len(), 2);
10366 assert_eq!(snapshot.per_core_routed_requests[owner_core], 3);
10367 assert_eq!(snapshot.per_core_applied_mutations[owner_core], 2);
10368 assert_eq!(
10369 snapshot.per_group_applied_mutations
10370 [usize::try_from(runtime.locate(&stream).raft_group_id.0).expect("u32 fits usize")],
10371 2
10372 );
10373 assert_eq!(
10374 snapshot.mutation_apply_ns,
10375 snapshot.per_core_mutation_apply_ns.iter().sum::<u64>()
10376 );
10377 assert_eq!(
10378 snapshot.mutation_apply_ns,
10379 snapshot.per_group_mutation_apply_ns.iter().sum::<u64>()
10380 );
10381 assert_eq!(
10382 snapshot.group_lock_wait_ns,
10383 snapshot.per_core_group_lock_wait_ns.iter().sum::<u64>()
10384 );
10385 assert_eq!(
10386 snapshot.group_lock_wait_ns,
10387 snapshot.per_group_group_lock_wait_ns.iter().sum::<u64>()
10388 );
10389 assert_eq!(
10390 snapshot.group_engine_exec_ns,
10391 snapshot.per_core_group_engine_exec_ns.iter().sum::<u64>()
10392 );
10393 assert_eq!(
10394 snapshot.group_engine_exec_ns,
10395 snapshot.per_group_group_engine_exec_ns.iter().sum::<u64>()
10396 );
10397 assert_eq!(
10398 snapshot.raft_write_many_batches,
10399 snapshot
10400 .per_core_raft_write_many_batches
10401 .iter()
10402 .sum::<u64>()
10403 );
10404 assert_eq!(
10405 snapshot.raft_write_many_batches,
10406 snapshot
10407 .per_group_raft_write_many_batches
10408 .iter()
10409 .sum::<u64>()
10410 );
10411 assert_eq!(
10412 snapshot.raft_write_many_commands,
10413 snapshot
10414 .per_core_raft_write_many_commands
10415 .iter()
10416 .sum::<u64>()
10417 );
10418 assert_eq!(
10419 snapshot.raft_write_many_commands,
10420 snapshot
10421 .per_group_raft_write_many_commands
10422 .iter()
10423 .sum::<u64>()
10424 );
10425 assert_eq!(
10426 snapshot.raft_write_many_logical_commands,
10427 snapshot
10428 .per_core_raft_write_many_logical_commands
10429 .iter()
10430 .sum::<u64>()
10431 );
10432 assert_eq!(
10433 snapshot.raft_write_many_logical_commands,
10434 snapshot
10435 .per_group_raft_write_many_logical_commands
10436 .iter()
10437 .sum::<u64>()
10438 );
10439 assert_eq!(
10440 snapshot.raft_write_many_responses,
10441 snapshot
10442 .per_core_raft_write_many_responses
10443 .iter()
10444 .sum::<u64>()
10445 );
10446 assert_eq!(
10447 snapshot.raft_write_many_responses,
10448 snapshot
10449 .per_group_raft_write_many_responses
10450 .iter()
10451 .sum::<u64>()
10452 );
10453 assert_eq!(
10454 snapshot.raft_write_many_submit_ns,
10455 snapshot
10456 .per_core_raft_write_many_submit_ns
10457 .iter()
10458 .sum::<u64>()
10459 );
10460 assert_eq!(
10461 snapshot.raft_write_many_submit_ns,
10462 snapshot
10463 .per_group_raft_write_many_submit_ns
10464 .iter()
10465 .sum::<u64>()
10466 );
10467 assert_eq!(
10468 snapshot.raft_write_many_response_ns,
10469 snapshot
10470 .per_core_raft_write_many_response_ns
10471 .iter()
10472 .sum::<u64>()
10473 );
10474 assert_eq!(
10475 snapshot.raft_write_many_response_ns,
10476 snapshot
10477 .per_group_raft_write_many_response_ns
10478 .iter()
10479 .sum::<u64>()
10480 );
10481 assert_eq!(
10482 snapshot.raft_apply_entries,
10483 snapshot.per_core_raft_apply_entries.iter().sum::<u64>()
10484 );
10485 assert_eq!(
10486 snapshot.raft_apply_entries,
10487 snapshot.per_group_raft_apply_entries.iter().sum::<u64>()
10488 );
10489 assert_eq!(
10490 snapshot.raft_apply_ns,
10491 snapshot.per_core_raft_apply_ns.iter().sum::<u64>()
10492 );
10493 assert_eq!(
10494 snapshot.raft_apply_ns,
10495 snapshot.per_group_raft_apply_ns.iter().sum::<u64>()
10496 );
10497 assert_eq!(
10498 snapshot.mailbox_send_wait_ns,
10499 snapshot.per_core_mailbox_send_wait_ns.iter().sum::<u64>()
10500 );
10501 }
10502
10503 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10504 async fn append_before_stream_setup_uses_stream_state_machine_error() {
10505 let runtime = runtime(2, 8);
10506 let stream = BucketStreamId::new("benchcmp", "missing-stream");
10507 let placement = runtime.locate(&stream);
10508 let err = runtime
10509 .append(AppendRequest::new(stream, 1))
10510 .await
10511 .expect_err("missing stream rejected");
10512
10513 match err {
10514 RuntimeError::GroupEngine {
10515 core_id,
10516 raft_group_id,
10517 message,
10518 ..
10519 } => {
10520 assert_eq!(core_id, placement.core_id);
10521 assert_eq!(raft_group_id, placement.raft_group_id);
10522 assert!(message.contains("BucketNotFound"), "message={message}");
10523 }
10524 other => panic!("expected group engine error, got {other:?}"),
10525 }
10526 assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
10527 }
10528
10529 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10530 async fn create_stream_is_routed_and_idempotent_for_matching_metadata() {
10531 let runtime = runtime(2, 8);
10532 let stream = BucketStreamId::new("benchcmp", "create-stream");
10533 let placement = runtime.locate(&stream);
10534
10535 let created = create_stream(&runtime, &stream).await;
10536 assert_eq!(created.placement, placement);
10537 assert_eq!(created.next_offset, 0);
10538 assert!(!created.closed);
10539 assert!(!created.already_exists);
10540
10541 let existing = create_stream(&runtime, &stream).await;
10542 assert_eq!(existing.placement, placement);
10543 assert_eq!(existing.next_offset, 0);
10544 assert!(!existing.closed);
10545 assert!(existing.already_exists);
10546 }
10547
10548 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10549 async fn head_stream_reflects_append_and_closed_state_on_owner_group() {
10550 let runtime = runtime(2, 8);
10551 let stream = BucketStreamId::new("benchcmp", "head-stream");
10552 let placement = runtime.locate(&stream);
10553 runtime
10554 .create_stream(CreateStreamRequest::new(stream.clone(), "text/plain"))
10555 .await
10556 .expect("create stream");
10557
10558 let mut append = AppendRequest::new(stream.clone(), 3);
10559 append.content_type = "text/plain".to_owned();
10560 append.close_after = true;
10561 let response = runtime.append(append).await.expect("append");
10562 assert_eq!(response.start_offset, 0);
10563 assert_eq!(response.next_offset, 3);
10564
10565 let head = runtime
10566 .head_stream(HeadStreamRequest {
10567 stream_id: stream,
10568 now_ms: 0,
10569 })
10570 .await
10571 .expect("head stream");
10572 assert_eq!(head.placement, placement);
10573 assert_eq!(head.content_type, "text/plain");
10574 assert_eq!(head.tail_offset, 3);
10575 assert!(head.closed);
10576 }
10577
10578 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10579 async fn read_stream_returns_payload_slice_from_owner_group() {
10580 let runtime = runtime(2, 8);
10581 let stream = BucketStreamId::new("benchcmp", "read-stream");
10582 let placement = runtime.locate(&stream);
10583 create_stream(&runtime, &stream).await;
10584 runtime
10585 .append(AppendRequest::from_bytes(
10586 stream.clone(),
10587 b"abcdefg".to_vec(),
10588 ))
10589 .await
10590 .expect("append");
10591
10592 let read = runtime
10593 .read_stream(ReadStreamRequest {
10594 stream_id: stream.clone(),
10595 offset: 2,
10596 max_len: 3,
10597 now_ms: 0,
10598 })
10599 .await
10600 .expect("read stream");
10601 assert_eq!(read.placement, placement);
10602 assert_eq!(read.offset, 2);
10603 assert_eq!(read.next_offset, 5);
10604 assert_eq!(read.payload, b"cde");
10605 assert!(!read.up_to_date);
10606 assert!(!read.closed);
10607
10608 let tail = runtime
10609 .read_stream(ReadStreamRequest {
10610 stream_id: stream,
10611 offset: 7,
10612 max_len: 16,
10613 now_ms: 0,
10614 })
10615 .await
10616 .expect("tail read");
10617 assert_eq!(tail.next_offset, 7);
10618 assert!(tail.payload.is_empty());
10619 assert!(tail.up_to_date);
10620 }
10621
10622 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10623 async fn flush_cold_publishes_chunk_metadata_on_owner_group() {
10624 let runtime = runtime(2, 8);
10625 let stream = BucketStreamId::new("benchcmp", "cold-runtime");
10626 let placement = runtime.locate(&stream);
10627 create_stream(&runtime, &stream).await;
10628 runtime
10629 .append(AppendRequest::from_bytes(
10630 stream.clone(),
10631 b"abcdef".to_vec(),
10632 ))
10633 .await
10634 .expect("append");
10635
10636 let flushed = runtime
10637 .flush_cold(FlushColdRequest {
10638 stream_id: stream.clone(),
10639 chunk: ColdChunkRef {
10640 start_offset: 0,
10641 end_offset: 4,
10642 s3_path: "s3://bucket/cold-runtime/000000".to_owned(),
10643 object_size: 4,
10644 },
10645 })
10646 .await
10647 .expect("flush cold");
10648 assert_eq!(flushed.placement, placement);
10649 assert_eq!(flushed.hot_start_offset, 4);
10650
10651 let hot = runtime
10652 .read_stream(ReadStreamRequest {
10653 stream_id: stream.clone(),
10654 offset: 4,
10655 max_len: 16,
10656 now_ms: 0,
10657 })
10658 .await
10659 .expect("hot read");
10660 assert_eq!(hot.payload, b"ef");
10661
10662 let err = runtime
10663 .read_stream(ReadStreamRequest {
10664 stream_id: stream,
10665 offset: 0,
10666 max_len: 16,
10667 now_ms: 0,
10668 })
10669 .await
10670 .expect_err("cold read needs store");
10671 match err {
10672 RuntimeError::GroupEngine {
10673 message,
10674 next_offset: Some(6),
10675 ..
10676 } if message.contains("InvalidColdFlush") => {}
10677 other => panic!("expected cold read error, got {other:?}"),
10678 }
10679 }
10680
10681 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10682 async fn flush_cold_once_uploads_outside_group_and_reads_back() {
10683 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10684 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10685 RuntimeConfig::new(2, 8),
10686 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10687 Some(cold_store),
10688 )
10689 .expect("spawn runtime");
10690 let stream = BucketStreamId::new("benchcmp", "cold-once");
10691 create_stream(&runtime, &stream).await;
10692 runtime
10693 .append(AppendRequest::from_bytes(
10694 stream.clone(),
10695 b"abcdef".to_vec(),
10696 ))
10697 .await
10698 .expect("append");
10699
10700 let flushed = runtime
10701 .flush_cold_once(PlanColdFlushRequest {
10702 stream_id: stream.clone(),
10703 min_hot_bytes: 4,
10704 max_flush_bytes: 4,
10705 })
10706 .await
10707 .expect("flush once")
10708 .expect("candidate flushed");
10709 assert_eq!(flushed.hot_start_offset, 4);
10710 let metrics = runtime.metrics().snapshot();
10711 assert_eq!(metrics.cold_flush_uploads, 1);
10712 assert_eq!(metrics.cold_flush_upload_bytes, 4);
10713 assert_eq!(metrics.cold_flush_publishes, 1);
10714 assert_eq!(metrics.cold_flush_publish_bytes, 4);
10715 assert_eq!(metrics.cold_orphan_cleanup_attempts, 0);
10716
10717 let read = runtime
10718 .read_stream(ReadStreamRequest {
10719 stream_id: stream,
10720 offset: 0,
10721 max_len: 6,
10722 now_ms: 0,
10723 })
10724 .await
10725 .expect("read cold and hot");
10726 assert_eq!(read.payload, b"abcdef");
10727 assert_eq!(read.next_offset, 6);
10728 }
10729
10730 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10731 async fn flush_cold_group_batch_once_publishes_multiple_chunks() {
10732 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10733 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10734 RuntimeConfig::new(2, 8),
10735 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10736 Some(cold_store),
10737 )
10738 .expect("spawn runtime");
10739 let stream = BucketStreamId::new("benchcmp", "cold-batch");
10740 let placement = runtime.locate(&stream);
10741 create_stream(&runtime, &stream).await;
10742 runtime
10743 .append(AppendRequest::from_bytes(stream.clone(), b"abcd".to_vec()))
10744 .await
10745 .expect("append");
10746
10747 let flushed = runtime
10748 .flush_cold_group_batch_once(
10749 placement.raft_group_id,
10750 PlanGroupColdFlushRequest {
10751 min_hot_bytes: 1,
10752 max_flush_bytes: 1,
10753 },
10754 4,
10755 )
10756 .await
10757 .expect("flush batch");
10758 assert_eq!(flushed.len(), 4);
10759 assert!(
10760 flushed
10761 .iter()
10762 .all(|response| response.placement == placement)
10763 );
10764 assert_eq!(
10765 flushed
10766 .iter()
10767 .map(|response| response.hot_start_offset)
10768 .collect::<Vec<_>>(),
10769 vec![1, 2, 3, 0]
10770 );
10771
10772 let metrics = runtime.metrics().snapshot();
10773 assert_eq!(metrics.cold_flush_uploads, 4);
10774 assert_eq!(metrics.cold_flush_upload_bytes, 4);
10775 assert_eq!(metrics.cold_flush_publishes, 4);
10776 assert_eq!(metrics.cold_flush_publish_bytes, 4);
10777 assert_eq!(metrics.cold_hot_bytes, 0);
10778
10779 let snapshot = runtime
10780 .snapshot_group(placement.raft_group_id)
10781 .await
10782 .expect("snapshot group");
10783 let entry = snapshot
10784 .stream_snapshot
10785 .streams
10786 .iter()
10787 .find(|entry| entry.metadata.stream_id == stream)
10788 .expect("stream snapshot");
10789 assert_eq!(entry.cold_chunks.len(), 4);
10790 assert!(entry.payload.is_empty());
10791
10792 let read = runtime
10793 .read_stream(ReadStreamRequest {
10794 stream_id: stream,
10795 offset: 0,
10796 max_len: 4,
10797 now_ms: 0,
10798 })
10799 .await
10800 .expect("read cold chunks");
10801 assert_eq!(read.payload, b"abcd");
10802 assert_eq!(read.next_offset, 4);
10803 }
10804
10805 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10806 async fn stale_cold_flush_batch_after_delete_recreate_is_classified_for_cleanup() {
10807 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10808 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10809 RuntimeConfig::new(2, 8),
10810 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10811 Some(cold_store),
10812 )
10813 .expect("spawn runtime");
10814 let stream = BucketStreamId::new("benchcmp", "stale-cold-runtime");
10815 let placement = runtime.locate(&stream);
10816 create_stream(&runtime, &stream).await;
10817 runtime
10818 .append(AppendRequest::from_bytes(
10819 stream.clone(),
10820 b"abcdefghijklmnopqr".to_vec(),
10821 ))
10822 .await
10823 .expect("append old stream");
10824 let candidates = runtime
10825 .plan_next_cold_flush_batch(
10826 placement.raft_group_id,
10827 PlanGroupColdFlushRequest {
10828 min_hot_bytes: 18,
10829 max_flush_bytes: 18,
10830 },
10831 1,
10832 )
10833 .await
10834 .expect("plan candidate");
10835 assert_eq!(candidates.len(), 1);
10836
10837 runtime
10838 .delete_stream(DeleteStreamRequest {
10839 stream_id: stream.clone(),
10840 })
10841 .await
10842 .expect("delete old stream");
10843 create_stream(&runtime, &stream).await;
10844 runtime
10845 .append(AppendRequest::from_bytes(
10846 stream.clone(),
10847 b"abcdefghijklmnopq".to_vec(),
10848 ))
10849 .await
10850 .expect("append recreated stream");
10851
10852 let err = runtime
10853 .flush_cold_candidates_batch(candidates)
10854 .await
10855 .expect_err("stale candidate should fail publish");
10856 assert!(is_stale_cold_flush_candidate_error(&err));
10857 let metrics = runtime.metrics().snapshot();
10858 assert_eq!(metrics.cold_flush_uploads, 1);
10859 assert_eq!(metrics.cold_flush_publishes, 0);
10860 assert_eq!(metrics.cold_orphan_cleanup_attempts, 1);
10861 assert_eq!(metrics.cold_orphan_cleanup_errors, 0);
10862
10863 let read = runtime
10864 .read_stream(ReadStreamRequest {
10865 stream_id: stream,
10866 offset: 0,
10867 max_len: 32,
10868 now_ms: 0,
10869 })
10870 .await
10871 .expect("read recreated stream");
10872 assert_eq!(read.payload, b"abcdefghijklmnopq");
10873 assert_eq!(read.next_offset, 17);
10874 }
10875
10876 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10877 async fn cold_write_admission_rejects_new_bytes_until_flush_catches_up() {
10878 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10879 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10880 RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(4)),
10881 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10882 Some(cold_store),
10883 )
10884 .expect("spawn runtime");
10885 let stream = BucketStreamId::new("benchcmp", "cold-admission");
10886 create_stream(&runtime, &stream).await;
10887 runtime
10888 .append(AppendRequest::from_bytes(stream.clone(), b"abcd".to_vec()))
10889 .await
10890 .expect("append below limit");
10891
10892 let err = runtime
10893 .append(AppendRequest::from_bytes(stream.clone(), b"e".to_vec()))
10894 .await
10895 .expect_err("append should be backpressured");
10896 match err {
10897 RuntimeError::GroupEngine { message, .. } if message.contains("ColdBackpressure") => {}
10898 other => panic!("expected cold backpressure, got {other:?}"),
10899 }
10900 let metrics = runtime.metrics().snapshot();
10901 let group_index = usize::try_from(runtime.locate(&stream).raft_group_id.0).unwrap();
10902 assert_eq!(metrics.accepted_appends, 1);
10903 assert_eq!(metrics.cold_hot_bytes, 4);
10904 assert_eq!(metrics.per_group_cold_hot_bytes[group_index], 4);
10905 assert_eq!(metrics.cold_hot_group_bytes_max, 4);
10906 assert_eq!(metrics.cold_hot_stream_bytes_max, 4);
10907 assert_eq!(metrics.cold_backpressure_events, 1);
10908 assert_eq!(metrics.per_group_cold_backpressure_events[group_index], 1);
10909 assert_eq!(metrics.cold_backpressure_bytes, 1);
10910
10911 runtime
10912 .flush_cold_once(PlanColdFlushRequest {
10913 stream_id: stream.clone(),
10914 min_hot_bytes: 4,
10915 max_flush_bytes: 4,
10916 })
10917 .await
10918 .expect("flush once")
10919 .expect("candidate flushed");
10920 assert_eq!(runtime.metrics().snapshot().cold_hot_bytes, 0);
10921
10922 runtime
10923 .append(AppendRequest::from_bytes(stream.clone(), b"e".to_vec()))
10924 .await
10925 .expect("append after flush");
10926 let read = runtime
10927 .read_stream(ReadStreamRequest {
10928 stream_id: stream,
10929 offset: 0,
10930 max_len: 5,
10931 now_ms: 0,
10932 })
10933 .await
10934 .expect("read cold and hot");
10935 assert_eq!(read.payload, b"abcde");
10936 }
10937
10938 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10939 async fn cold_write_admission_rejects_append_batch_without_partial_mutation() {
10940 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10941 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10942 RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(4)),
10943 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10944 Some(cold_store),
10945 )
10946 .expect("spawn runtime");
10947 let stream = BucketStreamId::new("benchcmp", "cold-admission-batch");
10948 create_stream(&runtime, &stream).await;
10949 runtime
10950 .append(AppendRequest::from_bytes(stream.clone(), b"abc".to_vec()))
10951 .await
10952 .expect("append below limit");
10953
10954 let err = runtime
10955 .append_batch(AppendBatchRequest::new(
10956 stream.clone(),
10957 vec![b"d".to_vec(), b"e".to_vec()],
10958 ))
10959 .await
10960 .expect_err("batch should be backpressured");
10961 match err {
10962 RuntimeError::GroupEngine { message, .. } if message.contains("ColdBackpressure") => {}
10963 other => panic!("expected cold backpressure, got {other:?}"),
10964 }
10965 let read = runtime
10966 .read_stream(ReadStreamRequest {
10967 stream_id: stream.clone(),
10968 offset: 0,
10969 max_len: 8,
10970 now_ms: 0,
10971 })
10972 .await
10973 .expect("read");
10974 assert_eq!(read.payload, b"abc");
10975 let metrics = runtime.metrics().snapshot();
10976 assert_eq!(metrics.accepted_appends, 1);
10977 assert_eq!(metrics.cold_backpressure_events, 1);
10978 assert_eq!(metrics.cold_backpressure_bytes, 2);
10979 }
10980
10981 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
10982 async fn flush_cold_group_once_selects_stream_inside_owner_group() {
10983 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
10984 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
10985 RuntimeConfig::new(2, 8),
10986 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
10987 Some(cold_store),
10988 )
10989 .expect("spawn runtime");
10990 let group_id = RaftGroupId(3);
10991 let stream = stream_on_group(&runtime, group_id, "cold-group");
10992 create_stream(&runtime, &stream).await;
10993 runtime
10994 .append(AppendRequest::from_bytes(
10995 stream.clone(),
10996 b"abcdef".to_vec(),
10997 ))
10998 .await
10999 .expect("append");
11000
11001 let flushed = runtime
11002 .flush_cold_group_once(
11003 group_id,
11004 PlanGroupColdFlushRequest {
11005 min_hot_bytes: 4,
11006 max_flush_bytes: 4,
11007 },
11008 )
11009 .await
11010 .expect("flush group")
11011 .expect("candidate flushed");
11012 assert_eq!(flushed.hot_start_offset, 4);
11013
11014 let read = runtime
11015 .read_stream(ReadStreamRequest {
11016 stream_id: stream,
11017 offset: 0,
11018 max_len: 6,
11019 now_ms: 0,
11020 })
11021 .await
11022 .expect("read cold and hot");
11023 assert_eq!(read.payload, b"abcdef");
11024 }
11025
11026 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11027 async fn flush_cold_all_groups_once_bounded_flushes_multiple_groups() {
11028 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11029 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11030 RuntimeConfig::new(2, 8),
11031 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
11032 Some(cold_store),
11033 )
11034 .expect("spawn runtime");
11035 let first = stream_on_group(&runtime, RaftGroupId(1), "cold-bounded-a");
11036 let second = stream_on_group(&runtime, RaftGroupId(6), "cold-bounded-b");
11037 for stream in [&first, &second] {
11038 create_stream(&runtime, stream).await;
11039 runtime
11040 .append(AppendRequest::from_bytes(
11041 stream.clone(),
11042 b"abcdef".to_vec(),
11043 ))
11044 .await
11045 .expect("append");
11046 }
11047
11048 let flushed = runtime
11049 .flush_cold_all_groups_once_bounded(
11050 PlanGroupColdFlushRequest {
11051 min_hot_bytes: 4,
11052 max_flush_bytes: 4,
11053 },
11054 2,
11055 )
11056 .await
11057 .expect("flush all bounded");
11058 assert_eq!(flushed, 2);
11059 let metrics = runtime.metrics().snapshot();
11060 assert_eq!(metrics.cold_flush_uploads, 2);
11061 assert_eq!(metrics.cold_flush_upload_bytes, 8);
11062 assert_eq!(metrics.cold_flush_publishes, 2);
11063 assert_eq!(metrics.cold_flush_publish_bytes, 8);
11064 }
11065
11066 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11067 async fn repeated_cold_flush_keeps_hot_bytes_bounded_while_writes_continue() {
11068 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11069 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11070 RuntimeConfig::new(2, 8).with_cold_max_hot_bytes_per_group(Some(16)),
11071 InMemoryGroupEngineFactory::with_cold_store(Some(cold_store.clone())),
11072 Some(cold_store),
11073 )
11074 .expect("spawn runtime");
11075 let streams = [
11076 stream_on_group(&runtime, RaftGroupId(0), "cold-steady-a"),
11077 stream_on_group(&runtime, RaftGroupId(3), "cold-steady-b"),
11078 stream_on_group(&runtime, RaftGroupId(5), "cold-steady-c"),
11079 stream_on_group(&runtime, RaftGroupId(7), "cold-steady-d"),
11080 ];
11081 for stream in &streams {
11082 create_stream(&runtime, stream).await;
11083 }
11084
11085 let mut expected = Vec::new();
11086 for round in 0..8u8 {
11087 let payload = vec![b'a' + round; 4];
11088 expected.extend_from_slice(&payload);
11089 for stream in &streams {
11090 runtime
11091 .append(AppendRequest::from_bytes(stream.clone(), payload.clone()))
11092 .await
11093 .expect("append while cold worker keeps up");
11094 }
11095
11096 let metrics_before_flush = runtime.metrics().snapshot();
11097 assert!(
11098 metrics_before_flush.cold_hot_bytes <= 64,
11099 "hot bytes should stay within one unflushed batch per group before flush: {}",
11100 metrics_before_flush.cold_hot_bytes
11101 );
11102
11103 let flushed = runtime
11104 .flush_cold_all_groups_once_bounded(
11105 PlanGroupColdFlushRequest {
11106 min_hot_bytes: 4,
11107 max_flush_bytes: 4,
11108 },
11109 streams.len(),
11110 )
11111 .await
11112 .expect("flush all bounded");
11113 assert_eq!(flushed, streams.len());
11114 let metrics_after_flush = runtime.metrics().snapshot();
11115 assert_eq!(
11116 metrics_after_flush.cold_hot_bytes, 0,
11117 "all newly appended bytes should be offloaded after round {round}"
11118 );
11119 assert_eq!(
11120 metrics_after_flush.cold_flush_uploads,
11121 u64::try_from((usize::from(round) + 1) * streams.len()).expect("count fits u64")
11122 );
11123 assert_eq!(metrics_after_flush.cold_orphan_cleanup_attempts, 0);
11124 assert_eq!(metrics_after_flush.cold_backpressure_events, 0);
11125 }
11126
11127 for stream in streams {
11128 let read = runtime
11129 .read_stream(ReadStreamRequest {
11130 stream_id: stream,
11131 offset: 0,
11132 max_len: expected.len(),
11133 now_ms: 0,
11134 })
11135 .await
11136 .expect("read cold-backed stream");
11137 assert_eq!(read.payload, expected);
11138 assert_eq!(read.next_offset, u64::try_from(expected.len()).unwrap());
11139 }
11140 }
11141
11142 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11143 async fn wait_read_stream_completes_after_owner_append() {
11144 let runtime = runtime(2, 8);
11145 let stream = BucketStreamId::new("benchcmp", "wait-read");
11146 create_stream(&runtime, &stream).await;
11147
11148 let wait = {
11149 let runtime = runtime.clone();
11150 let stream = stream.clone();
11151 tokio::spawn(async move {
11152 runtime
11153 .wait_read_stream(ReadStreamRequest {
11154 stream_id: stream,
11155 offset: 0,
11156 max_len: 16,
11157 now_ms: 0,
11158 })
11159 .await
11160 .expect("wait read")
11161 })
11162 };
11163 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
11164 runtime
11165 .append(AppendRequest::from_bytes(stream.clone(), b"hello".to_vec()))
11166 .await
11167 .expect("append");
11168
11169 let read = tokio::time::timeout(std::time::Duration::from_secs(1), wait)
11170 .await
11171 .expect("wait read timeout")
11172 .expect("wait task");
11173 assert_eq!(read.payload, b"hello");
11174 assert_eq!(read.next_offset, 5);
11175 assert!(read.up_to_date);
11176 assert!(!read.closed);
11177 assert_eq!(runtime.metrics().snapshot().live_read_waiters, 0);
11178 }
11179
11180 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11181 async fn wait_read_stream_completes_on_close_at_tail() {
11182 let runtime = runtime(2, 8);
11183 let stream = BucketStreamId::new("benchcmp", "wait-close");
11184 create_stream(&runtime, &stream).await;
11185
11186 let wait = {
11187 let runtime = runtime.clone();
11188 let stream = stream.clone();
11189 tokio::spawn(async move {
11190 runtime
11191 .wait_read_stream(ReadStreamRequest {
11192 stream_id: stream,
11193 offset: 0,
11194 max_len: 16,
11195 now_ms: 0,
11196 })
11197 .await
11198 .expect("wait read")
11199 })
11200 };
11201 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
11202 runtime
11203 .close_stream(CloseStreamRequest {
11204 stream_id: stream,
11205 stream_seq: None,
11206 producer: None,
11207 now_ms: 0,
11208 })
11209 .await
11210 .expect("close stream");
11211
11212 let read = tokio::time::timeout(std::time::Duration::from_secs(1), wait)
11213 .await
11214 .expect("wait read timeout")
11215 .expect("wait task");
11216 assert!(read.payload.is_empty());
11217 assert_eq!(read.next_offset, 0);
11218 assert!(read.up_to_date);
11219 assert!(read.closed);
11220 assert_eq!(runtime.metrics().snapshot().live_read_waiters, 0);
11221 }
11222
11223 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11224 async fn canceled_wait_read_stream_removes_owner_waiter() {
11225 let runtime = runtime(2, 8);
11226 let stream = BucketStreamId::new("benchcmp", "wait-cancel");
11227 create_stream(&runtime, &stream).await;
11228
11229 let wait = {
11230 let runtime = runtime.clone();
11231 let stream = stream.clone();
11232 tokio::spawn(async move {
11233 runtime
11234 .wait_read_stream(ReadStreamRequest {
11235 stream_id: stream,
11236 offset: 0,
11237 max_len: 16,
11238 now_ms: 0,
11239 })
11240 .await
11241 })
11242 };
11243 wait_for_live_waiters(&runtime, 1).await;
11244 wait.abort();
11245 let _ = wait.await;
11246 wait_for_live_waiters(&runtime, 0).await;
11247 }
11248
11249 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11250 async fn live_read_waiter_limit_rejects_excess_waiters_on_owner_core() {
11251 let runtime = ShardRuntime::spawn(
11252 RuntimeConfig::new(1, 1).with_live_read_max_waiters_per_core(Some(1)),
11253 )
11254 .expect("spawn runtime");
11255 let stream = BucketStreamId::new("benchcmp", "wait-limit");
11256 create_stream(&runtime, &stream).await;
11257
11258 let first = {
11259 let runtime = runtime.clone();
11260 let stream = stream.clone();
11261 tokio::spawn(async move {
11262 runtime
11263 .wait_read_stream(ReadStreamRequest {
11264 stream_id: stream,
11265 offset: 0,
11266 max_len: 16,
11267 now_ms: 0,
11268 })
11269 .await
11270 })
11271 };
11272 wait_for_live_waiters(&runtime, 1).await;
11273
11274 let err = runtime
11275 .wait_read_stream(ReadStreamRequest {
11276 stream_id: stream.clone(),
11277 offset: 0,
11278 max_len: 16,
11279 now_ms: 0,
11280 })
11281 .await
11282 .expect_err("second waiter should hit owner-core limit");
11283 assert_eq!(
11284 err,
11285 RuntimeError::LiveReadBackpressure {
11286 core_id: CoreId(0),
11287 current_waiters: 1,
11288 limit: 1,
11289 }
11290 );
11291 let snapshot = runtime.metrics().snapshot();
11292 assert_eq!(snapshot.live_read_waiters, 1);
11293 assert_eq!(snapshot.live_read_backpressure_events, 1);
11294 assert_eq!(snapshot.per_core_live_read_backpressure_events, vec![1]);
11295
11296 first.abort();
11297 let _ = first.await;
11298 wait_for_live_waiters(&runtime, 0).await;
11299 }
11300
11301 #[test]
11302 fn cancel_read_watcher_removes_group_local_waiter() {
11303 let stream = BucketStreamId::new("benchcmp", "watcher-cancel-local");
11304 let mut read_watchers = ReadWatchers::new();
11305 let (first_tx, _first_rx) = oneshot::channel();
11306 let (second_tx, _second_rx) = oneshot::channel();
11307 read_watchers.insert(
11308 stream.clone(),
11309 vec![
11310 ReadWatcher {
11311 waiter_id: 1,
11312 request: ReadStreamRequest {
11313 stream_id: stream.clone(),
11314 offset: 0,
11315 max_len: 16,
11316 now_ms: 0,
11317 },
11318 response_tx: first_tx,
11319 },
11320 ReadWatcher {
11321 waiter_id: 2,
11322 request: ReadStreamRequest {
11323 stream_id: stream.clone(),
11324 offset: 0,
11325 max_len: 16,
11326 now_ms: 0,
11327 },
11328 response_tx: second_tx,
11329 },
11330 ],
11331 );
11332
11333 let metrics = Arc::new(RuntimeMetricsInner::new(1, 1));
11334 metrics.record_read_watchers_added(CoreId(0), 2);
11335 CoreWorker::cancel_read_watcher(
11336 &mut read_watchers,
11337 metrics.clone(),
11338 CoreId(0),
11339 stream.clone(),
11340 1,
11341 );
11342
11343 let watcher_ids = read_watchers
11344 .get(&stream)
11345 .expect("one watcher remains")
11346 .iter()
11347 .map(|watcher| watcher.waiter_id)
11348 .collect::<Vec<_>>();
11349 assert_eq!(watcher_ids, vec![2]);
11350 assert_eq!(metrics.per_core_live_read_waiters[0].load_relaxed(), 1);
11351 }
11352
11353 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11354 async fn notify_read_watchers_shares_identical_reads_across_watchers() {
11355 let factory = BlockingReadFactory::default();
11356 let runtime = ShardRuntime::spawn_with_engine_factory(
11357 RuntimeConfig {
11358 core_count: 1,
11359 raft_group_count: 1,
11360 mailbox_capacity: 8,
11361 threading: RuntimeThreading::HostedTokio,
11362 cold_max_hot_bytes_per_group: None,
11363 live_read_max_waiters_per_core: Some(65_536),
11364 },
11365 factory.clone(),
11366 )
11367 .expect("spawn runtime");
11368 let stream = BucketStreamId::new("benchcmp", "watcher-shared-read");
11369 let placement = runtime.locate(&stream);
11370 let request = ReadStreamRequest {
11371 stream_id: stream.clone(),
11372 offset: 0,
11373 max_len: 16,
11374 now_ms: 0,
11375 };
11376 let mut read_watchers = ReadWatchers::new();
11377 let (first_tx, _first_rx) = oneshot::channel();
11378 let (second_tx, _second_rx) = oneshot::channel();
11379 read_watchers.insert(
11380 stream.clone(),
11381 vec![
11382 ReadWatcher {
11383 waiter_id: 1,
11384 request: request.clone(),
11385 response_tx: first_tx,
11386 },
11387 ReadWatcher {
11388 waiter_id: 2,
11389 request,
11390 response_tx: second_tx,
11391 },
11392 ],
11393 );
11394
11395 let metrics = Arc::new(RuntimeMetricsInner::new(1, 1));
11396 let mut engine = factory
11397 .create(
11398 placement,
11399 GroupEngineMetrics {
11400 inner: metrics.clone(),
11401 },
11402 )
11403 .await
11404 .expect("create engine");
11405 let notify = {
11406 let stream = stream.clone();
11407 tokio::spawn(async move {
11408 CoreWorker::notify_read_watchers(
11409 &mut engine,
11410 metrics,
11411 Arc::new(Semaphore::new(8)),
11412 &mut read_watchers,
11413 &stream,
11414 placement,
11415 )
11416 .await;
11417 read_watchers
11418 })
11419 };
11420 tokio::time::timeout(
11421 std::time::Duration::from_secs(1),
11422 factory.entered.notified(),
11423 )
11424 .await
11425 .expect("notify issued one grouped read");
11426 factory.release.notify_one();
11427 let read_watchers = tokio::time::timeout(std::time::Duration::from_secs(1), notify)
11428 .await
11429 .expect("notify should finish after one read")
11430 .expect("notify task");
11431
11432 let watcher_ids = read_watchers
11433 .get(&stream)
11434 .expect("pending watchers reinserted")
11435 .iter()
11436 .map(|watcher| watcher.waiter_id)
11437 .collect::<Vec<_>>();
11438 assert_eq!(watcher_ids, vec![1, 2]);
11439 assert_eq!(factory.read_count.load(Ordering::Relaxed), 1);
11440 }
11441
11442 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11443 async fn close_stream_allows_close_only_and_rejects_later_appends() {
11444 let runtime = runtime(2, 8);
11445 let stream = BucketStreamId::new("benchcmp", "close-only");
11446 let placement = runtime.locate(&stream);
11447 create_stream(&runtime, &stream).await;
11448
11449 let closed = runtime
11450 .close_stream(CloseStreamRequest {
11451 stream_id: stream.clone(),
11452 stream_seq: None,
11453 producer: None,
11454 now_ms: 0,
11455 })
11456 .await
11457 .expect("close stream");
11458 assert_eq!(closed.placement, placement);
11459 assert_eq!(closed.next_offset, 0);
11460
11461 let err = runtime
11462 .append(AppendRequest::new(stream.clone(), 1))
11463 .await
11464 .expect_err("append after close rejected");
11465 match err {
11466 RuntimeError::GroupEngine { message, .. } => {
11467 assert!(message.contains("StreamClosed"), "message={message}");
11468 }
11469 other => panic!("expected group engine error, got {other:?}"),
11470 }
11471
11472 let head = runtime
11473 .head_stream(HeadStreamRequest {
11474 stream_id: stream,
11475 now_ms: 0,
11476 })
11477 .await
11478 .expect("head stream");
11479 assert_eq!(head.tail_offset, 0);
11480 assert!(head.closed);
11481 assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
11482 }
11483
11484 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11485 async fn delete_stream_removes_state_on_owner_group() {
11486 let runtime = runtime(2, 8);
11487 let stream = BucketStreamId::new("benchcmp", "delete-stream");
11488 let placement = runtime.locate(&stream);
11489 create_stream(&runtime, &stream).await;
11490 runtime
11491 .append(AppendRequest::from_bytes(
11492 stream.clone(),
11493 b"payload".to_vec(),
11494 ))
11495 .await
11496 .expect("append");
11497
11498 let deleted = runtime
11499 .delete_stream(DeleteStreamRequest {
11500 stream_id: stream.clone(),
11501 })
11502 .await
11503 .expect("delete stream");
11504 assert_eq!(deleted.placement, placement);
11505
11506 let err = runtime
11507 .head_stream(HeadStreamRequest {
11508 stream_id: stream.clone(),
11509 now_ms: 0,
11510 })
11511 .await
11512 .expect_err("head after delete rejected");
11513 match err {
11514 RuntimeError::GroupEngine { message, .. } => {
11515 assert!(message.contains("StreamNotFound"), "message={message}");
11516 }
11517 other => panic!("expected group engine error, got {other:?}"),
11518 }
11519
11520 let err = runtime
11521 .append(AppendRequest::new(stream, 1))
11522 .await
11523 .expect_err("append after delete rejected");
11524 match err {
11525 RuntimeError::GroupEngine { message, .. } => {
11526 assert!(message.contains("StreamNotFound"), "message={message}");
11527 }
11528 other => panic!("expected group engine error, got {other:?}"),
11529 }
11530 }
11531
11532 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11533 async fn fork_ref_keeps_deleted_source_gone_until_last_fork_delete() {
11534 let runtime = runtime(2, 8);
11535 let source = BucketStreamId::new("benchcmp", "fork-ref-source");
11536 let fork = BucketStreamId::new("benchcmp", "fork-ref-child");
11537 let mut source_create = CreateStreamRequest::new(source.clone(), DEFAULT_CONTENT_TYPE);
11538 source_create.initial_payload = Bytes::from_static(b"abc");
11539 runtime
11540 .create_stream(source_create)
11541 .await
11542 .expect("create source");
11543
11544 let mut fork_create = CreateStreamRequest::new(fork.clone(), DEFAULT_CONTENT_TYPE);
11545 fork_create.forked_from = Some(source.clone());
11546 runtime
11547 .create_stream(fork_create)
11548 .await
11549 .expect("create fork");
11550
11551 runtime
11552 .delete_stream(DeleteStreamRequest {
11553 stream_id: source.clone(),
11554 })
11555 .await
11556 .expect("delete source");
11557 let err = runtime
11558 .head_stream(HeadStreamRequest {
11559 stream_id: source.clone(),
11560 now_ms: 0,
11561 })
11562 .await
11563 .expect_err("soft-deleted source is gone");
11564 match err {
11565 RuntimeError::GroupEngine { message, .. } => {
11566 assert!(message.contains("StreamGone"), "message={message}");
11567 }
11568 other => panic!("expected group engine error, got {other:?}"),
11569 }
11570
11571 let fork_read = runtime
11572 .read_stream(ReadStreamRequest {
11573 stream_id: fork.clone(),
11574 offset: 0,
11575 max_len: 16,
11576 now_ms: 0,
11577 })
11578 .await
11579 .expect("fork remains readable");
11580 assert_eq!(fork_read.payload, b"abc");
11581
11582 runtime
11583 .delete_stream(DeleteStreamRequest { stream_id: fork })
11584 .await
11585 .expect("delete fork");
11586 let err = runtime
11587 .head_stream(HeadStreamRequest {
11588 stream_id: source,
11589 now_ms: 0,
11590 })
11591 .await
11592 .expect_err("source is hard-deleted after last fork");
11593 match err {
11594 RuntimeError::GroupEngine { message, .. } => {
11595 assert!(message.contains("StreamNotFound"), "message={message}");
11596 }
11597 other => panic!("expected group engine error, got {other:?}"),
11598 }
11599 }
11600
11601 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
11602 async fn thread_per_core_runtime_reaches_all_configured_cores() {
11603 let mut config = RuntimeConfig::new(4, 32);
11604 config.mailbox_capacity = 128;
11605 assert_eq!(config.threading, RuntimeThreading::ThreadPerCore);
11606 let runtime = ShardRuntime::spawn(config).expect("spawn runtime");
11607
11608 let mut tasks = Vec::new();
11609 for index in 0..1024 {
11610 let runtime = runtime.clone();
11611 tasks.push(tokio::spawn(async move {
11612 let stream = BucketStreamId::new("benchcmp", format!("thread-core-{index}"));
11613 create_stream(&runtime, &stream).await;
11614 runtime
11615 .append(AppendRequest::new(stream, 1))
11616 .await
11617 .expect("append");
11618 }));
11619 }
11620
11621 for task in tasks {
11622 task.await.expect("task");
11623 }
11624
11625 let snapshot = runtime.metrics().snapshot();
11626 assert_eq!(snapshot.accepted_appends, 1024);
11627 assert!(snapshot.per_core_appends.iter().all(|value| *value > 0));
11628 }
11629
11630 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
11631 async fn custom_group_engine_is_created_once_per_touched_group_on_owner_core() {
11632 let factory = RecordingFactory::default();
11633 let runtime = ShardRuntime::spawn_with_engine_factory(
11634 RuntimeConfig {
11635 core_count: 4,
11636 raft_group_count: 32,
11637 mailbox_capacity: 128,
11638 threading: RuntimeThreading::HostedTokio,
11639 cold_max_hot_bytes_per_group: None,
11640 live_read_max_waiters_per_core: Some(65_536),
11641 },
11642 factory.clone(),
11643 )
11644 .expect("spawn runtime");
11645
11646 let mut touched_groups = HashSet::new();
11647 for index in 0..4096 {
11648 let stream = BucketStreamId::new("benchcmp", format!("engine-{index}"));
11649 let placement = runtime.locate(&stream);
11650 runtime
11651 .create_stream(CreateStreamRequest::new(stream, DEFAULT_CONTENT_TYPE))
11652 .await
11653 .expect("create stream");
11654 touched_groups.insert(placement.raft_group_id);
11655 if touched_groups.len() == 16 {
11656 break;
11657 }
11658 }
11659
11660 let created = factory.created();
11661 let created_groups = created
11662 .iter()
11663 .map(|placement| placement.raft_group_id)
11664 .collect::<HashSet<_>>();
11665 assert_eq!(created_groups, touched_groups);
11666 for placement in created {
11667 assert_eq!(
11668 u32::from(placement.core_id.0),
11669 placement.raft_group_id.0 % 4
11670 );
11671 }
11672 }
11673
11674 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11675 async fn background_cold_flush_skips_groups_that_cannot_accept_local_writes() {
11676 let factory = RecordingFactory::without_local_writes();
11677 let cold_store = Arc::new(ColdStore::memory().expect("memory cold store"));
11678 let runtime = ShardRuntime::spawn_with_engine_factory_and_cold_store(
11679 RuntimeConfig {
11680 core_count: 2,
11681 raft_group_count: 4,
11682 mailbox_capacity: 128,
11683 threading: RuntimeThreading::HostedTokio,
11684 cold_max_hot_bytes_per_group: None,
11685 live_read_max_waiters_per_core: Some(65_536),
11686 },
11687 factory.clone(),
11688 Some(cold_store),
11689 )
11690 .expect("spawn runtime");
11691
11692 let flushed = runtime
11693 .flush_cold_all_groups_once_bounded(
11694 PlanGroupColdFlushRequest {
11695 min_hot_bytes: 1,
11696 max_flush_bytes: 1,
11697 },
11698 4,
11699 )
11700 .await
11701 .expect("flush all groups");
11702
11703 assert_eq!(flushed, 0);
11704 assert_eq!(factory.created().len(), 4);
11705 let metrics = runtime.metrics().snapshot();
11706 assert_eq!(metrics.cold_flush_uploads, 0);
11707 assert_eq!(metrics.cold_flush_publishes, 0);
11708 }
11709
11710 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11711 async fn warm_group_instantiates_engine_on_owner_core_without_stream_mutation() {
11712 let factory = RecordingFactory::default();
11713 let runtime = ShardRuntime::spawn_with_engine_factory(
11714 RuntimeConfig {
11715 core_count: 2,
11716 raft_group_count: 4,
11717 mailbox_capacity: 128,
11718 threading: RuntimeThreading::HostedTokio,
11719 cold_max_hot_bytes_per_group: None,
11720 live_read_max_waiters_per_core: Some(65_536),
11721 },
11722 factory.clone(),
11723 )
11724 .expect("spawn runtime");
11725
11726 let warmed = runtime
11727 .warm_group(RaftGroupId(3))
11728 .await
11729 .expect("warm group");
11730 assert_eq!(warmed.core_id, CoreId(1));
11731 assert_eq!(warmed.raft_group_id, RaftGroupId(3));
11732
11733 runtime
11734 .warm_group(RaftGroupId(3))
11735 .await
11736 .expect("second warm is idempotent");
11737
11738 let created = factory.created();
11739 assert_eq!(created, vec![warmed]);
11740
11741 runtime.warm_all_groups().await.expect("warm all groups");
11742 let created_groups = factory
11743 .created()
11744 .into_iter()
11745 .map(|placement| placement.raft_group_id)
11746 .collect::<HashSet<_>>();
11747 assert_eq!(
11748 created_groups,
11749 [
11750 RaftGroupId(0),
11751 RaftGroupId(1),
11752 RaftGroupId(2),
11753 RaftGroupId(3)
11754 ]
11755 .into_iter()
11756 .collect()
11757 );
11758 }
11759
11760 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11761 async fn core_worker_dispatches_other_groups_while_one_group_waits() {
11762 let factory = BlockingFirstCreateEngineFactory::default();
11763 let runtime = ShardRuntime::spawn_with_engine_factory(
11764 RuntimeConfig {
11765 core_count: 1,
11766 raft_group_count: 2,
11767 mailbox_capacity: 128,
11768 threading: RuntimeThreading::HostedTokio,
11769 cold_max_hot_bytes_per_group: None,
11770 live_read_max_waiters_per_core: Some(65_536),
11771 },
11772 factory.clone(),
11773 )
11774 .expect("spawn runtime");
11775
11776 let blocked_stream = stream_on_group(&runtime, RaftGroupId(0), "blocked-group");
11777 let free_stream = stream_on_group(&runtime, RaftGroupId(1), "free-group");
11778 let entered_wait = factory.entered.notified();
11779 let blocked_runtime = runtime.clone();
11780 let blocked =
11781 tokio::spawn(async move { create_stream(&blocked_runtime, &blocked_stream).await });
11782
11783 tokio::time::timeout(std::time::Duration::from_secs(1), entered_wait)
11784 .await
11785 .expect("first group entered blocking create");
11786
11787 let completed = tokio::time::timeout(
11788 std::time::Duration::from_secs(1),
11789 create_stream(&runtime, &free_stream),
11790 )
11791 .await
11792 .expect("other group should complete while first group is blocked");
11793 assert_eq!(completed.placement.raft_group_id, RaftGroupId(1));
11794
11795 factory.release.notify_one();
11796 blocked.await.expect("blocked task");
11797 }
11798
11799 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11800 async fn runtime_read_uses_group_read_parts_fast_path() {
11801 let factory = BlockingReadFactory::default();
11802 let runtime = ShardRuntime::spawn_with_engine_factory(
11803 RuntimeConfig {
11804 core_count: 1,
11805 raft_group_count: 1,
11806 mailbox_capacity: 128,
11807 threading: RuntimeThreading::HostedTokio,
11808 cold_max_hot_bytes_per_group: None,
11809 live_read_max_waiters_per_core: Some(65_536),
11810 },
11811 factory.clone(),
11812 )
11813 .expect("spawn runtime");
11814 let stream = BucketStreamId::new("benchcmp", "read-offload");
11815 create_stream(&runtime, &stream).await;
11816
11817 let read = tokio::time::timeout(
11818 std::time::Duration::from_secs(1),
11819 runtime.read_stream(ReadStreamRequest {
11820 stream_id: stream.clone(),
11821 offset: 0,
11822 max_len: 16,
11823 now_ms: 0,
11824 }),
11825 )
11826 .await
11827 .expect("runtime read should not use blocking legacy read_stream")
11828 .expect("read stream");
11829 assert_eq!(read.placement.raft_group_id, RaftGroupId(0));
11830 assert_eq!(factory.read_count.load(Ordering::Relaxed), 1);
11831
11832 let head = runtime
11833 .head_stream(HeadStreamRequest {
11834 stream_id: stream,
11835 now_ms: 0,
11836 })
11837 .await
11838 .expect("head stream");
11839 assert_eq!(head.placement.raft_group_id, RaftGroupId(0));
11840 }
11841
11842 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11843 async fn read_materialization_is_bounded_without_blocking_group_actor() {
11844 let factory = BlockingReadFactory::block_materialization();
11845 let mut config = RuntimeConfig::new(1, 1);
11846 config.mailbox_capacity = 1;
11847 config.threading = RuntimeThreading::HostedTokio;
11848 let runtime = ShardRuntime::spawn_with_engine_factory(config, factory.clone())
11849 .expect("spawn runtime");
11850 let first_stream = BucketStreamId::new("benchcmp", "materialize-bound-1");
11851 let second_stream = BucketStreamId::new("benchcmp", "materialize-bound-2");
11852 create_stream(&runtime, &first_stream).await;
11853 create_stream(&runtime, &second_stream).await;
11854
11855 let first_runtime = runtime.clone();
11856 let first_stream_for_read = first_stream.clone();
11857 let first_read = tokio::spawn(async move {
11858 first_runtime
11859 .read_stream(ReadStreamRequest {
11860 stream_id: first_stream_for_read,
11861 offset: 0,
11862 max_len: 16,
11863 now_ms: 0,
11864 })
11865 .await
11866 });
11867 tokio::time::timeout(
11868 std::time::Duration::from_secs(1),
11869 factory.entered.notified(),
11870 )
11871 .await
11872 .expect("first materialization acquired the only permit");
11873
11874 let second_runtime = runtime.clone();
11875 let second_stream_for_read = second_stream.clone();
11876 let second_read = tokio::spawn(async move {
11877 second_runtime
11878 .read_stream(ReadStreamRequest {
11879 stream_id: second_stream_for_read,
11880 offset: 0,
11881 max_len: 16,
11882 now_ms: 0,
11883 })
11884 .await
11885 });
11886
11887 let head = tokio::time::timeout(
11888 std::time::Duration::from_secs(1),
11889 runtime.head_stream(HeadStreamRequest {
11890 stream_id: first_stream,
11891 now_ms: 0,
11892 }),
11893 )
11894 .await
11895 .expect("group actor should keep serving metadata while materialization waits")
11896 .expect("head stream");
11897 assert_eq!(head.placement.raft_group_id, RaftGroupId(0));
11898 assert!(!second_read.is_finished());
11899
11900 factory.release.notify_one();
11901 let first = first_read
11902 .await
11903 .expect("first read task")
11904 .expect("first read");
11905 assert_eq!(first.payload, b"ready");
11906 tokio::time::timeout(
11907 std::time::Duration::from_secs(1),
11908 factory.entered.notified(),
11909 )
11910 .await
11911 .expect("second materialization acquired permit after first released it");
11912 factory.release.notify_one();
11913 let second = second_read
11914 .await
11915 .expect("second read task")
11916 .expect("second read");
11917 assert_eq!(second.payload, b"ready");
11918 }
11919
11920 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11921 async fn group_engine_errors_include_group_context_and_do_not_record_success_metrics() {
11922 let runtime = ShardRuntime::spawn_with_engine_factory(
11923 RuntimeConfig {
11924 core_count: 2,
11925 raft_group_count: 8,
11926 mailbox_capacity: 128,
11927 threading: RuntimeThreading::HostedTokio,
11928 cold_max_hot_bytes_per_group: None,
11929 live_read_max_waiters_per_core: Some(65_536),
11930 },
11931 FailingFactory,
11932 )
11933 .expect("spawn runtime");
11934
11935 let stream = BucketStreamId::new("benchcmp", "failing-stream");
11936 let placement = runtime.locate(&stream);
11937 let err = runtime
11938 .append(AppendRequest::new(stream, 1))
11939 .await
11940 .expect_err("engine failure");
11941
11942 assert_eq!(
11943 err,
11944 RuntimeError::GroupEngine {
11945 core_id: placement.core_id,
11946 raft_group_id: placement.raft_group_id,
11947 message: "proposal rejected".to_owned(),
11948 next_offset: None,
11949 leader_hint: None,
11950 }
11951 );
11952 assert_eq!(runtime.metrics().snapshot().accepted_appends, 0);
11953 }
11954
11955 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
11956 async fn mailbox_full_events_record_owner_core_backpressure() {
11957 let factory = BlockingOnceFactory::default();
11958 let runtime = ShardRuntime::spawn_with_engine_factory(
11959 RuntimeConfig {
11960 core_count: 1,
11961 raft_group_count: 1,
11962 mailbox_capacity: 1,
11963 threading: RuntimeThreading::HostedTokio,
11964 cold_max_hot_bytes_per_group: None,
11965 live_read_max_waiters_per_core: Some(65_536),
11966 },
11967 factory.clone(),
11968 )
11969 .expect("spawn runtime");
11970
11971 let entered = factory.entered.clone();
11972 let entered_wait = entered.notified();
11973 let first_runtime = runtime.clone();
11974 let first = tokio::spawn(async move {
11975 create_stream(
11976 &first_runtime,
11977 &BucketStreamId::new("benchcmp", "backpressure-1"),
11978 )
11979 .await
11980 });
11981 tokio::time::timeout(std::time::Duration::from_secs(1), entered_wait)
11982 .await
11983 .expect("first create entered blocking engine factory");
11984
11985 let second_runtime = runtime.clone();
11986 let second = tokio::spawn(async move {
11987 create_stream(
11988 &second_runtime,
11989 &BucketStreamId::new("benchcmp", "backpressure-2"),
11990 )
11991 .await
11992 });
11993 wait_for_mailbox_depth(&runtime, 0, 1).await;
11994
11995 let third_runtime = runtime.clone();
11996 let third = tokio::spawn(async move {
11997 create_stream(
11998 &third_runtime,
11999 &BucketStreamId::new("benchcmp", "backpressure-3"),
12000 )
12001 .await
12002 });
12003 wait_for_mailbox_full_events(&runtime, 1).await;
12004 assert_eq!(
12005 runtime.metrics().snapshot().per_core_mailbox_full_events[0],
12006 1
12007 );
12008
12009 factory.release.notify_one();
12010 first.await.expect("first task");
12011 second.await.expect("second task");
12012 third.await.expect("third task");
12013 }
12014
12015 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12016 async fn group_mailbox_full_events_record_inner_actor_backpressure() {
12017 let factory = BlockingFirstCreateEngineFactory::default();
12018 let runtime = ShardRuntime::spawn_with_engine_factory(
12019 RuntimeConfig {
12020 core_count: 1,
12021 raft_group_count: 1,
12022 mailbox_capacity: 1,
12023 threading: RuntimeThreading::HostedTokio,
12024 cold_max_hot_bytes_per_group: None,
12025 live_read_max_waiters_per_core: Some(65_536),
12026 },
12027 factory.clone(),
12028 )
12029 .expect("spawn runtime");
12030
12031 let first_runtime = runtime.clone();
12032 let first = tokio::spawn(async move {
12033 create_stream(
12034 &first_runtime,
12035 &BucketStreamId::new("benchcmp", "group-backpressure-1"),
12036 )
12037 .await
12038 });
12039 tokio::time::timeout(
12040 std::time::Duration::from_secs(1),
12041 factory.entered.notified(),
12042 )
12043 .await
12044 .expect("first append entered blocking group engine");
12045
12046 let second_runtime = runtime.clone();
12047 let second = tokio::spawn(async move {
12048 create_stream(
12049 &second_runtime,
12050 &BucketStreamId::new("benchcmp", "group-backpressure-2"),
12051 )
12052 .await
12053 });
12054 for _ in 0..100 {
12055 if runtime.metrics().snapshot().group_mailbox_depth == 1 {
12056 break;
12057 }
12058 tokio::time::sleep(std::time::Duration::from_millis(10)).await;
12059 }
12060
12061 let third_runtime = runtime.clone();
12062 let third = tokio::spawn(async move {
12063 create_stream(
12064 &third_runtime,
12065 &BucketStreamId::new("benchcmp", "group-backpressure-3"),
12066 )
12067 .await
12068 });
12069 wait_for_group_mailbox_full_events(&runtime, 1).await;
12070 assert_eq!(
12071 runtime
12072 .metrics()
12073 .snapshot()
12074 .per_group_group_mailbox_full_events[0],
12075 1
12076 );
12077
12078 factory.release.notify_one();
12079 first.await.expect("first task");
12080 second.await.expect("second task");
12081 third.await.expect("third task");
12082 }
12083
12084 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12085 async fn wal_group_engine_recovers_multiple_groups_from_per_group_logs() {
12086 let wal_root = std::env::temp_dir().join(format!(
12087 "ursula-wal-test-{}-{}",
12088 std::process::id(),
12089 std::time::SystemTime::now()
12090 .duration_since(std::time::UNIX_EPOCH)
12091 .expect("system time after unix epoch")
12092 .as_nanos()
12093 ));
12094 let _ = std::fs::remove_dir_all(&wal_root);
12095 let config = RuntimeConfig {
12096 core_count: 2,
12097 raft_group_count: 8,
12098 mailbox_capacity: 128,
12099 threading: RuntimeThreading::HostedTokio,
12100 cold_max_hot_bytes_per_group: None,
12101 live_read_max_waiters_per_core: Some(65_536),
12102 };
12103
12104 let (first_stream, second_stream) = {
12105 let runtime = ShardRuntime::spawn_with_engine_factory(
12106 config.clone(),
12107 WalGroupEngineFactory::new(&wal_root),
12108 )
12109 .expect("spawn runtime");
12110
12111 let mut seen_groups = HashSet::new();
12112 let mut streams = Vec::new();
12113 for index in 0..256 {
12114 let stream = BucketStreamId::new("benchcmp", format!("wal-{index}"));
12115 if seen_groups.insert(runtime.locate(&stream).raft_group_id) {
12116 streams.push(stream);
12117 }
12118 if streams.len() == 2 {
12119 break;
12120 }
12121 }
12122 assert_eq!(streams.len(), 2, "expected streams on two groups");
12123 let first_stream = streams[0].clone();
12124 let second_stream = streams[1].clone();
12125
12126 create_stream(&runtime, &first_stream).await;
12127 runtime
12128 .append(AppendRequest::from_bytes(
12129 first_stream.clone(),
12130 b"first-payload".to_vec(),
12131 ))
12132 .await
12133 .expect("append first stream");
12134
12135 create_stream(&runtime, &second_stream).await;
12136 let mut append_second =
12137 AppendRequest::from_bytes(second_stream.clone(), b"second-payload".to_vec());
12138 append_second.close_after = true;
12139 runtime
12140 .append(append_second)
12141 .await
12142 .expect("append second stream");
12143
12144 (first_stream, second_stream)
12145 };
12146
12147 let recovered =
12148 ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12149 .expect("spawn recovered runtime");
12150
12151 let first_read = recovered
12152 .read_stream(ReadStreamRequest {
12153 stream_id: first_stream.clone(),
12154 offset: 0,
12155 max_len: 128,
12156 now_ms: 0,
12157 })
12158 .await
12159 .expect("read recovered first stream");
12160 assert_eq!(first_read.payload, b"first-payload");
12161 assert!(!first_read.closed);
12162
12163 let second_read = recovered
12164 .read_stream(ReadStreamRequest {
12165 stream_id: second_stream.clone(),
12166 offset: 0,
12167 max_len: 128,
12168 now_ms: 0,
12169 })
12170 .await
12171 .expect("read recovered second stream");
12172 assert_eq!(second_read.payload, b"second-payload");
12173 assert!(second_read.closed);
12174
12175 let mut wal_file_count = 0;
12176 for core_entry in std::fs::read_dir(&wal_root).expect("read WAL root") {
12177 let core_entry = core_entry.expect("read core WAL dir");
12178 for group_entry in std::fs::read_dir(core_entry.path()).expect("read group WAL dir") {
12179 let group_entry = group_entry.expect("read group WAL file");
12180 if group_entry
12181 .path()
12182 .extension()
12183 .is_some_and(|ext| ext == "jsonl")
12184 {
12185 wal_file_count += 1;
12186 }
12187 }
12188 }
12189 assert_eq!(wal_file_count, 2);
12190
12191 drop(recovered);
12192 std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12193 }
12194
12195 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12196 async fn wal_group_engine_batches_append_records_and_recovers() {
12197 let wal_root = std::env::temp_dir().join(format!(
12198 "ursula-wal-batch-test-{}-{}",
12199 std::process::id(),
12200 std::time::SystemTime::now()
12201 .duration_since(std::time::UNIX_EPOCH)
12202 .expect("system time after unix epoch")
12203 .as_nanos()
12204 ));
12205 let _ = std::fs::remove_dir_all(&wal_root);
12206 let config = RuntimeConfig {
12207 core_count: 2,
12208 raft_group_count: 8,
12209 mailbox_capacity: 128,
12210 threading: RuntimeThreading::HostedTokio,
12211 cold_max_hot_bytes_per_group: None,
12212 live_read_max_waiters_per_core: Some(65_536),
12213 };
12214 let stream = BucketStreamId::new("benchcmp", "wal-batch");
12215 let placement;
12216
12217 {
12218 let runtime = ShardRuntime::spawn_with_engine_factory(
12219 config.clone(),
12220 WalGroupEngineFactory::new(&wal_root),
12221 )
12222 .expect("spawn runtime");
12223 placement = runtime.locate(&stream);
12224 create_stream(&runtime, &stream).await;
12225 let response = runtime
12226 .append_batch(AppendBatchRequest::new(
12227 stream.clone(),
12228 vec![b"ab".to_vec(), b"cd".to_vec(), b"ef".to_vec()],
12229 ))
12230 .await
12231 .expect("append batch");
12232 assert_eq!(response.items.len(), 3);
12233 assert!(response.items.iter().all(Result::is_ok));
12234
12235 let read = runtime
12236 .read_stream(ReadStreamRequest {
12237 stream_id: stream.clone(),
12238 offset: 0,
12239 max_len: 16,
12240 now_ms: 0,
12241 })
12242 .await
12243 .expect("read");
12244 assert_eq!(read.payload, b"abcdef");
12245
12246 let snapshot = runtime.metrics().snapshot();
12247 let core_index = usize::from(placement.core_id.0);
12248 let group_index = usize::try_from(placement.raft_group_id.0).expect("u32 fits usize");
12249 assert_eq!(snapshot.wal_batches, 2);
12250 assert_eq!(snapshot.wal_records, 2);
12251 assert_eq!(snapshot.per_core_wal_batches[core_index], 2);
12252 assert_eq!(snapshot.per_group_wal_batches[group_index], 2);
12253 assert_eq!(snapshot.per_core_wal_records[core_index], 2);
12254 assert_eq!(snapshot.per_group_wal_records[group_index], 2);
12255 assert!(snapshot.wal_write_ns > 0);
12256 assert!(snapshot.wal_sync_ns > 0);
12257 assert_eq!(
12258 snapshot.wal_write_ns,
12259 snapshot.per_core_wal_write_ns.iter().sum::<u64>()
12260 );
12261 assert_eq!(
12262 snapshot.wal_sync_ns,
12263 snapshot.per_group_wal_sync_ns.iter().sum::<u64>()
12264 );
12265 }
12266
12267 let log_path = group_log_path(&wal_root, placement);
12268 let line_count = std::fs::read_to_string(&log_path)
12269 .expect("read WAL log")
12270 .lines()
12271 .count();
12272 assert_eq!(line_count, 2);
12273
12274 let recovered =
12275 ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12276 .expect("spawn recovered runtime");
12277 let read = recovered
12278 .read_stream(ReadStreamRequest {
12279 stream_id: stream,
12280 offset: 0,
12281 max_len: 16,
12282 now_ms: 0,
12283 })
12284 .await
12285 .expect("read recovered batch");
12286 assert_eq!(read.payload, b"abcdef");
12287
12288 drop(recovered);
12289 std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12290 }
12291
12292 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12293 async fn wal_group_engine_persists_installed_snapshot() {
12294 let wal_root = std::env::temp_dir().join(format!(
12295 "ursula-wal-install-snapshot-test-{}-{}",
12296 std::process::id(),
12297 std::time::SystemTime::now()
12298 .duration_since(std::time::UNIX_EPOCH)
12299 .expect("system time after unix epoch")
12300 .as_nanos()
12301 ));
12302 let _ = std::fs::remove_dir_all(&wal_root);
12303 let config = RuntimeConfig {
12304 core_count: 2,
12305 raft_group_count: 8,
12306 mailbox_capacity: 128,
12307 threading: RuntimeThreading::HostedTokio,
12308 cold_max_hot_bytes_per_group: None,
12309 live_read_max_waiters_per_core: Some(65_536),
12310 };
12311 let stream = BucketStreamId::new("benchcmp", "wal-installed-snapshot");
12312 let source = runtime(2, 8);
12313 let placement = source.locate(&stream);
12314 create_stream(&source, &stream).await;
12315 source
12316 .append(AppendRequest::from_bytes(
12317 stream.clone(),
12318 b"snapshot-payload".to_vec(),
12319 ))
12320 .await
12321 .expect("append source");
12322 let snapshot = source
12323 .snapshot_group(placement.raft_group_id)
12324 .await
12325 .expect("snapshot source");
12326
12327 {
12328 let target = ShardRuntime::spawn_with_engine_factory(
12329 config.clone(),
12330 WalGroupEngineFactory::new(&wal_root),
12331 )
12332 .expect("spawn WAL runtime");
12333 target
12334 .install_group_snapshot(snapshot)
12335 .await
12336 .expect("install snapshot");
12337 }
12338
12339 let recovered =
12340 ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12341 .expect("spawn recovered WAL runtime");
12342 let read = recovered
12343 .read_stream(ReadStreamRequest {
12344 stream_id: stream.clone(),
12345 offset: 0,
12346 max_len: 32,
12347 now_ms: 0,
12348 })
12349 .await
12350 .expect("read recovered snapshot");
12351 assert_eq!(read.payload, b"snapshot-payload");
12352
12353 let appended = recovered
12354 .append(AppendRequest::from_bytes(stream, b"-next".to_vec()))
12355 .await
12356 .expect("append after recovered snapshot");
12357 assert_eq!(appended.start_offset, 16);
12358 assert_eq!(appended.stream_append_count, 2);
12359
12360 drop(recovered);
12361 std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12362 }
12363
12364 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12365 async fn wal_group_engine_recovers_producer_dedup_state() {
12366 let wal_root = std::env::temp_dir().join(format!(
12367 "ursula-wal-producer-test-{}-{}",
12368 std::process::id(),
12369 std::time::SystemTime::now()
12370 .duration_since(std::time::UNIX_EPOCH)
12371 .expect("system time after unix epoch")
12372 .as_nanos()
12373 ));
12374 let _ = std::fs::remove_dir_all(&wal_root);
12375 let config = RuntimeConfig {
12376 core_count: 2,
12377 raft_group_count: 8,
12378 mailbox_capacity: 128,
12379 threading: RuntimeThreading::HostedTokio,
12380 cold_max_hot_bytes_per_group: None,
12381 live_read_max_waiters_per_core: Some(65_536),
12382 };
12383 let stream = BucketStreamId::new("benchcmp", "wal-producer");
12384
12385 {
12386 let runtime = ShardRuntime::spawn_with_engine_factory(
12387 config.clone(),
12388 WalGroupEngineFactory::new(&wal_root),
12389 )
12390 .expect("spawn WAL runtime");
12391 create_stream(&runtime, &stream).await;
12392 let mut append = AppendRequest::from_bytes(stream.clone(), b"a".to_vec());
12393 append.producer = Some(producer("writer-1", 0, 0));
12394 runtime.append(append).await.expect("append");
12395 }
12396
12397 let recovered =
12398 ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12399 .expect("spawn recovered runtime");
12400 let mut duplicate = AppendRequest::from_bytes(stream.clone(), b"ignored".to_vec());
12401 duplicate.producer = Some(producer("writer-1", 0, 0));
12402 let duplicate = recovered
12403 .append(duplicate)
12404 .await
12405 .expect("deduplicated retry");
12406 assert!(duplicate.deduplicated);
12407 assert_eq!(duplicate.start_offset, 0);
12408 assert_eq!(duplicate.next_offset, 1);
12409 assert_eq!(duplicate.stream_append_count, 1);
12410
12411 let mut next = AppendRequest::from_bytes(stream.clone(), b"b".to_vec());
12412 next.producer = Some(producer("writer-1", 0, 1));
12413 let next = recovered.append(next).await.expect("next append");
12414 assert_eq!(next.start_offset, 1);
12415 assert_eq!(next.next_offset, 2);
12416 assert_eq!(next.stream_append_count, 2);
12417
12418 let read = recovered
12419 .read_stream(ReadStreamRequest {
12420 stream_id: stream,
12421 offset: 0,
12422 max_len: 16,
12423 now_ms: 0,
12424 })
12425 .await
12426 .expect("read");
12427 assert_eq!(read.payload, b"ab");
12428
12429 drop(recovered);
12430 std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12431 }
12432
12433 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
12434 async fn wal_group_engine_recovers_producer_append_batch_dedup_state() {
12435 let wal_root = std::env::temp_dir().join(format!(
12436 "ursula-wal-producer-batch-test-{}-{}",
12437 std::process::id(),
12438 std::time::SystemTime::now()
12439 .duration_since(std::time::UNIX_EPOCH)
12440 .expect("system time after unix epoch")
12441 .as_nanos()
12442 ));
12443 let _ = std::fs::remove_dir_all(&wal_root);
12444 let config = RuntimeConfig {
12445 core_count: 2,
12446 raft_group_count: 8,
12447 mailbox_capacity: 128,
12448 threading: RuntimeThreading::HostedTokio,
12449 cold_max_hot_bytes_per_group: None,
12450 live_read_max_waiters_per_core: Some(65_536),
12451 };
12452 let stream = BucketStreamId::new("benchcmp", "wal-producer-batch");
12453 let placement;
12454
12455 {
12456 let runtime = ShardRuntime::spawn_with_engine_factory(
12457 config.clone(),
12458 WalGroupEngineFactory::new(&wal_root),
12459 )
12460 .expect("spawn WAL runtime");
12461 placement = runtime.locate(&stream);
12462 create_stream(&runtime, &stream).await;
12463
12464 let mut first =
12465 AppendBatchRequest::new(stream.clone(), vec![b"a".to_vec(), b"b".to_vec()]);
12466 first.producer = Some(producer("writer-1", 0, 0));
12467 let first = runtime.append_batch(first).await.expect("first batch");
12468 assert!(first.items.iter().all(Result::is_ok));
12469
12470 let mut duplicate = AppendBatchRequest::new(stream.clone(), vec![b"ignored".to_vec()]);
12471 duplicate.producer = Some(producer("writer-1", 0, 0));
12472 let duplicate = runtime
12473 .append_batch(duplicate)
12474 .await
12475 .expect("duplicate batch");
12476 assert!(
12477 duplicate
12478 .items
12479 .iter()
12480 .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
12481 );
12482 }
12483
12484 let log_path = group_log_path(&wal_root, placement);
12485 let line_count = std::fs::read_to_string(&log_path)
12486 .expect("read WAL log")
12487 .lines()
12488 .count();
12489 assert_eq!(line_count, 2);
12490
12491 let recovered =
12492 ShardRuntime::spawn_with_engine_factory(config, WalGroupEngineFactory::new(&wal_root))
12493 .expect("spawn recovered runtime");
12494 let mut duplicate = AppendBatchRequest::new(stream.clone(), vec![b"retry".to_vec()]);
12495 duplicate.producer = Some(producer("writer-1", 0, 0));
12496 let duplicate = recovered
12497 .append_batch(duplicate)
12498 .await
12499 .expect("deduplicated retry");
12500 assert_eq!(duplicate.items.len(), 2);
12501 assert!(
12502 duplicate
12503 .items
12504 .iter()
12505 .all(|item| { item.as_ref().expect("deduplicated item").deduplicated })
12506 );
12507
12508 let mut next = AppendBatchRequest::new(stream.clone(), vec![b"c".to_vec()]);
12509 next.producer = Some(producer("writer-1", 0, 1));
12510 let next = recovered.append_batch(next).await.expect("next batch");
12511 assert_eq!(next.items[0].as_ref().expect("next item").start_offset, 2);
12512
12513 let read = recovered
12514 .read_stream(ReadStreamRequest {
12515 stream_id: stream,
12516 offset: 0,
12517 max_len: 16,
12518 now_ms: 0,
12519 })
12520 .await
12521 .expect("read");
12522 assert_eq!(read.payload, b"abc");
12523
12524 drop(recovered);
12525 std::fs::remove_dir_all(&wal_root).expect("remove WAL root");
12526 }
12527
12528 #[derive(Debug, Clone)]
12529 struct RecordingFactory {
12530 created: Arc<Mutex<Vec<ShardPlacement>>>,
12531 accepts_local_writes: bool,
12532 }
12533
12534 impl Default for RecordingFactory {
12535 fn default() -> Self {
12536 Self {
12537 created: Arc::default(),
12538 accepts_local_writes: true,
12539 }
12540 }
12541 }
12542
12543 impl RecordingFactory {
12544 fn without_local_writes() -> Self {
12545 Self {
12546 accepts_local_writes: false,
12547 ..Self::default()
12548 }
12549 }
12550
12551 fn created(&self) -> Vec<ShardPlacement> {
12552 self.created.lock().expect("lock created groups").clone()
12553 }
12554 }
12555
12556 impl GroupEngineFactory for RecordingFactory {
12557 fn create<'a>(
12558 &'a self,
12559 placement: ShardPlacement,
12560 _metrics: GroupEngineMetrics,
12561 ) -> GroupEngineCreateFuture<'a> {
12562 Box::pin(async move {
12563 self.created
12564 .lock()
12565 .expect("lock created groups")
12566 .push(placement);
12567 let engine: Box<dyn GroupEngine> = Box::new(RecordingEngine {
12568 placement,
12569 commit_index: 0,
12570 accepts_local_writes: self.accepts_local_writes,
12571 });
12572 Ok(engine)
12573 })
12574 }
12575 }
12576
12577 struct RecordingEngine {
12578 placement: ShardPlacement,
12579 commit_index: u64,
12580 accepts_local_writes: bool,
12581 }
12582
12583 #[derive(Clone)]
12584 struct BlockingReadFactory {
12585 entered: Arc<Notify>,
12586 release: Arc<Notify>,
12587 read_count: Arc<AtomicU64>,
12588 block_parts: bool,
12589 }
12590
12591 impl Default for BlockingReadFactory {
12592 fn default() -> Self {
12593 Self {
12594 entered: Arc::new(Notify::new()),
12595 release: Arc::new(Notify::new()),
12596 read_count: Arc::new(AtomicU64::new(0)),
12597 block_parts: false,
12598 }
12599 }
12600 }
12601
12602 impl BlockingReadFactory {
12603 fn block_materialization() -> Self {
12604 Self {
12605 block_parts: true,
12606 ..Self::default()
12607 }
12608 }
12609 }
12610
12611 impl GroupEngineFactory for BlockingReadFactory {
12612 fn create<'a>(
12613 &'a self,
12614 placement: ShardPlacement,
12615 _metrics: GroupEngineMetrics,
12616 ) -> GroupEngineCreateFuture<'a> {
12617 Box::pin(async move {
12618 let engine: Box<dyn GroupEngine> = Box::new(BlockingReadEngine {
12619 inner: InMemoryGroupEngine::default(),
12620 placement,
12621 entered: self.entered.clone(),
12622 release: self.release.clone(),
12623 read_count: self.read_count.clone(),
12624 block_parts: self.block_parts,
12625 });
12626 Ok(engine)
12627 })
12628 }
12629 }
12630
12631 struct BlockingReadEngine {
12632 inner: InMemoryGroupEngine,
12633 placement: ShardPlacement,
12634 entered: Arc<Notify>,
12635 release: Arc<Notify>,
12636 read_count: Arc<AtomicU64>,
12637 block_parts: bool,
12638 }
12639
12640 impl GroupEngine for BlockingReadEngine {
12641 fn create_stream<'a>(
12642 &'a mut self,
12643 request: CreateStreamRequest,
12644 placement: ShardPlacement,
12645 ) -> GroupCreateStreamFuture<'a> {
12646 self.inner.create_stream(request, placement)
12647 }
12648
12649 fn head_stream<'a>(
12650 &'a mut self,
12651 request: HeadStreamRequest,
12652 placement: ShardPlacement,
12653 ) -> GroupHeadStreamFuture<'a> {
12654 self.inner.head_stream(request, placement)
12655 }
12656
12657 fn read_stream<'a>(
12658 &'a mut self,
12659 request: ReadStreamRequest,
12660 placement: ShardPlacement,
12661 ) -> GroupReadStreamFuture<'a> {
12662 let entered = self.entered.clone();
12663 let release = self.release.clone();
12664 let read_count = self.read_count.clone();
12665 Box::pin(async move {
12666 assert_eq!(placement, self.placement);
12667 read_count.fetch_add(1, Ordering::Relaxed);
12668 entered.notify_one();
12669 release.notified().await;
12670 Ok(ReadStreamResponse {
12671 placement,
12672 offset: request.offset,
12673 next_offset: request.offset,
12674 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12675 payload: Vec::new(),
12676 up_to_date: true,
12677 closed: false,
12678 })
12679 })
12680 }
12681
12682 fn read_stream_parts<'a>(
12683 &'a mut self,
12684 request: ReadStreamRequest,
12685 placement: ShardPlacement,
12686 ) -> GroupReadStreamPartsFuture<'a> {
12687 let entered = self.entered.clone();
12688 let read_count = self.read_count.clone();
12689 Box::pin(async move {
12690 assert_eq!(placement, self.placement);
12691 read_count.fetch_add(1, Ordering::Relaxed);
12692 entered.notify_one();
12693 if self.block_parts {
12694 return Ok(GroupReadStreamParts {
12695 placement,
12696 offset: request.offset,
12697 next_offset: request.offset
12698 + u64::try_from(b"ready".len()).expect("payload len fits u64"),
12699 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12700 up_to_date: true,
12701 closed: false,
12702 body: GroupReadStreamBody::Blocking {
12703 entered: self.entered.clone(),
12704 release: self.release.clone(),
12705 payload: b"ready".to_vec(),
12706 },
12707 });
12708 }
12709 let response = ReadStreamResponse {
12710 placement,
12711 offset: request.offset,
12712 next_offset: request.offset,
12713 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12714 payload: Vec::new(),
12715 up_to_date: true,
12716 closed: false,
12717 };
12718 Ok(GroupReadStreamParts::from_response(response))
12719 })
12720 }
12721
12722 fn touch_stream_access<'a>(
12723 &'a mut self,
12724 stream_id: BucketStreamId,
12725 now_ms: u64,
12726 renew_ttl: bool,
12727 placement: ShardPlacement,
12728 ) -> GroupTouchStreamAccessFuture<'a> {
12729 self.inner
12730 .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
12731 }
12732
12733 fn add_fork_ref<'a>(
12734 &'a mut self,
12735 stream_id: BucketStreamId,
12736 now_ms: u64,
12737 placement: ShardPlacement,
12738 ) -> GroupForkRefFuture<'a> {
12739 self.inner.add_fork_ref(stream_id, now_ms, placement)
12740 }
12741
12742 fn release_fork_ref<'a>(
12743 &'a mut self,
12744 stream_id: BucketStreamId,
12745 placement: ShardPlacement,
12746 ) -> GroupForkRefFuture<'a> {
12747 self.inner.release_fork_ref(stream_id, placement)
12748 }
12749
12750 fn close_stream<'a>(
12751 &'a mut self,
12752 request: CloseStreamRequest,
12753 placement: ShardPlacement,
12754 ) -> GroupCloseStreamFuture<'a> {
12755 self.inner.close_stream(request, placement)
12756 }
12757
12758 fn delete_stream<'a>(
12759 &'a mut self,
12760 request: DeleteStreamRequest,
12761 placement: ShardPlacement,
12762 ) -> GroupDeleteStreamFuture<'a> {
12763 self.inner.delete_stream(request, placement)
12764 }
12765
12766 fn append<'a>(
12767 &'a mut self,
12768 request: AppendRequest,
12769 placement: ShardPlacement,
12770 ) -> GroupAppendFuture<'a> {
12771 self.inner.append(request, placement)
12772 }
12773
12774 fn append_batch<'a>(
12775 &'a mut self,
12776 request: AppendBatchRequest,
12777 placement: ShardPlacement,
12778 ) -> GroupAppendBatchFuture<'a> {
12779 self.inner.append_batch(request, placement)
12780 }
12781
12782 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
12783 Box::pin(async move {
12784 Ok(GroupSnapshot {
12785 placement,
12786 group_commit_index: 0,
12787 stream_snapshot: StreamSnapshot {
12788 buckets: Vec::new(),
12789 streams: Vec::new(),
12790 },
12791 stream_append_counts: Vec::new(),
12792 })
12793 })
12794 }
12795
12796 fn install_snapshot<'a>(
12797 &'a mut self,
12798 _snapshot: GroupSnapshot,
12799 ) -> GroupInstallSnapshotFuture<'a> {
12800 Box::pin(async { Ok(()) })
12801 }
12802 }
12803
12804 impl GroupEngine for RecordingEngine {
12805 fn accepts_local_writes(&self) -> bool {
12806 self.accepts_local_writes
12807 }
12808
12809 fn create_stream<'a>(
12810 &'a mut self,
12811 request: CreateStreamRequest,
12812 placement: ShardPlacement,
12813 ) -> GroupCreateStreamFuture<'a> {
12814 Box::pin(async move {
12815 assert_eq!(placement, self.placement);
12816 self.commit_index += 1;
12817 Ok(CreateStreamResponse {
12818 placement,
12819 next_offset: u64::try_from(request.initial_payload.len())
12820 .expect("payload len fits u64"),
12821 closed: request.close_after,
12822 already_exists: false,
12823 group_commit_index: self.commit_index,
12824 })
12825 })
12826 }
12827
12828 fn head_stream<'a>(
12829 &'a mut self,
12830 request: HeadStreamRequest,
12831 placement: ShardPlacement,
12832 ) -> GroupHeadStreamFuture<'a> {
12833 Box::pin(async move {
12834 assert_eq!(placement, self.placement);
12835 Ok(HeadStreamResponse {
12836 placement,
12837 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12838 tail_offset: request.stream_id.stream_id.len() as u64,
12839 closed: false,
12840 stream_ttl_seconds: None,
12841 stream_expires_at_ms: None,
12842 snapshot_offset: None,
12843 })
12844 })
12845 }
12846
12847 fn read_stream<'a>(
12848 &'a mut self,
12849 request: ReadStreamRequest,
12850 placement: ShardPlacement,
12851 ) -> GroupReadStreamFuture<'a> {
12852 Box::pin(async move {
12853 assert_eq!(placement, self.placement);
12854 Ok(ReadStreamResponse {
12855 placement,
12856 offset: request.offset,
12857 next_offset: request.offset,
12858 content_type: DEFAULT_CONTENT_TYPE.to_owned(),
12859 payload: Vec::new(),
12860 up_to_date: true,
12861 closed: false,
12862 })
12863 })
12864 }
12865
12866 fn touch_stream_access<'a>(
12867 &'a mut self,
12868 _stream_id: BucketStreamId,
12869 _now_ms: u64,
12870 _renew_ttl: bool,
12871 placement: ShardPlacement,
12872 ) -> GroupTouchStreamAccessFuture<'a> {
12873 Box::pin(async move {
12874 assert_eq!(placement, self.placement);
12875 Ok(TouchStreamAccessResponse {
12876 placement,
12877 changed: false,
12878 expired: false,
12879 group_commit_index: self.commit_index,
12880 })
12881 })
12882 }
12883
12884 fn add_fork_ref<'a>(
12885 &'a mut self,
12886 _stream_id: BucketStreamId,
12887 _now_ms: u64,
12888 placement: ShardPlacement,
12889 ) -> GroupForkRefFuture<'a> {
12890 Box::pin(async move {
12891 assert_eq!(placement, self.placement);
12892 self.commit_index += 1;
12893 Ok(ForkRefResponse {
12894 placement,
12895 fork_ref_count: 1,
12896 hard_deleted: false,
12897 parent_to_release: None,
12898 group_commit_index: self.commit_index,
12899 })
12900 })
12901 }
12902
12903 fn release_fork_ref<'a>(
12904 &'a mut self,
12905 _stream_id: BucketStreamId,
12906 placement: ShardPlacement,
12907 ) -> GroupForkRefFuture<'a> {
12908 Box::pin(async move {
12909 assert_eq!(placement, self.placement);
12910 self.commit_index += 1;
12911 Ok(ForkRefResponse {
12912 placement,
12913 fork_ref_count: 0,
12914 hard_deleted: false,
12915 parent_to_release: None,
12916 group_commit_index: self.commit_index,
12917 })
12918 })
12919 }
12920
12921 fn close_stream<'a>(
12922 &'a mut self,
12923 _request: CloseStreamRequest,
12924 placement: ShardPlacement,
12925 ) -> GroupCloseStreamFuture<'a> {
12926 Box::pin(async move {
12927 assert_eq!(placement, self.placement);
12928 self.commit_index += 1;
12929 Ok(CloseStreamResponse {
12930 placement,
12931 next_offset: self.commit_index,
12932 group_commit_index: self.commit_index,
12933 deduplicated: false,
12934 })
12935 })
12936 }
12937
12938 fn delete_stream<'a>(
12939 &'a mut self,
12940 _request: DeleteStreamRequest,
12941 placement: ShardPlacement,
12942 ) -> GroupDeleteStreamFuture<'a> {
12943 Box::pin(async move {
12944 assert_eq!(placement, self.placement);
12945 self.commit_index += 1;
12946 Ok(DeleteStreamResponse {
12947 placement,
12948 group_commit_index: self.commit_index,
12949 hard_deleted: true,
12950 parent_to_release: None,
12951 })
12952 })
12953 }
12954
12955 fn append<'a>(
12956 &'a mut self,
12957 request: AppendRequest,
12958 placement: ShardPlacement,
12959 ) -> GroupAppendFuture<'a> {
12960 Box::pin(async move {
12961 assert_eq!(placement, self.placement);
12962 let start_offset = self.commit_index;
12963 let next_offset = start_offset + request.payload_len();
12964 self.commit_index += 1;
12965 Ok(AppendResponse {
12966 placement,
12967 start_offset,
12968 next_offset,
12969 stream_append_count: self.commit_index,
12970 group_commit_index: self.commit_index,
12971 closed: request.close_after,
12972 deduplicated: false,
12973 producer: request.producer,
12974 })
12975 })
12976 }
12977
12978 fn append_batch<'a>(
12979 &'a mut self,
12980 request: AppendBatchRequest,
12981 placement: ShardPlacement,
12982 ) -> GroupAppendBatchFuture<'a> {
12983 Box::pin(async move {
12984 assert_eq!(placement, self.placement);
12985 let AppendBatchRequest {
12986 stream_id: _,
12987 content_type: _,
12988 payloads,
12989 producer: _,
12990 ..
12991 } = request;
12992 let mut items = Vec::with_capacity(payloads.len());
12993 for payload in payloads {
12994 let start_offset = self.commit_index;
12995 let next_offset =
12996 start_offset + u64::try_from(payload.len()).expect("payload len fits u64");
12997 self.commit_index += 1;
12998 items.push(Ok(AppendResponse {
12999 placement,
13000 start_offset,
13001 next_offset,
13002 stream_append_count: self.commit_index,
13003 group_commit_index: self.commit_index,
13004 closed: false,
13005 deduplicated: false,
13006 producer: None,
13007 }));
13008 }
13009 Ok(GroupAppendBatchResponse { placement, items })
13010 })
13011 }
13012
13013 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13014 Box::pin(async move {
13015 assert_eq!(placement, self.placement);
13016 Ok(GroupSnapshot {
13017 placement,
13018 group_commit_index: self.commit_index,
13019 stream_snapshot: StreamSnapshot {
13020 buckets: Vec::new(),
13021 streams: Vec::new(),
13022 },
13023 stream_append_counts: Vec::new(),
13024 })
13025 })
13026 }
13027
13028 fn install_snapshot<'a>(
13029 &'a mut self,
13030 snapshot: GroupSnapshot,
13031 ) -> GroupInstallSnapshotFuture<'a> {
13032 Box::pin(async move {
13033 assert_eq!(snapshot.placement, self.placement);
13034 self.commit_index = snapshot.group_commit_index;
13035 Ok(())
13036 })
13037 }
13038 }
13039
13040 #[derive(Debug, Clone)]
13041 struct BlockingFirstCreateEngineFactory {
13042 first_create_blocks: Arc<AtomicBool>,
13043 entered: Arc<Notify>,
13044 release: Arc<Notify>,
13045 }
13046
13047 impl Default for BlockingFirstCreateEngineFactory {
13048 fn default() -> Self {
13049 Self {
13050 first_create_blocks: Arc::new(AtomicBool::new(true)),
13051 entered: Arc::new(Notify::new()),
13052 release: Arc::new(Notify::new()),
13053 }
13054 }
13055 }
13056
13057 impl GroupEngineFactory for BlockingFirstCreateEngineFactory {
13058 fn create<'a>(
13059 &'a self,
13060 _placement: ShardPlacement,
13061 _metrics: GroupEngineMetrics,
13062 ) -> GroupEngineCreateFuture<'a> {
13063 Box::pin(async move {
13064 let engine: Box<dyn GroupEngine> = Box::new(BlockingFirstCreateEngine {
13065 inner: InMemoryGroupEngine::default(),
13066 first_create_blocks: self.first_create_blocks.clone(),
13067 entered: self.entered.clone(),
13068 release: self.release.clone(),
13069 });
13070 Ok(engine)
13071 })
13072 }
13073 }
13074
13075 struct BlockingFirstCreateEngine {
13076 inner: InMemoryGroupEngine,
13077 first_create_blocks: Arc<AtomicBool>,
13078 entered: Arc<Notify>,
13079 release: Arc<Notify>,
13080 }
13081
13082 impl GroupEngine for BlockingFirstCreateEngine {
13083 fn create_stream<'a>(
13084 &'a mut self,
13085 request: CreateStreamRequest,
13086 placement: ShardPlacement,
13087 ) -> GroupCreateStreamFuture<'a> {
13088 let should_block = self.first_create_blocks.swap(false, Ordering::SeqCst);
13089 let entered = self.entered.clone();
13090 let release = self.release.clone();
13091 Box::pin(async move {
13092 if should_block {
13093 entered.notify_one();
13094 release.notified().await;
13095 }
13096 self.inner.create_stream(request, placement).await
13097 })
13098 }
13099
13100 fn head_stream<'a>(
13101 &'a mut self,
13102 request: HeadStreamRequest,
13103 placement: ShardPlacement,
13104 ) -> GroupHeadStreamFuture<'a> {
13105 self.inner.head_stream(request, placement)
13106 }
13107
13108 fn read_stream<'a>(
13109 &'a mut self,
13110 request: ReadStreamRequest,
13111 placement: ShardPlacement,
13112 ) -> GroupReadStreamFuture<'a> {
13113 self.inner.read_stream(request, placement)
13114 }
13115
13116 fn touch_stream_access<'a>(
13117 &'a mut self,
13118 stream_id: BucketStreamId,
13119 now_ms: u64,
13120 renew_ttl: bool,
13121 placement: ShardPlacement,
13122 ) -> GroupTouchStreamAccessFuture<'a> {
13123 self.inner
13124 .touch_stream_access(stream_id, now_ms, renew_ttl, placement)
13125 }
13126
13127 fn add_fork_ref<'a>(
13128 &'a mut self,
13129 stream_id: BucketStreamId,
13130 now_ms: u64,
13131 placement: ShardPlacement,
13132 ) -> GroupForkRefFuture<'a> {
13133 self.inner.add_fork_ref(stream_id, now_ms, placement)
13134 }
13135
13136 fn release_fork_ref<'a>(
13137 &'a mut self,
13138 stream_id: BucketStreamId,
13139 placement: ShardPlacement,
13140 ) -> GroupForkRefFuture<'a> {
13141 self.inner.release_fork_ref(stream_id, placement)
13142 }
13143
13144 fn close_stream<'a>(
13145 &'a mut self,
13146 request: CloseStreamRequest,
13147 placement: ShardPlacement,
13148 ) -> GroupCloseStreamFuture<'a> {
13149 self.inner.close_stream(request, placement)
13150 }
13151
13152 fn delete_stream<'a>(
13153 &'a mut self,
13154 request: DeleteStreamRequest,
13155 placement: ShardPlacement,
13156 ) -> GroupDeleteStreamFuture<'a> {
13157 self.inner.delete_stream(request, placement)
13158 }
13159
13160 fn append<'a>(
13161 &'a mut self,
13162 request: AppendRequest,
13163 placement: ShardPlacement,
13164 ) -> GroupAppendFuture<'a> {
13165 self.inner.append(request, placement)
13166 }
13167
13168 fn append_batch<'a>(
13169 &'a mut self,
13170 request: AppendBatchRequest,
13171 placement: ShardPlacement,
13172 ) -> GroupAppendBatchFuture<'a> {
13173 self.inner.append_batch(request, placement)
13174 }
13175
13176 fn snapshot<'a>(&'a mut self, placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13177 self.inner.snapshot(placement)
13178 }
13179
13180 fn install_snapshot<'a>(
13181 &'a mut self,
13182 snapshot: GroupSnapshot,
13183 ) -> GroupInstallSnapshotFuture<'a> {
13184 self.inner.install_snapshot(snapshot)
13185 }
13186 }
13187
13188 #[derive(Debug, Clone)]
13189 struct BlockingOnceFactory {
13190 first_create_blocks: Arc<AtomicBool>,
13191 entered: Arc<Notify>,
13192 release: Arc<Notify>,
13193 }
13194
13195 impl Default for BlockingOnceFactory {
13196 fn default() -> Self {
13197 Self {
13198 first_create_blocks: Arc::new(AtomicBool::new(true)),
13199 entered: Arc::new(Notify::new()),
13200 release: Arc::new(Notify::new()),
13201 }
13202 }
13203 }
13204
13205 impl GroupEngineFactory for BlockingOnceFactory {
13206 fn create<'a>(
13207 &'a self,
13208 _placement: ShardPlacement,
13209 _metrics: GroupEngineMetrics,
13210 ) -> GroupEngineCreateFuture<'a> {
13211 Box::pin(async move {
13212 if self.first_create_blocks.swap(false, Ordering::SeqCst) {
13213 self.entered.notify_one();
13214 self.release.notified().await;
13215 }
13216 let engine: Box<dyn GroupEngine> = Box::new(InMemoryGroupEngine::default());
13217 Ok(engine)
13218 })
13219 }
13220 }
13221
13222 #[derive(Debug, Clone, Copy)]
13223 struct FailingFactory;
13224
13225 impl GroupEngineFactory for FailingFactory {
13226 fn create<'a>(
13227 &'a self,
13228 _placement: ShardPlacement,
13229 _metrics: GroupEngineMetrics,
13230 ) -> GroupEngineCreateFuture<'a> {
13231 Box::pin(async {
13232 let engine: Box<dyn GroupEngine> = Box::new(FailingEngine);
13233 Ok(engine)
13234 })
13235 }
13236 }
13237
13238 struct FailingEngine;
13239
13240 impl GroupEngine for FailingEngine {
13241 fn create_stream<'a>(
13242 &'a mut self,
13243 _request: CreateStreamRequest,
13244 _placement: ShardPlacement,
13245 ) -> GroupCreateStreamFuture<'a> {
13246 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13247 }
13248
13249 fn head_stream<'a>(
13250 &'a mut self,
13251 _request: HeadStreamRequest,
13252 _placement: ShardPlacement,
13253 ) -> GroupHeadStreamFuture<'a> {
13254 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13255 }
13256
13257 fn read_stream<'a>(
13258 &'a mut self,
13259 _request: ReadStreamRequest,
13260 _placement: ShardPlacement,
13261 ) -> GroupReadStreamFuture<'a> {
13262 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13263 }
13264
13265 fn touch_stream_access<'a>(
13266 &'a mut self,
13267 _stream_id: BucketStreamId,
13268 _now_ms: u64,
13269 _renew_ttl: bool,
13270 _placement: ShardPlacement,
13271 ) -> GroupTouchStreamAccessFuture<'a> {
13272 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13273 }
13274
13275 fn add_fork_ref<'a>(
13276 &'a mut self,
13277 _stream_id: BucketStreamId,
13278 _now_ms: u64,
13279 _placement: ShardPlacement,
13280 ) -> GroupForkRefFuture<'a> {
13281 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13282 }
13283
13284 fn release_fork_ref<'a>(
13285 &'a mut self,
13286 _stream_id: BucketStreamId,
13287 _placement: ShardPlacement,
13288 ) -> GroupForkRefFuture<'a> {
13289 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13290 }
13291
13292 fn close_stream<'a>(
13293 &'a mut self,
13294 _request: CloseStreamRequest,
13295 _placement: ShardPlacement,
13296 ) -> GroupCloseStreamFuture<'a> {
13297 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13298 }
13299
13300 fn delete_stream<'a>(
13301 &'a mut self,
13302 _request: DeleteStreamRequest,
13303 _placement: ShardPlacement,
13304 ) -> GroupDeleteStreamFuture<'a> {
13305 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13306 }
13307
13308 fn append<'a>(
13309 &'a mut self,
13310 _request: AppendRequest,
13311 _placement: ShardPlacement,
13312 ) -> GroupAppendFuture<'a> {
13313 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13314 }
13315
13316 fn append_batch<'a>(
13317 &'a mut self,
13318 _request: AppendBatchRequest,
13319 _placement: ShardPlacement,
13320 ) -> GroupAppendBatchFuture<'a> {
13321 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13322 }
13323
13324 fn snapshot<'a>(&'a mut self, _placement: ShardPlacement) -> GroupSnapshotFuture<'a> {
13325 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13326 }
13327
13328 fn install_snapshot<'a>(
13329 &'a mut self,
13330 _snapshot: GroupSnapshot,
13331 ) -> GroupInstallSnapshotFuture<'a> {
13332 Box::pin(async { Err(GroupEngineError::new("proposal rejected")) })
13333 }
13334 }
13335}