rapace-cell 0.5.0

High-level cell runtime for rapace that eliminates boilerplate
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
#![doc = include_str!("../README.md")]

use std::error::Error as StdError;
use std::future::Future;
use std::path::PathBuf;
use std::pin::Pin;
use std::sync::Arc;

use rapace::transport::shm::{ShmSession, ShmSessionConfig, ShmTransport};
use rapace::{Transport, TransportError};

// Re-export common rapace types so macro-expanded code can refer to `$crate::...`
// without requiring every cell crate to depend on `rapace` directly.
pub use rapace::{Frame, RpcError, RpcSession};

pub mod lifecycle;
pub use lifecycle::{CellLifecycle, CellLifecycleClient, CellLifecycleServer, ReadyAck, ReadyMsg};

pub mod tracing_setup;
pub use tracing_setup::TracingConfigService;

#[cfg(unix)]
use rapace::transport::shm::{Doorbell, HubPeer};
#[cfg(unix)]
use std::os::unix::io::RawFd;

fn quiet_mode_enabled() -> bool {
    fn env_truthy(key: &str) -> bool {
        match std::env::var_os(key) {
            None => false,
            Some(v) => {
                let s = v.to_string_lossy();
                !(s.is_empty() || s == "0" || s.eq_ignore_ascii_case("false"))
            }
        }
    }

    // Support both generic and dodeca-specific toggles.
    env_truthy("RAPACE_QUIET") || env_truthy("DODECA_QUIET")
}

/// Default SHM configuration for two-peer sessions.
///
/// Designed for typical host-cell communication with moderate payloads.
/// Total memory per session: ~8.5MB (2 × 17KB rings + 8MB data segment).
///
/// See module documentation for customization guidelines.
pub const DEFAULT_SHM_CONFIG: ShmSessionConfig = ShmSessionConfig {
    ring_capacity: 256, // 256 in-flight descriptors per direction
    slot_size: 65536,   // 64KB max payload per slot
    slot_count: 128,    // 128 slots = 8MB total data segment
};

/// Channel ID start for cells (even IDs: 2, 4, 6, ...)
/// Hosts use odd IDs (1, 3, 5, ...)
const CELL_CHANNEL_START: u32 = 2;

/// Error type for cell runtime operations
#[derive(Debug)]
pub enum CellError {
    /// Failed to parse command line arguments
    Args(String),
    /// SHM file was not created by host within timeout
    ShmTimeout(PathBuf),
    /// Hub file was not created by host within timeout
    HubTimeout(PathBuf),
    /// Failed to open SHM session
    ShmOpen(String),
    /// Failed to open hub session
    HubOpen(String),
    /// Missing or invalid hub arguments
    HubArgs(String),
    /// Doorbell fd invalid
    DoorbellFd(String),
    /// RPC session error
    Rpc(RpcError),
    /// Transport error
    Transport(TransportError),
}

impl std::fmt::Display for CellError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Args(msg) => write!(f, "Argument error: {}", msg),
            Self::ShmTimeout(path) => write!(f, "SHM file not created by host: {}", path.display()),
            Self::HubTimeout(path) => write!(f, "Hub file not created by host: {}", path.display()),
            Self::ShmOpen(msg) => write!(f, "Failed to open SHM: {}", msg),
            Self::HubOpen(msg) => write!(f, "Failed to open hub: {}", msg),
            Self::HubArgs(msg) => write!(f, "Hub argument error: {}", msg),
            Self::DoorbellFd(msg) => write!(f, "Doorbell fd error: {}", msg),
            Self::Rpc(e) => write!(f, "RPC error: {:?}", e),
            Self::Transport(e) => write!(f, "Transport error: {:?}", e),
        }
    }
}

impl StdError for CellError {}

impl From<RpcError> for CellError {
    fn from(e: RpcError) -> Self {
        Self::Rpc(e)
    }
}

impl From<TransportError> for CellError {
    fn from(e: TransportError) -> Self {
        Self::Transport(e)
    }
}

/// Trait for service servers that can be dispatched
pub trait ServiceDispatch: Send + Sync + 'static {
    /// Dispatch a method call to this service
    fn dispatch(
        &self,
        method_id: u32,
        payload: &[u8],
    ) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send + 'static>>;
}

/// Builder for creating multi-service dispatchers
pub struct DispatcherBuilder {
    services: Vec<Box<dyn ServiceDispatch>>,
}

impl DispatcherBuilder {
    /// Create a new dispatcher builder
    pub fn new() -> Self {
        Self {
            services: Vec::new(),
        }
    }

    /// Add a service to the dispatcher
    pub fn add_service<S>(mut self, service: S) -> Self
    where
        S: ServiceDispatch,
    {
        self.services.push(Box::new(service));
        self
    }

    /// Add service introspection to this cell.
    ///
    /// This exposes the `ServiceIntrospection` service, allowing callers to
    /// query what services and methods this cell provides at runtime.
    ///
    /// # Example
    ///
    /// ```ignore
    /// use rapace_cell::run_multi;
    ///
    /// run_multi(|builder| {
    ///     builder
    ///         .add_service(MyServiceServer::new(my_impl))
    ///         .with_introspection() // ← Add introspection!
    /// }).await?;
    /// ```
    #[cfg(feature = "introspection")]
    pub fn with_introspection(self) -> Self {
        use rapace_introspection::{DefaultServiceIntrospection, ServiceIntrospectionServer};

        let introspection = DefaultServiceIntrospection::new();
        let server = Arc::new(ServiceIntrospectionServer::new(introspection));

        // Wrap the generated server to implement ServiceDispatch
        struct IntrospectionDispatcher(
            Arc<ServiceIntrospectionServer<DefaultServiceIntrospection>>,
        );

        impl ServiceDispatch for IntrospectionDispatcher {
            fn dispatch(
                &self,
                method_id: u32,
                payload: &[u8],
            ) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send + 'static>>
            {
                // Clone payload and capture server Arc for the future
                let payload_owned = payload.to_vec();
                let server = self.0.clone();
                Box::pin(async move { server.dispatch(method_id, &payload_owned).await })
            }
        }

        self.add_service(IntrospectionDispatcher(server))
    }

    /// Build the dispatcher function
    #[allow(clippy::type_complexity)]
    pub fn build(
        self,
    ) -> impl Fn(Frame) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send>>
    + Send
    + Sync
    + 'static {
        let services = Arc::new(self.services);
        move |request: Frame| {
            let services = services.clone();
            Box::pin(async move {
                let method_id = request.desc.method_id;
                let payload = request.payload_bytes();

                // Try each service in order until one doesn't return Unimplemented
                for service in services.iter() {
                    let result = service.dispatch(method_id, payload).await;

                    // If not "unknown method_id", return the result
                    if !matches!(
                        &result,
                        Err(RpcError::Status {
                            code: rapace::ErrorCode::Unimplemented,
                            ..
                        })
                    ) {
                        let mut response = result?;
                        response.desc.channel_id = request.desc.channel_id;
                        response.desc.msg_id = request.desc.msg_id;
                        return Ok(response);
                    }
                }

                // No service handled this method - use registry for better error message
                let error_msg = rapace_registry::ServiceRegistry::with_global(|reg| {
                    if let Some(method) = reg.method_by_id(rapace_registry::MethodId(method_id)) {
                        format!(
                            "Method '{}' (id={}) exists in registry but is not implemented by any service in this cell",
                            method.full_name, method_id
                        )
                    } else {
                        format!(
                            "Unknown method_id: {} (not registered in global registry)",
                            method_id
                        )
                    }
                });

                Err(RpcError::Status {
                    code: rapace::ErrorCode::Unimplemented,
                    message: error_msg,
                })
            })
        }
    }
}

impl Default for DispatcherBuilder {
    fn default() -> Self {
        Self::new()
    }
}

enum ParsedArgs {
    Pair {
        shm_path: PathBuf,
    },
    #[cfg(unix)]
    Hub {
        hub_path: PathBuf,
        peer_id: u16,
        doorbell_fd: RawFd,
    },
}

/// Parse CLI arguments to extract either SHM pair args or hub args.
fn parse_args() -> Result<ParsedArgs, CellError> {
    let mut shm_path: Option<PathBuf> = None;
    let mut hub_path: Option<PathBuf> = None;
    let mut peer_id: Option<u16> = None;
    #[cfg(unix)]
    let mut doorbell_fd: Option<RawFd> = None;

    for arg in std::env::args().skip(1) {
        if let Some(value) = arg.strip_prefix("--shm-path=") {
            shm_path = Some(PathBuf::from(value));
        } else if let Some(value) = arg.strip_prefix("--hub-path=") {
            hub_path = Some(PathBuf::from(value));
        } else if let Some(value) = arg.strip_prefix("--peer-id=") {
            peer_id = value.parse::<u16>().ok();
        } else if let Some(value) = arg.strip_prefix("--doorbell-fd=") {
            #[cfg(unix)]
            {
                doorbell_fd = value.parse::<i32>().ok();
            }
        } else if !arg.starts_with("--") && shm_path.is_none() && hub_path.is_none() {
            // First positional argument defaults to shm-path for backwards compat.
            shm_path = Some(PathBuf::from(arg));
        }
    }

    if let Some(hub_path) = hub_path {
        #[cfg(not(unix))]
        {
            return Err(CellError::HubArgs(
                "hub mode is only supported on unix platforms".to_string(),
            ));
        }

        #[cfg(unix)]
        {
            let peer_id = peer_id
                .ok_or_else(|| CellError::HubArgs("Missing --peer-id for hub mode".to_string()))?;
            let doorbell_fd = doorbell_fd.ok_or_else(|| {
                CellError::HubArgs("Missing --doorbell-fd for hub mode".to_string())
            })?;
            return Ok(ParsedArgs::Hub {
                hub_path,
                peer_id,
                doorbell_fd,
            });
        }
    }

    if let Some(shm_path) = shm_path {
        return Ok(ParsedArgs::Pair { shm_path });
    }

    Err(CellError::Args(
        "Missing SHM path (use --shm-path=PATH or provide as first argument)".to_string(),
    ))
}

/// Wait for the host to create the SHM file
async fn wait_for_shm(path: &std::path::Path, timeout_ms: u64) -> Result<(), CellError> {
    let attempts = timeout_ms / 100;
    for i in 0..attempts {
        if path.exists() {
            return Ok(());
        }
        if i < attempts - 1 {
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }
    }
    Err(CellError::ShmTimeout(path.to_path_buf()))
}

async fn wait_for_hub(path: &std::path::Path, timeout_ms: u64) -> Result<(), CellError> {
    let attempts = timeout_ms / 100;
    for i in 0..attempts {
        if path.exists() {
            return Ok(());
        }
        if i < attempts - 1 {
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }
    }
    Err(CellError::HubTimeout(path.to_path_buf()))
}

#[cfg(unix)]
fn validate_doorbell_fd(fd: RawFd) -> Result<(), CellError> {
    let flags = unsafe { libc::fcntl(fd, libc::F_GETFL) };
    if flags < 0 {
        return Err(CellError::DoorbellFd(format!(
            "doorbell fd {fd} is invalid: {}",
            std::io::Error::last_os_error()
        )));
    }
    Ok(())
}

fn cell_name_guess() -> String {
    std::env::current_exe()
        .ok()
        .and_then(|p| p.file_stem().map(|s| s.to_string_lossy().into_owned()))
        .unwrap_or_else(|| "cell".to_string())
}

/// Cell setup result with optional peer_id for hub mode
struct CellSetup {
    session: Arc<RpcSession>,
    #[allow(dead_code)]
    path: PathBuf,
    /// peer_id is Some when in hub mode, indicating ready signal should be sent
    peer_id: Option<u16>,
}

/// Setup common cell infrastructure
async fn setup_cell(config: ShmSessionConfig) -> Result<CellSetup, CellError> {
    // Install death-watch: cell will exit if parent dies
    // Required on macOS for ur-taking-me-with-you to work
    ur_taking_me_with_you::die_with_parent();

    match parse_args()? {
        ParsedArgs::Pair { shm_path } => {
            wait_for_shm(&shm_path, 5000).await?;

            let shm_session = ShmSession::open_file(&shm_path, config)
                .map_err(|e| CellError::ShmOpen(format!("{:?}", e)))?;

            let transport = Transport::Shm(ShmTransport::new(shm_session));
            let session = Arc::new(RpcSession::with_channel_start(
                transport,
                CELL_CHANNEL_START,
            ));
            Ok(CellSetup {
                session,
                path: shm_path,
                peer_id: None,
            })
        }
        #[cfg(unix)]
        ParsedArgs::Hub {
            hub_path,
            peer_id,
            doorbell_fd,
        } => {
            wait_for_hub(&hub_path, 5000).await?;
            validate_doorbell_fd(doorbell_fd)?;

            let peer = HubPeer::open(&hub_path, peer_id)
                .map_err(|e| CellError::HubOpen(format!("{:?}", e)))?;
            peer.register();

            let doorbell = Doorbell::from_raw_fd(doorbell_fd)
                .map_err(|e| CellError::DoorbellFd(format!("{:?}", e)))?;

            let transport = Transport::Shm(ShmTransport::hub_peer(
                Arc::new(peer),
                doorbell,
                cell_name_guess(),
            ));

            let session = Arc::new(RpcSession::with_channel_start(
                transport,
                CELL_CHANNEL_START,
            ));
            Ok(CellSetup {
                session,
                path: hub_path,
                peer_id: Some(peer_id),
            })
        }
    }
}

/// Run a single-service cell
///
/// This function handles all the boilerplate for a simple cell:
/// - Parses CLI arguments
/// - Waits for SHM file creation
/// - Sets up SHM transport and RPC session
/// - Configures the service dispatcher
/// - Runs the session loop
///
/// # Example
///
/// ```rust,ignore
/// use rapace_cell::{run, ServiceDispatch};
/// use rapace::{Frame, RpcError};
/// use std::future::Future;
/// use std::pin::Pin;
///
/// # struct MyServiceServer;
/// # impl MyServiceServer {
/// #     fn new(impl_: ()) -> Self { Self }
/// #     async fn dispatch_impl(&self, method_id: u32, payload: &[u8]) -> Result<Frame, RpcError> {
/// #         unimplemented!()
/// #     }
/// # }
/// # impl ServiceDispatch for MyServiceServer {
/// #     fn dispatch(&self, method_id: u32, payload: &[u8]) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send + 'static>> {
/// #         Box::pin(Self::dispatch_impl(self, method_id, payload))
/// #     }
/// # }
///
/// #[tokio::main]
/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
///     let server = MyServiceServer::new(());
///     run(server).await?;
///     Ok(())
/// }
/// ```
pub async fn run<S>(service: S) -> Result<(), CellError>
where
    S: ServiceDispatch,
{
    run_with_config(service, DEFAULT_SHM_CONFIG).await
}

/// Run a single-service cell with custom SHM configuration
///
/// This automatically sets up rapace-tracing to forward logs to the host.
/// When running in hub mode (with `--peer-id`), this automatically sends a
/// `CellLifecycle.ready()` signal to the host after the session is established.
pub async fn run_with_config<S>(service: S, config: ShmSessionConfig) -> Result<(), CellError>
where
    S: ServiceDispatch,
{
    let setup = setup_cell(config).await?;

    let session = setup.session;
    let peer_id = setup.peer_id;
    let cell_name = cell_name_guess();

    // Expose TracingConfig early (so the host can push filters), but don't install
    // the forwarding tracing layer until after the ready handshake.
    let (tracing_filter, tracing_service) = tracing_setup::create_tracing_config_service();

    // IMPORTANT: Set up dispatcher before starting demux.
    // We intentionally delay installing the tracing layer until after ready, to avoid
    // startup floods on contended transports.
    session.set_dispatcher(
        DispatcherBuilder::new()
            .add_service(tracing_service)
            .add_service(service)
            .build(),
    );

    // Start demux loop in background so we can send ready signal
    let run_task = {
        let session = session.clone();
        tokio::spawn(async move { session.run().await })
    };

    // Yield to ensure demux task gets scheduled
    for _ in 0..10 {
        tokio::task::yield_now().await;
    }

    // Send ready signal FIRST, before tracing is initialized
    if let Some(peer_id) = peer_id {
        let client = CellLifecycleClient::new(session.clone());
        let msg = ReadyMsg {
            peer_id,
            cell_name: cell_name.clone(),
            pid: Some(std::process::id()),
            version: None,
            features: vec![],
        };
        if !quiet_mode_enabled() {
            eprintln!(
                "[rapace-cell] {} (peer_id={}) sending ready signal...",
                cell_name, peer_id
            );
        }
        // Retry the handshake; hub slot allocation can be temporarily contended during parallel startup.
        match ready_handshake_with_backoff(&client, msg).await {
            Ok(ack) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready acknowledged: ok={}",
                        cell_name, peer_id, ack.ok
                    );
                }
            }
            Err(e) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready FAILED: {:?}",
                        cell_name, peer_id, e
                    );
                }
            }
        }
    }

    // NOW initialize tracing (after ready signal is confirmed)
    tracing_setup::install_tracing_layer(session.clone(), tracing_filter);
    tracing::debug!(target: "cell", cell = %cell_name, "Connected to host via SHM: {}", setup.path.display());

    // Wait for session to complete
    match run_task.await {
        Ok(result) => result?,
        Err(join_err) => {
            return Err(CellError::Transport(TransportError::Io(
                std::io::Error::other(format!("demux task join error: {join_err}")),
            )));
        }
    }

    Ok(())
}

/// Run a single-service cell, but let the service factory access the `RpcSession`.
///
/// This variant is useful for cells that need to make outgoing RPC calls during setup.
/// It starts the demux loop in a background task before invoking `factory`.
pub async fn run_with_session<F, S>(factory: F) -> Result<(), CellError>
where
    F: FnOnce(Arc<RpcSession>) -> S,
    S: ServiceDispatch,
{
    run_with_session_and_config(factory, DEFAULT_SHM_CONFIG).await
}

/// Run a single-service cell with session access and custom SHM configuration.
///
/// This automatically sets up rapace-tracing to forward logs to the host.
/// When running in hub mode (with `--peer-id`), this automatically sends a
/// `CellLifecycle.ready()` signal to the host after the session is established.
pub async fn run_with_session_and_config<F, S>(
    factory: F,
    config: ShmSessionConfig,
) -> Result<(), CellError>
where
    F: FnOnce(Arc<RpcSession>) -> S,
    S: ServiceDispatch,
{
    let setup = setup_cell(config).await?;

    let session = setup.session;
    let peer_id = setup.peer_id;
    let cell_name = cell_name_guess();

    // Create service from factory (needs session)
    let service = factory(session.clone());

    // Expose TracingConfig early (so the host can push filters), but don't install
    // the forwarding tracing layer until after the ready handshake.
    let (tracing_filter, tracing_service) = tracing_setup::create_tracing_config_service();

    // IMPORTANT: Set up dispatcher before starting demux.
    session.set_dispatcher(
        DispatcherBuilder::new()
            .add_service(tracing_service)
            .add_service(service)
            .build(),
    );

    // Start demux loop in background, so outgoing RPC calls won't deadlock on
    // current_thread runtimes.
    let run_task = {
        let session = session.clone();
        tokio::spawn(async move { session.run().await })
    };

    // Yield a few times to ensure the demux task gets scheduled.
    for _ in 0..10 {
        tokio::task::yield_now().await;
    }

    // Send ready signal FIRST, before tracing is initialized
    if let Some(peer_id) = peer_id {
        let client = CellLifecycleClient::new(session.clone());
        let msg = ReadyMsg {
            peer_id,
            cell_name: cell_name.clone(),
            pid: Some(std::process::id()),
            version: None,
            features: vec![],
        };
        if !quiet_mode_enabled() {
            eprintln!(
                "[rapace-cell] {} (peer_id={}) sending ready signal...",
                cell_name, peer_id
            );
        }
        // Retry the handshake; hub slot allocation can be temporarily contended during parallel startup.
        match ready_handshake_with_backoff(&client, msg).await {
            Ok(ack) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready acknowledged: ok={}",
                        cell_name, peer_id, ack.ok
                    );
                }
            }
            Err(e) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready FAILED: {:?}",
                        cell_name, peer_id, e
                    );
                }
            }
        }
    }

    // NOW initialize tracing (after ready signal is confirmed)
    tracing_setup::install_tracing_layer(session.clone(), tracing_filter);
    tracing::debug!(target: "cell", cell = %cell_name, "Connected to host via SHM: {}", setup.path.display());

    match run_task.await {
        Ok(result) => result?,
        Err(join_err) => {
            return Err(CellError::Transport(TransportError::Io(
                std::io::Error::other(format!("demux task join error: {join_err}")),
            )));
        }
    }

    Ok(())
}

fn ready_total_timeout() -> std::time::Duration {
    // Keep compatibility with dodeca's historical knob while providing a generic name too.
    let timeout_ms = std::env::var("RAPACE_CELL_READY_TIMEOUT_MS")
        .ok()
        .and_then(|s| s.parse::<u64>().ok())
        .or_else(|| {
            std::env::var("DODECA_CELL_READY_TIMEOUT_MS")
                .ok()
                .and_then(|s| s.parse::<u64>().ok())
        })
        .unwrap_or(10_000);

    std::time::Duration::from_millis(timeout_ms)
}

async fn ready_handshake_with_backoff(
    client: &CellLifecycleClient,
    msg: ReadyMsg,
) -> Result<ReadyAck, RpcError> {
    let timeout = ready_total_timeout();

    let start = std::time::Instant::now();
    let mut delay_ms = 10u64;

    loop {
        match client.ready(msg.clone()).await {
            Ok(ack) => return Ok(ack),
            Err(e) => {
                if start.elapsed() >= timeout {
                    return Err(e);
                }
                tracing::debug!(
                    cell = %msg.cell_name,
                    peer_id = msg.peer_id,
                    error = ?e,
                    delay_ms,
                    "Ready handshake failed; retrying"
                );
            }
        }

        tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
        delay_ms = (delay_ms * 2).min(200);
    }
}

/// Run a multi-service cell
///
/// This function handles all the boilerplate for a multi-service cell.
/// The builder function receives a `DispatcherBuilder` to configure which
/// services the cell exposes.
///
/// # Example
///
/// ```rust,ignore
/// use rapace_cell::{run_multi, DispatcherBuilder, ServiceDispatch};
/// use rapace::{Frame, RpcError};
/// use std::future::Future;
/// use std::pin::Pin;
///
/// # struct MyServiceServer;
/// # struct AnotherServiceServer;
/// # impl MyServiceServer {
/// #     fn new(impl_: ()) -> Self { Self }
/// #     async fn dispatch_impl(&self, method_id: u32, payload: &[u8]) -> Result<Frame, RpcError> {
/// #         unimplemented!()
/// #     }
/// # }
/// # impl AnotherServiceServer {
/// #     fn new(impl_: ()) -> Self { Self }
/// #     async fn dispatch_impl(&self, method_id: u32, payload: &[u8]) -> Result<Frame, RpcError> {
/// #         unimplemented!()
/// #     }
/// # }
/// # impl ServiceDispatch for MyServiceServer {
/// #     fn dispatch(&self, method_id: u32, payload: &[u8]) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send + 'static>> {
/// #         Box::pin(Self::dispatch_impl(self, method_id, payload))
/// #     }
/// # }
/// # impl ServiceDispatch for AnotherServiceServer {
/// #     fn dispatch(&self, method_id: u32, payload: &[u8]) -> Pin<Box<dyn Future<Output = Result<Frame, RpcError>> + Send + 'static>> {
/// #         Box::pin(Self::dispatch_impl(self, method_id, payload))
/// #     }
/// # }
///
/// #[tokio::main]
/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
///     run_multi(|builder| {
///         builder
///             .add_service(MyServiceServer::new(()))
///             .add_service(AnotherServiceServer::new(()))
///     }).await?;
///     Ok(())
/// }
/// ```
pub async fn run_multi<F>(builder_fn: F) -> Result<(), CellError>
where
    F: FnOnce(DispatcherBuilder) -> DispatcherBuilder,
{
    run_multi_with_config(builder_fn, DEFAULT_SHM_CONFIG).await
}

/// Run a multi-service cell with custom SHM configuration
///
/// This automatically sets up rapace-tracing to forward logs to the host.
pub async fn run_multi_with_config<F>(
    builder_fn: F,
    config: ShmSessionConfig,
) -> Result<(), CellError>
where
    F: FnOnce(DispatcherBuilder) -> DispatcherBuilder,
{
    let setup = setup_cell(config).await?;

    let session = setup.session;
    let peer_id = setup.peer_id;
    let cell_name = cell_name_guess();

    // Expose TracingConfig early (so the host can push filters), but don't install
    // the forwarding tracing layer until after the ready handshake.
    let (tracing_filter, tracing_service) = tracing_setup::create_tracing_config_service();

    // Build the dispatcher with user services + tracing config
    let builder = DispatcherBuilder::new();
    let builder = builder_fn(builder);
    let builder = builder.add_service(tracing_service);
    let dispatcher = builder.build();

    session.set_dispatcher(dispatcher);

    // Start demux loop in background so we can send ready signal
    let run_task = {
        let session = session.clone();
        tokio::spawn(async move { session.run().await })
    };

    // Yield a few times to ensure the demux task gets scheduled.
    for _ in 0..10 {
        tokio::task::yield_now().await;
    }

    // Send ready signal (hub mode only)
    if let Some(peer_id) = peer_id {
        let client = CellLifecycleClient::new(session.clone());
        let msg = ReadyMsg {
            peer_id,
            cell_name: cell_name.clone(),
            pid: Some(std::process::id()),
            version: None,
            features: vec![],
        };
        if !quiet_mode_enabled() {
            eprintln!(
                "[rapace-cell] {} (peer_id={}) sending ready signal...",
                cell_name, peer_id
            );
        }
        match ready_handshake_with_backoff(&client, msg).await {
            Ok(ack) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready acknowledged: ok={}",
                        cell_name, peer_id, ack.ok
                    );
                }
            }
            Err(e) => {
                if !quiet_mode_enabled() {
                    eprintln!(
                        "[rapace-cell] {} (peer_id={}) ready FAILED: {:?}",
                        cell_name, peer_id, e
                    );
                }
            }
        }
    }

    // Install tracing forwarding now that the cell is ready.
    tracing_setup::install_tracing_layer(session.clone(), tracing_filter);
    tracing::debug!(target: "cell", cell = %cell_name, "Connected to host via SHM: {}", setup.path.display());

    // Wait for session to complete
    match run_task.await {
        Ok(result) => result?,
        Err(join_err) => {
            return Err(CellError::Transport(TransportError::Io(
                std::io::Error::other(format!("demux task join error: {join_err}")),
            )));
        }
    }

    Ok(())
}

/// Extension trait for RpcSession to support single-service setup
pub trait RpcSessionExt {
    /// Set a single service as the dispatcher for this session
    ///
    /// This is a convenience method for cells that only expose one service.
    /// For multi-service cells, use `set_dispatcher` with a `DispatcherBuilder`.
    fn set_service<S>(&self, service: S)
    where
        S: ServiceDispatch;
}

impl RpcSessionExt for RpcSession {
    fn set_service<S>(&self, service: S)
    where
        S: ServiceDispatch,
    {
        let service = Arc::new(service);
        let dispatcher = move |request: Frame| {
            let service = service.clone();
            Box::pin(async move {
                let mut response = service
                    .dispatch(request.desc.method_id, request.payload_bytes())
                    .await?;
                response.desc.channel_id = request.desc.channel_id;
                response.desc.msg_id = request.desc.msg_id;
                Ok(response)
            })
        };
        self.set_dispatcher(dispatcher);
    }
}

/// Macro to run a cell with minimal boilerplate.
///
/// Generates a `current_thread` tokio main that calls `rapace_cell::run(...)`.
#[macro_export]
macro_rules! run_cell {
    ($service:expr) => {
        #[tokio::main(flavor = "current_thread")]
        async fn main() -> Result<(), Box<dyn std::error::Error>> {
            $crate::run($service).await?;
            Ok(())
        }
    };
}

/// Macro to run a cell whose setup needs access to the RPC session.
///
/// Generates a `current_thread` tokio main that calls `rapace_cell::run_with_session(...)`.
#[macro_export]
macro_rules! run_cell_with_session {
    ($factory:expr) => {
        #[tokio::main(flavor = "current_thread")]
        async fn main() -> Result<(), Box<dyn std::error::Error>> {
            $crate::run_with_session($factory).await?;
            Ok(())
        }
    };
}

/// Macro to wrap a generated server type as a `ServiceDispatch` cell service.
///
/// This is convenient when a proc-macro generates `FooServer<T>` where `FooServer::new(T)`
/// constructs the server and `FooServer::dispatch(method_id, bytes)` routes calls.
#[macro_export]
macro_rules! cell_service {
    ($server_type:ty, $impl_type:ty) => {
        struct CellService(std::sync::Arc<$server_type>);

        impl $crate::ServiceDispatch for CellService {
            fn dispatch(
                &self,
                method_id: u32,
                payload: &[u8],
            ) -> std::pin::Pin<
                Box<
                    dyn std::future::Future<
                            Output = std::result::Result<$crate::Frame, $crate::RpcError>,
                        > + Send
                        + 'static,
                >,
            > {
                let server = self.0.clone();
                let bytes = payload.to_vec();
                Box::pin(async move { server.dispatch(method_id, &bytes).await })
            }
        }

        impl From<$impl_type> for CellService {
            fn from(impl_val: $impl_type) -> Self {
                Self(std::sync::Arc::new(<$server_type>::new(impl_val)))
            }
        }
    };
}