strike48-connector 0.3.6

//! Multi-registration connector runner.
//!
//! Lets one host process register `N` independently-approvable connectors
//! against a single Matrix server while sharing the underlying transport:
//!
//! - **gRPC**: one TCP+TLS connection, N HTTP/2 streams (one per registration).
//!   Lazily opens additional channels when `max_streams_per_channel` is hit.
//! - **WebSocket**: HTTP/1.1 — no native multiplexing — falls back to N
//!   independent WS connections in one process. Same public API.
//!
//! From the Matrix server's point of view, each registration is a normal
//! `Connect` RPC. There are zero server-side changes.
//!
//! ## Reconnect behaviour
//!
//! The runner has two layers of reconnect, owned by different actors:
//!
//! - **Stream-level (per registration)**: implemented in the internal
//!   `registration_runner` module. When a stream ends (server closes, network
//!   blip, heartbeat timeout) the runner sleeps with exponential backoff +
//!   jitter (caps at `MultiTransportOptions::reconnect_max_delay_ms`) and
//!   opens a fresh stream over the existing channel. Fully shutdown-aware
//!   — never blocks shutdown for more than one backoff slice. Each
//!   reconnect bumps the registration's `successful_reconnects` /
//!   `total_disconnects` metrics.
//!
//! - **Channel-level (per HTTP/2 connection, gRPC only)**: delegated to
//!   `tonic::transport::Channel`. The channel is configured with HTTP/2
//!   keepalive and dialled **eagerly** via `endpoint.connect().await` the
//!   first time a registration needs it. tonic auto-recovers on transient
//!   TCP / TLS failures by re-dialing internally on the next request; the
//!   SDK does **not** explicitly recreate channels today.
//!
//!   In practice this means: if the underlying TCP connection breaks, all
//!   N registrations using that channel will see their streams close. The
//!   per-registration loop opens a fresh stream, which in turn forces the
//!   channel to redial. End state: connections recover transparently
//!   without involvement from this module.
//!
//!   If a channel ever goes **permanently dead** (e.g. DNS now points to
//!   an unreachable host and the lazy redial keeps failing), every
//!   registration on that channel will spend its time backing off. The
//!   [`MultiConnectorRunner::shutdown_handle`] still works, but recovery
//!   for the current process is best-effort. A future improvement would
//!   be to track per-channel consecutive-failure counts and rebuild the
//!   channel after a threshold; tracked under
//!   `connector-sdk-rust-channel-reconnect-policy`.
//!
//! ## Backward compatibility
//!
//! This module is **purely additive**. The existing single-registration
//! [`crate::ConnectorRunner`] API and behaviour are unchanged.
//!
//! ## Example
//!
//! ```no_run
//! # async fn run() -> strike48_connector::Result<()> {
//! use std::sync::Arc;
//! use strike48_connector::{
//!     BaseConnector, ConnectorConfig, ConnectorRegistration, MultiConnectorRunner,
//!     MultiTransportOptions, Result, TransportType,
//! };
//!
//! struct Echo;
//! impl BaseConnector for Echo {
//!     fn connector_type(&self) -> &str { "echo" }
//!     fn version(&self) -> &str { "1.0.0" }
//!     fn execute(
//!         &self,
//!         req: serde_json::Value,
//!         _: Option<&str>,
//!     ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<serde_json::Value>> + Send + '_>> {
//!         Box::pin(async move { Ok(req) })
//!     }
//! }
//!
//! let opts = MultiTransportOptions::builder()
//!     .host("localhost:50061")
//!     .transport_type(TransportType::Grpc)
//!     .build();
//!
//! let registrations = (0..3).map(|i| {
//!     ConnectorRegistration::new(
//!         ConnectorConfig {
//!             tenant_id: "demo-org".into(),
//!             connector_type: "echo".into(),
//!             instance_id: format!("echo-{i}"),
//!             ..ConnectorConfig::default()
//!         },
//!         Echo,
//!     )
//! }).collect::<Vec<_>>();
//!
//! let runner = MultiConnectorRunner::new(opts, registrations);
//! let _shutdown = runner.shutdown_handle();
//! runner.run().await?;
//! # Ok(()) }
//! ```

use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};

use tokio::sync::{Mutex, RwLock, Semaphore, watch};

use crate::connector::{BaseConnector, ConnectorConfig, ShutdownHandle};
use crate::error::{ConnectorError, Result};
use crate::logger::Logger;
use crate::transport::TransportType;
use crate::types::ConnectorMetrics;

mod registration_runner;
mod shared_channel;

use registration_runner::RegistrationRunner;
use shared_channel::SharedChannel;

// =============================================================================
// Public types
// =============================================================================

/// Transport-level configuration shared by every registration in a
/// [`MultiConnectorRunner`]. Per-registration identity (tenant, type, instance,
/// auth token, ...) lives on each [`ConnectorRegistration`]'s [`ConnectorConfig`].
///
/// Marked `#[non_exhaustive]` so future fields can be added without breaking
/// downstream struct-literal construction. Use
/// [`MultiTransportOptions::builder`] (preferred) or
/// `MultiTransportOptions { ..MultiTransportOptions::default() }`.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct MultiTransportOptions {
    /// Server host:port (e.g. `localhost:50061` for gRPC, `localhost:4000` for WS).
    pub host: String,
    /// Whether to use TLS for the transport.
    pub use_tls: bool,
    /// Transport scheme.
    pub transport_type: TransportType,

    /// Soft cap on concurrent gRPC streams per channel before the runner
    /// opens an additional channel. Defaults to `80` to leave headroom under
    /// the typical Cowboy/RFC 7540 default of 100. Ignored for WebSocket.
    pub max_streams_per_channel: usize,

    /// Initial connect timeout (ms). Default 10_000.
    pub connect_timeout_ms: u64,

    /// Enable channel-level reconnect on transport failure. Default `true`.
    pub reconnect_enabled: bool,
    /// Base reconnect backoff (ms). Default 500.
    pub reconnect_delay_ms: u64,
    /// Max reconnect backoff (ms). Default 60_000.
    pub max_backoff_delay_ms: u64,
    /// Reconnect jitter (ms). Default 500.
    pub reconnect_jitter_ms: u64,

    /// Per-registration maximum number of in-flight `ExecuteRequest`s being
    /// processed by the user `BaseConnector::execute()` callback. When the
    /// limit is reached, additional `ExecuteRequest`s queue on a semaphore
    /// until a permit is released. Mirrors
    /// [`crate::ConnectorConfig::max_concurrent_requests`] for the
    /// single-runner path. Default `100`.
    pub max_concurrent_requests: usize,
}

impl MultiTransportOptions {
    /// Start a builder with sensible defaults (gRPC, plaintext, localhost:50061).
    pub fn builder() -> MultiTransportOptionsBuilder {
        MultiTransportOptionsBuilder::default()
    }
}

impl Default for MultiTransportOptions {
    fn default() -> Self {
        Self {
            host: "localhost:50061".to_string(),
            use_tls: false,
            transport_type: TransportType::Grpc,
            max_streams_per_channel: 80,
            connect_timeout_ms: 10_000,
            reconnect_enabled: true,
            reconnect_delay_ms: 500,
            max_backoff_delay_ms: 60_000,
            reconnect_jitter_ms: 500,
            max_concurrent_requests: 100,
        }
    }
}

/// Fluent builder for [`MultiTransportOptions`].
#[derive(Debug, Clone, Default)]
pub struct MultiTransportOptionsBuilder {
    inner: Option<MultiTransportOptions>,
}

impl MultiTransportOptionsBuilder {
    fn opts(&mut self) -> &mut MultiTransportOptions {
        self.inner
            .get_or_insert_with(MultiTransportOptions::default)
    }

    /// Set the server host:port.
    pub fn host(mut self, host: impl Into<String>) -> Self {
        self.opts().host = host.into();
        self
    }

    /// Set whether to use TLS.
    pub fn use_tls(mut self, use_tls: bool) -> Self {
        self.opts().use_tls = use_tls;
        self
    }

    /// Set the transport scheme.
    pub fn transport_type(mut self, t: TransportType) -> Self {
        self.opts().transport_type = t;
        self
    }

    /// Override the soft cap on concurrent gRPC streams per channel (gRPC only).
    pub fn max_streams_per_channel(mut self, n: usize) -> Self {
        self.opts().max_streams_per_channel = n;
        self
    }

    /// Override the initial connect timeout (ms).
    pub fn connect_timeout_ms(mut self, ms: u64) -> Self {
        self.opts().connect_timeout_ms = ms;
        self
    }

    /// Override the per-registration `max_concurrent_requests` cap.
    pub fn max_concurrent_requests(mut self, n: usize) -> Self {
        self.opts().max_concurrent_requests = n.max(1);
        self
    }

    /// Enable or disable automatic reconnection on stream loss.
    pub fn reconnect_enabled(mut self, enabled: bool) -> Self {
        self.opts().reconnect_enabled = enabled;
        self
    }

    /// Initial reconnect delay (ms) — the base for exponential backoff.
    pub fn reconnect_delay_ms(mut self, ms: u64) -> Self {
        self.opts().reconnect_delay_ms = ms;
        self
    }

    /// Hard cap on reconnect backoff (ms). Jitter is applied first, then
    /// the result is clamped to this value, so the cap is a strict upper
    /// bound on the wait between attempts.
    pub fn max_backoff_delay_ms(mut self, ms: u64) -> Self {
        self.opts().max_backoff_delay_ms = ms;
        self
    }

    /// Per-attempt jitter range (ms). A uniformly-random value in
    /// `0..=ms` is added to the scaled backoff before capping.
    pub fn reconnect_jitter_ms(mut self, ms: u64) -> Self {
        self.opts().reconnect_jitter_ms = ms;
        self
    }

    /// Build the options.
    pub fn build(mut self) -> MultiTransportOptions {
        self.inner.take().unwrap_or_default()
    }
}

/// One logical connector to run inside a [`MultiConnectorRunner`].
///
/// `config.host`, `config.use_tls`, and `config.transport_type` are ignored —
/// the transport is governed by [`MultiTransportOptions`]. All other fields
/// (`tenant_id`, `connector_type`, `instance_id`, `auth_token`,
/// `display_name`, `tags`, `metadata`, `max_concurrent_requests`,
/// `metrics_*`) apply to this registration only.
///
/// Marked `#[non_exhaustive]` so future fields (e.g. per-registration
/// behaviour overrides) can be added without breaking downstream
/// struct-literal construction. Prefer [`ConnectorRegistration::new`].
#[non_exhaustive]
pub struct ConnectorRegistration {
    pub config: ConnectorConfig,
    pub connector: Arc<dyn BaseConnector>,
}

impl ConnectorRegistration {
    /// Build a registration from a `ConnectorConfig` and any
    /// [`BaseConnector`] implementor — the conversion to
    /// `Arc<dyn BaseConnector>` happens internally so callers don't have
    /// to write `Arc::new(...) as Arc<dyn BaseConnector>` themselves.
    ///
    /// ```
    /// use std::sync::Arc;
    /// use strike48_connector::{
    ///     BaseConnector, ConnectorConfig, ConnectorRegistration, Result,
    /// };
    ///
    /// struct Echo;
    /// impl BaseConnector for Echo {
    ///     fn connector_type(&self) -> &str { "echo" }
    ///     fn version(&self) -> &str { "1.0.0" }
    ///     fn execute(
    ///         &self,
    ///         req: serde_json::Value,
    ///         _: Option<&str>,
    ///     ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<serde_json::Value>> + Send + '_>> {
    ///         Box::pin(async move { Ok(req) })
    ///     }
    /// }
    ///
    /// let cfg = ConnectorConfig {
    ///     tenant_id: "demo".into(),
    ///     connector_type: "echo".into(),
    ///     instance_id: "echo-1".into(),
    ///     ..ConnectorConfig::default()
    /// };
    /// let reg = ConnectorRegistration::new(cfg, Echo);
    /// assert_eq!(reg.config.connector_type, "echo");
    /// # let _ = reg;
    /// ```
    pub fn new<T>(config: ConnectorConfig, connector: T) -> Self
    where
        T: BaseConnector + 'static,
    {
        Self {
            config,
            connector: Arc::new(connector) as Arc<dyn BaseConnector>,
        }
    }

    /// Build a registration from a config and an already-erased
    /// `Arc<dyn BaseConnector>`. Useful when callers have a heterogeneous
    /// list of differently-typed connectors that they have already boxed.
    pub fn from_arc(config: ConnectorConfig, connector: Arc<dyn BaseConnector>) -> Self {
        Self { config, connector }
    }
}

impl std::fmt::Debug for ConnectorRegistration {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ConnectorRegistration")
            .field("config", &self.config)
            .field(
                "connector",
                &format_args!(
                    "Arc<dyn BaseConnector>(\"{}\")",
                    self.connector.connector_type()
                ),
            )
            .finish()
    }
}

/// Stable identity for a registration. Matches the `tenant.type.instance`
/// triple the Matrix server uses to key a `ConnectorSession`.
///
/// Marked `#[non_exhaustive]` so additional identity dimensions (e.g. an
/// optional region tag) can be added without breaking downstream
/// pattern-match exhaustiveness.
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
#[non_exhaustive]
pub struct RegistrationKey {
    pub tenant_id: String,
    pub connector_type: String,
    pub instance_id: String,
}

impl RegistrationKey {
    pub fn from_config(config: &ConnectorConfig) -> Self {
        Self {
            tenant_id: config.tenant_id.clone(),
            connector_type: config.connector_type.clone(),
            instance_id: config.instance_id.clone(),
        }
    }
}

impl std::fmt::Display for RegistrationKey {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}.{}.{}",
            self.tenant_id, self.connector_type, self.instance_id
        )
    }
}

// =============================================================================
// MultiConnectorRunner
// =============================================================================

struct RegistrationEntry {
    key: RegistrationKey,
    config: ConnectorConfig,
    connector: Arc<dyn BaseConnector>,
    metrics: Arc<Mutex<ConnectorMetrics>>,
}

/// Runs `N` independently-approvable connectors over a shared transport.
///
/// See module-level docs for transport semantics. From the Matrix server's
/// point of view, each registration is a normal `Connect` RPC.
pub struct MultiConnectorRunner {
    opts: MultiTransportOptions,
    registrations: RwLock<Vec<RegistrationEntry>>,
    shutdown_requested: Arc<AtomicBool>,
    running: Arc<AtomicBool>,
}

impl MultiConnectorRunner {
    /// Create a new runner. Construction does not open any connections —
    /// transport is established lazily by [`MultiConnectorRunner::run`].
    ///
    /// Duplicate registrations (same `tenant.type.instance`) are rejected by
    /// [`MultiConnectorRunner::add`]; duplicates passed in here are reduced
    /// to the first occurrence and logged.
    pub fn new(opts: MultiTransportOptions, registrations: Vec<ConnectorRegistration>) -> Self {
        let mut entries: Vec<RegistrationEntry> = Vec::with_capacity(registrations.len());
        for ConnectorRegistration { config, connector } in registrations {
            let key = RegistrationKey::from_config(&config);
            if entries.iter().any(|e| e.key == key) {
                tracing::warn!(
                    target: "strike48_connector::multi",
                    registration = %key,
                    "duplicate registration ignored"
                );
                continue;
            }
            entries.push(RegistrationEntry {
                key,
                config,
                connector,
                metrics: Arc::new(Mutex::new(ConnectorMetrics::default())),
            });
        }

        Self {
            opts,
            registrations: RwLock::new(entries),
            shutdown_requested: Arc::new(AtomicBool::new(false)),
            running: Arc::new(AtomicBool::new(false)),
        }
    }

    /// Append a registration. Only valid before [`MultiConnectorRunner::run`]
    /// is called; returns an error if `run()` has already started or if the
    /// registration's `tenant.type.instance` collides with an existing one.
    ///
    /// `running` is checked **inside** the same write-lock critical section
    /// that `run()` uses to snapshot the registration list, so a concurrent
    /// `add` either lands before the snapshot (and is driven) or returns
    /// [`ConnectorError::AlreadyRunning`]. There is no window where the
    /// registration is silently dropped.
    pub async fn add(&self, registration: ConnectorRegistration) -> Result<()> {
        let key = RegistrationKey::from_config(&registration.config);
        let mut regs = self.registrations.write().await;
        if self.running.load(Ordering::SeqCst) {
            return Err(ConnectorError::AlreadyRunning);
        }
        if regs.iter().any(|e| e.key == key) {
            return Err(ConnectorError::InvalidConfig(format!(
                "duplicate registration: {key}"
            )));
        }
        regs.push(RegistrationEntry {
            key,
            config: registration.config,
            connector: registration.connector,
            metrics: Arc::new(Mutex::new(ConnectorMetrics::default())),
        });
        Ok(())
    }

    /// Get a [`ShutdownHandle`] that signals every registration to exit.
    pub fn shutdown_handle(&self) -> ShutdownHandle {
        ShutdownHandle::from_flag(self.shutdown_requested.clone())
    }

    /// Snapshot of the registered keys (in insertion order).
    pub async fn registrations(&self) -> Vec<RegistrationKey> {
        self.registrations
            .read()
            .await
            .iter()
            .map(|e| e.key.clone())
            .collect()
    }

    /// Per-registration metrics snapshot.
    ///
    /// Each registration owns its own [`ConnectorMetrics`] (no global
    /// singleton), so values are independent across registrations sharing the
    /// same transport.
    pub async fn metrics_snapshot(&self) -> HashMap<RegistrationKey, ConnectorMetrics> {
        let regs = self.registrations.read().await;
        let mut out = HashMap::with_capacity(regs.len());
        for entry in regs.iter() {
            let snapshot = entry.metrics.lock().await.clone();
            out.insert(entry.key.clone(), snapshot);
        }
        out
    }

    /// Run all registrations to completion or until shutdown.
    ///
    /// Returns once every registration has exited. Individual registration
    /// failures are logged but do not abort the runner unless
    /// [`MultiTransportOptions::reconnect_enabled`] is `false`.
    pub async fn run(&self) -> Result<()> {
        let logger = Logger::new("multi");

        // Take the write lock so concurrent `add()` calls observe `running`
        // atomically with the snapshot — see `add` for the partner half of
        // this protocol. We hold the lock only long enough to flip `running`
        // and clone the entries.
        let entries: Vec<RegistrationEntry> = {
            let regs = self.registrations.write().await;
            if self
                .running
                .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst)
                .is_err()
            {
                return Err(ConnectorError::AlreadyRunning);
            }
            regs.iter()
                .map(|e| RegistrationEntry {
                    key: e.key.clone(),
                    config: e.config.clone(),
                    connector: e.connector.clone(),
                    metrics: e.metrics.clone(),
                })
                .collect()
        };

        // Pre-signalled shutdown is a clean no-op exit.
        if self.shutdown_requested.load(Ordering::SeqCst) {
            logger.debug("shutdown signalled before run; exiting");
            self.running.store(false, Ordering::SeqCst);
            return Ok(());
        }

        if entries.is_empty() {
            logger.warn("no registrations configured; run() exiting immediately");
            self.running.store(false, Ordering::SeqCst);
            return Ok(());
        }

        let result = match self.opts.transport_type {
            TransportType::Grpc => self.run_grpc(entries, logger).await,
            TransportType::WebSocket => self.run_websocket(entries, logger).await,
        };
        self.running.store(false, Ordering::SeqCst);
        result
    }

    async fn run_grpc(&self, entries: Vec<RegistrationEntry>, logger: Logger) -> Result<()> {
        let shared = Arc::new(SharedChannel::new(self.opts.clone()));
        let mut tasks = Vec::with_capacity(entries.len());

        for entry in entries {
            let runner = RegistrationRunner {
                key: entry.key.clone(),
                config: Arc::new(RwLock::new(entry.config)),
                connector: entry.connector,
                shared_channel: shared.clone(),
                shutdown: self.shutdown_requested.clone(),
                metrics: entry.metrics,
                opts: self.opts.clone(),
                request_semaphore: Arc::new(Semaphore::new(
                    self.opts.max_concurrent_requests.max(1),
                )),
                session_token: Arc::new(RwLock::new(None)),
            };
            tasks.push(tokio::spawn(async move { runner.run().await }));
        }

        for task in tasks {
            match task.await {
                Ok(Ok(())) => {}
                Ok(Err(e)) => {
                    logger.warn(&format!("registration runner exited with error: {e}"));
                }
                Err(join_err) => {
                    logger.error("registration task panicked", &join_err.to_string());
                }
            }
        }

        Ok(())
    }

    /// WebSocket has no native multiplexing (HTTP/1.1), so each logical
    /// registration uses its own `WebSocketTransport` underneath. We fan out
    /// to N independent [`crate::ConnectorRunner`]s — same public ergonomics
    /// as gRPC mode, but the transport count == registration count. This keeps
    /// the API symmetric and lets the existing single-runner reconnect /
    /// auth / metrics paths apply unchanged.
    async fn run_websocket(&self, entries: Vec<RegistrationEntry>, logger: Logger) -> Result<()> {
        use crate::ConnectorRunner;

        // One watch::channel for the whole runner: when the multi-runner's
        // shutdown flag is flipped we bump this channel; every child task
        // observes the change immediately via `changed().await` and signals
        // its own ConnectorRunner's shutdown handle. No per-registration
        // polling task — those leaked indefinitely on the previous
        // `tokio::spawn(loop { sleep(100ms); load(...) })` design.
        let (shutdown_tx, shutdown_rx_template) = watch::channel(false);

        // Bridge the AtomicBool flag (kept for backward compatibility with
        // ShutdownHandle) to the watch sender. Owned by `run_websocket` and
        // dropped when this function returns; that drop closes the watch
        // channel and lets every child observe completion if it hadn't
        // already.
        let multi_shutdown = self.shutdown_requested.clone();
        let bridge_tx = shutdown_tx.clone();
        let bridge = tokio::spawn(async move {
            // Coarse poll on the AtomicBool — only ONE such task per runner,
            // not one per registration. Exits cleanly when shutdown fires.
            while !multi_shutdown.load(Ordering::SeqCst) {
                tokio::time::sleep(std::time::Duration::from_millis(100)).await;
                if bridge_tx.is_closed() {
                    // All receivers gone (run_websocket returned via the
                    // happy path before shutdown was ever signalled).
                    return;
                }
            }
            let _ = bridge_tx.send(true);
        });

        let mut tasks = Vec::with_capacity(entries.len());

        for entry in entries {
            let mut config = entry.config.clone();
            config.transport_type = TransportType::WebSocket;
            config.host = self.opts.host.clone();
            config.use_tls = self.opts.use_tls;
            config.reconnect_enabled = self.opts.reconnect_enabled;
            config.reconnect_delay_ms = self.opts.reconnect_delay_ms;
            config.max_backoff_delay_ms = self.opts.max_backoff_delay_ms;
            config.reconnect_jitter_ms = self.opts.reconnect_jitter_ms;

            let runner = ConnectorRunner::new(config, entry.connector);
            let child_shutdown = runner.shutdown_handle();
            let mut shutdown_rx = shutdown_rx_template.clone();
            let key = entry.key.clone();

            tasks.push(tokio::spawn(async move {
                let mut runner_fut = Box::pin(runner.run());
                let res = loop {
                    tokio::select! {
                        biased;
                        // Propagate shutdown to the child runner the
                        // instant the multi-runner flag flips. After
                        // signalling we keep awaiting the runner's own
                        // exit so its drain logic runs.
                        changed = shutdown_rx.changed() => {
                            match changed {
                                Ok(()) if *shutdown_rx.borrow() => {
                                    child_shutdown.shutdown();
                                }
                                Err(_) => {
                                    // Sender dropped; treat as "no further
                                    // shutdown will arrive". Wait for the
                                    // runner to finish on its own.
                                    break runner_fut.await;
                                }
                                _ => {}
                            }
                        }
                        result = &mut runner_fut => break result,
                    }
                };
                (key, res)
            }));
        }

        // Drop the template receiver so the only live receivers are the
        // ones inside the spawned tasks. This lets the bridge task observe
        // `is_closed()` once every task has exited.
        drop(shutdown_rx_template);

        for task in tasks {
            match task.await {
                Ok((_key, Ok(()))) => {}
                Ok((key, Err(e))) => {
                    logger.warn(&format!("ws registration {key} exited with error: {e}"));
                }
                Err(join_err) => {
                    logger.error("ws registration task panicked", &join_err.to_string());
                }
            }
        }

        // Tear down the bridge task. If it exited on its own we just await
        // a finished JoinHandle (cheap). Otherwise abort + await yields a
        // JoinError we ignore — the task is short-lived and trivial.
        bridge.abort();
        let _ = bridge.await;

        // Keep the sender alive for the whole function; dropping here.
        drop(shutdown_tx);

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::types::ConnectorBehavior;

    struct DummyConnector;
    impl BaseConnector for DummyConnector {
        fn connector_type(&self) -> &str {
            "dummy"
        }
        fn version(&self) -> &str {
            "0.0.0"
        }
        fn execute(
            &self,
            _: serde_json::Value,
            _: Option<&str>,
        ) -> std::pin::Pin<
            Box<dyn std::future::Future<Output = Result<serde_json::Value>> + Send + '_>,
        > {
            Box::pin(async { Ok(serde_json::json!({})) })
        }
        fn behavior(&self) -> ConnectorBehavior {
            ConnectorBehavior::Tool
        }
    }

    fn reg(tenant: &str, ty: &str, inst: &str) -> ConnectorRegistration {
        ConnectorRegistration::new(
            ConnectorConfig {
                tenant_id: tenant.into(),
                connector_type: ty.into(),
                instance_id: inst.into(),
                ..ConnectorConfig::default()
            },
            DummyConnector,
        )
    }

    #[test]
    fn options_builder_defaults_match_default_impl() {
        let built = MultiTransportOptions::builder().build();
        let defaulted = MultiTransportOptions::default();
        assert_eq!(built.host, defaulted.host);
        assert_eq!(built.use_tls, defaulted.use_tls);
        assert_eq!(
            built.max_streams_per_channel,
            defaulted.max_streams_per_channel
        );
        assert_eq!(built.transport_type, defaulted.transport_type);
    }

    #[test]
    fn options_builder_overrides_apply() {
        let opts = MultiTransportOptions::builder()
            .host("h:1")
            .use_tls(true)
            .max_streams_per_channel(42)
            .transport_type(TransportType::WebSocket)
            .build();
        assert_eq!(opts.host, "h:1");
        assert!(opts.use_tls);
        assert_eq!(opts.max_streams_per_channel, 42);
        assert_eq!(opts.transport_type, TransportType::WebSocket);
    }

    #[tokio::test]
    async fn registration_key_from_config_matches_display_form() {
        let r = reg("t", "c", "i");
        let k = RegistrationKey::from_config(&r.config);
        assert_eq!(k.to_string(), "t.c.i");
    }

    #[tokio::test]
    async fn duplicate_registrations_in_new_are_collapsed() {
        let runner = MultiConnectorRunner::new(
            MultiTransportOptions::default(),
            vec![reg("t", "c", "i"), reg("t", "c", "i"), reg("t", "c", "j")],
        );
        let keys = runner.registrations().await;
        assert_eq!(keys.len(), 2, "second duplicate should be dropped");
        assert_eq!(keys[0].instance_id, "i");
        assert_eq!(keys[1].instance_id, "j");
    }

    #[tokio::test]
    async fn add_rejects_duplicates() {
        let runner =
            MultiConnectorRunner::new(MultiTransportOptions::default(), vec![reg("t", "c", "i")]);
        let err = runner.add(reg("t", "c", "i")).await.unwrap_err();
        assert!(matches!(err, ConnectorError::InvalidConfig(_)));
    }

    #[tokio::test]
    async fn add_after_run_starts_is_rejected() {
        let runner =
            MultiConnectorRunner::new(MultiTransportOptions::default(), vec![reg("t", "c", "i")]);
        runner.running.store(true, Ordering::SeqCst);
        let err = runner.add(reg("t", "c", "j")).await.unwrap_err();
        assert!(matches!(&err, ConnectorError::AlreadyRunning));
    }

    #[tokio::test]
    async fn add_rejects_duplicate_with_invalid_config() {
        let runner =
            MultiConnectorRunner::new(MultiTransportOptions::default(), vec![reg("t", "c", "i")]);
        let err = runner.add(reg("t", "c", "i")).await.unwrap_err();
        assert!(matches!(&err, ConnectorError::InvalidConfig(m) if m.contains("duplicate")));
    }

    #[tokio::test]
    async fn shutdown_handle_signals_internal_flag() {
        let runner =
            MultiConnectorRunner::new(MultiTransportOptions::default(), vec![reg("t", "c", "i")]);
        let h = runner.shutdown_handle();
        assert!(!runner.shutdown_requested.load(Ordering::SeqCst));
        h.shutdown();
        assert!(runner.shutdown_requested.load(Ordering::SeqCst));
    }

    #[tokio::test]
    async fn run_with_empty_registrations_is_ok() {
        let runner = MultiConnectorRunner::new(MultiTransportOptions::default(), vec![]);
        runner.run().await.expect("empty run should succeed");
    }

    #[tokio::test]
    async fn run_with_pre_signalled_shutdown_is_ok() {
        let runner =
            MultiConnectorRunner::new(MultiTransportOptions::default(), vec![reg("t", "c", "i")]);
        runner.shutdown_handle().shutdown();
        runner
            .run()
            .await
            .expect("pre-signalled shutdown should be a clean Ok exit");
    }

    #[tokio::test]
    async fn run_websocket_accepts_config_and_shuts_down_cleanly() {
        // WS path fans out to N independent ConnectorRunners under the hood.
        // We can't run a real WS server here, so we verify the multi-runner
        // accepts WS config and exits cleanly when shutdown is signalled
        // before run starts.
        let opts = MultiTransportOptions::builder()
            .transport_type(TransportType::WebSocket)
            .host("localhost:65535") // unreachable; reconnect would loop forever
            .build();
        let runner = MultiConnectorRunner::new(opts, vec![reg("t", "c", "i")]);
        runner.shutdown_handle().shutdown();
        runner
            .run()
            .await
            .expect("pre-signalled WS shutdown should be a clean Ok exit");
    }

    #[tokio::test]
    async fn run_websocket_shutdown_does_not_leak_watcher_tasks() {
        // Regression test for the per-registration watcher-task leak. With
        // reconnect disabled the child ConnectorRunner exits on its own
        // (unreachable host, fail-fast). After it returns, the multi
        // run_websocket must tear its bridge task down so we can shut down
        // promptly and not leave a spinning watcher behind.
        let opts = MultiTransportOptions::builder()
            .transport_type(TransportType::WebSocket)
            // Localhost:1 is RFC-reserved and reliably refuses connections.
            .host("127.0.0.1:1")
            .build();
        let mut opts = opts;
        opts.reconnect_enabled = false;

        let runner = MultiConnectorRunner::new(opts, vec![reg("t", "c", "i")]);
        let shutdown = runner.shutdown_handle();

        // Trigger shutdown shortly after starting; the runner must observe
        // it via the watch channel and exit promptly even if the child
        // ConnectorRunner is still mid-handshake.
        tokio::spawn(async move {
            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
            shutdown.shutdown();
        });

        let res = tokio::time::timeout(std::time::Duration::from_secs(5), runner.run()).await;
        assert!(
            res.is_ok(),
            "run_websocket must exit within 5s of shutdown signal"
        );
        res.unwrap()
            .expect("run_websocket should return Ok after clean shutdown");
    }

    #[tokio::test]
    async fn add_races_with_run_either_lands_or_rejects_never_silently_dropped() {
        // Regression test for the TOCTOU between `add()`'s `running.load()`
        // and `run()`'s snapshot. Either the registration must be visible
        // to the runner (driven), or `add()` must return AlreadyRunning —
        // we must never observe "added but not driven".
        //
        // We can't drive a real run() without a server, so we exploit the
        // public surface: spawn run() against an empty pre-shutdown runner
        // (which exits cleanly without touching any registrations) and
        // race add() against it. The invariant is that `add()`'s outcome
        // must be consistent with `registrations()` *as seen after `run()`
        // returns*.
        for _ in 0..200 {
            let opts = MultiTransportOptions::default();
            let runner = std::sync::Arc::new(MultiConnectorRunner::new(opts, vec![]));
            // Pre-signal shutdown so run() exits without needing a server.
            runner.shutdown_handle().shutdown();

            let r1 = runner.clone();
            let run_task = tokio::spawn(async move { r1.run().await });

            // Yield once to let run() take the write lock first sometimes.
            tokio::task::yield_now().await;

            let add_res = runner.add(reg("t", "c", "i")).await;
            run_task.await.expect("run join").expect("run ok");

            match add_res {
                Ok(()) => {
                    // Must be visible in the registration list.
                    let keys = runner.registrations().await;
                    assert!(
                        keys.iter().any(|k| k.instance_id == "i"),
                        "add() succeeded but registration is not visible"
                    );
                }
                Err(ConnectorError::AlreadyRunning) => {
                    // Must NOT be in the list — rejection means not added.
                    let keys = runner.registrations().await;
                    assert!(
                        !keys.iter().any(|k| k.instance_id == "i"),
                        "add() returned AlreadyRunning but registration was still inserted"
                    );
                }
                Err(other) => panic!("unexpected add() error: {other:?}"),
            }
        }
    }

    #[tokio::test]
    async fn run_called_twice_returns_already_running() {
        let runner = MultiConnectorRunner::new(MultiTransportOptions::default(), vec![]);
        // Manually flip running so the second call hits the AlreadyRunning path
        // without us needing to race with a real run(). We can't naturally
        // observe two concurrent run()s without a running server — this test
        // covers only the precondition.
        runner.running.store(true, Ordering::SeqCst);
        let err = runner.run().await.unwrap_err();
        assert!(matches!(err, ConnectorError::AlreadyRunning));
    }
}