ff-engine 0.15.0

//! Unblock scanner for budget/quota/capability-blocked executions.
//!
//! Scans `ff:idx:{p:N}:lane:<lane>:blocked:{budget,quota,route}` per
//! execution partition. For each blocked execution, re-evaluates the
//! blocking condition. If cleared, calls `FCALL ff_unblock_execution`.
//!
//! Cross-partition budget check is cached per scan cycle (MANDATORY —
//! without it, 50K blocked executions = 50K budget reads).
//!
//! Capability sweep reads the union of non-authoritative worker cap
//! HASHes (`ff:worker:{ns}:{instance_id}:caps` — written by
//! `ff-sdk::FlowFabricWorker::connect` and by the `ff_register_worker`
//! FCALL) ONCE per scan cycle PER NAMESPACE, and uses it to decide
//! whether a `waiting_for_capable_worker` execution has a matching
//! worker. This is best-effort: caps HASHes may be slightly stale
//! (TTL'd, refreshed on connect), but the promotion path is
//! self-correcting — a promoted execution that still can't be claimed
//! gets re-blocked on the next scheduler tick. RFC-009 §7.5 documents
//! the v1 sweep approach and defers connect-triggered sweeps to V2.
//!
//! RFC-025 Phase 5 cutover: caps reads go through the namespace-scoped
//! `ff:idx:{ns}:workers` + `ff:worker:{ns}:{id}:caps` helpers; the
//! cache keys off `Namespace` so multi-tenant deployments don't mix
//! capability sets across tenants.
//!
//! MUST skip `paused_by_flow_cancel` — only cancel_flow clears that.
//!
//! Reference: RFC-008 §2.4, RFC-009 §7.5, RFC-010 §6

use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use std::time::{Duration, Instant};

use futures::stream::{FuturesUnordered, StreamExt};
use tokio::sync::Mutex as AsyncMutex;

use ff_core::backend::ScannerFilter;
use ff_core::engine_backend::EngineBackend;
use ff_core::keys::IndexKeys;
use ff_core::partition::{Partition, PartitionConfig, PartitionFamily, budget_partition};
use ff_core::types::{BudgetId, ExecutionId, LaneId, Namespace, TimestampMs};

use super::{should_skip_candidate, ScanResult, Scanner};

const BATCH_SIZE: u32 = 100;

/// SSCAN page size for the workers-index SET. Same COUNT the other
/// index-SET scanners use (budget_reconciler, flow_projector). Bounds
/// per-cursor round-trip response size; total iteration is still the
/// full SET size across cursor=0 wrap.
const WORKERS_SSCAN_COUNT: usize = 100;

/// Per-worker caps GET fan-out concurrency cap. Mirrors the bounded
/// parallelism W1 used in initialize_waitpoint_hmac_secret. Too low and
/// large fleets (1000+ workers) pay serial round-trip latency; too high
/// and we head-of-line the scanner client with a pipeline burst. 16 is
/// a pragmatic middle for the current fleet scales.
const CAPS_GET_CONCURRENCY: usize = 16;

pub struct UnblockScanner {
    interval: Duration,
    lanes: Vec<LaneId>,
    partition_config: PartitionConfig,
    filter: ScannerFilter,
    /// Shared worker-caps union cache across ALL partitions in one scan
    /// pass. Previously this cache was declared inside `scan_partition`
    /// which runs once PER PARTITION — at 256 partitions that meant up
    /// to 256 redundant SSCAN + fan-out GET sequences per scan interval.
    /// Now: one load per TTL window (= `interval`), shared across every
    /// partition visited in that window. `TTL == interval` is natural:
    /// a worker connecting "now" propagates into the caps union on the
    /// next scan cycle, not faster or slower than the cycle itself.
    ///
    /// `Arc<AsyncMutex<_>>` because the Scanner trait's `scan_partition`
    /// takes `&self`. Contention is bounded by the partition iteration
    /// cadence (one partition at a time per scanner task), so the mutex
    /// is effectively uncontended in steady state.
    caps_cache: Arc<AsyncMutex<CapsUnionCache>>,
    backend: Option<Arc<dyn EngineBackend>>,
}

/// Per-namespace worker-caps union cache. Every blocked execution
/// carries a `namespace` field on `exec_core`; the scanner reads that
/// alongside `blocking_reason` and routes the subset check to the
/// union of caps advertised by workers in THAT namespace only. Absent
/// this partitioning, a tenant-A worker advertising
/// `required_caps=[gpu]` would unblock a tenant-B execution waiting on
/// the same token.
struct CapsUnionCache {
    /// TTL applied uniformly to every namespace entry — same cadence
    /// as `UnblockScanner::interval`, so a freshly-connected worker
    /// propagates into its namespace's union on the next scan cycle.
    ttl: Duration,
    /// Entry per observed namespace. Grows as new namespaces surface
    /// on blocked executions; entries do NOT expire (the bounded fleet
    /// of namespaces is small and refreshing an unused entry on next
    /// visit costs one SSCAN). Cleared on scanner restart.
    entries: HashMap<Namespace, CapsUnionEntry>,
}

struct CapsUnionEntry {
    snapshot: Option<BTreeSet<String>>,
    fetched_at: Option<Instant>,
}

impl UnblockScanner {
    pub fn new(interval: Duration, lanes: Vec<LaneId>, partition_config: PartitionConfig) -> Self {
        Self::with_filter(interval, lanes, partition_config, ScannerFilter::default())
    }

    /// Construct with a [`ScannerFilter`] applied per candidate
    /// (issue #122).
    pub fn with_filter(
        interval: Duration,
        lanes: Vec<LaneId>,
        partition_config: PartitionConfig,
        filter: ScannerFilter,
    ) -> Self {
        Self {
            interval,
            lanes,
            partition_config,
            filter,
            caps_cache: Arc::new(AsyncMutex::new(CapsUnionCache {
                ttl: interval,
                entries: HashMap::new(),
            })),
            backend: None,
        }
    }

    /// PR-7b Cluster 1: wire an `EngineBackend` for filter-resolution
    /// reads. FCALL routing is cluster 2 scope.
    pub fn with_filter_and_backend(
        interval: Duration,
        lanes: Vec<LaneId>,
        partition_config: PartitionConfig,
        filter: ScannerFilter,
        backend: Arc<dyn EngineBackend>,
    ) -> Self {
        Self {
            interval,
            lanes,
            partition_config,
            filter,
            caps_cache: Arc::new(AsyncMutex::new(CapsUnionCache {
                ttl: interval,
                entries: HashMap::new(),
            })),
            backend: Some(backend),
        }
    }
}

impl Scanner for UnblockScanner {
    fn name(&self) -> &'static str {
        "unblock"
    }

    fn interval(&self) -> Duration {
        self.interval
    }

    fn filter(&self) -> &ScannerFilter {
        &self.filter
    }

    async fn scan_partition(
        &self,
        client: &ferriskey::Client,
        partition: u16,
    ) -> ScanResult {
        let p = Partition {
            family: PartitionFamily::Execution,
            index: partition,
        };
        let idx = IndexKeys::new(&p);

        let mut total_processed: u32 = 0;
        let mut total_errors: u32 = 0;

        // Cross-partition budget cache: budget_id → is_breached.
        // Reset per partition scan (each partition scan is one "cycle").
        let mut budget_cache: HashMap<String, bool> = HashMap::new();

        // Worker-caps union cache is shared across ALL partitions via
        // the scanner struct (Arc<AsyncMutex<CapsUnionCache>>). `get_or_load`
        // returns the cached snapshot if its fetched_at is within
        // `interval` (TTL == scan interval), otherwise loads fresh via
        // SSCAN + concurrent GET fan-out. Without this, at 256 partitions
        // the old per-partition-local cache re-ran load_worker_caps_union
        // up to 256× per cycle.
        let caps_cache = self.caps_cache.clone();

        for lane in &self.lanes {
            // Scan blocked:budget
            let budget_key = idx.lane_blocked_budget(lane);
            let r = scan_blocked_set(
                client, self.backend.as_ref(), &p, &idx, lane, &budget_key,
                "waiting_for_budget", &mut budget_cache,
                &caps_cache,
                &self.partition_config,
                &self.filter,
            ).await;
            total_processed += r.processed;
            total_errors += r.errors;

            // Scan blocked:quota
            let quota_key = idx.lane_blocked_quota(lane);
            let r = scan_blocked_set(
                client, self.backend.as_ref(), &p, &idx, lane, &quota_key,
                "waiting_for_quota", &mut budget_cache,
                &caps_cache,
                &self.partition_config,
                &self.filter,
            ).await;
            total_processed += r.processed;
            total_errors += r.errors;

            // Scan blocked:route (capability-mismatch blocks). Promotion
            // decision reads the union of connected workers' caps and
            // checks subset coverage. See check_route_cleared below.
            let route_key = idx.lane_blocked_route(lane);
            let r = scan_blocked_set(
                client, self.backend.as_ref(), &p, &idx, lane, &route_key,
                "waiting_for_capable_worker", &mut budget_cache,
                &caps_cache,
                &self.partition_config,
                &self.filter,
            ).await;
            total_processed += r.processed;
            total_errors += r.errors;
        }

        ScanResult {
            processed: total_processed,
            errors: total_errors,
        }
    }
}

/// Scan one blocked set and unblock executions whose condition has cleared.
#[allow(clippy::too_many_arguments)]
async fn scan_blocked_set(
    client: &ferriskey::Client,
    backend: Option<&Arc<dyn EngineBackend>>,
    partition: &Partition,
    idx: &IndexKeys,
    lane: &LaneId,
    blocked_key: &str,
    expected_reason: &str,
    budget_cache: &mut HashMap<String, bool>,
    caps_cache: &Arc<AsyncMutex<CapsUnionCache>>,
    partition_config: &PartitionConfig,
    filter: &ScannerFilter,
) -> ScanResult {
    // Read all members from the blocked set (they're scored by block time)
    let blocked: Vec<String> = match client
        .cmd("ZRANGEBYSCORE")
        .arg(blocked_key)
        .arg("-inf")
        .arg("+inf")
        .arg("LIMIT")
        .arg("0")
        .arg(BATCH_SIZE.to_string().as_str())
        .execute()
        .await
    {
        Ok(ids) => ids,
        Err(e) => {
            tracing::warn!(
                error = %e,
                blocked_key,
                "unblock_scanner: ZRANGEBYSCORE failed"
            );
            return ScanResult { processed: 0, errors: 1 };
        }
    };

    if blocked.is_empty() {
        return ScanResult { processed: 0, errors: 0 };
    }

    let mut processed: u32 = 0;
    let mut errors: u32 = 0;
    let tag = partition.hash_tag();

    for eid_str in &blocked {
        if should_skip_candidate(backend, filter, partition.index, eid_str).await {
            continue;
        }
        // Read blocking_reason + namespace from exec_core. Namespace
        // gates the caps-union subset check so tenant-A worker caps
        // never unblock a tenant-B `blocked_by_route` execution —
        // RFC-025 Phase 5 multi-tenant safety property.
        let core_key = format!("ff:exec:{}:{}:core", tag, eid_str);
        let fields: Vec<Option<String>> = match client
            .cmd("HMGET")
            .arg(&core_key)
            .arg("blocking_reason")
            .arg("namespace")
            .execute()
            .await
        {
            Ok(r) => r,
            Err(e) => {
                tracing::warn!(
                    execution_id = eid_str.as_str(),
                    error = %e,
                    "unblock_scanner: HMGET blocking_reason/namespace failed, skipping"
                );
                errors += 1;
                continue;
            }
        };
        if fields.len() < 2 {
            tracing::warn!(
                execution_id = eid_str.as_str(),
                returned_fields = fields.len(),
                "unblock_scanner: HMGET returned < 2 fields, skipping"
            );
            errors += 1;
            continue;
        }
        let reason = fields[0].as_deref().unwrap_or("");
        let namespace = match fields[1].as_deref().filter(|s| !s.is_empty()) {
            Some(ns_str) => Namespace::new(ns_str),
            None => {
                // Every execution gets a namespace at create-time
                // (`ff_create_execution` HSETs it). Missing here = data
                // integrity defect; skip rather than risk a
                // cross-tenant promotion.
                tracing::warn!(
                    execution_id = eid_str.as_str(),
                    core_key = %core_key,
                    "unblock_scanner: exec_core missing namespace field, skipping"
                );
                errors += 1;
                continue;
            }
        };

        // Skip if not blocked by the expected reason (e.g. paused_by_flow_cancel)
        if reason != expected_reason {
            continue;
        }

        // Re-evaluate the blocking condition
        let should_unblock = match expected_reason {
            "waiting_for_budget" => {
                check_budget_cleared(client, &core_key, budget_cache, partition_config).await
            }
            "waiting_for_quota" => {
                check_quota_cleared(client, backend, &core_key, eid_str, partition_config).await
            }
            "waiting_for_capable_worker" => {
                check_route_cleared(client, &core_key, caps_cache, &namespace).await
            }
            _ => false,
        };

        if !should_unblock {
            continue;
        }

        // Unblock: trait-route → `EngineBackend::unblock_execution`
        // (Valkey: wraps `ff_unblock_execution` FCALL on {p:N}).
        let now_ms_res: Result<u64, String> = if let Some(b) = backend {
            b.server_time_ms().await.map_err(|e| e.to_string())
        } else {
            crate::scanner::lease_expiry::server_time_ms_legacy(client).await.map_err(|e| e.to_string())
        };
        let now_ms = match now_ms_res {
            Ok(t) => t,
            Err(e) => {
                tracing::warn!(
                    execution_id = eid_str.as_str(),
                    error = %e,
                    "unblock_scanner: server TIME failed, skipping unblock"
                );
                errors += 1;
                continue;
            }
        };

        let res = if let Some(backend_arc) = backend {
            let Ok(eid) = ExecutionId::parse(eid_str) else {
                tracing::warn!(execution_id = eid_str.as_str(), "malformed eid; skipping");
                continue;
            };
            backend_arc
                .unblock_execution(
                    *partition,
                    lane,
                    &eid,
                    expected_reason,
                    TimestampMs::from_millis(now_ms as i64),
                )
                .await
                .map_err(|e| e.to_string())
        } else {
            // Test-only fallback: direct FCALL on the scanner client.
            // Mirrors cluster-1 lease_expiry. KEYS(3)/ARGV(3) identical
            // to the Valkey impl.
            let eligible_key = idx.lane_eligible(lane);
            let keys: [&str; 3] = [&core_key, blocked_key, &eligible_key];
            let now_s = now_ms.to_string();
            let argv: [&str; 3] = [eid_str, &now_s, expected_reason];
            client
                .fcall::<ferriskey::Value>("ff_unblock_execution", &keys, &argv)
                .await
                .map(|_: ferriskey::Value| ())
                .map_err(|e| e.to_string())
        };

        match res {
            Ok(()) => {
                tracing::info!(
                    execution_id = eid_str.as_str(),
                    reason = expected_reason,
                    "unblock_scanner: execution unblocked"
                );
                processed += 1;
            }
            Err(e) => {
                tracing::warn!(
                    execution_id = eid_str.as_str(),
                    error = %e,
                    "unblock_scanner: unblock_execution failed"
                );
                errors += 1;
            }
        }
    }

    ScanResult { processed, errors }
}

/// Check if budget blocking condition has cleared.
/// Uses cross-partition cache to avoid redundant reads.
async fn check_budget_cleared(
    client: &ferriskey::Client,
    core_key: &str,
    cache: &mut HashMap<String, bool>,
    config: &PartitionConfig,
) -> bool {
    // Read budget_ids from exec_core
    let budget_ids_str: Option<String> = client
        .cmd("HGET")
        .arg(core_key)
        .arg("budget_ids")
        .execute()
        .await
        .unwrap_or(None);

    let budget_ids_str = match budget_ids_str {
        Some(s) if !s.is_empty() => s,
        _ => return true, // no budgets → unblock
    };

    for budget_id in budget_ids_str.split(',') {
        let budget_id = budget_id.trim();
        if budget_id.is_empty() {
            continue;
        }

        // Check cache first
        if let Some(&breached) = cache.get(budget_id) {
            if breached {
                return false; // still breached
            }
            continue;
        }

        // Read from Valkey and cache
        let breached = is_budget_breached(client, budget_id, config).await;
        cache.insert(budget_id.to_owned(), breached);
        if breached {
            return false;
        }
    }

    true // all budgets within limits
}

/// Read budget usage + limits and check if any hard limit is breached.
/// Computes real {b:M} partition tag from budget_id.
async fn is_budget_breached(
    client: &ferriskey::Client,
    budget_id: &str,
    config: &PartitionConfig,
) -> bool {
    // Compute the real budget partition tag
    let bid = match BudgetId::parse(budget_id) {
        Ok(id) => id,
        Err(_) => return false, // invalid budget_id → treat as not breached
    };
    let partition = budget_partition(&bid, config);
    let tag = partition.hash_tag();
    let usage_key = format!("ff:budget:{}:{}:usage", tag, budget_id);
    let limits_key = format!("ff:budget:{}:{}:limits", tag, budget_id);

    // Read hard limits — fail-closed: if Valkey read fails, treat as breached
    // (keep execution blocked) rather than silently unblocking
    let limits: Vec<String> = match client
        .cmd("HGETALL")
        .arg(&limits_key)
        .execute()
        .await
    {
        Ok(v) => v,
        Err(e) => {
            tracing::error!(
                budget_id,
                error = %e,
                "unblock_scanner: budget limits read failed, keeping blocked (fail-closed)"
            );
            return true; // treat as breached
        }
    };

    let mut i = 0;
    while i + 1 < limits.len() {
        let field = &limits[i];
        let limit_str = &limits[i + 1];
        i += 2;

        if !field.starts_with("hard:") {
            continue;
        }
        let dimension = &field[5..];
        let limit: u64 = match limit_str.parse() {
            Ok(v) if v > 0 => v,
            _ => continue,
        };

        let usage_str: Option<String> = match client
            .cmd("HGET")
            .arg(&usage_key)
            .arg(dimension)
            .execute()
            .await
        {
            Ok(v) => v,
            Err(e) => {
                tracing::error!(
                    budget_id,
                    dimension,
                    error = %e,
                    "unblock_scanner: budget usage read failed, keeping blocked (fail-closed)"
                );
                return true; // treat as breached
            }
        };
        let usage: u64 = usage_str.as_deref().and_then(|s| s.parse().ok()).unwrap_or(0);

        if usage >= limit {
            return true; // breached
        }
    }

    false
}

/// Check if quota blocking condition has cleared.
/// Re-checks the sliding window after cleanup.
/// Computes real {q:K} partition tag from quota_policy_id.
async fn check_quota_cleared(
    client: &ferriskey::Client,
    backend: Option<&Arc<dyn EngineBackend>>,
    core_key: &str,
    _eid_str: &str,
    config: &PartitionConfig,
) -> bool {
    // Read quota_policy_id from exec_core — fail-closed on Valkey error
    let quota_id: Option<String> = match client
        .cmd("HGET")
        .arg(core_key)
        .arg("quota_policy_id")
        .execute()
        .await
    {
        Ok(v) => v,
        Err(e) => {
            tracing::error!(
                core_key,
                error = %e,
                "unblock_scanner: quota_policy_id read failed, keeping blocked (fail-closed)"
            );
            return false;
        }
    };

    let quota_id = match quota_id {
        Some(s) if !s.is_empty() => s,
        _ => return true, // no quota → unblock
    };

    // Compute real quota partition tag
    let qid = match ff_core::types::QuotaPolicyId::parse(&quota_id) {
        Ok(id) => id,
        Err(_) => return true, // invalid → unblock (advisory)
    };
    let partition = ff_core::partition::quota_partition(&qid, config);
    let tag = partition.hash_tag();

    let quota_def_key = format!("ff:quota:{}:{}", tag, quota_id);
    let window_key = format!("ff:quota:{}:{}:window:requests_per_window", tag, quota_id);
    let concurrency_key = format!("ff:quota:{}:{}:concurrency", tag, quota_id);

    // Read quota definition fields — fail-closed on Valkey error
    let def_fields: Vec<Option<String>> = match client
        .cmd("HMGET")
        .arg(&quota_def_key)
        .arg("max_requests_per_window")
        .arg("requests_per_window_seconds")
        .arg("active_concurrency_cap")
        .execute()
        .await
    {
        Ok(v) => v,
        Err(e) => {
            tracing::error!(
                quota_id = %quota_id,
                error = %e,
                "unblock_scanner: quota definition read failed, keeping blocked (fail-closed)"
            );
            return false;
        }
    };
    let rate_limit: u64 = def_fields.first()
        .and_then(|v| v.as_ref())
        .and_then(|s| s.parse().ok())
        .unwrap_or(0);
    let window_secs: u64 = def_fields.get(1)
        .and_then(|v| v.as_ref())
        .and_then(|s| s.parse().ok())
        .unwrap_or(60);
    let concurrency_cap: u64 = def_fields.get(2)
        .and_then(|v| v.as_ref())
        .and_then(|s| s.parse().ok())
        .unwrap_or(0);

    // Check rate: clean window, count
    if rate_limit > 0 {
        let now_ms_res: Result<u64, String> = if let Some(b) = backend {
            b.server_time_ms().await.map_err(|e| e.to_string())
        } else {
            crate::scanner::lease_expiry::server_time_ms_legacy(client).await.map_err(|e| e.to_string())
        };
        let now_ms = match now_ms_res {
            Ok(t) => t,
            Err(_) => return false,
        };
        let window_ms = window_secs * 1000;
        let cutoff = (now_ms.saturating_sub(window_ms)).to_string();

        let _: Result<i64, _> = client
            .cmd("ZREMRANGEBYSCORE")
            .arg(&window_key)
            .arg("-inf")
            .arg(&cutoff)
            .execute()
            .await;

        let count: i64 = client
            .cmd("ZCARD")
            .arg(&window_key)
            .execute()
            .await
            .unwrap_or(0);

        if count as u64 >= rate_limit {
            return false; // still at limit
        }
    }

    // Check concurrency
    if concurrency_cap > 0 {
        let active: i64 = client
            .cmd("GET")
            .arg(&concurrency_key)
            .execute()
            .await
            .unwrap_or(0);

        if active as u64 >= concurrency_cap {
            return false; // still at cap
        }
    }

    true // quota cleared
}

/// Check if the capability-block has cleared: some connected worker's
/// caps now cover the execution's `required_capabilities`.
///
/// The UNION of every connected worker's caps is cached on the scanner
/// struct with a TTL equal to `interval`; so within one scan cycle every
/// partition reuses the same snapshot, and between cycles a stale
/// snapshot is automatically refreshed.
///
/// Fail-OPEN on union-load failure: if the SSCAN or fan-out GET errors
/// out, assume a worker might match and let the scheduler re-decide on
/// the next tick. The caps set is non-authoritative (Lua never reads it);
/// treating it as "unknown → maybe" preserves liveness. Fail-closed
/// would leave executions stuck whenever the caps read hits a transient
/// Valkey error.
async fn check_route_cleared(
    client: &ferriskey::Client,
    core_key: &str,
    caps_cache: &Arc<AsyncMutex<CapsUnionCache>>,
    namespace: &Namespace,
) -> bool {
    let required_csv: Option<String> = client
        .cmd("HGET")
        .arg(core_key)
        .arg("required_capabilities")
        .execute()
        .await
        .unwrap_or(None);
    let required_csv = match required_csv {
        Some(s) if !s.is_empty() => s,
        _ => return true, // no required caps → anyone can claim
    };

    // Two-phase cache access: NEVER hold the mutex across the
    // network refresh. Holding `AsyncMutex` across
    // `load_worker_caps_union`'s SSCAN + HGET fan-out would serialize
    // every tenant's cap-block decision behind one slow namespace's
    // Valkey round trip — see PR #500 review.
    //
    // Phase 1: under the lock, decide if a refresh is needed and
    // grab a current-or-empty snapshot for the fallback path.
    //
    // Phase 2: if stale, refresh off-lock, then re-acquire briefly
    // to write the result. Racing refreshes for the same namespace
    // resolve to "last writer wins" — acceptable because all fetches
    // observe the same authoritative index at roughly the same time.
    let (stale, cached_snapshot): (bool, BTreeSet<String>) = {
        let mut guard = caps_cache.lock().await;
        let ttl = guard.ttl;
        let entry = guard
            .entries
            .entry(namespace.clone())
            .or_insert(CapsUnionEntry {
                snapshot: None,
                fetched_at: None,
            });
        let stale = entry
            .fetched_at
            .map(|t| t.elapsed() >= ttl)
            .unwrap_or(true);
        (stale, entry.snapshot.clone().unwrap_or_default())
    };

    let snapshot: BTreeSet<String> = if stale {
        match load_worker_caps_union(client, namespace).await {
            Ok(union) => {
                let refreshed = union.clone();
                let mut guard = caps_cache.lock().await;
                if let Some(entry) = guard.entries.get_mut(namespace) {
                    entry.snapshot = Some(refreshed);
                    entry.fetched_at = Some(Instant::now());
                }
                union
            }
            Err(e) => {
                tracing::warn!(
                    error = %e,
                    namespace = %namespace,
                    "unblock_scanner: failed to read worker caps union — \
                     assuming match possible (fail-open to preserve liveness)"
                );
                return true;
            }
        }
    } else {
        cached_snapshot
    };

    // Subset check: every non-empty token in required_csv present in union.
    required_csv
        .split(',')
        .filter(|t| !t.is_empty())
        .all(|t| snapshot.contains(t))
}

/// Union of every connected worker's advertised capabilities within
/// the given namespace.
///
/// #502 note — NOT folded into a single Lua FCALL by design: the
/// `workers_index_key_ns` (`ff:idx:<ns>:workers`) and the per-worker
/// `worker_caps_key_ns` (`ff:worker:<ns>:<inst>:caps`) do NOT share a
/// Valkey hash-tag `{…}` in the current key shape. In cluster mode an
/// FCALL only sees keys on a single slot; packaging SSCAN + per-worker
/// HGET into one Lua body would silently drop workers whose caps key
/// hashes to a different slot than the index SET. A restructure to
/// share `{ns}` as a hash tag (e.g. `ff:idx:{<ns>}:workers` +
/// `ff:worker:{<ns>}:<inst>:caps`) would require migrating the SDK
/// preamble + `ff_register_worker` FCALL + every reader in one hop —
/// cross-cutting change, deferred past the #502 closing-PR scope. The
/// SSCAN + bounded-concurrent fan-out below is already the shape we'd
/// emit from Lua, minus the single-round-trip framing.
///
/// Cluster-safe enumeration pattern (matches Batch A's index SETs for
/// budget/flow/deps): SSCAN the namespace-scoped
/// `workers_index_key_ns(namespace)` SET (single-slot, no hash tag
/// needed — the key name literally hashes to one slot), then fan-out
/// concurrent `HGET ff:worker:{ns}:{id}:caps caps_csv` with a bounded
/// concurrency cap. A keyspace `SCAN MATCH ff:worker:*:caps` in
/// cluster mode visits only one shard per call and silently drops
/// workers on other shards — exactly the class of bug Batch A Issue
/// #11 fixed.
///
/// Caps storage is a HASH (RFC-025 Phase 5 post-cutover — shape
/// matches `ff_register_worker` FCALL + the SDK preamble), and only
/// the `caps_csv` field drives the union. Per-worker HGET is a
/// single-field read, not HGETALL, so the scanner pays only the bytes
/// it uses.
///
/// SSCAN is used instead of SMEMBERS so a fleet of thousands of workers
/// doesn't round-trip the entire member list in one reply. `COUNT = 100`
/// matches the convention in budget_reconciler / flow_projector.
///
/// Empty `caps_csv` or missing key = "no caps for that worker";
/// scanner keeps accumulating. Per-worker HGET error, in contrast,
/// PROPAGATES — a previous version used `.unwrap_or(None)` which
/// silently merged error and empty into the same branch, making a
/// transient error look like "this worker has no caps". In a
/// single-capable-worker fleet that produced false-negative unions and
/// left executions blocked even though a matching worker existed,
/// contradicting the scanner's documented fail-open behavior. Now an
/// error bubbles up; the only caller (`check_route_cleared`) treats
/// `Err` by returning `true` (unblock — "we don't know, let the
/// scheduler re-decide next tick"), which preserves liveness uniformly
/// whether the fault is SSCAN, HGET, or deeper transport.
async fn load_worker_caps_union(
    client: &ferriskey::Client,
    namespace: &Namespace,
) -> Result<BTreeSet<String>, ferriskey::Error> {
    let mut union = BTreeSet::new();
    let index_key = ff_core::keys::workers_index_key_ns(namespace);

    // Helper: drain one completed future and fold its caps into the
    // union, or propagate its error. Centralizing keeps the in-loop +
    // drain paths symmetric (both must behave the same — a missed error
    // at either point re-introduces the false-negative-union bug).
    fn absorb(
        union: &mut BTreeSet<String>,
        res: Result<Option<String>, ferriskey::Error>,
    ) -> Result<(), ferriskey::Error> {
        let csv = res?;
        if let Some(csv) = csv {
            for token in csv.split(',') {
                if !token.is_empty() {
                    union.insert(token.to_owned());
                }
            }
        }
        Ok(())
    }

    // SSCAN loop — bounded per-page response size. Cursor starts at "0"
    // and wraps back to "0" when iteration completes.
    let mut cursor: String = "0".to_owned();
    loop {
        let reply: (String, Vec<String>) = client
            .cmd("SSCAN")
            .arg(&index_key)
            .arg(&cursor)
            .arg("COUNT")
            .arg(WORKERS_SSCAN_COUNT.to_string().as_str())
            .execute()
            .await?;
        cursor = reply.0;
        let worker_ids = reply.1;

        // Bounded concurrent HGETs per SSCAN page. FuturesUnordered with
        // a buffered stream caps in-flight work at CAPS_GET_CONCURRENCY —
        // enough parallelism to amortize round-trip latency, bounded so
        // one scanner tick can't flood the shared Valkey client. Each
        // spawned future returns `Result<Option<String>, Error>` so
        // transient Valkey errors propagate up (no .unwrap_or(None)
        // swallowing) — see the fn-level doc for why.
        let mut pending: FuturesUnordered<_> = FuturesUnordered::new();
        for id in worker_ids {
            let client = client.clone();
            let namespace = namespace.clone();
            pending.push(async move {
                let instance = ff_core::types::WorkerInstanceId::new(id);
                let caps_key =
                    ff_core::keys::worker_caps_key_ns(&namespace, &instance);
                let csv: Option<String> = client
                    .cmd("HGET")
                    .arg(&caps_key)
                    .arg("caps_csv")
                    .execute()
                    .await?;
                Ok::<Option<String>, ferriskey::Error>(csv)
            });
            if pending.len() >= CAPS_GET_CONCURRENCY
                && let Some(res) = pending.next().await
            {
                absorb(&mut union, res)?;
            }
        }
        // Drain remaining pending GETs from this page before advancing
        // the SSCAN cursor. Keeps the pipeline window bounded and the
        // union observation consistent with "all workers visible so far".
        while let Some(res) = pending.next().await {
            absorb(&mut union, res)?;
        }

        if cursor == "0" {
            break;
        }
    }
    Ok(union)
}