laburnum 1.17.3

// Copyright Two Neutron Stars Incorporated and contributors
// SPDX-License-Identifier: BlueOak-1.0.0

use {
  crate::{
    connect::lsp::{ClientId, ClientRegistry},
    daemon::{
      DaemonConfig, daemon_task::DaemonTask, idle_monitor::idle_monitor_task,
    },
    server::IpcServer,
  },
  concurrent_queue::ConcurrentQueue,
  std::{
    sync::{
      Arc,
      atomic::{AtomicBool, AtomicUsize, Ordering},
    },
    time::Duration,
  },
};

mod gc;
pub mod key_watcher;
pub mod lanes;
pub mod task;
mod worker;

use {
  crate::{
    Ident, Partitions, TRACER,
    connect::ipc::Connection,
    database::{
      Database, GenerationEpoch, chunk::RecordWriter, gc::GarbageCollector,
      query::SortKeyCondition, reaper::Reaper,
    },
    progress::ProgressTracker,
    protocol::{lsp::LanguageServer, task::RpcTask},
    scheduler::{
      key_watcher::{WatcherResult, dispatch_builtin_watcher},
      lanes::{Lane, lane_priority},
      task::{LaburnumTask, TaskContext},
    },
  },
  std::{
    collections::{BTreeMap, HashMap},
    future::Future,
  },
};

/// How the scheduler's lane queues are driven.
///
/// `Threaded` runs a pool of worker threads that block-park between work.
/// `Inline` runs no threads of its own: an external async executor polls
/// [`Scheduler::drive`] on its own thread, and queueing wakes that pump
/// through an [`event_listener::Event`] instead of unparking threads.
pub(crate) enum RunMode {
  Threaded,
  #[cfg(feature = "test")]
  Inline { wake: event_listener::Event },
}

/// A one-shot re-dispatch registered against a `pending_deps` condition; fired
/// when a matching key lands (see [`Scheduler::fire_pending_redispatch`]).
type ReDispatch = Box<dyn FnOnce() + Send>;

/// `pending_deps` re-dispatch registry, keyed by the partition a watcher read.
/// Each entry carries the owning watcher instance (so a re-dispatch can clear
/// its prior registrations — ADR0008), the sort-key condition it needs, and the
/// thunk to re-run.
type PendingRedispatch<P> = HashMap<
  Ident,
  Vec<(Ident, SortKeyCondition<<P as Partitions>::SortKey>, ReDispatch)>,
>;

/// The watcher handler function pointer shape shared by the `watchers!` macro
/// and the built-in watchers.
type WatcherHandlerFn<P, T> = for<'a> fn(
  &'a mut TaskContext<P, T>,
  &'a mut crate::database::PartitionWriteContextRef<'a, P>,
) -> std::pin::Pin<
  Box<dyn Future<Output = WatcherResult<P, T>> + Send + 'a>,
>;

/// Configuration options for the task scheduler.
///
/// Controls garbage collection, RPC buffering, and other scheduler behavior.
#[derive(Debug, Clone)]
pub struct SchedulerConfiguration {
  /// Maximum number of RPC responses to buffer before blocking.
  ///
  /// Higher values reduce blocking but use more memory. Default: 100.
  pub rpc_response_capacity: usize,

  /// Whether to run periodic garbage collection on IDLE_LANE.
  pub enable_periodic_gc: bool,

  /// How long the idle debounce waits after the last user task completes
  /// before declaring quiescence and ending open progress. Absorbs
  /// transient dips to zero between pipeline stages and coalesces rapid
  /// successive edits into a single progress run. Default: 10ms.
  pub idle_debounce: Duration,
}

impl Default for SchedulerConfiguration {
  fn default() -> Self {
    Self {
      rpc_response_capacity: 100,
      #[cfg(feature = "test")]
      enable_periodic_gc: false,
      #[cfg(not(feature = "test"))]
      enable_periodic_gc: true,
      idle_debounce: Duration::from_millis(10),
    }
  }
}

/// Priority-based async task scheduler with 31 priority lanes.
///
/// The scheduler manages execution of compilation tasks across multiple worker
/// threads, using a lane-based priority system inspired by React Fiber.
///
/// # Architecture
///
/// - **31 priority lanes**: Concurrent queues ordered from highest (SYNC_LANE)
///   to lowest (SPECULATIVE_LANE)
/// - **Worker threads**: `num_cpus - 1` threads that poll lanes for work
/// - **Priority selection**: O(1) via array iteration from high to low lanes
/// - **No work stealing**: Simpler implementation, predictable behavior
///
/// # Worker Thread Count
///
/// Default: `num_cpus::get() - 1` (leaves one CPU for main thread handling RPC
/// messages)
///
/// # Lane System
///
/// Each lane is a separate concurrent queue. Workers check lanes from high to
/// low priority, executing the first task found. This provides strict priority
/// ordering without locking overhead.
///
/// See [`lanes`] module for complete lane hierarchy.
pub struct Scheduler<P: Partitions, T: LanguageServer<P>> {
  db: Database<P>,
  pub(crate) connection: Connection,
  filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
  source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,

  /// 31 concurrent queues, one per priority lane. Workers iterate from lane 0
  /// (highest) to lane 30 (lowest) to find work.
  lane_queues: [ConcurrentQueue<Arc<LaburnumTask<P, T>>>; 31],

  /// Lock for RPC lane rotation. Ensures atomic bubble-up of tasks across
  /// the 4 RPC priority lanes during `queue_rpc_task()`.
  rpc_rotation_lock: parking_lot::Mutex<()>,

  worker_threads: parking_lot::RwLock<Vec<std::thread::JoinHandle<()>>>,

  /// Number of worker threads to spawn. Default: num_cpus - 1
  worker_count: usize,

  /// How the lane queues are driven (worker threads vs. an external executor
  /// polling [`Scheduler::drive`]). Determines how queueing wakes the runtime.
  run_mode: RunMode,

  pub server: Arc<T>,
  pub shutdown_flag: Arc<AtomicBool>,
  config: SchedulerConfiguration,

  /// Flag to request graceful shutdown from command handlers.
  /// Checked by DaemonServer to trigger graceful_shutdown().
  shutdown_requested: Arc<AtomicBool>,

  pub(crate) progress_tracker: Arc<ProgressTracker>,

  /// Count of in-flight `User` (compilation) tasks. Incremented when such a
  /// task is constructed, decremented when it reaches `Poll::Ready`. When it
  /// falls to zero the pipeline is quiescent. `System` tasks (RPC pump,
  /// daemon loop, GC, the debounce itself) are excluded so they don't pin
  /// this above zero forever.
  pub(crate) work_in_flight: AtomicUsize,

  /// Singleton guard for the idle debounce: ensures only one debounce task
  /// is in flight at a time even as the counter repeatedly touches zero.
  pub(crate) idle_debounce_armed: AtomicBool,

  /// `pending_deps` re-dispatch registry (GLD-0035). Keyed by the partition a
  /// watcher read while a record was absent; each entry holds the sort-key
  /// condition it needed and a one-shot thunk that re-runs that watcher. On
  /// each commit, [`fire_pending_redispatch`](Self::fire_pending_redispatch)
  /// fires the thunks whose condition matches a newly written/deleted key.
  pub(crate) pending_redispatch: parking_lot::Mutex<PendingRedispatch<P>>,

  /// Lock-free count of registrations in `pending_redispatch`, so the
  /// commit hot path can skip taking the mutex when nothing is waiting (the
  /// common case). Kept in sync with the registry's total entry count.
  pub(crate) pending_redispatch_count: AtomicUsize,

  /// Client registry for tracking connected clients.
  pub(crate) registry: Arc<ClientRegistry>,

  /// Reference-counting reaper for deferred decrements from index overwrites.
  pub(crate) reaper: Reaper<P>,

  /// Tri-color mark-sweep garbage collector.
  pub(crate) gc: GarbageCollector,

  /// Tracks the epoch at which each active task started, for snapshot isolation.
  /// Maps epoch -> count of active tasks at that epoch.
  /// The minimum key is the oldest running epoch, below which the reaper
  /// can safely process deferred decrements.
  pub(crate) active_epochs:
    parking_lot::Mutex<BTreeMap<GenerationEpoch, usize>>,
}

impl<P: Partitions, T: LanguageServer<P>> Scheduler<P, T> {
  /// Creates a new scheduler with default configuration.
  ///
  /// Worker thread count: `num_cpus - 1` (minimum 1)
  /// GC: Enabled, runs periodically on IDLE_LANE
  pub fn new(
    connection: Connection,
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
  ) -> Arc<Self> {
    let worker_count = num_cpus::get().saturating_sub(1).max(1);
    Self::new_with_config(
      connection,
      server,
      filesystems,
      source_cache,
      worker_count,
      SchedulerConfiguration::default(),
    )
  }

  #[cfg_attr(not(test), allow(dead_code))]
  pub(crate) fn new_with_worker_count(
    connection: Connection,
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
    worker_count: usize,
  ) -> Arc<Self> {
    Self::new_with_config(
      connection,
      server,
      filesystems,
      source_cache,
      worker_count,
      SchedulerConfiguration::default(),
    )
  }

  #[cfg_attr(not(test), allow(dead_code))]
  pub(crate) fn new_with_config(
    connection: Connection,
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
    worker_count: usize,
    config: SchedulerConfiguration,
  ) -> Arc<Self>
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    Self::new_inner(
      connection,
      server,
      filesystems,
      source_cache,
      worker_count,
      config,
      Arc::new(ClientRegistry::new()),
      RunMode::Threaded,
    )
  }

  pub fn new_daemon(
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
    worker_count: usize,
    config: SchedulerConfiguration,
    registry: Arc<ClientRegistry>,
  ) -> Arc<Self>
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    let (placeholder_sender, placeholder_receiver) = async_channel::unbounded();

    let placeholder_connection = Connection {
      sender: placeholder_sender,
      receiver: placeholder_receiver,
    };

    Self::new_inner(
      placeholder_connection,
      server,
      filesystems,
      source_cache,
      worker_count,
      config,
      registry,
      RunMode::Threaded,
    )
  }

  /// Creates a single-connection scheduler driven inline by an external async
  /// executor (see [`RunMode::Inline`] and [`Scheduler::run_inline`]). Used by
  /// the test harness so an entire test runs on one thread.
  #[cfg(feature = "test")]
  pub(crate) fn new_inline(
    connection: Connection,
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
    config: SchedulerConfiguration,
  ) -> Arc<Self>
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    Self::new_inner(
      connection,
      server,
      filesystems,
      source_cache,
      1,
      config,
      Arc::new(ClientRegistry::new()),
      RunMode::Inline {
        wake: event_listener::Event::new(),
      },
    )
  }

  fn new_inner(
    connection: Connection,
    server: Arc<T>,
    filesystems: Arc<parking_lot::RwLock<Vec<crate::fs::FS>>>,
    source_cache: Arc<parking_lot::RwLock<crate::SourceCache<P, T>>>,
    worker_count: usize,
    config: SchedulerConfiguration,
    registry: Arc<ClientRegistry>,
    run_mode: RunMode,
  ) -> Arc<Self>
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    otel::span!("laburnum.scheduler.new");

    let shutdown_flag = Arc::new(AtomicBool::new(false));

    let progress_tracker = Arc::new(ProgressTracker::new_disconnected());

    progress_tracker
      .register_client(ClientId::INTERNAL, connection.sender.clone());

    let db = Database::new();
    let reaper = Reaper::new(db.cas.stores_arc());
    let gc = GarbageCollector::new();

    let s = Arc::new(Self {
      db,
      connection: connection.clone(),
      filesystems,
      source_cache,
      lane_queues: std::array::from_fn(|_| ConcurrentQueue::unbounded()),
      rpc_rotation_lock: parking_lot::Mutex::new(()),
      worker_threads: parking_lot::RwLock::new(Vec::new()),
      worker_count,
      run_mode,
      server: server.clone(),
      shutdown_flag: shutdown_flag.clone(),
      config: config.clone(),
      shutdown_requested: Arc::new(AtomicBool::new(false)),
      progress_tracker,
      work_in_flight: AtomicUsize::new(0),
      idle_debounce_armed: AtomicBool::new(false),
      pending_redispatch: parking_lot::Mutex::new(HashMap::new()),
      pending_redispatch_count: AtomicUsize::new(0),
      registry,
      reaper,
      gc,
      active_epochs: parking_lot::Mutex::new(BTreeMap::new()),
    });

    s.source_cache.write().set_scheduler(Arc::downgrade(&s));

    s
  }

  /// Returns the client registry.
  pub fn registry(&self) -> &Arc<ClientRegistry> {
    &self.registry
  }

  /// Returns a [`QueryClient`] for reading the database directly in tests.
  #[cfg(feature = "test")]
  pub fn query_client(&self) -> crate::database::query::QueryClient<P> {
    crate::database::query::QueryClient::new(self.db.clone())
  }

  /// Returns a [`SourceCacheReader`] for resolving spans/idents in tests.
  ///
  /// [`SourceCacheReader`]: crate::source::cache::reporter::SourceCacheReader
  #[cfg(feature = "test")]
  pub fn source_cache_reader(
    &self,
  ) -> crate::source::cache::reporter::SourceCacheReader {
    self.source_cache.read().reader()
  }

  /// Returns a reference to the source cache for cleanup operations.
  pub(crate) fn source_cache(
    &self,
  ) -> &Arc<parking_lot::RwLock<crate::SourceCache<P, T>>> {
    &self.source_cache
  }

  /// Request graceful daemon shutdown.
  ///
  /// This sets a flag that the daemon server checks, triggering graceful
  /// shutdown. The shutdown will happen after the current request completes.
  pub fn request_shutdown(&self) {
    self.shutdown_requested.store(true, Ordering::Release);
  }

  /// Check if shutdown has been requested.
  pub fn is_shutdown_requested(&self) -> bool {
    self.shutdown_requested.load(Ordering::Acquire)
  }

  pub(crate) fn create_rpc_task_for_client(
    self: &Arc<Self>,
    connection: Connection,
    client_id: ClientId,
    shutdown_flag: Arc<AtomicBool>,
  ) -> Arc<LaburnumTask<P, T>> {
    RpcTask::create(
      (*self).clone(),
      connection,
      client_id,
      self.server.clone(),
      shutdown_flag,
      self.config.rpc_response_capacity,
    )
  }

  pub fn queue_client_rpc_task(
    self: &Arc<Self>,
    connection: Connection,
    client_id: ClientId,
    shutdown_flag: Arc<AtomicBool>,
  ) {
    let task =
      self.create_rpc_task_for_client(connection, client_id, shutdown_flag);
    self.queue_rpc_task(task);
  }

  pub fn progress_tracker(&self) -> &Arc<ProgressTracker> {
    &self.progress_tracker
  }

  /// Record that a `User` (compilation) task has been created. Called once
  /// per task at construction (see [`task::TaskClass`]).
  pub(crate) fn user_task_started(&self) {
    self.work_in_flight.fetch_add(1, Ordering::AcqRel);
  }

  /// Record that a `User` task has completed. When the in-flight count
  /// reaches zero the pipeline is quiescent, so we arm the idle debounce.
  pub(crate) fn user_task_finished(self: &Arc<Self>) {
    // `fetch_sub` returns the previous value; `1` means we just hit zero.
    if self.work_in_flight.fetch_sub(1, Ordering::AcqRel) == 1 {
      self.arm_idle_debounce();
    }
  }

  /// Arm a one-shot `System` task that, after a short debounce, ends all
  /// open progress iff the scheduler is still quiescent.
  ///
  /// The debounce absorbs transient dips to zero between pipeline stages
  /// (a stage that spawns the next stage normally keeps the count above
  /// zero, but this is robust even if it briefly doesn't) and coalesces
  /// rapid successive edits into a single progress run — matching how
  /// rust-analyzer ends progress on queue drain rather than per-stage.
  fn arm_idle_debounce(self: &Arc<Self>) {
    // Singleton: only one debounce in flight at a time. If one is already
    // armed, the count touching zero again is harmless — that debounce
    // will observe the latest state when it fires.
    if self
      .idle_debounce_armed
      .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
      .is_err()
    {
      return;
    }

    let delay = self.config.idle_debounce;
    let scheduler = self.clone();
    let task = LaburnumTask::new_system_with_parent(
      self.clone(),
      move |_ctx| {
        Box::pin(async move {
          smol::Timer::after(delay).await;
          // Clear the guard BEFORE re-reading the counter: a
          // 0 -> n -> 0 cycle during the sleep must be free to re-arm,
          // otherwise that quiescence edge would be lost.
          scheduler.idle_debounce_armed.store(false, Ordering::Release);
          if scheduler.work_in_flight.load(Ordering::Acquire) == 0 {
            scheduler.progress_tracker.on_idle();
          }
          None
        })
      },
      lanes::IDLE_LANE,
      None,
      ClientId::INTERNAL,
    );
    self.queue_task(task);
  }

  /// Register a `pending_deps` re-dispatch (GLD-0035): when a key matching
  /// `condition` is later written to (or deleted from) `partition`, run
  /// `thunk` (which re-dispatches the watcher that recorded the dependency).
  pub(crate) fn register_pending_redispatch(
    &self,
    owner: Ident,
    partition: crate::SpannedIdent,
    condition: SortKeyCondition<P::SortKey>,
    thunk: ReDispatch,
  ) {
    self
      .pending_redispatch
      .lock()
      .entry(partition.ident())
      .or_default()
      .push((owner, condition, thunk));
    self.pending_redispatch_count.fetch_add(1, Ordering::AcqRel);
  }

  /// Remove all re-dispatch registrations owned by `owner`. A watcher calls
  /// this before re-registering its current run's subscriptions, so the
  /// dependency set reflects exactly this run's reads rather than accumulating
  /// stale registrations across re-dispatches (ADR0008).
  pub(crate) fn clear_redispatch_for_owner(&self, owner: Ident) {
    if self.pending_redispatch_count.load(Ordering::Acquire) == 0 {
      return;
    }
    let mut removed = 0usize;
    let mut registry = self.pending_redispatch.lock();
    registry.retain(|_pk, entries| {
      let before = entries.len();
      entries.retain(|(o, _, _)| *o != owner);
      removed += before - entries.len();
      !entries.is_empty()
    });
    if removed > 0 {
      self
        .pending_redispatch_count
        .fetch_sub(removed, Ordering::AcqRel);
    }
  }

  /// Fire the `pending_deps` re-dispatch thunks whose condition matches a key
  /// written or deleted in this commit. Non-matching registrations are kept.
  /// Mirrors the database's `wake_partition_waiters`, but re-dispatches a
  /// completed watcher instead of waking a parked future.
  fn fire_pending_redispatch(&self, result: &crate::database::CommitResult<P>) {
    // Hot-path skip: when nothing is waiting (the common case), avoid taking
    // the registry mutex on every commit.
    if self.pending_redispatch_count.load(Ordering::Acquire) == 0 {
      return;
    }

    let mut to_fire: Vec<ReDispatch> = Vec::new();
    {
      let mut registry = self.pending_redispatch.lock();
      for pk in result.affected_partition_keys() {
        let Some(entries) = registry.remove(&pk.ident()) else {
          continue;
        };
        let mut kept = Vec::new();
        for (owner, condition, thunk) in entries {
          let inserted = result.inserted_keys.get(&pk);
          let deleted = result.deleted_keys.get(&pk);
          let any_match = inserted
            .into_iter()
            .flat_map(|keys| keys.iter())
            .chain(deleted.into_iter().flat_map(|keys| keys.iter()))
            .any(|rk| condition.matches(rk.sort_key()));
          if any_match {
            to_fire.push(thunk);
          } else {
            kept.push((owner, condition, thunk));
          }
        }
        if !kept.is_empty() {
          registry.insert(pk.ident(), kept);
        }
      }
    }
    if !to_fire.is_empty() {
      self
        .pending_redispatch_count
        .fetch_sub(to_fire.len(), Ordering::AcqRel);
    }
    // Fire outside the lock — each thunk queues a task and notifies workers.
    for thunk in to_fire {
      thunk();
    }
  }

  pub fn run_daemon(
    self: &Arc<Self>,
    ipc_server: IpcServer,
    config: DaemonConfig,
  ) where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    otel::span!("laburnum.scheduler.run_daemon");

    let idle_triggered = Arc::new(AtomicBool::new(false));

    if let Some(idle_timeout) = config.idle_timeout {
      self.queue_task(LaburnumTask::new_system_with_parent(
        self.clone(),
        idle_monitor_task(
          self.shutdown_flag.clone(),
          idle_triggered.clone(),
          idle_timeout,
        ),
        lanes::IDLE_LANE,
        None,
        ClientId::INTERNAL,
      ));
    }

    self.queue_task(DaemonTask::create(
      self.clone(),
      ipc_server,
      config,
      idle_triggered,
    ));

    if self.config.enable_periodic_gc {
      self.queue_task(LaburnumTask::new_system_with_parent(
        self.clone(),
        gc::periodic_gc_task(self.shutdown_flag.clone()),
        lanes::IDLE_LANE,
        None,
        ClientId::INTERNAL,
      ));
    }

    self.spawn_workers();

    while !self.shutdown_flag.load(Ordering::Acquire) {
      std::thread::park_timeout(Duration::from_millis(100));
    }

    self.notify_workers();

    let handles = {
      let mut threads = self.worker_threads.write();
      std::mem::take(&mut *threads)
    };

    for handle in handles {
      if let Err(e) = handle.join() {
        otel::error!(
          "worker_thread_join_failed",
          format!("Failed to join worker thread: {:?}", e)
        );
      }
    }
  }

  /// Spawns worker threads that execute tasks from the lane queues.
  ///
  /// Creates `worker_count` threads, each running a work loop that:
  /// 1. Checks lanes from high to low priority
  /// 2. Pops first available task
  /// 3. Executes task
  /// 4. Repeats
  ///
  /// Workers park when no work is available and are woken when new tasks
  /// arrive.
  pub fn spawn_workers(self: &Arc<Self>) {
    let trace_context =
      crate::protocol::otel::TraceContext::from_current_span();

    let mut threads = Vec::with_capacity(self.worker_count);

    for id in 0..self.worker_count {
      let handle =
        worker::Worker::spawn(id, self.clone(), trace_context.clone());
      threads.push(handle);
    }

    *self.worker_threads.write() = threads;
  }

  /// Queues a new async task on the specified priority lane.
  ///
  /// # Parameters
  ///
  /// - `task_fn`: Async function that performs the work and optionally returns
  ///   a `RecordWriter`
  /// - `lane`: Priority lane (see [`lanes`] module for options)
  ///
  /// # Task Function
  ///
  /// The task function receives a [`TaskContext`] and should return:
  /// - `Some(RecordWriter)`: Task completed successfully, write chunk to
  ///   database
  /// - `None`: Task cancelled or produced no output (no chunk written)
  ///
  /// # Example
  ///
  /// ```ignore
  /// scheduler.queue(move |ctx| async move {
  ///   let result = parse_file(file_content).await?;
  ///
  ///   let mut writer = ctx.create_writer(task_id);
  ///   writer.insert(partition_key, sort_key, result);
  ///
  ///   Ok(Some(writer))
  /// }, DEFAULT_LANE);
  /// ```
  pub fn queue<F, Fut>(self: &Arc<Self>, task_fn: F, lane: Lane)
  where
    F: FnOnce(TaskContext<P, T>) -> Fut + Send + 'static,
    Fut: Future<Output = Option<RecordWriter<P>>> + Send + 'static,
  {
    self.queue_task(LaburnumTask::new(
      self.clone(),
      task_fn,
      lane,
      ClientId::INTERNAL,
    ));
  }

  pub(crate) fn queue_task(&self, task: Arc<LaburnumTask<P, T>>) {
    let mut lane_idx = lane_priority(task.lane) as usize;
    if lane_idx > 31 {
      eprintln!("unable to push task onto queue: lane out of bounds");

      return;
    }

    // TODO: we use unbounded channels, so we shouldn't ever have an issue
    // adding a task but it could fail, and we need to handle it better.

    while lane_idx > 0 {
      if let Some(lane) = self.lane_queues.get(lane_idx) {
        match lane.push(task.clone()) {
          | Ok(_) => {
            break;
          },
          | Err(_) => {
            otel::error!(
              "scheduler.lane_push_failed",
              "lane queue is full",
              "lane_idx" = lane_idx as i64
            );
          },
        }
      }
      lane_idx -= 1;
    }

    if lane_idx == 0
      && let Some(lane) = self.lane_queues.get(lane_idx)
      && let Err(_) = lane.push(task)
    {
      otel::error!("scheduler.lowest_lane_push_failed", "lane queue is full");
    }

    self.notify_workers();
  }

  /// Queues an RPC task with priority queue aging semantics.
  ///
  /// RPC tasks use 4 priority lanes (RPC_LANE_0 through RPC_LANE_3) where
  /// older tasks have higher priority. This method:
  ///
  /// 1. **Bubbles up existing tasks**: Moves tasks from lower to higher
  ///    priority lanes, so older tasks migrate toward RPC_LANE_0
  /// 2. **Inserts new task at lowest priority**: The new task enters at
  ///    RPC_LANE_3, giving existing tasks precedence
  ///
  /// Workers steal from RPC_LANE_0 first, ensuring oldest RPC tasks are
  /// processed before newer ones.
  pub(crate) fn queue_rpc_task(&self, task: Arc<LaburnumTask<P, T>>) {
    use lanes::{RPC_LANE_HIGH_IDX, RPC_LANE_LOW_IDX};

    let _guard = self.rpc_rotation_lock.lock();

    for to_idx in RPC_LANE_HIGH_IDX..RPC_LANE_LOW_IDX {
      let from_idx = to_idx + 1;
      while let Ok(t) = self.lane_queues[from_idx].pop() {
        let _ = self.lane_queues[to_idx].push(t);
      }
    }

    let _ = self.lane_queues[RPC_LANE_LOW_IDX].push(task);

    self.notify_workers();
  }

  pub(crate) fn add_initial_tasks(self: &Arc<Self>)
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    self.queue_task(RpcTask::create(
      (*self).clone(),
      self.connection.clone(),
      ClientId::INTERNAL,
      self.server.clone(),
      self.shutdown_flag.clone(),
      self.config.rpc_response_capacity,
    ));

    if self.config.enable_periodic_gc {
      self.queue_task(LaburnumTask::new_system_with_parent(
        self.clone(),
        gc::periodic_gc_task(self.shutdown_flag.clone()),
        lanes::IDLE_LANE,
        None,
        ClientId::INTERNAL,
      ));
    }
  }

  /// Runs the scheduler, processing tasks until shutdown.
  ///
  /// This is the main event loop for the server. It:
  /// 1. Spawns worker threads
  /// 2. Parks the main thread (workers handle all work)
  /// 3. Wakes periodically to check shutdown flag
  /// 4. On shutdown, joins all worker threads
  ///
  /// # Blocking
  ///
  /// This method blocks until the server shuts down (typically when the LSP
  /// client disconnects).
  ///
  /// # Worker Thread Behavior
  ///
  /// Worker threads continuously poll lane queues for tasks. When all lanes are
  /// empty, workers park until notified by `queue()` or `queue_task()`.
  pub fn run(self: &Arc<Self>)
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    

    otel::span!("laburnum.scheduler.run");

    self.add_initial_tasks();

    self.spawn_workers();

    while !self.shutdown_flag.load(Ordering::Acquire) {
      std::thread::park_timeout(Duration::from_millis(100));
    }

    self.notify_workers();

    let handles = {
      let mut threads = self.worker_threads.write();
      std::mem::take(&mut *threads)
    };

    for handle in handles {
      if let Err(e) = handle.join() {
        otel::error!(
          "worker_thread_join_failed",
          format!("Failed to join worker thread: {:?}", e)
        );
      }
    }
  }
}

/// Stable id for a watcher instance: its handler partition plus the matched
/// keys it replays. A re-dispatch replays identical keys, so this is stable
/// across re-runs and scopes that instance's subscriptions for clear-and-rebuild.
pub(crate) fn watcher_owner_id<P: Partitions>(
  task_pk: crate::SpannedIdent,
  updated: &[crate::database::RecordKey<P>],
  deleted: &[crate::database::RecordKey<P>],
) -> crate::Ident {
  let mut hasher = crate::hash::ContentHasher::new();
  hasher.update(&task_pk.ident().to_bytes());
  for k in updated {
    hasher.update(&k.key_ident().to_bytes());
  }
  for k in deleted {
    hasher.update(&k.key_ident().to_bytes());
  }
  hasher.finish_ident()
}

/// Spawn a watcher task for `handler_fn` over the given matched keys.
///
/// Factored out so the same task body backs both the initial dispatch (from
/// [`Scheduler::on_new_chunk`]) and `pending_deps` re-dispatch: after the
/// handler returns, each dependency it recorded via
/// [`TaskContext::defer_until`](task::TaskContext::defer_until) registers a
/// re-dispatch thunk that calls this function again with the same matched keys
/// (GLD-0035).
fn spawn_watcher_task<P: Partitions, T: LanguageServer<P>>(
  scheduler: Arc<Scheduler<P, T>>,
  task_pk: crate::SpannedIdent,
  updated: Vec<crate::database::RecordKey<P>>,
  deleted: Vec<crate::database::RecordKey<P>>,
  handler_fn: WatcherHandlerFn<P, T>,
  parent_task_id: Option<crate::Ident>,
) {
  let body_scheduler = scheduler.clone();
  let task = LaburnumTask::new_with_parent(
    scheduler.clone(),
    move |mut ctx: TaskContext<P, T>| {
      let scheduler = body_scheduler;
      async move {
        ctx.set_matched_keys(updated.clone(), deleted.clone());

        let mut writer = RecordWriter::new(task_pk.ident());
        let mut writer_ctx =
          crate::database::PartitionWriteContextRef::new_for_watcher(
            &mut writer,
            task_pk,
          );

        let result = handler_fn(&mut ctx, &mut writer_ctx).await;

        for follow_up in result.follow_ups {
          let sched = scheduler.clone();
          scheduler.queue_task(LaburnumTask::new(
            sched.clone(),
            move |mut ctx| async move {
              let mut writer = RecordWriter::new(task_pk.ident());
              let mut writer_ctx =
                crate::database::PartitionWriteContextRef::new_for_watcher(
                  &mut writer,
                  task_pk,
                );
              (follow_up.task_fn)(&mut ctx, &mut writer_ctx).await;
              Some(writer)
            },
            follow_up.lane,
            ClientId::INTERNAL,
          ));
        }

        // Subscriptions + explicit defers -> re-dispatch. Clear this instance's
        // prior registrations first so the set reflects exactly this run's
        // reads, not an accumulation across re-dispatches (ADR0008).
        let owner = watcher_owner_id::<P>(task_pk, &updated, &deleted);
        scheduler.clear_redispatch_for_owner(owner);
        for (dep_pk, condition) in ctx.take_pending_deps() {
          let sched = scheduler.clone();
          let updated = updated.clone();
          let deleted = deleted.clone();
          scheduler.register_pending_redispatch(
            owner,
            dep_pk,
            condition,
            Box::new(move || {
              spawn_watcher_task(
                sched,
                task_pk,
                updated,
                deleted,
                handler_fn,
                parent_task_id,
              );
            }),
          );
        }

        Some(writer)
      }
    },
    lanes::DEFAULT_LANE,
    parent_task_id,
    ClientId::INTERNAL,
  );
  scheduler.queue_task(task);
}

impl<P: Partitions, T: LanguageServer<P>> Scheduler<P, T> {
  pub fn server(&self) -> Arc<T> {
    self.server.clone()
  }

  /// Called when a new chunk is written to the database.
  ///
  /// Accepts a `CommitResult` with pre-grouped inserted/deleted keys and
  /// dispatches to registered watchers for each affected partition key.
  pub(crate) fn on_new_chunk(
    self: &Arc<Self>,
    task_id: crate::Ident,
    result: crate::database::CommitResult<P>,
  ) {
    let inserted_count: usize =
      result.inserted_keys.values().map(|v| v.len()).sum();
    let deleted_count: usize =
      result.deleted_keys.values().map(|v| v.len()).sum();

    otel::span!(
      "laburnum.scheduler.on_new_chunk",
      "inserted_keys.count" = inserted_count as i64,
      "deleted_keys.count" = deleted_count as i64
    );

    // Re-dispatch any watcher whose recorded `pending_deps` are satisfied by a
    // key written or deleted in this commit (GLD-0035).
    self.fire_pending_redispatch(&result);

    for pk in result.affected_partition_keys() {
      let updated: Vec<crate::database::RecordKey<P>> = result
        .inserted_keys
        .get(&pk)
        .cloned()
        .unwrap_or_default();
      let deleted: Vec<crate::database::RecordKey<P>> = result
        .deleted_keys
        .get(&pk)
        .cloned()
        .unwrap_or_default();

      // Built-in watchers are children of the originating task; server
      // watchers are roots (parent `None`), preserving the prior behaviour.
      dispatch_builtin_watcher(pk, updated.clone(), deleted.clone(), {
        let scheduler = self.clone();
        move |task_pk, filtered_updated, filtered_deleted, handler_fn| {
          spawn_watcher_task(
            scheduler.clone(),
            task_pk,
            filtered_updated,
            filtered_deleted,
            handler_fn,
            Some(task_id),
          );
        }
      });

      T::dispatch_watcher(pk, updated, deleted, {
        let scheduler = self.clone();
        move |task_pk, filtered_updated, filtered_deleted, handler_fn| {
          spawn_watcher_task(
            scheduler.clone(),
            task_pk,
            filtered_updated,
            filtered_deleted,
            handler_fn,
            None,
          );
        }
      });
    }
  }

  /// Register a task's create epoch in the active epoch registry.
  ///
  /// Called when a task first produces a result (Poll::Ready with a chunk)
  /// to track the epoch at which it started. The epoch is snapshotted from
  /// the database's current epoch.
  pub(crate) fn register_active_epoch(&self, epoch: GenerationEpoch) {
    let mut epochs = self.active_epochs.lock();
    *epochs.entry(epoch).or_insert(0) += 1;
  }

  /// Deregister a task's epoch from the active epoch registry.
  ///
  /// Called when a task completes (Poll::Ready) to remove its epoch from
  /// tracking. If this was the last task at that epoch, the entry is removed.
  pub(crate) fn deregister_active_epoch(&self, epoch: GenerationEpoch) {
    let mut epochs = self.active_epochs.lock();
    if let Some(count) = epochs.get_mut(&epoch) {
      *count -= 1;
      if *count == 0 {
        epochs.remove(&epoch);
      }
    }
  }

  /// Get the oldest epoch at which any task is still running.
  ///
  /// Returns the minimum key in the active epochs map. If no tasks are
  /// running, returns the database's current epoch (all epochs are safe
  /// to reap).
  pub(crate) fn oldest_running_epoch(&self) -> GenerationEpoch {
    let epochs = self.active_epochs.lock();
    epochs
      .keys()
      .next()
      .copied()
      .unwrap_or_else(|| self.db.get_current_epoch())
  }

  fn notify_workers(&self) {
    match &self.run_mode {
      | RunMode::Threaded => {
        self.worker_threads.read().iter().for_each(|handle| {
          handle.thread().unpark();
        });
      },
      #[cfg(feature = "test")]
      | RunMode::Inline { wake } => {
        wake.notify(usize::MAX);
      },
    }
  }

  /// Returns the highest-priority ready task, draining lanes from lane 0
  /// (highest) to lane 30 (lowest) — the same priority order as a worker's
  /// `steal_work`. Used by the inline driver; returns `None` when every lane
  /// is empty.
  #[cfg(feature = "test")]
  fn next_ready_task(&self) -> Option<Arc<LaburnumTask<P, T>>> {
    self.lane_queues.iter().find_map(|lane| lane.pop().ok())
  }

  /// Whether any lane currently holds a ready task.
  #[cfg(feature = "test")]
  fn has_ready_work(&self) -> bool {
    self.lane_queues.iter().any(|lane| !lane.is_empty())
  }

  /// Drives the lane queues on the calling async executor until shutdown.
  ///
  /// This is the [`RunMode::Inline`] counterpart to the worker-thread pool: an
  /// external executor (e.g. one per test) polls this future on its own
  /// thread, so the scheduler runs no threads of its own. Ready tasks are
  /// drained in strict lane priority; when no lane has work the pump parks on
  /// the run mode's wake `Event` until [`queue_task`](Self::queue_task) or a
  /// task waker notifies it.
  ///
  /// Callers are responsible for queueing the initial tasks (RPC pump, accept
  /// loop, …) before — or concurrently with — driving.
  /// Queues the initial tasks and then drives the lane queues inline until
  /// shutdown — the [`RunMode::Inline`] counterpart to [`run`](Self::run).
  /// Spawn this on the external executor that owns the scheduler's thread.
  #[cfg(feature = "test")]
  pub(crate) async fn run_inline(self: Arc<Self>)
  where
    T: crate::hooks::LaburnumHooks<P, T>,
  {
    self.add_initial_tasks();
    self.drive().await;
  }

  #[cfg(feature = "test")]
  pub(crate) async fn drive(self: Arc<Self>) {
    let RunMode::Inline { wake } = &self.run_mode else {
      otel::error!(
        "scheduler.drive_threaded",
        "drive() called on a Threaded scheduler; no work will be pumped"
      );
      return;
    };

    loop {
      if self.shutdown_flag.load(Ordering::Acquire) {
        return;
      }

      if let Some(task) = self.next_ready_task() {
        // A `Pending` task re-queues itself through its waker; a `Ready` task
        // is complete. Either way the pump just moves to the next task.
        let _ = task.poll_once();
        continue;
      }

      // Arm the listener BEFORE re-checking for work: a task queued (and the
      // resulting notify) between the empty check and the listen would
      // otherwise be lost, hanging the single-threaded pump forever.
      let listener = wake.listen();
      if self.has_ready_work() || self.shutdown_flag.load(Ordering::Acquire) {
        continue;
      }
      listener.await;
    }
  }
}

#[cfg(test)]
pub mod tests;