rust-job-queue-api-worker-system 0.1.0

//! Queue operations: the mutating SQL that drives jobs through their
//! lifecycle.
//!
//! Every state transition lives here, and each one is written as either a
//! single SQL statement or a short transaction. This is deliberate:
//! concurrency invariants are enforced by Postgres rather than by the
//! application, so we cannot create incorrect states through buggy
//! application code paths. Two examples:
//!
//! 1. The dequeue (`fetch_next`) is one atomic `UPDATE ... WHERE id =
//!    (SELECT ... FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING ...`. The row
//!    is locked, the status flipped, and the row returned in a single
//!    server-side step. N workers polling the same table will each see a
//!    disjoint set of rows without blocking on one another.
//!
//! 2. `mark_failed_or_retry` opens a transaction, takes a `FOR UPDATE`
//!    lock on the row to read `attempts` + `max_attempts`, computes the
//!    next state in Rust, then commits the UPDATE — all inside the same
//!    transaction. The lock guarantees no other worker can mutate the row
//!    between the read and the write.
//!
//! The functions in this module are the *only* place where the `jobs`
//! table is mutated. Anything else (API handlers, workers, recovery
//! sweep) must go through this module to inherit its invariants.

use chrono::{DateTime, Utc};
use sqlx::{
    query::query, query_as::query_as, query_scalar::query_scalar, transaction::Transaction,
};
use sqlx_postgres::{PgPool, Postgres};

use crate::domain::{Job, JobKind, JobStatus, NewJob};
use crate::error::JobError;
use crate::ids::JobId;
use crate::retry::backoff_for_attempt;

/// The full column list, in a fixed order, used in every `SELECT` /
/// `RETURNING` projection.
///
/// Keeping this in one place (rather than inlining the columns in each
/// query) means a schema change adding or removing a column needs to be
/// touched in exactly two places: the migration and this constant. The
/// alternative — column names inline across a dozen call sites — leads
/// to drift bugs that take 20 minutes to find.
const JOB_COLUMNS: &str = "id, kind::text AS kind, payload, status::text AS status, \
                           attempts, max_attempts, last_error, run_at, locked_at, locked_by, \
                           cancel_requested, idempotency_key, created_at, updated_at";

/// Pagination and filter input for [`list`].
///
/// Optional filters are `None` to mean "match all". The numeric fields are
/// clamped inside [`list`] so callers cannot accidentally request a
/// 1-million-row page or a negative offset; the input is left permissive
/// here to keep the HTTP DTO unconstrained at the type level.
#[derive(Debug, Clone)]
pub struct ListFilter {
    pub status: Option<JobStatus>,
    pub kind: Option<JobKind>,
    pub limit: i64,
    pub offset: i64,
}

impl Default for ListFilter {
    fn default() -> Self {
        // A 50-row page is the typical "list view" size — big enough to be
        // useful interactively, small enough that a single page does not
        // ship a noticeable amount of JSON on the wire. Production
        // deployments expecting larger pages should set `limit`
        // explicitly; the upper bound (200) is enforced in `list`.
        Self {
            status: None,
            kind: None,
            limit: 50,
            offset: 0,
        }
    }
}

/// Result of [`request_cancel`].
///
/// The three variants correspond to the three reachable cancellation
/// situations: an atomic transition (job was waiting), a deferred
/// transition (job was running and must observe the flag cooperatively),
/// and a no-op (job already finished).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CancelOutcome {
    /// The job was `queued` or `retrying`. Status was atomically set to
    /// `cancelled` inside the same transaction. The API layer maps this
    /// to HTTP 200.
    CancelledNow,
    /// The job was `running`. The `cancel_requested` flag has been set on
    /// the row; the worker will observe it at the next safe point and
    /// transition the job to `cancelled` then. The API layer maps this to
    /// HTTP 202 (Accepted).
    PendingOnWorker,
    /// The job is already in a terminal state (`succeeded`,
    /// `failed_permanent`, or `cancelled`). No state change occurred.
    /// The API layer maps this to HTTP 409 (Conflict).
    AlreadyTerminal(JobStatus),
}

/// Result of [`enqueue`].
///
/// Distinguishes a freshly-created job from one that was found via the
/// idempotency key. The wrapped [`Job`] is the same shape either way;
/// callers that care about the difference (e.g., to return 201 vs 200)
/// can branch on the variant.
#[derive(Debug, Clone)]
pub enum EnqueueOutcome {
    /// A new row was inserted by this call.
    Created(Job),
    /// An existing row matched the idempotency key; returned unchanged.
    Existing(Job),
}

impl EnqueueOutcome {
    /// Borrow the wrapped job regardless of which variant was returned.
    /// Convenient for callers that only need the job representation and
    /// not the disposition.
    pub fn job(&self) -> &Job {
        match self {
            EnqueueOutcome::Created(j) | EnqueueOutcome::Existing(j) => j,
        }
    }

    /// `true` if this call created a new row; `false` if it returned an
    /// existing row by idempotency key.
    pub fn is_new(&self) -> bool {
        matches!(self, EnqueueOutcome::Created(_))
    }
}

/// Insert a new job, or return the existing one if the idempotency key
/// matches.
///
/// The payload is validated against [`crate::payload::validate`] before
/// the insert is attempted; on validation failure the insert is skipped
/// entirely and `Err(JobError::PayloadInvalid)` is returned.
///
/// The insert uses `ON CONFLICT (idempotency_key) WHERE idempotency_key
/// IS NOT NULL DO NOTHING`. If the partial unique index already has a row
/// for this key, the insert is a no-op and `RETURNING` returns no rows;
/// the function then re-fetches the existing row by key. The two-step is
/// safe under concurrency because the unique index serialises the inserts
/// at the DB level.
///
/// # Errors
/// - [`JobError::PayloadInvalid`] if the payload shape doesn't match the kind.
/// - [`JobError::IdempotencyConflict`] in the rare case the partial unique
///   index matched but the follow-up `SELECT` came back empty (the row
///   was deleted between the two queries; not expected in normal use).
/// - [`JobError::Db`] for any underlying Postgres error.
pub async fn enqueue(pool: &PgPool, new: NewJob) -> Result<EnqueueOutcome, JobError> {
    crate::payload::validate(new.kind, &new.payload)?;

    let id = JobId::new();
    let max_attempts = new.max_attempts.unwrap_or(3);

    // `ON CONFLICT (col) WHERE pred DO NOTHING` is the Postgres syntax
    // for inferring a partial unique index. The matching index is
    // `jobs_idempotency_key_uidx ON jobs(idempotency_key)
    // WHERE idempotency_key IS NOT NULL`. Rows with a NULL key always
    // succeed at insert; rows with a duplicate non-NULL key are skipped.
    let insert_sql = format!(
        "INSERT INTO jobs (id, kind, payload, max_attempts, idempotency_key) \
         VALUES ($1, $2::job_kind, $3, $4, $5) \
         ON CONFLICT (idempotency_key) WHERE idempotency_key IS NOT NULL DO NOTHING \
         RETURNING {JOB_COLUMNS}"
    );

    let inserted: Option<Job> = query_as(&insert_sql)
        .bind(id.as_uuid())
        .bind(new.kind.as_str())
        .bind(&new.payload)
        .bind(max_attempts)
        .bind(new.idempotency_key.as_deref())
        .fetch_optional(pool)
        .await?;

    if let Some(job) = inserted {
        return Ok(EnqueueOutcome::Created(job));
    }

    // Conflict path: the key already exists. The contract is "same key →
    // same job" (Stripe convention), so we look up the existing row by
    // key and return it. If the caller passed no key at all then there is
    // no key to fetch by, and the no-row return is unexpected — surface
    // as a DB error.
    let key = new
        .idempotency_key
        .as_deref()
        .ok_or_else(|| JobError::Db(sqlx::Error::RowNotFound))?;

    let existing: Job = query_as(&format!(
        "SELECT {JOB_COLUMNS} FROM jobs WHERE idempotency_key = $1"
    ))
    .bind(key)
    .fetch_optional(pool)
    .await?
    .ok_or_else(|| JobError::IdempotencyConflict(key.to_string()))?;

    Ok(EnqueueOutcome::Existing(existing))
}

/// Atomically claim the next ready job for `worker_id`.
///
/// Returns `Ok(None)` when no job is currently eligible (queue empty, or
/// every ready row is already locked by another worker, or every ready
/// row's `run_at` is still in the future).
///
/// The implementation is a single SQL statement:
///
/// ```sql
/// UPDATE jobs
///    SET status = 'running', attempts = attempts + 1, ...
///  WHERE id = (
///        SELECT id FROM jobs
///         WHERE status IN ('queued','retrying') AND run_at <= now()
///           AND cancel_requested = FALSE
///         ORDER BY run_at
///         FOR UPDATE SKIP LOCKED
///         LIMIT 1
///  )
///  RETURNING <columns>
/// ```
///
/// `FOR UPDATE SKIP LOCKED` is what makes this race-free: Postgres takes
/// a row-level lock during the inner SELECT, and any other concurrent
/// `fetch_next` call walking the same index will skip locked rows
/// instead of waiting. Each worker therefore observes a disjoint subset
/// of eligible rows.
///
/// The partial index `jobs_ready_idx ON jobs(run_at) WHERE status IN
/// ('queued','retrying')` is what makes this fast: the index covers
/// exactly the rows the dequeue can claim, so scanning it is O(ready
/// rows) not O(total table size).
pub async fn fetch_next(pool: &PgPool, worker_id: &str) -> Result<Option<Job>, JobError> {
    let sql = format!(
        "UPDATE jobs \
            SET status     = 'running'::job_status, \
                attempts   = attempts + 1, \
                locked_at  = now(), \
                locked_by  = $1, \
                updated_at = now() \
          WHERE id = ( \
                SELECT id FROM jobs \
                 WHERE status IN ('queued','retrying') \
                   AND run_at <= now() \
                   AND cancel_requested = FALSE \
                 ORDER BY run_at \
                 FOR UPDATE SKIP LOCKED \
                 LIMIT 1 \
          ) \
          RETURNING {JOB_COLUMNS}"
    );

    let row: Option<Job> = query_as(&sql).bind(worker_id).fetch_optional(pool).await?;

    Ok(row)
}

/// Transition a running job to `succeeded`.
///
/// The UPDATE is guarded by `WHERE id = $1 AND status = 'running'`, so a
/// concurrent cancellation that took the row from `running` to
/// `cancelled` between the executor finishing and this call would yield
/// `rows_affected() == 0` and surface as
/// [`JobError::InvalidTransition`]. Callers may want to retry or log;
/// the worker runtime logs a warning and continues.
///
/// # Errors
/// - [`JobError::InvalidTransition`] if the row was not in `running` state.
/// - [`JobError::Db`] for any underlying Postgres error.
pub async fn mark_succeeded(pool: &PgPool, id: JobId) -> Result<(), JobError> {
    let result = query(
        "UPDATE jobs \
            SET status = 'succeeded'::job_status, \
                locked_at = NULL, \
                locked_by = NULL, \
                updated_at = now() \
          WHERE id = $1 AND status = 'running'",
    )
    .bind(id.as_uuid())
    .execute(pool)
    .await?;

    if result.rows_affected() == 0 {
        // The status guard didn't match. The row is either gone or in an
        // unexpected state (e.g., concurrently cancelled mid-execution).
        // Surface as InvalidTransition so the worker logs informatively
        // rather than swallowing silently.
        return Err(JobError::InvalidTransition {
            id,
            status: "non-running".into(),
            action: "succeed",
        });
    }
    Ok(())
}

/// Transition a running job to either `retrying` (with a future `run_at`)
/// or `failed_permanent`, depending on how many attempts it has consumed.
///
/// The decision is made inside a transaction: the row's current
/// `attempts` and `max_attempts` are read under a `FOR UPDATE` lock, the
/// retry-vs-permanent disposition is computed in Rust (with a
/// decorrelated-jitter backoff for the retry case), and a single UPDATE
/// commits the new state. The lock prevents another worker from
/// concurrently mutating the row between the read and the write.
///
/// Returns the post-update row so callers can branch on
/// `job.status == JobStatus::FailedPermanent` for metrics or alerting.
///
/// # Errors
/// - [`JobError::InvalidTransition`] if the row was not in `running` state.
/// - [`JobError::Db`] for any underlying Postgres error.
pub async fn mark_failed_or_retry(
    pool: &PgPool,
    id: JobId,
    error_msg: &str,
) -> Result<Job, JobError> {
    let mut tx: Transaction<'_, Postgres> = pool.begin().await?;

    // Read the counter columns under a row lock. The guard `AND status =
    // 'running'` makes this also a state check; if the row was cancelled
    // mid-execution the SELECT returns no rows and we surface
    // InvalidTransition.
    let (attempts, max_attempts): (i32, i32) = query_as(
        "SELECT attempts, max_attempts FROM jobs WHERE id = $1 AND status = 'running' FOR UPDATE",
    )
    .bind(id.as_uuid())
    .fetch_optional(&mut *tx)
    .await?
    .ok_or(JobError::InvalidTransition {
        id,
        status: "non-running".into(),
        action: "fail",
    })?;

    // Disposition policy: this attempt has already incremented `attempts`
    // at dequeue time, so `attempts == max_attempts` means we just used
    // the last try.
    let exhausted = attempts >= max_attempts;

    // For the retry case compute the next run time in Rust so the SQL
    // doesn't have to call now() + interval, and so the backoff
    // distribution is purely in Rust (testable, deterministic per
    // attempt, identical regardless of DB timezone).
    let next_run_at: DateTime<Utc> = if exhausted {
        // Not used in the UPDATE below when exhausted (the CASE
        // expression preserves the existing run_at), but bound as a
        // placeholder to keep the SQL parameter list stable.
        Utc::now()
    } else {
        let backoff = backoff_for_attempt(attempts);
        Utc::now() + chrono::Duration::from_std(backoff).unwrap_or(chrono::Duration::seconds(60))
    };

    // One UPDATE that handles both dispositions. The `$4` boolean drives
    // the CASE expression for both the status column and the run_at
    // column. A future-proofing note: `last_error` is intentionally
    // overwritten, not appended. See `tradeoffs.md` (item 7).
    let sql = format!(
        "UPDATE jobs \
            SET status = CASE WHEN $4 THEN 'failed_permanent'::job_status \
                              ELSE 'retrying'::job_status END, \
                last_error = $2, \
                run_at = CASE WHEN $4 THEN run_at ELSE $3 END, \
                locked_at = NULL, \
                locked_by = NULL, \
                updated_at = now() \
          WHERE id = $1 \
          RETURNING {JOB_COLUMNS}"
    );

    let job: Job = query_as(&sql)
        .bind(id.as_uuid())
        .bind(error_msg)
        .bind(next_run_at)
        .bind(exhausted)
        .fetch_one(&mut *tx)
        .await?;

    tx.commit().await?;
    Ok(job)
}

/// Request cancellation of a job. The semantics depend on the job's
/// current state:
///
/// | Current status | Effect | Returned |
/// |---|---|---|
/// | `queued`, `retrying` | Status atomically set to `cancelled` | [`CancelOutcome::CancelledNow`] |
/// | `running` | `cancel_requested = TRUE` set; worker will observe and finalise | [`CancelOutcome::PendingOnWorker`] |
/// | terminal (`succeeded`, `failed_permanent`, `cancelled`) | No-op | [`CancelOutcome::AlreadyTerminal`] |
///
/// The read-then-branch-then-write sequence runs inside a transaction
/// with a `FOR UPDATE` lock on the row, so the decision can't race with
/// a concurrent `mark_*` call.
///
/// # Errors
/// - [`JobError::NotFound`] if no row exists with the given id.
/// - [`JobError::Db`] for any underlying Postgres error.
pub async fn request_cancel(pool: &PgPool, id: JobId) -> Result<CancelOutcome, JobError> {
    let mut tx: Transaction<'_, Postgres> = pool.begin().await?;

    // Read the current status under a row lock. Any concurrent worker
    // that wanted to advance this row (e.g., to `succeeded`) will block
    // until our transaction commits, so the snapshot we observe here is
    // the snapshot we act on.
    let current: Option<String> =
        query_scalar("SELECT status::text FROM jobs WHERE id = $1 FOR UPDATE")
            .bind(id.as_uuid())
            .fetch_optional(&mut *tx)
            .await?;

    let status = current
        .as_deref()
        .map(str::parse)
        .transpose()
        .map_err(|e| {
            JobError::Db(sqlx::Error::ColumnDecode {
                index: "status".into(),
                source: Box::new(e),
            })
        })?
        .ok_or(JobError::NotFound(id))?;

    let outcome = match status {
        JobStatus::Queued | JobStatus::Retrying => {
            // The job hasn't been claimed by a worker yet. A single UPDATE
            // flips status + sets the flag (the flag isn't necessary for
            // correctness here but keeps the row self-describing — "this
            // was cancelled at the user's request, not by a system path").
            query(
                "UPDATE jobs \
                    SET status = 'cancelled'::job_status, \
                        cancel_requested = TRUE, \
                        updated_at = now() \
                  WHERE id = $1",
            )
            .bind(id.as_uuid())
            .execute(&mut *tx)
            .await?;
            CancelOutcome::CancelledNow
        }
        JobStatus::Running => {
            // The job is already executing. Tokio tasks cannot be
            // preempted safely, so we set a flag and trust the executor
            // to observe it at the next sub-step boundary.
            // `finalize_cancelled` is the worker-side counterpart that
            // performs the actual `cancelled` transition.
            query(
                "UPDATE jobs \
                    SET cancel_requested = TRUE, \
                        updated_at = now() \
                  WHERE id = $1",
            )
            .bind(id.as_uuid())
            .execute(&mut *tx)
            .await?;
            CancelOutcome::PendingOnWorker
        }
        terminal @ (JobStatus::Succeeded | JobStatus::FailedPermanent | JobStatus::Cancelled) => {
            // Terminal states are immutable. Report back so the caller
            // can return HTTP 409 (or whatever fits their layer).
            CancelOutcome::AlreadyTerminal(terminal)
        }
    };

    tx.commit().await?;
    Ok(outcome)
}

/// Worker-side counterpart to [`request_cancel`]: when an executor
/// observes `cancel_requested = TRUE` between sub-steps, it calls this
/// to transition the row from `running` to `cancelled`.
///
/// Guarded by `AND status = 'running'`: if the row has already moved on
/// (because another path called `mark_succeeded` or `mark_failed_or_retry`
/// between the executor's flag-check and this call), the UPDATE is a
/// no-op and the worker proceeds. The transition is best-effort; failing
/// to transition is not an error because the alternative outcome
/// (`succeeded` or `retrying`) is acceptable from the user's
/// perspective — their cancel arrived too late but the system state is
/// consistent.
pub async fn finalize_cancelled(pool: &PgPool, id: JobId) -> Result<(), JobError> {
    query(
        "UPDATE jobs \
            SET status = 'cancelled'::job_status, \
                locked_at = NULL, \
                locked_by = NULL, \
                updated_at = now() \
          WHERE id = $1 AND status = 'running'",
    )
    .bind(id.as_uuid())
    .execute(pool)
    .await?;
    Ok(())
}

/// Fetch a single job by id. Returns `Ok(None)` when no row exists.
pub async fn get(pool: &PgPool, id: JobId) -> Result<Option<Job>, JobError> {
    let row = query_as(&format!("SELECT {JOB_COLUMNS} FROM jobs WHERE id = $1"))
        .bind(id.as_uuid())
        .fetch_optional(pool)
        .await?;
    Ok(row)
}

/// Paginated listing of jobs in reverse-chronological order
/// (`created_at DESC`), with optional filters on `status` and `kind`.
///
/// `limit` is clamped to `[1, 200]` and `offset` is clamped to `[0, ∞)`
/// so an unconstrained caller (e.g., a misbehaving client supplying
/// `limit=1000000`) cannot exhaust server memory or return absurd page
/// sizes.
pub async fn list(pool: &PgPool, filter: ListFilter) -> Result<Vec<Job>, JobError> {
    let limit = filter.limit.clamp(1, 200);
    let offset = filter.offset.max(0);

    // The `$1::text IS NULL OR status::text = $1` idiom lets one query
    // serve both the unfiltered and filtered cases; Postgres optimises
    // the always-true branch away when the filter is NULL.
    let rows = query_as(&format!(
        "SELECT {JOB_COLUMNS} \
           FROM jobs \
          WHERE ($1::text IS NULL OR status::text = $1) \
            AND ($2::text IS NULL OR kind::text = $2) \
          ORDER BY created_at DESC \
          LIMIT $3 OFFSET $4"
    ))
    .bind(filter.status.map(JobStatus::as_str))
    .bind(filter.kind.map(JobKind::as_str))
    .bind(limit)
    .bind(offset)
    .fetch_all(pool)
    .await?;
    Ok(rows)
}

/// Recovery sweep — re-queue any row that has been stuck in `running`
/// for longer than `stale_seconds`.
///
/// This is the safety net for crashed workers. A worker that exits
/// (SIGKILL, OOM, hardware failure) mid-job leaves its row with
/// `status = 'running'` and `locked_at` pointing at the crash time. The
/// next worker that starts up calls this function during startup; rows
/// whose `locked_at` is older than the threshold are reset to
/// `retrying` and become eligible for dequeue again.
///
/// The threshold should be larger than the slowest legitimate job — the
/// cost of a too-small threshold is double-processing of a live job; the
/// cost of a too-large threshold is a longer wait before crashed work is
/// recovered. The recommended default is 5 minutes (300 s); see the
/// runbook for tuning guidance.
///
/// Returns the number of rows reset.
pub async fn recover_stale(pool: &PgPool, stale_seconds: i64) -> Result<u64, JobError> {
    // The interval is built from a bound parameter rather than
    // concatenated into the SQL text, to avoid the (theoretical) risk
    // of an injection at the cost of one extra `||` per execution.
    let result = query(
        "UPDATE jobs \
            SET status = 'retrying'::job_status, \
                last_error = COALESCE(last_error, 'recovered: worker crashed mid-execution'), \
                locked_at = NULL, \
                locked_by = NULL, \
                updated_at = now() \
          WHERE status = 'running' \
            AND locked_at < now() - ($1 || ' seconds')::interval",
    )
    .bind(stale_seconds.to_string())
    .execute(pool)
    .await?;
    Ok(result.rows_affected())
}