runledger_runtime/
worker.rs

1use std::any::Any;
2use std::cmp::min;
3use std::panic::AssertUnwindSafe;
4use std::sync::Arc;
5
6use futures_util::FutureExt;
7use runledger_core::jobs::{JobCompletion, JobContext, JobFailure};
8use runledger_postgres::jobs::{self, JobProgressUpdate};
9use tokio::sync::{Semaphore, watch};
10use tokio::task::JoinSet;
11use tokio::time::{Duration, Instant, MissedTickBehavior, sleep_until};
12use tracing::{Instrument, error, info, info_span, warn};
13
14mod completion;
15mod dead_letter;
16mod observers;
17
18use self::completion::{
19    CompletionObservation, complete_job_failure_after_handler, complete_job_success_after_handler,
20};
21use self::observers::{JobRunningNotification, TerminalJobObserverEvent, TerminalObserverTasks};
22use crate::RuntimeLoopExit;
23use crate::WorkerError;
24use crate::config::JobsConfig;
25use crate::observer::{JobLeaseLostEvent, JobLifecycleObservers, ObservedJob};
26use crate::registry::JobRegistry;
27use crate::shutdown;
28
29const UNKNOWN_WORKER_ID: &str = "unknown-worker";
30// Kept stable for clients that already match this code; it also covers leases
31// that expired before the worker's lifecycle update reached storage.
32const LEASE_OWNER_MISMATCH_CODE: &str = "job.lease_owner_mismatch";
33const LEASE_MAINTENANCE_FAILED_CODE: &str = "job.lease_maintenance_failed";
34const WORKFLOW_RELEASE_CONFLICT_CODE: &str = "workflow.release_conflict";
35const HANDLER_PANIC_CODE: &str = "job.handler_panic";
36const INVALID_COMPLETION_PROGRESS_CODE: &str = "job.invalid_completion_progress";
37const RUNNING_PROGRESS_PERSIST_FAILED_REASON: &str = "RUNNING_PROGRESS_PERSIST_FAILED";
38const UNSTARTED_CLAIM_RELEASE_NOT_APPLICABLE_CODE: &str =
39    "job.unstarted_claim_release_not_applicable";
40const UNSTARTED_CLAIM_RETRY_DELAY_MS: i32 = 1_000;
41
42enum JobExecutionFailure {
43    Handler(JobFailure),
44    LeaseMaintenance(JobFailure),
45}
46
47pub async fn run_worker_loop(
48    pool: runledger_postgres::DbPool,
49    registry: JobRegistry,
50    config: JobsConfig,
51    shutdown: watch::Receiver<bool>,
52) -> RuntimeLoopExit {
53    run_worker_loop_with_observer(
54        pool,
55        registry,
56        config,
57        shutdown,
58        JobLifecycleObservers::empty(),
59    )
60    .await
61}
62
63pub async fn run_worker_loop_with_observer(
64    pool: runledger_postgres::DbPool,
65    registry: JobRegistry,
66    config: JobsConfig,
67    mut shutdown: watch::Receiver<bool>,
68    observers: JobLifecycleObservers,
69) -> RuntimeLoopExit {
70    if let Err(error) = config.validate_worker_loop() {
71        warn!(%error, "invalid jobs config; stopping worker loop");
72        return RuntimeLoopExit::InvalidConfig(error);
73    }
74
75    let registry = Arc::new(registry);
76    let claimable_job_types = registry.registered_types();
77    let semaphore = Arc::new(Semaphore::new(config.max_global_concurrency));
78    let mut join_set: JoinSet<()> = JoinSet::new();
79    let terminal_observer_tasks = TerminalObserverTasks::owned();
80
81    loop {
82        drain_finished_tasks(&mut join_set).await;
83        terminal_observer_tasks.drain_finished().await;
84
85        if shutdown::is_requested_or_closed(&shutdown) {
86            return drain_worker_tasks(
87                join_set,
88                terminal_observer_tasks,
89                RuntimeLoopExit::Shutdown,
90            )
91            .await;
92        }
93
94        if claimable_job_types.is_empty() {
95            if shutdown::wait_for_request_or_timeout(&mut shutdown, config.poll_interval).await {
96                return drain_worker_tasks(
97                    join_set,
98                    terminal_observer_tasks,
99                    RuntimeLoopExit::Shutdown,
100                )
101                .await;
102            }
103            continue;
104        }
105
106        let available = semaphore.available_permits();
107        if available == 0 {
108            if shutdown::wait_for_request_or_timeout(&mut shutdown, config.poll_interval).await {
109                return drain_worker_tasks(
110                    join_set,
111                    terminal_observer_tasks,
112                    RuntimeLoopExit::Shutdown,
113                )
114                .await;
115            }
116            continue;
117        }
118
119        let claim_limit = min(available, config.claim_batch_size as usize);
120        let claimed = match jobs::claim_prestart_jobs_for_types(
121            &pool,
122            &config.worker_id,
123            config.lease_ttl_seconds,
124            claim_limit as i64,
125            &claimable_job_types,
126        )
127        .await
128        {
129            Ok(claimed) => claimed,
130            Err(error) => {
131                let error = WorkerError::ClaimJobs {
132                    worker_id: config.worker_id.clone(),
133                    source: error,
134                };
135                warn!(%error, "worker claim failed");
136                Vec::new()
137            }
138        };
139
140        if claimed.is_empty() {
141            shutdown::wait_for_request_or_timeout(&mut shutdown, config.poll_interval).await;
142            continue;
143        }
144
145        let claimed_len = claimed.len();
146        for job in claimed {
147            let permit = match Arc::clone(&semaphore).acquire_owned().await {
148                Ok(permit) => permit,
149                Err(_) => {
150                    // The worker owns this semaphore and never closes it. If
151                    // this defensive branch fires, surface it as an unexpected
152                    // loop completion rather than graceful shutdown.
153                    warn!("worker semaphore closed; stopping worker loop");
154                    return drain_worker_tasks(
155                        join_set,
156                        terminal_observer_tasks,
157                        RuntimeLoopExit::Completed,
158                    )
159                    .await;
160                }
161            };
162            let pool_clone = pool.clone();
163            let registry_clone = Arc::clone(&registry);
164            let lease_ttl_seconds = config.lease_ttl_seconds;
165            let observers = observers.clone();
166            let terminal_observer_tasks = terminal_observer_tasks.clone();
167            join_set.spawn(async move {
168                let _permit = permit;
169                process_claimed_job_with_terminal_observers(
170                    pool_clone,
171                    registry_clone,
172                    job,
173                    lease_ttl_seconds,
174                    observers,
175                    terminal_observer_tasks,
176                )
177                .await;
178            });
179        }
180
181        if claimed_len == claim_limit {
182            continue;
183        }
184
185        if shutdown::wait_for_request_or_timeout(&mut shutdown, config.poll_interval).await {
186            return drain_worker_tasks(
187                join_set,
188                terminal_observer_tasks,
189                RuntimeLoopExit::Shutdown,
190            )
191            .await;
192        }
193    }
194}
195
196async fn drain_worker_tasks(
197    mut join_set: JoinSet<()>,
198    terminal_observer_tasks: TerminalObserverTasks,
199    exit: RuntimeLoopExit,
200) -> RuntimeLoopExit {
201    if !join_set.is_empty() {
202        match exit {
203            RuntimeLoopExit::Shutdown => {
204                info!("worker shutdown requested; draining in-flight jobs")
205            }
206            RuntimeLoopExit::InvalidConfig(_) => {
207                warn!("worker loop rejected invalid config; draining in-flight jobs");
208            }
209            RuntimeLoopExit::Completed => {
210                warn!("worker loop completed before shutdown; draining in-flight jobs");
211            }
212        }
213    }
214    while let Some(result) = join_set.join_next().await {
215        if let Err(error) = result {
216            error!(%error, "job task crashed while draining in-flight jobs");
217        }
218    }
219    terminal_observer_tasks.drain_for_shutdown().await;
220    exit
221}
222
223async fn drain_finished_tasks(join_set: &mut JoinSet<()>) {
224    while let Some(result) = join_set.try_join_next() {
225        if let Err(error) = result {
226            error!(%error, "job task crashed");
227        }
228    }
229}
230
231#[cfg(test)]
232async fn process_claimed_job(
233    pool: runledger_postgres::DbPool,
234    registry: Arc<JobRegistry>,
235    job: jobs::JobQueueRecord,
236    lease_ttl_seconds: i32,
237) {
238    process_claimed_job_with_observer(
239        pool,
240        registry,
241        job,
242        lease_ttl_seconds,
243        JobLifecycleObservers::empty(),
244    )
245    .await;
246}
247
248#[cfg(test)]
249async fn process_claimed_job_with_observer(
250    pool: runledger_postgres::DbPool,
251    registry: Arc<JobRegistry>,
252    job: jobs::JobQueueRecord,
253    lease_ttl_seconds: i32,
254    observers: JobLifecycleObservers,
255) {
256    process_claimed_job_with_terminal_observers(
257        pool,
258        registry,
259        job,
260        lease_ttl_seconds,
261        observers,
262        TerminalObserverTasks::detached(),
263    )
264    .await;
265}
266
267async fn process_claimed_job_with_terminal_observers(
268    pool: runledger_postgres::DbPool,
269    registry: Arc<JobRegistry>,
270    job: jobs::JobQueueRecord,
271    lease_ttl_seconds: i32,
272    observers: JobLifecycleObservers,
273    terminal_observer_tasks: TerminalObserverTasks,
274) {
275    let worker_id = job
276        .worker_id
277        .clone()
278        .unwrap_or_else(|| UNKNOWN_WORKER_ID.to_owned());
279
280    let job_span = info_span!(
281        "job",
282        sentry.name = %job.job_type,
283        sentry.op = "runledger.job",
284        job_id = %job.id,
285        job_type = %job.job_type,
286        run_number = job.run_number,
287        attempt = job.attempt,
288        organization_id = ?job.organization_id,
289        worker_id = %worker_id,
290    );
291    async {
292        let start = Instant::now();
293        let context = JobContext {
294            job_id: job.id,
295            run_number: job.run_number,
296            attempt: job.attempt,
297            organization_id: job.organization_id,
298            worker_id: worker_id.clone(),
299        };
300        let observed_job = observed_job(&job, &worker_id);
301
302        if !mark_job_running_or_abort(&pool, &context, &job).await {
303            return;
304        }
305        let mut running_notification =
306            JobRunningNotification::spawn(observers.clone(), observed_job.clone());
307
308        match execute_job_handler_with_heartbeats(
309            pool.clone(),
310            Arc::clone(&registry),
311            &context,
312            &job,
313            lease_ttl_seconds,
314        )
315        .await
316        {
317            Ok(completion) => {
318                complete_job_success_after_handler(
319                    &pool,
320                    registry.as_ref(),
321                    &context,
322                    &job,
323                    completion,
324                    CompletionObservation::new(
325                        &observers,
326                        observed_job.clone(),
327                        start.elapsed(),
328                        &mut running_notification,
329                        &terminal_observer_tasks,
330                    ),
331                )
332                .await;
333            }
334            Err(JobExecutionFailure::Handler(failure)) => {
335                complete_job_failure_after_handler(
336                    &pool,
337                    registry.as_ref(),
338                    &context,
339                    &job,
340                    failure,
341                    CompletionObservation::new(
342                        &observers,
343                        observed_job.clone(),
344                        start.elapsed(),
345                        &mut running_notification,
346                        &terminal_observer_tasks,
347                    ),
348                )
349                .await;
350            }
351            Err(JobExecutionFailure::LeaseMaintenance(failure)) => {
352                log_lease_maintenance_abort(&job, &failure);
353                running_notification
354                    .spawn_terminal_observer(
355                        &terminal_observer_tasks,
356                        &job,
357                        observers.clone(),
358                        TerminalJobObserverEvent::LeaseLost(JobLeaseLostEvent {
359                            job: observed_job.clone(),
360                            duration: start.elapsed(),
361                            failure,
362                        }),
363                    )
364                    .await;
365            }
366        }
367
368        info!(
369            job_id = %job.id,
370            attempt = job.attempt,
371            run_number = job.run_number,
372            elapsed_ms = start.elapsed().as_millis(),
373            "job processed"
374        );
375    }
376    .instrument(job_span)
377    .await;
378}
379
380fn observed_job(job: &jobs::JobQueueRecord, worker_id: &str) -> ObservedJob {
381    ObservedJob {
382        job_id: job.id,
383        job_type: job.job_type.clone(),
384        organization_id: job.organization_id,
385        run_number: job.run_number,
386        attempt: job.attempt,
387        max_attempts: job.max_attempts,
388        worker_id: worker_id.to_owned(),
389    }
390}
391
392fn log_lease_maintenance_abort(job: &jobs::JobQueueRecord, failure: &JobFailure) {
393    warn!(
394        job_id = %job.id,
395        attempt = job.attempt,
396        failure_code = failure.code,
397        "job processing aborted because durable lease maintenance was lost"
398    );
399}
400
401async fn mark_job_running_or_abort(
402    pool: &runledger_postgres::DbPool,
403    context: &JobContext,
404    job: &jobs::JobQueueRecord,
405) -> bool {
406    let running_progress = JobProgressUpdate {
407        stage: Some(runledger_core::jobs::JobStage::Running),
408        progress_done: None,
409        progress_total: None,
410        checkpoint: None,
411    };
412
413    let Err(source) = jobs::update_job_progress(
414        pool,
415        job.id,
416        job.run_number,
417        job.attempt,
418        &context.worker_id,
419        &running_progress,
420    )
421    .await
422    else {
423        return true;
424    };
425
426    handle_running_progress_persist_failure(pool, context, job, source).await;
427    false
428}
429
430async fn handle_running_progress_persist_failure(
431    pool: &runledger_postgres::DbPool,
432    context: &JobContext,
433    job: &jobs::JobQueueRecord,
434    source: runledger_postgres::Error,
435) {
436    let lease_owner_mismatch = is_lease_owner_mismatch_error(&source);
437    let error = WorkerError::SetRunningProgress {
438        job_id: job.id,
439        attempt: job.attempt,
440        source,
441    };
442
443    if lease_owner_mismatch {
444        warn!(
445            %error,
446            job_id = %job.id,
447            attempt = job.attempt,
448            "aborting job before execution because lease ownership was already lost"
449        );
450        return;
451    }
452
453    match jobs::release_unstarted_job_claim(
454        pool,
455        job.id,
456        job.run_number,
457        job.attempt,
458        &context.worker_id,
459        RUNNING_PROGRESS_PERSIST_FAILED_REASON,
460        UNSTARTED_CLAIM_RETRY_DELAY_MS,
461    )
462    .await
463    {
464        Ok(()) => {
465            warn!(
466                %error,
467                job_id = %job.id,
468                attempt = job.attempt,
469                "running progress could not be persisted; released unstarted claim back to pending"
470            );
471        }
472        Err(release_error) => {
473            let no_longer_releasable =
474                is_unstarted_claim_release_not_applicable_error(&release_error);
475            let release_error = WorkerError::ReleaseUnstartedClaim {
476                job_id: job.id,
477                attempt: job.attempt,
478                source: release_error,
479            };
480            if no_longer_releasable {
481                warn!(
482                    %error,
483                    %release_error,
484                    job_id = %job.id,
485                    attempt = job.attempt,
486                    "running progress could not be persisted; unstarted release no longer applies and the job will continue under the current lease owner"
487                );
488                return;
489            }
490
491            warn!(
492                %error,
493                %release_error,
494                job_id = %job.id,
495                attempt = job.attempt,
496                "running progress could not be persisted; leaving claim for reaper recovery"
497            );
498        }
499    }
500}
501
502async fn execute_job_handler(
503    registry: Arc<JobRegistry>,
504    context: &JobContext,
505    job: &jobs::JobQueueRecord,
506) -> Result<JobCompletion, JobFailure> {
507    let Some(handler) = registry.get(job.job_type.as_borrowed()) else {
508        return Err(JobFailure::terminal(
509            "job.handler_not_registered",
510            "No handler is registered for this job type.",
511        ));
512    };
513
514    handler.execute(context.clone(), job.payload.clone()).await
515}
516
517async fn execute_job_handler_with_heartbeats(
518    pool: runledger_postgres::DbPool,
519    registry: Arc<JobRegistry>,
520    context: &JobContext,
521    job: &jobs::JobQueueRecord,
522    lease_ttl_seconds: i32,
523) -> Result<JobCompletion, JobExecutionFailure> {
524    let mut execution =
525        Box::pin(AssertUnwindSafe(execute_job_handler(registry, context, job)).catch_unwind());
526    let timeout_deadline = Instant::now() + Duration::from_secs(job.timeout_seconds.max(1) as u64);
527    let mut timeout = Box::pin(sleep_until(timeout_deadline));
528
529    let mut ticker = tokio::time::interval(heartbeat_interval(lease_ttl_seconds));
530    ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
531    ticker.tick().await;
532
533    loop {
534        tokio::select! {
535            result = &mut execution => {
536                return match result {
537                    Ok(result) => result.map_err(JobExecutionFailure::Handler),
538                    Err(panic_payload) => {
539                        Err(JobExecutionFailure::Handler(handler_panic_failure(panic_payload)))
540                    }
541                };
542            }
543            _ = &mut timeout => {
544                return Err(JobExecutionFailure::Handler(JobFailure::timeout(
545                    "job.timeout_exceeded",
546                    "Job exceeded the configured timeout.",
547                )));
548            }
549            _ = ticker.tick() => {
550                if let Err(error) = jobs::heartbeat_job(
551                    &pool,
552                    job.id,
553                    job.run_number,
554                    job.attempt,
555                    &context.worker_id,
556                    lease_ttl_seconds,
557                )
558                .await
559                {
560                    let lease_owner_mismatch = is_lease_owner_mismatch_error(&error);
561                    let error = WorkerError::Heartbeat {
562                        job_id: job.id,
563                        attempt: job.attempt,
564                        source: error,
565                    };
566
567                    if lease_owner_mismatch {
568                        warn!(%error, job_id = %job.id, "job heartbeat lost lease ownership");
569                        return Err(JobExecutionFailure::LeaseMaintenance(
570                            lease_owner_mismatch_failure(),
571                        ));
572                    }
573
574                    warn!(
575                        %error,
576                        job_id = %job.id,
577                        "aborting job because lease heartbeat could not be persisted"
578                    );
579                    return Err(JobExecutionFailure::LeaseMaintenance(
580                        lease_maintenance_failure(),
581                    ));
582                }
583            }
584        }
585    }
586}
587
588fn lease_owner_mismatch_failure() -> JobFailure {
589    JobFailure::lease_expired(
590        LEASE_OWNER_MISMATCH_CODE,
591        "Job lease ownership was lost during processing.",
592    )
593}
594
595fn lease_maintenance_failure() -> JobFailure {
596    JobFailure::lease_expired(
597        LEASE_MAINTENANCE_FAILED_CODE,
598        "Job lease could not be durably maintained during processing.",
599    )
600}
601
602fn handler_panic_failure(panic_payload: Box<dyn Any + Send>) -> JobFailure {
603    JobFailure::panicked(
604        HANDLER_PANIC_CODE,
605        format!(
606            "Job handler panicked: {}",
607            panic_payload_message(&*panic_payload)
608        ),
609    )
610}
611
612fn panic_payload_message(panic_payload: &(dyn Any + Send)) -> String {
613    if let Some(message) = panic_payload.downcast_ref::<String>() {
614        return message.clone();
615    }
616
617    if let Some(message) = panic_payload.downcast_ref::<&'static str>() {
618        return (*message).to_string();
619    }
620
621    "non-string panic payload".to_string()
622}
623
624fn has_query_error_code(error: &runledger_postgres::Error, expected_code: &str) -> bool {
625    matches!(
626        error,
627        runledger_postgres::Error::QueryError(query_error)
628            if query_error.code() == expected_code
629    )
630}
631
632fn is_lease_owner_mismatch_error(error: &runledger_postgres::Error) -> bool {
633    has_query_error_code(error, LEASE_OWNER_MISMATCH_CODE)
634}
635
636fn is_unstarted_claim_release_not_applicable_error(error: &runledger_postgres::Error) -> bool {
637    has_query_error_code(error, UNSTARTED_CLAIM_RELEASE_NOT_APPLICABLE_CODE)
638}
639
640fn heartbeat_interval(lease_ttl_seconds: i32) -> Duration {
641    // Renew at one-third of the lease TTL so a delayed heartbeat still leaves
642    // time for subsequent renewals before the lease expires.
643    let seconds = (lease_ttl_seconds.max(1) / 3).max(1) as u64;
644    Duration::from_secs(seconds)
645}
646
647#[cfg(test)]
648mod tests;
runledger_runtime/worker.rs

runledger_runtime/
worker.rs