1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Failure Handler Demo
# A simple, runnable example demonstrating automatic job retry with failure handlers.
#
# This workflow runs jobs that randomly fail with different exit codes. The failure
# handler automatically retries jobs based on the error type.
#
# To run this example:
# 1. Start the torc server
# 2. Run: torc run examples/yaml/failure_handler_demo.yaml
# 3. Watch the jobs retry automatically when they fail with recoverable errors
#
# Exit codes used:
# 0: Success
# 1: Unrecoverable error (not retried)
# 10: Convergence failure (retried with recovery script)
# 11: Resource issue (retried with recovery script)
# 12: Transient error (retried without recovery script)
name: failure_handler_demo
description: Demo of automatic job retry with failure handlers
failure_handlers:
- name: demo_recovery
rules:
# Convergence failures: run recovery script, retry up to 3 times
- exit_codes:
recovery_script: examples/scripts/recovery_demo.sh
max_retries: 3
# Resource/timeout issues: run recovery script, retry up to 2 times
- exit_codes:
recovery_script: examples/scripts/recovery_demo.sh
max_retries: 2
# Transient errors: simple retry without recovery script
- exit_codes:
max_retries: 3
# Note: exit code 1 is intentionally NOT included.
# Jobs failing with exit code 1 will not be retried.
jobs:
# A job that randomly fails - demonstrates failure handling
- name: flaky_job_1
command: bash examples/scripts/failure_demo_job.sh --fail-rate 0.8 --work-time 0.5
failure_handler: demo_recovery
# Another flaky job that depends on the first
- name: flaky_job_2
command: bash examples/scripts/failure_demo_job.sh --fail-rate 0.6 --work-time 0.5
failure_handler: demo_recovery
depends_on:
- flaky_job_1
# A reliable job that always succeeds
- name: reliable_job
command: echo "This job always succeeds"
depends_on:
- flaky_job_1
# Final job that runs after all others complete
- name: summary
command: echo "All jobs completed successfully!"
depends_on:
- flaky_job_2
- reliable_job