1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
use std::time::Duration;
use crate::error::KumoError;
const JITTER_FRACTION: f64 = 0.25;
/// Configures how the engine retries failed fetch attempts.
///
/// # Example
/// ```rust,ignore
/// CrawlEngine::builder()
/// .retry_policy(
/// RetryPolicy::new(3)
/// .base_delay(Duration::from_millis(200))
/// .max_delay(Duration::from_secs(30))
/// .jitter(true)
/// .on_status(429)
/// .on_status(503),
/// )
/// ```
#[derive(Debug, Clone)]
pub struct RetryPolicy {
pub(crate) max_attempts: u32,
pub(crate) base_delay: Duration,
pub(crate) max_delay: Duration,
pub(crate) jitter: bool,
/// Empty = retry any `HttpStatus` or `Fetch` error.
/// Non-empty = only retry `HttpStatus` where the code is in this list.
pub(crate) retriable_statuses: Vec<u16>,
}
impl RetryPolicy {
/// Create a policy with `max_attempts` retries, 500ms base delay, 60s cap, no jitter.
///
/// `max_attempts` is the number of *retries* — total fetch calls = `max_attempts + 1`.
pub fn new(max_attempts: u32) -> Self {
Self {
max_attempts,
base_delay: Duration::from_millis(500),
max_delay: Duration::from_secs(60),
jitter: false,
retriable_statuses: Vec::new(),
}
}
pub fn base_delay(mut self, d: Duration) -> Self {
self.base_delay = d;
self
}
pub fn max_delay(mut self, d: Duration) -> Self {
self.max_delay = d;
self
}
/// Add ≤25% random jitter to each delay so concurrent retries don't thundering-herd.
pub fn jitter(mut self, enabled: bool) -> Self {
self.jitter = enabled;
self
}
/// Only retry when the HTTP response status code matches.
/// Call multiple times to allow several codes.
///
/// If never called, retries on any `KumoError::HttpStatus` or `KumoError::Fetch`.
pub fn on_status(mut self, status: u16) -> Self {
self.retriable_statuses.push(status);
self
}
/// Number of retries allowed by this policy.
///
/// Total fetch calls can be up to `max_attempts + 1`, because the first
/// fetch is not counted as a retry.
pub fn max_attempts(&self) -> u32 {
self.max_attempts
}
/// Return the configured base delay before exponential backoff.
pub fn base_delay_value(&self) -> Duration {
self.base_delay
}
/// Return the maximum retry delay cap.
pub fn max_delay_value(&self) -> Duration {
self.max_delay
}
/// Return whether this policy adds jitter to retry delays.
pub fn jitter_enabled(&self) -> bool {
self.jitter
}
/// Return the configured HTTP status filter.
///
/// An empty slice means all `KumoError::HttpStatus` and `KumoError::Fetch`
/// errors are retryable.
pub fn retriable_statuses(&self) -> &[u16] {
&self.retriable_statuses
}
/// Compute the deterministic exponential-backoff delay before retry
/// `attempt` (0-indexed), excluding random jitter.
pub fn delay_for_attempt(&self, attempt: u32) -> Duration {
let factor = 2_u32.saturating_pow(attempt);
self.base_delay.saturating_mul(factor).min(self.max_delay)
}
/// Compute the deterministic retry delay for `attempt`, preferring a
/// server-provided delay hint when one is available.
///
/// The hint is capped by `max_delay`. If no hint is provided, this returns
/// the normal exponential-backoff delay without jitter.
pub fn delay_for_attempt_with_hint(&self, attempt: u32, hint: Option<Duration>) -> Duration {
hint.map_or_else(
|| self.delay_for_attempt(attempt),
|delay| delay.min(self.max_delay),
)
}
/// Return the inclusive delay range that can be produced for `attempt`.
///
/// When jitter is disabled, both bounds are the deterministic delay.
/// When jitter is enabled, the upper bound includes up to 25% extra delay
/// and is still capped by `max_delay`.
pub fn delay_bounds_for_attempt(&self, attempt: u32) -> (Duration, Duration) {
let base = self.delay_for_attempt(attempt);
if !self.jitter {
return (base, base);
}
let max_jitter = Duration::from_secs_f64(base.as_secs_f64() * JITTER_FRACTION);
(base, (base + max_jitter).min(self.max_delay))
}
/// Return `true` if `err` should trigger a retry under this policy.
pub fn is_retriable_error(&self, err: &KumoError) -> bool {
match err {
KumoError::HttpStatus { status, .. } => {
if self.retriable_statuses.is_empty() {
true
} else {
self.retriable_statuses.contains(status)
}
}
KumoError::Fetch(_) => self.retriable_statuses.is_empty(),
// Never retry parse, store, domain, depth, llm, or browser errors.
_ => false,
}
}
/// Compute the sleep duration before retry `attempt` (0-indexed).
/// Result is capped at `max_delay`. If jitter is on, adds up to 25% extra.
pub(crate) fn delay_for(&self, attempt: u32) -> Duration {
let raw = self.delay_for_attempt(attempt);
if self.jitter {
use rand::Rng;
let extra_frac = rand::rng().random_range(0.0_f64..JITTER_FRACTION);
let extra = Duration::from_secs_f64(raw.as_secs_f64() * extra_frac);
(raw + extra).min(self.max_delay)
} else {
raw
}
}
pub(crate) fn delay_for_with_hint(&self, attempt: u32, hint: Option<Duration>) -> Duration {
hint.map_or_else(
|| self.delay_for(attempt),
|delay| delay.min(self.max_delay),
)
}
/// Return `true` if `err` should trigger a retry under this policy.
pub(crate) fn is_retriable(&self, err: &KumoError) -> bool {
self.is_retriable_error(err)
}
}