edgefirst_client/retry.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright © 2025 Au-Zone Technologies. All Rights Reserved.
3
4//! Retry policies with URL-based classification for EdgeFirst Studio Client.
5//!
6//! # Overview
7//!
8//! This module implements intelligent retry logic that classifies requests into
9//! two categories:
10//!
11//! - **StudioApi**: EdgeFirst Studio JSON-RPC API calls
12//! (`*.edgefirst.studio/api`)
13//! - **FileIO**: File upload/download operations (AWS S3 pre-signed URLs,
14//! CloudFront, etc.)
15//!
16//! # Motivation
17//!
18//! Different types of operations have different failure characteristics and
19//! retry requirements:
20//!
21//! ## Studio API Requests
22//!
23//! - **Low concurrency**: Sequential JSON-RPC method calls
24//! - **Fast-fail desired**: Authentication failures should not retry
25//! - **Predictable errors**: HTTP 401/403 indicate auth issues, not transient
26//! failures
27//! - **User experience**: Users expect quick feedback on invalid credentials
28//!
29//! ## File I/O Operations (S3, CloudFront)
30//!
31//! - **High concurrency**: Parallel uploads/downloads of dataset files (100+
32//! files)
33//! - **Transient failures common**: S3 rate limiting, network congestion,
34//! timeouts
35//! - **Retry-safe**: Idempotent operations (pre-signed URLs, multipart uploads)
36//! - **Robustness critical**: Dataset operations must complete reliably despite
37//! temporary issues
38//!
39//! # Classification Strategy
40//!
41//! URLs are classified by inspecting the host and path:
42//!
43//! - **StudioApi**: `https://*.edgefirst.studio/api*` (exact host match + path
44//! prefix)
45//! - **FileIO**: Everything else (S3, CloudFront, or any non-API Studio path)
46//!
47//! # Retry Behavior
48//!
49//! Both scopes use the same configurable retry count (`EDGEFIRST_MAX_RETRIES`,
50//! default: 3), but differ in error classification:
51//!
52//! # Environment Variables
53//!
54//! - `EDGEFIRST_MAX_RETRIES`: Maximum number of retries for failed requests
55//! (default: 5)
56//! - `MAX_TASKS`: Maximum concurrent upload/download tasks (default: half of
57//! CPU cores, min 2, max 8). Lower values (2-8) work better for large files
58//! to avoid timeouts. Higher values (16-32) are better for many small files.
59//!
60//! ## StudioApi Error Classification
61//!
62//! - **Never retry**: 401 Unauthorized, 403 Forbidden (auth failures)
63//! - **Always retry**: 408 Timeout, 429 Too Many Requests, 5xx Server Errors
64//! - **Retry transports errors**: Connection failures, DNS errors, timeouts
65//!
66//! ## FileIO Error Classification
67//!
68//! - **Always retry**: 408 Timeout, 409 Conflict, 423 Locked, 429 Too Many
69//! Requests, 5xx Server Errors
70//! - **Retry transport errors**: Connection failures, DNS errors, timeouts
71//! - **No auth bypass**: All HTTP errors (including 401/403) are retried for S3
72//! URLs
73//!
74//! # Configuration
75//!
76//! - `EDGEFIRST_MAX_RETRIES`: Maximum retry attempts per request (default: 5)
77//! - `EDGEFIRST_TIMEOUT`: Total-request deadline for API calls in seconds
78//! (default: 30). Applies to the `http` client only. **Do not** increase this
79//! for large file transfers — use `EDGEFIRST_READ_TIMEOUT` instead.
80//! - `EDGEFIRST_READ_TIMEOUT`: Per-chunk idle timeout for bulk downloads in
81//! seconds (default: 120). Applies to the `bulk_http` client. Resets after
82//! every received chunk, so healthy large downloads are never interrupted.
83//! Only fires when no bytes arrive for the configured duration.
84//! - `EDGEFIRST_UPLOAD_TIMEOUT`: Per-operation total timeout for bulk uploads
85//! in seconds (default: 600). Applied per-request via `RequestBuilder::timeout`
86//! on each upload attempt (per part for multipart, per file for single uploads).
87//! Covers the send phase where `EDGEFIRST_READ_TIMEOUT` does not apply.
88//! Sized for PART_SIZE (100 MB) at ~170 KB/s minimum; increase for very slow
89//! uplinks or larger single-file uploads.
90//!
91//! **For bulk file operations**, increase retry count for better resilience:
92//! ```bash
93//! export EDGEFIRST_MAX_RETRIES=10 # More retries for S3 operations
94//! export EDGEFIRST_READ_TIMEOUT=300 # 5-minute idle timeout for very slow downlinks
95//! export EDGEFIRST_UPLOAD_TIMEOUT=900 # 15-minute per-part timeout for very slow uplinks
96//! ```
97//!
98//! # Examples
99//!
100//! ```rust
101//! use edgefirst_client::{RetryScope, classify_url};
102//!
103//! // Studio API calls
104//! assert_eq!(
105//! classify_url("https://edgefirst.studio/api"),
106//! RetryScope::StudioApi
107//! );
108//! assert_eq!(
109//! classify_url("https://test.edgefirst.studio/api/datasets.list"),
110//! RetryScope::StudioApi
111//! );
112//!
113//! // File I/O operations
114//! assert_eq!(
115//! classify_url("https://s3.amazonaws.com/bucket/file.bin"),
116//! RetryScope::FileIO
117//! );
118//! assert_eq!(
119//! classify_url("https://d123abc.cloudfront.net/dataset.zip"),
120//! RetryScope::FileIO
121//! );
122//! ```
123
124use url::Url;
125
126/// Retry scope classification for URL-based retry policies.
127///
128/// Determines whether a request is a Studio API call or a File I/O operation,
129/// enabling different error handling strategies for each category.
130#[derive(Clone, Debug, PartialEq, Eq)]
131pub enum RetryScope {
132 /// EdgeFirst Studio JSON-RPC API calls to `*.edgefirst.studio/api`.
133 ///
134 /// These calls should fail fast on authentication errors but retry
135 /// server errors and transient failures.
136 StudioApi,
137
138 /// File upload/download operations to S3, CloudFront, or other endpoints.
139 ///
140 /// These operations experience high concurrency and should retry
141 /// aggressively on all transient failures.
142 FileIO,
143}
144
145/// Classifies a URL to determine which retry policy to apply.
146///
147/// This function performs URL-based classification to differentiate between
148/// EdgeFirst Studio API calls and File I/O operations (S3, CloudFront, etc.).
149///
150/// # Classification Algorithm
151///
152/// 1. Parse URL using proper URL parser (handles ports, query params,
153/// fragments)
154/// 2. Check protocol: Only HTTP/HTTPS are classified as StudioApi (all others →
155/// FileIO)
156/// 3. Check host: Must be `edgefirst.studio` or `*.edgefirst.studio`
157/// 4. Check path: Must start with `/api` (exact match or `/api/...`)
158/// 5. If all conditions met → `StudioApi`, otherwise → `FileIO`
159///
160/// # Edge Cases Handled
161///
162/// - **Port numbers**: `https://test.edgefirst.studio:8080/api` → StudioApi
163/// - **Trailing slashes**: `https://edgefirst.studio/api/` → StudioApi
164/// - **Query parameters**: `https://edgefirst.studio/api?foo=bar` → StudioApi
165/// - **Subdomains**: `https://ocean.edgefirst.studio/api` → StudioApi
166/// - **Similar domains**: `https://edgefirst.studio.com/api` → FileIO (not
167/// exact match)
168/// - **Path injection**: `https://evil.com/edgefirst.studio/api` → FileIO (host
169/// mismatch)
170/// - **Non-API paths**: `https://edgefirst.studio/download` → FileIO
171///
172/// # Security
173///
174/// The function uses proper URL parsing to prevent domain spoofing attacks.
175/// Only the URL host is checked, not the path, preventing injection via
176/// `https://attacker.com/edgefirst.studio/api`.
177///
178/// # Examples
179///
180/// ```rust
181/// use edgefirst_client::{RetryScope, classify_url};
182///
183/// // Studio API URLs
184/// assert_eq!(
185/// classify_url("https://edgefirst.studio/api"),
186/// RetryScope::StudioApi
187/// );
188/// assert_eq!(
189/// classify_url("https://test.edgefirst.studio/api/datasets"),
190/// RetryScope::StudioApi
191/// );
192/// assert_eq!(
193/// classify_url("https://test.edgefirst.studio:443/api?token=abc"),
194/// RetryScope::StudioApi
195/// );
196///
197/// // File I/O URLs (S3, CloudFront, etc.)
198/// assert_eq!(
199/// classify_url("https://s3.amazonaws.com/bucket/file.bin"),
200/// RetryScope::FileIO
201/// );
202/// assert_eq!(
203/// classify_url("https://d123abc.cloudfront.net/dataset.zip"),
204/// RetryScope::FileIO
205/// );
206/// assert_eq!(
207/// classify_url("https://edgefirst.studio/download_model"),
208/// RetryScope::FileIO // Non-API path
209/// );
210/// ```
211pub fn classify_url(url: &str) -> RetryScope {
212 // Try to parse as proper URL
213 if let Ok(parsed) = Url::parse(url) {
214 // Only match HTTP/HTTPS protocols
215 if parsed.scheme() != "http" && parsed.scheme() != "https" {
216 return RetryScope::FileIO;
217 }
218
219 if let Some(host) = parsed.host_str() {
220 let host_matches = host == "edgefirst.studio" || host.ends_with(".edgefirst.studio");
221
222 // Path must be exactly "/api" or start with "/api/" (not "/apis" etc.)
223 let path = parsed.path();
224 let path_is_api = path == "/api" || path.starts_with("/api/");
225
226 if host_matches && path_is_api {
227 return RetryScope::StudioApi;
228 }
229 }
230 }
231
232 RetryScope::FileIO
233}
234
235/// Creates a retry policy with URL-based classification.
236///
237/// This function builds a reqwest retry policy that inspects each request URL
238/// and applies different error classification rules based on whether it's a
239/// Studio API call or a File I/O operation.
240///
241/// # Retry Configuration
242///
243/// - **Max retries**: Configurable via `EDGEFIRST_MAX_RETRIES` (default: 5)
244/// - **Timeout**: Configurable via `EDGEFIRST_TIMEOUT` (default: 30 seconds)
245///
246/// # Error Classification by Scope
247///
248/// ## StudioApi (*.edgefirst.studio/api)
249///
250/// Optimized for fast-fail on authentication errors:
251///
252/// | HTTP Status | Action | Rationale |
253/// |-------------|--------|-----------|
254/// | 401, 403 | Never retry | Authentication failure - user action required |
255/// | 408, 429 | Retry | Timeout, rate limiting - transient |
256/// | 5xx | Retry | Server error - may recover |
257/// | Connection errors | Retry | Network issues - transient |
258///
259/// ## FileIO (S3, CloudFront, etc.)
260///
261/// Optimized for robustness under high concurrency:
262///
263/// | HTTP Status | Action | Rationale |
264/// |-------------|--------|-----------|
265/// | 408, 429 | Retry | Timeout, rate limiting - common with S3 |
266/// | 409, 423 | Retry | Conflict, locked - S3 eventual consistency |
267/// | 5xx | Retry | Server error - S3 transient issues |
268/// | Connection errors | Retry | Network issues - common in parallel uploads |
269///
270/// # Usage Recommendations
271///
272/// **For dataset downloads/uploads** (many concurrent S3 operations):
273/// ```bash
274/// export EDGEFIRST_MAX_RETRIES=10 # More retries for robustness
275/// export EDGEFIRST_READ_TIMEOUT=300 # Longer idle timeout for slow links
276/// export EDGEFIRST_UPLOAD_TIMEOUT=900 # Longer per-part timeout for slow uplinks
277/// ```
278///
279/// **For testing** (fast failure detection):
280/// ```bash
281/// export EDGEFIRST_MAX_RETRIES=1 # Minimal retries
282/// export EDGEFIRST_TIMEOUT=10 # Quick API call timeout
283/// ```
284///
285/// # Implementation Notes
286///
287/// Due to reqwest retry API limitations, both StudioApi and FileIO use the
288/// same `max_retries_per_request` value. The differentiation is in error
289/// classification only (which errors trigger retries), not retry count.
290///
291/// For operations requiring different retry counts, use separate Client
292/// instances with different `EDGEFIRST_MAX_RETRIES` configuration.
293pub fn create_retry_policy() -> reqwest::retry::Builder {
294 let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES")
295 .ok()
296 .and_then(|s| s.parse().ok())
297 .unwrap_or(5);
298
299 // Use wildcard host scope since we do URL inspection in classify_fn
300 reqwest::retry::for_host("*")
301 .max_retries_per_request(max_retries)
302 .classify_fn(|req_rep| {
303 let url = req_rep.uri().to_string();
304
305 match classify_url(&url) {
306 RetryScope::StudioApi => {
307 // Studio API: Never retry auth failures, retry server errors
308 match req_rep.status() {
309 Some(status) => match status.as_u16() {
310 401 | 403 => req_rep.success(), // Auth failures - don't retry
311 429 | 408 | 500..=599 => req_rep.retryable(),
312 _ => req_rep.success(),
313 },
314 // No status code means connection error, timeout, or other transport
315 // failure These are safe to retry for API calls
316 None if req_rep.error().is_some() => req_rep.retryable(),
317 None => req_rep.success(),
318 }
319 }
320 RetryScope::FileIO => {
321 // File I/O: Retry all transient errors
322 match req_rep.status() {
323 Some(status) => match status.as_u16() {
324 429 | 408 | 500..=599 | 409 | 423 => req_rep.retryable(),
325 _ => req_rep.success(),
326 },
327 None if req_rep.error().is_some() => req_rep.retryable(),
328 None => req_rep.success(),
329 }
330 }
331 }
332 })
333}
334
335pub fn log_retry_configuration() {
336 let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES").unwrap_or_else(|_| "5".to_string());
337 let timeout = std::env::var("EDGEFIRST_TIMEOUT").unwrap_or_else(|_| "30".to_string());
338 let read_timeout =
339 std::env::var("EDGEFIRST_READ_TIMEOUT").unwrap_or_else(|_| "120".to_string());
340 let upload_timeout =
341 std::env::var("EDGEFIRST_UPLOAD_TIMEOUT").unwrap_or_else(|_| "600".to_string());
342 log::debug!(
343 "Retry configuration - max_retries={}, api_timeout={}s, bulk_read_timeout={}s, upload_timeout={}s",
344 max_retries,
345 timeout,
346 read_timeout,
347 upload_timeout
348 );
349}
350
351#[cfg(test)]
352mod tests {
353 use super::*;
354
355 #[test]
356 fn test_classify_url_studio_api() {
357 // Base production URL
358 assert_eq!(
359 classify_url("https://edgefirst.studio/api"),
360 RetryScope::StudioApi
361 );
362
363 // Server-specific instances
364 assert_eq!(
365 classify_url("https://test.edgefirst.studio/api"),
366 RetryScope::StudioApi
367 );
368 assert_eq!(
369 classify_url("https://stage.edgefirst.studio/api"),
370 RetryScope::StudioApi
371 );
372 assert_eq!(
373 classify_url("https://saas.edgefirst.studio/api"),
374 RetryScope::StudioApi
375 );
376 assert_eq!(
377 classify_url("https://ocean.edgefirst.studio/api"),
378 RetryScope::StudioApi
379 );
380
381 // API endpoints with paths
382 assert_eq!(
383 classify_url("https://test.edgefirst.studio/api/datasets"),
384 RetryScope::StudioApi
385 );
386 assert_eq!(
387 classify_url("https://stage.edgefirst.studio/api/auth.login"),
388 RetryScope::StudioApi
389 );
390 }
391
392 #[test]
393 fn test_classify_url_file_io() {
394 // S3 URLs for file operations
395 assert_eq!(
396 classify_url("https://s3.amazonaws.com/bucket/file.bin"),
397 RetryScope::FileIO
398 );
399
400 // CloudFront URLs for file distribution
401 assert_eq!(
402 classify_url("https://d123abc.cloudfront.net/file.bin"),
403 RetryScope::FileIO
404 );
405
406 // Non-API paths on edgefirst.studio domain
407 assert_eq!(
408 classify_url("https://edgefirst.studio/docs"),
409 RetryScope::FileIO
410 );
411 assert_eq!(
412 classify_url("https://test.edgefirst.studio/download_model"),
413 RetryScope::FileIO
414 );
415 assert_eq!(
416 classify_url("https://stage.edgefirst.studio/download_checkpoint"),
417 RetryScope::FileIO
418 );
419
420 // Generic download URLs
421 assert_eq!(
422 classify_url("https://example.com/download"),
423 RetryScope::FileIO
424 );
425 }
426}