edgefirst_client/retry.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright © 2025 Au-Zone Technologies. All Rights Reserved.
3
4//! Retry policies with URL-based classification for EdgeFirst Studio Client.
5//!
6//! # Overview
7//!
8//! This module implements intelligent retry logic that classifies requests into
9//! two categories:
10//!
11//! - **StudioApi**: EdgeFirst Studio JSON-RPC API calls
12//! (`*.edgefirst.studio/api`)
13//! - **FileIO**: File upload/download operations (AWS S3 pre-signed URLs,
14//! CloudFront, etc.)
15//!
16//! # Motivation
17//!
18//! Different types of operations have different failure characteristics and
19//! retry requirements:
20//!
21//! ## Studio API Requests
22//!
23//! - **Low concurrency**: Sequential JSON-RPC method calls
24//! - **Fast-fail desired**: Authentication failures should not retry
25//! - **Predictable errors**: HTTP 401/403 indicate auth issues, not transient
26//! failures
27//! - **User experience**: Users expect quick feedback on invalid credentials
28//!
29//! ## File I/O Operations (S3, CloudFront)
30//!
31//! - **High concurrency**: Parallel uploads/downloads of dataset files (100+
32//! files)
33//! - **Transient failures common**: S3 rate limiting, network congestion,
34//! timeouts
35//! - **Retry-safe**: Idempotent operations (pre-signed URLs, multipart uploads)
36//! - **Robustness critical**: Dataset operations must complete reliably despite
37//! temporary issues
38//!
39//! # Classification Strategy
40//!
41//! URLs are classified by inspecting the host and path:
42//!
43//! - **StudioApi**: `https://*.edgefirst.studio/api*` (exact host match + path
44//! prefix)
45//! - **FileIO**: Everything else (S3, CloudFront, or any non-API Studio path)
46//!
47//! # Retry Behavior
48//!
49//! Both scopes use the same configurable retry count (`EDGEFIRST_MAX_RETRIES`,
50//! default: 3), but differ in error classification:
51//!
52//! ## StudioApi Error Classification
53//!
54//! - **Never retry**: 401 Unauthorized, 403 Forbidden (auth failures)
55//! - **Always retry**: 408 Timeout, 429 Too Many Requests, 5xx Server Errors
56//! - **Retry transports errors**: Connection failures, DNS errors, timeouts
57//!
58//! ## FileIO Error Classification
59//!
60//! - **Always retry**: 408 Timeout, 409 Conflict, 423 Locked, 429 Too Many
61//! Requests, 5xx Server Errors
62//! - **Retry transport errors**: Connection failures, DNS errors, timeouts
63//! - **No auth bypass**: All HTTP errors (including 401/403) are retried for S3
64//! URLs
65//!
66//! # Configuration
67//!
68//! - `EDGEFIRST_MAX_RETRIES`: Maximum retry attempts per request (default: 3)
69//! - `EDGEFIRST_TIMEOUT`: Request timeout in seconds (default: 30)
70//!
71//! **For bulk file operations**, increase retry count for better resilience:
72//! ```bash
73//! export EDGEFIRST_MAX_RETRIES=10 # More retries for S3 operations
74//! export EDGEFIRST_TIMEOUT=60 # Longer timeout for large files
75//! ```
76//!
77//! # Examples
78//!
79//! ```rust
80//! use edgefirst_client::{RetryScope, classify_url};
81//!
82//! // Studio API calls
83//! assert_eq!(
84//! classify_url("https://edgefirst.studio/api"),
85//! RetryScope::StudioApi
86//! );
87//! assert_eq!(
88//! classify_url("https://test.edgefirst.studio/api/datasets.list"),
89//! RetryScope::StudioApi
90//! );
91//!
92//! // File I/O operations
93//! assert_eq!(
94//! classify_url("https://s3.amazonaws.com/bucket/file.bin"),
95//! RetryScope::FileIO
96//! );
97//! assert_eq!(
98//! classify_url("https://d123abc.cloudfront.net/dataset.zip"),
99//! RetryScope::FileIO
100//! );
101//! ```
102
103use url::Url;
104
105/// Retry scope classification for URL-based retry policies.
106///
107/// Determines whether a request is a Studio API call or a File I/O operation,
108/// enabling different error handling strategies for each category.
109#[derive(Clone, Debug, PartialEq, Eq)]
110pub enum RetryScope {
111 /// EdgeFirst Studio JSON-RPC API calls to `*.edgefirst.studio/api`.
112 ///
113 /// These calls should fail fast on authentication errors but retry
114 /// server errors and transient failures.
115 StudioApi,
116
117 /// File upload/download operations to S3, CloudFront, or other endpoints.
118 ///
119 /// These operations experience high concurrency and should retry
120 /// aggressively on all transient failures.
121 FileIO,
122}
123
124/// Classifies a URL to determine which retry policy to apply.
125///
126/// This function performs URL-based classification to differentiate between
127/// EdgeFirst Studio API calls and File I/O operations (S3, CloudFront, etc.).
128///
129/// # Classification Algorithm
130///
131/// 1. Parse URL using proper URL parser (handles ports, query params,
132/// fragments)
133/// 2. Check protocol: Only HTTP/HTTPS are classified as StudioApi (all others →
134/// FileIO)
135/// 3. Check host: Must be `edgefirst.studio` or `*.edgefirst.studio`
136/// 4. Check path: Must start with `/api` (exact match or `/api/...`)
137/// 5. If all conditions met → `StudioApi`, otherwise → `FileIO`
138///
139/// # Edge Cases Handled
140///
141/// - **Port numbers**: `https://test.edgefirst.studio:8080/api` → StudioApi
142/// - **Trailing slashes**: `https://edgefirst.studio/api/` → StudioApi
143/// - **Query parameters**: `https://edgefirst.studio/api?foo=bar` → StudioApi
144/// - **Subdomains**: `https://ocean.edgefirst.studio/api` → StudioApi
145/// - **Similar domains**: `https://edgefirst.studio.com/api` → FileIO (not
146/// exact match)
147/// - **Path injection**: `https://evil.com/edgefirst.studio/api` → FileIO (host
148/// mismatch)
149/// - **Non-API paths**: `https://edgefirst.studio/download` → FileIO
150///
151/// # Security
152///
153/// The function uses proper URL parsing to prevent domain spoofing attacks.
154/// Only the URL host is checked, not the path, preventing injection via
155/// `https://attacker.com/edgefirst.studio/api`.
156///
157/// # Examples
158///
159/// ```rust
160/// use edgefirst_client::{RetryScope, classify_url};
161///
162/// // Studio API URLs
163/// assert_eq!(
164/// classify_url("https://edgefirst.studio/api"),
165/// RetryScope::StudioApi
166/// );
167/// assert_eq!(
168/// classify_url("https://test.edgefirst.studio/api/datasets"),
169/// RetryScope::StudioApi
170/// );
171/// assert_eq!(
172/// classify_url("https://test.edgefirst.studio:443/api?token=abc"),
173/// RetryScope::StudioApi
174/// );
175///
176/// // File I/O URLs (S3, CloudFront, etc.)
177/// assert_eq!(
178/// classify_url("https://s3.amazonaws.com/bucket/file.bin"),
179/// RetryScope::FileIO
180/// );
181/// assert_eq!(
182/// classify_url("https://d123abc.cloudfront.net/dataset.zip"),
183/// RetryScope::FileIO
184/// );
185/// assert_eq!(
186/// classify_url("https://edgefirst.studio/download_model"),
187/// RetryScope::FileIO // Non-API path
188/// );
189/// ```
190pub fn classify_url(url: &str) -> RetryScope {
191 // Try to parse as proper URL
192 if let Ok(parsed) = Url::parse(url) {
193 // Only match HTTP/HTTPS protocols
194 if parsed.scheme() != "http" && parsed.scheme() != "https" {
195 return RetryScope::FileIO;
196 }
197
198 if let Some(host) = parsed.host_str() {
199 let host_matches = host == "edgefirst.studio" || host.ends_with(".edgefirst.studio");
200
201 // Path must be exactly "/api" or start with "/api/" (not "/apis" etc.)
202 let path = parsed.path();
203 let path_is_api = path == "/api" || path.starts_with("/api/");
204
205 if host_matches && path_is_api {
206 return RetryScope::StudioApi;
207 }
208 }
209 }
210
211 RetryScope::FileIO
212}
213
214/// Creates a retry policy with URL-based classification.
215///
216/// This function builds a reqwest retry policy that inspects each request URL
217/// and applies different error classification rules based on whether it's a
218/// Studio API call or a File I/O operation.
219///
220/// # Retry Configuration
221///
222/// - **Max retries**: Configurable via `EDGEFIRST_MAX_RETRIES` (default: 3)
223/// - **Timeout**: Configurable via `EDGEFIRST_TIMEOUT` (default: 30 seconds)
224///
225/// # Error Classification by Scope
226///
227/// ## StudioApi (*.edgefirst.studio/api)
228///
229/// Optimized for fast-fail on authentication errors:
230///
231/// | HTTP Status | Action | Rationale |
232/// |-------------|--------|-----------|
233/// | 401, 403 | Never retry | Authentication failure - user action required |
234/// | 408, 429 | Retry | Timeout, rate limiting - transient |
235/// | 5xx | Retry | Server error - may recover |
236/// | Connection errors | Retry | Network issues - transient |
237///
238/// ## FileIO (S3, CloudFront, etc.)
239///
240/// Optimized for robustness under high concurrency:
241///
242/// | HTTP Status | Action | Rationale |
243/// |-------------|--------|-----------|
244/// | 408, 429 | Retry | Timeout, rate limiting - common with S3 |
245/// | 409, 423 | Retry | Conflict, locked - S3 eventual consistency |
246/// | 5xx | Retry | Server error - S3 transient issues |
247/// | Connection errors | Retry | Network issues - common in parallel uploads |
248///
249/// # Usage Recommendations
250///
251/// **For dataset downloads/uploads** (many concurrent S3 operations):
252/// ```bash
253/// export EDGEFIRST_MAX_RETRIES=10 # More retries for robustness
254/// export EDGEFIRST_TIMEOUT=60 # Longer timeout for large files
255/// ```
256///
257/// **For testing** (fast failure detection):
258/// ```bash
259/// export EDGEFIRST_MAX_RETRIES=1 # Minimal retries
260/// export EDGEFIRST_TIMEOUT=10 # Quick timeout
261/// ```
262///
263/// # Implementation Notes
264///
265/// Due to reqwest retry API limitations, both StudioApi and FileIO use the
266/// same `max_retries_per_request` value. The differentiation is in error
267/// classification only (which errors trigger retries), not retry count.
268///
269/// For operations requiring different retry counts, use separate Client
270/// instances with different `EDGEFIRST_MAX_RETRIES` configuration.
271pub fn create_retry_policy() -> reqwest::retry::Builder {
272 let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES")
273 .ok()
274 .and_then(|s| s.parse().ok())
275 .unwrap_or(3); // Reduced from 5 to 3 for faster failures
276
277 // Use wildcard host scope since we do URL inspection in classify_fn
278 reqwest::retry::for_host("*")
279 .max_retries_per_request(max_retries)
280 .classify_fn(|req_rep| {
281 let url = req_rep.uri().to_string();
282
283 match classify_url(&url) {
284 RetryScope::StudioApi => {
285 // Studio API: Never retry auth failures, retry server errors
286 match req_rep.status() {
287 Some(status) => match status.as_u16() {
288 401 | 403 => req_rep.success(), // Auth failures - don't retry
289 429 | 408 | 500..=599 => req_rep.retryable(),
290 _ => req_rep.success(),
291 },
292 // No status code means connection error, timeout, or other transport
293 // failure These are safe to retry for API calls
294 None if req_rep.error().is_some() => req_rep.retryable(),
295 None => req_rep.success(),
296 }
297 }
298 RetryScope::FileIO => {
299 // File I/O: Retry all transient errors
300 match req_rep.status() {
301 Some(status) => match status.as_u16() {
302 429 | 408 | 500..=599 | 409 | 423 => req_rep.retryable(),
303 _ => req_rep.success(),
304 },
305 None if req_rep.error().is_some() => req_rep.retryable(),
306 None => req_rep.success(),
307 }
308 }
309 }
310 })
311}
312
313pub fn log_retry_configuration() {
314 let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES").unwrap_or_else(|_| "3".to_string());
315 let timeout = std::env::var("EDGEFIRST_TIMEOUT").unwrap_or_else(|_| "30".to_string());
316 log::debug!(
317 "Retry configuration - max_retries={}, timeout={}s",
318 max_retries,
319 timeout
320 );
321}
322
323#[cfg(test)]
324mod tests {
325 use super::*;
326
327 #[test]
328 fn test_classify_url_studio_api() {
329 // Base production URL
330 assert_eq!(
331 classify_url("https://edgefirst.studio/api"),
332 RetryScope::StudioApi
333 );
334
335 // Server-specific instances
336 assert_eq!(
337 classify_url("https://test.edgefirst.studio/api"),
338 RetryScope::StudioApi
339 );
340 assert_eq!(
341 classify_url("https://stage.edgefirst.studio/api"),
342 RetryScope::StudioApi
343 );
344 assert_eq!(
345 classify_url("https://saas.edgefirst.studio/api"),
346 RetryScope::StudioApi
347 );
348 assert_eq!(
349 classify_url("https://ocean.edgefirst.studio/api"),
350 RetryScope::StudioApi
351 );
352
353 // API endpoints with paths
354 assert_eq!(
355 classify_url("https://test.edgefirst.studio/api/datasets"),
356 RetryScope::StudioApi
357 );
358 assert_eq!(
359 classify_url("https://stage.edgefirst.studio/api/auth.login"),
360 RetryScope::StudioApi
361 );
362 }
363
364 #[test]
365 fn test_classify_url_file_io() {
366 // S3 URLs for file operations
367 assert_eq!(
368 classify_url("https://s3.amazonaws.com/bucket/file.bin"),
369 RetryScope::FileIO
370 );
371
372 // CloudFront URLs for file distribution
373 assert_eq!(
374 classify_url("https://d123abc.cloudfront.net/file.bin"),
375 RetryScope::FileIO
376 );
377
378 // Non-API paths on edgefirst.studio domain
379 assert_eq!(
380 classify_url("https://edgefirst.studio/docs"),
381 RetryScope::FileIO
382 );
383 assert_eq!(
384 classify_url("https://test.edgefirst.studio/download_model"),
385 RetryScope::FileIO
386 );
387 assert_eq!(
388 classify_url("https://stage.edgefirst.studio/download_checkpoint"),
389 RetryScope::FileIO
390 );
391
392 // Generic download URLs
393 assert_eq!(
394 classify_url("https://example.com/download"),
395 RetryScope::FileIO
396 );
397 }
398}