edgefirst_client/
retry.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright © 2025 Au-Zone Technologies. All Rights Reserved.
3
4//! Retry policies with URL-based classification for EdgeFirst Studio Client.
5//!
6//! # Overview
7//!
8//! This module implements intelligent retry logic that classifies requests into
9//! two categories:
10//!
11//! - **StudioApi**: EdgeFirst Studio JSON-RPC API calls
12//!   (`*.edgefirst.studio/api`)
13//! - **FileIO**: File upload/download operations (AWS S3 pre-signed URLs,
14//!   CloudFront, etc.)
15//!
16//! # Motivation
17//!
18//! Different types of operations have different failure characteristics and
19//! retry requirements:
20//!
21//! ## Studio API Requests
22//!
23//! - **Low concurrency**: Sequential JSON-RPC method calls
24//! - **Fast-fail desired**: Authentication failures should not retry
25//! - **Predictable errors**: HTTP 401/403 indicate auth issues, not transient
26//!   failures
27//! - **User experience**: Users expect quick feedback on invalid credentials
28//!
29//! ## File I/O Operations (S3, CloudFront)
30//!
31//! - **High concurrency**: Parallel uploads/downloads of dataset files (100+
32//!   files)
33//! - **Transient failures common**: S3 rate limiting, network congestion,
34//!   timeouts
35//! - **Retry-safe**: Idempotent operations (pre-signed URLs, multipart uploads)
36//! - **Robustness critical**: Dataset operations must complete reliably despite
37//!   temporary issues
38//!
39//! # Classification Strategy
40//!
41//! URLs are classified by inspecting the host and path:
42//!
43//! - **StudioApi**: `https://*.edgefirst.studio/api*` (exact host match + path
44//!   prefix)
45//! - **FileIO**: Everything else (S3, CloudFront, or any non-API Studio path)
46//!
47//! # Retry Behavior
48//!
49//! Both scopes use the same configurable retry count (`EDGEFIRST_MAX_RETRIES`,
50//! default: 3), but differ in error classification:
51//!
52//! # Environment Variables
53//!
54//! - `EDGEFIRST_MAX_RETRIES`: Maximum number of retries for failed requests
55//!   (default: 3)
56//! - `MAX_TASKS`: Maximum concurrent upload/download tasks (default: half of
57//!   CPU cores, min 2, max 8). Lower values (2-8) work better for large files
58//!   to avoid timeouts. Higher values (16-32) are better for many small files.
59//!
60//! ## StudioApi Error Classification
61//!
62//! - **Never retry**: 401 Unauthorized, 403 Forbidden (auth failures)
63//! - **Always retry**: 408 Timeout, 429 Too Many Requests, 5xx Server Errors
64//! - **Retry transports errors**: Connection failures, DNS errors, timeouts
65//!
66//! ## FileIO Error Classification
67//!
68//! - **Always retry**: 408 Timeout, 409 Conflict, 423 Locked, 429 Too Many
69//!   Requests, 5xx Server Errors
70//! - **Retry transport errors**: Connection failures, DNS errors, timeouts
71//! - **No auth bypass**: All HTTP errors (including 401/403) are retried for S3
72//!   URLs
73//!
74//! # Configuration
75//!
76//! - `EDGEFIRST_MAX_RETRIES`: Maximum retry attempts per request (default: 3)
77//! - `EDGEFIRST_TIMEOUT`: Request timeout in seconds (default: 30)
78//!
79//! **For bulk file operations**, increase retry count for better resilience:
80//! ```bash
81//! export EDGEFIRST_MAX_RETRIES=10  # More retries for S3 operations
82//! export EDGEFIRST_TIMEOUT=60      # Longer timeout for large files
83//! ```
84//!
85//! # Examples
86//!
87//! ```rust
88//! use edgefirst_client::{RetryScope, classify_url};
89//!
90//! // Studio API calls
91//! assert_eq!(
92//!     classify_url("https://edgefirst.studio/api"),
93//!     RetryScope::StudioApi
94//! );
95//! assert_eq!(
96//!     classify_url("https://test.edgefirst.studio/api/datasets.list"),
97//!     RetryScope::StudioApi
98//! );
99//!
100//! // File I/O operations
101//! assert_eq!(
102//!     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
103//!     RetryScope::FileIO
104//! );
105//! assert_eq!(
106//!     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
107//!     RetryScope::FileIO
108//! );
109//! ```
110
111use url::Url;
112
113/// Retry scope classification for URL-based retry policies.
114///
115/// Determines whether a request is a Studio API call or a File I/O operation,
116/// enabling different error handling strategies for each category.
117#[derive(Clone, Debug, PartialEq, Eq)]
118pub enum RetryScope {
119    /// EdgeFirst Studio JSON-RPC API calls to `*.edgefirst.studio/api`.
120    ///
121    /// These calls should fail fast on authentication errors but retry
122    /// server errors and transient failures.
123    StudioApi,
124
125    /// File upload/download operations to S3, CloudFront, or other endpoints.
126    ///
127    /// These operations experience high concurrency and should retry
128    /// aggressively on all transient failures.
129    FileIO,
130}
131
132/// Classifies a URL to determine which retry policy to apply.
133///
134/// This function performs URL-based classification to differentiate between
135/// EdgeFirst Studio API calls and File I/O operations (S3, CloudFront, etc.).
136///
137/// # Classification Algorithm
138///
139/// 1. Parse URL using proper URL parser (handles ports, query params,
140///    fragments)
141/// 2. Check protocol: Only HTTP/HTTPS are classified as StudioApi (all others →
142///    FileIO)
143/// 3. Check host: Must be `edgefirst.studio` or `*.edgefirst.studio`
144/// 4. Check path: Must start with `/api` (exact match or `/api/...`)
145/// 5. If all conditions met → `StudioApi`, otherwise → `FileIO`
146///
147/// # Edge Cases Handled
148///
149/// - **Port numbers**: `https://test.edgefirst.studio:8080/api` → StudioApi
150/// - **Trailing slashes**: `https://edgefirst.studio/api/` → StudioApi
151/// - **Query parameters**: `https://edgefirst.studio/api?foo=bar` → StudioApi
152/// - **Subdomains**: `https://ocean.edgefirst.studio/api` → StudioApi
153/// - **Similar domains**: `https://edgefirst.studio.com/api` → FileIO (not
154///   exact match)
155/// - **Path injection**: `https://evil.com/edgefirst.studio/api` → FileIO (host
156///   mismatch)
157/// - **Non-API paths**: `https://edgefirst.studio/download` → FileIO
158///
159/// # Security
160///
161/// The function uses proper URL parsing to prevent domain spoofing attacks.
162/// Only the URL host is checked, not the path, preventing injection via
163/// `https://attacker.com/edgefirst.studio/api`.
164///
165/// # Examples
166///
167/// ```rust
168/// use edgefirst_client::{RetryScope, classify_url};
169///
170/// // Studio API URLs
171/// assert_eq!(
172///     classify_url("https://edgefirst.studio/api"),
173///     RetryScope::StudioApi
174/// );
175/// assert_eq!(
176///     classify_url("https://test.edgefirst.studio/api/datasets"),
177///     RetryScope::StudioApi
178/// );
179/// assert_eq!(
180///     classify_url("https://test.edgefirst.studio:443/api?token=abc"),
181///     RetryScope::StudioApi
182/// );
183///
184/// // File I/O URLs (S3, CloudFront, etc.)
185/// assert_eq!(
186///     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
187///     RetryScope::FileIO
188/// );
189/// assert_eq!(
190///     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
191///     RetryScope::FileIO
192/// );
193/// assert_eq!(
194///     classify_url("https://edgefirst.studio/download_model"),
195///     RetryScope::FileIO // Non-API path
196/// );
197/// ```
198pub fn classify_url(url: &str) -> RetryScope {
199    // Try to parse as proper URL
200    if let Ok(parsed) = Url::parse(url) {
201        // Only match HTTP/HTTPS protocols
202        if parsed.scheme() != "http" && parsed.scheme() != "https" {
203            return RetryScope::FileIO;
204        }
205
206        if let Some(host) = parsed.host_str() {
207            let host_matches = host == "edgefirst.studio" || host.ends_with(".edgefirst.studio");
208
209            // Path must be exactly "/api" or start with "/api/" (not "/apis" etc.)
210            let path = parsed.path();
211            let path_is_api = path == "/api" || path.starts_with("/api/");
212
213            if host_matches && path_is_api {
214                return RetryScope::StudioApi;
215            }
216        }
217    }
218
219    RetryScope::FileIO
220}
221
222/// Creates a retry policy with URL-based classification.
223///
224/// This function builds a reqwest retry policy that inspects each request URL
225/// and applies different error classification rules based on whether it's a
226/// Studio API call or a File I/O operation.
227///
228/// # Retry Configuration
229///
230/// - **Max retries**: Configurable via `EDGEFIRST_MAX_RETRIES` (default: 3)
231/// - **Timeout**: Configurable via `EDGEFIRST_TIMEOUT` (default: 30 seconds)
232///
233/// # Error Classification by Scope
234///
235/// ## StudioApi (*.edgefirst.studio/api)
236///
237/// Optimized for fast-fail on authentication errors:
238///
239/// | HTTP Status | Action | Rationale |
240/// |-------------|--------|-----------|
241/// | 401, 403 | Never retry | Authentication failure - user action required |
242/// | 408, 429 | Retry | Timeout, rate limiting - transient |
243/// | 5xx | Retry | Server error - may recover |
244/// | Connection errors | Retry | Network issues - transient |
245///
246/// ## FileIO (S3, CloudFront, etc.)
247///
248/// Optimized for robustness under high concurrency:
249///
250/// | HTTP Status | Action | Rationale |
251/// |-------------|--------|-----------|
252/// | 408, 429 | Retry | Timeout, rate limiting - common with S3 |
253/// | 409, 423 | Retry | Conflict, locked - S3 eventual consistency |
254/// | 5xx | Retry | Server error - S3 transient issues |
255/// | Connection errors | Retry | Network issues - common in parallel uploads |
256///
257/// # Usage Recommendations
258///
259/// **For dataset downloads/uploads** (many concurrent S3 operations):
260/// ```bash
261/// export EDGEFIRST_MAX_RETRIES=10  # More retries for robustness
262/// export EDGEFIRST_TIMEOUT=60      # Longer timeout for large files
263/// ```
264///
265/// **For testing** (fast failure detection):
266/// ```bash
267/// export EDGEFIRST_MAX_RETRIES=1   # Minimal retries
268/// export EDGEFIRST_TIMEOUT=10      # Quick timeout
269/// ```
270///
271/// # Implementation Notes
272///
273/// Due to reqwest retry API limitations, both StudioApi and FileIO use the
274/// same `max_retries_per_request` value. The differentiation is in error
275/// classification only (which errors trigger retries), not retry count.
276///
277/// For operations requiring different retry counts, use separate Client
278/// instances with different `EDGEFIRST_MAX_RETRIES` configuration.
279pub fn create_retry_policy() -> reqwest::retry::Builder {
280    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES")
281        .ok()
282        .and_then(|s| s.parse().ok())
283        .unwrap_or(3); // Reduced from 5 to 3 for faster failures
284
285    // Use wildcard host scope since we do URL inspection in classify_fn
286    reqwest::retry::for_host("*")
287        .max_retries_per_request(max_retries)
288        .classify_fn(|req_rep| {
289            let url = req_rep.uri().to_string();
290
291            match classify_url(&url) {
292                RetryScope::StudioApi => {
293                    // Studio API: Never retry auth failures, retry server errors
294                    match req_rep.status() {
295                        Some(status) => match status.as_u16() {
296                            401 | 403 => req_rep.success(), // Auth failures - don't retry
297                            429 | 408 | 500..=599 => req_rep.retryable(),
298                            _ => req_rep.success(),
299                        },
300                        // No status code means connection error, timeout, or other transport
301                        // failure These are safe to retry for API calls
302                        None if req_rep.error().is_some() => req_rep.retryable(),
303                        None => req_rep.success(),
304                    }
305                }
306                RetryScope::FileIO => {
307                    // File I/O: Retry all transient errors
308                    match req_rep.status() {
309                        Some(status) => match status.as_u16() {
310                            429 | 408 | 500..=599 | 409 | 423 => req_rep.retryable(),
311                            _ => req_rep.success(),
312                        },
313                        None if req_rep.error().is_some() => req_rep.retryable(),
314                        None => req_rep.success(),
315                    }
316                }
317            }
318        })
319}
320
321pub fn log_retry_configuration() {
322    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES").unwrap_or_else(|_| "3".to_string());
323    let timeout = std::env::var("EDGEFIRST_TIMEOUT").unwrap_or_else(|_| "30".to_string());
324    log::debug!(
325        "Retry configuration - max_retries={}, timeout={}s",
326        max_retries,
327        timeout
328    );
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_classify_url_studio_api() {
337        // Base production URL
338        assert_eq!(
339            classify_url("https://edgefirst.studio/api"),
340            RetryScope::StudioApi
341        );
342
343        // Server-specific instances
344        assert_eq!(
345            classify_url("https://test.edgefirst.studio/api"),
346            RetryScope::StudioApi
347        );
348        assert_eq!(
349            classify_url("https://stage.edgefirst.studio/api"),
350            RetryScope::StudioApi
351        );
352        assert_eq!(
353            classify_url("https://saas.edgefirst.studio/api"),
354            RetryScope::StudioApi
355        );
356        assert_eq!(
357            classify_url("https://ocean.edgefirst.studio/api"),
358            RetryScope::StudioApi
359        );
360
361        // API endpoints with paths
362        assert_eq!(
363            classify_url("https://test.edgefirst.studio/api/datasets"),
364            RetryScope::StudioApi
365        );
366        assert_eq!(
367            classify_url("https://stage.edgefirst.studio/api/auth.login"),
368            RetryScope::StudioApi
369        );
370    }
371
372    #[test]
373    fn test_classify_url_file_io() {
374        // S3 URLs for file operations
375        assert_eq!(
376            classify_url("https://s3.amazonaws.com/bucket/file.bin"),
377            RetryScope::FileIO
378        );
379
380        // CloudFront URLs for file distribution
381        assert_eq!(
382            classify_url("https://d123abc.cloudfront.net/file.bin"),
383            RetryScope::FileIO
384        );
385
386        // Non-API paths on edgefirst.studio domain
387        assert_eq!(
388            classify_url("https://edgefirst.studio/docs"),
389            RetryScope::FileIO
390        );
391        assert_eq!(
392            classify_url("https://test.edgefirst.studio/download_model"),
393            RetryScope::FileIO
394        );
395        assert_eq!(
396            classify_url("https://stage.edgefirst.studio/download_checkpoint"),
397            RetryScope::FileIO
398        );
399
400        // Generic download URLs
401        assert_eq!(
402            classify_url("https://example.com/download"),
403            RetryScope::FileIO
404        );
405    }
406}