edgefirst_client/
retry.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright © 2025 Au-Zone Technologies. All Rights Reserved.
3
4//! Retry policies with URL-based classification for EdgeFirst Studio Client.
5//!
6//! # Overview
7//!
8//! This module implements intelligent retry logic that classifies requests into
9//! two categories:
10//!
11//! - **StudioApi**: EdgeFirst Studio JSON-RPC API calls
12//!   (`*.edgefirst.studio/api`)
13//! - **FileIO**: File upload/download operations (AWS S3 pre-signed URLs,
14//!   CloudFront, etc.)
15//!
16//! # Motivation
17//!
18//! Different types of operations have different failure characteristics and
19//! retry requirements:
20//!
21//! ## Studio API Requests
22//!
23//! - **Low concurrency**: Sequential JSON-RPC method calls
24//! - **Fast-fail desired**: Authentication failures should not retry
25//! - **Predictable errors**: HTTP 401/403 indicate auth issues, not transient
26//!   failures
27//! - **User experience**: Users expect quick feedback on invalid credentials
28//!
29//! ## File I/O Operations (S3, CloudFront)
30//!
31//! - **High concurrency**: Parallel uploads/downloads of dataset files (100+
32//!   files)
33//! - **Transient failures common**: S3 rate limiting, network congestion,
34//!   timeouts
35//! - **Retry-safe**: Idempotent operations (pre-signed URLs, multipart uploads)
36//! - **Robustness critical**: Dataset operations must complete reliably despite
37//!   temporary issues
38//!
39//! # Classification Strategy
40//!
41//! URLs are classified by inspecting the host and path:
42//!
43//! - **StudioApi**: `https://*.edgefirst.studio/api*` (exact host match + path
44//!   prefix)
45//! - **FileIO**: Everything else (S3, CloudFront, or any non-API Studio path)
46//!
47//! # Retry Behavior
48//!
49//! Both scopes use the same configurable retry count (`EDGEFIRST_MAX_RETRIES`,
50//! default: 3), but differ in error classification:
51//!
52//! ## StudioApi Error Classification
53//!
54//! - **Never retry**: 401 Unauthorized, 403 Forbidden (auth failures)
55//! - **Always retry**: 408 Timeout, 429 Too Many Requests, 5xx Server Errors
56//! - **Retry transports errors**: Connection failures, DNS errors, timeouts
57//!
58//! ## FileIO Error Classification
59//!
60//! - **Always retry**: 408 Timeout, 409 Conflict, 423 Locked, 429 Too Many
61//!   Requests, 5xx Server Errors
62//! - **Retry transport errors**: Connection failures, DNS errors, timeouts
63//! - **No auth bypass**: All HTTP errors (including 401/403) are retried for S3
64//!   URLs
65//!
66//! # Configuration
67//!
68//! - `EDGEFIRST_MAX_RETRIES`: Maximum retry attempts per request (default: 3)
69//! - `EDGEFIRST_TIMEOUT`: Request timeout in seconds (default: 30)
70//!
71//! **For bulk file operations**, increase retry count for better resilience:
72//! ```bash
73//! export EDGEFIRST_MAX_RETRIES=10  # More retries for S3 operations
74//! export EDGEFIRST_TIMEOUT=60      # Longer timeout for large files
75//! ```
76//!
77//! # Examples
78//!
79//! ```rust
80//! use edgefirst_client::{RetryScope, classify_url};
81//!
82//! // Studio API calls
83//! assert_eq!(
84//!     classify_url("https://edgefirst.studio/api"),
85//!     RetryScope::StudioApi
86//! );
87//! assert_eq!(
88//!     classify_url("https://test.edgefirst.studio/api/datasets.list"),
89//!     RetryScope::StudioApi
90//! );
91//!
92//! // File I/O operations
93//! assert_eq!(
94//!     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
95//!     RetryScope::FileIO
96//! );
97//! assert_eq!(
98//!     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
99//!     RetryScope::FileIO
100//! );
101//! ```
102
103use url::Url;
104
105/// Retry scope classification for URL-based retry policies.
106///
107/// Determines whether a request is a Studio API call or a File I/O operation,
108/// enabling different error handling strategies for each category.
109#[derive(Clone, Debug, PartialEq, Eq)]
110pub enum RetryScope {
111    /// EdgeFirst Studio JSON-RPC API calls to `*.edgefirst.studio/api`.
112    ///
113    /// These calls should fail fast on authentication errors but retry
114    /// server errors and transient failures.
115    StudioApi,
116
117    /// File upload/download operations to S3, CloudFront, or other endpoints.
118    ///
119    /// These operations experience high concurrency and should retry
120    /// aggressively on all transient failures.
121    FileIO,
122}
123
124/// Classifies a URL to determine which retry policy to apply.
125///
126/// This function performs URL-based classification to differentiate between
127/// EdgeFirst Studio API calls and File I/O operations (S3, CloudFront, etc.).
128///
129/// # Classification Algorithm
130///
131/// 1. Parse URL using proper URL parser (handles ports, query params,
132///    fragments)
133/// 2. Check protocol: Only HTTP/HTTPS are classified as StudioApi (all others →
134///    FileIO)
135/// 3. Check host: Must be `edgefirst.studio` or `*.edgefirst.studio`
136/// 4. Check path: Must start with `/api` (exact match or `/api/...`)
137/// 5. If all conditions met → `StudioApi`, otherwise → `FileIO`
138///
139/// # Edge Cases Handled
140///
141/// - **Port numbers**: `https://test.edgefirst.studio:8080/api` → StudioApi
142/// - **Trailing slashes**: `https://edgefirst.studio/api/` → StudioApi
143/// - **Query parameters**: `https://edgefirst.studio/api?foo=bar` → StudioApi
144/// - **Subdomains**: `https://ocean.edgefirst.studio/api` → StudioApi
145/// - **Similar domains**: `https://edgefirst.studio.com/api` → FileIO (not
146///   exact match)
147/// - **Path injection**: `https://evil.com/edgefirst.studio/api` → FileIO (host
148///   mismatch)
149/// - **Non-API paths**: `https://edgefirst.studio/download` → FileIO
150///
151/// # Security
152///
153/// The function uses proper URL parsing to prevent domain spoofing attacks.
154/// Only the URL host is checked, not the path, preventing injection via
155/// `https://attacker.com/edgefirst.studio/api`.
156///
157/// # Examples
158///
159/// ```rust
160/// use edgefirst_client::{RetryScope, classify_url};
161///
162/// // Studio API URLs
163/// assert_eq!(
164///     classify_url("https://edgefirst.studio/api"),
165///     RetryScope::StudioApi
166/// );
167/// assert_eq!(
168///     classify_url("https://test.edgefirst.studio/api/datasets"),
169///     RetryScope::StudioApi
170/// );
171/// assert_eq!(
172///     classify_url("https://test.edgefirst.studio:443/api?token=abc"),
173///     RetryScope::StudioApi
174/// );
175///
176/// // File I/O URLs (S3, CloudFront, etc.)
177/// assert_eq!(
178///     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
179///     RetryScope::FileIO
180/// );
181/// assert_eq!(
182///     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
183///     RetryScope::FileIO
184/// );
185/// assert_eq!(
186///     classify_url("https://edgefirst.studio/download_model"),
187///     RetryScope::FileIO // Non-API path
188/// );
189/// ```
190pub fn classify_url(url: &str) -> RetryScope {
191    // Try to parse as proper URL
192    if let Ok(parsed) = Url::parse(url) {
193        // Only match HTTP/HTTPS protocols
194        if parsed.scheme() != "http" && parsed.scheme() != "https" {
195            return RetryScope::FileIO;
196        }
197
198        if let Some(host) = parsed.host_str() {
199            let host_matches = host == "edgefirst.studio" || host.ends_with(".edgefirst.studio");
200
201            // Path must be exactly "/api" or start with "/api/" (not "/apis" etc.)
202            let path = parsed.path();
203            let path_is_api = path == "/api" || path.starts_with("/api/");
204
205            if host_matches && path_is_api {
206                return RetryScope::StudioApi;
207            }
208        }
209    }
210
211    RetryScope::FileIO
212}
213
214/// Creates a retry policy with URL-based classification.
215///
216/// This function builds a reqwest retry policy that inspects each request URL
217/// and applies different error classification rules based on whether it's a
218/// Studio API call or a File I/O operation.
219///
220/// # Retry Configuration
221///
222/// - **Max retries**: Configurable via `EDGEFIRST_MAX_RETRIES` (default: 3)
223/// - **Timeout**: Configurable via `EDGEFIRST_TIMEOUT` (default: 30 seconds)
224///
225/// # Error Classification by Scope
226///
227/// ## StudioApi (*.edgefirst.studio/api)
228///
229/// Optimized for fast-fail on authentication errors:
230///
231/// | HTTP Status | Action | Rationale |
232/// |-------------|--------|-----------|
233/// | 401, 403 | Never retry | Authentication failure - user action required |
234/// | 408, 429 | Retry | Timeout, rate limiting - transient |
235/// | 5xx | Retry | Server error - may recover |
236/// | Connection errors | Retry | Network issues - transient |
237///
238/// ## FileIO (S3, CloudFront, etc.)
239///
240/// Optimized for robustness under high concurrency:
241///
242/// | HTTP Status | Action | Rationale |
243/// |-------------|--------|-----------|
244/// | 408, 429 | Retry | Timeout, rate limiting - common with S3 |
245/// | 409, 423 | Retry | Conflict, locked - S3 eventual consistency |
246/// | 5xx | Retry | Server error - S3 transient issues |
247/// | Connection errors | Retry | Network issues - common in parallel uploads |
248///
249/// # Usage Recommendations
250///
251/// **For dataset downloads/uploads** (many concurrent S3 operations):
252/// ```bash
253/// export EDGEFIRST_MAX_RETRIES=10  # More retries for robustness
254/// export EDGEFIRST_TIMEOUT=60      # Longer timeout for large files
255/// ```
256///
257/// **For testing** (fast failure detection):
258/// ```bash
259/// export EDGEFIRST_MAX_RETRIES=1   # Minimal retries
260/// export EDGEFIRST_TIMEOUT=10      # Quick timeout
261/// ```
262///
263/// # Implementation Notes
264///
265/// Due to reqwest retry API limitations, both StudioApi and FileIO use the
266/// same `max_retries_per_request` value. The differentiation is in error
267/// classification only (which errors trigger retries), not retry count.
268///
269/// For operations requiring different retry counts, use separate Client
270/// instances with different `EDGEFIRST_MAX_RETRIES` configuration.
271pub fn create_retry_policy() -> reqwest::retry::Builder {
272    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES")
273        .ok()
274        .and_then(|s| s.parse().ok())
275        .unwrap_or(3); // Reduced from 5 to 3 for faster failures
276
277    // Use wildcard host scope since we do URL inspection in classify_fn
278    reqwest::retry::for_host("*")
279        .max_retries_per_request(max_retries)
280        .classify_fn(|req_rep| {
281            let url = req_rep.uri().to_string();
282
283            match classify_url(&url) {
284                RetryScope::StudioApi => {
285                    // Studio API: Never retry auth failures, retry server errors
286                    match req_rep.status() {
287                        Some(status) => match status.as_u16() {
288                            401 | 403 => req_rep.success(), // Auth failures - don't retry
289                            429 | 408 | 500..=599 => req_rep.retryable(),
290                            _ => req_rep.success(),
291                        },
292                        // No status code means connection error, timeout, or other transport
293                        // failure These are safe to retry for API calls
294                        None if req_rep.error().is_some() => req_rep.retryable(),
295                        None => req_rep.success(),
296                    }
297                }
298                RetryScope::FileIO => {
299                    // File I/O: Retry all transient errors
300                    match req_rep.status() {
301                        Some(status) => match status.as_u16() {
302                            429 | 408 | 500..=599 | 409 | 423 => req_rep.retryable(),
303                            _ => req_rep.success(),
304                        },
305                        None if req_rep.error().is_some() => req_rep.retryable(),
306                        None => req_rep.success(),
307                    }
308                }
309            }
310        })
311}
312
313pub fn log_retry_configuration() {
314    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES").unwrap_or_else(|_| "3".to_string());
315    let timeout = std::env::var("EDGEFIRST_TIMEOUT").unwrap_or_else(|_| "30".to_string());
316    log::info!(
317        "Retry configuration - max_retries={}, timeout={}s",
318        max_retries,
319        timeout
320    );
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326
327    #[test]
328    fn test_classify_url_studio_api() {
329        // Base production URL
330        assert_eq!(
331            classify_url("https://edgefirst.studio/api"),
332            RetryScope::StudioApi
333        );
334
335        // Server-specific instances
336        assert_eq!(
337            classify_url("https://test.edgefirst.studio/api"),
338            RetryScope::StudioApi
339        );
340        assert_eq!(
341            classify_url("https://stage.edgefirst.studio/api"),
342            RetryScope::StudioApi
343        );
344        assert_eq!(
345            classify_url("https://saas.edgefirst.studio/api"),
346            RetryScope::StudioApi
347        );
348        assert_eq!(
349            classify_url("https://ocean.edgefirst.studio/api"),
350            RetryScope::StudioApi
351        );
352
353        // API endpoints with paths
354        assert_eq!(
355            classify_url("https://test.edgefirst.studio/api/datasets"),
356            RetryScope::StudioApi
357        );
358        assert_eq!(
359            classify_url("https://stage.edgefirst.studio/api/auth.login"),
360            RetryScope::StudioApi
361        );
362    }
363
364    #[test]
365    fn test_classify_url_file_io() {
366        // S3 URLs for file operations
367        assert_eq!(
368            classify_url("https://s3.amazonaws.com/bucket/file.bin"),
369            RetryScope::FileIO
370        );
371
372        // CloudFront URLs for file distribution
373        assert_eq!(
374            classify_url("https://d123abc.cloudfront.net/file.bin"),
375            RetryScope::FileIO
376        );
377
378        // Non-API paths on edgefirst.studio domain
379        assert_eq!(
380            classify_url("https://edgefirst.studio/docs"),
381            RetryScope::FileIO
382        );
383        assert_eq!(
384            classify_url("https://test.edgefirst.studio/download_model"),
385            RetryScope::FileIO
386        );
387        assert_eq!(
388            classify_url("https://stage.edgefirst.studio/download_checkpoint"),
389            RetryScope::FileIO
390        );
391
392        // Generic download URLs
393        assert_eq!(
394            classify_url("https://example.com/download"),
395            RetryScope::FileIO
396        );
397    }
398}