Skip to main content

edgefirst_client/
retry.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright © 2025 Au-Zone Technologies. All Rights Reserved.
3
4//! Retry policies with URL-based classification for EdgeFirst Studio Client.
5//!
6//! # Overview
7//!
8//! This module implements intelligent retry logic that classifies requests into
9//! two categories:
10//!
11//! - **StudioApi**: EdgeFirst Studio JSON-RPC API calls
12//!   (`*.edgefirst.studio/api`)
13//! - **FileIO**: File upload/download operations (AWS S3 pre-signed URLs,
14//!   CloudFront, etc.)
15//!
16//! # Motivation
17//!
18//! Different types of operations have different failure characteristics and
19//! retry requirements:
20//!
21//! ## Studio API Requests
22//!
23//! - **Low concurrency**: Sequential JSON-RPC method calls
24//! - **Fast-fail desired**: Authentication failures should not retry
25//! - **Predictable errors**: HTTP 401/403 indicate auth issues, not transient
26//!   failures
27//! - **User experience**: Users expect quick feedback on invalid credentials
28//!
29//! ## File I/O Operations (S3, CloudFront)
30//!
31//! - **High concurrency**: Parallel uploads/downloads of dataset files (100+
32//!   files)
33//! - **Transient failures common**: S3 rate limiting, network congestion,
34//!   timeouts
35//! - **Retry-safe**: Idempotent operations (pre-signed URLs, multipart uploads)
36//! - **Robustness critical**: Dataset operations must complete reliably despite
37//!   temporary issues
38//!
39//! # Classification Strategy
40//!
41//! URLs are classified by inspecting the host and path:
42//!
43//! - **StudioApi**: `https://*.edgefirst.studio/api*` (exact host match + path
44//!   prefix)
45//! - **FileIO**: Everything else (S3, CloudFront, or any non-API Studio path)
46//!
47//! # Retry Behavior
48//!
49//! Both scopes use the same configurable retry count (`EDGEFIRST_MAX_RETRIES`,
50//! default: 3), but differ in error classification:
51//!
52//! # Environment Variables
53//!
54//! - `EDGEFIRST_MAX_RETRIES`: Maximum number of retries for failed requests
55//!   (default: 5)
56//! - `MAX_TASKS`: Maximum concurrent upload/download tasks (default: half of
57//!   CPU cores, min 2, max 8). Lower values (2-8) work better for large files
58//!   to avoid timeouts. Higher values (16-32) are better for many small files.
59//!
60//! ## StudioApi Error Classification
61//!
62//! - **Never retry**: 401 Unauthorized, 403 Forbidden (auth failures)
63//! - **Always retry**: 408 Timeout, 429 Too Many Requests, 5xx Server Errors
64//! - **Retry transports errors**: Connection failures, DNS errors, timeouts
65//!
66//! ## FileIO Error Classification
67//!
68//! - **Always retry**: 408 Timeout, 409 Conflict, 423 Locked, 429 Too Many
69//!   Requests, 5xx Server Errors
70//! - **Retry transport errors**: Connection failures, DNS errors, timeouts
71//! - **No auth bypass**: All HTTP errors (including 401/403) are retried for S3
72//!   URLs
73//!
74//! # Configuration
75//!
76//! - `EDGEFIRST_MAX_RETRIES`: Maximum retry attempts per request (default: 5)
77//! - `EDGEFIRST_TIMEOUT`: Total-request deadline for API calls in seconds
78//!   (default: 30). Applies to the `http` client only. **Do not** increase this
79//!   for large file transfers — use `EDGEFIRST_READ_TIMEOUT` instead.
80//! - `EDGEFIRST_READ_TIMEOUT`: Per-chunk idle timeout for bulk downloads in
81//!   seconds (default: 120). Applies to the `bulk_http` client. Resets after
82//!   every received chunk, so healthy large downloads are never interrupted.
83//!   Only fires when no bytes arrive for the configured duration.
84//! - `EDGEFIRST_UPLOAD_TIMEOUT`: Per-operation total timeout for bulk uploads
85//!   in seconds (default: 600). Applied per-request via `RequestBuilder::timeout`
86//!   on each upload attempt (per part for multipart, per file for single uploads).
87//!   Covers the send phase where `EDGEFIRST_READ_TIMEOUT` does not apply.
88//!   Sized for PART_SIZE (100 MB) at ~170 KB/s minimum; increase for very slow
89//!   uplinks or larger single-file uploads.
90//!
91//! **For bulk file operations**, increase retry count for better resilience:
92//! ```bash
93//! export EDGEFIRST_MAX_RETRIES=10     # More retries for S3 operations
94//! export EDGEFIRST_READ_TIMEOUT=300   # 5-minute idle timeout for very slow downlinks
95//! export EDGEFIRST_UPLOAD_TIMEOUT=900 # 15-minute per-part timeout for very slow uplinks
96//! ```
97//!
98//! # Examples
99//!
100//! ```rust
101//! use edgefirst_client::{RetryScope, classify_url};
102//!
103//! // Studio API calls
104//! assert_eq!(
105//!     classify_url("https://edgefirst.studio/api"),
106//!     RetryScope::StudioApi
107//! );
108//! assert_eq!(
109//!     classify_url("https://test.edgefirst.studio/api/datasets.list"),
110//!     RetryScope::StudioApi
111//! );
112//!
113//! // File I/O operations
114//! assert_eq!(
115//!     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
116//!     RetryScope::FileIO
117//! );
118//! assert_eq!(
119//!     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
120//!     RetryScope::FileIO
121//! );
122//! ```
123
124use url::Url;
125
126/// Retry scope classification for URL-based retry policies.
127///
128/// Determines whether a request is a Studio API call or a File I/O operation,
129/// enabling different error handling strategies for each category.
130#[derive(Clone, Debug, PartialEq, Eq)]
131pub enum RetryScope {
132    /// EdgeFirst Studio JSON-RPC API calls to `*.edgefirst.studio/api`.
133    ///
134    /// These calls should fail fast on authentication errors but retry
135    /// server errors and transient failures.
136    StudioApi,
137
138    /// File upload/download operations to S3, CloudFront, or other endpoints.
139    ///
140    /// These operations experience high concurrency and should retry
141    /// aggressively on all transient failures.
142    FileIO,
143}
144
145/// Classifies a URL to determine which retry policy to apply.
146///
147/// This function performs URL-based classification to differentiate between
148/// EdgeFirst Studio API calls and File I/O operations (S3, CloudFront, etc.).
149///
150/// # Classification Algorithm
151///
152/// 1. Parse URL using proper URL parser (handles ports, query params,
153///    fragments)
154/// 2. Check protocol: Only HTTP/HTTPS are classified as StudioApi (all others →
155///    FileIO)
156/// 3. Check host: Must be `edgefirst.studio` or `*.edgefirst.studio`
157/// 4. Check path: Must start with `/api` (exact match or `/api/...`)
158/// 5. If all conditions met → `StudioApi`, otherwise → `FileIO`
159///
160/// # Edge Cases Handled
161///
162/// - **Port numbers**: `https://test.edgefirst.studio:8080/api` → StudioApi
163/// - **Trailing slashes**: `https://edgefirst.studio/api/` → StudioApi
164/// - **Query parameters**: `https://edgefirst.studio/api?foo=bar` → StudioApi
165/// - **Subdomains**: `https://ocean.edgefirst.studio/api` → StudioApi
166/// - **Similar domains**: `https://edgefirst.studio.com/api` → FileIO (not
167///   exact match)
168/// - **Path injection**: `https://evil.com/edgefirst.studio/api` → FileIO (host
169///   mismatch)
170/// - **Non-API paths**: `https://edgefirst.studio/download` → FileIO
171///
172/// # Security
173///
174/// The function uses proper URL parsing to prevent domain spoofing attacks.
175/// Only the URL host is checked, not the path, preventing injection via
176/// `https://attacker.com/edgefirst.studio/api`.
177///
178/// # Examples
179///
180/// ```rust
181/// use edgefirst_client::{RetryScope, classify_url};
182///
183/// // Studio API URLs
184/// assert_eq!(
185///     classify_url("https://edgefirst.studio/api"),
186///     RetryScope::StudioApi
187/// );
188/// assert_eq!(
189///     classify_url("https://test.edgefirst.studio/api/datasets"),
190///     RetryScope::StudioApi
191/// );
192/// assert_eq!(
193///     classify_url("https://test.edgefirst.studio:443/api?token=abc"),
194///     RetryScope::StudioApi
195/// );
196///
197/// // File I/O URLs (S3, CloudFront, etc.)
198/// assert_eq!(
199///     classify_url("https://s3.amazonaws.com/bucket/file.bin"),
200///     RetryScope::FileIO
201/// );
202/// assert_eq!(
203///     classify_url("https://d123abc.cloudfront.net/dataset.zip"),
204///     RetryScope::FileIO
205/// );
206/// assert_eq!(
207///     classify_url("https://edgefirst.studio/download_model"),
208///     RetryScope::FileIO // Non-API path
209/// );
210/// ```
211pub fn classify_url(url: &str) -> RetryScope {
212    // Try to parse as proper URL
213    if let Ok(parsed) = Url::parse(url) {
214        // Only match HTTP/HTTPS protocols
215        if parsed.scheme() != "http" && parsed.scheme() != "https" {
216            return RetryScope::FileIO;
217        }
218
219        if let Some(host) = parsed.host_str() {
220            let host_matches = host == "edgefirst.studio" || host.ends_with(".edgefirst.studio");
221
222            // Path must be exactly "/api" or start with "/api/" (not "/apis" etc.)
223            let path = parsed.path();
224            let path_is_api = path == "/api" || path.starts_with("/api/");
225
226            if host_matches && path_is_api {
227                return RetryScope::StudioApi;
228            }
229        }
230    }
231
232    RetryScope::FileIO
233}
234
235/// Creates a retry policy with URL-based classification.
236///
237/// This function builds a reqwest retry policy that inspects each request URL
238/// and applies different error classification rules based on whether it's a
239/// Studio API call or a File I/O operation.
240///
241/// # Retry Configuration
242///
243/// - **Max retries**: Configurable via `EDGEFIRST_MAX_RETRIES` (default: 5)
244/// - **Timeout**: Configurable via `EDGEFIRST_TIMEOUT` (default: 30 seconds)
245///
246/// # Error Classification by Scope
247///
248/// ## StudioApi (*.edgefirst.studio/api)
249///
250/// Optimized for fast-fail on authentication errors:
251///
252/// | HTTP Status | Action | Rationale |
253/// |-------------|--------|-----------|
254/// | 401, 403 | Never retry | Authentication failure - user action required |
255/// | 408, 429 | Retry | Timeout, rate limiting - transient |
256/// | 5xx | Retry | Server error - may recover |
257/// | Connection errors | Retry | Network issues - transient |
258///
259/// ## FileIO (S3, CloudFront, etc.)
260///
261/// Optimized for robustness under high concurrency:
262///
263/// | HTTP Status | Action | Rationale |
264/// |-------------|--------|-----------|
265/// | 408, 429 | Retry | Timeout, rate limiting - common with S3 |
266/// | 409, 423 | Retry | Conflict, locked - S3 eventual consistency |
267/// | 5xx | Retry | Server error - S3 transient issues |
268/// | Connection errors | Retry | Network issues - common in parallel uploads |
269///
270/// # Usage Recommendations
271///
272/// **For dataset downloads/uploads** (many concurrent S3 operations):
273/// ```bash
274/// export EDGEFIRST_MAX_RETRIES=10      # More retries for robustness
275/// export EDGEFIRST_READ_TIMEOUT=300    # Longer idle timeout for slow links
276/// export EDGEFIRST_UPLOAD_TIMEOUT=900  # Longer per-part timeout for slow uplinks
277/// ```
278///
279/// **For testing** (fast failure detection):
280/// ```bash
281/// export EDGEFIRST_MAX_RETRIES=1   # Minimal retries
282/// export EDGEFIRST_TIMEOUT=10      # Quick API call timeout
283/// ```
284///
285/// # Implementation Notes
286///
287/// Due to reqwest retry API limitations, both StudioApi and FileIO use the
288/// same `max_retries_per_request` value. The differentiation is in error
289/// classification only (which errors trigger retries), not retry count.
290///
291/// For operations requiring different retry counts, use separate Client
292/// instances with different `EDGEFIRST_MAX_RETRIES` configuration.
293pub fn create_retry_policy() -> reqwest::retry::Builder {
294    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES")
295        .ok()
296        .and_then(|s| s.parse().ok())
297        .unwrap_or(5);
298
299    // Use wildcard host scope since we do URL inspection in classify_fn
300    reqwest::retry::for_host("*")
301        .max_retries_per_request(max_retries)
302        .classify_fn(|req_rep| {
303            let url = req_rep.uri().to_string();
304
305            match classify_url(&url) {
306                RetryScope::StudioApi => {
307                    // Studio API: Never retry auth failures, retry server errors
308                    match req_rep.status() {
309                        Some(status) => match status.as_u16() {
310                            401 | 403 => req_rep.success(), // Auth failures - don't retry
311                            429 | 408 | 500..=599 => req_rep.retryable(),
312                            _ => req_rep.success(),
313                        },
314                        // No status code means connection error, timeout, or other transport
315                        // failure These are safe to retry for API calls
316                        None if req_rep.error().is_some() => req_rep.retryable(),
317                        None => req_rep.success(),
318                    }
319                }
320                RetryScope::FileIO => {
321                    // File I/O: Retry all transient errors
322                    match req_rep.status() {
323                        Some(status) => match status.as_u16() {
324                            429 | 408 | 500..=599 | 409 | 423 => req_rep.retryable(),
325                            _ => req_rep.success(),
326                        },
327                        None if req_rep.error().is_some() => req_rep.retryable(),
328                        None => req_rep.success(),
329                    }
330                }
331            }
332        })
333}
334
335pub fn log_retry_configuration() {
336    let max_retries = std::env::var("EDGEFIRST_MAX_RETRIES").unwrap_or_else(|_| "5".to_string());
337    let timeout = std::env::var("EDGEFIRST_TIMEOUT").unwrap_or_else(|_| "30".to_string());
338    let read_timeout =
339        std::env::var("EDGEFIRST_READ_TIMEOUT").unwrap_or_else(|_| "120".to_string());
340    let upload_timeout =
341        std::env::var("EDGEFIRST_UPLOAD_TIMEOUT").unwrap_or_else(|_| "600".to_string());
342    log::debug!(
343        "Retry configuration - max_retries={}, api_timeout={}s, bulk_read_timeout={}s, upload_timeout={}s",
344        max_retries,
345        timeout,
346        read_timeout,
347        upload_timeout
348    );
349}
350
351#[cfg(test)]
352mod tests {
353    use super::*;
354
355    #[test]
356    fn test_classify_url_studio_api() {
357        // Base production URL
358        assert_eq!(
359            classify_url("https://edgefirst.studio/api"),
360            RetryScope::StudioApi
361        );
362
363        // Server-specific instances
364        assert_eq!(
365            classify_url("https://test.edgefirst.studio/api"),
366            RetryScope::StudioApi
367        );
368        assert_eq!(
369            classify_url("https://stage.edgefirst.studio/api"),
370            RetryScope::StudioApi
371        );
372        assert_eq!(
373            classify_url("https://saas.edgefirst.studio/api"),
374            RetryScope::StudioApi
375        );
376        assert_eq!(
377            classify_url("https://ocean.edgefirst.studio/api"),
378            RetryScope::StudioApi
379        );
380
381        // API endpoints with paths
382        assert_eq!(
383            classify_url("https://test.edgefirst.studio/api/datasets"),
384            RetryScope::StudioApi
385        );
386        assert_eq!(
387            classify_url("https://stage.edgefirst.studio/api/auth.login"),
388            RetryScope::StudioApi
389        );
390    }
391
392    #[test]
393    fn test_classify_url_file_io() {
394        // S3 URLs for file operations
395        assert_eq!(
396            classify_url("https://s3.amazonaws.com/bucket/file.bin"),
397            RetryScope::FileIO
398        );
399
400        // CloudFront URLs for file distribution
401        assert_eq!(
402            classify_url("https://d123abc.cloudfront.net/file.bin"),
403            RetryScope::FileIO
404        );
405
406        // Non-API paths on edgefirst.studio domain
407        assert_eq!(
408            classify_url("https://edgefirst.studio/docs"),
409            RetryScope::FileIO
410        );
411        assert_eq!(
412            classify_url("https://test.edgefirst.studio/download_model"),
413            RetryScope::FileIO
414        );
415        assert_eq!(
416            classify_url("https://stage.edgefirst.studio/download_checkpoint"),
417            RetryScope::FileIO
418        );
419
420        // Generic download URLs
421        assert_eq!(
422            classify_url("https://example.com/download"),
423            RetryScope::FileIO
424        );
425    }
426}