Skip to main content

scrapfly_sdk/
error.rs

1//! Error types — 1:1 port of `sdk/go/errors.go` + `crawler.go::parseAPIError`.
2
3use serde::Deserialize;
4use thiserror::Error;
5
6/// Structured API error envelope — the JSON shape returned by Scrapfly when
7/// a call fails (both `/scrape` `{result: {error: ...}}` envelopes and the
8/// generic `{message, code, error_id, http_code}` shape).
9#[derive(Debug, Clone, Default)]
10pub struct ApiError {
11    /// Human-readable error message.
12    pub message: String,
13    /// Error code identifier (e.g. `ERR::SCRAPE::NETWORK_ERROR`).
14    pub code: String,
15    /// HTTP status code from the response.
16    pub http_status: u16,
17    /// Documentation URL.
18    pub documentation_url: String,
19    /// Hint text (SDK-supplied, context-sensitive).
20    pub hint: String,
21    /// Retry-After in milliseconds, parsed from the HTTP header.
22    pub retry_after_ms: u64,
23}
24
25impl std::fmt::Display for ApiError {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        write!(
28            f,
29            "API Error: {} (code: {}, status: {}, docs: {})",
30            self.message, self.code, self.http_status, self.documentation_url
31        )?;
32        if self.retry_after_ms > 0 {
33            write!(f, ", retry_after_ms: {}", self.retry_after_ms)?;
34        }
35        Ok(())
36    }
37}
38
39/// All errors raised by `scrapfly-sdk`.
40#[derive(Debug, Error)]
41pub enum ScrapflyError {
42    /// Transport-level failure (connect, TLS, timeout …).
43    #[error("transport: {0}")]
44    Transport(#[from] reqwest::Error),
45    /// JSON (de)serialization failure.
46    #[error("json: {0}")]
47    Json(#[from] serde_json::Error),
48    /// Invalid configuration (builder validation).
49    #[error("config: {0}")]
50    Config(String),
51    /// Invalid or empty API key.
52    #[error("invalid key, must be a non-empty string")]
53    BadApiKey,
54    /// Structured API error envelope.
55    #[error("api error [{}] {}", .0.code, .0.message)]
56    Api(ApiError),
57    /// 4xx from Scrapfly itself.
58    #[error("API http client error: {0}")]
59    ApiClient(ApiError),
60    /// 5xx from Scrapfly itself.
61    #[error("API http server error: {0}")]
62    ApiServer(ApiError),
63    /// 4xx from the upstream target.
64    #[error("upstream http client error: {0}")]
65    UpstreamClient(ApiError),
66    /// 5xx from the upstream target.
67    #[error("upstream http server error: {0}")]
68    UpstreamServer(ApiError),
69    /// Rate limited (HTTP 429).
70    #[error("too many requests: {0}")]
71    TooManyRequests(ApiError),
72    /// Quota exhausted.
73    #[error("quota limit reached: {0}")]
74    QuotaLimitReached(ApiError),
75    /// Scrape failed with an `ERR::SCRAPE::*` status.
76    #[error("scrape failed: {0}")]
77    ScrapeFailed(ApiError),
78    /// Proxy failure (`ERR::PROXY::*`).
79    #[error("proxy error: {0}")]
80    ProxyFailed(ApiError),
81    /// Anti-bot bypass failure (`ERR::ASP::*`).
82    #[error("ASP bypass error: {0}")]
83    AspBypassFailed(ApiError),
84    /// Schedule error.
85    #[error("schedule error: {0}")]
86    ScheduleFailed(ApiError),
87    /// Webhook delivery error.
88    #[error("webhook error: {0}")]
89    WebhookFailed(ApiError),
90    /// Session error.
91    #[error("session error: {0}")]
92    SessionFailed(ApiError),
93    /// Screenshot API error.
94    #[error("screenshot API error: {0}")]
95    ScreenshotApiFailed(ApiError),
96    /// Extraction API error.
97    #[error("extraction API error: {0}")]
98    ExtractionApiFailed(ApiError),
99    /// Crawler API error.
100    #[error("crawler error: {0}")]
101    CrawlerFailed(ApiError),
102    /// Unhandled API error response.
103    #[error("unhandled API error response: {0}")]
104    UnhandledApiResponse(ApiError),
105    /// `Crawl` helper called before `start()`.
106    #[error("crawler not started, call start() first")]
107    CrawlerNotStarted,
108    /// `Crawl::start()` called twice.
109    #[error("crawler already started")]
110    CrawlerAlreadyStarted,
111    /// `Crawl::wait()` observed CANCELLED terminal state.
112    #[error("crawler was cancelled")]
113    CrawlerCancelled,
114    /// `Crawl::wait()` exceeded the caller's deadline.
115    #[error("crawler wait timed out")]
116    CrawlerTimeout,
117    /// Server returned a content-type the SDK didn't expect.
118    #[error("unexpected response format: {0}")]
119    UnexpectedResponseFormat(String),
120    /// Invalid content type for this operation.
121    #[error("invalid content type for this operation: {0}")]
122    ContentType(String),
123    /// I/O failure (example: save screenshot to disk).
124    #[error("io: {0}")]
125    Io(#[from] std::io::Error),
126}
127
128#[derive(Debug, Deserialize, Default)]
129struct ErrorEnvelope {
130    #[serde(default)]
131    message: String,
132    // The legacy /scrape envelope uses `code`; the public /schedules
133    // envelope uses `error`. Accept either spelling so a single
134    // ErrorEnvelope handles both shapes.
135    #[serde(default, alias = "error")]
136    code: String,
137    #[serde(default)]
138    #[allow(dead_code)]
139    error_id: String,
140    #[serde(default)]
141    #[allow(dead_code)]
142    http_code: u16,
143}
144
145/// Build a [`ScrapflyError`] from a non-2xx HTTP response.
146///
147/// Ports the switch logic from `sdk/go/client.go::handleAPIErrorResponse` +
148/// `sdk/go/crawler.go::handleCrawlerErrorResponse`: categorizes the error based
149/// on HTTP status + `code` field in the JSON envelope, surfaces the right
150/// sentinel variant, attaches a contextual hint, and parses `Retry-After`.
151pub fn from_response(
152    status: u16,
153    body: &[u8],
154    retry_after_ms: u64,
155    is_crawler: bool,
156) -> ScrapflyError {
157    let envelope: ErrorEnvelope = serde_json::from_slice(body).unwrap_or_default();
158    let msg = if envelope.message.is_empty() {
159        format!("API returned status {}", status)
160    } else {
161        envelope.message.clone()
162    };
163    let mut err = ApiError {
164        message: msg,
165        code: envelope.code.clone(),
166        http_status: status,
167        documentation_url: String::new(),
168        hint: String::new(),
169        retry_after_ms,
170    };
171
172    // ERR::SCHEDULER::* takes precedence over the generic 429/422 dispatch
173    // below so a 429 from a quota-exhausted schedule create surfaces as
174    // ScheduleFailed (resource-typed) rather than the generic
175    // TooManyRequests variant. Same rule for any future SCHEDULE error
176    // that overlaps with a status-typed bucket.
177    if envelope.code.contains("::SCHEDULE::") {
178        return ScrapflyError::ScheduleFailed(err);
179    }
180
181    // HTTP-status-based hint + early dispatch.
182    match status {
183        401 => err.hint = "Provide a valid API key via ?key=... or Bearer token.".into(),
184        429 => {
185            err.hint =
186                "Back off and retry after the indicated delay, or reduce concurrency/scope.".into();
187            return ScrapflyError::TooManyRequests(err);
188        }
189        422 => {
190            let body_str = String::from_utf8_lossy(body);
191            if body_str.contains("SCREENSHOT") {
192                err.hint =
193                    "Check screenshot parameters (format/capture/resolution) and upstream site readiness."
194                        .into();
195                return ScrapflyError::ScreenshotApiFailed(err);
196            }
197            if body_str.contains("EXTRACTION") {
198                err.hint =
199                    "Check content_type, body encoding, and template/prompt validity.".into();
200                return ScrapflyError::ExtractionApiFailed(err);
201            }
202        }
203        _ => {}
204    }
205
206    // Crawler-resource errors get their own bucket.
207    if is_crawler && envelope.code.contains("::CRAWLER::") {
208        return ScrapflyError::CrawlerFailed(err);
209    }
210
211    // Code-based dispatch (`ERR::RESOURCE::*`).
212    if let Some(resource) = envelope.code.split("::").nth(1) {
213        match resource {
214            "SCRAPE" => return ScrapflyError::ScrapeFailed(err),
215            "PROXY" => return ScrapflyError::ProxyFailed(err),
216            "ASP" => return ScrapflyError::AspBypassFailed(err),
217            "SCHEDULE" => return ScrapflyError::ScheduleFailed(err),
218            "WEBHOOK" => return ScrapflyError::WebhookFailed(err),
219            "SESSION" => return ScrapflyError::SessionFailed(err),
220            "THROTTLE" => return ScrapflyError::TooManyRequests(err),
221            "QUOTA" => return ScrapflyError::QuotaLimitReached(err),
222            "CRAWLER" => return ScrapflyError::CrawlerFailed(err),
223            _ => {}
224        }
225    }
226
227    // HTTP-status-based fallback.
228    match status {
229        400..=499 => ScrapflyError::ApiClient(err),
230        500..=599 => ScrapflyError::ApiServer(err),
231        _ => ScrapflyError::UnhandledApiResponse(err),
232    }
233}
234
235/// Parse the `Retry-After` header value into milliseconds.
236/// Supports integer seconds; HTTP-date is best-effort not parsed (returns 0).
237pub(crate) fn parse_retry_after(value: Option<&str>) -> u64 {
238    match value {
239        Some(v) => v
240            .trim()
241            .parse::<u64>()
242            .map(|secs| secs.saturating_mul(1000))
243            .unwrap_or(0),
244        None => 0,
245    }
246}