1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
//! Error types for the crawlberg crate.
use thiserror::Error;
/// Stable, language-agnostic classification for network-level errors.
///
/// The [`tag`](NetworkErrorKind::tag) method returns a lowercase ASCII string
/// that is stable across all language bindings. Cross-language e2e fixtures
/// assert that the error message contains the corresponding tag.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum NetworkErrorKind {
/// TCP connection attempt refused or unreachable.
Connection,
/// DNS name resolution failed.
Dns,
/// TLS/SSL handshake or certificate error.
Ssl,
/// Request exceeded the configured deadline.
Timeout,
/// Error communicating with or configuring a proxy.
Proxy,
/// Unclassified network error.
Other,
}
impl NetworkErrorKind {
/// Returns the stable, lowercase tag string embedded in error messages.
///
/// Each tag is a fixed ASCII keyword: `"connection"`, `"dns"`, `"ssl"`,
/// `"timeout"`, `"proxy"`, or `"network"`. Cross-language e2e fixtures
/// assert that `error.to_string()` contains this substring.
#[must_use]
pub fn tag(self) -> &'static str {
match self {
Self::Connection => "connection",
Self::Dns => "dns",
Self::Ssl => "ssl",
Self::Timeout => "timeout",
Self::Proxy => "proxy",
Self::Other => "network",
}
}
}
/// Errors that can occur during crawling, scraping, or mapping operations.
#[derive(Debug, Clone, Error)]
#[non_exhaustive]
pub enum CrawlError {
/// The requested page was not found (HTTP 404).
#[error("not_found: {0}")]
NotFound(String),
/// The request was unauthorized (HTTP 401).
#[error("unauthorized: {0}")]
Unauthorized(String),
/// The request was forbidden (HTTP 403).
#[error("forbidden: {0}")]
Forbidden(String),
/// The request was blocked by a WAF or bot protection (HTTP 403 with WAF indicators).
///
/// `vendor` is the lowercase identifier of the detected WAF (e.g. "cloudflare",
/// "datadome"). When the engine cannot identify the vendor, it uses "unknown".
/// `message` is the freeform description for logs and human readers.
///
/// The stable error tag remains `forbidden: waf/blocked: MESSAGE` so existing
/// log-grep patterns and cross-language bindings continue to work; vendor is
/// surfaced separately for structured consumers.
#[error("forbidden: waf/blocked: {message}")]
WafBlocked {
/// Lowercase WAF vendor identifier (e.g. "cloudflare").
vendor: String,
/// Freeform description / context for logs.
message: String,
},
/// The request timed out.
#[error("timeout: {0}")]
Timeout(String),
/// The request was rate-limited (HTTP 429).
#[error("rate_limited: {0}")]
RateLimited(String),
/// A server error occurred (HTTP 5xx).
#[error("server_error: {0}")]
ServerError(String),
/// A bad gateway error occurred (HTTP 502).
#[error("bad_gateway: {0}")]
BadGateway(String),
/// The resource is permanently gone (HTTP 410).
#[error("gone: {0}")]
Gone(String),
/// A connection error occurred.
#[error("connection: {0}")]
Connection(String),
/// A DNS resolution error occurred.
#[error("dns: {0}")]
Dns(String),
/// An SSL/TLS error occurred.
#[error("ssl: {0}")]
Ssl(String),
/// Data was lost or truncated during transfer.
#[error("data_loss: {0}")]
DataLoss(String),
/// The browser failed to launch, connect, or navigate.
#[error("browser: {0}")]
BrowserError(String),
/// The browser page load or rendering timed out.
#[error("browser_timeout: {0}")]
BrowserTimeout(String),
/// The provided configuration is invalid.
#[error("invalid_config: {0}")]
InvalidConfig(String),
/// The requested capability is not supported by the active backend or build.
#[error("unsupported: {0}")]
Unsupported(String),
/// A URL was rejected by SSRF policy (private IP, metadata, disallowed scheme, etc).
#[error("ssrf_policy_violation: {url} - {reason}")]
SsrfPolicyViolation {
/// The URL that was refused by the policy.
url: String,
/// Reason for rejection (e.g., "loopback", "private_network", "disallowed_scheme: ftp").
reason: String,
},
/// An unclassified error occurred.
#[error("other: {0}")]
Other(String),
}
impl From<crate::net::ssrf::SsrfError> for CrawlError {
fn from(err: crate::net::ssrf::SsrfError) -> Self {
// `SsrfError` is already formatted by its Display impl,
// so we extract the reason from it. For simplicity,
// we use a generic "unknown" URL since the conversion doesn't
// have the original URL. Call sites should catch and re-wrap
// with the actual URL.
CrawlError::SsrfPolicyViolation {
url: "unknown".to_string(),
reason: err.to_string(),
}
}
}
/// Collect the full error source chain into a single lowercase string for keyword matching.
pub(crate) fn error_chain_string(e: &reqwest::Error) -> String {
let mut parts = vec![e.to_string()];
let mut current: &dyn std::error::Error = e;
while let Some(src) = current.source() {
parts.push(src.to_string());
current = src;
}
parts.join(" | ").to_lowercase()
}
/// Determine the [`NetworkErrorKind`] for a reqwest error (non-wasm).
///
/// Walks the full source chain to detect DNS, SSL/TLS, timeout, and connection
/// errors that reqwest may wrap inside generic connect errors.
#[cfg(not(target_arch = "wasm32"))]
pub(crate) fn network_error_kind(e: &reqwest::Error) -> NetworkErrorKind {
let chain = error_chain_string(e);
if e.is_timeout() || chain.contains("timed out") || chain.contains("timeout") {
NetworkErrorKind::Timeout
} else if chain.contains("dns") || chain.contains("resolve") || chain.contains("lookup") {
NetworkErrorKind::Dns
} else if chain.contains("ssl")
|| chain.contains("tls")
|| chain.contains("certificate")
|| chain.contains("record overflow")
|| chain.contains("handshake")
|| chain.contains("corrupt message")
|| chain.contains("alertdescription")
|| chain.contains("invalidcontenttype")
{
NetworkErrorKind::Ssl
} else if chain.contains("proxy") {
NetworkErrorKind::Proxy
} else if e.is_connect() || chain.contains("connection") || chain.contains("connect") {
NetworkErrorKind::Connection
} else {
NetworkErrorKind::Other
}
}
/// Determine the [`NetworkErrorKind`] for a reqwest error (wasm fallback).
///
/// On wasm32, reqwest does not expose `.is_timeout()`, `.is_connect()`, or `.is_body()`
/// methods, so we rely solely on the error chain string for classification.
#[cfg(target_arch = "wasm32")]
pub(crate) fn network_error_kind(e: &reqwest::Error) -> NetworkErrorKind {
let chain = error_chain_string(e);
if chain.contains("timed out") || chain.contains("timeout") {
NetworkErrorKind::Timeout
} else if chain.contains("dns") || chain.contains("resolve") || chain.contains("lookup") {
NetworkErrorKind::Dns
} else if chain.contains("ssl")
|| chain.contains("tls")
|| chain.contains("certificate")
|| chain.contains("handshake")
{
NetworkErrorKind::Ssl
} else if chain.contains("proxy") {
NetworkErrorKind::Proxy
} else if chain.contains("connection") || chain.contains("connect") {
NetworkErrorKind::Connection
} else {
NetworkErrorKind::Other
}
}
/// Classify a `reqwest::Error` into the appropriate `CrawlError` variant (non-wasm).
///
/// The error message is prefixed with `[network:<kind>]` so that cross-language
/// e2e fixtures can assert on stable substrings regardless of the native error
/// message format each binding produces.
#[cfg(not(target_arch = "wasm32"))]
pub(crate) fn classify_reqwest_error(e: &reqwest::Error) -> CrawlError {
let chain = error_chain_string(e);
let kind = network_error_kind(e);
let tag = kind.tag();
match kind {
NetworkErrorKind::Timeout => CrawlError::Timeout(format!("[network:{tag}] {e}")),
NetworkErrorKind::Dns => CrawlError::Dns(format!("[network:{tag}] {e}")),
NetworkErrorKind::Ssl => CrawlError::Ssl(format!("[network:{tag}] {e}")),
NetworkErrorKind::Proxy | NetworkErrorKind::Connection => {
CrawlError::Connection(format!("[network:{tag}] {e}"))
}
NetworkErrorKind::Other => {
if e.is_body()
|| chain.contains("content-length")
|| chain.contains("truncat")
|| chain.contains("incomplete")
|| chain.contains("decoding response body")
|| chain.contains("error decoding")
{
CrawlError::DataLoss(format!("data_loss: {e}"))
} else {
CrawlError::Other(format!("other: {e}"))
}
}
}
}
/// Classify a `reqwest::Error` into the appropriate `CrawlError` variant (wasm fallback).
///
/// The error message is prefixed with `[network:<kind>]` so that cross-language
/// e2e fixtures can assert on stable substrings regardless of the native error
/// message format each binding produces.
#[cfg(target_arch = "wasm32")]
pub(crate) fn classify_reqwest_error(e: &reqwest::Error) -> CrawlError {
let chain = error_chain_string(e);
let kind = network_error_kind(e);
let tag = kind.tag();
match kind {
NetworkErrorKind::Timeout => CrawlError::Timeout(format!("[network:{tag}] {e}")),
NetworkErrorKind::Dns => CrawlError::Dns(format!("[network:{tag}] {e}")),
NetworkErrorKind::Ssl => CrawlError::Ssl(format!("[network:{tag}] {e}")),
NetworkErrorKind::Proxy | NetworkErrorKind::Connection => {
CrawlError::Connection(format!("[network:{tag}] {e}"))
}
NetworkErrorKind::Other => {
if chain.contains("content-length") || chain.contains("truncat") || chain.contains("incomplete") {
CrawlError::DataLoss(format!("data_loss: {e}"))
} else {
CrawlError::Other(format!("other: {e}"))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
// -------------------------------------------------------------------------
// NetworkErrorKind::tag() unit tests (pure, no network)
// -------------------------------------------------------------------------
#[test]
fn network_error_kind_tag_connection() {
assert_eq!(NetworkErrorKind::Connection.tag(), "connection");
}
#[test]
fn network_error_kind_tag_dns() {
assert_eq!(NetworkErrorKind::Dns.tag(), "dns");
}
#[test]
fn network_error_kind_tag_ssl() {
assert_eq!(NetworkErrorKind::Ssl.tag(), "ssl");
}
#[test]
fn network_error_kind_tag_timeout() {
assert_eq!(NetworkErrorKind::Timeout.tag(), "timeout");
}
#[test]
fn network_error_kind_tag_proxy() {
assert_eq!(NetworkErrorKind::Proxy.tag(), "proxy");
}
#[test]
fn network_error_kind_tag_other() {
assert_eq!(NetworkErrorKind::Other.tag(), "network");
}
// -------------------------------------------------------------------------
// Network integration tests — each triggers a real reqwest error
// -------------------------------------------------------------------------
#[cfg(not(target_arch = "wasm32"))]
mod network_integration {
use super::*;
use std::time::Duration;
use tokio::net::TcpListener;
async fn scrape_url(url: &str) -> CrawlError {
let client = reqwest::Client::builder()
.timeout(Duration::from_millis(500))
.danger_accept_invalid_certs(true)
.build()
.expect("client build must not fail");
classify_reqwest_error(&client.get(url).send().await.expect_err("expected network error"))
}
#[tokio::test]
async fn connection_refused_produces_connection_tag() {
// Port 1 is almost universally not listening.
let err = scrape_url("http://127.0.0.1:1/").await;
let msg = err.to_string();
assert!(
msg.contains("[network:connection]"),
"expected [network:connection] in '{msg}'"
);
assert!(msg.contains("connection"), "expected 'connection' in '{msg}'");
}
#[tokio::test]
async fn dns_failure_produces_dns_tag() {
let err = scrape_url("http://this-hostname-does-not-exist-crawlberg-test.invalid/").await;
let msg = err.to_string();
assert!(msg.contains("[network:dns]"), "expected [network:dns] in '{msg}'");
assert!(msg.contains("dns"), "expected 'dns' in '{msg}'");
}
#[tokio::test]
async fn timeout_produces_timeout_tag() {
// Start a TCP listener that accepts but never writes — causes a
// read/response timeout for the HTTP client.
let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind failed");
let addr = listener.local_addr().expect("addr");
// Accept in background and keep the socket open so the connect succeeds
// but the response never arrives.
tokio::spawn(async move {
if let Ok((_socket, _)) = listener.accept().await {
// Hold socket open until dropped at task end.
tokio::time::sleep(Duration::from_secs(5)).await;
}
});
let err = scrape_url(&format!("http://{addr}/")).await;
let msg = err.to_string();
assert!(
msg.contains("[network:timeout]"),
"expected [network:timeout] in '{msg}'"
);
assert!(msg.contains("timeout"), "expected 'timeout' in '{msg}'");
}
#[tokio::test]
async fn invalid_proxy_produces_connection_tag() {
// Configure a proxy pointing at a port that refuses connections.
let client = reqwest::Client::builder()
.proxy(reqwest::Proxy::all("http://127.0.0.1:1").expect("proxy parse"))
.timeout(Duration::from_millis(500))
.build()
.expect("client build");
let raw_err = client
.get("http://example.com/")
.send()
.await
.expect_err("expected proxy error");
let err = classify_reqwest_error(&raw_err);
let msg = err.to_string();
// A proxy error surfaces as connection-refused to the proxy address.
// The [network:connection] or [network:proxy] tag must be present.
assert!(
msg.contains("[network:connection]") || msg.contains("[network:proxy]"),
"expected [network:connection] or [network:proxy] in '{msg}'"
);
assert!(
msg.contains("connection") || msg.contains("proxy"),
"expected 'connection' or 'proxy' in '{msg}'"
);
}
}
}