1use axum::{
2 http::StatusCode,
3 response::{IntoResponse, Response},
4 Json,
5};
6use serde_json::json;
7use thiserror::Error;
8
9#[derive(Debug, Error)]
10pub enum ScrapeError {
11 #[error("HTTP request failed: {0}")]
12 RequestFailed(#[from] reqwest::Error),
13
14 #[error("Invalid URL: {0}")]
15 InvalidUrl(String),
16
17 #[error("Timeout occurred")]
18 Timeout,
19
20 #[error("Failed to parse HTML: {0}")]
21 ParseError(String),
22
23 #[error("Robots.txt disallows scraping")]
24 RobotsDisallowed,
25
26 #[error("Unsupported format: {0}")]
27 UnsupportedFormat(String),
28
29 #[error("Internal error: {0}")]
30 Internal(String),
31
32 #[error("Browser error: {0}")]
33 BrowserError(String),
34
35 #[error("Browser launch failed: {0}")]
36 BrowserLaunchFailed(String),
37
38 #[error("Navigation failed: {0}")]
39 NavigationFailed(String),
40
41 #[error("Element not found: {0}")]
42 ElementNotFound(String),
43
44 #[error("Validation failed")]
45 ValidationFailed(Vec<String>),
46
47 #[error("Browser not found: {0}")]
48 BrowserNotFound(String),
49
50 #[error("Invalid request: {0}")]
51 InvalidRequest(String),
52
53 #[error("Resource limit exceeded: {0}")]
54 ResourceLimit(String),
55
56 #[error("Unauthorized")]
57 Unauthorized,
58
59 #[error("SSRF attempt detected: {0}")]
60 SsrfAttempt(String),
61
62 #[error("Empty content: {0}")]
63 EmptyContent(String),
64
65 #[error("Low quality content: {0}")]
66 LowQuality(String),
67
68 #[error("Error page detected: {0}")]
69 ErrorPage(String),
70
71 #[error("Configuration error: {0}")]
72 Configuration(String),
73}
74
75impl IntoResponse for ScrapeError {
76 fn into_response(self) -> Response {
77 let (status, error_message) = match self {
78 ScrapeError::RequestFailed(ref e) => {
79 if e.is_timeout() {
80 (StatusCode::REQUEST_TIMEOUT, "Request timeout".to_string())
81 } else if e.is_connect() {
82 (
83 StatusCode::BAD_GATEWAY,
84 "Failed to connect to target".to_string(),
85 )
86 } else {
87 (StatusCode::BAD_GATEWAY, format!("Request failed: {}", e))
88 }
89 }
90 ScrapeError::InvalidUrl(_) => (StatusCode::BAD_REQUEST, self.to_string()),
91 ScrapeError::Timeout => (StatusCode::REQUEST_TIMEOUT, self.to_string()),
92 ScrapeError::ParseError(_) => (StatusCode::UNPROCESSABLE_ENTITY, self.to_string()),
93 ScrapeError::RobotsDisallowed => (StatusCode::FORBIDDEN, self.to_string()),
94 ScrapeError::UnsupportedFormat(_) => (StatusCode::BAD_REQUEST, self.to_string()),
95 ScrapeError::Internal(_) => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()),
96 ScrapeError::BrowserError(_) => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()),
97 ScrapeError::BrowserLaunchFailed(_) => {
98 (StatusCode::SERVICE_UNAVAILABLE, self.to_string())
99 }
100 ScrapeError::NavigationFailed(_) => (StatusCode::BAD_GATEWAY, self.to_string()),
101 ScrapeError::ElementNotFound(_) => (StatusCode::UNPROCESSABLE_ENTITY, self.to_string()),
102 ScrapeError::ValidationFailed(ref errors) => (
103 StatusCode::BAD_REQUEST,
104 format!("Validation failed: {}", errors.join(", ")),
105 ),
106 ScrapeError::BrowserNotFound(_) => {
107 (StatusCode::SERVICE_UNAVAILABLE, self.to_string())
108 }
109 ScrapeError::InvalidRequest(_) => (StatusCode::BAD_REQUEST, self.to_string()),
110 ScrapeError::ResourceLimit(_) => (StatusCode::PAYLOAD_TOO_LARGE, self.to_string()),
111 ScrapeError::Unauthorized => (StatusCode::UNAUTHORIZED, "Unauthorized".to_string()),
112 ScrapeError::SsrfAttempt(_) => (StatusCode::FORBIDDEN, self.to_string()),
113 ScrapeError::EmptyContent(_) => (StatusCode::UNPROCESSABLE_ENTITY, self.to_string()),
114 ScrapeError::LowQuality(_) => (StatusCode::UNPROCESSABLE_ENTITY, self.to_string()),
115 ScrapeError::ErrorPage(_) => (StatusCode::BAD_GATEWAY, self.to_string()),
116 ScrapeError::Configuration(_) => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()),
117 };
118
119 let body = Json(json!({
120 "success": false,
121 "error": error_message,
122 }));
123
124 (status, body).into_response()
125 }
126}
127
128pub type Result<T> = std::result::Result<T, ScrapeError>;