spider_util/error.rs
1//! Shared error types for the workspace.
2//!
3//! The runtime keeps transport, parsing, configuration, and pipeline failures in
4//! a small set of error enums so applications can match on them consistently.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::error::{SpiderError, PipelineError};
10//! use url::Url;
11//!
12//! // URL parsing error
13//! let result: Result<Url, SpiderError> = Url::parse("not-a-url").map_err(SpiderError::from);
14//! if let Err(e) = result {
15//! println!("Error: {}", e);
16//! }
17//!
18//! // Pipeline error
19//! let pipeline_err = PipelineError::IoError("File not found".to_string());
20//! ```
21
22use http;
23use serde_json::Error as SerdeJsonError;
24use std::str::Utf8Error;
25use thiserror::Error;
26
27/// Simplified wrapper around `reqwest::Error`.
28#[derive(Debug, Clone, Error)]
29#[error("Reqwest error: {message}")]
30pub struct ReqwestError {
31 /// A human-readable error message.
32 pub message: String,
33 /// Whether the error was a connection failure.
34 pub is_connect: bool,
35 /// Whether the error was a timeout.
36 pub is_timeout: bool,
37}
38
39impl From<reqwest::Error> for ReqwestError {
40 fn from(err: reqwest::Error) -> Self {
41 ReqwestError {
42 is_connect: err.is_connect(),
43 is_timeout: err.is_timeout(),
44 message: err.to_string(),
45 }
46 }
47}
48
49/// Main runtime error type used across the crawler stack.
50///
51/// ## Variants
52///
53/// - **Network Errors**: [`ReqwestError`](SpiderError::ReqwestError) for HTTP client errors
54/// - **URL Errors**: [`UrlParseError`](SpiderError::UrlParseError) for invalid URLs
55/// - **Serialization Errors**: [`JsonError`](SpiderError::JsonError) for JSON parsing/serialization
56/// - **I/O Errors**: [`IoError`](SpiderError::IoError) for file system operations
57/// - **Configuration Errors**: [`ConfigurationError`](SpiderError::ConfigurationError) for invalid settings
58/// - **Pipeline Errors**: [`PipelineError`](SpiderError::PipelineError) for item processing failures
59/// - **HTML/UTF-8 Errors**: Parse errors for HTML and UTF-8 content
60/// - **Robots.txt**: [`BlockedByRobotsTxt`](SpiderError::BlockedByRobotsTxt) for blocked requests
61///
62/// ## Example
63///
64/// ```rust,ignore
65/// use spider_util::error::SpiderError;
66/// use url::Url;
67///
68/// // Handle different error types
69/// match Url::parse("not-a-url").map_err(SpiderError::from) {
70/// Ok(url) => println!("Valid URL: {}", url),
71/// Err(SpiderError::UrlParseError(e)) => println!("Invalid URL: {}", e),
72/// Err(e) => println!("Other error: {}", e),
73/// }
74/// ```
75#[derive(Debug, Clone, Error)]
76pub enum SpiderError {
77 /// HTTP client error.
78 #[error("Reqwest error: {0}")]
79 ReqwestError(#[from] ReqwestError),
80 /// URL parsing error.
81 #[error("Url parsing error: {0}")]
82 UrlParseError(#[from] url::ParseError),
83 /// JSON parsing or serialization error.
84 #[error("Json parsing error: {0}")]
85 JsonError(String),
86 /// I/O operation error.
87 #[error("Io error: {0}")]
88 IoError(String),
89 /// Invalid configuration error.
90 #[error("Configuration error: {0}")]
91 ConfigurationError(String),
92 /// General unspecified error.
93 #[error("General error: {0}")]
94 GeneralError(String),
95 /// Failed to convert item to string.
96 #[error("Failed to convert item to string: {0}")]
97 ItemToStringError(String),
98 /// Item serialization error.
99 #[error("Error during item serialization: {0}")]
100 ItemSerializationError(String),
101 /// Unknown error.
102 #[error("Unknown error")]
103 Unknown,
104 /// Invalid HTTP header value.
105 #[error("Invalid HTTP header value: {0}")]
106 InvalidHeaderValue(String),
107 /// HTTP header value error.
108 #[error("Header value error: {0}")]
109 HeaderValueError(String),
110 /// HTML parsing error.
111 #[error("HTML parsing error: {0}")]
112 HtmlParseError(String),
113 /// UTF-8 decoding error.
114 #[error("UTF-8 parsing error: {0}")]
115 Utf8Error(#[from] Utf8Error),
116 /// Pipeline processing error.
117 #[error("Pipeline error: {0}")]
118 PipelineError(#[from] PipelineError),
119 /// Request blocked by robots.txt.
120 #[error("Request blocked by robots.txt")]
121 BlockedByRobotsTxt,
122}
123
124impl From<http::header::InvalidHeaderValue> for SpiderError {
125 fn from(err: http::header::InvalidHeaderValue) -> Self {
126 SpiderError::InvalidHeaderValue(err.to_string())
127 }
128}
129
130impl From<bincode::Error> for SpiderError {
131 fn from(err: bincode::Error) -> Self {
132 SpiderError::GeneralError(format!("Bincode error: {}", err))
133 }
134}
135
136impl From<reqwest::Error> for SpiderError {
137 fn from(err: reqwest::Error) -> Self {
138 SpiderError::ReqwestError(err.into())
139 }
140}
141
142impl From<std::io::Error> for SpiderError {
143 fn from(err: std::io::Error) -> Self {
144 SpiderError::IoError(err.to_string())
145 }
146}
147
148impl From<SerdeJsonError> for SpiderError {
149 fn from(err: SerdeJsonError) -> Self {
150 SpiderError::JsonError(err.to_string())
151 }
152}
153
154/// Error type used by item pipelines.
155///
156/// ## Variants
157///
158/// - **[`IoError`](PipelineError::IoError)**: File system or I/O operation failures
159/// - **[`ItemError`](PipelineError::ItemError)**: General item processing failures
160/// - **[`DatabaseError`](PipelineError::DatabaseError)**: Database operation errors (e.g., SQLite)
161/// - **[`SerializationError`](PipelineError::SerializationError)**: JSON/serialization failures
162/// - **[`CsvError`](PipelineError::CsvError)**: CSV reading/writing errors
163/// - **[`Other`](PipelineError::Other)**: Other unspecified pipeline errors
164///
165/// ## Example
166///
167/// ```rust,ignore
168/// use spider_util::error::PipelineError;
169///
170/// let err = PipelineError::IoError("File not found".to_string());
171/// println!("Pipeline error: {}", err);
172/// ```
173#[derive(Error, Debug, Clone)]
174pub enum PipelineError {
175 /// I/O operation error.
176 #[error("I/O error: {0}")]
177 IoError(String),
178 /// Item processing error.
179 #[error("Item processing error: {0}")]
180 ItemError(String),
181 /// Database operation error.
182 #[error("Database error: {0}")]
183 DatabaseError(String),
184 /// Serialization error.
185 #[error("Serialization error: {0}")]
186 SerializationError(String),
187 /// CSV operation error.
188 #[error("CSV error: {0}")]
189 CsvError(String),
190 /// Other unspecified pipeline error.
191 #[error("Other pipeline error: {0}")]
192 Other(String),
193}
194
195impl From<csv::Error> for PipelineError {
196 fn from(err: csv::Error) -> Self {
197 PipelineError::CsvError(err.to_string())
198 }
199}
200
201impl From<std::io::Error> for PipelineError {
202 fn from(err: std::io::Error) -> Self {
203 PipelineError::IoError(err.to_string())
204 }
205}
206
207impl From<SerdeJsonError> for PipelineError {
208 fn from(err: SerdeJsonError) -> Self {
209 PipelineError::SerializationError(err.to_string())
210 }
211}
212
213impl From<rusqlite::Error> for PipelineError {
214 fn from(err: rusqlite::Error) -> Self {
215 PipelineError::DatabaseError(err.to_string())
216 }
217}
218
219impl From<rusqlite::Error> for SpiderError {
220 fn from(err: rusqlite::Error) -> Self {
221 SpiderError::PipelineError(PipelineError::DatabaseError(err.to_string()))
222 }
223}