Skip to main content

spider_util/
error.rs

1//! Shared error types for the workspace.
2//!
3//! The runtime keeps transport, parsing, configuration, and pipeline failures in
4//! a small set of error enums so applications can match on them consistently.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::error::{SpiderError, PipelineError};
10//! use url::Url;
11//!
12//! // URL parsing error
13//! let result: Result<Url, SpiderError> = Url::parse("not-a-url").map_err(SpiderError::from);
14//! if let Err(e) = result {
15//!     println!("Error: {}", e);
16//! }
17//!
18//! // Pipeline error
19//! let pipeline_err = PipelineError::IoError("File not found".to_string());
20//! ```
21
22use http;
23use serde_json::Error as SerdeJsonError;
24use std::str::Utf8Error;
25use thiserror::Error;
26
27/// Simplified wrapper around `reqwest::Error`.
28#[derive(Debug, Clone, Error)]
29#[error("Reqwest error: {message}")]
30pub struct ReqwestError {
31    /// A human-readable error message.
32    pub message: String,
33    /// Whether the error was a connection failure.
34    pub is_connect: bool,
35    /// Whether the error was a timeout.
36    pub is_timeout: bool,
37}
38
39impl From<reqwest::Error> for ReqwestError {
40    fn from(err: reqwest::Error) -> Self {
41        ReqwestError {
42            is_connect: err.is_connect(),
43            is_timeout: err.is_timeout(),
44            message: err.to_string(),
45        }
46    }
47}
48
49/// Main runtime error type used across the crawler stack.
50///
51/// ## Variants
52///
53/// - **Network Errors**: [`ReqwestError`](SpiderError::ReqwestError) for HTTP client errors
54/// - **URL Errors**: [`UrlParseError`](SpiderError::UrlParseError) for invalid URLs
55/// - **Serialization Errors**: [`JsonError`](SpiderError::JsonError) for JSON parsing/serialization
56/// - **I/O Errors**: [`IoError`](SpiderError::IoError) for file system operations
57/// - **Configuration Errors**: [`ConfigurationError`](SpiderError::ConfigurationError) for invalid settings
58/// - **Pipeline Errors**: [`PipelineError`](SpiderError::PipelineError) for item processing failures
59/// - **HTML/UTF-8 Errors**: Parse errors for HTML and UTF-8 content
60/// - **Robots.txt**: [`BlockedByRobotsTxt`](SpiderError::BlockedByRobotsTxt) for blocked requests
61///
62/// ## Example
63///
64/// ```rust,ignore
65/// use spider_util::error::SpiderError;
66/// use url::Url;
67///
68/// // Handle different error types
69/// match Url::parse("not-a-url").map_err(SpiderError::from) {
70///     Ok(url) => println!("Valid URL: {}", url),
71///     Err(SpiderError::UrlParseError(e)) => println!("Invalid URL: {}", e),
72///     Err(e) => println!("Other error: {}", e),
73/// }
74/// ```
75#[derive(Debug, Clone, Error)]
76pub enum SpiderError {
77    /// HTTP client error.
78    #[error("Reqwest error: {0}")]
79    ReqwestError(#[from] ReqwestError),
80    /// URL parsing error.
81    #[error("Url parsing error: {0}")]
82    UrlParseError(#[from] url::ParseError),
83    /// JSON parsing or serialization error.
84    #[error("Json parsing error: {0}")]
85    JsonError(String),
86    /// I/O operation error.
87    #[error("Io error: {0}")]
88    IoError(String),
89    /// Invalid configuration error.
90    #[error("Configuration error: {0}")]
91    ConfigurationError(String),
92    /// General unspecified error.
93    #[error("General error: {0}")]
94    GeneralError(String),
95    /// Failed to convert item to string.
96    #[error("Failed to convert item to string: {0}")]
97    ItemToStringError(String),
98    /// Item serialization error.
99    #[error("Error during item serialization: {0}")]
100    ItemSerializationError(String),
101    /// Unknown error.
102    #[error("Unknown error")]
103    Unknown,
104    /// Invalid HTTP header value.
105    #[error("Invalid HTTP header value: {0}")]
106    InvalidHeaderValue(String),
107    /// HTTP header value error.
108    #[error("Header value error: {0}")]
109    HeaderValueError(String),
110    /// HTML parsing error.
111    #[error("HTML parsing error: {0}")]
112    HtmlParseError(String),
113    /// UTF-8 decoding error.
114    #[error("UTF-8 parsing error: {0}")]
115    Utf8Error(#[from] Utf8Error),
116    /// Pipeline processing error.
117    #[error("Pipeline error: {0}")]
118    PipelineError(#[from] PipelineError),
119    /// Request blocked by robots.txt.
120    #[error("Request blocked by robots.txt")]
121    BlockedByRobotsTxt,
122}
123
124impl From<http::header::InvalidHeaderValue> for SpiderError {
125    fn from(err: http::header::InvalidHeaderValue) -> Self {
126        SpiderError::InvalidHeaderValue(err.to_string())
127    }
128}
129
130impl From<bincode::Error> for SpiderError {
131    fn from(err: bincode::Error) -> Self {
132        SpiderError::GeneralError(format!("Bincode error: {}", err))
133    }
134}
135
136impl From<reqwest::Error> for SpiderError {
137    fn from(err: reqwest::Error) -> Self {
138        SpiderError::ReqwestError(err.into())
139    }
140}
141
142impl From<std::io::Error> for SpiderError {
143    fn from(err: std::io::Error) -> Self {
144        SpiderError::IoError(err.to_string())
145    }
146}
147
148impl From<SerdeJsonError> for SpiderError {
149    fn from(err: SerdeJsonError) -> Self {
150        SpiderError::JsonError(err.to_string())
151    }
152}
153
154/// Error type used by item pipelines.
155///
156/// ## Variants
157///
158/// - **[`IoError`](PipelineError::IoError)**: File system or I/O operation failures
159/// - **[`ItemError`](PipelineError::ItemError)**: General item processing failures
160/// - **[`DatabaseError`](PipelineError::DatabaseError)**: Database operation errors (e.g., SQLite)
161/// - **[`SerializationError`](PipelineError::SerializationError)**: JSON/serialization failures
162/// - **[`CsvError`](PipelineError::CsvError)**: CSV reading/writing errors
163/// - **[`Other`](PipelineError::Other)**: Other unspecified pipeline errors
164///
165/// ## Example
166///
167/// ```rust,ignore
168/// use spider_util::error::PipelineError;
169///
170/// let err = PipelineError::IoError("File not found".to_string());
171/// println!("Pipeline error: {}", err);
172/// ```
173#[derive(Error, Debug, Clone)]
174pub enum PipelineError {
175    /// I/O operation error.
176    #[error("I/O error: {0}")]
177    IoError(String),
178    /// Item processing error.
179    #[error("Item processing error: {0}")]
180    ItemError(String),
181    /// Database operation error.
182    #[error("Database error: {0}")]
183    DatabaseError(String),
184    /// Serialization error.
185    #[error("Serialization error: {0}")]
186    SerializationError(String),
187    /// CSV operation error.
188    #[error("CSV error: {0}")]
189    CsvError(String),
190    /// Other unspecified pipeline error.
191    #[error("Other pipeline error: {0}")]
192    Other(String),
193}
194
195impl From<csv::Error> for PipelineError {
196    fn from(err: csv::Error) -> Self {
197        PipelineError::CsvError(err.to_string())
198    }
199}
200
201impl From<std::io::Error> for PipelineError {
202    fn from(err: std::io::Error) -> Self {
203        PipelineError::IoError(err.to_string())
204    }
205}
206
207impl From<SerdeJsonError> for PipelineError {
208    fn from(err: SerdeJsonError) -> Self {
209        PipelineError::SerializationError(err.to_string())
210    }
211}
212
213impl From<rusqlite::Error> for PipelineError {
214    fn from(err: rusqlite::Error) -> Self {
215        PipelineError::DatabaseError(err.to_string())
216    }
217}
218
219impl From<rusqlite::Error> for SpiderError {
220    fn from(err: rusqlite::Error) -> Self {
221        SpiderError::PipelineError(PipelineError::DatabaseError(err.to_string()))
222    }
223}