Skip to main content

spider_util/
error.rs

1//! Custom error types for the spider framework.
2//!
3//! This module defines comprehensive error types used throughout the spider ecosystem:
4//!
5//! - **[`SpiderError`]**: The main error type for the spider framework, covering
6//!   network failures, URL parsing problems, I/O errors, configuration issues,
7//!   and pipeline errors.
8//! - **[`PipelineError`]**: Specialized errors for item processing pipelines.
9//! - **[`ReqwestError`]**: A simplified wrapper around reqwest errors with
10//!   additional context about connection and timeout issues.
11//!
12//! By centralizing error definitions, the module provides a consistent and
13//! semantic way to report and handle errors, improving the robustness and
14//! maintainability of web scraping applications.
15//!
16//! ## Example
17//!
18//! ```rust
19//! use spider_util::error::{SpiderError, PipelineError};
20//! use url::Url;
21//!
22//! // URL parsing error
23//! let result: Result<Url, SpiderError> = Url::parse("not-a-url");
24//! if let Err(e) = result {
25//!     println!("Error: {}", e);
26//! }
27//!
28//! // Pipeline error
29//! let pipeline_err = PipelineError::IoError("File not found".to_string());
30//! ```
31
32use http;
33use serde_json::Error as SerdeJsonError;
34use std::str::Utf8Error;
35use thiserror::Error;
36
37/// A wrapper around reqwest errors with additional context.
38///
39/// [`ReqwestError`] provides simplified error information by extracting
40/// key details from reqwest errors, such as whether the error was related
41/// to connection failures or timeouts.
42#[derive(Debug, Clone, Error)]
43#[error("Reqwest error: {message}")]
44pub struct ReqwestError {
45    /// A human-readable error message.
46    pub message: String,
47    /// Whether the error was a connection failure.
48    pub is_connect: bool,
49    /// Whether the error was a timeout.
50    pub is_timeout: bool,
51}
52
53impl From<reqwest::Error> for ReqwestError {
54    fn from(err: reqwest::Error) -> Self {
55        ReqwestError {
56            is_connect: err.is_connect(),
57            is_timeout: err.is_timeout(),
58            message: err.to_string(),
59        }
60    }
61}
62
63/// The main error type for the spider framework.
64///
65/// [`SpiderError`] encompasses all possible errors that can occur during
66/// web scraping operations, from network failures to data processing issues.
67///
68/// ## Variants
69///
70/// - **Network Errors**: [`ReqwestError`](SpiderError::ReqwestError) for HTTP client errors
71/// - **URL Errors**: [`UrlParseError`](SpiderError::UrlParseError) for invalid URLs
72/// - **Serialization Errors**: [`JsonError`](SpiderError::JsonError) for JSON parsing/serialization
73/// - **I/O Errors**: [`IoError`](SpiderError::IoError) for file system operations
74/// - **Configuration Errors**: [`ConfigurationError`](SpiderError::ConfigurationError) for invalid settings
75/// - **Pipeline Errors**: [`PipelineError`](SpiderError::PipelineError) for item processing failures
76/// - **HTML/UTF-8 Errors**: Parse errors for HTML and UTF-8 content
77/// - **Robots.txt**: [`BlockedByRobotsTxt`](SpiderError::BlockedByRobotsTxt) for blocked requests
78///
79/// ## Example
80///
81/// ```rust
82/// use spider_util::error::SpiderError;
83/// use url::Url;
84///
85/// // Handle different error types
86/// match Url::parse("not-a-url") {
87///     Ok(url) => println!("Valid URL: {}", url),
88///     Err(SpiderError::UrlParseError(e)) => println!("Invalid URL: {}", e),
89///     Err(e) => println!("Other error: {}", e),
90/// }
91/// ```
92#[derive(Debug, Clone, Error)]
93pub enum SpiderError {
94    /// HTTP client error.
95    #[error("Reqwest error: {0}")]
96    ReqwestError(#[from] ReqwestError),
97    /// URL parsing error.
98    #[error("Url parsing error: {0}")]
99    UrlParseError(#[from] url::ParseError),
100    /// JSON parsing or serialization error.
101    #[error("Json parsing error: {0}")]
102    JsonError(String),
103    /// I/O operation error.
104    #[error("Io error: {0}")]
105    IoError(String),
106    /// Invalid configuration error.
107    #[error("Configuration error: {0}")]
108    ConfigurationError(String),
109    /// General unspecified error.
110    #[error("General error: {0}")]
111    GeneralError(String),
112    /// Failed to convert item to string.
113    #[error("Failed to convert item to string: {0}")]
114    ItemToStringError(String),
115    /// Item serialization error.
116    #[error("Error during item serialization: {0}")]
117    ItemSerializationError(String),
118    /// Unknown error.
119    #[error("Unknown error")]
120    Unknown,
121    /// Invalid HTTP header value.
122    #[error("Invalid HTTP header value: {0}")]
123    InvalidHeaderValue(String),
124    /// HTTP header value error.
125    #[error("Header value error: {0}")]
126    HeaderValueError(String),
127    /// HTML parsing error.
128    #[error("HTML parsing error: {0}")]
129    HtmlParseError(String),
130    /// UTF-8 decoding error.
131    #[error("UTF-8 parsing error: {0}")]
132    Utf8Error(#[from] Utf8Error),
133    /// Pipeline processing error.
134    #[error("Pipeline error: {0}")]
135    PipelineError(#[from] PipelineError),
136    /// Request blocked by robots.txt.
137    #[error("Request blocked by robots.txt")]
138    BlockedByRobotsTxt,
139}
140
141impl From<http::header::InvalidHeaderValue> for SpiderError {
142    fn from(err: http::header::InvalidHeaderValue) -> Self {
143        SpiderError::InvalidHeaderValue(err.to_string())
144    }
145}
146
147impl From<bincode::Error> for SpiderError {
148    fn from(err: bincode::Error) -> Self {
149        SpiderError::GeneralError(format!("Bincode error: {}", err))
150    }
151}
152
153impl From<reqwest::Error> for SpiderError {
154    fn from(err: reqwest::Error) -> Self {
155        SpiderError::ReqwestError(err.into())
156    }
157}
158
159impl From<std::io::Error> for SpiderError {
160    fn from(err: std::io::Error) -> Self {
161        SpiderError::IoError(err.to_string())
162    }
163}
164
165impl From<SerdeJsonError> for SpiderError {
166    fn from(err: SerdeJsonError) -> Self {
167        SpiderError::JsonError(err.to_string())
168    }
169}
170
171/// Error type for item processing pipelines.
172///
173/// [`PipelineError`] represents errors that can occur during the processing
174/// of scraped items in pipelines, including I/O errors, database errors,
175/// serialization failures, and CSV operations.
176///
177/// ## Variants
178///
179/// - **[`IoError`](PipelineError::IoError)**: File system or I/O operation failures
180/// - **[`ItemError`](PipelineError::ItemError)**: General item processing failures
181/// - **[`DatabaseError`](PipelineError::DatabaseError)**: Database operation errors (e.g., SQLite)
182/// - **[`SerializationError`](PipelineError::SerializationError)**: JSON/serialization failures
183/// - **[`CsvError`](PipelineError::CsvError)**: CSV reading/writing errors
184/// - **[`Other`](PipelineError::Other)**: Other unspecified pipeline errors
185///
186/// ## Example
187///
188/// ```rust
189/// use spider_util::error::PipelineError;
190///
191/// let err = PipelineError::IoError("File not found".to_string());
192/// println!("Pipeline error: {}", err);
193/// ```
194#[derive(Error, Debug, Clone)]
195pub enum PipelineError {
196    /// I/O operation error.
197    #[error("I/O error: {0}")]
198    IoError(String),
199    /// Item processing error.
200    #[error("Item processing error: {0}")]
201    ItemError(String),
202    /// Database operation error.
203    #[error("Database error: {0}")]
204    DatabaseError(String),
205    /// Serialization error.
206    #[error("Serialization error: {0}")]
207    SerializationError(String),
208    /// CSV operation error.
209    #[error("CSV error: {0}")]
210    CsvError(String),
211    /// Other unspecified pipeline error.
212    #[error("Other pipeline error: {0}")]
213    Other(String),
214}
215
216impl From<csv::Error> for PipelineError {
217    fn from(err: csv::Error) -> Self {
218        PipelineError::CsvError(err.to_string())
219    }
220}
221
222impl From<std::io::Error> for PipelineError {
223    fn from(err: std::io::Error) -> Self {
224        PipelineError::IoError(err.to_string())
225    }
226}
227
228impl From<SerdeJsonError> for PipelineError {
229    fn from(err: SerdeJsonError) -> Self {
230        PipelineError::SerializationError(err.to_string())
231    }
232}
233
234impl From<rusqlite::Error> for PipelineError {
235    fn from(err: rusqlite::Error) -> Self {
236        PipelineError::DatabaseError(err.to_string())
237    }
238}
239
240impl From<rusqlite::Error> for SpiderError {
241    fn from(err: rusqlite::Error) -> Self {
242        SpiderError::PipelineError(PipelineError::DatabaseError(err.to_string()))
243    }
244}