spider_util/error.rs
1//! Custom error types for the spider framework.
2//!
3//! This module defines comprehensive error types used throughout the spider ecosystem:
4//!
5//! - **[`SpiderError`]**: The main error type for the spider framework, covering
6//! network failures, URL parsing problems, I/O errors, configuration issues,
7//! and pipeline errors.
8//! - **[`PipelineError`]**: Specialized errors for item processing pipelines.
9//! - **[`ReqwestError`]**: A simplified wrapper around reqwest errors with
10//! additional context about connection and timeout issues.
11//!
12//! By centralizing error definitions, the module provides a consistent and
13//! semantic way to report and handle errors, improving the robustness and
14//! maintainability of web scraping applications.
15//!
16//! ## Example
17//!
18//! ```rust
19//! use spider_util::error::{SpiderError, PipelineError};
20//! use url::Url;
21//!
22//! // URL parsing error
23//! let result: Result<Url, SpiderError> = Url::parse("not-a-url");
24//! if let Err(e) = result {
25//! println!("Error: {}", e);
26//! }
27//!
28//! // Pipeline error
29//! let pipeline_err = PipelineError::IoError("File not found".to_string());
30//! ```
31
32use http;
33use serde_json::Error as SerdeJsonError;
34use std::str::Utf8Error;
35use thiserror::Error;
36
37/// A wrapper around reqwest errors with additional context.
38///
39/// [`ReqwestError`] provides simplified error information by extracting
40/// key details from reqwest errors, such as whether the error was related
41/// to connection failures or timeouts.
42#[derive(Debug, Clone, Error)]
43#[error("Reqwest error: {message}")]
44pub struct ReqwestError {
45 /// A human-readable error message.
46 pub message: String,
47 /// Whether the error was a connection failure.
48 pub is_connect: bool,
49 /// Whether the error was a timeout.
50 pub is_timeout: bool,
51}
52
53impl From<reqwest::Error> for ReqwestError {
54 fn from(err: reqwest::Error) -> Self {
55 ReqwestError {
56 is_connect: err.is_connect(),
57 is_timeout: err.is_timeout(),
58 message: err.to_string(),
59 }
60 }
61}
62
63/// The main error type for the spider framework.
64///
65/// [`SpiderError`] encompasses all possible errors that can occur during
66/// web scraping operations, from network failures to data processing issues.
67///
68/// ## Variants
69///
70/// - **Network Errors**: [`ReqwestError`](SpiderError::ReqwestError) for HTTP client errors
71/// - **URL Errors**: [`UrlParseError`](SpiderError::UrlParseError) for invalid URLs
72/// - **Serialization Errors**: [`JsonError`](SpiderError::JsonError) for JSON parsing/serialization
73/// - **I/O Errors**: [`IoError`](SpiderError::IoError) for file system operations
74/// - **Configuration Errors**: [`ConfigurationError`](SpiderError::ConfigurationError) for invalid settings
75/// - **Pipeline Errors**: [`PipelineError`](SpiderError::PipelineError) for item processing failures
76/// - **HTML/UTF-8 Errors**: Parse errors for HTML and UTF-8 content
77/// - **Robots.txt**: [`BlockedByRobotsTxt`](SpiderError::BlockedByRobotsTxt) for blocked requests
78///
79/// ## Example
80///
81/// ```rust
82/// use spider_util::error::SpiderError;
83/// use url::Url;
84///
85/// // Handle different error types
86/// match Url::parse("not-a-url") {
87/// Ok(url) => println!("Valid URL: {}", url),
88/// Err(SpiderError::UrlParseError(e)) => println!("Invalid URL: {}", e),
89/// Err(e) => println!("Other error: {}", e),
90/// }
91/// ```
92#[derive(Debug, Clone, Error)]
93pub enum SpiderError {
94 /// HTTP client error.
95 #[error("Reqwest error: {0}")]
96 ReqwestError(#[from] ReqwestError),
97 /// URL parsing error.
98 #[error("Url parsing error: {0}")]
99 UrlParseError(#[from] url::ParseError),
100 /// JSON parsing or serialization error.
101 #[error("Json parsing error: {0}")]
102 JsonError(String),
103 /// I/O operation error.
104 #[error("Io error: {0}")]
105 IoError(String),
106 /// Invalid configuration error.
107 #[error("Configuration error: {0}")]
108 ConfigurationError(String),
109 /// General unspecified error.
110 #[error("General error: {0}")]
111 GeneralError(String),
112 /// Failed to convert item to string.
113 #[error("Failed to convert item to string: {0}")]
114 ItemToStringError(String),
115 /// Item serialization error.
116 #[error("Error during item serialization: {0}")]
117 ItemSerializationError(String),
118 /// Unknown error.
119 #[error("Unknown error")]
120 Unknown,
121 /// Invalid HTTP header value.
122 #[error("Invalid HTTP header value: {0}")]
123 InvalidHeaderValue(String),
124 /// HTTP header value error.
125 #[error("Header value error: {0}")]
126 HeaderValueError(String),
127 /// HTML parsing error.
128 #[error("HTML parsing error: {0}")]
129 HtmlParseError(String),
130 /// UTF-8 decoding error.
131 #[error("UTF-8 parsing error: {0}")]
132 Utf8Error(#[from] Utf8Error),
133 /// Pipeline processing error.
134 #[error("Pipeline error: {0}")]
135 PipelineError(#[from] PipelineError),
136 /// Request blocked by robots.txt.
137 #[error("Request blocked by robots.txt")]
138 BlockedByRobotsTxt,
139}
140
141impl From<http::header::InvalidHeaderValue> for SpiderError {
142 fn from(err: http::header::InvalidHeaderValue) -> Self {
143 SpiderError::InvalidHeaderValue(err.to_string())
144 }
145}
146
147impl From<bincode::Error> for SpiderError {
148 fn from(err: bincode::Error) -> Self {
149 SpiderError::GeneralError(format!("Bincode error: {}", err))
150 }
151}
152
153impl From<reqwest::Error> for SpiderError {
154 fn from(err: reqwest::Error) -> Self {
155 SpiderError::ReqwestError(err.into())
156 }
157}
158
159impl From<std::io::Error> for SpiderError {
160 fn from(err: std::io::Error) -> Self {
161 SpiderError::IoError(err.to_string())
162 }
163}
164
165impl From<SerdeJsonError> for SpiderError {
166 fn from(err: SerdeJsonError) -> Self {
167 SpiderError::JsonError(err.to_string())
168 }
169}
170
171/// Error type for item processing pipelines.
172///
173/// [`PipelineError`] represents errors that can occur during the processing
174/// of scraped items in pipelines, including I/O errors, database errors,
175/// serialization failures, and CSV operations.
176///
177/// ## Variants
178///
179/// - **[`IoError`](PipelineError::IoError)**: File system or I/O operation failures
180/// - **[`ItemError`](PipelineError::ItemError)**: General item processing failures
181/// - **[`DatabaseError`](PipelineError::DatabaseError)**: Database operation errors (e.g., SQLite)
182/// - **[`SerializationError`](PipelineError::SerializationError)**: JSON/serialization failures
183/// - **[`CsvError`](PipelineError::CsvError)**: CSV reading/writing errors
184/// - **[`Other`](PipelineError::Other)**: Other unspecified pipeline errors
185///
186/// ## Example
187///
188/// ```rust
189/// use spider_util::error::PipelineError;
190///
191/// let err = PipelineError::IoError("File not found".to_string());
192/// println!("Pipeline error: {}", err);
193/// ```
194#[derive(Error, Debug, Clone)]
195pub enum PipelineError {
196 /// I/O operation error.
197 #[error("I/O error: {0}")]
198 IoError(String),
199 /// Item processing error.
200 #[error("Item processing error: {0}")]
201 ItemError(String),
202 /// Database operation error.
203 #[error("Database error: {0}")]
204 DatabaseError(String),
205 /// Serialization error.
206 #[error("Serialization error: {0}")]
207 SerializationError(String),
208 /// CSV operation error.
209 #[error("CSV error: {0}")]
210 CsvError(String),
211 /// Other unspecified pipeline error.
212 #[error("Other pipeline error: {0}")]
213 Other(String),
214}
215
216impl From<csv::Error> for PipelineError {
217 fn from(err: csv::Error) -> Self {
218 PipelineError::CsvError(err.to_string())
219 }
220}
221
222impl From<std::io::Error> for PipelineError {
223 fn from(err: std::io::Error) -> Self {
224 PipelineError::IoError(err.to_string())
225 }
226}
227
228impl From<SerdeJsonError> for PipelineError {
229 fn from(err: SerdeJsonError) -> Self {
230 PipelineError::SerializationError(err.to_string())
231 }
232}
233
234impl From<rusqlite::Error> for PipelineError {
235 fn from(err: rusqlite::Error) -> Self {
236 PipelineError::DatabaseError(err.to_string())
237 }
238}
239
240impl From<rusqlite::Error> for SpiderError {
241 fn from(err: rusqlite::Error) -> Self {
242 SpiderError::PipelineError(PipelineError::DatabaseError(err.to_string()))
243 }
244}