Skip to main content

spider_lib/
error.rs

1//! Custom error types for the `spider-lib` framework.
2//!
3//! This module defines a comprehensive set of custom error types, `SpiderError`
4//! and `PipelineError`, used throughout the `spider-lib` crate. These errors
5//! encapsulate various issues that can occur during crawling, such as network
6//! failures, URL parsing problems, I/O errors, configuration issues, and
7//! problems within item processing pipelines.
8//!
9//! By centralizing error definitions, the module provides a consistent and
10//! semantic way to report and handle errors, improving the robustness and
11//! maintainability of the web scraping application.
12
13use http;
14use thiserror::Error;
15
16#[derive(Debug, Clone, Error)]
17#[error("Reqwest error: {message}")]
18pub struct ReqwestErrorDetails {
19    pub message: String,
20    pub is_connect: bool,
21    pub is_timeout: bool,
22    // Add other relevant flags if necessary
23}
24
25impl From<reqwest::Error> for ReqwestErrorDetails {
26    fn from(err: reqwest::Error) -> Self {
27        ReqwestErrorDetails {
28            is_connect: err.is_connect(),
29            is_timeout: err.is_timeout(),
30            message: err.to_string(),
31        }
32    }
33}
34
35/// The main error type for the spider framework.
36#[derive(Debug, Clone, Error)] // Added Clone
37pub enum SpiderError {
38    #[error("Reqwest error: {0}")]
39    ReqwestError(#[from] ReqwestErrorDetails), // Changed to wrap ReqwestErrorDetails
40    #[error("Url parsing error: {0}")]
41    UrlParseError(#[from] url::ParseError),
42    #[error("Json parsing error: {0}")]
43    JsonError(String), // Changed to String for cloning
44    #[error("Io error: {0}")]
45    IoError(String), // Changed to String for cloning
46    #[error("Configuration error: {0}")]
47    ConfigurationError(String),
48    #[error("General error: {0}")]
49    GeneralError(String),
50    #[error("Failed to convert item to string: {0}")]
51    ItemToStringError(String),
52    #[error("Error during item serialization: {0}")]
53    ItemSerializationError(String),
54    #[error("Unknown error")]
55    Unknown,
56    #[error("Invalid HTTP header value: {0}")]
57    InvalidHeaderValue(String), // Changed to String
58    #[error("Header value error: {0}")]
59    HeaderValueError(String),
60    #[error("HTML parsing error: {0}")]
61    HtmlParseError(String),
62    #[error("UTF-8 parsing error: {0}")]
63    Utf8Error(#[from] std::str::Utf8Error),
64    #[error("Pipeline error: {0}")]
65    PipelineError(#[from] PipelineError),
66    #[error("Request blocked by robots.txt")]
67    BlockedByRobotsTxt,
68}
69
70// Manual From implementation for http::header::InvalidHeaderValue
71impl From<http::header::InvalidHeaderValue> for SpiderError {
72    fn from(err: http::header::InvalidHeaderValue) -> Self {
73        SpiderError::InvalidHeaderValue(err.to_string())
74    }
75}
76
77#[cfg(any(feature = "middleware-http-cache", feature = "checkpoint"))]
78impl From<bincode::Error> for SpiderError {
79    fn from(err: bincode::Error) -> Self {
80        SpiderError::GeneralError(format!("Bincode error: {}", err))
81    }
82}
83
84impl From<reqwest::Error> for SpiderError {
85    fn from(err: reqwest::Error) -> Self {
86        SpiderError::ReqwestError(err.into())
87    }
88}
89
90impl From<std::io::Error> for SpiderError {
91    fn from(err: std::io::Error) -> Self {
92        SpiderError::IoError(err.to_string())
93    }
94}
95
96impl From<serde_json::Error> for SpiderError {
97    fn from(err: serde_json::Error) -> Self {
98        SpiderError::JsonError(err.to_string())
99    }
100}
101
102/// The error type for item processing pipelines.
103#[derive(Error, Debug, Clone)]
104pub enum PipelineError {
105    #[error("I/O error: {0}")]
106    IoError(String),
107    #[error("Item processing error: {0}")]
108    ItemError(String),
109    #[cfg(feature = "pipeline-sqlite")]
110    #[error("Database error: {0}")]
111    DatabaseError(String),
112    #[error("Serialization error: {0}")]
113    SerializationError(String),
114    #[error("CSV error: {0}")]
115    CsvError(String),
116    #[error("Other pipeline error: {0}")]
117    Other(String),
118}
119
120#[cfg(feature = "pipeline-csv")]
121impl From<csv::Error> for PipelineError {
122    fn from(err: csv::Error) -> Self {
123        PipelineError::CsvError(err.to_string())
124    }
125}
126
127impl From<std::io::Error> for PipelineError {
128    fn from(err: std::io::Error) -> Self {
129        PipelineError::IoError(err.to_string())
130    }
131}
132
133impl From<serde_json::Error> for PipelineError {
134    fn from(err: serde_json::Error) -> Self {
135        PipelineError::SerializationError(err.to_string())
136    }
137}
138
139#[cfg(feature = "pipeline-sqlite")]
140impl From<rusqlite::Error> for PipelineError {
141    fn from(err: rusqlite::Error) -> Self {
142        PipelineError::DatabaseError(err.to_string())
143    }
144}
145#[cfg(feature = "pipeline-sqlite")]
146impl From<rusqlite::Error> for SpiderError {
147    fn from(err: rusqlite::Error) -> Self {
148        SpiderError::PipelineError(PipelineError::DatabaseError(err.to_string()))
149    }
150}