Skip to main content

dataset_core/
error.rs

1use ureq::Error as UreqError;
2use zip::result::ZipError;
3
4/// Specific kinds of data format errors that can occur during dataset parsing.
5///
6/// # Variants
7///
8/// - `CsvReadError` - Failed to read a CSV record.
9/// - `InvalidColumnCount` - The row has an unexpected number of columns.
10/// - `ParseFailed` - Failed to parse a field value into the target type.
11/// - `InvalidValue` - The field value is syntactically valid but semantically incorrect.
12/// - `LengthMismatch` - The total parsed data length doesn't match expected dimensions.
13/// - `EmptyDataset` - The dataset is empty.
14/// - `ArrayShapeError` - Failed to construct ndarray with the given shape and data.
15#[derive(Debug, thiserror::Error)]
16pub enum DataFormatErrorKind {
17    /// Failed to read a CSV record
18    #[error("[{dataset_name}] failed to read CSV record: {error}")]
19    CsvReadError {
20        /// Dataset identifier
21        dataset_name: String,
22        /// The underlying CSV error message
23        error: String,
24    },
25    /// The row has an unexpected number of columns
26    #[error(
27        "[{dataset_name}] invalid column count at line {line_num}: expected {expected}, got {actual}"
28    )]
29    InvalidColumnCount {
30        /// Dataset identifier
31        dataset_name: String,
32        /// Expected number of columns
33        expected: usize,
34        /// Actual number of columns found
35        actual: usize,
36        /// Line number (1-based)
37        line_num: usize,
38    },
39    /// Failed to parse a field value into the target type
40    #[error(
41        "[{dataset_name}] failed to parse `{field_name}` at line {line_num}: {error}"
42    )]
43    ParseFailed {
44        /// Dataset identifier
45        dataset_name: String,
46        /// Field name that failed to parse
47        field_name: String,
48        /// Line number (1-based)
49        line_num: usize,
50        /// The underlying parse error message
51        error: String,
52    },
53    /// The field value is syntactically valid but semantically incorrect
54    #[error(
55        "[{dataset_name}] invalid value for `{field_name}` at line {line_num}: `{value}`"
56    )]
57    InvalidValue {
58        /// Dataset identifier
59        dataset_name: String,
60        /// Field name with invalid value
61        field_name: String,
62        /// The invalid value
63        value: String,
64        /// Line number (1-based)
65        line_num: usize,
66    },
67    /// The total parsed data length doesn't match expected dimensions
68    #[error("[{dataset_name}] invalid `{field_name}` length: expected {expected}, got {actual}")]
69    LengthMismatch {
70        /// Dataset identifier
71        dataset_name: String,
72        /// Field name whose length is being validated
73        field_name: String,
74        /// Expected length
75        expected: usize,
76        /// Actual length
77        actual: usize,
78    },
79    /// The dataset is empty
80    #[error("[{dataset_name}] is empty")]
81    EmptyDataset {
82        /// Dataset identifier
83        dataset_name: String,
84    },
85    /// Failed to construct ndarray with the given shape and data
86    #[error("[{dataset_name}] failed to build `{array_name}` array: {error}")]
87    ArrayShapeError {
88        /// Dataset identifier
89        dataset_name: String,
90        /// Array name that failed to build
91        array_name: String,
92        /// The underlying shape error message
93        error: String,
94    },
95}
96
97/// Error type used by dataset loading utilities.
98///
99/// # Variants
100///
101/// - `DownloadError` - The download step failed (network, invalid URL, or downloader configuration).
102/// - `ValidationError` - Downloaded file content failed integrity validation (SHA256 mismatch).
103/// - `UnzipError` - Extracting a zip archive failed.
104/// - `IoError` - A standard I/O operation failed (reading directories, opening/removing files, etc.).
105/// - `DataFormatError` - The dataset content was not in the expected format.
106#[derive(Debug, thiserror::Error)]
107pub enum DatasetError {
108    #[error("Download error: {0}")]
109    DownloadError(#[from] UreqError),
110
111    #[error("Validation error: {0}")]
112    ValidationError(String),
113
114    #[error("Unzip error: {0}")]
115    UnzipError(#[from] ZipError),
116
117    #[error("I/O error: {0}")]
118    IoError(#[from] std::io::Error),
119
120    #[error("Data format error: {0}")]
121    DataFormatError(#[from] DataFormatErrorKind),
122}
123
124impl DatasetError {
125    /// Creates a standard SHA256 validation failure error message for a file.
126    ///
127    /// # Parameters
128    ///
129    /// - `dataset_name` - The dataset identifier used in the error prefix.
130    /// - `file_name` - The dataset file name that failed checksum validation.
131    ///
132    /// # Returns
133    ///
134    /// - `DatasetError::ValidationError` - A variant of `DatasetError` that contains the unified SHA256 failure message.
135    pub fn sha256_validation_failed(dataset_name: &str, file_name: &str) -> Self {
136        Self::ValidationError(format!(
137            "[{}] SHA256 validation failed for file `{}`",
138            dataset_name, file_name
139        ))
140    }
141
142    /// Creates a CSV read error.
143    ///
144    /// # Parameters
145    ///
146    /// - `dataset_name` - The dataset identifier.
147    /// - `error` - The underlying CSV error.
148    ///
149    /// # Returns
150    ///
151    /// - `DatasetError::DataFormatError(DataFormatErrorKind::CsvReadError)` - A variant of `DatasetError` describing the CSV read error.
152    pub fn csv_read_error(dataset_name: &str, error: impl std::fmt::Display) -> Self {
153        Self::DataFormatError(DataFormatErrorKind::CsvReadError {
154            dataset_name: dataset_name.to_string(),
155            error: error.to_string(),
156        })
157    }
158
159    /// Creates a unified invalid-column-count data format error.
160    ///
161    /// # Parameters
162    ///
163    /// - `dataset_name` - The dataset identifier used in the error prefix.
164    /// - `expected` - The expected number of columns.
165    /// - `actual` - The actual number of columns found.
166    /// - `line_num` - The line number (1-based) where the error occurred.
167    ///
168    /// # Returns
169    ///
170    /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidColumnCount)` - A variant of `DatasetError` describing the column count mismatch.
171    pub fn invalid_column_count(
172        dataset_name: &str,
173        expected: usize,
174        actual: usize,
175        line_num: usize,
176    ) -> Self {
177        Self::DataFormatError(DataFormatErrorKind::InvalidColumnCount {
178            dataset_name: dataset_name.to_string(),
179            expected,
180            actual,
181            line_num,
182        })
183    }
184
185    /// Creates a unified parse failure data format error.
186    ///
187    /// # Parameters
188    ///
189    /// - `dataset_name` - The dataset identifier.
190    /// - `field_name` - The logical field name that failed to parse.
191    /// - `line_num` - The line number (1-based) where the error occurred.
192    /// - `line` - The original input line where parsing failed.
193    /// - `err` - The underlying parser error detail.
194    ///
195    /// # Returns
196    ///
197    /// - `DatasetError::DataFormatError(DataFormatErrorKind::ParseFailed)` - A variant of `DatasetError` describing the parse failure.
198    pub fn parse_failed(
199        dataset_name: &str,
200        field_name: &str,
201        line_num: usize,
202        err: impl std::fmt::Display,
203    ) -> Self {
204        Self::DataFormatError(DataFormatErrorKind::ParseFailed {
205            dataset_name: dataset_name.to_string(),
206            field_name: field_name.to_string(),
207            line_num,
208            error: err.to_string(),
209        })
210    }
211
212    /// Creates a unified invalid-field-value data format error.
213    ///
214    /// # Parameters
215    ///
216    /// - `dataset_name` - The dataset identifier.
217    /// - `field_name` - The logical field name with an invalid value.
218    /// - `value` - The invalid raw value.
219    /// - `line_num` - The line number (1-based) where the error occurred.
220    ///
221    /// # Returns
222    ///
223    /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidValue)` - A variant of `DatasetError` describing the invalid value.
224    pub fn invalid_value(
225        dataset_name: &str,
226        field_name: &str,
227        value: &str,
228        line_num: usize,
229    ) -> Self {
230        Self::DataFormatError(DataFormatErrorKind::InvalidValue {
231            dataset_name: dataset_name.to_string(),
232            field_name: field_name.to_string(),
233            value: value.to_string(),
234            line_num,
235        })
236    }
237
238    /// Creates a unified vector/row length mismatch data format error.
239    ///
240    /// # Parameters
241    ///
242    /// - `dataset_name` - The dataset identifier.
243    /// - `field_name` - The logical field name whose length is being validated.
244    /// - `expected` - The expected length.
245    /// - `actual` - The actual length.
246    ///
247    /// # Returns
248    ///
249    /// - `DatasetError::DataFormatError(DataFormatErrorKind::LengthMismatch)` - A variant of `DatasetError` describing the length mismatch.
250    pub fn length_mismatch(
251        dataset_name: &str,
252        field_name: &str,
253        expected: usize,
254        actual: usize,
255    ) -> Self {
256        Self::DataFormatError(DataFormatErrorKind::LengthMismatch {
257            dataset_name: dataset_name.to_string(),
258            field_name: field_name.to_string(),
259            expected,
260            actual,
261        })
262    }
263
264    /// Creates a unified ndarray shape construction data format error.
265    ///
266    /// # Parameters
267    ///
268    /// - `dataset_name` - The dataset identifier.
269    /// - `array_name` - The logical array name that failed to build.
270    /// - `err` - The underlying ndarray shape construction error detail.
271    ///
272    /// # Returns
273    ///
274    /// - `DatasetError::DataFormatError(DataFormatErrorKind::ArrayShapeError)` - A variant of `DatasetError` describing the array shape failure.
275    pub fn array_shape_error(
276        dataset_name: &str,
277        array_name: &str,
278        err: impl std::fmt::Display,
279    ) -> Self {
280        Self::DataFormatError(DataFormatErrorKind::ArrayShapeError {
281            dataset_name: dataset_name.to_string(),
282            array_name: array_name.to_string(),
283            error: err.to_string(),
284        })
285    }
286
287    /// Creates an empty dataset error.
288    ///
289    /// # Parameters
290    ///
291    /// - `dataset_name` - The dataset identifier.
292    ///
293    /// # Returns
294    ///
295    /// - `DatasetError::DataFormatError(DataFormatErrorKind::EmptyDataset)` - A variant of `DatasetError` indicating the dataset is empty.
296    pub fn empty_dataset(dataset_name: &str) -> Self {
297        Self::DataFormatError(DataFormatErrorKind::EmptyDataset {
298            dataset_name: dataset_name.to_string(),
299        })
300    }
301}