Skip to main content

dataset_core/
error.rs

1use ureq::Error as UreqError;
2use zip::result::ZipError;
3
4/// Specific kinds of data format errors that can occur during dataset parsing.
5///
6/// # Variants
7///
8/// - `CsvReadError` - Failed to read a CSV record.
9/// - `InvalidColumnCount` - The row has an unexpected number of columns.
10/// - `ParseFailed` - Failed to parse a field value into the target type.
11/// - `InvalidValue` - The field value is syntactically valid but semantically incorrect.
12/// - `LengthMismatch` - The total parsed data length doesn't match expected dimensions.
13/// - `EmptyDataset` - The dataset is empty.
14/// - `ArrayShapeError` - Failed to construct ndarray with the given shape and data.
15#[derive(Debug, thiserror::Error)]
16pub enum DataFormatErrorKind {
17    /// Failed to read a CSV record
18    #[error("[{dataset_name}] failed to read CSV record: {error}")]
19    CsvReadError {
20        /// Dataset identifier
21        dataset_name: String,
22        /// The underlying CSV error message
23        error: String,
24    },
25    /// The row has an unexpected number of columns
26    #[error(
27        "[{dataset_name}] invalid column count at line {line_num}: expected {expected}, got {actual} (line: `{line}`)"
28    )]
29    InvalidColumnCount {
30        /// Dataset identifier
31        dataset_name: String,
32        /// Expected number of columns
33        expected: usize,
34        /// Actual number of columns found
35        actual: usize,
36        /// Line number (1-based)
37        line_num: usize,
38        /// The original input line
39        line: String,
40    },
41    /// Failed to parse a field value into the target type
42    #[error(
43        "[{dataset_name}] failed to parse `{field_name}` at line {line_num}: {error} (line: `{line}`)"
44    )]
45    ParseFailed {
46        /// Dataset identifier
47        dataset_name: String,
48        /// Field name that failed to parse
49        field_name: String,
50        /// Line number (1-based)
51        line_num: usize,
52        /// The original input line
53        line: String,
54        /// The underlying parse error message
55        error: String,
56    },
57    /// The field value is syntactically valid but semantically incorrect
58    #[error(
59        "[{dataset_name}] invalid value for `{field_name}` at line {line_num}: `{value}` (line: `{line}`)"
60    )]
61    InvalidValue {
62        /// Dataset identifier
63        dataset_name: String,
64        /// Field name with invalid value
65        field_name: String,
66        /// The invalid value
67        value: String,
68        /// Line number (1-based)
69        line_num: usize,
70        /// The original input line
71        line: String,
72    },
73    /// The total parsed data length doesn't match expected dimensions
74    #[error("[{dataset_name}] invalid `{field_name}` length: expected {expected}, got {actual}")]
75    LengthMismatch {
76        /// Dataset identifier
77        dataset_name: String,
78        /// Field name whose length is being validated
79        field_name: String,
80        /// Expected length
81        expected: usize,
82        /// Actual length
83        actual: usize,
84    },
85    /// The dataset is empty
86    #[error("[{dataset_name}] is empty")]
87    EmptyDataset {
88        /// Dataset identifier
89        dataset_name: String,
90    },
91    /// Failed to construct ndarray with the given shape and data
92    #[error("[{dataset_name}] failed to build `{array_name}` array: {error}")]
93    ArrayShapeError {
94        /// Dataset identifier
95        dataset_name: String,
96        /// Array name that failed to build
97        array_name: String,
98        /// The underlying shape error message
99        error: String,
100    },
101}
102
103/// Error type used by dataset loading utilities.
104///
105/// # Variants
106///
107/// - `DownloadError` - The download step failed (network, invalid URL, or downloader configuration).
108/// - `ValidationError` - Downloaded file content failed integrity validation (SHA256 mismatch).
109/// - `UnzipError` - Extracting a zip archive failed.
110/// - `IoError` - A standard I/O operation failed (reading directories, opening/removing files, etc.).
111/// - `DataFormatError` - The dataset content was not in the expected format.
112#[derive(Debug, thiserror::Error)]
113pub enum DatasetError {
114    #[error("Download error: {0}")]
115    DownloadError(#[from] UreqError),
116
117    #[error("Validation error: {0}")]
118    ValidationError(String),
119
120    #[error("Unzip error: {0}")]
121    UnzipError(#[from] ZipError),
122
123    #[error("I/O error: {0}")]
124    IoError(#[from] std::io::Error),
125
126    #[error("Data format error: {0}")]
127    DataFormatError(#[from] DataFormatErrorKind),
128}
129
130impl DatasetError {
131    /// Creates a standard SHA256 validation failure error message for a file.
132    ///
133    /// # Parameters
134    ///
135    /// - `dataset_name` - The dataset identifier used in the error prefix.
136    /// - `file_name` - The dataset file name that failed checksum validation.
137    ///
138    /// # Returns
139    ///
140    /// - `DatasetError::ValidationError` - A variant of `DatasetError` that contains the unified SHA256 failure message.
141    pub fn sha256_validation_failed(dataset_name: &str, file_name: &str) -> Self {
142        Self::ValidationError(format!(
143            "[{}] SHA256 validation failed for file `{}`",
144            dataset_name, file_name
145        ))
146    }
147
148    /// Creates a CSV read error.
149    ///
150    /// # Parameters
151    ///
152    /// - `dataset_name` - The dataset identifier.
153    /// - `error` - The underlying CSV error.
154    ///
155    /// # Returns
156    ///
157    /// - `DatasetError::DataFormatError(DataFormatErrorKind::CsvReadError)` - A variant of `DatasetError` describing the CSV read error.
158    pub fn csv_read_error(dataset_name: &str, error: impl std::fmt::Display) -> Self {
159        Self::DataFormatError(DataFormatErrorKind::CsvReadError {
160            dataset_name: dataset_name.to_string(),
161            error: error.to_string(),
162        })
163    }
164
165    /// Creates a unified invalid-column-count data format error.
166    ///
167    /// # Parameters
168    ///
169    /// - `dataset_name` - The dataset identifier used in the error prefix.
170    /// - `expected` - The expected number of columns.
171    /// - `actual` - The actual number of columns found.
172    /// - `line_num` - The line number (1-based) where the error occurred.
173    /// - `line` - The original input line that failed validation.
174    ///
175    /// # Returns
176    ///
177    /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidColumnCount)` - A variant of `DatasetError` describing the column count mismatch.
178    pub fn invalid_column_count(
179        dataset_name: &str,
180        expected: usize,
181        actual: usize,
182        line_num: usize,
183        line: &str,
184    ) -> Self {
185        Self::DataFormatError(DataFormatErrorKind::InvalidColumnCount {
186            dataset_name: dataset_name.to_string(),
187            expected,
188            actual,
189            line_num,
190            line: line.to_string(),
191        })
192    }
193
194    /// Creates a unified parse failure data format error.
195    ///
196    /// # Parameters
197    ///
198    /// - `dataset_name` - The dataset identifier.
199    /// - `field_name` - The logical field name that failed to parse.
200    /// - `line_num` - The line number (1-based) where the error occurred.
201    /// - `line` - The original input line where parsing failed.
202    /// - `err` - The underlying parser error detail.
203    ///
204    /// # Returns
205    ///
206    /// - `DatasetError::DataFormatError(DataFormatErrorKind::ParseFailed)` - A variant of `DatasetError` describing the parse failure.
207    pub fn parse_failed(
208        dataset_name: &str,
209        field_name: &str,
210        line_num: usize,
211        line: &str,
212        err: impl std::fmt::Display,
213    ) -> Self {
214        Self::DataFormatError(DataFormatErrorKind::ParseFailed {
215            dataset_name: dataset_name.to_string(),
216            field_name: field_name.to_string(),
217            line_num,
218            line: line.to_string(),
219            error: err.to_string(),
220        })
221    }
222
223    /// Creates a unified invalid-field-value data format error.
224    ///
225    /// # Parameters
226    ///
227    /// - `dataset_name` - The dataset identifier.
228    /// - `field_name` - The logical field name with an invalid value.
229    /// - `value` - The invalid raw value.
230    /// - `line_num` - The line number (1-based) where the error occurred.
231    /// - `line` - The original input line where the invalid value was found.
232    ///
233    /// # Returns
234    ///
235    /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidValue)` - A variant of `DatasetError` describing the invalid value.
236    pub fn invalid_value(
237        dataset_name: &str,
238        field_name: &str,
239        value: &str,
240        line_num: usize,
241        line: &str,
242    ) -> Self {
243        Self::DataFormatError(DataFormatErrorKind::InvalidValue {
244            dataset_name: dataset_name.to_string(),
245            field_name: field_name.to_string(),
246            value: value.to_string(),
247            line_num,
248            line: line.to_string(),
249        })
250    }
251
252    /// Creates a unified vector/row length mismatch data format error.
253    ///
254    /// # Parameters
255    ///
256    /// - `dataset_name` - The dataset identifier.
257    /// - `field_name` - The logical field name whose length is being validated.
258    /// - `expected` - The expected length.
259    /// - `actual` - The actual length.
260    ///
261    /// # Returns
262    ///
263    /// - `DatasetError::DataFormatError(DataFormatErrorKind::LengthMismatch)` - A variant of `DatasetError` describing the length mismatch.
264    pub fn length_mismatch(
265        dataset_name: &str,
266        field_name: &str,
267        expected: usize,
268        actual: usize,
269    ) -> Self {
270        Self::DataFormatError(DataFormatErrorKind::LengthMismatch {
271            dataset_name: dataset_name.to_string(),
272            field_name: field_name.to_string(),
273            expected,
274            actual,
275        })
276    }
277
278    /// Creates a unified ndarray shape construction data format error.
279    ///
280    /// # Parameters
281    ///
282    /// - `dataset_name` - The dataset identifier.
283    /// - `array_name` - The logical array name that failed to build.
284    /// - `err` - The underlying ndarray shape construction error detail.
285    ///
286    /// # Returns
287    ///
288    /// - `DatasetError::DataFormatError(DataFormatErrorKind::ArrayShapeError)` - A variant of `DatasetError` describing the array shape failure.
289    pub fn array_shape_error(
290        dataset_name: &str,
291        array_name: &str,
292        err: impl std::fmt::Display,
293    ) -> Self {
294        Self::DataFormatError(DataFormatErrorKind::ArrayShapeError {
295            dataset_name: dataset_name.to_string(),
296            array_name: array_name.to_string(),
297            error: err.to_string(),
298        })
299    }
300
301    /// Creates an empty dataset error.
302    ///
303    /// # Parameters
304    ///
305    /// - `dataset_name` - The dataset identifier.
306    ///
307    /// # Returns
308    ///
309    /// - `DatasetError::DataFormatError(DataFormatErrorKind::EmptyDataset)` - A variant of `DatasetError` indicating the dataset is empty.
310    pub fn empty_dataset(dataset_name: &str) -> Self {
311        Self::DataFormatError(DataFormatErrorKind::EmptyDataset {
312            dataset_name: dataset_name.to_string(),
313        })
314    }
315}