dataset_core/error.rs
1use ureq::Error as UreqError;
2use zip::result::ZipError;
3
4/// Specific kinds of data format errors that can occur during dataset parsing.
5///
6/// # Variants
7///
8/// - `CsvReadError` - Failed to read a CSV record.
9/// - `InvalidColumnCount` - The row has an unexpected number of columns.
10/// - `ParseFailed` - Failed to parse a field value into the target type.
11/// - `InvalidValue` - The field value is syntactically valid but semantically incorrect.
12/// - `LengthMismatch` - The total parsed data length doesn't match expected dimensions.
13/// - `EmptyDataset` - The dataset is empty.
14/// - `ArrayShapeError` - Failed to construct ndarray with the given shape and data.
15#[derive(Debug, thiserror::Error)]
16pub enum DataFormatErrorKind {
17 /// Failed to read a CSV record
18 #[error("[{dataset_name}] failed to read CSV record: {error}")]
19 CsvReadError {
20 /// Dataset identifier
21 dataset_name: String,
22 /// The underlying CSV error message
23 error: String,
24 },
25 /// The row has an unexpected number of columns
26 #[error(
27 "[{dataset_name}] invalid column count at line {line_num}: expected {expected}, got {actual} (line: `{line}`)"
28 )]
29 InvalidColumnCount {
30 /// Dataset identifier
31 dataset_name: String,
32 /// Expected number of columns
33 expected: usize,
34 /// Actual number of columns found
35 actual: usize,
36 /// Line number (1-based)
37 line_num: usize,
38 /// The original input line
39 line: String,
40 },
41 /// Failed to parse a field value into the target type
42 #[error(
43 "[{dataset_name}] failed to parse `{field_name}` at line {line_num}: {error} (line: `{line}`)"
44 )]
45 ParseFailed {
46 /// Dataset identifier
47 dataset_name: String,
48 /// Field name that failed to parse
49 field_name: String,
50 /// Line number (1-based)
51 line_num: usize,
52 /// The original input line
53 line: String,
54 /// The underlying parse error message
55 error: String,
56 },
57 /// The field value is syntactically valid but semantically incorrect
58 #[error(
59 "[{dataset_name}] invalid value for `{field_name}` at line {line_num}: `{value}` (line: `{line}`)"
60 )]
61 InvalidValue {
62 /// Dataset identifier
63 dataset_name: String,
64 /// Field name with invalid value
65 field_name: String,
66 /// The invalid value
67 value: String,
68 /// Line number (1-based)
69 line_num: usize,
70 /// The original input line
71 line: String,
72 },
73 /// The total parsed data length doesn't match expected dimensions
74 #[error("[{dataset_name}] invalid `{field_name}` length: expected {expected}, got {actual}")]
75 LengthMismatch {
76 /// Dataset identifier
77 dataset_name: String,
78 /// Field name whose length is being validated
79 field_name: String,
80 /// Expected length
81 expected: usize,
82 /// Actual length
83 actual: usize,
84 },
85 /// The dataset is empty
86 #[error("[{dataset_name}] is empty")]
87 EmptyDataset {
88 /// Dataset identifier
89 dataset_name: String,
90 },
91 /// Failed to construct ndarray with the given shape and data
92 #[error("[{dataset_name}] failed to build `{array_name}` array: {error}")]
93 ArrayShapeError {
94 /// Dataset identifier
95 dataset_name: String,
96 /// Array name that failed to build
97 array_name: String,
98 /// The underlying shape error message
99 error: String,
100 },
101}
102
103/// Error type used by dataset loading utilities.
104///
105/// # Variants
106///
107/// - `DownloadError` - The download step failed (network, invalid URL, or downloader configuration).
108/// - `ValidationError` - Downloaded file content failed integrity validation (SHA256 mismatch).
109/// - `UnzipError` - Extracting a zip archive failed.
110/// - `IoError` - A standard I/O operation failed (reading directories, opening/removing files, etc.).
111/// - `DataFormatError` - The dataset content was not in the expected format.
112#[derive(Debug, thiserror::Error)]
113pub enum DatasetError {
114 #[error("Download error: {0}")]
115 DownloadError(#[from] UreqError),
116
117 #[error("Validation error: {0}")]
118 ValidationError(String),
119
120 #[error("Unzip error: {0}")]
121 UnzipError(#[from] ZipError),
122
123 #[error("I/O error: {0}")]
124 IoError(#[from] std::io::Error),
125
126 #[error("Data format error: {0}")]
127 DataFormatError(#[from] DataFormatErrorKind),
128}
129
130impl DatasetError {
131 /// Creates a standard SHA256 validation failure error message for a file.
132 ///
133 /// # Parameters
134 ///
135 /// - `dataset_name` - The dataset identifier used in the error prefix.
136 /// - `file_name` - The dataset file name that failed checksum validation.
137 ///
138 /// # Returns
139 ///
140 /// - `DatasetError::ValidationError` - A variant of `DatasetError` that contains the unified SHA256 failure message.
141 pub fn sha256_validation_failed(dataset_name: &str, file_name: &str) -> Self {
142 Self::ValidationError(format!(
143 "[{}] SHA256 validation failed for file `{}`",
144 dataset_name, file_name
145 ))
146 }
147
148 /// Creates a CSV read error.
149 ///
150 /// # Parameters
151 ///
152 /// - `dataset_name` - The dataset identifier.
153 /// - `error` - The underlying CSV error.
154 ///
155 /// # Returns
156 ///
157 /// - `DatasetError::DataFormatError(DataFormatErrorKind::CsvReadError)` - A variant of `DatasetError` describing the CSV read error.
158 pub fn csv_read_error(dataset_name: &str, error: impl std::fmt::Display) -> Self {
159 Self::DataFormatError(DataFormatErrorKind::CsvReadError {
160 dataset_name: dataset_name.to_string(),
161 error: error.to_string(),
162 })
163 }
164
165 /// Creates a unified invalid-column-count data format error.
166 ///
167 /// # Parameters
168 ///
169 /// - `dataset_name` - The dataset identifier used in the error prefix.
170 /// - `expected` - The expected number of columns.
171 /// - `actual` - The actual number of columns found.
172 /// - `line_num` - The line number (1-based) where the error occurred.
173 /// - `line` - The original input line that failed validation.
174 ///
175 /// # Returns
176 ///
177 /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidColumnCount)` - A variant of `DatasetError` describing the column count mismatch.
178 pub fn invalid_column_count(
179 dataset_name: &str,
180 expected: usize,
181 actual: usize,
182 line_num: usize,
183 line: &str,
184 ) -> Self {
185 Self::DataFormatError(DataFormatErrorKind::InvalidColumnCount {
186 dataset_name: dataset_name.to_string(),
187 expected,
188 actual,
189 line_num,
190 line: line.to_string(),
191 })
192 }
193
194 /// Creates a unified parse failure data format error.
195 ///
196 /// # Parameters
197 ///
198 /// - `dataset_name` - The dataset identifier.
199 /// - `field_name` - The logical field name that failed to parse.
200 /// - `line_num` - The line number (1-based) where the error occurred.
201 /// - `line` - The original input line where parsing failed.
202 /// - `err` - The underlying parser error detail.
203 ///
204 /// # Returns
205 ///
206 /// - `DatasetError::DataFormatError(DataFormatErrorKind::ParseFailed)` - A variant of `DatasetError` describing the parse failure.
207 pub fn parse_failed(
208 dataset_name: &str,
209 field_name: &str,
210 line_num: usize,
211 line: &str,
212 err: impl std::fmt::Display,
213 ) -> Self {
214 Self::DataFormatError(DataFormatErrorKind::ParseFailed {
215 dataset_name: dataset_name.to_string(),
216 field_name: field_name.to_string(),
217 line_num,
218 line: line.to_string(),
219 error: err.to_string(),
220 })
221 }
222
223 /// Creates a unified invalid-field-value data format error.
224 ///
225 /// # Parameters
226 ///
227 /// - `dataset_name` - The dataset identifier.
228 /// - `field_name` - The logical field name with an invalid value.
229 /// - `value` - The invalid raw value.
230 /// - `line_num` - The line number (1-based) where the error occurred.
231 /// - `line` - The original input line where the invalid value was found.
232 ///
233 /// # Returns
234 ///
235 /// - `DatasetError::DataFormatError(DataFormatErrorKind::InvalidValue)` - A variant of `DatasetError` describing the invalid value.
236 pub fn invalid_value(
237 dataset_name: &str,
238 field_name: &str,
239 value: &str,
240 line_num: usize,
241 line: &str,
242 ) -> Self {
243 Self::DataFormatError(DataFormatErrorKind::InvalidValue {
244 dataset_name: dataset_name.to_string(),
245 field_name: field_name.to_string(),
246 value: value.to_string(),
247 line_num,
248 line: line.to_string(),
249 })
250 }
251
252 /// Creates a unified vector/row length mismatch data format error.
253 ///
254 /// # Parameters
255 ///
256 /// - `dataset_name` - The dataset identifier.
257 /// - `field_name` - The logical field name whose length is being validated.
258 /// - `expected` - The expected length.
259 /// - `actual` - The actual length.
260 ///
261 /// # Returns
262 ///
263 /// - `DatasetError::DataFormatError(DataFormatErrorKind::LengthMismatch)` - A variant of `DatasetError` describing the length mismatch.
264 pub fn length_mismatch(
265 dataset_name: &str,
266 field_name: &str,
267 expected: usize,
268 actual: usize,
269 ) -> Self {
270 Self::DataFormatError(DataFormatErrorKind::LengthMismatch {
271 dataset_name: dataset_name.to_string(),
272 field_name: field_name.to_string(),
273 expected,
274 actual,
275 })
276 }
277
278 /// Creates a unified ndarray shape construction data format error.
279 ///
280 /// # Parameters
281 ///
282 /// - `dataset_name` - The dataset identifier.
283 /// - `array_name` - The logical array name that failed to build.
284 /// - `err` - The underlying ndarray shape construction error detail.
285 ///
286 /// # Returns
287 ///
288 /// - `DatasetError::DataFormatError(DataFormatErrorKind::ArrayShapeError)` - A variant of `DatasetError` describing the array shape failure.
289 pub fn array_shape_error(
290 dataset_name: &str,
291 array_name: &str,
292 err: impl std::fmt::Display,
293 ) -> Self {
294 Self::DataFormatError(DataFormatErrorKind::ArrayShapeError {
295 dataset_name: dataset_name.to_string(),
296 array_name: array_name.to_string(),
297 error: err.to_string(),
298 })
299 }
300
301 /// Creates an empty dataset error.
302 ///
303 /// # Parameters
304 ///
305 /// - `dataset_name` - The dataset identifier.
306 ///
307 /// # Returns
308 ///
309 /// - `DatasetError::DataFormatError(DataFormatErrorKind::EmptyDataset)` - A variant of `DatasetError` indicating the dataset is empty.
310 pub fn empty_dataset(dataset_name: &str) -> Self {
311 Self::DataFormatError(DataFormatErrorKind::EmptyDataset {
312 dataset_name: dataset_name.to_string(),
313 })
314 }
315}