dsfb_semiconductor/dataset/
secom.rs1use crate::error::{DsfbSemiconductorError, Result};
2use chrono::NaiveDateTime;
3use serde::Serialize;
4use std::fs::{self, File};
5use std::io::{BufRead, BufReader, Read, Write};
6use std::path::{Path, PathBuf};
7use zip::ZipArchive;
8
9pub const SECOM_UCI_URL: &str = "https://archive.ics.uci.edu/static/public/179/secom.zip";
10pub const SECOM_ARCHIVE_NAME: &str = "secom.zip";
11pub const SECOM_DATA_FILE: &str = "secom.data";
12pub const SECOM_LABELS_FILE: &str = "secom_labels.data";
13pub const SECOM_NAMES_FILE: &str = "secom.names";
14
15#[derive(Debug, Clone, Serialize)]
16pub struct SecomArchiveLayout {
17 pub data_file_numeric_column_count: usize,
18 pub metadata_attribute_count_claim: Option<usize>,
19 pub label_row_count: usize,
20 pub label_file_includes_timestamp: bool,
21 pub note: String,
22}
23
24#[derive(Debug, Clone, Serialize)]
25pub struct SecomRun {
26 pub index: usize,
27 pub label: i8,
28 pub timestamp: NaiveDateTime,
29 pub features: Vec<Option<f64>>,
30}
31
32#[derive(Debug, Clone, Serialize)]
33pub struct SecomDataset {
34 pub feature_names: Vec<String>,
35 pub runs: Vec<SecomRun>,
36}
37
38#[derive(Debug, Clone, Serialize)]
39pub struct SecomDataPaths {
40 pub root: PathBuf,
41 pub archive: PathBuf,
42 pub data_file: PathBuf,
43 pub labels_file: PathBuf,
44 pub names_file: PathBuf,
45}
46
47pub fn dataset_paths(data_root: &Path) -> SecomDataPaths {
48 let root = data_root.join("secom");
49 SecomDataPaths {
50 archive: root.join(SECOM_ARCHIVE_NAME),
51 data_file: root.join(SECOM_DATA_FILE),
52 labels_file: root.join(SECOM_LABELS_FILE),
53 names_file: root.join(SECOM_NAMES_FILE),
54 root,
55 }
56}
57
58pub fn fetch_if_missing(data_root: &Path) -> Result<SecomDataPaths> {
59 let paths = dataset_paths(data_root);
60 if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
61 return Ok(paths);
62 }
63
64 fs::create_dir_all(&paths.root)?;
65
66 if !paths.archive.exists() {
67 let response = ureq::get(SECOM_UCI_URL)
68 .call()
69 .map_err(|err| DsfbSemiconductorError::Network(err.to_string()))?;
70 let mut reader = response.into_reader();
71 let mut file = File::create(&paths.archive)?;
72 std::io::copy(&mut reader, &mut file)?;
73 }
74
75 unpack_archive(&paths.archive, &paths.root)?;
76 Ok(paths)
77}
78
79pub fn ensure_present(data_root: &Path) -> Result<SecomDataPaths> {
80 let paths = dataset_paths(data_root);
81 if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
82 Ok(paths)
83 } else {
84 Err(DsfbSemiconductorError::DatasetMissing {
85 dataset: "SECOM",
86 path: paths.root,
87 })
88 }
89}
90
91pub fn load_from_root(data_root: &Path) -> Result<SecomDataset> {
92 let paths = ensure_present(data_root)?;
93 load_from_paths(&paths)
94}
95
96pub fn inspect_archive_layout(paths: &SecomDataPaths) -> Result<SecomArchiveLayout> {
97 let mut data_file_numeric_column_count = 0usize;
98 let reader = BufReader::new(File::open(&paths.data_file)?);
99 for line in reader.lines() {
100 let line = line?;
101 let trimmed = line.trim();
102 if !trimmed.is_empty() {
103 data_file_numeric_column_count = trimmed.split_whitespace().count();
104 break;
105 }
106 }
107
108 let label_rows = read_labels(&paths.labels_file)?;
109 let names_text = fs::read_to_string(&paths.names_file)?;
110 let metadata_attribute_count_claim = names_text
111 .lines()
112 .find_map(|line| {
113 let normalized = line.trim().to_ascii_lowercase();
114 normalized
115 .strip_prefix("number of attributes:")
116 .and_then(|value| value.trim().parse::<usize>().ok())
117 })
118 .or_else(|| {
119 names_text.lines().find_map(|line| {
120 let normalized = line.to_ascii_lowercase();
121 let prefix = "consisting of 1567 examples each with ";
122 normalized.find(prefix).and_then(|start| {
123 normalized[start + prefix.len()..]
124 .split_whitespace()
125 .next()
126 .and_then(|token| token.parse::<usize>().ok())
127 })
128 })
129 });
130
131 let note = match metadata_attribute_count_claim {
132 Some(claim) if claim != data_file_numeric_column_count => format!(
133 "The distributed UCI archive currently parses as {data_file_numeric_column_count} whitespace-delimited numeric columns in {SECOM_DATA_FILE}, while {SECOM_NAMES_FILE} states {claim} attributes. This crate uses the {data_file_numeric_column_count} numeric columns actually present in {SECOM_DATA_FILE} and reads labels plus timestamps separately from {SECOM_LABELS_FILE}."
134 ),
135 Some(claim) => format!(
136 "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}, matching the {claim}-attribute claim in {SECOM_NAMES_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
137 ),
138 None => format!(
139 "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
140 ),
141 };
142
143 Ok(SecomArchiveLayout {
144 data_file_numeric_column_count,
145 metadata_attribute_count_claim,
146 label_row_count: label_rows.len(),
147 label_file_includes_timestamp: !label_rows.is_empty(),
148 note,
149 })
150}
151
152pub fn load_from_paths(paths: &SecomDataPaths) -> Result<SecomDataset> {
153 let labels = read_labels(&paths.labels_file)?;
154 let data = read_data(&paths.data_file)?;
155
156 if labels.len() != data.len() {
157 return Err(DsfbSemiconductorError::DatasetFormat(format!(
158 "SECOM rows do not match labels: {} data rows vs {} labels",
159 data.len(),
160 labels.len()
161 )));
162 }
163
164 let feature_count = data.first().map(Vec::len).unwrap_or_default();
165 let feature_names = (1..=feature_count)
166 .map(|idx| format!("S{idx:03}"))
167 .collect::<Vec<_>>();
168
169 let runs = data
170 .into_iter()
171 .zip(labels.into_iter())
172 .enumerate()
173 .map(|(index, (features, (label, timestamp)))| SecomRun {
174 index,
175 label,
176 timestamp,
177 features,
178 })
179 .collect::<Vec<_>>();
180
181 Ok(SecomDataset {
182 feature_names,
183 runs,
184 })
185}
186
187fn unpack_archive(archive_path: &Path, output_dir: &Path) -> Result<()> {
188 let file = File::open(archive_path)?;
189 let mut archive = ZipArchive::new(file)?;
190 for index in 0..archive.len() {
191 let mut entry = archive.by_index(index)?;
192 let out_path = output_dir.join(entry.name());
193 let mut out_file = File::create(out_path)?;
194 let mut buffer = Vec::new();
195 entry.read_to_end(&mut buffer)?;
196 out_file.write_all(&buffer)?;
197 }
198 Ok(())
199}
200
201fn read_labels(path: &Path) -> Result<Vec<(i8, NaiveDateTime)>> {
202 let reader = BufReader::new(File::open(path)?);
203 let mut labels = Vec::new();
204
205 for line in reader.lines() {
206 let line = line?;
207 let trimmed = line.trim();
208 if trimmed.is_empty() {
209 continue;
210 }
211 let mut parts = trimmed.splitn(2, ' ');
212 let label = parts
213 .next()
214 .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing SECOM label".into()))?
215 .parse::<i8>()
216 .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
217 let timestamp_raw = parts
218 .next()
219 .ok_or_else(|| {
220 DsfbSemiconductorError::DatasetFormat("missing SECOM label timestamp".into())
221 })?
222 .trim_matches('"');
223 let timestamp = NaiveDateTime::parse_from_str(timestamp_raw, "%d/%m/%Y %H:%M:%S")
224 .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
225 labels.push((label, timestamp));
226 }
227
228 Ok(labels)
229}
230
231fn read_data(path: &Path) -> Result<Vec<Vec<Option<f64>>>> {
232 let reader = BufReader::new(File::open(path)?);
233 let mut rows = Vec::new();
234
235 for line in reader.lines() {
236 let line = line?;
237 let trimmed = line.trim();
238 if trimmed.is_empty() {
239 continue;
240 }
241 let row = trimmed
242 .split_whitespace()
243 .map(|token| {
244 if token.eq_ignore_ascii_case("nan") {
245 Ok(None)
246 } else {
247 token.parse::<f64>().map(Some).map_err(|err| {
248 DsfbSemiconductorError::DatasetFormat(format!(
249 "invalid SECOM value `{token}`: {err}"
250 ))
251 })
252 }
253 })
254 .collect::<Result<Vec<_>>>()?;
255 rows.push(row);
256 }
257
258 Ok(rows)
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
266 fn labels_parse_from_uci_format() {
267 let temp = tempfile::tempdir().unwrap();
268 let path = temp.path().join("labels.data");
269 fs::write(
270 &path,
271 "-1 \"19/07/2008 11:55:00\"\n1 \"19/07/2008 13:17:00\"\n",
272 )
273 .unwrap();
274 let labels = read_labels(&path).unwrap();
275 assert_eq!(labels.len(), 2);
276 assert_eq!(labels[0].0, -1);
277 assert_eq!(labels[1].0, 1);
278 }
279
280 #[test]
281 fn data_parser_keeps_nan_as_missing() {
282 let temp = tempfile::tempdir().unwrap();
283 let path = temp.path().join("secom.data");
284 fs::write(&path, "1.0 NaN 2.5\n").unwrap();
285 let rows = read_data(&path).unwrap();
286 assert_eq!(rows[0], vec![Some(1.0), None, Some(2.5)]);
287 }
288
289 #[test]
290 fn archive_layout_reports_mismatch_when_names_claim_exceeds_numeric_columns() {
291 let temp = tempfile::tempdir().unwrap();
292 let root = temp.path().join("secom");
293 fs::create_dir_all(&root).unwrap();
294 fs::write(root.join(SECOM_DATA_FILE), "1.0 2.0 3.0\n4.0 5.0 6.0\n").unwrap();
295 fs::write(
296 root.join(SECOM_LABELS_FILE),
297 "-1 \"01/01/2008 00:00:00\"\n1 \"01/01/2008 01:00:00\"\n",
298 )
299 .unwrap();
300 fs::write(
301 root.join(SECOM_NAMES_FILE),
302 "Number of Attributes: 4\nData Structure: 2 examples each with 4 features\n",
303 )
304 .unwrap();
305
306 let layout = inspect_archive_layout(&SecomDataPaths {
307 root: root.clone(),
308 archive: root.join(SECOM_ARCHIVE_NAME),
309 data_file: root.join(SECOM_DATA_FILE),
310 labels_file: root.join(SECOM_LABELS_FILE),
311 names_file: root.join(SECOM_NAMES_FILE),
312 })
313 .unwrap();
314
315 assert_eq!(layout.data_file_numeric_column_count, 3);
316 assert_eq!(layout.metadata_attribute_count_claim, Some(4));
317 assert_eq!(layout.label_row_count, 2);
318 assert!(layout.label_file_includes_timestamp);
319 assert!(layout
320 .note
321 .contains("3 whitespace-delimited numeric columns"));
322 }
323}