Skip to main content

dsfb_semiconductor/dataset/
secom.rs

1use crate::error::{DsfbSemiconductorError, Result};
2use chrono::NaiveDateTime;
3use serde::Serialize;
4use std::fs::{self, File};
5use std::io::{BufRead, BufReader, Read, Write};
6use std::path::{Path, PathBuf};
7use zip::ZipArchive;
8
9pub const SECOM_UCI_URL: &str = "https://archive.ics.uci.edu/static/public/179/secom.zip";
10pub const SECOM_ARCHIVE_NAME: &str = "secom.zip";
11pub const SECOM_DATA_FILE: &str = "secom.data";
12pub const SECOM_LABELS_FILE: &str = "secom_labels.data";
13pub const SECOM_NAMES_FILE: &str = "secom.names";
14
15#[derive(Debug, Clone, Serialize)]
16pub struct SecomArchiveLayout {
17    pub data_file_numeric_column_count: usize,
18    pub metadata_attribute_count_claim: Option<usize>,
19    pub label_row_count: usize,
20    pub label_file_includes_timestamp: bool,
21    pub note: String,
22}
23
24#[derive(Debug, Clone, Serialize)]
25pub struct SecomRun {
26    pub index: usize,
27    pub label: i8,
28    pub timestamp: NaiveDateTime,
29    pub features: Vec<Option<f64>>,
30}
31
32#[derive(Debug, Clone, Serialize)]
33pub struct SecomDataset {
34    pub feature_names: Vec<String>,
35    pub runs: Vec<SecomRun>,
36}
37
38#[derive(Debug, Clone, Serialize)]
39pub struct SecomDataPaths {
40    pub root: PathBuf,
41    pub archive: PathBuf,
42    pub data_file: PathBuf,
43    pub labels_file: PathBuf,
44    pub names_file: PathBuf,
45}
46
47pub fn dataset_paths(data_root: &Path) -> SecomDataPaths {
48    let root = data_root.join("secom");
49    SecomDataPaths {
50        archive: root.join(SECOM_ARCHIVE_NAME),
51        data_file: root.join(SECOM_DATA_FILE),
52        labels_file: root.join(SECOM_LABELS_FILE),
53        names_file: root.join(SECOM_NAMES_FILE),
54        root,
55    }
56}
57
58pub fn fetch_if_missing(data_root: &Path) -> Result<SecomDataPaths> {
59    let paths = dataset_paths(data_root);
60    if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
61        return Ok(paths);
62    }
63
64    fs::create_dir_all(&paths.root)?;
65
66    if !paths.archive.exists() {
67        let response = ureq::get(SECOM_UCI_URL)
68            .call()
69            .map_err(|err| DsfbSemiconductorError::Network(err.to_string()))?;
70        let mut reader = response.into_reader();
71        let mut file = File::create(&paths.archive)?;
72        std::io::copy(&mut reader, &mut file)?;
73    }
74
75    unpack_archive(&paths.archive, &paths.root)?;
76    Ok(paths)
77}
78
79pub fn ensure_present(data_root: &Path) -> Result<SecomDataPaths> {
80    let paths = dataset_paths(data_root);
81    if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
82        Ok(paths)
83    } else {
84        Err(DsfbSemiconductorError::DatasetMissing {
85            dataset: "SECOM",
86            path: paths.root,
87        })
88    }
89}
90
91pub fn load_from_root(data_root: &Path) -> Result<SecomDataset> {
92    let paths = ensure_present(data_root)?;
93    load_from_paths(&paths)
94}
95
96pub fn inspect_archive_layout(paths: &SecomDataPaths) -> Result<SecomArchiveLayout> {
97    let mut data_file_numeric_column_count = 0usize;
98    let reader = BufReader::new(File::open(&paths.data_file)?);
99    for line in reader.lines() {
100        let line = line?;
101        let trimmed = line.trim();
102        if !trimmed.is_empty() {
103            data_file_numeric_column_count = trimmed.split_whitespace().count();
104            break;
105        }
106    }
107
108    let label_rows = read_labels(&paths.labels_file)?;
109    let names_text = fs::read_to_string(&paths.names_file)?;
110    let metadata_attribute_count_claim = names_text
111        .lines()
112        .find_map(|line| {
113            let normalized = line.trim().to_ascii_lowercase();
114            normalized
115                .strip_prefix("number of attributes:")
116                .and_then(|value| value.trim().parse::<usize>().ok())
117        })
118        .or_else(|| {
119            names_text.lines().find_map(|line| {
120                let normalized = line.to_ascii_lowercase();
121                let prefix = "consisting of 1567 examples each with ";
122                normalized.find(prefix).and_then(|start| {
123                    normalized[start + prefix.len()..]
124                        .split_whitespace()
125                        .next()
126                        .and_then(|token| token.parse::<usize>().ok())
127                })
128            })
129        });
130
131    let note = match metadata_attribute_count_claim {
132        Some(claim) if claim != data_file_numeric_column_count => format!(
133            "The distributed UCI archive currently parses as {data_file_numeric_column_count} whitespace-delimited numeric columns in {SECOM_DATA_FILE}, while {SECOM_NAMES_FILE} states {claim} attributes. This crate uses the {data_file_numeric_column_count} numeric columns actually present in {SECOM_DATA_FILE} and reads labels plus timestamps separately from {SECOM_LABELS_FILE}."
134        ),
135        Some(claim) => format!(
136            "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}, matching the {claim}-attribute claim in {SECOM_NAMES_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
137        ),
138        None => format!(
139            "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
140        ),
141    };
142
143    Ok(SecomArchiveLayout {
144        data_file_numeric_column_count,
145        metadata_attribute_count_claim,
146        label_row_count: label_rows.len(),
147        label_file_includes_timestamp: !label_rows.is_empty(),
148        note,
149    })
150}
151
152pub fn load_from_paths(paths: &SecomDataPaths) -> Result<SecomDataset> {
153    let labels = read_labels(&paths.labels_file)?;
154    let data = read_data(&paths.data_file)?;
155
156    if labels.len() != data.len() {
157        return Err(DsfbSemiconductorError::DatasetFormat(format!(
158            "SECOM rows do not match labels: {} data rows vs {} labels",
159            data.len(),
160            labels.len()
161        )));
162    }
163
164    let feature_count = data.first().map(Vec::len).unwrap_or_default();
165    let feature_names = (1..=feature_count)
166        .map(|idx| format!("S{idx:03}"))
167        .collect::<Vec<_>>();
168
169    let runs = data
170        .into_iter()
171        .zip(labels.into_iter())
172        .enumerate()
173        .map(|(index, (features, (label, timestamp)))| SecomRun {
174            index,
175            label,
176            timestamp,
177            features,
178        })
179        .collect::<Vec<_>>();
180
181    Ok(SecomDataset {
182        feature_names,
183        runs,
184    })
185}
186
187fn unpack_archive(archive_path: &Path, output_dir: &Path) -> Result<()> {
188    let file = File::open(archive_path)?;
189    let mut archive = ZipArchive::new(file)?;
190    for index in 0..archive.len() {
191        let mut entry = archive.by_index(index)?;
192        let out_path = output_dir.join(entry.name());
193        let mut out_file = File::create(out_path)?;
194        let mut buffer = Vec::new();
195        entry.read_to_end(&mut buffer)?;
196        out_file.write_all(&buffer)?;
197    }
198    Ok(())
199}
200
201fn read_labels(path: &Path) -> Result<Vec<(i8, NaiveDateTime)>> {
202    let reader = BufReader::new(File::open(path)?);
203    let mut labels = Vec::new();
204
205    for line in reader.lines() {
206        let line = line?;
207        let trimmed = line.trim();
208        if trimmed.is_empty() {
209            continue;
210        }
211        let mut parts = trimmed.splitn(2, ' ');
212        let label = parts
213            .next()
214            .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing SECOM label".into()))?
215            .parse::<i8>()
216            .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
217        let timestamp_raw = parts
218            .next()
219            .ok_or_else(|| {
220                DsfbSemiconductorError::DatasetFormat("missing SECOM label timestamp".into())
221            })?
222            .trim_matches('"');
223        let timestamp = NaiveDateTime::parse_from_str(timestamp_raw, "%d/%m/%Y %H:%M:%S")
224            .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
225        labels.push((label, timestamp));
226    }
227
228    Ok(labels)
229}
230
231fn read_data(path: &Path) -> Result<Vec<Vec<Option<f64>>>> {
232    let reader = BufReader::new(File::open(path)?);
233    let mut rows = Vec::new();
234
235    for line in reader.lines() {
236        let line = line?;
237        let trimmed = line.trim();
238        if trimmed.is_empty() {
239            continue;
240        }
241        let row = trimmed
242            .split_whitespace()
243            .map(|token| {
244                if token.eq_ignore_ascii_case("nan") {
245                    Ok(None)
246                } else {
247                    token.parse::<f64>().map(Some).map_err(|err| {
248                        DsfbSemiconductorError::DatasetFormat(format!(
249                            "invalid SECOM value `{token}`: {err}"
250                        ))
251                    })
252                }
253            })
254            .collect::<Result<Vec<_>>>()?;
255        rows.push(row);
256    }
257
258    Ok(rows)
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn labels_parse_from_uci_format() {
267        let temp = tempfile::tempdir().unwrap();
268        let path = temp.path().join("labels.data");
269        fs::write(
270            &path,
271            "-1 \"19/07/2008 11:55:00\"\n1 \"19/07/2008 13:17:00\"\n",
272        )
273        .unwrap();
274        let labels = read_labels(&path).unwrap();
275        assert_eq!(labels.len(), 2);
276        assert_eq!(labels[0].0, -1);
277        assert_eq!(labels[1].0, 1);
278    }
279
280    #[test]
281    fn data_parser_keeps_nan_as_missing() {
282        let temp = tempfile::tempdir().unwrap();
283        let path = temp.path().join("secom.data");
284        fs::write(&path, "1.0 NaN 2.5\n").unwrap();
285        let rows = read_data(&path).unwrap();
286        assert_eq!(rows[0], vec![Some(1.0), None, Some(2.5)]);
287    }
288
289    #[test]
290    fn archive_layout_reports_mismatch_when_names_claim_exceeds_numeric_columns() {
291        let temp = tempfile::tempdir().unwrap();
292        let root = temp.path().join("secom");
293        fs::create_dir_all(&root).unwrap();
294        fs::write(root.join(SECOM_DATA_FILE), "1.0 2.0 3.0\n4.0 5.0 6.0\n").unwrap();
295        fs::write(
296            root.join(SECOM_LABELS_FILE),
297            "-1 \"01/01/2008 00:00:00\"\n1 \"01/01/2008 01:00:00\"\n",
298        )
299        .unwrap();
300        fs::write(
301            root.join(SECOM_NAMES_FILE),
302            "Number of Attributes: 4\nData Structure: 2 examples each with 4 features\n",
303        )
304        .unwrap();
305
306        let layout = inspect_archive_layout(&SecomDataPaths {
307            root: root.clone(),
308            archive: root.join(SECOM_ARCHIVE_NAME),
309            data_file: root.join(SECOM_DATA_FILE),
310            labels_file: root.join(SECOM_LABELS_FILE),
311            names_file: root.join(SECOM_NAMES_FILE),
312        })
313        .unwrap();
314
315        assert_eq!(layout.data_file_numeric_column_count, 3);
316        assert_eq!(layout.metadata_attribute_count_claim, Some(4));
317        assert_eq!(layout.label_row_count, 2);
318        assert!(layout.label_file_includes_timestamp);
319        assert!(layout
320            .note
321            .contains("3 whitespace-delimited numeric columns"));
322    }
323}