dsfb-semiconductor 0.1.1

Deterministic DSFB semiconductor benchmark companion for SECOM and PHM-style dataset adapters
Documentation
use crate::error::{DsfbSemiconductorError, Result};
use chrono::NaiveDateTime;
use serde::Serialize;
use std::fs::{self, File};
use std::io::{BufRead, BufReader, Read, Write};
use std::path::{Path, PathBuf};
use zip::ZipArchive;

pub const SECOM_UCI_URL: &str = "https://archive.ics.uci.edu/static/public/179/secom.zip";
pub const SECOM_ARCHIVE_NAME: &str = "secom.zip";
pub const SECOM_DATA_FILE: &str = "secom.data";
pub const SECOM_LABELS_FILE: &str = "secom_labels.data";
pub const SECOM_NAMES_FILE: &str = "secom.names";

#[derive(Debug, Clone, Serialize)]
pub struct SecomArchiveLayout {
    pub data_file_numeric_column_count: usize,
    pub metadata_attribute_count_claim: Option<usize>,
    pub label_row_count: usize,
    pub label_file_includes_timestamp: bool,
    pub note: String,
}

#[derive(Debug, Clone, Serialize)]
pub struct SecomRun {
    pub index: usize,
    pub label: i8,
    pub timestamp: NaiveDateTime,
    pub features: Vec<Option<f64>>,
}

#[derive(Debug, Clone, Serialize)]
pub struct SecomDataset {
    pub feature_names: Vec<String>,
    pub runs: Vec<SecomRun>,
}

#[derive(Debug, Clone, Serialize)]
pub struct SecomDataPaths {
    pub root: PathBuf,
    pub archive: PathBuf,
    pub data_file: PathBuf,
    pub labels_file: PathBuf,
    pub names_file: PathBuf,
}

pub fn dataset_paths(data_root: &Path) -> SecomDataPaths {
    let root = data_root.join("secom");
    SecomDataPaths {
        archive: root.join(SECOM_ARCHIVE_NAME),
        data_file: root.join(SECOM_DATA_FILE),
        labels_file: root.join(SECOM_LABELS_FILE),
        names_file: root.join(SECOM_NAMES_FILE),
        root,
    }
}

pub fn fetch_if_missing(data_root: &Path) -> Result<SecomDataPaths> {
    let paths = dataset_paths(data_root);
    if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
        return Ok(paths);
    }

    fs::create_dir_all(&paths.root)?;

    if !paths.archive.exists() {
        let response = ureq::get(SECOM_UCI_URL)
            .call()
            .map_err(|err| DsfbSemiconductorError::Network(err.to_string()))?;
        let mut reader = response.into_reader();
        let mut file = File::create(&paths.archive)?;
        std::io::copy(&mut reader, &mut file)?;
    }

    unpack_archive(&paths.archive, &paths.root)?;
    Ok(paths)
}

pub fn ensure_present(data_root: &Path) -> Result<SecomDataPaths> {
    let paths = dataset_paths(data_root);
    if paths.data_file.exists() && paths.labels_file.exists() && paths.names_file.exists() {
        Ok(paths)
    } else {
        Err(DsfbSemiconductorError::DatasetMissing {
            dataset: "SECOM",
            path: paths.root,
        })
    }
}

pub fn load_from_root(data_root: &Path) -> Result<SecomDataset> {
    let paths = ensure_present(data_root)?;
    load_from_paths(&paths)
}

pub fn inspect_archive_layout(paths: &SecomDataPaths) -> Result<SecomArchiveLayout> {
    let mut data_file_numeric_column_count = 0usize;
    let reader = BufReader::new(File::open(&paths.data_file)?);
    for line in reader.lines() {
        let line = line?;
        let trimmed = line.trim();
        if !trimmed.is_empty() {
            data_file_numeric_column_count = trimmed.split_whitespace().count();
            break;
        }
    }

    let label_rows = read_labels(&paths.labels_file)?;
    let names_text = fs::read_to_string(&paths.names_file)?;
    let metadata_attribute_count_claim = names_text
        .lines()
        .find_map(|line| {
            let normalized = line.trim().to_ascii_lowercase();
            normalized
                .strip_prefix("number of attributes:")
                .and_then(|value| value.trim().parse::<usize>().ok())
        })
        .or_else(|| {
            names_text.lines().find_map(|line| {
                let normalized = line.to_ascii_lowercase();
                let prefix = "consisting of 1567 examples each with ";
                normalized.find(prefix).and_then(|start| {
                    normalized[start + prefix.len()..]
                        .split_whitespace()
                        .next()
                        .and_then(|token| token.parse::<usize>().ok())
                })
            })
        });

    let note = match metadata_attribute_count_claim {
        Some(claim) if claim != data_file_numeric_column_count => format!(
            "The distributed UCI archive currently parses as {data_file_numeric_column_count} whitespace-delimited numeric columns in {SECOM_DATA_FILE}, while {SECOM_NAMES_FILE} states {claim} attributes. This crate uses the {data_file_numeric_column_count} numeric columns actually present in {SECOM_DATA_FILE} and reads labels plus timestamps separately from {SECOM_LABELS_FILE}."
        ),
        Some(claim) => format!(
            "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}, matching the {claim}-attribute claim in {SECOM_NAMES_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
        ),
        None => format!(
            "The distributed UCI archive parses as {data_file_numeric_column_count} numeric columns in {SECOM_DATA_FILE}. Labels and timestamps are read separately from {SECOM_LABELS_FILE}."
        ),
    };

    Ok(SecomArchiveLayout {
        data_file_numeric_column_count,
        metadata_attribute_count_claim,
        label_row_count: label_rows.len(),
        label_file_includes_timestamp: !label_rows.is_empty(),
        note,
    })
}

pub fn load_from_paths(paths: &SecomDataPaths) -> Result<SecomDataset> {
    let labels = read_labels(&paths.labels_file)?;
    let data = read_data(&paths.data_file)?;

    if labels.len() != data.len() {
        return Err(DsfbSemiconductorError::DatasetFormat(format!(
            "SECOM rows do not match labels: {} data rows vs {} labels",
            data.len(),
            labels.len()
        )));
    }

    let feature_count = data.first().map(Vec::len).unwrap_or_default();
    let feature_names = (1..=feature_count)
        .map(|idx| format!("S{idx:03}"))
        .collect::<Vec<_>>();

    let runs = data
        .into_iter()
        .zip(labels.into_iter())
        .enumerate()
        .map(|(index, (features, (label, timestamp)))| SecomRun {
            index,
            label,
            timestamp,
            features,
        })
        .collect::<Vec<_>>();

    Ok(SecomDataset {
        feature_names,
        runs,
    })
}

fn unpack_archive(archive_path: &Path, output_dir: &Path) -> Result<()> {
    let file = File::open(archive_path)?;
    let mut archive = ZipArchive::new(file)?;
    for index in 0..archive.len() {
        let mut entry = archive.by_index(index)?;
        let out_path = output_dir.join(entry.name());
        let mut out_file = File::create(out_path)?;
        let mut buffer = Vec::new();
        entry.read_to_end(&mut buffer)?;
        out_file.write_all(&buffer)?;
    }
    Ok(())
}

fn read_labels(path: &Path) -> Result<Vec<(i8, NaiveDateTime)>> {
    let reader = BufReader::new(File::open(path)?);
    let mut labels = Vec::new();

    for line in reader.lines() {
        let line = line?;
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let mut parts = trimmed.splitn(2, ' ');
        let label = parts
            .next()
            .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing SECOM label".into()))?
            .parse::<i8>()
            .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
        let timestamp_raw = parts
            .next()
            .ok_or_else(|| {
                DsfbSemiconductorError::DatasetFormat("missing SECOM label timestamp".into())
            })?
            .trim_matches('"');
        let timestamp = NaiveDateTime::parse_from_str(timestamp_raw, "%d/%m/%Y %H:%M:%S")
            .map_err(|err| DsfbSemiconductorError::DatasetFormat(err.to_string()))?;
        labels.push((label, timestamp));
    }

    Ok(labels)
}

fn read_data(path: &Path) -> Result<Vec<Vec<Option<f64>>>> {
    let reader = BufReader::new(File::open(path)?);
    let mut rows = Vec::new();

    for line in reader.lines() {
        let line = line?;
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let row = trimmed
            .split_whitespace()
            .map(|token| {
                if token.eq_ignore_ascii_case("nan") {
                    Ok(None)
                } else {
                    token.parse::<f64>().map(Some).map_err(|err| {
                        DsfbSemiconductorError::DatasetFormat(format!(
                            "invalid SECOM value `{token}`: {err}"
                        ))
                    })
                }
            })
            .collect::<Result<Vec<_>>>()?;
        rows.push(row);
    }

    Ok(rows)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn labels_parse_from_uci_format() {
        let temp = tempfile::tempdir().unwrap();
        let path = temp.path().join("labels.data");
        fs::write(
            &path,
            "-1 \"19/07/2008 11:55:00\"\n1 \"19/07/2008 13:17:00\"\n",
        )
        .unwrap();
        let labels = read_labels(&path).unwrap();
        assert_eq!(labels.len(), 2);
        assert_eq!(labels[0].0, -1);
        assert_eq!(labels[1].0, 1);
    }

    #[test]
    fn data_parser_keeps_nan_as_missing() {
        let temp = tempfile::tempdir().unwrap();
        let path = temp.path().join("secom.data");
        fs::write(&path, "1.0 NaN 2.5\n").unwrap();
        let rows = read_data(&path).unwrap();
        assert_eq!(rows[0], vec![Some(1.0), None, Some(2.5)]);
    }

    #[test]
    fn archive_layout_reports_mismatch_when_names_claim_exceeds_numeric_columns() {
        let temp = tempfile::tempdir().unwrap();
        let root = temp.path().join("secom");
        fs::create_dir_all(&root).unwrap();
        fs::write(root.join(SECOM_DATA_FILE), "1.0 2.0 3.0\n4.0 5.0 6.0\n").unwrap();
        fs::write(
            root.join(SECOM_LABELS_FILE),
            "-1 \"01/01/2008 00:00:00\"\n1 \"01/01/2008 01:00:00\"\n",
        )
        .unwrap();
        fs::write(
            root.join(SECOM_NAMES_FILE),
            "Number of Attributes: 4\nData Structure: 2 examples each with 4 features\n",
        )
        .unwrap();

        let layout = inspect_archive_layout(&SecomDataPaths {
            root: root.clone(),
            archive: root.join(SECOM_ARCHIVE_NAME),
            data_file: root.join(SECOM_DATA_FILE),
            labels_file: root.join(SECOM_LABELS_FILE),
            names_file: root.join(SECOM_NAMES_FILE),
        })
        .unwrap();

        assert_eq!(layout.data_file_numeric_column_count, 3);
        assert_eq!(layout.metadata_attribute_count_claim, Some(4));
        assert_eq!(layout.label_row_count, 2);
        assert!(layout.label_file_includes_timestamp);
        assert!(layout
            .note
            .contains("3 whitespace-delimited numeric columns"));
    }
}