dsfb-semiconductor 0.1.1

use crate::baselines::compute_baselines;
use crate::config::PipelineConfig;
use crate::dataset::phm2018::support_status;
use crate::error::{DsfbSemiconductorError, Result};
use crate::grammar::evaluate_grammar;
use crate::heuristics::{
    expanded_semantic_policy_definitions, heuristic_policy_definition, HeuristicAlertClass,
};
use crate::nominal::build_nominal_model;
use crate::output_paths::{compile_pdf, create_timestamped_run_dir, zip_directory};
use crate::plots::generate_phm2018_figures;
use crate::precursor::evaluate_dsa;
use crate::preprocessing::{DatasetSummary, PreparedDataset};
use crate::residual::compute_residuals;
use crate::semiotics::{build_semantic_layer, classify_motifs};
use crate::signs::compute_signs;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use std::fs::{self, File};
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};

const PHM_SENSOR_COLUMN_START: usize = 7;
const PHM_MAX_AGGREGATED_POINTS: usize = 4096;
const PHM_SELECTED_DSA_WINDOW: usize = 8;
const PHM_SELECTED_DSA_PERSISTENCE: usize = 2;
const PHM_SELECTED_DSA_TAU: f64 = 1.5;
const PHM_SELECTED_DSA_M: usize = 1;
const PHM_THRESHOLD_BASELINE: &str = "run_energy_scalar_threshold";

#[derive(Debug, Clone, Serialize)]
pub struct Phm2018RunArtifacts {
    pub run_dir: PathBuf,
    pub lead_time_metrics_path: PathBuf,
    pub early_warning_stats_path: PathBuf,
    pub structural_metrics_path: PathBuf,
    pub claim_alignment_report_path: PathBuf,
    pub manifest_path: PathBuf,
    pub tex_report_path: PathBuf,
    pub pdf_path: Option<PathBuf>,
    pub zip_path: PathBuf,
}

#[derive(Debug, Clone, Serialize)]
pub struct Phm2018LeadTimeRow {
    pub run_id: String,
    pub dsfb_detection_time: Option<i64>,
    pub threshold_detection_time: Option<i64>,
    pub lead_time_delta: Option<i64>,
}

#[derive(Debug, Clone, Serialize)]
pub struct Phm2018EarlyWarningStats {
    pub threshold_baseline: String,
    pub total_runs: usize,
    pub comparable_runs: usize,
    pub mean_lead_delta: Option<f64>,
    pub median_lead_delta: Option<f64>,
    pub percent_runs_dsfb_earlier: f64,
    pub percent_runs_equal: f64,
    pub percent_runs_later: f64,
}

#[derive(Debug, Clone, Serialize)]
pub struct Phm2018StructuralMetrics {
    pub threshold_baseline: String,
    pub total_runs: usize,
    pub runs_with_structured_emergence: usize,
    pub comparable_structure_runs: usize,
    pub runs_with_structure_before_threshold: usize,
    pub percent_structure_before_threshold: f64,
    pub mean_structure_minus_threshold_delta: Option<f64>,
    pub median_structure_minus_threshold_delta: Option<f64>,
}

#[derive(Debug, Clone, Serialize)]
pub struct ClaimAlignmentReport {
    pub secom_supports: Vec<String>,
    pub secom_does_not_support: Vec<String>,
    pub phm2018_supports: Vec<String>,
    pub claims_not_made: Vec<String>,
}

#[derive(Debug, Clone, Serialize)]
struct Phm2018ArtifactManifest {
    dataset: String,
    run_dir: String,
    lead_time_metrics_path: String,
    early_warning_stats_path: String,
    structural_metrics_path: String,
    support_status_path: String,
    claim_alignment_report_path: String,
    zip_path: String,
}

#[derive(Debug, Clone, Serialize)]
pub struct Phm2018RunDetail {
    pub run_id: String,
    pub fault_time: i64,
    pub fault_index: usize,
    pub healthy_prefix_count: usize,
    pub evaluation_start_run_index: usize,
    pub dsfb_detection_run_index: Option<usize>,
    pub threshold_detection_run_index: Option<usize>,
    pub earliest_semantic_run_index: Option<usize>,
    pub earliest_structured_run_index: Option<usize>,
    pub dsfb_detection_time: Option<i64>,
    pub threshold_detection_time: Option<i64>,
    pub earliest_semantic_time: Option<i64>,
    pub earliest_structured_time: Option<i64>,
    pub lead_time_delta: Option<i64>,
    pub structure_minus_threshold_delta: Option<i64>,
}

#[derive(Debug, Clone)]
struct Phm2018RunSpec {
    run_id: String,
    sensor_path: PathBuf,
    fault_time: i64,
}

#[derive(Debug, Clone)]
struct Phm2018RunSeries {
    run_id: String,
    timestamps_raw: Vec<i64>,
    feature_names: Vec<String>,
    raw_values: Vec<Vec<Option<f64>>>,
    fault_time: i64,
    fault_index: usize,
    healthy_prefix_count: usize,
}

pub fn run_phm2018_benchmark(
    data_root: &Path,
    output_root: &Path,
    secom_run_dir: Option<&Path>,
) -> Result<Phm2018RunArtifacts> {
    let status = support_status(data_root);
    if !status.extracted_dataset_detected {
        return Err(DsfbSemiconductorError::DatasetMissing {
            dataset: "PHM 2018 ion mill etch",
            path: status.extracted_dataset_path,
        });
    }

    let run_specs = load_phm2018_train_run_specs(&status.extracted_dataset_path)?;
    if run_specs.is_empty() {
        return Err(DsfbSemiconductorError::DatasetFormat(
            "PHM 2018 extracted tree contains no train runs".into(),
        ));
    }

    fs::create_dir_all(output_root)?;
    let run_dir = create_timestamped_run_dir(output_root, "phm2018")?;
    let mut lead_time_rows = Vec::new();
    let mut run_details = Vec::new();

    for run_spec in &run_specs {
        let run = load_phm2018_train_run_series(run_spec)?;
        let config = phm_pipeline_config(run.healthy_prefix_count, run.fault_index);
        let prepared = run.as_prepared_dataset();
        let nominal = build_nominal_model(&prepared, &config);
        let residuals = compute_residuals(&prepared, &nominal);
        let signs = compute_signs(&prepared, &nominal, &residuals, &config);
        let baselines = compute_baselines(&prepared, &nominal, &residuals, &config);
        let grammar = evaluate_grammar(&residuals, &signs, &nominal, &config);
        let motifs = classify_motifs(
            &prepared,
            &nominal,
            &residuals,
            &signs,
            &grammar,
            config.pre_failure_lookback_runs,
        );
        let semantic_layer = build_semantic_layer(
            &prepared,
            &residuals,
            &signs,
            &grammar,
            &motifs,
            &nominal,
            config.pre_failure_lookback_runs,
        );
        let dsa = evaluate_dsa(
            &prepared,
            &nominal,
            &residuals,
            &signs,
            &baselines,
            &grammar,
            &config.dsa,
            config.pre_failure_lookback_runs,
        )?;

        let evaluation_start_run_index = run.healthy_prefix_count.min(run.fault_index);
        let dsfb_detection_run_index = (evaluation_start_run_index..run.fault_index)
            .find(|&run_index| dsa.run_signals.primary_run_alert[run_index]);
        let threshold_detection_run_index = (evaluation_start_run_index..run.fault_index)
            .find(|&run_index| baselines.run_energy.alarm[run_index]);
        let dsfb_detection_time =
            dsfb_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
        let threshold_detection_time =
            threshold_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
        let earliest_semantic_run_index = semantic_layer
            .ranked_candidates
            .iter()
            .filter(|row| {
                row.run_index >= evaluation_start_run_index && row.run_index < run.fault_index
            })
            .filter(|row| {
                !matches!(
                    heuristic_alert_default(row.heuristic_name.as_str()),
                    HeuristicAlertClass::Silent
                )
            })
            .map(|row| row.run_index)
            .min();
        let earliest_structured_run_index = earliest_semantic_run_index.or_else(|| {
            motifs
                .traces
                .iter()
                .flat_map(|trace| trace.labels.iter().enumerate())
                .filter(|(run_index, label)| {
                    *run_index >= evaluation_start_run_index
                        && *run_index < run.fault_index
                        && !matches!(label, crate::semiotics::DsfbMotifClass::StableAdmissible)
                })
                .map(|(run_index, _)| run_index)
                .min()
        });
        let earliest_semantic_time =
            earliest_semantic_run_index.map(|run_index| run.timestamps_raw[run_index]);
        let earliest_structured_time =
            earliest_structured_run_index.map(|run_index| run.timestamps_raw[run_index]);
        let lead_time_delta = match (dsfb_detection_time, threshold_detection_time) {
            (Some(dsfb), Some(threshold)) => Some(threshold - dsfb),
            _ => None,
        };
        let structure_minus_threshold_delta =
            match (earliest_structured_time, threshold_detection_time) {
                (Some(structure), Some(threshold)) => Some(threshold - structure),
                _ => None,
            };

        lead_time_rows.push(Phm2018LeadTimeRow {
            run_id: run.run_id.clone(),
            dsfb_detection_time,
            threshold_detection_time,
            lead_time_delta,
        });
        run_details.push(Phm2018RunDetail {
            run_id: run.run_id.clone(),
            fault_time: run.fault_time,
            fault_index: run.fault_index,
            healthy_prefix_count: run.healthy_prefix_count,
            evaluation_start_run_index,
            dsfb_detection_run_index,
            threshold_detection_run_index,
            earliest_semantic_run_index,
            earliest_structured_run_index,
            dsfb_detection_time,
            threshold_detection_time,
            earliest_semantic_time,
            earliest_structured_time,
            lead_time_delta,
            structure_minus_threshold_delta,
        });
    }

    let early_warning_stats = summarize_phm_lead_times(&lead_time_rows);
    let structural_metrics = summarize_phm_structural_metrics(&run_details);
    let secom_run_dir = resolve_secom_run_dir(secom_run_dir, output_root)?;
    let claim_alignment_report =
        build_claim_alignment_report(&secom_run_dir, &early_warning_stats, &structural_metrics)?;

    let lead_time_metrics_path = run_dir.join("phm2018_lead_time_metrics.csv");
    let early_warning_stats_path = run_dir.join("phm2018_early_warning_stats.json");
    let structural_metrics_path = run_dir.join("phm2018_structural_metrics.json");
    let claim_alignment_report_path = run_dir.join("claim_alignment_report.json");
    let manifest_path = run_dir.join("artifact_manifest.json");
    let zip_path = run_dir.join("run_bundle.zip");

    write_serialized_csv(&lead_time_metrics_path, &lead_time_rows)?;
    write_json_pretty(&early_warning_stats_path, &early_warning_stats)?;
    write_json_pretty(&structural_metrics_path, &structural_metrics)?;
    write_json_pretty(&run_dir.join("phm2018_support_status.json"), &status)?;
    write_json_pretty(&run_dir.join("phm2018_run_details.json"), &run_details)?;
    write_json_pretty(&claim_alignment_report_path, &claim_alignment_report)?;

    // Engineering report (tex + pdf)
    let figure_files = match generate_phm2018_figures(
        &run_dir,
        &run_details,
        &early_warning_stats,
        &structural_metrics,
    ) {
        Ok(files) => files,
        Err(e) => {
            eprintln!("[phm2018] Figure generation warning: {e}");
            vec![]
        }
    };
    let tex_report_path = run_dir.join("engineering_report.tex");
    fs::write(
        &tex_report_path,
        phm2018_tex_report(
            &early_warning_stats,
            &structural_metrics,
            &claim_alignment_report,
            run_details.len(),
            &figure_files,
        ),
    )?;
    let (pdf_path, pdf_error) = compile_pdf(&tex_report_path, &run_dir);
    if let Some(ref err) = pdf_error {
        eprintln!("[phm2018] PDF compile warning: {}", err.lines().next().unwrap_or("unknown"));
    }

    write_json_pretty(
        &manifest_path,
        &Phm2018ArtifactManifest {
            dataset: "PHM2018".into(),
            run_dir: run_dir.display().to_string(),
            lead_time_metrics_path: lead_time_metrics_path.display().to_string(),
            early_warning_stats_path: early_warning_stats_path.display().to_string(),
            structural_metrics_path: structural_metrics_path.display().to_string(),
            support_status_path: run_dir
                .join("phm2018_support_status.json")
                .display()
                .to_string(),
            claim_alignment_report_path: claim_alignment_report_path.display().to_string(),
            zip_path: zip_path.display().to_string(),
        },
    )?;
    // Zip after all files (including tex/pdf) are written.
    zip_directory(&run_dir, &zip_path)?;

    Ok(Phm2018RunArtifacts {
        run_dir,
        lead_time_metrics_path,
        early_warning_stats_path,
        structural_metrics_path,
        claim_alignment_report_path,
        manifest_path,
        tex_report_path,
        pdf_path,
        zip_path,
    })
}

fn load_phm2018_train_run_specs(extracted_root: &Path) -> Result<Vec<Phm2018RunSpec>> {
    let train_dir = extracted_root.join("train");
    let fault_dir = train_dir.join("train_faults");
    let ttf_dir = train_dir.join("train_ttf");

    let sensor_files = fs::read_dir(&train_dir)?
        .flatten()
        .map(|entry| entry.path())
        .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("csv"))
        .collect::<Vec<_>>();

    let fault_times = load_fault_times(&fault_dir)?;
    let ttf_fallbacks = load_ttf_zero_times(&ttf_dir)?;
    let mut run_specs = Vec::new();

    for sensor_path in sensor_files {
        let run_id = run_id_from_sensor_path(&sensor_path)?;
        let fault_time = fault_times
            .get(&run_id)
            .copied()
            .or_else(|| ttf_fallbacks.get(&run_id).copied())
            .ok_or_else(|| {
                DsfbSemiconductorError::DatasetFormat(format!(
                    "missing fault target for PHM train run {run_id}"
                ))
            })?;
        run_specs.push(Phm2018RunSpec {
            run_id,
            fault_time,
            sensor_path,
        });
    }

    run_specs.sort_by(|left, right| left.run_id.cmp(&right.run_id));
    Ok(run_specs)
}

fn load_phm2018_train_run_series(run_spec: &Phm2018RunSpec) -> Result<Phm2018RunSeries> {
    let (timestamps_raw, feature_names, raw_values) =
        load_sensor_csv_aggregated(&run_spec.sensor_path)?;
    if timestamps_raw.is_empty() || feature_names.is_empty() || raw_values.is_empty() {
        return Err(DsfbSemiconductorError::DatasetFormat(format!(
            "empty PHM train run {} at {}",
            run_spec.run_id,
            run_spec.sensor_path.display()
        )));
    }
    let fault_index = timestamps_raw
        .iter()
        .position(|time| *time >= run_spec.fault_time)
        .unwrap_or_else(|| timestamps_raw.len().saturating_sub(1));
    let healthy_prefix_count = healthy_prefix_count(fault_index, timestamps_raw.len());

    Ok(Phm2018RunSeries {
        run_id: run_spec.run_id.clone(),
        timestamps_raw,
        feature_names,
        raw_values,
        fault_time: run_spec.fault_time,
        fault_index,
        healthy_prefix_count,
    })
}

impl Phm2018RunSeries {
    fn as_prepared_dataset(&self) -> PreparedDataset {
        let run_count = self.raw_values.len();
        let feature_count = self.feature_names.len();
        let total_cells = run_count * feature_count;
        let missing_cells = self
            .raw_values
            .iter()
            .flat_map(|row| row.iter())
            .filter(|value| value.is_none())
            .count();
        let mut per_feature_missing_fraction = vec![0.0; feature_count];
        for feature_index in 0..feature_count {
            let missing = self
                .raw_values
                .iter()
                .filter(|row| row[feature_index].is_none())
                .count();
            per_feature_missing_fraction[feature_index] = if run_count == 0 {
                0.0
            } else {
                missing as f64 / run_count as f64
            };
        }

        let mut labels = vec![-1; run_count];
        if self.fault_index < labels.len() {
            labels[self.fault_index] = 1;
        }
        let timestamps = self
            .timestamps_raw
            .iter()
            .enumerate()
            .map(|(index, value)| {
                DateTime::<Utc>::from_timestamp(*value, 0)
                    .map(|value| value.naive_utc())
                    .unwrap_or_else(|| {
                        DateTime::<Utc>::from_timestamp(index as i64, 0)
                            .expect("valid synthetic timestamp")
                            .naive_utc()
                    })
            })
            .collect::<Vec<_>>();
        let healthy_pass_indices = (0..self.healthy_prefix_count).collect::<Vec<_>>();

        PreparedDataset {
            feature_names: self.feature_names.clone(),
            labels,
            timestamps,
            raw_values: self.raw_values.clone(),
            healthy_pass_indices,
            per_feature_missing_fraction,
            summary: DatasetSummary {
                run_count,
                feature_count,
                pass_count: run_count.saturating_sub(1),
                fail_count: 1,
                dataset_missing_fraction: if total_cells == 0 {
                    0.0
                } else {
                    missing_cells as f64 / total_cells as f64
                },
                healthy_pass_runs_requested: self.healthy_prefix_count,
                healthy_pass_runs_found: self.healthy_prefix_count,
            },
        }
    }
}

fn phm_pipeline_config(healthy_prefix_count: usize, fault_index: usize) -> PipelineConfig {
    PipelineConfig {
        healthy_pass_runs: healthy_prefix_count.max(2),
        pre_failure_lookback_runs: fault_index.max(1),
        dsa: crate::precursor::DsaConfig {
            window: PHM_SELECTED_DSA_WINDOW,
            persistence_runs: PHM_SELECTED_DSA_PERSISTENCE,
            alert_tau: PHM_SELECTED_DSA_TAU,
            corroborating_feature_count_min: PHM_SELECTED_DSA_M,
        },
        ..PipelineConfig::default()
    }
}

fn healthy_prefix_count(fault_index: usize, run_len: usize) -> usize {
    let proportional = (fault_index as f64 * 0.10).round() as usize;
    proportional
        .clamp(25, 200)
        .min(fault_index.max(2))
        .min(run_len)
}

fn run_id_from_sensor_path(path: &Path) -> Result<String> {
    let stem = path
        .file_stem()
        .and_then(|stem| stem.to_str())
        .ok_or_else(|| {
            DsfbSemiconductorError::DatasetFormat("invalid PHM sensor filename".into())
        })?;
    let mut parts = stem.split('_');
    let lot = parts
        .next()
        .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM lot id".into()))?;
    let tool = parts
        .next()
        .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM tool id".into()))?;
    Ok(format!("{lot}_{tool}"))
}

fn load_sensor_csv_aggregated(
    path: &Path,
) -> Result<(Vec<i64>, Vec<String>, Vec<Vec<Option<f64>>>)> {
    let total_rows = estimate_csv_data_rows(path)?;
    let bucket_size = (total_rows / PHM_MAX_AGGREGATED_POINTS).max(1);

    let mut reader = csv::ReaderBuilder::new().from_path(path)?;
    let header = reader
        .headers()?
        .iter()
        .skip(PHM_SENSOR_COLUMN_START)
        .map(|name| name.to_string())
        .collect::<Vec<_>>();
    let feature_count = header.len();

    let mut timestamps = Vec::new();
    let mut raw_values = Vec::new();
    let mut bucket_time_sum = 0f64;
    let mut bucket_time_count = 0usize;
    let mut bucket_row_count = 0usize;
    let mut bucket_sums = vec![0.0; feature_count];
    let mut bucket_counts = vec![0usize; feature_count];

    for record in reader.records() {
        let record = record?;
        let timestamp = record
            .get(0)
            .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM time field".into()))?
            .parse::<i64>()
            .map_err(|err| {
                DsfbSemiconductorError::DatasetFormat(format!(
                    "invalid PHM time value in {}: {err}",
                    path.display()
                ))
            })?;
        bucket_time_sum += timestamp as f64;
        bucket_time_count += 1;
        bucket_row_count += 1;

        for (feature_index, value) in record.iter().skip(PHM_SENSOR_COLUMN_START).enumerate() {
            if let Ok(parsed) = value.parse::<f64>() {
                bucket_sums[feature_index] += parsed;
                bucket_counts[feature_index] += 1;
            }
        }

        if bucket_row_count >= bucket_size {
            finalize_aggregate_bucket(
                &mut timestamps,
                &mut raw_values,
                &mut bucket_time_sum,
                &mut bucket_time_count,
                &mut bucket_row_count,
                &mut bucket_sums,
                &mut bucket_counts,
            );
        }
    }

    finalize_aggregate_bucket(
        &mut timestamps,
        &mut raw_values,
        &mut bucket_time_sum,
        &mut bucket_time_count,
        &mut bucket_row_count,
        &mut bucket_sums,
        &mut bucket_counts,
    );

    Ok((timestamps, header, raw_values))
}

fn estimate_csv_data_rows(path: &Path) -> Result<usize> {
    let file = File::open(path)?;
    let total_bytes = file.metadata()?.len() as f64;
    let mut reader = BufReader::new(file);
    let mut sampled_lines = 0usize;
    let mut sampled_bytes = 0usize;
    let mut buffer = String::new();

    while sampled_lines < 4096 {
        buffer.clear();
        let bytes = reader.read_line(&mut buffer)?;
        if bytes == 0 {
            break;
        }
        sampled_lines += 1;
        sampled_bytes += bytes;
    }

    if sampled_lines == 0 || sampled_bytes == 0 {
        return Ok(0);
    }

    let average_bytes_per_line = sampled_bytes as f64 / sampled_lines as f64;
    let estimated_total_lines = (total_bytes / average_bytes_per_line).round() as usize;
    Ok(estimated_total_lines.saturating_sub(1))
}

fn finalize_aggregate_bucket(
    timestamps: &mut Vec<i64>,
    raw_values: &mut Vec<Vec<Option<f64>>>,
    bucket_time_sum: &mut f64,
    bucket_time_count: &mut usize,
    bucket_row_count: &mut usize,
    bucket_sums: &mut [f64],
    bucket_counts: &mut [usize],
) {
    if *bucket_row_count == 0 {
        return;
    }

    timestamps.push((*bucket_time_sum / *bucket_time_count as f64).round() as i64);
    raw_values.push(
        bucket_sums
            .iter()
            .zip(bucket_counts.iter())
            .map(|(sum, count)| (*count > 0).then_some(*sum / *count as f64))
            .collect(),
    );

    *bucket_time_sum = 0.0;
    *bucket_time_count = 0;
    *bucket_row_count = 0;
    bucket_sums.fill(0.0);
    bucket_counts.fill(0);
}

fn load_fault_times(fault_dir: &Path) -> Result<BTreeMap<String, i64>> {
    let mut map = BTreeMap::new();
    for entry in fs::read_dir(fault_dir)?.flatten() {
        let path = entry.path();
        if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
            continue;
        }
        let file_name = path
            .file_name()
            .and_then(|name| name.to_str())
            .unwrap_or_default();
        let run_id = file_name
            .split("_train_fault_data")
            .next()
            .unwrap_or(file_name)
            .to_string();
        let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
        let mut earliest: Option<i64> = None;
        for record in reader.records() {
            let record = record?;
            let time = record
                .get(0)
                .ok_or_else(|| {
                    DsfbSemiconductorError::DatasetFormat(format!(
                        "fault file {} missing time column",
                        path.display()
                    ))
                })?
                .parse::<i64>()
                .map_err(|err| {
                    DsfbSemiconductorError::DatasetFormat(format!(
                        "invalid fault time in {}: {err}",
                        path.display()
                    ))
                })?;
            earliest = Some(match earliest {
                Some(current) => current.min(time),
                None => time,
            });
        }
        if let Some(time) = earliest {
            map.insert(run_id, time);
        }
    }
    Ok(map)
}

fn load_ttf_zero_times(ttf_dir: &Path) -> Result<BTreeMap<String, i64>> {
    let mut map = BTreeMap::new();
    for entry in fs::read_dir(ttf_dir)?.flatten() {
        let path = entry.path();
        if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
            continue;
        }
        let run_id = run_id_from_sensor_path(&path)?;
        let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
        let mut earliest = None;
        for record in reader.records() {
            let record = record?;
            let time = record
                .get(0)
                .ok_or_else(|| {
                    DsfbSemiconductorError::DatasetFormat(format!(
                        "ttf file {} missing time column",
                        path.display()
                    ))
                })?
                .parse::<i64>()
                .map_err(|err| {
                    DsfbSemiconductorError::DatasetFormat(format!(
                        "invalid ttf time in {}: {err}",
                        path.display()
                    ))
                })?;
            let has_zero = record
                .iter()
                .skip(1)
                .filter_map(|value| value.parse::<f64>().ok())
                .any(|value| value <= 0.0);
            if has_zero {
                earliest = Some(time);
                break;
            }
        }
        if let Some(time) = earliest {
            map.insert(run_id, time);
        }
    }
    Ok(map)
}

fn summarize_phm_lead_times(rows: &[Phm2018LeadTimeRow]) -> Phm2018EarlyWarningStats {
    let comparable = rows
        .iter()
        .filter_map(|row| row.lead_time_delta.map(|value| value as f64))
        .collect::<Vec<_>>();
    let earlier = rows
        .iter()
        .filter(
            |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
                (Some(dsfb), Some(threshold)) => dsfb < threshold,
                (Some(_), None) => true,
                _ => false,
            },
        )
        .count();
    let equal = rows
        .iter()
        .filter(
            |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
                (Some(dsfb), Some(threshold)) => dsfb == threshold,
                _ => false,
            },
        )
        .count();
    let later = rows.len().saturating_sub(earlier + equal);
    let mut sorted = comparable.clone();
    sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
    let median = if sorted.is_empty() {
        None
    } else if sorted.len() % 2 == 1 {
        Some(sorted[sorted.len() / 2])
    } else {
        Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
    };

    Phm2018EarlyWarningStats {
        threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
        total_runs: rows.len(),
        comparable_runs: comparable.len(),
        mean_lead_delta: (!comparable.is_empty())
            .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
        median_lead_delta: median,
        percent_runs_dsfb_earlier: percent(earlier, rows.len()),
        percent_runs_equal: percent(equal, rows.len()),
        percent_runs_later: percent(later, rows.len()),
    }
}

fn summarize_phm_structural_metrics(run_details: &[Phm2018RunDetail]) -> Phm2018StructuralMetrics {
    let comparable = run_details
        .iter()
        .filter_map(|detail| {
            detail
                .structure_minus_threshold_delta
                .map(|value| value as f64)
        })
        .collect::<Vec<_>>();
    let runs_with_structured_emergence = run_details
        .iter()
        .filter(|detail| detail.earliest_structured_run_index.is_some())
        .count();
    let runs_with_structure_before_threshold = run_details
        .iter()
        .filter(|detail| {
            detail
                .structure_minus_threshold_delta
                .is_some_and(|value| value > 0)
        })
        .count();
    let mut sorted = comparable.clone();
    sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
    let median = if sorted.is_empty() {
        None
    } else if sorted.len() % 2 == 1 {
        Some(sorted[sorted.len() / 2])
    } else {
        Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
    };

    Phm2018StructuralMetrics {
        threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
        total_runs: run_details.len(),
        runs_with_structured_emergence,
        comparable_structure_runs: comparable.len(),
        runs_with_structure_before_threshold,
        percent_structure_before_threshold: percent(
            runs_with_structure_before_threshold,
            run_details.len(),
        ),
        mean_structure_minus_threshold_delta: (!comparable.is_empty())
            .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
        median_structure_minus_threshold_delta: median,
    }
}

fn build_claim_alignment_report(
    secom_run_dir: &Path,
    phm_stats: &Phm2018EarlyWarningStats,
    phm_structural: &Phm2018StructuralMetrics,
) -> Result<ClaimAlignmentReport> {
    let secom_targets =
        load_json::<serde_json::Value>(&secom_run_dir.join("dsa_operator_delta_targets.json"))?;
    let episode_precision =
        load_json::<serde_json::Value>(&secom_run_dir.join("episode_precision_metrics.json")).ok();
    let episode_precision_text = episode_precision
        .as_ref()
        .and_then(|json| {
            Some((
                json.get("dsfb_precision")?.as_f64()?,
                json.get("raw_alarm_precision")?.as_f64()?,
                json.get("precision_gain_factor")?.as_f64()?,
            ))
        })
        .map(|(dsfb_precision, raw_precision, gain)| {
            format!(
                "episode precision, with DSFB at {:.1}% versus a raw-boundary proxy of {:.2}% ({:.1}x)",
                dsfb_precision * 100.0,
                raw_precision * 100.0,
                gain,
            )
        })
        .unwrap_or_else(|| "episode precision surfaced as the primary operator metric".into());
    let delta_investigation = secom_targets
        .get("delta_investigation_load")
        .and_then(|value| value.as_f64())
        .unwrap_or_default()
        * 100.0;
    let delta_episode = secom_targets
        .get("delta_episode_count")
        .and_then(|value| value.as_f64())
        .unwrap_or_default()
        * 100.0;
    let mut phm_supports = Vec::new();
    phm_supports.push(format!(
        "structural co-occurrence: DSFB grammar-state emergence observed alongside the {} baseline on {} of {} PHM2018 runs",
        PHM_THRESHOLD_BASELINE,
        phm_stats.comparable_runs,
        phm_stats.total_runs,
    ));
    if let Some(mean_delta) = phm_stats.mean_lead_delta {
        phm_supports.push(format!(
            "structural co-occurrence timing: mean {}-minus-DSFB emergence gap {:.2} (positive = structure emerged before threshold; negative = after)",
            PHM_THRESHOLD_BASELINE, mean_delta
        ));
    }
    if let Some(mean_structure_delta) = phm_structural.mean_structure_minus_threshold_delta {
        phm_supports.push(format!(
            "structure-emergence comparison: mean {}-minus-structure-emergence gap {:.2}, with structure preceding threshold on {:.1}% of runs",
            PHM_THRESHOLD_BASELINE,
            mean_structure_delta,
            phm_structural.percent_structure_before_threshold * 100.0,
        ));
    }

    Ok(ClaimAlignmentReport {
        secom_supports: vec![
            format!(
                "episode compression: {:.1}% reduction versus the raw-boundary episode baseline",
                delta_episode
            ),
            episode_precision_text,
            format!(
                "investigation load reduction: {:.1}% versus Numeric-only DSA",
                delta_investigation
            ),
        ],
        secom_does_not_support: vec![
            "DSFB is an observer-only, read-only, non-intrusive monitoring layer; it does not replace EWMA, thresholds, or DSA".into(),
            "DSFB augments and reinforces existing detection methods, making them more effective rather than competing with them".into(),
        ],
        phm2018_supports: phm_supports,
        claims_not_made: vec![
            "any unsupported delta without naming its baseline".into(),
            "universal dominance over scalar baselines".into(),
            "replacement of existing detection methods; DSFB is additive and non-intrusive only".into(),
            "PHM burden reduction without direct PHM burden metrics".into(),
        ],
    })
}

fn heuristic_alert_default(heuristic_name: &str) -> HeuristicAlertClass {
    heuristic_policy_definition(heuristic_name)
        .map(|definition| definition.alert_class_default)
        .or_else(|| {
            expanded_semantic_policy_definitions()
                .into_iter()
                .find(|definition| definition.motif_name == heuristic_name)
                .map(|definition| definition.alert_class_default)
        })
        .unwrap_or(HeuristicAlertClass::Silent)
}

fn resolve_secom_run_dir(secom_run_dir: Option<&Path>, output_root: &Path) -> Result<PathBuf> {
    if let Some(path) = secom_run_dir {
        return Ok(path.to_path_buf());
    }
    let candidates = [
        output_root.to_path_buf(),
        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("output-dsfb-semiconductor"),
        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .and_then(Path::parent)
            .map(Path::to_path_buf)
            .unwrap_or_else(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")))
            .join("output-dsfb-semiconductor"),
    ];
    for root in candidates {
        if let Some(path) = latest_run_dir(&root, "_secom")? {
            return Ok(path);
        }
    }
    Err(DsfbSemiconductorError::DatasetFormat(
        "could not resolve a SECOM run directory for claim alignment".into(),
    ))
}

fn latest_run_dir(root: &Path, suffix: &str) -> Result<Option<PathBuf>> {
    let mut dirs = fs::read_dir(root)
        .ok()
        .into_iter()
        .flat_map(|entries| entries.flatten())
        .map(|entry| entry.path())
        .filter(|path| path.is_dir())
        .filter(|path| {
            path.file_name()
                .and_then(|name| name.to_str())
                .is_some_and(|name| name.contains("dsfb-semiconductor") && name.ends_with(suffix))
        })
        .collect::<Vec<_>>();
    dirs.sort();
    Ok(dirs.pop())
}

fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<()> {
    let file = File::create(path)?;
    serde_json::to_writer_pretty(file, value)?;
    Ok(())
}

fn load_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
    let file = File::open(path)?;
    Ok(serde_json::from_reader(file)?)
}

fn write_serialized_csv<T: Serialize>(path: &Path, rows: &[T]) -> Result<()> {
    let mut writer = csv::Writer::from_path(path)?;
    for row in rows {
        writer.serialize(row)?;
    }
    writer.flush()?;
    Ok(())
}

fn phm2018_tex_report(
    early_warning: &Phm2018EarlyWarningStats,
    structural: &Phm2018StructuralMetrics,
    claim: &ClaimAlignmentReport,
    training_run_count: usize,
    figure_files: &[String],
) -> String {
    let fmt_f64 = |v: Option<f64>| {
        v.map(|x| format!("{x:.2}"))
            .unwrap_or_else(|| "n/a".into())
    };
    let esc = |s: &str| {
        s.replace('_', "\\_")
            .replace('&', "\\&")
            .replace('%', "\\%")
            .replace('#', "\\#")
            .replace('$', "\\$")
            // math symbols — added after $-escaping so these new $ aren't re-escaped
            .replace('≥', "$\\geq$")
            .replace('≤', "$\\leq$")
            .replace('→', "$\\to$")
            .replace('←', "$\\leftarrow$")
            .replace('×', "$\\times$")
            .replace('±', "$\\pm$")
            .replace('∞', "$\\infty$")
    };
    let list_items = |items: &[String]| -> String {
        items
            .iter()
            .map(|s| format!("  \\item {}\n", esc(s)))
            .collect::<String>()
    };

    let fig_caption = |file: &str| match file {
        "phm_lead_before_fault.png" =>
            "Lead time before fault: DSFB (blue) vs run-energy threshold (red) per run. \
             Bars show timestamp units before fault; taller bar = earlier detection. \
             Runs where detection did not occur are omitted.",
        "phm_lead_delta_per_run.png" =>
            "Lead time delta per run: positive (green) means DSFB detected earlier than \
             the threshold baseline; negative (red) means the threshold was earlier. \
             Grey bars indicate no comparable detection pair for that run.",
        "phm_structural_emergence.png" =>
            "Detection outcome summary across all PHM2018 training runs. \
             `DSFB earlier' counts runs where the DSFB DSA alert preceded the threshold; \
             `Threshold earlier' counts the converse; \
             `Tied' means simultaneous detection; \
             `DSFB only' and `Threshold only' count single-side detections; \
             `Neither detected' counts runs with no alert before fault.",
        _ => "Crate-generated figure.",
    };
    let figures_section = if figure_files.is_empty() {
        String::new()
    } else {
        let mut s = "\\section{Figures}\n\n".to_string();
        for file in figure_files {
            s.push_str(&format!(
                "\\begin{{figure}}[htbp]\n\
                 \\centering\n\
                 \\includegraphics[width=0.95\\linewidth]{{figures/{}}}\n\
                 \\caption{{{}}}\n\
                 \\end{{figure}}\n\n",
                file,
                fig_caption(file),
            ));
        }
        s
    };

    format!(
        r#"\documentclass{{article}}
\usepackage[utf8]{{inputenc}}
\usepackage[margin=1in]{{geometry}}
\usepackage{{graphicx}}
\usepackage{{booktabs}}
\usepackage{{hyperref}}
\usepackage{{parskip}}
\title{{DSFB PHM 2018 Engineering Report}}
\author{{DSFB Semiconductor Companion Crate}}
\date{{\today}}
\begin{{document}}
\maketitle

\section{{Dataset}}
\begin{{itemize}}
  \item Dataset: PHM 2018 Ion Mill Etch (train split)
  \item Training runs processed: {training_run_count}
  \item Evidence class: trajectory-level structural emergence comparison against a univariate run-energy threshold baseline
  \item Non-claim: this run does not establish SEMI compliance, production readiness, or universal early-warning superiority
\end{{itemize}}

\section{{Structural Co-occurrence Statistics}}
\begin{{itemize}}
  \item Baseline: \texttt{{{baseline}}}
  \item Total runs: {total_runs}
  \item Runs with co-occurring DSFB and baseline detection: {comparable_runs}
  \item Mean baseline-minus-DSFB emergence gap (seconds): {mean_lead}
  \item Median baseline-minus-DSFB emergence gap (seconds): {median_lead}
\end{{itemize}}

\section{{Structural Emergence}}
\begin{{itemize}}
  \item Runs with structured emergence before threshold: {struct_before}/{comparable_struct} ({pct_struct:.1}\%)
  \item Mean structure-emergence minus threshold delta (seconds): {mean_struct_delta}
  \item Median structure-emergence minus threshold delta (seconds): {median_struct_delta}
\end{{itemize}}

\section{{Claim Alignment}}

\subsection{{SECOM supports}}
\begin{{itemize}}
{secom_supports}\end{{itemize}}

\subsection{{Design boundaries (observer layer)}}
\begin{{itemize}}
{secom_does_not}
\end{{itemize}}

\subsection{{PHM 2018 supports}}
\begin{{itemize}}
{phm_supports}\end{{itemize}}

\subsection{{Claims not made}}
\begin{{itemize}}
{claims_not_made}\end{{itemize}}

\section{{Interpretation}}
This report covers bounded public-data evidence only.
The PHM 2018 dataset contains labeled fault times for ion mill etch training runs.
DSFB operates as an observer-only, read-only, non-intrusive monitoring layer.
It does not replace, override, or compete with existing detection methods such as
EWMA, run-energy thresholds, or DSA---it augments them, making each existing
method more effective by providing an additional structural evidence layer.
The structural co-occurrence comparison against a univariate run-energy threshold
baseline is presented as observational context, not as a competitive benchmark.
No claims of superiority, replacement, or deployment-ready dominance are made.

{figures_section}
\end{{document}}
"#,
        training_run_count = training_run_count,
        baseline = esc(&early_warning.threshold_baseline),
        total_runs = early_warning.total_runs,
        comparable_runs = early_warning.comparable_runs,
        mean_lead = fmt_f64(early_warning.mean_lead_delta),
        median_lead = fmt_f64(early_warning.median_lead_delta),
        struct_before = structural.runs_with_structure_before_threshold,
        comparable_struct = structural.comparable_structure_runs,
        pct_struct = structural.percent_structure_before_threshold * 100.0,
        mean_struct_delta = fmt_f64(structural.mean_structure_minus_threshold_delta),
        median_struct_delta = fmt_f64(structural.median_structure_minus_threshold_delta),
        secom_supports = list_items(&claim.secom_supports),
        secom_does_not = list_items(&claim.secom_does_not_support),
        phm_supports = list_items(&claim.phm2018_supports),
        claims_not_made = list_items(&claim.claims_not_made),
        figures_section = figures_section,
    )
}

fn percent(numerator: usize, denominator: usize) -> f64 {
    if denominator == 0 {
        0.0
    } else {
        numerator as f64 / denominator as f64
    }
}