Skip to main content

dsfb_semiconductor/
phm2018_loader.rs

1use crate::baselines::compute_baselines;
2use crate::config::PipelineConfig;
3use crate::dataset::phm2018::support_status;
4use crate::error::{DsfbSemiconductorError, Result};
5use crate::grammar::evaluate_grammar;
6use crate::heuristics::{
7    expanded_semantic_policy_definitions, heuristic_policy_definition, HeuristicAlertClass,
8};
9use crate::nominal::build_nominal_model;
10use crate::output_paths::{compile_pdf, create_timestamped_run_dir, zip_directory};
11use crate::plots::generate_phm2018_figures;
12use crate::precursor::evaluate_dsa;
13use crate::preprocessing::{DatasetSummary, PreparedDataset};
14use crate::residual::compute_residuals;
15use crate::semiotics::{build_semantic_layer, classify_motifs};
16use crate::signs::compute_signs;
17use chrono::{DateTime, Utc};
18use serde::{Deserialize, Serialize};
19use std::collections::BTreeMap;
20use std::fs::{self, File};
21use std::io::{BufRead, BufReader};
22use std::path::{Path, PathBuf};
23
24const PHM_SENSOR_COLUMN_START: usize = 7;
25const PHM_MAX_AGGREGATED_POINTS: usize = 4096;
26const PHM_SELECTED_DSA_WINDOW: usize = 8;
27const PHM_SELECTED_DSA_PERSISTENCE: usize = 2;
28const PHM_SELECTED_DSA_TAU: f64 = 1.5;
29const PHM_SELECTED_DSA_M: usize = 1;
30const PHM_THRESHOLD_BASELINE: &str = "run_energy_scalar_threshold";
31
32#[derive(Debug, Clone, Serialize)]
33pub struct Phm2018RunArtifacts {
34    pub run_dir: PathBuf,
35    pub lead_time_metrics_path: PathBuf,
36    pub early_warning_stats_path: PathBuf,
37    pub structural_metrics_path: PathBuf,
38    pub claim_alignment_report_path: PathBuf,
39    pub manifest_path: PathBuf,
40    pub tex_report_path: PathBuf,
41    pub pdf_path: Option<PathBuf>,
42    pub zip_path: PathBuf,
43}
44
45#[derive(Debug, Clone, Serialize)]
46pub struct Phm2018LeadTimeRow {
47    pub run_id: String,
48    pub dsfb_detection_time: Option<i64>,
49    pub threshold_detection_time: Option<i64>,
50    pub lead_time_delta: Option<i64>,
51}
52
53#[derive(Debug, Clone, Serialize)]
54pub struct Phm2018EarlyWarningStats {
55    pub threshold_baseline: String,
56    pub total_runs: usize,
57    pub comparable_runs: usize,
58    pub mean_lead_delta: Option<f64>,
59    pub median_lead_delta: Option<f64>,
60    pub percent_runs_dsfb_earlier: f64,
61    pub percent_runs_equal: f64,
62    pub percent_runs_later: f64,
63}
64
65#[derive(Debug, Clone, Serialize)]
66pub struct Phm2018StructuralMetrics {
67    pub threshold_baseline: String,
68    pub total_runs: usize,
69    pub runs_with_structured_emergence: usize,
70    pub comparable_structure_runs: usize,
71    pub runs_with_structure_before_threshold: usize,
72    pub percent_structure_before_threshold: f64,
73    pub mean_structure_minus_threshold_delta: Option<f64>,
74    pub median_structure_minus_threshold_delta: Option<f64>,
75}
76
77#[derive(Debug, Clone, Serialize)]
78pub struct ClaimAlignmentReport {
79    pub secom_supports: Vec<String>,
80    pub secom_does_not_support: Vec<String>,
81    pub phm2018_supports: Vec<String>,
82    pub claims_not_made: Vec<String>,
83}
84
85#[derive(Debug, Clone, Serialize)]
86struct Phm2018ArtifactManifest {
87    dataset: String,
88    run_dir: String,
89    lead_time_metrics_path: String,
90    early_warning_stats_path: String,
91    structural_metrics_path: String,
92    support_status_path: String,
93    claim_alignment_report_path: String,
94    zip_path: String,
95}
96
97#[derive(Debug, Clone, Serialize)]
98pub struct Phm2018RunDetail {
99    pub run_id: String,
100    pub fault_time: i64,
101    pub fault_index: usize,
102    pub healthy_prefix_count: usize,
103    pub evaluation_start_run_index: usize,
104    pub dsfb_detection_run_index: Option<usize>,
105    pub threshold_detection_run_index: Option<usize>,
106    pub earliest_semantic_run_index: Option<usize>,
107    pub earliest_structured_run_index: Option<usize>,
108    pub dsfb_detection_time: Option<i64>,
109    pub threshold_detection_time: Option<i64>,
110    pub earliest_semantic_time: Option<i64>,
111    pub earliest_structured_time: Option<i64>,
112    pub lead_time_delta: Option<i64>,
113    pub structure_minus_threshold_delta: Option<i64>,
114}
115
116#[derive(Debug, Clone)]
117struct Phm2018RunSpec {
118    run_id: String,
119    sensor_path: PathBuf,
120    fault_time: i64,
121}
122
123#[derive(Debug, Clone)]
124struct Phm2018RunSeries {
125    run_id: String,
126    timestamps_raw: Vec<i64>,
127    feature_names: Vec<String>,
128    raw_values: Vec<Vec<Option<f64>>>,
129    fault_time: i64,
130    fault_index: usize,
131    healthy_prefix_count: usize,
132}
133
134pub fn run_phm2018_benchmark(
135    data_root: &Path,
136    output_root: &Path,
137    secom_run_dir: Option<&Path>,
138) -> Result<Phm2018RunArtifacts> {
139    let status = support_status(data_root);
140    if !status.extracted_dataset_detected {
141        return Err(DsfbSemiconductorError::DatasetMissing {
142            dataset: "PHM 2018 ion mill etch",
143            path: status.extracted_dataset_path,
144        });
145    }
146
147    let run_specs = load_phm2018_train_run_specs(&status.extracted_dataset_path)?;
148    if run_specs.is_empty() {
149        return Err(DsfbSemiconductorError::DatasetFormat(
150            "PHM 2018 extracted tree contains no train runs".into(),
151        ));
152    }
153
154    fs::create_dir_all(output_root)?;
155    let run_dir = create_timestamped_run_dir(output_root, "phm2018")?;
156    let mut lead_time_rows = Vec::new();
157    let mut run_details = Vec::new();
158
159    for run_spec in &run_specs {
160        let run = load_phm2018_train_run_series(run_spec)?;
161        let config = phm_pipeline_config(run.healthy_prefix_count, run.fault_index);
162        let prepared = run.as_prepared_dataset();
163        let nominal = build_nominal_model(&prepared, &config);
164        let residuals = compute_residuals(&prepared, &nominal);
165        let signs = compute_signs(&prepared, &nominal, &residuals, &config);
166        let baselines = compute_baselines(&prepared, &nominal, &residuals, &config);
167        let grammar = evaluate_grammar(&residuals, &signs, &nominal, &config);
168        let motifs = classify_motifs(
169            &prepared,
170            &nominal,
171            &residuals,
172            &signs,
173            &grammar,
174            config.pre_failure_lookback_runs,
175        );
176        let semantic_layer = build_semantic_layer(
177            &prepared,
178            &residuals,
179            &signs,
180            &grammar,
181            &motifs,
182            &nominal,
183            config.pre_failure_lookback_runs,
184        );
185        let dsa = evaluate_dsa(
186            &prepared,
187            &nominal,
188            &residuals,
189            &signs,
190            &baselines,
191            &grammar,
192            &config.dsa,
193            config.pre_failure_lookback_runs,
194        )?;
195
196        let evaluation_start_run_index = run.healthy_prefix_count.min(run.fault_index);
197        let dsfb_detection_run_index = (evaluation_start_run_index..run.fault_index)
198            .find(|&run_index| dsa.run_signals.primary_run_alert[run_index]);
199        let threshold_detection_run_index = (evaluation_start_run_index..run.fault_index)
200            .find(|&run_index| baselines.run_energy.alarm[run_index]);
201        let dsfb_detection_time =
202            dsfb_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
203        let threshold_detection_time =
204            threshold_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
205        let earliest_semantic_run_index = semantic_layer
206            .ranked_candidates
207            .iter()
208            .filter(|row| {
209                row.run_index >= evaluation_start_run_index && row.run_index < run.fault_index
210            })
211            .filter(|row| {
212                !matches!(
213                    heuristic_alert_default(row.heuristic_name.as_str()),
214                    HeuristicAlertClass::Silent
215                )
216            })
217            .map(|row| row.run_index)
218            .min();
219        let earliest_structured_run_index = earliest_semantic_run_index.or_else(|| {
220            motifs
221                .traces
222                .iter()
223                .flat_map(|trace| trace.labels.iter().enumerate())
224                .filter(|(run_index, label)| {
225                    *run_index >= evaluation_start_run_index
226                        && *run_index < run.fault_index
227                        && !matches!(label, crate::semiotics::DsfbMotifClass::StableAdmissible)
228                })
229                .map(|(run_index, _)| run_index)
230                .min()
231        });
232        let earliest_semantic_time =
233            earliest_semantic_run_index.map(|run_index| run.timestamps_raw[run_index]);
234        let earliest_structured_time =
235            earliest_structured_run_index.map(|run_index| run.timestamps_raw[run_index]);
236        let lead_time_delta = match (dsfb_detection_time, threshold_detection_time) {
237            (Some(dsfb), Some(threshold)) => Some(threshold - dsfb),
238            _ => None,
239        };
240        let structure_minus_threshold_delta =
241            match (earliest_structured_time, threshold_detection_time) {
242                (Some(structure), Some(threshold)) => Some(threshold - structure),
243                _ => None,
244            };
245
246        lead_time_rows.push(Phm2018LeadTimeRow {
247            run_id: run.run_id.clone(),
248            dsfb_detection_time,
249            threshold_detection_time,
250            lead_time_delta,
251        });
252        run_details.push(Phm2018RunDetail {
253            run_id: run.run_id.clone(),
254            fault_time: run.fault_time,
255            fault_index: run.fault_index,
256            healthy_prefix_count: run.healthy_prefix_count,
257            evaluation_start_run_index,
258            dsfb_detection_run_index,
259            threshold_detection_run_index,
260            earliest_semantic_run_index,
261            earliest_structured_run_index,
262            dsfb_detection_time,
263            threshold_detection_time,
264            earliest_semantic_time,
265            earliest_structured_time,
266            lead_time_delta,
267            structure_minus_threshold_delta,
268        });
269    }
270
271    let early_warning_stats = summarize_phm_lead_times(&lead_time_rows);
272    let structural_metrics = summarize_phm_structural_metrics(&run_details);
273    let secom_run_dir = resolve_secom_run_dir(secom_run_dir, output_root)?;
274    let claim_alignment_report =
275        build_claim_alignment_report(&secom_run_dir, &early_warning_stats, &structural_metrics)?;
276
277    let lead_time_metrics_path = run_dir.join("phm2018_lead_time_metrics.csv");
278    let early_warning_stats_path = run_dir.join("phm2018_early_warning_stats.json");
279    let structural_metrics_path = run_dir.join("phm2018_structural_metrics.json");
280    let claim_alignment_report_path = run_dir.join("claim_alignment_report.json");
281    let manifest_path = run_dir.join("artifact_manifest.json");
282    let zip_path = run_dir.join("run_bundle.zip");
283
284    write_serialized_csv(&lead_time_metrics_path, &lead_time_rows)?;
285    write_json_pretty(&early_warning_stats_path, &early_warning_stats)?;
286    write_json_pretty(&structural_metrics_path, &structural_metrics)?;
287    write_json_pretty(&run_dir.join("phm2018_support_status.json"), &status)?;
288    write_json_pretty(&run_dir.join("phm2018_run_details.json"), &run_details)?;
289    write_json_pretty(&claim_alignment_report_path, &claim_alignment_report)?;
290
291    // Engineering report (tex + pdf)
292    let figure_files = match generate_phm2018_figures(
293        &run_dir,
294        &run_details,
295        &early_warning_stats,
296        &structural_metrics,
297    ) {
298        Ok(files) => files,
299        Err(e) => {
300            eprintln!("[phm2018] Figure generation warning: {e}");
301            vec![]
302        }
303    };
304    let tex_report_path = run_dir.join("engineering_report.tex");
305    fs::write(
306        &tex_report_path,
307        phm2018_tex_report(
308            &early_warning_stats,
309            &structural_metrics,
310            &claim_alignment_report,
311            run_details.len(),
312            &figure_files,
313        ),
314    )?;
315    let (pdf_path, pdf_error) = compile_pdf(&tex_report_path, &run_dir);
316    if let Some(ref err) = pdf_error {
317        eprintln!("[phm2018] PDF compile warning: {}", err.lines().next().unwrap_or("unknown"));
318    }
319
320    write_json_pretty(
321        &manifest_path,
322        &Phm2018ArtifactManifest {
323            dataset: "PHM2018".into(),
324            run_dir: run_dir.display().to_string(),
325            lead_time_metrics_path: lead_time_metrics_path.display().to_string(),
326            early_warning_stats_path: early_warning_stats_path.display().to_string(),
327            structural_metrics_path: structural_metrics_path.display().to_string(),
328            support_status_path: run_dir
329                .join("phm2018_support_status.json")
330                .display()
331                .to_string(),
332            claim_alignment_report_path: claim_alignment_report_path.display().to_string(),
333            zip_path: zip_path.display().to_string(),
334        },
335    )?;
336    // Zip after all files (including tex/pdf) are written.
337    zip_directory(&run_dir, &zip_path)?;
338
339    Ok(Phm2018RunArtifacts {
340        run_dir,
341        lead_time_metrics_path,
342        early_warning_stats_path,
343        structural_metrics_path,
344        claim_alignment_report_path,
345        manifest_path,
346        tex_report_path,
347        pdf_path,
348        zip_path,
349    })
350}
351
352fn load_phm2018_train_run_specs(extracted_root: &Path) -> Result<Vec<Phm2018RunSpec>> {
353    let train_dir = extracted_root.join("train");
354    let fault_dir = train_dir.join("train_faults");
355    let ttf_dir = train_dir.join("train_ttf");
356
357    let sensor_files = fs::read_dir(&train_dir)?
358        .flatten()
359        .map(|entry| entry.path())
360        .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("csv"))
361        .collect::<Vec<_>>();
362
363    let fault_times = load_fault_times(&fault_dir)?;
364    let ttf_fallbacks = load_ttf_zero_times(&ttf_dir)?;
365    let mut run_specs = Vec::new();
366
367    for sensor_path in sensor_files {
368        let run_id = run_id_from_sensor_path(&sensor_path)?;
369        let fault_time = fault_times
370            .get(&run_id)
371            .copied()
372            .or_else(|| ttf_fallbacks.get(&run_id).copied())
373            .ok_or_else(|| {
374                DsfbSemiconductorError::DatasetFormat(format!(
375                    "missing fault target for PHM train run {run_id}"
376                ))
377            })?;
378        run_specs.push(Phm2018RunSpec {
379            run_id,
380            fault_time,
381            sensor_path,
382        });
383    }
384
385    run_specs.sort_by(|left, right| left.run_id.cmp(&right.run_id));
386    Ok(run_specs)
387}
388
389fn load_phm2018_train_run_series(run_spec: &Phm2018RunSpec) -> Result<Phm2018RunSeries> {
390    let (timestamps_raw, feature_names, raw_values) =
391        load_sensor_csv_aggregated(&run_spec.sensor_path)?;
392    if timestamps_raw.is_empty() || feature_names.is_empty() || raw_values.is_empty() {
393        return Err(DsfbSemiconductorError::DatasetFormat(format!(
394            "empty PHM train run {} at {}",
395            run_spec.run_id,
396            run_spec.sensor_path.display()
397        )));
398    }
399    let fault_index = timestamps_raw
400        .iter()
401        .position(|time| *time >= run_spec.fault_time)
402        .unwrap_or_else(|| timestamps_raw.len().saturating_sub(1));
403    let healthy_prefix_count = healthy_prefix_count(fault_index, timestamps_raw.len());
404
405    Ok(Phm2018RunSeries {
406        run_id: run_spec.run_id.clone(),
407        timestamps_raw,
408        feature_names,
409        raw_values,
410        fault_time: run_spec.fault_time,
411        fault_index,
412        healthy_prefix_count,
413    })
414}
415
416impl Phm2018RunSeries {
417    fn as_prepared_dataset(&self) -> PreparedDataset {
418        let run_count = self.raw_values.len();
419        let feature_count = self.feature_names.len();
420        let total_cells = run_count * feature_count;
421        let missing_cells = self
422            .raw_values
423            .iter()
424            .flat_map(|row| row.iter())
425            .filter(|value| value.is_none())
426            .count();
427        let mut per_feature_missing_fraction = vec![0.0; feature_count];
428        for feature_index in 0..feature_count {
429            let missing = self
430                .raw_values
431                .iter()
432                .filter(|row| row[feature_index].is_none())
433                .count();
434            per_feature_missing_fraction[feature_index] = if run_count == 0 {
435                0.0
436            } else {
437                missing as f64 / run_count as f64
438            };
439        }
440
441        let mut labels = vec![-1; run_count];
442        if self.fault_index < labels.len() {
443            labels[self.fault_index] = 1;
444        }
445        let timestamps = self
446            .timestamps_raw
447            .iter()
448            .enumerate()
449            .map(|(index, value)| {
450                DateTime::<Utc>::from_timestamp(*value, 0)
451                    .map(|value| value.naive_utc())
452                    .unwrap_or_else(|| {
453                        DateTime::<Utc>::from_timestamp(index as i64, 0)
454                            .expect("valid synthetic timestamp")
455                            .naive_utc()
456                    })
457            })
458            .collect::<Vec<_>>();
459        let healthy_pass_indices = (0..self.healthy_prefix_count).collect::<Vec<_>>();
460
461        PreparedDataset {
462            feature_names: self.feature_names.clone(),
463            labels,
464            timestamps,
465            raw_values: self.raw_values.clone(),
466            healthy_pass_indices,
467            per_feature_missing_fraction,
468            summary: DatasetSummary {
469                run_count,
470                feature_count,
471                pass_count: run_count.saturating_sub(1),
472                fail_count: 1,
473                dataset_missing_fraction: if total_cells == 0 {
474                    0.0
475                } else {
476                    missing_cells as f64 / total_cells as f64
477                },
478                healthy_pass_runs_requested: self.healthy_prefix_count,
479                healthy_pass_runs_found: self.healthy_prefix_count,
480            },
481        }
482    }
483}
484
485fn phm_pipeline_config(healthy_prefix_count: usize, fault_index: usize) -> PipelineConfig {
486    PipelineConfig {
487        healthy_pass_runs: healthy_prefix_count.max(2),
488        pre_failure_lookback_runs: fault_index.max(1),
489        dsa: crate::precursor::DsaConfig {
490            window: PHM_SELECTED_DSA_WINDOW,
491            persistence_runs: PHM_SELECTED_DSA_PERSISTENCE,
492            alert_tau: PHM_SELECTED_DSA_TAU,
493            corroborating_feature_count_min: PHM_SELECTED_DSA_M,
494        },
495        ..PipelineConfig::default()
496    }
497}
498
499fn healthy_prefix_count(fault_index: usize, run_len: usize) -> usize {
500    let proportional = (fault_index as f64 * 0.10).round() as usize;
501    proportional
502        .clamp(25, 200)
503        .min(fault_index.max(2))
504        .min(run_len)
505}
506
507fn run_id_from_sensor_path(path: &Path) -> Result<String> {
508    let stem = path
509        .file_stem()
510        .and_then(|stem| stem.to_str())
511        .ok_or_else(|| {
512            DsfbSemiconductorError::DatasetFormat("invalid PHM sensor filename".into())
513        })?;
514    let mut parts = stem.split('_');
515    let lot = parts
516        .next()
517        .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM lot id".into()))?;
518    let tool = parts
519        .next()
520        .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM tool id".into()))?;
521    Ok(format!("{lot}_{tool}"))
522}
523
524fn load_sensor_csv_aggregated(
525    path: &Path,
526) -> Result<(Vec<i64>, Vec<String>, Vec<Vec<Option<f64>>>)> {
527    let total_rows = estimate_csv_data_rows(path)?;
528    let bucket_size = (total_rows / PHM_MAX_AGGREGATED_POINTS).max(1);
529
530    let mut reader = csv::ReaderBuilder::new().from_path(path)?;
531    let header = reader
532        .headers()?
533        .iter()
534        .skip(PHM_SENSOR_COLUMN_START)
535        .map(|name| name.to_string())
536        .collect::<Vec<_>>();
537    let feature_count = header.len();
538
539    let mut timestamps = Vec::new();
540    let mut raw_values = Vec::new();
541    let mut bucket_time_sum = 0f64;
542    let mut bucket_time_count = 0usize;
543    let mut bucket_row_count = 0usize;
544    let mut bucket_sums = vec![0.0; feature_count];
545    let mut bucket_counts = vec![0usize; feature_count];
546
547    for record in reader.records() {
548        let record = record?;
549        let timestamp = record
550            .get(0)
551            .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM time field".into()))?
552            .parse::<i64>()
553            .map_err(|err| {
554                DsfbSemiconductorError::DatasetFormat(format!(
555                    "invalid PHM time value in {}: {err}",
556                    path.display()
557                ))
558            })?;
559        bucket_time_sum += timestamp as f64;
560        bucket_time_count += 1;
561        bucket_row_count += 1;
562
563        for (feature_index, value) in record.iter().skip(PHM_SENSOR_COLUMN_START).enumerate() {
564            if let Ok(parsed) = value.parse::<f64>() {
565                bucket_sums[feature_index] += parsed;
566                bucket_counts[feature_index] += 1;
567            }
568        }
569
570        if bucket_row_count >= bucket_size {
571            finalize_aggregate_bucket(
572                &mut timestamps,
573                &mut raw_values,
574                &mut bucket_time_sum,
575                &mut bucket_time_count,
576                &mut bucket_row_count,
577                &mut bucket_sums,
578                &mut bucket_counts,
579            );
580        }
581    }
582
583    finalize_aggregate_bucket(
584        &mut timestamps,
585        &mut raw_values,
586        &mut bucket_time_sum,
587        &mut bucket_time_count,
588        &mut bucket_row_count,
589        &mut bucket_sums,
590        &mut bucket_counts,
591    );
592
593    Ok((timestamps, header, raw_values))
594}
595
596fn estimate_csv_data_rows(path: &Path) -> Result<usize> {
597    let file = File::open(path)?;
598    let total_bytes = file.metadata()?.len() as f64;
599    let mut reader = BufReader::new(file);
600    let mut sampled_lines = 0usize;
601    let mut sampled_bytes = 0usize;
602    let mut buffer = String::new();
603
604    while sampled_lines < 4096 {
605        buffer.clear();
606        let bytes = reader.read_line(&mut buffer)?;
607        if bytes == 0 {
608            break;
609        }
610        sampled_lines += 1;
611        sampled_bytes += bytes;
612    }
613
614    if sampled_lines == 0 || sampled_bytes == 0 {
615        return Ok(0);
616    }
617
618    let average_bytes_per_line = sampled_bytes as f64 / sampled_lines as f64;
619    let estimated_total_lines = (total_bytes / average_bytes_per_line).round() as usize;
620    Ok(estimated_total_lines.saturating_sub(1))
621}
622
623fn finalize_aggregate_bucket(
624    timestamps: &mut Vec<i64>,
625    raw_values: &mut Vec<Vec<Option<f64>>>,
626    bucket_time_sum: &mut f64,
627    bucket_time_count: &mut usize,
628    bucket_row_count: &mut usize,
629    bucket_sums: &mut [f64],
630    bucket_counts: &mut [usize],
631) {
632    if *bucket_row_count == 0 {
633        return;
634    }
635
636    timestamps.push((*bucket_time_sum / *bucket_time_count as f64).round() as i64);
637    raw_values.push(
638        bucket_sums
639            .iter()
640            .zip(bucket_counts.iter())
641            .map(|(sum, count)| (*count > 0).then_some(*sum / *count as f64))
642            .collect(),
643    );
644
645    *bucket_time_sum = 0.0;
646    *bucket_time_count = 0;
647    *bucket_row_count = 0;
648    bucket_sums.fill(0.0);
649    bucket_counts.fill(0);
650}
651
652fn load_fault_times(fault_dir: &Path) -> Result<BTreeMap<String, i64>> {
653    let mut map = BTreeMap::new();
654    for entry in fs::read_dir(fault_dir)?.flatten() {
655        let path = entry.path();
656        if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
657            continue;
658        }
659        let file_name = path
660            .file_name()
661            .and_then(|name| name.to_str())
662            .unwrap_or_default();
663        let run_id = file_name
664            .split("_train_fault_data")
665            .next()
666            .unwrap_or(file_name)
667            .to_string();
668        let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
669        let mut earliest: Option<i64> = None;
670        for record in reader.records() {
671            let record = record?;
672            let time = record
673                .get(0)
674                .ok_or_else(|| {
675                    DsfbSemiconductorError::DatasetFormat(format!(
676                        "fault file {} missing time column",
677                        path.display()
678                    ))
679                })?
680                .parse::<i64>()
681                .map_err(|err| {
682                    DsfbSemiconductorError::DatasetFormat(format!(
683                        "invalid fault time in {}: {err}",
684                        path.display()
685                    ))
686                })?;
687            earliest = Some(match earliest {
688                Some(current) => current.min(time),
689                None => time,
690            });
691        }
692        if let Some(time) = earliest {
693            map.insert(run_id, time);
694        }
695    }
696    Ok(map)
697}
698
699fn load_ttf_zero_times(ttf_dir: &Path) -> Result<BTreeMap<String, i64>> {
700    let mut map = BTreeMap::new();
701    for entry in fs::read_dir(ttf_dir)?.flatten() {
702        let path = entry.path();
703        if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
704            continue;
705        }
706        let run_id = run_id_from_sensor_path(&path)?;
707        let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
708        let mut earliest = None;
709        for record in reader.records() {
710            let record = record?;
711            let time = record
712                .get(0)
713                .ok_or_else(|| {
714                    DsfbSemiconductorError::DatasetFormat(format!(
715                        "ttf file {} missing time column",
716                        path.display()
717                    ))
718                })?
719                .parse::<i64>()
720                .map_err(|err| {
721                    DsfbSemiconductorError::DatasetFormat(format!(
722                        "invalid ttf time in {}: {err}",
723                        path.display()
724                    ))
725                })?;
726            let has_zero = record
727                .iter()
728                .skip(1)
729                .filter_map(|value| value.parse::<f64>().ok())
730                .any(|value| value <= 0.0);
731            if has_zero {
732                earliest = Some(time);
733                break;
734            }
735        }
736        if let Some(time) = earliest {
737            map.insert(run_id, time);
738        }
739    }
740    Ok(map)
741}
742
743fn summarize_phm_lead_times(rows: &[Phm2018LeadTimeRow]) -> Phm2018EarlyWarningStats {
744    let comparable = rows
745        .iter()
746        .filter_map(|row| row.lead_time_delta.map(|value| value as f64))
747        .collect::<Vec<_>>();
748    let earlier = rows
749        .iter()
750        .filter(
751            |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
752                (Some(dsfb), Some(threshold)) => dsfb < threshold,
753                (Some(_), None) => true,
754                _ => false,
755            },
756        )
757        .count();
758    let equal = rows
759        .iter()
760        .filter(
761            |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
762                (Some(dsfb), Some(threshold)) => dsfb == threshold,
763                _ => false,
764            },
765        )
766        .count();
767    let later = rows.len().saturating_sub(earlier + equal);
768    let mut sorted = comparable.clone();
769    sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
770    let median = if sorted.is_empty() {
771        None
772    } else if sorted.len() % 2 == 1 {
773        Some(sorted[sorted.len() / 2])
774    } else {
775        Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
776    };
777
778    Phm2018EarlyWarningStats {
779        threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
780        total_runs: rows.len(),
781        comparable_runs: comparable.len(),
782        mean_lead_delta: (!comparable.is_empty())
783            .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
784        median_lead_delta: median,
785        percent_runs_dsfb_earlier: percent(earlier, rows.len()),
786        percent_runs_equal: percent(equal, rows.len()),
787        percent_runs_later: percent(later, rows.len()),
788    }
789}
790
791fn summarize_phm_structural_metrics(run_details: &[Phm2018RunDetail]) -> Phm2018StructuralMetrics {
792    let comparable = run_details
793        .iter()
794        .filter_map(|detail| {
795            detail
796                .structure_minus_threshold_delta
797                .map(|value| value as f64)
798        })
799        .collect::<Vec<_>>();
800    let runs_with_structured_emergence = run_details
801        .iter()
802        .filter(|detail| detail.earliest_structured_run_index.is_some())
803        .count();
804    let runs_with_structure_before_threshold = run_details
805        .iter()
806        .filter(|detail| {
807            detail
808                .structure_minus_threshold_delta
809                .is_some_and(|value| value > 0)
810        })
811        .count();
812    let mut sorted = comparable.clone();
813    sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
814    let median = if sorted.is_empty() {
815        None
816    } else if sorted.len() % 2 == 1 {
817        Some(sorted[sorted.len() / 2])
818    } else {
819        Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
820    };
821
822    Phm2018StructuralMetrics {
823        threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
824        total_runs: run_details.len(),
825        runs_with_structured_emergence,
826        comparable_structure_runs: comparable.len(),
827        runs_with_structure_before_threshold,
828        percent_structure_before_threshold: percent(
829            runs_with_structure_before_threshold,
830            run_details.len(),
831        ),
832        mean_structure_minus_threshold_delta: (!comparable.is_empty())
833            .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
834        median_structure_minus_threshold_delta: median,
835    }
836}
837
838fn build_claim_alignment_report(
839    secom_run_dir: &Path,
840    phm_stats: &Phm2018EarlyWarningStats,
841    phm_structural: &Phm2018StructuralMetrics,
842) -> Result<ClaimAlignmentReport> {
843    let secom_targets =
844        load_json::<serde_json::Value>(&secom_run_dir.join("dsa_operator_delta_targets.json"))?;
845    let episode_precision =
846        load_json::<serde_json::Value>(&secom_run_dir.join("episode_precision_metrics.json")).ok();
847    let episode_precision_text = episode_precision
848        .as_ref()
849        .and_then(|json| {
850            Some((
851                json.get("dsfb_precision")?.as_f64()?,
852                json.get("raw_alarm_precision")?.as_f64()?,
853                json.get("precision_gain_factor")?.as_f64()?,
854            ))
855        })
856        .map(|(dsfb_precision, raw_precision, gain)| {
857            format!(
858                "episode precision, with DSFB at {:.1}% versus a raw-boundary proxy of {:.2}% ({:.1}x)",
859                dsfb_precision * 100.0,
860                raw_precision * 100.0,
861                gain,
862            )
863        })
864        .unwrap_or_else(|| "episode precision surfaced as the primary operator metric".into());
865    let delta_investigation = secom_targets
866        .get("delta_investigation_load")
867        .and_then(|value| value.as_f64())
868        .unwrap_or_default()
869        * 100.0;
870    let delta_episode = secom_targets
871        .get("delta_episode_count")
872        .and_then(|value| value.as_f64())
873        .unwrap_or_default()
874        * 100.0;
875    let mut phm_supports = Vec::new();
876    phm_supports.push(format!(
877        "structural co-occurrence: DSFB grammar-state emergence observed alongside the {} baseline on {} of {} PHM2018 runs",
878        PHM_THRESHOLD_BASELINE,
879        phm_stats.comparable_runs,
880        phm_stats.total_runs,
881    ));
882    if let Some(mean_delta) = phm_stats.mean_lead_delta {
883        phm_supports.push(format!(
884            "structural co-occurrence timing: mean {}-minus-DSFB emergence gap {:.2} (positive = structure emerged before threshold; negative = after)",
885            PHM_THRESHOLD_BASELINE, mean_delta
886        ));
887    }
888    if let Some(mean_structure_delta) = phm_structural.mean_structure_minus_threshold_delta {
889        phm_supports.push(format!(
890            "structure-emergence comparison: mean {}-minus-structure-emergence gap {:.2}, with structure preceding threshold on {:.1}% of runs",
891            PHM_THRESHOLD_BASELINE,
892            mean_structure_delta,
893            phm_structural.percent_structure_before_threshold * 100.0,
894        ));
895    }
896
897    Ok(ClaimAlignmentReport {
898        secom_supports: vec![
899            format!(
900                "episode compression: {:.1}% reduction versus the raw-boundary episode baseline",
901                delta_episode
902            ),
903            episode_precision_text,
904            format!(
905                "investigation load reduction: {:.1}% versus Numeric-only DSA",
906                delta_investigation
907            ),
908        ],
909        secom_does_not_support: vec![
910            "DSFB is an observer-only, read-only, non-intrusive monitoring layer; it does not replace EWMA, thresholds, or DSA".into(),
911            "DSFB augments and reinforces existing detection methods, making them more effective rather than competing with them".into(),
912        ],
913        phm2018_supports: phm_supports,
914        claims_not_made: vec![
915            "any unsupported delta without naming its baseline".into(),
916            "universal dominance over scalar baselines".into(),
917            "replacement of existing detection methods; DSFB is additive and non-intrusive only".into(),
918            "PHM burden reduction without direct PHM burden metrics".into(),
919        ],
920    })
921}
922
923fn heuristic_alert_default(heuristic_name: &str) -> HeuristicAlertClass {
924    heuristic_policy_definition(heuristic_name)
925        .map(|definition| definition.alert_class_default)
926        .or_else(|| {
927            expanded_semantic_policy_definitions()
928                .into_iter()
929                .find(|definition| definition.motif_name == heuristic_name)
930                .map(|definition| definition.alert_class_default)
931        })
932        .unwrap_or(HeuristicAlertClass::Silent)
933}
934
935fn resolve_secom_run_dir(secom_run_dir: Option<&Path>, output_root: &Path) -> Result<PathBuf> {
936    if let Some(path) = secom_run_dir {
937        return Ok(path.to_path_buf());
938    }
939    let candidates = [
940        output_root.to_path_buf(),
941        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("output-dsfb-semiconductor"),
942        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
943            .parent()
944            .and_then(Path::parent)
945            .map(Path::to_path_buf)
946            .unwrap_or_else(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")))
947            .join("output-dsfb-semiconductor"),
948    ];
949    for root in candidates {
950        if let Some(path) = latest_run_dir(&root, "_secom")? {
951            return Ok(path);
952        }
953    }
954    Err(DsfbSemiconductorError::DatasetFormat(
955        "could not resolve a SECOM run directory for claim alignment".into(),
956    ))
957}
958
959fn latest_run_dir(root: &Path, suffix: &str) -> Result<Option<PathBuf>> {
960    let mut dirs = fs::read_dir(root)
961        .ok()
962        .into_iter()
963        .flat_map(|entries| entries.flatten())
964        .map(|entry| entry.path())
965        .filter(|path| path.is_dir())
966        .filter(|path| {
967            path.file_name()
968                .and_then(|name| name.to_str())
969                .is_some_and(|name| name.contains("dsfb-semiconductor") && name.ends_with(suffix))
970        })
971        .collect::<Vec<_>>();
972    dirs.sort();
973    Ok(dirs.pop())
974}
975
976fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<()> {
977    let file = File::create(path)?;
978    serde_json::to_writer_pretty(file, value)?;
979    Ok(())
980}
981
982fn load_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
983    let file = File::open(path)?;
984    Ok(serde_json::from_reader(file)?)
985}
986
987fn write_serialized_csv<T: Serialize>(path: &Path, rows: &[T]) -> Result<()> {
988    let mut writer = csv::Writer::from_path(path)?;
989    for row in rows {
990        writer.serialize(row)?;
991    }
992    writer.flush()?;
993    Ok(())
994}
995
996fn phm2018_tex_report(
997    early_warning: &Phm2018EarlyWarningStats,
998    structural: &Phm2018StructuralMetrics,
999    claim: &ClaimAlignmentReport,
1000    training_run_count: usize,
1001    figure_files: &[String],
1002) -> String {
1003    let fmt_f64 = |v: Option<f64>| {
1004        v.map(|x| format!("{x:.2}"))
1005            .unwrap_or_else(|| "n/a".into())
1006    };
1007    let esc = |s: &str| {
1008        s.replace('_', "\\_")
1009            .replace('&', "\\&")
1010            .replace('%', "\\%")
1011            .replace('#', "\\#")
1012            .replace('$', "\\$")
1013            // math symbols — added after $-escaping so these new $ aren't re-escaped
1014            .replace('≥', "$\\geq$")
1015            .replace('≤', "$\\leq$")
1016            .replace('→', "$\\to$")
1017            .replace('←', "$\\leftarrow$")
1018            .replace('×', "$\\times$")
1019            .replace('±', "$\\pm$")
1020            .replace('∞', "$\\infty$")
1021    };
1022    let list_items = |items: &[String]| -> String {
1023        items
1024            .iter()
1025            .map(|s| format!("  \\item {}\n", esc(s)))
1026            .collect::<String>()
1027    };
1028
1029    let fig_caption = |file: &str| match file {
1030        "phm_lead_before_fault.png" =>
1031            "Lead time before fault: DSFB (blue) vs run-energy threshold (red) per run. \
1032             Bars show timestamp units before fault; taller bar = earlier detection. \
1033             Runs where detection did not occur are omitted.",
1034        "phm_lead_delta_per_run.png" =>
1035            "Lead time delta per run: positive (green) means DSFB detected earlier than \
1036             the threshold baseline; negative (red) means the threshold was earlier. \
1037             Grey bars indicate no comparable detection pair for that run.",
1038        "phm_structural_emergence.png" =>
1039            "Detection outcome summary across all PHM2018 training runs. \
1040             `DSFB earlier' counts runs where the DSFB DSA alert preceded the threshold; \
1041             `Threshold earlier' counts the converse; \
1042             `Tied' means simultaneous detection; \
1043             `DSFB only' and `Threshold only' count single-side detections; \
1044             `Neither detected' counts runs with no alert before fault.",
1045        _ => "Crate-generated figure.",
1046    };
1047    let figures_section = if figure_files.is_empty() {
1048        String::new()
1049    } else {
1050        let mut s = "\\section{Figures}\n\n".to_string();
1051        for file in figure_files {
1052            s.push_str(&format!(
1053                "\\begin{{figure}}[htbp]\n\
1054                 \\centering\n\
1055                 \\includegraphics[width=0.95\\linewidth]{{figures/{}}}\n\
1056                 \\caption{{{}}}\n\
1057                 \\end{{figure}}\n\n",
1058                file,
1059                fig_caption(file),
1060            ));
1061        }
1062        s
1063    };
1064
1065    format!(
1066        r#"\documentclass{{article}}
1067\usepackage[utf8]{{inputenc}}
1068\usepackage[margin=1in]{{geometry}}
1069\usepackage{{graphicx}}
1070\usepackage{{booktabs}}
1071\usepackage{{hyperref}}
1072\usepackage{{parskip}}
1073\title{{DSFB PHM 2018 Engineering Report}}
1074\author{{DSFB Semiconductor Companion Crate}}
1075\date{{\today}}
1076\begin{{document}}
1077\maketitle
1078
1079\section{{Dataset}}
1080\begin{{itemize}}
1081  \item Dataset: PHM 2018 Ion Mill Etch (train split)
1082  \item Training runs processed: {training_run_count}
1083  \item Evidence class: trajectory-level structural emergence comparison against a univariate run-energy threshold baseline
1084  \item Non-claim: this run does not establish SEMI compliance, production readiness, or universal early-warning superiority
1085\end{{itemize}}
1086
1087\section{{Structural Co-occurrence Statistics}}
1088\begin{{itemize}}
1089  \item Baseline: \texttt{{{baseline}}}
1090  \item Total runs: {total_runs}
1091  \item Runs with co-occurring DSFB and baseline detection: {comparable_runs}
1092  \item Mean baseline-minus-DSFB emergence gap (seconds): {mean_lead}
1093  \item Median baseline-minus-DSFB emergence gap (seconds): {median_lead}
1094\end{{itemize}}
1095
1096\section{{Structural Emergence}}
1097\begin{{itemize}}
1098  \item Runs with structured emergence before threshold: {struct_before}/{comparable_struct} ({pct_struct:.1}\%)
1099  \item Mean structure-emergence minus threshold delta (seconds): {mean_struct_delta}
1100  \item Median structure-emergence minus threshold delta (seconds): {median_struct_delta}
1101\end{{itemize}}
1102
1103\section{{Claim Alignment}}
1104
1105\subsection{{SECOM supports}}
1106\begin{{itemize}}
1107{secom_supports}\end{{itemize}}
1108
1109\subsection{{Design boundaries (observer layer)}}
1110\begin{{itemize}}
1111{secom_does_not}
1112\end{{itemize}}
1113
1114\subsection{{PHM 2018 supports}}
1115\begin{{itemize}}
1116{phm_supports}\end{{itemize}}
1117
1118\subsection{{Claims not made}}
1119\begin{{itemize}}
1120{claims_not_made}\end{{itemize}}
1121
1122\section{{Interpretation}}
1123This report covers bounded public-data evidence only.
1124The PHM 2018 dataset contains labeled fault times for ion mill etch training runs.
1125DSFB operates as an observer-only, read-only, non-intrusive monitoring layer.
1126It does not replace, override, or compete with existing detection methods such as
1127EWMA, run-energy thresholds, or DSA---it augments them, making each existing
1128method more effective by providing an additional structural evidence layer.
1129The structural co-occurrence comparison against a univariate run-energy threshold
1130baseline is presented as observational context, not as a competitive benchmark.
1131No claims of superiority, replacement, or deployment-ready dominance are made.
1132
1133{figures_section}
1134\end{{document}}
1135"#,
1136        training_run_count = training_run_count,
1137        baseline = esc(&early_warning.threshold_baseline),
1138        total_runs = early_warning.total_runs,
1139        comparable_runs = early_warning.comparable_runs,
1140        mean_lead = fmt_f64(early_warning.mean_lead_delta),
1141        median_lead = fmt_f64(early_warning.median_lead_delta),
1142        struct_before = structural.runs_with_structure_before_threshold,
1143        comparable_struct = structural.comparable_structure_runs,
1144        pct_struct = structural.percent_structure_before_threshold * 100.0,
1145        mean_struct_delta = fmt_f64(structural.mean_structure_minus_threshold_delta),
1146        median_struct_delta = fmt_f64(structural.median_structure_minus_threshold_delta),
1147        secom_supports = list_items(&claim.secom_supports),
1148        secom_does_not = list_items(&claim.secom_does_not_support),
1149        phm_supports = list_items(&claim.phm2018_supports),
1150        claims_not_made = list_items(&claim.claims_not_made),
1151        figures_section = figures_section,
1152    )
1153}
1154
1155fn percent(numerator: usize, denominator: usize) -> f64 {
1156    if denominator == 0 {
1157        0.0
1158    } else {
1159        numerator as f64 / denominator as f64
1160    }
1161}