1use crate::baselines::compute_baselines;
2use crate::config::PipelineConfig;
3use crate::dataset::phm2018::support_status;
4use crate::error::{DsfbSemiconductorError, Result};
5use crate::grammar::evaluate_grammar;
6use crate::heuristics::{
7 expanded_semantic_policy_definitions, heuristic_policy_definition, HeuristicAlertClass,
8};
9use crate::nominal::build_nominal_model;
10use crate::output_paths::{compile_pdf, create_timestamped_run_dir, zip_directory};
11use crate::plots::generate_phm2018_figures;
12use crate::precursor::evaluate_dsa;
13use crate::preprocessing::{DatasetSummary, PreparedDataset};
14use crate::residual::compute_residuals;
15use crate::semiotics::{build_semantic_layer, classify_motifs};
16use crate::signs::compute_signs;
17use chrono::{DateTime, Utc};
18use serde::{Deserialize, Serialize};
19use std::collections::BTreeMap;
20use std::fs::{self, File};
21use std::io::{BufRead, BufReader};
22use std::path::{Path, PathBuf};
23
24const PHM_SENSOR_COLUMN_START: usize = 7;
25const PHM_MAX_AGGREGATED_POINTS: usize = 4096;
26const PHM_SELECTED_DSA_WINDOW: usize = 8;
27const PHM_SELECTED_DSA_PERSISTENCE: usize = 2;
28const PHM_SELECTED_DSA_TAU: f64 = 1.5;
29const PHM_SELECTED_DSA_M: usize = 1;
30const PHM_THRESHOLD_BASELINE: &str = "run_energy_scalar_threshold";
31
32#[derive(Debug, Clone, Serialize)]
33pub struct Phm2018RunArtifacts {
34 pub run_dir: PathBuf,
35 pub lead_time_metrics_path: PathBuf,
36 pub early_warning_stats_path: PathBuf,
37 pub structural_metrics_path: PathBuf,
38 pub claim_alignment_report_path: PathBuf,
39 pub manifest_path: PathBuf,
40 pub tex_report_path: PathBuf,
41 pub pdf_path: Option<PathBuf>,
42 pub zip_path: PathBuf,
43}
44
45#[derive(Debug, Clone, Serialize)]
46pub struct Phm2018LeadTimeRow {
47 pub run_id: String,
48 pub dsfb_detection_time: Option<i64>,
49 pub threshold_detection_time: Option<i64>,
50 pub lead_time_delta: Option<i64>,
51}
52
53#[derive(Debug, Clone, Serialize)]
54pub struct Phm2018EarlyWarningStats {
55 pub threshold_baseline: String,
56 pub total_runs: usize,
57 pub comparable_runs: usize,
58 pub mean_lead_delta: Option<f64>,
59 pub median_lead_delta: Option<f64>,
60 pub percent_runs_dsfb_earlier: f64,
61 pub percent_runs_equal: f64,
62 pub percent_runs_later: f64,
63}
64
65#[derive(Debug, Clone, Serialize)]
66pub struct Phm2018StructuralMetrics {
67 pub threshold_baseline: String,
68 pub total_runs: usize,
69 pub runs_with_structured_emergence: usize,
70 pub comparable_structure_runs: usize,
71 pub runs_with_structure_before_threshold: usize,
72 pub percent_structure_before_threshold: f64,
73 pub mean_structure_minus_threshold_delta: Option<f64>,
74 pub median_structure_minus_threshold_delta: Option<f64>,
75}
76
77#[derive(Debug, Clone, Serialize)]
78pub struct ClaimAlignmentReport {
79 pub secom_supports: Vec<String>,
80 pub secom_does_not_support: Vec<String>,
81 pub phm2018_supports: Vec<String>,
82 pub claims_not_made: Vec<String>,
83}
84
85#[derive(Debug, Clone, Serialize)]
86struct Phm2018ArtifactManifest {
87 dataset: String,
88 run_dir: String,
89 lead_time_metrics_path: String,
90 early_warning_stats_path: String,
91 structural_metrics_path: String,
92 support_status_path: String,
93 claim_alignment_report_path: String,
94 zip_path: String,
95}
96
97#[derive(Debug, Clone, Serialize)]
98pub struct Phm2018RunDetail {
99 pub run_id: String,
100 pub fault_time: i64,
101 pub fault_index: usize,
102 pub healthy_prefix_count: usize,
103 pub evaluation_start_run_index: usize,
104 pub dsfb_detection_run_index: Option<usize>,
105 pub threshold_detection_run_index: Option<usize>,
106 pub earliest_semantic_run_index: Option<usize>,
107 pub earliest_structured_run_index: Option<usize>,
108 pub dsfb_detection_time: Option<i64>,
109 pub threshold_detection_time: Option<i64>,
110 pub earliest_semantic_time: Option<i64>,
111 pub earliest_structured_time: Option<i64>,
112 pub lead_time_delta: Option<i64>,
113 pub structure_minus_threshold_delta: Option<i64>,
114}
115
116#[derive(Debug, Clone)]
117struct Phm2018RunSpec {
118 run_id: String,
119 sensor_path: PathBuf,
120 fault_time: i64,
121}
122
123#[derive(Debug, Clone)]
124struct Phm2018RunSeries {
125 run_id: String,
126 timestamps_raw: Vec<i64>,
127 feature_names: Vec<String>,
128 raw_values: Vec<Vec<Option<f64>>>,
129 fault_time: i64,
130 fault_index: usize,
131 healthy_prefix_count: usize,
132}
133
134pub fn run_phm2018_benchmark(
135 data_root: &Path,
136 output_root: &Path,
137 secom_run_dir: Option<&Path>,
138) -> Result<Phm2018RunArtifacts> {
139 let status = support_status(data_root);
140 if !status.extracted_dataset_detected {
141 return Err(DsfbSemiconductorError::DatasetMissing {
142 dataset: "PHM 2018 ion mill etch",
143 path: status.extracted_dataset_path,
144 });
145 }
146
147 let run_specs = load_phm2018_train_run_specs(&status.extracted_dataset_path)?;
148 if run_specs.is_empty() {
149 return Err(DsfbSemiconductorError::DatasetFormat(
150 "PHM 2018 extracted tree contains no train runs".into(),
151 ));
152 }
153
154 fs::create_dir_all(output_root)?;
155 let run_dir = create_timestamped_run_dir(output_root, "phm2018")?;
156 let mut lead_time_rows = Vec::new();
157 let mut run_details = Vec::new();
158
159 for run_spec in &run_specs {
160 let run = load_phm2018_train_run_series(run_spec)?;
161 let config = phm_pipeline_config(run.healthy_prefix_count, run.fault_index);
162 let prepared = run.as_prepared_dataset();
163 let nominal = build_nominal_model(&prepared, &config);
164 let residuals = compute_residuals(&prepared, &nominal);
165 let signs = compute_signs(&prepared, &nominal, &residuals, &config);
166 let baselines = compute_baselines(&prepared, &nominal, &residuals, &config);
167 let grammar = evaluate_grammar(&residuals, &signs, &nominal, &config);
168 let motifs = classify_motifs(
169 &prepared,
170 &nominal,
171 &residuals,
172 &signs,
173 &grammar,
174 config.pre_failure_lookback_runs,
175 );
176 let semantic_layer = build_semantic_layer(
177 &prepared,
178 &residuals,
179 &signs,
180 &grammar,
181 &motifs,
182 &nominal,
183 config.pre_failure_lookback_runs,
184 );
185 let dsa = evaluate_dsa(
186 &prepared,
187 &nominal,
188 &residuals,
189 &signs,
190 &baselines,
191 &grammar,
192 &config.dsa,
193 config.pre_failure_lookback_runs,
194 )?;
195
196 let evaluation_start_run_index = run.healthy_prefix_count.min(run.fault_index);
197 let dsfb_detection_run_index = (evaluation_start_run_index..run.fault_index)
198 .find(|&run_index| dsa.run_signals.primary_run_alert[run_index]);
199 let threshold_detection_run_index = (evaluation_start_run_index..run.fault_index)
200 .find(|&run_index| baselines.run_energy.alarm[run_index]);
201 let dsfb_detection_time =
202 dsfb_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
203 let threshold_detection_time =
204 threshold_detection_run_index.map(|run_index| run.timestamps_raw[run_index]);
205 let earliest_semantic_run_index = semantic_layer
206 .ranked_candidates
207 .iter()
208 .filter(|row| {
209 row.run_index >= evaluation_start_run_index && row.run_index < run.fault_index
210 })
211 .filter(|row| {
212 !matches!(
213 heuristic_alert_default(row.heuristic_name.as_str()),
214 HeuristicAlertClass::Silent
215 )
216 })
217 .map(|row| row.run_index)
218 .min();
219 let earliest_structured_run_index = earliest_semantic_run_index.or_else(|| {
220 motifs
221 .traces
222 .iter()
223 .flat_map(|trace| trace.labels.iter().enumerate())
224 .filter(|(run_index, label)| {
225 *run_index >= evaluation_start_run_index
226 && *run_index < run.fault_index
227 && !matches!(label, crate::semiotics::DsfbMotifClass::StableAdmissible)
228 })
229 .map(|(run_index, _)| run_index)
230 .min()
231 });
232 let earliest_semantic_time =
233 earliest_semantic_run_index.map(|run_index| run.timestamps_raw[run_index]);
234 let earliest_structured_time =
235 earliest_structured_run_index.map(|run_index| run.timestamps_raw[run_index]);
236 let lead_time_delta = match (dsfb_detection_time, threshold_detection_time) {
237 (Some(dsfb), Some(threshold)) => Some(threshold - dsfb),
238 _ => None,
239 };
240 let structure_minus_threshold_delta =
241 match (earliest_structured_time, threshold_detection_time) {
242 (Some(structure), Some(threshold)) => Some(threshold - structure),
243 _ => None,
244 };
245
246 lead_time_rows.push(Phm2018LeadTimeRow {
247 run_id: run.run_id.clone(),
248 dsfb_detection_time,
249 threshold_detection_time,
250 lead_time_delta,
251 });
252 run_details.push(Phm2018RunDetail {
253 run_id: run.run_id.clone(),
254 fault_time: run.fault_time,
255 fault_index: run.fault_index,
256 healthy_prefix_count: run.healthy_prefix_count,
257 evaluation_start_run_index,
258 dsfb_detection_run_index,
259 threshold_detection_run_index,
260 earliest_semantic_run_index,
261 earliest_structured_run_index,
262 dsfb_detection_time,
263 threshold_detection_time,
264 earliest_semantic_time,
265 earliest_structured_time,
266 lead_time_delta,
267 structure_minus_threshold_delta,
268 });
269 }
270
271 let early_warning_stats = summarize_phm_lead_times(&lead_time_rows);
272 let structural_metrics = summarize_phm_structural_metrics(&run_details);
273 let secom_run_dir = resolve_secom_run_dir(secom_run_dir, output_root)?;
274 let claim_alignment_report =
275 build_claim_alignment_report(&secom_run_dir, &early_warning_stats, &structural_metrics)?;
276
277 let lead_time_metrics_path = run_dir.join("phm2018_lead_time_metrics.csv");
278 let early_warning_stats_path = run_dir.join("phm2018_early_warning_stats.json");
279 let structural_metrics_path = run_dir.join("phm2018_structural_metrics.json");
280 let claim_alignment_report_path = run_dir.join("claim_alignment_report.json");
281 let manifest_path = run_dir.join("artifact_manifest.json");
282 let zip_path = run_dir.join("run_bundle.zip");
283
284 write_serialized_csv(&lead_time_metrics_path, &lead_time_rows)?;
285 write_json_pretty(&early_warning_stats_path, &early_warning_stats)?;
286 write_json_pretty(&structural_metrics_path, &structural_metrics)?;
287 write_json_pretty(&run_dir.join("phm2018_support_status.json"), &status)?;
288 write_json_pretty(&run_dir.join("phm2018_run_details.json"), &run_details)?;
289 write_json_pretty(&claim_alignment_report_path, &claim_alignment_report)?;
290
291 let figure_files = match generate_phm2018_figures(
293 &run_dir,
294 &run_details,
295 &early_warning_stats,
296 &structural_metrics,
297 ) {
298 Ok(files) => files,
299 Err(e) => {
300 eprintln!("[phm2018] Figure generation warning: {e}");
301 vec![]
302 }
303 };
304 let tex_report_path = run_dir.join("engineering_report.tex");
305 fs::write(
306 &tex_report_path,
307 phm2018_tex_report(
308 &early_warning_stats,
309 &structural_metrics,
310 &claim_alignment_report,
311 run_details.len(),
312 &figure_files,
313 ),
314 )?;
315 let (pdf_path, pdf_error) = compile_pdf(&tex_report_path, &run_dir);
316 if let Some(ref err) = pdf_error {
317 eprintln!("[phm2018] PDF compile warning: {}", err.lines().next().unwrap_or("unknown"));
318 }
319
320 write_json_pretty(
321 &manifest_path,
322 &Phm2018ArtifactManifest {
323 dataset: "PHM2018".into(),
324 run_dir: run_dir.display().to_string(),
325 lead_time_metrics_path: lead_time_metrics_path.display().to_string(),
326 early_warning_stats_path: early_warning_stats_path.display().to_string(),
327 structural_metrics_path: structural_metrics_path.display().to_string(),
328 support_status_path: run_dir
329 .join("phm2018_support_status.json")
330 .display()
331 .to_string(),
332 claim_alignment_report_path: claim_alignment_report_path.display().to_string(),
333 zip_path: zip_path.display().to_string(),
334 },
335 )?;
336 zip_directory(&run_dir, &zip_path)?;
338
339 Ok(Phm2018RunArtifacts {
340 run_dir,
341 lead_time_metrics_path,
342 early_warning_stats_path,
343 structural_metrics_path,
344 claim_alignment_report_path,
345 manifest_path,
346 tex_report_path,
347 pdf_path,
348 zip_path,
349 })
350}
351
352fn load_phm2018_train_run_specs(extracted_root: &Path) -> Result<Vec<Phm2018RunSpec>> {
353 let train_dir = extracted_root.join("train");
354 let fault_dir = train_dir.join("train_faults");
355 let ttf_dir = train_dir.join("train_ttf");
356
357 let sensor_files = fs::read_dir(&train_dir)?
358 .flatten()
359 .map(|entry| entry.path())
360 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("csv"))
361 .collect::<Vec<_>>();
362
363 let fault_times = load_fault_times(&fault_dir)?;
364 let ttf_fallbacks = load_ttf_zero_times(&ttf_dir)?;
365 let mut run_specs = Vec::new();
366
367 for sensor_path in sensor_files {
368 let run_id = run_id_from_sensor_path(&sensor_path)?;
369 let fault_time = fault_times
370 .get(&run_id)
371 .copied()
372 .or_else(|| ttf_fallbacks.get(&run_id).copied())
373 .ok_or_else(|| {
374 DsfbSemiconductorError::DatasetFormat(format!(
375 "missing fault target for PHM train run {run_id}"
376 ))
377 })?;
378 run_specs.push(Phm2018RunSpec {
379 run_id,
380 fault_time,
381 sensor_path,
382 });
383 }
384
385 run_specs.sort_by(|left, right| left.run_id.cmp(&right.run_id));
386 Ok(run_specs)
387}
388
389fn load_phm2018_train_run_series(run_spec: &Phm2018RunSpec) -> Result<Phm2018RunSeries> {
390 let (timestamps_raw, feature_names, raw_values) =
391 load_sensor_csv_aggregated(&run_spec.sensor_path)?;
392 if timestamps_raw.is_empty() || feature_names.is_empty() || raw_values.is_empty() {
393 return Err(DsfbSemiconductorError::DatasetFormat(format!(
394 "empty PHM train run {} at {}",
395 run_spec.run_id,
396 run_spec.sensor_path.display()
397 )));
398 }
399 let fault_index = timestamps_raw
400 .iter()
401 .position(|time| *time >= run_spec.fault_time)
402 .unwrap_or_else(|| timestamps_raw.len().saturating_sub(1));
403 let healthy_prefix_count = healthy_prefix_count(fault_index, timestamps_raw.len());
404
405 Ok(Phm2018RunSeries {
406 run_id: run_spec.run_id.clone(),
407 timestamps_raw,
408 feature_names,
409 raw_values,
410 fault_time: run_spec.fault_time,
411 fault_index,
412 healthy_prefix_count,
413 })
414}
415
416impl Phm2018RunSeries {
417 fn as_prepared_dataset(&self) -> PreparedDataset {
418 let run_count = self.raw_values.len();
419 let feature_count = self.feature_names.len();
420 let total_cells = run_count * feature_count;
421 let missing_cells = self
422 .raw_values
423 .iter()
424 .flat_map(|row| row.iter())
425 .filter(|value| value.is_none())
426 .count();
427 let mut per_feature_missing_fraction = vec![0.0; feature_count];
428 for feature_index in 0..feature_count {
429 let missing = self
430 .raw_values
431 .iter()
432 .filter(|row| row[feature_index].is_none())
433 .count();
434 per_feature_missing_fraction[feature_index] = if run_count == 0 {
435 0.0
436 } else {
437 missing as f64 / run_count as f64
438 };
439 }
440
441 let mut labels = vec![-1; run_count];
442 if self.fault_index < labels.len() {
443 labels[self.fault_index] = 1;
444 }
445 let timestamps = self
446 .timestamps_raw
447 .iter()
448 .enumerate()
449 .map(|(index, value)| {
450 DateTime::<Utc>::from_timestamp(*value, 0)
451 .map(|value| value.naive_utc())
452 .unwrap_or_else(|| {
453 DateTime::<Utc>::from_timestamp(index as i64, 0)
454 .expect("valid synthetic timestamp")
455 .naive_utc()
456 })
457 })
458 .collect::<Vec<_>>();
459 let healthy_pass_indices = (0..self.healthy_prefix_count).collect::<Vec<_>>();
460
461 PreparedDataset {
462 feature_names: self.feature_names.clone(),
463 labels,
464 timestamps,
465 raw_values: self.raw_values.clone(),
466 healthy_pass_indices,
467 per_feature_missing_fraction,
468 summary: DatasetSummary {
469 run_count,
470 feature_count,
471 pass_count: run_count.saturating_sub(1),
472 fail_count: 1,
473 dataset_missing_fraction: if total_cells == 0 {
474 0.0
475 } else {
476 missing_cells as f64 / total_cells as f64
477 },
478 healthy_pass_runs_requested: self.healthy_prefix_count,
479 healthy_pass_runs_found: self.healthy_prefix_count,
480 },
481 }
482 }
483}
484
485fn phm_pipeline_config(healthy_prefix_count: usize, fault_index: usize) -> PipelineConfig {
486 PipelineConfig {
487 healthy_pass_runs: healthy_prefix_count.max(2),
488 pre_failure_lookback_runs: fault_index.max(1),
489 dsa: crate::precursor::DsaConfig {
490 window: PHM_SELECTED_DSA_WINDOW,
491 persistence_runs: PHM_SELECTED_DSA_PERSISTENCE,
492 alert_tau: PHM_SELECTED_DSA_TAU,
493 corroborating_feature_count_min: PHM_SELECTED_DSA_M,
494 },
495 ..PipelineConfig::default()
496 }
497}
498
499fn healthy_prefix_count(fault_index: usize, run_len: usize) -> usize {
500 let proportional = (fault_index as f64 * 0.10).round() as usize;
501 proportional
502 .clamp(25, 200)
503 .min(fault_index.max(2))
504 .min(run_len)
505}
506
507fn run_id_from_sensor_path(path: &Path) -> Result<String> {
508 let stem = path
509 .file_stem()
510 .and_then(|stem| stem.to_str())
511 .ok_or_else(|| {
512 DsfbSemiconductorError::DatasetFormat("invalid PHM sensor filename".into())
513 })?;
514 let mut parts = stem.split('_');
515 let lot = parts
516 .next()
517 .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM lot id".into()))?;
518 let tool = parts
519 .next()
520 .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM tool id".into()))?;
521 Ok(format!("{lot}_{tool}"))
522}
523
524fn load_sensor_csv_aggregated(
525 path: &Path,
526) -> Result<(Vec<i64>, Vec<String>, Vec<Vec<Option<f64>>>)> {
527 let total_rows = estimate_csv_data_rows(path)?;
528 let bucket_size = (total_rows / PHM_MAX_AGGREGATED_POINTS).max(1);
529
530 let mut reader = csv::ReaderBuilder::new().from_path(path)?;
531 let header = reader
532 .headers()?
533 .iter()
534 .skip(PHM_SENSOR_COLUMN_START)
535 .map(|name| name.to_string())
536 .collect::<Vec<_>>();
537 let feature_count = header.len();
538
539 let mut timestamps = Vec::new();
540 let mut raw_values = Vec::new();
541 let mut bucket_time_sum = 0f64;
542 let mut bucket_time_count = 0usize;
543 let mut bucket_row_count = 0usize;
544 let mut bucket_sums = vec![0.0; feature_count];
545 let mut bucket_counts = vec![0usize; feature_count];
546
547 for record in reader.records() {
548 let record = record?;
549 let timestamp = record
550 .get(0)
551 .ok_or_else(|| DsfbSemiconductorError::DatasetFormat("missing PHM time field".into()))?
552 .parse::<i64>()
553 .map_err(|err| {
554 DsfbSemiconductorError::DatasetFormat(format!(
555 "invalid PHM time value in {}: {err}",
556 path.display()
557 ))
558 })?;
559 bucket_time_sum += timestamp as f64;
560 bucket_time_count += 1;
561 bucket_row_count += 1;
562
563 for (feature_index, value) in record.iter().skip(PHM_SENSOR_COLUMN_START).enumerate() {
564 if let Ok(parsed) = value.parse::<f64>() {
565 bucket_sums[feature_index] += parsed;
566 bucket_counts[feature_index] += 1;
567 }
568 }
569
570 if bucket_row_count >= bucket_size {
571 finalize_aggregate_bucket(
572 &mut timestamps,
573 &mut raw_values,
574 &mut bucket_time_sum,
575 &mut bucket_time_count,
576 &mut bucket_row_count,
577 &mut bucket_sums,
578 &mut bucket_counts,
579 );
580 }
581 }
582
583 finalize_aggregate_bucket(
584 &mut timestamps,
585 &mut raw_values,
586 &mut bucket_time_sum,
587 &mut bucket_time_count,
588 &mut bucket_row_count,
589 &mut bucket_sums,
590 &mut bucket_counts,
591 );
592
593 Ok((timestamps, header, raw_values))
594}
595
596fn estimate_csv_data_rows(path: &Path) -> Result<usize> {
597 let file = File::open(path)?;
598 let total_bytes = file.metadata()?.len() as f64;
599 let mut reader = BufReader::new(file);
600 let mut sampled_lines = 0usize;
601 let mut sampled_bytes = 0usize;
602 let mut buffer = String::new();
603
604 while sampled_lines < 4096 {
605 buffer.clear();
606 let bytes = reader.read_line(&mut buffer)?;
607 if bytes == 0 {
608 break;
609 }
610 sampled_lines += 1;
611 sampled_bytes += bytes;
612 }
613
614 if sampled_lines == 0 || sampled_bytes == 0 {
615 return Ok(0);
616 }
617
618 let average_bytes_per_line = sampled_bytes as f64 / sampled_lines as f64;
619 let estimated_total_lines = (total_bytes / average_bytes_per_line).round() as usize;
620 Ok(estimated_total_lines.saturating_sub(1))
621}
622
623fn finalize_aggregate_bucket(
624 timestamps: &mut Vec<i64>,
625 raw_values: &mut Vec<Vec<Option<f64>>>,
626 bucket_time_sum: &mut f64,
627 bucket_time_count: &mut usize,
628 bucket_row_count: &mut usize,
629 bucket_sums: &mut [f64],
630 bucket_counts: &mut [usize],
631) {
632 if *bucket_row_count == 0 {
633 return;
634 }
635
636 timestamps.push((*bucket_time_sum / *bucket_time_count as f64).round() as i64);
637 raw_values.push(
638 bucket_sums
639 .iter()
640 .zip(bucket_counts.iter())
641 .map(|(sum, count)| (*count > 0).then_some(*sum / *count as f64))
642 .collect(),
643 );
644
645 *bucket_time_sum = 0.0;
646 *bucket_time_count = 0;
647 *bucket_row_count = 0;
648 bucket_sums.fill(0.0);
649 bucket_counts.fill(0);
650}
651
652fn load_fault_times(fault_dir: &Path) -> Result<BTreeMap<String, i64>> {
653 let mut map = BTreeMap::new();
654 for entry in fs::read_dir(fault_dir)?.flatten() {
655 let path = entry.path();
656 if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
657 continue;
658 }
659 let file_name = path
660 .file_name()
661 .and_then(|name| name.to_str())
662 .unwrap_or_default();
663 let run_id = file_name
664 .split("_train_fault_data")
665 .next()
666 .unwrap_or(file_name)
667 .to_string();
668 let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
669 let mut earliest: Option<i64> = None;
670 for record in reader.records() {
671 let record = record?;
672 let time = record
673 .get(0)
674 .ok_or_else(|| {
675 DsfbSemiconductorError::DatasetFormat(format!(
676 "fault file {} missing time column",
677 path.display()
678 ))
679 })?
680 .parse::<i64>()
681 .map_err(|err| {
682 DsfbSemiconductorError::DatasetFormat(format!(
683 "invalid fault time in {}: {err}",
684 path.display()
685 ))
686 })?;
687 earliest = Some(match earliest {
688 Some(current) => current.min(time),
689 None => time,
690 });
691 }
692 if let Some(time) = earliest {
693 map.insert(run_id, time);
694 }
695 }
696 Ok(map)
697}
698
699fn load_ttf_zero_times(ttf_dir: &Path) -> Result<BTreeMap<String, i64>> {
700 let mut map = BTreeMap::new();
701 for entry in fs::read_dir(ttf_dir)?.flatten() {
702 let path = entry.path();
703 if path.extension().and_then(|ext| ext.to_str()) != Some("csv") {
704 continue;
705 }
706 let run_id = run_id_from_sensor_path(&path)?;
707 let mut reader = csv::ReaderBuilder::new().from_path(&path)?;
708 let mut earliest = None;
709 for record in reader.records() {
710 let record = record?;
711 let time = record
712 .get(0)
713 .ok_or_else(|| {
714 DsfbSemiconductorError::DatasetFormat(format!(
715 "ttf file {} missing time column",
716 path.display()
717 ))
718 })?
719 .parse::<i64>()
720 .map_err(|err| {
721 DsfbSemiconductorError::DatasetFormat(format!(
722 "invalid ttf time in {}: {err}",
723 path.display()
724 ))
725 })?;
726 let has_zero = record
727 .iter()
728 .skip(1)
729 .filter_map(|value| value.parse::<f64>().ok())
730 .any(|value| value <= 0.0);
731 if has_zero {
732 earliest = Some(time);
733 break;
734 }
735 }
736 if let Some(time) = earliest {
737 map.insert(run_id, time);
738 }
739 }
740 Ok(map)
741}
742
743fn summarize_phm_lead_times(rows: &[Phm2018LeadTimeRow]) -> Phm2018EarlyWarningStats {
744 let comparable = rows
745 .iter()
746 .filter_map(|row| row.lead_time_delta.map(|value| value as f64))
747 .collect::<Vec<_>>();
748 let earlier = rows
749 .iter()
750 .filter(
751 |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
752 (Some(dsfb), Some(threshold)) => dsfb < threshold,
753 (Some(_), None) => true,
754 _ => false,
755 },
756 )
757 .count();
758 let equal = rows
759 .iter()
760 .filter(
761 |row| match (row.dsfb_detection_time, row.threshold_detection_time) {
762 (Some(dsfb), Some(threshold)) => dsfb == threshold,
763 _ => false,
764 },
765 )
766 .count();
767 let later = rows.len().saturating_sub(earlier + equal);
768 let mut sorted = comparable.clone();
769 sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
770 let median = if sorted.is_empty() {
771 None
772 } else if sorted.len() % 2 == 1 {
773 Some(sorted[sorted.len() / 2])
774 } else {
775 Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
776 };
777
778 Phm2018EarlyWarningStats {
779 threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
780 total_runs: rows.len(),
781 comparable_runs: comparable.len(),
782 mean_lead_delta: (!comparable.is_empty())
783 .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
784 median_lead_delta: median,
785 percent_runs_dsfb_earlier: percent(earlier, rows.len()),
786 percent_runs_equal: percent(equal, rows.len()),
787 percent_runs_later: percent(later, rows.len()),
788 }
789}
790
791fn summarize_phm_structural_metrics(run_details: &[Phm2018RunDetail]) -> Phm2018StructuralMetrics {
792 let comparable = run_details
793 .iter()
794 .filter_map(|detail| {
795 detail
796 .structure_minus_threshold_delta
797 .map(|value| value as f64)
798 })
799 .collect::<Vec<_>>();
800 let runs_with_structured_emergence = run_details
801 .iter()
802 .filter(|detail| detail.earliest_structured_run_index.is_some())
803 .count();
804 let runs_with_structure_before_threshold = run_details
805 .iter()
806 .filter(|detail| {
807 detail
808 .structure_minus_threshold_delta
809 .is_some_and(|value| value > 0)
810 })
811 .count();
812 let mut sorted = comparable.clone();
813 sorted.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
814 let median = if sorted.is_empty() {
815 None
816 } else if sorted.len() % 2 == 1 {
817 Some(sorted[sorted.len() / 2])
818 } else {
819 Some((sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0)
820 };
821
822 Phm2018StructuralMetrics {
823 threshold_baseline: PHM_THRESHOLD_BASELINE.into(),
824 total_runs: run_details.len(),
825 runs_with_structured_emergence,
826 comparable_structure_runs: comparable.len(),
827 runs_with_structure_before_threshold,
828 percent_structure_before_threshold: percent(
829 runs_with_structure_before_threshold,
830 run_details.len(),
831 ),
832 mean_structure_minus_threshold_delta: (!comparable.is_empty())
833 .then_some(comparable.iter().sum::<f64>() / comparable.len() as f64),
834 median_structure_minus_threshold_delta: median,
835 }
836}
837
838fn build_claim_alignment_report(
839 secom_run_dir: &Path,
840 phm_stats: &Phm2018EarlyWarningStats,
841 phm_structural: &Phm2018StructuralMetrics,
842) -> Result<ClaimAlignmentReport> {
843 let secom_targets =
844 load_json::<serde_json::Value>(&secom_run_dir.join("dsa_operator_delta_targets.json"))?;
845 let episode_precision =
846 load_json::<serde_json::Value>(&secom_run_dir.join("episode_precision_metrics.json")).ok();
847 let episode_precision_text = episode_precision
848 .as_ref()
849 .and_then(|json| {
850 Some((
851 json.get("dsfb_precision")?.as_f64()?,
852 json.get("raw_alarm_precision")?.as_f64()?,
853 json.get("precision_gain_factor")?.as_f64()?,
854 ))
855 })
856 .map(|(dsfb_precision, raw_precision, gain)| {
857 format!(
858 "episode precision, with DSFB at {:.1}% versus a raw-boundary proxy of {:.2}% ({:.1}x)",
859 dsfb_precision * 100.0,
860 raw_precision * 100.0,
861 gain,
862 )
863 })
864 .unwrap_or_else(|| "episode precision surfaced as the primary operator metric".into());
865 let delta_investigation = secom_targets
866 .get("delta_investigation_load")
867 .and_then(|value| value.as_f64())
868 .unwrap_or_default()
869 * 100.0;
870 let delta_episode = secom_targets
871 .get("delta_episode_count")
872 .and_then(|value| value.as_f64())
873 .unwrap_or_default()
874 * 100.0;
875 let mut phm_supports = Vec::new();
876 phm_supports.push(format!(
877 "structural co-occurrence: DSFB grammar-state emergence observed alongside the {} baseline on {} of {} PHM2018 runs",
878 PHM_THRESHOLD_BASELINE,
879 phm_stats.comparable_runs,
880 phm_stats.total_runs,
881 ));
882 if let Some(mean_delta) = phm_stats.mean_lead_delta {
883 phm_supports.push(format!(
884 "structural co-occurrence timing: mean {}-minus-DSFB emergence gap {:.2} (positive = structure emerged before threshold; negative = after)",
885 PHM_THRESHOLD_BASELINE, mean_delta
886 ));
887 }
888 if let Some(mean_structure_delta) = phm_structural.mean_structure_minus_threshold_delta {
889 phm_supports.push(format!(
890 "structure-emergence comparison: mean {}-minus-structure-emergence gap {:.2}, with structure preceding threshold on {:.1}% of runs",
891 PHM_THRESHOLD_BASELINE,
892 mean_structure_delta,
893 phm_structural.percent_structure_before_threshold * 100.0,
894 ));
895 }
896
897 Ok(ClaimAlignmentReport {
898 secom_supports: vec![
899 format!(
900 "episode compression: {:.1}% reduction versus the raw-boundary episode baseline",
901 delta_episode
902 ),
903 episode_precision_text,
904 format!(
905 "investigation load reduction: {:.1}% versus Numeric-only DSA",
906 delta_investigation
907 ),
908 ],
909 secom_does_not_support: vec![
910 "DSFB is an observer-only, read-only, non-intrusive monitoring layer; it does not replace EWMA, thresholds, or DSA".into(),
911 "DSFB augments and reinforces existing detection methods, making them more effective rather than competing with them".into(),
912 ],
913 phm2018_supports: phm_supports,
914 claims_not_made: vec![
915 "any unsupported delta without naming its baseline".into(),
916 "universal dominance over scalar baselines".into(),
917 "replacement of existing detection methods; DSFB is additive and non-intrusive only".into(),
918 "PHM burden reduction without direct PHM burden metrics".into(),
919 ],
920 })
921}
922
923fn heuristic_alert_default(heuristic_name: &str) -> HeuristicAlertClass {
924 heuristic_policy_definition(heuristic_name)
925 .map(|definition| definition.alert_class_default)
926 .or_else(|| {
927 expanded_semantic_policy_definitions()
928 .into_iter()
929 .find(|definition| definition.motif_name == heuristic_name)
930 .map(|definition| definition.alert_class_default)
931 })
932 .unwrap_or(HeuristicAlertClass::Silent)
933}
934
935fn resolve_secom_run_dir(secom_run_dir: Option<&Path>, output_root: &Path) -> Result<PathBuf> {
936 if let Some(path) = secom_run_dir {
937 return Ok(path.to_path_buf());
938 }
939 let candidates = [
940 output_root.to_path_buf(),
941 PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("output-dsfb-semiconductor"),
942 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
943 .parent()
944 .and_then(Path::parent)
945 .map(Path::to_path_buf)
946 .unwrap_or_else(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")))
947 .join("output-dsfb-semiconductor"),
948 ];
949 for root in candidates {
950 if let Some(path) = latest_run_dir(&root, "_secom")? {
951 return Ok(path);
952 }
953 }
954 Err(DsfbSemiconductorError::DatasetFormat(
955 "could not resolve a SECOM run directory for claim alignment".into(),
956 ))
957}
958
959fn latest_run_dir(root: &Path, suffix: &str) -> Result<Option<PathBuf>> {
960 let mut dirs = fs::read_dir(root)
961 .ok()
962 .into_iter()
963 .flat_map(|entries| entries.flatten())
964 .map(|entry| entry.path())
965 .filter(|path| path.is_dir())
966 .filter(|path| {
967 path.file_name()
968 .and_then(|name| name.to_str())
969 .is_some_and(|name| name.contains("dsfb-semiconductor") && name.ends_with(suffix))
970 })
971 .collect::<Vec<_>>();
972 dirs.sort();
973 Ok(dirs.pop())
974}
975
976fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<()> {
977 let file = File::create(path)?;
978 serde_json::to_writer_pretty(file, value)?;
979 Ok(())
980}
981
982fn load_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
983 let file = File::open(path)?;
984 Ok(serde_json::from_reader(file)?)
985}
986
987fn write_serialized_csv<T: Serialize>(path: &Path, rows: &[T]) -> Result<()> {
988 let mut writer = csv::Writer::from_path(path)?;
989 for row in rows {
990 writer.serialize(row)?;
991 }
992 writer.flush()?;
993 Ok(())
994}
995
996fn phm2018_tex_report(
997 early_warning: &Phm2018EarlyWarningStats,
998 structural: &Phm2018StructuralMetrics,
999 claim: &ClaimAlignmentReport,
1000 training_run_count: usize,
1001 figure_files: &[String],
1002) -> String {
1003 let fmt_f64 = |v: Option<f64>| {
1004 v.map(|x| format!("{x:.2}"))
1005 .unwrap_or_else(|| "n/a".into())
1006 };
1007 let esc = |s: &str| {
1008 s.replace('_', "\\_")
1009 .replace('&', "\\&")
1010 .replace('%', "\\%")
1011 .replace('#', "\\#")
1012 .replace('$', "\\$")
1013 .replace('≥', "$\\geq$")
1015 .replace('≤', "$\\leq$")
1016 .replace('→', "$\\to$")
1017 .replace('←', "$\\leftarrow$")
1018 .replace('×', "$\\times$")
1019 .replace('±', "$\\pm$")
1020 .replace('∞', "$\\infty$")
1021 };
1022 let list_items = |items: &[String]| -> String {
1023 items
1024 .iter()
1025 .map(|s| format!(" \\item {}\n", esc(s)))
1026 .collect::<String>()
1027 };
1028
1029 let fig_caption = |file: &str| match file {
1030 "phm_lead_before_fault.png" =>
1031 "Lead time before fault: DSFB (blue) vs run-energy threshold (red) per run. \
1032 Bars show timestamp units before fault; taller bar = earlier detection. \
1033 Runs where detection did not occur are omitted.",
1034 "phm_lead_delta_per_run.png" =>
1035 "Lead time delta per run: positive (green) means DSFB detected earlier than \
1036 the threshold baseline; negative (red) means the threshold was earlier. \
1037 Grey bars indicate no comparable detection pair for that run.",
1038 "phm_structural_emergence.png" =>
1039 "Detection outcome summary across all PHM2018 training runs. \
1040 `DSFB earlier' counts runs where the DSFB DSA alert preceded the threshold; \
1041 `Threshold earlier' counts the converse; \
1042 `Tied' means simultaneous detection; \
1043 `DSFB only' and `Threshold only' count single-side detections; \
1044 `Neither detected' counts runs with no alert before fault.",
1045 _ => "Crate-generated figure.",
1046 };
1047 let figures_section = if figure_files.is_empty() {
1048 String::new()
1049 } else {
1050 let mut s = "\\section{Figures}\n\n".to_string();
1051 for file in figure_files {
1052 s.push_str(&format!(
1053 "\\begin{{figure}}[htbp]\n\
1054 \\centering\n\
1055 \\includegraphics[width=0.95\\linewidth]{{figures/{}}}\n\
1056 \\caption{{{}}}\n\
1057 \\end{{figure}}\n\n",
1058 file,
1059 fig_caption(file),
1060 ));
1061 }
1062 s
1063 };
1064
1065 format!(
1066 r#"\documentclass{{article}}
1067\usepackage[utf8]{{inputenc}}
1068\usepackage[margin=1in]{{geometry}}
1069\usepackage{{graphicx}}
1070\usepackage{{booktabs}}
1071\usepackage{{hyperref}}
1072\usepackage{{parskip}}
1073\title{{DSFB PHM 2018 Engineering Report}}
1074\author{{DSFB Semiconductor Companion Crate}}
1075\date{{\today}}
1076\begin{{document}}
1077\maketitle
1078
1079\section{{Dataset}}
1080\begin{{itemize}}
1081 \item Dataset: PHM 2018 Ion Mill Etch (train split)
1082 \item Training runs processed: {training_run_count}
1083 \item Evidence class: trajectory-level structural emergence comparison against a univariate run-energy threshold baseline
1084 \item Non-claim: this run does not establish SEMI compliance, production readiness, or universal early-warning superiority
1085\end{{itemize}}
1086
1087\section{{Structural Co-occurrence Statistics}}
1088\begin{{itemize}}
1089 \item Baseline: \texttt{{{baseline}}}
1090 \item Total runs: {total_runs}
1091 \item Runs with co-occurring DSFB and baseline detection: {comparable_runs}
1092 \item Mean baseline-minus-DSFB emergence gap (seconds): {mean_lead}
1093 \item Median baseline-minus-DSFB emergence gap (seconds): {median_lead}
1094\end{{itemize}}
1095
1096\section{{Structural Emergence}}
1097\begin{{itemize}}
1098 \item Runs with structured emergence before threshold: {struct_before}/{comparable_struct} ({pct_struct:.1}\%)
1099 \item Mean structure-emergence minus threshold delta (seconds): {mean_struct_delta}
1100 \item Median structure-emergence minus threshold delta (seconds): {median_struct_delta}
1101\end{{itemize}}
1102
1103\section{{Claim Alignment}}
1104
1105\subsection{{SECOM supports}}
1106\begin{{itemize}}
1107{secom_supports}\end{{itemize}}
1108
1109\subsection{{Design boundaries (observer layer)}}
1110\begin{{itemize}}
1111{secom_does_not}
1112\end{{itemize}}
1113
1114\subsection{{PHM 2018 supports}}
1115\begin{{itemize}}
1116{phm_supports}\end{{itemize}}
1117
1118\subsection{{Claims not made}}
1119\begin{{itemize}}
1120{claims_not_made}\end{{itemize}}
1121
1122\section{{Interpretation}}
1123This report covers bounded public-data evidence only.
1124The PHM 2018 dataset contains labeled fault times for ion mill etch training runs.
1125DSFB operates as an observer-only, read-only, non-intrusive monitoring layer.
1126It does not replace, override, or compete with existing detection methods such as
1127EWMA, run-energy thresholds, or DSA---it augments them, making each existing
1128method more effective by providing an additional structural evidence layer.
1129The structural co-occurrence comparison against a univariate run-energy threshold
1130baseline is presented as observational context, not as a competitive benchmark.
1131No claims of superiority, replacement, or deployment-ready dominance are made.
1132
1133{figures_section}
1134\end{{document}}
1135"#,
1136 training_run_count = training_run_count,
1137 baseline = esc(&early_warning.threshold_baseline),
1138 total_runs = early_warning.total_runs,
1139 comparable_runs = early_warning.comparable_runs,
1140 mean_lead = fmt_f64(early_warning.mean_lead_delta),
1141 median_lead = fmt_f64(early_warning.median_lead_delta),
1142 struct_before = structural.runs_with_structure_before_threshold,
1143 comparable_struct = structural.comparable_structure_runs,
1144 pct_struct = structural.percent_structure_before_threshold * 100.0,
1145 mean_struct_delta = fmt_f64(structural.mean_structure_minus_threshold_delta),
1146 median_struct_delta = fmt_f64(structural.median_structure_minus_threshold_delta),
1147 secom_supports = list_items(&claim.secom_supports),
1148 secom_does_not = list_items(&claim.secom_does_not_support),
1149 phm_supports = list_items(&claim.phm2018_supports),
1150 claims_not_made = list_items(&claim.claims_not_made),
1151 figures_section = figures_section,
1152 )
1153}
1154
1155fn percent(numerator: usize, denominator: usize) -> f64 {
1156 if denominator == 0 {
1157 0.0
1158 } else {
1159 numerator as f64 / denominator as f64
1160 }
1161}