use crate::error::{PeacoQCError, Result};
use crate::qc::peaks::{ChannelPeakFrame, PeakInfo};
use crate::stats::median_mad::{MAD_SCALE_FACTOR, median_mad_scaled};
use crate::stats::spline::smooth_spline;
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq)]
pub struct MADConfig {
pub mad_threshold: f64,
pub smooth_param: f64,
}
impl Default for MADConfig {
fn default() -> Self {
Self {
mad_threshold: 6.0,
smooth_param: 0.5, }
}
}
#[derive(Debug, Clone)]
pub struct MADResult {
pub outlier_bins: Vec<bool>,
pub contribution: HashMap<String, f64>,
}
fn smooth_peak_trajectory(peak_values: &[f64], smooth_param: f64) -> Vec<f64> {
let n = peak_values.len();
if n < 3 || smooth_param <= 0.0 {
return peak_values.to_vec();
}
let x: Vec<f64> = (1..=n).map(|i| i as f64).collect();
if std::env::var("PEACOQC_DEBUG_SPLINE").is_ok() {
let y_min = peak_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let y_max = peak_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let y_range = y_max - y_min;
eprintln!(
"Smoothing trajectory: n={}, y_range={:.4}, spar={:.3}, first={:.4}, last={:.4}",
n, y_range, smooth_param,
peak_values.first().copied().unwrap_or(0.0),
peak_values.last().copied().unwrap_or(0.0)
);
}
match smooth_spline(&x, peak_values, smooth_param) {
Ok(smoothed) => {
if std::env::var("PEACOQC_DEBUG_SPLINE").is_ok() {
let max_diff = peak_values.iter().zip(smoothed.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f64, f64::max);
let mean_diff = peak_values.iter().zip(smoothed.iter())
.map(|(a, b)| (a - b).abs())
.sum::<f64>() / n as f64;
eprintln!(
"Smoothing result: max_diff={:.4}, mean_diff={:.4}, smoothed_range={:.4}",
max_diff, mean_diff,
smoothed.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)) -
smoothed.iter().fold(f64::INFINITY, |a, &b| a.min(b))
);
}
smoothed
},
Err(e) => {
eprintln!("Spline smoothing failed: {:?}, returning original", e);
peak_values.to_vec()
}
}
}
fn mad_outliers_single_channel(
peak_values: &[f64],
mad_threshold: f64,
smooth_param: f64,
) -> Result<Vec<bool>> {
if peak_values.len() < 3 {
return Ok(vec![false; peak_values.len()]);
}
let smoothed = smooth_peak_trajectory(peak_values, smooth_param);
let (median, mad) = median_mad_scaled(&smoothed)?;
if mad == 0.0 {
return Ok(vec![false; peak_values.len()]);
}
let upper_interval = median + mad_threshold * mad;
let lower_interval = median - mad_threshold * mad;
if std::env::var("PEACOQC_DEBUG_MAD").is_ok() {
let n_outliers = smoothed.iter()
.filter(|&&y| y > upper_interval || y < lower_interval)
.count();
eprintln!(
"MAD thresholds: median={:.4}, mad={:.4}, upper={:.4}, lower={:.4}, outliers={}/{}",
median, mad, upper_interval, lower_interval, n_outliers, smoothed.len()
);
for (i, &y) in smoothed.iter().take(10).enumerate() {
let is_outlier = y > upper_interval || y < lower_interval;
let deviation = if y > median {
(y - median) / mad
} else {
(median - y) / mad
};
eprintln!(
" [{}] smoothed={:.4}, deviation={:.2} MADs, outlier={}",
i, y, deviation, is_outlier
);
}
}
let outliers: Vec<bool> = smoothed
.iter()
.map(|&y| y > upper_interval || y < lower_interval)
.collect();
Ok(outliers)
}
pub fn mad_outlier_method(
peak_results: &HashMap<String, ChannelPeakFrame>,
existing_outliers: &[bool],
n_bins: usize,
config: &MADConfig,
) -> Result<MADResult> {
if peak_results.is_empty() {
return Err(PeacoQCError::NoPeaksDetected);
}
let mut cluster_trajectories: Vec<(String, usize, Vec<f64>)> = Vec::new();
let mut channel_names: Vec<&String> = peak_results.keys().collect();
channel_names.sort();
for channel in channel_names {
let peak_frame = &peak_results[channel];
let mut clusters: HashMap<usize, Vec<&PeakInfo>> = HashMap::new();
for peak in &peak_frame.peaks {
clusters.entry(peak.cluster).or_default().push(peak);
}
for (cluster_id, cluster_peaks) in clusters {
let n_peaks = cluster_peaks.len();
let cluster_values: Vec<f64> = cluster_peaks.iter().map(|p| p.peak_value).collect();
let cluster_median = crate::stats::median(&cluster_values)
.unwrap_or_else(|_| cluster_values.iter().sum::<f64>() / cluster_values.len() as f64);
let mut trajectory = vec![cluster_median; n_bins];
for peak in &cluster_peaks {
if peak.bin < n_bins {
trajectory[peak.bin] = peak.peak_value;
}
}
if std::env::var("PEACOQC_DEBUG_TRAJECTORY").is_ok() {
let n_bins_with_peaks = trajectory.iter()
.take(n_bins)
.filter(|&&v| (v - cluster_median).abs() > 1e-10)
.count();
eprintln!(
"Trajectory: channel={}, cluster={}, n_peaks={}, cluster_median={:.4}, bins_with_peaks={}/{}",
channel, cluster_id, n_peaks, cluster_median, n_bins_with_peaks, n_bins
);
if n_bins > 10 {
eprintln!(" First 5: {:?}", &trajectory[0..5]);
eprintln!(" Last 5: {:?}", &trajectory[n_bins-5..n_bins]);
}
}
cluster_trajectories.push((channel.clone(), cluster_id, trajectory));
}
}
if cluster_trajectories.is_empty() {
return Err(PeacoQCError::NoPeaksDetected);
}
let mut outlier_bins_per_cluster: Vec<Vec<bool>> = Vec::new();
let mut contribution = HashMap::new();
for (channel, _cluster_id, trajectory) in &cluster_trajectories {
let filtered_trajectory: Vec<f64> = trajectory
.iter()
.enumerate()
.filter_map(|(bin_idx, &value)| {
if bin_idx < existing_outliers.len() && existing_outliers[bin_idx] {
Some(value)
} else {
None
}
})
.collect();
if std::env::var("PEACOQC_DEBUG_TRAJECTORY").is_ok() {
eprintln!(
"Filtered trajectory: channel={}, cluster={}, original_len={}, filtered_len={}",
channel, _cluster_id, trajectory.len(), filtered_trajectory.len()
);
if filtered_trajectory.len() > 10 {
eprintln!(" First 5 filtered: {:?}", &filtered_trajectory[0..5]);
eprintln!(" Last 5 filtered: {:?}", &filtered_trajectory[filtered_trajectory.len()-5..]);
}
}
if filtered_trajectory.len() < 3 {
continue;
}
let cluster_outliers = mad_outliers_single_channel(
&filtered_trajectory,
config.mad_threshold,
config.smooth_param,
)?;
let mut full_outliers = vec![false; n_bins];
let mut filtered_idx = 0;
for bin_idx in 0..n_bins {
if bin_idx < existing_outliers.len() && existing_outliers[bin_idx] {
if filtered_idx < cluster_outliers.len() && cluster_outliers[filtered_idx] {
full_outliers[bin_idx] = true;
}
filtered_idx += 1;
}
}
let n_outliers: usize = full_outliers.iter().filter(|&&x| x).count();
outlier_bins_per_cluster.push(full_outliers.clone());
let contrib_pct = (n_outliers as f64 / n_bins as f64) * 100.0;
contribution
.entry(channel.clone())
.and_modify(|e| *e += contrib_pct)
.or_insert(contrib_pct);
}
let mut outlier_bins = vec![false; n_bins];
for cluster_outliers in &outlier_bins_per_cluster {
for (bin_idx, &is_outlier) in cluster_outliers.iter().enumerate() {
if is_outlier {
outlier_bins[bin_idx] = true;
}
}
}
let total_outliers = outlier_bins.iter().filter(|&&x| x).count();
eprintln!(
"MAD detected {} outlier bins ({:.1}%) using scale factor {}",
total_outliers,
(total_outliers as f64 / n_bins as f64) * 100.0,
MAD_SCALE_FACTOR
);
Ok(MADResult {
outlier_bins,
contribution,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::qc::peaks::PeakInfo;
#[test]
fn test_smooth_peak_trajectory() {
let peak_values: Vec<f64> = (0..20).map(|i| 100.0 + (i as f64) * 2.0).collect();
let smoothed = smooth_peak_trajectory(&peak_values, 0.5);
assert_eq!(smoothed.len(), peak_values.len());
assert!(smoothed[0] < smoothed[19], "Trend should be preserved");
let mid = smoothed[10];
assert!(
mid > 100.0 && mid < 150.0,
"Mid value {} should be reasonable",
mid
);
}
#[test]
fn test_mad_outliers_single_channel() {
let mut peak_values: Vec<f64> = (0..50).map(|i| 100.0 + (i as f64) * 0.1).collect();
peak_values[25] = 10000.0;
let outliers = mad_outliers_single_channel(&peak_values, 3.0, 0.2).unwrap();
assert_eq!(outliers.len(), peak_values.len());
let n_outliers: usize = outliers.iter().filter(|&&x| x).count();
assert!(
n_outliers > 0,
"Should detect at least one outlier near the extreme spike"
);
}
#[test]
fn test_mad_outliers() {
let mut peaks = Vec::new();
for bin in 0..50 {
let peak_value = if bin == 25 {
10000.0 } else {
100.0 + (bin as f64) * 0.5
};
peaks.push(PeakInfo {
bin,
peak_value,
cluster: 1,
});
}
let mut peak_results = HashMap::new();
peak_results.insert("FL1-A".to_string(), ChannelPeakFrame { peaks });
let existing_outliers = vec![true; 50];
let config = MADConfig {
mad_threshold: 3.0, smooth_param: 0.2, };
let result = mad_outlier_method(&peak_results, &existing_outliers, 50, &config).unwrap();
let n_outliers = result.outlier_bins.iter().filter(|&&x| x).count();
assert!(
n_outliers > 0,
"Should detect outlier bins near the extreme spike at bin 25"
);
assert!(result.contribution.get("FL1-A").unwrap() > &0.0);
}
#[test]
fn test_mad_no_outliers_stable_data() {
let peaks: Vec<PeakInfo> = (0..50)
.map(|bin| PeakInfo {
bin,
peak_value: 100.0, cluster: 1,
})
.collect();
let mut peak_results = HashMap::new();
peak_results.insert("FL1-A".to_string(), ChannelPeakFrame { peaks });
let existing_outliers = vec![true; 50];
let config = MADConfig::default();
let result = mad_outlier_method(&peak_results, &existing_outliers, 50, &config).unwrap();
let n_outliers = result.outlier_bins.iter().filter(|&&x| x).count();
assert_eq!(n_outliers, 0, "Stable data should have no outliers");
}
}