use cyanea_core::{CyaneaError, Result};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CnvSegment {
pub chrom: String,
pub start: u64,
pub end: u64,
pub log2_ratio: f64,
pub n_probes: usize,
pub copy_number: u32,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct BafSegment {
pub chrom: String,
pub start: u64,
pub end: u64,
pub mean_baf: f64,
pub n_snps: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum SvType {
Deletion,
Duplication,
Inversion,
Translocation,
Insertion,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct SvBreakpoint {
pub sv_type: SvType,
pub chrom1: String,
pub pos1: u64,
pub chrom2: String,
pub pos2: u64,
pub split_reads: u32,
pub discordant_pairs: u32,
pub quality: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CbsConfig {
pub alpha: f64,
pub min_probes: usize,
pub n_permutations: usize,
pub seed: u64,
}
impl Default for CbsConfig {
fn default() -> Self {
Self {
alpha: 0.01,
min_probes: 3,
n_permutations: 1000,
seed: 42,
}
}
}
struct Xorshift64 {
state: u64,
}
impl Xorshift64 {
fn new(seed: u64) -> Self {
Self {
state: if seed == 0 { 1 } else { seed },
}
}
fn next_u64(&mut self) -> u64 {
let mut x = self.state;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
self.state = x;
x
}
}
fn cn_from_log2(log2_ratio: f64) -> u32 {
let cn = 2.0_f64.powf(log2_ratio + 1.0);
let rounded = cn.round() as i64;
rounded.clamp(0, 10) as u32
}
pub fn circular_binary_segmentation(
positions: &[u64],
log2_ratios: &[f64],
chrom: &str,
config: &CbsConfig,
) -> Result<Vec<CnvSegment>> {
if positions.len() != log2_ratios.len() {
return Err(CyaneaError::InvalidInput(format!(
"positions length ({}) must match log2_ratios length ({})",
positions.len(),
log2_ratios.len()
)));
}
if positions.is_empty() {
return Err(CyaneaError::InvalidInput(
"positions and log2_ratios must not be empty".into(),
));
}
let mut breakpoints = Vec::new();
let mut rng = Xorshift64::new(config.seed);
cbs_recurse(
log2_ratios,
0,
log2_ratios.len(),
config,
&mut rng,
&mut breakpoints,
);
breakpoints.sort_unstable();
breakpoints.dedup();
let mut boundaries = Vec::with_capacity(breakpoints.len() + 2);
boundaries.push(0usize);
for &bp in &breakpoints {
boundaries.push(bp);
}
boundaries.push(log2_ratios.len());
boundaries.sort_unstable();
boundaries.dedup();
let mut segments = Vec::new();
for w in boundaries.windows(2) {
let s = w[0];
let e = w[1];
if e <= s {
continue;
}
let n_probes = e - s;
let mean_lr: f64 = log2_ratios[s..e].iter().sum::<f64>() / n_probes as f64;
segments.push(CnvSegment {
chrom: chrom.to_string(),
start: positions[s],
end: positions[e - 1] + 1, log2_ratio: mean_lr,
n_probes,
copy_number: cn_from_log2(mean_lr),
});
}
Ok(segments)
}
fn cbs_recurse(
data: &[f64],
start: usize,
end: usize,
config: &CbsConfig,
rng: &mut Xorshift64,
breakpoints: &mut Vec<usize>,
) {
let n = end - start;
if n < 2 * config.min_probes {
return;
}
let slice = &data[start..end];
let (max_t, best_pos) = max_t_stat(slice, config.min_probes);
if max_t <= 0.0 {
return;
}
let mut count_ge = 0usize;
let mut perm_data: Vec<f64> = slice.to_vec();
for _ in 0..config.n_permutations {
for i in (1..perm_data.len()).rev() {
let j = (rng.next_u64() as usize) % (i + 1);
perm_data.swap(i, j);
}
let (perm_t, _) = max_t_stat(&perm_data, config.min_probes);
if perm_t >= max_t {
count_ge += 1;
}
}
let p_value = count_ge as f64 / config.n_permutations as f64;
if p_value < config.alpha {
let split = start + best_pos;
breakpoints.push(split);
cbs_recurse(data, start, split, config, rng, breakpoints);
cbs_recurse(data, split, end, config, rng, breakpoints);
}
}
fn max_t_stat(data: &[f64], min_probes: usize) -> (f64, usize) {
let n = data.len();
if n < 2 * min_probes {
return (0.0, 0);
}
let total_sum: f64 = data.iter().sum();
let total_mean = total_sum / n as f64;
let mut best_t = 0.0_f64;
let mut best_pos = min_probes;
let mut cum_sum = vec![0.0_f64; n + 1];
for i in 0..n {
cum_sum[i + 1] = cum_sum[i] + data[i];
}
for i in min_probes..=(n - min_probes) {
let left_sum = cum_sum[i];
let right_sum = total_sum - left_sum;
let left_n = i as f64;
let right_n = (n - i) as f64;
let left_mean = left_sum / left_n;
let right_mean = right_sum / right_n;
let mut ss = 0.0_f64;
for j in 0..n {
let d = data[j] - total_mean;
ss += d * d;
}
let variance = ss / n as f64;
if variance <= 1e-15 {
continue;
}
let diff = (left_mean - right_mean).abs();
let se = (variance * (1.0 / left_n + 1.0 / right_n)).sqrt();
let t = diff / se;
if t > best_t {
best_t = t;
best_pos = i;
}
}
(best_t, best_pos)
}
pub fn baf_segmentation(
positions: &[u64],
bafs: &[f64],
chrom: &str,
config: &CbsConfig,
) -> Result<Vec<BafSegment>> {
if positions.len() != bafs.len() {
return Err(CyaneaError::InvalidInput(format!(
"positions length ({}) must match bafs length ({})",
positions.len(),
bafs.len()
)));
}
if positions.is_empty() {
return Err(CyaneaError::InvalidInput(
"positions and bafs must not be empty".into(),
));
}
let mirrored: Vec<f64> = bafs.iter().map(|b| (b - 0.5).abs()).collect();
let mut breakpoints = Vec::new();
let mut rng = Xorshift64::new(config.seed);
cbs_recurse(
&mirrored,
0,
mirrored.len(),
config,
&mut rng,
&mut breakpoints,
);
breakpoints.sort_unstable();
breakpoints.dedup();
let mut boundaries = Vec::with_capacity(breakpoints.len() + 2);
boundaries.push(0usize);
for &bp in &breakpoints {
boundaries.push(bp);
}
boundaries.push(bafs.len());
boundaries.sort_unstable();
boundaries.dedup();
let mut segments = Vec::new();
for w in boundaries.windows(2) {
let s = w[0];
let e = w[1];
if e <= s {
continue;
}
let n_snps = e - s;
let mean_baf: f64 = bafs[s..e].iter().sum::<f64>() / n_snps as f64;
segments.push(BafSegment {
chrom: chrom.to_string(),
start: positions[s],
end: positions[e - 1] + 1,
mean_baf,
n_snps,
});
}
Ok(segments)
}
struct SvEvidence {
chrom1: String,
pos1: u64,
chrom2: String,
pos2: u64,
is_split_read: bool,
}
pub fn detect_sv_breakpoints(
discordant_pairs: &[(String, u64, String, u64)],
split_reads: &[(String, u64, String, u64)],
cluster_distance: u64,
) -> Vec<SvBreakpoint> {
let mut evidence: Vec<SvEvidence> = Vec::with_capacity(discordant_pairs.len() + split_reads.len());
for (c1, p1, c2, p2) in discordant_pairs {
evidence.push(SvEvidence {
chrom1: c1.clone(),
pos1: *p1,
chrom2: c2.clone(),
pos2: *p2,
is_split_read: false,
});
}
for (c1, p1, c2, p2) in split_reads {
evidence.push(SvEvidence {
chrom1: c1.clone(),
pos1: *p1,
chrom2: c2.clone(),
pos2: *p2,
is_split_read: true,
});
}
if evidence.is_empty() {
return Vec::new();
}
evidence.sort_by(|a, b| {
a.chrom1
.cmp(&b.chrom1)
.then(a.chrom2.cmp(&b.chrom2))
.then(a.pos1.cmp(&b.pos1))
.then(a.pos2.cmp(&b.pos2))
});
let mut clusters: Vec<Vec<&SvEvidence>> = Vec::new();
let mut current_cluster: Vec<&SvEvidence> = vec![&evidence[0]];
for ev in evidence.iter().skip(1) {
let last = current_cluster.last().unwrap();
let same_chroms = ev.chrom1 == last.chrom1 && ev.chrom2 == last.chrom2;
let close_pos1 = ev.pos1.abs_diff(last.pos1) <= cluster_distance;
let close_pos2 = ev.pos2.abs_diff(last.pos2) <= cluster_distance;
if same_chroms && close_pos1 && close_pos2 {
current_cluster.push(ev);
} else {
clusters.push(current_cluster);
current_cluster = vec![ev];
}
}
clusters.push(current_cluster);
let mut breakpoints = Vec::new();
for cluster in &clusters {
let mut sr_count = 0u32;
let mut dp_count = 0u32;
let mut sum_pos1 = 0u64;
let mut sum_pos2 = 0u64;
for ev in cluster {
if ev.is_split_read {
sr_count += 1;
} else {
dp_count += 1;
}
sum_pos1 += ev.pos1;
sum_pos2 += ev.pos2;
}
let n = cluster.len() as u64;
let mean_pos1 = sum_pos1 / n;
let mean_pos2 = sum_pos2 / n;
let chrom1 = &cluster[0].chrom1;
let chrom2 = &cluster[0].chrom2;
let sv_type = classify_sv(chrom1, mean_pos1, chrom2, mean_pos2);
let quality = sr_count as f64 * 10.0 + dp_count as f64 * 5.0;
breakpoints.push(SvBreakpoint {
sv_type,
chrom1: chrom1.clone(),
pos1: mean_pos1,
chrom2: chrom2.clone(),
pos2: mean_pos2,
split_reads: sr_count,
discordant_pairs: dp_count,
quality,
});
}
breakpoints
}
fn classify_sv(chrom1: &str, pos1: u64, chrom2: &str, pos2: u64) -> SvType {
if chrom1 != chrom2 {
SvType::Translocation
} else if pos2 > pos1 {
SvType::Deletion
} else {
SvType::Inversion
}
}
pub fn merge_cnv_segments(
segments: &[CnvSegment],
max_gap: u64,
cn_tolerance: u32,
) -> Vec<CnvSegment> {
if segments.is_empty() {
return Vec::new();
}
let mut sorted: Vec<CnvSegment> = segments.to_vec();
sorted.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start)));
let mut merged: Vec<CnvSegment> = Vec::new();
let mut current = sorted[0].clone();
for seg in sorted.iter().skip(1) {
let same_chrom = seg.chrom == current.chrom;
let gap = if seg.start >= current.end {
seg.start - current.end
} else {
0
};
let cn_diff = (seg.copy_number as i64 - current.copy_number as i64).unsigned_abs() as u32;
if same_chrom && gap <= max_gap && cn_diff <= cn_tolerance {
let total_probes = current.n_probes + seg.n_probes;
let weighted_lr = (current.log2_ratio * current.n_probes as f64
+ seg.log2_ratio * seg.n_probes as f64)
/ total_probes as f64;
current.end = seg.end;
current.log2_ratio = weighted_lr;
current.n_probes = total_probes;
current.copy_number = cn_from_log2(weighted_lr);
} else {
merged.push(current);
current = seg.clone();
}
}
merged.push(current);
merged
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn cbs_constant_data() {
let positions: Vec<u64> = (0..20).map(|i| i * 1000).collect();
let log2_ratios = vec![0.0; 20];
let config = CbsConfig::default();
let segments =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &config).unwrap();
assert_eq!(segments.len(), 1);
assert_eq!(segments[0].n_probes, 20);
assert!((segments[0].log2_ratio - 0.0).abs() < 1e-10);
assert_eq!(segments[0].copy_number, 2); }
#[test]
fn cbs_two_level() {
let n = 40;
let positions: Vec<u64> = (0..n).map(|i| i as u64 * 1000).collect();
let mut log2_ratios = vec![0.0; n];
for i in 20..n {
log2_ratios[i] = 1.0;
}
let config = CbsConfig {
alpha: 0.05,
min_probes: 3,
n_permutations: 500,
seed: 123,
};
let segments =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &config).unwrap();
assert_eq!(segments.len(), 2);
assert!(segments[0].log2_ratio < 0.5);
assert!(segments[1].log2_ratio > 0.5);
}
#[test]
fn cbs_three_level() {
let n = 60;
let positions: Vec<u64> = (0..n).map(|i| i as u64 * 1000).collect();
let mut log2_ratios = vec![0.0; n];
for i in 20..40 {
log2_ratios[i] = 1.0; }
for i in 40..60 {
log2_ratios[i] = -1.0; }
let config = CbsConfig {
alpha: 0.05,
min_probes: 3,
n_permutations: 500,
seed: 456,
};
let segments =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &config).unwrap();
assert_eq!(segments.len(), 3);
assert!(segments[0].log2_ratio.abs() < 0.5);
assert!(segments[1].log2_ratio > 0.5);
assert!(segments[2].log2_ratio < -0.5);
}
#[test]
fn cbs_min_probes_respected() {
let positions: Vec<u64> = (0..10).map(|i| i * 1000).collect();
let mut log2_ratios = vec![0.0; 10];
log2_ratios[0] = 2.0;
log2_ratios[1] = 2.0;
let config = CbsConfig {
alpha: 0.05,
min_probes: 5,
n_permutations: 200,
seed: 789,
};
let segments =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &config).unwrap();
assert_eq!(segments.len(), 1);
}
#[test]
fn cbs_alpha_threshold() {
let n = 30;
let positions: Vec<u64> = (0..n).map(|i| i as u64 * 1000).collect();
let mut log2_ratios = vec![0.0; n];
for i in 15..n {
log2_ratios[i] = 0.3;
}
let strict_config = CbsConfig {
alpha: 0.001,
min_probes: 3,
n_permutations: 200,
seed: 111,
};
let segments_strict =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &strict_config).unwrap();
let lenient_config = CbsConfig {
alpha: 0.5,
min_probes: 3,
n_permutations: 200,
seed: 111,
};
let segments_lenient =
circular_binary_segmentation(&positions, &log2_ratios, "chr1", &lenient_config)
.unwrap();
assert!(segments_strict.len() <= segments_lenient.len());
}
#[test]
fn baf_het_and_loh() {
let n = 40;
let positions: Vec<u64> = (0..n).map(|i| i as u64 * 1000).collect();
let mut bafs = vec![0.5; n]; for i in 20..n {
bafs[i] = 0.95;
}
let config = CbsConfig {
alpha: 0.05,
min_probes: 3,
n_permutations: 500,
seed: 222,
};
let segments = baf_segmentation(&positions, &bafs, "chr1", &config).unwrap();
assert!(segments.len() >= 2);
assert!((segments[0].mean_baf - 0.5).abs() < 0.1);
assert!((segments.last().unwrap().mean_baf - 0.95).abs() < 0.1);
}
#[test]
fn baf_normal_diploid() {
let positions: Vec<u64> = (0..20).map(|i| i * 1000).collect();
let bafs = vec![0.5; 20];
let config = CbsConfig::default();
let segments = baf_segmentation(&positions, &bafs, "chr1", &config).unwrap();
assert_eq!(segments.len(), 1);
assert!((segments[0].mean_baf - 0.5).abs() < 1e-10);
}
#[test]
fn sv_deletion_cluster() {
let discordant: Vec<(String, u64, String, u64)> = vec![
("chr1".into(), 1000, "chr1".into(), 5000),
("chr1".into(), 1010, "chr1".into(), 5010),
("chr1".into(), 1020, "chr1".into(), 5020),
];
let split: Vec<(String, u64, String, u64)> = vec![
("chr1".into(), 1005, "chr1".into(), 5005),
];
let bps = detect_sv_breakpoints(&discordant, &split, 100);
assert_eq!(bps.len(), 1);
assert_eq!(bps[0].sv_type, SvType::Deletion);
assert_eq!(bps[0].discordant_pairs, 3);
assert_eq!(bps[0].split_reads, 1);
assert!((bps[0].quality - (1.0 * 10.0 + 3.0 * 5.0)).abs() < 1e-10);
}
#[test]
fn sv_translocation() {
let discordant: Vec<(String, u64, String, u64)> = vec![
("chr1".into(), 1000, "chr5".into(), 2000),
("chr1".into(), 1010, "chr5".into(), 2010),
];
let split: Vec<(String, u64, String, u64)> = vec![
("chr1".into(), 1005, "chr5".into(), 2005),
];
let bps = detect_sv_breakpoints(&discordant, &split, 100);
assert_eq!(bps.len(), 1);
assert_eq!(bps[0].sv_type, SvType::Translocation);
assert_eq!(bps[0].chrom1, "chr1");
assert_eq!(bps[0].chrom2, "chr5");
}
#[test]
fn sv_inversion() {
let discordant: Vec<(String, u64, String, u64)> = vec![
("chr2".into(), 5000, "chr2".into(), 3000),
("chr2".into(), 5010, "chr2".into(), 3010),
];
let split: Vec<(String, u64, String, u64)> = Vec::new();
let bps = detect_sv_breakpoints(&discordant, &split, 100);
assert_eq!(bps.len(), 1);
assert_eq!(bps[0].sv_type, SvType::Inversion);
assert_eq!(bps[0].chrom1, "chr2");
assert_eq!(bps[0].chrom2, "chr2");
assert_eq!(bps[0].discordant_pairs, 2);
assert_eq!(bps[0].split_reads, 0);
}
#[test]
fn cnv_merge_adjacent() {
let segments = vec![
CnvSegment {
chrom: "chr1".into(),
start: 0,
end: 1000,
log2_ratio: 0.0,
n_probes: 10,
copy_number: 2,
},
CnvSegment {
chrom: "chr1".into(),
start: 1000,
end: 2000,
log2_ratio: 0.0,
n_probes: 10,
copy_number: 2,
},
];
let merged = merge_cnv_segments(&segments, 100, 0);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].start, 0);
assert_eq!(merged[0].end, 2000);
assert_eq!(merged[0].n_probes, 20);
assert_eq!(merged[0].copy_number, 2);
}
#[test]
fn cnv_merge_max_gap() {
let segments = vec![
CnvSegment {
chrom: "chr1".into(),
start: 0,
end: 1000,
log2_ratio: 0.0,
n_probes: 10,
copy_number: 2,
},
CnvSegment {
chrom: "chr1".into(),
start: 5000, end: 6000,
log2_ratio: 0.0,
n_probes: 10,
copy_number: 2,
},
];
let merged = merge_cnv_segments(&segments, 100, 0);
assert_eq!(merged.len(), 2);
let merged_large = merge_cnv_segments(&segments, 5000, 0);
assert_eq!(merged_large.len(), 1);
}
#[test]
fn cnv_merge_different_cn_not_merged() {
let segments = vec![
CnvSegment {
chrom: "chr1".into(),
start: 0,
end: 1000,
log2_ratio: 0.0,
n_probes: 10,
copy_number: 2,
},
CnvSegment {
chrom: "chr1".into(),
start: 1000,
end: 2000,
log2_ratio: 1.0,
n_probes: 10,
copy_number: 4,
},
];
let merged = merge_cnv_segments(&segments, 100, 0);
assert_eq!(merged.len(), 2);
}
#[test]
fn cn_estimation_from_log2_ratio() {
assert_eq!(cn_from_log2(0.0), 2);
assert_eq!(cn_from_log2(1.0), 4);
assert_eq!(cn_from_log2(-1.0), 1);
assert_eq!(cn_from_log2(-10.0), 0);
assert_eq!(cn_from_log2(5.0), 10);
assert_eq!(cn_from_log2(-0.415), 2);
assert_eq!(cn_from_log2(0.585), 3);
}
}