use std::collections::HashMap;
use cyanea_core::{CyaneaError, Result};
use crate::genomic::Strand;
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CpgSite {
pub chrom: String,
pub position: u64,
pub strand: Strand,
pub methylated_reads: u32,
pub total_reads: u32,
}
impl CpgSite {
pub fn beta(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
self.methylated_reads as f64 / self.total_reads as f64
}
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct DmRegion {
pub chrom: String,
pub start: u64,
pub end: u64,
pub mean_delta_beta: f64,
pub n_cpgs: usize,
pub p_value: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct CpgIsland {
pub chrom: String,
pub start: u64,
pub end: u64,
pub cpg_count: usize,
pub obs_exp_ratio: f64,
pub gc_content: f64,
}
pub struct DmrConfig {
pub min_delta_beta: f64,
pub max_gap: u64,
pub min_cpgs: usize,
pub min_coverage: u32,
}
impl Default for DmrConfig {
fn default() -> Self {
Self {
min_delta_beta: 0.2,
max_gap: 500,
min_cpgs: 3,
min_coverage: 5,
}
}
}
pub fn call_methylation(
positions: &[u64],
c_counts: &[u32],
t_counts: &[u32],
chrom: &str,
reference: &[u8],
) -> Vec<CpgSite> {
let n = positions.len().min(c_counts.len()).min(t_counts.len());
let ref_len = reference.len();
let mut sites = Vec::new();
for i in 0..n {
let pos = positions[i] as usize;
if pos + 1 >= ref_len {
continue;
}
let base = reference[pos].to_ascii_uppercase();
let next = reference[pos + 1].to_ascii_uppercase();
if base == b'C' && next == b'G' {
sites.push(CpgSite {
chrom: chrom.to_string(),
position: positions[i],
strand: Strand::Forward,
methylated_reads: c_counts[i],
total_reads: c_counts[i] + t_counts[i],
});
} else if base == b'G' && pos > 0 && reference[pos - 1].to_ascii_uppercase() == b'C' {
continue;
}
}
sites
}
pub fn find_dmrs(
group1: &[&[CpgSite]],
group2: &[&[CpgSite]],
config: &DmrConfig,
) -> Result<Vec<DmRegion>> {
if group1.is_empty() || group2.is_empty() {
return Err(CyaneaError::InvalidInput(
"Both groups must have at least one sample".to_string(),
));
}
let collect_group = |group: &[&[CpgSite]]| -> HashMap<(String, u64), Vec<(f64, u32)>> {
let mut map: HashMap<(String, u64), Vec<(f64, u32)>> = HashMap::new();
for sample in group {
for site in *sample {
map.entry((site.chrom.clone(), site.position))
.or_default()
.push((site.beta(), site.total_reads));
}
}
map
};
let g1_map = collect_group(group1);
let g2_map = collect_group(group2);
struct SigSite {
chrom: String,
position: u64,
g1_betas: Vec<f64>,
g2_betas: Vec<f64>,
delta_beta: f64,
}
let mut sig_sites: Vec<SigSite> = Vec::new();
for (key, g1_entries) in &g1_map {
if let Some(g2_entries) = g2_map.get(key) {
let g1_pass = g1_entries.iter().all(|(_, cov)| *cov >= config.min_coverage);
let g2_pass = g2_entries.iter().all(|(_, cov)| *cov >= config.min_coverage);
if !g1_pass || !g2_pass {
continue;
}
let g1_betas: Vec<f64> = g1_entries.iter().map(|(b, _)| *b).collect();
let g2_betas: Vec<f64> = g2_entries.iter().map(|(b, _)| *b).collect();
let mean1 = g1_betas.iter().sum::<f64>() / g1_betas.len() as f64;
let mean2 = g2_betas.iter().sum::<f64>() / g2_betas.len() as f64;
let delta = mean1 - mean2;
if delta.abs() >= config.min_delta_beta {
sig_sites.push(SigSite {
chrom: key.0.clone(),
position: key.1,
g1_betas,
g2_betas,
delta_beta: delta,
});
}
}
}
sig_sites.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.position.cmp(&b.position)));
let mut regions: Vec<DmRegion> = Vec::new();
let mut i = 0;
while i < sig_sites.len() {
let chrom = sig_sites[i].chrom.clone();
let start = sig_sites[i].position;
let mut end = sig_sites[i].position + 1;
let mut all_g1_betas: Vec<f64> = sig_sites[i].g1_betas.clone();
let mut all_g2_betas: Vec<f64> = sig_sites[i].g2_betas.clone();
let mut delta_sum = sig_sites[i].delta_beta;
let mut n_cpgs = 1;
let mut j = i + 1;
while j < sig_sites.len()
&& sig_sites[j].chrom == chrom
&& sig_sites[j].position <= end + config.max_gap - 1
{
end = sig_sites[j].position + 1;
all_g1_betas.extend_from_slice(&sig_sites[j].g1_betas);
all_g2_betas.extend_from_slice(&sig_sites[j].g2_betas);
delta_sum += sig_sites[j].delta_beta;
n_cpgs += 1;
j += 1;
}
if n_cpgs >= config.min_cpgs {
let mean_delta = delta_sum / n_cpgs as f64;
let p_value = welch_t_test(&all_g1_betas, &all_g2_betas);
regions.push(DmRegion {
chrom,
start,
end,
mean_delta_beta: mean_delta,
n_cpgs,
p_value,
});
}
i = j;
}
Ok(regions)
}
fn erf_approx(x: f64) -> f64 {
let sign = if x >= 0.0 { 1.0 } else { -1.0 };
let x = x.abs();
let p = 0.3275911;
let a1 = 0.254829592;
let a2 = -0.284496736;
let a3 = 1.421413741;
let a4 = -1.453152027;
let a5 = 1.061405429;
let t = 1.0 / (1.0 + p * x);
let poly = t * (a1 + t * (a2 + t * (a3 + t * (a4 + t * a5))));
sign * (1.0 - poly * (-x * x).exp())
}
fn erfc_approx(x: f64) -> f64 {
1.0 - erf_approx(x)
}
fn welch_t_test(group1: &[f64], group2: &[f64]) -> f64 {
let n1 = group1.len() as f64;
let n2 = group2.len() as f64;
if n1 < 2.0 || n2 < 2.0 {
return 1.0;
}
let mean1 = group1.iter().sum::<f64>() / n1;
let mean2 = group2.iter().sum::<f64>() / n2;
let var1 = group1.iter().map(|x| (x - mean1).powi(2)).sum::<f64>() / (n1 - 1.0);
let var2 = group2.iter().map(|x| (x - mean2).powi(2)).sum::<f64>() / (n2 - 1.0);
let se = (var1 / n1 + var2 / n2).sqrt();
if se == 0.0 {
return 1.0;
}
let t = (mean1 - mean2) / se;
erfc_approx(t.abs() / std::f64::consts::SQRT_2)
}
pub fn find_cpg_islands(
sequence: &[u8],
chrom: &str,
min_length: u64,
min_gc: f64,
min_obs_exp: f64,
) -> Vec<CpgIsland> {
let seq_len = sequence.len();
let window_size: usize = 200;
if seq_len < window_size {
return Vec::new();
}
let mut qualifying: Vec<(usize, usize)> = Vec::new();
for start in 0..=(seq_len - window_size) {
let window = &sequence[start..start + window_size];
let wlen = window_size as f64;
let mut n_c: usize = 0;
let mut n_g: usize = 0;
let mut n_cpg: usize = 0;
for j in 0..window.len() {
let b = window[j].to_ascii_uppercase();
if b == b'C' {
n_c += 1;
if j + 1 < window.len() && window[j + 1].to_ascii_uppercase() == b'G' {
n_cpg += 1;
}
} else if b == b'G' {
n_g += 1;
}
}
let gc_content = (n_c + n_g) as f64 / wlen;
if gc_content < min_gc {
continue;
}
let obs_exp = if n_c > 0 && n_g > 0 {
(n_cpg as f64 * wlen) / (n_c as f64 * n_g as f64)
} else {
0.0
};
if obs_exp < min_obs_exp {
continue;
}
qualifying.push((start, start + window_size));
}
if qualifying.is_empty() {
return Vec::new();
}
let mut merged: Vec<(usize, usize)> = Vec::new();
let mut cur_start = qualifying[0].0;
let mut cur_end = qualifying[0].1;
for &(s, e) in &qualifying[1..] {
if s <= cur_end {
cur_end = cur_end.max(e);
} else {
merged.push((cur_start, cur_end));
cur_start = s;
cur_end = e;
}
}
merged.push((cur_start, cur_end));
let mut islands = Vec::new();
for (start, end) in merged {
let length = (end - start) as u64;
if length < min_length {
continue;
}
let region = &sequence[start..end];
let mut n_c: usize = 0;
let mut n_g: usize = 0;
let mut n_cpg: usize = 0;
for j in 0..region.len() {
let b = region[j].to_ascii_uppercase();
if b == b'C' {
n_c += 1;
if j + 1 < region.len() && region[j + 1].to_ascii_uppercase() == b'G' {
n_cpg += 1;
}
} else if b == b'G' {
n_g += 1;
}
}
let gc_content = (n_c + n_g) as f64 / region.len() as f64;
let obs_exp = if n_c > 0 && n_g > 0 {
(n_cpg as f64 * region.len() as f64) / (n_c as f64 * n_g as f64)
} else {
0.0
};
islands.push(CpgIsland {
chrom: chrom.to_string(),
start: start as u64,
end: end as u64,
cpg_count: n_cpg,
obs_exp_ratio: obs_exp,
gc_content,
});
}
islands
}
pub fn bisulfite_convert(sequence: &[u8], methylated_positions: &[u64]) -> Vec<u8> {
let meth_set: std::collections::HashSet<u64> =
methylated_positions.iter().copied().collect();
sequence
.iter()
.enumerate()
.map(|(i, &base)| {
let upper = base.to_ascii_uppercase();
if upper == b'C' && !meth_set.contains(&(i as u64)) {
if base.is_ascii_uppercase() {
b'T'
} else {
b't'
}
} else {
base
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_beta_value_calculation() {
let site = CpgSite {
chrom: "chr1".to_string(),
position: 100,
strand: Strand::Forward,
methylated_reads: 8,
total_reads: 10,
};
assert!((site.beta() - 0.8).abs() < 1e-10);
}
#[test]
fn test_beta_zero_coverage() {
let site = CpgSite {
chrom: "chr1".to_string(),
position: 100,
strand: Strand::Forward,
methylated_reads: 0,
total_reads: 0,
};
assert_eq!(site.beta(), 0.0);
}
#[test]
fn test_cpg_identification_from_reference() {
let reference = b"AACGTTCGAA";
let positions = vec![2, 6];
let c_counts = vec![5, 3];
let t_counts = vec![5, 7];
let sites = call_methylation(&positions, &c_counts, &t_counts, "chr1", reference);
assert_eq!(sites.len(), 2);
assert_eq!(sites[0].position, 2);
assert_eq!(sites[0].strand, Strand::Forward);
assert_eq!(sites[0].methylated_reads, 5);
assert_eq!(sites[0].total_reads, 10);
assert_eq!(sites[1].position, 6);
assert_eq!(sites[1].methylated_reads, 3);
assert_eq!(sites[1].total_reads, 10);
}
#[test]
fn test_non_cpg_position_skipped() {
let reference = b"AACGTT";
let positions = vec![1]; let c_counts = vec![5];
let t_counts = vec![5];
let sites = call_methylation(&positions, &c_counts, &t_counts, "chr1", reference);
assert!(sites.is_empty());
}
#[test]
fn test_bisulfite_unmethylated_c_to_t() {
let seq = b"ACGT";
let methylated: &[u64] = &[];
let converted = bisulfite_convert(seq, methylated);
assert_eq!(converted, b"ATGT");
}
#[test]
fn test_bisulfite_methylated_c_stays_c() {
let seq = b"ACGT";
let methylated = &[1]; let converted = bisulfite_convert(seq, methylated);
assert_eq!(converted, b"ACGT");
}
#[test]
fn test_bisulfite_g_a_t_unaffected() {
let seq = b"GATT";
let methylated: &[u64] = &[];
let converted = bisulfite_convert(seq, methylated);
assert_eq!(converted, b"GATT");
}
#[test]
fn test_cpg_island_found_on_cg_rich_sequence() {
let mut seq = Vec::new();
for _ in 0..125 {
seq.push(b'C');
seq.push(b'G');
}
assert_eq!(seq.len(), 250);
let islands = find_cpg_islands(&seq, "chr1", 200, 0.5, 0.6);
assert!(!islands.is_empty());
assert!(islands[0].gc_content >= 0.5);
assert!(islands[0].obs_exp_ratio >= 0.6);
assert!(islands[0].cpg_count > 0);
}
#[test]
fn test_no_cpg_island_on_at_rich_sequence() {
let mut seq = Vec::new();
for _ in 0..150 {
seq.push(b'A');
seq.push(b'T');
}
assert_eq!(seq.len(), 300);
let islands = find_cpg_islands(&seq, "chr1", 200, 0.5, 0.6);
assert!(islands.is_empty());
}
#[test]
fn test_dmr_detected_between_differentially_methylated_groups() {
let g1_sample: Vec<CpgSite> = (0..5)
.map(|i| CpgSite {
chrom: "chr1".to_string(),
position: i * 100,
strand: Strand::Forward,
methylated_reads: 9,
total_reads: 10,
})
.collect();
let g2_sample: Vec<CpgSite> = (0..5)
.map(|i| CpgSite {
chrom: "chr1".to_string(),
position: i * 100,
strand: Strand::Forward,
methylated_reads: 1,
total_reads: 10,
})
.collect();
let g1_refs: Vec<&[CpgSite]> = vec![&g1_sample, &g1_sample];
let g2_refs: Vec<&[CpgSite]> = vec![&g2_sample, &g2_sample];
let config = DmrConfig {
min_delta_beta: 0.2,
max_gap: 500,
min_cpgs: 3,
min_coverage: 5,
};
let dmrs = find_dmrs(&g1_refs, &g2_refs, &config).unwrap();
assert!(!dmrs.is_empty());
assert!(dmrs[0].mean_delta_beta > 0.0);
assert!(dmrs[0].n_cpgs >= 3);
assert!(dmrs[0].p_value < 0.05);
}
#[test]
fn test_dmr_none_when_groups_identical() {
let sample: Vec<CpgSite> = (0..5)
.map(|i| CpgSite {
chrom: "chr1".to_string(),
position: i * 100,
strand: Strand::Forward,
methylated_reads: 5,
total_reads: 10,
})
.collect();
let g1_refs: Vec<&[CpgSite]> = vec![&sample];
let g2_refs: Vec<&[CpgSite]> = vec![&sample];
let config = DmrConfig::default();
let dmrs = find_dmrs(&g1_refs, &g2_refs, &config).unwrap();
assert!(dmrs.is_empty());
}
#[test]
fn test_dmr_min_cpgs_filter() {
let g1_sample: Vec<CpgSite> = (0..2)
.map(|i| CpgSite {
chrom: "chr1".to_string(),
position: i * 100,
strand: Strand::Forward,
methylated_reads: 9,
total_reads: 10,
})
.collect();
let g2_sample: Vec<CpgSite> = (0..2)
.map(|i| CpgSite {
chrom: "chr1".to_string(),
position: i * 100,
strand: Strand::Forward,
methylated_reads: 1,
total_reads: 10,
})
.collect();
let g1_refs: Vec<&[CpgSite]> = vec![&g1_sample, &g1_sample];
let g2_refs: Vec<&[CpgSite]> = vec![&g2_sample, &g2_sample];
let config = DmrConfig {
min_delta_beta: 0.2,
max_gap: 500,
min_cpgs: 3,
min_coverage: 5,
};
let dmrs = find_dmrs(&g1_refs, &g2_refs, &config).unwrap();
assert!(dmrs.is_empty());
}
}