use anyhow::{Context, Result};
use rand::Rng;
use rand_distr::{Distribution, LogNormal, Normal};
use crate::io::config::LongReadFragmentConfig;
pub trait FragmentSampler: Send + Sync {
fn sample<R: Rng>(&self, rng: &mut R) -> usize;
}
pub struct NormalFragmentSampler {
dist: Normal<f64>,
min_size: usize,
}
impl NormalFragmentSampler {
pub fn new(mean: f64, sd: f64) -> Result<Self> {
Ok(Self {
dist: Normal::new(mean, sd)
.context("invalid normal distribution parameters (mean or sd is NaN/Inf)")?,
min_size: 50,
})
}
}
impl FragmentSampler for NormalFragmentSampler {
fn sample<R: Rng>(&self, rng: &mut R) -> usize {
let size = self.dist.sample(rng).round() as i64;
size.max(self.min_size as i64) as usize
}
}
pub struct CfdnaFragmentSampler {
mono_dist: Normal<f64>,
di_dist: Normal<f64>,
mono_weight: f64,
ctdna_dist: Normal<f64>,
ctdna_fraction: f64,
min_size: usize,
}
impl CfdnaFragmentSampler {
pub fn new(
mono_peak: f64,
di_peak: f64,
mono_weight: f64,
ctdna_fraction: f64,
mono_sd: f64,
di_sd: f64,
) -> Result<Self> {
Ok(Self {
mono_dist: Normal::new(mono_peak, mono_sd)
.context("invalid mononucleosomal distribution parameters")?,
di_dist: Normal::new(di_peak, di_sd)
.context("invalid dinucleosomal distribution parameters")?,
mono_weight,
ctdna_dist: Normal::new(143.0, 15.0)
.context("invalid ctDNA distribution parameters")?,
ctdna_fraction,
min_size: 50,
})
}
fn apply_periodicity<R: Rng>(&self, size: f64, rng: &mut R) -> f64 {
let period_phase = (size / 10.0).fract();
let adjustment = (period_phase * 2.0 * std::f64::consts::PI).sin() * 2.0;
let noise: f64 = rng.random_range(-1.0..1.0);
size + adjustment + noise
}
}
impl FragmentSampler for CfdnaFragmentSampler {
fn sample<R: Rng>(&self, rng: &mut R) -> usize {
let is_ctdna = rng.random::<f64>() < self.ctdna_fraction;
let raw_size = if is_ctdna {
self.ctdna_dist.sample(rng)
} else if rng.random::<f64>() < self.mono_weight {
self.mono_dist.sample(rng)
} else {
self.di_dist.sample(rng)
};
let size = self.apply_periodicity(raw_size, rng);
(size.round() as i64).max(self.min_size as i64) as usize
}
}
pub fn sample_long_read_length<R: Rng>(cfg: &LongReadFragmentConfig, rng: &mut R) -> Result<usize> {
let mean = cfg.mean as f64;
let sd = cfg.sd as f64;
let variance = sd * sd;
let mu = (mean * mean / (mean * mean + variance).sqrt()).ln();
let sigma = (1.0_f64 + variance / (mean * mean)).ln().sqrt();
let dist = LogNormal::new(mu, sigma)
.context("invalid log-normal parameters for long-read fragment size")?;
let sample = dist.sample(rng) as usize;
Ok(sample.clamp(cfg.min_len, cfg.max_len))
}
pub struct PcrFamilySizeSampler {
dist: LogNormal<f64>,
}
impl PcrFamilySizeSampler {
pub fn new(mean: f64, sd: f64) -> Result<Self> {
let variance = sd * sd;
let mu = (mean * mean / (mean * mean + variance).sqrt()).ln();
let sigma = (1.0 + variance / (mean * mean)).ln().sqrt();
Ok(Self {
dist: LogNormal::new(mu, sigma)
.context("invalid log-normal parameters for PCR family size")?,
})
}
pub fn sample<R: Rng>(&self, rng: &mut R) -> usize {
let size = self.dist.sample(rng).round() as usize;
size.max(1) }
}
#[cfg(test)]
mod tests {
use super::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
#[test]
fn test_normal_fragment_sampler() {
let sampler = NormalFragmentSampler::new(300.0, 50.0).unwrap();
let mut rng = StdRng::seed_from_u64(42);
let sizes: Vec<usize> = (0..1000).map(|_| sampler.sample(&mut rng)).collect();
let mean = sizes.iter().sum::<usize>() as f64 / sizes.len() as f64;
assert!(
(mean - 300.0).abs() < 10.0,
"mean {} too far from 300",
mean
);
assert!(sizes.iter().all(|&s| s >= 50), "no fragment below minimum");
}
#[test]
fn test_cfdna_fragment_sampler() {
let sampler = CfdnaFragmentSampler::new(167.0, 334.0, 0.85, 0.0, 20.0, 30.0).unwrap();
let mut rng = StdRng::seed_from_u64(42);
let sizes: Vec<usize> = (0..10000).map(|_| sampler.sample(&mut rng)).collect();
let mean = sizes.iter().sum::<usize>() as f64 / sizes.len() as f64;
assert!(
mean > 150.0 && mean < 230.0,
"cfDNA mean {} unexpected",
mean
);
}
#[test]
fn test_cfdna_ctdna_shorter() {
let sampler_normal =
CfdnaFragmentSampler::new(167.0, 334.0, 0.85, 0.0, 20.0, 30.0).unwrap();
let sampler_ctdna = CfdnaFragmentSampler::new(167.0, 334.0, 0.85, 1.0, 20.0, 30.0).unwrap();
let mut rng = StdRng::seed_from_u64(42);
let normal_mean: f64 = (0..5000)
.map(|_| sampler_normal.sample(&mut rng) as f64)
.sum::<f64>()
/ 5000.0;
let ctdna_mean: f64 = (0..5000)
.map(|_| sampler_ctdna.sample(&mut rng) as f64)
.sum::<f64>()
/ 5000.0;
assert!(
ctdna_mean < normal_mean,
"ctDNA mean ({}) should be shorter than normal cfDNA ({})",
ctdna_mean,
normal_mean
);
}
#[test]
fn test_pcr_family_size_sampler() {
let sampler = PcrFamilySizeSampler::new(3.0, 1.5).unwrap();
let mut rng = StdRng::seed_from_u64(42);
let sizes: Vec<usize> = (0..10000).map(|_| sampler.sample(&mut rng)).collect();
assert!(sizes.iter().all(|&s| s >= 1), "min family size is 1");
let mean = sizes.iter().sum::<usize>() as f64 / sizes.len() as f64;
assert!((mean - 3.0).abs() < 1.0, "mean {} too far from 3.0", mean);
}
#[test]
fn test_long_read_length_range() {
use crate::io::config::LongReadFragmentConfig;
let cfg = LongReadFragmentConfig {
mean: 15000,
sd: 5000,
min_len: 1000,
max_len: 100000,
};
let mut rng = StdRng::seed_from_u64(42);
let sizes: Vec<usize> = (0..1000)
.map(|_| sample_long_read_length(&cfg, &mut rng).unwrap())
.collect();
assert!(
sizes.iter().all(|&s| (1000..=100000).contains(&s)),
"all samples must be within [min_len, max_len]"
);
let mean = sizes.iter().sum::<usize>() as f64 / sizes.len() as f64;
assert!(
mean > 5000.0 && mean < 30000.0,
"mean {} outside expected range",
mean
);
}
#[test]
fn test_cfdna_low_ctdna_fraction_mostly_normal_lengths() {
let sampler = CfdnaFragmentSampler::new(167.0, 334.0, 0.85, 0.01, 20.0, 30.0).unwrap();
let mut rng = StdRng::seed_from_u64(99);
let n = 10_000usize;
let short_count = (0..n).filter(|_| sampler.sample(&mut rng) < 130).count();
let short_fraction = short_count as f64 / n as f64;
assert!(
short_fraction < 0.10,
"expected <10% fragments below 130 bp at ctdna_fraction=0.01, got {:.1}%",
short_fraction * 100.0
);
}
#[test]
fn test_cfdna_full_ctdna_fraction_all_short() {
let sampler = CfdnaFragmentSampler::new(167.0, 334.0, 0.85, 1.0, 20.0, 30.0).unwrap();
let mut rng = StdRng::seed_from_u64(77);
let mean: f64 = (0..5_000)
.map(|_| sampler.sample(&mut rng) as f64)
.sum::<f64>()
/ 5_000.0;
assert!(
mean < 160.0,
"expected mean <160 bp at ctdna_fraction=1.0, got {:.1}",
mean
);
}
#[test]
fn test_deterministic_with_seed() {
let sampler = NormalFragmentSampler::new(300.0, 50.0).unwrap();
let mut rng1 = StdRng::seed_from_u64(123);
let mut rng2 = StdRng::seed_from_u64(123);
let sizes1: Vec<usize> = (0..100).map(|_| sampler.sample(&mut rng1)).collect();
let sizes2: Vec<usize> = (0..100).map(|_| sampler.sample(&mut rng2)).collect();
assert_eq!(sizes1, sizes2, "same seed must produce same results");
}
}