1use crate::DNACopy;
2use anyhow::{bail, Context};
3use rand::{seq::SliceRandom, Rng};
4use rand_distr::Distribution;
5use std::cmp::{min, Ord};
6use std::fmt;
7use std::{collections::HashMap, fs, num::NonZeroU16, path::Path};
8
9#[derive(Debug, Clone, Copy, PartialEq)]
11pub enum SamplingStrategy {
12 Uniform,
14 Gaussian(Sigma),
22 Poisson(Lambda),
26 Exponential(Lambda),
33}
34
35#[derive(Debug, Clone, Copy, PartialEq)]
36pub struct Sigma(Lambda);
41
42impl Sigma {
43 pub fn new(sigma: f32) -> Sigma {
44 Sigma(Lambda::new(sigma))
47 }
48
49 pub fn get(&self) -> f32 {
50 self.0 .0
51 }
52}
53
54#[derive(Debug, Clone, Copy, PartialEq)]
55pub struct Lambda(f32);
60
61impl Lambda {
62 pub fn new(lambda: f32) -> Lambda {
63 assert!(
66 lambda.is_normal() && lambda.is_sign_positive(),
67 "lambda {} not valid!, must be finite and > 0",
68 lambda
69 );
70 Lambda(lambda)
71 }
72
73 pub fn get(&self) -> f32 {
74 self.0
75 }
76}
77
78#[derive(Clone, PartialEq, Eq, Debug)]
79pub struct EcDNADistribution {
80 nminus: u64,
81 nplus: Vec<DNACopy>,
82}
83
84impl fmt::Display for EcDNADistribution {
85 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
86 let mut hist = self
87 .create_histogram()
88 .into_iter()
89 .collect::<Vec<(u16, u64)>>();
90 hist.sort_by_key(|(ecdna, _)| *ecdna);
91 hist.into_iter()
92 .try_for_each(|(ecdna, cells)| writeln!(f, "{}:{}", ecdna, cells))
93 }
94}
95
96impl From<Vec<u16>> for EcDNADistribution {
97 fn from(vec: Vec<u16>) -> Self {
98 let mut nplus = vec;
99 nplus.shrink_to_fit();
100 nplus.sort();
101 let (nminus, nplus) = nplus.split_at(nplus.partition_point(|ele| ele == &0u16));
102 Self {
103 nminus: nminus.len() as u64,
104 nplus: nplus
105 .iter()
106 .map(|&k| unsafe { NonZeroU16::new_unchecked(k) })
107 .collect(),
108 }
109 }
110}
111
112impl EcDNADistribution {
113 pub fn new(distribution: HashMap<u16, u64>, iterations: usize) -> Self {
114 let nminus = *distribution.get(&0).unwrap_or(&0);
115
116 let mut nplus = Vec::with_capacity(iterations);
117 for (copies, cells) in distribution.into_iter() {
118 if copies > 0 {
119 for _ in 0..cells {
120 nplus.push(unsafe { NonZeroU16::new_unchecked(copies) });
121 }
122 }
123 }
124
125 EcDNADistribution { nminus, nplus }
126 }
127
128 pub fn get_nminus(&self) -> &u64 {
129 &self.nminus
130 }
131
132 pub fn is_nplus_empty(&self) -> bool {
133 self.nplus.is_empty()
135 }
136
137 pub fn pick_remove_random_nplus(&mut self, rng: &mut impl Rng) -> anyhow::Result<DNACopy> {
138 if self.is_nplus_empty() {
147 bail!("Empty distr")
148 } else {
149 let idx = self.pick_random_nplus(rng);
150 Ok(self.nplus.swap_remove(idx))
151 }
152 }
153
154 fn pick_random_nplus(&self, rng: &mut impl Rng) -> usize {
155 rng.gen_range(0..self.nplus.len())
156 }
157
158 pub fn increase_nplus(&mut self, copies: Vec<DNACopy>, verbosity: u8) {
159 for k in copies {
162 if verbosity > 1 {
163 println!("Increasing count for clone {}", k);
164 }
165 self.nplus.push(k);
166 }
167 }
168
169 pub fn increase_nminus(&mut self) {
170 self.nminus += 1;
171 }
172
173 pub fn decrease_nplus(&mut self, rng: &mut impl Rng, verbosity: u8) {
174 let idx = self.pick_random_nplus(rng);
175
176 if verbosity > 1 {
177 println!("Clone {} will loose one cell", self.nplus[idx]);
178 }
179 self.nplus.swap_remove(idx);
180 }
181
182 pub fn decrease_nminus(&mut self) {
183 self.nminus -= 1;
184 }
185
186 pub fn compute_nplus(&self) -> u64 {
187 self.nplus.len() as u64
188 }
189
190 pub fn is_empty(&self) -> bool {
191 self.is_nplus_empty() && self.nminus == 0
192 }
193
194 pub fn load(path2file: &Path, capacity: usize) -> anyhow::Result<Self> {
195 let extension = path2file
196 .extension()
197 .with_context(|| format!("Do not recognize extension of file {:#?}", path2file))
198 .unwrap();
199 if let Some("json") = extension.to_str() {
200 let path2read = fs::read_to_string(path2file)
201 .with_context(|| format!("Cannot read {:#?}", path2file))?;
202
203 let map: HashMap<u16, u64> = serde_json::from_str(&path2read)
204 .map_err(|e| anyhow::anyhow!(e))
205 .with_context(|| "Cannot load ecDNA distribution")?;
206
207 return Ok(EcDNADistribution::new(map, capacity));
208 }
209 bail!("Extension not recognized, must be JSON `.json`")
210 }
211
212 pub fn save(&self, path2file: &Path, verbosity: u8) -> anyhow::Result<()> {
213 let map: HashMap<u16, u64> = self.create_histogram();
214 let distribution =
215 serde_json::to_string(&map).expect("Cannot serialize the ecDNA distribution");
216 if verbosity > 1 {
217 println!(
218 "Saving ecDNA distribution to {:#?}\n{:#?}",
219 path2file, distribution
220 );
221 }
222 fs::create_dir_all(path2file.parent().unwrap()).expect("Cannot create dir");
223 fs::write(path2file, distribution)
224 .with_context(|| "Cannot save the ecDNA distribution".to_string())?;
225 Ok(())
226 }
227
228 pub fn into_subsampled<R: Rng>(&self, nb_cells: u64, rng: &mut R) -> EcDNADistribution {
229 self.undersample(nb_cells, rng)
230 }
231
232 pub fn sample<R: Rng>(&mut self, nb_cells: u64, strategy: &SamplingStrategy, mut rng: &mut R) {
233 assert!(nb_cells <= (*self.get_nminus() + self.compute_nplus()));
240 assert!(nb_cells > 0);
241 let nminus = self.nminus;
244 if self.compute_nplus() > 0 {
245 match strategy {
246 SamplingStrategy::Uniform => {}
247 SamplingStrategy::Gaussian(_)
248 | SamplingStrategy::Poisson(_)
249 | SamplingStrategy::Exponential(_) => {
250 let mut ecdna_copies = Vec::with_capacity(
251 *self.get_nminus() as usize + self.compute_nplus() as usize,
252 );
253 self.nminus = 0;
257 match strategy {
258 SamplingStrategy::Gaussian(sigma) => {
259 for (k, cells) in self.create_histogram() {
260 ecdna_copies.append(
261 &mut rand_distr::Normal::new(k as f32, sigma.get())
262 .unwrap()
263 .sample_iter(&mut rng)
264 .take(cells as usize)
265 .map(|copy| f32::round(copy) as u16)
266 .collect(),
267 );
268 }
269 }
270 SamplingStrategy::Poisson(rate) => {
271 for (k, cells) in self.create_histogram() {
272 ecdna_copies.append(
273 &mut rand_distr::Poisson::new(k as f32 * rate.get())
274 .unwrap()
275 .sample_iter(&mut rng)
276 .take(cells as usize)
277 .map(|copy| f32::round(copy) as u16)
278 .collect(),
279 );
280 }
281 }
282 SamplingStrategy::Exponential(rate) => {
283 let exp = rand_distr::Exp::new(rate.get()).unwrap();
284 for (k, cells) in self.create_histogram() {
285 ecdna_copies.append(
286 &mut exp
287 .sample_iter(&mut rng)
288 .take(cells as usize)
289 .map(|copy| f32::round(k as f32 * copy) as u16)
290 .collect(),
291 );
292 }
293 }
294 _ => unreachable!(),
295 }
296
297 let distribution = EcDNADistribution::from(ecdna_copies);
298 self.nminus = nminus + distribution.nminus;
300 self.nplus = distribution.nplus;
301 }
302 }
303 let distribution = self.undersample(nb_cells, rng);
304 self.nplus = distribution.nplus;
305 self.nminus = distribution.nminus;
306 } else {
307 self.nminus = nb_cells;
308 }
309 }
310
311 fn undersample(&self, nb_cells: u64, rng: &mut impl Rng) -> EcDNADistribution {
312 let mut distribution: Vec<u16> = self
318 .nplus
319 .iter()
320 .map(|&k| k.get())
321 .chain(std::iter::repeat(0u16).take(*self.get_nminus() as usize))
322 .collect();
323 let sample = if nb_cells as usize > self.nplus.len() / 2 {
324 let tot = distribution.len();
325 distribution.partial_shuffle(rng, tot - nb_cells as usize).1
326 } else {
327 distribution.partial_shuffle(rng, nb_cells as usize).0
328 };
329 let mut new_distribution = EcDNADistribution::from(sample.to_vec());
330 let amount = self.nplus.len() / 3;
332 new_distribution.nplus.partial_shuffle(rng, amount / 3);
333 new_distribution
334 }
335
336 pub fn ks_distance(&self, ecdna: &EcDNADistribution) -> f32 {
337 EcDNADistribution::calculate_statistic(&self.nplus, &ecdna.nplus)
342 }
343
344 fn calculate_statistic<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f32 {
345 assert!(!xs.is_empty());
347 assert!(!ys.is_empty());
348 let n = xs.len();
349 let m = ys.len();
350
351 assert!(n > 7 && m > 7);
352 let mut xs = xs.to_vec();
353 let mut ys = ys.to_vec();
354
355 xs.sort();
357 ys.sort();
358
359 let mut current: &T;
362
363 let mut i = 0;
365 let mut j = 0;
366
367 let mut ecdf_xs = 0.0;
369 let mut ecdf_ys = 0.0;
370
371 let mut statistic = 0.0;
373
374 while i < n && j < m {
375 let x_i = &xs[i];
377
378 while i + 1 < n && *x_i == xs[i + 1] {
379 i += 1;
380 }
381
382 let y_j = &ys[j];
384
385 while j + 1 < m && *y_j == ys[j + 1] {
386 j += 1;
387 }
388
389 current = min(x_i, y_j);
391
392 if current == x_i {
394 ecdf_xs = (i + 1) as f32 / n as f32;
395 i += 1;
396 }
397
398 if current == y_j {
399 ecdf_ys = (j + 1) as f32 / m as f32;
400 j += 1;
401 }
402
403 let diff = (ecdf_xs - ecdf_ys).abs();
405
406 if diff > statistic {
407 statistic = diff;
408 }
409 }
410
411 statistic
416 }
417
418 pub fn create_histogram(&self) -> HashMap<u16, u64> {
419 let mut lookup = self.nplus.iter().fold(HashMap::new(), |mut acc, c| {
420 let c_u16 = c.get();
421 *acc.entry(c_u16).or_insert(0) += 1;
422 acc
423 });
424
425 if self.nminus > 0 {
426 lookup.insert(0, *self.get_nminus());
427 }
428
429 lookup
430 }
431
432 pub fn drop_cells_with_k_copies(self, k: DNACopy) -> EcDNADistribution {
433 let nminus = self.nminus;
438 let nplus = self.nplus.into_iter().filter(|&ecdna| ecdna != k).collect();
439 EcDNADistribution { nplus, nminus }
440 }
441
442 pub fn drop_nminus(&mut self) {
443 self.nminus = 0;
445 }
446
447 pub fn scale_by(self, c: f32) -> Self {
448 if c.is_sign_positive() {
457 assert!(!c.is_nan(), "`c` cannot be NaN");
458 if c.is_finite() {
459 if (c - 0f32).abs() < f32::EPSILON {
460 panic!("`c` cannot be zero!");
461 } else {
462 let nplus = self
464 .nplus
465 .into_iter()
466 .map(|copy| unsafe {
467 NonZeroU16::new_unchecked((copy.get() as f32 / c).ceil() as u16)
468 })
469 .collect();
470 return EcDNADistribution {
471 nminus: self.nminus,
472 nplus,
473 };
474 }
475 }
476 panic!("`c` cannot be infinite");
477 }
478 panic!("`c` must be greather than 0!")
479 }
480
481 pub fn compute_mean(&self) -> f32 {
482 let mean = self.nplus.iter().map(|&ele| ele.get() as u32).sum::<u32>() as f32;
484 let n = self.nminus as f32 + self.nplus.len() as f32;
485 mean / n
486 }
487
488 pub fn compute_variance(&self) -> f32 {
489 let mean = self.compute_mean();
491 if mean.is_nan() {
492 f32::NAN
493 } else {
494 let mut variance = self
495 .nplus
496 .iter()
497 .map(|value| {
498 let diff = mean - (value.get() as f32);
499
500 diff * diff
501 })
502 .sum::<f32>();
503 variance += *self.get_nminus() as f32 * mean * mean;
504 variance /= (*self.get_nminus() + self.compute_nplus()) as f32;
505
506 variance
507 }
508 }
509
510 pub fn compute_entropy(&self) -> f32 {
511 let ntot = self.compute_nplus() as f32 + self.nminus as f32;
512 let lookup: HashMap<u16, f32> = self
513 .create_histogram()
514 .into_iter()
515 .map(|(k, copy)| (k, copy as f32))
516 .collect();
517
518 -lookup
519 .values()
520 .map(|&count| {
521 let prob = count / ntot;
522 prob * prob.log2()
523 })
524 .sum::<f32>()
525 }
526
527 pub fn compute_frequency(&self) -> f32 {
528 let nplus = self.compute_nplus() as f32;
529 let nminus = self.nminus as f32;
530 let freq = nplus / (nplus + nminus);
531 if freq.is_nan() {
532 0f32
533 } else {
534 freq
535 }
536 }
537}
538
539#[cfg(test)]
540mod tests {
541 use crate::test_util::NonEmptyDistribtionWithNPlusCells;
542
543 use super::*;
544 use quickcheck::{Arbitrary, Gen};
545 use quickcheck_macros::quickcheck;
546 use rand::SeedableRng;
547 use rand_chacha::ChaCha8Rng;
548 use std::{
549 collections::{hash_map::RandomState, HashSet},
550 num::{NonZeroU64, NonZeroU8},
551 };
552
553 #[derive(Clone, Debug)]
554 pub struct NPlusVec(pub Vec<DNACopy>);
555
556 impl Arbitrary for NPlusVec {
557 fn arbitrary(g: &mut Gen) -> Self {
558 NPlusVec(
559 (0..4u16 * (u8::MAX as u16))
560 .map(|_| NonZeroU16::arbitrary(g))
561 .collect(),
562 )
563 }
564 }
565
566 #[derive(Clone, Debug)]
567 pub struct SigmaGrZero(pub Sigma);
568
569 impl Arbitrary for SigmaGrZero {
570 fn arbitrary(g: &mut Gen) -> Self {
571 SigmaGrZero(Sigma(LambdaGrZero::arbitrary(g).0))
572 }
573 }
574 #[derive(Clone, Debug)]
575 pub struct LambdaGrZero(pub Lambda);
576
577 impl Arbitrary for LambdaGrZero {
578 fn arbitrary(g: &mut Gen) -> Self {
579 let mut lambda = u8::arbitrary(g) as f32 / 10.;
580 while !lambda.is_normal() || lambda.is_sign_negative() {
581 lambda = u8::arbitrary(g) as f32 / 10.;
582 }
583 LambdaGrZero(Lambda::new(lambda))
584 }
585 }
586
587 #[test]
588 fn ecdna_compute_mean_empty_test() {
589 assert!(EcDNADistribution {
590 nminus: 0,
591 nplus: vec![]
592 }
593 .compute_mean()
594 .is_nan());
595 }
596
597 #[test]
598 fn ecdna_compute_mean_10() {
599 assert_eq!(
600 EcDNADistribution {
601 nminus: 0,
602 nplus: vec![
603 NonZeroU16::new(15).unwrap(),
604 NonZeroU16::new(5).unwrap(),
605 NonZeroU16::new(10).unwrap()
606 ],
607 }
608 .compute_mean(),
609 10f32
610 )
611 }
612
613 #[test]
614 fn ecdna_compute_mean_with_nminus_10() {
615 assert_eq!(
616 EcDNADistribution {
617 nminus: 1,
618 nplus: vec![
619 NonZeroU16::new(15).unwrap(),
620 NonZeroU16::new(15).unwrap(),
621 NonZeroU16::new(10).unwrap()
622 ],
623 }
624 .compute_mean(),
625 10f32
626 )
627 }
628
629 #[test]
630 fn ecdna_compute_mean_with_nminus_5() {
631 assert_eq!(
632 EcDNADistribution {
633 nminus: 5,
634 nplus: vec![
635 NonZeroU16::new(15).unwrap(),
636 NonZeroU16::new(15).unwrap(),
637 NonZeroU16::new(10).unwrap()
638 ],
639 }
640 .compute_mean(),
641 5f32
642 )
643 }
644
645 #[test]
646 fn ecdna_compute_mean_with_nminus_05() {
647 assert_eq!(
648 EcDNADistribution {
649 nminus: 19,
650 nplus: vec![NonZeroU16::new(10).unwrap()],
651 }
652 .compute_mean(),
653 0.5f32
654 )
655 }
656
657 #[quickcheck]
658 fn ecdna_compute_mean_not_empty_test(copies: DNACopy, nminus: NonZeroU64) -> bool {
659 let nminus = nminus.get();
660 EcDNADistribution {
661 nminus,
662 nplus: vec![copies],
663 }
664 .compute_mean()
665 == (copies.get() as f32 / nminus as f32)
666 }
667
668 #[quickcheck]
669 fn ecdna_compute_mean_not_empty_no_nminus_test(copies: DNACopy) -> bool {
670 EcDNADistribution {
671 nminus: 0,
672 nplus: vec![copies],
673 }
674 .compute_mean()
675 == copies.get() as f32
676 }
677
678 #[test]
679 fn compute_variance_no_cells() {
680 let distribution = EcDNADistribution {
681 nminus: 0,
682 nplus: vec![],
683 };
684 assert!(distribution.compute_variance().is_nan());
685 }
686
687 #[quickcheck]
688 fn compute_variance_no_nminus(nplus: NonZeroU8) -> bool {
689 let distribution = EcDNADistribution {
690 nminus: 0,
691 nplus: vec![DNACopy::new(1).unwrap(); nplus.get() as usize],
692 };
693 (distribution.compute_variance() - 0.).abs() < f32::EPSILON
694 }
695
696 #[quickcheck]
697 fn compute_variance_no_nplus(nminus: NonZeroU8) -> bool {
698 let distribution = EcDNADistribution {
699 nminus: nminus.get() as u64,
700 nplus: vec![],
701 };
702 (distribution.compute_variance() - 0.).abs() < f32::EPSILON
703 }
704
705 #[test]
706 fn compute_variance_equal_1() {
707 let distribution = EcDNADistribution {
708 nminus: 1,
709 nplus: vec![DNACopy::new(2).unwrap()],
710 };
711 assert!((distribution.compute_variance() - 1.).abs() < f32::EPSILON);
712 }
713
714 #[quickcheck]
715 fn ecdna_compute_frequency_no_nplus(nminus: u64) -> bool {
716 (EcDNADistribution {
717 nminus,
718 nplus: vec![],
719 }
720 .compute_frequency()
721 - 0f32)
722 .abs()
723 < f32::EPSILON
724 }
725
726 #[quickcheck]
727 fn ecdna_compute_frequency_nonminus(copies: DNACopy) -> bool {
728 (EcDNADistribution {
729 nminus: 0,
730 nplus: vec![copies],
731 }
732 .compute_frequency()
733 - 1f32)
734 .abs()
735 < f32::EPSILON
736 }
737
738 #[quickcheck]
739 fn ecdna_compute_frequency_01(copies: DNACopy) -> bool {
740 (EcDNADistribution {
741 nminus: 9,
742 nplus: vec![copies],
743 }
744 .compute_frequency()
745 - 0.1f32)
746 .abs()
747 < f32::EPSILON
748 }
749
750 #[quickcheck]
751 fn ecdna_compute_frequency_02(copies: DNACopy) -> bool {
752 (EcDNADistribution {
753 nminus: 4,
754 nplus: vec![copies],
755 }
756 .compute_frequency()
757 - 0.2f32)
758 .abs()
759 < f32::EPSILON
760 }
761
762 #[quickcheck]
763 fn ecdna_compute_frequency_05(copies: DNACopy) -> bool {
764 (EcDNADistribution {
765 nminus: 1,
766 nplus: vec![copies],
767 }
768 .compute_frequency()
769 - 0.5f32)
770 .abs()
771 < f32::EPSILON
772 }
773
774 #[test]
775 fn compute_nplus_0_cells_test() {
776 let ecdna = EcDNADistribution {
777 nminus: 0,
778 nplus: vec![],
779 };
780 assert_eq!(ecdna.compute_nplus(), 0);
781 assert!(ecdna.is_empty());
782 }
783
784 #[quickcheck]
785 fn compute_nplus_0_cells_with_nminus_test(nminus: NonZeroU8) -> bool {
786 let ecdna = EcDNADistribution {
787 nminus: nminus.get() as u64,
788 nplus: vec![],
789 };
790 ecdna.compute_nplus() == 0 && !ecdna.is_empty()
791 }
792
793 #[quickcheck]
794 fn compute_nplus_0_nminus_test(copies: DNACopy) -> bool {
795 let ecdna = EcDNADistribution {
796 nminus: 0,
797 nplus: vec![copies],
798 };
799 ecdna.compute_nplus() > 0 && !ecdna.is_empty()
800 }
801
802 #[quickcheck]
803 fn get_uniform_random_nplus_test(seed: u64, distr: NonEmptyDistribtionWithNPlusCells) -> bool {
804 let mut rng = ChaCha8Rng::seed_from_u64(seed);
805 let idx = distr.0.pick_random_nplus(&mut rng);
806 distr.0.nplus[idx].get() > 0
807 }
808
809 #[quickcheck]
810 fn get_uniform_random_nplus_reproducible(
811 seed: u64,
812 distr: NonEmptyDistribtionWithNPlusCells,
813 ) -> bool {
814 let mut rng = ChaCha8Rng::seed_from_u64(seed);
815 let idx_first = distr.0.pick_random_nplus(&mut rng);
816 let mut rng = ChaCha8Rng::seed_from_u64(seed);
817 let idx_second = distr.0.pick_random_nplus(&mut rng);
818 idx_first == idx_second && distr.0.nplus[idx_first].get() > 0
819 }
820
821 #[test]
822 #[should_panic]
823 fn ecdna_pick_random_nplus_panics_nminus_test() {
824 let mut rng = ChaCha8Rng::from_entropy();
825 let distr = EcDNADistribution {
826 nminus: 12,
827 nplus: vec![],
828 };
829 distr.pick_random_nplus(&mut rng);
830 }
831
832 #[quickcheck]
833 fn pick_random_nplus_test(distribution: NonEmptyDistribtionWithNPlusCells, seed: u64) {
834 let mut rng = ChaCha8Rng::seed_from_u64(seed);
835 let random = distribution.0.pick_random_nplus(&mut rng);
836 assert!(distribution.0.nplus.get(random).is_some());
837 }
838
839 #[quickcheck]
840 fn pick_random_nplus_reproducible_test(
841 distribution: NonEmptyDistribtionWithNPlusCells,
842 seed: u64,
843 ) -> bool {
844 let mut rng = ChaCha8Rng::seed_from_u64(seed);
845 let idx_first = distribution.0.pick_random_nplus(&mut rng);
846 let mut rng = ChaCha8Rng::seed_from_u64(seed);
847 let idx_second = distribution.0.pick_random_nplus(&mut rng);
848
849 idx_first == idx_second && distribution.0.nplus[idx_first].get() > 0
850 }
851
852 #[quickcheck]
853 fn pick_random_nplus_no_choice_test(seed: u64, copies: DNACopy) -> bool {
854 let mut rng = ChaCha8Rng::seed_from_u64(seed);
855 let distr = EcDNADistribution {
856 nminus: 12,
857 nplus: vec![copies],
858 };
859 distr.nplus[distr.pick_random_nplus(&mut rng)] == copies
860 }
861
862 #[test]
863 fn test_entropy_rosetta() {
864 let mut distribution = EcDNADistribution {
866 nplus: vec![1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
867 .into_iter()
868 .map(|ele| NonZeroU16::new(ele).unwrap())
869 .collect(),
870 nminus: 0,
871 };
872
873 let entropy = distribution.compute_entropy();
874 let expected = 1.8464392f32;
875 assert!(
876 (entropy - expected).abs() < 0.0001,
877 "should be {}, got {} instead",
878 expected,
879 entropy
880 );
881
882 distribution.nminus += 1;
883 assert!(
884 (distribution.compute_entropy() - 2.1180782).abs() < f32::EPSILON,
885 "{}",
886 entropy
887 );
888 }
889
890 #[test]
891 #[should_panic]
892 fn ecdna_ks_distance_x_empty() {
893 let x = EcDNADistribution {
894 nplus: vec![],
895 nminus: 0,
896 };
897 let y = EcDNADistribution {
898 nplus: vec![NonZeroU16::new(3).unwrap(); 7],
899 nminus: 0,
900 };
901 x.ks_distance(&y);
902 }
903
904 #[test]
905 #[should_panic]
906 fn ecdna_ks_distance_y_empty() {
907 let x = EcDNADistribution {
908 nplus: vec![NonZeroU16::new(3).unwrap(); 7],
909 nminus: 0,
910 };
911 let y = EcDNADistribution {
912 nplus: vec![],
913 nminus: 0,
914 };
915 x.ks_distance(&y);
916 }
917
918 #[test]
919 #[should_panic]
920 fn ecdna_ks_distance_small_sample_y() {
921 let x = EcDNADistribution {
922 nplus: vec![
923 NonZeroU16::new(1).unwrap(),
924 NonZeroU16::new(1).unwrap(),
925 NonZeroU16::new(1).unwrap(),
926 NonZeroU16::new(1).unwrap(),
927 NonZeroU16::new(1).unwrap(),
928 NonZeroU16::new(1).unwrap(),
929 NonZeroU16::new(1).unwrap(),
930 NonZeroU16::new(1).unwrap(),
931 NonZeroU16::new(2).unwrap(),
932 ],
933 nminus: 1,
934 };
935 let y = EcDNADistribution {
936 nplus: vec![NonZeroU16::new(1).unwrap(), NonZeroU16::new(2).unwrap()],
937 nminus: 100,
938 };
939 x.ks_distance(&y);
940 }
941
942 #[test]
943 #[should_panic]
944 fn ecdna_ks_distance_small_sample_x() {
945 let x = EcDNADistribution {
946 nplus: vec![NonZeroU16::new(1).unwrap(), NonZeroU16::new(2).unwrap()],
947 nminus: 100,
948 };
949 let y = EcDNADistribution {
950 nplus: vec![
951 NonZeroU16::new(1).unwrap(),
952 NonZeroU16::new(1).unwrap(),
953 NonZeroU16::new(1).unwrap(),
954 NonZeroU16::new(1).unwrap(),
955 NonZeroU16::new(1).unwrap(),
956 NonZeroU16::new(1).unwrap(),
957 NonZeroU16::new(1).unwrap(),
958 NonZeroU16::new(1).unwrap(),
959 NonZeroU16::new(2).unwrap(),
960 ],
961 nminus: 100,
962 };
963 x.ks_distance(&y);
964 }
965
966 #[quickcheck]
967 fn ecdna_ks_distance_same_data(x: NonEmptyDistribtionWithNPlusCells) -> bool {
968 (x.0.ks_distance(&x.0) - 0f32).abs() <= f32::EPSILON
969 }
970
971 #[quickcheck]
972 fn ecdna_ks_distance_is_one_when_no_overlap_in_support(
973 x: NonEmptyDistribtionWithNPlusCells,
974 ) -> bool {
975 let y_min = x.0.nplus.iter().max().unwrap().get() + 1;
977 let y: EcDNADistribution = EcDNADistribution {
978 nminus: 0,
979 nplus: x
980 .0
981 .nplus
982 .iter()
983 .map(|ele| NonZeroU16::new(ele.get() + y_min).unwrap())
984 .collect(),
985 };
986 (x.0.ks_distance(&y) - 1f32).abs() <= f32::EPSILON
987 }
988
989 #[quickcheck]
990 fn ecdna_ks_distance_is_point_five_when_semi_overlap_in_support(
991 x: NonEmptyDistribtionWithNPlusCells,
992 ) -> bool {
993 let y_min = x.0.nplus.iter().max().unwrap().get() + 1;
995 let mut y = EcDNADistribution {
997 nminus: x.0.nminus,
998 nplus: x.0.nplus.clone(),
999 };
1000
1001 for copy in x.0.nplus.iter() {
1002 y.nplus.push(NonZeroU16::new(copy.get() + y_min).unwrap());
1003 }
1004 y.nminus *= 2;
1005 let y_ntot = y.compute_nplus() + y.nminus;
1006 let ntot = x.0.compute_nplus() + x.0.nminus;
1007 assert_eq!(y_ntot, ntot * 2, "{} vs {}", y_ntot, ntot * 2);
1008
1009 (x.0.ks_distance(&y) - 0.5f32).abs() <= f32::EPSILON
1010 }
1011
1012 #[test]
1013 fn create_histogram_without_nminus() {
1014 let distribution = EcDNADistribution {
1015 nminus: 0,
1016 nplus: [1u16, 1u16, 2u16, 200u16]
1017 .iter()
1018 .map(|&ele| NonZeroU16::new(ele).unwrap())
1019 .collect(),
1020 };
1021 let map1: HashMap<u16, f32> = distribution
1022 .create_histogram()
1023 .into_iter()
1024 .map(|(k, copy)| (k, copy as f32))
1025 .collect();
1026 let map2 = HashMap::from([(1, 2f32), (2, 1f32), (200, 1f32)]);
1027 assert_eq!(map1.len(), map2.len());
1028 assert!(map1.keys().all(|k| map2.contains_key(k)));
1029 assert!(map1.keys().all(|k| map1.get(k) == map2.get(k)));
1030 }
1031
1032 #[test]
1033 fn create_histogram() {
1034 let distribution = EcDNADistribution {
1035 nminus: 10,
1036 nplus: [1u16, 1u16, 2u16, 200u16]
1037 .iter()
1038 .map(|&ele| NonZeroU16::new(ele).unwrap())
1039 .collect(),
1040 };
1041 let map1: HashMap<u16, f32> = distribution
1042 .create_histogram()
1043 .into_iter()
1044 .map(|(k, copy)| (k, copy as f32))
1045 .collect();
1046 let map2 = HashMap::from([(0, 10f32), (1, 2f32), (2, 1f32), (200, 1f32)]);
1047 assert_eq!(map1.len(), map2.len());
1048 assert!(map1.keys().all(|k| map2.contains_key(k)));
1049 assert!(map1.keys().all(|k| map1.get(k) == map2.get(k)));
1050 }
1051
1052 #[quickcheck]
1053 fn from_vec_empty_to_distribution(capacity: u8) -> bool {
1054 let distribution: EcDNADistribution = Vec::with_capacity(capacity as usize).into();
1055 distribution.nplus.capacity() == 0usize
1056 && distribution.nminus == 0u64
1057 && distribution.nplus.is_empty()
1058 && distribution.is_empty()
1059 }
1060
1061 #[quickcheck]
1062 fn from_vec_to_distribution(mut nplus: NPlusVec, nminus: u8) -> bool {
1063 let mut my_vec: Vec<u16> = nplus.0.iter().map(|&k| k.get()).collect();
1064 my_vec.extend(std::iter::repeat(0u16).take(nminus as usize));
1065 my_vec.shrink_to_fit();
1066 let capacity = my_vec.capacity();
1067 let distribution: EcDNADistribution = my_vec.into();
1068 nplus.0.sort();
1069 distribution.nplus.capacity() <= capacity
1070 && distribution.nminus == nminus as u64
1071 && distribution.nplus == nplus.0
1072 }
1073
1074 #[test]
1075 #[should_panic]
1076 fn sample_wrong_sample_size() {
1077 let my_vec = vec![0u16, 1u16];
1078 let mut distribution: EcDNADistribution = my_vec.into();
1079 distribution.sample(
1080 4,
1081 &SamplingStrategy::Uniform,
1082 &mut ChaCha8Rng::seed_from_u64(26),
1083 );
1084 }
1085
1086 #[test]
1087 #[should_panic]
1088 fn sample_wrong_sample_size_0() {
1089 let my_vec = vec![0u16, 1u16];
1090 let mut distribution: EcDNADistribution = my_vec.into();
1091 distribution.sample(
1092 0,
1093 &SamplingStrategy::Uniform,
1094 &mut ChaCha8Rng::seed_from_u64(26),
1095 );
1096 }
1097
1098 #[quickcheck]
1099 fn sample_gaussian_2_zeros_1_one_test(sigma: SigmaGrZero, seed: u64) -> bool {
1100 let distr_vec = vec![0, 0, 1];
1101 let size = distr_vec.len();
1102 let mut distribution = EcDNADistribution::from(distr_vec);
1103 distribution.sample(
1104 size as u64,
1105 &SamplingStrategy::Gaussian(sigma.0),
1106 &mut ChaCha8Rng::seed_from_u64(seed),
1107 );
1108 distribution.compute_nplus() < 2
1109 }
1110
1111 #[quickcheck]
1112 fn sample_gaussian_only_zeros_test(sigma: SigmaGrZero, seed: u64) -> bool {
1113 let distr_vec = vec![0; 100];
1114 let size = distr_vec.len();
1115 let mut distribution = EcDNADistribution::from(distr_vec);
1116 distribution.sample(
1117 size as u64,
1118 &SamplingStrategy::Gaussian(sigma.0),
1119 &mut ChaCha8Rng::seed_from_u64(seed),
1120 );
1121 distribution.nplus.is_empty() && *distribution.get_nminus() as usize == size
1122 }
1123
1124 #[quickcheck]
1125 fn sample_gaussian_range_copy_test(copy: DNACopy, seed: u64) -> bool {
1126 let range = 4;
1127 let mut min = 1;
1128 let max = copy.get() + range;
1129 if copy.get() > range {
1130 min = copy.get() - range;
1131 }
1132 let distr_vec = vec![copy.get(); 100];
1133 let size = distr_vec.len();
1134 let mut distribution = EcDNADistribution::from(distr_vec);
1135 distribution.sample(
1136 size as u64,
1137 &SamplingStrategy::Gaussian(Sigma::new(1.0)),
1138 &mut ChaCha8Rng::seed_from_u64(seed),
1139 );
1140 distribution
1141 .nplus
1142 .into_iter()
1143 .all(|copy| min <= copy.get() && copy.get() <= max)
1144 }
1145
1146 #[quickcheck]
1147 fn sample_gaussian_test(seed: u64, sigma: SigmaGrZero) -> bool {
1148 let distr_vec = vec![1; 100];
1149 let size = distr_vec.len();
1150 let mut distribution = EcDNADistribution::from(distr_vec);
1151 distribution.sample(
1152 size as u64,
1153 &SamplingStrategy::Gaussian(sigma.0),
1154 &mut ChaCha8Rng::seed_from_u64(seed),
1155 );
1156 let mut found = false;
1157 for copy in distribution.nplus.iter() {
1158 if copy.get() >= 1 {
1159 found = true;
1160 }
1161 }
1162 found
1163 }
1164
1165 #[should_panic]
1166 #[test]
1167 fn lambda_0_small_poisson_test() {
1168 Lambda::new(0.);
1169 }
1170
1171 #[should_panic]
1172 #[test]
1173 fn lambda_too_small_poisson_test() {
1174 Lambda::new(-1.0);
1175 }
1176
1177 #[quickcheck]
1178 fn sample_poisson_test(lambda: LambdaGrZero, seed: u64) -> bool {
1179 let distr_vec = vec![1; 100];
1180 let size = distr_vec.len();
1181 let mut distribution = EcDNADistribution::from(distr_vec);
1182 distribution.sample(
1183 size as u64,
1184 &SamplingStrategy::Poisson(lambda.0),
1185 &mut ChaCha8Rng::seed_from_u64(seed),
1186 );
1187 distribution.get_nminus() + distribution.compute_nplus() == size as u64
1188 }
1189
1190 #[should_panic]
1215 #[test]
1216 fn lambda_0_small_exp_test() {
1217 Lambda::new(0.);
1218 }
1219
1220 #[should_panic]
1221 #[test]
1222 fn lambda_too_small_exp_test() {
1223 Lambda::new(-1.0);
1224 }
1225
1226 #[quickcheck]
1227 fn sample_exp_test(lambda: LambdaGrZero, seed: u64) -> bool {
1228 let distr_vec = vec![1; 100];
1229 let size = distr_vec.len();
1230 let mut distribution = EcDNADistribution::from(distr_vec);
1231 distribution.sample(
1232 size as u64,
1233 &SamplingStrategy::Exponential(lambda.0),
1234 &mut ChaCha8Rng::seed_from_u64(seed),
1235 );
1236 distribution.get_nminus() + distribution.compute_nplus() == size as u64
1237 }
1238
1239 #[derive(Debug, Clone)]
1240 enum SamplingStrategyEnum {
1241 Gaussian,
1242 Poisson,
1243 Exp,
1244 }
1245
1246 impl Arbitrary for SamplingStrategyEnum {
1247 fn arbitrary(g: &mut Gen) -> Self {
1248 match u8::arbitrary(g) % 3 {
1249 0 => Self::Gaussian,
1250 1 => Self::Poisson,
1251 2 => Self::Exp,
1252 _ => panic!(),
1253 }
1254 }
1255 }
1256
1257 #[quickcheck]
1284 fn undersample_full_distribution(
1285 mut distribution: NonEmptyDistribtionWithNPlusCells,
1286 seed: u64,
1287 ) -> bool {
1288 let size = (distribution.0.compute_nplus() + *distribution.0.get_nminus()) as usize;
1289 let mut distribution_sampled = distribution.0.clone();
1290 distribution_sampled.undersample(size as u64, &mut ChaCha8Rng::seed_from_u64(seed));
1291 distribution.0.nplus.sort();
1292 distribution_sampled.nplus.sort();
1293
1294 distribution.0 == distribution_sampled
1295 }
1296
1297 #[quickcheck]
1298 fn undersample_distribution(
1299 size: NonZeroU8,
1300 distribution: NonEmptyDistribtionWithNPlusCells,
1301 seed: u64,
1302 ) -> bool {
1303 let copies_present = HashSet::<u16, RandomState>::from_iter(
1304 distribution
1305 .0
1306 .nplus
1307 .iter()
1308 .map(|&ele| ele.get())
1309 .chain(std::iter::repeat(0u16).take(size.get() as usize)),
1310 );
1311
1312 let distribution = distribution
1313 .0
1314 .undersample(size.get() as u64, &mut ChaCha8Rng::seed_from_u64(seed));
1315 let expected_size = size.get() as u64;
1316 let distr_size = *distribution.get_nminus() + distribution.compute_nplus();
1317
1318 expected_size == distr_size
1319 && distribution
1320 .nplus
1321 .iter()
1322 .all(|&ele| copies_present.contains(&ele.get()))
1323 }
1324
1325 #[test]
1326 #[should_panic]
1327 fn scale_by_zero_test() {
1328 let distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1329 distribution.scale_by(0f32);
1330 }
1331
1332 #[test]
1333 #[should_panic]
1334 fn scale_by_nan_test() {
1335 let distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1336 distribution.scale_by(f32::NAN);
1337 }
1338
1339 #[test]
1340 #[should_panic]
1341 fn scale_by_inf_test() {
1342 let distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1343 distribution.scale_by(f32::INFINITY);
1344 }
1345
1346 #[test]
1347 #[should_panic]
1348 fn scale_by_minus_inf_test() {
1349 let distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1350 distribution.scale_by(f32::NEG_INFINITY);
1351 }
1352
1353 #[test]
1354 #[should_panic]
1355 fn scale_by_neg_test() {
1356 let distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1357 distribution.scale_by(-0f32);
1358 }
1359
1360 #[test]
1361 fn scale_by_c_greather_than_1_test() {
1362 let mut distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1363 distribution = distribution.scale_by(2f32);
1364 let expected = EcDNADistribution::from(vec![0, 1, 1, 1, 2, 2]);
1365 assert_eq!(distribution, expected);
1366 }
1367
1368 #[test]
1369 fn scale_by_c_smaller_than_1_test() {
1370 let mut distribution = EcDNADistribution::from(vec![0, 1, 1, 2, 3, 4]);
1371 distribution = distribution.scale_by(0.5f32);
1372 let expected = EcDNADistribution::from(vec![0, 2, 2, 4, 6, 8]);
1373 assert_eq!(distribution, expected);
1374 }
1375
1376 #[quickcheck]
1377 fn test_drop_cells_with_k_copies(distribution: NonEmptyDistribtionWithNPlusCells) -> bool {
1378 let k = distribution
1379 .0
1380 .nplus
1381 .choose(&mut rand::thread_rng())
1382 .unwrap()
1383 .to_owned();
1384 let filtered_distr = distribution.0.drop_cells_with_k_copies(k);
1385
1386 for cell in filtered_distr.nplus {
1387 if cell == k {
1388 return false;
1389 }
1390 }
1391 true
1392 }
1393
1394 #[quickcheck]
1395 fn test_display(distribution: NonEmptyDistribtionWithNPlusCells) {
1396 println!("{}", distribution.0);
1397 }
1398
1399 #[test]
1400 fn test_empty_distribution() {
1401 let distribution = EcDNADistribution::from(vec![]);
1402 assert!(distribution.is_empty());
1403 assert_eq!(distribution.compute_nplus(), 0);
1404 assert_eq!(distribution.get_nminus(), &0);
1405 }
1406}