finch/
statistics.rs

1use std::cmp;
2
3use crate::sketch_schemes::KmerCount;
4
5pub fn cardinality(sketch: &[KmerCount]) -> Result<u64, &'static str> {
6    // Other (possibly more accurate) possibilities:
7    // "hyper log-log" estimate from lowest value?
8    // multiset distribution applied to total count number?
9    // "AKMV" approach: http://people.mpi-inf.mpg.de/~rgemulla/publications/beyer07distinct.pdf
10
11    // fast and simple k-minimum value estimate
12    // https://research.neustar.biz/2012/07/09/sketch-of-the-day-k-minimum-values/
13    if sketch.is_empty() {
14        return Ok(0u64);
15    }
16    Ok(((sketch.len() - 1) as f32
17        / (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64)
18}
19
20/// Generates a Vec of numbers of kmers for each coverage level
21///
22/// For example, a size 1000 sketch of the same genome repeated 5 times (e.g. 5x coverage) should
23/// produce a "histogram" like [0, 0, 0, 0, 1000] (assuming no repetative kmers in the genome)
24///
25pub fn hist(sketch: &[KmerCount]) -> Vec<u64> {
26    let mut counts = vec![0u64; 65536];
27    let mut max_count: u64 = 0;
28    for kmer in sketch {
29        max_count = cmp::max(max_count, u64::from(kmer.count));
30        counts[kmer.count as usize - 1] += 1;
31    }
32    counts.truncate(max_count as usize);
33    counts
34}
35
36#[test]
37fn test_hist() {
38    let sketch = vec![
39        KmerCount {
40            hash: 1,
41            kmer: vec![],
42            count: 1,
43            extra_count: 0,
44            label: None,
45        },
46        KmerCount {
47            hash: 2,
48            kmer: vec![],
49            count: 1,
50            extra_count: 0,
51            label: None,
52        },
53        KmerCount {
54            hash: 3,
55            kmer: vec![],
56            count: 1,
57            extra_count: 0,
58            label: None,
59        },
60    ];
61
62    let hist_data = hist(&sketch);
63    assert_eq!(hist_data.len(), 1);
64    assert_eq!(hist_data[0], 3);
65
66    let sketch = vec![
67        KmerCount {
68            hash: 1,
69            kmer: vec![],
70            count: 4,
71            extra_count: 0,
72            label: None,
73        },
74        KmerCount {
75            hash: 2,
76            kmer: vec![],
77            count: 2,
78            extra_count: 0,
79            label: None,
80        },
81        KmerCount {
82            hash: 3,
83            kmer: vec![],
84            count: 4,
85            extra_count: 0,
86            label: None,
87        },
88        KmerCount {
89            hash: 4,
90            kmer: vec![],
91            count: 3,
92            extra_count: 0,
93            label: None,
94        },
95    ];
96
97    let hist_data = hist(&sketch);
98    assert_eq!(hist_data.len(), 4);
99    assert_eq!(hist_data[0], 0);
100    assert_eq!(hist_data[1], 1);
101    assert_eq!(hist_data[2], 1);
102    assert_eq!(hist_data[3], 2);
103}