1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
use std::cmp;

use crate::sketch_schemes::KmerCount;

pub fn cardinality(sketch: &[KmerCount]) -> Result<u64, &'static str> {
    // Other (possibly more accurate) possibilities:
    // "hyper log-log" estimate from lowest value?
    // multiset distribution applied to total count number?
    // "AKMV" approach: http://people.mpi-inf.mpg.de/~rgemulla/publications/beyer07distinct.pdf

    // fast and simple k-minimum value estimate
    // https://research.neustar.biz/2012/07/09/sketch-of-the-day-k-minimum-values/
    if sketch.is_empty() {
        return Ok(0u64);
    }
    Ok(((sketch.len() - 1) as f32
        / (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64)
}

/// Generates a Vec of numbers of kmers for each coverage level
///
/// For example, a size 1000 sketch of the same genome repeated 5 times (e.g. 5x coverage) should
/// produce a "histogram" like [0, 0, 0, 0, 1000] (assuming no repetative kmers in the genome)
///
pub fn hist(sketch: &[KmerCount]) -> Vec<u64> {
    let mut counts = vec![0u64; 65536];
    let mut max_count: u64 = 0;
    for kmer in sketch {
        max_count = cmp::max(max_count, u64::from(kmer.count));
        counts[kmer.count as usize - 1] += 1;
    }
    counts.truncate(max_count as usize);
    counts
}

#[test]
fn test_hist() {
    let sketch = vec![
        KmerCount {
            hash: 1,
            kmer: vec![],
            count: 1,
            extra_count: 0,
            label: None,
        },
        KmerCount {
            hash: 2,
            kmer: vec![],
            count: 1,
            extra_count: 0,
            label: None,
        },
        KmerCount {
            hash: 3,
            kmer: vec![],
            count: 1,
            extra_count: 0,
            label: None,
        },
    ];

    let hist_data = hist(&sketch);
    assert_eq!(hist_data.len(), 1);
    assert_eq!(hist_data[0], 3);

    let sketch = vec![
        KmerCount {
            hash: 1,
            kmer: vec![],
            count: 4,
            extra_count: 0,
            label: None,
        },
        KmerCount {
            hash: 2,
            kmer: vec![],
            count: 2,
            extra_count: 0,
            label: None,
        },
        KmerCount {
            hash: 3,
            kmer: vec![],
            count: 4,
            extra_count: 0,
            label: None,
        },
        KmerCount {
            hash: 4,
            kmer: vec![],
            count: 3,
            extra_count: 0,
            label: None,
        },
    ];

    let hist_data = hist(&sketch);
    assert_eq!(hist_data.len(), 4);
    assert_eq!(hist_data[0], 0);
    assert_eq!(hist_data[1], 1);
    assert_eq!(hist_data[2], 1);
    assert_eq!(hist_data[3], 2);
}