1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright 2021, 2022 Jeff Knaggs
// Licensed under the MIT license (http://opensource.org/licenses/MIT)
// This file may not be copied, modified, or distributed
// except according to those terms.

use crate::codec::{Codec, ParseBioErr};
use crate::{Seq, SeqSlice};
use bitvec::prelude::*;
use core::fmt;
use core::hash::{Hash, Hasher};
use core::marker::PhantomData;

/// ## Kmers
///
/// Encoded sequences of fixed length `k`, known at compile time.
///
/// For this implementation `k * codec::width` must fit in a `usize` (i.e. 64 bits). for larger kmers use `SeqSlice` or
/// `simd::Kmer`
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Kmer<C: Codec, const K: usize> {
    pub _p: PhantomData<C>,
    pub bs: usize,
}

impl<A: Codec, const K: usize> From<usize> for Kmer<A, K> {
    fn from(i: usize) -> Kmer<A, K> {
        Kmer {
            _p: PhantomData,
            bs: i,
        }
    }
}

impl<A: Codec, const K: usize> From<&Kmer<A, K>> for usize {
    fn from(kmer: &Kmer<A, K>) -> usize {
        kmer.bs
    }
}

impl<A: Codec, const K: usize> From<Kmer<A, K>> for usize {
    fn from(kmer: Kmer<A, K>) -> usize {
        kmer.bs
    }
}

impl<A: Codec, const K: usize> fmt::Display for Kmer<A, K> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let mut s = String::new();
        for chunk in BitArray::<usize, Lsb0>::from(self.bs)[..K * A::WIDTH as usize]
            .chunks(A::WIDTH as usize)
        {
            s.push_str(
                &A::unsafe_from_bits(chunk.load::<u8>())
                    .to_char()
                    .to_string(),
            );
        }
        write!(f, "{}", s,)
    }
}

/// The value of K is included in the hasher state so that
/// `hash(kmer!("AAA")) != hash(kmer!("AAAA"))
impl<A: Codec, const K: usize> Hash for Kmer<A, K> {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.bs.hash(state);
        K.hash(state);
    }
}

impl<A: Codec, const K: usize> TryFrom<Seq<A>> for Kmer<A, K> {
    type Error = ParseBioErr;

    fn try_from(seq: Seq<A>) -> Result<Self, Self::Error> {
        if seq.len() != K {
            Err(ParseBioErr)
        } else {
            Ok(Kmer::<A, K>::from(&seq[0..K]))
        }
    }
}

impl<A: Codec, const K: usize> From<&SeqSlice<A>> for Kmer<A, K> {
    fn from(slice: &SeqSlice<A>) -> Self {
        assert_eq!(K, slice.len());
        Kmer {
            _p: PhantomData,
            bs: slice.into(),
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::codec::amino::Amino;
    use crate::codec::dna::Dna;
    use crate::Seq;
    use core::str::FromStr;
    #[test]
    fn kmer_to_usize() {
        for (kmer, index) in dna!("AACTT").kmers::<2>().zip([0, 4, 13, 15]) {
            assert_eq!(index as usize, (&kmer).into());
        }
    }

    #[test]
    fn amino_kmer_to_usize() {
        for (kmer, index) in amino!("SRY")
            .kmers::<2>()
            .zip([0b001000_011000, 0b010011_001000])
        {
            assert_eq!(index as usize, (&kmer).into());
        }
    }

    #[test]
    fn amino_kmer_iter() {
        for (kmer, target) in amino!("SSLMNHKKL")
            .kmers::<3>()
            .zip(["SSL", "SLM", "LMN", "MNH", "NHK", "HKK", "KKL"])
        {
            assert_eq!(format!("{}", kmer), target);
        }
    }
}