seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
//! k-mer处理模块
//!
//! 提供k-mer相关功能:
//! - k-mer编码和解码
//! - k-mer频率统计
//! - k-mer迭代器

use rustc_hash::FxHashMap;
use std::collections::HashMap;

/// k-mer计数器
///
/// 统计序列中所有k-mer的出现频率
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::kmer::KmerCounter;
///
/// let seq = b"ACGTACGT";
/// let mut counter = KmerCounter::new(3);  // 3-mer
///
/// counter.count_sequence(seq);
///
/// // 获取所有k-mer的频率
/// for (kmer, count) in counter.iter() {
///     println!("{}: {}", String::from_utf8_lossy(kmer), count);
/// }
/// ```
pub struct KmerCounter {
    k: usize,
    counts: FxHashMap<Vec<u8>, usize>,
}

impl KmerCounter {
    /// 创建新的k-mer计数器
    ///
    /// # 参数
    ///
    /// * `k` - k-mer长度
    pub fn new(k: usize) -> Self {
        Self {
            k,
            counts: FxHashMap::default(),
        }
    }

    /// 统计序列中的k-mer
    ///
    /// # 参数
    ///
    /// * `seq` - 输入序列
    pub fn count_sequence(&mut self, seq: &[u8]) {
        if seq.len() < self.k {
            return;
        }

        for i in 0..=(seq.len() - self.k) {
            let kmer = &seq[i..i + self.k];
            *self.counts.entry(kmer.to_vec()).or_insert(0) += 1;
        }
    }

    /// 获取特定k-mer的计数
    pub fn get_count(&self, kmer: &[u8]) -> usize {
        self.counts.get(kmer).copied().unwrap_or(0)
    }

    /// 获取所有k-mer及其计数的迭代器
    pub fn iter(&self) -> impl Iterator<Item = (&Vec<u8>, &usize)> {
        self.counts.iter()
    }

    /// 获取不同k-mer的总数
    pub fn num_distinct_kmers(&self) -> usize {
        self.counts.len()
    }

    /// 获取k-mer总出现次数
    pub fn total_count(&self) -> usize {
        self.counts.values().sum()
    }

    /// 清空计数器
    pub fn clear(&mut self) {
        self.counts.clear();
    }

    /// 获取最常见的k个k-mer
    ///
    /// # 参数
    ///
    /// * `top_k` - 返回的k-mer数量
    ///
    /// # 返回
    ///
    /// 返回按频率降序排列的k-mer列表,每个元素为(k-mer, count)
    pub fn top_kmers(&self, top_k: usize) -> Vec<(Vec<u8>, usize)> {
        let mut items: Vec<_> = self
            .counts
            .iter()
            .map(|(kmer, &count)| (kmer.clone(), count))
            .collect();

        items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
        items.truncate(top_k);
        items
    }
}

/// k-mer迭代器
///
/// 遍历序列中的所有k-mer
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::kmer::KmerIterator;
///
/// let seq = b"ACGTACGT";
/// let kmers: Vec<_> = KmerIterator::new(seq, 3).collect();
///
/// assert_eq!(kmers.len(), 6);  // 8-3+1=6个3-mer
/// ```
pub struct KmerIterator<'a> {
    seq: &'a [u8],
    k: usize,
    pos: usize,
}

impl<'a> KmerIterator<'a> {
    /// 创建新的k-mer迭代器
    ///
    /// # 参数
    ///
    /// * `seq` - 输入序列
    /// * `k` - k-mer长度
    pub fn new(seq: &'a [u8], k: usize) -> Self {
        Self { seq, k, pos: 0 }
    }
}

impl<'a> Iterator for KmerIterator<'a> {
    type Item = &'a [u8];

    fn next(&mut self) -> Option<Self::Item> {
        if self.pos + self.k <= self.seq.len() {
            let kmer = &self.seq[self.pos..self.pos + self.k];
            self.pos += 1;
            Some(kmer)
        } else {
            None
        }
    }
}

/// 反向互补k-mer
///
/// # 参数
///
/// * `kmer` - 输入k-mer
/// * `comp_table` - 互补碱基查找表
///
/// # 返回
///
/// 返回反向互补后的k-mer
pub fn reverse_complement_kmer(kmer: &[u8], comp_table: &[u8; 256]) -> Vec<u8> {
    kmer.iter()
        .rev()
        .map(|&base| comp_table[base as usize])
        .collect()
}

/// 规范化k-mer(返回字典序较小的那个)
///
/// 对于每个k-mer,返回它和它的反向互补中字典序较小的那个,
/// 这样可以将正链和负链的相同k-mer视为同一个。
///
/// # 参数
///
/// * `kmer` - 输入k-mer
/// * `comp_table` - 互补碱基查找表
///
/// # 返回
///
/// 返回规范化的k-mer
pub fn canonical_kmer(kmer: &[u8], comp_table: &[u8; 256]) -> Vec<u8> {
    let rc = reverse_complement_kmer(kmer, comp_table);
    if kmer < rc.as_slice() {
        kmer.to_vec()
    } else {
        rc
    }
}

/// k-mer频率分布统计
///
/// 统计k-mer出现频率的分布,即有多少个k-mer出现1次、2次、3次等
///
/// # 示例
///
/// ```
/// use seqtkrs::algorithms::kmer::kmer_frequency_distribution;
/// use rustc_hash::FxHashMap;
///
/// let mut counts = FxHashMap::default();
/// counts.insert(b"ACG".to_vec(), 1);
/// counts.insert(b"CGT".to_vec(), 2);
/// counts.insert(b"GTA".to_vec(), 2);
///
/// let dist = kmer_frequency_distribution(&counts);
///
/// assert_eq!(dist.get(&1), Some(&1));  // 1个k-mer出现1次
/// assert_eq!(dist.get(&2), Some(&2));  // 2个k-mer出现2次
/// ```
pub fn kmer_frequency_distribution(counts: &FxHashMap<Vec<u8>, usize>) -> HashMap<usize, usize> {
    let mut distribution = HashMap::new();

    for &count in counts.values() {
        *distribution.entry(count).or_insert(0) += 1;
    }

    distribution
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_kmer_counter_basic() {
        let seq = b"ACGTACGT";
        let mut counter = KmerCounter::new(3);

        counter.count_sequence(seq);

        // "ACGTACGT"的3-mer: ACG, CGT, GTA, TAC, ACG, CGT
        // ACG出现2次,CGT出现2次,GTA出现1次,TAC出现1次
        assert_eq!(counter.get_count(b"ACG"), 2);
        assert_eq!(counter.get_count(b"CGT"), 2);
        assert_eq!(counter.get_count(b"GTA"), 1);
        assert_eq!(counter.get_count(b"TAC"), 1);
        assert_eq!(counter.num_distinct_kmers(), 4);
        assert_eq!(counter.total_count(), 6);
    }

    #[test]
    fn test_kmer_counter_short_seq() {
        let seq = b"AC"; // 太短,无法生成3-mer
        let mut counter = KmerCounter::new(3);

        counter.count_sequence(seq);

        assert_eq!(counter.num_distinct_kmers(), 0);
        assert_eq!(counter.total_count(), 0);
    }

    #[test]
    fn test_kmer_counter_top_kmers() {
        let seq = b"ACGTACGTACG";
        let mut counter = KmerCounter::new(3);

        counter.count_sequence(seq);

        let top = counter.top_kmers(2);

        // ACG和CGT应该是最常见的
        assert_eq!(top.len(), 2);
        assert!(top[0].1 >= top[1].1); // 按频率降序
    }

    #[test]
    fn test_kmer_iterator() {
        let seq = b"ACGT";
        let kmers: Vec<_> = KmerIterator::new(seq, 2).collect();

        assert_eq!(kmers.len(), 3); // AC, CG, GT
        assert_eq!(kmers[0], b"AC");
        assert_eq!(kmers[1], b"CG");
        assert_eq!(kmers[2], b"GT");
    }

    #[test]
    fn test_kmer_iterator_empty() {
        let seq = b"A";
        let kmers: Vec<_> = KmerIterator::new(seq, 3).collect();

        assert_eq!(kmers.len(), 0); // 序列太短
    }

    #[test]
    fn test_reverse_complement_kmer() {
        use crate::core::tables::LOOKUP_TABLES;

        let kmer = b"ACGT";
        let rc = reverse_complement_kmer(kmer, &LOOKUP_TABLES.comp);

        assert_eq!(rc, b"ACGT"); // ACGT的反向互补是ACGT

        let kmer2 = b"AAAA";
        let rc2 = reverse_complement_kmer(kmer2, &LOOKUP_TABLES.comp);

        assert_eq!(rc2, b"TTTT");
    }

    #[test]
    fn test_canonical_kmer() {
        use crate::core::tables::LOOKUP_TABLES;

        // ACG的反向互补是CGT,ACG字典序小,所以返回ACG
        let kmer1 = b"ACG";
        let canonical1 = canonical_kmer(kmer1, &LOOKUP_TABLES.comp);
        assert_eq!(canonical1, b"ACG");

        // CGT的反向互补是ACG,ACG字典序小,所以返回ACG
        let kmer2 = b"CGT";
        let canonical2 = canonical_kmer(kmer2, &LOOKUP_TABLES.comp);
        assert_eq!(canonical2, b"ACG");

        // 两个互补的k-mer应该返回相同的规范形式
        assert_eq!(canonical1, canonical2);
    }

    #[test]
    fn test_kmer_frequency_distribution() {
        let mut counts = FxHashMap::default();
        counts.insert(b"ACG".to_vec(), 1);
        counts.insert(b"CGT".to_vec(), 2);
        counts.insert(b"GTA".to_vec(), 2);
        counts.insert(b"TAC".to_vec(), 3);

        let dist = kmer_frequency_distribution(&counts);

        assert_eq!(dist.get(&1), Some(&1)); // 1个k-mer出现1次
        assert_eq!(dist.get(&2), Some(&2)); // 2个k-mer出现2次
        assert_eq!(dist.get(&3), Some(&1)); // 1个k-mer出现3次
    }

    #[test]
    fn test_kmer_counter_clear() {
        let seq = b"ACGT";
        let mut counter = KmerCounter::new(2);

        counter.count_sequence(seq);
        assert!(counter.num_distinct_kmers() > 0);

        counter.clear();
        assert_eq!(counter.num_distinct_kmers(), 0);
        assert_eq!(counter.total_count(), 0);
    }
}