seqtkrs 0.1.1

A Rust reimplementation of seqtk, a fast and lightweight tool for processing biological sequences in FASTA/FASTQ format
Documentation
use std::sync::LazyLock;

/// 查找表集合,用于高效的碱基编码转换
///
/// # 说明
/// 所有查找表都是256元素的数组,提供O(1)的查找性能。
/// 通过全局单例 `LOOKUP_TABLES` 访问。
pub struct LookupTables {
    /// seq_nt16_table: ASCII -> 4位IUPAC编码
    /// A=1, C=2, G=4, T=8, 其他模糊碱基是这些的组合,N=15
    pub nt16: [u8; 256],

    /// seq_nt6_table: ASCII -> 简化编码
    /// A=1, C=2, G=3, T=4, N=5, other=0
    pub nt6: [u8; 256],

    /// comp_tab: 碱基互补表
    /// A<->T, C<->G, 支持IUPAC模糊碱基
    pub comp: [u8; 256],

    /// bitcnt_table: 4位编码中1的个数(用于计算碱基歧义度)
    pub bitcnt: [u8; 16],

    /// nt16_str: 4位编码 -> ASCII字符
    pub nt16_str: [u8; 16],
}

impl LookupTables {
    /// 创建查找表实例
    pub fn new() -> Self {
        Self {
            nt16: init_nt16_table(),
            nt6: init_nt6_table(),
            comp: init_comp_table(),
            bitcnt: init_bitcnt_table(),
            nt16_str: init_nt16_str_table(),
        }
    }
}

impl Default for LookupTables {
    fn default() -> Self {
        Self::new()
    }
}

/// 初始化 seq_nt16_table
///
/// # 编码规则
/// - A=1 (0001), C=2 (0010), G=4 (0100), T=8 (1000)
/// - U视为T (RNA序列)
/// - IUPAC模糊碱基:R=A|G=5, Y=C|T=10, M=A|C=3, K=G|T=12, 等
/// - N=15 (1111,表示任意碱基)
/// - 未知字符=15
fn init_nt16_table() -> [u8; 256] {
    let mut table = [15u8; 256];

    // 大写碱基
    table[b'A' as usize] = 1;
    table[b'C' as usize] = 2;
    table[b'G' as usize] = 4;
    table[b'T' as usize] = 8;
    table[b'U' as usize] = 8; // RNA中U等同于T

    // IUPAC模糊碱基编码
    table[b'R' as usize] = 5; // A or G (puRine)
    table[b'Y' as usize] = 10; // C or T (pYrimidine)
    table[b'M' as usize] = 3; // A or C (aMino)
    table[b'K' as usize] = 12; // G or T (Keto)
    table[b'S' as usize] = 6; // C or G (Strong)
    table[b'W' as usize] = 9; // A or T (Weak)
    table[b'H' as usize] = 11; // A or C or T (not G)
    table[b'B' as usize] = 14; // C or G or T (not A)
    table[b'V' as usize] = 7; // A or C or G (not T)
    table[b'D' as usize] = 13; // A or G or T (not C)
    table[b'N' as usize] = 15; // Any

    // 小写碱基(与大写相同)
    table[b'a' as usize] = 1;
    table[b'c' as usize] = 2;
    table[b'g' as usize] = 4;
    table[b't' as usize] = 8;
    table[b'u' as usize] = 8;
    table[b'r' as usize] = 5;
    table[b'y' as usize] = 10;
    table[b'm' as usize] = 3;
    table[b'k' as usize] = 12;
    table[b's' as usize] = 6;
    table[b'w' as usize] = 9;
    table[b'h' as usize] = 11;
    table[b'b' as usize] = 14;
    table[b'v' as usize] = 7;
    table[b'd' as usize] = 13;
    table[b'n' as usize] = 15;

    table
}

/// 初始化 seq_nt6_table
///
/// # 编码规则
/// - A=1, C=2, G=3, T=4, N=5
/// - 其他字符=0
fn init_nt6_table() -> [u8; 256] {
    let mut table = [0u8; 256];

    table[b'A' as usize] = 1;
    table[b'C' as usize] = 2;
    table[b'G' as usize] = 3;
    table[b'T' as usize] = 4;
    table[b'N' as usize] = 5;
    table[b'a' as usize] = 1;
    table[b'c' as usize] = 2;
    table[b'g' as usize] = 3;
    table[b't' as usize] = 4;
    table[b'n' as usize] = 5;

    table
}

/// 初始化碱基互补表
///
/// # 互补规则
/// - A <-> T, C <-> G
/// - IUPAC模糊碱基也有对应的互补关系
/// - S和W自互补(CG和AT)
/// - 未定义的字符保持不变
fn init_comp_table() -> [u8; 256] {
    let mut table = [0u8; 256];

    // 默认:字符映射到自身
    for (i, item) in table.iter_mut().enumerate() {
        *item = i as u8;
    }

    // 基本碱基互补
    table[b'A' as usize] = b'T';
    table[b'C' as usize] = b'G';
    table[b'G' as usize] = b'C';
    table[b'T' as usize] = b'A';
    table[b'a' as usize] = b't';
    table[b'c' as usize] = b'g';
    table[b'g' as usize] = b'c';
    table[b't' as usize] = b'a';

    // IUPAC模糊碱基的互补
    table[b'R' as usize] = b'Y'; // AG -> CT
    table[b'Y' as usize] = b'R'; // CT -> AG
    table[b'M' as usize] = b'K'; // AC -> GT
    table[b'K' as usize] = b'M'; // GT -> AC
    table[b'S' as usize] = b'S'; // CG -> CG (自互补)
    table[b'W' as usize] = b'W'; // AT -> AT (自互补)
    table[b'H' as usize] = b'D'; // ACT -> AGT
    table[b'B' as usize] = b'V'; // CGT -> ACG
    table[b'V' as usize] = b'B'; // ACG -> CGT
    table[b'D' as usize] = b'H'; // AGT -> ACT
    table[b'N' as usize] = b'N'; // N -> N

    table[b'r' as usize] = b'y';
    table[b'y' as usize] = b'r';
    table[b'm' as usize] = b'k';
    table[b'k' as usize] = b'm';
    table[b's' as usize] = b's';
    table[b'w' as usize] = b'w';
    table[b'h' as usize] = b'd';
    table[b'b' as usize] = b'v';
    table[b'v' as usize] = b'b';
    table[b'd' as usize] = b'h';
    table[b'n' as usize] = b'n';

    table
}

/// 初始化 bitcnt 表
///
/// # 说明
/// 计算4位二进制数中1的个数,用于判断碱基歧义度
fn init_bitcnt_table() -> [u8; 16] {
    let mut table = [0u8; 16];
    for (i, item) in table.iter_mut().enumerate() {
        *item = i.count_ones() as u8;
    }
    table
}

/// 初始化 nt16_str 表
///
/// # 说明
/// 将4位编码转换回ASCII字符
fn init_nt16_str_table() -> [u8; 16] {
    [
        b'=', b'A', b'C', b'M', b'G', b'R', b'S', b'V', b'T', b'W', b'Y', b'H', b'K', b'D', b'B',
        b'N',
    ]
}

/// 全局查找表单例
///
/// # 使用示例
/// ```
/// use seqtkrs::core::tables::LOOKUP_TABLES;
/// let comp_base = LOOKUP_TABLES.comp[b'A' as usize]; // 返回 b'T'
/// ```
pub static LOOKUP_TABLES: LazyLock<LookupTables> = LazyLock::new(LookupTables::new);

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_nt16_table() {
        let tables = &LOOKUP_TABLES;

        // 测试基本碱基
        assert_eq!(tables.nt16[b'A' as usize], 1);
        assert_eq!(tables.nt16[b'C' as usize], 2);
        assert_eq!(tables.nt16[b'G' as usize], 4);
        assert_eq!(tables.nt16[b'T' as usize], 8);

        // 测试小写
        assert_eq!(tables.nt16[b'a' as usize], 1);
        assert_eq!(tables.nt16[b'c' as usize], 2);

        // 测试模糊碱基
        assert_eq!(tables.nt16[b'R' as usize], 5); // A|G
        assert_eq!(tables.nt16[b'Y' as usize], 10); // C|T
        assert_eq!(tables.nt16[b'N' as usize], 15);
    }

    #[test]
    fn test_nt6_table() {
        let tables = &LOOKUP_TABLES;

        assert_eq!(tables.nt6[b'A' as usize], 1);
        assert_eq!(tables.nt6[b'C' as usize], 2);
        assert_eq!(tables.nt6[b'G' as usize], 3);
        assert_eq!(tables.nt6[b'T' as usize], 4);
        assert_eq!(tables.nt6[b'N' as usize], 5);
        assert_eq!(tables.nt6[b'X' as usize], 0); // 未知字符
    }

    #[test]
    fn test_comp_table() {
        let tables = &LOOKUP_TABLES;

        // 基本互补
        assert_eq!(tables.comp[b'A' as usize], b'T');
        assert_eq!(tables.comp[b'T' as usize], b'A');
        assert_eq!(tables.comp[b'G' as usize], b'C');
        assert_eq!(tables.comp[b'C' as usize], b'G');

        // 小写
        assert_eq!(tables.comp[b'a' as usize], b't');
        assert_eq!(tables.comp[b'g' as usize], b'c');

        // 模糊碱基
        assert_eq!(tables.comp[b'R' as usize], b'Y');
        assert_eq!(tables.comp[b'S' as usize], b'S'); // 自互补
    }

    #[test]
    fn test_bitcnt_table() {
        let tables = &LOOKUP_TABLES;

        assert_eq!(tables.bitcnt[0b0000], 0);
        assert_eq!(tables.bitcnt[0b0001], 1);
        assert_eq!(tables.bitcnt[0b0011], 2);
        assert_eq!(tables.bitcnt[0b1111], 4);
    }

    #[test]
    fn test_nt16_str_table() {
        let tables = &LOOKUP_TABLES;

        assert_eq!(tables.nt16_str[1], b'A');
        assert_eq!(tables.nt16_str[2], b'C');
        assert_eq!(tables.nt16_str[4], b'G');
        assert_eq!(tables.nt16_str[8], b'T');
        assert_eq!(tables.nt16_str[15], b'N');
    }

    #[test]
    fn test_reverse_complement_logic() {
        let tables = &LOOKUP_TABLES;

        // 测试反向互补的逻辑
        let seq = b"ACGT";
        let mut rev_comp = Vec::new();

        for &base in seq.iter().rev() {
            rev_comp.push(tables.comp[base as usize]);
        }

        assert_eq!(rev_comp, b"ACGT"); // ACGT的反向互补是自身
    }
}