use crate::encoding;
use crate::kmer::{Kmer, KmerBits};
use crate::offsets::OffsetsVector;
use crate::spectrum_preserving_string_set::SpectrumPreservingStringSet;
use anyhow::Result;
pub struct Encoder<const K: usize>
where
Kmer<K>: KmerBits,
{
strings: Vec<u8>,
offsets: OffsetsVector,
num_kmers: u64,
num_strings: u64,
total_bases: u64,
}
impl<const K: usize> Encoder<K>
where
Kmer<K>: KmerBits,
{
pub fn new() -> Self {
Self {
strings: Vec::new(),
offsets: OffsetsVector::new(), num_kmers: 0,
num_strings: 0,
total_bases: 0,
}
}
pub fn add_sequence(&mut self, sequence: &[u8]) -> Result<()> {
let seq_len = sequence.len();
if seq_len < K {
return Ok(());
}
for (i, &base) in sequence.iter().enumerate() {
let encoded = encoding::encode_base(base).map_err(|_| {
anyhow::anyhow!("Invalid base at position {}: {:?}", i, base as char)
})?;
let base_idx = self.total_bases as usize;
let byte_idx = base_idx / 4;
let bit_offset = (base_idx % 4) * 2;
if byte_idx >= self.strings.len() {
self.strings.push(0);
}
self.strings[byte_idx] |= encoded << bit_offset;
self.total_bases += 1;
}
self.offsets.push(self.total_bases);
let kmers_in_string = if seq_len >= K {
(seq_len - K + 1) as u64
} else {
0
};
self.num_kmers += kmers_in_string;
self.num_strings += 1;
Ok(())
}
pub fn num_kmers(&self) -> u64 {
self.num_kmers
}
pub fn num_strings(&self) -> u64 {
self.num_strings
}
pub fn build(self, m: usize) -> SpectrumPreservingStringSet {
SpectrumPreservingStringSet::from_parts(
self.strings,
self.offsets,
K,
m,
)
}
}
impl<const K: usize> Default for Encoder<K>
where
Kmer<K>: KmerBits,
{
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encoder_creation() {
let encoder = Encoder::<31>::new();
assert_eq!(encoder.num_kmers(), 0);
assert_eq!(encoder.num_strings(), 0);
}
#[test]
fn test_encoder_add_sequence() {
let mut encoder = Encoder::<7>::new();
encoder.add_sequence(b"ACGTACGT").unwrap();
assert_eq!(encoder.num_strings(), 1);
assert_eq!(encoder.num_kmers(), 2); }
#[test]
fn test_encoder_skip_short_sequence() {
let mut encoder = Encoder::<31>::new();
encoder.add_sequence(b"ACGT").unwrap();
assert_eq!(encoder.num_strings(), 0); assert_eq!(encoder.num_kmers(), 0);
}
#[test]
fn test_encoder_multiple_sequences() {
let mut encoder = Encoder::<5>::new();
encoder.add_sequence(b"ACGTACGT").unwrap(); encoder.add_sequence(b"TGCA").unwrap(); encoder.add_sequence(b"AAAAAAA").unwrap();
assert_eq!(encoder.num_strings(), 2); assert_eq!(encoder.num_kmers(), 7); }
#[test]
fn test_encoder_build_spss() {
let mut encoder = Encoder::<7>::new();
encoder.add_sequence(b"ACGTACGT").unwrap();
encoder.add_sequence(b"TGCATGCA").unwrap();
let spss = encoder.build(5);
assert_eq!(spss.num_strings(), 2);
assert_eq!(spss.total_bases(), 16); }
}