sshash_lib/builder/
encode.rs1use crate::encoding;
7use crate::kmer::{Kmer, KmerBits};
8use crate::offsets::OffsetsVector;
9use crate::spectrum_preserving_string_set::SpectrumPreservingStringSet;
10use anyhow::Result;
11
12pub struct Encoder<const K: usize>
21where
22 Kmer<K>: KmerBits,
23{
24 strings: Vec<u8>,
26
27 offsets: OffsetsVector,
29
30 num_kmers: u64,
32
33 num_strings: u64,
35
36 total_bases: u64,
38}
39
40impl<const K: usize> Encoder<K>
41where
42 Kmer<K>: KmerBits,
43{
44 pub fn new() -> Self {
46 Self {
47 strings: Vec::new(),
48 offsets: OffsetsVector::new(), num_kmers: 0,
50 num_strings: 0,
51 total_bases: 0,
52 }
53 }
54
55 pub fn add_sequence(&mut self, sequence: &[u8]) -> Result<()> {
66 let seq_len = sequence.len();
67
68 if seq_len < K {
70 return Ok(());
71 }
72
73 for (i, &base) in sequence.iter().enumerate() {
75 let encoded = encoding::encode_base(base).map_err(|_| {
76 anyhow::anyhow!("Invalid base at position {}: {:?}", i, base as char)
77 })?;
78
79 let base_idx = self.total_bases as usize;
80 let byte_idx = base_idx / 4;
81 let bit_offset = (base_idx % 4) * 2;
82
83 if byte_idx >= self.strings.len() {
85 self.strings.push(0);
86 }
87
88 self.strings[byte_idx] |= encoded << bit_offset;
89 self.total_bases += 1;
90 }
91
92 self.offsets.push(self.total_bases);
94
95 let kmers_in_string = if seq_len >= K {
97 (seq_len - K + 1) as u64
98 } else {
99 0
100 };
101
102 self.num_kmers += kmers_in_string;
103 self.num_strings += 1;
104
105 Ok(())
106 }
107
108 pub fn num_kmers(&self) -> u64 {
110 self.num_kmers
111 }
112
113 pub fn num_strings(&self) -> u64 {
115 self.num_strings
116 }
117
118 pub fn build(self, m: usize) -> SpectrumPreservingStringSet {
122 SpectrumPreservingStringSet::from_parts(
123 self.strings,
124 self.offsets,
125 K,
126 m,
127 )
128 }
129}
130
131impl<const K: usize> Default for Encoder<K>
132where
133 Kmer<K>: KmerBits,
134{
135 fn default() -> Self {
136 Self::new()
137 }
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn test_encoder_creation() {
146 let encoder = Encoder::<31>::new();
147 assert_eq!(encoder.num_kmers(), 0);
148 assert_eq!(encoder.num_strings(), 0);
149 }
150
151 #[test]
152 fn test_encoder_add_sequence() {
153 let mut encoder = Encoder::<7>::new();
154
155 encoder.add_sequence(b"ACGTACGT").unwrap();
157
158 assert_eq!(encoder.num_strings(), 1);
159 assert_eq!(encoder.num_kmers(), 2); }
161
162 #[test]
163 fn test_encoder_skip_short_sequence() {
164 let mut encoder = Encoder::<31>::new();
165
166 encoder.add_sequence(b"ACGT").unwrap(); assert_eq!(encoder.num_strings(), 0); assert_eq!(encoder.num_kmers(), 0);
171 }
172
173 #[test]
174 fn test_encoder_multiple_sequences() {
175 let mut encoder = Encoder::<5>::new();
176
177 encoder.add_sequence(b"ACGTACGT").unwrap(); encoder.add_sequence(b"TGCA").unwrap(); encoder.add_sequence(b"AAAAAAA").unwrap(); assert_eq!(encoder.num_strings(), 2); assert_eq!(encoder.num_kmers(), 7); }
184
185 #[test]
186 fn test_encoder_build_spss() {
187 let mut encoder = Encoder::<7>::new();
188 encoder.add_sequence(b"ACGTACGT").unwrap();
189 encoder.add_sequence(b"TGCATGCA").unwrap();
190
191 let spss = encoder.build(5); assert_eq!(spss.num_strings(), 2);
194 assert_eq!(spss.total_bases(), 16); }
196}