fqtk_lib/bitenc.rs
1// Copyright 2014-2016 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5// Adapted by Nils Homer (2025)
6
7//! A fixed-width bit encoding implementation. This allows to store a sequence of values over
8//! a reduced alphabet by packing them bit-encoded into a sequence of bytes.
9//!
10//! Similar behaviour can be achieved using a `PackedVec` from the [packedvec](https://docs.rs/packedvec) crate.
11//!
12//! # Example
13//!
14//! ```
15//! use fqtk_lib::bitenc::BitEnc;
16//! let mut bitenc = BitEnc::new(2);
17//! bitenc.push(0);
18//! bitenc.push(2);
19//! bitenc.push(1);
20//! let values: Vec<u8> = bitenc.iter().collect();
21//! assert_eq!(values, [0, 2, 1]);
22//! ```
23//!
24
25use serde::Deserialize;
26use serde::Serialize;
27
28/// A sequence of bitencoded values.
29///
30/// Space complexity: O(⌈(n * width) / k⌉) * 32 bit, where n is the length of the input
31/// sequence and `k = 32 - (32 % width)` is the number of bits in each
32/// 32-bit block that can be used to store values.
33/// For values that are not a divider of 32, some bits will remain unused.
34/// For example for `width = 7` only `4 * 7 = 28` bits are used.
35/// Five 7-bit values are stored in 2 blocks.
36#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
37pub struct BitEnc {
38 storage: Vec<u32>,
39 width: usize,
40 mask: u32,
41 len: usize,
42 usable_bits_per_block: usize,
43}
44
45/// Create a mask with `width` 1-bits.
46fn mask(width: usize) -> u32 {
47 (1 << width) - 1
48}
49
50impl BitEnc {
51 /// Create a new instance with a given encoding width (e.g. width=2 for using two bits per value).
52 /// Supports widths up to 8 bits per character, i.e. `1 <= width <= 8`.
53 ///
54 /// Complexity: O(1)
55 ///
56 /// # Example
57 ///
58 /// ```
59 /// use fqtk_lib::bitenc::BitEnc;
60 /// let bitenc = BitEnc::new(3);
61 /// ```
62 pub fn new(width: usize) -> Self {
63 assert!(width <= 8, "Only encoding widths up to 8 supported");
64 BitEnc {
65 storage: Vec::new(),
66 width,
67 mask: mask(width),
68 len: 0,
69 usable_bits_per_block: 32 - 32 % width,
70 }
71 }
72
73 /// Create a new instance with a given capacity and encoding width
74 /// (e.g. width=2 for using two bits per value).
75 /// Supports widths up to 8 bits per character, i.e. `1 <= width <= 8`.
76 ///
77 /// Complexity: O(1)
78 ///
79 /// # Example
80 ///
81 /// ```
82 /// use fqtk_lib::bitenc::BitEnc;
83 ///
84 /// let bitenc = BitEnc::with_capacity(3, 42);
85 /// ```
86 pub fn with_capacity(width: usize, n: usize) -> Self {
87 assert!(width <= 8, "Only encoding widths up to 8 supported");
88 BitEnc {
89 storage: Vec::with_capacity(n * width / 32),
90 width,
91 mask: mask(width),
92 len: 0,
93 usable_bits_per_block: 32 - 32 % width,
94 }
95 }
96
97 /// Append a character to the current bit-encoding.
98 ///
99 /// Complexity: O(1)
100 ///
101 /// # Example
102 ///
103 /// ```
104 /// use fqtk_lib::bitenc::BitEnc;
105 ///
106 /// let mut bitenc = BitEnc::new(4);
107 /// bitenc.push(0b0000);
108 /// bitenc.push(0b1000);
109 /// bitenc.push(0b1010);
110 /// // The three characters added above are encoded into one u32 entry.
111 /// let values: Vec<u8> = bitenc.iter().collect();
112 /// assert_eq!(values, [0b0000, 0b1000, 0b1010]);
113 /// ```
114 pub fn push(&mut self, value: u8) {
115 let (block, bit) = self.addr(self.len);
116 if bit == 0 {
117 self.storage.push(0);
118 }
119 self.set_by_addr(block, bit, value);
120 self.len += 1;
121 }
122
123 /// Append the given `value` to the encoding `n` times.
124 ///
125 /// The added values comprise 0 to 1 blocks that need to be filled up
126 /// from previous steps, 0 to m blocks that are
127 /// completely filled with the value and 0 to 1 blocks
128 /// that are only partially filled.
129 ///
130 /// Complexity: O(n)
131 ///
132 /// # Example
133 ///
134 /// ```
135 /// use fqtk_lib::bitenc::BitEnc;
136 ///
137 /// let mut bitenc = BitEnc::new(8);
138 /// // Width: 8 → 4 values per block
139 /// // | __ __ __ __ | Denotes one block with 4 empty slots
140 ///
141 /// bitenc.push_values(5, 0b101010);
142 /// // This adds one full and one partial block.
143 /// // | 42 42 42 42 | __ __ __ 42 |
144 ///
145 /// let values: Vec<u8> = bitenc.iter().collect();
146 /// assert_eq!(values, [42, 42, 42, 42, 42]);
147 ///
148 /// bitenc.push_values(1, 23);
149 /// // This only fills up an existing block;
150 /// // | 42 42 42 42 | __ __ 23 42 |
151 ///
152 /// let values: Vec<u8> = bitenc.iter().collect();
153 /// assert_eq!(values, [42, 42, 42, 42, 42, 23]);
154 ///
155 /// bitenc.push_values(6, 17);
156 /// // Fills up the current block, adds a whole new one but does not create a partial block.
157 /// // | 42 42 42 42 | 17 17 23 42 | 17 17 17 17 |
158 ///
159 /// let values: Vec<u8> = bitenc.iter().collect();
160 /// assert_eq!(values, [42, 42, 42, 42, 42, 23, 17, 17, 17, 17, 17, 17]);
161 /// ```
162 pub fn push_values(&mut self, mut n: usize, value: u8) {
163 // Fill up the previous block.
164 // Example: After adding 3 values with a width
165 // of 8, 8 out out 32 bits in the first block
166 // can still be used.
167 {
168 // Check if there are remaining free slots
169 // in the current block, i.e. if the bit offset
170 // within the block is non-zero
171 let (block, bit) = self.addr(self.len);
172 if bit > 0 {
173 // insert as many values as required to fill
174 // up the previous block and decrease the number
175 // left to insert accordingly
176 // The take(n) assures this iterator stops before
177 // an overflow of n can occur, if less symbols are
178 // added than open slots in the block.
179 for bit in (bit..32).step_by(self.width).take(n) {
180 self.set_by_addr(block, bit, value);
181 n -= 1;
182 self.len += 1;
183 }
184 }
185 }
186
187 // If symbols remain to be inserted after
188 // filling up the current block divide them into
189 // full and partial blocks to add them.
190 if n > 0 {
191 // Create a full value block containing
192 // as many copies of value as possible.
193 let mut value_block = 0;
194 {
195 let mut v = u32::from(value);
196 for _ in 0..32 / self.width {
197 value_block |= v;
198 v <<= self.width;
199 }
200 }
201
202 // Append as many full value blocks as needed
203 // to reach the last block
204 let i = self.len + n;
205 let (block, bit) = self.addr(i);
206 self.storage.resize(block, value_block);
207
208 if bit > 0 {
209 // add the remaining values to a final block
210 // let shifted_block = value_block >> (32 - bit);
211 let shifted_block = value_block >> (self.usable_bits_per_block - bit);
212 self.storage.push(shifted_block);
213 }
214
215 self.len = i;
216 }
217 }
218
219 /// Replace the current value as position `i` with the given value.
220 ///
221 /// Complexity: O(1)
222 ///
223 /// ```
224 /// use fqtk_lib::bitenc::BitEnc;
225 ///
226 /// let mut bitenc = BitEnc::new(4);
227 /// bitenc.push_values(4, 0b1111);
228 /// bitenc.set(2, 0b0000);
229 ///
230 /// let values: Vec<u8> = bitenc.iter().collect();
231 /// assert_eq!(values, [0b1111, 0b1111, 0b0000, 0b1111]);
232 /// ```
233 pub fn set(&mut self, i: usize, value: u8) {
234 let (block, bit) = self.addr(i);
235 self.set_by_addr(block, bit, value);
236 }
237
238 /// Get the value at position `i`.
239 ///
240 /// Complexity: O(1)
241 ///
242 /// ```
243 /// use fqtk_lib::bitenc::BitEnc;
244 ///
245 /// let mut bitenc = BitEnc::new(4);
246 /// for value in 1..=4 {
247 /// bitenc.push(value);
248 /// }
249 ///
250 /// let values: Vec<u8> = bitenc.iter().collect();
251 /// assert_eq!(values, [0b0001, 0b0010, 0b0011, 0b0100]);
252 /// ```
253 pub fn get(&self, i: usize) -> Option<u8> {
254 if i >= self.len {
255 None
256 } else {
257 let (block, bit) = self.addr(i);
258 Some(self.get_by_addr(block, bit))
259 }
260 }
261
262 /// Iterate over stored values (values will be unpacked into bytes).
263 ///
264 /// Complexity: O(n), where n is the number of encoded values
265 ///
266 /// # Example
267 ///
268 /// ```
269 /// use fqtk_lib::bitenc::BitEnc;
270 ///
271 /// // Fill bitenc with 1, 2, 3, and 4.
272 /// let mut bitenc = BitEnc::new(4);
273 /// for value in 1..=4 {
274 /// bitenc.push(value);
275 /// }
276 ///
277 /// // Collect iterator for comparison
278 /// let values: Vec<u8> = bitenc.iter().collect();
279 /// assert_eq!(values, [0b0001, 0b0010, 0b0011, 0b0100]);
280 /// ```
281 pub fn iter(&self) -> BitEncIter<'_> {
282 BitEncIter { bitenc: self, i: 0 }
283 }
284
285 /// Clear the sequence.
286 ///
287 /// Complexity: O(1)
288 ///
289 /// # Example
290 ///
291 /// ```
292 /// use fqtk_lib::bitenc::BitEnc;
293 ///
294 /// let mut bitenc = BitEnc::new(2);
295 /// bitenc.push(2);
296 /// assert_eq!(bitenc.len(), 1);
297 /// bitenc.clear();
298 /// assert_eq!(bitenc.len(), 0);
299 /// ```
300 pub fn clear(&mut self) {
301 self.storage.clear();
302 self.len = 0;
303 }
304
305 /// Get the value stored in the given `block` at `bit`.
306 fn get_by_addr(&self, block: usize, bit: usize) -> u8 {
307 ((self.storage[block] >> bit) & self.mask) as u8
308 }
309
310 /// Replace the value in the given `block` at `bit` with the given `value`.
311 fn set_by_addr(&mut self, block: usize, bit: usize, value: u8) {
312 let mask = self.mask << bit;
313 self.storage[block] |= mask;
314 self.storage[block] ^= mask;
315 self.storage[block] |= (u32::from(value) & self.mask) << bit;
316 }
317
318 /// Get the block and start bit for the `i`th encoded value.
319 fn addr(&self, i: usize) -> (usize, usize) {
320 let k = i * self.width;
321 (k / self.usable_bits_per_block, k % self.usable_bits_per_block)
322 }
323
324 /// Get the number of symbols encoded.
325 ///
326 /// Complexity: O(1)
327 ///
328 /// # Example
329 ///
330 /// ```
331 /// use fqtk_lib::bitenc::BitEnc;
332 ///
333 /// let mut bitenc = BitEnc::new(8);
334 /// bitenc.push(2);
335 /// assert_eq!(bitenc.len(), 1);
336 /// bitenc.push(2);
337 /// bitenc.push(2);
338 /// bitenc.push(2);
339 /// assert_eq!(bitenc.len(), 4);
340 /// // Add another 2 to create a second block
341 /// bitenc.push(2);
342 /// assert_eq!(bitenc.len(), 5);
343 /// ```
344 #[deprecated(
345 since = "0.33.0",
346 note = "Please use the more specific `nr_blocks` and `nr_symbols` functions instead."
347 )]
348 pub fn len(&self) -> usize {
349 self.len
350 }
351
352 /// Get the number of blocks used by the encoding.
353 ///
354 /// Complexity: O(1)
355 ///
356 /// # Example
357 ///
358 /// ```
359 /// use fqtk_lib::bitenc::BitEnc;
360 ///
361 /// let mut bitenc = BitEnc::new(8);
362 /// bitenc.push(2);
363 /// assert_eq!(bitenc.nr_blocks(), 1);
364 /// // Add enough 2s to completely fill the first block
365 /// bitenc.push(2);
366 /// bitenc.push(2);
367 /// bitenc.push(2);
368 /// assert_eq!(bitenc.nr_blocks(), 1);
369 /// // Add another 2 to create a second block
370 /// bitenc.push(2);
371 /// assert_eq!(bitenc.nr_blocks(), 2);
372 /// ```
373 pub fn nr_blocks(&self) -> usize {
374 self.storage.len()
375 }
376
377 /// Get the number of symbols encoded.
378 ///
379 /// Complexity: O(1)
380 ///
381 /// # Example
382 ///
383 /// ```
384 /// use fqtk_lib::bitenc::BitEnc;
385 ///
386 /// let mut bitenc = BitEnc::new(8);
387 /// bitenc.push(2);
388 /// assert_eq!(bitenc.nr_symbols(), 1);
389 /// bitenc.push(2);
390 /// bitenc.push(2);
391 /// bitenc.push(2);
392 /// assert_eq!(bitenc.nr_symbols(), 4);
393 /// bitenc.push(2);
394 /// assert_eq!(bitenc.nr_symbols(), 5);
395 /// ```
396 pub fn nr_symbols(&self) -> usize {
397 self.len
398 }
399
400 /// Is the encoded sequence empty?
401 ///
402 /// Complexity: O(1)
403 ///
404 /// # Example
405 ///
406 /// ```
407 /// use fqtk_lib::bitenc::BitEnc;
408 ///
409 /// let mut bitenc = BitEnc::new(2);
410 /// assert!(bitenc.is_empty());
411 /// bitenc.push(2);
412 /// assert!(!bitenc.is_empty());
413 /// bitenc.clear();
414 /// assert!(bitenc.is_empty());
415 /// ```
416 pub fn is_empty(&self) -> bool {
417 self.len == 0
418 }
419
420 /// Calculate the Hamming distance between this and another bitencoded sequence.
421 ///
422 /// Note: this allows IUPAC fuzzy matching with IUPAC bases in this sequence.
423 /// An IUPAC base in the other sequence matches if it is at least as specific as the
424 /// corresponding (IUPAC) base in this sequence. E.g. If the other sequence is an
425 /// N, it will not match anything but an N, and if the other base is an R, it
426 /// will match R, V, D, and N, since the latter IUPAC codes allow both A and G.
427 ///
428 /// # Panics
429 ///
430 /// Panics if the length and widths of the two sequences are not the same.
431 #[must_use]
432 pub fn hamming(&self, other: &BitEnc, max_mismatches: u32) -> u32 {
433 assert!(self.len == other.len, "Both bitenc sequences must have the same length");
434 assert!(self.width == other.width, "Both bitenc sequences must have the same width");
435 let mut count: u32 = 0;
436 let values_per_block = self.usable_bits_per_block / self.width;
437 for block_index in 0..self.nr_blocks() {
438 // Get the bits that are different across the two blocks. These represent differences
439 // in values (bases), where multiple values (bases) are stored in each block. We
440 // save the block differences so we efficiently count the differences below.
441 let block_diff = self.storage[block_index] & !other.storage[block_index];
442 if block_diff != 0 {
443 // Scan through the values (bases) in the block, counting those values (bases)
444 // that are different.
445 let mut shift_i = 0;
446 for _ in 0..values_per_block {
447 let block_diff_sub = (block_diff >> shift_i) & self.mask;
448 if block_diff_sub != 0 {
449 count += 1;
450 }
451 shift_i += self.width;
452 }
453 if count >= max_mismatches {
454 return max_mismatches;
455 }
456 }
457 }
458 count
459 }
460}
461
462/// Iterator over values of a bitencoded sequence (values will be unpacked into bytes).
463/// Used to implement the `iter` method of `BitEnc`.
464#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
465pub struct BitEncIter<'a> {
466 bitenc: &'a BitEnc,
467 i: usize,
468}
469
470impl Iterator for BitEncIter<'_> {
471 type Item = u8;
472
473 fn next(&mut self) -> Option<u8> {
474 let value = self.bitenc.get(self.i);
475 self.i += 1;
476 value
477 }
478}
479
480#[cfg(test)]
481mod tests {
482 use super::BitEnc;
483
484 #[test]
485 fn test_bitenc() {
486 let mut bitenc = BitEnc::new(2);
487 bitenc.push(0);
488 bitenc.push(2);
489 bitenc.push(1);
490 let mut values: Vec<u8> = bitenc.iter().collect();
491 assert_eq!(values, [0, 2, 1]);
492 bitenc.set(1, 3);
493 values = bitenc.iter().collect();
494 assert_eq!(values, [0, 3, 1]);
495 }
496
497 #[test]
498 fn test_push_values() {
499 let mut bitenc = BitEnc::new(2);
500 bitenc.push_values(32, 0);
501 assert_eq!(bitenc.storage, [0, 0]);
502 }
503
504 #[test]
505 fn test_push_values_edge_cases() {
506 //! This is a slight variation of the methods doc test,
507 //! which also creates a new partial block in the last step
508 //! and an additonal full block of 17s.
509 //! As this bloats the result vectors, this was not included
510 //! in the doc test.
511 //! Additionally, this test uses a width of 7 bits, which leave
512 //! some bits unused.
513
514 let mut bitenc = BitEnc::new(7);
515 // Width: 7 → 4 values per block
516 // | __ __ __ __ | Denotes one block with 4 empty slots and 4 leftover bits
517
518 bitenc.push_values(5, 0b101010);
519 // This adds one full and one partial block.
520 // | 42 42 42 42 | __ __ __ 42 |
521
522 let values: Vec<u8> = bitenc.iter().collect();
523 assert_eq!(values, [42, 42, 42, 42, 42]);
524 assert_eq!(bitenc.nr_blocks(), 2);
525 assert_eq!(bitenc.nr_symbols(), 5);
526
527 bitenc.push_values(1, 23);
528 // This only fills up an existing block;
529 // | 42 42 42 42 | __ __ 23 42 |
530 let values: Vec<u8> = bitenc.iter().collect();
531 assert_eq!(values, [42, 42, 42, 42, 42, 23]);
532 assert_eq!(bitenc.nr_blocks(), 2);
533 assert_eq!(bitenc.nr_symbols(), 6);
534
535 bitenc.push_values(12, 17);
536 // Fills up the current block, adds a whole new one AND create a partial block.
537 // | 42 42 42 42 | 17 17 23 42 | 17 17 17 17 | 17 17 17 17 | __ __ 17 17 |
538
539 let values: Vec<u8> = bitenc.iter().collect();
540 assert_eq!(
541 values,
542 [42, 42, 42, 42, 42, 23, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
543 );
544 assert_eq!(bitenc.nr_blocks(), 5);
545 assert_eq!(bitenc.nr_symbols(), 18);
546 }
547
548 #[test]
549 fn test_issue29() {
550 for w in 2..9 {
551 let mut vec = BitEnc::with_capacity(w, 1000);
552 for _ in 0..1000 {
553 vec.push(1);
554 }
555 }
556 }
557
558 #[test]
559 fn test_hamming() {
560 let mut left = BitEnc::new(4);
561 left.push_values(10, 0b1010); // Y (C or T)
562 let mut right = BitEnc::new(4);
563 right.push_values(10, 0b0010); // C
564 let mut mismatches = BitEnc::new(4);
565 mismatches.push_values(2, 0b1010); // Y
566 mismatches.push_values(1, 0b0100); // G
567 mismatches.push_values(3, 0b1010); // Y
568 mismatches.push_values(2, 0b0001); // A
569 mismatches.push_values(2, 0b1010); // Y
570
571 assert_eq!(left.hamming(&left, 100), 0);
572 assert_eq!(right.hamming(&right, 100), 0);
573 assert_eq!(right.hamming(&left, 100), 0); // since left is less restrictive than right
574 assert_eq!(left.hamming(&right, 100), 10); // since right is more restrictive than left
575 assert_eq!(left.hamming(&mismatches, 100), 3);
576 assert_eq!(mismatches.hamming(&left, 100), 3);
577 assert_eq!(mismatches.hamming(&left, 1), 1);
578 assert_eq!(mismatches.hamming(&left, 2), 2);
579 assert_eq!(mismatches.hamming(&left, 3), 3);
580 }
581}