Skip to main content

fqtk_lib/
bitenc.rs

1// Copyright 2014-2016 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5// Adapted by Nils Homer (2025)
6
7//! A fixed-width bit encoding implementation. This allows to store a sequence of values over
8//! a reduced alphabet by packing them bit-encoded into a sequence of bytes.
9//!
10//! Similar behaviour can be achieved using a `PackedVec` from the [packedvec](https://docs.rs/packedvec) crate.
11//!
12//! # Example
13//!
14//! ```
15//! use fqtk_lib::bitenc::BitEnc;
16//! let mut bitenc = BitEnc::new(2);
17//! bitenc.push(0);
18//! bitenc.push(2);
19//! bitenc.push(1);
20//! let values: Vec<u8> = bitenc.iter().collect();
21//! assert_eq!(values, [0, 2, 1]);
22//! ```
23//!
24
25use serde::Deserialize;
26use serde::Serialize;
27
28/// A sequence of bitencoded values.
29///
30/// Space complexity: O(⌈(n * width) / k⌉) * 32 bit, where n is the length of the input
31/// sequence and `k = 32 - (32 % width)`  is the number of bits in each
32/// 32-bit block that can be used to store values.
33/// For values that are not a divider of 32, some bits will remain unused.
34/// For example for `width = 7` only `4 * 7 = 28` bits are used.
35/// Five 7-bit values are stored in 2 blocks.
36#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
37pub struct BitEnc {
38    storage: Vec<u32>,
39    width: usize,
40    mask: u32,
41    len: usize,
42    usable_bits_per_block: usize,
43}
44
45/// Create a mask with `width` 1-bits.
46fn mask(width: usize) -> u32 {
47    (1 << width) - 1
48}
49
50impl BitEnc {
51    /// Create a new instance with a given encoding width (e.g. width=2 for using two bits per value).
52    /// Supports widths up to 8 bits per character, i.e. `1 <= width <= 8`.
53    ///
54    /// Complexity: O(1)
55    ///
56    /// # Example
57    ///
58    /// ```
59    /// use fqtk_lib::bitenc::BitEnc;
60    /// let bitenc = BitEnc::new(3);
61    /// ```
62    pub fn new(width: usize) -> Self {
63        assert!(width <= 8, "Only encoding widths up to 8 supported");
64        BitEnc {
65            storage: Vec::new(),
66            width,
67            mask: mask(width),
68            len: 0,
69            usable_bits_per_block: 32 - 32 % width,
70        }
71    }
72
73    /// Create a new instance with a given capacity and encoding width
74    /// (e.g. width=2 for using two bits per value).
75    /// Supports widths up to 8 bits per character, i.e. `1 <= width <= 8`.
76    ///
77    /// Complexity: O(1)
78    ///
79    /// # Example
80    ///
81    /// ```
82    /// use fqtk_lib::bitenc::BitEnc;
83    ///
84    /// let bitenc = BitEnc::with_capacity(3, 42);
85    /// ```
86    pub fn with_capacity(width: usize, n: usize) -> Self {
87        assert!(width <= 8, "Only encoding widths up to 8 supported");
88        BitEnc {
89            storage: Vec::with_capacity(n * width / 32),
90            width,
91            mask: mask(width),
92            len: 0,
93            usable_bits_per_block: 32 - 32 % width,
94        }
95    }
96
97    /// Append a character to the current bit-encoding.
98    ///
99    /// Complexity: O(1)
100    ///
101    /// # Example
102    ///
103    /// ```
104    /// use fqtk_lib::bitenc::BitEnc;
105    ///
106    /// let mut bitenc = BitEnc::new(4);
107    /// bitenc.push(0b0000);
108    /// bitenc.push(0b1000);
109    /// bitenc.push(0b1010);
110    /// // The three characters added above are encoded into one u32 entry.
111    /// let values: Vec<u8> = bitenc.iter().collect();
112    /// assert_eq!(values, [0b0000, 0b1000, 0b1010]);
113    /// ```
114    pub fn push(&mut self, value: u8) {
115        let (block, bit) = self.addr(self.len);
116        if bit == 0 {
117            self.storage.push(0);
118        }
119        self.set_by_addr(block, bit, value);
120        self.len += 1;
121    }
122
123    /// Append the given `value` to the encoding `n` times.
124    ///
125    /// The added values comprise 0 to 1 blocks that need to be filled up
126    /// from previous steps, 0 to m blocks that are
127    /// completely filled with the value and 0 to 1 blocks
128    /// that are only partially filled.
129    ///
130    /// Complexity: O(n)
131    ///
132    /// # Example
133    ///
134    /// ```
135    /// use fqtk_lib::bitenc::BitEnc;
136    ///
137    /// let mut bitenc = BitEnc::new(8);
138    /// // Width: 8 → 4 values per block
139    /// // | __ __ __ __ | Denotes one block with 4 empty slots
140    ///
141    /// bitenc.push_values(5, 0b101010);
142    /// // This adds one full and one partial block.
143    /// // | 42 42 42 42 | __ __ __ 42 |
144    ///
145    /// let values: Vec<u8> = bitenc.iter().collect();
146    /// assert_eq!(values, [42, 42, 42, 42, 42]);
147    ///
148    /// bitenc.push_values(1, 23);
149    /// // This only fills up an existing block;
150    /// // | 42 42 42 42 | __ __ 23 42 |
151    ///
152    /// let values: Vec<u8> = bitenc.iter().collect();
153    /// assert_eq!(values, [42, 42, 42, 42, 42, 23]);
154    ///
155    /// bitenc.push_values(6, 17);
156    /// // Fills up the current block, adds a whole new one but does not create a partial block.
157    /// // | 42 42 42 42 | 17 17 23 42 | 17 17 17 17 |
158    ///
159    /// let values: Vec<u8> = bitenc.iter().collect();
160    /// assert_eq!(values, [42, 42, 42, 42, 42, 23, 17, 17, 17, 17, 17, 17]);
161    /// ```
162    pub fn push_values(&mut self, mut n: usize, value: u8) {
163        // Fill up the previous block.
164        // Example: After adding 3 values with a width
165        // of 8, 8 out out 32 bits in the first block
166        // can still be used.
167        {
168            // Check if there are remaining free slots
169            // in the current block, i.e. if the bit offset
170            // within the block is non-zero
171            let (block, bit) = self.addr(self.len);
172            if bit > 0 {
173                // insert as many values as required to fill
174                // up the previous block and decrease the number
175                // left to insert accordingly
176                // The take(n) assures this iterator stops before
177                // an overflow of n can occur, if less symbols are
178                // added than open slots in the block.
179                for bit in (bit..32).step_by(self.width).take(n) {
180                    self.set_by_addr(block, bit, value);
181                    n -= 1;
182                    self.len += 1;
183                }
184            }
185        }
186
187        // If symbols remain to be inserted after
188        // filling up the current block divide them into
189        // full and partial blocks to add them.
190        if n > 0 {
191            // Create a full value block containing
192            // as many copies of value as possible.
193            let mut value_block = 0;
194            {
195                let mut v = u32::from(value);
196                for _ in 0..32 / self.width {
197                    value_block |= v;
198                    v <<= self.width;
199                }
200            }
201
202            // Append as many full value blocks as needed
203            // to reach the last block
204            let i = self.len + n;
205            let (block, bit) = self.addr(i);
206            self.storage.resize(block, value_block);
207
208            if bit > 0 {
209                // add the remaining values to a final block
210                // let shifted_block = value_block >> (32 - bit);
211                let shifted_block = value_block >> (self.usable_bits_per_block - bit);
212                self.storage.push(shifted_block);
213            }
214
215            self.len = i;
216        }
217    }
218
219    /// Replace the current value as position `i` with the given value.
220    ///
221    /// Complexity: O(1)
222    ///
223    /// ```
224    /// use fqtk_lib::bitenc::BitEnc;
225    ///
226    /// let mut bitenc = BitEnc::new(4);
227    /// bitenc.push_values(4, 0b1111);
228    /// bitenc.set(2, 0b0000);
229    ///
230    /// let values: Vec<u8> = bitenc.iter().collect();
231    /// assert_eq!(values, [0b1111, 0b1111, 0b0000, 0b1111]);
232    /// ```
233    pub fn set(&mut self, i: usize, value: u8) {
234        let (block, bit) = self.addr(i);
235        self.set_by_addr(block, bit, value);
236    }
237
238    /// Get the value at position `i`.
239    ///
240    /// Complexity: O(1)
241    ///
242    /// ```
243    /// use fqtk_lib::bitenc::BitEnc;
244    ///
245    /// let mut bitenc = BitEnc::new(4);
246    /// for value in 1..=4 {
247    ///     bitenc.push(value);
248    /// }
249    ///
250    /// let values: Vec<u8> = bitenc.iter().collect();
251    /// assert_eq!(values, [0b0001, 0b0010, 0b0011, 0b0100]);
252    /// ```
253    pub fn get(&self, i: usize) -> Option<u8> {
254        if i >= self.len {
255            None
256        } else {
257            let (block, bit) = self.addr(i);
258            Some(self.get_by_addr(block, bit))
259        }
260    }
261
262    /// Iterate over stored values (values will be unpacked into bytes).
263    ///
264    /// Complexity: O(n), where n is the number of encoded values
265    ///
266    /// # Example
267    ///
268    /// ```
269    /// use fqtk_lib::bitenc::BitEnc;
270    ///
271    /// // Fill bitenc with 1, 2, 3, and 4.
272    /// let mut bitenc = BitEnc::new(4);
273    /// for value in 1..=4 {
274    ///     bitenc.push(value);
275    /// }
276    ///
277    /// // Collect iterator for comparison
278    /// let values: Vec<u8> = bitenc.iter().collect();
279    /// assert_eq!(values, [0b0001, 0b0010, 0b0011, 0b0100]);
280    /// ```
281    pub fn iter(&self) -> BitEncIter<'_> {
282        BitEncIter { bitenc: self, i: 0 }
283    }
284
285    /// Clear the sequence.
286    ///
287    /// Complexity: O(1)
288    ///
289    /// # Example
290    ///
291    /// ```
292    /// use fqtk_lib::bitenc::BitEnc;
293    ///
294    /// let mut bitenc = BitEnc::new(2);
295    /// bitenc.push(2);
296    /// assert_eq!(bitenc.len(), 1);
297    /// bitenc.clear();
298    /// assert_eq!(bitenc.len(), 0);
299    /// ```
300    pub fn clear(&mut self) {
301        self.storage.clear();
302        self.len = 0;
303    }
304
305    /// Get the value stored in the given `block` at `bit`.
306    fn get_by_addr(&self, block: usize, bit: usize) -> u8 {
307        ((self.storage[block] >> bit) & self.mask) as u8
308    }
309
310    /// Replace the value in the given `block` at `bit` with the given `value`.
311    fn set_by_addr(&mut self, block: usize, bit: usize, value: u8) {
312        let mask = self.mask << bit;
313        self.storage[block] |= mask;
314        self.storage[block] ^= mask;
315        self.storage[block] |= (u32::from(value) & self.mask) << bit;
316    }
317
318    /// Get the block and start bit for the `i`th encoded value.
319    fn addr(&self, i: usize) -> (usize, usize) {
320        let k = i * self.width;
321        (k / self.usable_bits_per_block, k % self.usable_bits_per_block)
322    }
323
324    /// Get the number of symbols encoded.
325    ///
326    /// Complexity: O(1)
327    ///
328    /// # Example
329    ///
330    /// ```
331    /// use fqtk_lib::bitenc::BitEnc;
332    ///
333    /// let mut bitenc = BitEnc::new(8);
334    /// bitenc.push(2);
335    /// assert_eq!(bitenc.len(), 1);
336    /// bitenc.push(2);
337    /// bitenc.push(2);
338    /// bitenc.push(2);
339    /// assert_eq!(bitenc.len(), 4);
340    /// // Add another 2 to create a second block
341    /// bitenc.push(2);
342    /// assert_eq!(bitenc.len(), 5);
343    /// ```
344    #[deprecated(
345        since = "0.33.0",
346        note = "Please use the more specific `nr_blocks` and `nr_symbols` functions instead."
347    )]
348    pub fn len(&self) -> usize {
349        self.len
350    }
351
352    /// Get the number of blocks used by the encoding.
353    ///
354    /// Complexity: O(1)
355    ///
356    /// # Example
357    ///
358    /// ```
359    /// use fqtk_lib::bitenc::BitEnc;
360    ///
361    /// let mut bitenc = BitEnc::new(8);
362    /// bitenc.push(2);
363    /// assert_eq!(bitenc.nr_blocks(), 1);
364    /// // Add enough 2s to completely fill the first block
365    /// bitenc.push(2);
366    /// bitenc.push(2);
367    /// bitenc.push(2);
368    /// assert_eq!(bitenc.nr_blocks(), 1);
369    /// // Add another 2 to create a second block
370    /// bitenc.push(2);
371    /// assert_eq!(bitenc.nr_blocks(), 2);
372    /// ```
373    pub fn nr_blocks(&self) -> usize {
374        self.storage.len()
375    }
376
377    /// Get the number of symbols encoded.
378    ///
379    /// Complexity: O(1)
380    ///
381    /// # Example
382    ///
383    /// ```
384    /// use fqtk_lib::bitenc::BitEnc;
385    ///
386    /// let mut bitenc = BitEnc::new(8);
387    /// bitenc.push(2);
388    /// assert_eq!(bitenc.nr_symbols(), 1);
389    /// bitenc.push(2);
390    /// bitenc.push(2);
391    /// bitenc.push(2);
392    /// assert_eq!(bitenc.nr_symbols(), 4);
393    /// bitenc.push(2);
394    /// assert_eq!(bitenc.nr_symbols(), 5);
395    /// ```
396    pub fn nr_symbols(&self) -> usize {
397        self.len
398    }
399
400    /// Is the encoded sequence empty?
401    ///
402    /// Complexity: O(1)
403    ///
404    /// # Example
405    ///
406    /// ```
407    /// use fqtk_lib::bitenc::BitEnc;
408    ///
409    /// let mut bitenc = BitEnc::new(2);
410    /// assert!(bitenc.is_empty());
411    /// bitenc.push(2);
412    /// assert!(!bitenc.is_empty());
413    /// bitenc.clear();
414    /// assert!(bitenc.is_empty());
415    /// ```
416    pub fn is_empty(&self) -> bool {
417        self.len == 0
418    }
419
420    /// Calculate the Hamming distance between this and another bitencoded sequence.
421    ///
422    /// Note: this allows IUPAC fuzzy matching with IUPAC bases in this sequence.
423    /// An IUPAC base in the other sequence matches if it is at least as specific as the
424    /// corresponding (IUPAC) base in this sequence. E.g. If the other sequence is an
425    /// N, it will not match anything but an N, and if the other base is an R, it
426    /// will match R, V, D, and N, since the latter IUPAC codes allow both A and G.
427    ///
428    /// # Panics
429    ///
430    /// Panics if the length and widths of the two sequences are not the same.
431    #[must_use]
432    pub fn hamming(&self, other: &BitEnc, max_mismatches: u32) -> u32 {
433        assert!(self.len == other.len, "Both bitenc sequences must have the same length");
434        assert!(self.width == other.width, "Both bitenc sequences must have the same width");
435        let mut count: u32 = 0;
436        let values_per_block = self.usable_bits_per_block / self.width;
437        for block_index in 0..self.nr_blocks() {
438            // Get the bits that are different across the two blocks.  These represent differences
439            // in values (bases), where multiple values (bases) are stored in each block.  We
440            // save the block differences so we efficiently count the differences below.
441            let block_diff = self.storage[block_index] & !other.storage[block_index];
442            if block_diff != 0 {
443                // Scan through the values (bases) in the block, counting those values (bases)
444                // that are different.
445                let mut shift_i = 0;
446                for _ in 0..values_per_block {
447                    let block_diff_sub = (block_diff >> shift_i) & self.mask;
448                    if block_diff_sub != 0 {
449                        count += 1;
450                    }
451                    shift_i += self.width;
452                }
453                if count >= max_mismatches {
454                    return max_mismatches;
455                }
456            }
457        }
458        count
459    }
460}
461
462/// Iterator over values of a bitencoded sequence (values will be unpacked into bytes).
463/// Used to implement the `iter` method of `BitEnc`.
464#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
465pub struct BitEncIter<'a> {
466    bitenc: &'a BitEnc,
467    i: usize,
468}
469
470impl Iterator for BitEncIter<'_> {
471    type Item = u8;
472
473    fn next(&mut self) -> Option<u8> {
474        let value = self.bitenc.get(self.i);
475        self.i += 1;
476        value
477    }
478}
479
480#[cfg(test)]
481mod tests {
482    use super::BitEnc;
483
484    #[test]
485    fn test_bitenc() {
486        let mut bitenc = BitEnc::new(2);
487        bitenc.push(0);
488        bitenc.push(2);
489        bitenc.push(1);
490        let mut values: Vec<u8> = bitenc.iter().collect();
491        assert_eq!(values, [0, 2, 1]);
492        bitenc.set(1, 3);
493        values = bitenc.iter().collect();
494        assert_eq!(values, [0, 3, 1]);
495    }
496
497    #[test]
498    fn test_push_values() {
499        let mut bitenc = BitEnc::new(2);
500        bitenc.push_values(32, 0);
501        assert_eq!(bitenc.storage, [0, 0]);
502    }
503
504    #[test]
505    fn test_push_values_edge_cases() {
506        //! This is a slight variation of the methods doc test,
507        //! which also creates a new partial block in the last step
508        //! and an additonal full block of 17s.
509        //! As this bloats the result vectors, this was not included
510        //! in the doc test.
511        //! Additionally, this test uses a width of 7 bits, which leave
512        //! some bits unused.
513
514        let mut bitenc = BitEnc::new(7);
515        // Width: 7 → 4 values per block
516        // | __ __ __ __ | Denotes one block with 4 empty slots and 4 leftover bits
517
518        bitenc.push_values(5, 0b101010);
519        // This adds one full and one partial block.
520        // | 42 42 42 42 | __ __ __ 42 |
521
522        let values: Vec<u8> = bitenc.iter().collect();
523        assert_eq!(values, [42, 42, 42, 42, 42]);
524        assert_eq!(bitenc.nr_blocks(), 2);
525        assert_eq!(bitenc.nr_symbols(), 5);
526
527        bitenc.push_values(1, 23);
528        // This only fills up an existing block;
529        // | 42 42 42 42 | __ __ 23 42 |
530        let values: Vec<u8> = bitenc.iter().collect();
531        assert_eq!(values, [42, 42, 42, 42, 42, 23]);
532        assert_eq!(bitenc.nr_blocks(), 2);
533        assert_eq!(bitenc.nr_symbols(), 6);
534
535        bitenc.push_values(12, 17);
536        // Fills up the current block, adds a whole new one AND create a partial block.
537        // | 42 42 42 42 | 17 17 23 42 | 17 17 17 17 | 17 17 17 17 | __ __ 17 17 |
538
539        let values: Vec<u8> = bitenc.iter().collect();
540        assert_eq!(
541            values,
542            [42, 42, 42, 42, 42, 23, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
543        );
544        assert_eq!(bitenc.nr_blocks(), 5);
545        assert_eq!(bitenc.nr_symbols(), 18);
546    }
547
548    #[test]
549    fn test_issue29() {
550        for w in 2..9 {
551            let mut vec = BitEnc::with_capacity(w, 1000);
552            for _ in 0..1000 {
553                vec.push(1);
554            }
555        }
556    }
557
558    #[test]
559    fn test_hamming() {
560        let mut left = BitEnc::new(4);
561        left.push_values(10, 0b1010); // Y (C or T)
562        let mut right = BitEnc::new(4);
563        right.push_values(10, 0b0010); // C
564        let mut mismatches = BitEnc::new(4);
565        mismatches.push_values(2, 0b1010); // Y
566        mismatches.push_values(1, 0b0100); // G
567        mismatches.push_values(3, 0b1010); // Y
568        mismatches.push_values(2, 0b0001); // A
569        mismatches.push_values(2, 0b1010); // Y
570
571        assert_eq!(left.hamming(&left, 100), 0);
572        assert_eq!(right.hamming(&right, 100), 0);
573        assert_eq!(right.hamming(&left, 100), 0); // since left is less restrictive than right
574        assert_eq!(left.hamming(&right, 100), 10); // since right is more restrictive than left
575        assert_eq!(left.hamming(&mismatches, 100), 3);
576        assert_eq!(mismatches.hamming(&left, 100), 3);
577        assert_eq!(mismatches.hamming(&left, 1), 1);
578        assert_eq!(mismatches.hamming(&left, 2), 2);
579        assert_eq!(mismatches.hamming(&left, 3), 3);
580    }
581}