seq_hash/
anti_lex.rs

1//! 'Anti lexicographic' hashing:
2//!
3//! A kmer's hash found by reading it's characters right to left, and by inverting the last (most significant) character.
4//! When k > 16, only the last 16 characters are used.
5
6use std::cmp::min;
7
8use crate::{KmerHasher, S};
9use packed_seq::{Delay, Seq};
10
11/// A hash function that compares strings reverse-lexicographically,
12/// with the last (most significant) character inverted.
13///
14/// Only supports 2-bit DNA sequences ([`packed_seq::AsciiSeq`] and [`packed_seq::PackedSeq`]).
15///
16/// The canonical version (with `CANONICAL=true`) returns the minimum of the forward and reverse-complement hashes.
17/// TODO: Test minimum vs maximum.
18pub struct AntiLexHasher<const CANONICAL: bool> {
19    k: usize,
20    /// Number of bits of each character.
21    b: usize,
22    /// Number of bits to shift each new character up to make it the most significant one.
23    shift: u32,
24    /// Mask to flip the bits of the most significant character.
25    anti: u32,
26    /// Mask to keep only the lowest k*b bits.
27    mask: u32,
28}
29
30impl<const CANONICAL: bool> AntiLexHasher<CANONICAL> {
31    /// Create a new [`AntiLexHasher`] for kmers of length `k`.
32    #[inline(always)]
33    pub const fn new(k: usize) -> Self {
34        let b = 2;
35        let shift = if b * k <= 32 { b * (k - 1) } else { 32 - b } as u32;
36        let anti = ((1 << b) - 1) << shift;
37        let mask = if b * k < 32 {
38            (1 << (b * k)) - 1
39        } else {
40            u32::MAX
41        };
42        Self {
43            k,
44            b,
45            shift,
46            anti,
47            mask,
48        }
49    }
50}
51
52impl KmerHasher for AntiLexHasher<false> {
53    const CANONICAL: bool = false;
54
55    fn new(k: usize) -> Self {
56        Self::new(k)
57    }
58
59    #[inline(always)]
60    fn k(&self) -> usize {
61        self.k
62    }
63
64    #[inline(always)]
65    fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32 {
66        assert!(seq.bits_per_char() <= self.b);
67
68        let mut fw: u32 = 0;
69        move |(a, _r)| {
70            fw = (fw >> self.b) ^ ((a as u32) << self.shift);
71            fw ^ self.anti
72        }
73    }
74
75    #[inline(always)]
76    fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S {
77        assert!(seq.bits_per_char() <= self.b);
78
79        let mut fw: S = S::splat(0);
80        move |(a, _r)| {
81            fw = (fw >> self.b as u32) ^ (a << self.shift);
82            fw ^ S::splat(self.anti)
83        }
84    }
85
86    #[inline(always)]
87    fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32 {
88        assert!(seq.bits_per_char() <= self.b);
89        let k = seq.len();
90        let shift = if self.b * k <= 32 {
91            self.b * (k - 1)
92        } else {
93            32 - self.b
94        } as u32;
95        let anti = ((1 << self.b) - 1) << shift;
96
97        let mut fw: u32 = 0;
98        move |a| {
99            fw = (fw >> self.b) ^ ((a as u32) << shift);
100            fw ^ anti
101        }
102    }
103}
104
105impl KmerHasher for AntiLexHasher<true> {
106    const CANONICAL: bool = true;
107
108    fn new(k: usize) -> Self {
109        Self::new(k)
110    }
111
112    #[inline(always)]
113    fn k(&self) -> usize {
114        self.k
115    }
116
117    #[inline(always)]
118    fn delay(&self) -> Delay {
119        Delay(self.k.saturating_sub(32 / self.b))
120    }
121
122    #[inline(always)]
123    fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32 {
124        assert!(seq.bits_per_char() <= self.b);
125        let mut shift = 0;
126        let mut anti = (1 << self.b) - 1;
127        let mut mask = anti;
128
129        let mut fw: u32 = 0;
130        let mut rc: u32 = 0;
131        let mut i = 0;
132        move |a| {
133            if i * self.b >= 32 {
134                fw >>= self.b;
135            }
136            fw ^= (a as u32) << shift;
137            if i * self.b < 32 {
138                // ^2 for complement.
139                rc = ((rc << self.b) & mask) ^ (a as u32 ^ 2);
140            }
141            let out = min(fw ^ anti, rc ^ anti);
142
143            if (i + 1) * self.b < 32 {
144                shift += self.b as u32;
145                anti <<= self.b;
146                mask = (mask << self.b) | ((1 << self.b) - 1);
147            }
148            i += 1;
149
150            out
151        }
152    }
153
154    #[inline(always)]
155    fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32 {
156        assert!(seq.bits_per_char() <= self.b);
157
158        let mut fw: u32 = 0;
159        let mut rc: u32 = 0;
160        move |(a, r)| {
161            fw = (fw >> self.b) ^ ((a as u32) << self.shift);
162            // ^2 for complement.
163            rc = ((rc << self.b) & self.mask) ^ (r as u32 ^ 2);
164            min(fw ^ self.anti, rc ^ self.anti)
165        }
166    }
167
168    #[inline(always)]
169    fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S {
170        assert!(seq.bits_per_char() <= self.b);
171
172        let mut fw: S = S::splat(0);
173        let mut rc: S = S::splat(0);
174        move |(a, r)| {
175            fw = (fw >> self.b as u32) ^ (a << self.shift);
176            rc = ((rc << self.b as u32) & S::splat(self.mask)) ^ (r ^ S::splat(2));
177            (fw ^ S::splat(self.anti)).min(rc ^ S::splat(self.anti))
178        }
179    }
180}