Skip to main content

rustalign_fmindex/
locus.rs

1//! SideLocus - Position within FM-index
2//!
3//! A SideLocus represents a position within the FM-index, specifically
4//! within a "side" (a fixed-size chunk of the BWT).
5//!
6//! This matches the C++ SideLocus struct in bt2_idx.h for binary compatibility.
7
8use crate::EbwtParams;
9use crate::params::OFF_SIZE;
10use std::fmt;
11
12/// Position within the FM-index
13///
14/// This corresponds to the `SideLocus` struct in the C++ code.
15/// The FM-index is organized into "sides" of fixed size for
16/// efficient cache access.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct SideLocus {
19    /// Offset of side within ebwt[] (in bytes)
20    pub side_byte_off: u64,
21
22    /// Index of side
23    pub side_num: u64,
24
25    /// Character offset within side (0 to sideBwtLen-1)
26    pub char_off: u32,
27
28    /// Byte within side (0 to sideBwtSz-1)
29    pub by: i32,
30
31    /// Bit-pair within byte (0-3)
32    pub bp: i32,
33}
34
35impl SideLocus {
36    /// Size constant: 48 * OFF_SIZE (hard-coded in C++)
37    const SIDE_BWT_LEN: u64 = 48 * OFF_SIZE as u64;
38
39    /// Create a new invalid SideLocus (default state)
40    pub fn new() -> Self {
41        Self {
42            side_byte_off: 0,
43            side_num: 0,
44            char_off: 0,
45            by: -1,
46            bp: -1,
47        }
48    }
49
50    /// Initialize from a BWT row position
51    ///
52    /// # Arguments
53    /// * `row` - BWT row position
54    /// * `ep` - Ebwt parameters
55    /// * `ebwt` - Pointer to ebwt data (for prefetching)
56    ///
57    /// # Safety
58    /// The ebwt pointer must be valid for the lifetime of this SideLocus.
59    pub fn init_from_row(&mut self, row: u64, ep: &EbwtParams, ebwt: &[u8]) {
60        let side_sz = ep.side_sz as i32;
61        // Side length is hard-coded for now; this allows the compiler
62        // to do clever things to accelerate / and %.
63        let side_num = row / Self::SIDE_BWT_LEN;
64        assert!(side_num < ep.num_sides, "side_num out of range");
65        let char_off = (row % Self::SIDE_BWT_LEN) as u32;
66
67        self.side_num = side_num;
68        self.char_off = char_off;
69        let s_byte_off = side_num * side_sz as u64;
70
71        // Prefetch cache lines
72        Self::prefetch_from_ebwt(ebwt, s_byte_off as usize);
73
74        self.side_byte_off = s_byte_off;
75        assert!(row <= ep.len, "row exceeds reference length");
76        assert!(
77            self.side_byte_off + side_sz as u64 <= ep.ebwt_tot_sz,
78            "side_byte_off exceeds ebwt size"
79        );
80
81        // Tons of cache misses on the next line
82        self.by = (char_off >> 2) as i32; // byte within side
83        assert!(self.by < ep.side_bwt_sz as i32, "by exceeds side_bwt_sz");
84        self.bp = (char_off & 3) as i32; // bit-pair within byte
85    }
86
87    /// Initialize two SideLocus objects from a top/bot pair
88    ///
89    /// This is an optimized version that uses the result from one call
90    /// to initFromRow to possibly avoid a second call.
91    pub fn init_from_top_bot(top: u64, bot: u64, ep: &EbwtParams, ebwt: &[u8]) -> (Self, Self) {
92        assert!(bot > top, "bot must be > top");
93
94        let mut ltop = Self::new();
95        ltop.init_from_row(top, ep, ebwt);
96
97        let spread = bot - top;
98        let char_off_sum = ltop.char_off as u64 + spread;
99
100        let lbot = if char_off_sum < ep.side_bwt_len as u64 {
101            // Same side as ltop
102            let bchar_off = char_off_sum as u32;
103            Self {
104                side_byte_off: ltop.side_byte_off,
105                side_num: ltop.side_num,
106                char_off: bchar_off,
107                by: (bchar_off >> 2) as i32,
108                bp: (bchar_off & 3) as i32,
109            }
110        } else {
111            // Different side - need to init from row
112            let mut locus = Self::new();
113            locus.init_from_row(bot, ep, ebwt);
114            locus
115        };
116
117        (ltop, lbot)
118    }
119
120    /// Prefetch cache lines for a given row
121    pub fn prefetch_from_row(row: u64, ep: &EbwtParams, ebwt: &[u8]) {
122        let side_sz = ep.side_sz as i32;
123        let side_num = row / Self::SIDE_BWT_LEN;
124        let side_byte_off = (side_num * side_sz as u64) as usize;
125        Self::prefetch_from_ebwt(ebwt, side_byte_off);
126    }
127
128    /// Prefetch cache lines for this SideLocus
129    pub fn prefetch(&self, ebwt: &[u8]) {
130        Self::prefetch_from_ebwt(ebwt, self.side_byte_off as usize);
131    }
132
133    /// Prefetch from ebwt at given offset
134    fn prefetch_from_ebwt(ebwt: &[u8], offset: usize) {
135        if offset + 64 <= ebwt.len() {
136            // Prefetch first 64-byte cache line
137            #[cfg(target_arch = "x86_64")]
138            unsafe {
139                std::arch::x86_64::_mm_prefetch(
140                    ebwt.as_ptr().add(offset) as *const i8,
141                    std::arch::x86_64::_MM_HINT_T0,
142                );
143            }
144            // Prefetch second 64-byte cache line
145            if offset + 128 <= ebwt.len() {
146                #[cfg(target_arch = "x86_64")]
147                unsafe {
148                    std::arch::x86_64::_mm_prefetch(
149                        ebwt.as_ptr().add(offset + 64) as *const i8,
150                        std::arch::x86_64::_MM_HINT_T0,
151                    );
152                }
153            }
154        }
155    }
156
157    /// Transform this SideLocus to refer to the next side
158    pub fn next_side(&mut self, ep: &EbwtParams) {
159        assert!(self.valid(), "SideLocus must be valid");
160        self.side_byte_off += ep.side_sz as u64;
161        self.side_num += 1;
162        self.by = 0;
163        self.bp = 0;
164        self.char_off = 0;
165        assert!(self.valid(), "SideLocus must be valid after next_side");
166    }
167
168    /// Check if this is a valid (initialized) SideLocus
169    pub fn valid(&self) -> bool {
170        self.bp != -1
171    }
172
173    /// Check if this SideLocus is valid with respect to given parameters
174    pub fn is_valid(&self, _ep: &EbwtParams) -> bool {
175        self.valid()
176    }
177
178    /// Convert locus to BW row it corresponds to
179    pub fn to_bw_row(&self) -> u64 {
180        self.side_num * Self::SIDE_BWT_LEN + self.char_off as u64
181    }
182
183    /// Invalidate this SideLocus (make it look like an invalid one)
184    pub fn invalidate(&mut self) {
185        self.bp = -1;
186    }
187
188    /// Get a read-only pointer to the beginning of the top side
189    pub fn side<'a>(&self, ebwt: &'a [u8]) -> &'a [u8] {
190        &ebwt[self.side_byte_off as usize..]
191    }
192
193    /// Get the actual byte offset in the mmapped data
194    pub fn byte_offset(&self) -> u64 {
195        self.side_byte_off
196    }
197
198    /// Get the mask for extracting the bit-pair at this position
199    pub fn mask(&self) -> u8 {
200        0x03 << (self.bp * 2)
201    }
202
203    /// Extract the nucleotide pair at this locus from data
204    pub fn extract(&self, data: &[u8]) -> u8 {
205        let byte = data[self.byte_offset() as usize];
206        (byte >> (self.bp * 2)) & 0x03
207    }
208}
209
210impl Default for SideLocus {
211    fn default() -> Self {
212        Self::new()
213    }
214}
215
216impl fmt::Display for SideLocus {
217    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
218        write!(
219            f,
220            "SideLocus(side={}, char_off={}, by={}, bp={})",
221            self.side_num, self.char_off, self.by, self.bp
222        )
223    }
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229
230    #[test]
231    fn test_locus_new() {
232        let locus = SideLocus::new();
233        assert!(!locus.valid());
234    }
235
236    #[test]
237    fn test_locus_invalid() {
238        let mut locus = SideLocus::new();
239        assert!(!locus.valid());
240        locus.invalidate();
241        assert!(!locus.valid());
242    }
243
244    #[test]
245    fn test_locus_to_bw_row() {
246        let locus = SideLocus {
247            side_byte_off: 0,
248            side_num: 0,
249            char_off: 100,
250            by: 25,
251            bp: 0,
252        };
253        assert_eq!(locus.to_bw_row(), 100);
254    }
255
256    #[test]
257    fn test_locus_next_side() {
258        let ep = EbwtParams::new(10000);
259        let mut locus = SideLocus {
260            side_byte_off: ep.side_sz as u64,
261            side_num: 1,
262            char_off: 50,
263            by: 12,
264            bp: 2,
265        };
266        assert!(locus.valid());
267        locus.next_side(&ep);
268        assert_eq!(locus.side_num, 2);
269        assert_eq!(locus.char_off, 0);
270        assert_eq!(locus.by, 0);
271        assert_eq!(locus.bp, 0);
272        assert!(locus.valid());
273    }
274
275    #[test]
276    fn test_mask() {
277        // Test all 4 bit-pair positions
278        for bp in 0..4 {
279            let locus = SideLocus {
280                side_byte_off: 0,
281                side_num: 0,
282                char_off: 0,
283                by: 0,
284                bp,
285            };
286            assert_eq!(locus.mask(), 0x03 << (bp * 2));
287        }
288    }
289}