rustalign_fmindex/
ebwt.rs

1//! Extended Burrows-Wheeler Transform (Ebwt) - Main FM-index structure
2//!
3//! This is the core data structure for RustAlign's sequence alignment.
4//! It implements a memory-mapped FM-index for efficient pattern matching.
5
6use crate::{EbwtParams, FmIndex};
7use memmap2::Mmap;
8use rustalign_common::{AlignError, AlignResult, Nuc, NucPair};
9use std::fs::File;
10use std::path::{Path, PathBuf};
11
12/// Extended Burrows-Wheeler Transform (FM-index)
13///
14/// This is the main data structure used by RustAlign for ultrafast
15/// sequence alignment. It is memory-mapped from .rai index files.
16pub struct Ebwt {
17    /// Index parameters
18    params: EbwtParams,
19
20    /// Path to the index base (without extension)
21    path: PathBuf,
22
23    /// Mmapped forward index (.1.rai file)
24    ebwt_fw: Option<Mmap>,
25
26    /// Mmapped suffix array samples (.2.rai file)
27    offs_mmap: Option<Mmap>,
28
29    /// Mmapped reverse index (not currently used)
30    #[allow(dead_code)]
31    ebwt_rc: Option<Mmap>,
32
33    /// Whether this is a "large" index
34    large: bool,
35
36    /// Number of reference sequences (chromosomes)
37    n_pat: u32,
38
39    /// Number of fragments
40    n_frag: u32,
41
42    /// Length of each reference sequence
43    plen: Vec<u32>,
44
45    /// Reference sequence names (chromosome names)
46    refnames: Vec<String>,
47
48    /// rstarts array: [frag_start, text_id, frag_offset] for each fragment
49    /// Stored as flat array of n_frag * 3 elements
50    rstarts: Vec<u32>,
51
52    /// Offset to ftab in the mmapped data
53    ftab_off: usize,
54
55    /// Offset to eftab in the mmapped data
56    #[allow(dead_code)]
57    eftab_off: usize,
58
59    /// Offset to ebwt data in the mmapped data
60    ebwt_off: usize,
61
62    /// Offset to rstarts in the mmapped data
63    #[allow(dead_code)]
64    rstarts_off: usize,
65
66    /// Position of '$' in BWT
67    z_off: u32,
68
69    /// First character array (C array) - start positions for each character
70    fchr: [u64; 5],
71}
72
73impl Ebwt {
74    /// Load an existing index from disk
75    ///
76    /// # Arguments
77    /// * `path_base` - Path to the index (without .1.rai, .2.rai, etc. extensions)
78    ///
79    /// # Errors
80    /// Returns an error if the index files don't exist or are corrupted.
81    pub fn load<P: AsRef<Path>>(path_base: P) -> AlignResult<Self> {
82        let path_base = path_base.as_ref();
83        Self::load_with_options(path_base, false)
84    }
85
86    /// Load a "large" index variant
87    pub fn load_large<P: AsRef<Path>>(path_base: P) -> AlignResult<Self> {
88        let path_base = path_base.as_ref();
89        Self::load_with_options(path_base, true)
90    }
91
92    /// Load index with size option
93    fn load_with_options(path_base: &Path, large: bool) -> AlignResult<Self> {
94        // Construct file extension based on size
95        let ext = if large { ".l" } else { "" };
96        let base_name = path_base
97            .file_name()
98            .ok_or_else(|| AlignError::InvalidFormat("Invalid index path".into()))?
99            .to_str()
100            .unwrap();
101
102        // Load the first .1.rai file
103        let file_1 = path_base.with_file_name(format!("{}.1.rai{}", base_name, ext));
104        let file_2 = path_base.with_file_name(format!("{}.2.rai{}", base_name, ext));
105
106        if !file_1.exists() || !file_2.exists() {
107            return Err(AlignError::IndexCorrupted {
108                path: path_base.to_path_buf(),
109            });
110        }
111
112        // Load and parse parameters from file header
113        let file = File::open(&file_1)?;
114        let mmap = unsafe { Mmap::map(&file)? };
115
116        // Load the suffix array samples from .2.rai file
117        let file2 = File::open(&file_2)?;
118        let offs_mmap = unsafe { Mmap::map(&file2)? };
119
120        // Parse header and get offsets
121        let (
122            params,
123            n_pat,
124            n_frag,
125            plen,
126            rstarts,
127            ftab_off,
128            eftab_off,
129            ebwt_off,
130            rstarts_off,
131            z_off,
132            fchr,
133            refnames,
134        ) = Self::parse_header_and_offsets(&mmap)?;
135
136        Ok(Self {
137            params,
138            path: path_base.to_path_buf(),
139            ebwt_fw: Some(mmap),
140            offs_mmap: Some(offs_mmap),
141            ebwt_rc: None,
142            large,
143            n_pat,
144            n_frag,
145            plen,
146            refnames,
147            rstarts,
148            ftab_off,
149            eftab_off,
150            ebwt_off,
151            rstarts_off,
152            z_off,
153            fchr,
154        })
155    }
156
157    /// Parse EbwtParams and calculate offsets from mmapped data
158    ///
159    /// The RustAlign index file format (.rai):
160    /// - endian hint: int32 (should be 1 for little-endian)
161    /// - len: uint32 (length of reference sequence)
162    /// - lineRate: int32
163    /// - linesPerSide: int32 (not used, always 2)
164    /// - offRate: int32
165    /// - ftabChars: int32
166    /// - flags: int32 (colorspace and entire-reverse flags)
167    /// - nPat: uint32 (number of reference sequences)
168    /// - plen: uint32[nPat] (lengths of each sequence)
169    /// - nFrag: uint32 (number of fragments)
170    /// - rstarts: uint32[nFrag * 3] (fragment starts, text ids, offsets)
171    /// - ebwt: uint8[ebwtTotLen] (BWT data with side structure)
172    /// - zOff: uint32 (position of '$' in BWT)
173    /// - fchr: uint32[5] (first character ranges: A, C, G, T, N)
174    /// - ftab: uint32[4^ftabChars + 1] (frequency table)
175    /// - eftab: uint32[ftabChars * 2] (extended frequency table)
176    /// - refnames: null-terminated, newline-separated chromosome names
177    #[allow(clippy::type_complexity)]
178    fn parse_header_and_offsets(
179        data: &[u8],
180    ) -> AlignResult<(
181        EbwtParams,
182        u32,
183        u32,
184        Vec<u32>,
185        Vec<u32>,
186        usize,
187        usize,
188        usize,
189        usize,
190        u32,
191        [u64; 5],
192        Vec<String>,
193    )> {
194        use byteorder::{LittleEndian, ReadBytesExt};
195        use std::io::Cursor;
196
197        if data.len() < 28 {
198            return Err(AlignError::InvalidFormat(
199                "Index file too small for header".into(),
200            ));
201        }
202
203        let mut cursor = Cursor::new(data);
204
205        // Read endian hint (should be 1 for little-endian)
206        let endian_hint = cursor
207            .read_i32::<LittleEndian>()
208            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read endian hint: {}", e)))?;
209
210        if endian_hint != 1 {
211            return Err(AlignError::InvalidFormat(format!(
212                "Invalid endian hint: {} (expected 1)",
213                endian_hint
214            )));
215        }
216
217        // Read header fields
218        let len = cursor
219            .read_u32::<LittleEndian>()
220            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read len: {}", e)))?
221            as u64;
222
223        let line_rate = cursor
224            .read_i32::<LittleEndian>()
225            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read lineRate: {}", e)))?;
226
227        let _lines_per_side = cursor.read_i32::<LittleEndian>().map_err(|e| {
228            AlignError::InvalidFormat(format!("Failed to read linesPerSide: {}", e))
229        })?;
230
231        let off_rate = cursor
232            .read_i32::<LittleEndian>()
233            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read offRate: {}", e)))?;
234
235        let ftab_chars = cursor
236            .read_i32::<LittleEndian>()
237            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read ftabChars: {}", e)))?;
238
239        let flags = cursor
240            .read_i32::<LittleEndian>()
241            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read flags: {}", e)))?;
242
243        // Extract flags (from EBWT_FLAGS enum)
244        // Flags are stored as negative values
245        let color = flags < 0 && ((-flags) & 2) != 0;
246        let entire_reverse = flags < 0 && ((-flags) & 4) != 0;
247
248        // Create params using the parsed values
249        let params =
250            EbwtParams::with_options(len, line_rate, off_rate, ftab_chars, color, entire_reverse);
251
252        if !params.is_valid() {
253            return Err(AlignError::InvalidFormat(format!(
254                "Invalid index parameters: len={}, lineRate={}, offRate={}, ftabChars={}",
255                len, line_rate, off_rate, ftab_chars
256            )));
257        }
258
259        // Read nPat (number of reference sequences)
260        let n_pat = cursor
261            .read_u32::<LittleEndian>()
262            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read nPat: {}", e)))?;
263
264        // Read plen array (n_pat * 4 bytes)
265        let plen_off = cursor.position() as usize;
266        let plen_size = n_pat as usize * 4;
267        if plen_off + plen_size > data.len() {
268            return Err(AlignError::InvalidFormat(
269                "Index file truncated at plen".into(),
270            ));
271        }
272
273        let mut plen = Vec::with_capacity(n_pat as usize);
274        for i in 0..n_pat as usize {
275            plen.push(Self::read_u32_at(data, plen_off + i * 4)?);
276        }
277        cursor.set_position(cursor.position() + plen_size as u64);
278
279        // Read nFrag (number of fragments)
280        let n_frag = cursor
281            .read_u32::<LittleEndian>()
282            .map_err(|e| AlignError::InvalidFormat(format!("Failed to read nFrag: {}", e)))?;
283
284        // Read rstarts array (n_frag * 3 * 4 bytes)
285        let rstarts_off = cursor.position() as usize;
286        let rstarts_size = n_frag as usize * 3 * 4;
287        if rstarts_off + rstarts_size > data.len() {
288            return Err(AlignError::InvalidFormat(
289                "Index file truncated at rstarts".into(),
290            ));
291        }
292
293        let mut rstarts = Vec::with_capacity(n_frag as usize * 3);
294        for i in 0..(n_frag as usize * 3) {
295            rstarts.push(Self::read_u32_at(data, rstarts_off + i * 4)?);
296        }
297        cursor.set_position(cursor.position() + rstarts_size as u64);
298
299        // Skip ebwt data (the BWT itself - this is the largest section)
300        let ebwt_tot_len = params.ebwt_tot_len as usize;
301        if cursor.position() as usize + ebwt_tot_len > data.len() {
302            return Err(AlignError::InvalidFormat(
303                "Index file truncated at ebwt".into(),
304            ));
305        }
306
307        // Record ebwt offset before skipping
308        let ebwt_off = cursor.position() as usize;
309        cursor.set_position(cursor.position() + ebwt_tot_len as u64);
310
311        // Read zOff (single uint32)
312        if cursor.position() as usize + 4 > data.len() {
313            return Err(AlignError::InvalidFormat(
314                "Index file truncated at zOff".into(),
315            ));
316        }
317        let z_off = Self::read_u32_at(data, cursor.position() as usize)?;
318        cursor.set_position(cursor.position() + 4);
319
320        // Now we're at fchr (first character array - 5 uint32 values)
321        let fchr_off = cursor.position() as usize;
322        if fchr_off + 20 > data.len() {
323            return Err(AlignError::InvalidFormat(
324                "Index file truncated at fchr".into(),
325            ));
326        }
327
328        // Read fchr (first character ranges)
329        // fchr[0] = start of A (always 0)
330        // fchr[1] = start of C (count of A)
331        // fchr[2] = start of G (count of A + C)
332        // fchr[3] = start of T (count of A + C + G)
333        // fchr[4] = end of T (bwt length)
334        let fchr = [
335            Self::read_u32_at(data, fchr_off)? as u64,      // A start
336            Self::read_u32_at(data, fchr_off + 4)? as u64,  // C start
337            Self::read_u32_at(data, fchr_off + 8)? as u64,  // G start
338            Self::read_u32_at(data, fchr_off + 12)? as u64, // T start
339            Self::read_u32_at(data, fchr_off + 16)? as u64, // N/end
340        ];
341
342        // Move past fchr to get to ftab
343        cursor.set_position(fchr_off as u64 + 20);
344
345        // ftab starts here
346        let ftab_off = cursor.position() as usize;
347        let ftab_len = params.ftab_len as usize;
348        let ftab_size = ftab_len * 4;
349
350        if ftab_off + ftab_size > data.len() {
351            return Err(AlignError::InvalidFormat(
352                "Index file truncated at ftab".into(),
353            ));
354        }
355
356        // eftab follows ftab
357        let eftab_off = ftab_off + ftab_size;
358        let eftab_size = params.eftab_len as usize * 4;
359
360        // Read chromosome names (refnames) - they come after eftab
361        // The names are stored as the full FASTA header, but we only need
362        // the first word (the actual chromosome name like "1", "Mt", "Pt")
363        let refnames_off = eftab_off + eftab_size;
364        let mut refnames = Vec::new();
365
366        if refnames_off < data.len() {
367            let mut current_name = String::new();
368            let mut pos = refnames_off;
369
370            while pos < data.len() {
371                let c = data[pos];
372                pos += 1;
373
374                if c == 0 {
375                    // Null terminator - end of names
376                    if !current_name.is_empty() {
377                        // Only take the first word (before any space)
378                        let short_name = current_name.split_whitespace().next().unwrap_or("");
379                        refnames.push(short_name.to_string());
380                    }
381                    break;
382                } else if c == b'\n' {
383                    // Newline - start new name
384                    if !current_name.is_empty() {
385                        // Only take the first word (before any space)
386                        let short_name = current_name.split_whitespace().next().unwrap_or("");
387                        refnames.push(short_name.to_string());
388                        current_name.clear();
389                    }
390                } else {
391                    current_name.push(c as char);
392                }
393            }
394
395            // Don't forget the last name if not empty
396            if !current_name.is_empty() {
397                let short_name = current_name.split_whitespace().next().unwrap_or("");
398                refnames.push(short_name.to_string());
399            }
400        }
401
402        Ok((
403            params,
404            n_pat,
405            n_frag,
406            plen,
407            rstarts,
408            ftab_off,
409            eftab_off,
410            ebwt_off,
411            rstarts_off,
412            z_off,
413            fchr,
414            refnames,
415        ))
416    }
417
418    /// Read a little-endian u32 at a given offset
419    fn read_u32_at(data: &[u8], offset: usize) -> AlignResult<u32> {
420        if offset + 4 > data.len() {
421            return Err(AlignError::InvalidFormat("Offset out of bounds".into()));
422        }
423        let bytes: [u8; 4] = data[offset..offset + 4].try_into().unwrap();
424        Ok(u32::from_le_bytes(bytes))
425    }
426
427    /// Read a u32 from ftab at given index
428    fn ftab(&self, idx: usize) -> u64 {
429        let offset = self.ftab_off + idx * 4;
430        u32::from_le_bytes(
431            self.ebwt_fw.as_ref().unwrap()[offset..offset + 4]
432                .try_into()
433                .unwrap(),
434        ) as u64
435    }
436
437    /// Get the fchr (first character array) entry for a nucleotide
438    fn fchr(&self, c: u8) -> u64 {
439        match c {
440            0 => self.fchr[0], // A
441            1 => self.fchr[1], // C
442            2 => self.fchr[2], // G
443            3 => self.fchr[3], // T
444            _ => self.fchr[4], // N/others
445        }
446    }
447
448    /// Internal rank operation - count occurrences of character c up to position pos
449    ///
450    /// Uses RustAlign's side-based counting structure for efficiency.
451    /// Each side has the format: [bwt_data (side_bwt_sz bytes)] [4 x u32 counts for A,C,G,T]
452    /// The counts are cumulative - they represent all occurrences up to that side boundary.
453    fn rank_internal(&self, c: usize, pos: u64) -> AlignResult<usize> {
454        if pos == 0 {
455            return Ok(0);
456        }
457
458        let data = self
459            .ebwt_fw
460            .as_ref()
461            .ok_or_else(|| AlignError::InvalidFormat("Index not loaded".into()))?;
462
463        // BWT is bit-packed: 2 bits per nucleotide, 4 nucleotides per byte
464        let side_bwt_len = self.params.side_bwt_len as u64;
465        let side_bwt_sz = self.params.side_bwt_sz as usize;
466        let side_sz = self.params.side_sz as usize;
467
468        // Determine which side the position falls in
469        let side_num = pos / side_bwt_len;
470        let char_off = (pos % side_bwt_len) as usize; // character offset within side
471
472        // Read the cumulative count from the side boundary
473        // Counts are stored at offset side_bwt_sz within each side
474        let mut count = 0usize;
475
476        if side_num > 0 {
477            // Read cumulative counts from the start of the current side
478            // These counts represent all occurrences in sides 0..side_num
479            let side_start = self.ebwt_off + side_num as usize * side_sz;
480            let count_offset = side_start + side_bwt_sz + c * 4; // 4 bytes per count
481
482            if count_offset + 4 <= data.len() {
483                count = u32::from_le_bytes(data[count_offset..count_offset + 4].try_into().unwrap())
484                    as usize;
485            }
486        }
487
488        // Count occurrences within the current side up to char_off
489        // This is the countUpTo part
490        if char_off > 0 {
491            let side_start = self.ebwt_off + side_num as usize * side_sz;
492
493            // Count nucleotides one by one from start of side to char_off
494            for i in 0..char_off {
495                let byte_off = side_start + (i >> 2); // i / 4
496                let bit_off = (i & 3) * 2; // (i % 4) * 2
497
498                if byte_off < data.len() {
499                    let byte = data[byte_off];
500                    let nuc = ((byte >> bit_off) & 0x03) as usize;
501                    if nuc == c {
502                        count += 1;
503                    }
504                }
505            }
506        }
507
508        Ok(count)
509    }
510
511    /// Get the index parameters
512    pub fn params(&self) -> &EbwtParams {
513        &self.params
514    }
515
516    /// Get the path to this index
517    pub fn path(&self) -> &Path {
518        &self.path
519    }
520
521    /// Check if this is a large index
522    pub fn is_large(&self) -> bool {
523        self.large
524    }
525
526    /// Get the chromosome names
527    pub fn refnames(&self) -> &[String] {
528        &self.refnames
529    }
530
531    /// Get the number of chromosomes
532    pub fn n_pat(&self) -> u32 {
533        self.n_pat
534    }
535
536    /// Get the number of fragments
537    pub fn n_frag(&self) -> u32 {
538        self.n_frag
539    }
540
541    /// Get the rstarts array
542    pub fn rstarts(&self) -> &[u32] {
543        &self.rstarts
544    }
545
546    /// Get chromosome lengths
547    pub fn plen(&self) -> &[u32] {
548        &self.plen
549    }
550
551    /// Get the zOff value (position of $ in BWT)
552    pub fn z_off(&self) -> u32 {
553        self.z_off
554    }
555
556    /// Read a suffix array sample from the .2.rai file
557    ///
558    /// The offs array stores sampled suffix array values at positions
559    /// that are multiples of (1 << offRate).
560    #[allow(clippy::collapsible_if)]
561    fn offs(&self, idx: usize) -> u32 {
562        // Skip the 4-byte endianness hint at the start of .2.rai file
563        let offset = 4 + idx * 4;
564        if let Some(ref mmap) = self.offs_mmap {
565            if offset + 4 <= mmap.len() {
566                return u32::from_le_bytes(mmap[offset..offset + 4].try_into().unwrap());
567            }
568        }
569        0
570    }
571
572    /// Convert a BWT row to a genome offset using suffix array samples and LF-mapping.
573    ///
574    /// This implements RustAlign's getOffset() algorithm:
575    /// 1. If row is zOff, return 0
576    /// 2. If row is at a sampled position, return the sampled offset
577    /// 3. Otherwise, walk back using LF-mapping until hitting zOff or a sampled row
578    pub fn get_offset(&self, row: u64) -> AlignResult<u64> {
579        // Check for zOff (the $ character position)
580        if row == self.z_off as u64 {
581            return Ok(0);
582        }
583
584        // Check if row is at a sampled position (multiple of 2^offRate)
585        // off_mask has all bits set except the lower offRate bits
586        // (row & off_mask) == row means lower offRate bits are all 0
587        if (row & self.params.off_mask) == row {
588            let sample_idx = (row >> self.params.off_rate) as usize;
589            let offset = self.offs(sample_idx);
590            return Ok(offset as u64);
591        }
592
593        // Walk back using LF-mapping until hitting zOff or a sampled row
594        let mut current_row = row;
595        let mut jumps = 0u64;
596
597        // No theoretical maximum - we just walk until we hit a known position
598        loop {
599            // Perform LF-mapping step
600            let c = self.bwt_char(current_row)?;
601            let fchr_c = self.fchr(c as u8);
602            let rank_c = self.rank_internal(c, current_row)?;
603            current_row = fchr_c + rank_c as u64;
604            jumps += 1;
605
606            // Check if we hit zOff
607            if current_row == self.z_off as u64 {
608                return Ok(jumps);
609            }
610
611            // Check if we hit a sampled row
612            if (current_row & self.params.off_mask) == current_row {
613                let sample_idx = (current_row >> self.params.off_rate) as usize;
614                let offset = self.offs(sample_idx);
615                return Ok(jumps + offset as u64);
616            }
617
618            // Safety limit to prevent infinite loops (should never hit this in practice)
619            if jumps > self.params.len {
620                return Err(AlignError::InvalidFormat(format!(
621                    "getOffset failed to resolve row {} after {} steps (exceeded genome length)",
622                    row, jumps
623                )));
624            }
625        }
626    }
627
628    /// Get the character at a given BWT position
629    fn bwt_char(&self, pos: u64) -> AlignResult<usize> {
630        let data = self
631            .ebwt_fw
632            .as_ref()
633            .ok_or_else(|| AlignError::InvalidFormat("Index not loaded".into()))?;
634
635        let side_bwt_len = self.params.side_bwt_len as u64;
636        let side_sz = self.params.side_sz as usize;
637
638        let side_num = pos / side_bwt_len;
639        let char_off = (pos % side_bwt_len) as usize;
640
641        let side_start = self.ebwt_off + side_num as usize * side_sz;
642        let byte_off = side_start + (char_off >> 2);
643        let bit_off = (char_off & 3) * 2;
644
645        if byte_off < data.len() {
646            let byte = data[byte_off];
647            Ok(((byte >> bit_off) & 0x03) as usize)
648        } else {
649            Err(AlignError::InvalidFormat(format!(
650                "BWT position {} out of bounds",
651                pos
652            )))
653        }
654    }
655
656    /// Convert a genome offset to chromosome index and position.
657    ///
658    /// This implements RustAlign's joinedToTextOff() algorithm using binary search
659    /// on the rstarts array.
660    ///
661    /// Returns (chromosome_index, position_in_chromosome, chromosome_length)
662    pub fn joined_to_text_off(
663        &self,
664        qlen: u64,
665        off: u64,
666        fw: bool,
667    ) -> AlignResult<(u32, u64, u64)> {
668        if self.rstarts.is_empty() {
669            return Err(AlignError::InvalidFormat("rstarts array not loaded".into()));
670        }
671
672        let n_frag = self.n_frag as usize;
673
674        // Binary search for the fragment containing this offset
675        let mut top = 0usize;
676        let mut bot = n_frag;
677
678        while top < bot {
679            let mid = top + ((bot - top) >> 1);
680
681            // rstarts format: [frag_start, text_id, frag_offset] for each fragment
682            let lower = self.rstarts[mid * 3] as u64;
683            let upper = if mid + 1 >= n_frag {
684                self.params.len
685            } else {
686                self.rstarts[(mid + 1) * 3] as u64
687            };
688
689            if lower <= off {
690                if upper > off {
691                    // Found the right fragment
692                    let tidx = self.rstarts[mid * 3 + 1]; // chromosome index
693                    let frag_off_base = self.rstarts[mid * 3 + 2]; // fragment offset within text
694
695                    // Calculate position within the fragment
696                    let frag_len = upper - lower;
697                    let mut frag_off = off - lower;
698
699                    // Handle reverse index orientation
700                    if !fw {
701                        if frag_off + qlen > frag_len {
702                            // Would straddle fragment boundary
703                            return Err(AlignError::InvalidFormat(
704                                "Alignment straddles fragment boundary".into(),
705                            ));
706                        }
707                        frag_off = frag_len - frag_off - 1;
708                        frag_off = frag_off.saturating_sub(qlen - 1);
709                    }
710
711                    // Add fragment offset within the chromosome
712                    let text_off = frag_off + frag_off_base as u64;
713
714                    // Get chromosome length from plen
715                    let tlen = if (tidx as usize) < self.plen.len() {
716                        self.plen[tidx as usize] as u64
717                    } else {
718                        0
719                    };
720
721                    return Ok((tidx, text_off, tlen));
722                } else {
723                    top = mid + 1;
724                }
725            } else {
726                bot = mid;
727            }
728        }
729
730        Err(AlignError::InvalidFormat(format!(
731            "Failed to find fragment for offset {}",
732            off
733        )))
734    }
735
736    /// Resolve a BWT row to chromosome name and position.
737    ///
738    /// This combines getOffset() and joinedToTextOff() to convert a BWT row
739    /// directly to a chromosome name and position.
740    ///
741    /// Returns (chromosome_name, position, chromosome_length)
742    pub fn resolve_position(
743        &self,
744        row: u64,
745        qlen: u64,
746        fw: bool,
747    ) -> AlignResult<(String, u64, u64)> {
748        // Step 1: Convert BWT row to genome offset
749        let genome_off = self.get_offset(row)?;
750
751        // Step 2: Convert genome offset to chromosome + position
752        let (chr_idx, pos, chr_len) = self.joined_to_text_off(qlen, genome_off, fw)?;
753
754        // Step 3: Get chromosome name
755        let chr_name = if (chr_idx as usize) < self.refnames.len() {
756            self.refnames[chr_idx as usize].clone()
757        } else {
758            format!("chr{}", chr_idx)
759        };
760
761        // SAM format uses 1-based positions
762        Ok((chr_name, pos + 1, chr_len))
763    }
764}
765
766impl FmIndex for Ebwt {
767    /// Exact match search using backward search on FM-index
768    ///
769    /// This implements the classic FM-index backward search algorithm:
770    /// 1. Initialize range using ftab for the last ftabChars nucleotides
771    /// 2. Extend left through remaining characters using LF-mapping
772    /// 3. Return all positions in the final range
773    fn exact_match(&self, pattern: &[Nuc]) -> AlignResult<Vec<usize>> {
774        if pattern.is_empty() {
775            return Ok(Vec::new());
776        }
777
778        // Filter out invalid nucleotides (N's)
779        let valid_pattern: Vec<Nuc> = pattern.iter().filter(|&&n| n.is_valid()).copied().collect();
780        if valid_pattern.is_empty() {
781            return Ok(Vec::new());
782        }
783
784        let pat_len = valid_pattern.len();
785        let ftab_len = self.params.ftab_chars as usize;
786
787        // Step 1: Initialize range using ftab
788        // Use the last min(ftab_len, pat_len) characters
789        let init_len = ftab_len.min(pat_len);
790        let ftab_start = pat_len - init_len;
791
792        // Compute ftab index from the last init_len characters
793        let mut ftab_idx: usize = 0;
794        for i in 0..init_len {
795            let c = valid_pattern[ftab_start + i] as u8;
796            ftab_idx = (ftab_idx << 2) | (c as usize);
797        }
798
799        // Get initial range from ftab
800        let top = self.ftab(ftab_idx);
801        let bot = self.ftab(ftab_idx + 1);
802
803        if bot <= top {
804            // No matches
805            return Ok(Vec::new());
806        }
807
808        // Step 2: Extend left through remaining characters using LF-mapping
809        let mut top = top;
810        let mut bot = bot;
811
812        // Process characters from ftab_start-1 down to 0
813        for i in (0..ftab_start).rev() {
814            let c = valid_pattern[i] as u8;
815
816            // LF-mapping: top' = fchr[c] + rank(c, top)
817            //                       bot' = fchr[c] + rank(c, bot)
818            let fchr_c = self.fchr(c);
819            let rank_top = self.rank_internal(c as usize, top)?;
820            let rank_bot = self.rank_internal(c as usize, bot)?;
821
822            top = fchr_c + rank_top as u64;
823            bot = fchr_c + rank_bot as u64;
824
825            if bot <= top {
826                // No matches
827                return Ok(Vec::new());
828            }
829        }
830
831        // Step 3: Convert range to positions
832        // For now, return all positions in the range
833        // In a full implementation, we'd resolve these via the suffix array
834        Ok((top..bot).map(|i| i as usize).collect())
835    }
836
837    fn len(&self) -> usize {
838        self.params.len as usize
839    }
840
841    fn is_empty(&self) -> bool {
842        self.params.len == 0
843    }
844
845    fn bwt(&self, pos: usize) -> AlignResult<NucPair> {
846        if pos >= self.params.bwt_len as usize {
847            return Err(AlignError::InvalidArgument(
848                "BWT position out of range".into(),
849            ));
850        }
851
852        // In the full implementation, this would:
853        // 1. Calculate SideLocus from position
854        // 2. Extract the nucleotide pair from mmapped data
855        Ok(NucPair::default())
856    }
857
858    fn rank(&self, c: Nuc, _pos: usize) -> AlignResult<usize> {
859        if !c.is_valid() {
860            return Err(AlignError::InvalidNucleotide(c as u8));
861        }
862
863        // In the full implementation, this would:
864        // 1. Use the side-based rank structure
865        // 2. Count occurrences of c up to position
866        Ok(0)
867    }
868
869    fn select(&self, c: Nuc, _k: usize) -> AlignResult<usize> {
870        if !c.is_valid() {
871            return Err(AlignError::InvalidNucleotide(c as u8));
872        }
873
874        // In the full implementation, this would:
875        // 1. Use the side-based select structure
876        // 2. Find position of k-th occurrence of c
877        Ok(0)
878    }
879}
880
881#[cfg(test)]
882mod tests {
883    use super::*;
884
885    #[test]
886    fn test_ebwt_default() {
887        let params = EbwtParams::default();
888        assert_eq!(params.len, 0);
889    }
890
891    #[test]
892    fn test_ebwt_load_nonexistent() {
893        let result = Ebwt::load("/nonexistent/path/lambda_virus");
894        assert!(result.is_err());
895    }
896
897    // More tests would require actual index files or mock data
898}
rustalign_fmindex/ebwt.rs

rustalign_fmindex/
ebwt.rs