Skip to main content

rustalign_aligner/
lib.rs

1//! RustAlign Alignment Algorithms
2//!
3//! This crate implements the seed-based alignment and Smith-Waterman
4//! dynamic programming algorithms used by RustAlign.
5
6#![warn(missing_docs)]
7#![warn(clippy::all)]
8
9mod constraint;
10mod paired_end;
11mod result;
12mod seed;
13mod smith_waterman;
14mod sw_sse;
15
16#[cfg(test)]
17mod proptest;
18
19pub use constraint::{Constraint, SeedPolicy, SeedType};
20pub use paired_end::{PairedAlignment, PeAlignmentType, PeConstraints, PePolicy};
21pub use result::{Alignment, AlignmentResult};
22pub use seed::{Seed, SeedAligner, SeedResults};
23pub use smith_waterman::{SwAligner, SwParams};
24pub use sw_sse::{QueryProfile, SwSseAligner};
25
26use rustalign_common::{Nuc, Score, Strand};
27use rustalign_fmindex::{EbwtWithRef, FmIndex};
28
29/// Main aligner that orchestrates seed search and DP extension
30pub struct Aligner<I: FmIndex> {
31    /// FM-index for forward strand
32    index_fw: I,
33
34    /// FM-index for reverse strand
35    #[allow(dead_code)]
36    index_rc: I,
37
38    /// Alignment parameters
39    params: AlignerParams,
40}
41
42/// Aligner with reference sequence for proper SW scoring
43///
44/// This aligner extracts reference sequences from the index and uses them
45/// for Smith-Waterman alignment, giving accurate scores for matches/mismatches.
46pub struct AlignerWithRef {
47    /// Combined FM-index and reference
48    index_with_ref: EbwtWithRef,
49
50    /// Alignment parameters
51    params: AlignerParams,
52}
53
54/// Parameters for the aligner
55#[derive(Debug, Clone)]
56pub struct AlignerParams {
57    /// Maximum number of alignments per read
58    pub max_alignments: usize,
59
60    /// Seed policy
61    pub seed_policy: SeedPolicy,
62
63    /// Smith-Waterman parameters
64    pub sw_params: SwParams,
65
66    /// Minimum score threshold
67    pub min_score: Score,
68}
69
70impl Default for AlignerParams {
71    fn default() -> Self {
72        Self {
73            max_alignments: 50, // Extend many seeds to find best alignment across chromosomes
74            seed_policy: SeedPolicy::default(),
75            sw_params: SwParams::default(),
76            min_score: 30,
77        }
78    }
79}
80
81impl<I: FmIndex> Aligner<I> {
82    /// Create a new aligner with the given FM-indexes
83    pub fn new(index_fw: I, index_rc: I, params: AlignerParams) -> Self {
84        Self {
85            index_fw,
86            index_rc,
87            params,
88        }
89    }
90
91    /// Align a single read
92    ///
93    /// Searches both forward and reverse complement strands and returns
94    /// the best alignment found.
95    #[allow(clippy::collapsible_if)]
96    pub fn align(&self, read: &[Nuc], read_rc: &[Nuc]) -> AlignmentResult {
97        use rustalign_common::Strand;
98
99        let mut all_alignments = Vec::new();
100
101        // 1. Search forward strand (using the original read)
102        if let Ok(seed_results) = self.search_seeds(read) {
103            if !seed_results.hits.is_empty() {
104                for seed_hit in seed_results.hits.iter().take(self.params.max_alignments) {
105                    if let Ok(mut aln) = self.extend_seed(read, seed_hit) {
106                        aln.strand = Strand::Forward;
107                        if aln.score >= self.params.min_score {
108                            all_alignments.push(aln);
109                        }
110                    }
111                }
112            }
113        }
114
115        // 2. Search reverse complement strand (using the RC of the read)
116        if let Ok(seed_results) = self.search_seeds(read_rc) {
117            if !seed_results.hits.is_empty() {
118                for seed_hit in seed_results.hits.iter().take(self.params.max_alignments) {
119                    if let Ok(mut aln) = self.extend_seed(read_rc, seed_hit) {
120                        aln.strand = Strand::Reverse;
121                        if aln.score >= self.params.min_score {
122                            all_alignments.push(aln);
123                        }
124                    }
125                }
126            }
127        }
128
129        // 3. Return results
130        if all_alignments.is_empty() {
131            return AlignmentResult {
132                success: false,
133                error: Some("No seed matches found on either strand".to_string()),
134                ..Default::default()
135            };
136        }
137
138        // Sort by score (best first)
139        all_alignments.sort_by(|a, b| b.score.cmp(&a.score));
140
141        AlignmentResult {
142            success: true,
143            alignments: all_alignments,
144            ..Default::default()
145        }
146    }
147
148    /// Search for seed matches
149    fn search_seeds(&self, read: &[Nuc]) -> rustalign_common::Result<SeedResults> {
150        let mut seed_aligner = SeedAligner::new(self.params.seed_policy.clone());
151        seed_aligner.search(&self.index_fw, read)
152    }
153
154    /// Extend a seed with Smith-Waterman
155    fn extend_seed(&self, read: &[Nuc], seed: &Seed) -> rustalign_common::Result<Alignment> {
156        let sw = SwAligner::new(self.params.sw_params);
157        // TODO: Extract reference sequence around seed position
158        // For now, just use the read itself as placeholder
159        let mut aln = sw.align(read, read)?;
160        // Store the BWT row for later position resolution
161        aln.bwt_row = seed.bwt_row;
162        // Store the seed's position in the read for position calculation
163        aln.seed_offset = seed.read_pos;
164        // Store how many positions this seed matched (for tiebreaking)
165        aln.seed_hit_count = seed.hit_count;
166        Ok(aln)
167    }
168}
169
170impl AlignerWithRef {
171    /// Create a new aligner with reference
172    pub fn new(index_with_ref: EbwtWithRef, params: AlignerParams) -> Self {
173        Self {
174            index_with_ref,
175            params,
176        }
177    }
178
179    /// Align a single read using actual reference sequences
180    ///
181    /// Searches both forward and reverse complement strands and returns
182    /// all valid alignments with scores computed against actual reference.
183    #[allow(clippy::collapsible_if)]
184    pub fn align(&self, read: &[Nuc], read_rc: &[Nuc]) -> AlignmentResult {
185        let mut all_alignments = Vec::new();
186
187        // 1. Search forward strand
188        if let Ok(seed_results) = self.search_seeds(read) {
189            if !seed_results.hits.is_empty() {
190                for seed_hit in seed_results.hits.iter().take(self.params.max_alignments) {
191                    if let Ok(aln) = self.extend_seed_with_ref(read, seed_hit, Strand::Forward) {
192                        if aln.score >= self.params.min_score {
193                            all_alignments.push(aln);
194                        }
195                    }
196                }
197            }
198        }
199
200        // 2. Search reverse complement strand
201        if let Ok(seed_results) = self.search_seeds(read_rc) {
202            if !seed_results.hits.is_empty() {
203                for seed_hit in seed_results.hits.iter().take(self.params.max_alignments) {
204                    if let Ok(aln) = self.extend_seed_with_ref(read_rc, seed_hit, Strand::Reverse) {
205                        if aln.score >= self.params.min_score {
206                            all_alignments.push(aln);
207                        }
208                    }
209                }
210            }
211        }
212
213        // 3. Return results
214        if all_alignments.is_empty() {
215            return AlignmentResult {
216                success: false,
217                error: Some("No seed matches found on either strand".to_string()),
218                ..Default::default()
219            };
220        }
221
222        // Sort by score (best first)
223        all_alignments.sort_by(|a, b| b.score.cmp(&a.score));
224
225        AlignmentResult {
226            success: true,
227            alignments: all_alignments,
228            ..Default::default()
229        }
230    }
231
232    /// Search for seed matches
233    fn search_seeds(&self, read: &[Nuc]) -> rustalign_common::Result<SeedResults> {
234        let mut seed_aligner = SeedAligner::new(self.params.seed_policy.clone());
235        seed_aligner.search(self.index_with_ref.ebwt(), read)
236    }
237
238    /// Extend a seed with Smith-Waterman using actual reference sequence
239    #[allow(clippy::implicit_saturating_sub)]
240    fn extend_seed_with_ref(
241        &self,
242        read: &[Nuc],
243        seed: &Seed,
244        strand: Strand,
245    ) -> rustalign_common::Result<Alignment> {
246        let read_len = read.len() as u64;
247
248        // Step 1: Resolve position from BWT row
249        let (chr_idx, pos_in_chr) = {
250            let genome_off = self.index_with_ref.ebwt().get_offset(seed.bwt_row)?;
251
252            // Get chromosome index and position
253            // We need to use joined_to_text_off, which requires the internal method
254            let (chr_idx, text_off, _chr_len) = self
255                .index_with_ref
256                .ebwt()
257                .joined_to_text_off(read_len, genome_off, true)?;
258
259            // Adjust for seed offset within the read
260            let adjusted_pos = if text_off > seed.read_pos as u64 {
261                text_off - seed.read_pos as u64
262            } else {
263                0
264            };
265
266            (chr_idx, adjusted_pos)
267        };
268
269        // Step 2: Extract reference sequence
270        // Extract enough reference for the entire read plus some padding
271        let ref_len = read.len() + 20; // Extra padding for potential indels
272        let reference = self
273            .index_with_ref
274            .get_reference(chr_idx, pos_in_chr, ref_len)?;
275
276        // Step 3: Perform Smith-Waterman alignment against actual reference
277        let sw = SwAligner::new(self.params.sw_params);
278        let mut aln = if reference.len() >= read.len() {
279            sw.align(read, &reference)?
280        } else {
281            // Reference too short (near end of chromosome), use read as fallback
282            sw.align(read, read)?
283        };
284
285        // Store metadata
286        aln.bwt_row = seed.bwt_row;
287        aln.seed_offset = seed.read_pos;
288        aln.seed_hit_count = seed.hit_count;
289        aln.strand = strand;
290
291        Ok(aln)
292    }
293
294    /// Get the underlying EbwtWithRef
295    pub fn index(&self) -> &EbwtWithRef {
296        &self.index_with_ref
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303
304    #[test]
305    fn test_params_default() {
306        let params = AlignerParams::default();
307        assert_eq!(params.max_alignments, 50);
308        assert_eq!(params.min_score, 30);
309    }
310}