Skip to main content

rustalign_aligner/
seed.rs

1//! Seed-based alignment implementation
2
3use crate::{Constraint, SeedPolicy, SeedType};
4use rustalign_common::Nuc;
5use rustalign_fmindex::FmIndex;
6
7/// A seed (substring match) for alignment
8#[derive(Debug, Clone)]
9pub struct Seed {
10    /// BWT row position (for position resolution)
11    pub bwt_row: u64,
12
13    /// Starting position in reference
14    pub pos: usize,
15
16    /// Starting position in read
17    pub read_pos: usize,
18
19    /// Length of the seed
20    pub len: usize,
21
22    /// Seed type
23    pub seed_type: SeedType,
24
25    /// Constraint used for this seed
26    pub constraint: Constraint,
27
28    /// Number of genome positions this seed sequence matches
29    /// Lower values indicate more unique/better seeds
30    pub hit_count: usize,
31}
32
33impl Seed {
34    /// Create a new seed
35    pub fn new(bwt_row: u64, pos: usize, read_pos: usize, len: usize, seed_type: SeedType) -> Self {
36        Self {
37            bwt_row,
38            pos,
39            read_pos,
40            len,
41            seed_type,
42            constraint: Constraint::new(),
43            hit_count: 1,
44        }
45    }
46
47    /// Create a new seed with a specific hit count
48    pub fn with_hit_count(
49        bwt_row: u64,
50        pos: usize,
51        read_pos: usize,
52        len: usize,
53        seed_type: SeedType,
54        hit_count: usize,
55    ) -> Self {
56        Self {
57            bwt_row,
58            pos,
59            read_pos,
60            len,
61            seed_type,
62            constraint: Constraint::new(),
63            hit_count,
64        }
65    }
66}
67
68/// Results from seed search
69#[derive(Debug, Clone, Default)]
70pub struct SeedResults {
71    /// Seed hits found
72    pub hits: Vec<Seed>,
73
74    /// Number of seeds searched
75    pub seeds_searched: usize,
76}
77
78/// Seed aligner that searches for seed matches
79pub struct SeedAligner {
80    /// Seed policy
81    policy: SeedPolicy,
82
83    /// Number of seed sets to try (similar to -R option)
84    seed_sets: usize,
85}
86
87impl SeedAligner {
88    /// Create a new seed aligner
89    pub fn new(policy: SeedPolicy) -> Self {
90        Self {
91            policy,
92            seed_sets: 3, // Try 3 different seed offsets
93        }
94    }
95
96    /// Create a seed aligner with custom number of seed sets
97    pub fn with_seed_sets(policy: SeedPolicy, seed_sets: usize) -> Self {
98        Self { policy, seed_sets }
99    }
100
101    /// Search for seeds in a read using multiple seed sets
102    ///
103    /// This implements a multi-set seed search.
104    /// If the first seed set finds no matches, it tries additional sets with
105    /// different offsets to handle reads with SNPs that break some seeds.
106    pub fn search<I: FmIndex>(
107        &mut self,
108        index: &I,
109        read: &[Nuc],
110    ) -> rustalign_common::Result<SeedResults> {
111        let mut best_results = SeedResults::default();
112        let _read_len = read.len();
113
114        // Try multiple seed sets with different offsets
115        for set_idx in 0..self.seed_sets {
116            // Calculate offset for this seed set
117            // Set 0: offset 0 (default)
118            // Set 1: offset seed_interval/2
119            // Set 2: offset seed_interval/4
120            let offset = if set_idx == 0 {
121                0
122            } else {
123                (self.policy.seed_interval * set_idx) / (self.seed_sets)
124            };
125
126            let results = self.search_with_offset(index, read, offset)?;
127
128            // If this set found matches, use them
129            if !results.hits.is_empty() {
130                // Merge with best results (prefer earlier sets for same uniqueness)
131                for hit in results.hits {
132                    // Avoid duplicates from different seed sets
133                    let is_duplicate = best_results.hits.iter().any(|existing| {
134                        existing.bwt_row == hit.bwt_row && existing.read_pos == hit.read_pos
135                    });
136                    if !is_duplicate {
137                        best_results.hits.push(hit);
138                    }
139                }
140                best_results.seeds_searched += results.seeds_searched;
141
142                // If we have good results from the first set, stop early
143                if set_idx == 0 {
144                    break;
145                }
146            }
147        }
148
149        // Sort hits by uniqueness (fewer hits = more unique) as primary criterion,
150        // then by read position to prefer seeds earlier in the read
151        best_results.hits.sort_by(|a, b| {
152            // Primary: prefer seeds with fewer genome hits (more unique)
153            match a.hit_count.cmp(&b.hit_count) {
154                std::cmp::Ordering::Equal => {
155                    // Secondary: prefer seeds earlier in the read
156                    a.read_pos.cmp(&b.read_pos)
157                }
158                other => other,
159            }
160        });
161
162        Ok(best_results)
163    }
164
165    /// Search for seeds with a specific starting offset
166    fn search_with_offset<I: FmIndex>(
167        &mut self,
168        index: &I,
169        read: &[Nuc],
170        offset: usize,
171    ) -> rustalign_common::Result<SeedResults> {
172        let mut results = SeedResults::default();
173        let read_len = read.len();
174
175        // Generate seed positions starting from offset
176        let mut seed_pos = offset;
177        while seed_pos + self.policy.seed_len <= read_len {
178            let seed_seq = &read[seed_pos..seed_pos + self.policy.seed_len];
179
180            // Search for exact matches
181            match index.exact_match(seed_seq) {
182                Ok(positions) => {
183                    // Track how many genome positions this seed matches
184                    let hit_count = positions.len();
185
186                    // Limit to max_hits to avoid explosion with repetitive seeds
187                    for bwt_row in positions.into_iter().take(self.policy.max_hits) {
188                        let seed = Seed::with_hit_count(
189                            bwt_row as u64,
190                            bwt_row,
191                            seed_pos,
192                            self.policy.seed_len,
193                            self.policy.seed_type,
194                            hit_count,
195                        );
196                        results.hits.push(seed);
197                    }
198                }
199                Err(_) => {
200                    // Continue trying other seeds even if this one fails
201                }
202            }
203
204            results.seeds_searched += 1;
205            seed_pos += self.policy.seed_interval;
206        }
207
208        Ok(results)
209    }
210}
211
212impl Default for SeedAligner {
213    fn default() -> Self {
214        Self::new(SeedPolicy::default())
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    #[test]
223    fn test_seed_new() {
224        let seed = Seed::new(100, 100, 0, 22, SeedType::Exact);
225        assert_eq!(seed.bwt_row, 100);
226        assert_eq!(seed.pos, 100);
227        assert_eq!(seed.read_pos, 0);
228        assert_eq!(seed.len, 22);
229        assert_eq!(seed.hit_count, 1);
230    }
231
232    #[test]
233    fn test_seed_with_hit_count() {
234        let seed = Seed::with_hit_count(100, 100, 0, 22, SeedType::Exact, 5);
235        assert_eq!(seed.bwt_row, 100);
236        assert_eq!(seed.hit_count, 5);
237    }
238
239    #[test]
240    fn test_seed_aligner_new() {
241        let aligner = SeedAligner::new(SeedPolicy::new());
242        assert_eq!(aligner.policy.seed_len, 22);
243    }
244}