1#[inline]
14fn base_to_idx(b: u8) -> Option<usize> {
15 match b {
16 b'A' => Some(0),
17 b'C' => Some(1),
18 b'G' => Some(2),
19 b'T' => Some(3),
20 _ => None,
21 }
22}
23
24#[inline]
26fn complement(b: u8) -> u8 {
27 match b {
28 b'A' => b'T',
29 b'T' => b'A',
30 b'C' => b'G',
31 b'G' => b'C',
32 other => other,
33 }
34}
35
36fn reverse_complement(seq: &[u8]) -> Vec<u8> {
38 seq.iter().rev().map(|&b| complement(b)).collect()
39}
40
41fn build_sa(text: &[u8]) -> Vec<usize> {
43 let mut sa: Vec<usize> = (0..text.len()).collect();
44 sa.sort_by(|&a, &b| text[a..].cmp(&text[b..]));
45 sa
46}
47
48fn build_fm(text: &[u8]) -> (Vec<u8>, Vec<usize>, Vec<[usize; 4]>, [usize; 4]) {
52 let n = text.len();
53 let sa = build_sa(text);
54
55 let mut bwt = Vec::with_capacity(n);
57 for &pos in &sa {
58 if pos == 0 {
59 bwt.push(text[n - 1]);
60 } else {
61 bwt.push(text[pos - 1]);
62 }
63 }
64
65 let mut occ = Vec::with_capacity(n);
67 let mut counts = [0usize; 4];
68 for &b in &bwt {
69 if let Some(idx) = base_to_idx(b) {
70 counts[idx] += 1;
71 }
72 occ.push(counts);
73 }
74
75 let num_sentinels = n - counts[0] - counts[1] - counts[2] - counts[3];
79 let c_table = [
80 num_sentinels,
81 num_sentinels + counts[0],
82 num_sentinels + counts[0] + counts[1],
83 num_sentinels + counts[0] + counts[1] + counts[2],
84 ];
85
86 (bwt, sa, occ, c_table)
87}
88
89#[inline]
91fn occ_count(occ: &[[usize; 4]], pos: usize, c: usize) -> usize {
92 if pos == 0 { 0 } else { occ[pos - 1][c] }
93}
94
95#[inline]
100fn lf_step(
101 c_table: &[usize; 4],
102 occ: &[[usize; 4]],
103 lo: usize,
104 hi: usize,
105 c: usize,
106) -> (usize, usize) {
107 if lo >= hi {
108 return (0, 0);
109 }
110 let occ_lo = occ_count(occ, lo, c);
111 let occ_hi = occ_count(occ, hi, c);
112 (c_table[c] + occ_lo, c_table[c] + occ_hi)
113}
114
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
119pub struct BiInterval {
120 pub lower: usize,
122 pub size: usize,
124 pub lower_rev: usize,
126}
127
128impl BiInterval {
129 #[inline]
131 pub fn is_empty(&self) -> bool {
132 self.size == 0
133 }
134}
135
136#[derive(Debug, Clone)]
141pub struct FmdIndex {
142 bwt: Vec<u8>,
144 sa: Vec<usize>,
145 occ: Vec<[usize; 4]>,
146 c_table: [usize; 4],
147 _bwt_rev: Vec<u8>,
149 _sa_rev: Vec<usize>,
150 occ_rev: Vec<[usize; 4]>,
151 c_table_rev: [usize; 4],
152 text_len: usize,
154}
155
156impl FmdIndex {
157 pub fn new(seq: &[u8]) -> Self {
173 let text_len = seq.len();
174 let rc = reverse_complement(seq);
175
176 let mut forward = Vec::with_capacity(seq.len() + rc.len() + 2);
178 forward.extend_from_slice(seq);
179 forward.push(b'#');
180 forward.extend_from_slice(&rc);
181 forward.push(b'$');
182
183 let reversed: Vec<u8> = forward.iter().rev().copied().collect();
185
186 let (bwt, sa, occ, c_table) = build_fm(&forward);
187 let (bwt_rev, _sa_rev, occ_rev, c_table_rev) = build_fm(&reversed);
188
189 Self {
190 bwt,
191 sa,
192 occ,
193 c_table,
194 _bwt_rev: bwt_rev,
195 _sa_rev,
196 occ_rev,
197 c_table_rev,
198 text_len,
199 }
200 }
201
202 pub fn init_interval(&self, c: u8) -> BiInterval {
207 let ci = match base_to_idx(c) {
208 Some(idx) => idx,
209 None => return BiInterval { lower: 0, size: 0, lower_rev: 0 },
210 };
211
212 let n = self.bwt.len();
213
214 let lo = self.c_table[ci];
216 let hi = if ci < 3 { self.c_table[ci + 1] } else { n };
217 let size = hi - lo;
218
219 if size == 0 {
220 return BiInterval { lower: 0, size: 0, lower_rev: 0 };
221 }
222
223 let lo_rev = self.c_table_rev[ci];
225
226 BiInterval {
227 lower: lo,
228 size,
229 lower_rev: lo_rev,
230 }
231 }
232
233 pub fn extend_backward(&self, interval: &BiInterval, c: u8) -> BiInterval {
238 if interval.is_empty() {
239 return *interval;
240 }
241 let ci = match base_to_idx(c) {
242 Some(idx) => idx,
243 None => return BiInterval { lower: 0, size: 0, lower_rev: 0 },
244 };
245
246 let lo = interval.lower;
247 let hi = lo + interval.size;
248
249 let (new_lo, new_hi) = lf_step(
251 &self.c_table, &self.occ, lo, hi, ci,
252 );
253 let new_size = if new_hi > new_lo { new_hi - new_lo } else { 0 };
254
255 if new_size == 0 {
256 return BiInterval { lower: 0, size: 0, lower_rev: 0 };
257 }
258
259 let lo_rev = interval.lower_rev;
262 let mut offset = 0;
263 for a in 0..ci {
264 offset += occ_count(&self.occ, hi, a) - occ_count(&self.occ, lo, a);
265 }
266 let new_lo_rev = lo_rev + offset;
267
268 BiInterval {
269 lower: new_lo,
270 size: new_size,
271 lower_rev: new_lo_rev,
272 }
273 }
274
275 pub fn extend_forward(&self, interval: &BiInterval, c: u8) -> BiInterval {
280 if interval.is_empty() {
281 return *interval;
282 }
283 let ci = match base_to_idx(c) {
284 Some(idx) => idx,
285 None => return BiInterval { lower: 0, size: 0, lower_rev: 0 },
286 };
287
288 let lo_rev = interval.lower_rev;
289 let hi_rev = lo_rev + interval.size;
290
291 let (new_lo_rev, new_hi_rev) = lf_step(
293 &self.c_table_rev, &self.occ_rev, lo_rev, hi_rev, ci,
294 );
295 let new_size = if new_hi_rev > new_lo_rev {
296 new_hi_rev - new_lo_rev
297 } else {
298 0
299 };
300
301 if new_size == 0 {
302 return BiInterval { lower: 0, size: 0, lower_rev: 0 };
303 }
304
305 let lo = interval.lower;
308 let mut offset = 0;
309 for a in 0..ci {
310 offset += occ_count(&self.occ_rev, hi_rev, a) - occ_count(&self.occ_rev, lo_rev, a);
311 }
312 let new_lo = lo + offset;
313
314 BiInterval {
315 lower: new_lo,
316 size: new_size,
317 lower_rev: new_lo_rev,
318 }
319 }
320
321 pub fn locate(&self, interval: &BiInterval) -> Vec<usize> {
326 if interval.is_empty() {
327 return vec![];
328 }
329 let lo = interval.lower;
330 let hi = lo + interval.size;
331 let mut positions: Vec<usize> = (lo..hi)
332 .map(|i| self.sa[i])
333 .filter(|&pos| pos < self.text_len)
334 .collect();
335 positions.sort_unstable();
336 positions
337 }
338
339 pub fn backward_search(&self, pattern: &[u8]) -> BiInterval {
344 if pattern.is_empty() {
345 return BiInterval { lower: 0, size: 0, lower_rev: 0 };
346 }
347
348 let mut interval = self.init_interval(pattern[pattern.len() - 1]);
349 if interval.is_empty() {
350 return interval;
351 }
352
353 for i in (0..pattern.len() - 1).rev() {
354 interval = self.extend_backward(&interval, pattern[i]);
355 if interval.is_empty() {
356 return interval;
357 }
358 }
359
360 interval
361 }
362
363 pub fn smems(
378 &self,
379 query: &[u8],
380 min_len: usize,
381 ) -> Vec<(usize, usize, BiInterval)> {
382 if query.is_empty() || min_len == 0 {
383 return vec![];
384 }
385
386 let qlen = query.len();
387 let mut mems: Vec<(usize, usize, BiInterval)> = Vec::new();
388
389 let mut pos = 0;
392 while pos < qlen {
393 let mut interval = self.init_interval(query[pos]);
395 if interval.is_empty() {
396 pos += 1;
397 continue;
398 }
399
400 let mut end = pos + 1;
402 while end < qlen {
403 let next = self.extend_forward(&interval, query[end]);
404 if next.is_empty() {
405 break;
406 }
407 interval = next;
408 end += 1;
409 }
410
411 let start = pos;
413 let mut best_interval = interval;
414 let mut best_start = start;
415
416 if start > 0 {
417 let mut back_interval = interval;
418 let mut s = start;
419 while s > 0 {
420 let prev = self.extend_backward(&back_interval, query[s - 1]);
421 if prev.is_empty() {
422 break;
423 }
424 back_interval = prev;
425 s -= 1;
426 }
427 if s < best_start {
428 best_start = s;
429 best_interval = back_interval;
430 }
431 }
432
433 let match_len = end - best_start;
434 if match_len >= min_len {
435 mems.push((best_start, end, best_interval));
436 }
437
438 pos = end;
440 }
441
442 Self::filter_supermaximal(&mut mems);
444 mems
445 }
446
447 fn filter_supermaximal(mems: &mut Vec<(usize, usize, BiInterval)>) {
449 if mems.len() <= 1 {
450 return;
451 }
452 mems.sort_by(|a, b| a.0.cmp(&b.0).then(b.1.cmp(&a.1)));
454
455 let mut keep = vec![true; mems.len()];
456 let mut max_end = 0;
457 for i in 0..mems.len() {
458 if mems[i].1 <= max_end {
459 keep[i] = false;
461 }
462 if mems[i].1 > max_end {
463 max_end = mems[i].1;
464 }
465 }
466
467 let mut write = 0;
468 for read in 0..mems.len() {
469 if keep[read] {
470 mems[write] = mems[read];
471 write += 1;
472 }
473 }
474 mems.truncate(write);
475 }
476}
477
478#[cfg(test)]
479mod tests {
480 use super::*;
481
482 #[test]
483 fn complement_bases() {
484 assert_eq!(complement(b'A'), b'T');
485 assert_eq!(complement(b'T'), b'A');
486 assert_eq!(complement(b'C'), b'G');
487 assert_eq!(complement(b'G'), b'C');
488 }
489
490 #[test]
491 fn revcomp() {
492 assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
493 assert_eq!(reverse_complement(b"AAAC"), b"GTTT");
494 assert_eq!(reverse_complement(b""), b"");
495 }
496
497 #[test]
498 fn build_simple() {
499 let fmd = FmdIndex::new(b"ACGT");
500 assert_eq!(fmd.text_len, 4);
501 assert_eq!(fmd.bwt.len(), 10);
503 assert_eq!(fmd._bwt_rev.len(), 10);
504 }
505
506 #[test]
507 fn init_interval_valid() {
508 let fmd = FmdIndex::new(b"ACGTACGT");
509 for &base in &[b'A', b'C', b'G', b'T'] {
510 let iv = fmd.init_interval(base);
511 assert!(!iv.is_empty(), "init_interval for {} should be non-empty", base as char);
512 assert!(iv.size >= 4);
514 }
515 }
516
517 #[test]
518 fn init_interval_invalid() {
519 let fmd = FmdIndex::new(b"ACGT");
520 let iv = fmd.init_interval(b'N');
521 assert!(iv.is_empty());
522 }
523
524 #[test]
525 fn backward_search_exact_match() {
526 let fmd = FmdIndex::new(b"ACGTACGT");
527 let iv = fmd.backward_search(b"ACGT");
528 assert!(!iv.is_empty());
529 let positions = fmd.locate(&iv);
530 assert_eq!(positions, vec![0, 4]);
532 }
533
534 #[test]
535 fn backward_search_no_match() {
536 let fmd = FmdIndex::new(b"ACGTACGT");
537 let iv = fmd.backward_search(b"AAA");
538 assert!(iv.is_empty());
539 assert!(fmd.locate(&iv).is_empty());
540 }
541
542 #[test]
543 fn backward_search_single_base() {
544 let fmd = FmdIndex::new(b"ACGTACGT");
545 let iv = fmd.backward_search(b"A");
546 assert!(!iv.is_empty());
547 let positions = fmd.locate(&iv);
548 assert_eq!(positions, vec![0, 4]);
549 }
550
551 #[test]
552 fn backward_search_full_text() {
553 let fmd = FmdIndex::new(b"ACGTACGT");
554 let iv = fmd.backward_search(b"ACGTACGT");
555 assert!(!iv.is_empty());
556 let positions = fmd.locate(&iv);
557 assert_eq!(positions, vec![0]);
558 }
559
560 #[test]
561 fn backward_search_empty_pattern() {
562 let fmd = FmdIndex::new(b"ACGT");
563 let iv = fmd.backward_search(b"");
564 assert!(iv.is_empty());
565 }
566
567 #[test]
568 fn forward_extension() {
569 let fmd = FmdIndex::new(b"ACGTACGT");
570 let iv = fmd.init_interval(b'A');
572 assert!(!iv.is_empty());
573
574 let iv2 = fmd.extend_forward(&iv, b'C');
575 assert!(!iv2.is_empty());
576
577 let iv3 = fmd.extend_forward(&iv2, b'G');
578 assert!(!iv3.is_empty());
579
580 let iv4 = fmd.extend_forward(&iv3, b'T');
581 assert!(!iv4.is_empty());
582
583 let positions = fmd.locate(&iv4);
585 assert_eq!(positions, vec![0, 4]);
586 }
587
588 #[test]
589 fn forward_and_backward_agree() {
590 let fmd = FmdIndex::new(b"ACGTACGT");
591
592 let iv_back = fmd.backward_search(b"ACG");
594
595 let iv_fwd = fmd.init_interval(b'A');
597 let iv_fwd = fmd.extend_forward(&iv_fwd, b'C');
598 let iv_fwd = fmd.extend_forward(&iv_fwd, b'G');
599
600 let pos_back = fmd.locate(&iv_back);
602 let pos_fwd = fmd.locate(&iv_fwd);
603 assert_eq!(pos_back, pos_fwd);
604 }
605
606 #[test]
607 fn locate_filters_to_original_text() {
608 let fmd = FmdIndex::new(b"AACC");
610 let iv = fmd.backward_search(b"AA");
611 let positions = fmd.locate(&iv);
612 assert_eq!(positions, vec![0]);
615 }
616
617 #[test]
618 fn reverse_complement_symmetry() {
619 let seq = b"ACGTAAAA";
622 let fmd = FmdIndex::new(seq);
623
624 let iv = fmd.backward_search(b"ACGT");
626 let positions = fmd.locate(&iv);
627 assert!(positions.contains(&0));
628
629 assert!(iv.size >= 2);
635 }
636
637 #[test]
638 fn smems_simple() {
639 let fmd = FmdIndex::new(b"ACGTACGT");
641 let smems = fmd.smems(b"ACGT", 2);
642 assert!(!smems.is_empty());
643 let (start, end, _) = smems[0];
645 assert_eq!(start, 0);
646 assert_eq!(end, 4);
647 }
648
649 #[test]
650 fn smems_min_len_filter() {
651 let fmd = FmdIndex::new(b"ACGTACGT");
652 let smems = fmd.smems(b"ACGT", 10);
654 assert!(smems.is_empty());
655 }
656
657 #[test]
658 fn smems_multiple_matches() {
659 let fmd = FmdIndex::new(b"ACGTTTTTGGGG");
661 let smems = fmd.smems(b"ACGTTTTT", 3);
663 assert!(!smems.is_empty());
664 let max_len = smems.iter().map(|(s, e, _)| e - s).max().unwrap();
666 assert!(max_len >= 4);
667 }
668
669 #[test]
670 fn smems_empty_query() {
671 let fmd = FmdIndex::new(b"ACGT");
672 let smems = fmd.smems(b"", 1);
673 assert!(smems.is_empty());
674 }
675
676 #[test]
677 fn smems_no_match_in_query() {
678 let fmd = FmdIndex::new(b"ACGT");
681 let smems = fmd.smems(b"N", 1);
682 assert!(smems.is_empty());
683 }
684
685 #[test]
686 fn smems_supermaximal_property() {
687 let fmd = FmdIndex::new(b"ACGTACGTACGT");
689 let smems = fmd.smems(b"ACGTACGT", 2);
690 for i in 0..smems.len() {
691 for j in 0..smems.len() {
692 if i == j {
693 continue;
694 }
695 let (si, ei, _) = smems[i];
696 let (sj, ej, _) = smems[j];
697 assert!(!(si >= sj && ei <= ej && (si, ei) != (sj, ej)),
699 "SMEM ({}, {}) is contained in ({}, {})", si, ei, sj, ej);
700 }
701 }
702 }
703
704 #[test]
705 fn locate_repeated_sequence() {
706 let fmd = FmdIndex::new(b"AAAA");
707 let iv = fmd.backward_search(b"AA");
708 let positions = fmd.locate(&iv);
709 assert_eq!(positions, vec![0, 1, 2]);
711 }
712
713 #[test]
714 fn backward_extension() {
715 let fmd = FmdIndex::new(b"ACGTACGT");
716 let iv = fmd.init_interval(b'T');
718 let iv = fmd.extend_backward(&iv, b'G');
719 let iv = fmd.extend_backward(&iv, b'C');
720 let iv = fmd.extend_backward(&iv, b'A');
721 let positions = fmd.locate(&iv);
722 assert_eq!(positions, vec![0, 4]);
723 }
724
725 #[test]
726 fn bi_interval_empty() {
727 let iv = BiInterval { lower: 0, size: 0, lower_rev: 0 };
728 assert!(iv.is_empty());
729 let iv2 = BiInterval { lower: 5, size: 3, lower_rev: 2 };
730 assert!(!iv2.is_empty());
731 }
732}