1use crate::{
12 types::{
13 FromUsize, Int, SeedMask, SuffixSortType, SufrBuilderArgs, OUTFILE_VERSION,
14 SENTINEL_CHARACTER,
15 },
16 util::{find_lcp_full_offset, slice_u8_to_vec, usize_to_bytes, vec_to_slice_u8},
17};
18use anyhow::{anyhow, bail, Result};
19use log::info;
20use rand::{rngs::StdRng, Rng, RngCore, SeedableRng};
21use rayon::prelude::*;
22use std::{
23 cmp::{max, min, Ordering},
24 collections::HashSet,
25 fs::{self, File, OpenOptions},
26 io::{BufWriter, Seek, SeekFrom, Write},
27 mem,
28 ops::Range,
29 path::PathBuf,
30 sync::{Arc, Mutex},
31 time::Instant,
32};
33use tempfile::NamedTempFile;
34
35#[derive(Debug)]
38pub struct SufrBuilder<T>
39where
40 T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
41{
42 pub version: u8,
44
45 pub is_dna: bool,
47
48 pub allow_ambiguity: bool,
51
52 pub ignore_softmask: bool,
55
56 pub text_len: T,
58
59 pub num_suffixes: T,
63
64 pub num_sequences: T,
66
67 pub sequence_starts: Vec<T>,
69
70 pub sequence_names: Vec<String>,
73
74 pub text: Vec<u8>,
76
77 pub sort_type: SuffixSortType,
80
81 partitions: Vec<Partition>,
83
84 pub n_ranges: Vec<Range<usize>>,
86
87 pub path: String,
89}
90
91impl<T> SufrBuilder<T>
93where
94 T: Int + FromUsize<T> + Sized + Send + Sync,
95{
96 pub fn new(args: SufrBuilderArgs) -> Result<SufrBuilder<T>> {
144 let text: Vec<_> = args
145 .text
146 .iter()
147 .map(|b| {
148 if (97..=122).contains(b) {
150 if args.ignore_softmask {
151 b'N'
152 } else {
153 b & 0b1011111
155 }
156 } else {
157 *b
158 }
159 })
160 .collect();
161 let text_len = T::from_usize(text.len());
162
163 if args.seed_mask.is_some() && args.max_query_len.is_some() {
164 bail!("Cannot use max_query_len and seed_mask together");
165 }
166
167 let sort_type = if let Some(mask) = args.seed_mask {
168 let seed_mask = SeedMask::new(&mask)?;
169 SuffixSortType::Mask(seed_mask)
170 } else {
171 SuffixSortType::MaxQueryLen(args.max_query_len.unwrap_or(0))
172 };
173
174 let mut n_ranges: Vec<Range<usize>> = vec![];
176 if args.allow_ambiguity {
177 let mut n_start: Option<usize> = None;
178 let min_n = 1000;
179 let now = Instant::now();
180 for (i, &byte) in text.iter().enumerate() {
181 if byte == b'N' {
182 if n_start.is_none() {
183 n_start = Some(i);
184 }
185 } else {
186 if let Some(prev) = n_start {
187 if i - prev >= min_n {
188 n_ranges.push(prev..i);
189 }
190 }
191 n_start = None;
192 }
193 }
194 info!("Scanned for runs of Ns in {:?}", now.elapsed());
195 }
196
197 let mut sa = SufrBuilder {
198 version: OUTFILE_VERSION,
199 is_dna: args.is_dna,
200 allow_ambiguity: args.allow_ambiguity,
201 ignore_softmask: args.ignore_softmask,
202 sort_type,
203 text_len,
204 num_suffixes: T::default(),
205 text,
206 num_sequences: T::from_usize(args.sequence_starts.len()),
207 sequence_starts: args
208 .sequence_starts
209 .into_iter()
210 .map(T::from_usize)
211 .collect::<Vec<_>>(),
212 sequence_names: args.sequence_names,
213 partitions: vec![],
214 n_ranges,
215 path: args.path.unwrap_or("out.sufr".to_string()),
216 };
217 sa.sort(args.num_partitions, args.random_seed)?;
218 sa.write()?;
219 Ok(sa)
220 }
221
222 fn find_n_run(&self, suffix: usize) -> Option<usize> {
242 self.n_ranges
243 .binary_search_by(|range| {
244 if range.contains(&suffix) {
245 Ordering::Equal
246 } else if range.start < suffix {
247 Ordering::Less
248 } else {
249 Ordering::Greater
250 }
251 })
252 .ok()
253 .map(|i| self.n_ranges[i].end)
254 }
255
256 #[inline(always)]
268 fn find_lcp(&self, start1: usize, start2: usize, len: T, skip: usize) -> T {
269 match &self.sort_type {
272 SuffixSortType::Mask(mask) => {
273 let a_vals = mask
276 .positions
277 .iter()
278 .skip(skip)
279 .map(|&offset| start1 + offset)
280 .filter(|&v| v < self.text_len.to_usize());
281
282 let b_vals = mask
283 .positions
284 .iter()
285 .skip(skip)
286 .map(|&offset| start2 + offset)
287 .filter(|&v| v < self.text_len.to_usize());
288
289 unsafe {
290 T::from_usize(
291 skip + a_vals
292 .zip(b_vals)
293 .take_while(|(a, b)| {
294 self.text.get_unchecked(*a)
295 == self.text.get_unchecked(*b)
296 })
297 .count(),
298 )
299 }
300 }
301 SuffixSortType::MaxQueryLen(max_query_len) => {
302 match (&self.find_n_run(start1), &self.find_n_run(start2)) {
303 (Some(end1), Some(end2)) => {
306 T::from_usize(min(end1 - start1, end2 - start2))
307 }
308 _ => {
309 let text_len = self.text_len.to_usize();
310 let len = if max_query_len > &0 {
311 *max_query_len
312 } else {
313 len.to_usize()
314 };
315 let start1 = start1 + skip;
316 let start2 = start2 + skip;
317 let end1 = min(start1 + len, text_len);
318 let end2 = min(start2 + len, text_len);
319 unsafe {
320 T::from_usize(
321 skip + (start1..end1)
322 .zip(start2..end2)
323 .take_while(|(a, b)| {
324 self.text.get_unchecked(*a)
325 == self.text.get_unchecked(*b)
326 })
327 .count(),
328 )
329 }
330 }
331 }
332 }
333 }
334 }
335
336 #[inline(always)]
346 fn is_less(&self, start1: T, start2: T) -> bool {
347 if start1 == start2 {
348 false
349 } else {
350 let max_query_len = match &self.sort_type {
351 SuffixSortType::Mask(seed_mask) => T::from_usize(seed_mask.weight),
352 SuffixSortType::MaxQueryLen(max_query_len) => {
353 if max_query_len > &0 {
354 T::from_usize(*max_query_len)
355 } else {
356 self.text_len
357 }
358 }
359 };
360
361 let len_lcp = find_lcp_full_offset(
362 self.find_lcp(start1.to_usize(), start2.to_usize(), max_query_len, 0)
363 .to_usize(),
364 &self.sort_type,
365 );
366
367 if len_lcp >= max_query_len.to_usize() {
368 false
370 } else {
371 match (
373 self.text.get(start1.to_usize() + len_lcp),
374 self.text.get(start2.to_usize() + len_lcp),
375 ) {
376 (Some(a), Some(b)) => a < b,
377 (None, Some(_)) => true,
378 _ => false,
379 }
380 }
381 }
382 }
383
384 #[inline(always)]
391 fn upper_bound(&self, suffix: T, pivots: &[T]) -> usize {
392 pivots.partition_point(|&p| self.is_less(p, suffix))
394 }
395
396 fn partition(
405 &mut self,
406 num_partitions: usize,
407 random_seed: u64,
408 ) -> Result<PartitionBuildResult<T>> {
409 let max_partitions = self.text_len.to_usize() / 4;
412 let num_partitions = if num_partitions * 10 < max_partitions {
413 num_partitions * 10
414 } else if num_partitions * 5 < max_partitions {
415 num_partitions * 5
416 } else if num_partitions * 2 < max_partitions {
417 num_partitions * 2
418 } else if num_partitions < max_partitions {
419 num_partitions
420 } else {
421 max_partitions
422 };
423
424 let now = Instant::now();
426 let pivot_sa = self.select_pivots(self.text.len(), num_partitions, random_seed);
427 let num_pivots = pivot_sa.len();
428 info!(
429 "Selected {num_pivots} pivot{} in {:?}",
430 if num_pivots == 1 { "" } else { "s" },
431 now.elapsed()
432 );
433
434 let capacity = 4096;
435 let mut builders: Vec<_> = vec![];
436 for _ in 0..num_partitions {
437 let builder: PartitionBuilder<T> = PartitionBuilder::new(capacity)?;
438 builders.push(Arc::new(Mutex::new(builder)));
439 }
440
441 let now = Instant::now();
442 self.text
443 .par_iter()
444 .enumerate()
445 .try_for_each(|(i, &val)| -> Result<()> {
446 if val == SENTINEL_CHARACTER
447 || !self.is_dna || (b"ACGT".contains(&val) || self.allow_ambiguity)
449 {
450 let suffix = T::from_usize(i);
451 let partition_num = self.upper_bound(suffix, &pivot_sa);
452 match builders[partition_num].lock() {
453 Ok(mut partition) => {
454 if partition.add(suffix).is_err() {
455 bail!("Unable to write data to disk")
456 }
457 }
458 Err(e) => bail!("{e}"),
459 }
460 }
461 Ok(())
462 })?;
463
464 let mut num_suffixes = 0;
466 for builder in &builders {
467 match builder.lock() {
468 Ok(mut val) => {
469 val.write()?;
470 num_suffixes += val.total_len;
471 }
472 Err(e) => panic!("Failed to lock: {e}"),
473 }
474 }
475
476 info!(
477 "Wrote {num_suffixes} unsorted suffixes to partition{} in {:?}",
478 if num_pivots == 1 { "" } else { "s" },
479 now.elapsed()
480 );
481
482 Ok(PartitionBuildResult {
484 builders,
485 num_suffixes,
486 })
487 }
488
489 fn sort(&mut self, num_partitions: usize, random_seed: u64) -> Result<()> {
496 let mut partition_build = self.partition(num_partitions, random_seed)?;
497
498 let num_per_partition = (partition_build.num_suffixes as f64
500 / num_partitions as f64)
501 .ceil() as usize;
502 let total_sort_time = Instant::now();
503 let mut num_taken = 0;
504 let mut partition_inputs = vec![vec![]; num_partitions];
505
506 #[allow(clippy::needless_range_loop)]
511 for partition_num in 0..num_partitions {
512 let boundary = num_per_partition * (partition_num + 1);
513 while !partition_build.builders.is_empty() {
514 let part = partition_build.builders.remove(0);
515 match part.lock() {
516 Ok(builder) => {
517 if builder.total_len > 0 {
518 partition_inputs[partition_num]
519 .push((builder.path.clone(), builder.total_len));
520 num_taken += builder.total_len;
521 }
522 }
523 Err(e) => panic!("Can't get partition: {e}"),
524 }
525
526 if partition_num < num_partitions - 1 && num_taken > boundary {
528 break;
529 }
530 }
531 }
532
533 if num_taken != partition_build.num_suffixes {
535 bail!(
536 "Took {num_taken} but needed to take {}",
537 partition_build.num_suffixes
538 );
539 }
540
541 let mut partitions: Vec<Option<Partition>> =
542 (0..num_partitions).map(|_| None).collect();
543
544 partitions.par_iter_mut().enumerate().try_for_each(
545 |(partition_num, partition)| -> Result<()> {
546 let mut part_sa = vec![];
548 for (path, len) in &partition_inputs[partition_num] {
549 let buffer = fs::read(path)?;
550 let mut part: Vec<T> = slice_u8_to_vec(&buffer, *len);
551 part_sa.append(&mut part);
552 fs::remove_file(path)?;
553 }
554
555 let len = part_sa.len();
556 if len > 0 {
557 let mut sa_w = part_sa.clone();
558 let mut lcp = vec![T::default(); len];
559 let mut lcp_w = vec![T::default(); len];
560 self.merge_sort(&mut sa_w, &mut part_sa, len, &mut lcp, &mut lcp_w);
561
562 let mut sa_file = NamedTempFile::new()?;
564 let _ = sa_file.write(vec_to_slice_u8(&part_sa))?;
565 let mut lcp_file = NamedTempFile::new()?;
566 let _ = lcp_file.write(vec_to_slice_u8(&lcp))?;
567 let (_, sa_path) = sa_file.keep()?;
568 let (_, lcp_path) = lcp_file.keep()?;
569
570 *partition = Some(Partition {
571 order: partition_num,
572 len,
573 first_suffix: part_sa.first().unwrap().to_usize(),
574 last_suffix: part_sa.last().unwrap().to_usize(),
575 sa_path,
576 lcp_path,
577 });
578 }
579 Ok(())
580 },
581 )?;
582
583 let mut partitions: Vec<_> = partitions.into_iter().flatten().collect();
585 partitions.sort_by_key(|p| p.order);
586
587 let sizes: Vec<_> = partitions.iter().map(|p| p.len).collect();
588 let total_size = sizes.iter().sum::<usize>();
589 info!(
590 "Sorted {total_size} suffixes in {num_partitions} partitions (avg {}) in {:?}",
591 total_size / num_partitions,
592 total_sort_time.elapsed()
593 );
594 self.num_suffixes = T::from_usize(sizes.iter().sum());
595 self.partitions = partitions;
596
597 Ok(())
598 }
599
600 fn merge_sort(
602 &self,
603 x: &mut [T],
604 y: &mut [T],
605 n: usize,
606 lcp: &mut [T],
607 lcp_w: &mut [T],
608 ) {
609 if n == 1 {
610 lcp[0] = T::default();
611 } else {
612 let mid = n / 2;
613 self.merge_sort(
614 &mut y[..mid],
615 &mut x[..mid],
616 mid,
617 &mut lcp_w[..mid],
618 &mut lcp[..mid],
619 );
620
621 self.merge_sort(
622 &mut y[mid..],
623 &mut x[mid..],
624 n - mid,
625 &mut lcp_w[mid..],
626 &mut lcp[mid..],
627 );
628
629 self.merge(x, mid, lcp_w, y, lcp);
630 }
631 }
632
633 fn merge(
635 &self,
636 suffix_array: &mut [T],
637 mid: usize,
638 lcp_w: &mut [T],
639 target_sa: &mut [T],
640 target_lcp: &mut [T],
641 ) {
642 let (mut x, mut y) = suffix_array.split_at_mut(mid);
643 let (mut lcp_x, mut lcp_y) = lcp_w.split_at_mut(mid);
644 let mut len_x = x.len();
645 let mut len_y = y.len();
646 let mut m = T::default(); let mut idx_x = 0; let mut idx_y = 0; let mut idx_target = 0; while idx_x < len_x && idx_y < len_y {
652 let l_x = lcp_x[idx_x];
653
654 match l_x.cmp(&m) {
655 Ordering::Greater => {
656 target_sa[idx_target] = x[idx_x];
657 target_lcp[idx_target] = l_x;
658 }
659 Ordering::Less => {
660 target_sa[idx_target] = y[idx_y];
661 target_lcp[idx_target] = m;
662 m = l_x;
663 }
664 Ordering::Equal => {
665 let shorter_suffix = max(x[idx_x], y[idx_y]);
666 let max_n = self.text_len - shorter_suffix;
667
668 let context = match &self.sort_type {
669 SuffixSortType::Mask(seed_mask) => T::from_usize(
670 seed_mask
671 .positions
672 .iter()
673 .filter(|&i| *i < max_n.to_usize())
674 .count(),
675 ),
676 SuffixSortType::MaxQueryLen(max_query_len) => {
677 if max_query_len > &0 {
678 min(T::from_usize(*max_query_len), max_n)
679 } else {
680 max_n
681 }
682 }
683 };
684
685 let (len_lcp, full_len_lcp) = if m < context {
687 let lcp = self.find_lcp(
688 x[idx_x].to_usize(),
689 y[idx_y].to_usize(),
690 context - m,
691 m.to_usize(), );
693 let full_lcp =
694 find_lcp_full_offset(lcp.to_usize(), &self.sort_type);
695 (lcp, T::from_usize(full_lcp))
696 } else {
697 (context, context)
698 };
699
700 if len_lcp >= context {
702 target_sa[idx_target] = shorter_suffix;
703 }
704 else {
706 let cmp = self.text[(x[idx_x] + full_len_lcp).to_usize()]
707 .cmp(&self.text[(y[idx_y] + full_len_lcp).to_usize()]);
708
709 match cmp {
710 Ordering::Equal => {
711 target_sa[idx_target] = shorter_suffix;
712 }
713 Ordering::Less => {
714 target_sa[idx_target] = x[idx_x];
715 }
716 Ordering::Greater => {
717 target_sa[idx_target] = y[idx_y];
718 }
719 }
720 }
721
722 if target_sa[idx_target] == x[idx_x] {
724 target_lcp[idx_target] = l_x;
725 } else {
726 target_lcp[idx_target] = m
727 }
728
729 m = len_lcp;
730 }
731 }
732
733 if target_sa[idx_target] == x[idx_x] {
734 idx_x += 1;
735 } else {
736 idx_y += 1;
737 mem::swap(&mut x, &mut y);
738 mem::swap(&mut len_x, &mut len_y);
739 mem::swap(&mut lcp_x, &mut lcp_y);
740 mem::swap(&mut idx_x, &mut idx_y);
741 }
742 idx_target += 1;
743 }
744
745 while idx_x < len_x {
747 target_sa[idx_target] = x[idx_x];
748 target_lcp[idx_target] = lcp_x[idx_x];
749 idx_x += 1;
750 idx_target += 1;
751 }
752
753 if idx_y < len_y {
755 target_sa[idx_target] = y[idx_y];
756 target_lcp[idx_target] = m;
757 idx_y += 1;
758 idx_target += 1;
759
760 while idx_y < len_y {
761 target_sa[idx_target] = y[idx_y];
762 target_lcp[idx_target] = lcp_y[idx_y];
763 idx_y += 1;
764 idx_target += 1;
765 }
766 }
767 }
768
769 #[inline(always)]
771 fn select_pivots(
772 &self,
773 text_len: usize,
774 num_partitions: usize,
775 random_seed: u64,
776 ) -> Vec<T> {
777 if num_partitions > 1 {
778 let num_pivots = num_partitions - 1;
781 let mut rng: Box<dyn RngCore> = if random_seed > 0 {
782 Box::new(StdRng::seed_from_u64(random_seed))
783 } else {
784 Box::new(rand::rng())
785 };
786 let mut pivot_sa = HashSet::<T>::new();
787 loop {
788 let pos = rng.random_range(0..text_len);
789 if self.is_dna && !b"ACGT$".contains(&self.text[pos]) {
790 continue;
791 }
792 let _ = pivot_sa.insert(T::from_usize(pos));
793 if pivot_sa.len() == num_pivots {
794 break;
795 }
796 }
797
798 let mut pivot_sa: Vec<T> = pivot_sa.iter().cloned().collect();
800 let mut sa_w = pivot_sa.clone();
801 let len = pivot_sa.len();
802 let mut lcp = vec![T::default(); len];
803 let mut lcp_w = vec![T::default(); len];
804 self.merge_sort(&mut sa_w, &mut pivot_sa, len, &mut lcp, &mut lcp_w);
805 pivot_sa
806 } else {
807 vec![]
808 }
809 }
810
811 fn write(&self) -> Result<()> {
818 let filename = &self.path;
819 let mut file = BufWriter::new(
820 File::create(filename).map_err(|e| anyhow!("{filename}: {e}"))?,
821 );
822
823 let mut bytes_out: usize = 0;
824
825 let is_dna: u8 = if self.is_dna { 1 } else { 0 };
827 let allow_ambiguity: u8 = if self.allow_ambiguity { 1 } else { 0 };
828 let ignore_softmask: u8 = if self.ignore_softmask { 1 } else { 0 };
829 bytes_out +=
830 file.write(&[OUTFILE_VERSION, is_dna, allow_ambiguity, ignore_softmask])?;
831
832 bytes_out += file.write(&usize_to_bytes(self.text_len.to_usize()))?;
834
835 let locs_pos = file.stream_position()?;
838 bytes_out += file.write(&usize_to_bytes(0usize))?;
839 bytes_out += file.write(&usize_to_bytes(0usize))?;
840 bytes_out += file.write(&usize_to_bytes(0usize))?;
841
842 bytes_out += file.write(&usize_to_bytes(self.num_suffixes.to_usize()))?;
844
845 let max_query_len = if let SuffixSortType::MaxQueryLen(val) = &self.sort_type {
847 *val
848 } else {
849 0
850 };
851 bytes_out += file.write(&usize_to_bytes(max_query_len))?;
852
853 bytes_out += file.write(&usize_to_bytes(self.sequence_starts.len()))?;
855
856 bytes_out += file.write(vec_to_slice_u8(&self.sequence_starts))?;
858
859 match &self.sort_type {
861 SuffixSortType::Mask(seed_mask) => {
862 bytes_out += file.write(&usize_to_bytes(seed_mask.bytes.len()))?;
863 file.write_all(&seed_mask.bytes)?;
864 bytes_out += seed_mask.bytes.len();
865 }
866 _ => bytes_out += file.write(&usize_to_bytes(0))?,
867 }
868
869 let text_pos = bytes_out;
871 file.write_all(&self.text)?;
872 bytes_out += self.text.len();
873
874 let sa_pos = bytes_out;
876 for partition in &self.partitions {
877 let buffer = fs::read(&partition.sa_path)?;
878 bytes_out += &buffer.len();
879 file.write_all(&buffer)?;
880 fs::remove_file(&partition.sa_path)?;
881 }
882
883 let lcp_pos = bytes_out;
884
885 for (i, partition) in self.partitions.iter().enumerate() {
887 let buffer = fs::read(&partition.lcp_path)?;
888 bytes_out += &buffer.len();
889
890 if i == 0 {
891 file.write_all(&buffer)?;
892 } else {
893 let mut lcp: Vec<T> = slice_u8_to_vec(&buffer, partition.len);
895 if let Some(val) = lcp.first_mut() {
896 *val = self.find_lcp(
897 self.partitions[i - 1].last_suffix,
898 partition.first_suffix,
899 self.text_len,
900 0, );
902 }
903 file.write_all(vec_to_slice_u8(&lcp))?;
904 }
905 fs::remove_file(&partition.lcp_path)?;
906 }
907
908 _ = file.write(&bincode::serialize(&self.sequence_names)?)?;
910
911 file.seek(SeekFrom::Start(locs_pos))?;
913 let _ = file.write(&usize_to_bytes(text_pos))?;
914 let _ = file.write(&usize_to_bytes(sa_pos))?;
915 let _ = file.write(&usize_to_bytes(lcp_pos))?;
916
917 Ok(())
918 }
919}
920
921#[derive(Debug)]
924struct Partition {
925 order: usize,
927
928 len: usize,
930
931 first_suffix: usize,
933
934 last_suffix: usize,
936
937 sa_path: PathBuf,
939
940 lcp_path: PathBuf,
942}
943
944#[derive(Debug)]
947struct PartitionBuildResult<T>
948where
949 T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
950{
951 builders: Vec<Arc<Mutex<PartitionBuilder<T>>>>,
953
954 num_suffixes: usize,
956}
957
958#[derive(Debug)]
961struct PartitionBuilder<T>
962where
963 T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
964{
965 vals: Vec<T>,
966 capacity: usize,
967 len: usize,
968 total_len: usize,
969 path: PathBuf,
970}
971
972impl<T> PartitionBuilder<T>
974where
975 T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
976{
977 fn new(capacity: usize) -> Result<Self> {
988 let tmp = NamedTempFile::new()?;
989 let (_, path) = tmp.keep()?;
990
991 Ok(PartitionBuilder {
992 vals: vec![T::default(); capacity],
994 len: 0,
995 total_len: 0,
996 capacity,
997 path,
998 })
999 }
1000
1001 pub fn add(&mut self, val: T) -> Result<()> {
1007 self.vals[self.len] = val;
1008 self.len += 1;
1009 if self.len == self.capacity {
1010 self.write()?;
1011 self.len = 0;
1012 }
1013
1014 Ok(())
1015 }
1016
1017 pub fn write(&mut self) -> Result<()> {
1020 if self.len > 0 {
1021 let mut file = OpenOptions::new()
1022 .create(true)
1023 .append(true)
1024 .open(&self.path)?;
1025 file.write_all(vec_to_slice_u8(&self.vals[0..self.len]))?;
1026 self.total_len += self.len;
1027 }
1028 Ok(())
1029 }
1030}
1031
1032#[cfg(test)]
1034mod test {
1035 use super::{SufrBuilder, SufrBuilderArgs};
1036 use anyhow::Result;
1037 use std::fs;
1038 use tempfile::NamedTempFile;
1039 use pretty_assertions::assert_eq;
1040
1041 #[test]
1042 fn test_is_less() -> Result<()> {
1043 let text = b"TTTAGC".to_vec();
1045 let outfile = NamedTempFile::new()?;
1046 let args = SufrBuilderArgs {
1047 text,
1048 low_memory: true,
1049 path: Some(outfile.path().to_string_lossy().to_string()),
1050 max_query_len: None,
1051 is_dna: false,
1052 allow_ambiguity: false,
1053 ignore_softmask: false,
1054 sequence_starts: vec![0],
1055 sequence_names: vec!["1".to_string()],
1056 num_partitions: 2,
1057 seed_mask: None,
1058 random_seed: 0,
1059 };
1060 let sufr = SufrBuilder::<u32>::new(args)?;
1061
1062 assert!(sufr.is_less(1, 0));
1065
1066 assert!(!sufr.is_less(0, 1));
1069
1070 assert!(!sufr.is_less(2, 3));
1073
1074 assert!(sufr.is_less(3, 0));
1077
1078 fs::remove_file(outfile)?;
1079
1080 Ok(())
1081 }
1082
1083 #[test]
1084 fn test_is_less_max_query_len() -> Result<()> {
1085 let text = b"TTTAGC".to_vec();
1087 let outfile = NamedTempFile::new()?;
1088 let args = SufrBuilderArgs {
1089 text,
1090 low_memory: true,
1091 path: Some(outfile.path().to_string_lossy().to_string()),
1092 max_query_len: Some(2),
1093 is_dna: false,
1094 allow_ambiguity: false,
1095 ignore_softmask: false,
1096 sequence_starts: vec![0],
1097 sequence_names: vec!["1".to_string()],
1098 num_partitions: 2,
1099 seed_mask: None,
1100 random_seed: 0,
1101 };
1102 let sufr = SufrBuilder::<u32>::new(args)?;
1103
1104 assert!(!sufr.is_less(1, 0));
1109
1110 assert!(!sufr.is_less(0, 1));
1114
1115 assert!(!sufr.is_less(2, 3));
1118
1119 assert!(sufr.is_less(3, 0));
1122
1123 fs::remove_file(outfile)?;
1124
1125 Ok(())
1126 }
1127
1128 #[test]
1129 fn test_is_less_seed_mask() -> Result<()> {
1130 let text = b"TTTTAT".to_vec();
1132 let outfile = NamedTempFile::new()?;
1133 let args = SufrBuilderArgs {
1134 text,
1135 low_memory: true,
1136 path: Some(outfile.path().to_string_lossy().to_string()),
1137 max_query_len: None,
1138 is_dna: false,
1139 allow_ambiguity: false,
1140 ignore_softmask: false,
1141 sequence_starts: vec![0],
1142 sequence_names: vec!["1".to_string()],
1143 num_partitions: 2,
1144 seed_mask: Some("101".to_string()),
1145 random_seed: 0,
1146 };
1147 let sufr: SufrBuilder<u32> = SufrBuilder::new(args)?;
1148
1149 assert!(!sufr.is_less(0, 1));
1153
1154 assert!(!sufr.is_less(1, 0));
1158
1159 assert!(!sufr.is_less(0, 3));
1163
1164 assert!(!sufr.is_less(3, 0));
1168
1169 fs::remove_file(outfile)?;
1170
1171 Ok(())
1172 }
1173
1174 #[test]
1175 fn test_find_lcp_no_seed_mask() -> Result<()> {
1176 let text = b"TTTAGC".to_vec();
1178 let outfile = NamedTempFile::new()?;
1179 let args = SufrBuilderArgs {
1180 text,
1181 low_memory: true,
1182 path: Some(outfile.path().to_string_lossy().to_string()),
1183 max_query_len: None,
1184 is_dna: false,
1185 allow_ambiguity: false,
1186 ignore_softmask: false,
1187 sequence_starts: vec![0],
1188 sequence_names: vec!["1".to_string()],
1189 num_partitions: 2,
1190 seed_mask: None,
1191 random_seed: 0,
1192 };
1193 let sufr: SufrBuilder<u32> = SufrBuilder::new(args)?;
1194
1195 assert_eq!(sufr.find_lcp(0, 1, 6, 0), 2);
1199
1200 assert_eq!(sufr.find_lcp(0, 2, 6, 0), 1);
1204
1205 assert_eq!(sufr.find_lcp(0, 1, 1, 0), 1);
1209
1210 assert_eq!(sufr.find_lcp(0, 3, 6, 0), 0);
1214
1215 fs::remove_file(outfile)?;
1218
1219 Ok(())
1220 }
1221
1222 #[test]
1223 fn test_find_lcp_with_seed_mask() -> Result<()> {
1224 let text = b"TTTTTA".to_vec();
1226 let outfile = NamedTempFile::new()?;
1227 let args = SufrBuilderArgs {
1228 text,
1229 low_memory: true,
1230 path: Some(outfile.path().to_string_lossy().to_string()),
1231 max_query_len: None,
1232 is_dna: false,
1233 allow_ambiguity: false,
1234 ignore_softmask: false,
1235 sequence_starts: vec![0],
1236 sequence_names: vec!["1".to_string()],
1237 num_partitions: 2,
1238 seed_mask: Some("1101".to_string()),
1239 random_seed: 42,
1240 };
1241 let sufr: SufrBuilder<u32> = SufrBuilder::new(args)?;
1242
1243 assert_eq!(sufr.find_lcp(0, 1, 3, 0), 3);
1246
1247 assert_eq!(sufr.find_lcp(0, 2, 3, 0), 2);
1250
1251 assert_eq!(sufr.find_lcp(0, 5, 3, 0), 0);
1254
1255 fs::remove_file(outfile)?;
1256
1257 Ok(())
1258 }
1259
1260 #[test]
1261 fn test_upper_bound_1() -> Result<()> {
1262 let text = b"TTTAGC".to_vec();
1264 let outfile = NamedTempFile::new()?;
1265 let args = SufrBuilderArgs {
1266 text,
1267 low_memory: true,
1268 path: Some(outfile.path().to_string_lossy().to_string()),
1269 max_query_len: None,
1270 is_dna: false,
1271 allow_ambiguity: false,
1272 ignore_softmask: false,
1273 sequence_starts: vec![0],
1274 sequence_names: vec!["1".to_string()],
1275 num_partitions: 2,
1276 seed_mask: None,
1277 random_seed: 42,
1278 };
1279 let sufr: SufrBuilder<u32> = SufrBuilder::new(args)?;
1280
1281 assert_eq!(sufr.upper_bound(3, &[5, 4]), 0);
1283
1284 assert_eq!(sufr.upper_bound(2, &[3, 4, 5]), 3);
1286
1287 assert_eq!(sufr.upper_bound(5, &[3, 4, 5]), 1);
1289
1290 fs::remove_file(outfile)?;
1291
1292 Ok(())
1293 }
1294
1295 #[test]
1296 fn test_upper_bound_2() -> Result<()> {
1297 let text = b"ACGTNNACGT".to_vec();
1299 let outfile = NamedTempFile::new()?;
1300 let args = SufrBuilderArgs {
1301 text,
1302 low_memory: true,
1303 path: Some(outfile.path().to_string_lossy().to_string()),
1304 max_query_len: None,
1305 is_dna: false,
1306 allow_ambiguity: false,
1307 ignore_softmask: false,
1308 sequence_starts: vec![0],
1309 sequence_names: vec!["1".to_string()],
1310 num_partitions: 2,
1311 seed_mask: None,
1312 random_seed: 42,
1313 };
1314
1315 let sufr: SufrBuilder<u64> = SufrBuilder::new(args)?;
1316
1317 assert_eq!(sufr.upper_bound(0, &[0]), 0);
1319
1320 assert_eq!(sufr.upper_bound(0, &[6]), 1);
1322
1323 assert_eq!(sufr.upper_bound(6, &[0]), 0);
1325
1326 assert_eq!(sufr.upper_bound(6, &[6]), 0);
1328
1329 assert_eq!(sufr.upper_bound(0, &[7, 8]), 0);
1332
1333 assert_eq!(sufr.upper_bound(1, &[7, 8]), 1);
1335
1336 assert_eq!(sufr.upper_bound(1, &[7, 8]), 1);
1338
1339 assert_eq!(sufr.upper_bound(9, &[7, 8]), 2);
1341
1342 assert_eq!(sufr.upper_bound(9, &[3]), 0);
1344
1345 fs::remove_file(outfile)?;
1346
1347 Ok(())
1348 }
1349
1350 #[test]
1351 fn test_upper_bound_seed_mask() -> Result<()> {
1352 let text = b"ACGTNNACGT".to_vec();
1354 let outfile = NamedTempFile::new()?;
1355 let args = SufrBuilderArgs {
1356 text,
1357 low_memory: true,
1358 path: Some(outfile.path().to_string_lossy().to_string()),
1359 max_query_len: None,
1360 is_dna: false,
1361 allow_ambiguity: false,
1362 ignore_softmask: false,
1363 sequence_starts: vec![0],
1364 sequence_names: vec!["1".to_string()],
1365 num_partitions: 2,
1366 seed_mask: Some("101".to_string()),
1367 random_seed: 42,
1368 };
1369 let sufr: SufrBuilder<u32> = SufrBuilder::new(args)?;
1370
1371 assert_eq!(sufr.upper_bound(0, &[0]), 0);
1373
1374 assert_eq!(sufr.upper_bound(0, &[6]), 0);
1376
1377 assert_eq!(sufr.upper_bound(6, &[0]), 0);
1379
1380 assert_eq!(sufr.upper_bound(6, &[6]), 0);
1382
1383 assert_eq!(sufr.upper_bound(0, &[7, 8]), 0);
1386
1387 assert_eq!(sufr.upper_bound(1, &[7, 8]), 0);
1390
1391 assert_eq!(sufr.upper_bound(8, &[7, 8]), 1);
1394
1395 assert_eq!(sufr.upper_bound(9, &[7, 8]), 2);
1398
1399 assert_eq!(sufr.upper_bound(9, &[3]), 0);
1401
1402 fs::remove_file(outfile)?;
1403
1404 Ok(())
1405 }
1406}