libsufr/types.rs
1//! Common types
2
3use anyhow::{bail, Result};
4use chrono::{DateTime, Local};
5use regex::Regex;
6use std::{
7 cmp::Ordering,
8 fmt::{self, Debug, Display},
9 hash::Hash,
10 ops::Range,
11 ops::{Add, AddAssign, Div, Sub},
12};
13
14// --------------------------------------------------
15/// Serialization version
16pub const OUTFILE_VERSION: u8 = 6;
17
18/// The sentinel character placed at the end of the text
19/// (and so must not occur in the given text)
20pub const SENTINEL_CHARACTER: u8 = b'$';
21
22// --------------------------------------------------
23/// Describes the suffixes in a _.sufr_ file were sorted
24#[derive(Debug, Clone, PartialEq)]
25pub enum SuffixSortType {
26 /// A maximum query length, which can be zero
27 MaxQueryLen(usize),
28
29 /// A seed mask of 1/0 for care/don't-care positions
30 Mask(SeedMask),
31}
32
33// --------------------------------------------------
34/// A struct describing a seed mask
35#[derive(Debug, PartialEq, Clone)]
36pub struct SeedMask {
37 /// The given mask string of 1/0s
38 pub mask: String,
39
40 /// A vector of 1/0 `u8` valus representing the mask
41 pub bytes: Vec<u8>,
42
43 /// The offset positions of the "care" values (from the 1s)
44 pub positions: Vec<usize>,
45
46 /// A vector of "difference" values to add to the
47 /// range 0..`weight` that will return the "care" positions
48 pub differences: Vec<usize>,
49
50 /// The number of "care" positions (popcount of 1)
51 pub weight: usize,
52}
53
54// --------------------------------------------------
55impl SeedMask {
56 /// Create a new `SeedMask` from an input string.
57 /// The string must:
58 /// * be comprised entirely of 1 or 0
59 /// * start and end with 1
60 /// * contain at least one 0
61 ///
62 /// ```
63 /// use anyhow::Result;
64 /// use libsufr::types::SeedMask;
65 ///
66 /// fn main() -> Result<()> {
67 /// let mask = SeedMask::new("110110101")?;
68 /// let expected = SeedMask {
69 /// mask: "110110101".to_string(),
70 /// bytes: vec![1u8, 1u8, 0u8, 1u8, 1u8, 0u8, 1u8, 0u8, 1u8],
71 /// positions: vec![0, 1, 3, 4, 6, 8],
72 /// differences: vec![0, 0, 1, 1, 2, 3],
73 /// weight: 6,
74 /// };
75 /// assert_eq!(mask, expected);
76 /// Ok(())
77 /// }
78 /// ```
79 ///
80 pub fn new(mask: &str) -> Result<Self> {
81 if !Self::is_valid(mask) {
82 bail!("Invalid seed mask '{mask}'")
83 }
84
85 let bytes = Self::parse(mask);
86 let positions = Self::get_positions(&bytes);
87 let differences = Self::get_differences(&positions);
88 let weight = positions.len();
89
90 Ok(Self {
91 mask: mask.to_string(),
92 bytes,
93 positions,
94 differences,
95 weight,
96 })
97 }
98
99 /// Instantiate a `SeedMask` from the byte representation
100 /// from a `SufrFile`
101 ///
102 /// ```
103 /// use anyhow::Result;
104 /// use libsufr::types::SeedMask;
105 ///
106 /// fn main() -> Result<()> {
107 /// let bytes: Vec<u8> = vec![1, 1, 0, 1, 1, 0, 1, 0, 1];
108 /// let mask = SeedMask::from_bytes(&bytes)?;
109 /// let expected = SeedMask {
110 /// mask: "110110101".to_string(),
111 /// bytes,
112 /// positions: vec![0, 1, 3, 4, 6, 8],
113 /// differences: vec![0, 0, 1, 1, 2, 3],
114 /// weight: 6,
115 /// };
116 /// assert_eq!(mask, expected);
117 /// Ok(())
118 /// }
119 /// ```
120 ///
121 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
122 let mask: String = bytes
123 .iter()
124 .filter_map(|b| match b {
125 0 => Some('0'),
126 1 => Some('1'),
127 _ => None,
128 })
129 .collect();
130 if mask.is_empty() {
131 bail!("Bytes must be 1 or 0");
132 }
133 let positions = Self::get_positions(bytes);
134 let differences = Self::get_differences(&positions);
135 let weight = positions.len();
136
137 Ok(Self {
138 mask,
139 bytes: bytes.to_vec(),
140 positions,
141 differences,
142 weight,
143 })
144 }
145
146 /// Determine if a seed mask is valid
147 ///
148 /// ```
149 /// use anyhow::Result;
150 /// use libsufr::types::SeedMask;
151 ///
152 /// fn main() -> Result<()> {
153 /// assert!(SeedMask::new("").is_err());
154 /// assert!(SeedMask::new("0").is_err());
155 /// assert!(SeedMask::new("01").is_err());
156 /// assert!(SeedMask::new("10").is_err());
157 /// assert!(SeedMask::new("11").is_err());
158 /// assert!(SeedMask::new("101").is_ok());
159 /// Ok(())
160 /// }
161 /// ```
162 ///
163 fn is_valid(mask: &str) -> bool {
164 let seed_re = Regex::new("^1+0[01]*1$").unwrap();
165 seed_re.is_match(mask)
166 }
167
168 /// Turn a valid string mask into a vector of `u8` values
169 /// suitable for serialization.
170 fn parse(mask: &str) -> Vec<u8> {
171 mask.as_bytes()
172 .iter()
173 .flat_map(|b| match b {
174 b'1' => Some(1),
175 b'0' => Some(0),
176 _ => None,
177 })
178 .collect()
179 }
180
181 /// Find the differences to add to each index to get the offsets
182 /// of the "care" positions.
183 fn get_differences(positions: &[usize]) -> Vec<usize> {
184 // Mask: "1001101"
185 // M: [0, 3, 4, 6]
186 // U: [0, 1, 2, 3]
187 // D: [0, 2, 2, 3]
188 positions.iter().enumerate().map(|(i, &m)| m - i).collect()
189 }
190
191 /// Return the offsets of the "care" positions from the bytes
192 fn get_positions(bytes: &[u8]) -> Vec<usize> {
193 // [1, 0, 1] -> [0, 2]
194 bytes
195 .iter()
196 .enumerate()
197 .filter_map(|(i, &b)| (b == 1).then_some(i))
198 .collect()
199 }
200}
201
202impl Display for SeedMask {
203 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204 write!(f, "{}", self.mask)
205 }
206}
207
208// --------------------------------------------------
209/// A struct for use in searching the suffix array
210#[derive(Debug, Clone)]
211pub struct SearchOptions {
212 /// A vector of query strings
213 pub queries: Vec<String>,
214
215 /// A maximum query length to use.
216 /// If the suffix array was sorted with a shorter MQL, that
217 /// value will be used instead.
218 pub max_query_len: Option<usize>,
219
220 /// When `true`, the suffix array will be placed into memory.
221 /// When `false`, the suffix array will be read from disk.
222 pub low_memory: bool,
223
224 /// Whether or not to return location information or to simply count
225 pub find_suffixes: bool,
226}
227
228// --------------------------------------------------
229/// A struct representing the result of a suffix array search
230#[derive(Debug, PartialEq)]
231pub struct SearchResult<T>
232where
233 T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
234{
235 /// The ordinal position of the query
236 pub query_num: usize,
237
238 /// The query string
239 pub query: String,
240
241 /// A optional summary of the places where the query was found
242 pub locations: Option<SearchResultLocations<T>>,
243}
244
245// --------------------------------------------------
246/// The suffix locations matching a given query.
247#[derive(Debug, PartialEq)]
248pub struct SearchResultLocations<T> {
249 /// The range of suffix ranks
250 pub ranks: Range<usize>,
251
252 /// A vector of the suffix locations
253 pub suffixes: Vec<T>,
254}
255
256// --------------------------------------------------
257/// A struct describing how a query compares to a suffix
258#[derive(Debug)]
259pub struct Comparison {
260 /// Whether the suffix is greater, less, or equal
261 pub cmp: Ordering,
262
263 /// The length of the longest common prefix (LCP)
264 /// between the query and the suffix
265 pub lcp: usize,
266}
267
268// --------------------------------------------------
269/// This struct is returned by `libsufr::utils::read_sequence_file`
270/// for reading sequences from a FASTA/Q file.
271#[derive(Debug)]
272pub struct SequenceFileData {
273 /// The sequence as a vector of bytes. Multiple sequences are
274 /// separated by a user-supplied sequence delimiter.
275 pub seq: Vec<u8>,
276
277 /// The offsets where each sequence starts
278 pub start_positions: Vec<usize>,
279
280 /// The names of the sequences, will be the same length as `start_positions`
281 pub sequence_names: Vec<String>,
282}
283
284// --------------------------------------------------
285/// Trait to generically describe an "integer" of size `u8` (for text/bytes),
286/// `u32` for suffix/LCP arrays over a text with a length < 2^32,
287/// or `u64` for longer texts.
288pub trait Int:
289 Debug
290 + AddAssign
291 + Add<Output = Self>
292 + Sub<Output = Self>
293 + Div<Output = Self>
294 + Copy
295 + Default
296 + Display
297 + Ord
298 + Hash
299 + serde::ser::Serialize
300{
301 /// Convert an `Int` to a `usize`
302 fn to_usize(&self) -> usize;
303}
304
305impl Int for u8 {
306 fn to_usize(&self) -> usize {
307 *self as usize
308 }
309}
310
311impl Int for u32 {
312 fn to_usize(&self) -> usize {
313 *self as usize
314 }
315}
316
317impl Int for u64 {
318 fn to_usize(&self) -> usize {
319 *self as usize
320 }
321}
322
323/// Convert a `usize` to an `Int`
324pub trait FromUsize<T> {
325 /// Convert a `usize` to an `Int`
326 fn from_usize(val: usize) -> T;
327}
328
329impl FromUsize<u8> for u8 {
330 fn from_usize(val: usize) -> u8 {
331 val as u8
332 }
333}
334
335impl FromUsize<u32> for u32 {
336 fn from_usize(val: usize) -> u32 {
337 val as u32
338 }
339}
340
341impl FromUsize<u64> for u64 {
342 fn from_usize(val: usize) -> u64 {
343 val as u64
344 }
345}
346
347// --------------------------------------------------
348/// Options for counting the occurrences of suffixes
349#[derive(Debug, Clone)]
350pub struct CountOptions {
351 /// Vector of query strings
352 pub queries: Vec<String>,
353
354 /// Maximum query length for search
355 pub max_query_len: Option<usize>,
356
357 /// When `true`, the suffix array will be placed into memory.
358 /// When `false`, the suffix array will be read from disk.
359 pub low_memory: bool,
360}
361
362// --------------------------------------------------
363/// A struct representing the results of counting the occurrences of suffixes
364#[derive(Debug, PartialEq)]
365pub struct CountResult {
366 /// The ordinal position of the original query
367 pub query_num: usize,
368
369 /// The query string
370 pub query: String,
371
372 /// Number of times a query was found
373 pub count: usize,
374}
375
376// --------------------------------------------------
377/// Options for extracting suffixes from a search
378#[derive(Debug, Clone)]
379pub struct ExtractOptions {
380 /// Vector of query strings
381 pub queries: Vec<String>,
382
383 /// Maximum query length for search
384 pub max_query_len: Option<usize>,
385
386 /// When `true`, the suffix array will be placed into memory.
387 /// When `false`, the suffix array will be read from disk.
388 pub low_memory: bool,
389
390 /// Optional length of prefix to append before found suffixes (context)
391 pub prefix_len: Option<usize>,
392
393 /// Optional limit to the length of the suffix returned
394 pub suffix_len: Option<usize>,
395}
396
397// --------------------------------------------------
398/// The result of querying/extracting suffixes
399#[derive(Debug, PartialEq)]
400pub struct ExtractResult {
401 /// The ordinal position of the original query
402 pub query_num: usize,
403
404 /// The query string
405 pub query: String,
406
407 /// A vector of the sequences containing the queries and locations
408 pub sequences: Vec<ExtractSequence>,
409}
410
411// --------------------------------------------------
412/// A struct describing a found query in the context of a sequence
413/// This struct is used by the `sufr extract` command to print the
414/// results of a search in FASTA format.
415/// Normally, the query will be at the beginning of the result, so the
416/// `sequence_start` will be 0.
417/// When looking for alignment seeds, the user may request some prior context
418/// via a `prefix_len`, in which case the sequence start will be whatever
419/// length of prefix appended before the actual found hit. Note that the user
420/// may request something like 100 characters of prefix but less than that is
421/// appended because the hit was closer than 100 to the start of the sequence.
422#[derive(Debug, PartialEq)]
423pub struct ExtractSequence {
424 /// The position of the suffix in the suffix array
425 pub suffix: usize,
426
427 /// The rank of the suffix in the suffix array
428 pub rank: usize,
429
430 /// The name of the sequence containing a query hit
431 pub sequence_name: String,
432
433 /// The start/offset of the containing sequence in the full `text`
434 pub sequence_start: usize,
435
436 /// The hit's relative start/stop range inside the sequence
437 /// including the prefix/suffix lengths shown
438 pub sequence_range: Range<usize>,
439
440 /// The query hit's start position from the beginning of the shown context
441 /// E.g., if the user requested a prefix of 10, then this value will be
442 /// between 0-10, depending on the location of the hit inside the sequence.
443 pub suffix_offset: usize,
444}
445
446// --------------------------------------------------
447/// Arguments to sufr_file.list
448#[derive(Debug, Clone)]
449pub struct ListOptions {
450 /// Ranks of suffixes to show
451 pub ranks: Vec<usize>,
452
453 /// Show rank column
454 pub show_rank: bool,
455
456 /// Show suffix position column
457 pub show_suffix: bool,
458
459 /// Show LCP column
460 pub show_lcp: bool,
461
462 /// Length of suffixes to show
463 pub len: Option<usize>,
464
465 /// Number of suffixes to show
466 pub number: Option<usize>,
467
468 /// Output
469 //pub output: &'a mut Box<dyn Write>,
470 pub output: Option<String>,
471}
472
473// --------------------------------------------------
474/// A struct for use in locating suffixes
475#[derive(Debug, Clone)]
476pub struct LocateOptions {
477 /// A vector of query strings
478 pub queries: Vec<String>,
479
480 /// A maximum query length to use.
481 /// If the suffix array was sorted with a shorter MQL, that
482 /// value will be used instead.
483 pub max_query_len: Option<usize>,
484
485 /// When `true`, the suffix array will be placed into memory.
486 /// When `false`, the suffix array will be read from disk.
487 pub low_memory: bool,
488}
489
490// --------------------------------------------------
491/// A struct representing the results of a search that includes the
492/// locations of the suffixes in their sequence context.
493#[derive(Debug, PartialEq)]
494pub struct LocateResult {
495 /// The ordinal position of the original query
496 pub query_num: usize,
497
498 /// The query string
499 pub query: String,
500
501 /// A vector of positions where the query was found.
502 /// This will be empty when the query was not present.
503 pub positions: Vec<LocatePosition>,
504}
505
506// --------------------------------------------------
507/// A struct representing the relative location of a query hit
508/// in the context of a sequence.
509#[derive(Debug, PartialEq)]
510pub struct LocatePosition {
511 /// The position of the suffix in the suffix array
512 pub suffix: usize,
513
514 /// The rank of the suffix in the suffix array
515 pub rank: usize,
516
517 /// The name of the sequence containing a query hit
518 pub sequence_name: String,
519
520 /// The start position of the hit in the sequence
521 pub sequence_position: usize,
522}
523
524// --------------------------------------------------
525/// The arguments for creating a `SufrBuilder` struct
526#[derive(Clone, Debug)]
527pub struct SufrBuilderArgs {
528 /// Text as raw U8 bytes. cf. `libsufr::utils::read_sequence_file`
529 /// Note: this value will be kept in memory during the build process.
530 pub text: Vec<u8>,
531
532 /// The path to the .sufr file that will be written.
533 pub path: Option<String>,
534
535 /// Use low memory when reading in resulting suffix array
536 pub low_memory: bool,
537
538 /// Maximum query length determines a prefix length of the suffixes.
539 /// Without this value, suffixes will be fully sorted.
540 pub max_query_len: Option<usize>,
541
542 /// Indicates that the input is nucleotides, which has implications
543 /// for ignoring ambiguity characters (not A, C, G, or T) and
544 /// soft-masked/lowercase characters (usually indicating low-complexity
545 /// regions).
546 pub is_dna: bool,
547
548 /// Whether or not to allow ambiguity characters (not A, C, G, or T)
549 /// when handling nucleotides.
550 pub allow_ambiguity: bool,
551
552 /// Whether or not to ignore lowercased/softmasked nucleotide
553 /// values (when `is_dna` is true).
554 pub ignore_softmask: bool,
555
556 /// When the `text` holds multiple sequences, this value contains
557 /// the start positions of the sequences for locate queries.
558 pub sequence_starts: Vec<usize>,
559
560 /// When the `text` holds multiple sequences, this value contains
561 /// the names of the sequences for locate queries.
562 pub sequence_names: Vec<String>,
563
564 /// The number of on-disk partitions to use when building the suffix array.
565 /// Recommended to be at least the number of available CPUs, but
566 /// a good number would place a 1-3 million suffixes into each partition,
567 /// depending on the amount of available memory.
568 /// The partitions are sorted independently and in parallel.
569 /// Max memory usage will be determined by the average size of the
570 /// partitions (which includes the number of suffixes in a partition
571 /// and the integer size [`u32`, `u64`] to represent the suffixes)
572 /// times the number of threads used to process concurrently.
573 pub num_partitions: usize,
574
575 /// An optional seed mask of 1/0 for care/don't-care positions,
576 /// cf. `SeedMask`.
577 pub seed_mask: Option<String>,
578
579 /// A seed value for reproducibility when randomly choosing the
580 /// suffixes for partitioning.
581 pub random_seed: u64,
582}
583
584// --------------------------------------------------
585/// A struct with metadata about the Sufr file
586#[derive(Debug, PartialEq)]
587pub struct SufrMetadata {
588 /// Filename
589 pub filename: String,
590
591 /// Modified
592 pub modified: DateTime<Local>,
593
594 /// File size
595 pub file_size: usize,
596
597 /// File version
598 pub file_version: usize,
599
600 /// Nucleotides
601 pub is_dna: bool,
602
603 /// Allow ambiguity
604 pub allow_ambiguity: bool,
605
606 /// Ignore softmask
607 pub ignore_softmask: bool,
608
609 /// Text length
610 pub text_len: usize,
611
612 /// Number of suffixes
613 pub len_suffixes: usize,
614
615 /// Number of sequences
616 pub num_sequences: usize,
617
618 /// Start positions of sequences
619 pub sequence_starts: Vec<usize>,
620
621 /// Names of sequences
622 pub sequence_names: Vec<String>,
623
624 /// Sort type
625 pub sort_type: SuffixSortType,
626}
627
628#[cfg(test)]
629mod tests {
630 use super::SeedMask;
631 use anyhow::Result;
632 use pretty_assertions::assert_eq;
633
634 #[test]
635 fn test_create_seed_mask() -> Result<()> {
636 // No 0 at beginning
637 let res = SeedMask::new("0101");
638 assert!(res.is_err());
639
640 // No 0 at end
641 let res = SeedMask::new("1010");
642 assert!(res.is_err());
643
644 // Nothing other than 0/1
645 let res = SeedMask::new("1021");
646 assert!(res.is_err());
647
648 // Must have a 0
649 let res = SeedMask::new("1111");
650 assert!(res.is_err());
651
652 // Valid
653 let res = SeedMask::new("101");
654 assert!(res.is_ok());
655
656 let expected = SeedMask {
657 mask: "101".to_string(),
658 bytes: vec![1, 0, 1],
659 positions: vec![0, 2],
660 differences: vec![0, 1],
661 weight: 2,
662 };
663 assert_eq!(res.unwrap(), expected);
664
665 let res = SeedMask::new("11101101101000011");
666 assert!(res.is_ok());
667
668 let expected = SeedMask {
669 mask: "11101101101000011".to_string(),
670 bytes: vec![1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1],
671 positions: vec![0, 1, 2, 4, 5, 7, 8, 10, 15, 16],
672 differences: vec![0, 0, 0, 1, 1, 2, 2, 3, 7, 7],
673 weight: 10,
674 };
675 assert_eq!(res.unwrap(), expected);
676
677 Ok(())
678 }
679
680 #[test]
681 fn test_display_seed_mask() -> Result<()> {
682 for val in &["101", "11101101101000011"] {
683 let mask = SeedMask::new(val).unwrap();
684 assert_eq!(format!("{mask}"), val.to_string());
685 }
686 Ok(())
687 }
688
689 #[test]
690 fn test_valid_seed_mask() -> Result<()> {
691 let valid = ["101", "1001", "1101", "10101", "1110110110100001"];
692 for pattern in valid {
693 assert!(SeedMask::is_valid(pattern));
694 }
695
696 let invalid = [
697 "", "abc", "1", "11", "111", "0", "00", "0111", "11100", "1a01",
698 ];
699 for pattern in invalid {
700 assert!(!SeedMask::is_valid(pattern));
701 }
702
703 Ok(())
704 }
705
706 #[test]
707 fn test_seed_mask_positions() -> Result<()> {
708 assert_eq!(SeedMask::get_positions(&[1, 0, 1]), [0, 2]);
709 assert_eq!(SeedMask::get_positions(&[1, 1, 0, 1, 1]), [0, 1, 3, 4]);
710 assert_eq!(SeedMask::get_positions(&[1, 0, 1, 1, 0, 1]), [0, 2, 3, 5]);
711 Ok(())
712 }
713
714 #[test]
715 fn test_seed_mask_difference() -> Result<()> {
716 // Empty is not a failure
717 assert_eq!(SeedMask::get_differences(&[]), []);
718
719 // "11011" -> [0, 1, 3, 4]
720 // - 0 1 2 3
721 // ------------
722 // [0, 0, 1, 1]
723 assert_eq!(SeedMask::get_differences(&[0, 1, 3, 4]), [0, 0, 1, 1]);
724
725 // "100001" -> [0, 5]
726 // - 0 1
727 // --------
728 // [0, 4]
729 assert_eq!(SeedMask::get_differences(&[0, 5]), [0, 4]);
730
731 // "1001101" -> [0, 3, 4, 6]
732 // - 0 1 2 3
733 // -------------
734 // [0, 2, 2, 3]
735 assert_eq!(SeedMask::get_differences(&[0, 3, 4, 6]), [0, 2, 2, 3]);
736 Ok(())
737 }
738}