libsufr/
types.rs

1//! Common types
2
3use anyhow::{bail, Result};
4use chrono::{DateTime, Local};
5use regex::Regex;
6use std::{
7    cmp::Ordering,
8    fmt::{self, Debug, Display},
9    hash::Hash,
10    ops::Range,
11    ops::{Add, AddAssign, Div, Sub},
12};
13
14// --------------------------------------------------
15/// Serialization version
16pub const OUTFILE_VERSION: u8 = 6;
17
18/// The sentinel character placed at the end of the text
19/// (and so must not occur in the given text)
20pub const SENTINEL_CHARACTER: u8 = b'$';
21
22// --------------------------------------------------
23/// Describes the suffixes in a _.sufr_ file were sorted
24#[derive(Debug, Clone, PartialEq)]
25pub enum SuffixSortType {
26    /// A maximum query length, which can be zero
27    MaxQueryLen(usize),
28
29    /// A seed mask of 1/0 for care/don't-care positions
30    Mask(SeedMask),
31}
32
33// --------------------------------------------------
34/// A struct describing a seed mask
35#[derive(Debug, PartialEq, Clone)]
36pub struct SeedMask {
37    /// The given mask string of 1/0s
38    pub mask: String,
39
40    /// A vector of 1/0 `u8` valus representing the mask
41    pub bytes: Vec<u8>,
42
43    /// The offset positions of the "care" values (from the 1s)
44    pub positions: Vec<usize>,
45
46    /// A vector of "difference" values to add to the
47    /// range 0..`weight` that will return the "care" positions
48    pub differences: Vec<usize>,
49
50    /// The number of "care" positions (popcount of 1)
51    pub weight: usize,
52}
53
54// --------------------------------------------------
55impl SeedMask {
56    /// Create a new `SeedMask` from an input string.
57    /// The string must:
58    /// * be comprised entirely of 1 or 0
59    /// * start and end with 1
60    /// * contain at least one 0
61    ///
62    /// ```
63    /// use anyhow::Result;
64    /// use libsufr::types::SeedMask;
65    ///
66    /// fn main() -> Result<()> {
67    ///     let mask = SeedMask::new("110110101")?;
68    ///     let expected = SeedMask {
69    ///         mask: "110110101".to_string(),
70    ///         bytes: vec![1u8, 1u8, 0u8, 1u8, 1u8, 0u8, 1u8, 0u8, 1u8],
71    ///         positions: vec![0, 1, 3, 4, 6, 8],
72    ///         differences: vec![0, 0, 1, 1, 2, 3],
73    ///         weight: 6,
74    ///     };
75    ///     assert_eq!(mask, expected);
76    ///     Ok(())
77    /// }
78    /// ```
79    ///
80    pub fn new(mask: &str) -> Result<Self> {
81        if !Self::is_valid(mask) {
82            bail!("Invalid seed mask '{mask}'")
83        }
84
85        let bytes = Self::parse(mask);
86        let positions = Self::get_positions(&bytes);
87        let differences = Self::get_differences(&positions);
88        let weight = positions.len();
89
90        Ok(Self {
91            mask: mask.to_string(),
92            bytes,
93            positions,
94            differences,
95            weight,
96        })
97    }
98
99    /// Instantiate a `SeedMask` from the byte representation
100    /// from a `SufrFile`
101    ///
102    /// ```
103    /// use anyhow::Result;
104    /// use libsufr::types::SeedMask;
105    ///
106    /// fn main() -> Result<()> {
107    ///     let bytes: Vec<u8> = vec![1, 1, 0, 1, 1, 0, 1, 0, 1];
108    ///     let mask = SeedMask::from_bytes(&bytes)?;
109    ///     let expected = SeedMask {
110    ///         mask: "110110101".to_string(),
111    ///         bytes,
112    ///         positions: vec![0, 1, 3, 4, 6, 8],
113    ///         differences: vec![0, 0, 1, 1, 2, 3],
114    ///         weight: 6,
115    ///     };
116    ///     assert_eq!(mask, expected);
117    ///     Ok(())
118    /// }
119    /// ```
120    ///
121    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
122        let mask: String = bytes
123            .iter()
124            .filter_map(|b| match b {
125                0 => Some('0'),
126                1 => Some('1'),
127                _ => None,
128            })
129            .collect();
130        if mask.is_empty() {
131            bail!("Bytes must be 1 or 0");
132        }
133        let positions = Self::get_positions(bytes);
134        let differences = Self::get_differences(&positions);
135        let weight = positions.len();
136
137        Ok(Self {
138            mask,
139            bytes: bytes.to_vec(),
140            positions,
141            differences,
142            weight,
143        })
144    }
145
146    /// Determine if a seed mask is valid
147    ///
148    /// ```
149    /// use anyhow::Result;
150    /// use libsufr::types::SeedMask;
151    ///
152    /// fn main() -> Result<()> {
153    ///     assert!(SeedMask::new("").is_err());
154    ///     assert!(SeedMask::new("0").is_err());
155    ///     assert!(SeedMask::new("01").is_err());
156    ///     assert!(SeedMask::new("10").is_err());
157    ///     assert!(SeedMask::new("11").is_err());
158    ///     assert!(SeedMask::new("101").is_ok());
159    ///     Ok(())
160    /// }
161    /// ```
162    ///
163    fn is_valid(mask: &str) -> bool {
164        let seed_re = Regex::new("^1+0[01]*1$").unwrap();
165        seed_re.is_match(mask)
166    }
167
168    /// Turn a valid string mask into a vector of `u8` values
169    /// suitable for serialization.
170    fn parse(mask: &str) -> Vec<u8> {
171        mask.as_bytes()
172            .iter()
173            .flat_map(|b| match b {
174                b'1' => Some(1),
175                b'0' => Some(0),
176                _ => None,
177            })
178            .collect()
179    }
180
181    /// Find the differences to add to each index to get the offsets
182    /// of the "care" positions.
183    fn get_differences(positions: &[usize]) -> Vec<usize> {
184        // Mask: "1001101"
185        // M: [0, 3, 4, 6]
186        // U: [0, 1, 2, 3]
187        // D: [0, 2, 2, 3]
188        positions.iter().enumerate().map(|(i, &m)| m - i).collect()
189    }
190
191    /// Return the offsets of the "care" positions from the bytes
192    fn get_positions(bytes: &[u8]) -> Vec<usize> {
193        // [1, 0, 1] -> [0, 2]
194        bytes
195            .iter()
196            .enumerate()
197            .filter_map(|(i, &b)| (b == 1).then_some(i))
198            .collect()
199    }
200}
201
202impl Display for SeedMask {
203    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204        write!(f, "{}", self.mask)
205    }
206}
207
208// --------------------------------------------------
209/// A struct for use in searching the suffix array
210#[derive(Debug, Clone)]
211pub struct SearchOptions {
212    /// A vector of query strings
213    pub queries: Vec<String>,
214
215    /// A maximum query length to use.
216    /// If the suffix array was sorted with a shorter MQL, that
217    /// value will be used instead.
218    pub max_query_len: Option<usize>,
219
220    /// When `true`, the suffix array will be placed into memory. 
221    /// When `false`, the suffix array will be read from disk.
222    pub low_memory: bool,
223
224    /// Whether or not to return location information or to simply count
225    pub find_suffixes: bool,
226}
227
228// --------------------------------------------------
229/// A struct representing the result of a suffix array search
230#[derive(Debug, PartialEq)]
231pub struct SearchResult<T>
232where
233    T: Int + FromUsize<T> + Sized + Send + Sync + serde::ser::Serialize,
234{
235    /// The ordinal position of the query
236    pub query_num: usize,
237
238    /// The query string
239    pub query: String,
240
241    /// A optional summary of the places where the query was found
242    pub locations: Option<SearchResultLocations<T>>,
243}
244
245// --------------------------------------------------
246/// The suffix locations matching a given query.
247#[derive(Debug, PartialEq)]
248pub struct SearchResultLocations<T> {
249    /// The range of suffix ranks
250    pub ranks: Range<usize>,
251
252    /// A vector of the suffix locations
253    pub suffixes: Vec<T>,
254}
255
256// --------------------------------------------------
257/// A struct describing how a query compares to a suffix
258#[derive(Debug)]
259pub struct Comparison {
260    /// Whether the suffix is greater, less, or equal
261    pub cmp: Ordering,
262
263    /// The length of the longest common prefix (LCP)
264    /// between the query and the suffix
265    pub lcp: usize,
266}
267
268// --------------------------------------------------
269/// This struct is returned by `libsufr::utils::read_sequence_file`
270/// for reading sequences from a FASTA/Q file.
271#[derive(Debug)]
272pub struct SequenceFileData {
273    /// The sequence as a vector of bytes. Multiple sequences are
274    /// separated by a user-supplied sequence delimiter.
275    pub seq: Vec<u8>,
276
277    /// The offsets where each sequence starts
278    pub start_positions: Vec<usize>,
279
280    /// The names of the sequences, will be the same length as `start_positions`
281    pub sequence_names: Vec<String>,
282}
283
284// --------------------------------------------------
285/// Trait to generically describe an "integer" of size `u8` (for text/bytes),
286/// `u32` for suffix/LCP arrays over a text with a length < 2^32,
287/// or `u64` for longer texts.
288pub trait Int:
289    Debug
290    + AddAssign
291    + Add<Output = Self>
292    + Sub<Output = Self>
293    + Div<Output = Self>
294    + Copy
295    + Default
296    + Display
297    + Ord
298    + Hash
299    + serde::ser::Serialize
300{
301    /// Convert an `Int` to a `usize`
302    fn to_usize(&self) -> usize;
303}
304
305impl Int for u8 {
306    fn to_usize(&self) -> usize {
307        *self as usize
308    }
309}
310
311impl Int for u32 {
312    fn to_usize(&self) -> usize {
313        *self as usize
314    }
315}
316
317impl Int for u64 {
318    fn to_usize(&self) -> usize {
319        *self as usize
320    }
321}
322
323/// Convert a `usize` to an `Int`
324pub trait FromUsize<T> {
325    /// Convert a `usize` to an `Int`
326    fn from_usize(val: usize) -> T;
327}
328
329impl FromUsize<u8> for u8 {
330    fn from_usize(val: usize) -> u8 {
331        val as u8
332    }
333}
334
335impl FromUsize<u32> for u32 {
336    fn from_usize(val: usize) -> u32 {
337        val as u32
338    }
339}
340
341impl FromUsize<u64> for u64 {
342    fn from_usize(val: usize) -> u64 {
343        val as u64
344    }
345}
346
347// --------------------------------------------------
348/// Options for counting the occurrences of suffixes
349#[derive(Debug, Clone)]
350pub struct CountOptions {
351    /// Vector of query strings
352    pub queries: Vec<String>,
353
354    /// Maximum query length for search
355    pub max_query_len: Option<usize>,
356
357    /// When `true`, the suffix array will be placed into memory. 
358    /// When `false`, the suffix array will be read from disk.
359    pub low_memory: bool,
360}
361
362// --------------------------------------------------
363/// A struct representing the results of counting the occurrences of suffixes
364#[derive(Debug, PartialEq)]
365pub struct CountResult {
366    /// The ordinal position of the original query
367    pub query_num: usize,
368
369    /// The query string
370    pub query: String,
371
372    /// Number of times a query was found
373    pub count: usize,
374}
375
376// --------------------------------------------------
377/// Options for extracting suffixes from a search
378#[derive(Debug, Clone)]
379pub struct ExtractOptions {
380    /// Vector of query strings
381    pub queries: Vec<String>,
382
383    /// Maximum query length for search
384    pub max_query_len: Option<usize>,
385
386    /// When `true`, the suffix array will be placed into memory. 
387    /// When `false`, the suffix array will be read from disk.
388    pub low_memory: bool,
389
390    /// Optional length of prefix to append before found suffixes (context)
391    pub prefix_len: Option<usize>,
392
393    /// Optional limit to the length of the suffix returned
394    pub suffix_len: Option<usize>,
395}
396
397// --------------------------------------------------
398/// The result of querying/extracting suffixes
399#[derive(Debug, PartialEq)]
400pub struct ExtractResult {
401    /// The ordinal position of the original query
402    pub query_num: usize,
403
404    /// The query string
405    pub query: String,
406
407    /// A vector of the sequences containing the queries and locations
408    pub sequences: Vec<ExtractSequence>,
409}
410
411// --------------------------------------------------
412/// A struct describing a found query in the context of a sequence
413/// This struct is used by the `sufr extract` command to print the
414/// results of a search in FASTA format.
415/// Normally, the query will be at the beginning of the result, so the
416/// `sequence_start` will be 0.
417/// When looking for alignment seeds, the user may request some prior context
418/// via a `prefix_len`, in which case the sequence start will be whatever
419/// length of prefix appended before the actual found hit. Note that the user
420/// may request something like 100 characters of prefix but less than that is
421/// appended because the hit was closer than 100 to the start of the sequence.
422#[derive(Debug, PartialEq)]
423pub struct ExtractSequence {
424    /// The position of the suffix in the suffix array
425    pub suffix: usize,
426
427    /// The rank of the suffix in the suffix array
428    pub rank: usize,
429
430    /// The name of the sequence containing a query hit
431    pub sequence_name: String,
432
433    /// The start/offset of the containing sequence in the full `text`
434    pub sequence_start: usize,
435
436    /// The hit's relative start/stop range inside the sequence
437    /// including the prefix/suffix lengths shown
438    pub sequence_range: Range<usize>,
439
440    /// The query hit's start position from the beginning of the shown context
441    /// E.g., if the user requested a prefix of 10, then this value will be
442    /// between 0-10, depending on the location of the hit inside the sequence.
443    pub suffix_offset: usize,
444}
445
446// --------------------------------------------------
447/// Arguments to sufr_file.list
448#[derive(Debug, Clone)]
449pub struct ListOptions {
450    /// Ranks of suffixes to show
451    pub ranks: Vec<usize>,
452
453    /// Show rank column
454    pub show_rank: bool,
455
456    /// Show suffix position column
457    pub show_suffix: bool,
458
459    /// Show LCP column
460    pub show_lcp: bool,
461
462    /// Length of suffixes to show
463    pub len: Option<usize>,
464
465    /// Number of suffixes to show
466    pub number: Option<usize>,
467
468    /// Output
469    //pub output: &'a mut Box<dyn Write>,
470    pub output: Option<String>,
471}
472
473// --------------------------------------------------
474/// A struct for use in locating suffixes
475#[derive(Debug, Clone)]
476pub struct LocateOptions {
477    /// A vector of query strings
478    pub queries: Vec<String>,
479
480    /// A maximum query length to use.
481    /// If the suffix array was sorted with a shorter MQL, that
482    /// value will be used instead.
483    pub max_query_len: Option<usize>,
484
485    /// When `true`, the suffix array will be placed into memory. 
486    /// When `false`, the suffix array will be read from disk.
487    pub low_memory: bool,
488}
489
490// --------------------------------------------------
491/// A struct representing the results of a search that includes the
492/// locations of the suffixes in their sequence context.
493#[derive(Debug, PartialEq)]
494pub struct LocateResult {
495    /// The ordinal position of the original query
496    pub query_num: usize,
497
498    /// The query string
499    pub query: String,
500
501    /// A vector of positions where the query was found.
502    /// This will be empty when the query was not present.
503    pub positions: Vec<LocatePosition>,
504}
505
506// --------------------------------------------------
507/// A struct representing the relative location of a query hit
508/// in the context of a sequence.
509#[derive(Debug, PartialEq)]
510pub struct LocatePosition {
511    /// The position of the suffix in the suffix array
512    pub suffix: usize,
513
514    /// The rank of the suffix in the suffix array
515    pub rank: usize,
516
517    /// The name of the sequence containing a query hit
518    pub sequence_name: String,
519
520    /// The start position of the hit in the sequence
521    pub sequence_position: usize,
522}
523
524// --------------------------------------------------
525/// The arguments for creating a `SufrBuilder` struct
526#[derive(Clone, Debug)]
527pub struct SufrBuilderArgs {
528    /// Text as raw U8 bytes. cf. `libsufr::utils::read_sequence_file`
529    /// Note: this value will be kept in memory during the build process.
530    pub text: Vec<u8>,
531
532    /// The path to the .sufr file that will be written.
533    pub path: Option<String>,
534
535    /// Use low memory when reading in resulting suffix array
536    pub low_memory: bool,
537
538    /// Maximum query length determines a prefix length of the suffixes.
539    /// Without this value, suffixes will be fully sorted.
540    pub max_query_len: Option<usize>,
541
542    /// Indicates that the input is nucleotides, which has implications
543    /// for ignoring ambiguity characters (not A, C, G, or T) and
544    /// soft-masked/lowercase characters (usually indicating low-complexity
545    /// regions).
546    pub is_dna: bool,
547
548    /// Whether or not to allow ambiguity characters (not A, C, G, or T)
549    /// when handling nucleotides.
550    pub allow_ambiguity: bool,
551
552    /// Whether or not to ignore lowercased/softmasked nucleotide
553    /// values (when `is_dna` is true).
554    pub ignore_softmask: bool,
555
556    /// When the `text` holds multiple sequences, this value contains
557    /// the start positions of the sequences for locate queries.
558    pub sequence_starts: Vec<usize>,
559
560    /// When the `text` holds multiple sequences, this value contains
561    /// the names of the sequences for locate queries.
562    pub sequence_names: Vec<String>,
563
564    /// The number of on-disk partitions to use when building the suffix array.
565    /// Recommended to be at least the number of available CPUs, but
566    /// a good number would place a 1-3 million suffixes into each partition,
567    /// depending on the amount of available memory.
568    /// The partitions are sorted independently and in parallel.
569    /// Max memory usage will be determined by the average size of the
570    /// partitions (which includes the number of suffixes in a partition
571    /// and the integer size [`u32`, `u64`] to represent the suffixes)
572    /// times the number of threads used to process concurrently.
573    pub num_partitions: usize,
574
575    /// An optional seed mask of 1/0 for care/don't-care positions,
576    /// cf. `SeedMask`.
577    pub seed_mask: Option<String>,
578
579    /// A seed value for reproducibility when randomly choosing the
580    /// suffixes for partitioning.
581    pub random_seed: u64,
582}
583
584// --------------------------------------------------
585/// A struct with metadata about the Sufr file
586#[derive(Debug, PartialEq)]
587pub struct SufrMetadata {
588    /// Filename
589    pub filename: String,
590
591    /// Modified
592    pub modified: DateTime<Local>,
593
594    /// File size
595    pub file_size: usize,
596
597    /// File version
598    pub file_version: usize,
599
600    /// Nucleotides
601    pub is_dna: bool,
602
603    /// Allow ambiguity
604    pub allow_ambiguity: bool,
605
606    /// Ignore softmask
607    pub ignore_softmask: bool,
608
609    /// Text length
610    pub text_len: usize,
611
612    /// Number of suffixes
613    pub len_suffixes: usize,
614
615    /// Number of sequences
616    pub num_sequences: usize,
617
618    /// Start positions of sequences
619    pub sequence_starts: Vec<usize>,
620
621    /// Names of sequences
622    pub sequence_names: Vec<String>,
623
624    /// Sort type
625    pub sort_type: SuffixSortType,
626}
627
628#[cfg(test)]
629mod tests {
630    use super::SeedMask;
631    use anyhow::Result;
632    use pretty_assertions::assert_eq;
633
634    #[test]
635    fn test_create_seed_mask() -> Result<()> {
636        // No 0 at beginning
637        let res = SeedMask::new("0101");
638        assert!(res.is_err());
639
640        // No 0 at end
641        let res = SeedMask::new("1010");
642        assert!(res.is_err());
643
644        // Nothing other than 0/1
645        let res = SeedMask::new("1021");
646        assert!(res.is_err());
647
648        // Must have a 0
649        let res = SeedMask::new("1111");
650        assert!(res.is_err());
651
652        // Valid
653        let res = SeedMask::new("101");
654        assert!(res.is_ok());
655
656        let expected = SeedMask {
657            mask: "101".to_string(),
658            bytes: vec![1, 0, 1],
659            positions: vec![0, 2],
660            differences: vec![0, 1],
661            weight: 2,
662        };
663        assert_eq!(res.unwrap(), expected);
664
665        let res = SeedMask::new("11101101101000011");
666        assert!(res.is_ok());
667
668        let expected = SeedMask {
669            mask: "11101101101000011".to_string(),
670            bytes: vec![1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1],
671            positions: vec![0, 1, 2, 4, 5, 7, 8, 10, 15, 16],
672            differences: vec![0, 0, 0, 1, 1, 2, 2, 3, 7, 7],
673            weight: 10,
674        };
675        assert_eq!(res.unwrap(), expected);
676
677        Ok(())
678    }
679
680    #[test]
681    fn test_display_seed_mask() -> Result<()> {
682        for val in &["101", "11101101101000011"] {
683            let mask = SeedMask::new(val).unwrap();
684            assert_eq!(format!("{mask}"), val.to_string());
685        }
686        Ok(())
687    }
688
689    #[test]
690    fn test_valid_seed_mask() -> Result<()> {
691        let valid = ["101", "1001", "1101", "10101", "1110110110100001"];
692        for pattern in valid {
693            assert!(SeedMask::is_valid(pattern));
694        }
695
696        let invalid = [
697            "", "abc", "1", "11", "111", "0", "00", "0111", "11100", "1a01",
698        ];
699        for pattern in invalid {
700            assert!(!SeedMask::is_valid(pattern));
701        }
702
703        Ok(())
704    }
705
706    #[test]
707    fn test_seed_mask_positions() -> Result<()> {
708        assert_eq!(SeedMask::get_positions(&[1, 0, 1]), [0, 2]);
709        assert_eq!(SeedMask::get_positions(&[1, 1, 0, 1, 1]), [0, 1, 3, 4]);
710        assert_eq!(SeedMask::get_positions(&[1, 0, 1, 1, 0, 1]), [0, 2, 3, 5]);
711        Ok(())
712    }
713
714    #[test]
715    fn test_seed_mask_difference() -> Result<()> {
716        // Empty is not a failure
717        assert_eq!(SeedMask::get_differences(&[]), []);
718
719        // "11011" -> [0, 1, 3, 4]
720        //           - 0  1  2  3
721        //            ------------
722        //            [0, 0, 1, 1]
723        assert_eq!(SeedMask::get_differences(&[0, 1, 3, 4]), [0, 0, 1, 1]);
724
725        // "100001" -> [0, 5]
726        //            - 0  1
727        //            --------
728        //             [0, 4]
729        assert_eq!(SeedMask::get_differences(&[0, 5]), [0, 4]);
730
731        // "1001101" -> [0, 3, 4, 6]
732        //             - 0  1  2  3
733        //             -------------
734        //              [0, 2, 2, 3]
735        assert_eq!(SeedMask::get_differences(&[0, 3, 4, 6]), [0, 2, 2, 3]);
736        Ok(())
737    }
738}