sa_ord/
mod.rs

1/// This module is a pure (attempted idiomatic) Rust implementation of the SA-IS suffix tree
2/// algorithm for slices of Ord + Clone + Hash + Debug types
3pub mod errors;
4
5use bitvec::vec::BitVec;
6use core::hash::Hash;
7use errors::{Error, Result};
8use itertools::Itertools;
9use std::cmp::Ordering;
10use std::collections::HashMap;
11use std::fmt::Debug;
12
13/// Position types (L or S). Implementation of Suffix Types `BitVec` partially borrowed from
14/// [Rust Bio](https://github.com/rust-bio/rust-bio/blob/43d198e87c69867fad39f07374d3fdcd9d879729/src/data_structures/suffix_array.rs)
15#[derive(Debug)]
16struct SuffixTypes {
17    /// `BitVec`, with 1-bits indicating S type and 0 bits L-type
18    types: BitVec,
19}
20
21impl SuffixTypes {
22    /// Instantiates a new `SuffixTypes` object. Characterizes the provided text into L or S type
23    /// suffixes and stores these types in a `BitVec`
24    fn new<S: Ord + Clone + Hash + Debug>(text: &[S]) -> Self {
25        let text_len = text.len();
26        let mut types: BitVec = BitVec::with_capacity(text_len);
27        for _ in 0..text_len - 1 {
28            types.push(false);
29        }
30
31        types.push(true);
32
33        for text_index in (0..text_len - 1).rev() {
34            if text[text_index] == text[text_index + 1] {
35                // if the characters are equal, the next position determines
36                // the lexicographical order
37                let v = types[text_index + 1];
38                types.set(text_index, v);
39            } else {
40                types.set(text_index, text[text_index] < text[text_index + 1]);
41            }
42        }
43        SuffixTypes { types }
44    }
45
46    /// Check if p is S-position.
47    fn is_stype(&self, p: usize) -> bool {
48        self.types[p]
49    }
50
51    /// Check if p is L-position.
52    fn is_ltype(&self, p: usize) -> bool {
53        !(self.types[p])
54    }
55
56    /// Check if p is LMS-position.
57    fn is_leftmost_stype(&self, p: usize) -> bool {
58        p != 0 && self.is_stype(p) && self.is_ltype(p - 1)
59    }
60}
61
62/// An object that allows for sorting of suffix types based on L or S substring properties for the
63/// purposes of ordering 'bins' of the suffix array as described in the SA-IS implementation paper.
64#[derive(Debug, Clone, Hash, PartialEq, Eq)]
65enum SortableSuffixType<S: Ord + Clone + Hash + Debug> {
66    /// 'Smaller' type, indicating that the suffix is smaller than the suffix to its right
67    SType(S),
68    /// 'Larger' type, indicating that the suffix is larger than the suffix to its right
69    LType(S),
70}
71
72impl<S: Ord + Clone + Hash + Debug> PartialOrd for SortableSuffixType<S> {
73    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
74        Some(self.cmp(other))
75    }
76}
77
78impl<S: Ord + Clone + Hash + Debug> Ord for SortableSuffixType<S> {
79    fn cmp(&self, other: &Self) -> Ordering {
80        match (self, other) {
81            (SortableSuffixType::SType(x), SortableSuffixType::LType(y)) => match x.cmp(y) {
82                Ordering::Equal => Ordering::Greater,
83                cmp => cmp,
84            },
85            (SortableSuffixType::LType(x), SortableSuffixType::SType(y)) => match x.cmp(y) {
86                Ordering::Equal => Ordering::Less,
87                cmp => cmp,
88            },
89            (
90                SortableSuffixType::SType(x) | SortableSuffixType::LType(x),
91                SortableSuffixType::SType(y) | SortableSuffixType::LType(y),
92            ) => x.cmp(y),
93        }
94    }
95}
96
97/// Finds the start and end indices of the LMS substrings in the provided arra of suffix classifications
98fn find_lms_substring_indices(suffix_classifications: &SuffixTypes) -> Vec<(usize, usize)> {
99    let lms_num = (0..suffix_classifications.types.len())
100        .filter(|idx| suffix_classifications.is_leftmost_stype(*idx))
101        .count();
102    let mut wstrings: Vec<(usize, usize)> = Vec::with_capacity(lms_num);
103    let mut start = None;
104    for current_index in 0..suffix_classifications.types.len() {
105        if suffix_classifications.is_leftmost_stype(current_index) {
106            if let Some(current_start) = start {
107                wstrings.push((current_start, current_index + 1));
108            }
109            start = Some(current_index);
110        }
111    }
112    wstrings.push((suffix_classifications.types.len() - 1, suffix_classifications.types.len()));
113    wstrings
114}
115
116/// Counts the number of characters of each value and type (L or S).
117fn count_text_bin_sizes<S: Ord + Clone + Hash + Debug>(
118    text: &[S],
119    suffix_classifications: &SuffixTypes,
120) -> HashMap<SortableSuffixType<S>, usize> {
121    let mut bin_sizes: HashMap<SortableSuffixType<S>, usize> = HashMap::new();
122    for (index, suffix_type) in suffix_classifications.types.iter().enumerate() {
123        let char_var = if *suffix_type {
124            SortableSuffixType::SType(text[index].clone())
125        } else {
126            SortableSuffixType::LType(text[index].clone())
127        };
128        if let Some(x) = bin_sizes.get_mut(&char_var) {
129            *x += 1;
130        } else {
131            bin_sizes.insert(char_var, 1);
132        }
133    }
134    bin_sizes
135}
136
137/// Derives the bin offsets for the suffix array given the list of bin sizes.
138fn get_bin_offsets<S: Ord + Clone + Hash + Debug>(
139    bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
140    forward_edge: bool,
141) -> HashMap<SortableSuffixType<S>, usize> {
142    let ordered_chars = bin_sizes.keys().cloned().sorted().collect::<Vec<SortableSuffixType<S>>>();
143    let mut running_counter = 0;
144    let mut bin_offsets: HashMap<SortableSuffixType<S>, usize> =
145        HashMap::with_capacity(ordered_chars.len());
146
147    for x_type in &ordered_chars {
148        if let Some(x) = bin_sizes.get(x_type) {
149            if forward_edge {
150                bin_offsets.insert(x_type.clone(), running_counter);
151                running_counter += *x;
152            } else {
153                running_counter += *x;
154                bin_offsets.insert(x_type.clone(), running_counter - 1);
155            }
156        }
157    }
158    bin_offsets
159}
160
161/// Reduces the provided text into a reduced substring in which LMS substrings are represented by
162/// their indexing in the ordering of the LMS substrings.
163fn get_reduced_substring<S: Ord + Clone + Hash + Debug>(
164    text: &[S],
165    w_slice_indices: &[(usize, usize)],
166    stypes: &SuffixTypes,
167    bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
168) -> Result<(bool, Vec<usize>)> {
169    let mut sort_sa = Vec::with_capacity(text.len());
170    for _ in 0..text.len() {
171        sort_sa.push(-1);
172    }
173    let mut bin_edges = get_bin_offsets(bin_sizes, false);
174    for (start_idx, _) in w_slice_indices {
175        let x = bin_edges.get_mut(&SortableSuffixType::SType(text[*start_idx].clone())).ok_or(
176            Error::ValueNotInBinEdges {
177                val: format!("{:?}", SortableSuffixType::SType(text[*start_idx].clone())),
178            },
179        )?;
180
181        sort_sa[*x] = isize::try_from(*start_idx).map_err(|e| Error::TryFromIntError { e })?;
182        if *x > 0 {
183            *x -= 1;
184        }
185    }
186
187    suffix_array_modification_propagation(&mut sort_sa, text, stypes, bin_sizes, true)?;
188    suffix_array_modification_propagation(&mut sort_sa, text, stypes, bin_sizes, false)?;
189
190    let validated_sa = sort_sa
191        .iter()
192        .map(|i| usize::try_from(*i).map_err(|e| Error::TryFromIntError { e }))
193        .collect::<Result<Vec<usize>>>()?;
194
195    let substring_start_to_ends: HashMap<usize, usize> = w_slice_indices.iter().copied().collect();
196    let mut unique_substring_counter = 0;
197    let mut reduced_substring_mapping: HashMap<usize, usize> =
198        HashMap::with_capacity(w_slice_indices.len());
199
200    let mut previous_substring: Option<&[S]> = None;
201    for text_index in &validated_sa {
202        if stypes.is_leftmost_stype(*text_index) {
203            let substring_end = substring_start_to_ends
204                .get(text_index)
205                .ok_or(Error::LmsIndexError { beginning: *text_index })?;
206            let current_substring = &text[*text_index..*substring_end];
207            if previous_substring.is_some() && previous_substring != Some(current_substring) {
208                unique_substring_counter += 1;
209            }
210            previous_substring = Some(current_substring);
211            reduced_substring_mapping.insert(*text_index, unique_substring_counter);
212        }
213    }
214    let reduced_substring = w_slice_indices
215        .iter()
216        .map(|(lms_substring_start, _)| {
217            Ok(*reduced_substring_mapping
218                .get(lms_substring_start)
219                .ok_or(Error::LmsReductionError { text_index: *lms_substring_start })?)
220        })
221        .collect::<Result<Vec<usize>>>()?;
222
223    Ok((reduced_substring.len() > unique_substring_counter + 1, reduced_substring))
224}
225
226/// Modifies the provided suffix array by doing either a forward or reverse pass, corresponding to
227/// steps 2 or 3 in the induction of SA from SA of the reduced substring described in the SA-IS
228/// paper.
229///
230/// # Errors
231/// Can theoretically return an Err if something goes wrong with the internals of this crate, but
232/// never should
233fn suffix_array_modification_propagation<S: Ord + Clone + Hash + Debug>(
234    sa: &mut Vec<isize>,
235    text: &[S],
236    stypes: &SuffixTypes,
237    bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
238    forward_pass: bool,
239) -> Result<()> {
240    let mut bin_edges = get_bin_offsets(bin_sizes, forward_pass);
241
242    let iter_order: Box<dyn Iterator<Item = usize>> =
243        if forward_pass { Box::new(0..sa.len()) } else { Box::new((0..sa.len()).rev()) };
244    for index in iter_order {
245        let mut val = sa[index];
246        if val == -1 {
247            continue;
248        };
249        if val == 0 {
250            val = isize::try_from(sa.len()).map_err(|e| Error::TryFromIntError { e })? - 1;
251        } else {
252            val -= 1;
253        };
254        let sortable_suffix_type = if forward_pass {
255            SortableSuffixType::LType(
256                text[usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?].clone(),
257            )
258        } else {
259            SortableSuffixType::SType(
260                text[usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?].clone(),
261            )
262        };
263        let check_function_result = if forward_pass {
264            stypes.is_ltype(usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?)
265        } else {
266            stypes.is_stype(usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?)
267        };
268        if check_function_result {
269            let edge = bin_edges.get_mut(&sortable_suffix_type).ok_or_else(|| {
270                Error::ValueNotInBinEdges { val: format!("{:?}", sortable_suffix_type) }
271            })?;
272            sa[*edge] = val;
273            if forward_pass {
274                *edge += 1;
275            } else if *edge > 0 {
276                *edge -= 1;
277            }
278        }
279    }
280    Ok(())
281}
282
283/// Performs SA-IS (Suffix Array - Induced Sorting) of a provided text.
284///
285/// # Errors
286///
287/// Will return `Error::TryFromIntError` if there are more LMS substrings than there are positive
288/// numbers in isize (if so you likely have bigger problems).
289/// May also Error if the last character of the text is not a sentinel character (i.e. the
290/// unambiguous smallest character in the text).
291pub fn sais<S: Ord + Clone + Hash + Debug>(text: &[S]) -> Result<Vec<usize>> {
292    let mut sa: Vec<isize> = vec![-1; text.len()];
293    // Step 1 - Classify all suffixes in the text with SType + LeftmostSType / LType label
294    let stypes = SuffixTypes::new(text);
295    // Step 2 - Divide the buckets into S- and L-Buckets on. TheL-Buckets in front of the S-Buckets.
296    let bin_sizes = count_text_bin_sizes(text, &stypes);
297
298    // Sort the LeftmostSType strings lexicographically and write them in their respective S-Buckets.
299    let w_slice_indices = find_lms_substring_indices(&stypes);
300    let (duplicates, reduced_substring) =
301        get_reduced_substring(text, &w_slice_indices, &stypes, &bin_sizes)?;
302
303    let mut reduced_suffix_array: Vec<usize>;
304    if duplicates {
305        reduced_suffix_array = sais(&reduced_substring)?;
306    } else {
307        reduced_suffix_array = vec![0; reduced_substring.len()];
308        for (original_index, &sort_index) in reduced_substring.iter().enumerate() {
309            reduced_suffix_array[sort_index] = original_index;
310        }
311    }
312    let mut bin_edges = get_bin_offsets(&bin_sizes, false);
313    for reduced_string_index in reduced_suffix_array.iter().rev() {
314        let w_slice_start = w_slice_indices[*reduced_string_index].0;
315        let s_char = text[w_slice_start].clone();
316        let x = bin_edges.get_mut(&SortableSuffixType::SType(s_char)).ok_or(
317            Error::ValueNotInBinEdges {
318                val: format!("{:?}", SortableSuffixType::SType(text[w_slice_start].clone())),
319            },
320        )?;
321
322        sa[*x] = isize::try_from(w_slice_start).map_err(|e| Error::TryFromIntError { e })?;
323        if *x > 0 {
324            *x -= 1;
325        }
326    }
327
328    suffix_array_modification_propagation(&mut sa, text, &stypes, &bin_sizes, true)?;
329    suffix_array_modification_propagation(&mut sa, text, &stypes, &bin_sizes, false)?;
330
331    return sa
332        .iter()
333        .map(|i| usize::try_from(*i).map_err(|e| Error::TryFromIntError { e }))
334        .collect::<Result<Vec<usize>>>();
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use bitvec::bitvec;
341    use bitvec::prelude::*;
342
343    #[test]
344    fn test_suffix_types() {
345        let suffix_types = SuffixTypes::new("mmiissiissiippii$".as_bytes());
346
347        let expected_largest_types = bitvec![1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0];
348        let expected_smallest_types = bitvec![0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1];
349        let expected_lms_type = bitvec![0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1];
350        assert_eq!(
351            suffix_types.types.len(),
352            expected_largest_types.len(),
353            "Suffix Types length ({}) didn't match expectation ({})",
354            suffix_types.types.len(),
355            expected_largest_types.len()
356        );
357        for idx in 0..suffix_types.types.len() {
358            assert_eq!(suffix_types.is_ltype(idx), expected_largest_types[idx], "Suffix Type at position {} is L type was expected to resolve to {}, resolved to {}", idx, expected_largest_types[idx], suffix_types.is_ltype(idx));
359            assert_eq!(suffix_types.is_stype(idx), expected_smallest_types[idx], "Suffix Type at position {} is S type was expected to resolve to {}, resolved to {}", idx, expected_smallest_types[idx], suffix_types.is_ltype(idx));
360            assert_eq!(suffix_types.is_leftmost_stype(idx), expected_lms_type[idx], "Suffix Type at position {} is LMS type was expected to resolve to {}, resolved to {}", idx, expected_lms_type[idx], suffix_types.is_leftmost_stype(idx));
361        }
362    }
363
364    #[test]
365    fn test_w_slice_indices() {
366        let text = "mmiissiissiippii$".as_bytes();
367        let suffix_types = SuffixTypes::new(text);
368        let w_slice_indices = find_lms_substring_indices(&suffix_types);
369
370        assert_eq!(w_slice_indices, vec![(2, 7), (6, 11), (10, 17), (16, 17)]);
371        let w_slices: Vec<&[u8]> =
372            w_slice_indices.iter().map(|&(start, end)| &text[start..end]).collect();
373
374        assert_eq!(
375            w_slices,
376            vec!["iissi".as_bytes(), "iissi".as_bytes(), "iippii$".as_bytes(), "$".as_bytes()]
377        );
378    }
379
380    #[test]
381    fn test_count_text_bin_sizes() {
382        let m = "m".as_bytes()[0];
383        let i = "i".as_bytes()[0];
384        let s = "s".as_bytes()[0];
385        let p = "p".as_bytes()[0];
386        let terminator = "$".as_bytes()[0];
387
388        let text = "mmiissiissiippii$".as_bytes();
389        let suffix_types = SuffixTypes::new(text);
390        let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
391
392        assert_eq!(
393            text_bin_sizes,
394            HashMap::from([
395                (SortableSuffixType::LType(m), 2),
396                (SortableSuffixType::LType(i), 2),
397                (SortableSuffixType::SType(i), 6),
398                (SortableSuffixType::LType(s), 4),
399                (SortableSuffixType::LType(p), 2),
400                (SortableSuffixType::SType(terminator), 1),
401            ])
402        );
403    }
404
405    #[test]
406    fn test_get_bin_offsets() {
407        let m = "m".as_bytes()[0];
408        let i = "i".as_bytes()[0];
409        let s = "s".as_bytes()[0];
410        let p = "p".as_bytes()[0];
411        let terminator = "$".as_bytes()[0];
412
413        let text = "mmiissiissiippii$".as_bytes();
414        let suffix_types = SuffixTypes::new(text);
415        let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
416        let forward_bin_offsets = get_bin_offsets(&text_bin_sizes, true);
417        let reverse_bin_offsets = get_bin_offsets(&text_bin_sizes, false);
418
419        assert_eq!(
420            forward_bin_offsets,
421            HashMap::from([
422                (SortableSuffixType::SType(terminator), 0),
423                (SortableSuffixType::LType(i), 1),
424                (SortableSuffixType::SType(i), 3),
425                (SortableSuffixType::LType(m), 9),
426                (SortableSuffixType::LType(p), 11),
427                (SortableSuffixType::LType(s), 13),
428            ])
429        );
430        assert_eq!(
431            reverse_bin_offsets,
432            HashMap::from([
433                (SortableSuffixType::SType(terminator), 0),
434                (SortableSuffixType::LType(i), 2),
435                (SortableSuffixType::SType(i), 8),
436                (SortableSuffixType::LType(m), 10),
437                (SortableSuffixType::LType(p), 12),
438                (SortableSuffixType::LType(s), 16),
439            ])
440        );
441    }
442
443    #[test]
444    fn test_get_reduced_substring() {
445        let text = "mmiissiissiippii$".as_bytes();
446        let suffix_types = SuffixTypes::new(text);
447        let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
448        let w_slice_indices = find_lms_substring_indices(&suffix_types);
449        let (duplicates, reduced_substring) =
450            get_reduced_substring(text, &w_slice_indices, &suffix_types, &text_bin_sizes).unwrap();
451        assert!(duplicates, "Reduced substring of 'mmiissiissiippii$' has duplicates not identified by our approach");
452        assert_eq!(reduced_substring, vec![2, 2, 1, 0]);
453    }
454
455    #[test]
456    fn test_suffix_array_modification_propagation() {
457        let text = "mmiissiissiippii$".as_bytes();
458        let suffix_types = SuffixTypes::new(text);
459        let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
460        let mut initial_suffix_array =
461            vec![16, -1, -1, -1, -1, -1, 10, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1];
462        suffix_array_modification_propagation(
463            &mut initial_suffix_array,
464            text,
465            &suffix_types,
466            &text_bin_sizes,
467            true,
468        )
469        .unwrap();
470        assert_eq!(
471            initial_suffix_array,
472            vec![16, 15, 14, -1, -1, -1, 10, 6, 2, 1, 0, 13, 12, 9, 5, 8, 4]
473        );
474        suffix_array_modification_propagation(
475            &mut initial_suffix_array,
476            text,
477            &suffix_types,
478            &text_bin_sizes,
479            false,
480        )
481        .unwrap();
482        assert_eq!(
483            initial_suffix_array,
484            vec![16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4]
485        );
486    }
487
488    #[test]
489    fn full_test() {
490        let paper_example = "mmiissiissiippii$".as_bytes();
491        assert_eq!(
492            sais(paper_example).unwrap(),
493            [16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4].to_vec()
494        );
495    }
496
497    #[test]
498    fn wholly_repetitive_completes() {
499        let homopolymer = [1_u32, 1, 1, 1, 1, 1, 1, 0];
500        assert_eq!(sais(&homopolymer).unwrap(), [7_usize, 6, 5, 4, 3, 2, 1, 0].to_vec());
501    }
502}