1pub mod errors;
4
5use bitvec::vec::BitVec;
6use core::hash::Hash;
7use errors::{Error, Result};
8use itertools::Itertools;
9use std::cmp::Ordering;
10use std::collections::HashMap;
11use std::fmt::Debug;
12
13#[derive(Debug)]
16struct SuffixTypes {
17 types: BitVec,
19}
20
21impl SuffixTypes {
22 fn new<S: Ord + Clone + Hash + Debug>(text: &[S]) -> Self {
25 let text_len = text.len();
26 let mut types: BitVec = BitVec::with_capacity(text_len);
27 for _ in 0..text_len - 1 {
28 types.push(false);
29 }
30
31 types.push(true);
32
33 for text_index in (0..text_len - 1).rev() {
34 if text[text_index] == text[text_index + 1] {
35 let v = types[text_index + 1];
38 types.set(text_index, v);
39 } else {
40 types.set(text_index, text[text_index] < text[text_index + 1]);
41 }
42 }
43 SuffixTypes { types }
44 }
45
46 fn is_stype(&self, p: usize) -> bool {
48 self.types[p]
49 }
50
51 fn is_ltype(&self, p: usize) -> bool {
53 !(self.types[p])
54 }
55
56 fn is_leftmost_stype(&self, p: usize) -> bool {
58 p != 0 && self.is_stype(p) && self.is_ltype(p - 1)
59 }
60}
61
62#[derive(Debug, Clone, Hash, PartialEq, Eq)]
65enum SortableSuffixType<S: Ord + Clone + Hash + Debug> {
66 SType(S),
68 LType(S),
70}
71
72impl<S: Ord + Clone + Hash + Debug> PartialOrd for SortableSuffixType<S> {
73 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
74 Some(self.cmp(other))
75 }
76}
77
78impl<S: Ord + Clone + Hash + Debug> Ord for SortableSuffixType<S> {
79 fn cmp(&self, other: &Self) -> Ordering {
80 match (self, other) {
81 (SortableSuffixType::SType(x), SortableSuffixType::LType(y)) => match x.cmp(y) {
82 Ordering::Equal => Ordering::Greater,
83 cmp => cmp,
84 },
85 (SortableSuffixType::LType(x), SortableSuffixType::SType(y)) => match x.cmp(y) {
86 Ordering::Equal => Ordering::Less,
87 cmp => cmp,
88 },
89 (
90 SortableSuffixType::SType(x) | SortableSuffixType::LType(x),
91 SortableSuffixType::SType(y) | SortableSuffixType::LType(y),
92 ) => x.cmp(y),
93 }
94 }
95}
96
97fn find_lms_substring_indices(suffix_classifications: &SuffixTypes) -> Vec<(usize, usize)> {
99 let lms_num = (0..suffix_classifications.types.len())
100 .filter(|idx| suffix_classifications.is_leftmost_stype(*idx))
101 .count();
102 let mut wstrings: Vec<(usize, usize)> = Vec::with_capacity(lms_num);
103 let mut start = None;
104 for current_index in 0..suffix_classifications.types.len() {
105 if suffix_classifications.is_leftmost_stype(current_index) {
106 if let Some(current_start) = start {
107 wstrings.push((current_start, current_index + 1));
108 }
109 start = Some(current_index);
110 }
111 }
112 wstrings.push((suffix_classifications.types.len() - 1, suffix_classifications.types.len()));
113 wstrings
114}
115
116fn count_text_bin_sizes<S: Ord + Clone + Hash + Debug>(
118 text: &[S],
119 suffix_classifications: &SuffixTypes,
120) -> HashMap<SortableSuffixType<S>, usize> {
121 let mut bin_sizes: HashMap<SortableSuffixType<S>, usize> = HashMap::new();
122 for (index, suffix_type) in suffix_classifications.types.iter().enumerate() {
123 let char_var = if *suffix_type {
124 SortableSuffixType::SType(text[index].clone())
125 } else {
126 SortableSuffixType::LType(text[index].clone())
127 };
128 if let Some(x) = bin_sizes.get_mut(&char_var) {
129 *x += 1;
130 } else {
131 bin_sizes.insert(char_var, 1);
132 }
133 }
134 bin_sizes
135}
136
137fn get_bin_offsets<S: Ord + Clone + Hash + Debug>(
139 bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
140 forward_edge: bool,
141) -> HashMap<SortableSuffixType<S>, usize> {
142 let ordered_chars = bin_sizes.keys().cloned().sorted().collect::<Vec<SortableSuffixType<S>>>();
143 let mut running_counter = 0;
144 let mut bin_offsets: HashMap<SortableSuffixType<S>, usize> =
145 HashMap::with_capacity(ordered_chars.len());
146
147 for x_type in &ordered_chars {
148 if let Some(x) = bin_sizes.get(x_type) {
149 if forward_edge {
150 bin_offsets.insert(x_type.clone(), running_counter);
151 running_counter += *x;
152 } else {
153 running_counter += *x;
154 bin_offsets.insert(x_type.clone(), running_counter - 1);
155 }
156 }
157 }
158 bin_offsets
159}
160
161fn get_reduced_substring<S: Ord + Clone + Hash + Debug>(
164 text: &[S],
165 w_slice_indices: &[(usize, usize)],
166 stypes: &SuffixTypes,
167 bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
168) -> Result<(bool, Vec<usize>)> {
169 let mut sort_sa = Vec::with_capacity(text.len());
170 for _ in 0..text.len() {
171 sort_sa.push(-1);
172 }
173 let mut bin_edges = get_bin_offsets(bin_sizes, false);
174 for (start_idx, _) in w_slice_indices {
175 let x = bin_edges.get_mut(&SortableSuffixType::SType(text[*start_idx].clone())).ok_or(
176 Error::ValueNotInBinEdges {
177 val: format!("{:?}", SortableSuffixType::SType(text[*start_idx].clone())),
178 },
179 )?;
180
181 sort_sa[*x] = isize::try_from(*start_idx).map_err(|e| Error::TryFromIntError { e })?;
182 if *x > 0 {
183 *x -= 1;
184 }
185 }
186
187 suffix_array_modification_propagation(&mut sort_sa, text, stypes, bin_sizes, true)?;
188 suffix_array_modification_propagation(&mut sort_sa, text, stypes, bin_sizes, false)?;
189
190 let validated_sa = sort_sa
191 .iter()
192 .map(|i| usize::try_from(*i).map_err(|e| Error::TryFromIntError { e }))
193 .collect::<Result<Vec<usize>>>()?;
194
195 let substring_start_to_ends: HashMap<usize, usize> = w_slice_indices.iter().copied().collect();
196 let mut unique_substring_counter = 0;
197 let mut reduced_substring_mapping: HashMap<usize, usize> =
198 HashMap::with_capacity(w_slice_indices.len());
199
200 let mut previous_substring: Option<&[S]> = None;
201 for text_index in &validated_sa {
202 if stypes.is_leftmost_stype(*text_index) {
203 let substring_end = substring_start_to_ends
204 .get(text_index)
205 .ok_or(Error::LmsIndexError { beginning: *text_index })?;
206 let current_substring = &text[*text_index..*substring_end];
207 if previous_substring.is_some() && previous_substring != Some(current_substring) {
208 unique_substring_counter += 1;
209 }
210 previous_substring = Some(current_substring);
211 reduced_substring_mapping.insert(*text_index, unique_substring_counter);
212 }
213 }
214 let reduced_substring = w_slice_indices
215 .iter()
216 .map(|(lms_substring_start, _)| {
217 Ok(*reduced_substring_mapping
218 .get(lms_substring_start)
219 .ok_or(Error::LmsReductionError { text_index: *lms_substring_start })?)
220 })
221 .collect::<Result<Vec<usize>>>()?;
222
223 Ok((reduced_substring.len() > unique_substring_counter + 1, reduced_substring))
224}
225
226fn suffix_array_modification_propagation<S: Ord + Clone + Hash + Debug>(
234 sa: &mut Vec<isize>,
235 text: &[S],
236 stypes: &SuffixTypes,
237 bin_sizes: &HashMap<SortableSuffixType<S>, usize>,
238 forward_pass: bool,
239) -> Result<()> {
240 let mut bin_edges = get_bin_offsets(bin_sizes, forward_pass);
241
242 let iter_order: Box<dyn Iterator<Item = usize>> =
243 if forward_pass { Box::new(0..sa.len()) } else { Box::new((0..sa.len()).rev()) };
244 for index in iter_order {
245 let mut val = sa[index];
246 if val == -1 {
247 continue;
248 };
249 if val == 0 {
250 val = isize::try_from(sa.len()).map_err(|e| Error::TryFromIntError { e })? - 1;
251 } else {
252 val -= 1;
253 };
254 let sortable_suffix_type = if forward_pass {
255 SortableSuffixType::LType(
256 text[usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?].clone(),
257 )
258 } else {
259 SortableSuffixType::SType(
260 text[usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?].clone(),
261 )
262 };
263 let check_function_result = if forward_pass {
264 stypes.is_ltype(usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?)
265 } else {
266 stypes.is_stype(usize::try_from(val).map_err(|e| Error::TryFromIntError { e })?)
267 };
268 if check_function_result {
269 let edge = bin_edges.get_mut(&sortable_suffix_type).ok_or_else(|| {
270 Error::ValueNotInBinEdges { val: format!("{:?}", sortable_suffix_type) }
271 })?;
272 sa[*edge] = val;
273 if forward_pass {
274 *edge += 1;
275 } else if *edge > 0 {
276 *edge -= 1;
277 }
278 }
279 }
280 Ok(())
281}
282
283pub fn sais<S: Ord + Clone + Hash + Debug>(text: &[S]) -> Result<Vec<usize>> {
292 let mut sa: Vec<isize> = vec![-1; text.len()];
293 let stypes = SuffixTypes::new(text);
295 let bin_sizes = count_text_bin_sizes(text, &stypes);
297
298 let w_slice_indices = find_lms_substring_indices(&stypes);
300 let (duplicates, reduced_substring) =
301 get_reduced_substring(text, &w_slice_indices, &stypes, &bin_sizes)?;
302
303 let mut reduced_suffix_array: Vec<usize>;
304 if duplicates {
305 reduced_suffix_array = sais(&reduced_substring)?;
306 } else {
307 reduced_suffix_array = vec![0; reduced_substring.len()];
308 for (original_index, &sort_index) in reduced_substring.iter().enumerate() {
309 reduced_suffix_array[sort_index] = original_index;
310 }
311 }
312 let mut bin_edges = get_bin_offsets(&bin_sizes, false);
313 for reduced_string_index in reduced_suffix_array.iter().rev() {
314 let w_slice_start = w_slice_indices[*reduced_string_index].0;
315 let s_char = text[w_slice_start].clone();
316 let x = bin_edges.get_mut(&SortableSuffixType::SType(s_char)).ok_or(
317 Error::ValueNotInBinEdges {
318 val: format!("{:?}", SortableSuffixType::SType(text[w_slice_start].clone())),
319 },
320 )?;
321
322 sa[*x] = isize::try_from(w_slice_start).map_err(|e| Error::TryFromIntError { e })?;
323 if *x > 0 {
324 *x -= 1;
325 }
326 }
327
328 suffix_array_modification_propagation(&mut sa, text, &stypes, &bin_sizes, true)?;
329 suffix_array_modification_propagation(&mut sa, text, &stypes, &bin_sizes, false)?;
330
331 return sa
332 .iter()
333 .map(|i| usize::try_from(*i).map_err(|e| Error::TryFromIntError { e }))
334 .collect::<Result<Vec<usize>>>();
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340 use bitvec::bitvec;
341 use bitvec::prelude::*;
342
343 #[test]
344 fn test_suffix_types() {
345 let suffix_types = SuffixTypes::new("mmiissiissiippii$".as_bytes());
346
347 let expected_largest_types = bitvec![1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0];
348 let expected_smallest_types = bitvec![0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1];
349 let expected_lms_type = bitvec![0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1];
350 assert_eq!(
351 suffix_types.types.len(),
352 expected_largest_types.len(),
353 "Suffix Types length ({}) didn't match expectation ({})",
354 suffix_types.types.len(),
355 expected_largest_types.len()
356 );
357 for idx in 0..suffix_types.types.len() {
358 assert_eq!(suffix_types.is_ltype(idx), expected_largest_types[idx], "Suffix Type at position {} is L type was expected to resolve to {}, resolved to {}", idx, expected_largest_types[idx], suffix_types.is_ltype(idx));
359 assert_eq!(suffix_types.is_stype(idx), expected_smallest_types[idx], "Suffix Type at position {} is S type was expected to resolve to {}, resolved to {}", idx, expected_smallest_types[idx], suffix_types.is_ltype(idx));
360 assert_eq!(suffix_types.is_leftmost_stype(idx), expected_lms_type[idx], "Suffix Type at position {} is LMS type was expected to resolve to {}, resolved to {}", idx, expected_lms_type[idx], suffix_types.is_leftmost_stype(idx));
361 }
362 }
363
364 #[test]
365 fn test_w_slice_indices() {
366 let text = "mmiissiissiippii$".as_bytes();
367 let suffix_types = SuffixTypes::new(text);
368 let w_slice_indices = find_lms_substring_indices(&suffix_types);
369
370 assert_eq!(w_slice_indices, vec![(2, 7), (6, 11), (10, 17), (16, 17)]);
371 let w_slices: Vec<&[u8]> =
372 w_slice_indices.iter().map(|&(start, end)| &text[start..end]).collect();
373
374 assert_eq!(
375 w_slices,
376 vec!["iissi".as_bytes(), "iissi".as_bytes(), "iippii$".as_bytes(), "$".as_bytes()]
377 );
378 }
379
380 #[test]
381 fn test_count_text_bin_sizes() {
382 let m = "m".as_bytes()[0];
383 let i = "i".as_bytes()[0];
384 let s = "s".as_bytes()[0];
385 let p = "p".as_bytes()[0];
386 let terminator = "$".as_bytes()[0];
387
388 let text = "mmiissiissiippii$".as_bytes();
389 let suffix_types = SuffixTypes::new(text);
390 let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
391
392 assert_eq!(
393 text_bin_sizes,
394 HashMap::from([
395 (SortableSuffixType::LType(m), 2),
396 (SortableSuffixType::LType(i), 2),
397 (SortableSuffixType::SType(i), 6),
398 (SortableSuffixType::LType(s), 4),
399 (SortableSuffixType::LType(p), 2),
400 (SortableSuffixType::SType(terminator), 1),
401 ])
402 );
403 }
404
405 #[test]
406 fn test_get_bin_offsets() {
407 let m = "m".as_bytes()[0];
408 let i = "i".as_bytes()[0];
409 let s = "s".as_bytes()[0];
410 let p = "p".as_bytes()[0];
411 let terminator = "$".as_bytes()[0];
412
413 let text = "mmiissiissiippii$".as_bytes();
414 let suffix_types = SuffixTypes::new(text);
415 let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
416 let forward_bin_offsets = get_bin_offsets(&text_bin_sizes, true);
417 let reverse_bin_offsets = get_bin_offsets(&text_bin_sizes, false);
418
419 assert_eq!(
420 forward_bin_offsets,
421 HashMap::from([
422 (SortableSuffixType::SType(terminator), 0),
423 (SortableSuffixType::LType(i), 1),
424 (SortableSuffixType::SType(i), 3),
425 (SortableSuffixType::LType(m), 9),
426 (SortableSuffixType::LType(p), 11),
427 (SortableSuffixType::LType(s), 13),
428 ])
429 );
430 assert_eq!(
431 reverse_bin_offsets,
432 HashMap::from([
433 (SortableSuffixType::SType(terminator), 0),
434 (SortableSuffixType::LType(i), 2),
435 (SortableSuffixType::SType(i), 8),
436 (SortableSuffixType::LType(m), 10),
437 (SortableSuffixType::LType(p), 12),
438 (SortableSuffixType::LType(s), 16),
439 ])
440 );
441 }
442
443 #[test]
444 fn test_get_reduced_substring() {
445 let text = "mmiissiissiippii$".as_bytes();
446 let suffix_types = SuffixTypes::new(text);
447 let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
448 let w_slice_indices = find_lms_substring_indices(&suffix_types);
449 let (duplicates, reduced_substring) =
450 get_reduced_substring(text, &w_slice_indices, &suffix_types, &text_bin_sizes).unwrap();
451 assert!(duplicates, "Reduced substring of 'mmiissiissiippii$' has duplicates not identified by our approach");
452 assert_eq!(reduced_substring, vec![2, 2, 1, 0]);
453 }
454
455 #[test]
456 fn test_suffix_array_modification_propagation() {
457 let text = "mmiissiissiippii$".as_bytes();
458 let suffix_types = SuffixTypes::new(text);
459 let text_bin_sizes = count_text_bin_sizes(text, &suffix_types);
460 let mut initial_suffix_array =
461 vec![16, -1, -1, -1, -1, -1, 10, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1];
462 suffix_array_modification_propagation(
463 &mut initial_suffix_array,
464 text,
465 &suffix_types,
466 &text_bin_sizes,
467 true,
468 )
469 .unwrap();
470 assert_eq!(
471 initial_suffix_array,
472 vec![16, 15, 14, -1, -1, -1, 10, 6, 2, 1, 0, 13, 12, 9, 5, 8, 4]
473 );
474 suffix_array_modification_propagation(
475 &mut initial_suffix_array,
476 text,
477 &suffix_types,
478 &text_bin_sizes,
479 false,
480 )
481 .unwrap();
482 assert_eq!(
483 initial_suffix_array,
484 vec![16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4]
485 );
486 }
487
488 #[test]
489 fn full_test() {
490 let paper_example = "mmiissiissiippii$".as_bytes();
491 assert_eq!(
492 sais(paper_example).unwrap(),
493 [16, 15, 14, 10, 6, 2, 11, 7, 3, 1, 0, 13, 12, 9, 5, 8, 4].to_vec()
494 );
495 }
496
497 #[test]
498 fn wholly_repetitive_completes() {
499 let homopolymer = [1_u32, 1, 1, 1, 1, 1, 1, 0];
500 assert_eq!(sais(&homopolymer).unwrap(), [7_usize, 6, 5, 4, 3, 2, 1, 0].to_vec());
501 }
502}