libsufr/
lib.rs

1#![warn(missing_docs)]
2
3//! # Parallel Construction of Suffix Arrays in Rust
4//!
5//! Inspired by the merge sort approach described in "Cache-friendly,
6//! Parallel, and Samplesort-based Constructor for Suffix Arrays and LCP
7//! Arrays." [^patro]
8//!
9//! [^patro]: <https://doi.org/10.4230/LIPIcs.WABI.2023.16>
10//!
11//! * Most people should use [suffix_array] to create and interact with suffix arrays
12//!
13//! If you want lower-level access to Sufr's internals:
14//!
15//! * Use [sufr_builder] to create a new suffix array/_.sufr_ file
16//! * Use [sufr_file] to interact and query an existing suffix array/_.sufr_ file
17//!
18//!
19//! ## Authors
20//! * Ken Youens-Clark <kyclark@gmail.com>
21//! * Jack Roddy <jroddy@arizona.edu>
22//! * Travis Wheeler <twheeler@arizona.edu>
23
24mod file_access;
25pub mod suffix_array;
26pub mod sufr_builder;
27pub mod sufr_file;
28mod sufr_search;
29pub mod types;
30pub mod util;
31
32// --------------------------------------------------
33#[cfg(test)]
34mod tests {
35    use super::{
36        sufr_builder::SufrBuilder,
37        sufr_file::SufrFile,
38        types::{SufrBuilderArgs, OUTFILE_VERSION},
39        util::read_sequence_file,
40    };
41    use anyhow::Result;
42    use std::path::Path;
43    use tempfile::NamedTempFile;
44
45    #[test]
46    fn test_write_read_suffix_file_32() -> Result<()> {
47        let seq_file = Path::new("../data/inputs/2.fa");
48        let sequence_delimiter = b'N';
49        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
50        let outfile = NamedTempFile::new()?;
51        let outpath = outfile.path().to_string_lossy().to_string();
52        let args = SufrBuilderArgs {
53            text: seq_data.seq,
54            low_memory: true,
55            path: Some(outpath.clone()),
56            max_query_len: None,
57            is_dna: true,
58            allow_ambiguity: false,
59            ignore_softmask: false,
60            sequence_starts: seq_data.start_positions.into_iter().collect(),
61            sequence_names: seq_data.sequence_names,
62            num_partitions: 2,
63            seed_mask: None,
64            random_seed: 0,
65        };
66        let res = SufrBuilder::<u32>::new(args);
67        assert!(res.is_ok());
68
69        let builder = res.unwrap();
70        assert!(Path::new(&builder.path).exists());
71
72        let res: Result<SufrFile<u32>> = SufrFile::read(&outpath, false);
73        assert!(res.is_ok());
74
75        let mut sufr_file = res.unwrap();
76        assert_eq!(sufr_file.version, OUTFILE_VERSION);
77        assert!(sufr_file.is_dna);
78        assert_eq!(sufr_file.text_len, 18);
79        assert_eq!(sufr_file.num_sequences, 2);
80        assert_eq!(sufr_file.sequence_starts, [0, 9]);
81        assert_eq!(sufr_file.sequence_names, ["ABC", "DEF"]);
82        assert_eq!(sufr_file.text, b"ACGTACGTNACGTACGT$");
83
84        let file_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
85        let sorted_sa = [17, 13, 9, 0, 4, 14, 10, 1, 5, 15, 11, 2, 6, 16, 12, 3, 7];
86        assert_eq!(file_sa, sorted_sa);
87
88        let file_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
89        let lcp = [0, 0, 4, 8, 4, 0, 3, 7, 3, 0, 2, 6, 2, 0, 1, 5, 1];
90        assert_eq!(file_lcp, lcp);
91        Ok(())
92    }
93
94    #[test]
95    fn test_write_read_suffix_file_64() -> Result<()> {
96        let seq_file = Path::new("../data/inputs/1.fa");
97        let sequence_delimiter = b'N';
98        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
99        let outfile = NamedTempFile::new()?;
100        let outpath = outfile.path().to_string_lossy().to_string();
101        let args = SufrBuilderArgs {
102            text: seq_data.seq,
103            low_memory: true,
104            path: Some(outpath.clone()),
105            max_query_len: None,
106            is_dna: true,
107            allow_ambiguity: true,
108            ignore_softmask: false,
109            sequence_starts: seq_data.start_positions.into_iter().collect(),
110            sequence_names: seq_data.sequence_names,
111            num_partitions: 2,
112            seed_mask: None,
113            random_seed: 0,
114        };
115
116        let res = SufrBuilder::<u64>::new(args);
117        assert!(res.is_ok());
118
119        let builder = res.unwrap();
120        assert!(Path::new(&builder.path).exists());
121
122        let res: Result<SufrFile<u64>> = SufrFile::read(&outpath, false);
123        assert!(res.is_ok());
124
125        let mut sufr_file = res.unwrap();
126        assert_eq!(sufr_file.version, OUTFILE_VERSION);
127        assert!(sufr_file.is_dna);
128        assert_eq!(sufr_file.text_len, 11);
129        assert_eq!(sufr_file.num_sequences, 1);
130        assert_eq!(sufr_file.sequence_starts, [0]);
131        assert_eq!(sufr_file.sequence_names, ["1"]);
132        assert_eq!(sufr_file.text, b"ACGTNNACGT$");
133        assert_eq!(sufr_file.len_suffixes, 11);
134
135        let file_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
136        assert_eq!(file_sa, &[10, 6, 0, 7, 1, 8, 2, 5, 4, 9, 3]);
137        let file_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
138        assert_eq!(file_lcp, &[0, 0, 4, 0, 3, 0, 2, 0, 1, 0, 1]);
139        Ok(())
140    }
141
142    #[test]
143    fn test_subsample_suffix_array() -> Result<()> {
144        let seq_file = Path::new("../data/inputs/smol.fa");
145        let sequence_delimiter = b'N';
146        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
147        let outfile = NamedTempFile::new()?;
148        let outpath = outfile.path().to_string_lossy().to_string();
149        let builder_args = SufrBuilderArgs {
150            text: seq_data.seq,
151            low_memory: true,
152            path: Some(outpath.clone()),
153            max_query_len: None,
154            is_dna: true,
155            allow_ambiguity: false,
156            ignore_softmask: false,
157            sequence_starts: seq_data.start_positions,
158            sequence_names: seq_data.sequence_names,
159            num_partitions: 2,
160            seed_mask: None,
161            random_seed: 0,
162        };
163        let res = SufrBuilder::<u32>::new(builder_args);
164        assert!(res.is_ok());
165
166        let builder = res.unwrap();
167        assert_eq!(builder.num_suffixes, 364);
168
169        let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
170        let full_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
171        let full_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
172        assert_eq!(full_sa.len(), 364);
173        assert_eq!(full_lcp.len(), 364);
174
175        let max_query_len = 1;
176        let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
177        assert_eq!(sub_sa.len(), 5);
178        assert_eq!(sub_rank.len(), 5);
179        // $, A, C, G, T
180        assert_eq!(sub_sa, vec![365, 364, 92, 224, 363]);
181        assert_eq!(sub_rank, vec![0, 1, 94, 191, 284]);
182
183        let max_query_len = 2;
184        let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
185        assert_eq!(sub_sa.len(), 20);
186        assert_eq!(sub_rank.len(), 20);
187        // $, A$, AA, AC, AG, AN, AT, CA, CC, CG, CT, GA, GC, GG, GN, GT, TA, TC, TG, TT
188        assert_eq!(
189            sub_sa,
190            vec![
191                365, 364, 358, 91, 341, 255, 362, 92, 339, 233, 296, 224, 88, 129, 110,
192                96, 363, 217, 223, 356
193            ]
194        );
195        assert_eq!(
196            sub_rank,
197            vec![
198                0, 1, 2, 38, 49, 70, 71, 94, 112, 143, 170, 191, 216, 252, 269, 270,
199                284, 298, 315, 343
200            ]
201        );
202
203        let max_query_len = 3;
204        let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
205        assert_eq!(sub_sa.len(), 71);
206        assert_eq!(sub_rank.len(), 71);
207
208        let max_query_len = 5;
209        let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
210        assert_eq!(sub_sa.len(), 293);
211        assert_eq!(sub_rank.len(), 293);
212
213        let max_query_len = 3;
214        let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
215        assert_eq!(sub_sa.len(), 71);
216        assert_eq!(sub_rank.len(), 71);
217
218        Ok(())
219    }
220
221    #[test]
222    fn test_spaced_seeds_1() -> Result<()> {
223        let seq_file = Path::new("../data/inputs/mostlya1.fa");
224        let sequence_delimiter = b'N';
225        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
226        let outfile = NamedTempFile::new()?;
227        let outpath = outfile.path().to_string_lossy().to_string();
228        let builder_args = SufrBuilderArgs {
229            text: seq_data.seq,
230            low_memory: true,
231            path: Some(outpath.clone()),
232            max_query_len: None,
233            is_dna: true,
234            allow_ambiguity: false,
235            ignore_softmask: false,
236            sequence_starts: seq_data.start_positions,
237            sequence_names: seq_data.sequence_names,
238            num_partitions: 1,
239            seed_mask: Some("101".to_string()),
240            random_seed: 0,
241        };
242
243        // 7 $
244        // 6 A$
245        // 5 AA$
246        // 4 AAA$
247        // 2 ACAAA$
248        // 0 AAACAAA$
249        // 1 AACAAA$
250        // 3 CAAA$
251
252        let res = SufrBuilder::<u32>::new(builder_args);
253        assert!(res.is_ok());
254
255        let builder = res.unwrap();
256        assert_eq!(builder.num_suffixes, 8);
257
258        let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
259        let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
260        assert_eq!(suffix_array.len(), 8);
261        assert_eq!(suffix_array, vec![7, 6, 5, 4, 2, 0, 1, 3]);
262
263        Ok(())
264    }
265
266    #[test]
267    fn test_spaced_seeds_2() -> Result<()> {
268        let seq_file = Path::new("../data/inputs/mostlya2.fa");
269        let sequence_delimiter = b'N';
270        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
271        let outfile = NamedTempFile::new()?;
272        let outpath = outfile.path().to_string_lossy().to_string();
273        let builder_args = SufrBuilderArgs {
274            text: seq_data.seq,
275            low_memory: true,
276            path: Some(outpath.clone()),
277            max_query_len: None,
278            is_dna: true,
279            allow_ambiguity: false,
280            ignore_softmask: false,
281            sequence_starts: seq_data.start_positions,
282            sequence_names: seq_data.sequence_names,
283            num_partitions: 1,
284            seed_mask: Some("11011".to_string()),
285            random_seed: 0,
286        };
287
288        //  0 16 $
289        //  1 13 AAC$
290        //  2  5 AACAAACAAAC$
291        //  3  1 AACAAACAAACAAAC$
292        //  4  9 AACAAAC$
293        //  5 12 AAAC$
294        //  6  8 AAACAAAC$
295        //  7  4 AAACAAACAAAC$
296        //  8  0 AAACAAACAAACAAAC$
297        //  9 14 AC$
298        // 10 10 ACAAAC$
299        // 11  6 ACAAACAAAC$
300        // 12  2 ACAAACAAACAAAC$
301        // 13 15 C$
302        // 14 11 CAAAC$
303        // 15  7 CAAACAAAC$
304        // 16  3 CAAACAAACAAAC$
305
306        let res = SufrBuilder::<u32>::new(builder_args);
307        assert!(res.is_ok());
308
309        let builder = res.unwrap();
310        assert_eq!(builder.num_suffixes, 17);
311
312        let mut sufr_file = SufrFile::<u32>::read(&outpath, false)?;
313        let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
314        assert_eq!(suffix_array.len(), 17);
315        assert_eq!(
316            suffix_array,
317            vec![16, 13, 9, 5, 1, 12, 8, 4, 0, 14, 10, 6, 2, 15, 11, 7, 3],
318        );
319
320        Ok(())
321    }
322
323    // --------------------------------------------------
324    #[test]
325    fn test_spaced_seeds_3() -> Result<()> {
326        let seq_file = Path::new("../data/inputs/spaced_input.fa");
327        let sequence_delimiter = b'N';
328        let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
329        let outfile = NamedTempFile::new()?;
330        let outpath = outfile.path().to_string_lossy().to_string();
331        let builder_args = SufrBuilderArgs {
332            text: seq_data.seq,
333            low_memory: false,
334            path: Some(outpath.clone()),
335            max_query_len: None,
336            is_dna: true,
337            allow_ambiguity: false,
338            ignore_softmask: false,
339            sequence_starts: seq_data.start_positions,
340            sequence_names: seq_data.sequence_names,
341            num_partitions: 1,
342            seed_mask: Some("11000111".to_string()),
343            random_seed: 0,
344        };
345
346        let res = SufrBuilder::<u32>::new(builder_args);
347        assert!(res.is_ok());
348
349        let builder = res.unwrap();
350        assert_eq!(builder.num_suffixes, 43);
351
352        let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
353        let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
354        assert_eq!(suffix_array.len(), 43);
355        assert_eq!(
356            suffix_array,
357            vec![
358                42, 18, 12, 0, 32, 29, 13, 23, 21, 6, 40, 1, 33, 19, 30, 10, 28, 9, 17,
359                14, 4, 26, 39, 22, 25, 38, 24, 35, 7, 36, 15, 41, 5, 20, 31, 11, 27, 8,
360                16, 3, 37, 34, 2
361            ],
362        );
363
364        Ok(())
365    }
366}