1#![warn(missing_docs)]
2
3mod file_access;
25pub mod suffix_array;
26pub mod sufr_builder;
27pub mod sufr_file;
28mod sufr_search;
29pub mod types;
30pub mod util;
31
32#[cfg(test)]
34mod tests {
35 use super::{
36 sufr_builder::SufrBuilder,
37 sufr_file::SufrFile,
38 types::{SufrBuilderArgs, OUTFILE_VERSION},
39 util::read_sequence_file,
40 };
41 use anyhow::Result;
42 use std::path::Path;
43 use tempfile::NamedTempFile;
44
45 #[test]
46 fn test_write_read_suffix_file_32() -> Result<()> {
47 let seq_file = Path::new("../data/inputs/2.fa");
48 let sequence_delimiter = b'N';
49 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
50 let outfile = NamedTempFile::new()?;
51 let outpath = outfile.path().to_string_lossy().to_string();
52 let args = SufrBuilderArgs {
53 text: seq_data.seq,
54 low_memory: true,
55 path: Some(outpath.clone()),
56 max_query_len: None,
57 is_dna: true,
58 allow_ambiguity: false,
59 ignore_softmask: false,
60 sequence_starts: seq_data.start_positions.into_iter().collect(),
61 sequence_names: seq_data.sequence_names,
62 num_partitions: 2,
63 seed_mask: None,
64 random_seed: 0,
65 };
66 let res = SufrBuilder::<u32>::new(args);
67 assert!(res.is_ok());
68
69 let builder = res.unwrap();
70 assert!(Path::new(&builder.path).exists());
71
72 let res: Result<SufrFile<u32>> = SufrFile::read(&outpath, false);
73 assert!(res.is_ok());
74
75 let mut sufr_file = res.unwrap();
76 assert_eq!(sufr_file.version, OUTFILE_VERSION);
77 assert!(sufr_file.is_dna);
78 assert_eq!(sufr_file.text_len, 18);
79 assert_eq!(sufr_file.num_sequences, 2);
80 assert_eq!(sufr_file.sequence_starts, [0, 9]);
81 assert_eq!(sufr_file.sequence_names, ["ABC", "DEF"]);
82 assert_eq!(sufr_file.text, b"ACGTACGTNACGTACGT$");
83
84 let file_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
85 let sorted_sa = [17, 13, 9, 0, 4, 14, 10, 1, 5, 15, 11, 2, 6, 16, 12, 3, 7];
86 assert_eq!(file_sa, sorted_sa);
87
88 let file_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
89 let lcp = [0, 0, 4, 8, 4, 0, 3, 7, 3, 0, 2, 6, 2, 0, 1, 5, 1];
90 assert_eq!(file_lcp, lcp);
91 Ok(())
92 }
93
94 #[test]
95 fn test_write_read_suffix_file_64() -> Result<()> {
96 let seq_file = Path::new("../data/inputs/1.fa");
97 let sequence_delimiter = b'N';
98 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
99 let outfile = NamedTempFile::new()?;
100 let outpath = outfile.path().to_string_lossy().to_string();
101 let args = SufrBuilderArgs {
102 text: seq_data.seq,
103 low_memory: true,
104 path: Some(outpath.clone()),
105 max_query_len: None,
106 is_dna: true,
107 allow_ambiguity: true,
108 ignore_softmask: false,
109 sequence_starts: seq_data.start_positions.into_iter().collect(),
110 sequence_names: seq_data.sequence_names,
111 num_partitions: 2,
112 seed_mask: None,
113 random_seed: 0,
114 };
115
116 let res = SufrBuilder::<u64>::new(args);
117 assert!(res.is_ok());
118
119 let builder = res.unwrap();
120 assert!(Path::new(&builder.path).exists());
121
122 let res: Result<SufrFile<u64>> = SufrFile::read(&outpath, false);
123 assert!(res.is_ok());
124
125 let mut sufr_file = res.unwrap();
126 assert_eq!(sufr_file.version, OUTFILE_VERSION);
127 assert!(sufr_file.is_dna);
128 assert_eq!(sufr_file.text_len, 11);
129 assert_eq!(sufr_file.num_sequences, 1);
130 assert_eq!(sufr_file.sequence_starts, [0]);
131 assert_eq!(sufr_file.sequence_names, ["1"]);
132 assert_eq!(sufr_file.text, b"ACGTNNACGT$");
133 assert_eq!(sufr_file.len_suffixes, 11);
134
135 let file_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
136 assert_eq!(file_sa, &[10, 6, 0, 7, 1, 8, 2, 5, 4, 9, 3]);
137 let file_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
138 assert_eq!(file_lcp, &[0, 0, 4, 0, 3, 0, 2, 0, 1, 0, 1]);
139 Ok(())
140 }
141
142 #[test]
143 fn test_subsample_suffix_array() -> Result<()> {
144 let seq_file = Path::new("../data/inputs/smol.fa");
145 let sequence_delimiter = b'N';
146 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
147 let outfile = NamedTempFile::new()?;
148 let outpath = outfile.path().to_string_lossy().to_string();
149 let builder_args = SufrBuilderArgs {
150 text: seq_data.seq,
151 low_memory: true,
152 path: Some(outpath.clone()),
153 max_query_len: None,
154 is_dna: true,
155 allow_ambiguity: false,
156 ignore_softmask: false,
157 sequence_starts: seq_data.start_positions,
158 sequence_names: seq_data.sequence_names,
159 num_partitions: 2,
160 seed_mask: None,
161 random_seed: 0,
162 };
163 let res = SufrBuilder::<u32>::new(builder_args);
164 assert!(res.is_ok());
165
166 let builder = res.unwrap();
167 assert_eq!(builder.num_suffixes, 364);
168
169 let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
170 let full_sa: Vec<_> = sufr_file.suffix_array_file.iter().collect();
171 let full_lcp: Vec<_> = sufr_file.lcp_file.iter().collect();
172 assert_eq!(full_sa.len(), 364);
173 assert_eq!(full_lcp.len(), 364);
174
175 let max_query_len = 1;
176 let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
177 assert_eq!(sub_sa.len(), 5);
178 assert_eq!(sub_rank.len(), 5);
179 assert_eq!(sub_sa, vec![365, 364, 92, 224, 363]);
181 assert_eq!(sub_rank, vec![0, 1, 94, 191, 284]);
182
183 let max_query_len = 2;
184 let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
185 assert_eq!(sub_sa.len(), 20);
186 assert_eq!(sub_rank.len(), 20);
187 assert_eq!(
189 sub_sa,
190 vec![
191 365, 364, 358, 91, 341, 255, 362, 92, 339, 233, 296, 224, 88, 129, 110,
192 96, 363, 217, 223, 356
193 ]
194 );
195 assert_eq!(
196 sub_rank,
197 vec![
198 0, 1, 2, 38, 49, 70, 71, 94, 112, 143, 170, 191, 216, 252, 269, 270,
199 284, 298, 315, 343
200 ]
201 );
202
203 let max_query_len = 3;
204 let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
205 assert_eq!(sub_sa.len(), 71);
206 assert_eq!(sub_rank.len(), 71);
207
208 let max_query_len = 5;
209 let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
210 assert_eq!(sub_sa.len(), 293);
211 assert_eq!(sub_rank.len(), 293);
212
213 let max_query_len = 3;
214 let (sub_sa, sub_rank) = sufr_file.subsample_suffix_array(max_query_len);
215 assert_eq!(sub_sa.len(), 71);
216 assert_eq!(sub_rank.len(), 71);
217
218 Ok(())
219 }
220
221 #[test]
222 fn test_spaced_seeds_1() -> Result<()> {
223 let seq_file = Path::new("../data/inputs/mostlya1.fa");
224 let sequence_delimiter = b'N';
225 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
226 let outfile = NamedTempFile::new()?;
227 let outpath = outfile.path().to_string_lossy().to_string();
228 let builder_args = SufrBuilderArgs {
229 text: seq_data.seq,
230 low_memory: true,
231 path: Some(outpath.clone()),
232 max_query_len: None,
233 is_dna: true,
234 allow_ambiguity: false,
235 ignore_softmask: false,
236 sequence_starts: seq_data.start_positions,
237 sequence_names: seq_data.sequence_names,
238 num_partitions: 1,
239 seed_mask: Some("101".to_string()),
240 random_seed: 0,
241 };
242
243 let res = SufrBuilder::<u32>::new(builder_args);
253 assert!(res.is_ok());
254
255 let builder = res.unwrap();
256 assert_eq!(builder.num_suffixes, 8);
257
258 let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
259 let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
260 assert_eq!(suffix_array.len(), 8);
261 assert_eq!(suffix_array, vec![7, 6, 5, 4, 2, 0, 1, 3]);
262
263 Ok(())
264 }
265
266 #[test]
267 fn test_spaced_seeds_2() -> Result<()> {
268 let seq_file = Path::new("../data/inputs/mostlya2.fa");
269 let sequence_delimiter = b'N';
270 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
271 let outfile = NamedTempFile::new()?;
272 let outpath = outfile.path().to_string_lossy().to_string();
273 let builder_args = SufrBuilderArgs {
274 text: seq_data.seq,
275 low_memory: true,
276 path: Some(outpath.clone()),
277 max_query_len: None,
278 is_dna: true,
279 allow_ambiguity: false,
280 ignore_softmask: false,
281 sequence_starts: seq_data.start_positions,
282 sequence_names: seq_data.sequence_names,
283 num_partitions: 1,
284 seed_mask: Some("11011".to_string()),
285 random_seed: 0,
286 };
287
288 let res = SufrBuilder::<u32>::new(builder_args);
307 assert!(res.is_ok());
308
309 let builder = res.unwrap();
310 assert_eq!(builder.num_suffixes, 17);
311
312 let mut sufr_file = SufrFile::<u32>::read(&outpath, false)?;
313 let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
314 assert_eq!(suffix_array.len(), 17);
315 assert_eq!(
316 suffix_array,
317 vec![16, 13, 9, 5, 1, 12, 8, 4, 0, 14, 10, 6, 2, 15, 11, 7, 3],
318 );
319
320 Ok(())
321 }
322
323 #[test]
325 fn test_spaced_seeds_3() -> Result<()> {
326 let seq_file = Path::new("../data/inputs/spaced_input.fa");
327 let sequence_delimiter = b'N';
328 let seq_data = read_sequence_file(seq_file, sequence_delimiter)?;
329 let outfile = NamedTempFile::new()?;
330 let outpath = outfile.path().to_string_lossy().to_string();
331 let builder_args = SufrBuilderArgs {
332 text: seq_data.seq,
333 low_memory: false,
334 path: Some(outpath.clone()),
335 max_query_len: None,
336 is_dna: true,
337 allow_ambiguity: false,
338 ignore_softmask: false,
339 sequence_starts: seq_data.start_positions,
340 sequence_names: seq_data.sequence_names,
341 num_partitions: 1,
342 seed_mask: Some("11000111".to_string()),
343 random_seed: 0,
344 };
345
346 let res = SufrBuilder::<u32>::new(builder_args);
347 assert!(res.is_ok());
348
349 let builder = res.unwrap();
350 assert_eq!(builder.num_suffixes, 43);
351
352 let mut sufr_file: SufrFile<u32> = SufrFile::read(&outpath, false)?;
353 let suffix_array: Vec<_> = sufr_file.suffix_array_file.iter().collect();
354 assert_eq!(suffix_array.len(), 43);
355 assert_eq!(
356 suffix_array,
357 vec![
358 42, 18, 12, 0, 32, 29, 13, 23, 21, 6, 40, 1, 33, 19, 30, 10, 28, 9, 17,
359 14, 4, 26, 39, 22, 25, 38, 24, 35, 7, 36, 15, 41, 5, 20, 31, 11, 27, 8,
360 16, 3, 37, 34, 2
361 ],
362 );
363
364 Ok(())
365 }
366}