1use bio::alphabets::dna::revcomp;
2use bio::data_structures::suffix_array::{lcp, shortest_unique_substrings, suffix_array};
3use bio::io::fasta;
4use std::{fs::File, io::BufReader};
5
6static END_CHAR: u8 = b'$';
7static END_CHAR_STR: &str = "$";
8
9pub struct Genome {
10    pub names: Vec<String>,
11    pub starts: Vec<usize>,
12    pub ends: Vec<usize>,
13    pub seq: Vec<u8>,
14    pub length: usize,
15}
16
17impl Genome {
18    pub fn new(records: fasta::Records<BufReader<File>>) -> Genome {
19        let mut genome = Genome {
20            names: Vec::new(),
21            starts: Vec::new(),
22            ends: Vec::new(),
23            seq: Vec::new(),
24            length: 0,
25        };
26        let mut seq = Vec::new();
27        for rec in records {
28            let rec = rec.unwrap();
29            genome.starts.push(seq.len());
31            genome.names.push(rec.id().to_string());
32            seq.append(&mut rec.seq().to_ascii_uppercase());
34            genome.ends.push(seq.len());
36            seq.push(END_CHAR);
38        }
39        genome.length = seq.len();
40        seq.append(&mut revcomp(&seq[..seq.len() - 1]));
41        seq.push(END_CHAR);
42        genome.seq = seq;
43        eprintln!("Done reading in the genome.");
44        eprintln!("Genome length: {}", genome.length - genome.starts.len());
45        eprintln!("Genome structure size: {}", genome.seq.len());
46        genome
47    }
48
49    pub fn from_file(fastafile: &str) -> Genome {
55        let records = fasta::Reader::from_file(fastafile)
56            .expect("Unable to read")
57            .records();
58        Genome::new(records)
59    }
60    pub fn get_shortest_subseq_size(text: &[u8]) -> Vec<Option<usize>> {
73        eprintln!("Making a suffix array (SA) from {} elements.", text.len());
74        let pos = suffix_array(text);
75        eprintln!("Done reading making the SA.");
76        let lcp = lcp(text, &pos);
78        eprintln!("Done reading making the longest common prefix (LCP) structure.");
79        shortest_unique_substrings(&pos, &lcp)
81    }
82
83    pub fn get_longest_perfect_repeats(&self, min_length: usize) -> Vec<(&String, usize, usize)> {
91        let mut vec = Vec::new();
92        for (idx, x) in Genome::get_shortest_subseq_size(&self.seq)
93            .into_iter()
94            .enumerate()
95        {
96            if idx >= self.length {
98                break;
99            }
100            if let Some(val) = x {
101                if val >= min_length {
102                    if let Some((name, pos)) = self.convert_from_idx(idx) {
103                        vec.push((name, pos, val))
104                    }
105                }
106            }
107        }
108        eprintln!("{:?}", vec);
109        vec
110    }
111
112    pub fn find_intervals(&self, sus: Vec<Option<usize>>, kmer_size: usize) -> Vec<(usize, usize)> {
116        let mut vec = Vec::new();
117        let mut i: usize = 0;
118        while i < self.length {
119            let start = i; let mut cur: usize = sus[i].unwrap_or(kmer_size + 1);
121            while cur <= kmer_size && (i + 1) < self.length && self.seq[i] != END_CHAR && self.seq[i + 1] != END_CHAR
126            {
128                i += 1;
129                cur = sus[i].unwrap_or(kmer_size + 1);
130            }
131            let end = i + 1;
133            if start < i && end - start >= kmer_size {
134                vec.push((start, end));
135            }
136            i += 1;
138        }
139        vec
140    }
141
142    pub fn convert_from_idx(&self, idx: usize) -> Option<(&String, usize)> {
152        let mut i = 0;
154        while idx >= self.ends[i] {
155            if idx == self.ends[i] {
156                return None;
157            }
158            i += 1;
159        }
160        Some((&self.names[i], idx - self.starts[i]))
162    }
163    fn convert_from_raw(
165        &self,
166        raw_intervals: Vec<(usize, usize)>,
167    ) -> Vec<(&String, usize, usize, &[u8])> {
168        let mut i = 0;
169        let mut intervals = Vec::new();
170        for (raw_start, raw_end) in raw_intervals {
171            while raw_start > self.ends[i] && raw_end > self.ends[i] {
173                i += 1;
174            }
175
176            intervals.push((
177                &self.names[i],
178                raw_start - self.starts[i],
179                raw_end - self.starts[i],
180                &self.seq[raw_start..raw_end],
181            ));
182        }
183        intervals
184    }
185
186    pub fn find_sun_intervals(&self, kmer_size: usize) -> Vec<(&String, usize, usize, &[u8])> {
187        assert!(kmer_size > 1);
188        let sus = Genome::get_shortest_subseq_size(&self.seq);
189        eprintln!("Done calculating the shortest unique substrings.");
190        let raw_intervals = self.find_intervals(sus, kmer_size);
191        eprintln!("Done calculating the raw SUN intervals from the LCP.");
192        pretty_interval_print(&raw_intervals, &self.seq);
193        self.convert_from_raw(raw_intervals)
194    }
195}
196
197pub fn pretty_interval_print(intervals: &[(usize, usize)], text: &[u8]) {
199    eprintln!("\n{}", "-".repeat(50));
200    for (start, end) in intervals {
201        eprintln!("start:{}, end:{}, {}", start, end, text.len());
202        eprintln!("{}", std::str::from_utf8(text).unwrap());
203        for i in 0..(text.len()) {
204            if i >= *start && i < *end {
205                eprint!(".");
206            } else {
207                eprint!(" ");
208            }
209        }
210        eprintln!();
211    }
212    eprintln!("{}\n", "-".repeat(50));
213}
214
215pub fn validate_suns(
216    genome: &Genome,
217    intervals: &[(&String, usize, usize, &[u8])],
218    kmer_size: usize,
219) {
220    let genome_str = std::str::from_utf8(&genome.seq).unwrap();
221    let mut all_suns = Vec::new();
223    for (chr, start, _end, seq) in intervals {
225        let sun_r = std::str::from_utf8(seq).unwrap();
226        eprint!("\r{}:{}", chr, start);
227        for i in 0..(sun_r.len() - kmer_size + 1) {
229            let sun = &sun_r[i..i + kmer_size];
230            all_suns.push(sun);
231            assert_eq!(1, genome_str.matches(sun).count());
233            assert_eq!(0, sun.matches(END_CHAR_STR).count());
235        }
236    }
237    eprintln!();
238    for i in 0..(genome_str.len() - kmer_size) {
240        let sun = &genome_str[i..i + kmer_size];
242        if sun.matches(END_CHAR_STR).count() > 0 {
244            continue;
245        }
246        if i >= genome.length {
248            break;
249        }
250        let count = genome_str.matches(sun).count();
251        eprintln!("{}\t{}\t{}", i, sun, count);
253        assert!(count > 1 || all_suns.contains(&sun));
254    }
255    eprintln!();
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    #[test]
262    fn test_sun_finding() {
263        let genome = Genome::from_file(".test/test.fa");
264
265        let kmer_size = 2;
266        let intervals = genome.find_sun_intervals(kmer_size);
267        validate_suns(&genome, &intervals, kmer_size);
268
269        let kmer_size = 3;
270        let intervals = genome.find_sun_intervals(kmer_size);
271        validate_suns(&genome, &intervals, kmer_size);
272
273        let kmer_size = 4;
274        let intervals = genome.find_sun_intervals(kmer_size);
275        validate_suns(&genome, &intervals, kmer_size);
276
277        let kmer_size = 5;
278        let intervals = genome.find_sun_intervals(kmer_size);
279        validate_suns(&genome, &intervals, kmer_size);
280    }
281
282    #[test]
283    fn test_convert() {
284        let genome = Genome::from_file(".test/test.fa");
285
286        let idx_to_test = 21;
287        let rtn = genome.convert_from_idx(idx_to_test).unwrap();
288        assert_eq!((&"chr2".to_string(), 1), rtn);
290
291        let rtn2 = genome.convert_from_idx(10).unwrap();
292        println!("{}", genome.seq[10] as char);
293        assert_eq!((&"chr1".to_string(), 10), rtn2);
294
295        genome.get_longest_perfect_repeats(4);
296    }
297}