epimetheus_methylome/
lib.rs

1// use regex::Regex;
2
3pub mod iupac;
4pub mod modtype;
5pub mod motif;
6pub mod read;
7pub mod sequence;
8pub mod strand;
9
10pub use iupac::IupacBase;
11pub use modtype::ModType;
12pub use motif::Motif;
13pub use strand::Strand;
14
15use crate::sequence::Sequence;
16
17pub fn find_motif_indices_in_sequence(sequence: &Sequence, motif: &Motif) -> Vec<usize> {
18    // let regex_str = motif.to_regex();
19    // let re = Regex::new(&regex_str).expect("Expected regex pattern");
20
21    // let indices = re
22    //     .find_iter(sequence)
23    //     .map(|m| m.start() as usize + motif.mod_position as usize)
24    //     .collect();
25
26    let motif_bases = motif.sequence.clone();
27    let motif_len = motif_bases.len();
28    let mut indices = Vec::new();
29
30    if sequence.len() < motif_len {
31        return indices;
32    }
33
34    for i in 0..=(sequence.len() - motif_len) {
35        let mut matches = true;
36
37        for (j, &motif_base) in motif_bases.iter().enumerate() {
38            let seq_base = sequence[i + j];
39            if (seq_base.mask() & motif_base.mask()) == 0 {
40                matches = false;
41                break;
42            }
43        }
44
45        if matches {
46            indices.push(i + motif.mod_position as usize);
47        }
48    }
49
50    indices
51}
52
53#[cfg(test)]
54mod tests {
55    use super::*;
56
57    use crate::read::{MethBase, MethQual, Read};
58    use noodles_fastq::{self as fastq, record::Definition};
59
60    #[test]
61    fn test_find_motif_indices_in_contig() {
62        let contig = Sequence::from_str("GGATCTCCATGATC").unwrap();
63        let contig2 = Sequence::from_str("TGGACGATCCCGATC").unwrap();
64        let motif1 = Motif::new("GATC", "m", 3).unwrap();
65        let motif2 = Motif::new("RGATCY", "m", 4).unwrap();
66        let motif3 = Motif::new("GATC", "a", 1).unwrap();
67        let motif4 = Motif::new("GGANNNTCC", "a", 2).unwrap();
68
69        println!("{}", &motif4.to_regex());
70        assert_eq!(
71            find_motif_indices_in_sequence(&contig, &motif1),
72            vec![4, 13]
73        );
74        assert_eq!(find_motif_indices_in_sequence(&contig, &motif2), vec![4]);
75
76        assert_eq!(
77            find_motif_indices_in_sequence(&contig2, &motif3),
78            vec![6, 12]
79        );
80        assert_eq!(
81            find_motif_indices_in_sequence(&contig2, &motif3.reverse_complement()),
82            vec![7, 13]
83        );
84
85        assert_eq!(find_motif_indices_in_sequence(&contig2, &motif4), vec![3])
86    }
87
88    #[test]
89    fn test_find_motif_indices_in_read() {
90        let description = "MM:Z:A+a.,0,0,0,0,0,2,0,9,0,0,0,0,1,0,0,0,0,2,0,0,0,0,16,0,0,0,4,0,0,0,1,11,1,0,1,0,0,0,0,0,4,0,0,0,2,0,10,6,5,11,0,11,1,6,0,0,0,0,0,2,3,12,0,4,16,0,0,1,0,1,4,0,0,0,0,0;C+21839.,6,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,2,11,0,0,0,0,6,0,5,4,2,9,0,1,3,0,0,0,5,2,1,11,1,0,3,0,0;C+m.,6,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,2,11,0,0,0,0,6,0,5,4,2,9,0,1,3,0,0,0,5,2,1,11,1,0,3,0,0; ML:B:C,204,119,22,36,26,40,16,20,15,25,97,104,150,20,112,20,16,34,81,66,52,12,30,67,20,155,15,21,28,20,85,22,13,14,13,19,13,17,24,12,12,14,30,13,20,20,147,16,17,22,36,41,37,163,29,14,71,28,58,12,12,14,14,12,12,15,64,25,137,42,19,34,29,23,231,46,6,16,17,30,9,41,40,25,27,26,14,179,86,24,8,23,42,15,48,12,16,13,15,14,10,16,162,21,9,3,16,14,8,31,3,2,7,4,6,21,3,15,12,19,20,12,83,45,12,18,10,26,17,33,68,70,49,53,23,13,23,21,48,40,83,5,5,5,5,2,11,13,29,60,7,24,12,16,2,3,14,14,44,12,20,13,13,9,14,6,10,6,7,4,2,34";
91        let sequence = "ACTATAAATCATTTATTTTATATTTAATGTAAACATTTCTTCACCTTCTAAGGTGCCACAAAGATAATCATTAGCATCTACCCGTCCTACACCTGCTGGTGTACCTGTAAATATGACATCTCCTTTTTTTAACATAAAGTATTGAGAAACATACGATATAAGCTCGTCTATTTTCCACAACATTAAATTTGTATTTCCCTTTTGTACTATTTCTTCATTTTTTAACAATGAAAAATTTATATTATCTACAGAAGAAAATTTACTTTTTGGCAACCATTTACCTATTACTGCCGCACCATCAAAACCTTTTGCTTTTTCCCAAGGTAATCCTTTTTCTTTTAACTTAGATTGAAGATCACGTGCTGTAAAGTCTATTCCTAGACCAATTTCATCATAATAATTGGCAGCAAATTTTTGTTCTATATGTTTTCCTACTTTTTTAATTTTAACTAAAACCTCTACTTCATAATGTATGTTATTAGAGAATTCTGGTATGTAAAAATCTTGTTCTTTTGGTAGCACAGCAGAATCTGGCTTAATAAAAACAACAGGATCTGTAGGCTTTTCATTGGCCAATTCTTTAATATGATCTGTGTAATTACGACCTATACAAATTATCTTCATAATAAACTTTATTAGGTTATTGCTTTTCTTTAGACAATTTTAAACTTACCGCTGGTACTAAATGCTTGGTATTATCTTTAGACTTTACCTTAAATCTATCAT";
92        let quals = "JIGSSGIH=GLLMLJFB;=>@?HSSNKJSSMKIKDCD?>??<>?JMSJJGGFSSLLJSSSOIJLSONMJSSRSSJSSSSLSLIIHFGSQSLSSSLSSRSSSSKSQMIHSSISSSSLIQSLLK88:1111EDMQIIOROPMS::22336C@CICCBCDSSNJJEG?;;:<;<>@;8B44HSSSSSPPISHHILGKJQAMLRPJJ:SLMSMQS99999KHKISSSSMJIKSIMSSSSSSSSSIKGJJMSHM50))))GSMSLSS>;;<PSSSSJMPMSSSSNLSSSKIKNSSNKKKONMJSSQLSSB@PKHGJFBSSSJSSQDGSSSQKKSSSSSSSSMMNJSSSSSSSSSSORISSSSSSSSSSSOSSSSSSMSKSSNSRQSSSSSSSSNISSSSSNSSSSMJKNSSQMMPSSSSSSSSSPSNSSSOJJMSSSSJJMKSSSSSSSSSSSSSSSSSSSSSQSSSSSNSSSSSOSNMSHSGFIDMIIFFIG@@ILSMINSLKSSSSLGSBSNSRPSSSSSPPSFCA@@CLSSSSSKNSSHEEEHHSSSSRNLLOSSSSSSSPSSSNSSSSSSRLOLILSSSSSSSQPSSSHJMKJSSSSRSSSIINSLNSRSSSMLOCCHILKJKJKLOSSSSSSSSSSSPSSSSPSSMIAGSA>EEE;AAABBI@>>==@SLLKIKEIHLLLOJHHKMSHISSHQSKQNSPSJFD>@>>>950///.-./(,*((&&&";
93        let fastq = fastq::Record::new(
94            Definition::new("@8c32d39d-6b88-480f-ac6d-e8061e4d674b", description),
95            sequence,
96            quals,
97        );
98
99        let read = Read::from_fastq_record(fastq).unwrap();
100        let motif = Motif::new("ACTATA", "a", 0).unwrap();
101
102        let indices = find_motif_indices_in_sequence(read.get_sequence(), &motif);
103        assert_eq!(indices, vec![0]);
104    }
105    #[test]
106    fn test_read_construction_simple() {
107        let description = "MM:Z:A+a.,0,1; ML:B:C,204,255";
108        let sequence = "GGGCGGATCAGATC";
109        let quals = "JIGSSGIH=";
110        let fastq = fastq::Record::new(
111            Definition::new("8c32d39d-6b88-480f-ac6d-e8061e4d674b", description),
112            sequence,
113            quals,
114        );
115
116        let read = Read::from_fastq_record(fastq).unwrap();
117        let modifcations = read.get_modifications();
118        let motif = Motif::new("GATC", "a", 1).unwrap();
119
120        assert_eq!(
121            *modifcations.0.get(&6).unwrap(),
122            MethBase {
123                base: ModType::SixMA,
124                quality: MethQual(204)
125            }
126        );
127        assert_eq!(modifcations.0.get(&9), None);
128        assert_eq!(
129            *modifcations.0.get(&11).unwrap(),
130            MethBase {
131                base: ModType::SixMA,
132                quality: MethQual(255)
133            }
134        );
135        let indices = find_motif_indices_in_sequence(read.get_sequence(), &motif);
136        assert_eq!(indices, vec![6, 11]);
137    }
138}