lightmotif_io/jaspar16/
mod.rs

1//! Parser implementation for matrices in JASPAR (2016) format.
2//!
3//! The [JASPAR database](https://jaspar.elixir.no/docs/) stores manually
4//! curated DNA-binding sites as count matrices.
5//!
6//! The JASPAR files contains a FASTA-like header line for each record,
7//! followed by one line per symbol storing tab-separated counts at each
8//! position. The 2016 version introduces bracketed matrix columns for
9//! each symbol, allowing for non-standard alphabets to be used:
10//! ```text
11//! >MA0001.3	AGL3
12//! A  [     0      0     82     40     56     35     65     25     64      0 ]
13//! C  [    92     79      1      4      0      0      1      4      0      0 ]
14//! G  [     0      0      2      3      1      0      4      3     28     92 ]
15//! T  [     3     16     10     48     38     60     25     63      3      3 ]
16//! ```
17
18use std::io::BufRead;
19
20use lightmotif::abc::Alphabet;
21use lightmotif::pwm::CountMatrix;
22
23use crate::error::Error;
24
25mod parse;
26
27// ---
28
29/// A JASPAR (2016) record.
30#[derive(Debug, Clone)]
31pub struct Record<A: Alphabet> {
32    id: String,
33    description: Option<String>,
34    matrix: CountMatrix<A>,
35}
36
37impl<A: Alphabet> Record<A> {
38    /// Get the identifier of the record.
39    pub fn id(&self) -> &str {
40        &self.id
41    }
42
43    /// Get the description of the record, if any.
44    pub fn description(&self) -> Option<&str> {
45        self.description.as_deref()
46    }
47
48    /// Get the count matrix of the record.
49    pub fn matrix(&self) -> &CountMatrix<A> {
50        &self.matrix
51    }
52
53    /// Take the count matrix of the record.
54    pub fn into_matrix(self) -> CountMatrix<A> {
55        self.matrix
56    }
57}
58
59impl<A: Alphabet> AsRef<CountMatrix<A>> for Record<A> {
60    fn as_ref(&self) -> &CountMatrix<A> {
61        &self.matrix
62    }
63}
64
65// ---
66
67/// An iterative reader for the JASPAR (2016) format.
68pub struct Reader<B: BufRead, A: Alphabet> {
69    buffer: Vec<u8>,
70    bufread: B,
71    start: usize,
72    _alphabet: std::marker::PhantomData<A>,
73}
74
75impl<B: BufRead, A: Alphabet> Reader<B, A> {
76    pub fn new(mut reader: B) -> Self {
77        let mut buffer = Vec::new();
78        let start = reader.read_until(b'>', &mut buffer).unwrap_or(1) - 1;
79
80        Self {
81            bufread: reader,
82            buffer,
83            start,
84            _alphabet: std::marker::PhantomData,
85        }
86    }
87}
88
89impl<B: BufRead, A: Alphabet> Iterator for Reader<B, A> {
90    type Item = Result<Record<A>, Error>;
91    fn next(&mut self) -> Option<Self::Item> {
92        match self.bufread.read_until(b'>', &mut self.buffer) {
93            Ok(n) => {
94                let bytes = if n == 0 {
95                    &self.buffer[self.start..]
96                } else {
97                    &self.buffer[self.start..=self.start + n]
98                };
99                let text = match std::str::from_utf8(bytes) {
100                    Ok(text) => text,
101                    Err(_) => {
102                        return Some(Err(Error::from(std::io::Error::new(
103                            std::io::ErrorKind::InvalidData,
104                            "decoding error",
105                        ))));
106                    }
107                };
108                if n == 0 && text.trim().is_empty() {
109                    return None;
110                }
111                let (rest, record) = match self::parse::record::<A>(text) {
112                    Err(e) => return Some(Err(Error::from(e))),
113                    Ok((rest, record)) => (rest, record),
114                };
115                self.start += n + 1 - rest.len();
116                if self.start > self.buffer.capacity() / 2 {
117                    let n = self.buffer.len();
118                    self.buffer.copy_within(self.start.., 0);
119                    self.buffer.truncate(n - self.start);
120                    self.start = 0;
121                }
122                Some(Ok(record))
123            }
124            Err(e) => Some(Err(Error::from(e))),
125        }
126    }
127}
128
129/// Read the records from a file in JASPAR (2016) format.
130pub fn read<B: BufRead, A: Alphabet>(reader: B) -> self::Reader<B, A> {
131    self::Reader::new(reader)
132}
133
134#[cfg(test)]
135mod test {
136
137    use lightmotif::Dna;
138
139    #[test]
140    fn test_single() {
141        let text = concat!(
142            ">MA0001.1 RUNX1\n",
143            "A [10 12  4  1  2  2  0  0  0  8 13 ]\n",
144            "C [ 2  2  7  1  0  8  0  0  1  2  2 ]\n",
145            "G [ 3  1  1  0 23  0 26 26  0  0  4 ]\n",
146            "T [11 11 14 24  1 16  0  0 25 16  7 ]\n",
147        );
148        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(text));
149        let record = reader.next().unwrap().unwrap();
150        assert_eq!(&record.id, "MA0001.1");
151        assert!(reader.next().is_none());
152    }
153
154    #[test]
155    fn test_multi() {
156        let text = concat!(
157            ">MA0001.1 RUNX1\n",
158            "A [10 12  4  1  2  2  0  0  0  8 13 ]\n",
159            "C [ 2  2  7  1  0  8  0  0  1  2  2 ]\n",
160            "G [ 3  1  1  0 23  0 26 26  0  0  4 ]\n",
161            "T [11 11 14 24  1 16  0  0 25 16  7 ]\n",
162            ">MA0002.1 RUNX1\n",
163            "A [10 12  4  1  2  2  0  0  0  8 13 ]\n",
164            "C [ 2  2  7  1  0  8  0  0  1  2  2 ]\n",
165            "G [ 3  1  1  0 23  0 26 26  0  0  4 ]\n",
166            "T [11 11 14 24  1 16  0  0 25 16  7 ]\n",
167        );
168        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(text));
169        let record = reader.next().unwrap().unwrap();
170        assert_eq!(&record.id, "MA0001.1");
171        let record = reader.next().unwrap().unwrap();
172        assert_eq!(&record.id, "MA0002.1");
173        assert!(reader.next().is_none());
174    }
175}