lightmotif_io/jaspar/
mod.rs

1//! Parser implementation for matrices in JASPAR (raw) format.
2//!
3//! The [JASPAR database](https://jaspar.elixir.no/docs/) stores manually
4//! curated DNA-binding sites as count matrices.
5//!
6//! The JASPAR files contains a FASTA-like header line for each record,
7//! followed by one line per symbol storing tab-separated counts at each
8//! position. The "raw" format simply stores 4 lines corresponding to the
9//! scores for the A, C, G and T letters:
10//! ```text
11//! >MA1104.2 GATA6
12//! 22320 20858 35360  5912 4535  2560  5044 76686  1507  1096 13149 18911 22172
13//! 16229 14161 13347 11831 62936 1439  1393   815   852 75930  3228 19054 17969
14//! 13432 11894 10394  7066 6459   580   615   819   456   712  1810 18153 11605
15//! 27463 32531 20343 54635 5514 74865 72392  1124 76629  1706 61257 23326 27698
16//! ```
17
18use std::io::BufRead;
19
20use lightmotif::abc::Dna;
21use lightmotif::pwm::CountMatrix;
22
23use crate::error::Error;
24
25mod parse;
26
27// ---
28
29/// A JASPAR (raw) record.
30///
31/// The JASPAR (raw) format only supports count matrices in the DNA
32/// alphabet.
33#[derive(Debug, Clone)]
34pub struct Record {
35    id: String,
36    description: Option<String>,
37    matrix: CountMatrix<Dna>,
38}
39
40impl Record {
41    /// Get the identifier of the record.
42    pub fn id(&self) -> &str {
43        &self.id
44    }
45
46    /// Get the description of the record, if any.
47    pub fn description(&self) -> Option<&str> {
48        self.description.as_deref()
49    }
50
51    /// Get the count matrix of the record.
52    pub fn matrix(&self) -> &CountMatrix<Dna> {
53        &self.matrix
54    }
55}
56
57impl AsRef<CountMatrix<Dna>> for Record {
58    fn as_ref(&self) -> &CountMatrix<Dna> {
59        &self.matrix
60    }
61}
62
63impl From<Record> for CountMatrix<Dna> {
64    fn from(value: Record) -> Self {
65        value.matrix
66    }
67}
68
69// ---
70
71/// An iterative reader for the JASPAR format.
72pub struct Reader<B: BufRead> {
73    buffer: Vec<u8>,
74    bufread: B,
75    start: usize,
76}
77
78impl<B: BufRead> Reader<B> {
79    /// Create a new `Reader` from a buffered reader.
80    pub fn new(mut reader: B) -> Self {
81        let mut buffer = Vec::new();
82        let start = reader.read_until(b'>', &mut buffer).unwrap_or(1) - 1;
83
84        Self {
85            bufread: reader,
86            buffer,
87            start,
88        }
89    }
90}
91
92impl<B: BufRead> Iterator for Reader<B> {
93    type Item = Result<Record, Error>;
94    fn next(&mut self) -> Option<Self::Item> {
95        match self.bufread.read_until(b'>', &mut self.buffer) {
96            Ok(n) => {
97                let bytes = if n == 0 {
98                    &self.buffer[self.start..]
99                } else {
100                    &self.buffer[self.start..=self.start + n]
101                };
102                let text = match std::str::from_utf8(bytes) {
103                    Ok(text) => text,
104                    Err(_) => {
105                        return Some(Err(Error::from(std::io::Error::new(
106                            std::io::ErrorKind::InvalidData,
107                            "decoding error",
108                        ))));
109                    }
110                };
111                if n == 0 && text.trim().is_empty() {
112                    return None;
113                }
114                let (rest, record) = match self::parse::record(text) {
115                    Err(e) => return Some(Err(Error::from(e))),
116                    Ok((rest, record)) => (rest, record),
117                };
118                self.start += n + 1 - rest.len();
119                if self.start > self.buffer.capacity() / 2 {
120                    let n = self.buffer.len();
121                    self.buffer.copy_within(self.start.., 0);
122                    self.buffer.truncate(n - self.start);
123                    self.start = 0;
124                }
125                Some(Ok(record))
126            }
127            Err(e) => Some(Err(Error::from(e))),
128        }
129    }
130}
131
132/// Read the records from a file in JASPAR format.
133pub fn read<B: BufRead>(reader: B) -> self::Reader<B> {
134    self::Reader::new(reader)
135}
136
137#[cfg(test)]
138mod test {
139
140    #[test]
141    fn test_single() {
142        let text = concat!(
143            ">MA1104.2 GATA6\n",
144            "22320 20858 35360  5912 4535  2560  5044 76686  1507  1096 13149 18911 22172\n",
145            "16229 14161 13347 11831 62936 1439  1393   815   852 75930  3228 19054 17969\n",
146            "13432 11894 10394  7066 6459   580   615   819   456   712  1810 18153 11605\n",
147            "27463 32531 20343 54635 5514 74865 72392  1124 76629  1706 61257 23326 27698\n",
148        );
149        let mut reader = super::Reader::new(std::io::Cursor::new(text));
150        let record = reader.next().unwrap().unwrap();
151        assert_eq!(&record.id, "MA1104.2");
152        assert!(reader.next().is_none());
153    }
154}