lightmotif_io/meme/
mod.rs

1//! Parser implementation for matrices in STREME format.
2
3use std::io::BufRead;
4use std::sync::Arc;
5
6use lightmotif::abc::Alphabet;
7use lightmotif::abc::Background;
8use lightmotif::abc::Symbol;
9use lightmotif::dense::DenseMatrix;
10use lightmotif::num::Unsigned;
11use lightmotif::pwm::FrequencyMatrix;
12
13use crate::error::Error;
14
15mod parse;
16
17// ---
18
19/// A MEME file record.
20pub struct Record<A: Alphabet> {
21    alength: Option<usize>,
22    w: Option<usize>,
23    nsites: Option<usize>,
24    evalue: Option<f32>,
25    id: String,
26    name: Option<String>,
27    matrix: FrequencyMatrix<A>,
28    url: Option<String>,
29    background: Option<Background<A>>,
30}
31
32impl<A: Alphabet> Record<A> {
33    /// Get the background of the record, if any declared in the file.
34    pub fn background(&self) -> Option<&Background<A>> {
35        self.background.as_ref()
36    }
37
38    /// Get the identifier of the record.
39    pub fn id(&self) -> &str {
40        &self.id
41    }
42
43    /// Get the name of the record, if any.
44    pub fn name(&self) -> Option<&str> {
45        self.name.as_ref().map(String::as_ref)
46    }
47
48    /// Get the URL of the record, if any.
49    pub fn url(&self) -> Option<&str> {
50        self.url.as_ref().map(String::as_ref)
51    }
52
53    /// Get the frequency matrix of the record.
54    pub fn matrix(&self) -> &FrequencyMatrix<A> {
55        &self.matrix
56    }
57
58    /// Take the frequency matrix of the record.
59    pub fn into_matrix(self) -> FrequencyMatrix<A> {
60        self.matrix
61    }
62}
63
64impl<A: Alphabet> AsRef<FrequencyMatrix<A>> for Record<A> {
65    fn as_ref(&self) -> &FrequencyMatrix<A> {
66        &self.matrix
67    }
68}
69
70// ---
71
72/// An iterative reader for the MEME format.
73pub struct Reader<B: BufRead, A: Alphabet> {
74    buffer: String,
75    bufread: B,
76    meme_version: String,
77    symbols: Vec<A::Symbol>,
78    background: Option<Background<A>>,
79    error: Option<crate::error::Error>,
80    _alphabet: std::marker::PhantomData<A>,
81}
82
83impl<B: BufRead, A: Alphabet> Reader<B, A> {
84    pub fn new(mut reader: B) -> Self {
85        let mut buffer = String::new();
86        let mut meme_version = None;
87        let mut error = None;
88        let mut background = None;
89        let mut symbols = None;
90
91        macro_rules! read_line {
92            () => {
93                match reader.read_line(&mut buffer) {
94                    Err(e) => {
95                        error = Some(crate::error::Error::Io(Arc::new(e)));
96                        break;
97                    }
98                    Ok(0) => {
99                        error = Some(crate::error::Error::InvalidData(None)); // FIXME
100                        break;
101                    }
102                    _ => (),
103                }
104            };
105        }
106
107        // Detect MEME version but don't read past the first motif
108        while !buffer.starts_with("MOTIF") {
109            // attempt parsing current line
110            match self::parse::meme_version(&buffer) {
111                Err(e) => buffer.clear(),
112                Ok((_, v)) if meme_version.is_none() => {
113                    meme_version = Some(v.to_string());
114                    buffer.clear();
115                    break;
116                }
117                Ok(_) => {
118                    error = Some(crate::error::Error::InvalidData(Some(String::from(
119                        "multiple MEME versions found",
120                    ))));
121                    break;
122                }
123            }
124            // read next line
125            read_line!();
126        }
127
128        // Error if no MEME version was found (only mandatory field)
129        if meme_version.is_none() {
130            error = Some(crate::error::Error::InvalidData(Some(String::from(
131                "no MEME version found",
132            ))));
133            meme_version = Some(String::new());
134        }
135
136        // Get remaining global metadata
137        if error.is_none() {
138            while !buffer.starts_with("MOTIF") && error.is_none() {
139                // get background, which can span on multiple lines
140                if buffer.starts_with("Background letter frequencies") {
141                    loop {
142                        read_line!();
143                        match self::parse::background::<A>(&buffer) {
144                            Err(nom::Err::Incomplete(_)) => {
145                                read_line!();
146                            }
147                            Err(e) => {
148                                error = Some(e.into());
149                                break;
150                            }
151                            Ok((_, bg)) => {
152                                background = Some(bg);
153                                buffer.clear();
154                                break;
155                            }
156                        }
157                    }
158                }
159                // get alphabet symbols to ensure columns are parsed in order
160                if buffer.starts_with("ALPHABET") {
161                    match self::parse::alphabet::<A>(&buffer) {
162                        Err(e) => {
163                            error = Some(e.into());
164                            break;
165                        }
166                        Ok((_, s)) => {
167                            symbols = Some(s);
168                            buffer.clear();
169                        }
170                    }
171                }
172                // get strands (TODO)
173                buffer.clear();
174                read_line!();
175            }
176        }
177
178        // If no alphabet found, use lexicographic order for columns
179        if symbols.is_none() {
180            let mut s = A::symbols().to_vec();
181            s.as_mut_slice()[..A::K::USIZE - 1].sort_by(|x, y| x.as_char().cmp(&y.as_char()));
182            symbols = Some(s);
183        }
184
185        Self {
186            bufread: reader,
187            buffer,
188            meme_version: meme_version.unwrap(),
189            error,
190            background,
191            symbols: symbols.unwrap(),
192            _alphabet: std::marker::PhantomData,
193        }
194    }
195
196    /// Get the MEME format version of the file being read.
197    pub fn meme_version(&self) -> Result<&str, Error> {
198        if let Some(e) = self.error.as_ref() {
199            Err(e.clone())
200        } else {
201            Ok(&self.meme_version)
202        }
203    }
204
205    /// Get the background alphabet probabilities, if any.
206    pub fn background(&self) -> Result<Option<&Background<A>>, Error> {
207        if let Some(e) = self.error.as_ref() {
208            Err(e.clone())
209        } else {
210            Ok(self.background.as_ref())
211        }
212    }
213}
214
215impl<B: BufRead, A: Alphabet> Iterator for Reader<B, A> {
216    type Item = Result<Record<A>, Error>;
217    fn next(&mut self) -> Option<Self::Item> {
218        let mut motif: Option<Record<A>> = None;
219
220        if let Some(err) = self.error.as_ref() {
221            return Some(Err(err.clone()));
222        }
223
224        loop {
225            if self.buffer.starts_with("MOTIF") {
226                if let Some(m) = motif {
227                    return Some(Ok(m));
228                }
229                match self::parse::motif(&self.buffer) {
230                    Err(e) => return Some(Err(e.into())),
231                    Ok((_, (id, name))) => {
232                        motif = Some(Record {
233                            alength: None,
234                            w: None,
235                            nsites: None,
236                            evalue: None,
237                            id: id.to_string(),
238                            name: name.map(String::from),
239                            url: None,
240                            matrix: FrequencyMatrix::new(DenseMatrix::new(0)).unwrap(),
241                            background: self.background.clone(),
242                        });
243                    }
244                }
245                self.buffer.clear();
246            } else if self.buffer.starts_with("letter-probability matrix") {
247                self.buffer.clear(); // FIXME: parse line & metadata
248                let mut rows = Vec::new();
249                loop {
250                    match self.bufread.read_line(&mut self.buffer) {
251                        Err(e) => return Some(Err(e.into())),
252                        Ok(0) => break,
253                        Ok(n) => (),
254                    };
255                    match self::parse::motif_row::<A>(&self.buffer, &self.symbols) {
256                        Ok((_, row)) => rows.push(row),
257                        Err(e) => break, // FIXME?
258                    }
259                    self.buffer.clear();
260                }
261                if let Some(m) = motif.as_mut() {
262                    m.matrix = FrequencyMatrix::new(DenseMatrix::from_rows(rows)).unwrap();
263                } else {
264                    return Some(Err(crate::error::Error::InvalidData(Some(String::from(
265                        "motif data before declared motif",
266                    )))));
267                }
268            } else if self.buffer.starts_with("URL") {
269                let url = match self::parse::url(&self.buffer) {
270                    Ok((_, url)) => url,
271                    Err(e) => return Some(Err(e.into())),
272                };
273                if let Some(m) = motif.as_mut() {
274                    m.url = Some(url.into());
275                } else {
276                    return Some(Err(crate::error::Error::InvalidData(Some(String::from(
277                        "motif data before declared motif",
278                    )))));
279                }
280                self.buffer.clear();
281            } else {
282                self.buffer.clear();
283            }
284
285            if self.buffer.is_empty() {
286                match self.bufread.read_line(&mut self.buffer) {
287                    Err(e) => return Some(Err(e.into())),
288                    Ok(0) => return motif.map(Ok),
289                    Ok(_) => (),
290                }
291            }
292        }
293    }
294}
295
296/// Read the records from a file in MEME format.
297pub fn read<B: BufRead, A: Alphabet>(reader: B) -> self::Reader<B, A> {
298    self::Reader::new(reader)
299}
300
301#[cfg(test)]
302mod tests {
303    use lightmotif::abc::{Dna, Nucleotide, Symbol};
304
305    #[test]
306    fn record() {
307        const TEXT: &'static str = concat!(
308            "MEME version 4\n",
309            "MOTIF MA0004.1 Arnt\n",
310            "letter-probability matrix: alength= 4 w= 6 nsites= 20 E= 0\n",
311            " 0.200000  0.800000  0.000000  0.000000\n",
312            " 0.950000  0.000000  0.050000  0.000000\n",
313            " 0.000000  1.000000  0.000000  0.000000\n",
314            " 0.000000  0.000000  1.000000  0.000000\n",
315            " 0.000000  0.000000  0.000000  1.000000\n",
316            " 0.000000  0.000000  1.000000  0.000000\n",
317            "URL http://jaspar.genereg.net/matrix/MA0004.1\n",
318        );
319        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(TEXT));
320        let record = reader.next().unwrap().unwrap();
321        assert_eq!(&record.id, "MA0004.1");
322        assert_eq!(
323            record.url().unwrap(),
324            "http://jaspar.genereg.net/matrix/MA0004.1"
325        );
326        assert!(reader.next().is_none());
327
328        assert_eq!(
329            record.matrix().matrix()[0][Nucleotide::A.as_index()],
330            0.200000
331        );
332        assert_eq!(
333            record.matrix().matrix()[0][Nucleotide::C.as_index()],
334            0.800000
335        );
336        assert_eq!(
337            record.matrix().matrix()[0][Nucleotide::G.as_index()],
338            0.000000
339        );
340        assert_eq!(
341            record.matrix().matrix()[0][Nucleotide::T.as_index()],
342            0.000000
343        );
344
345        assert_eq!(
346            record.matrix().matrix()[1][Nucleotide::A.as_index()],
347            0.950000
348        );
349        assert_eq!(
350            record.matrix().matrix()[1][Nucleotide::C.as_index()],
351            0.000000
352        );
353        assert_eq!(
354            record.matrix().matrix()[1][Nucleotide::G.as_index()],
355            0.050000
356        );
357        assert_eq!(
358            record.matrix().matrix()[1][Nucleotide::T.as_index()],
359            0.000000
360        );
361
362        assert_eq!(
363            record.matrix().matrix()[3][Nucleotide::A.as_index()],
364            0.000000
365        );
366        assert_eq!(
367            record.matrix().matrix()[3][Nucleotide::C.as_index()],
368            0.000000
369        );
370        assert_eq!(
371            record.matrix().matrix()[3][Nucleotide::G.as_index()],
372            1.000000
373        );
374        assert_eq!(
375            record.matrix().matrix()[3][Nucleotide::T.as_index()],
376            0.000000
377        );
378    }
379}