lightmotif_io/uniprobe/
mod.rs

1//! Parser implementation for matrices in UniPROBE format.
2//!
3//! The [UniPROBE database](http://the_brain.bwh.harvard.edu/uniprobe/index.php)
4//! stores DNA-binding sites as Position-Weight Matrices.
5//!
6//! The UniPROBE files contains a metadata line for each matrix, followed
7//! by one line per symbol storing tab-separated scores for the column:
8//! ```text
9//! Motif XYZ
10//! A:	0.179	0.210	0.182	0.25
11//! C:	0.268	0.218	0.213	0.25
12//! G:	0.383	0.352	0.340	0.25
13//! T:	0.383	0.352	0.340	0.25
14//! ```
15
16use std::io::BufRead;
17
18use lightmotif::abc::Alphabet;
19use lightmotif::pwm::FrequencyMatrix;
20
21use crate::error::Error;
22
23mod parse;
24
25// ---
26
27#[derive(Debug, Clone)]
28pub struct Record<A: Alphabet> {
29    id: String,
30    matrix: FrequencyMatrix<A>,
31}
32
33impl<A: Alphabet> Record<A> {
34    pub fn id(&self) -> &str {
35        &self.id
36    }
37
38    pub fn matrix(&self) -> &FrequencyMatrix<A> {
39        &self.matrix
40    }
41
42    /// Take the frequency matrix of the record.
43    pub fn into_matrix(self) -> FrequencyMatrix<A> {
44        self.matrix
45    }
46}
47
48impl<A: Alphabet> AsRef<FrequencyMatrix<A>> for Record<A> {
49    fn as_ref(&self) -> &FrequencyMatrix<A> {
50        &self.matrix
51    }
52}
53
54// ---
55
56pub struct Reader<B: BufRead, A: Alphabet> {
57    buffer: String,
58    bufread: B,
59    line: bool,
60    _alphabet: std::marker::PhantomData<A>,
61}
62
63impl<B: BufRead, A: Alphabet> Reader<B, A> {
64    pub fn new(reader: B) -> Self {
65        Self {
66            bufread: reader,
67            buffer: String::new(),
68            line: false,
69            _alphabet: std::marker::PhantomData,
70        }
71    }
72}
73
74impl<B: BufRead, A: Alphabet> Iterator for Reader<B, A> {
75    type Item = Result<Record<A>, Error>;
76    fn next(&mut self) -> Option<Self::Item> {
77        // advance to first line with content
78        while !self.line {
79            match self.bufread.read_line(&mut self.buffer) {
80                Err(e) => return Some(Err(Error::from(e))),
81                Ok(0) => return None,
82                Ok(_) => {
83                    if !self.buffer.trim().is_empty() {
84                        self.line = true;
85                    } else {
86                        self.buffer.clear();
87                    }
88                }
89            }
90        }
91        // parse id
92        let id = match self::parse::id(&self.buffer) {
93            Err(e) => return Some(Err(Error::from(e))),
94            Ok((_, x)) => x.to_string(),
95        };
96        self.line = false;
97        self.buffer.clear();
98
99        // parse columns
100        let mut columns = Vec::new();
101        loop {
102            while !self.line {
103                match self.bufread.read_line(&mut self.buffer) {
104                    Err(e) => return Some(Err(Error::from(e))),
105                    Ok(0) => break,
106                    Ok(_) => {
107                        if !self.buffer.trim().is_empty() {
108                            self.line = true;
109                        } else {
110                            self.buffer.clear();
111                        }
112                    }
113                }
114            }
115            match self::parse::matrix_column::<A>(&self.buffer) {
116                Err(_e) => break,
117                Ok((_, column)) => {
118                    columns.push(column);
119                    self.buffer.clear();
120                    self.line = false;
121                }
122            }
123        }
124
125        let matrix = match self::parse::build_matrix::<A>(columns) {
126            Err(e) => return Some(Err(Error::from(e))),
127            Ok(matrix) => matrix,
128        };
129        match FrequencyMatrix::<A>::new(matrix) {
130            Err(e) => Some(Err(Error::from(e))),
131            Ok(matrix) => Some(Ok(Record { id, matrix })),
132        }
133    }
134}
135
136pub fn read<B: BufRead, A: Alphabet>(reader: B) -> self::Reader<B, A> {
137    self::Reader::new(reader)
138}
139
140#[cfg(test)]
141mod test {
142
143    use lightmotif::Dna;
144
145    #[test]
146    fn test_single() {
147        let text = concat!(
148            "TEST001\n",
149            "A:	0.179	0.210	0.182\n",
150            "C:	0.268	0.218	0.213\n",
151            "G:	0.383	0.352	0.340\n",
152            "T:	0.170	0.220	0.265\n",
153        );
154        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(text));
155        let record = reader.next().unwrap().unwrap();
156        assert_eq!(&record.id, "TEST001");
157        assert!(reader.next().is_none());
158    }
159
160    #[test]
161    fn test_multi() {
162        let text = concat!(
163            "TEST001\n",
164            "A:	0.179	0.210	0.182\n",
165            "C:	0.268	0.218	0.213\n",
166            "G:	0.383	0.352	0.340\n",
167            "T:	0.170	0.220	0.265\n",
168            "\n",
169            "TEST002\n",
170            "A:	0.179	0.210	0.182\n",
171            "C:	0.268	0.218	0.213\n",
172            "G:	0.383	0.352	0.340\n",
173            "T:	0.170	0.220	0.265\n",
174            "\n",
175        );
176        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(text));
177        let record = reader.next().unwrap().unwrap();
178        assert_eq!(&record.id, "TEST001");
179        let record = reader.next().unwrap().unwrap();
180        assert_eq!(&record.id, "TEST002");
181        assert!(reader.next().is_none());
182    }
183
184    #[test]
185    fn test_multi_concatenated() {
186        let text = concat!(
187            "TEST001\n",
188            "A:	0.179	0.210	0.182\n",
189            "C:	0.268	0.218	0.213\n",
190            "G:	0.383	0.352	0.340\n",
191            "T:	0.170	0.220	0.265\n",
192            "TEST002\n",
193            "A:	0.179	0.210	0.182\n",
194            "C:	0.268	0.218	0.213\n",
195            "G:	0.383	0.352	0.340\n",
196            "T:	0.170	0.220	0.265\n",
197        );
198        let mut reader = super::Reader::<_, Dna>::new(std::io::Cursor::new(text));
199        let record = reader.next().unwrap().unwrap();
200        assert_eq!(&record.id, "TEST001");
201        let record = reader.next().unwrap().unwrap();
202        assert_eq!(&record.id, "TEST002");
203        assert!(reader.next().is_none());
204    }
205}