census_proteomics/
parser.rs

1//! Parse Census proteomics file
2//!
3
4use super::*;
5
6use std::fmt;
7use std::iter::Peekable;
8use std::str::Lines;
9
10#[derive(PartialEq, PartialOrd, Debug)]
11pub enum ErrorKind {
12    /// Invalid beginning of line
13    Invalid(char),
14    /// Error converting to number
15    Conversion,
16    /// Unexpected end-of-file
17    EOF,
18}
19
20/// Error that may occur during parsing of a Census file
21#[derive(PartialEq, PartialOrd, Debug)]
22pub struct Error {
23    kind: ErrorKind,
24    line: usize,
25}
26
27impl fmt::Display for Error {
28    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
29        write!(
30            f,
31            "Error parsing file at line {}: {:?}",
32            self.line, self.kind
33        )
34    }
35}
36
37impl std::error::Error for Error {}
38
39pub struct Parser<'s> {
40    iter: Peekable<Lines<'s>>,
41    /// Number of TMT channels to parse
42    channels: u8,
43    line: usize,
44}
45
46impl<'s> Parser<'s> {
47    /// Create a new parser operating on input data
48    pub fn new(input: &'s str) -> Parser<'s> {
49        Parser {
50            iter: input.lines().peekable(),
51            channels: 0,
52            line: 1,
53        }
54    }
55
56    /// Convenience function for creating Error struct
57    fn err(&self, kind: ErrorKind) -> Error {
58        Error {
59            kind,
60            line: self.line,
61        }
62    }
63
64    fn peek(&mut self) -> Option<&&'s str> {
65        self.iter.peek()
66    }
67
68    fn next(&mut self) -> Option<&'s str> {
69        let n = self.iter.next();
70        if n.is_some() {
71            self.line += 1;
72        }
73        n
74    }
75
76    fn parse_peptide(&mut self) -> Result<Peptide, Error> {
77        let line = self.iter.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
78        // Using split_whitespace obfuscates missing 'U' values, and messes up
79        // parsing
80        let mut data = line.split('\t');
81        assert_eq!(data.next(), Some("S"));
82
83        let n = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
84        assert!(n.len() <= 1);
85        let unique: bool = n == "U";
86        let sequence = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
87
88        let mut values = Vec::with_capacity(self.channels as usize);
89
90        for _ in 0..self.channels {
91            let mz = data
92                .next()
93                .ok_or_else(|| self.err(ErrorKind::EOF))?
94                .parse::<u32>()
95                .map_err(|_| self.err(ErrorKind::Conversion))?;
96            // discard normalized data
97            let _ = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
98            values.push(mz);
99        }
100
101        let _ = data.next();
102        let purity = data.next().unwrap_or("1.0").parse::<f32>().unwrap_or(1.0);
103
104        let scan = data
105            .skip(3)
106            .next()
107            .unwrap_or("")
108            .parse::<usize>()
109            .unwrap_or(0);
110
111        Ok(Peptide {
112            sequence,
113            unique,
114            values,
115            scan,
116            purity,
117        })
118    }
119
120    fn parse_protein(&mut self) -> Result<Protein, Error> {
121        let line = self.iter.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
122        let mut data = line.split('\t');
123        assert_eq!(data.next(), Some("P"));
124        let accession = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
125        let spectral_count = data
126            .next()
127            .ok_or_else(|| self.err(ErrorKind::EOF))?
128            .parse::<u16>()
129            .map_err(|_| self.err(ErrorKind::Conversion))?;
130        let sequence_count = data
131            .next()
132            .ok_or_else(|| self.err(ErrorKind::EOF))?
133            .parse::<u16>()
134            .map_err(|_| self.err(ErrorKind::Conversion))?;
135        let sequence_coverage = data
136            .next()
137            .ok_or_else(|| self.err(ErrorKind::EOF))?
138            .trim_end_matches('%')
139            .parse::<f32>()
140            .map_err(|_| self.err(ErrorKind::Conversion))?;
141        let molecular_weight = data
142            .next()
143            .ok_or_else(|| self.err(ErrorKind::EOF))?
144            .parse::<u32>()
145            .map_err(|_| self.err(ErrorKind::Conversion))?;
146
147        // let mut description = String::new();
148        // for n in data {
149        //     description = n.into();
150        // }
151        let description = data.last().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
152
153        let mut peptides = Vec::new();
154        while let Some(next) = self.iter.peek() {
155            if next.starts_with('S') {
156                peptides.push(self.parse_peptide()?);
157            } else {
158                // Next line should be a protein entry
159                break;
160            }
161        }
162
163        Ok(Protein {
164            accession,
165            spectral_count,
166            sequence_count,
167            sequence_coverage,
168            molecular_weight,
169            description,
170            peptides,
171            channels: self.channels,
172        })
173    }
174
175    fn parse_headers(&mut self) -> Option<()> {
176        while let Some(line) = self.peek() {
177            if line.starts_with('H') {
178                let line = self.next()?;
179                if line.contains("m/z") {
180                    self.channels = (line.matches("m/z_").count() / 2) as u8;
181                }
182            } else {
183                return Some(());
184            }
185        }
186        None
187    }
188
189    pub fn parse(mut self) -> Result<Dataset, Error> {
190        let mut data = Vec::new();
191
192        while let Some(line) = self.peek() {
193            let init = line
194                .chars()
195                .next()
196                .ok_or_else(|| self.err(ErrorKind::EOF))?;
197            match init {
198                'H' => self
199                    .parse_headers()
200                    .ok_or_else(|| self.err(ErrorKind::EOF))?,
201                'P' => data.push(self.parse_protein()?),
202                _ => return Err(self.err(ErrorKind::Invalid(init))),
203            }
204        }
205
206        Ok(Dataset {
207            proteins: data,
208            channels: self.channels,
209        })
210    }
211}