census_proteomics/
parser.rs1use super::*;
5
6use std::fmt;
7use std::iter::Peekable;
8use std::str::Lines;
9
10#[derive(PartialEq, PartialOrd, Debug)]
11pub enum ErrorKind {
12 Invalid(char),
14 Conversion,
16 EOF,
18}
19
20#[derive(PartialEq, PartialOrd, Debug)]
22pub struct Error {
23 kind: ErrorKind,
24 line: usize,
25}
26
27impl fmt::Display for Error {
28 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
29 write!(
30 f,
31 "Error parsing file at line {}: {:?}",
32 self.line, self.kind
33 )
34 }
35}
36
37impl std::error::Error for Error {}
38
39pub struct Parser<'s> {
40 iter: Peekable<Lines<'s>>,
41 channels: u8,
43 line: usize,
44}
45
46impl<'s> Parser<'s> {
47 pub fn new(input: &'s str) -> Parser<'s> {
49 Parser {
50 iter: input.lines().peekable(),
51 channels: 0,
52 line: 1,
53 }
54 }
55
56 fn err(&self, kind: ErrorKind) -> Error {
58 Error {
59 kind,
60 line: self.line,
61 }
62 }
63
64 fn peek(&mut self) -> Option<&&'s str> {
65 self.iter.peek()
66 }
67
68 fn next(&mut self) -> Option<&'s str> {
69 let n = self.iter.next();
70 if n.is_some() {
71 self.line += 1;
72 }
73 n
74 }
75
76 fn parse_peptide(&mut self) -> Result<Peptide, Error> {
77 let line = self.iter.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
78 let mut data = line.split('\t');
81 assert_eq!(data.next(), Some("S"));
82
83 let n = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
84 assert!(n.len() <= 1);
85 let unique: bool = n == "U";
86 let sequence = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
87
88 let mut values = Vec::with_capacity(self.channels as usize);
89
90 for _ in 0..self.channels {
91 let mz = data
92 .next()
93 .ok_or_else(|| self.err(ErrorKind::EOF))?
94 .parse::<u32>()
95 .map_err(|_| self.err(ErrorKind::Conversion))?;
96 let _ = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
98 values.push(mz);
99 }
100
101 let _ = data.next();
102 let purity = data.next().unwrap_or("1.0").parse::<f32>().unwrap_or(1.0);
103
104 let scan = data
105 .skip(3)
106 .next()
107 .unwrap_or("")
108 .parse::<usize>()
109 .unwrap_or(0);
110
111 Ok(Peptide {
112 sequence,
113 unique,
114 values,
115 scan,
116 purity,
117 })
118 }
119
120 fn parse_protein(&mut self) -> Result<Protein, Error> {
121 let line = self.iter.next().ok_or_else(|| self.err(ErrorKind::EOF))?;
122 let mut data = line.split('\t');
123 assert_eq!(data.next(), Some("P"));
124 let accession = data.next().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
125 let spectral_count = data
126 .next()
127 .ok_or_else(|| self.err(ErrorKind::EOF))?
128 .parse::<u16>()
129 .map_err(|_| self.err(ErrorKind::Conversion))?;
130 let sequence_count = data
131 .next()
132 .ok_or_else(|| self.err(ErrorKind::EOF))?
133 .parse::<u16>()
134 .map_err(|_| self.err(ErrorKind::Conversion))?;
135 let sequence_coverage = data
136 .next()
137 .ok_or_else(|| self.err(ErrorKind::EOF))?
138 .trim_end_matches('%')
139 .parse::<f32>()
140 .map_err(|_| self.err(ErrorKind::Conversion))?;
141 let molecular_weight = data
142 .next()
143 .ok_or_else(|| self.err(ErrorKind::EOF))?
144 .parse::<u32>()
145 .map_err(|_| self.err(ErrorKind::Conversion))?;
146
147 let description = data.last().ok_or_else(|| self.err(ErrorKind::EOF))?.into();
152
153 let mut peptides = Vec::new();
154 while let Some(next) = self.iter.peek() {
155 if next.starts_with('S') {
156 peptides.push(self.parse_peptide()?);
157 } else {
158 break;
160 }
161 }
162
163 Ok(Protein {
164 accession,
165 spectral_count,
166 sequence_count,
167 sequence_coverage,
168 molecular_weight,
169 description,
170 peptides,
171 channels: self.channels,
172 })
173 }
174
175 fn parse_headers(&mut self) -> Option<()> {
176 while let Some(line) = self.peek() {
177 if line.starts_with('H') {
178 let line = self.next()?;
179 if line.contains("m/z") {
180 self.channels = (line.matches("m/z_").count() / 2) as u8;
181 }
182 } else {
183 return Some(());
184 }
185 }
186 None
187 }
188
189 pub fn parse(mut self) -> Result<Dataset, Error> {
190 let mut data = Vec::new();
191
192 while let Some(line) = self.peek() {
193 let init = line
194 .chars()
195 .next()
196 .ok_or_else(|| self.err(ErrorKind::EOF))?;
197 match init {
198 'H' => self
199 .parse_headers()
200 .ok_or_else(|| self.err(ErrorKind::EOF))?,
201 'P' => data.push(self.parse_protein()?),
202 _ => return Err(self.err(ErrorKind::Invalid(init))),
203 }
204 }
205
206 Ok(Dataset {
207 proteins: data,
208 channels: self.channels,
209 })
210 }
211}