1use super::*;
4#[cfg(feature = "serialization")]
5use serde::{Deserialize, Serialize};
6use std::collections::HashSet;
7
8#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
10#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)]
11pub enum ProteinFilter {
12 SpectralCounts(u16),
14 SequenceCounts(u16),
16 ExcludeReverse,
19}
20
21#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum PeptideFilter<'a> {
32 SequenceMatch(&'a str),
34 SequenceExclude(&'a str),
36 TotalIntensity(u32),
38
39 TotalIntensityChannels(Vec<usize>, u32),
42
43 ChannelCV(Vec<usize>, f64),
48
49 ChannelIntensity(usize, u32),
54
55 Purity(f32),
57
58 Tryptic,
60 Unique,
62}
63
64#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
66#[derive(Debug, Clone, PartialEq, PartialOrd)]
67pub struct Filter<'a> {
68 #[cfg_attr(feature = "serde", serde(borrow))]
69 peptide_filters: Vec<PeptideFilter<'a>>,
70 protein_filters: Vec<ProteinFilter>,
71}
72
73impl<'a> Default for Filter<'a> {
74 fn default() -> Self {
76 Filter {
77 peptide_filters: Vec::new(),
78 protein_filters: Vec::new(),
79 }
80 }
81}
82
83impl<'a> Filter<'a> {
84 pub fn add_protein_filter(mut self, filter: ProteinFilter) -> Self {
88 self.protein_filters.push(filter);
89 self
90 }
91
92 pub fn add_peptide_filter(mut self, filter: PeptideFilter<'a>) -> Self {
96 self.peptide_filters.push(filter);
97 self
98 }
99
100 pub fn tryptic_regex() -> regex::Regex {
101 regex::RegexBuilder::new(r#"(R|K|-)\..*((R|K)\..|.-)"#)
102 .build()
103 .unwrap()
104 }
105
106 pub fn filter_dataset(&self, dataset: Dataset) -> Dataset {
108 let reg = Self::tryptic_regex();
109 Dataset {
110 channels: dataset.channels,
111 proteins: dataset
112 .proteins
113 .into_iter()
114 .filter_map(|prot| self.filter_protein(prot, ®))
115 .collect(),
116 }
117 }
118
119 pub fn filter_protein(
126 &self,
127 mut protein: Protein,
128 tryptic_regex: ®ex::Regex,
129 ) -> Option<Protein> {
130 for filter in &self.protein_filters {
132 match filter {
133 ProteinFilter::SequenceCounts(n) => {
134 if protein.sequence_count < *n {
135 return None;
136 }
137 }
138 ProteinFilter::SpectralCounts(n) => {
139 if protein.spectral_count < *n {
140 return None;
141 }
142 }
143 ProteinFilter::ExcludeReverse => {
144 if protein.accession.contains("Reverse") {
145 return None;
146 }
147 }
148 }
149 }
150
151 let mut filtered = Vec::new();
152
153 for peptide in protein.peptides {
156 let mut pass = true;
157 for filter in &self.peptide_filters {
158 match filter {
159 PeptideFilter::SequenceExclude(pat) => {
160 if peptide.sequence.contains(pat) {
161 pass = false;
162 break;
163 }
164 }
165 PeptideFilter::SequenceMatch(pat) => {
166 if !peptide.sequence.contains(pat) {
167 pass = false;
168 break;
169 }
170 }
171 PeptideFilter::TotalIntensity(n) => {
172 if peptide.values.iter().sum::<u32>() < *n {
173 pass = false;
174 break;
175 }
176 }
177 PeptideFilter::Tryptic => {
178 if !tryptic_regex.is_match(&peptide.sequence) {
179 pass = false;
180 break;
181 }
182 }
183 PeptideFilter::Unique => {
184 if !peptide.unique {
185 pass = false;
186 break;
187 }
188 }
189 PeptideFilter::Purity(cutoff) => {
190 if peptide.purity < *cutoff {
191 pass = false;
192 break;
193 }
194 }
195 PeptideFilter::ChannelCV(channels, cutoff) => {
196 let mut v = Vec::new();
197 for chan in channels.iter() {
198 if chan - 1 < peptide.values.len() {
199 v.push(peptide.values[chan - 1]);
200 }
201 }
202 if util::cv(&v) >= *cutoff {
203 pass = false;
204 break;
205 }
206 }
207 PeptideFilter::ChannelIntensity(channel, cutoff) => {
208 if channel - 1 < peptide.values.len()
210 && peptide.values[channel - 1] < *cutoff
211 {
212 pass = false;
213 break;
214 }
215 }
216 PeptideFilter::TotalIntensityChannels(chan, cutoff) => {
217 let mut sum = 0;
218 for c in chan {
219 if c - 1 < peptide.values.len() {
220 sum += peptide.values[*c - 1];
221 }
222 }
223 if sum < *cutoff {
224 pass = false;
225 break;
226 }
227 }
228 }
229 }
230
231 if pass {
232 filtered.push(peptide)
233 }
234 }
235 if filtered.len() == 0 {
237 return None;
238 }
239
240 protein.peptides = filtered;
241
242 let spec = protein.peptides.len() as u16;
243 let seq = protein
244 .peptides
245 .iter()
246 .map(|pep| &pep.sequence)
247 .collect::<HashSet<_>>()
248 .len() as u16;
249
250 protein.spectral_count = spec;
251 protein.sequence_count = seq;
252
253 for filter in &self.protein_filters {
256 match filter {
257 ProteinFilter::SequenceCounts(n) => {
258 if seq < *n {
259 return None;
260 }
261 }
262 ProteinFilter::SpectralCounts(n) => {
263 if spec < *n {
264 return None;
265 }
266 }
267 _ => {}
268 }
269 }
270
271 Some(protein)
272 }
273}
274
275#[cfg(test)]
276mod test {
277 use super::*;
278
279 #[test]
280 fn total_intensity_channels() {
281 let p1 = Peptide {
282 sequence: "aa".into(),
283 values: vec![1, 2998, 5000, 84, 4738, 9384],
284 unique: true,
285 scan: 0,
286 purity: 1.0,
287 };
288 let p2 = Peptide {
289 sequence: "aaa".into(),
290 values: vec![10000, 0, 433, 61346, 41, 5555],
291 unique: true,
292 scan: 0,
293 purity: 1.0,
294 };
295
296 let p3 = Peptide {
297 sequence: "aaaa".into(),
298 values: vec![1, 2999, 0, 0, 0, 0],
299 unique: true,
300 scan: 0,
301 purity: 1.0,
302 };
303
304 let prot = Protein {
305 accession: "".into(),
306 description: "".into(),
307 spectral_count: 10,
308 sequence_count: 3,
309 sequence_coverage: 0.3,
310 molecular_weight: 10,
311 peptides: vec![p1.clone(), p2.clone(), p3.clone()],
312 channels: 6,
313 };
314
315 let mut fil = Filter {
316 peptide_filters: Vec::new(),
317 protein_filters: Vec::new(),
318 };
319
320 fil = fil.add_peptide_filter(PeptideFilter::TotalIntensityChannels(vec![1, 2], 3000));
321 let p = fil.filter_protein(prot, &Filter::tryptic_regex()).unwrap();
322 assert_eq!(p.peptides.len(), 2);
323 assert_eq!(p.sequence_count, 2);
324 assert_eq!(p.peptides, vec![p2.clone(), p3.clone()]);
325 }
326}