census_proteomics/
filter.rs

1//! Utilities for filtering datasets and proteins based on a set of composable
2//! rules
3use super::*;
4#[cfg(feature = "serialization")]
5use serde::{Deserialize, Serialize};
6use std::collections::HashSet;
7
8/// Protein-level filter
9#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
10#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)]
11pub enum ProteinFilter {
12    /// Include only proteins that have spectral counts >= N
13    SpectralCounts(u16),
14    /// Include only proteins that have sequence counts >= N
15    SequenceCounts(u16),
16    /// Include only proteins that do not have "Reverse" in their
17    /// UniProt accession
18    ExcludeReverse,
19}
20
21/// Peptide-level filter
22///
23/// Filter individual peptides within a protein based on sequence mactches,
24/// total intensities, coefficient of variance between channels, or intensity
25/// values on specified channels.
26///
27/// Peptide can also be filtered based on whether they have 2 tryptic ends,
28/// or if they are unique.
29#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum PeptideFilter<'a> {
32    /// Include only peptides that have a sequence matching the pattern
33    SequenceMatch(&'a str),
34    /// Include only peptides that do NOT have a sequence matching the pattern
35    SequenceExclude(&'a str),
36    /// Include only peptides that have a total ion itensity >= N
37    TotalIntensity(u32),
38
39    /// Include only peptides where the total intensity in a set of channels
40    /// >= N
41    TotalIntensityChannels(Vec<usize>, u32),
42
43    /// ChannelCV(channels, N)
44    ///
45    /// Include only peptides where the coeff. of variance is < N between
46    /// the specified channels
47    ChannelCV(Vec<usize>, f64),
48
49    /// ChannelIntensity(channel, cutoff)
50    ///
51    /// Include only peptides that have an ion intensity >= N
52    /// in the specified channel
53    ChannelIntensity(usize, u32),
54
55    /// TMT purity
56    Purity(f32),
57
58    /// Include only tryptic peptides
59    Tryptic,
60    /// Include only unique peptides
61    Unique,
62}
63
64/// Provides filtering functionality on datasets and proteins
65#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
66#[derive(Debug, Clone, PartialEq, PartialOrd)]
67pub struct Filter<'a> {
68    #[cfg_attr(feature = "serde", serde(borrow))]
69    peptide_filters: Vec<PeptideFilter<'a>>,
70    protein_filters: Vec<ProteinFilter>,
71}
72
73impl<'a> Default for Filter<'a> {
74    /// Construct a filter with no rules
75    fn default() -> Self {
76        Filter {
77            peptide_filters: Vec::new(),
78            protein_filters: Vec::new(),
79        }
80    }
81}
82
83impl<'a> Filter<'a> {
84    /// Add a new `ProteinFilter` to the `Filter` object.
85    ///
86    /// This follows the Builder pattern
87    pub fn add_protein_filter(mut self, filter: ProteinFilter) -> Self {
88        self.protein_filters.push(filter);
89        self
90    }
91
92    /// Add a new `PeptideFilter` to the `Filter` object.
93    ///
94    /// This follows the Builder pattern
95    pub fn add_peptide_filter(mut self, filter: PeptideFilter<'a>) -> Self {
96        self.peptide_filters.push(filter);
97        self
98    }
99
100    pub fn tryptic_regex() -> regex::Regex {
101        regex::RegexBuilder::new(r#"(R|K|-)\..*((R|K)\..|.-)"#)
102            .build()
103            .unwrap()
104    }
105
106    /// Return a new `Dataset` that only contains filtered `Protein`'s
107    pub fn filter_dataset(&self, dataset: Dataset) -> Dataset {
108        let reg = Self::tryptic_regex();
109        Dataset {
110            channels: dataset.channels,
111            proteins: dataset
112                .proteins
113                .into_iter()
114                .filter_map(|prot| self.filter_protein(prot, &reg))
115                .collect(),
116        }
117    }
118
119    /// Filter a `Protein`, returning `Some` if it passes any
120    /// `ProteinFilter`s that need to be applied or `None` if the protein
121    /// fails a given `ProteinFilter`.
122    ///
123    /// The peptides associated with the returned `Protein` object aree those
124    /// that passed any given `PeptideFilter`s.
125    pub fn filter_protein(
126        &self,
127        mut protein: Protein,
128        tryptic_regex: &regex::Regex,
129    ) -> Option<Protein> {
130        // First run through any protein level filters
131        for filter in &self.protein_filters {
132            match filter {
133                ProteinFilter::SequenceCounts(n) => {
134                    if protein.sequence_count < *n {
135                        return None;
136                    }
137                }
138                ProteinFilter::SpectralCounts(n) => {
139                    if protein.spectral_count < *n {
140                        return None;
141                    }
142                }
143                ProteinFilter::ExcludeReverse => {
144                    if protein.accession.contains("Reverse") {
145                        return None;
146                    }
147                }
148            }
149        }
150
151        let mut filtered = Vec::new();
152
153        // Iterate through all of the peptides in the protein container,
154        // applying relevant filters as we go.
155        for peptide in protein.peptides {
156            let mut pass = true;
157            for filter in &self.peptide_filters {
158                match filter {
159                    PeptideFilter::SequenceExclude(pat) => {
160                        if peptide.sequence.contains(pat) {
161                            pass = false;
162                            break;
163                        }
164                    }
165                    PeptideFilter::SequenceMatch(pat) => {
166                        if !peptide.sequence.contains(pat) {
167                            pass = false;
168                            break;
169                        }
170                    }
171                    PeptideFilter::TotalIntensity(n) => {
172                        if peptide.values.iter().sum::<u32>() < *n {
173                            pass = false;
174                            break;
175                        }
176                    }
177                    PeptideFilter::Tryptic => {
178                        if !tryptic_regex.is_match(&peptide.sequence) {
179                            pass = false;
180                            break;
181                        }
182                    }
183                    PeptideFilter::Unique => {
184                        if !peptide.unique {
185                            pass = false;
186                            break;
187                        }
188                    }
189                    PeptideFilter::Purity(cutoff) => {
190                        if peptide.purity < *cutoff {
191                            pass = false;
192                            break;
193                        }
194                    }
195                    PeptideFilter::ChannelCV(channels, cutoff) => {
196                        let mut v = Vec::new();
197                        for chan in channels.iter() {
198                            if chan - 1 < peptide.values.len() {
199                                v.push(peptide.values[chan - 1]);
200                            }
201                        }
202                        if util::cv(&v) >= *cutoff {
203                            pass = false;
204                            break;
205                        }
206                    }
207                    PeptideFilter::ChannelIntensity(channel, cutoff) => {
208                        // Ignore incorrect channel values
209                        if channel - 1 < peptide.values.len()
210                            && peptide.values[channel - 1] < *cutoff
211                        {
212                            pass = false;
213                            break;
214                        }
215                    }
216                    PeptideFilter::TotalIntensityChannels(chan, cutoff) => {
217                        let mut sum = 0;
218                        for c in chan {
219                            if c - 1 < peptide.values.len() {
220                                sum += peptide.values[*c - 1];
221                            }
222                        }
223                        if sum < *cutoff {
224                            pass = false;
225                            break;
226                        }
227                    }
228                }
229            }
230
231            if pass {
232                filtered.push(peptide)
233            }
234        }
235        // We must have at least a single peptide...
236        if filtered.len() == 0 {
237            return None;
238        }
239
240        protein.peptides = filtered;
241
242        let spec = protein.peptides.len() as u16;
243        let seq = protein
244            .peptides
245            .iter()
246            .map(|pep| &pep.sequence)
247            .collect::<HashSet<_>>()
248            .len() as u16;
249
250        protein.spectral_count = spec;
251        protein.sequence_count = seq;
252
253        // Second pass through protein filters, in case we no longer have
254        // enough filtered peptides
255        for filter in &self.protein_filters {
256            match filter {
257                ProteinFilter::SequenceCounts(n) => {
258                    if seq < *n {
259                        return None;
260                    }
261                }
262                ProteinFilter::SpectralCounts(n) => {
263                    if spec < *n {
264                        return None;
265                    }
266                }
267                _ => {}
268            }
269        }
270
271        Some(protein)
272    }
273}
274
275#[cfg(test)]
276mod test {
277    use super::*;
278
279    #[test]
280    fn total_intensity_channels() {
281        let p1 = Peptide {
282            sequence: "aa".into(),
283            values: vec![1, 2998, 5000, 84, 4738, 9384],
284            unique: true,
285            scan: 0,
286            purity: 1.0,
287        };
288        let p2 = Peptide {
289            sequence: "aaa".into(),
290            values: vec![10000, 0, 433, 61346, 41, 5555],
291            unique: true,
292            scan: 0,
293            purity: 1.0,
294        };
295
296        let p3 = Peptide {
297            sequence: "aaaa".into(),
298            values: vec![1, 2999, 0, 0, 0, 0],
299            unique: true,
300            scan: 0,
301            purity: 1.0,
302        };
303
304        let prot = Protein {
305            accession: "".into(),
306            description: "".into(),
307            spectral_count: 10,
308            sequence_count: 3,
309            sequence_coverage: 0.3,
310            molecular_weight: 10,
311            peptides: vec![p1.clone(), p2.clone(), p3.clone()],
312            channels: 6,
313        };
314
315        let mut fil = Filter {
316            peptide_filters: Vec::new(),
317            protein_filters: Vec::new(),
318        };
319
320        fil = fil.add_peptide_filter(PeptideFilter::TotalIntensityChannels(vec![1, 2], 3000));
321        let p = fil.filter_protein(prot, &Filter::tryptic_regex()).unwrap();
322        assert_eq!(p.peptides.len(), 2);
323        assert_eq!(p.sequence_count, 2);
324        assert_eq!(p.peptides, vec![p2.clone(), p3.clone()]);
325    }
326}