rustyms/
isobaric_sets.rs

1use std::cmp::Ordering;
2
3use itertools::Itertools;
4
5use crate::{
6    annotation::model::GlycanModel,
7    chemistry::Chemical,
8    quantities::Tolerance,
9    sequence::{
10        AminoAcid, CheckedAminoAcid, Modification, Peptidoform, PlacementRule, Position,
11        SemiAmbiguous, SequenceElement, SequencePosition, SimpleLinear, SimpleModification,
12        SimpleModificationInner,
13    },
14    system::{Mass, Ratio, fraction},
15};
16
17/// A list of building blocks for a sequence defined by its sequence elements and its mass.
18pub type BuildingBlocks = Vec<(SequenceElement<SemiAmbiguous>, Mass)>;
19/// A list of all combinations of terminal modifications and their accompanying amino acid
20pub type TerminalBuildingBlocks = Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>;
21/// Get the possible building blocks for sequences based on the given modifications.
22/// Useful for any automated sequence generation, like isobaric set generation or de novo sequencing.
23/// The result is for each location (N term, center, C term) the list of all possible building blocks with its mass, sorted on mass.
24/// # Panics
25/// Panics if any of the modifications does not have a defined mass.
26pub fn building_blocks(
27    amino_acids: &[AminoAcid],
28    fixed: &[(SimpleModification, Option<PlacementRule>)],
29    variable: &[(SimpleModification, Option<PlacementRule>)],
30) -> (
31    TerminalBuildingBlocks,
32    BuildingBlocks,
33    TerminalBuildingBlocks,
34) {
35    /// Enforce the placement rules of predefined modifications.
36    fn can_be_placed(
37        modification: &SimpleModification,
38        seq: &SequenceElement<SemiAmbiguous>,
39        position: SequencePosition,
40    ) -> bool {
41        if let SimpleModificationInner::Database { specificities, .. } = &**modification {
42            specificities.is_empty()
43                || specificities
44                    .iter()
45                    .any(|(rules, _, _)| PlacementRule::any_possible(rules, seq, position))
46        } else {
47            true
48        }
49    }
50    fn n_term_options(amino_acids: &[AminoAcid], rule: &PlacementRule) -> Vec<AminoAcid> {
51        match rule {
52            PlacementRule::AminoAcid(aa, Position::AnyNTerm | Position::ProteinNTerm) => {
53                amino_acids
54                    .iter()
55                    .filter(|a| aa.contains(a))
56                    .copied()
57                    .collect_vec()
58            }
59            PlacementRule::Terminal(Position::AnyNTerm | Position::ProteinNTerm) => {
60                amino_acids.iter().copied().collect_vec()
61            }
62            _ => Vec::new(),
63        }
64    }
65    fn c_term_options(amino_acids: &[AminoAcid], rule: &PlacementRule) -> Vec<AminoAcid> {
66        match rule {
67            PlacementRule::AminoAcid(aa, Position::AnyCTerm | Position::ProteinCTerm) => {
68                amino_acids
69                    .iter()
70                    .filter(|a| aa.contains(a))
71                    .copied()
72                    .collect_vec()
73            }
74            PlacementRule::Terminal(Position::AnyCTerm | Position::ProteinCTerm) => {
75                amino_acids.iter().copied().collect_vec()
76            }
77            _ => Vec::new(),
78        }
79    }
80    fn generate_terminal(
81        position: &impl Fn(&PlacementRule) -> Vec<AminoAcid>,
82        fixed: &[(SimpleModification, Option<PlacementRule>)],
83        variable: &[(SimpleModification, Option<PlacementRule>)],
84    ) -> TerminalBuildingBlocks {
85        let mut options = fixed
86            .iter()
87            .chain(variable.iter())
88            .flat_map(|(modification, rule)| {
89                rule.as_ref().map_or_else(
90                    || {
91                        if let SimpleModificationInner::Database { specificities, .. } =
92                            &**modification
93                        {
94                            specificities
95                                .iter()
96                                .flat_map(|(rules, _, _)| {
97                                    rules.iter().flat_map(position).collect_vec()
98                                })
99                                .unique()
100                                .map(|a| {
101                                    (
102                                        SequenceElement::new(CheckedAminoAcid::new(a), None),
103                                        modification.clone(),
104                                    )
105                                })
106                                .collect_vec()
107                        } else {
108                            Vec::new()
109                        }
110                    },
111                    |rule| {
112                        position(rule)
113                            .into_iter()
114                            .map(|a| {
115                                (
116                                    SequenceElement::new(CheckedAminoAcid::new(a), None),
117                                    modification.clone(),
118                                )
119                            })
120                            .collect_vec()
121                    },
122                )
123            })
124            .flat_map(|(a, m)| {
125                let mc = m.clone();
126                a.formulas_all(
127                    &[],
128                    &[],
129                    &mut Vec::new(),
130                    false,
131                    SequencePosition::default(),
132                    0,
133                    0,
134                    &GlycanModel::DISALLOW,
135                )
136                .0
137                .iter()
138                .map(|f| f.monoisotopic_mass() + m.formula().monoisotopic_mass())
139                .map(|mass| (a.clone(), mc.clone(), mass))
140                .collect_vec()
141            })
142            .collect_vec();
143        options.sort_unstable_by(|a, b| a.2.value.total_cmp(&b.2.value));
144        options
145    }
146
147    let generate = |position| {
148        let mut options: Vec<(SequenceElement<SemiAmbiguous>, Mass)> = amino_acids
149            .iter()
150            .flat_map(|aa| {
151                let mut options = Vec::new();
152                options.extend(
153                    fixed
154                        .iter()
155                        .filter(|&m| {
156                            m.1.as_ref().map_or_else(
157                                || {
158                                    can_be_placed(
159                                        &m.0,
160                                        &SequenceElement::new(aa.into(), None),
161                                        position,
162                                    )
163                                },
164                                |rule| {
165                                    rule.is_possible(
166                                        &SequenceElement::new(aa.into(), None),
167                                        position,
168                                    )
169                                },
170                            )
171                        })
172                        .map(|m| {
173                            SequenceElement::new(aa.into(), None)
174                                .with_simple_modification(m.0.clone())
175                        }),
176                );
177                if options.is_empty() {
178                    vec![SequenceElement::new(aa.into(), None)]
179                } else {
180                    options
181                }
182            })
183            .flat_map(|seq| {
184                let mut options = vec![seq.clone()];
185                options.extend(
186                    variable
187                        .iter()
188                        .filter(|&m| {
189                            m.1.as_ref().map_or_else(
190                                || can_be_placed(&m.0, &seq, position),
191                                |rule| rule.is_possible(&seq, position),
192                            )
193                        })
194                        .map(|m| {
195                            let mut modifications = seq.modifications.clone();
196                            modifications.push(Modification::Simple(m.0.clone()));
197                            let mut result = SequenceElement::new(seq.aminoacid, None);
198                            result.modifications = modifications;
199                            result
200                        }),
201                );
202                options
203            })
204            .flat_map(|s| {
205                s.formulas_all(
206                    &[],
207                    &[],
208                    &mut Vec::new(),
209                    false,
210                    position,
211                    0,
212                    0,
213                    &GlycanModel::DISALLOW,
214                )
215                .0
216                .iter()
217                .map(|f| (s.clone(), f.monoisotopic_mass()))
218                .collect_vec()
219            })
220            .collect();
221        options.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
222        options
223    };
224
225    // Create the building blocks
226    (
227        generate_terminal(&|rule| n_term_options(amino_acids, rule), fixed, variable),
228        generate(SequencePosition::Index(0)),
229        generate_terminal(&|rule| c_term_options(amino_acids, rule), fixed, variable),
230    )
231}
232
233/// Find the isobaric sets for the given mass with the given modifications and ppm error.
234/// The modifications are placed on any location they are allowed based on the given placement
235/// rules, so using any modifications which provide those is advised. If the provided [`Peptidoform`]
236/// has multiple formulas, it uses the formula with the lowest monoisotopic mass.
237/// # Panics
238/// Panics if any of the modifications does not have a defined mass. Or if the weight of the
239/// base selection is already in the tolerance of the given mass.
240pub fn find_isobaric_sets(
241    mass: Mass,
242    tolerance: Tolerance<Mass>,
243    amino_acids: &[AminoAcid],
244    fixed: &[(SimpleModification, Option<PlacementRule>)],
245    variable: &[(SimpleModification, Option<PlacementRule>)],
246    base: Option<&Peptidoform<SimpleLinear>>,
247) -> IsobaricSetIterator {
248    let bounds = tolerance.bounds(mass);
249    let base_mass = base
250        .and_then(|b| {
251            b.formulas()
252                .mass_bounds()
253                .into_option()
254                .map(|(f, _)| f.monoisotopic_mass())
255        })
256        .unwrap_or_default();
257    let bounds = (bounds.0 - base_mass, bounds.1 - base_mass);
258    assert!(
259        bounds.0.value > 0.0,
260        "Cannot have a base selection that has a weight within the tolerance of the intended final mass for isobaric search."
261    );
262    let (n_term, center, c_term) = building_blocks(amino_acids, fixed, variable);
263
264    IsobaricSetIterator::new(n_term, c_term, center, bounds, base)
265}
266
267/// Iteratively generate isobaric sets based on the given settings.
268#[derive(Debug)]
269pub struct IsobaricSetIterator {
270    n_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
271    c_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
272    center: Vec<(SequenceElement<SemiAmbiguous>, Mass)>,
273    sizes: (Mass, Mass),
274    bounds: (Mass, Mass),
275    state: (Option<usize>, Option<usize>, Vec<usize>),
276    base: Option<Peptidoform<SimpleLinear>>,
277}
278
279impl IsobaricSetIterator {
280    /// `n_term` & `c_term` are the possible combinations of terminal modifications with their valid placements and the full mass of this combo
281    /// # Panics
282    /// If there is not at least one element in the `center` list.
283    fn new(
284        n_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
285        c_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
286        center: Vec<(SequenceElement<SemiAmbiguous>, Mass)>,
287        bounds: (Mass, Mass),
288        base: Option<&Peptidoform<SimpleLinear>>,
289    ) -> Self {
290        let sizes = (center.first().unwrap().1, center.last().unwrap().1);
291        let mut iter = Self {
292            n_term,
293            c_term,
294            center,
295            sizes,
296            bounds,
297            state: (None, None, Vec::new()),
298            base: base.cloned(),
299        };
300        while iter.current_mass() < iter.bounds.0 - iter.sizes.0 {
301            iter.state.2.push(0);
302        }
303        iter
304    }
305
306    fn current_mass(&self) -> Mass {
307        self.state.0.map(|i| self.n_term[i].2).unwrap_or_default()
308            + self.state.1.map(|i| self.c_term[i].2).unwrap_or_default()
309            + self
310                .state
311                .2
312                .iter()
313                .copied()
314                .map(|i| self.center[i].1)
315                .sum::<Mass>()
316    }
317
318    fn mass_fits(&self) -> Ordering {
319        let mass = self.current_mass();
320        if mass < self.bounds.0 {
321            Ordering::Less
322        } else if mass > self.bounds.1 {
323            Ordering::Greater
324        } else {
325            Ordering::Equal
326        }
327    }
328
329    /// # Panics
330    /// If the base sequence is empty.
331    fn peptide(&self) -> Peptidoform<SimpleLinear> {
332        let mut sequence = Vec::with_capacity(
333            self.base.as_ref().map(Peptidoform::len).unwrap_or_default()
334                + self.state.2.len()
335                + usize::from(self.state.0.is_some())
336                + usize::from(self.state.1.is_some()),
337        );
338        if self
339            .base
340            .as_ref()
341            .is_some_and(|b| !b.get_n_term().is_empty())
342        {
343            sequence.push(self.base.as_ref().unwrap().sequence()[0].clone());
344        } else if let Some(n) = self.state.0.map(|i| self.n_term[i].clone()) {
345            sequence.push(n.0.into());
346        }
347        if let Some(base) = &self.base {
348            let n = usize::from(base.get_n_term().is_empty());
349            let c = usize::from(base.get_c_term().is_empty());
350            sequence.extend(base.sequence()[n..base.len() - n - c].iter().cloned());
351        }
352        sequence.extend(
353            self.state
354                .2
355                .iter()
356                .copied()
357                .map(|i| self.center[i].0.clone().into()),
358        );
359        if self
360            .base
361            .as_ref()
362            .is_some_and(|b| !b.get_c_term().is_empty())
363        {
364            sequence.push(
365                self.base
366                    .as_ref()
367                    .unwrap()
368                    .sequence()
369                    .last()
370                    .unwrap()
371                    .clone(),
372            );
373        } else if let Some(c) = self.state.1.map(|i| self.c_term[i].clone()) {
374            sequence.push(c.0.into());
375        }
376        Peptidoform::new(sequence)
377            .n_term(self.base.as_ref().map_or_else(
378                || {
379                    self.state.0.map_or(Vec::new(), |i| {
380                        vec![Modification::Simple(self.n_term[i].1.clone())]
381                    })
382                },
383                |b| b.get_n_term().to_vec(),
384            ))
385            .c_term(self.base.as_ref().map_or_else(
386                || {
387                    self.state.0.map_or(Vec::new(), |i| {
388                        vec![Modification::Simple(self.c_term[i].1.clone())]
389                    })
390                },
391                |b| b.get_c_term().to_vec(),
392            ))
393    }
394
395    /// Reset the state for the center selection
396    fn reset_center_state(&mut self) {
397        self.state.2.clear();
398        while self.current_mass() < self.bounds.0 - self.sizes.0 {
399            self.state.2.push(0);
400        }
401    }
402}
403
404impl Iterator for IsobaricSetIterator {
405    type Item = Peptidoform<SimpleLinear>;
406    fn next(&mut self) -> Option<Self::Item> {
407        loop {
408            // N terminal loop
409            loop {
410                // C terminal loop
411
412                // Main loop
413                while !self.state.2.is_empty() {
414                    // Do state + 1 at the highest level where this is still possible and check if that one fits the bounds
415                    // Until every level is full then pop and try with one fewer number of amino acids
416                    while !self.state.2.iter().all(|s| *s == self.center.len() - 1) {
417                        let mut level = self.state.2.len() - 1;
418                        loop {
419                            if self.state.2[level] == self.center.len() - 1 {
420                                if level == 0 {
421                                    break;
422                                }
423                                level -= 1;
424                            } else {
425                                // Update this level
426                                self.state.2[level] += 1;
427                                // Reset the levels above, has to start at minimal at the index of this level to prevent 'rotations' of the set to show up
428                                for l in level + 1..self.state.2.len() {
429                                    self.state.2[l] = self.state.2[level];
430                                }
431                                match self.mass_fits() {
432                                    Ordering::Greater => {
433                                        // If the mass is too great the level below will have the be changed, otherwise it could only be getting heavier with the next iteration(s)
434                                        if level == 0 {
435                                            break;
436                                        }
437                                        level -= 1;
438                                    }
439                                    Ordering::Equal => {
440                                        return Some(self.peptide());
441                                    }
442                                    Ordering::Less => {
443                                        // If there a way to reach at least the lower limit by having all the heaviest options selected try and reach them.
444                                        // Otherwise this will increase this level again next iteration.
445                                        if self.state.2[0..level]
446                                            .iter()
447                                            .map(|i| self.center[*i].1)
448                                            .sum::<Mass>()
449                                            + Ratio::new::<fraction>(
450                                                (self.state.2.len() - level) as f64,
451                                            ) * self.sizes.1
452                                            > self.bounds.0
453                                        {
454                                            level = self.state.2.len() - 1;
455                                        }
456                                    }
457                                }
458                            }
459                        }
460                    }
461                    self.state.2.pop();
462                    // Stop the search when there is no possibility for a fitting answer
463                    if self.sizes.1 * Ratio::new::<fraction>(self.state.2.len() as f64)
464                        < self.bounds.0
465                    {
466                        break;
467                    }
468                    // Reset the levels to be all 0s again
469                    for level in 0..self.state.2.len() {
470                        self.state.2[level] = 0;
471                    }
472                }
473
474                // Try the next C terminal option
475                if let Some(c) = self.state.1 {
476                    if c + 1 >= self.c_term.len() {
477                        break;
478                    }
479                    self.state.1 = Some(c + 1);
480                } else if self.c_term.is_empty()
481                    || self
482                        .base
483                        .as_ref()
484                        .is_some_and(|b| !b.get_c_term().is_empty())
485                {
486                    // If the base piece has a defined C term mod do not try possible C term mods in the isobaric generation
487                    break;
488                } else {
489                    self.state.1 = Some(0);
490                }
491                self.reset_center_state();
492            }
493            // Try the next N terminal option
494            if let Some(n) = self.state.0 {
495                if n + 1 >= self.n_term.len() {
496                    break;
497                }
498                self.state.0 = Some(n + 1);
499            } else if self.n_term.is_empty()
500                || self
501                    .base
502                    .as_ref()
503                    .is_some_and(|b| !b.get_n_term().is_empty())
504            {
505                // If the base piece has a defined N term mod do not try possible N term mods in the isobaric generation
506                break;
507            } else {
508                self.state.1 = Some(0);
509            }
510            self.reset_center_state();
511        }
512        None
513    }
514}
515
516#[cfg(test)]
517#[expect(clippy::missing_panics_doc)]
518mod tests {
519
520    use super::*;
521    #[test]
522    fn simple_isobaric_sets() {
523        let pep = Peptidoform::pro_forma("AG", None)
524            .unwrap()
525            .into_unambiguous()
526            .unwrap();
527        let sets: Vec<Peptidoform<SimpleLinear>> = find_isobaric_sets(
528            pep.bare_formula().monoisotopic_mass(),
529            Tolerance::new_ppm(10.0),
530            AminoAcid::UNIQUE_MASS_AMINO_ACIDS,
531            &[],
532            &[],
533            None,
534        )
535        .collect();
536        assert_eq!(
537            &sets,
538            &[
539                Peptidoform::pro_forma("GA", None)
540                    .unwrap()
541                    .into_simple_linear()
542                    .unwrap(),
543                Peptidoform::pro_forma("Q", None)
544                    .unwrap()
545                    .into_simple_linear()
546                    .unwrap(),
547            ]
548        );
549    }
550}