1use std::cmp::Ordering;
2
3use itertools::Itertools;
4
5use crate::{
6 annotation::model::GlycanModel,
7 chemistry::Chemical,
8 quantities::Tolerance,
9 sequence::{
10 AminoAcid, CheckedAminoAcid, Modification, Peptidoform, PlacementRule, Position,
11 SemiAmbiguous, SequenceElement, SequencePosition, SimpleLinear, SimpleModification,
12 SimpleModificationInner,
13 },
14 system::{Mass, Ratio, fraction},
15};
16
17pub type BuildingBlocks = Vec<(SequenceElement<SemiAmbiguous>, Mass)>;
19pub type TerminalBuildingBlocks = Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>;
21pub fn building_blocks(
27 amino_acids: &[AminoAcid],
28 fixed: &[(SimpleModification, Option<PlacementRule>)],
29 variable: &[(SimpleModification, Option<PlacementRule>)],
30) -> (
31 TerminalBuildingBlocks,
32 BuildingBlocks,
33 TerminalBuildingBlocks,
34) {
35 fn can_be_placed(
37 modification: &SimpleModification,
38 seq: &SequenceElement<SemiAmbiguous>,
39 position: SequencePosition,
40 ) -> bool {
41 if let SimpleModificationInner::Database { specificities, .. } = &**modification {
42 specificities.is_empty()
43 || specificities
44 .iter()
45 .any(|(rules, _, _)| PlacementRule::any_possible(rules, seq, position))
46 } else {
47 true
48 }
49 }
50 fn n_term_options(amino_acids: &[AminoAcid], rule: &PlacementRule) -> Vec<AminoAcid> {
51 match rule {
52 PlacementRule::AminoAcid(aa, Position::AnyNTerm | Position::ProteinNTerm) => {
53 amino_acids
54 .iter()
55 .filter(|a| aa.contains(a))
56 .copied()
57 .collect_vec()
58 }
59 PlacementRule::Terminal(Position::AnyNTerm | Position::ProteinNTerm) => {
60 amino_acids.iter().copied().collect_vec()
61 }
62 _ => Vec::new(),
63 }
64 }
65 fn c_term_options(amino_acids: &[AminoAcid], rule: &PlacementRule) -> Vec<AminoAcid> {
66 match rule {
67 PlacementRule::AminoAcid(aa, Position::AnyCTerm | Position::ProteinCTerm) => {
68 amino_acids
69 .iter()
70 .filter(|a| aa.contains(a))
71 .copied()
72 .collect_vec()
73 }
74 PlacementRule::Terminal(Position::AnyCTerm | Position::ProteinCTerm) => {
75 amino_acids.iter().copied().collect_vec()
76 }
77 _ => Vec::new(),
78 }
79 }
80 fn generate_terminal(
81 position: &impl Fn(&PlacementRule) -> Vec<AminoAcid>,
82 fixed: &[(SimpleModification, Option<PlacementRule>)],
83 variable: &[(SimpleModification, Option<PlacementRule>)],
84 ) -> TerminalBuildingBlocks {
85 let mut options = fixed
86 .iter()
87 .chain(variable.iter())
88 .flat_map(|(modification, rule)| {
89 rule.as_ref().map_or_else(
90 || {
91 if let SimpleModificationInner::Database { specificities, .. } =
92 &**modification
93 {
94 specificities
95 .iter()
96 .flat_map(|(rules, _, _)| {
97 rules.iter().flat_map(position).collect_vec()
98 })
99 .unique()
100 .map(|a| {
101 (
102 SequenceElement::new(CheckedAminoAcid::new(a), None),
103 modification.clone(),
104 )
105 })
106 .collect_vec()
107 } else {
108 Vec::new()
109 }
110 },
111 |rule| {
112 position(rule)
113 .into_iter()
114 .map(|a| {
115 (
116 SequenceElement::new(CheckedAminoAcid::new(a), None),
117 modification.clone(),
118 )
119 })
120 .collect_vec()
121 },
122 )
123 })
124 .flat_map(|(a, m)| {
125 let mc = m.clone();
126 a.formulas_all(
127 &[],
128 &[],
129 &mut Vec::new(),
130 false,
131 SequencePosition::default(),
132 0,
133 0,
134 &GlycanModel::DISALLOW,
135 )
136 .0
137 .iter()
138 .map(|f| f.monoisotopic_mass() + m.formula().monoisotopic_mass())
139 .map(|mass| (a.clone(), mc.clone(), mass))
140 .collect_vec()
141 })
142 .collect_vec();
143 options.sort_unstable_by(|a, b| a.2.value.total_cmp(&b.2.value));
144 options
145 }
146
147 let generate = |position| {
148 let mut options: Vec<(SequenceElement<SemiAmbiguous>, Mass)> = amino_acids
149 .iter()
150 .flat_map(|aa| {
151 let mut options = Vec::new();
152 options.extend(
153 fixed
154 .iter()
155 .filter(|&m| {
156 m.1.as_ref().map_or_else(
157 || {
158 can_be_placed(
159 &m.0,
160 &SequenceElement::new(aa.into(), None),
161 position,
162 )
163 },
164 |rule| {
165 rule.is_possible(
166 &SequenceElement::new(aa.into(), None),
167 position,
168 )
169 },
170 )
171 })
172 .map(|m| {
173 SequenceElement::new(aa.into(), None)
174 .with_simple_modification(m.0.clone())
175 }),
176 );
177 if options.is_empty() {
178 vec![SequenceElement::new(aa.into(), None)]
179 } else {
180 options
181 }
182 })
183 .flat_map(|seq| {
184 let mut options = vec![seq.clone()];
185 options.extend(
186 variable
187 .iter()
188 .filter(|&m| {
189 m.1.as_ref().map_or_else(
190 || can_be_placed(&m.0, &seq, position),
191 |rule| rule.is_possible(&seq, position),
192 )
193 })
194 .map(|m| {
195 let mut modifications = seq.modifications.clone();
196 modifications.push(Modification::Simple(m.0.clone()));
197 let mut result = SequenceElement::new(seq.aminoacid, None);
198 result.modifications = modifications;
199 result
200 }),
201 );
202 options
203 })
204 .flat_map(|s| {
205 s.formulas_all(
206 &[],
207 &[],
208 &mut Vec::new(),
209 false,
210 position,
211 0,
212 0,
213 &GlycanModel::DISALLOW,
214 )
215 .0
216 .iter()
217 .map(|f| (s.clone(), f.monoisotopic_mass()))
218 .collect_vec()
219 })
220 .collect();
221 options.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
222 options
223 };
224
225 (
227 generate_terminal(&|rule| n_term_options(amino_acids, rule), fixed, variable),
228 generate(SequencePosition::Index(0)),
229 generate_terminal(&|rule| c_term_options(amino_acids, rule), fixed, variable),
230 )
231}
232
233pub fn find_isobaric_sets(
241 mass: Mass,
242 tolerance: Tolerance<Mass>,
243 amino_acids: &[AminoAcid],
244 fixed: &[(SimpleModification, Option<PlacementRule>)],
245 variable: &[(SimpleModification, Option<PlacementRule>)],
246 base: Option<&Peptidoform<SimpleLinear>>,
247) -> IsobaricSetIterator {
248 let bounds = tolerance.bounds(mass);
249 let base_mass = base
250 .and_then(|b| {
251 b.formulas()
252 .mass_bounds()
253 .into_option()
254 .map(|(f, _)| f.monoisotopic_mass())
255 })
256 .unwrap_or_default();
257 let bounds = (bounds.0 - base_mass, bounds.1 - base_mass);
258 assert!(
259 bounds.0.value > 0.0,
260 "Cannot have a base selection that has a weight within the tolerance of the intended final mass for isobaric search."
261 );
262 let (n_term, center, c_term) = building_blocks(amino_acids, fixed, variable);
263
264 IsobaricSetIterator::new(n_term, c_term, center, bounds, base)
265}
266
267#[derive(Debug)]
269pub struct IsobaricSetIterator {
270 n_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
271 c_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
272 center: Vec<(SequenceElement<SemiAmbiguous>, Mass)>,
273 sizes: (Mass, Mass),
274 bounds: (Mass, Mass),
275 state: (Option<usize>, Option<usize>, Vec<usize>),
276 base: Option<Peptidoform<SimpleLinear>>,
277}
278
279impl IsobaricSetIterator {
280 fn new(
284 n_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
285 c_term: Vec<(SequenceElement<SemiAmbiguous>, SimpleModification, Mass)>,
286 center: Vec<(SequenceElement<SemiAmbiguous>, Mass)>,
287 bounds: (Mass, Mass),
288 base: Option<&Peptidoform<SimpleLinear>>,
289 ) -> Self {
290 let sizes = (center.first().unwrap().1, center.last().unwrap().1);
291 let mut iter = Self {
292 n_term,
293 c_term,
294 center,
295 sizes,
296 bounds,
297 state: (None, None, Vec::new()),
298 base: base.cloned(),
299 };
300 while iter.current_mass() < iter.bounds.0 - iter.sizes.0 {
301 iter.state.2.push(0);
302 }
303 iter
304 }
305
306 fn current_mass(&self) -> Mass {
307 self.state.0.map(|i| self.n_term[i].2).unwrap_or_default()
308 + self.state.1.map(|i| self.c_term[i].2).unwrap_or_default()
309 + self
310 .state
311 .2
312 .iter()
313 .copied()
314 .map(|i| self.center[i].1)
315 .sum::<Mass>()
316 }
317
318 fn mass_fits(&self) -> Ordering {
319 let mass = self.current_mass();
320 if mass < self.bounds.0 {
321 Ordering::Less
322 } else if mass > self.bounds.1 {
323 Ordering::Greater
324 } else {
325 Ordering::Equal
326 }
327 }
328
329 fn peptide(&self) -> Peptidoform<SimpleLinear> {
332 let mut sequence = Vec::with_capacity(
333 self.base.as_ref().map(Peptidoform::len).unwrap_or_default()
334 + self.state.2.len()
335 + usize::from(self.state.0.is_some())
336 + usize::from(self.state.1.is_some()),
337 );
338 if self
339 .base
340 .as_ref()
341 .is_some_and(|b| !b.get_n_term().is_empty())
342 {
343 sequence.push(self.base.as_ref().unwrap().sequence()[0].clone());
344 } else if let Some(n) = self.state.0.map(|i| self.n_term[i].clone()) {
345 sequence.push(n.0.into());
346 }
347 if let Some(base) = &self.base {
348 let n = usize::from(base.get_n_term().is_empty());
349 let c = usize::from(base.get_c_term().is_empty());
350 sequence.extend(base.sequence()[n..base.len() - n - c].iter().cloned());
351 }
352 sequence.extend(
353 self.state
354 .2
355 .iter()
356 .copied()
357 .map(|i| self.center[i].0.clone().into()),
358 );
359 if self
360 .base
361 .as_ref()
362 .is_some_and(|b| !b.get_c_term().is_empty())
363 {
364 sequence.push(
365 self.base
366 .as_ref()
367 .unwrap()
368 .sequence()
369 .last()
370 .unwrap()
371 .clone(),
372 );
373 } else if let Some(c) = self.state.1.map(|i| self.c_term[i].clone()) {
374 sequence.push(c.0.into());
375 }
376 Peptidoform::new(sequence)
377 .n_term(self.base.as_ref().map_or_else(
378 || {
379 self.state.0.map_or(Vec::new(), |i| {
380 vec![Modification::Simple(self.n_term[i].1.clone())]
381 })
382 },
383 |b| b.get_n_term().to_vec(),
384 ))
385 .c_term(self.base.as_ref().map_or_else(
386 || {
387 self.state.0.map_or(Vec::new(), |i| {
388 vec![Modification::Simple(self.c_term[i].1.clone())]
389 })
390 },
391 |b| b.get_c_term().to_vec(),
392 ))
393 }
394
395 fn reset_center_state(&mut self) {
397 self.state.2.clear();
398 while self.current_mass() < self.bounds.0 - self.sizes.0 {
399 self.state.2.push(0);
400 }
401 }
402}
403
404impl Iterator for IsobaricSetIterator {
405 type Item = Peptidoform<SimpleLinear>;
406 fn next(&mut self) -> Option<Self::Item> {
407 loop {
408 loop {
410 while !self.state.2.is_empty() {
414 while !self.state.2.iter().all(|s| *s == self.center.len() - 1) {
417 let mut level = self.state.2.len() - 1;
418 loop {
419 if self.state.2[level] == self.center.len() - 1 {
420 if level == 0 {
421 break;
422 }
423 level -= 1;
424 } else {
425 self.state.2[level] += 1;
427 for l in level + 1..self.state.2.len() {
429 self.state.2[l] = self.state.2[level];
430 }
431 match self.mass_fits() {
432 Ordering::Greater => {
433 if level == 0 {
435 break;
436 }
437 level -= 1;
438 }
439 Ordering::Equal => {
440 return Some(self.peptide());
441 }
442 Ordering::Less => {
443 if self.state.2[0..level]
446 .iter()
447 .map(|i| self.center[*i].1)
448 .sum::<Mass>()
449 + Ratio::new::<fraction>(
450 (self.state.2.len() - level) as f64,
451 ) * self.sizes.1
452 > self.bounds.0
453 {
454 level = self.state.2.len() - 1;
455 }
456 }
457 }
458 }
459 }
460 }
461 self.state.2.pop();
462 if self.sizes.1 * Ratio::new::<fraction>(self.state.2.len() as f64)
464 < self.bounds.0
465 {
466 break;
467 }
468 for level in 0..self.state.2.len() {
470 self.state.2[level] = 0;
471 }
472 }
473
474 if let Some(c) = self.state.1 {
476 if c + 1 >= self.c_term.len() {
477 break;
478 }
479 self.state.1 = Some(c + 1);
480 } else if self.c_term.is_empty()
481 || self
482 .base
483 .as_ref()
484 .is_some_and(|b| !b.get_c_term().is_empty())
485 {
486 break;
488 } else {
489 self.state.1 = Some(0);
490 }
491 self.reset_center_state();
492 }
493 if let Some(n) = self.state.0 {
495 if n + 1 >= self.n_term.len() {
496 break;
497 }
498 self.state.0 = Some(n + 1);
499 } else if self.n_term.is_empty()
500 || self
501 .base
502 .as_ref()
503 .is_some_and(|b| !b.get_n_term().is_empty())
504 {
505 break;
507 } else {
508 self.state.1 = Some(0);
509 }
510 self.reset_center_state();
511 }
512 None
513 }
514}
515
516#[cfg(test)]
517#[expect(clippy::missing_panics_doc)]
518mod tests {
519
520 use super::*;
521 #[test]
522 fn simple_isobaric_sets() {
523 let pep = Peptidoform::pro_forma("AG", None)
524 .unwrap()
525 .into_unambiguous()
526 .unwrap();
527 let sets: Vec<Peptidoform<SimpleLinear>> = find_isobaric_sets(
528 pep.bare_formula().monoisotopic_mass(),
529 Tolerance::new_ppm(10.0),
530 AminoAcid::UNIQUE_MASS_AMINO_ACIDS,
531 &[],
532 &[],
533 None,
534 )
535 .collect();
536 assert_eq!(
537 &sets,
538 &[
539 Peptidoform::pro_forma("GA", None)
540 .unwrap()
541 .into_simple_linear()
542 .unwrap(),
543 Peptidoform::pro_forma("Q", None)
544 .unwrap()
545 .into_simple_linear()
546 .unwrap(),
547 ]
548 );
549 }
550}