Skip to main content

openproteo_core/
conformance.rs

1//! Conformance checks for any [`crate::SpectrumSource`].
2//!
3//! This module hosts a small invariant suite that every vendor crate runs
4//! in its integration tests against a real fixture. It is *not* a
5//! validator for arbitrary mzML files; it operates on the in-memory
6//! `SpectrumRecord` stream so failures pinpoint the parser, not the
7//! writer.
8//!
9//! Invariants checked (per spectrum unless noted):
10//!
11//! 1. `mz.len() == intensity.len()`. Empty spectra are allowed because
12//!    sparse PASEF / SRM frames can legitimately yield zero peaks within
13//!    a scan range.
14//! 2. `inv_mobility_per_peak`, when present, has the same length as `mz`.
15//! 3. `total_ion_current`, when provided by the parser, equals
16//!    `sum(intensity)` within a relative tolerance.
17//! 4. `base_peak_intensity`, when provided, equals the maximum of
18//!    `intensity` within tolerance.
19//! 5. MS2+ spectra carry a [`crate::PrecursorInfo`] with at least one of
20//!    `target_mz`, `selected_mz`, or `precursor_native_id` populated.
21//! 6. Retention time is non-decreasing within the spectrum stream of one
22//!    function / acquisition. The check is per native-ID prefix so
23//!    interleaved Waters functions or Bruker MS1/MS2 frames are not
24//!    flagged.
25//! 7. The first spectrum's `index` is 0 and indices are strictly
26//!    increasing.
27//!
28//! On failure each check emits a [`ConformanceError`] with the offending
29//! `native_id` so the operator can jump straight to the problem.
30
31use std::collections::HashMap;
32
33use crate::source::SpectrumSource;
34use crate::types::SpectrumRecord;
35
36/// Relative tolerance applied to TIC / base-peak floating-point checks.
37const FLOAT_REL_TOL: f64 = 1e-4;
38
39/// Failure modes detected by [`assert_source_invariants`].
40#[derive(Debug)]
41pub enum ConformanceError {
42    /// `mz` / `intensity` arrays have mismatched lengths.
43    PeakArrayLengthMismatch {
44        native_id: String,
45        mz_len: usize,
46        intensity_len: usize,
47    },
48    /// `inv_mobility_per_peak.len()` did not match `mz.len()`.
49    MobilityArrayLengthMismatch {
50        native_id: String,
51        mz_len: usize,
52        mobility_len: usize,
53    },
54    /// `total_ion_current` did not match `sum(intensity)` within tolerance.
55    TicMismatch {
56        native_id: String,
57        declared: f64,
58        computed: f64,
59    },
60    /// `base_peak_intensity` did not match `max(intensity)` within tolerance.
61    BasePeakIntensityMismatch {
62        native_id: String,
63        declared: f64,
64        computed: f64,
65    },
66    /// MS2+ spectrum was missing precursor info.
67    MissingPrecursor { native_id: String, ms_level: u32 },
68    /// Retention time went backwards within one acquisition stream.
69    RetentionTimeNonMonotonic {
70        prefix: String,
71        previous: f64,
72        current: f64,
73        native_id: String,
74    },
75    /// Spectrum index sequence was not strictly increasing or did not start at 0.
76    IndexSequence {
77        native_id: String,
78        previous: Option<usize>,
79        current: usize,
80    },
81    /// Spectrum had no peaks at all.
82    EmptySpectrum { native_id: String },
83}
84
85impl std::fmt::Display for ConformanceError {
86    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87        match self {
88            Self::PeakArrayLengthMismatch {
89                native_id,
90                mz_len,
91                intensity_len,
92            } => write!(
93                f,
94                "{native_id}: mz.len()={mz_len} != intensity.len()={intensity_len}"
95            ),
96            Self::MobilityArrayLengthMismatch {
97                native_id,
98                mz_len,
99                mobility_len,
100            } => write!(
101                f,
102                "{native_id}: inv_mobility_per_peak.len()={mobility_len} != mz.len()={mz_len}"
103            ),
104            Self::TicMismatch {
105                native_id,
106                declared,
107                computed,
108            } => {
109                write!(
110                    f,
111                    "{native_id}: declared TIC {declared} != computed {computed}"
112                )
113            }
114            Self::BasePeakIntensityMismatch {
115                native_id,
116                declared,
117                computed,
118            } => write!(
119                f,
120                "{native_id}: declared base-peak intensity {declared} != max(intensity) {computed}"
121            ),
122            Self::MissingPrecursor {
123                native_id,
124                ms_level,
125            } => write!(f, "{native_id}: ms_level={ms_level} but no precursor info"),
126            Self::RetentionTimeNonMonotonic {
127                prefix,
128                previous,
129                current,
130                native_id,
131            } => write!(
132                f,
133                "{native_id}: RT regressed in stream '{prefix}' ({previous} -> {current})"
134            ),
135            Self::IndexSequence {
136                native_id,
137                previous,
138                current,
139            } => write!(
140                f,
141                "{native_id}: index sequence broken (prev={previous:?}, current={current})"
142            ),
143            Self::EmptySpectrum { native_id } => write!(f, "{native_id}: empty spectrum"),
144        }
145    }
146}
147
148impl std::error::Error for ConformanceError {}
149
150fn rel_close(a: f64, b: f64, tol: f64) -> bool {
151    let scale = a.abs().max(b.abs()).max(1.0);
152    (a - b).abs() <= tol * scale
153}
154
155/// Per-native-ID-prefix key used to scope retention-time monotonicity.
156///
157/// We split on the first whitespace and strip any trailing `scan=...`
158/// token so that mzML native IDs like `function=2 process=0 scan=17`,
159/// `frame=12 scan=8`, and `controllerType=0 controllerNumber=1 scan=42`
160/// each group correctly.
161fn rt_stream_key(native_id: &str) -> String {
162    native_id
163        .split_whitespace()
164        .filter(|tok| !tok.starts_with("scan="))
165        .collect::<Vec<_>>()
166        .join(" ")
167}
168
169/// Run the invariant suite against a borrowed source. Returns the first
170/// failure encountered, or `Ok(spectrum_count)` if every spectrum passed.
171///
172/// The source is iterated to completion on success; on failure iteration
173/// stops at the offending spectrum.
174pub fn assert_source_invariants<S: SpectrumSource>(src: &mut S) -> Result<usize, ConformanceError> {
175    assert_iter_invariants(src.iter_spectra())
176}
177
178/// Run the invariant suite against any iterator yielding spectra. Useful
179/// for vendors whose top-level mzML writer does not yet implement
180/// [`SpectrumSource`].
181pub fn assert_iter_invariants<I: IntoIterator<Item = SpectrumRecord>>(
182    iter: I,
183) -> Result<usize, ConformanceError> {
184    let mut last_index: Option<usize> = None;
185    let mut last_rt: HashMap<String, f64> = HashMap::new();
186    let mut count = 0usize;
187    for spectrum in iter {
188        check_one(&spectrum, last_index, &mut last_rt)?;
189        last_index = Some(spectrum.index);
190        count += 1;
191    }
192    Ok(count)
193}
194
195fn check_one(
196    s: &SpectrumRecord,
197    last_index: Option<usize>,
198    last_rt: &mut HashMap<String, f64>,
199) -> Result<(), ConformanceError> {
200    if s.mz.len() != s.intensity.len() {
201        return Err(ConformanceError::PeakArrayLengthMismatch {
202            native_id: s.native_id.clone(),
203            mz_len: s.mz.len(),
204            intensity_len: s.intensity.len(),
205        });
206    }
207    if let Some(mob) = &s.inv_mobility_per_peak {
208        if mob.len() != s.mz.len() {
209            return Err(ConformanceError::MobilityArrayLengthMismatch {
210                native_id: s.native_id.clone(),
211                mz_len: s.mz.len(),
212                mobility_len: mob.len(),
213            });
214        }
215    }
216    if let Some(tic) = s.total_ion_current {
217        let computed: f64 = s.intensity.iter().map(|&v| v as f64).sum();
218        if !rel_close(tic, computed, FLOAT_REL_TOL) {
219            return Err(ConformanceError::TicMismatch {
220                native_id: s.native_id.clone(),
221                declared: tic,
222                computed,
223            });
224        }
225    }
226    if let Some(bp) = s.base_peak_intensity {
227        if s.intensity.is_empty() {
228            // Nothing to compare against; trust the parser.
229            let _ = bp;
230        } else {
231            let computed = s
232                .intensity
233                .iter()
234                .copied()
235                .fold(f32::NEG_INFINITY, f32::max) as f64;
236            if !rel_close(bp, computed, FLOAT_REL_TOL) {
237                return Err(ConformanceError::BasePeakIntensityMismatch {
238                    native_id: s.native_id.clone(),
239                    declared: bp,
240                    computed,
241                });
242            }
243        }
244    }
245    if s.ms_level >= 2 {
246        let has_precursor = match &s.precursor {
247            Some(p) => {
248                p.target_mz.is_some() || p.selected_mz.is_some() || p.precursor_native_id.is_some()
249            }
250            None => false,
251        };
252        if !has_precursor {
253            return Err(ConformanceError::MissingPrecursor {
254                native_id: s.native_id.clone(),
255                ms_level: s.ms_level,
256            });
257        }
258    }
259    match last_index {
260        None => {
261            if s.index != 0 {
262                return Err(ConformanceError::IndexSequence {
263                    native_id: s.native_id.clone(),
264                    previous: None,
265                    current: s.index,
266                });
267            }
268        }
269        Some(prev) => {
270            if s.index <= prev {
271                return Err(ConformanceError::IndexSequence {
272                    native_id: s.native_id.clone(),
273                    previous: Some(prev),
274                    current: s.index,
275                });
276            }
277        }
278    }
279    let key = rt_stream_key(&s.native_id);
280    if let Some(prev) = last_rt.get(&key).copied() {
281        // Allow tiny floating-point regressions (<1 us).
282        if s.retention_time_sec + 1e-6 < prev {
283            return Err(ConformanceError::RetentionTimeNonMonotonic {
284                prefix: key,
285                previous: prev,
286                current: s.retention_time_sec,
287                native_id: s.native_id.clone(),
288            });
289        }
290    }
291    last_rt.insert(key, s.retention_time_sec);
292    Ok(())
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298    use crate::types::RunMetadata;
299    use crate::{Polarity, PrecursorInfo, SpectrumRecord};
300
301    struct ToySource(Vec<SpectrumRecord>);
302
303    impl SpectrumSource for ToySource {
304        fn run_metadata(&self) -> RunMetadata {
305            RunMetadata {
306                source_file_name: "toy".into(),
307                source_file_format: crate::CvTerm {
308                    accession: "MS:1000563",
309                    name: "Thermo RAW format".into(),
310                },
311                native_id_format: crate::CvTerm {
312                    accession: "MS:1000768",
313                    name: "Thermo nativeID format".into(),
314                },
315                instrument: crate::CvTerm {
316                    accession: "MS:1000483",
317                    name: "Thermo Fisher Scientific instrument model".into(),
318                },
319                software_name: "test".into(),
320                software_version: "0.0".into(),
321                start_timestamp: None,
322                mobility_array_kind: None,
323            }
324        }
325        fn iter_spectra<'a>(&'a mut self) -> Box<dyn Iterator<Item = SpectrumRecord> + 'a> {
326            Box::new(self.0.clone().into_iter())
327        }
328    }
329
330    fn ok_spec(index: usize, ms_level: u32, rt: f64) -> SpectrumRecord {
331        let mz = vec![100.0, 200.0];
332        let intensity = vec![10.0f32, 20.0];
333        SpectrumRecord {
334            index,
335            scan_number: (index + 1) as u32,
336            native_id: format!("controllerType=0 controllerNumber=1 scan={}", index + 1),
337            ms_level,
338            polarity: Some(Polarity::Positive),
339            scan_mode: None,
340            analyzer: None,
341            filter: None,
342            retention_time_sec: rt,
343            total_ion_current: Some(30.0),
344            base_peak_mz: Some(200.0),
345            base_peak_intensity: Some(20.0),
346            low_mz: Some(100.0),
347            high_mz: Some(200.0),
348            ion_injection_time_ms: None,
349            inv_mobility: None,
350            precursor: if ms_level >= 2 {
351                Some(PrecursorInfo {
352                    selected_mz: Some(500.0),
353                    ..Default::default()
354                })
355            } else {
356                None
357            },
358            mz,
359            intensity,
360            inv_mobility_per_peak: None,
361        }
362    }
363
364    #[test]
365    fn happy_path_passes() {
366        let mut src = ToySource(vec![
367            ok_spec(0, 1, 1.0),
368            ok_spec(1, 2, 2.0),
369            ok_spec(2, 1, 3.0),
370        ]);
371        assert_eq!(assert_source_invariants(&mut src).unwrap(), 3);
372    }
373
374    #[test]
375    fn detects_tic_mismatch() {
376        let mut s = ok_spec(0, 1, 1.0);
377        s.total_ion_current = Some(999.0);
378        let mut src = ToySource(vec![s]);
379        let err = assert_source_invariants(&mut src).unwrap_err();
380        assert!(matches!(err, ConformanceError::TicMismatch { .. }));
381    }
382
383    #[test]
384    fn detects_missing_precursor() {
385        let mut s = ok_spec(0, 2, 1.0);
386        s.precursor = None;
387        let mut src = ToySource(vec![s]);
388        let err = assert_source_invariants(&mut src).unwrap_err();
389        assert!(matches!(err, ConformanceError::MissingPrecursor { .. }));
390    }
391
392    #[test]
393    fn detects_rt_regression_within_same_stream() {
394        let s0 = ok_spec(0, 1, 5.0);
395        let s1 = ok_spec(1, 1, 2.0);
396        let mut src = ToySource(vec![s0, s1]);
397        let err = assert_source_invariants(&mut src).unwrap_err();
398        assert!(matches!(
399            err,
400            ConformanceError::RetentionTimeNonMonotonic { .. }
401        ));
402    }
403
404    #[test]
405    fn detects_mobility_length_mismatch() {
406        let mut s = ok_spec(0, 1, 1.0);
407        s.inv_mobility_per_peak = Some(vec![0.5]);
408        let mut src = ToySource(vec![s]);
409        let err = assert_source_invariants(&mut src).unwrap_err();
410        assert!(matches!(
411            err,
412            ConformanceError::MobilityArrayLengthMismatch { .. }
413        ));
414    }
415}