mzident 0.1.0

Handle all kinds of identified peptidoform files.
Documentation
use std::{
    borrow::Cow,
    marker::PhantomData,
    ops::{Not, Range},
    path::Path,
};

use crate::{
    FastaIdentifier, GeneralIdentifiedPeptidoforms, IdentifiedPeptidoform,
    IdentifiedPeptidoformData, KnownFileFormat, MaybePeptidoform, MetaData, SpectrumId,
    SpectrumIds,
};
use mzannotate::prelude::AnnotatedSpectrum;
use mzcore::{
    ontology::Ontologies,
    prelude::*,
    sequence::{FlankingSequence, Linked},
    system::{Mass, MassOverCharge, Time, isize::Charge},
};

use context_error::{BasicKind, BoxedError, FullErrorContent};

use itertools::Itertools;

impl MetaData for AnnotatedSpectrum {
    fn compound_peptidoform_ion(&self) -> Option<Cow<'_, CompoundPeptidoformIon>> {
        let cpi: CompoundPeptidoformIon = self
            .analytes
            .iter()
            .filter_map(|a| match &a.target {
                mzannotate::mzspeclib::AnalyteTarget::PeptidoformIon(pep) => Some(pep.clone()),
                _ => None,
            })
            .collect();
        cpi.peptidoform_ions()
            .is_empty()
            .not()
            .then_some(Cow::Owned(cpi))
    }

    fn format(&self) -> KnownFileFormat {
        KnownFileFormat::AnnotatedSpectrum
    }

    fn id(&self) -> String {
        self.description.index.to_string()
    }

    fn confidence(&self) -> Option<f64> {
        self.interpretations
            .iter()
            .filter_map(|i| i.probability)
            .exactly_one()
            .ok()
    }

    fn local_confidence(&self) -> Option<Cow<'_, [f64]>> {
        None
    }

    fn original_confidence(&self) -> Option<f64> {
        self.interpretations
            .iter()
            .filter_map(|i| i.probability)
            .exactly_one()
            .ok()
    }

    fn original_local_confidence(&self) -> Option<&[f64]> {
        None
    }

    fn charge(&self) -> Option<Charge> {
        self.analytes
            .iter()
            .filter_map(|a| match &a.target {
                mzannotate::mzspeclib::AnalyteTarget::PeptidoformIon(pep) => Some(pep),
                _ => None,
            })
            .exactly_one()
            .ok()
            .and_then(|p| p.get_charge_carriers().map(MolecularCharge::charge))
    }

    #[allow(clippy::redundant_closure_for_method_calls)] // Do not want to have to reference mzdata
    fn mode(&self) -> Option<Cow<'_, str>> {
        self.description
            .precursor
            .first()
            .map(|p| Cow::Owned(p.activation.methods().iter().map(|d| d.name()).join("+")))
    }

    fn fragmentation_model(
        &self,
    ) -> Option<mzannotate::annotation::model::BuiltInFragmentationModel> {
        self.description
            .precursor
            .first()
            .map(|p| p.activation.methods().into())
    }

    fn retention_time(&self) -> Option<Time> {
        self.description
            .acquisition
            .scans
            .first()
            .map(|s| Time::new::<mzcore::system::time::s>(s.start_time))
    }

    fn scans(&self) -> SpectrumIds {
        self.description
            .params
            .iter()
            .find(|p| {
                p.controlled_vocabulary
                    .is_some_and(|cv| cv.prefix() == "MS")
                    && p.accession == Some(1003203)
            })
            .map_or_else(
                || SpectrumIds::FileNotKnown(vec![SpectrumId::Index(self.description.index)]),
                |rawfile| {
                    SpectrumIds::FileKnown(vec![(
                        rawfile.value.to_string().into(),
                        vec![SpectrumId::Index(self.description.index)],
                    )])
                },
            )
    }

    fn experimental_mz(&self) -> Option<MassOverCharge> {
        self.description
            .precursor
            .first()
            .and_then(|p| p.ions.first())
            .map(|i| MassOverCharge::new::<mzcore::system::mass_over_charge::thomson>(i.mz))
    }

    fn experimental_mass(&self) -> Option<Mass> {
        self.description
            .precursor
            .first()
            .and_then(|p| p.ions.first())
            .and_then(|i| {
                i.charge
                    .map(|c| Mass::new::<mzcore::system::mass::dalton>(i.mz * f64::from(c)))
            })
    }

    fn protein_names(&self) -> Option<Cow<'_, [FastaIdentifier<String>]>> {
        Some(Cow::Owned(
            self.analytes
                .iter()
                .flat_map(|a| &a.proteins)
                .filter_map(|p| p.accession.clone())
                .map(|a| {
                    a.parse()
                        .unwrap_or_else(|_| FastaIdentifier::Undefined(false, a.to_string()))
                })
                .collect(),
        ))
    }

    fn protein_id(&self) -> Option<usize> {
        None
    }

    fn protein_location(&self) -> Option<Range<u16>> {
        None
    }

    fn flanking_sequences(&self) -> (&FlankingSequence, &FlankingSequence) {
        (
            self.analytes
                .iter()
                .flat_map(|a| &a.proteins)
                .map(|p| &p.flanking_sequences.0)
                .exactly_one()
                .unwrap_or(FlankingSequence::UNKNOWN),
            self.analytes
                .iter()
                .flat_map(|a| &a.proteins)
                .map(|p| &p.flanking_sequences.1)
                .exactly_one()
                .unwrap_or(FlankingSequence::UNKNOWN),
        )
    }

    fn database(&self) -> Option<(&str, Option<&str>)> {
        self.analytes
            .iter()
            .flat_map(|a| &a.proteins)
            .map(|p| p.database_name.as_deref())
            .exactly_one()
            .ok()
            .flatten()
            .map(|name| {
                (
                    name,
                    self.analytes
                        .iter()
                        .flat_map(|a| &a.proteins)
                        .map(|p| p.database_version.as_deref())
                        .exactly_one()
                        .ok()
                        .flatten(),
                )
            })
    }

    fn annotated_spectrum(&self) -> Option<Cow<'_, AnnotatedSpectrum>> {
        Some(Cow::Borrowed(self))
    }

    fn has_annotated_spectrum(&self) -> bool {
        true
    }
}

impl From<AnnotatedSpectrum> for IdentifiedPeptidoform<Linked, MaybePeptidoform> {
    fn from(value: AnnotatedSpectrum) -> Self {
        Self {
            score: value.confidence(),
            local_confidence: value.local_confidence().map(|v| v.to_vec()),
            data: IdentifiedPeptidoformData::AnnotatedSpectrum(value),
            complexity_marker: PhantomData,
            peptidoform_availability_marker: PhantomData,
        }
    }
}

/// Parse an mzSpecLib file into a general identified peptidoforms iterator.
/// # Errors
/// If the file is not a valid mzSpecLib file.
pub(crate) fn parse_mzspeclib<'a>(
    path: &Path,
    ontologies: &'a Ontologies,
) -> Result<GeneralIdentifiedPeptidoforms<'a>, BoxedError<'static, BasicKind>> {
    mzannotate::mzspeclib::MzSpecLibTextParser::open_file(path, ontologies)
        .map(move |parser| {
            let b: Box<
                dyn Iterator<
                    Item = Result<
                        IdentifiedPeptidoform<Linked, MaybePeptidoform>,
                        BoxedError<'static, BasicKind>,
                    >,
                >,
            > = Box::new(parser.map(move |s| {
                s.map(Into::into)
                    .map_err(|e| e.convert(|_| BasicKind::Error))
            }));
            b
        })
        .map_err(|e| e.convert(|_| BasicKind::Error))
}