use std::{borrow::Cow, marker::PhantomData, ops::Range, sync::OnceLock};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::{
BoxedIdentifiedPeptideIter, FastaIdentifier, IdentifiedPeptidoform, IdentifiedPeptidoformData,
IdentifiedPeptidoformSource, IdentifiedPeptidoformVersion, KnownFileFormat, MaybePeptidoform,
MetaData, PeaksFamilyId, SpectrumId, SpectrumIds,
common_parser::{Location, OptionalColumn, OptionalLocation},
};
use mzcore::{
csv::{CsvLine, parse_csv},
ontology::Ontologies,
sequence::{
AminoAcid, CompoundPeptidoformIon, FlankingSequence, Peptidoform, SemiAmbiguous,
SloppyParsingParameters,
},
system::{Mass, MassOverCharge, Time, isize::Charge},
};
static NUMBER_ERROR: (&str, &str) = (
"Invalid DeepNovoFamily line",
"This column is not a number but it is required to be a number in this format",
);
static ID_ERROR: (&str, &str) = (
"Invalid DeepNovoFamily line",
"This column is not a valid ID but it is required to be in this peaks format\nExamples of valid IDs: '1234' & 'F2:1234'",
);
static PARAMETERS: OnceLock<SloppyParsingParameters> = OnceLock::new();
format_family!(
DeepNovoFamily,
SemiAmbiguous, MaybePeptidoform, [&DEEPNOVO_V0_0_1, &POINTNOVOFAMILY], b'\t', None;
required {
scan: Vec<PeaksFamilyId>, |location: Location, _| location.or_empty()
.map_or(Ok(Vec::new()), |l| l.array(';').map(|v| v.parse(ID_ERROR)).collect::<Result<Vec<_>,_>>());
peptide: Option<Peptidoform<SemiAmbiguous>>, |location: Location, ontologies: &Ontologies|
location.or_empty().map(|location| Peptidoform::sloppy_pro_forma(
location.full_line(),
location.location.clone(),
ontologies,
PARAMETERS.get_or_init(|| SloppyParsingParameters {
mod_indications: (
Some("mod"),
vec![
(
AminoAcid::Asparagine,
ontologies.unimod().get_by_index(&7).unwrap(),
),
(
AminoAcid::Glutamine,
ontologies.unimod().get_by_index(&7).unwrap(),
),
(
AminoAcid::Cysteine,
ontologies.unimod().get_by_index(&6).unwrap(),
),
(
AminoAcid::Methionine,
ontologies.unimod().get_by_index(&3).unwrap(),
),
],
),
..Default::default()
})
).map_err(BoxedError::to_owned)).transpose();
score: Option<f64>, |location: Location, _| location.or_empty().parse::<f64>(NUMBER_ERROR);
local_confidence: Option<Vec<f64>>, |location: Location, _| location.or_empty()
.optional_array(',').map(|array| array.map(|l| l.parse::<f64>(NUMBER_ERROR)).collect::<Result<Vec<_>, _>>())
.transpose();
}
optional {
z: Charge, |location: Location, _| location
.trim_end_matches(".0")
.parse::<isize>(NUMBER_ERROR)
.map(Charge::new::<mzcore::system::e>);
mz: MassOverCharge, |location: Location, _| location.parse::<f64>(NUMBER_ERROR).map(MassOverCharge::new::<mzcore::system::thomson>);
}
fn post_process(_source: &CsvLine, mut parsed: Self, _ontologies: &Ontologies) -> Result<Self, BoxedError<'static, BasicKind>> {
if parsed.local_confidence.as_ref().map(Vec::len)
!= parsed.peptide.as_ref().map(Peptidoform::len)
{
parsed.local_confidence = parsed.local_confidence.map(interpolate_lc);
}
Ok(parsed)
}
);
#[expect(clippy::needless_pass_by_value)] fn interpolate_lc(local_confidence: Vec<f64>) -> Vec<f64> {
let mut reinterpolated = Vec::with_capacity(local_confidence.len() + 1);
for i in 0..local_confidence.len() {
if i == 0 {
reinterpolated.push(local_confidence[i]);
} else {
let average = f64::midpoint(local_confidence[i - 1], local_confidence[i]);
reinterpolated.push(average);
}
}
reinterpolated.push(local_confidence[local_confidence.len() - 1]);
reinterpolated
}
pub const DEEPNOVO_V0_0_1: DeepNovoFamilyFormat = DeepNovoFamilyFormat {
version: DeepNovoFamilyVersion::DeepNovoV0_0_1,
scan: "scan",
peptide: "predicted_sequence",
score: "predicted_score",
local_confidence: "predicted_position_score",
mz: OptionalColumn::NotAvailable,
z: OptionalColumn::NotAvailable,
};
pub const POINTNOVOFAMILY: DeepNovoFamilyFormat = DeepNovoFamilyFormat {
version: DeepNovoFamilyVersion::PointNovoFamily,
scan: "scan_list_original",
peptide: "predicted_sequence",
score: "predicted_score",
local_confidence: "predicted_position_score",
mz: OptionalColumn::Required("precursor_mz"),
z: OptionalColumn::Required("precursor_charge"),
};
#[derive(
Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default, Serialize, Deserialize,
)]
pub enum DeepNovoFamilyVersion {
#[default]
DeepNovoV0_0_1,
PointNovoFamily,
}
impl std::fmt::Display for DeepNovoFamilyVersion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(f, "{}", self.name())
}
}
impl IdentifiedPeptidoformVersion<DeepNovoFamilyFormat> for DeepNovoFamilyVersion {
fn format(self) -> DeepNovoFamilyFormat {
match self {
Self::DeepNovoV0_0_1 => DEEPNOVO_V0_0_1,
Self::PointNovoFamily => POINTNOVOFAMILY,
}
}
fn name(self) -> &'static str {
match self {
Self::DeepNovoV0_0_1 => "DeepNovo v0.0.1",
Self::PointNovoFamily => "PointNovo v0.0.1 / PGPointNovo v1.0.6 / BiatNovo v0.1",
}
}
}
impl MetaData for DeepNovoFamilyData {
fn compound_peptidoform_ion(&self) -> Option<Cow<'_, CompoundPeptidoformIon>> {
self.peptide.as_ref().map(|p| Cow::Owned(p.clone().into()))
}
fn format(&self) -> KnownFileFormat {
KnownFileFormat::DeepNovoFamily(self.version)
}
fn id(&self) -> String {
self.scan.iter().join(";")
}
fn confidence(&self) -> Option<f64> {
self.score.map(|score| 2.0 / (1.0 + (-score).exp()))
}
fn local_confidence(&self) -> Option<Cow<'_, [f64]>> {
self.local_confidence
.as_ref()
.map(|lc| lc.iter().map(|v| 2.0 / (1.0 + (-v).exp())).collect())
}
fn original_confidence(&self) -> Option<f64> {
self.score
}
fn original_local_confidence(&self) -> Option<&[f64]> {
self.local_confidence.as_deref()
}
fn charge(&self) -> Option<Charge> {
self.z
}
fn mode(&self) -> Option<Cow<'_, str>> {
None
}
fn retention_time(&self) -> Option<Time> {
None
}
fn scans(&self) -> SpectrumIds {
SpectrumIds::FileNotKnown(
self.scan
.iter()
.flat_map(|s| s.scans.clone())
.map(SpectrumId::Number)
.collect(),
)
}
fn experimental_mz(&self) -> Option<MassOverCharge> {
self.mz
}
fn experimental_mass(&self) -> Option<Mass> {
self.mz
.and_then(|mz| self.z.map(|z| (mz, z)).map(|(mz, z)| mz * z.to_float()))
}
fn protein_names(&self) -> Option<Cow<'_, [FastaIdentifier<String>]>> {
None
}
fn protein_id(&self) -> Option<usize> {
None
}
fn protein_location(&self) -> Option<Range<u16>> {
None
}
fn flanking_sequences(&self) -> (&FlankingSequence, &FlankingSequence) {
(&FlankingSequence::Unknown, &FlankingSequence::Unknown)
}
fn database(&self) -> Option<(&str, Option<&str>)> {
None
}
}