use std::path::Path;
use context_error::*;
use mzcore::{
ontology::Ontologies,
sequence::{Linked, SemiAmbiguous, SimpleLinear},
};
use crate::*;
pub fn open_identified_peptidoforms_file<'a>(
path: impl AsRef<Path>,
ontologies: &'a Ontologies,
keep_all_columns: bool,
) -> Result<GeneralIdentifiedPeptidoforms<'a>, BoxedError<'static, BasicKind>> {
let path = path.as_ref();
let actual_extension = path
.extension()
.map(|ex| {
ex.eq_ignore_ascii_case("gz")
.then_some(path)
.and_then(|p| p.file_stem())
.and_then(|p| Path::new(p).extension())
.unwrap_or(ex)
})
.map(|ex| ex.to_string_lossy().to_lowercase());
match actual_extension.as_deref() {
Some("csv") => PeaksData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.or_else(|pe| {
NovorData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|ne| (pe, ne))
})
.or_else(|(pe, ne)| {
InstaNovoData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|ie| (pe, ne, ie))
})
.or_else(|(pe, ne, ie)| {
PLinkData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|le| (pe, ne, ie, le))
}).or_else(|(pe, ne, ie, le)| {
PowerNovoData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|pne| (pe, ne, ie, le, pne))
}).or_else(|(pe, ne, ie, le, pne)| {
PLGSData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|ple| (pe, ne, ie, le, pne, ple))
}).or_else(|(pe, ne, ie, le, pne, ple)| {
BasicCSVData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|be| (pe, ne, ie, le, pne, ple, be))
}).or_else(|(pe, ne, ie, le, pne, ple, be)| {
PUniFindData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|pue| (pe, ne, ie, le, pne, ple, be, pue))
}).map_err(|(pe, ne, ie, le, pne, ple, be, pue)| {
BoxedError::new(BasicKind::Error,
"Unknown file format",
"Could not be recognised as either a Peaks, Novor, InstaNovo, pLink, PowerNovo, PLGS, pUniFind, or basic file",
Context::default().source(path.to_string_lossy()).to_owned(),
)
.add_underlying_errors(vec![pe, ne, ie, le, pne, ple, be, pue])
}),
Some("tsv") => SageData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.or_else(|se| {
PepNetData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|pe| (se, pe))
})
.or_else(|(se, pe)| {
MSFraggerData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|mfe| (se, pe, mfe))
})
.or_else(|(se, pe, mfe)| {
ProteoscapeData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|pse| (se, pe, mfe, pse))
})
.or_else(|(se, pe, mfe, pse)| {
NovoBData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|ne| (se, pe, mfe, pse, ne))
})
.or_else(|(se, pe, mfe, pse, ne)| {
PiPrimeNovoData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|pne| (se, pe, mfe, pse, ne, pne))
})
.map_err(|(se, pe, mfe, pse, ne, pne)| {
BoxedError::new(BasicKind::Error,
"Unknown file format",
"Could not be recognised a Sage, PepNet, MSFragger, NovoB, π-PrimeNovo or Proteoscape file",
Context::default().source(path.to_string_lossy()).to_owned(),
)
.add_underlying_errors(vec![se, pe, mfe, pse, ne, pne])
}),
Some("psmtsv") => {
MetaMorpheusData::parse_file(path, ontologies, keep_all_columns, None).map(IdentifiedPeptidoformIter::into_box).or_else(|me| {
OpairData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|oe| (me, oe))
}).map_err(|(me, oe)| {
BoxedError::new(BasicKind::Error,
"Unknown file format",
"Could not be recognised a MetaMorpheus or OPair file",
Context::default().source(path.to_string_lossy()).to_owned(),
)
.add_underlying_errors(vec![me, oe])
})
}
Some("fasta" | "fas" | "fa" | "faa" | "mpfa") => FastaData::parse_file(path).map(|peptides| {
let a: Box<dyn Iterator<Item = Result<IdentifiedPeptidoform<Linked, MaybePeptidoform>, BoxedError<'static, BasicKind>>> + 'a>
= Box::new(peptides.into_iter().map(|p| Ok(IdentifiedPeptidoform::<SemiAmbiguous, PeptidoformPresent>::from(p).cast())));
a
}),
#[cfg(feature = "mzannotate")]
Some("txt") if path.file_stem().is_some_and(|p| p.to_string_lossy().to_ascii_lowercase().contains(".mzspeclib")) => {
annotated_spectrum::parse_mzspeclib(path, ontologies)
}
Some("txt") => {
MaxQuantData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.or_else(|me| {
PiHelixNovoData::parse_file(path, ontologies, keep_all_columns, None)
.map(IdentifiedPeptidoformIter::into_box)
.map_err(|hne| (me, hne))
})
.map_err(|(me, he)| {
BoxedError::new(BasicKind::Error,
"Unknown file format",
"Could not be recognised as either a MaxQuant or π-HelixNovo file",
Context::default().source(path.to_string_lossy()).to_owned(),
)
.add_underlying_errors(vec![me, he])
})
}
Some("mztab") => MZTabData::parse_file(path, ontologies).map(|peptides| {
let a: Box<dyn Iterator<Item = Result<IdentifiedPeptidoform<Linked, MaybePeptidoform>, BoxedError<'static, BasicKind>>> + 'a>
= Box::new(peptides.into_iter().map(|p| p.map(|p| {
IdentifiedPeptidoform::<SimpleLinear, MaybePeptidoform>::from(p).cast()
})));
a
}),
Some("deepnovo_denovo") => {
DeepNovoFamilyData::parse_file(path, ontologies, keep_all_columns, None).map(IdentifiedPeptidoformIter::into_box)
},
Some("ssl") => {
SpectrumSequenceListData::parse_file(path, ontologies, keep_all_columns, None).map(IdentifiedPeptidoformIter::into_box)
}
_ => Err(BoxedError::new(BasicKind::Error,
"Unknown extension",
"Use CSV, SSL, TSV, TXT, PSMTSV, deepnovo_denovo, or Fasta, or any of these as a gzipped file (eg csv.gz).",
Context::default().source(path.to_string_lossy()).to_owned(),
)),
}
}
#[expect(clippy::missing_panics_doc)]
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::BufReader;
#[test]
fn open_sage() {
match test_format::<SageData>(
BufReader::new(File::open("src/test_files/sage_v0_14.tsv").unwrap()),
&mzcore::ontology::STATIC_ONTOLOGIES,
true,
false,
Some(SageVersion::V0_14),
) {
Ok(n) => assert_eq!(n, 19),
Err(e) => {
println!("{e}");
panic!("Failed identified peptides test");
}
}
}
#[test]
fn open_msfragger() {
match test_format::<MSFraggerData>(
BufReader::new(File::open("src/test_files/msfragger_v21.tsv").unwrap()),
&mzcore::ontology::STATIC_ONTOLOGIES,
true,
false,
Some(MSFraggerVersion::FragPipeV20Or21),
) {
Ok(n) => assert_eq!(n, 19),
Err(e) => {
println!("{e}");
panic!("Failed identified peptides test");
}
}
}
}