use mzcv::CVIndex;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
use std::collections::HashSet;
use mzcore::sequence::{
AnnotatedPeptide, Annotation, HasPeptidoformImpl, Peptidoform, Region, UnAmbiguous,
};
pub(super) use super::*;
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Selection<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> {
pub species: Option<HashSet<Species, S1>>,
pub chains: Option<HashSet<ChainType, S2>>,
pub genes: Option<HashSet<GeneType>>,
pub allele: AlleleSelection,
}
impl<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> Selection<S1, S2> {
#[must_use]
pub fn species(self, species: impl Into<HashSet<Species, S1>>) -> Self {
Self {
species: Some(species.into()),
..self
}
}
#[must_use]
pub fn chain(self, chains: impl Into<HashSet<ChainType, S2>>) -> Self {
Self {
chains: Some(chains.into()),
..self
}
}
#[must_use]
pub fn gene(self, genes: impl Into<HashSet<GeneType>>) -> Self {
Self {
genes: Some(genes.into()),
..self
}
}
#[must_use]
pub fn allele(self, allele: AlleleSelection) -> Self {
Self { allele, ..self }
}
}
impl<
'a,
S1: std::hash::BuildHasher + Clone + Send + Sync + 'a,
S2: std::hash::BuildHasher + Clone + Send + Sync + 'a,
> Selection<S1, S2>
{
pub fn germlines(self, cv: &'a CVIndex<IMGT>) -> impl Iterator<Item = Allele<'a>> {
cv.data()
.iter()
.filter(move |(s, _)| {
self.species
.as_ref()
.is_none_or(|species| species.contains(s))
})
.flat_map(|(s, g)| g.into_iter().map(|c| (*s, c.0, c.1)))
.filter(move |(_, kind, _)| self.chains.as_ref().is_none_or(|k| k.contains(kind)))
.flat_map(|(species, _, c)| c.into_iter().map(move |g| (species, g.0, g.1)))
.filter(move |(_, gene, _)| self.genes.as_ref().is_none_or(|s| contains_gene(s, *gene)))
.flat_map(|(species, _, germlines)| germlines.iter().map(move |a| (species, a)))
.flat_map(move |(species, germline)| {
germline
.into_iter()
.take(self.allele.take_num())
.map(move |(a, seq)| (species, &germline.name, *a, seq))
})
.map(Into::into)
}
#[cfg(feature = "rayon")]
pub fn par_germlines(self, cv: &'a CVIndex<IMGT>) -> impl ParallelIterator<Item = Allele<'a>> {
cv.data()
.par_iter()
.filter(move |(s, _)| {
self.species
.as_ref()
.is_none_or(|species| species.contains(s))
})
.flat_map(|(s, g)| g.into_par_iter().map(|c| (*s, c.0, c.1)))
.filter(move |(_, kind, _)| self.chains.as_ref().is_none_or(|k| k.contains(kind)))
.flat_map(|(species, _, c)| c.into_par_iter().map(move |g| (species, g.0, g.1)))
.filter(move |(_, gene, _)| self.genes.as_ref().is_none_or(|s| contains_gene(s, *gene)))
.flat_map(|(species, _, germlines)| {
germlines.into_par_iter().map(move |a| (species, a))
})
.flat_map(move |(species, germline)| {
germline
.into_par_iter()
.take(self.allele.take_num())
.map(move |(a, seq)| (species, &germline.name, *a, seq))
})
.map(Into::into)
}
}
fn contains_gene(s: &HashSet<GeneType>, gene: GeneType) -> bool {
s.contains(&gene) || matches!(gene, GeneType::C(_)) && s.contains(&GeneType::C(None))
}
impl<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> Default for Selection<S1, S2> {
fn default() -> Self {
Self {
species: None,
chains: None,
genes: None,
allele: AlleleSelection::First,
}
}
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum AlleleSelection {
All,
First,
}
impl AlleleSelection {
#[allow(dead_code)] const fn take_num(self) -> usize {
match self {
Self::First => 1,
Self::All => usize::MAX,
}
}
}
#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)]
pub struct Allele<'a> {
pub species: Species,
pub gene: std::borrow::Cow<'a, Gene>,
pub number: usize,
pub sequence: &'a Peptidoform<UnAmbiguous>,
pub regions: &'a [(Region, usize)],
pub annotations: &'a [(Annotation, usize)],
}
impl Allele<'_> {
pub fn name(&self) -> String {
format!("{}*{:02}", self.gene, self.number)
}
pub fn fancy_name(&self) -> String {
format!("{}*{:02}", self.gene.to_fancy_string(), self.number)
}
}
impl HasPeptidoformImpl for Allele<'_> {
type Complexity = UnAmbiguous;
fn peptidoform(&self) -> &Peptidoform<Self::Complexity> {
self.sequence
}
}
impl AnnotatedPeptide for Allele<'_> {
fn annotations(&self) -> &[(Annotation, usize)] {
self.annotations
}
fn regions(&self) -> &[(Region, usize)] {
self.regions
}
}
impl<'a> From<(Species, &'a Gene, usize, &'a AnnotatedSequence)> for Allele<'a> {
fn from(value: (Species, &'a Gene, usize, &'a AnnotatedSequence)) -> Self {
Self {
species: value.0,
gene: std::borrow::Cow::Borrowed(value.1),
number: value.2,
sequence: &value.3.sequence,
regions: &value.3.regions,
annotations: &value.3.annotations,
}
}
}
impl Germlines {
pub fn find_allele(&self, gene: Gene, allele: Option<usize>) -> Option<Allele<'_>> {
let chain = match gene.chain {
ChainType::Heavy => &self.h,
ChainType::LightKappa => &self.k,
ChainType::LightLambda => &self.l,
ChainType::Iota => &self.i,
};
let genes = match gene.kind {
GeneType::V => &chain.variable,
GeneType::J => &chain.joining,
GeneType::C(None) => &chain.c,
GeneType::C(Some(Constant::A)) => &chain.a,
GeneType::C(Some(Constant::D)) => &chain.d,
GeneType::C(Some(Constant::E)) => &chain.e,
GeneType::C(Some(Constant::G)) => &chain.g,
GeneType::C(Some(Constant::M)) => &chain.m,
GeneType::C(Some(Constant::O)) => &chain.o,
GeneType::C(Some(Constant::T)) => &chain.t,
};
genes
.binary_search_by(|g| g.name.cmp(&gene))
.ok()
.and_then(|g| {
let g = &genes[g];
allele
.map_or_else(
|| g.alleles.first(),
|a| g.alleles.iter().find(|(ga, _)| a == *ga),
)
.map(|res| (g, res))
})
.map(move |(g, (a, seq))| Allele {
species: g.species,
gene: std::borrow::Cow::Owned(gene),
number: *a,
sequence: &seq.sequence,
regions: &seq.regions,
annotations: &seq.annotations,
})
}
pub fn find_germline(&self, gene: Gene) -> Option<std::sync::Arc<Germline>> {
let chain = match gene.chain {
ChainType::Heavy => &self.h,
ChainType::LightKappa => &self.k,
ChainType::LightLambda => &self.l,
ChainType::Iota => &self.i,
};
let genes = match gene.kind {
GeneType::V => &chain.variable,
GeneType::J => &chain.joining,
GeneType::C(None) => &chain.c,
GeneType::C(Some(Constant::A)) => &chain.a,
GeneType::C(Some(Constant::D)) => &chain.d,
GeneType::C(Some(Constant::E)) => &chain.e,
GeneType::C(Some(Constant::G)) => &chain.g,
GeneType::C(Some(Constant::M)) => &chain.m,
GeneType::C(Some(Constant::O)) => &chain.o,
GeneType::C(Some(Constant::T)) => &chain.t,
};
genes
.binary_search_by(|g| g.name.cmp(&gene))
.ok()
.map(|g| genes[g].clone())
}
pub fn remove_germline(&mut self, gene: Gene) {
let chain = match gene.chain {
ChainType::Heavy => &mut self.h,
ChainType::LightKappa => &mut self.k,
ChainType::LightLambda => &mut self.l,
ChainType::Iota => &mut self.i,
};
let genes = match gene.kind {
GeneType::V => &mut chain.variable,
GeneType::J => &mut chain.joining,
GeneType::C(None) => &mut chain.c,
GeneType::C(Some(Constant::A)) => &mut chain.a,
GeneType::C(Some(Constant::D)) => &mut chain.d,
GeneType::C(Some(Constant::E)) => &mut chain.e,
GeneType::C(Some(Constant::G)) => &mut chain.g,
GeneType::C(Some(Constant::M)) => &mut chain.m,
GeneType::C(Some(Constant::O)) => &mut chain.o,
GeneType::C(Some(Constant::T)) => &mut chain.t,
};
genes
.binary_search_by(|g| g.name.cmp(&gene))
.ok()
.map(|g| genes.remove(g));
}
}
#[cfg(all(test, not(feature = "internal-no-data")))]
#[expect(clippy::missing_panics_doc)]
mod tests {
use std::collections::HashSet;
use crate::{ChainType, GeneType, STATIC_IMGT, Selection, Species, select::contains_gene};
#[test]
fn try_first_human() {
let selection = Selection::default()
.species([Species::HomoSapiens])
.chain([ChainType::Heavy])
.gene([GeneType::V]);
let first = selection.germlines(&STATIC_IMGT).next().unwrap();
assert_eq!(first.name(), "IGHV1-2*01");
}
#[test]
fn try_first_g_human() {
let selection = Selection::default()
.species([Species::HomoSapiens])
.chain([ChainType::Heavy])
.gene([GeneType::C(Some(crate::Constant::G))]);
let first = selection.germlines(&STATIC_IMGT).next().unwrap();
assert_eq!(first.name(), "IGHGP*01");
}
#[test]
fn gene_selections() {
let constant = HashSet::from([GeneType::C(None)]);
assert!(contains_gene(&constant, GeneType::C(None)));
assert!(contains_gene(
&constant,
GeneType::C(Some(crate::Constant::G))
));
assert!(contains_gene(
&constant,
GeneType::C(Some(crate::Constant::A))
));
let constant_g = HashSet::from([GeneType::C(Some(crate::Constant::G))]);
assert!(!contains_gene(&constant_g, GeneType::C(None)));
assert!(contains_gene(
&constant_g,
GeneType::C(Some(crate::Constant::G))
));
assert!(!contains_gene(
&constant_g,
GeneType::C(Some(crate::Constant::A))
));
}
}