rustyms/imgt/
select.rs

1#[cfg(all(feature = "rayon", not(feature = "internal-no-data")))]
2use rayon::prelude::*;
3use std::collections::HashSet;
4
5use crate::sequence::{
6    AnnotatedPeptide, Annotation, HasPeptidoformImpl, Peptidoform, Region, UnAmbiguous,
7};
8
9pub(super) use super::*;
10
11/// Get a specific germline
12#[cfg(not(feature = "internal-no-data"))]
13pub fn get_germline(
14    species: Species,
15    gene: Gene,
16    allele: Option<usize>,
17) -> Option<Allele<'static>> {
18    germlines(species).and_then(|g| g.find(species, gene, allele))
19}
20
21/// The selection rules for iterating over a selection of germlines.
22#[derive(Clone, Debug, Eq, PartialEq)]
23pub struct Selection<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> {
24    /// The species you want, None allows all, otherwise only the species specified will be returned
25    pub species: Option<HashSet<Species, S1>>,
26    /// The chain of genes you want, None allows all, otherwise only the chains specified will be returned
27    pub chains: Option<HashSet<ChainType, S2>>,
28    /// The kind of genes you want, None allows all, otherwise only the genes specified will be returned
29    pub genes: Option<HashSet<GeneType>>,
30    /// The way of handling alleles you want
31    pub allele: AlleleSelection,
32}
33
34impl<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> Selection<S1, S2> {
35    /// Builder pattern method to add a species selection, will replace any previously set species selection
36    #[must_use]
37    pub fn species(self, species: impl Into<HashSet<Species, S1>>) -> Self {
38        Self {
39            species: Some(species.into()),
40            ..self
41        }
42    }
43
44    /// Builder pattern method to add a chain selection, will replace any previously set chain selection
45    #[must_use]
46    pub fn chain(self, chains: impl Into<HashSet<ChainType, S2>>) -> Self {
47        Self {
48            chains: Some(chains.into()),
49            ..self
50        }
51    }
52
53    /// Builder pattern method to add a gene selection, will replace any previously set gene selection
54    #[must_use]
55    pub fn gene(self, genes: impl Into<HashSet<GeneType>>) -> Self {
56        Self {
57            genes: Some(genes.into()),
58            ..self
59        }
60    }
61
62    /// Builder pattern method to add an allele selection, will replace any previously set allele selection
63    #[must_use]
64    pub fn allele(self, allele: AlleleSelection) -> Self {
65        Self { allele, ..self }
66    }
67}
68
69impl<
70    S1: std::hash::BuildHasher + Clone + Send + Sync,
71    S2: std::hash::BuildHasher + Clone + Send + Sync,
72> Selection<S1, S2>
73{
74    /// Get the selected alleles
75    #[cfg(not(feature = "internal-no-data"))]
76    pub fn germlines(self) -> impl Iterator<Item = Allele<'static>> {
77        all_germlines()
78            .filter(move |g| self.species.as_ref().is_none_or(|s| s.contains(&g.species)))
79            .flat_map(|g| g.into_iter().map(|c| (g.species, c.0, c.1)))
80            .filter(move |(_, kind, _)| self.chains.as_ref().is_none_or(|k| k.contains(kind)))
81            .flat_map(|(species, _, c)| c.into_iter().map(move |g| (species, g.0, g.1)))
82            .filter(move |(_, gene, _)| self.genes.as_ref().is_none_or(|s| contains_gene(s, *gene)))
83            .flat_map(|(species, _, germlines)| germlines.iter().map(move |a| (species, a)))
84            .flat_map(move |(species, germline)| {
85                germline
86                    .into_iter()
87                    .take(self.allele.take_num())
88                    .map(move |(a, seq)| (species, &germline.name, *a, seq))
89            })
90            .map(Into::into)
91    }
92
93    #[cfg(all(feature = "rayon", not(feature = "internal-no-data")))]
94    /// Get the selected alleles in parallel fashion, only available if you enable the feature "rayon" (on by default)
95    pub fn par_germlines(self) -> impl ParallelIterator<Item = Allele<'static>> {
96        par_germlines()
97            .filter(move |g| self.species.as_ref().is_none_or(|s| s.contains(&g.species)))
98            .flat_map(|g| g.into_par_iter().map(|c| (g.species, c.0, c.1)))
99            .filter(move |(_, kind, _)| self.chains.as_ref().is_none_or(|k| k.contains(kind)))
100            .flat_map(|(species, _, c)| c.into_par_iter().map(move |g| (species, g.0, g.1)))
101            .filter(move |(_, gene, _)| self.genes.as_ref().is_none_or(|s| contains_gene(s, *gene)))
102            .flat_map(|(species, _, germlines)| {
103                germlines.into_par_iter().map(move |a| (species, a))
104            })
105            .flat_map(move |(species, germline)| {
106                germline
107                    .into_par_iter()
108                    .take(self.allele.take_num())
109                    .map(move |(a, seq)| (species, &germline.name, *a, seq))
110            })
111            .map(Into::into)
112    }
113}
114
115#[cfg(not(feature = "internal-no-data"))]
116fn contains_gene(s: &HashSet<GeneType>, gene: GeneType) -> bool {
117    s.contains(&gene) || matches!(gene, GeneType::C(_)) && s.contains(&GeneType::C(None))
118}
119
120impl<S1: std::hash::BuildHasher, S2: std::hash::BuildHasher> Default for Selection<S1, S2> {
121    /// Get a default selection, which gives all kinds and genes but only returns the first allele
122    fn default() -> Self {
123        Self {
124            species: None,
125            chains: None,
126            genes: None,
127            allele: AlleleSelection::First,
128        }
129    }
130}
131
132/// The allele handling strategy
133#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
134pub enum AlleleSelection {
135    /// Return all alleles
136    All,
137    /// Only return the first allele. It can have a number higher than 1 if the previous alleles are not functional.
138    First,
139}
140
141impl AlleleSelection {
142    #[cfg(not(feature = "internal-no-data"))]
143    const fn take_num(self) -> usize {
144        match self {
145            Self::First => 1,
146            Self::All => usize::MAX,
147        }
148    }
149}
150
151/// A returned allele
152#[non_exhaustive] // Do not let anyone build it themselves
153#[derive(Clone, Debug, Eq, PartialEq)]
154pub struct Allele<'a> {
155    /// The species where this gene originates from
156    pub species: Species,
157    /// The gene where this is the sequence for, eg `IGHV3-23`
158    pub gene: std::borrow::Cow<'a, Gene>,
159    /// The allele number, in IMGT this follows the name, eg `*01` is the allele in `IGHV3-23*01`
160    pub number: usize,
161    /// The actual sequence, the sequences present in the database are pure amino acids, no modifications are to be expected
162    pub sequence: &'a Peptidoform<UnAmbiguous>,
163    /// The regions in the sequence, every region has an annotation and a length, all lengths together are the same length as the full sequence
164    pub regions: &'a [(Region, usize)],
165    /// Any additional annotations, every annotation has beside the kind it is also its location, as index in the sequence
166    pub annotations: &'a [(Annotation, usize)],
167}
168
169impl Allele<'_> {
170    /// Get the IMGT name for this allele
171    pub fn name(&self) -> String {
172        format!("{}*{:02}", self.gene, self.number)
173    }
174
175    /// Get the biologists name for this allele with fancy non ASCII characters
176    pub fn fancy_name(&self) -> String {
177        format!("{}*{:02}", self.gene.to_fancy_string(), self.number)
178    }
179}
180
181impl HasPeptidoformImpl for Allele<'_> {
182    type Complexity = UnAmbiguous;
183    fn peptidoform(&self) -> &Peptidoform<Self::Complexity> {
184        self.sequence
185    }
186}
187
188impl AnnotatedPeptide for Allele<'_> {
189    fn annotations(&self) -> &[(Annotation, usize)] {
190        self.annotations
191    }
192    fn regions(&self) -> &[(Region, usize)] {
193        self.regions
194    }
195}
196
197impl<'a> From<(Species, &'a Gene, usize, &'a AnnotatedSequence)> for Allele<'a> {
198    fn from(value: (Species, &'a Gene, usize, &'a AnnotatedSequence)) -> Self {
199        Self {
200            species: value.0,
201            gene: std::borrow::Cow::Borrowed(value.1),
202            number: value.2,
203            sequence: &value.3.sequence,
204            regions: &value.3.regions,
205            annotations: &value.3.annotations,
206        }
207    }
208}
209
210impl Germlines {
211    /// Find a specific germline.
212    pub fn find(&self, species: Species, gene: Gene, allele: Option<usize>) -> Option<Allele<'_>> {
213        let chain = match gene.chain {
214            ChainType::Heavy => &self.h,
215            ChainType::LightKappa => &self.k,
216            ChainType::LightLambda => &self.l,
217            ChainType::Iota => &self.i,
218        };
219        let genes = match gene.kind {
220            GeneType::V => &chain.variable,
221            GeneType::J => &chain.joining,
222            GeneType::C(None) => &chain.c,
223            GeneType::C(Some(Constant::A)) => &chain.a,
224            GeneType::C(Some(Constant::D)) => &chain.d,
225            GeneType::C(Some(Constant::E)) => &chain.e,
226            GeneType::C(Some(Constant::G)) => &chain.g,
227            GeneType::C(Some(Constant::M)) => &chain.m,
228            GeneType::C(Some(Constant::O)) => &chain.o,
229            GeneType::C(Some(Constant::T)) => &chain.t,
230        };
231        genes
232            .binary_search_by(|g| g.name.cmp(&gene))
233            .ok()
234            .and_then(|g| {
235                let g = &genes[g];
236                allele.map_or_else(
237                    || g.alleles.first(),
238                    |a| g.alleles.iter().find(|(ga, _)| a == *ga),
239                )
240            })
241            .map(move |(a, seq)| Allele {
242                species,
243                gene: std::borrow::Cow::Owned(gene),
244                number: *a,
245                sequence: &seq.sequence,
246                regions: &seq.regions,
247                annotations: &seq.annotations,
248            })
249    }
250}
251
252#[cfg(all(test, not(feature = "internal-no-data")))]
253#[expect(clippy::missing_panics_doc)]
254mod tests {
255    use std::collections::HashSet;
256
257    use crate::imgt::select::contains_gene;
258
259    use super::Selection;
260    use super::{ChainType, GeneType, Species};
261
262    #[test]
263    fn try_first_human() {
264        let selection = Selection::default()
265            .species([Species::HomoSapiens])
266            .chain([ChainType::Heavy])
267            .gene([GeneType::V]);
268        let first = selection.germlines().next().unwrap();
269        assert_eq!(first.name(), "IGHV1-2*01");
270    }
271
272    #[test]
273    fn try_first_g_human() {
274        let selection = Selection::default()
275            .species([Species::HomoSapiens])
276            .chain([ChainType::Heavy])
277            .gene([GeneType::C(Some(crate::imgt::Constant::G))]);
278        let first = selection.germlines().next().unwrap();
279        assert_eq!(first.name(), "IGHGP*01");
280    }
281
282    #[test]
283    fn gene_selections() {
284        let constant = HashSet::from([GeneType::C(None)]);
285        assert!(contains_gene(&constant, GeneType::C(None)));
286        assert!(contains_gene(
287            &constant,
288            GeneType::C(Some(crate::imgt::Constant::G))
289        ));
290        assert!(contains_gene(
291            &constant,
292            GeneType::C(Some(crate::imgt::Constant::A))
293        ));
294        let constant_g = HashSet::from([GeneType::C(Some(crate::imgt::Constant::G))]);
295        assert!(!contains_gene(&constant_g, GeneType::C(None)));
296        assert!(contains_gene(
297            &constant_g,
298            GeneType::C(Some(crate::imgt::Constant::G))
299        ));
300        assert!(!contains_gene(
301            &constant_g,
302            GeneType::C(Some(crate::imgt::Constant::A))
303        ));
304    }
305}