use std::collections::HashMap;
use std::fmt::Display;
use crate::{AnnotatedSequence, Gene};
use itertools::Itertools;
use mzcore::sequence::{AminoAcid, Annotation, CheckedAminoAcid, Region};
use crate::structs::{Location, SequenceRegion, SingleSeq};
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct IMGTGene {
pub acc: String,
pub key: String,
pub location: Location,
pub allele: String,
pub regions: HashMap<String, crate::structs::Region>,
}
impl IMGTGene {
pub(crate) fn finish(self) -> Result<SingleSeq, String> {
let (regions, additional_annotations) = self.get_regions()?;
let sequence: Vec<AminoAcid> = regions.iter().flat_map(|reg| reg.1.0.clone()).collect();
let dna: String = regions.iter().map(|reg| reg.1.2.clone()).collect();
let region_lengths = regions
.iter()
.map(|reg| (reg.0.clone(), reg.1.0.len()))
.collect();
let conserved_map = HashMap::from([
("1st-CYS", Annotation::Conserved),
("2nd-CYS", Annotation::Conserved),
("CONSERVED-TRP", Annotation::Conserved),
("J-PHE", Annotation::Conserved),
("J-TRP", Annotation::Conserved),
]);
let mut conserved = self
.regions
.iter()
.filter(|(key, _)| {
["1st-CYS", "2nd-CYS", "CONSERVED-TRP", "J-PHE", "J-TRP"].contains(&key.as_str())
})
.map(|(key, region)| {
region
.location
.find_aa_location(®ions)
.map(|index| (conserved_map[key.as_str()].clone(), index))
.ok_or_else(|| format!("Cannot find location of '{key}' '{region}'"))
})
.collect::<Result<Vec<_>, _>>()?;
conserved.extend(
find_possible_n_glycan_locations(&sequence)
.iter()
.map(|i| (Annotation::NGlycan, *i)),
);
conserved.extend(additional_annotations);
let (name, allele) = Gene::from_imgt_name_with_allele(self.allele.as_str())?;
Ok(SingleSeq {
name,
allele,
acc: self.acc,
sequence: AnnotatedSequence::new(
sequence
.iter()
.copied()
.map(CheckedAminoAcid::new)
.map(|p| p.into_unambiguous().unwrap())
.collect(),
region_lengths,
conserved,
),
dna,
})
}
fn get_regions(&self) -> Result<(Vec<SequenceRegion>, Vec<(Annotation, usize)>), String> {
let mut additional_annotations = Vec::new();
let regions = match self.key.as_str() {
"V-GENE" => {
vec![
self.get_region(&Region::Framework(1), "FR1-IMGT")?,
self.get_region(&Region::ComplementarityDetermining(1), "CDR1-IMGT")?,
self.get_region(&Region::Framework(2), "FR2-IMGT")?,
self.get_region(&Region::ComplementarityDetermining(2), "CDR2-IMGT")?,
self.get_region(&Region::Framework(3), "FR3-IMGT")?,
self.get_region(&Region::ComplementarityDetermining(3), "CDR3-IMGT")?,
]
}
"J-GENE" => {
let j = self.get_region(&Region::Framework(4), "J-REGION")?;
let motif = j.1.0.iter().tuple_windows().position(|(a, b, _, d)| {
(*a == AminoAcid::Tryptophan || *a == AminoAcid::Phenylalanine)
&& *b == AminoAcid::Glycine
&& *d == AminoAcid::Glycine
});
if let Some(motif_start) = motif {
let j = fix_j(j.1, motif_start);
additional_annotations.extend(j.1);
j.0
} else {
vec![j]
}
}
"D-GENE" => {
vec![self.get_region(&Region::ComplementarityDetermining(3), "D-REGION")?]
}
"C-GENE" => {
let mut seq = Vec::new();
seq.extend(self.get_region(&Region::ConstantHeavy(1), "CH1").ok());
if self.regions.contains_key("H") && self.regions.contains_key("CH2") {
seq.extend(self.get_region(&Region::Hinge(None), "H").ok());
seq.extend(self.get_region(&Region::ConstantHeavy(2), "CH2").ok());
} else if self.regions.contains_key("H-CH2") {
seq.extend(
self.get_region(
&Region::Joined(vec![Region::Hinge(None), Region::ConstantHeavy(2)]),
"H-CH2",
)
.ok(),
);
} else {
seq.extend(self.get_region(&Region::Hinge(Some(1)), "H1").ok());
seq.extend(self.get_region(&Region::Hinge(Some(2)), "H2").ok());
seq.extend(self.get_region(&Region::Hinge(Some(3)), "H3").ok());
seq.extend(self.get_region(&Region::Hinge(Some(4)), "H4").ok());
seq.extend(self.get_region(&Region::ConstantHeavy(2), "CH2").ok());
}
let mut secretory_added = false;
for (region, with_chs) in [
(
Region::ConstantHeavy(3),
Region::Joined(vec![Region::ConstantHeavy(3), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(4),
Region::Joined(vec![Region::ConstantHeavy(4), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(5),
Region::Joined(vec![Region::ConstantHeavy(5), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(6),
Region::Joined(vec![Region::ConstantHeavy(6), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(7),
Region::Joined(vec![Region::ConstantHeavy(7), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(8),
Region::Joined(vec![Region::ConstantHeavy(8), Region::SecratoryTail]),
),
(
Region::ConstantHeavy(9),
Region::Joined(vec![Region::ConstantHeavy(9), Region::SecratoryTail]),
),
] {
if self.regions.contains_key(®ion.to_string())
&& self.regions.contains_key("CHS")
{
seq.extend(self.get_region(®ion, ®ion.to_string()).ok());
} else if self.regions.contains_key(&with_chs.to_string()) {
seq.extend(self.get_region(&with_chs, &with_chs.to_string()).ok());
secretory_added = true;
} else {
seq.extend(self.get_region(®ion, ®ion.to_string()).ok());
}
}
if !secretory_added {
seq.extend(self.get_region(&Region::SecratoryTail, "CHS").ok());
}
if seq.is_empty() {
seq.extend(self.get_region(&Region::ConstantLight, "CL").ok());
}
if seq.is_empty() {
seq.extend(self.get_region(&Region::ConstantLight, "C-REGION").ok());
}
if seq.is_empty() {
return Err("Empty C sequence".to_string());
}
seq
}
_ => Vec::new(),
};
Ok((regions, additional_annotations))
}
fn get_region(&self, region: &Region, key: &str) -> Result<SequenceRegion, String> {
self.regions
.get(key)
.ok_or_else(|| format!("Could not find {key}"))
.and_then(|region| {
region
.found_seq
.as_ref()
.map(|seq| {
let mut final_seq = region
.splice_aa
.map(|aa| vec![aa])
.filter(|_| region.shift != 2)
.unwrap_or_default();
final_seq.extend(seq.1.0.clone());
(final_seq, region.location.clone(), seq.0.clone())
})
.map_err(ToOwned::to_owned)
})
.map(|res| (region.clone(), res))
}
}
impl Display for IMGTGene {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "{}\t{}\t{}", self.key, self.location, self.allele)?;
for region in self.regions.values().sorted_by_key(|reg| ®.key) {
writeln!(f, " R {region}")?;
}
Ok(())
}
}
fn find_possible_n_glycan_locations(sequence: &[AminoAcid]) -> Vec<usize> {
let mut result = Vec::new();
for (index, aa) in sequence.windows(3).enumerate() {
if let (AminoAcid::Asparagine, AminoAcid::Serine | AminoAcid::Threonine) = (aa[0], aa[2])
&& aa[1] != AminoAcid::Proline
{
result.push(index);
}
}
result
}
fn fix_j(
j: (Vec<AminoAcid>, Location, String),
cdr3_length: usize,
) -> (Vec<SequenceRegion>, Vec<(Annotation, usize)>) {
let (cdr3_loc, fr4_loc) =
j.1.splice(cdr3_length)
.expect("CDR3 should fit in full FR4 of J gene");
let cdr3 = (
j.0[..cdr3_length].to_vec(),
cdr3_loc,
j.2[..cdr3_length].to_owned(),
);
let fr4 = (
j.0[cdr3_length..].to_vec(),
fr4_loc,
j.2[cdr3_length..].to_owned(),
);
let mut annotations = Vec::new();
if fr4.0[0] == AminoAcid::Tryptophan || fr4.0[0] == AminoAcid::Phenylalanine {
annotations.push((Annotation::Conserved, cdr3_length));
}
if fr4.0[1] == AminoAcid::Glycine {
annotations.push((Annotation::Conserved, cdr3_length + 1));
}
if fr4.0[3] == AminoAcid::Glycine {
annotations.push((Annotation::Conserved, cdr3_length + 3));
}
(
vec![
(Region::ComplementarityDetermining(3), cdr3),
(Region::Framework(4), fr4),
],
annotations,
)
}