use crate::imgt_gene::IMGTGene;
use crate::structs::{AASequence, DataItem, Location, Region};
use crate::{NotASpecies, Species};
use itertools::Itertools;
use mzcore::sequence::AminoAcid;
use std::collections::HashMap;
use std::io::BufRead;
pub(crate) fn parse_dat<T: BufRead>(reader: T) -> impl Iterator<Item = Result<DataItem, String>> {
reader
.lines()
.batching(|f| {
let mut data = PreDataItem::default();
for line in f.filter_map(Result::ok) {
if parse_dat_line(&mut data, &line) {
return Some(data);
}
}
None
})
.filter(|pre| {
pre.kw.contains(&"immunoglobulin (IG)".to_string())
&& (pre.kw.contains(&"functional".to_string())
|| pre.kw.contains(&"germline".to_string())
|| pre.kw.contains(&"productive".to_string()))
&& pre.os.is_some()
})
.map(DataItem::new)
}
fn parse_dat_line(data: &mut PreDataItem, line: &str) -> bool {
if line.len() < 2 {
return false;
}
match &line[..2] {
"//" => return true,
"ID" => data.id = line.to_string(),
"KW" => data.kw.extend(
line[5..]
.split(';')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty()),
),
"FH" if line.starts_with("FH Key") => {
data.ft_key_width = line.find("Location").expect("Incorrect FH line") - 5;
}
"FT" => data.ft.push(line[5..].to_string()),
"OS" if data.os.is_none() => {
data.os = Species::from_imgt(line[5..].trim()).unwrap_or_else(|NotASpecies| {
println!("Not a species name: `{line}`");
None
});
}
" " => data.sq.extend(
line.chars()
.filter(|c| *c == 'c' || *c == 'a' || *c == 't' || *c == 'g' || *c == 'n'),
),
_ => (),
}
false
}
impl DataItem {
fn new(data: PreDataItem) -> Result<Self, String> {
let mut result = Self {
id: data.id[5..].split(';').next().unwrap().to_string(),
species: data.os.ok_or("No species found")?,
sequence: data.sq,
genes: Vec::new(),
regions: Vec::new(),
};
let mut current: Option<Region> = None;
let mut is_sequence = false;
for line in data.ft {
if !line.starts_with(' ') || current.is_none() {
if let Some(region) = current.take() {
result.add_region(region);
}
if let Ok(location) = line[data.ft_key_width..].parse() {
let (key, location) = (line[..data.ft_key_width].trim().to_string(), location);
current = Some(Region {
acc: result.id.clone(),
key,
location,
reported_seq: String::new(),
found_seq: Err("Not loaded".to_string()),
allele: String::new(),
functional: true,
partial: false,
shift: 0,
splice_aa: None,
});
}
continue;
}
if let Some(current) = &mut current {
Self::parse_ft_line(&line, current, &mut is_sequence)?;
}
}
if let Some(region) = current.take() {
result.add_region(region);
}
Ok(result)
}
fn parse_ft_line(
line: &str,
current: &mut Region,
is_sequence: &mut bool,
) -> Result<(), String> {
let trimmed = line.trim();
let split = trimmed
.split_once('=')
.map(|(key, tail)| (key.to_ascii_lowercase(), tail));
match split.as_ref().map(|(key, tail)| (key.as_str(), tail)) {
Some(("/translation", tail)) => {
current.reported_seq += tail.trim_matches('\"');
*is_sequence = !tail.ends_with('\"');
}
Some(("/imgt_allele", tail)) => {
current.allele = tail.trim_matches('\"').to_string();
}
Some(("/codon_start", tail)) => {
current.shift = tail
.parse::<usize>()
.map_err(|_| format!("Not a valid codon_start: '{tail}'"))?
- 1;
}
Some(("/splice-expectedcodon", tail)) => {
if let Some(i) = tail.find(']') {
current.splice_aa = AminoAcid::try_from(tail.as_bytes()[i - 1]).ok();
}
}
Some(("/functional", _)) => {
current.functional = true;
}
Some(("/note" | "/imgt_note", s)) if s.to_ascii_lowercase().contains("functional") => {
current.functional = true;
}
Some(("/pseudo", _)) => {
current.functional = false;
}
Some(("/partial", _)) => current.partial = true,
None if *is_sequence => {
current.reported_seq += trimmed.trim_end_matches('\"');
*is_sequence = !trimmed.ends_with('\"');
}
_ => (),
}
Ok(())
}
fn add_region(&mut self, mut region: Region) {
region.found_seq = self.get_sequence(®ion.location, region.shift);
if ["V-GENE", "C-GENE", "J-GENE"].contains(®ion.key.as_str()) && region.functional
&& !region.partial
&& region.allele.starts_with("IG")
{
self.genes.push(IMGTGene {
acc: region.acc,
key: region.key,
location: region.location,
allele: region.allele,
regions: HashMap::new(),
});
} else if ["V-REGION", "C-REGION", "J-REGION"].contains(®ion.key.as_str()) && region.functional
&& !region.partial
&& region.allele.starts_with("IG")
{
if let Some(existing) = self.genes.iter_mut().find(|g| g.allele == region.allele) {
existing.regions.insert(region.key.clone(), region);
} else {
self.genes.push(IMGTGene {
acc: region.acc,
key: region.key,
location: region.location,
allele: region.allele,
regions: HashMap::new(),
});
}
} else if [
"FR1-IMGT",
"FR2-IMGT",
"FR3-IMGT",
"FR4-IMGT",
"CDR1-IMGT",
"CDR2-IMGT",
"CDR3-IMGT",
"1st-CYS",
"2nd-CYS",
"CONSERVED-TRP",
"J-REGION",
"CH1",
"CH2",
"H-CH2",
"CH3",
"CH3-CHS",
"CH4",
"CH4-CHS",
"CH5",
"CH5-CHS",
"CH6",
"CH6-CHS",
"CH7",
"CH7-CHS",
"CH8",
"CH8-CHS",
"CH9",
"CH9-CHS",
"CHS",
"CL",
"C-REGION",
"H", "H1",
"H2",
"H3",
"H4",
"M",
"M1",
"M2",
]
.contains(®ion.key.as_str())
{
if region.key == "CDR3-IMGT" {
if let Some(gene) = self
.genes
.iter_mut()
.find(|g| g.location.overlaps(®ion.location))
{
gene.regions.insert(region.key.clone(), region);
} else {
self.regions.push(region);
}
} else if let Some(gene) = self
.genes
.iter_mut()
.find(|g| g.location.contains(®ion.location))
{
gene.regions.insert(region.key.clone(), region);
} else {
self.regions.push(region);
}
}
}
fn get_sequence(&self, slice: &Location, shift: usize) -> Result<(String, AASequence), String> {
let (inner_shift, shift) = if shift == 2 { (1, 0) } else { (0, shift) };
translate(
&match slice {
Location::Normal(range) => {
if *range.start() < inner_shift {
return Err("Shift outside of range".to_string());
}
self.sequence
.get(range.start() - inner_shift..=*range.end())
.ok_or("Normal outside of range")?
.to_string()
}
Location::SingleNormal(index) => char::from(
*self
.sequence
.as_bytes()
.get(*index)
.ok_or("Single normal outside of range")?,
)
.to_string(),
Location::Complement(range) => complement(
self.sequence
.get(*range.start()..=*range.end() + inner_shift)
.ok_or("Complement outside of range")?
.to_string(),
),
Location::SingleComplement(index) => complement(
char::from(
*self
.sequence
.as_bytes()
.get(*index)
.ok_or("Single complement outside of range")?,
)
.to_string(),
),
}[shift..],
)
.map(|(s, v)| (s.to_owned(), AASequence(v)))
}
}
#[derive(Debug, Default)]
struct PreDataItem {
id: String,
kw: Vec<String>,
ft_key_width: usize,
ft: Vec<String>,
os: Option<Species>,
sq: String,
}
pub(crate) fn complement(s: String) -> String {
let map = HashMap::from([
(b'a', b't'),
(b't', b'a'),
(b'c', b'g'),
(b'g', b'c'),
(b'n', b'n'),
]);
String::from_utf8(
s.as_bytes()
.iter()
.map(|c| {
*map.get(c)
.unwrap_or_else(|| panic!("Invalid sequence: {} in `{s}`", char::from(*c)))
})
.rev()
.collect(),
)
.unwrap()
}
pub(crate) fn translate(s: &str) -> Result<(&str, Vec<AminoAcid>), String> {
if s.len() < 3 {
Ok((s, Vec::new()))
} else {
Ok((
s,
(0..=s.len() - 3)
.step_by(3)
.filter_map(|chunk| {
AminoAcid::from_dna(&s[chunk..chunk + 3])
.map_err(|_| format!("Not a codon {}", &s[chunk..chunk + 3]))
.transpose()
})
.collect::<Result<Vec<AminoAcid>, String>>()?,
))
}
}