use crate::grangers_utils::{self, equal_length};
use anyhow::bail;
use noodles::fasta;
use std::collections::HashMap;
use std::io::BufRead;
use std::path::Path;
#[derive(Clone)]
pub struct SeqInfo {
seqname: Vec<String>,
seqlengths: Option<Vec<usize>>,
is_circular: Option<Vec<bool>>,
genome: Option<String>,
extra: Option<HashMap<String, Vec<String>>>,
}
impl SeqInfo {
pub fn seqname(&self) -> &Vec<String> {
&self.seqname
}
pub fn seqlengths(&self) -> &Option<Vec<usize>> {
&self.seqlengths
}
pub fn is_circular(&self) -> &Option<Vec<bool>> {
&self.is_circular
}
pub fn genome(&self) -> &Option<String> {
&self.genome
}
pub fn extra(&self) -> &Option<HashMap<String, Vec<String>>> {
&self.extra
}
pub fn set_seqnames(&mut self, seqname: Vec<String>) {
self.seqname = seqname;
}
pub fn new(
seqname: Vec<String>,
seqlengths: Option<Vec<usize>>,
is_circular: Option<Vec<bool>>,
genome: Option<String>,
extra: Option<HashMap<String, Vec<String>>>,
) -> anyhow::Result<SeqInfo> {
if let Some(v) = &seqlengths {
if equal_length(&seqname[..], &v[..]) {
bail!("seqname and seqlengths have different length; Could not create SeqInfo")
}
}
if let Some(v) = &is_circular {
if equal_length(&seqname[..], &v[..]) {
bail!("seqname and is_circular have different length; Could not create SeqInfo")
}
}
if let Some(hm) = &extra {
for (k, v) in hm.iter() {
if equal_length(&seqname[..], &v[..]) {
bail!(
"seqname and {} have different length; Could not create SeqInfo",
k
)
}
}
}
Ok(SeqInfo {
seqname,
seqlengths,
is_circular,
genome,
extra: None,
})
}
pub fn from_fasta<T: AsRef<Path>>(file_path: T) -> anyhow::Result<SeqInfo> {
let (seqname, seqlengths) = get_chromsize(&file_path)?;
let si = SeqInfo::new(
seqname,
Some(seqlengths),
None,
Some(file_path.as_ref().to_string_lossy().to_string()),
None,
);
si
}
}
pub fn get_chromsize<T: AsRef<Path>>(file_path: T) -> anyhow::Result<(Vec<String>, Vec<usize>)> {
let mut reader = grangers_utils::get_noodles_reader_from_path(file_path)?;
_get_chromsize(&mut reader)
}
fn _get_chromsize<T: BufRead>(
rdr: &mut fasta::Reader<T>,
) -> anyhow::Result<(Vec<String>, Vec<usize>)> {
let mut seqname: Vec<String> = Vec::new();
let mut seqlengths: Vec<usize> = Vec::new();
for result in rdr.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
seqname.push(
record_name
.split_once(' ')
.unwrap_or((record_name, ""))
.0
.to_string(),
);
seqlengths.push(record.sequence().len());
}
Ok((seqname, seqlengths))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_get_chromsize() {
let fasta_data = b">sq0 test\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
let reader = Cursor::new(fasta_data);
let mut rdr =
grangers_utils::get_noodles_reader_from_reader(reader).expect("can't construct reader");
let chromsize = _get_chromsize(&mut rdr).unwrap();
assert_eq!(chromsize.1.first().unwrap(), &4);
}
}