use flate2::bufread::MultiGzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use tracing::trace;
pub type FastaReader = noodles::fasta::Reader<Box<dyn BufRead>>;
pub(crate) const VALIDSTRANDS: [&str; 2] = ["+", "-"];
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum FileFormat {
GTF,
GFF,
BED,
SAM,
BAM,
FASTA,
FASTQ,
}
impl FileFormat {
pub fn get_essential(&self) -> &[&str] {
match self {
FileFormat::GTF => GXFESSENTIALATTRIBUTES.as_ref(),
FileFormat::GFF => GXFESSENTIALATTRIBUTES.as_ref(),
_ => &[],
}
}
pub fn is_gtf(&self) -> bool {
matches!(self, FileFormat::GTF)
}
}
impl std::str::FromStr for FileFormat {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<FileFormat> {
let ft = match s.to_lowercase().as_str() {
"gtf" => FileFormat::GTF,
"gff2" => FileFormat::GTF,
"gff" => FileFormat::GFF,
"gff3" => FileFormat::GFF,
"bed" => FileFormat::BED,
"sam" => FileFormat::SAM,
"bam" => FileFormat::BAM,
"fasta" => FileFormat::FASTA,
"fastq" => FileFormat::FASTQ,
_ => anyhow::bail!("Cannot parse the file type."),
};
Ok(ft)
}
}
impl std::fmt::Display for FileFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
FileFormat::GTF => write!(f, "GTF"),
FileFormat::GFF => write!(f, "GFF"),
FileFormat::BED => write!(f, "BED"),
FileFormat::SAM => write!(f, "SAM"),
FileFormat::BAM => write!(f, "BAM"),
FileFormat::FASTA => write!(f, "FASTA"),
FileFormat::FASTQ => write!(f, "FASTQ"),
}
}
}
pub(crate) const GXFESSENTIALATTRIBUTES: [&str; 4] =
["gene_id", "gene_name", "transcript_id", "exon_number"];
pub(crate) const GXFFIELDS: [&str; 8] = [
"seqname",
"source",
"feature_type",
"start",
"end",
"score",
"strand",
"phase",
];
pub fn _file_line_count<T: AsRef<Path>>(file_path: T) -> anyhow::Result<usize> {
let reader = BufReader::new(File::open(file_path)?);
let mut num_lines = 0usize;
for l in reader.lines() {
let line = l?;
if !(line.trim().starts_with('#') | line.trim().is_empty()) {
num_lines += 1;
}
}
Ok(num_lines)
}
pub fn equal_length<T, R>(vec1: &[T], vec2: &[R]) -> bool {
vec1.len() == vec2.len()
}
pub fn _setdiff<T: Eq + Clone>(vec1: &[T], vec2: &[T]) -> Vec<T> {
let mut diff: Vec<T> = Vec::new();
for v in vec1.iter() {
if !vec2.contains(v) {
diff.push(v.to_owned());
}
}
diff
}
pub fn is_gzipped<T: BufRead>(reader: &mut T) -> std::io::Result<bool> {
const GZIP_MAGIC_NUMBER: [u8; 2] = [0x1f, 0x8b];
let src = reader.fill_buf()?;
if src.get(..2) == Some(&GZIP_MAGIC_NUMBER) {
Ok(true)
} else {
Ok(false)
}
}
pub fn get_noodles_reader_from_path<T: AsRef<Path>>(p: T) -> anyhow::Result<FastaReader> {
let file = std::fs::File::open(p.as_ref())?;
let mut inner_rdr = std::io::BufReader::new(file);
if is_gzipped(&mut inner_rdr)? {
trace!("auto-detected gzipped FASTA file - reading via decompression");
Ok(noodles::fasta::Reader::new(Box::new(BufReader::new(
MultiGzDecoder::new(inner_rdr),
))))
} else {
Ok(noodles::fasta::Reader::new(Box::new(inner_rdr)))
}
}
pub fn get_noodles_reader_from_reader(r: impl Read + 'static) -> anyhow::Result<FastaReader> {
let mut inner_rdr = std::io::BufReader::new(r);
if is_gzipped(&mut inner_rdr)? {
trace!("auto-detected gzipped FASTA file - reading via decompression");
Ok(noodles::fasta::Reader::new(Box::new(BufReader::new(
MultiGzDecoder::new(inner_rdr),
))))
} else {
Ok(noodles::fasta::Reader::new(Box::new(inner_rdr)))
}
}
#[derive(Clone, Copy)]
pub enum IntervalType {
Inclusive(i64),
Exclusive(i64),
LeftInclusive(i64),
RightInclusive(i64),
}
impl Default for IntervalType {
fn default() -> Self {
IntervalType::Inclusive(1)
}
}
impl IntervalType {
pub fn from<T: ToString>(file_type: T) -> Self {
match file_type.to_string().to_lowercase().as_str() {
"gtf" | "gff" | "sam" => IntervalType::Inclusive(1),
"bam" | "bed" => IntervalType::RightInclusive(0),
_ => panic!("The file type is not supported"),
}
}
pub fn start_offset(&self) -> i64 {
match self {
IntervalType::Inclusive(c) => 1 - c,
IntervalType::LeftInclusive(c) => 1 - c,
IntervalType::RightInclusive(c) => 1 + 1 - c,
IntervalType::Exclusive(c) => 1 + 1 - c,
}
}
pub fn end_offset(&self) -> i64 {
match self {
IntervalType::Inclusive(c) => 1 - c,
IntervalType::LeftInclusive(c) => -1 + 1 - c,
IntervalType::RightInclusive(c) => 1 - c,
IntervalType::Exclusive(c) => -1 + 1 - c,
}
}
}
pub static FIELDCOLUMNS: [&str; 12] = [
"seqname",
"source",
"feature_type",
"start",
"end",
"score",
"strand",
"phase",
"gene_id",
"gene_name",
"transcript_id",
"exon_number",
];