use crate::grangers_utils::FIELDCOLUMNS;
use anyhow::bail;
use polars::prelude::DataFrame;
use std::collections::HashSet;
use tracing::warn;
pub struct InclusiveInterval {
pub start: u64,
pub end: u64,
}
pub enum Strand {
Positive,
Negative,
}
impl std::fmt::Display for Strand {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Strand::Positive => write!(f, "+"),
Strand::Negative => write!(f, "-"),
}
}
}
#[derive(Copy, Clone)]
pub struct FlankOptions {
pub start: bool,
pub both: bool,
pub ignore_strand: bool,
}
impl Default for FlankOptions {
fn default() -> FlankOptions {
FlankOptions {
start: true,
both: false,
ignore_strand: false,
}
}
}
impl FlankOptions {
pub fn new(start: bool, both: bool, ignore_strand: bool) -> FlankOptions {
FlankOptions {
start,
both,
ignore_strand,
}
}
}
pub struct MergeOptions {
pub by: Vec<String>,
pub slack: i64,
pub ignore_strand: bool,
}
impl Default for MergeOptions {
fn default() -> MergeOptions {
MergeOptions {
by: vec![String::from("seqname"), String::from("strand")],
slack: 1,
ignore_strand: false,
}
}
}
impl MergeOptions {
pub fn new<T: AsRef<str>>(
by: &[T],
ignore_strand: bool,
slack: i64,
) -> anyhow::Result<MergeOptions> {
let mut by_hash: HashSet<String> = by.iter().map(|n| n.as_ref().to_string()).collect();
if slack < 1 {
warn!("It usually doen't make sense to set a non-positive slack.")
}
if by_hash.take(&String::from("start")).is_some()
| by_hash.take(&String::from("end")).is_some()
{
bail!("The provided `by` vector cannot contain the start or end column")
};
if ignore_strand {
if by_hash.take(&String::from("strand")).is_some() {
warn!("Remove `strand` from the provided `by` vector as the ignored_strand flag is set.")
}
} else {
by_hash.insert(String::from("strand"));
}
if by_hash.insert(String::from("seqname")) {
warn!("Added `seqname` to the `by` vector as it is required.")
};
Ok(MergeOptions {
by: by_hash.into_iter().collect(),
slack,
ignore_strand,
})
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum ExtendOption {
Start,
End,
Both,
}
pub struct GetSequenceOptions {}
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum OOBOption {
Truncate,
Skip,
}
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct FieldColumns {
pub seqname: String,
pub source: Option<String>,
pub feature_type: Option<String>,
pub start: String,
pub end: String,
pub score: Option<String>,
pub strand: String,
pub phase: Option<String>,
pub gene_id: Option<String>,
pub gene_name: Option<String>,
pub transcript_id: Option<String>,
pub exon_number: Option<String>,
}
impl FieldColumns {
pub fn seqname(&self) -> &str {
self.seqname.as_str()
}
pub fn source(&self) -> Option<&str> {
self.source.as_deref()
}
pub fn feature_type(&self) -> Option<&str> {
self.feature_type.as_deref()
}
pub fn start(&self) -> &str {
self.start.as_str()
}
pub fn end(&self) -> &str {
self.end.as_str()
}
pub fn score(&self) -> Option<&str> {
self.score.as_deref()
}
pub fn strand(&self) -> &str {
self.strand.as_str()
}
pub fn phase(&self) -> Option<&str> {
self.phase.as_deref()
}
pub fn gene_id(&self) -> Option<&str> {
self.gene_id.as_deref()
}
pub fn gene_name(&self) -> Option<&str> {
self.gene_name.as_deref()
}
pub fn transcript_id(&self) -> Option<&str> {
self.transcript_id.as_deref()
}
pub fn exon_number(&self) -> Option<&str> {
self.exon_number.as_deref()
}
}
impl Default for FieldColumns {
fn default() -> Self {
Self {
seqname: "seqname".to_string(),
source: Some("source".to_string()),
feature_type: Some("feature_type".to_string()),
start: "start".to_string(),
end: "end".to_string(),
score: Some("score".to_string()),
strand: "strand".to_string(),
phase: Some("phase".to_string()),
gene_id: Some("gene_id".to_string()),
gene_name: Some("gene_name".to_string()),
transcript_id: Some("transcript_id".to_string()),
exon_number: Some("exon_number".to_string()),
}
}
}
impl FieldColumns {
pub fn optional_fields(&self) -> [Option<&str>; 8] {
[
self.source(),
self.feature_type(),
self.score(),
self.phase(),
self.gene_id(),
self.gene_name(),
self.transcript_id(),
self.exon_number(),
]
}
pub fn gtf_fields(&self) -> [&str; 8] {
[
self.seqname(),
self.source().unwrap_or(""),
self.feature_type().unwrap_or(""),
self.start(),
self.end(),
self.score().unwrap_or(""),
self.strand(),
self.phase().unwrap_or(""),
]
}
pub fn gtf_attributes(&self) -> [Option<&str>; 4] {
[
self.gene_id(),
self.gene_name(),
self.transcript_id(),
self.exon_number(),
]
}
pub fn essential_fields(&self) -> [&str; 4] {
[self.seqname(), self.start(), self.end(), self.strand()]
}
pub fn is_valid(&self, df: &DataFrame, is_warn: bool, is_bail: bool) -> anyhow::Result<bool> {
let mut is_valid = true;
if df.column(self.seqname()).is_err() {
is_valid = false;
if is_warn {
warn!(
"The dataframe does not contain the specified seqname column {}; Cannot proceed. You can add one by calling `df.update_column(\"seqname\", vec!['.'; df.height()])`",
self.seqname()
)
}
}
if df.column(self.start()).is_err() {
is_valid = false;
if is_warn {
warn!(
"The dataframe does not contain the specified start column {}; Cannot proceed. You can add one by calling `df.update_column(\"start\", vec!['.'; df.height()])`",
self.start()
)
}
}
if df.column(self.end()).is_err() {
is_valid = false;
if is_warn {
warn!(
"The dataframe does not contain the specified end column {}; Cannot proceed. You can add one by calling `df.update_column(\"end\", vec!['.'; df.height()])`",
self.end()
)
}
}
if df.column(self.strand()).is_err() {
is_valid = false;
if is_warn {
warn!(
"The dataframe does not contain the specified strand column {}; Cannot proceed. You can add one by calling `df.update_column(\"strand\", vec!['.'; df.height()])`",
self.strand()
)
}
}
if let Some(s) = self.source() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided source column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.feature_type() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided feature_type column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.score() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided score column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.phase() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided phase column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.gene_id() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided gene_id column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.gene_name() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided gene_name column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.transcript_id() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided transcript_id column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if let Some(s) = self.exon_number() {
if df.column(s).is_err() {
is_valid = false;
if is_warn {
warn!("The provided exon_number column {} is not found in the dataframe; It will be ignored", s)
}
}
}
if !is_valid & is_bail {
bail!(
"The FieldColumns is not valid; Please try fix it by calling FieldColumns::fix()."
)
}
if !is_valid & is_warn {
warn!(
"The FieldColumns is not valid; Please try fix it by calling FieldColumns::fix()."
)
}
Ok(is_valid)
}
pub fn fix(&mut self, df: &DataFrame, is_warn: bool) -> anyhow::Result<()> {
if df.column(self.seqname()).is_err() {
if is_warn {
warn!(
"cannot find the specified seqname column {} in the dataframe; try to fix",
self.seqname()
);
}
if df.column("seqname").is_ok() {
self.seqname = "seqname".to_string();
} else {
bail!("The dataframe does not contain the specified seqname column {} or a column named \"seqname\"; Cannot fix.", self.seqname());
}
}
if df.column(self.start()).is_err() {
if is_warn {
warn!(
"cannot find the specified start column {} in the dataframe; try to fix",
self.start()
);
}
if df.column("start").is_ok() {
self.start = "start".to_string();
} else {
bail!("The dataframe does not contain the specified start column {} or a column named \"start\"; Cannot fix.", self.start());
}
}
if df.column(self.end()).is_err() {
if is_warn {
warn!(
"cannot find the specified end column {} in the dataframe; try to fix",
self.end()
);
}
if df.column("end").is_ok() {
self.end = "end".to_string();
} else {
bail!("The dataframe does not contain the specified end column {} or a column named \"end\"; Cannot fix.", self.end());
}
}
if df.column(self.strand()).is_err() {
if is_warn {
warn!(
"cannot find the specified strand column {} in the dataframe; try to fix",
self.strand()
);
}
if df.column("strand").is_ok() {
self.strand = "strand".to_string();
} else {
bail!("The dataframe does not contain the specified strand column {} or a column named \"strand\"; Cannot fix. If this is desired, you can add a dummy strand column by calling `df.update_column(\"strand\", vec!['.'; df.height()])`", self.strand());
}
}
if let Some(s) = self.source() {
if df.column(s).is_err() {
if is_warn {
warn!(
"cannot find the specified source column {} in the dataframe; try to fix",
s
);
}
self.source = if df.column("source").is_ok() {
Some("source".to_string())
} else {
None
}
}
}
if let Some(s) = self.feature_type() {
if df.column(s).is_err() {
if is_warn {
warn!("cannot find the specified feature_type column {} in the dataframe; try to fix", s);
}
self.feature_type = if df.column("feature_type").is_ok() {
Some("feature_type".to_string())
} else {
None
}
}
}
if let Some(s) = self.score() {
if df.column(s).is_err() {
if is_warn {
warn!(
"cannot find the specified score column {} in the dataframe; try to fix",
s
);
}
self.score = if df.column("score").is_ok() {
Some("score".to_string())
} else {
None
}
}
}
if let Some(s) = self.phase() {
if df.column(s).is_err() {
if is_warn {
warn!(
"cannot find the specified phase column {} in the dataframe; try to fix",
s
);
}
self.phase = if df.column("phase").is_ok() {
Some("phase".to_string())
} else {
None
}
}
}
if let Some(s) = self.gene_id() {
if df.column(s).is_err() {
if is_warn {
warn!(
"cannot find the specified gene_id column {} in the dataframe; try to fix",
s
);
}
self.gene_id = if df.column("gene_id").is_ok() {
Some("gene_id".to_string())
} else {
None
}
}
}
if let Some(s) = self.gene_name() {
if df.column(s).is_err() {
if is_warn {
warn!(
"cannot find the specified gene_name column {} in the dataframe; try to fix",
s
);
}
self.gene_name = if df.column("gene_name").is_ok() {
Some("gene_name".to_string())
} else {
None
}
}
}
if let Some(s) = self.transcript_id() {
if df.column(s).is_err() {
if is_warn {
warn!("cannot find the specified transcript_id column {} in the dataframe; try to fix", s);
}
self.transcript_id = if df.column("transcript_id").is_ok() {
Some("transcript_id".to_string())
} else {
None
}
}
}
if let Some(s) = self.exon_number() {
if df.column(s).is_err() {
if is_warn {
warn!("cannot find the specified exon_number column {} in the dataframe; try to fix", s);
}
self.exon_number = if df.column("exon_number").is_ok() {
Some("exon_number".to_string())
} else {
None
}
}
}
Ok(())
}
pub fn update<T: AsRef<str>>(&mut self, field: T, value: T) -> anyhow::Result<()> {
let value = value.as_ref().to_string();
match field.as_ref() {
"seqname" => self.seqname = value,
"source" => self.source = Some(value),
"feature_type" => self.feature_type = Some(value),
"start" => self.start = value,
"end" => self.end = value,
"score" => self.score = Some(value),
"strand" => self.strand = value,
"phase" => self.phase = Some(value),
"gene_id" => self.gene_id = Some(value),
"gene_name" => self.gene_name = Some(value),
"transcript_id" => self.transcript_id = Some(value),
"exon_number" => self.exon_number = Some(value),
_ => bail!("invalid field name: {}", field.as_ref()),
}
Ok(())
}
pub fn field<T: AsRef<str>>(&self, field: T) -> Option<&str> {
match field.as_ref() {
"seqname" => Some(self.seqname.as_str()),
"source" => self.source(),
"feature_type" => self.feature_type(),
"start" => Some(self.start.as_str()),
"end" => Some(self.end.as_str()),
"score" => self.score(),
"strand" => Some(self.strand.as_str()),
"phase" => self.phase(),
"gene_id" => self.gene_id(),
"gene_name" => self.gene_name(),
"transcript_id" => self.transcript_id(),
"exon_number" => self.exon_number(),
_ => None,
}
}
pub fn field_checked<T: AsRef<str>>(
&self,
field: T,
is_bail: bool,
) -> anyhow::Result<Option<&str>> {
match field.as_ref() {
"seqname" => Ok(Some(self.seqname.as_str())),
"source" => Ok(self.source()),
"feature_type" => Ok(self.feature_type()),
"start" => Ok(Some(self.start.as_str())),
"end" => Ok(Some(self.end.as_str())),
"score" => Ok(self.score()),
"strand" => Ok(Some(self.strand.as_str())),
"phase" => Ok(self.phase()),
"gene_id" => Ok(self.gene_id()),
"gene_name" => Ok(self.gene_name()),
"transcript_id" => Ok(self.transcript_id()),
"exon_number" => Ok(self.exon_number()),
_ => {
if is_bail {
bail!(
"The provided field {} is not a valid field name; Cannot proceed",
field.as_ref()
);
}
warn!(
"The provided field {} is not a valid field name; It will be ignored",
field.as_ref()
);
Ok(None)
}
}
}
pub fn all_fields() -> [&'static str; 12] {
FIELDCOLUMNS
}
}