use crate::grangers_utils;
use crate::grangers_utils::*;
use crate::options::*;
use crate::reader;
use crate::reader::fasta::SeqInfo;
use anyhow::{bail, Context};
use lazy_static::lazy_static;
pub(crate) use noodles::fasta::record::{Definition, Sequence};
use nutype::nutype;
use polars::{frame::DataFrame, lazy::prelude::*, prelude::*, series::Series};
use rust_lapper::{Interval, Lapper};
use std::collections::{HashMap, HashSet};
use std::convert::AsRef;
use std::fs;
use std::io::{BufWriter, Read, Write};
use std::iter::IntoIterator;
use std::ops::FnMut;
use std::ops::{Add, Mul, Sub};
use std::path::Path;
use std::result::Result::Ok;
use std::sync::atomic::{AtomicU32, Ordering};
use tracing::debug;
use tracing::{info, warn};
lazy_static! {
static ref GRANGERS_COUNTER: AtomicU32 = AtomicU32::new(0);
}
type LapperType = Lapper<u64, (usize, Vec<String>)>;
#[nutype(derive(Debug, Clone, AsRef))]
pub struct GrangersRecordID(u32);
pub struct GrangersSequenceCollection {
pub records: Vec<(GrangersRecordID, noodles::fasta::Record)>,
pub signature: u64,
}
impl GrangersSequenceCollection {
pub fn new_with_signature_and_capacity(signature: u64, capacity: usize) -> Self {
GrangersSequenceCollection {
records: Vec::<(GrangersRecordID, noodles::fasta::Record)>::with_capacity(capacity),
signature,
}
}
pub fn add_record(&mut self, rec_id: GrangersRecordID, rec: noodles::fasta::Record) {
self.records.push((rec_id, rec));
}
pub fn records_iter(&self) -> std::slice::Iter<'_, (GrangersRecordID, noodles::fasta::Record)> {
self.records.iter()
}
pub fn records_iter_mut(
&mut self,
) -> std::slice::IterMut<'_, (GrangersRecordID, noodles::fasta::Record)> {
self.records.iter_mut()
}
pub fn get_signature(&self) -> u64 {
self.signature
}
}
#[derive(Clone)]
pub struct Grangers {
pub df: DataFrame,
pub misc: Option<HashMap<String, Vec<String>>>,
pub seqinfo: Option<SeqInfo>,
pub interval_type: IntervalType,
pub field_columns: FieldColumns,
pub signature: u64,
}
impl Grangers {
#[inline(always)]
fn inc_signature(&mut self) {
self.signature += 1;
}
}
impl Grangers {
pub fn any_nulls<T: AsRef<str>>(
&self,
fields: &[T],
is_warn: bool,
is_bail: bool,
) -> anyhow::Result<bool> {
self.field_columns().is_valid(self.df(), true, true)?;
let mut any_nulls = false;
let df = self.df();
for col in fields {
if df
.column(&self.get_column_name(col.as_ref(), false)?)?
.null_count()
> 0
{
any_nulls = true;
if is_warn {
warn!("The dataframe contains null values in the given fields -- {:?}. This will cause problems for most Grangers functions.", col.as_ref());
}
}
}
if (any_nulls) & is_bail {
let fields_str = fields.iter().map(|s| s.as_ref()).collect::<Vec<_>>();
bail!("The dataframe contains null values in the given fields -- {:?}. You can drop null values by calling `df.drop_nulls(Some(&{:?}))`", fields_str,fields_str);
}
if (any_nulls) & is_warn {
let fields_str = fields.iter().map(|s| s.as_ref()).collect::<Vec<_>>();
warn!(
"You can drop null values by calling `df.drop_nulls(Some(&{:?}))`",
fields_str
)
}
Ok(any_nulls)
}
pub fn new(
mut df: DataFrame,
seqinfo: Option<SeqInfo>,
misc: Option<HashMap<String, Vec<String>>>,
interval_type: IntervalType,
mut field_columns: FieldColumns,
verbose: bool,
) -> anyhow::Result<Grangers> {
if !field_columns.is_valid(&df, verbose, false)? {
field_columns.fix(&df, false)?;
}
if interval_type.start_offset() != 0 {
df.with_column(
df.column(field_columns.start()).unwrap() - interval_type.start_offset(),
)?;
}
if interval_type.end_offset() != 0 {
df.with_column(df.column(field_columns.end()).unwrap() - interval_type.end_offset())?;
}
let gid = GRANGERS_COUNTER.fetch_add(1, Ordering::SeqCst) as u64;
let gr = Grangers {
df,
misc,
seqinfo,
interval_type,
field_columns,
signature: (gid << 32),
};
gr.any_nulls(&gr.field_columns().essential_fields(), verbose, true)?;
Ok(gr)
}
pub fn from_gstruct(
gstruct: reader::GStruct,
interval_type: IntervalType,
) -> anyhow::Result<Grangers> {
let mut df_vec = vec![
Column::new("seqname".into(), gstruct.seqid),
Column::new("source".into(), gstruct.source),
Column::new("feature_type".into(), gstruct.feature_type),
Column::new("start".into(), gstruct.start),
Column::new("end".into(), gstruct.end),
Column::new("score".into(), gstruct.score),
Column::new("strand".into(), gstruct.strand),
Column::new("phase".into(), gstruct.phase),
];
let el = if let Some(ref extra) = gstruct.attributes.extra {
extra.len()
} else {
0_usize
};
df_vec.reserve(df_vec.len() + gstruct.attributes.essential.len() + el);
for (k, v) in gstruct.attributes.essential {
if !v.is_empty() {
df_vec.push(Column::new(k.into(), v));
};
}
if let Some(attributes) = gstruct.attributes.extra {
for (k, v) in attributes {
let s = if v.is_empty() {
Column::full_null(k.into(), gstruct.attributes.tally, &DataType::String)
} else {
Column::new(k.into(), v)
};
df_vec.push(s);
}
}
let df = DataFrame::new(df_vec)?;
Grangers::new(
df,
None,
gstruct.misc,
interval_type,
FieldColumns::default(),
true,
)
}
pub fn from_gtf<P: AsRef<std::path::Path>>(
file_path: P,
only_essential: bool,
) -> anyhow::Result<Grangers> {
let am = reader::AttributeMode::from(!only_essential);
let gstruct = reader::GStruct::from_gtf(file_path.as_ref(), am)?;
Grangers::from_gstruct(gstruct, IntervalType::Inclusive(1))
}
pub fn from_gff<P: AsRef<std::path::Path>>(
file_path: P,
only_essential: bool,
) -> anyhow::Result<Grangers> {
let am = reader::AttributeMode::from(!only_essential);
let gstruct = reader::GStruct::from_gff(file_path, am)?;
Grangers::from_gstruct(gstruct, IntervalType::Inclusive(1))
}
pub fn add_seqinfo<T: AsRef<Path>>(&mut self, genome_file: T) -> anyhow::Result<()> {
self.seqinfo = Some(SeqInfo::from_fasta(genome_file)?);
Ok(())
}
pub fn get_gtf_df(&self) -> anyhow::Result<DataFrame> {
let df = self.df();
let mut fc = self.field_columns.clone();
let mut attr_cols: HashSet<&str> = df
.get_column_names()
.into_iter()
.map(|s| s.as_str())
.collect();
let mut existing_field_cols = Vec::new();
let mut missing_field_cols = Vec::new();
for field in GXFFIELDS {
if let Ok(col) = self.get_column_name(field, false) {
fc.update(field, col.as_str())?;
attr_cols.remove(col.as_str());
existing_field_cols.push(col);
} else {
fc.update(field, field)?;
missing_field_cols.push(field);
}
}
let mut expr_vec = existing_field_cols
.iter()
.map(|col_name| col(col_name.as_str()))
.collect::<Vec<_>>();
expr_vec.extend(
missing_field_cols
.iter()
.map(|&col_name| lit(".").alias(col_name)),
);
expr_vec.extend(attr_cols.iter().map(|&col_name| {
(when(col(col_name).is_not_null())
.then(
lit(col_name) + lit(" \"") + col(col_name).cast(DataType::String) + lit("\";"),
)
.otherwise(lit("")))
.alias(col_name)
}));
let out_df = self
.df()
.clone()
.lazy()
.select(expr_vec)
.select([
col(fc.field("seqname").expect(
"Could not get the seqname field. Please report this issue via GitHub.",
)),
col(fc.field("source").expect(
"Could not get the source field. Please report this issue via GitHub.",
)),
col(fc.field("feature_type").expect(
"Could not get the feature_type field. Please report this issue via GitHub.",
)),
col(fc
.field("start")
.expect("Could not get the start field. Please report this issue via GitHub.")),
col(fc
.field("end")
.expect("Could not get the end field. Please report this issue via GitHub.")),
col(fc
.field("score")
.expect("Could not get the score field. Please report this issue via GitHub.")),
col(fc.field("strand").expect(
"Could not get the strand field. Please report this issue via GitHub.",
)),
col(fc
.field("phase")
.expect("Could not get the phase field. Please report this issue via GitHub.")),
concat_str(
attr_cols.iter().map(|&c| col(c)).collect::<Vec<_>>(),
"",
false,
)
.alias("attributes"),
])
.fill_nan(lit("."))
.fill_null(lit("."))
.collect()?;
Ok(out_df)
}
pub fn write_gtf<T: AsRef<Path>>(&self, file_path: T) -> anyhow::Result<()> {
let file_path = file_path.as_ref();
fs::create_dir_all(file_path.parent().with_context(|| {
format!(
"Could not get the parent directory of the given output file path {:?}",
file_path.as_os_str()
)
})?)?;
let mut out_df = self.get_gtf_df()?;
let file = std::fs::File::create(file_path)?;
let mut file = BufWriter::with_capacity(4194304, file);
CsvWriter::new(&mut file)
.include_header(false)
.with_separator(b'\t')
.with_null_value(".".to_string())
.finish(&mut out_df)?;
Ok(())
}
}
impl Grangers {
pub fn field_columns(&self) -> &FieldColumns {
&self.field_columns
}
pub fn field_columns_mut(&mut self) -> &FieldColumns {
&mut self.field_columns
}
pub fn field_columns_checked(
&self,
is_warn: bool,
is_bail: bool,
) -> anyhow::Result<&FieldColumns> {
self.field_columns().is_valid(self.df(), is_warn, is_bail)?;
Ok(self.field_columns())
}
pub fn df(&self) -> &DataFrame {
&self.df
}
pub fn df_mut(&mut self) -> &mut DataFrame {
&mut self.df
}
pub fn interval_type(&self) -> &IntervalType {
&self.interval_type
}
pub fn seqinfo(&self) -> Option<&SeqInfo> {
self.seqinfo.as_ref()
}
pub fn seqinfo_mut(&mut self) -> Option<&mut SeqInfo> {
self.seqinfo.as_mut()
}
pub fn sort_by<T>(
&mut self,
by: &[&str],
descending: impl IntoIterator<Item = bool>,
maintain_order: bool,
multithreaded: bool,
) -> anyhow::Result<()> {
self.df = self.df.sort(
by.iter()
.map(|s| s.to_owned().into())
.collect::<Vec<PlSmallStr>>(),
SortMultipleOptions::default()
.with_order_descending_multi(descending)
.with_maintain_order(maintain_order)
.with_multithreaded(multithreaded),
)?;
self.inc_signature();
Ok(())
}
pub fn filter<T: AsRef<str>>(
&self,
by: T,
values: &[T],
warn_empty: bool,
) -> anyhow::Result<Grangers> {
let column = self.get_column_name(by.as_ref(), false)?;
let s = self.df().column(&column)?.as_materialized_series();
let df = self.df().filter(&is_in(
s,
&Series::new(
PlSmallStr::from_str("values"),
values.iter().map(|s| s.as_ref()).collect::<Vec<&str>>(),
),
)?)?;
if df.is_empty() && warn_empty {
warn!("The filtered dataframe is empty.")
}
Grangers::new(
df,
self.seqinfo().cloned(),
self.misc.clone(),
IntervalType::default(),
self.field_columns().clone(),
true,
)
}
pub fn get_signature(&self) -> u64 {
self.signature
}
fn set_signature(&mut self, other_sig: u64) {
self.signature = other_sig
}
pub fn update_column(
&mut self,
column: Column,
field_column: Option<&str>,
) -> anyhow::Result<()> {
if column.null_count() > 0 {
warn!("The provided Series object contains {} null values. This might cause problems when calling Grangers methods.", column.null_count());
}
if let Some(field_column) = field_column {
self.field_columns
.update(PlSmallStr::from(field_column), column.name().clone())?;
}
let name = column.name().to_owned();
self.df.with_column(column).with_context(|| {
format!(
"Could not update Grangers with the provided Series object: {:?}",
name
)
})?;
self.inc_signature();
Ok(())
}
pub fn update_df(&mut self, df: DataFrame, is_warn: bool, is_bail: bool) -> anyhow::Result<()> {
if df.shape() != self.df.shape() {
bail!("The provided dataframe has a different layout as the current one. Please use Grangers::new() to instantiate a new Grangers struct.")
}
let self_columns = self.df.get_column_names();
let new_columns = df.get_column_names();
if !self_columns.iter().all(|item| new_columns.contains(item)) {
bail!("The provided dataframe have different column names as the current one. Please use Grangers::new() to instantiate a new Grangers struct.")
}
self.df = df;
self.validate(is_warn, is_bail)?;
self.inc_signature();
Ok(())
}
}
impl Grangers {
pub fn get_column_name_str<T: AsRef<str>>(
&self,
name: T,
bail_null: bool,
) -> anyhow::Result<&str> {
let name = name.as_ref();
let fc = self.field_columns_checked(false, true)?;
let name = if let Some(col) = fc.field(name) {
col
} else if self
.df()
.get_column_names()
.contains(&&PlSmallStr::from(name))
{
self.df().column(name)?.name()
} else {
bail!("{} is neither a column in the dataframe nor a field of FieldColumns. Cannot proceed", name)
};
if bail_null && self.df().column(name)?.null_count() > 0 {
bail!("The column {} contains null values. Cannot proceed.", name)
}
Ok(name)
}
pub fn get_column_name<T: AsRef<str>>(
&self,
name: T,
bail_null: bool,
) -> anyhow::Result<String> {
let name = name.as_ref();
let fc = self.field_columns_checked(false, true)?;
let name = if let Some(col) = fc.field(name) {
col
} else if self
.df()
.get_column_names()
.contains(&&PlSmallStr::from(name))
{
self.df().column(name)?.name()
} else {
bail!("{} is neither a column in the dataframe nor a field of FieldColumns. Cannot proceed", name)
};
if bail_null && self.df().column(name)?.null_count() > 0 {
bail!("The column {} contains null values. Cannot proceed.", name)
}
Ok(name.to_string())
}
pub fn column<T: AsRef<str>>(&self, name: T) -> anyhow::Result<&Column> {
let direct = self.df().column(name.as_ref());
let col = if direct.is_ok() {
direct?
} else {
let col = self.get_column_name_str(name.as_ref(), false)?;
self.df().column(col)?
};
Ok(col)
}
pub fn columns<T: AsRef<str>>(&self, names: &[T]) -> anyhow::Result<Vec<&Column>> {
let mut cols = Vec::new();
for name in names {
cols.push(self.column(name)?);
}
Ok(cols)
}
pub fn seqname(&self) -> anyhow::Result<&Column> {
self.column(self.field_columns.seqname())
}
pub fn start(&self) -> anyhow::Result<&Column> {
self.column(self.field_columns.start())
}
pub fn end(&self) -> anyhow::Result<&Column> {
self.column(self.field_columns.end())
}
pub fn strand(&self) -> anyhow::Result<&Column> {
self.column(self.field_columns.strand())
}
pub fn score(&self) -> anyhow::Result<&Column> {
self.column(
self.field_columns
.score()
.with_context(|| "Could not get the score column from the dataframe.")?,
)
}
pub fn phase(&self) -> anyhow::Result<&Column> {
self.column(
self.field_columns
.phase()
.with_context(|| "Could not get the score column from the dataframe.")?,
)
}
pub fn feature_type(&self) -> anyhow::Result<&Column> {
self.column(
self.field_columns
.feature_type()
.with_context(|| "Could not get the score column from the dataframe.")?,
)
}
pub fn range(&self) -> anyhow::Result<DataFrame> {
let range = self.df.select([
self.field_columns().start(),
self.field_columns().end(),
self.field_columns().strand(),
])?;
Ok(range)
}
pub fn is_column<T: AsRef<str>>(&self, name: T) -> bool {
self.column(name).is_ok()
}
}
impl Grangers {
pub fn validate(&self, is_warn: bool, is_bail: bool) -> anyhow::Result<bool> {
if self.df().height() == 0 {
if is_bail {
bail!("The dataframe is empty. Cannot proceed.")
} else {
if is_warn {
warn!("The dataframe is empty.")
}
return Ok(false);
}
}
let valid_fc = self.field_columns.is_valid(self.df(), false, true)?;
if is_warn & !valid_fc {
warn!("The field_columns is not valid. You can use Grangers::fix_field_columns() to fix it.");
return Ok(false);
}
let essential_nulls =
self.any_nulls(&self.field_columns().essential_fields(), false, is_bail)?;
if is_warn & essential_nulls {
warn!("The dataframe contains null values in the essential fields - seqname, start, end and strand. You can use Grangers::drop_nulls() to drop them.");
return Ok(false);
}
if is_warn {
self.any_nulls(&self.df().get_column_names(), is_warn, false)?;
}
Ok(true)
}
pub fn fix_field_columns(&mut self, is_warn: bool) -> anyhow::Result<()> {
let mut field_columns = self.field_columns().clone();
field_columns.fix(self.df(), is_warn)?;
self.field_columns = field_columns;
Ok(())
}
}
impl Grangers {
pub fn introns(
&self,
by: Option<&str>,
exon_feature: Option<&str>,
keep_columns: Option<&[&str]>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
let exon_gr = self.exons(exon_feature, multithreaded)?;
let gene_id = self.field_columns().gene_id();
let gene_name = self.field_columns().gene_name();
let mut kc = Vec::new();
if let Some(gene_id) = gene_id {
kc.push(gene_id);
}
if let Some(gene_name) = gene_name {
kc.push(gene_name);
}
if let Some(keep_columns) = keep_columns {
kc.extend_from_slice(keep_columns);
}
let by_str = if let Some(by) = by {
exon_gr.get_column_name_str(by, true)?
} else {
exon_gr.get_column_name_str("transcript_id", true)?
};
exon_gr.gaps(&[by_str], false, None, Some(&kc), multithreaded)
}
pub fn genes(
&self,
exon_feature: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
self.validate(false, true)?;
let gene_id = self.get_column_name_str("gene_id", true)?;
self.boundary(gene_id, exon_feature, multithreaded)
}
pub fn transcripts(
&self,
exon_feature: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
let transcript_id = self.get_column_name_str("transcript_id", true)?;
self.boundary(transcript_id, exon_feature, multithreaded)
}
pub fn boundary(
&self,
by: &str,
exon_feature: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
self.validate(false, true)?;
let mut exon_gr = self.exons(exon_feature, multithreaded)?;
let fc = self.field_columns();
let seqname = fc.seqname();
let start = fc.start();
let end = fc.end();
let strand = fc.strand();
let by = self.get_column_name_str(by, true)?;
let any_invalid = exon_gr
.df()
.select([seqname, strand, by])?
.lazy()
.group_by([by])
.agg([
col(seqname)
.unique()
.count()
.neq(lit(1))
.alias("seqname_any"),
col(strand).unique().count().neq(lit(1)).alias("strand_any"),
])
.select([col("seqname_any").any(true), col("strand_any").any(true)]) .collect()?
.get_row(0)?
.0
.into_iter()
.any(|c| c != AnyValue::Boolean(false));
if any_invalid {
bail!("The genes are not well defined. All features of a gene should be defined in the same seqname and strand. Cannot proceed.")
};
exon_gr.df = exon_gr
.df
.lazy()
.group_by([seqname, by, strand])
.agg([col(start).min(), col(end).max()])
.collect()?;
exon_gr.fix_field_columns(false)?;
Ok(exon_gr)
}
pub fn exons(
&self,
exon_feature: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
self.validate(false, true)?;
let exon_feature = exon_feature.unwrap_or("exon");
let feature_type = self.get_column_name_str("feature_type", false)?;
if self.column(feature_type)?.null_count() > 0 {
warn!("Found rows with a null `{}` value. These rows will be ignored when selecting exon records.", feature_type)
}
let mut exon_gr = self.filter(feature_type, &[exon_feature], true)?;
let mut fc = self.field_columns().clone();
let seqname_s = self.get_column_name("seqname", true)?;
let seqname = seqname_s.as_str();
let start_s = self.get_column_name("start", true)?;
let start = start_s.as_str();
let end_s = self.get_column_name("end", true)?;
let end = end_s.as_str();
let strand_s = self.get_column_name("strand", true)?;
let strand = strand_s.as_str();
let transcript_id_s = self.get_column_name("transcript_id", false)?;
let transcript_id = transcript_id_s.as_str();
if exon_gr.column(transcript_id)?.null_count() > 0 {
bail!("Found exon features with a null transcript_id; Cannot proceed")
}
if !is_in(
&exon_gr.column(strand)?.as_materialized_series().unique()?,
&Series::new("valid stands".into(), VALIDSTRANDS),
)?
.all()
{
bail!("Found exons that do not have a valid strand (+ or -). Cannot proceed.")
}
let tx_strand = exon_gr
.df()
.select([seqname, transcript_id, strand])?
.lazy()
.group_by([seqname, transcript_id])
.agg([col(strand).unique().count().gt(lit(1)).alias("is_solo")])
.collect()?;
if tx_strand.column("is_solo")?.bool()?.any() {
bail!(
"Found transcripts with exons from multiple chromosomes or strands; Cannot proceed"
)
}
if let Some(start_min) = exon_gr.column(start)?.i64()?.min() {
if start_min < 1 {
bail!("Found exons with non-positive start position. Cannot proceed.")
}
} else {
bail!(
"Cannot get min value in the {} column. Cannot proceed.",
start
)
}
if let Some(end_min) = exon_gr.column(end)?.i64()?.min() {
if end_min < 1 {
bail!("Found exons with non-positive start position. Cannot proceed.")
}
} else {
bail!(
"Cannot get min value in the {} column. Cannot proceed.",
end
)
}
if fc.exon_number.is_some() && exon_gr.column(fc.exon_number().unwrap())?.null_count() > 0 {
warn!("The {} column contains null values. Will compute the exon number from exon start position .", fc.exon_number().unwrap());
fc.exon_number = None;
}
let exon_number = if let Some(exon_number) = fc.exon_number() {
exon_number.to_string()
} else {
fc.exon_number = Some("exon_number".to_string());
exon_gr.add_order(
Some(&[transcript_id]),
"exon_number",
Some(1),
multithreaded,
)?;
"exon_number".to_string()
};
exon_gr.df = exon_gr
.df
.lazy()
.with_column(col(exon_number.as_str()).cast(DataType::UInt32))
.select([all().sort_by(
[
col(seqname).cast(DataType::Categorical(None, CategoricalOrdering::Lexical)),
col(strand).cast(DataType::Categorical(None, CategoricalOrdering::Lexical)),
col(transcript_id)
.cast(DataType::Categorical(None, CategoricalOrdering::Lexical)),
col(exon_number.as_str()),
],
SortMultipleOptions::default().with_multithreaded(multithreaded),
)])
.collect()?;
fc.fix(exon_gr.df(), false)?;
exon_gr.field_columns = fc;
Ok(exon_gr)
}
pub fn extend(
&mut self,
length: i64,
extend_option: &ExtendOption,
ignore_strand: bool,
) -> anyhow::Result<()> {
self.validate(false, true)?;
let start_s = self.get_column_name("start", true)?;
let start = start_s.as_str();
let end_s = self.get_column_name("end", true)?;
let end = end_s.as_str();
let strand_s = self.get_column_name("strand", true)?;
let strand = strand_s.as_str();
if (!ignore_strand) & (extend_option != &ExtendOption::Both)
&& self.column(strand)?.is_null().any()
| !is_in(
&self.column(strand)?.as_materialized_series().unique()?,
&Series::new("valid stands".into(), VALIDSTRANDS),
)?
.all()
{
bail!("The strand column contains values other than {:?}. Please remove them first or set ignore_strand to true.", VALIDSTRANDS)
}
if let ExtendOption::Both = extend_option {
self.df
.with_column(self.df.column(start)?.clone() - length)?;
self.df.with_column(self.df.column(end)?.clone() + length)?;
return Ok(());
}
if ignore_strand {
match extend_option {
ExtendOption::Start => {
self.df
.with_column(self.df.column(start)?.clone() - length)?;
return Ok(());
}
ExtendOption::End => {
self.df.with_column(self.df.column(end)?.clone() + length)?;
return Ok(());
}
_ => {}
}
} else {
let mut df = self.df().select([start, end, strand])?;
df = df
.lazy()
.with_columns([
when(
col(strand)
.eq(lit("+"))
.eq(lit(extend_option == &ExtendOption::Start))
.or(col(strand)
.eq(lit("-"))
.eq(lit(extend_option == &ExtendOption::End))),
)
.then(col(start).sub(lit(length)))
.otherwise(col(start))
.alias(start),
when(
col(strand)
.eq(lit("-"))
.eq(lit(extend_option == &ExtendOption::Start))
.or(col(strand)
.eq(lit("+"))
.eq(lit(extend_option == &ExtendOption::End))),
)
.then(col(end).add(lit(length)))
.otherwise(col(end))
.alias(end),
])
.collect()?;
self.df.with_column(df.column(start)?.clone())?;
self.df.with_column(df.column(end)?.clone())?;
}
Ok(())
}
pub fn flank(&self, width: i64, options: FlankOptions) -> anyhow::Result<Grangers> {
self.validate(false, true)?;
let start_s = self.get_column_name("start", true)?;
let start = start_s.as_str();
let end_s = self.get_column_name("end", true)?;
let end = end_s.as_str();
let strand_s = self.get_column_name("strand", true)?;
let strand = strand_s.as_str();
let df = self
.df()
.clone()
.lazy()
.with_column(
when(options.ignore_strand)
.then(lit(true))
.otherwise(col(strand).eq(lit("-")).neq(lit(options.start)))
.alias("start_flags_temp"),
)
.with_column(
when(options.both)
.then(
when(col("start_flags_temp").eq(lit(true)))
.then(col(start) - lit(width).abs())
.otherwise(col(end) - lit(width).abs() + lit(1)),
)
.otherwise(
when(width >= 0)
.then(
when(col("start_flags_temp").eq(lit(true)))
.then(col(start) - lit(width))
.otherwise(col(end) + lit(1)),
)
.otherwise(
when(col("start_flags_temp").eq(lit(true)))
.then(col(start))
.otherwise(col(end) + lit(width) + lit(1)),
),
)
.alias(start),
)
.select([
all().exclude([end, "start_flags_temp"]),
col(start)
.add(
(lit(width)
.abs()
.mul(when(lit(options.both)).then(lit(2)).otherwise(lit(1))))
.sub(lit(1)),
)
.alias(end),
])
.select(
self.df()
.get_column_names()
.iter()
.map(|&x| col(x.to_owned()))
.collect::<Vec<Expr>>(),
)
.collect()?;
Grangers::new(
df,
self.seqinfo.clone(),
self.misc.clone(),
self.interval_type,
self.field_columns.clone(),
false,
)
}
pub fn setdiff(
&self,
_boundary: Grangers,
_on: &str,
_boundary_on: &str,
) -> anyhow::Result<Grangers> {
todo!("not yet implemented");
}
pub fn seqinfo_to_bounary(&self) -> anyhow::Result<Grangers> {
todo!("not yet implemented");
}
pub fn gaps<T: AsRef<str>>(
&self,
by: &[T],
ignore_strand: bool,
slack: Option<usize>,
keep_columns: Option<&[&str]>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
let mut gr = self.merge(by, ignore_strand, slack, keep_columns, multithreaded)?;
gr.df = gr.apply(
by,
None,
ignore_strand,
apply_gaps,
keep_columns,
multithreaded,
)?;
Ok(gr)
}
pub fn add_order(
&mut self,
by: Option<&[&str]>,
name: &str,
offset: Option<u32>,
multithreaded: bool,
) -> anyhow::Result<()> {
self.validate(false, true)?;
let offset = offset.unwrap_or(1);
if let Some(by) = by {
let mut by_col = Vec::new();
for b in by.iter() {
by_col.push(col(self.get_column_name_str(b, true)?))
}
let strand_s = self.get_column_name("strand", false)?;
let strand = strand_s.as_str();
let start_s = self.get_column_name("start", true)?;
let start = start_s.as_str();
self.df = self
.df()
.clone()
.lazy()
.with_column(
when(col(strand).first().eq(lit("+")))
.then(
col(start)
.arg_sort(SortOptions::default().with_multithreaded(multithreaded)),
)
.otherwise(
col(start).arg_sort(
SortOptions::default()
.with_order_descending(true)
.with_multithreaded(multithreaded),
),
)
.add(Expr::Literal(LiteralValue::UInt32(offset)))
.over(by)
.cast(DataType::String)
.alias(name),
)
.collect()?;
} else {
self.df.with_row_index(name.into(), Some(offset))?;
}
Ok(())
}
pub fn drop_nulls(&mut self, fields: Option<&[&str]>) -> anyhow::Result<()> {
self.validate(false, true)?;
let cols: Vec<String> = match fields {
Some(names) => self
.columns(names)?
.iter()
.map(|s| s.name().to_owned().into_string())
.collect(),
None => self
.df
.get_column_names()
.into_iter()
.map(|s| s.to_owned().into_string())
.collect(),
};
*self.df_mut() = self.df().drop_nulls(Some(&cols))?;
Ok(())
}
pub fn merge<T: AsRef<str>>(
&self,
by: &[T],
ignore_strand: bool,
slack: Option<usize>,
keep_columns: Option<&[&str]>,
multithreaded: bool,
) -> anyhow::Result<Grangers> {
self.validate(false, true)?;
let df = self.apply(
by,
slack,
ignore_strand,
apply_merge,
keep_columns,
multithreaded,
)?;
Grangers::new(
df,
self.seqinfo.clone(),
self.misc.clone(),
IntervalType::default(),
self.field_columns.clone(),
false,
)
}
fn apply<F, T: AsRef<str>>(
&self,
by: &[T],
slack: Option<usize>,
ignore_strand: bool,
apply_fn: F,
keep_columns: Option<&[&str]>,
multithreaded: bool,
) -> anyhow::Result<DataFrame>
where
F: Fn(Column, i64) -> Result<Option<polars::prelude::Column>, PolarsError>
+ Copy
+ std::marker::Send
+ std::marker::Sync
+ 'static,
{
self.validate(false, true)?;
let df = self.df();
let fc = self.field_columns();
let seqname = fc.seqname();
let start = fc.start();
let end = fc.end();
let strand = fc.strand();
let mut by_hash: HashSet<&str> = HashSet::with_capacity(by.len());
for name in by.iter() {
let name = self.get_column_name_str(name.as_ref(), true)?;
by_hash.insert(name);
}
if by_hash.take(start).is_some() | by_hash.take(end).is_some() {
bail!("The provided `by` vector cannot contain the start or end column")
};
let slack = if let Some(s) = slack {
if s < 1 {
warn!("It usually does not make sense to set slack as zero.")
}
s as i64
} else {
1
};
if ignore_strand {
if by_hash.take(strand).is_some() {
warn!("Remove `strand` from the provided `by` vector as the ignored_strand flag is set.")
}
} else {
by_hash.insert(strand);
}
if by_hash.insert(seqname) {
debug!("Added `seqname` to the `by` vector as it is required.")
};
let by: Vec<&str> = by_hash.into_iter().collect();
let mut selected = by.to_vec();
if !selected.contains(&seqname) {
selected.push(seqname);
}
if !selected.contains(&start) {
selected.push(start);
}
if !selected.contains(&end) {
selected.push(end);
}
if !ignore_strand && !selected.contains(&strand) {
selected.push(strand);
}
if self.any_nulls(&selected, true, false)? {
warn!("Found null value(s) in the selected columns -- {:?}. As null will be used for grouping, we recommend dropping all null values by calling gr.drops_nulls() beforehand.", selected)
}
let mut sorted_by_exprs_essential = vec![col(seqname), col(start), col(end)];
let mut sorted_by_desc_essential = vec![false, false, true];
if !ignore_strand {
sorted_by_exprs_essential.push(col(strand));
sorted_by_desc_essential.push(false);
}
let mut sorted_by_exprs: Vec<Expr> = by
.iter()
.filter(|&&n| !sorted_by_exprs_essential.contains(&col(n)))
.map(|&n| col(n))
.collect();
let mut sorted_by_desc = vec![false; sorted_by_exprs.len()];
sorted_by_exprs.extend(sorted_by_exprs_essential);
sorted_by_desc.extend(sorted_by_desc_essential);
if let Some(keep_columns) = keep_columns {
for &c in keep_columns {
if !selected.contains(&self.get_column_name_str(c, false)?) {
selected.push(c);
}
}
}
let mut df = df.select(selected)?;
df = df
.lazy()
.sort_by_exprs(
&sorted_by_exprs,
SortMultipleOptions::default()
.with_order_descending_multi(sorted_by_desc.clone())
.with_multithreaded(multithreaded),
)
.group_by(by.iter().map(|&s| col(s)).collect::<Vec<Expr>>())
.agg([
all().exclude([start, end]).first(),
as_struct([col(start), col(end)].to_vec())
.apply(
move |s| apply_fn(s, slack),
GetOutput::from_type(DataType::List((DataType::Int64).into())),
)
.alias("start_end_list-temp-nobody-will-use-this-name-right"),
])
.explode(["start_end_list-temp-nobody-will-use-this-name-right"])
.with_columns([
col("start_end_list-temp-nobody-will-use-this-name-right")
.list()
.get(lit(0), false)
.alias(start),
col("start_end_list-temp-nobody-will-use-this-name-right")
.list()
.get(lit(1), false)
.alias(end),
lit(".").alias(if ignore_strand {
strand
} else {
"ignore_strand-temp-nobody-will-use-this-name-right"
}),
])
.drop_nulls(Some(vec![cols([start, end])]))
.with_column(
lit(".")
.cast(DataType::String)
.alias("ignore_strand-temp-nobody-will-use-this-name-right"),
)
.select([all().exclude([
"start_end_list-temp-nobody-will-use-this-name-right",
"ignore_strand-temp-nobody-will-use-this-name-right",
])])
.select([
col(seqname),
col(start),
col(end),
col(strand),
all().exclude([seqname, start, end, strand]),
])
.sort_by_exprs(
sorted_by_exprs,
SortMultipleOptions::default()
.with_order_descending_multi(sorted_by_desc.clone())
.with_maintain_order(multithreaded),
)
.collect()?;
Ok(df)
}
}
impl Grangers {
pub fn build_lappers<T: AsRef<str>>(
&mut self,
ignore_invalid: bool,
ignore_strand: bool,
group_by: &[T],
) -> anyhow::Result<HashMap<[String; 2], LapperType>> {
let start_time = std::time::Instant::now();
self.validate(false, true)?;
let mut by = Vec::new();
for b in group_by.iter() {
let name = b.as_ref();
if !self.field_columns().gtf_fields().contains(&name) {
warn!("The provided `by` vector contains a non-attribute column. There should be a strong reason of doing so")
}
by.push(self.get_column_name_str(name, true)?);
}
let start_s = self.get_column_name("start", true)?;
let start = start_s.as_str();
let end_s = self.get_column_name("end", true)?;
let end = end_s.as_str();
let seqname_s = self.get_column_name("seqname", true)?;
let seqname = seqname_s.as_str();
let strand_s = self.get_column_name("strand", true)?;
let strand = strand_s.as_str();
let selected = [start, end, seqname, strand];
let df = self.df();
let valid_rows_df = df
.select([start, end, strand])?
.lazy()
.select([
col(start).gt(lit(0)),
col(end).gt(lit(0)),
col(strand).eq(lit("+")).or(col(strand).eq(lit("-"))),
])
.select([
col(start).and(col(end)).alias("pos_valid"),
col(start)
.and(col(end))
.and(col(strand))
.alias("pos_strand_valid"),
])
.collect()?;
let valid_pos = valid_rows_df.column("pos_valid")?.as_materialized_series();
let valid_pos_strand = valid_rows_df
.column("pos_strand_valid")?
.as_materialized_series();
if !ignore_invalid {
if ignore_strand {
if valid_pos.iter().any(|v| v != AnyValue::Boolean(true)) {
bail!("Found features with non-positive start/end position. Please remove them first or set ignore_invalid to true.")
}
} else {
if valid_pos_strand
.iter()
.any(|v| v != AnyValue::Boolean(true))
{
bail!("Found features with non-positive start/end/strand position. Please remove them first or set ignore_invalid to true.")
}
}
}
type Iv = Interval<u64, (usize, Vec<String>)>;
let mut by_iters = df
.columns(by)?
.iter()
.map(|s| s.as_materialized_series().iter())
.collect::<Vec<_>>();
let mut ess_iters = self
.df()
.columns(selected)?
.iter()
.map(|s| s.as_materialized_series().iter())
.collect::<Vec<_>>();
let valid_rows = if ignore_strand {
valid_pos
} else {
valid_pos_strand
};
let mut lapper_tree_vec_hm = HashMap::new();
for (rid, is_valid) in valid_rows.iter().enumerate() {
if is_valid != AnyValue::Boolean(true) {
println!("Found invalid row at row {}: {}", rid, is_valid);
ess_iters[0].next();
ess_iters[1].next();
ess_iters[2].next();
ess_iters[3].next();
continue;
}
let s = ess_iters[0]
.next()
.expect("should have as many iterations as rows")
.cast(&DataType::Int64)
.try_extract::<i64>()? as u64;
let e = ess_iters[1]
.next()
.expect("should have as many iterations as rows")
.cast(&DataType::Int64)
.try_extract::<i64>()? as u64
+ 1;
let seqn = if let AnyValue::String(t) = ess_iters[2]
.next()
.expect("should have as many iterations as rows")
{
t.to_string()
} else {
bail!("Could not get the seqname of the feature")
};
let strd = if ignore_strand {
String::from(".")
} else if let AnyValue::String(t) = ess_iters[3]
.next()
.expect("should have as many iterations as rows")
{
t.to_string()
} else {
bail!("Could not get the strand of the feature")
};
let mut by_vec = Vec::new();
for it in by_iters.iter_mut() {
let v = if let AnyValue::String(t) =
it.next().expect("should have as many iterations as rows")
{
t.to_string()
} else {
bail!("Could not get the strand of the feature")
};
by_vec.push(v);
}
let lapper_tree_vec = lapper_tree_vec_hm
.entry([seqn.clone(), strd.clone()])
.or_insert(Vec::new());
lapper_tree_vec.push(Iv {
start: s,
stop: e,
val: (rid, by_vec),
});
}
let mut lappers = HashMap::new();
for (key, lapper_tree_vec) in lapper_tree_vec_hm.into_iter() {
let lapper = Lapper::new(lapper_tree_vec);
lappers.insert(key, lapper);
}
let duration: std::time::Duration = start_time.elapsed();
debug!("build rust-lappers in {:?}", duration);
Ok(lappers)
}
pub fn _lapper_find<T: AsRef<str>>(
&self,
_interval: InclusiveInterval,
_seqname: T,
_strand: Option<Strand>,
) -> anyhow::Result<()> {
todo!("not yet implemented");
}
}
impl Grangers {
pub fn write_transcript_sequences<T: AsRef<Path>, W: Write>(
&mut self,
ref_path: T,
out_file: W,
exon_name: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<()> {
self.write_transcript_sequences_with_filter(
ref_path,
out_file,
exon_name,
multithreaded,
&mut None::<fn(&noodles::fasta::Record) -> bool>,
)
}
pub fn write_transcript_sequences_with_filter<T: AsRef<Path>, W: Write, F>(
&mut self,
ref_path: T,
out_file: W,
exon_name: Option<&str>,
multithreaded: bool,
record_filter: &mut Option<F>,
) -> anyhow::Result<()>
where
F: FnMut(&noodles::fasta::Record) -> bool,
{
self.validate(false, true)?;
let mut exon_gr = self.exons(exon_name, multithreaded)?;
exon_gr.df = exon_gr.df().select([
exon_gr.get_column_name_str("seqname", true)?,
exon_gr.get_column_name_str("start", true)?,
exon_gr.get_column_name_str("end", true)?,
exon_gr.get_column_name_str("strand", true)?,
exon_gr.get_column_name_str("transcript_id", true)?,
exon_gr.get_column_name_str("exon_number", true)?,
])?;
let mut fc = exon_gr.field_columns().clone();
fc.fix(exon_gr.df(), false)?;
exon_gr.field_columns = fc;
let fc = exon_gr.field_columns();
let seqname = fc.seqname();
let end = fc.end();
let transcript_id = fc.transcript_id().unwrap();
let all_seqnames = exon_gr
.seqname()?
.unique()?
.str()?
.into_iter()
.map(|s| s.unwrap().to_string())
.collect::<HashSet<_>>();
let mut reader = grangers_utils::get_noodles_reader_from_path(ref_path)?;
let out_writer = BufWriter::with_capacity(4194304, out_file);
let mut writer = noodles::fasta::writer::Builder::default()
.set_line_base_count(usize::MAX)
.build_from_writer(out_writer);
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
if !all_seqnames.contains(chr_name) {
continue;
}
let chr_gr = exon_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
if let Some(end_max) = chr_gr.df().column(end)?.i64()?.max() {
if end_max > record.sequence().len() as i64 {
bail!("Found exons that exceed the length of the reference sequence. Cannot proceed")
}
} else {
bail!("Could not get the maximum end value of the exons. Cannot proceed")
}
let chr_seq_vec = chr_gr.get_sequences_fasta_record(&record, &OOBOption::Skip)?;
if chr_seq_vec
.iter()
.map(|f| f.is_none())
.fold(0, |acc, x| acc + x as usize)
> 0
{
bail!("Found invalid exons that exceed the chromosome length; Cannot proceed")
}
let mut tx_id_iter = chr_gr
.df()
.column(transcript_id)?
.str()?
.into_iter()
.peekable();
let mut curr_tx = if let Some(id) = tx_id_iter
.peek()
.with_context(|| "Could not get the first transcript id")?
{
id.to_string()
} else {
bail!("Could not get the first transcript id")
};
let mut exon_u8_vec: Vec<u8> = Vec::new();
for (tx_id, seq) in tx_id_iter.zip(chr_seq_vec.into_iter()) {
if let (Some(tx_id), Some(seq)) = (tx_id, seq) {
if tx_id == curr_tx {
exon_u8_vec.extend(seq.as_ref().iter());
} else {
let definition = Definition::new(curr_tx.clone(), None);
let sequence = Sequence::from_iter(exon_u8_vec.clone());
let rec = &noodles::fasta::Record::new(definition, sequence);
let write_record: bool;
if let Some(ref mut cb) = record_filter {
write_record = cb(rec);
} else {
write_record = true;
}
if write_record {
writer
.write_record(rec)
.with_context(|| {
format!(
"Could not write the sequence of transcript {} to the output file",
curr_tx
)
})?;
}
exon_u8_vec.clear();
exon_u8_vec.extend(seq.as_ref().iter());
curr_tx = tx_id.to_string();
}
} else {
bail!("Found null transcript id or empty exon sequence. This should not happen, please report this bug.")
}
}
let definition = Definition::new(curr_tx.clone(), None);
let sequence = Sequence::from_iter(exon_u8_vec.clone());
let rec = &noodles::fasta::Record::new(definition, sequence);
let write_record: bool;
if let Some(ref mut cb) = record_filter {
write_record = cb(rec);
} else {
write_record = true;
}
if write_record {
writer.write_record(rec).with_context(|| {
format!(
"Could not write the sequence of transcript {} to the output file",
curr_tx
)
})?;
}
exon_u8_vec.clear();
}
Ok(())
}
pub fn _write_sequences<T: AsRef<Path>>(
&mut self,
ref_path: T,
out_path: T,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: &OOBOption,
) -> anyhow::Result<()> {
let out_path = out_path.as_ref();
fs::create_dir_all(out_path.parent().with_context(|| {
format!(
"Could not get the parent directory of the given output file path {:?}",
out_path.as_os_str()
)
})?)?;
let out_file = std::fs::File::create(out_path).with_context(|| {
format!(
"Could not create the output file {:?}",
out_path.as_os_str()
)
})?;
let out_file = BufWriter::with_capacity(4194304, out_file);
self.validate(false, true)?;
let name_column = if let Some(name_column) = name_column {
if self.get_column_name_str(name_column, true).is_ok() {
self.get_column_name(name_column, false)?
} else {
warn!("The provided name column {:?} for naming the extracted sequences is not in the dataframe. Row order will will be used instead.", name_column);
"row_order".to_owned()
}
} else {
info!("No name column is provided. The extracted sequences will be named by the row order.");
"row_order".to_owned()
};
let mut fc = self.field_columns().clone();
let selection = [fc.seqname(), fc.start(), fc.end(), fc.strand()];
let mut df = self.df.select(selection)?;
df.with_column(Series::new(
"row_order".into(),
(0..df.height() as u32).collect::<Vec<u32>>(),
))?;
if ignore_strand {
df.with_column(Series::new(fc.strand().into(), vec!["+"; df.height()]))?;
}
fc.fix(&df, false)?;
let mut essential_gr = Grangers::new(df, None, None, IntervalType::default(), fc, false)?;
essential_gr.set_signature(self.get_signature());
let seqname = essential_gr.get_column_name_str("seqname", true)?;
let mut reader = grangers_utils::get_noodles_reader_from_path(ref_path)?;
let mut writer = noodles::fasta::writer::Builder::default()
.set_line_base_count(usize::MAX)
.build_from_writer(out_file);
let mut empty_counter = 0;
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
let chr_gr = essential_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
let name_vec = chr_gr
.df()
.column(name_column.as_str())?
.str()?
.into_iter()
.map(|s| s.unwrap())
.collect::<Vec<_>>();
let chr_seq_vec = chr_gr.get_sequences_fasta_record(&record, oob_option)?;
for (name, sequence) in name_vec.into_iter().zip(chr_seq_vec.into_iter()) {
let definition = Definition::new(name, None);
if let Some(sequence) = sequence {
writer
.write_record(&noodles::fasta::Record::new(definition, sequence))
.with_context(|| {
format!(
"Could not write sequence {} to the output file; Cannot proceed.",
name
)
})?;
} else {
empty_counter += 1;
}
}
}
if empty_counter > 0 {
warn!("Extracted empty sequence from {} records. They are usually caused by out of boundary features.", empty_counter)
}
Ok(())
}
pub fn write_sequences<T: AsRef<Path>, W: Write, F>(
&mut self,
ref_path: T,
out_file: W,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
) -> anyhow::Result<()> {
self.write_sequences_with_filter(
ref_path,
out_file,
ignore_strand,
name_column,
oob_option,
&mut None::<fn(&noodles::fasta::Record) -> bool>,
)
}
pub fn write_sequences_with_filter<T: AsRef<Path>, W: Write, F>(
&mut self,
ref_path: T,
out_file: W,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
record_filter: &mut Option<F>,
) -> anyhow::Result<()>
where
F: FnMut(&noodles::fasta::Record) -> bool,
{
self.validate(false, true)?;
let name_column = if let Some(name_column) = name_column {
if self.get_column_name_str(name_column, true).is_ok() {
self.get_column_name(name_column, false)?
} else {
warn!("The provided name column {:?} for naming the extracted sequences is not in the dataframe. Row order will will be used instead.", name_column);
"row_order".to_owned()
}
} else {
info!("No name column is provided. The extracted sequences will be named by the row order.");
"row_order".to_owned()
};
let mut fc = self.field_columns().clone();
let selection = [
fc.seqname(),
fc.start(),
fc.end(),
fc.strand(),
name_column.as_str(),
];
let mut df = if name_column.as_str() == "row_order" {
self.df
.with_row_index("row_order".into(), None)?
.select(selection)?
} else {
self.df.select(selection)?
};
if ignore_strand {
df.with_column(Series::new(fc.strand().into(), vec!["+"; df.height()]))?;
}
fc.fix(&df, false)?;
let mut essential_gr = Grangers::new(df, None, None, IntervalType::default(), fc, false)?;
essential_gr.set_signature(self.get_signature());
let seqname_s = essential_gr.get_column_name("seqname", true)?;
let seqname = seqname_s.as_str();
let mut reader = grangers_utils::get_noodles_reader_from_path(ref_path)?;
let out_writer = BufWriter::with_capacity(4194304, out_file);
let mut writer = noodles::fasta::writer::Builder::default()
.set_line_base_count(usize::MAX)
.build_from_writer(out_writer);
let mut empty_counter = 0;
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
let chr_gr = essential_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
let name_vec = chr_gr
.df()
.column(name_column.as_str())?
.str()?
.into_iter()
.map(|s| s.unwrap())
.collect::<Vec<_>>();
let chrsi = ChrRowSeqIter::new(&chr_gr, &record, oob_option)?;
for (feat_name, chrsi_rec) in name_vec.into_iter().zip(chrsi) {
if let Ok(sequence) = chrsi_rec {
let definition = Definition::new(feat_name, None);
let rec = &noodles::fasta::Record::new(definition, sequence);
let write_record: bool;
if let Some(ref mut cb) = record_filter {
write_record = cb(rec);
} else {
write_record = true;
}
if write_record {
writer.write_record(rec).with_context(|| {
format!(
"Could not write sequence {} to the output file; Cannot proceed.",
feat_name
)
})?;
}
} else {
empty_counter += 1;
}
}
}
if empty_counter > 0 {
warn!("Unable to extract sequence for {} records. They are usually caused by out of boundary features or an invalid alphabet.", empty_counter)
}
Ok(())
}
}
impl Grangers {
pub fn get_transcript_sequences<T: AsRef<Path>>(
&mut self,
fasta_path: T,
exon_name: Option<&str>,
multithreaded: bool,
) -> anyhow::Result<Vec<noodles::fasta::Record>> {
self.validate(false, true)?;
let mut exon_gr = self.exons(exon_name, multithreaded)?;
exon_gr.df = exon_gr.df().select([
exon_gr.get_column_name_str("seqname", true)?,
exon_gr.get_column_name_str("start", true)?,
exon_gr.get_column_name_str("end", true)?,
exon_gr.get_column_name_str("strand", true)?,
exon_gr.get_column_name_str("transcript_id", true)?,
exon_gr.get_column_name_str("exon_number", true)?,
])?;
let mut fc = exon_gr.field_columns().clone();
fc.fix(exon_gr.df(), false)?;
exon_gr.field_columns = fc;
let fc = exon_gr.field_columns();
let seqname = fc.seqname();
let end = fc.end();
let transcript_id = fc.transcript_id().unwrap();
let mut reader = grangers_utils::get_noodles_reader_from_path(fasta_path)?;
let mut transcript_seq_vec: Vec<noodles::fasta::Record> =
Vec::with_capacity(self.df().column(transcript_id)?.unique()?.len());
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
let chr_gr = exon_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
if let Some(end_max) = chr_gr.df().column(end)?.i64()?.max() {
if end_max > record.sequence().len() as i64 {
bail!("Found exons that exceed the length of the reference sequence. Cannot proceed")
}
} else {
bail!("Could not get the maximum end value of the exons. Cannot proceed")
}
let chr_seq_vec = chr_gr.get_sequences_fasta_record(&record, &OOBOption::Skip)?;
if chr_seq_vec
.iter()
.map(|f| f.is_none())
.fold(0, |acc, x| acc + x as usize)
> 0
{
bail!("Found invalid exons that exceed the chromosome length; Cannot proceed")
}
let mut tx_id_iter = chr_gr
.df()
.column(transcript_id)?
.str()?
.into_iter()
.peekable();
let mut curr_tx = if let Some(id) = tx_id_iter
.peek()
.with_context(|| "Could not get the first transcript id")?
{
id.to_string()
} else {
bail!("Could not get the first transcript id")
};
let mut exon_u8_vec: Vec<u8> = Vec::new();
for (tx_id, seq) in tx_id_iter.zip(chr_seq_vec.into_iter()) {
if let (Some(tx_id), Some(seq)) = (tx_id, seq) {
if tx_id == curr_tx {
exon_u8_vec.extend(seq.as_ref().iter());
} else {
let definition = Definition::new(curr_tx.clone(), None);
let sequence = Sequence::from_iter(exon_u8_vec.clone());
transcript_seq_vec.push(noodles::fasta::Record::new(definition, sequence));
exon_u8_vec.clear();
exon_u8_vec.extend(seq.as_ref().iter());
curr_tx = tx_id.to_string();
}
} else {
bail!("Found null transcript id or empty exon sequence. This should not happen, please report this bug.")
}
}
}
Ok(transcript_seq_vec)
}
pub fn _get_sequences<T: AsRef<Path>>(
&mut self,
fasta_path: T,
ignore_strand: bool,
name: Option<&str>,
oob_option: &OOBOption,
) -> anyhow::Result<Vec<Option<noodles::fasta::Record>>> {
self.validate(false, true)?;
let name = if name.is_some() && self.get_column_name(name.unwrap(), true).is_ok() {
warn!(
"The provided name column {:?} is not in the dataframe. Ignored.",
name
);
Some(self.get_column_name(name.unwrap(), false)?)
} else {
None
};
let mut fc = self.field_columns().clone();
let selection = [fc.seqname(), fc.start(), fc.end(), fc.strand()];
let df = self
.df
.select(selection)?
.lazy()
.with_row_index("row_order", None)
.with_column(if ignore_strand {
lit("+").alias("strand")
} else {
col("strand")
})
.collect()?;
fc.fix(&df, false)?;
let mut essential_gr = Grangers::new(df, None, None, IntervalType::default(), fc, false)?;
essential_gr.set_signature(self.get_signature());
let seqname = essential_gr.get_column_name_str("seqname", true)?;
let mut reader = grangers_utils::get_noodles_reader_from_path(fasta_path)?;
let mut seq_vec: Vec<Option<noodles::fasta::Record>> =
vec![None; essential_gr.df().height()];
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
let chr_gr = essential_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
let name_vec = if let Some(name) = &name {
chr_gr
.df()
.column(name)?
.str()?
.into_iter()
.map(|s| s.unwrap())
.collect::<Vec<_>>()
} else {
Vec::new()
};
let chr_seq_vec = chr_gr.get_sequences_fasta_record(&record, oob_option)?;
for (idx, seq) in chr_gr
.df()
.column("row_order")?
.u32()?
.into_iter()
.zip(chr_seq_vec.into_iter())
{
let idx: usize = idx.unwrap() as usize;
let seq_name = if name.is_some() {
name_vec[idx].to_string()
} else {
idx.to_string()
};
let definition = Definition::new(seq_name, None);
seq_vec[idx] = seq.map(|seq| noodles::fasta::Record::new(definition, seq));
}
}
Ok(seq_vec)
}
pub fn get_sequences<T: AsRef<Path>>(
&mut self,
ref_path: T,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
) -> anyhow::Result<GrangersSequenceCollection> {
self.get_sequences_from_read(
std::fs::File::open(ref_path)?,
ignore_strand,
name_column,
oob_option,
)
}
pub fn get_sequences_from_read<R: Read + 'static>(
&mut self,
reader: R,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
) -> anyhow::Result<GrangersSequenceCollection> {
self.validate(false, true)?;
let name_column = if let Some(name_column) = name_column {
if self.get_column_name_str(name_column, true).is_ok() {
self.get_column_name(name_column, false)?
} else {
warn!("The provided name column {:?} for naming the extracted sequences is not in the dataframe. Row order will will be used instead.", name_column);
"row_order".to_owned()
}
} else {
info!("No name column is provided. The extracted sequences will be named by the row order.");
"row_order".to_owned()
};
let mut fc = self.field_columns().clone();
let selection = [
fc.seqname(),
fc.start(),
fc.end(),
fc.strand(),
name_column.as_str(),
];
let df = self
.df
.select(selection)?
.lazy()
.with_row_index("row_order", None)
.with_column(if ignore_strand {
lit("+").alias("strand")
} else {
col("strand")
})
.collect()?;
fc.fix(&df, false)?;
let mut essential_gr = Grangers::new(df, None, None, IntervalType::default(), fc, false)?;
essential_gr.set_signature(self.get_signature());
let seqname = essential_gr.get_column_name_str("seqname", true)?;
let mut reader = grangers_utils::get_noodles_reader_from_reader(reader)?;
let sig = essential_gr.get_signature();
let num_rec = essential_gr.df().height();
let mut seq_coll =
GrangersSequenceCollection::new_with_signature_and_capacity(sig, num_rec);
let mut empty_counter = 0_usize;
for result in reader.records() {
let record = result?;
let record_name = std::str::from_utf8(record.name())?;
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
let chr_gr = essential_gr.filter(seqname, &[chr_name], false)?;
if chr_gr.df().height() == 0 {
continue;
}
let name_vec_iter = chr_gr
.df()
.column(name_column.as_str())?
.cast(&DataType::String)?
.str()?
.into_iter()
.map(|s| {
s.expect(
"The name column contains null values. Please report this bug on GitHub.",
)
.to_string()
})
.collect::<Vec<String>>();
let row_order_iter = chr_gr
.df()
.column("row_order")?
.u32()?
.into_iter()
.map(|s| s.expect("Could not get row order. Please report this bug on GitHub."));
let chrsi = ChrRowSeqIter::new(&chr_gr, &record, oob_option)?;
for ((idx, feat_name), sequence) in row_order_iter.zip(name_vec_iter).zip(chrsi) {
let sequence = match sequence {
Ok(sequence) => sequence,
Err(e) => {
warn!("Failed to get sequence for feature {} at row {}. The error message was {:?}", feat_name, idx, e);
empty_counter += 1;
continue;
}
};
let definition = Definition::new(feat_name, None);
let record = noodles::fasta::Record::new(definition, sequence);
seq_coll.add_record(GrangersRecordID::new(idx), record);
}
}
if empty_counter > 0 {
warn!("Unable to extract sequence for {} records. They are usually caused by out of boundary features or an invalid alphabet.", empty_counter)
}
Ok(seq_coll)
}
pub fn iter_sequences_from_reader<R: Read + 'static>(
&mut self,
reader: R,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
) -> anyhow::Result<Pin<Box<GrangersSeqIter>>> {
self.validate(false, true)?;
let name_column = if let Some(name_column) = name_column {
if self.get_column_name_str(name_column, true).is_ok() {
self.get_column_name(name_column, false)?
} else {
warn!("The provided name column {:?} for naming the extracted sequences is not in the dataframe. Row order will will be used instead.", name_column);
"row_order".to_owned()
}
} else {
info!("No name column is provided. The extracted sequences will be named by the row order.");
"row_order".to_owned()
};
let mut fc = self.field_columns().clone();
let selection = [
fc.seqname(),
fc.start(),
fc.end(),
fc.strand(),
name_column.as_str(),
];
let df = self
.df
.select(selection)?
.lazy()
.with_row_index("row_order", None)
.with_column(if ignore_strand {
lit("+").alias("strand")
} else {
col("strand")
})
.collect()?;
fc.fix(&df, false)?;
let mut essential_gr = Grangers::new(df, None, None, IntervalType::default(), fc, false)?;
essential_gr.set_signature(self.get_signature());
let seqname = essential_gr.get_column_name_str("seqname", true)?;
let filt_opt = GrangersFilterOpts {
seqname: seqname.to_owned(),
name_column,
oob_option,
};
Ok(GrangersSeqIter::new(reader, filt_opt, essential_gr))
}
pub fn iter_sequences<T: AsRef<Path>>(
&mut self,
ref_path: T,
ignore_strand: bool,
name_column: Option<&str>,
oob_option: OOBOption,
) -> anyhow::Result<Pin<Box<GrangersSeqIter>>> {
self.iter_sequences_from_reader(
std::fs::File::open(ref_path)?,
ignore_strand,
name_column,
oob_option,
)
}
pub(crate) fn get_sequences_fasta_record(
&self,
record: &noodles::fasta::Record,
oob_option: &OOBOption,
) -> anyhow::Result<Vec<Option<Sequence>>> {
self.validate(true, true)?;
let df = self.df();
let seqname = self.get_column_name("seqname", true)?;
let start = self.get_column_name("start", true)?;
let end = self.get_column_name("end", true)?;
let strand = self.get_column_name("strand", true)?;
if df.column(&seqname)?.unique()?.len() > 1 {
bail!("The dataframe contains more than one reference name. Please filter the dataframe by the reference name first.")
}
let mut seq_vec = Vec::with_capacity(df.height());
let ses = df.columns([start, end, strand])?;
for ((start, end), strand) in ses[0]
.i64()?
.into_iter()
.zip(ses[1].i64()?.into_iter())
.zip(ses[2].str()?.into_iter())
{
if let (Some(start), Some(end)) = (start, end) {
let (start, end) = if oob_option == &OOBOption::Truncate {
(
noodles::core::Position::try_from(std::cmp::max(1, start as usize))?,
noodles::core::Position::try_from(std::cmp::min(
record.sequence().len(),
end as usize,
))?,
)
} else {
(
noodles::core::Position::try_from(start as usize)?,
noodles::core::Position::try_from(end as usize)?,
)
};
let seq = record.sequence().slice(start..=end);
if strand == Some("-") {
if let Some(seq) = seq {
seq_vec.push(Some(seq.complement().rev().collect::<Result<_, _>>()?))
}
} else {
seq_vec.push(seq);
};
}
}
Ok(seq_vec)
}
}
pub struct GrangersFilterOpts {
seqname: String,
name_column: String,
oob_option: OOBOption,
}
pub struct GrangersSeqIter {
essential_gr: Grangers,
chr_gr: Option<Grangers>,
seq_reader: grangers_utils::FastaReader,
seq_record: noodles::fasta::Record,
filt_opt: GrangersFilterOpts,
name_vec_iter: <Vec<String> as IntoIterator>::IntoIter,
row_order_iter: <Vec<u32> as IntoIterator>::IntoIter,
chr_seq_iter: Option<ChrRowSeqIter<'static>>,
def_buffer: String,
}
use core::pin::Pin;
impl GrangersSeqIter {
pub fn new<R: Read + 'static>(
r: R,
filt_opt: GrangersFilterOpts,
essential_gr: Grangers,
) -> Pin<Box<Self>> {
let reader = grangers_utils::get_noodles_reader_from_reader(r)
.expect("couldn't create reader from input reader");
let definition = Definition::new("empty", None);
let sequence = Sequence::from(b"A".to_vec());
let curr_record = noodles::fasta::Record::new(definition, sequence);
let v: Vec<String> = vec![];
let o: Vec<u32> = vec![];
Box::pin(GrangersSeqIter {
essential_gr,
chr_gr: None,
seq_reader: reader,
seq_record: curr_record,
filt_opt,
name_vec_iter: v.into_iter(),
row_order_iter: o.into_iter(),
chr_seq_iter: None,
def_buffer: String::new(),
})
}
}
impl Iterator for GrangersSeqIter {
type Item = (GrangersRecordID, noodles::fasta::Record);
#[allow(clippy::question_mark)]
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(ref mut chr_seq_iter) = self.chr_seq_iter {
if let (Some(chr_seq_rec), Some(feat_name), Some(row_idx)) = (
chr_seq_iter.next(),
self.name_vec_iter.next(),
self.row_order_iter.next(),
) {
if let Ok(sequence) = chr_seq_rec {
return Some((
GrangersRecordID::new(row_idx),
noodles::fasta::Record::new(Definition::new(feat_name, None), sequence),
));
}
continue;
}
self.chr_seq_iter = None;
} else {
loop {
self.def_buffer.clear();
let def_bytes = self
.seq_reader
.read_definition(&mut self.def_buffer)
.expect("GrangersSeqIter: could not read definition from reference file");
if def_bytes == 0 {
return None;
}
let definition = match self.def_buffer.parse() {
Ok(d) => d,
Err(e) => panic!("could not parse sequence definition: error {}", e),
};
let mut seq_buffer = Vec::<u8>::new();
let seq_bytes = self
.seq_reader
.read_sequence(&mut seq_buffer)
.expect("GrangersSeqIter: could not read sequence from reference file");
if seq_bytes == 0 {
warn!("GrangersSeqIter: was able to read record definition, but no sequence. This seems like a problem!");
return None;
}
let sequence = Sequence::from(seq_buffer);
self.seq_record = noodles::fasta::Record::new(definition, sequence);
let record_name = std::str::from_utf8(self.seq_record.name())
.expect("GrangersSeqIter: could not convert record name to utf8");
let chr_name = record_name.strip_suffix(' ').unwrap_or(record_name);
self.chr_gr = Some(
self.essential_gr
.filter(
self.filt_opt.seqname.clone(),
&[chr_name.to_string()],
false,
)
.expect("GrangersSeqIter: cannot filter essential_gr"),
);
if self.chr_gr.as_ref().unwrap().df().height() == 0 {
self.chr_gr = None;
continue;
}
self.name_vec_iter = self
.chr_gr
.as_ref()
.unwrap()
.df()
.column(self.filt_opt.name_column.as_str())
.expect("GrangersSeqIter: cannot get name_column")
.str()
.expect("GrangersSeqIter: cannot convert name_vec to str")
.into_iter()
.map(|s| s.unwrap().to_owned())
.collect::<Vec<_>>()
.into_iter();
self.row_order_iter = self
.chr_gr
.as_ref()
.unwrap()
.df()
.column("row_order")
.expect("GrangersSeqIter: cannot get row_order column")
.u32()
.expect("GrangerSeqIter: cannot convert row_order entries to u32")
.into_iter()
.map(|s| {
s.expect("Could not get row order. Please report this bug on GitHub.")
})
.collect::<Vec<_>>()
.into_iter();
let ref_grangers = unsafe {
core::mem::transmute::<&Grangers, &'static Grangers>(
self.chr_gr.as_ref().unwrap(),
)
};
let ref_record = unsafe {
core::mem::transmute::<
&noodles::fasta::Record,
&'static noodles::fasta::Record,
>(&self.seq_record)
};
self.chr_seq_iter = Some(
ChrRowSeqIter::new(ref_grangers, ref_record, self.filt_opt.oob_option)
.expect("cannot create ChrRowSeqIter"),
);
break;
}
if self.chr_seq_iter.is_none() {
return None;
}
}
}
}
}
struct ChrRowSeqIter<'a> {
iters: Vec<polars::series::SeriesIter<'a>>,
record: &'a noodles::fasta::Record,
oob_option: OOBOption,
seqlen: usize,
}
impl<'a> ChrRowSeqIter<'a> {
pub fn new(
grangers: &'a Grangers,
record: &'a noodles::fasta::Record,
oob_option: OOBOption,
) -> anyhow::Result<Self> {
let fc = grangers.field_columns();
let iters: Vec<polars::series::SeriesIter> = vec![
grangers
.df()
.column(fc.start())?
.as_materialized_series()
.iter(),
grangers
.df()
.column(fc.end())?
.as_materialized_series()
.iter(),
grangers
.df
.column(fc.strand())?
.as_materialized_series()
.iter(),
];
let seqlen = record.sequence().len();
Ok(Self {
iters,
record,
oob_option,
seqlen,
})
}
}
#[allow(clippy::needless_lifetimes)]
impl<'a> Iterator for ChrRowSeqIter<'a> {
type Item = anyhow::Result<Sequence>;
fn next(&mut self) -> Option<Self::Item> {
if let (Some(start), Some(end), Some(strand)) = (
self.iters[0].next(),
self.iters[1].next(),
self.iters[2].next(),
) {
let sequence = if let (
AnyValue::Int64(start),
AnyValue::Int64(end),
AnyValue::String(strand),
) = (start, end, strand)
{
let (start, end) = if self.oob_option == OOBOption::Truncate {
(
noodles::core::Position::new(std::cmp::max(1, start as usize)),
noodles::core::Position::new(std::cmp::min(self.seqlen, end as usize)),
)
} else {
(
noodles::core::Position::new(start as usize),
noodles::core::Position::new(end as usize),
)
};
if let (Some(start), Some(end)) = (start, end) {
let seq = self.record.sequence().get(start..=end);
if let Some(seq) = seq {
let mut sequence = Ok(Sequence::from_iter(seq.iter().copied()));
if strand == "-" {
sequence = sequence.unwrap().complement().rev().collect::<Result<_, _>>().with_context(||"Could not get the reverse complement of a sequence. Please check if the alphabet is valid.");
};
sequence
} else {
Err(anyhow::anyhow!("Could not get the sequence from the start to the end. Please check if the start and end are valid."))
}
} else {
Err(anyhow::anyhow!("Found invalid start or end. Please check if the start and end are within boundary."))
}
} else {
Err(anyhow::anyhow!("Found null start, end or strand. Please check if the start, end and strand are valid."))
};
Some(sequence)
} else {
None
}
}
}
#[allow(dead_code)]
pub fn argsort1based<T: Ord>(data: &[T], descending: bool) -> Vec<usize> {
let mut indices = (1..=data.len()).collect::<Vec<_>>();
indices.sort_by_key(|&i| &data[i - 1]);
if descending {
indices.reverse();
}
indices
}
fn apply_merge(s: Column, slack: i64) -> Result<Option<polars::prelude::Column>, PolarsError> {
let ca: StructChunked = s.struct_()?.clone();
let start_series = &ca.fields_as_series()[0];
let end_series = &ca.fields_as_series()[1];
let mut start_iter = start_series.i64()?.into_iter();
let mut end_iter = end_series.i64()?.into_iter();
let (mut window_start, mut window_end) =
if let (Some(Some(start)), Some(Some(end))) = (start_iter.next(), end_iter.next()) {
(start, end)
} else {
return Result::<Option<polars::prelude::Column>, PolarsError>::Err(
PolarsError::ComputeError(
"Found missing value in the start or end column. Cannot proceed.".into(),
),
);
};
let mut out_list: Vec<Series> = Vec::with_capacity(start_series.len());
for (id, (start, end)) in start_iter.zip(end_iter).enumerate() {
let (curr_start, curr_end) = if let (Some(start), Some(end)) = (start, end) {
(start, end)
} else {
return Result::<Option<polars::prelude::Column>, polars::prelude::PolarsError>::Err(
polars::prelude::PolarsError::ComputeError(
"Found missing value in the start or end column. This should not happen."
.into(),
),
);
};
if ((curr_start - slack) <= window_end) | ((curr_start <= window_start) & (curr_end >= window_end))
{
if curr_end > window_end {
window_end = curr_end;
}
} else {
out_list.push(Series::new(
PlSmallStr::from_string(id.to_string()),
[window_start, window_end],
));
window_start = curr_start;
window_end = curr_end;
}
}
out_list.push(Series::new("one more".into(), [window_start, window_end]));
let ls = Column::new("pos".into(), out_list);
Result::<Option<polars::prelude::Column>, PolarsError>::Ok(Some(ls))
}
fn apply_gaps(s: Column, _slack: i64) -> Result<Option<polars::prelude::Column>, PolarsError> {
let ca: StructChunked = s.struct_()?.clone();
let start_series = &ca.fields_as_series()[0];
let end_series = &ca.fields_as_series()[1];
if start_series.len() == 1 {
return Result::<Option<polars::prelude::Column>, PolarsError>::Ok(None);
}
let mut start_iter = start_series.i64()?.into_iter();
let mut end_iter = end_series.i64()?.into_iter();
let mut out_list: Vec<Series> = Vec::with_capacity(start_series.len());
start_iter.next();
for (id, next_feat_start) in start_iter.enumerate() {
let (gap_start, gap_end) = if let (Some(next_feat_start), Some(Some(prev_feat_end))) =
(next_feat_start, end_iter.next())
{
(prev_feat_end + 1, next_feat_start - 1)
} else {
return Result::<Option<polars::prelude::Column>, polars::prelude::PolarsError>::Err(
polars::prelude::PolarsError::ComputeError(
"Found missing value in the start or end column. This should not happen."
.into(),
),
);
};
out_list.push(Series::new(
PlSmallStr::from(id.to_string()),
[gap_start, gap_end],
));
}
let ls = Column::new("pos".into(), out_list).cast(&DataType::List((DataType::Int64).into()))?;
Result::<Option<polars::prelude::Column>, PolarsError>::Ok(Some(ls))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::reader::gtf::{AttributeMode, Attributes, GStruct};
use noodles::core::Position;
use crate::grangers_utils::FileFormat;
const SAY: bool = true;
#[test]
fn test_graners() {
let mut gs = GStruct {
seqid: vec![String::from("chr1"); 9],
source: vec![String::from("HAVANA"); 9],
feature_type: vec![
String::from("gene"),
String::from("transcript"),
String::from("exon"),
String::from("exon"),
String::from("exon"),
String::from("gene"),
String::from("transcript"),
String::from("exon"),
String::from("exon"),
],
start: vec![1, 1, 1, 21, 41, 101, 101, 101, 121],
end: vec![50, 50, 10, 30, 50, 150, 150, 110, 150],
score: vec![Some(10.0); 9],
strand: vec![
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("-")),
Some(String::from("-")),
Some(String::from("-")),
Some(String::from("-")),
],
phase: vec![Some(String::from("0")); 9],
attributes: Attributes::new(AttributeMode::Full, FileFormat::GTF).unwrap(),
misc: Some(HashMap::new()),
};
let gsr = &mut gs;
gsr.attributes.file_type = FileFormat::GTF;
gsr.attributes.essential.insert(
String::from("gene_id"),
vec![
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
],
);
gsr.attributes.essential.insert(
String::from("gene_name"),
vec![
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
],
);
gsr.attributes.essential.insert(
String::from("transcript_id"),
vec![
None,
Some(String::from("t1")),
Some(String::from(String::from("t1"))),
Some(String::from("t1")),
None,
Some(String::from("t2")),
Some(String::from("t2")),
Some(String::from("t2")),
Some(String::from("t2")),
],
);
if let Some(extra) = &mut gsr.attributes.extra {
extra.insert(
String::from("gene_version"),
vec![Some(String::from("1")); 9],
);
}
let _gr = Grangers::from_gstruct(gs, IntervalType::Inclusive(1)).unwrap();
}
#[test]
fn test_flank() {
let gr = get_toy_gr().unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let fo = FlankOptions {
start: true,
both: false,
ignore_strand: false,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 251, 251, 211, 251];
let end: Vec<i64> = vec![100, 100, 100, 120, 140, 260, 260, 220, 260];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![101, 101, 101, 121, 141, 241, 241, 201, 241];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 250, 250, 210, 250];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let fo = FlankOptions {
start: true,
both: true,
ignore_strand: false,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 241, 241, 201, 241];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 260, 260, 220, 260];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 241, 241, 201, 241];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 260, 260, 220, 260];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let fo = FlankOptions {
start: false,
both: false,
ignore_strand: false,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![151, 151, 111, 131, 151, 191, 191, 191, 211];
let end: Vec<i64> = vec![160, 160, 120, 140, 160, 200, 200, 200, 220];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![141, 141, 101, 121, 141, 201, 201, 201, 221];
let end: Vec<i64> = vec![150, 150, 110, 130, 150, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let fo = FlankOptions {
start: false,
both: true,
ignore_strand: false,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![141, 141, 101, 121, 141, 191, 191, 191, 211];
let end: Vec<i64> = vec![160, 160, 120, 140, 160, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![141, 141, 101, 121, 141, 191, 191, 191, 211];
let end: Vec<i64> = vec![160, 160, 120, 140, 160, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let fo = FlankOptions {
start: true,
both: false,
ignore_strand: true,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 191, 191, 191, 211];
let end: Vec<i64> = vec![100, 100, 100, 120, 140, 200, 200, 200, 220];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![101, 101, 101, 121, 141, 201, 201, 201, 221];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let fo = FlankOptions {
start: true,
both: true,
ignore_strand: true,
};
let gr1 = gr.flank(10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 191, 191, 191, 211];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.flank(-10, fo).unwrap();
let start: Vec<i64> = vec![91, 91, 91, 111, 131, 191, 191, 191, 211];
let end: Vec<i64> = vec![110, 110, 110, 130, 150, 210, 210, 210, 230];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
}
fn get_toy_gr() -> anyhow::Result<Grangers> {
let mut gs = GStruct {
seqid: vec![String::from("chr1"); 9],
source: vec![String::from("HAVANA"); 9],
feature_type: vec![
String::from("gene"),
String::from("transcript"),
String::from("exon"),
String::from("exon"),
String::from("exon"),
String::from("gene"),
String::from("transcript"),
String::from("exon"),
String::from("exon"),
],
start: vec![101, 101, 101, 121, 141, 201, 201, 201, 221],
end: vec![150, 150, 110, 130, 150, 250, 250, 210, 250],
score: vec![Some(10.0); 9],
strand: vec![
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("+")),
Some(String::from("-")),
Some(String::from("-")),
Some(String::from("-")),
Some(String::from("-")),
],
phase: vec![Some(String::from("0")); 9],
attributes: Attributes::new(AttributeMode::Full, FileFormat::GTF)?,
misc: Some(HashMap::new()),
};
let gsr = &mut gs;
gsr.attributes.file_type = FileFormat::GTF;
gsr.attributes.essential.insert(
String::from("gene_id"),
vec![
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
],
);
gsr.attributes.essential.insert(
String::from("gene_name"),
vec![
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g1")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
Some(String::from("g2")),
],
);
gsr.attributes.essential.insert(
String::from("transcript_id"),
vec![
None,
Some(String::from("t1")),
Some(String::from(String::from("t1"))),
Some(String::from("t1")),
None,
Some(String::from("t2")),
Some(String::from("t2")),
Some(String::from("t2")),
Some(String::from("t2")),
],
);
if let Some(extra) = &mut gsr.attributes.extra {
extra.insert(
String::from("gene_version"),
vec![Some(String::from("1")); 9],
);
}
let gr = Grangers::from_gstruct(gs, IntervalType::Inclusive(1))?;
Ok(gr)
}
#[test]
fn test_merge() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2"],
"start" => [1i64, 5, 1, 11, 22, 1, 5],
"end" => [10i64, 10, 10, 20, 30, 10, 30],
"strand"=> ["+", "+", "-", "-", "-", "+", "-"],
"gene_id" => ["g1", "g1", "g2", "g2", "g2", "g3", "g4"],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let gr1: Grangers = gr
.merge(&["seqname", "gene_id"], false, None, None, true)
.unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 1, 22, 1, 5];
let end: Vec<i64> = vec![10i64, 20, 30, 10, 30];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.merge(&["seqname"], true, None, None, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 22, 1];
let end: Vec<i64> = vec![20i64, 30, 30];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1: Grangers = gr
.merge(&["seqname", "gene_id"], false, Some(0), None, true)
.unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 1, 11, 22, 1, 5];
let end: Vec<i64> = vec![10i64, 10, 20, 30, 10, 30];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1: Grangers = gr
.merge(&["seqname", "gene_id"], false, Some(2), None, true)
.unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 1, 1, 5];
let end: Vec<i64> = vec![10i64, 30, 10, 30];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
}
#[test]
fn test_gaps() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2"],
"start" => [1i64, 12, 1, 5, 22, 1, 5],
"end" => [10i64, 20, 10, 20, 30, 10, 30],
"strand"=> ["+", "+", "+", "+", "+", "+", "-"],
"gene_id" => ["g1", "g1", "g2", "g2", "g2", "g3", "g4"],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let gr1: Grangers = gr
.gaps(&["seqname", "gene_id"], false, None, None, true)
.unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![11i64, 21];
let end: Vec<i64> = vec![11i64, 21];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
}
#[test]
fn test_extend() {
let df = df!(
"seqname" => ["chr1", "chr1"],
"start" => [1i64, 50],
"end" => [10i64, 60],
"strand"=> ["+", "-"],
"gene_id" => ["g1", "g2"],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let mut gr1 = gr.clone();
gr1.extend(5, &ExtendOption::Start, false).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![-4i64, 50];
let end: Vec<i64> = vec![10i64, 65];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let mut gr1 = gr.clone();
gr1.extend(5, &ExtendOption::Start, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![-4i64, 45];
let end: Vec<i64> = vec![10i64, 60];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let mut gr1 = gr.clone();
gr1.extend(5, &ExtendOption::End, false).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 45];
let end: Vec<i64> = vec![15i64, 60];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let mut gr1 = gr.clone();
gr1.extend(5, &ExtendOption::End, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 50];
let end: Vec<i64> = vec![15i64, 65];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let mut gr1 = gr.clone();
gr1.extend(5, &ExtendOption::Both, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![-4i64, 45];
let end: Vec<i64> = vec![15i64, 65];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
}
#[test]
fn test_exons() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1"],
"feature_type" => ["gene", "transcript", "exon", "exon", "transcript", "exon", "exon"],
"start" => [1i64, 1, 1, 71, 71, 71, 101],
"end" => [200i64, 80, 20, 80, 150, 80, 150],
"strand"=> ["+", "+", "+", "+", "-", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g1", "g1", "g1", "g1"],
"transcript_id" => [None, Some("t1"), Some("t1"), Some("t1"), Some("t2"), Some("t2"), Some("t2")],
"exon_id" => [None, None, Some("e1"), Some("e2"), None, Some("e3"), Some("e4")],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let gr1 = gr.exons(None, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 71, 101, 71];
let end: Vec<i64> = vec![20i64, 80, 150, 80];
let exon_number = vec![1i64, 2, 1, 2];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
assert_eq!(
gr1.column("exon_number")
.unwrap()
.cast(&DataType::Int64)
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
exon_number
);
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1"],
"feature_type" => ["gene", "transcript", "exon", "exon", "transcript", "exon", "exon"],
"start" => [1i64, 1, 1, 71, 71, 71, 101],
"end" => [200i64, 80, 20, 80, 150, 80, 150],
"strand"=> ["+", "+", "+", "+", "-", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g1", "g1", "g1", "g1"],
"transcript_id" => [None, Some("t1"), Some("t1"), Some("t1"), Some("t2"), Some("t2"), Some("t2")],
"exon_id" => [None, None, Some("e1"), Some("e2"), None, Some("e3"), Some("e4")],
"exon_number" => [None, None, Some("1"), Some("2"), None, Some("1"), Some("2")],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let gr1 = gr.exons(None, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![1i64, 71, 71, 101];
let end: Vec<i64> = vec![20i64, 80, 80, 150];
let exon_number = vec![1i64, 2, 1, 2];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
assert_eq!(
gr1.column("exon_number")
.unwrap()
.cast(&DataType::Int64)
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
exon_number
);
}
#[test]
fn test_introns() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1"],
"feature_type" => ["gene", "transcript", "exon", "exon", "transcript", "exon", "exon"],
"start" => [1i64, 1, 1, 71, 71, 71, 101],
"end" => [200i64, 80, 20, 80, 150, 80, 150],
"strand"=> ["+", "+", "+", "+", "+", "+", "+"],
"gene_id" => ["g1", "g1", "g1", "g1", "g1", "g1", "g1"],
"transcript_id" => [None, Some("t1"), Some("t1"), Some("t1"), Some("t2"), Some("t2"), Some("t2")],
"exon_id" => [None, None, Some("e1"), Some("e2"), None, Some("e3"), Some("e4")],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr: {:?}", gr.df());
}
let gr1 = gr.introns(None, None, None, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![21i64, 81];
let end: Vec<i64> = vec![70i64, 100];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
let gr1 = gr.introns(None, None, None, true).unwrap();
if SAY {
println!("gr1: {:?}", gr1.df());
}
let start: Vec<i64> = vec![21i64, 81];
let end: Vec<i64> = vec![70i64, 100];
let tid = vec![String::from("t1"), String::from("t2")];
assert_eq!(
gr1.start()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
start
);
assert_eq!(
gr1.end()
.unwrap()
.i64()
.unwrap()
.to_vec()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<i64>>(),
end
);
assert_eq!(
gr1.column("transcript_id")
.unwrap()
.str()
.unwrap()
.into_iter()
.map(|x| x.unwrap().to_string())
.collect::<Vec<String>>(),
tid
);
}
#[test]
fn test_get_sequences_fasta_record() {
let df = df!(
"seqname" => ["chr1", "chr1"],
"start" => [10i64, 15],
"end" => [20i64, 30],
"strand"=> ["+", "-"],
"gene_id" => ["g1", "g2"],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
let definition = Definition::new("chr1", None);
let sequence = Sequence::from(
b"GTAGTTCTCTGGGACCTGCAAGATTAGGCAGGGACATGTGAGAGGTGACAGGGACCTGCA".to_vec(),
);
let record = noodles::fasta::Record::new(definition, sequence.clone());
let seq_vec = gr
.get_sequences_fasta_record(&record, &OOBOption::Skip)
.unwrap();
let start = Position::new(10).unwrap();
let end = Position::new(20).unwrap();
let expected_seq1 = sequence.slice(start..=end).unwrap();
let start = Position::new(15).unwrap();
let end = Position::new(30).unwrap();
let expected_seq2 = sequence
.slice(start..=end)
.unwrap()
.complement()
.rev()
.collect::<Result<Sequence, _>>()
.unwrap();
assert_eq!(seq_vec[0], Some(expected_seq1));
assert_eq!(seq_vec[1], Some(expected_seq2));
}
#[test]
fn test_chrrowseqiter() {
let df = df!(
"seqname" => ["chr1", "chr1"],
"start" => [10i64, 15],
"end" => [20i64, 30],
"strand"=> ["+", "-"],
"gene_id" => ["g1", "g2"],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
let definition = Definition::new("chr1", None);
let sequence = Sequence::from(
b"GTAGTTCTCTGGGACCTGCAAGATTAGGCAGGGACATGTGAGAGGTGACAGGGACCTGCA".to_vec(),
);
let record = noodles::fasta::Record::new(definition, sequence.clone());
let mut chrsi = ChrRowSeqIter::new(&gr, &record, OOBOption::Skip).unwrap();
let chrsi1 = chrsi.next().unwrap().unwrap();
let chrsi2 = chrsi.next().unwrap().unwrap();
assert!(chrsi.next().is_none());
let start = Position::new(10).unwrap();
let end = Position::new(20).unwrap();
let expected_seq1 = Sequence::from_iter(sequence.get(start..=end).unwrap().iter().cloned());
assert_eq!(chrsi1, expected_seq1);
let start = Position::new(15).unwrap();
let end = Position::new(30).unwrap();
let expected_seq2 = Sequence::from_iter(sequence.get(start..=end).unwrap().iter().cloned())
.complement()
.rev()
.collect::<Result<Sequence, _>>()
.unwrap();
assert_eq!(chrsi2, expected_seq2);
}
#[test]
fn test_write_gtf() {
let df = df!(
"seqname" => ["chr1", "chr1"],
"start" => [10i64, 15],
"end" => [20i64, 30],
"strand"=> ["+", "-"],
"source" => [Some("HAVANA"), None],
"gene_id" => ["g1", "g2"],
"gene_name" => [Some("gene_1"), None],
)
.unwrap();
let gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr1: {:?}", gr.df());
}
let gtf_df = gr.get_gtf_df().unwrap();
if SAY {
println!("gtf_df: {:?}", gtf_df);
}
let source = vec![String::from("HAVANA"), String::from(".")];
let gtf_df_attributes = gtf_df
.column("attributes")
.unwrap()
.str()
.unwrap()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<&str>>();
assert!(gtf_df_attributes[0].contains("gene_name \"gene_1\";"));
assert!(gtf_df_attributes[0].contains("gene_id \"g1\";"));
assert!(gtf_df_attributes[1].contains("gene_id \"g2\";"));
assert_eq!(
gtf_df
.column("source")
.unwrap()
.str()
.unwrap()
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<&str>>(),
source
);
}
#[test]
fn test_build_lappers() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2"],
"feature_type" => ["exon", "exon", "exon", "exon", "exon", "exon", "exon"],
"start" => [1i64, 21, -5, 1, 51, 1, 51],
"end" => [10i64, 30, 5, 100, 150, 100, 150],
"strand"=> ["+", "+", "+", "+", "+", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g2", "g2", "g2", "g2"],
)
.unwrap();
let mut gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr1: {:?}", gr.df());
}
let lappers = gr.build_lappers(true, false, &["gene_id"]).unwrap();
let chr1p = lappers.get(&["chr1".to_string(), "+".to_string()]).unwrap();
if SAY {
println!("chr1 positive lapper: {:?}", chr1p);
}
let chr2p = lappers.get(&["chr2".to_string(), "+".to_string()]).unwrap();
if SAY {
println!("chr2 positive lapper: {:?}", chr2p);
}
let chr2n = lappers.get(&["chr2".to_string(), "+".to_string()]).unwrap();
if SAY {
println!("chr2 negative lapper: {:?}", chr2n);
}
let chr1p_o = chr1p.find(11, 15);
assert!(chr1p_o.count() == 0);
let chr1p_o = chr1p.find(10, 15);
assert!(chr1p_o.count() == 1);
}
#[test]
fn test_update_column() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2"],
"feature_type" => ["exon", "exon", "exon", "exon", "exon", "exon", "exon"],
"start" => [1i64, 21, -5, 1, 51, 1, 51],
"end" => [10i64, 30, 5, 100, 150, 100, 150],
"strand"=> ["+", "+", "+", "+", "+", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g2", "g2", "g2", "g2"],
)
.unwrap();
let mut gr = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr1: {:?}", gr.df());
}
let new_col = Column::new("new_col".into(), &[1i64, 2, 3, 4, 5, 6, 7]);
gr.update_column(new_col.clone(), None).unwrap();
assert_eq!(gr.column("new_col").unwrap(), &new_col);
let gene_id_col = Column::new("gene_id".into(), &["g", "g", "g", "g", "g", "g", "g"]);
gr.update_column(gene_id_col.clone(), None).unwrap();
assert_eq!(gr.column("gene_id").unwrap(), &gene_id_col);
let gene_id_col = Column::new("gene_id_new".into(), &["g", "g", "g", "g", "g", "g", "g"]);
gr.update_column(gene_id_col.clone(), Some("gene_id"))
.unwrap();
assert_eq!(gr.field_columns().gene_id(), Some("gene_id_new"));
assert_eq!(
gr.column(gr.field_columns().gene_id().unwrap()).unwrap(),
&gene_id_col
);
}
#[test]
fn test_update_df() {
let df = df!(
"seqname" => ["chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2"],
"feature_type" => ["exon", "exon", "exon", "exon", "exon", "exon", "exon"],
"start" => [1i64, 21, -5, 1, 51, 1, 51],
"end" => [10i64, 30, 5, 100, 150, 100, 150],
"strand"=> ["+", "+", "+", "+", "+", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g2", "g2", "g2", "g2"],
)
.unwrap();
let mut gr: Grangers = Grangers::new(
df,
None,
None,
IntervalType::Inclusive(1),
FieldColumns::default(),
false,
)
.unwrap();
if SAY {
println!("gr1: {:?}", gr.df());
}
let df1 = df!(
"seqname" => ["chr1111", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2"],
"feature_type" => ["exon", "exon", "exon", "exon", "exon", "exon", "exon"],
"start" => [1i64, 21, -5, 1, 51, 1, 51],
"end" => [10i64, 30, 5, 100, 150, 100, 150],
"strand"=> ["+", "+", "+", "+", "+", "-", "-"],
"gene_id" => ["g1", "g1", "g1", "g2", "g2", "g2", "g2"],
)
.unwrap();
gr.update_df(df1.clone(), false, false).unwrap();
assert_eq!(gr.df(), &df1);
assert!(gr
.clone()
.update_df(DataFrame::default(), false, false)
.is_err());
let df2 = df!(
"seqname" => ["chr1111", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2"],
"feature_type" => ["exon", "exon", "exon", "exon", "exon", "exon", "exon"],
"start" => [1i64, 21, -5, 1, 51, 1, 51],
"end" => [10i64, 30, 5, 100, 150, 100, 150],
"strand"=> ["+", "+", "+", "+", "+", "-", "-"],
"gene_iddddddd" => ["g1", "g1", "g1", "g2", "g2", "g2", "g2"],
)
.unwrap();
assert!(gr.update_df(df2, false, false).is_err());
}
}