use std::{
env,
fs::{self, copy},
io::BufRead,
path::PathBuf,
};
use tempfile::{Builder, NamedTempFile};
use crate::{
commands::granges_random_bed,
create_granges_with_seqlens,
error::GRangesError,
granges::GRangesEmpty,
io::{parsers::bed::Bed5Addition, InputStream},
prelude::{GRanges, VecRangesIndexed},
ranges::{
coitrees::COITrees,
vec::{VecRanges, VecRangesEmpty},
RangeEmpty,
},
Position,
};
#[cfg(feature = "ndarray")]
use genomap::GenomeMap;
use indexmap::IndexMap;
#[cfg(feature = "ndarray")]
use ndarray::{Array1, Array2};
use rand::{distributions::Uniform, rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
fn get_rng() -> StdRng {
let seed: u64 = env::var("TEST_SEED")
.unwrap_or_else(|_| "13".to_string())
.parse()
.expect("TEST_SEED must be a valid u64");
StdRng::seed_from_u64(seed)
}
pub fn granges_binary_path() -> PathBuf {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("target");
path.push(if cfg!(debug_assertions) {
"debug"
} else {
"release"
});
path.push("granges");
path
}
pub const NRANDOM_RANGES: usize = 10000;
pub const MIN_LEN: Position = 1;
pub const MAX_LEN: Position = 1000;
pub const NCHROM: usize = 22;
pub const MIN_CHROM_LEN: Position = 50_000_000;
pub const MAX_CHROM_LEN: Position = 250_000_000;
pub fn random_range(chrom_len: Position, rng: &mut StdRng) -> (Position, Position) {
let len = rng.gen_range(MIN_LEN..MAX_LEN);
let start = rng.gen_range(0..chrom_len - len + 1);
(start, start + len)
}
pub fn random_seqlen(rng: &mut StdRng) -> Position {
rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN)
}
pub fn random_chrom(rng: &mut StdRng) -> String {
format!("chr{}", rng.gen_range(1..NCHROM + 1))
}
pub fn random_vecranges(n: usize) -> VecRanges<RangeEmpty> {
let mut rng = get_rng();
let seqlen = random_seqlen(&mut rng);
let mut vr = VecRanges::new(seqlen);
for _i in 0..n {
let (start, end) = random_range(seqlen, &mut rng);
let range = RangeEmpty::new(start, end);
vr.push_range(range);
}
vr
}
pub fn random_granges(
seqlens: &IndexMap<String, Position>,
num: usize,
) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
let mut rng = get_rng();
let mut gr = GRangesEmpty::new_vec(seqlens);
let seqnames: Vec<String> = seqlens.keys().cloned().collect();
for _ in 0..num {
let seqname = seqnames.choose(&mut rng).unwrap();
let chrom_len = *seqlens
.get(seqname)
.ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
let (start, end) = random_range(chrom_len, &mut rng);
gr.push_range(seqname, start, end)?;
}
Ok(gr)
}
fn generate_random_string(n: usize, rng: &mut StdRng) -> String {
let letters: Vec<char> = ('a'..='z').collect();
let letters_dist = Uniform::from(0..letters.len());
(0..n).map(|_| letters[rng.sample(letters_dist)]).collect()
}
fn generate_random_uniform(start: f64, end: f64, rng: &mut StdRng) -> f64 {
let uniform = Uniform::new(start, end); rng.sample(uniform)
}
pub fn random_granges_mock_bed5(
seqlens: &IndexMap<String, Position>,
num: usize,
) -> Result<GRanges<VecRangesIndexed, Vec<Bed5Addition>>, GRangesError> {
let mut rng = get_rng();
let mut gr = GRanges::new_vec(seqlens);
let seqnames: Vec<String> = seqlens.keys().cloned().collect();
for _ in 0..num {
let seqname = seqnames.choose(&mut rng).unwrap();
let chrom_len = *seqlens
.get(seqname)
.ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
let (start, end) = random_range(chrom_len, &mut rng);
let bed5_cols = Bed5Addition {
name: generate_random_string(8, &mut rng),
score: Some(generate_random_uniform(0.0, 1.0, &mut rng)),
};
gr.push_range(seqname, start, end, bed5_cols)?;
}
Ok(gr)
}
pub fn random_coitrees() -> COITrees<()> {
let vr = random_vecranges(100);
let cr: COITrees<()> = vr.into();
cr
}
pub fn temp_bedfile() -> NamedTempFile {
let tempfile = Builder::new()
.suffix(".bed")
.tempfile()
.expect("Failed to create temp file");
tempfile
}
pub fn copy_tempfile_for_inspection(tempfile: impl Into<PathBuf>, new_file_name: &str) -> PathBuf {
let tempfile = tempfile.into();
if env::var("TEST_LOCAL").is_ok() {
let dir_path = PathBuf::from("test_temp");
fs::create_dir_all(&dir_path).expect("could not create test_temp/");
let dest_path = dir_path.join(new_file_name);
copy(tempfile, &dest_path).expect("could not copy temp file");
dest_path
} else {
tempfile
}
}
pub fn random_bed3file(length: usize) -> NamedTempFile {
let temp_bedfile = temp_bedfile();
granges_random_bed(
"tests_data/hg38_seqlens.tsv",
length,
Some(temp_bedfile.path()),
true,
false,
)
.expect("could not generate random BED file");
temp_bedfile
}
pub fn random_bed5file(length: usize) -> NamedTempFile {
let temp_bedfile = temp_bedfile();
granges_random_bed(
"tests_data/hg38_seqlens.tsv",
length,
Some(temp_bedfile.path()),
true,
true,
)
.expect("could not generate random BED file");
temp_bedfile
}
pub fn head_file(filepath: impl Into<PathBuf>) {
let filepath = filepath.into();
let file = InputStream::new(filepath);
let mut reader = file.reader().unwrap();
for _ in 0..10 {
let mut line = String::new();
reader.read_line(&mut line).unwrap();
dbg!(&line);
}
}
pub fn granges_test_case_01() -> GRanges<VecRangesIndexed, Vec<f64>> {
create_granges_with_seqlens!(VecRangesIndexed, Vec<f64>, {
"chr1" => [(0, 5, 1.1), (4, 7, 8.1), (10, 17, 10.1)],
"chr2" => [(10, 20, 3.7), (18, 32, 1.1)]
}, seqlens: { "chr1" => 30, "chr2" => 100 })
}
pub fn granges_test_case_02() -> GRanges<VecRangesIndexed, Vec<f64>> {
create_granges_with_seqlens!(VecRangesIndexed, Vec<f64>, {
"chr1" => [(30, 50, 1.1)],
"chr2" => [(100, 200, 3.7), (250, 300, 1.1)]
}, seqlens: { "chr1" => 50, "chr2" => 300 })
}
pub fn granges_test_case_03() -> GRanges<VecRangesIndexed, Vec<Bed5Addition>> {
create_granges_with_seqlens!(VecRangesIndexed, Vec<Bed5Addition>, {
"chr1" => [(0, 5, Bed5Addition { name: "a".to_string(), score: Some(1.1) }),
(4, 7, Bed5Addition { name: "b".to_string(), score: Some(8.1) }),
(10, 17, Bed5Addition { name: "c".to_string(), score: Some(10.1) })],
"chr2" => [(10, 20, Bed5Addition { name: "d".to_string(), score: Some(3.7)}),
(18, 32, Bed5Addition { name: "d".to_string(), score: Some(1.1) })]
}, seqlens: { "chr1" => 30, "chr2" => 100 })
}
#[cfg(feature = "ndarray")]
pub fn random_array1_sequences(n: usize) -> GenomeMap<Array1<f64>> {
let mut seqs = GenomeMap::new();
let max_len = 100_000;
for seqnum in 1..=n {
let chrom = format!("chr{}", seqnum);
let array: Array1<f64> = Array1::from_iter(1..=max_len).map(|&x| x as f64);
seqs.insert(&chrom, array).unwrap();
}
seqs
}
#[cfg(feature = "ndarray")]
pub fn random_array2_sequences(n: usize) -> GenomeMap<Array2<f64>> {
let mut seqs = GenomeMap::new();
let max_len = 100_000;
for seqnum in 1..=n {
let chrom = format!("chr{}", seqnum);
let rows = max_len as usize;
let mut array = Array2::<f64>::zeros((rows, 2));
let mut value = 0.0;
for row in array.rows_mut() {
for elem in row {
*elem = value;
value += 1.0;
}
}
seqs.insert(&chrom, array).unwrap();
}
seqs
}
#[cfg(test)]
mod tests {
use crate::{
granges::GRanges, io::Bed5Iterator, seqlens, test_utilities::granges_test_case_01,
};
#[test]
fn test_test_case_01() {
let genome = seqlens!("chr1" => 30, "chr2" => 100);
let iter = Bed5Iterator::new("tests_data/test_case_01.bed").unwrap();
let iter_simplified = iter.map(|result| {
match result {
Ok(record) => Ok(record.into_map_data(|bed5_cols| bed5_cols.score.unwrap())),
Err(e) => Err(e),
}
});
let gr = GRanges::from_iter(iter_simplified, &genome).unwrap();
assert_eq!(gr, granges_test_case_01());
}
}