use crate::geneset::GeneSets;
use crate::matrix::ExprMatrix;
use std::collections::HashSet;
fn subset_rows(expr: &ExprMatrix, rows: &[usize]) -> ExprMatrix {
let col_names = expr.col_names().to_vec();
let mut row_names = Vec::with_capacity(rows.len());
let mut data = Vec::with_capacity(rows.len() * expr.ncol());
for &r in rows {
row_names.push(expr.row_names()[r].clone());
data.extend_from_slice(expr.row(r));
}
ExprMatrix::new(row_names, col_names, data)
}
pub(crate) fn filter_constant_rows(expr: &ExprMatrix) -> ExprMatrix {
let mut keep = Vec::with_capacity(expr.nrow());
for i in 0..expr.nrow() {
let mut mn = f64::INFINITY;
let mut mx = f64::NEG_INFINITY;
for &v in expr.row(i) {
if v < mn {
mn = v;
}
if v > mx {
mx = v;
}
}
if mn != mx && mn.is_finite() && mx.is_finite() {
keep.push(i);
}
}
assert!(
keep.len() >= 2,
"Less than two rows left in the input matrix after dropping constant rows"
);
subset_rows(expr, &keep)
}
pub(crate) fn map_and_filter_sets(
features: &ExprMatrix,
gene_sets: &GeneSets,
min_size: usize,
max_size: usize,
) -> (Vec<String>, Vec<Vec<usize>>) {
let mut total_matched = 0usize;
let mut mapped: Vec<(String, Vec<usize>)> = Vec::with_capacity(gene_sets.len());
for set in gene_sets.iter() {
let mut idx = Vec::with_capacity(set.genes.len());
for gene in &set.genes {
if let Some(i) = features.row_of(gene) {
idx.push(i);
}
}
total_matched += idx.len();
mapped.push((set.name.clone(), idx));
}
assert!(
total_matched > 0,
"No identifiers in the gene sets could be matched to the identifiers in the expression data"
);
let mut names = Vec::new();
let mut indices = Vec::new();
for (name, idx) in mapped {
if idx.len() >= min_size && idx.len() <= max_size {
names.push(name);
indices.push(idx);
}
}
assert!(
!names.is_empty(),
"No gene set left after mapping and filtering"
);
let mut seen = HashSet::with_capacity(names.len());
for name in &names {
assert!(
seen.insert(name.as_str()),
"The gene set list contains duplicated gene set names: '{name}'"
);
}
(names, indices)
}
pub(crate) fn scale_rows(m: &ExprMatrix) -> Vec<f64> {
let n = m.ncol();
let nf = n as f64;
let mut z = vec![0.0f64; m.nrow() * n];
for i in 0..m.nrow() {
let row = m.row(i);
let mut mean = row.iter().sum::<f64>() / nf;
let mut corr = 0.0f64;
for &v in row {
corr += v - mean;
}
mean += corr / nf;
let mut ss = 0.0f64;
for &v in row {
let d = v - mean;
ss += d * d;
}
let sd = (ss / (nf - 1.0)).sqrt();
let out = &mut z[i * n..(i + 1) * n];
for (o, &v) in out.iter_mut().zip(row) {
*o = (v - mean) / sd;
}
}
z
}