use std::io::{BufRead, Write};
use rayon::prelude::*;
use rsomics_common::{Result, RsomicsError};
mod dm;
mod env;
pub use dm::DistanceMatrix;
pub use env::{EnvTable, Standardized};
pub struct SizeBest {
pub size: usize,
pub correlation: f64,
pub vars: Vec<String>,
}
pub fn bioenv(dm_flat: &[f64], env: &Standardized) -> Vec<SizeBest> {
let p = env.vars.len();
let n = env.n;
let m = dm_flat.len();
let dm_ranks = rankdata(dm_flat);
let dm_centered = centered_unit(&dm_ranks);
(1..=p)
.map(|size| {
let best = (0..count_combinations(p, size))
.into_par_iter()
.map(|combo_idx| {
let subset = nth_combination(p, size, combo_idx);
let dists = euclidean_condensed(&env.cols, n, m, &subset);
let var_ranks = rankdata(&dists);
let rho = spearman_against(&var_ranks, &dm_centered);
Candidate { rho, combo_idx }
})
.reduce(
|| Candidate {
rho: f64::NEG_INFINITY,
combo_idx: usize::MAX,
},
Candidate::better,
);
let subset = nth_combination(p, size, best.combo_idx);
SizeBest {
size,
correlation: best.rho,
vars: subset.iter().map(|&i| env.vars[i].clone()).collect(),
}
})
.collect()
}
#[derive(Clone, Copy)]
struct Candidate {
rho: f64,
combo_idx: usize,
}
impl Candidate {
fn better(a: Candidate, b: Candidate) -> Candidate {
if b.rho > a.rho || (b.rho == a.rho && b.combo_idx < a.combo_idx) {
b
} else {
a
}
}
}
fn euclidean_condensed(cols: &[f64], n: usize, m: usize, subset: &[usize]) -> Vec<f64> {
let mut out = Vec::with_capacity(m);
for i in 0..n {
for j in (i + 1)..n {
let mut s = 0.0;
for &c in subset {
let base = c * n;
let d = cols[base + i] - cols[base + j];
s += d * d;
}
out.push(s.sqrt());
}
}
out
}
fn spearman_against(var_ranks: &[f64], dm_centered: &[f64]) -> f64 {
let vmean = var_ranks.iter().sum::<f64>() / var_ranks.len() as f64;
let mut vnorm_sq = 0.0;
let mut dot = 0.0;
for (&vr, &dc) in var_ranks.iter().zip(dm_centered) {
let cv = vr - vmean;
vnorm_sq += cv * cv;
dot += cv * dc;
}
if vnorm_sq == 0.0 {
return f64::NAN;
}
dot / vnorm_sq.sqrt()
}
fn centered_unit(v: &[f64]) -> Vec<f64> {
let mean = v.iter().sum::<f64>() / v.len() as f64;
let mut out: Vec<f64> = v.iter().map(|&x| x - mean).collect();
let norm = out.iter().map(|&x| x * x).sum::<f64>().sqrt();
for x in &mut out {
*x /= norm;
}
out
}
fn rankdata(v: &[f64]) -> Vec<f64> {
let mut order: Vec<usize> = (0..v.len()).collect();
order.sort_by(|&a, &b| v[a].partial_cmp(&v[b]).unwrap());
let mut ranks = vec![0.0f64; v.len()];
let mut i = 0;
while i < order.len() {
let mut j = i + 1;
while j < order.len() && v[order[j]] == v[order[i]] {
j += 1;
}
let avg = ((i + 1 + j) as f64) / 2.0;
for &idx in &order[i..j] {
ranks[idx] = avg;
}
i = j;
}
ranks
}
fn count_combinations(n: usize, k: usize) -> usize {
if k > n {
return 0;
}
let k = k.min(n - k);
let mut c = 1usize;
for i in 0..k {
c = c * (n - i) / (i + 1);
}
c
}
fn nth_combination(n: usize, k: usize, mut idx: usize) -> Vec<usize> {
let mut out = Vec::with_capacity(k);
let mut c = 0usize;
let mut remaining = k;
while remaining > 0 {
let block = count_combinations(n - c - 1, remaining - 1);
if idx < block {
out.push(c);
remaining -= 1;
} else {
idx -= block;
}
c += 1;
}
out
}
pub fn write_result<W: Write>(out: &mut W, best: &[SizeBest]) -> Result<()> {
writeln!(out, "size\tcorrelation\tvars").map_err(RsomicsError::Io)?;
for b in best {
writeln!(
out,
"{}\t{:.12}\t{}",
b.size,
b.correlation,
b.vars.join(", ")
)
.map_err(RsomicsError::Io)?;
}
Ok(())
}
pub fn read_matrix<R: BufRead>(reader: R, source: &str) -> Result<DistanceMatrix> {
DistanceMatrix::read(reader, source)
}
pub fn read_env<R: BufRead>(reader: R, source: &str) -> Result<EnvTable> {
EnvTable::read(reader, source)
}
pub fn run_bioenv(
dm: &DistanceMatrix,
env: &EnvTable,
columns: Option<&[String]>,
env_source: &str,
) -> Result<Vec<SizeBest>> {
let selected = env.select(columns, env_source)?;
let std = selected.standardized_for(&dm.ids, env_source)?;
Ok(bioenv(&dm.condensed(), &std))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
const DM: &str = "\tA\tB\tC\tD\n\
A\t0.0\t0.5\t0.25\t0.75\n\
B\t0.5\t0.0\t0.1\t0.42\n\
C\t0.25\t0.1\t0.0\t0.33\n\
D\t0.75\t0.42\t0.33\t0.0\n";
const ENV: &str = "id\tpH\tElevation\n\
A\t7.0\t400\n\
B\t8.0\t530\n\
C\t7.5\t450\n\
D\t8.5\t810\n";
#[test]
fn skbio_doc_example() {
let dm = read_matrix(Cursor::new(DM), "dm").unwrap();
let env = read_env(Cursor::new(ENV), "env").unwrap();
let best = run_bioenv(&dm, &env, None, "env").unwrap();
assert_eq!(best.len(), 2);
assert_eq!(best[0].vars, vec!["pH"]);
assert!(
(best[0].correlation - 0.771517).abs() < 1e-6,
"{}",
best[0].correlation
);
assert_eq!(best[1].vars, vec!["pH", "Elevation"]);
assert!(
(best[1].correlation - 0.714286).abs() < 1e-6,
"{}",
best[1].correlation
);
}
#[test]
fn rankdata_ties_averaged() {
assert_eq!(rankdata(&[1.0, 2.0, 2.0, 3.0]), vec![1.0, 2.5, 2.5, 4.0]);
}
#[test]
fn nth_combination_is_lexicographic() {
let all: Vec<_> = (0..count_combinations(4, 2))
.map(|i| nth_combination(4, 2, i))
.collect();
assert_eq!(
all,
vec![
vec![0, 1],
vec![0, 2],
vec![0, 3],
vec![1, 2],
vec![1, 3],
vec![2, 3],
]
);
}
#[test]
fn condensed_upper_triangle() {
let dm = read_matrix(Cursor::new(DM), "dm").unwrap();
assert_eq!(dm.condensed(), vec![0.5, 0.25, 0.75, 0.1, 0.42, 0.33]);
}
}