quant-indicators 0.7.0

//! Hierarchical Risk Parity (HRP) — pure math functions.
//!
//! Three-phase algorithm:
//! 1. Compute correlation → distance matrix → agglomerative clustering (average linkage)
//! 2. Build dendrogram (binary tree of clusters)
//! 3. Recursive bisection following dendrogram with inverse-variance weighting
//!
//! All functions are pure math: `&[f64]` in, `f64` out. No domain types,
//! no I/O. The adapter wraps domain types (e.g. `LegReturns`) into raw slices
//! before calling these functions.

use rust_decimal::Decimal;

/// Minimum number of return observations required to compute
/// a meaningful correlation matrix.
pub const MIN_OBSERVATIONS: usize = 30;

/// Binary tree node representing hierarchical clustering.
#[derive(Debug)]
pub enum ClusterNode {
    /// Leaf node: a single asset (original index).
    Leaf(usize),
    /// Branch node: two child clusters.
    Branch {
        left: Box<ClusterNode>,
        right: Box<ClusterNode>,
    },
}

// =============================================================================
// Linear algebra helpers
// =============================================================================

pub fn mean(data: &[f64]) -> f64 {
    data.iter().sum::<f64>() / data.len() as f64
}

pub fn variance(data: &[f64], mean: f64) -> f64 {
    data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (data.len() - 1) as f64
}

/// Compute the Pearson correlation matrix from return series.
///
/// `series[i]` is the return series for asset `i`. All series must have the
/// same length. `means[i]` and `variances[i]` are the pre-computed mean and
/// sample variance for series `i`.
pub fn correlation_matrix(series: &[&[f64]], means: &[f64], variances: &[f64]) -> Vec<Vec<f64>> {
    let n = series.len();
    let n_obs = series[0].len();
    let mut corr = vec![vec![1.0; n]; n];

    for i in 0..n {
        for j in (i + 1)..n {
            let cov: f64 = (0..n_obs)
                .map(|t| (series[i][t] - means[i]) * (series[j][t] - means[j]))
                .sum::<f64>()
                / (n_obs - 1) as f64;
            let r = cov / (variances[i].sqrt() * variances[j].sqrt());
            let r = r.clamp(-1.0, 1.0);
            corr[i][j] = r;
            corr[j][i] = r;
        }
    }

    corr
}

/// Convert correlation to distance: d(i,j) = sqrt((1 - corr(i,j)) / 2)
pub fn distance_matrix(corr: &[Vec<f64>]) -> Vec<Vec<f64>> {
    let n = corr.len();
    let mut dist = vec![vec![0.0; n]; n];
    for i in 0..n {
        for j in (i + 1)..n {
            let d = ((1.0 - corr[i][j]) / 2.0).sqrt();
            dist[i][j] = d;
            dist[j][i] = d;
        }
    }
    dist
}

// =============================================================================
// Agglomerative clustering → dendrogram
// =============================================================================

/// Build a dendrogram using agglomerative clustering with weighted average linkage.
pub fn build_dendrogram(dist: &[Vec<f64>], n: usize) -> ClusterNode {
    if n == 1 {
        return ClusterNode::Leaf(0);
    }

    let mut nodes: Vec<Option<ClusterNode>> = (0..n).map(|i| Some(ClusterNode::Leaf(i))).collect();

    let mut work_dist: Vec<Vec<f64>> = dist.to_vec();
    let mut sizes: Vec<usize> = vec![1; n];

    for _ in 0..(n - 1) {
        let (ci, cj) = find_closest_pair(&work_dist, &nodes);

        // Safety: find_closest_pair only returns indices of active (Some) nodes
        let Some(left) = nodes[ci].take() else {
            continue;
        };
        let Some(right) = nodes[cj].take() else {
            nodes[ci] = Some(left); // restore if cj was unexpectedly None
            continue;
        };

        let merged = ClusterNode::Branch {
            left: Box::new(left),
            right: Box::new(right),
        };

        let size_merged = sizes[ci] + sizes[cj];

        // Update distances using UPGMA (size-weighted average linkage)
        for k in 0..nodes.len() {
            if k == ci || k == cj || nodes[k].is_none() {
                continue;
            }
            let d_new = (work_dist[ci][k] * sizes[ci] as f64 + work_dist[cj][k] * sizes[cj] as f64)
                / size_merged as f64;
            work_dist[ci][k] = d_new;
            work_dist[k][ci] = d_new;
        }

        sizes[ci] = size_merged;

        // Invalidate merged slot's row
        for val in &mut work_dist[cj] {
            *val = f64::INFINITY;
        }
        // Invalidate merged slot's column
        for row in &mut work_dist {
            row[cj] = f64::INFINITY;
        }

        nodes[ci] = Some(merged);
    }

    // After n-1 merges, exactly one node remains
    nodes
        .into_iter()
        .find_map(|n| n)
        .unwrap_or(ClusterNode::Leaf(0))
}

pub fn find_closest_pair(dist: &[Vec<f64>], nodes: &[Option<ClusterNode>]) -> (usize, usize) {
    let mut min_d = f64::INFINITY;
    let mut best = (0, 1);

    for i in 0..nodes.len() {
        if nodes[i].is_none() {
            continue;
        }
        for j in (i + 1)..nodes.len() {
            if nodes[j].is_none() {
                continue;
            }
            if dist[i][j] < min_d {
                min_d = dist[i][j];
                best = (i, j);
            }
        }
    }

    best
}

// =============================================================================
// Recursive bisection (follows dendrogram structure)
// =============================================================================

/// Walk the dendrogram tree, splitting weight between left and right
/// subtrees using inverse-variance allocation at each branch.
pub fn recursive_bisect(node: &ClusterNode, variances: &[f64], weight: f64, out: &mut [f64]) {
    match node {
        ClusterNode::Leaf(i) => {
            out[*i] = weight;
        }
        ClusterNode::Branch { left, right } => {
            let var_left = cluster_variance(left, variances);
            let var_right = cluster_variance(right, variances);
            let total = var_left + var_right;

            if total < 1e-30 {
                recursive_bisect(left, variances, weight * 0.5, out);
                recursive_bisect(right, variances, weight * 0.5, out);
            } else {
                // Inverse-variance: lower variance gets more weight
                let alpha = 1.0 - var_left / total;
                recursive_bisect(left, variances, weight * alpha, out);
                recursive_bisect(right, variances, weight * (1.0 - alpha), out);
            }
        }
    }
}

/// Compute average variance for a cluster (sum of leaf variances / leaf count).
///
/// Bug fix (#1238): previously returned the raw SUM, which scaled linearly
/// with cluster size. A 17-leaf cluster had ~17x the "variance" of a 1-leaf
/// cluster, causing inverse-variance bisection to push 89-99% weight into
/// the smaller cluster. Dividing by leaf count normalizes for cluster size.
pub fn cluster_variance(node: &ClusterNode, variances: &[f64]) -> f64 {
    let (sum, count) = cluster_variance_sum(node, variances);
    sum / count as f64
}

pub fn cluster_variance_sum(node: &ClusterNode, variances: &[f64]) -> (f64, usize) {
    match node {
        ClusterNode::Leaf(i) => (variances[*i], 1),
        ClusterNode::Branch { left, right } => {
            let (sl, cl) = cluster_variance_sum(left, variances);
            let (sr, cr) = cluster_variance_sum(right, variances);
            (sl + sr, cl + cr)
        }
    }
}

/// Convert f64 to Decimal with 6 decimal places precision.
///
/// Returns an error message if the value is NaN, infinite, or
/// cannot be represented as a Decimal.
pub fn decimal_from_f64(v: f64) -> Result<Decimal, String> {
    if !v.is_finite() {
        return Err(format!("non-finite weight: {v}"));
    }
    let scaled = (v * 1_000_000.0).round() / 1_000_000.0;
    Decimal::try_from(scaled).map_err(|e| format!("decimal conversion failed for {v}: {e}"))
}

#[cfg(test)]
#[path = "hrp_tests.rs"]
mod tests;