zilla-muf 0.1.1

use num_traits::Float;

/// Segment-sum — the lower-triangular cumulative-sum matrix used in
/// Mamba-2's SSD chunked algorithm.
///
/// Builds an n x n matrix where
///   `out[i][j] = sum_{t=j+1}^{i} x[t]`   for `j <= i`  (the segment sum)
///   `out[i][j] = -inf`                    for `j  > i`  (masked / future)
/// with the diagonal equal to 0 (an empty sum). Returned flattened
/// row-major, so logical entry `(i, j)` lives at index `i * n + j`.
///
/// Why it matters: in SSD this is the decay *in log space*. Once you
/// exponentiate `out`, entry `(i, j)` becomes the product of decays
/// carrying state from step `j` to step `i`, and every `-inf` turns into
/// `0` — i.e. the causal mask falls out for free. Accumulating sums (not
/// products) is the trick that keeps long decays numerically stable.
/// Cost: O(n^2) time and space.
pub fn segsum<T: Float>(x: &[T]) -> Vec<T> {
	let n = x.len();
	// Initialize fully masked: any upper-triangular entry we don't touch
	// below stays -inf (a hard zero after exponentiation).
	let mut out = vec![T::neg_infinity(); n * n];
	for i in 0..n {
		out[i * n + i] = T::zero(); // diagonal: empty sum = 0
		// Walk left from the diagonal, accumulating x[j+1..=i]. Sweeping
		// right-to-left lets each step reuse the running total, so the
		// whole matrix is O(n^2) rather than O(n^3) recomputation.
		let mut running = T::zero();
		for j in (0..i).rev() {
			running = running + x[j + 1];
			out[i * n + j] = running;
		}
	}
	out
}

#[cfg(test)]
mod tests {
	use super::*;

	#[test]
	fn empty_returns_empty() {
		assert!(segsum::<f64>(&[]).is_empty());
	}

	#[test]
	fn small_case() {
		let x = [1.0, 2.0, 3.0];
		let m = segsum(&x);
		// row 2, col 0 = x[1] + x[2] = 5.0
		assert_eq!(m[2 * 3 + 0], 5.0);
		// diagonal is always 0
		assert_eq!(m[0], 0.0);
		assert_eq!(m[1 * 3 + 1], 0.0);
	}
}