atelier_quant 0.0.12

//! Maximum Likelihood Estimation for the univariate Hawkes process.
//!
//! # Model
//!
//! Univariate Hawkes process with exponential kernel:
//!
//! $$
//!   \lambda(t) = \mu + \alpha \sum_{t_j < t} e^{-\beta(t - t_j)}
//! $$
//!
//! # Log-Likelihood
//!
//! $$
//!   \ell(\mu, \alpha, \beta) = \sum_i \ln \lambda(t_i)
//!     - \mu T
//!     - \frac{\alpha}{\beta} \sum_i \bigl(1 - e^{-\beta(T - t_i)}\bigr)
//! $$
//!
//! Evaluated in $O(n)$ via the auxiliary recursion:
//!
//! $$
//!   A_i = e^{-\beta(t_i - t_{i-1})} \cdot (1 + A_{i-1}), \quad A_0 = 0
//! $$
//!
//! # Optimization
//!
//! Gradient ascent with analytical gradients, Armijo backtracking line
//! search, and parameter projection to maintain the feasibility set
//! $\mu > 0,\; \alpha > 0,\; \beta > 0,\; \alpha / \beta < 1$.

use serde::{Deserialize, Serialize};

use super::errors::HawkesError;

// ── Configuration ────────────────────────────────────────────────────

/// Configuration for the Hawkes MLE optimizer.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HawkesEstimationConfig {
    /// Maximum number of gradient ascent iterations.
    pub max_iter: usize,

    /// Convergence tolerance on the gradient infinity-norm.
    pub tol: f64,

    /// Initial step size for line search.
    pub learning_rate: f64,

    /// Optional initial parameter guess `(μ₀, α₀, β₀)`.
    /// If `None`, a heuristic based on the data is used.
    pub initial_params: Option<(f64, f64, f64)>,
}

impl Default for HawkesEstimationConfig {
    fn default() -> Self {
        Self {
            max_iter: 1_000,
            tol: 1e-3,
            learning_rate: 1e-3,
            initial_params: None,
        }
    }
}

// ── Result ───────────────────────────────────────────────────────────

/// Output of the Hawkes MLE estimation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HawkesEstimationResult {
    /// Estimated baseline intensity.
    pub mu: f64,
    /// Estimated excitation factor.
    pub alpha: f64,
    /// Estimated decay rate.
    pub beta: f64,
    /// Log-likelihood at the optimum.
    pub log_likelihood: f64,
    /// Number of iterations executed.
    pub iterations: usize,
    /// Whether the optimizer converged within tolerance.
    pub converged: bool,
    /// Branching ratio α/β (must be < 1 for stationarity).
    pub branching_ratio: f64,
    /// Akaike Information Criterion: 2k − 2ℓ (k=3).
    pub aic: f64,
    /// Bayesian Information Criterion: k·ln(n) − 2ℓ.
    pub bic: f64,
}

// ── Core functions ───────────────────────────────────────────────────

/// Minimum parameter value to avoid log(0) and division by zero.
const PARAM_EPS: f64 = 1e-12;

/// Maximum branching ratio allowed during optimization.
const MAX_BRANCHING: f64 = 0.9999;

/// Compute the log-likelihood of a univariate Hawkes process with
/// exponential kernel, given event times and parameters.
///
/// Uses the O(n) recursive formulation.
///
/// # Arguments
///
/// * `mu` - Baseline intensity (> 0).
/// * `alpha` - Excitation factor (> 0).
/// * `beta` - Decay rate (> 0).
/// * `events` - Sorted event times (in any consistent time unit).
///
/// # Returns
///
/// The log-likelihood value, or `f64::NEG_INFINITY` if any intensity is ≤ 0.
pub fn log_likelihood(mu: f64, alpha: f64, beta: f64, events: &[f64]) -> f64 {
    let n = events.len();
    if n == 0 {
        return 0.0;
    }

    let t_max = events[n - 1];
    let mut ll = -(mu * t_max);

    // Recursive auxiliary A_i
    let mut a = 0.0_f64;

    for i in 0..n {
        if i > 0 {
            let dt = events[i] - events[i - 1];
            a = (-beta * dt).exp() * (1.0 + a);
        }

        let intensity = mu + alpha * a;
        if intensity <= 0.0 {
            return f64::NEG_INFINITY;
        }
        ll += intensity.ln();

        // Compensator contribution for this event
        let remaining = t_max - events[i];
        ll -= (alpha / beta) * (1.0 - (-beta * remaining).exp());
    }

    ll
}

/// Compute the analytical gradient of the log-likelihood.
///
/// Returns `(∂ℓ/∂μ, ∂ℓ/∂α, ∂ℓ/∂β)`.
///
/// Uses the recursive auxiliaries:
///   A(i) = exp(-β·Δt) · (1 + A(i-1))
///   B(i) = -(Δt · exp(-β·Δt) · (1 + A(i-1)) + exp(-β·Δt) · B(i-1))
pub fn gradient(mu: f64, alpha: f64, beta: f64, events: &[f64]) -> (f64, f64, f64) {
    let n = events.len();
    if n == 0 {
        return (0.0, 0.0, 0.0);
    }

    let t_max = events[n - 1];

    let mut dl_dmu = -t_max;
    let mut dl_dalpha = 0.0_f64;
    let mut dl_dbeta = 0.0_f64;

    let mut a = 0.0_f64; // A(i) recursion
    let mut b = 0.0_f64; // B(i) recursion (∂A/∂β)

    for i in 0..n {
        if i > 0 {
            let dt = events[i] - events[i - 1];
            let exp_neg = (-beta * dt).exp();
            let one_plus_a = 1.0 + a;

            // B must be computed before A is updated
            //   B_i = −Δt·exp(−β·Δt)·(1+A_{i−1}) + exp(−β·Δt)·B_{i−1}
            b = -dt * exp_neg * one_plus_a + exp_neg * b;
            a = exp_neg * one_plus_a;
        }

        let intensity = mu + alpha * a;
        if intensity <= PARAM_EPS {
            continue; // skip degenerate points
        }
        let inv_lambda = 1.0 / intensity;

        // ∂ℓ/∂μ += 1/λ(tᵢ)
        dl_dmu += inv_lambda;

        // ∂ℓ/∂α += A(i)/λ(tᵢ)
        dl_dalpha += a * inv_lambda;

        // ∂ℓ/∂β += α·B(i)/λ(tᵢ)
        dl_dbeta += alpha * b * inv_lambda;

        // Compensator gradient contributions
        let remaining = t_max - events[i];
        let exp_rem = (-beta * remaining).exp();

        // ∂/∂α of compensator term: -(1/β)(1 - exp(-β·r))
        dl_dalpha -= (1.0 / beta) * (1.0 - exp_rem);

        // ∂/∂β of compensator term:
        //   (α/β²)(1 - exp(-β·r)) - (α/β)·r·exp(-β·r)
        dl_dbeta += (alpha / (beta * beta)) * (1.0 - exp_rem);
        dl_dbeta -= (alpha / beta) * remaining * exp_rem;
    }

    (dl_dmu, dl_dalpha, dl_dbeta)
}

/// Project parameters onto the feasible set.
///
/// Ensures μ > 0, α > 0, β > 0, and α/β < MAX_BRANCHING.
#[inline]
pub fn project(mu: &mut f64, alpha: &mut f64, beta: &mut f64) {
    *mu = mu.max(PARAM_EPS);
    *alpha = alpha.max(PARAM_EPS);
    *beta = beta.max(PARAM_EPS);

    // Enforce stationarity: α/β < MAX_BRANCHING
    if *alpha / *beta >= MAX_BRANCHING {
        *alpha = MAX_BRANCHING * *beta;
    }
}

/// Infinity-norm of a 3-vector.
#[inline]
pub fn grad_norm(g: &(f64, f64, f64)) -> f64 {
    g.0.abs().max(g.1.abs()).max(g.2.abs())
}

/// Estimate Hawkes process parameters via Maximum Likelihood.
///
/// Uses projected gradient ascent with Armijo backtracking line search.
///
/// # Arguments
///
/// * `events` - Sorted event times in any consistent unit (e.g. seconds).
///   `(2 spaces)`: Must contain at least 2 events.
/// * `config` - Optimizer configuration (iterations, tolerance, etc.).
///
/// # Returns
///
/// `HawkesEstimationResult` with fitted parameters and diagnostics.
///
/// # Errors
///
/// - `HawkesError::InvalidEventTimes` if events are empty, unsorted, or contain NaN.
/// - `HawkesError::NotConverged` if the gradient norm exceeds `tol` after `max_iter`.
/// - `HawkesError::EstimationFailed` on numerical failure (NaN log-likelihood).
///
pub fn estimate_hawkes_mle(
    events: &[f64],
    config: &HawkesEstimationConfig,
) -> Result<HawkesEstimationResult, HawkesError> {
    // ── Validate input ───────────────────────────────────────────────
    validate_events(events)?;

    let n = events.len();
    let t_max = events[n - 1];
    let t_min = events[0];
    let duration = t_max - t_min;

    if duration <= 0.0 {
        return Err(HawkesError::InvalidEventTimes(
            "All events have the same timestamp".into(),
        ));
    }

    // ── Initial parameters ───────────────────────────────────────────
    let (mut mu, mut alpha, mut beta) = match config.initial_params {
        Some((m, a, b)) => (m, a, b),
        None => {
            // Heuristic that adapts to the time-scale of the data.
            //
            // The empirical rate is λ̂ = n/T.  We split the rate
            // between a baseline μ and the Hawkes self-excitation
            // contribution μ/(1−α/β).
            //
            //   μ₀  = 0.8 · λ̂       (80 % attributed to baseline)
            //   β₀  = 1 / mean_ia    (decay ≈ one interarrival)
            //   α₀  = 0.2 · β₀       (branching ratio = 0.2)
            //
            // This ensures α₀/β₀ = 0.2, well inside the stationarity
            // region regardless of the time unit (ms, s, μs, …).
            let mean_ia = duration / (n as f64 - 1.0);
            let lambda_hat = n as f64 / duration;
            let mu0 = 0.8 * lambda_hat;
            let beta0 = 1.0 / mean_ia;
            let alpha0 = 0.2 * beta0;
            (mu0, alpha0, beta0)
        }
    };

    project(&mut mu, &mut alpha, &mut beta);

    let mut ll = log_likelihood(mu, alpha, beta, events);
    let mut lr = config.learning_rate;
    let mut converged = false;
    let mut iterations = 0;

    // ── Gradient ascent with Armijo line search ──────────────────────
    //
    // We use a *normalised* search direction: d = g / ||g||_inf.
    // This decouples the step size from the gradient magnitude, so the
    // same learning_rate works whether n=10 or n=100_000.

    for iter in 0..config.max_iter {
        iterations = iter + 1;

        let g = gradient(mu, alpha, beta, events);
        let gn = grad_norm(&g);

        if gn < config.tol {
            converged = true;
            break;
        }

        // Normalised direction: each component in [-1, 1]
        let inv_gn = 1.0 / gn;
        let d = (g.0 * inv_gn, g.1 * inv_gn, g.2 * inv_gn);

        // Armijo backtracking line search
        let c1 = 1e-4; // sufficient decrease constant
        let mut step = lr;
        let mut found_step = false;

        for _ in 0..30 {
            let mut mu_new = mu + step * d.0;
            let mut alpha_new = alpha + step * d.1;
            let mut beta_new = beta + step * d.2;
            project(&mut mu_new, &mut alpha_new, &mut beta_new);

            let ll_new = log_likelihood(mu_new, alpha_new, beta_new, events);

            // Armijo condition: f(x + step·d) ≥ f(x) + c1·step·(g·d)
            let directional =
                g.0 * (mu_new - mu) + g.1 * (alpha_new - alpha) + g.2 * (beta_new - beta);

            if ll_new.is_finite() && ll_new >= ll + c1 * directional {
                mu = mu_new;
                alpha = alpha_new;
                beta = beta_new;
                ll = ll_new;
                found_step = true;

                // Adaptive step size: grow cautiously on success
                lr = (step * 1.2).min(1.0);
                break;
            }

            step *= 0.5;
        }

        if !found_step {
            // Line search failed — shrink global learning rate and retry
            lr *= 0.1;
            if lr < 1e-15 {
                // Effectively stuck
                break;
            }
        }

        if !ll.is_finite() {
            return Err(HawkesError::EstimationFailed(
                "Log-likelihood became NaN/Inf during optimization".into(),
            ));
        }
    }

    // ── Package results ──────────────────────────────────────────────
    let branching_ratio = alpha / beta;
    let k = 3.0_f64; // number of parameters
    let n_f = n as f64;
    let aic = 2.0 * k - 2.0 * ll;
    let bic = k * n_f.ln() - 2.0 * ll;

    if !converged {
        let final_grad = gradient(mu, alpha, beta, events);
        let gn = grad_norm(&final_grad);

        // If gradient is reasonably small, consider it "close enough"
        if gn < config.tol {
            converged = true;
        } else {
            return Err(HawkesError::NotConverged {
                iterations,
                gradient_norm: gn,
            });
        }
    }

    Ok(HawkesEstimationResult {
        mu,
        alpha,
        beta,
        log_likelihood: ll,
        iterations,
        converged,
        branching_ratio,
        aic,
        bic,
    })
}

/// Compute the A_i and B_i recursion arrays for testing/diagnostics.
///
/// Returns `(a_values, b_values)` where:
/// - `a_values[i]` = $\sum_{j < i} e^{-\beta(t_i - t_j)}$ (the Ozaki recursion)
/// - `b_values[i]` = $\partial A_i / \partial \beta$
///
/// This is exposed for testing the recursion against brute-force computation.
pub fn compute_recursion_arrays(beta: f64, events: &[f64]) -> (Vec<f64>, Vec<f64>) {
    let n = events.len();
    let mut a_vals = Vec::with_capacity(n);
    let mut b_vals = Vec::with_capacity(n);

    let mut a = 0.0_f64;
    let mut b = 0.0_f64;

    for i in 0..n {
        if i > 0 {
            let dt = events[i] - events[i - 1];
            let exp_neg = (-beta * dt).exp();
            let one_plus_a = 1.0 + a;
            b = -dt * exp_neg * one_plus_a + exp_neg * b;
            a = exp_neg * one_plus_a;
        }
        a_vals.push(a);
        b_vals.push(b);
    }

    (a_vals, b_vals)
}

/// Validate event times for MLE input.
///
/// Ensures at least 2 events, all finite, and strictly increasing.
pub fn validate_events(events: &[f64]) -> Result<(), HawkesError> {
    if events.len() < 2 {
        return Err(HawkesError::InvalidEventTimes(
            "Need at least 2 events for estimation".into(),
        ));
    }

    for (i, &t) in events.iter().enumerate() {
        if !t.is_finite() {
            return Err(HawkesError::InvalidEventTimes(format!(
                "Event at index {} is not finite: {}",
                i, t
            )));
        }
    }

    for i in 1..events.len() {
        if events[i] <= events[i - 1] {
            return Err(HawkesError::InvalidEventTimes(format!(
                "Events not strictly increasing at index {}: {} <= {}",
                i,
                events[i],
                events[i - 1]
            )));
        }
    }

    Ok(())
}

/// Compute the compensator (integrated intensity) over `[0, T]`.
///
/// $$
///   \Lambda(T) = \mu T + \frac{\alpha}{\beta} \sum_i
///     \bigl(1 - e^{-\beta(T - t_i)}\bigr)
/// $$
///
/// Useful for time-rescaling residual analysis: if the model is correctly
/// specified, the transformed inter-event intervals
/// $\Lambda(t_{i+1}) - \Lambda(t_i)$ should be i.i.d. Exp(1).
pub fn compensator(mu: f64, alpha: f64, beta: f64, events: &[f64], t: f64) -> f64 {
    let mut comp = mu * t;

    for &ti in events {
        if ti >= t {
            break;
        }
        comp += (alpha / beta) * (1.0 - (-beta * (t - ti)).exp());
    }

    comp
}

/// Compute time-rescaling residuals for goodness-of-fit assessment.
///
/// Returns the transformed inter-event intervals
/// `Λ(t_{i+1}) − Λ(t_i)` which should be i.i.d. Exp(1) if the
/// model is correctly specified (time-rescaling theorem).
///
/// # Arguments
///
/// * `mu`, `alpha`, `beta` - Fitted Hawkes parameters.
/// * `events` - The event times used for fitting.
///
/// # Returns
///
/// A vector of rescaled intervals (length `n-1`).
pub fn time_rescaling_residuals(
    mu: f64,
    alpha: f64,
    beta: f64,
    events: &[f64],
) -> Vec<f64> {
    let n = events.len();
    if n < 2 {
        return vec![];
    }

    let mut residuals = Vec::with_capacity(n - 1);

    // Compute compensator at each event time
    let mut comp_prev = compensator(mu, alpha, beta, events, events[0]);

    for i in 1..n {
        let comp_curr = compensator(mu, alpha, beta, events, events[i]);
        residuals.push(comp_curr - comp_prev);
        comp_prev = comp_curr;
    }

    residuals
}