pandrs 0.3.1 - Docs.rs

//! Anomaly detection algorithms
//!
//! This module provides implementations of anomaly detection algorithms
//! for identifying outliers and unusual patterns in data.

use crate::core::error::{Error, Result};
use crate::dataframe::DataFrame;
use crate::ml::models::ModelEvaluator;
use crate::ml::models::ModelMetrics;
use crate::ml::models::UnsupervisedModel;
use std::collections::{HashMap, HashSet};

/// Isolation Forest for anomaly detection
///
/// Isolation Forest is an ensemble method that detects anomalies by
/// recursively partitioning the data and identifying points that
/// require fewer partitions to isolate.
#[derive(Debug, Clone)]
pub struct IsolationForest {
    /// Number of trees in the forest
    pub n_estimators: usize,
    /// Maximum number of samples to draw for each tree
    pub max_samples: Option<usize>,
    /// Maximum depth of the trees
    pub max_depth: Option<usize>,
    /// Contamination: expected proportion of outliers in the data
    pub contamination: f64,
    /// Random seed for reproducibility
    pub random_seed: Option<u64>,
    /// Anomaly scores (-1 for anomalies, 1 for normal points)
    pub scores: Option<Vec<f64>>,
    /// Feature columns used for anomaly detection
    pub feature_columns: Option<Vec<String>>,
}

impl IsolationForest {
    /// Create a new IsolationForest instance
    pub fn new() -> Self {
        IsolationForest {
            n_estimators: 100,
            max_samples: None,
            max_depth: None,
            contamination: 0.1,
            random_seed: None,
            scores: None,
            feature_columns: None,
        }
    }

    /// Get anomaly scores
    pub fn anomaly_scores(&self) -> &[f64] {
        match &self.scores {
            Some(scores) => scores,
            None => &[], // Return empty slice if no scores available
        }
    }

    /// Get anomaly labels (1 for normal, -1 for anomalies)
    pub fn labels(&self) -> &[i64] {
        // This is a stub implementation for backward compatibility
        // In a real implementation, we would return actual labels
        &[]
    }

    /// Set number of trees in the forest
    pub fn n_estimators(mut self, n_estimators: usize) -> Self {
        self.n_estimators = n_estimators;
        self
    }

    /// Set maximum number of samples to draw for each tree
    pub fn max_samples(mut self, max_samples: usize) -> Self {
        self.max_samples = Some(max_samples);
        self
    }

    /// Set maximum depth of the trees
    pub fn max_depth(mut self, max_depth: usize) -> Self {
        self.max_depth = Some(max_depth);
        self
    }

    /// Set contamination (expected proportion of outliers)
    pub fn contamination(mut self, contamination: f64) -> Self {
        self.contamination = contamination;
        self
    }

    /// Set random seed for reproducibility
    pub fn random_seed(mut self, seed: u64) -> Self {
        self.random_seed = Some(seed);
        self
    }

    /// Specify feature columns to use
    pub fn with_columns(mut self, columns: Vec<String>) -> Self {
        self.feature_columns = Some(columns);
        self
    }

    /// Predict anomaly scores for new data
    pub fn predict(&self, data: &DataFrame) -> Result<Vec<f64>> {
        // Placeholder implementation
        // In a real implementation, this would use the trained model to predict anomaly scores
        let n_samples = data.row_count();
        let mut scores = vec![1.0; n_samples]; // Default: all normal points

        // Mark random 10% as anomalies for demonstration
        use rand::{Rng, RngExt};
        let mut rng = rand::rng();

        for _ in 0..((n_samples as f64 * 0.1) as usize) {
            let idx = rng.random_range(0..n_samples);
            scores[idx] = -1.0; // Anomaly
        }

        Ok(scores)
    }

    /// Get decision function values (anomaly scores)
    pub fn decision_function(&self, data: &DataFrame) -> Result<Vec<f64>> {
        // Placeholder implementation
        // In a real implementation, this would return the raw anomaly scores
        let n_samples = data.row_count();
        let mut scores = Vec::with_capacity(n_samples);

        use rand::{Rng, RngExt};
        let mut rng = rand::rng();

        for _ in 0..n_samples {
            // Random scores between -1 and 1, where lower = more anomalous
            scores.push(rng.random_range(-1.0..1.0));
        }

        Ok(scores)
    }
}

impl UnsupervisedModel for IsolationForest {
    fn fit(&mut self, data: &DataFrame) -> Result<()> {
        // Placeholder implementation
        // In a real implementation, this would build the isolation forest model
        let n_samples = data.row_count();
        let mut scores = Vec::with_capacity(n_samples);

        use rand::{Rng, RngExt};
        let mut rng = rand::rng();

        for _ in 0..n_samples {
            // Random scores between -1 and 1, where lower = more anomalous
            scores.push(rng.random_range(-1.0..1.0));
        }

        self.scores = Some(scores);

        // Store feature columns if not already specified
        if self.feature_columns.is_none() {
            self.feature_columns = Some(data.column_names().into());
        }

        Ok(())
    }

    fn transform(&self, data: &DataFrame) -> Result<DataFrame> {
        // Transform adds an anomaly score column to the data
        let scores = self.decision_function(data)?;
        let mut result = data.clone();

        result.add_column(
            "anomaly_score".to_string(),
            crate::series::Series::new(scores, Some("anomaly_score".to_string()))?,
        )?;

        Ok(result)
    }
}

impl ModelEvaluator for IsolationForest {
    fn evaluate(&self, test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
        // Placeholder implementation
        let mut metrics = ModelMetrics::new();
        metrics.add_metric("anomaly_ratio", self.contamination);
        Ok(metrics)
    }

    fn cross_validate(
        &self,
        _data: &DataFrame,
        _target: &str,
        _folds: usize,
    ) -> Result<Vec<ModelMetrics>> {
        Err(Error::InvalidOperation(
            "Cross-validation is not applicable for anomaly detection".into(),
        ))
    }
}

/// Local Outlier Factor for anomaly detection
///
/// Local Outlier Factor computes the local density deviation of a point
/// with respect to its neighbors, identifying points that have significantly
/// lower density than their neighbors.
#[derive(Debug, Clone)]
pub struct LocalOutlierFactor {
    /// Number of neighbors to consider
    pub n_neighbors: usize,
    /// Contamination: expected proportion of outliers in the data
    pub contamination: f64,
    /// Algorithm to use for nearest neighbors search
    pub algorithm: String,
    /// Anomaly scores (-1 for anomalies, 1 for normal points)
    pub scores: Option<Vec<f64>>,
    /// Feature columns used for anomaly detection
    pub feature_columns: Option<Vec<String>>,
}

impl LocalOutlierFactor {
    /// Create a new LocalOutlierFactor instance
    pub fn new(n_neighbors: usize) -> Self {
        LocalOutlierFactor {
            n_neighbors,
            contamination: 0.1,
            algorithm: "auto".to_string(),
            scores: None,
            feature_columns: None,
        }
    }

    /// Get anomaly scores
    pub fn anomaly_scores(&self) -> &[f64] {
        match &self.scores {
            Some(scores) => scores,
            None => &[], // Return empty slice if no scores available
        }
    }

    /// Get anomaly labels (1 for normal, -1 for anomalies)
    pub fn labels(&self) -> &[i64] {
        // This is a stub implementation for backward compatibility
        &[]
    }

    /// Set contamination (expected proportion of outliers)
    pub fn contamination(mut self, contamination: f64) -> Self {
        self.contamination = contamination;
        self
    }

    /// Set algorithm for nearest neighbors search
    pub fn algorithm(mut self, algorithm: &str) -> Self {
        self.algorithm = algorithm.to_string();
        self
    }

    /// Specify feature columns to use
    pub fn with_columns(mut self, columns: Vec<String>) -> Self {
        self.feature_columns = Some(columns);
        self
    }
}

impl UnsupervisedModel for LocalOutlierFactor {
    fn fit(&mut self, data: &DataFrame) -> Result<()> {
        // Placeholder implementation
        self.feature_columns = Some(data.column_names().into());
        Ok(())
    }

    fn transform(&self, _data: &DataFrame) -> Result<DataFrame> {
        // LOF is typically used in "novelty" mode for transform
        Err(Error::InvalidOperation(
            "LocalOutlierFactor does not support transform in current implementation".into(),
        ))
    }
}

impl ModelEvaluator for LocalOutlierFactor {
    fn evaluate(&self, _test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
        let mut metrics = ModelMetrics::new();
        metrics.add_metric("anomaly_ratio", self.contamination);
        Ok(metrics)
    }

    fn cross_validate(
        &self,
        _data: &DataFrame,
        _target: &str,
        _folds: usize,
    ) -> Result<Vec<ModelMetrics>> {
        Err(Error::InvalidOperation(
            "Cross-validation is not applicable for anomaly detection".into(),
        ))
    }
}

/// One-Class SVM for anomaly detection
///
/// One-Class SVM learns a boundary that separates the majority of data
/// from the origin, treating points outside this boundary as anomalies.
#[derive(Debug, Clone)]
pub struct OneClassSVM {
    /// Kernel type
    pub kernel: String,
    /// Regularization parameter nu (upper bound on fraction of outliers)
    pub nu: f64,
    /// Kernel coefficient for RBF, polynomial, and sigmoid kernels
    pub gamma: Option<f64>,
    /// Anomaly scores (-1 for anomalies, 1 for normal points)
    pub scores: Option<Vec<f64>>,
    /// Feature columns used for anomaly detection
    pub feature_columns: Option<Vec<String>>,
}

impl OneClassSVM {
    /// Create a new OneClassSVM instance
    pub fn new() -> Self {
        OneClassSVM {
            kernel: "rbf".to_string(),
            nu: 0.1,
            gamma: None,
            scores: None,
            feature_columns: None,
        }
    }

    /// Get anomaly scores
    pub fn anomaly_scores(&self) -> &[f64] {
        match &self.scores {
            Some(scores) => scores,
            None => &[], // Return empty slice if no scores available
        }
    }

    /// Get anomaly labels (1 for normal, -1 for anomalies)
    pub fn labels(&self) -> &[i64] {
        // This is a stub implementation for backward compatibility
        &[]
    }

    /// Set kernel type
    pub fn kernel(mut self, kernel: &str) -> Self {
        self.kernel = kernel.to_string();
        self
    }

    /// Set regularization parameter nu
    pub fn nu(mut self, nu: f64) -> Self {
        self.nu = nu;
        self
    }

    /// Set kernel coefficient gamma
    pub fn gamma(mut self, gamma: f64) -> Self {
        self.gamma = Some(gamma);
        self
    }

    /// Specify feature columns to use
    pub fn with_columns(mut self, columns: Vec<String>) -> Self {
        self.feature_columns = Some(columns);
        self
    }
}

impl UnsupervisedModel for OneClassSVM {
    fn fit(&mut self, data: &DataFrame) -> Result<()> {
        // Placeholder implementation
        self.feature_columns = Some(data.column_names().into());
        Ok(())
    }

    fn transform(&self, data: &DataFrame) -> Result<DataFrame> {
        // Transform adds an anomaly score column to the data
        let n_samples = data.row_count();

        // Placeholder implementation for scoring
        use rand::{Rng, RngExt};
        let mut rng = rand::rng();

        let scores: Vec<f64> = (0..n_samples)
            .map(|_| rng.random_range(-1.0..1.0))
            .collect();

        let mut result = data.clone();

        result.add_column(
            "anomaly_score".to_string(),
            crate::series::Series::new(scores, Some("anomaly_score".to_string()))?,
        )?;

        Ok(result)
    }
}

impl ModelEvaluator for OneClassSVM {
    fn evaluate(&self, _test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
        let mut metrics = ModelMetrics::new();
        metrics.add_metric("anomaly_ratio", self.nu);
        Ok(metrics)
    }

    fn cross_validate(
        &self,
        _data: &DataFrame,
        _target: &str,
        _folds: usize,
    ) -> Result<Vec<ModelMetrics>> {
        Err(Error::InvalidOperation(
            "Cross-validation is not applicable for anomaly detection".into(),
        ))
    }
}

// Re-exports - remove self references to avoid duplicate definitions
// These types are already defined in this module, so no need to re-export