use std::collections::HashMap;
use chrono::{Datelike, NaiveDate};
use rust_decimal::Decimal;
use crate::models::{Graph, NodeId};
#[derive(Debug, Clone)]
pub enum NormalizationMethod {
None,
MinMax,
ZScore,
Log,
Robust,
}
pub struct FeatureNormalizer {
method: NormalizationMethod,
stats: Vec<FeatureStats>,
}
#[derive(Debug, Clone, Default)]
struct FeatureStats {
min: f64,
max: f64,
mean: f64,
std: f64,
median: f64,
q1: f64,
q3: f64,
}
impl FeatureNormalizer {
pub fn new(method: NormalizationMethod) -> Self {
Self {
method,
stats: Vec::new(),
}
}
pub fn fit_nodes(&mut self, graph: &Graph) {
let features = graph.node_features();
self.fit(&features);
}
pub fn fit_edges(&mut self, graph: &Graph) {
let features = graph.edge_features();
self.fit(&features);
}
fn fit(&mut self, features: &[Vec<f64>]) {
if features.is_empty() {
return;
}
let dim = features[0].len();
self.stats = (0..dim)
.map(|d| {
let values: Vec<f64> = features
.iter()
.map(|f| f.get(d).copied().unwrap_or(0.0))
.collect();
Self::compute_stats(&values)
})
.collect();
}
fn compute_stats(values: &[f64]) -> FeatureStats {
if values.is_empty() {
return FeatureStats::default();
}
let n = values.len() as f64;
let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let sum: f64 = values.iter().sum();
let mean = sum / n;
let variance: f64 = values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
let std = variance.sqrt();
let mut sorted = values.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if sorted.len().is_multiple_of(2) {
(sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
} else {
sorted[sorted.len() / 2]
};
let q1_idx = sorted.len() / 4;
let q3_idx = 3 * sorted.len() / 4;
let q1 = sorted.get(q1_idx).copied().unwrap_or(min);
let q3 = sorted.get(q3_idx).copied().unwrap_or(max);
FeatureStats {
min,
max,
mean,
std,
median,
q1,
q3,
}
}
pub fn transform(&self, features: &[Vec<f64>]) -> Vec<Vec<f64>> {
features.iter().map(|f| self.transform_single(f)).collect()
}
fn transform_single(&self, features: &[f64]) -> Vec<f64> {
features
.iter()
.enumerate()
.map(|(i, &x)| {
let stats = self.stats.get(i).cloned().unwrap_or_default();
self.normalize_value(x, &stats)
})
.collect()
}
fn normalize_value(&self, x: f64, stats: &FeatureStats) -> f64 {
match self.method {
NormalizationMethod::None => x,
NormalizationMethod::MinMax => {
let range = stats.max - stats.min;
if range.abs() < 1e-10 {
0.0
} else {
(x - stats.min) / range
}
}
NormalizationMethod::ZScore => {
if stats.std.abs() < 1e-10 {
0.0
} else {
(x - stats.mean) / stats.std
}
}
NormalizationMethod::Log => (x.abs() + 1.0).ln() * x.signum(),
NormalizationMethod::Robust => {
let iqr = stats.q3 - stats.q1;
if iqr.abs() < 1e-10 {
0.0
} else {
(x - stats.median) / iqr
}
}
}
}
}
pub fn compute_structural_features(graph: &Graph) -> HashMap<NodeId, Vec<f64>> {
let mut features = HashMap::new();
for &node_id in graph.nodes.keys() {
let mut node_features = Vec::new();
let in_degree = graph.in_degree(node_id) as f64;
let out_degree = graph.out_degree(node_id) as f64;
let total_degree = in_degree + out_degree;
node_features.push(in_degree);
node_features.push(out_degree);
node_features.push(total_degree);
node_features.push((in_degree + 1.0).ln());
node_features.push((out_degree + 1.0).ln());
if total_degree > 0.0 {
node_features.push(in_degree / total_degree);
node_features.push(out_degree / total_degree);
} else {
node_features.push(0.5);
node_features.push(0.5);
}
let in_weight: f64 = graph.incoming_edges(node_id).iter().map(|e| e.weight).sum();
let out_weight: f64 = graph.outgoing_edges(node_id).iter().map(|e| e.weight).sum();
node_features.push((in_weight + 1.0).ln());
node_features.push((out_weight + 1.0).ln());
if in_degree > 0.0 {
node_features.push(in_weight / in_degree);
} else {
node_features.push(0.0);
}
if out_degree > 0.0 {
node_features.push(out_weight / out_degree);
} else {
node_features.push(0.0);
}
let neighbors = graph.neighbors(node_id);
let k = neighbors.len();
if k > 1 {
let mut triangle_count = 0;
for i in 0..k {
for j in i + 1..k {
if graph.neighbors(neighbors[i]).contains(&neighbors[j]) {
triangle_count += 1;
}
}
}
let max_triangles = k * (k - 1) / 2;
node_features.push(triangle_count as f64 / max_triangles as f64);
} else {
node_features.push(0.0);
}
features.insert(node_id, node_features);
}
features
}
pub fn compute_temporal_features(date: NaiveDate) -> Vec<f64> {
let mut features = Vec::new();
let weekday = date.weekday().num_days_from_monday() as f64;
features.push(weekday / 6.0);
let day = date.day() as f64;
features.push(day / 31.0);
let month = date.month() as f64;
features.push(month / 12.0);
let quarter = ((month - 1.0) / 3.0).floor() + 1.0;
features.push(quarter / 4.0);
features.push(if weekday >= 5.0 { 1.0 } else { 0.0 });
features.push(if day >= 28.0 { 1.0 } else { 0.0 });
let is_quarter_end = matches!(month as u32, 3 | 6 | 9 | 12) && day >= 28.0;
features.push(if is_quarter_end { 1.0 } else { 0.0 });
features.push(if month == 12.0 { 1.0 } else { 0.0 });
let day_of_year = date.ordinal() as f64;
features.push((2.0 * std::f64::consts::PI * day_of_year / 365.0).sin());
features.push((2.0 * std::f64::consts::PI * day_of_year / 365.0).cos());
features.push((2.0 * std::f64::consts::PI * weekday / 7.0).sin());
features.push((2.0 * std::f64::consts::PI * weekday / 7.0).cos());
features
}
pub fn compute_benford_features(amount: f64) -> Vec<f64> {
let mut features = Vec::new();
let first_digit = extract_first_digit(amount);
let benford_prob = benford_probability(first_digit);
features.push(benford_prob);
let expected_benford = [
0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046,
];
if (1..=9).contains(&first_digit) {
let deviation = (expected_benford[first_digit as usize - 1] - benford_prob).abs();
features.push(deviation);
} else {
features.push(0.0);
}
for d in 1..=9 {
features.push(if first_digit == d { 1.0 } else { 0.0 });
}
let second_digit = extract_second_digit(amount);
features.push(second_digit as f64 / 9.0);
features
}
fn extract_first_digit(value: f64) -> u32 {
if value == 0.0 {
return 0;
}
let abs_val = value.abs();
let log10 = abs_val.log10().floor();
let normalized = abs_val / 10_f64.powf(log10);
normalized.floor() as u32
}
fn extract_second_digit(value: f64) -> u32 {
if value == 0.0 {
return 0;
}
let abs_val = value.abs();
let log10 = abs_val.log10().floor();
let normalized = abs_val / 10_f64.powf(log10);
((normalized - normalized.floor()) * 10.0).floor() as u32
}
fn benford_probability(digit: u32) -> f64 {
if digit == 0 || digit > 9 {
return 0.0;
}
(1.0 + 1.0 / digit as f64).log10()
}
pub fn compute_amount_features(amount: Decimal) -> Vec<f64> {
let amount_f64: f64 = amount.try_into().unwrap_or(0.0);
let mut features = Vec::new();
features.push((amount_f64.abs() + 1.0).ln());
features.push(if amount_f64 >= 0.0 { 1.0 } else { 0.0 });
let is_round = (amount_f64 % 100.0).abs() < 0.01;
features.push(if is_round { 1.0 } else { 0.0 });
let magnitude = if amount_f64.abs() < 1.0 {
0
} else {
(amount_f64.abs().log10().floor() as i32).clamp(0, 9)
};
for m in 0..10 {
features.push(if magnitude == m { 1.0 } else { 0.0 });
}
features.extend(compute_benford_features(amount_f64));
features
}
pub fn one_hot_encode(value: &str, categories: &[&str]) -> Vec<f64> {
let mut encoding = vec![0.0; categories.len()];
if let Some(idx) = categories.iter().position(|&c| c == value) {
encoding[idx] = 1.0;
}
encoding
}
pub fn label_encode(value: &str, categories: &[&str]) -> f64 {
categories
.iter()
.position(|&c| c == value)
.map(|i| i as f64)
.unwrap_or(-1.0)
}
pub fn positional_encoding(position: usize, d_model: usize) -> Vec<f64> {
let mut encoding = Vec::with_capacity(d_model);
for i in 0..d_model {
let angle = position as f64 / 10000_f64.powf(2.0 * (i / 2) as f64 / d_model as f64);
if i % 2 == 0 {
encoding.push(angle.sin());
} else {
encoding.push(angle.cos());
}
}
encoding
}
pub fn compute_edge_direction_features(
source_features: &[f64],
target_features: &[f64],
) -> Vec<f64> {
let mut features = Vec::new();
for (s, t) in source_features.iter().zip(target_features.iter()) {
features.push(t - s); }
for (s, t) in source_features.iter().zip(target_features.iter()) {
features.push((t - s).abs());
}
for (s, t) in source_features.iter().zip(target_features.iter()) {
features.push(s * t);
}
let source_sum: f64 = source_features.iter().sum();
let target_sum: f64 = target_features.iter().sum();
features.push(if source_sum > target_sum { 1.0 } else { 0.0 });
features
}
pub fn compute_velocity_features(
node_id: usize,
edges: &[(usize, usize, f64)], window_days: &[u32],
reference_day: f64,
) -> Vec<f64> {
let mut features = Vec::with_capacity(window_days.len() * 2);
for &window in window_days {
let cutoff = reference_day - f64::from(window);
let mut count = 0u64;
let mut weight_sum = 0.0_f64;
for &(src, tgt, ts) in edges {
if (src == node_id || tgt == node_id) && ts >= cutoff {
count += 1;
weight_sum += 1.0; }
}
features.push(count as f64);
features.push(weight_sum);
}
features
}
pub fn pagerank(adjacency: &[Vec<usize>], damping: f64, iterations: usize) -> Vec<f64> {
let n = adjacency.len();
if n == 0 {
return Vec::new();
}
let init = 1.0 / n as f64;
let mut rank = vec![init; n];
for _ in 0..iterations {
let mut new_rank = vec![(1.0 - damping) / n as f64; n];
for (src, targets) in adjacency.iter().enumerate() {
if targets.is_empty() {
let share = damping * rank[src] / n as f64;
for r in new_rank.iter_mut() {
*r += share;
}
} else {
let share = damping * rank[src] / targets.len() as f64;
for &tgt in targets {
if tgt < n {
new_rank[tgt] += share;
}
}
}
}
rank = new_rank;
}
let total: f64 = rank.iter().sum();
if total > 0.0 {
rank.iter_mut().for_each(|r| *r /= total);
}
rank
}
pub fn degree_centrality(adjacency: &[Vec<usize>]) -> Vec<f64> {
let n = adjacency.len();
if n == 0 {
return Vec::new();
}
let mut degree = vec![0usize; n];
for (src, targets) in adjacency.iter().enumerate() {
degree[src] += targets.len(); for &tgt in targets {
if tgt < n {
degree[tgt] += 1; }
}
}
let normalizer = if n > 1 { (n - 1) as f64 } else { 1.0 };
degree.iter().map(|&d| (d as f64) / normalizer).collect()
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_benford_probability() {
let prob1 = benford_probability(1);
assert!((prob1 - 0.301).abs() < 0.001);
let prob9 = benford_probability(9);
assert!((prob9 - 0.046).abs() < 0.001);
}
#[test]
fn test_extract_first_digit() {
assert_eq!(extract_first_digit(1234.56), 1);
assert_eq!(extract_first_digit(9876.54), 9);
assert_eq!(extract_first_digit(0.0123), 1);
}
#[test]
fn test_temporal_features() {
let date = NaiveDate::from_ymd_opt(2024, 12, 31).unwrap();
let features = compute_temporal_features(date);
assert!(!features.is_empty());
assert!(features[7] > 0.5); }
#[test]
fn test_normalization() {
let features = vec![vec![1.0, 100.0], vec![2.0, 200.0], vec![3.0, 300.0]];
let mut normalizer = FeatureNormalizer::new(NormalizationMethod::MinMax);
normalizer.fit(&features);
let normalized = normalizer.transform(&features);
assert_eq!(normalized[0][0], 0.0); assert_eq!(normalized[2][0], 1.0); }
#[test]
fn test_one_hot_encode() {
let categories = ["A", "B", "C"];
let encoded = one_hot_encode("B", &categories);
assert_eq!(encoded, vec![0.0, 1.0, 0.0]);
}
#[test]
fn test_positional_encoding() {
let encoding = positional_encoding(0, 8);
assert_eq!(encoding.len(), 8);
assert_eq!(encoding[0], 0.0); }
#[test]
fn test_velocity_features_length() {
let edges = vec![(1, 2, 995.0), (1, 3, 998.0), (1, 4, 1000.0)];
let features = compute_velocity_features(1, &edges, &[7, 30, 90], 1002.0);
assert_eq!(features.len(), 6, "2 metrics per window × 3 windows");
}
#[test]
fn test_velocity_features_windowing() {
let edges = vec![(1, 2, 994.0), (1, 3, 960.0), (1, 4, 900.0)];
let features = compute_velocity_features(1, &edges, &[7, 30, 90], 1000.0);
assert_eq!(features[0], 1.0, "7-day count");
assert_eq!(features[2], 1.0, "30-day count");
assert_eq!(features[4], 2.0, "90-day count");
}
#[test]
fn test_pagerank_basic() {
let adjacency = vec![vec![1], vec![2], vec![0]];
let pr = pagerank(&adjacency, 0.85, 50);
assert_eq!(pr.len(), 3);
for &r in &pr {
assert!((r - 1.0 / 3.0).abs() < 0.01, "Expected ~0.333 but got {r}");
}
let total: f64 = pr.iter().sum();
assert!((total - 1.0).abs() < 1e-9, "PageRank must sum to 1.0");
}
#[test]
fn test_pagerank_empty() {
let pr = pagerank(&[], 0.85, 10);
assert!(pr.is_empty());
}
#[test]
fn test_degree_centrality_basic() {
let adjacency = vec![vec![1, 2, 3], vec![], vec![], vec![]];
let dc = degree_centrality(&adjacency);
assert_eq!(dc.len(), 4);
assert!((dc[0] - 1.0).abs() < 1e-9, "Hub should have centrality 1.0");
for &c in &dc[1..] {
assert!(
(c - 1.0 / 3.0).abs() < 1e-9,
"Leaf centrality should be ~0.333"
);
}
}
#[test]
fn test_degree_centrality_empty() {
let dc = degree_centrality(&[]);
assert!(dc.is_empty());
}
}