use std::fmt;
#[derive(Debug, Clone)]
pub struct LinearModel {
slope: f64,
intercept: f64,
}
impl LinearModel {
#[must_use]
pub const fn new() -> Self {
Self {
slope: 1.0,
intercept: 0.0,
}
}
pub fn train(&mut self, data: &[(i64, usize)]) {
const SAMPLING_THRESHOLD: usize = 10_000;
if data.len() >= SAMPLING_THRESHOLD {
let sample_size = (data.len() as f64).sqrt() as usize;
self.train_sampled(data, sample_size);
} else {
self.train_full(data);
}
}
pub fn train_full(&mut self, data: &[(i64, usize)]) {
if data.is_empty() {
return;
}
if data.len() == 1 {
let (_key, pos) = data[0];
self.slope = 0.0;
self.intercept = pos as f64;
return;
}
let n = data.len() as f64;
let sum_x = data.iter().map(|(k, _)| *k as f64).sum::<f64>();
let sum_y = data.iter().map(|(_, p)| *p as f64).sum::<f64>();
let sum_xy = data.iter().map(|(k, p)| *k as f64 * *p as f64).sum::<f64>();
let sum_x2 = data.iter().map(|(k, _)| (*k as f64).powi(2)).sum::<f64>();
let denominator = n * sum_x2 - sum_x * sum_x;
if denominator.abs() < 1e-10 {
self.slope = 0.0;
self.intercept = sum_y / n;
} else {
self.slope = (n * sum_xy - sum_x * sum_y) / denominator;
self.intercept = (sum_y - self.slope * sum_x) / n;
}
}
pub fn train_sampled(&mut self, data: &[(i64, usize)], sample_size: usize) {
if data.is_empty() {
return;
}
if sample_size >= data.len() {
self.train_full(data);
return;
}
let stride = data.len() / sample_size;
let samples: Vec<(i64, usize)> = (0..sample_size)
.map(|i| {
let idx = (i * stride).min(data.len() - 1);
data[idx]
})
.collect();
self.train_full(&samples);
}
#[must_use]
pub fn predict(&self, key: i64) -> usize {
let pos = self.slope * key as f64 + self.intercept;
pos.max(0.0) as usize
}
#[must_use]
pub const fn slope(&self) -> f64 {
self.slope
}
#[must_use]
pub const fn intercept(&self) -> f64 {
self.intercept
}
#[must_use]
pub fn max_error(&self, data: &[(i64, usize)]) -> usize {
data.iter()
.map(|(key, pos)| {
let predicted = self.predict(*key);
(predicted as i64 - *pos as i64).unsigned_abs() as usize
})
.max()
.unwrap_or(0)
}
#[must_use]
pub fn avg_error(&self, data: &[(i64, usize)]) -> f64 {
if data.is_empty() {
return 0.0;
}
let total_error: usize = data
.iter()
.map(|(key, pos)| {
let predicted = self.predict(*key);
(predicted as i64 - *pos as i64).unsigned_abs() as usize
})
.sum();
total_error as f64 / data.len() as f64
}
}
impl Default for LinearModel {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for LinearModel {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"LinearModel(y = {:.6}x + {:.6})",
self.slope, self.intercept
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_identity_function() {
let model = LinearModel::new();
assert_eq!(model.predict(0), 0);
assert_eq!(model.predict(100), 100);
assert_eq!(model.predict(1000), 1000);
}
#[test]
fn test_perfect_linear_data() {
let data: Vec<(i64, usize)> = (0..100).map(|i| (i, i as usize)).collect();
let mut model = LinearModel::new();
model.train(&data);
assert!((model.slope() - 1.0).abs() < 0.01);
assert!(model.intercept().abs() < 0.01);
for i in 0..100 {
let predicted = model.predict(i);
assert_eq!(predicted, i as usize);
}
assert_eq!(model.max_error(&data), 0);
}
#[test]
fn test_scaled_data() {
let data: Vec<(i64, usize)> = (0..100).map(|i| (i * 10, i as usize)).collect();
let mut model = LinearModel::new();
model.train(&data);
assert!((model.slope() - 0.1).abs() < 0.01);
assert!(model.intercept().abs() < 0.01);
for i in 0..100 {
let predicted = model.predict(i * 10);
assert!((predicted as i64 - i).abs() <= 1);
}
}
#[test]
fn test_offset_data() {
let data: Vec<(i64, usize)> = (0..100).map(|i| (i + 1000, i as usize)).collect();
let mut model = LinearModel::new();
model.train(&data);
assert!((model.slope() - 1.0).abs() < 0.01);
assert!((model.intercept() + 1000.0).abs() < 0.01);
for i in 0..100 {
let predicted = model.predict(i + 1000);
assert_eq!(predicted, i as usize);
}
}
#[test]
fn test_single_point() {
let data = vec![(42, 10)];
let mut model = LinearModel::new();
model.train(&data);
assert_eq!(model.predict(0), 10);
assert_eq!(model.predict(42), 10);
assert_eq!(model.predict(100), 10);
}
#[test]
fn test_duplicate_keys() {
let data = vec![(5, 0), (5, 1), (5, 2), (5, 3)];
let mut model = LinearModel::new();
model.train(&data);
let predicted = model.predict(5);
assert!((predicted as i64 - 1).abs() <= 1);
}
#[test]
fn test_sparse_data() {
let data = vec![(0, 0), (1000, 1), (2000, 2), (3000, 3)];
let mut model = LinearModel::new();
model.train(&data);
let mid = model.predict(1500);
assert!((mid as i64 - 1).abs() <= 1);
}
#[test]
fn test_error_metrics() {
let data = vec![(0, 0), (10, 2), (20, 3), (30, 5)];
let mut model = LinearModel::new();
model.train(&data);
let max_err = model.max_error(&data);
let avg_err = model.avg_error(&data);
assert!(max_err > 0);
assert!(avg_err > 0.0);
assert!(max_err < 5); }
#[test]
fn test_empty_data() {
let data: Vec<(i64, usize)> = vec![];
let mut model = LinearModel::new();
model.train(&data);
assert_eq!(model.slope(), 1.0);
assert_eq!(model.intercept(), 0.0);
}
#[test]
fn test_negative_keys() {
let data: Vec<(i64, usize)> = (-50..50).map(|i| (i, (i + 50) as usize)).collect();
let mut model = LinearModel::new();
model.train(&data);
assert_eq!(model.predict(-50), 0);
assert_eq!(model.predict(0), 50);
assert_eq!(model.predict(49), 99);
}
#[test]
fn test_large_scale() {
let data: Vec<(i64, usize)> = (0..1_000_000).map(|i| (i as i64, i as usize)).collect();
let mut model = LinearModel::new();
model.train(&data);
assert!((model.slope() - 1.0).abs() < 0.001);
assert!(model.intercept().abs() < 0.001);
assert_eq!(model.predict(0), 0);
assert_eq!(model.predict(500_000), 500_000);
assert_eq!(model.predict(999_999), 999_999);
}
#[test]
fn test_cdfshop_sampling() {
let data: Vec<(i64, usize)> = (0..100_000).map(|i| (i as i64, i as usize)).collect();
let mut sampled_model = LinearModel::new();
let sample_size = (data.len() as f64).sqrt() as usize;
sampled_model.train_sampled(&data, sample_size);
let mut full_model = LinearModel::new();
full_model.train_full(&data);
assert!((sampled_model.slope() - full_model.slope()).abs() < 0.01);
assert!((sampled_model.intercept() - full_model.intercept()).abs() < 1.0);
for i in (0..100_000).step_by(10_000) {
let sampled_pred = sampled_model.predict(i);
let actual = i as usize;
let error = (sampled_pred as i64 - actual as i64).abs();
assert!(error < 100, "Error too high: {} for key {}", error, i);
}
}
#[test]
fn test_sampling_threshold() {
let small_data: Vec<(i64, usize)> = (0..5_000).map(|i| (i as i64, i as usize)).collect();
let mut model = LinearModel::new();
model.train(&small_data);
assert!((model.slope() - 1.0).abs() < 0.01);
assert!(model.intercept().abs() < 0.01);
let large_data: Vec<(i64, usize)> = (0..50_000).map(|i| (i as i64, i as usize)).collect();
let mut model2 = LinearModel::new();
model2.train(&large_data);
assert!((model2.slope() - 1.0).abs() < 0.01);
assert!(model2.intercept().abs() < 1.0);
}
}