pub mod corpus;
pub mod cpt;
pub mod curriculum;
pub mod dam;
pub mod distillation;
pub mod distillation_advanced;
pub mod dpo;
pub mod drift;
pub mod eval_harness;
pub mod moe_construction;
pub mod orchestrator;
pub mod per_layer_merge;
pub mod rlvr;
pub mod tokenizer_surgery;
use crate::error::{AprenderError, Result};
pub trait OnlineLearner {
fn partial_fit(&mut self, x: &[f64], y: &[f64], learning_rate: Option<f64>) -> Result<f64>;
fn supports_warm_start(&self) -> bool {
true
}
fn current_learning_rate(&self) -> f64;
fn n_samples_seen(&self) -> u64;
fn reset(&mut self);
}
pub trait PassiveAggressive: OnlineLearner {
fn aggressiveness(&self) -> f64;
fn set_aggressiveness(&mut self, c: f64);
}
#[derive(Debug, Clone)]
pub struct OnlineLearnerConfig {
pub learning_rate: f64,
pub decay: LearningRateDecay,
pub l2_reg: f64,
pub momentum: f64,
pub gradient_clip: Option<f64>,
}
impl Default for OnlineLearnerConfig {
fn default() -> Self {
Self {
learning_rate: 0.01,
decay: LearningRateDecay::InverseSqrt,
l2_reg: 0.0,
momentum: 0.0,
gradient_clip: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Default)]
pub enum LearningRateDecay {
Constant,
#[default]
InverseSqrt,
Inverse,
Step { decay_rate: f64 },
AdaGrad { epsilon: f64 },
}
#[derive(Debug, Clone)]
pub struct OnlineLinearRegression {
weights: Vec<f64>,
bias: f64,
accum_grad: Vec<f64>,
n_samples: u64,
config: OnlineLearnerConfig,
}
impl OnlineLinearRegression {
#[must_use]
pub fn new(n_features: usize) -> Self {
Self {
weights: vec![0.0; n_features],
bias: 0.0,
accum_grad: vec![1e-8; n_features], n_samples: 0,
config: OnlineLearnerConfig::default(),
}
}
#[must_use]
pub fn with_config(n_features: usize, config: OnlineLearnerConfig) -> Self {
Self {
weights: vec![0.0; n_features],
bias: 0.0,
accum_grad: vec![1e-8; n_features],
n_samples: 0,
config,
}
}
#[must_use]
pub fn weights(&self) -> &[f64] {
&self.weights
}
#[must_use]
pub fn bias(&self) -> f64 {
self.bias
}
pub fn predict_one(&self, x: &[f64]) -> Result<f64> {
if x.len() != self.weights.len() {
return Err(AprenderError::dimension_mismatch(
"input features",
self.weights.len(),
x.len(),
));
}
let pred: f64 = x.iter().zip(&self.weights).map(|(xi, wi)| xi * wi).sum();
Ok(pred + self.bias)
}
fn compute_lr(&self) -> f64 {
let base = self.config.learning_rate;
let t = self.n_samples.max(1) as f64;
match self.config.decay {
LearningRateDecay::InverseSqrt => base / t.sqrt(),
LearningRateDecay::Inverse => base / t,
LearningRateDecay::Step { decay_rate } => base / (1.0 + decay_rate * t),
LearningRateDecay::Constant | LearningRateDecay::AdaGrad { .. } => base,
}
}
}
impl OnlineLearner for OnlineLinearRegression {
fn partial_fit(&mut self, x: &[f64], y: &[f64], learning_rate: Option<f64>) -> Result<f64> {
if x.is_empty() || y.is_empty() {
return Err(AprenderError::empty_input("partial_fit input"));
}
let n_features = self.weights.len();
if !x.len().is_multiple_of(n_features) {
return Err(AprenderError::dimension_mismatch(
"input features",
n_features,
x.len() % n_features,
));
}
let n_samples = x.len() / n_features;
if n_samples != y.len() {
return Err(AprenderError::dimension_mismatch(
"samples in y",
n_samples,
y.len(),
));
}
let lr = learning_rate.unwrap_or_else(|| self.compute_lr());
let mut total_loss = 0.0;
for i in 0..n_samples {
let xi = &x[i * n_features..(i + 1) * n_features];
let yi = y[i];
let pred = self.predict_one(xi)?;
let error = pred - yi;
total_loss += error * error;
for (j, &xij) in xi.iter().enumerate() {
let grad = error * xij + self.config.l2_reg * self.weights[j];
let grad = if let Some(clip) = self.config.gradient_clip {
grad.clamp(-clip, clip)
} else {
grad
};
let effective_lr = match self.config.decay {
LearningRateDecay::AdaGrad { epsilon } => {
self.accum_grad[j] += grad * grad;
lr / (self.accum_grad[j].sqrt() + epsilon)
}
_ => lr,
};
self.weights[j] -= effective_lr * grad;
}
self.bias -= lr * error;
self.n_samples += 1;
}
Ok(total_loss / n_samples as f64)
}
fn current_learning_rate(&self) -> f64 {
self.compute_lr()
}
fn n_samples_seen(&self) -> u64 {
self.n_samples
}
fn reset(&mut self) {
self.weights.fill(0.0);
self.bias = 0.0;
self.accum_grad.fill(1e-8);
self.n_samples = 0;
}
}
#[derive(Debug, Clone)]
pub struct OnlineLogisticRegression {
weights: Vec<f64>,
bias: f64,
accum_grad: Vec<f64>,
n_samples: u64,
config: OnlineLearnerConfig,
}
impl OnlineLogisticRegression {
#[must_use]
pub fn new(n_features: usize) -> Self {
Self {
weights: vec![0.0; n_features],
bias: 0.0,
accum_grad: vec![1e-8; n_features],
n_samples: 0,
config: OnlineLearnerConfig::default(),
}
}
#[must_use]
pub fn with_config(n_features: usize, config: OnlineLearnerConfig) -> Self {
Self {
weights: vec![0.0; n_features],
bias: 0.0,
accum_grad: vec![1e-8; n_features],
n_samples: 0,
config,
}
}
#[must_use]
pub fn weights(&self) -> &[f64] {
&self.weights
}
#[must_use]
pub fn bias(&self) -> f64 {
self.bias
}
fn sigmoid(z: f64) -> f64 {
crate::nn::functional::sigmoid_scalar_f64(z)
}
pub fn predict_proba_one(&self, x: &[f64]) -> Result<f64> {
if x.len() != self.weights.len() {
return Err(AprenderError::dimension_mismatch(
"input features",
self.weights.len(),
x.len(),
));
}
let logit: f64 = x.iter().zip(&self.weights).map(|(xi, wi)| xi * wi).sum();
Ok(Self::sigmoid(logit + self.bias))
}
fn compute_lr(&self) -> f64 {
let base = self.config.learning_rate;
let t = self.n_samples.max(1) as f64;
match self.config.decay {
LearningRateDecay::InverseSqrt => base / t.sqrt(),
LearningRateDecay::Inverse => base / t,
LearningRateDecay::Step { decay_rate } => base / (1.0 + decay_rate * t),
LearningRateDecay::Constant | LearningRateDecay::AdaGrad { .. } => base,
}
}
}
impl OnlineLearner for OnlineLogisticRegression {
fn partial_fit(&mut self, x: &[f64], y: &[f64], learning_rate: Option<f64>) -> Result<f64> {
if x.is_empty() || y.is_empty() {
return Err(AprenderError::empty_input("partial_fit input"));
}
let n_features = self.weights.len();
if !x.len().is_multiple_of(n_features) {
return Err(AprenderError::dimension_mismatch(
"input features",
n_features,
x.len() % n_features,
));
}
let n_samples = x.len() / n_features;
if n_samples != y.len() {
return Err(AprenderError::dimension_mismatch(
"samples in y",
n_samples,
y.len(),
));
}
let lr = learning_rate.unwrap_or_else(|| self.compute_lr());
let mut total_loss = 0.0;
for i in 0..n_samples {
let xi = &x[i * n_features..(i + 1) * n_features];
let yi = y[i];
let pred = self.predict_proba_one(xi)?;
let eps = 1e-15;
let pred_clipped = pred.clamp(eps, 1.0 - eps);
total_loss += -yi * pred_clipped.ln() - (1.0 - yi) * (1.0 - pred_clipped).ln();
let error = pred - yi;
for (j, &xij) in xi.iter().enumerate() {
let grad = error * xij + self.config.l2_reg * self.weights[j];
let grad = if let Some(clip) = self.config.gradient_clip {
grad.clamp(-clip, clip)
} else {
grad
};
let effective_lr = match self.config.decay {
LearningRateDecay::AdaGrad { epsilon } => {
self.accum_grad[j] += grad * grad;
lr / (self.accum_grad[j].sqrt() + epsilon)
}
_ => lr,
};
self.weights[j] -= effective_lr * grad;
}
self.bias -= lr * error;
self.n_samples += 1;
}
Ok(total_loss / n_samples as f64)
}
fn current_learning_rate(&self) -> f64 {
self.compute_lr()
}
fn n_samples_seen(&self) -> u64 {
self.n_samples
}
fn reset(&mut self) {
self.weights.fill(0.0);
self.bias = 0.0;
self.accum_grad.fill(1e-8);
self.n_samples = 0;
}
}
#[cfg(test)]
#[path = "online_tests.rs"]
mod tests;