rs_ml/
lib.rs

1//! rs-ml is a simple ML framework for the Rust language. it includes train test splitting,
2//! scalers, and a guassian naive bayes model. It also includes traits to add more transfomers and
3//! models to the framework.
4//!
5//! # Usage
6//!
7//! This library requires a compute backend to perform matrix operations. Compute backends are
8//! exposed with provided feature flags. Refer to the
9//! [ndarray_linalg](https://github.com/rust-ndarray/ndarray-linalg?tab=readme-ov-file#backend-features)
10//! docs for more information.
11#![deny(
12    missing_docs,
13    unsafe_code,
14    missing_debug_implementations,
15    missing_copy_implementations,
16    clippy::missing_panics_doc
17)]
18
19use std::ops::Add;
20use std::ops::Div;
21use std::ops::Mul;
22
23use classification::ClassificationDataSet;
24use ndarray::Axis;
25
26pub mod classification;
27pub mod metrics;
28pub mod regression;
29pub mod transformer;
30
31/// Trait for fitting classification and regression models, and transformers.
32///
33/// The struct on which this trait is implemented holds and validates the hyperparameters necessary
34/// to fit the estimator to the desired output. For example, a classification model may take as
35/// input a tuple with features and labels:
36/// ```
37/// use ndarray::{Array1, Array2};
38/// use rs_ml::Estimator;
39///
40/// struct ModelParameters {
41///   // Hyperparameters required to fit the model
42///   learning_rate: f64
43/// }
44///
45/// struct Model {
46///     // Internal state of model required to predict features
47///     means: Array2<f64>
48/// };
49///
50/// impl Estimator<(Array2<f64>, Array1<String>)> for ModelParameters {
51///     type Estimator = Model;
52///
53///     fn fit(&self, input: &(Array2<f64>, Array1<String>)) -> Option<Self::Estimator> {
54///         let (features, labels) = input;
55///
56///         // logic to fit the model
57///         Some(Model {
58///             means: Array2::zeros((1, 1))
59///         })
60///     }
61/// }
62/// ```
63pub trait Estimator<Input> {
64    /// Output model or transformer fitted to input data.
65    type Estimator;
66
67    /// Fit model or transformer based on given inputs, or None if the estimator was not able to
68    /// fit to the input data as expected.
69    fn fit(&self, input: &Input) -> Option<Self::Estimator>;
70}
71
72/// Train test split result. returns in order training features, testing features, training labels,
73/// testing labels.
74#[derive(Debug, Clone)]
75pub struct SplitDataset<Feature, Label>(
76    pub Vec<Feature>,
77    pub Vec<Feature>,
78    pub Vec<Label>,
79    pub Vec<Label>,
80);
81
82/// Split data and features into training and testing set. `test_size` must be between 0 and 1.
83///
84/// # Panics
85///
86/// Panics if `test_size` is outside range 0..=1.
87///
88/// Example:
89/// ```
90/// use rs_ml::{train_test_split};
91/// use rs_ml::classification::ClassificationDataSet;
92/// use ndarray::{arr1, arr2};
93///
94/// let features = arr2(&[
95///   [1., 0.],
96///   [0., 1.],
97///   [0., 0.],
98///   [1., 1.]]);
99///
100/// let labels = vec![1, 1, 0, 0];
101///
102/// let dataset = ClassificationDataSet::from(
103///   features.rows().into_iter().zip(labels));
104///
105/// let (train, test) = train_test_split(dataset, 0.25);
106/// ```
107pub fn train_test_split<Feature, Label>(
108    dataset: ClassificationDataSet<Feature, Label>,
109    test_size: f64,
110) -> (
111    ClassificationDataSet<Feature, Label>,
112    ClassificationDataSet<Feature, Label>,
113) {
114    let (train, test): (Vec<_>, Vec<_>) = dataset
115        .consume_records()
116        .into_iter()
117        .partition(|_| rand::random_bool(test_size));
118
119    (
120        ClassificationDataSet::from(train),
121        ClassificationDataSet::from(test),
122    )
123}
124
125fn iterative_mean<I, F, R>(it: I) -> Option<R>
126where
127    I: IntoIterator<Item = F>,
128    F: Into<R>,
129    R: Div<f64, Output = R> + Mul<f64, Output = R> + Add<Output = R> + Default,
130{
131    it.into_iter().enumerate().fold(None, |acc, (i, curr)| {
132        let idx = i as f64;
133        let idx_inc_1 = (i + 1) as f64;
134
135        let current: R = curr.into();
136        let scaled_current = current / idx_inc_1;
137
138        match acc {
139            Some(acc) => Some(acc * (idx / idx_inc_1) + scaled_current),
140            None => Some(scaled_current),
141        }
142    })
143}
144
145#[cfg(test)]
146mod tests {
147    use ndarray::{arr1, arr2, Array1};
148
149    use crate::iterative_mean;
150
151    #[test]
152    fn test_iterative_mean_2darray() {
153        let arr = arr2(&[[0., 1., 2.], [1., 2., 3.], [2., 3., 4.]]);
154
155        let mean: Option<Array1<f64>> =
156            iterative_mean(arr.rows().into_iter().map(|row| row.to_owned()));
157
158        assert!(mean.is_some_and(|m| m.relative_eq(&arr1(&[1.0, 2.0, 3.0]), 1e-4, 1e-2)));
159    }
160
161    #[test]
162    fn test_iterative_mean_vec() {
163        let arr: Vec<f64> = vec![0., 1., 2., 3., 4.];
164
165        let mean = iterative_mean(arr);
166
167        assert_eq!(mean, Some(2.0));
168    }
169}