rs_ml/lib.rs
1//! rs-ml is a simple ML framework for the Rust language. it includes train test splitting,
2//! scalers, and a guassian naive bayes model. It also includes traits to add more transfomers and
3//! models to the framework.
4//!
5//! # Usage
6//!
7//! This library requires a compute backend to perform matrix operations. Compute backends are
8//! exposed with provided feature flags. Refer to the
9//! [ndarray_linalg](https://github.com/rust-ndarray/ndarray-linalg?tab=readme-ov-file#backend-features)
10//! docs for more information.
11#![deny(
12 missing_docs,
13 unsafe_code,
14 missing_debug_implementations,
15 missing_copy_implementations,
16 clippy::missing_panics_doc
17)]
18
19use std::ops::Add;
20use std::ops::Div;
21use std::ops::Mul;
22
23use classification::ClassificationDataSet;
24use ndarray::Axis;
25
26pub mod classification;
27pub mod dimensionality_reduction;
28pub mod metrics;
29pub mod regression;
30pub mod transformer;
31
32/// Trait for fitting classification and regression models, and transformers.
33///
34/// The struct on which this trait is implemented holds and validates the hyperparameters necessary
35/// to fit the estimator to the desired output. For example, a classification model may take as
36/// input a tuple with features and labels:
37/// ```
38/// use ndarray::{Array1, Array2};
39/// use rs_ml::Estimator;
40///
41/// struct ModelParameters {
42/// // Hyperparameters required to fit the model
43/// learning_rate: f64
44/// }
45///
46/// struct Model {
47/// // Internal state of model required to predict features
48/// means: Array2<f64>
49/// };
50///
51/// impl Estimator<(Array2<f64>, Array1<String>)> for ModelParameters {
52/// type Estimator = Model;
53///
54/// fn fit(&self, input: &(Array2<f64>, Array1<String>)) -> Option<Self::Estimator> {
55/// let (features, labels) = input;
56///
57/// // logic to fit the model
58/// Some(Model {
59/// means: Array2::zeros((1, 1))
60/// })
61/// }
62/// }
63/// ```
64pub trait Estimator<Input> {
65 /// Output model or transformer fitted to input data.
66 type Estimator;
67
68 /// Fit model or transformer based on given inputs, or None if the estimator was not able to
69 /// fit to the input data as expected.
70 fn fit(&self, input: &Input) -> Option<Self::Estimator>;
71}
72
73/// Train test split result. returns in order training features, testing features, training labels,
74/// testing labels.
75#[derive(Debug, Clone)]
76pub struct SplitDataset<Feature, Label>(
77 pub Vec<Feature>,
78 pub Vec<Feature>,
79 pub Vec<Label>,
80 pub Vec<Label>,
81);
82
83/// Split data and features into training and testing set. `test_size` must be between 0 and 1.
84///
85/// # Panics
86///
87/// Panics if `test_size` is outside range 0..=1.
88///
89/// Example:
90/// ```
91/// use rs_ml::{train_test_split};
92/// use rs_ml::classification::ClassificationDataSet;
93/// use ndarray::{arr1, arr2};
94///
95/// let features = arr2(&[
96/// [1., 0.],
97/// [0., 1.],
98/// [0., 0.],
99/// [1., 1.]]);
100///
101/// let labels = vec![1, 1, 0, 0];
102///
103/// let dataset = ClassificationDataSet::from(
104/// features.rows().into_iter().zip(labels));
105///
106/// let (train, test) = train_test_split(dataset, 0.25);
107/// ```
108pub fn train_test_split<Feature, Label>(
109 dataset: ClassificationDataSet<Feature, Label>,
110 test_size: f64,
111) -> (
112 ClassificationDataSet<Feature, Label>,
113 ClassificationDataSet<Feature, Label>,
114) {
115 let (train, test): (Vec<_>, Vec<_>) = dataset
116 .consume_records()
117 .into_iter()
118 .partition(|_| rand::random_bool(test_size));
119
120 (
121 ClassificationDataSet::from(train),
122 ClassificationDataSet::from(test),
123 )
124}
125
126fn iterative_mean<I, F, R>(it: I) -> Option<R>
127where
128 I: IntoIterator<Item = F>,
129 F: Into<R>,
130 R: Div<f64, Output = R> + Mul<f64, Output = R> + Add<Output = R> + Default,
131{
132 it.into_iter().enumerate().fold(None, |acc, (i, curr)| {
133 let idx = i as f64;
134 let idx_inc_1 = (i + 1) as f64;
135
136 let current: R = curr.into();
137 let scaled_current = current / idx_inc_1;
138
139 match acc {
140 Some(acc) => Some(acc * (idx / idx_inc_1) + scaled_current),
141 None => Some(scaled_current),
142 }
143 })
144}
145
146#[cfg(test)]
147mod tests {
148 use ndarray::{arr1, arr2, Array1};
149
150 use crate::iterative_mean;
151
152 #[test]
153 fn test_iterative_mean_2darray() {
154 let arr = arr2(&[[0., 1., 2.], [1., 2., 3.], [2., 3., 4.]]);
155
156 let mean: Option<Array1<f64>> =
157 iterative_mean(arr.rows().into_iter().map(|row| row.to_owned()));
158
159 assert!(mean.is_some_and(|m| m.relative_eq(&arr1(&[1.0, 2.0, 3.0]), 1e-4, 1e-2)));
160 }
161
162 #[test]
163 fn test_iterative_mean_vec() {
164 let arr: Vec<f64> = vec![0., 1., 2., 3., 4.];
165
166 let mean = iterative_mean(arr);
167
168 assert_eq!(mean, Some(2.0));
169 }
170}