local_outlier_probabilities/
lib.rs

1// Include readme in doc
2#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/Readme.md"))]
3
4pub mod error;
5
6use std::fmt::Debug;
7use std::iter::Sum;
8use std::ops::AddAssign;
9
10use kdtree::KdTree;
11use ndarray::{Array1, Array2, ScalarOperand};
12use num_traits::{Float, FloatConst};
13
14use crate::error::Error;
15
16/// Type alias for distance function
17/// Due to the kdtree implementation we can not return a Error in case the dimensions do not match
18///
19#[allow(type_alias_bounds)]
20pub type DistanceFn<T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug> =
21    fn(&[T], &[T]) -> T;
22
23/// Manhattan distance
24///
25/// # Arguments
26/// * `a` - First point
27/// * `b` - Second point
28///
29pub fn manhattan<T>(a: &[T], b: &[T]) -> T
30where
31    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
32{
33    debug_assert_eq!(a.len(), b.len());
34    a.iter()
35        .zip(b.iter())
36        .map(|(x, y)| (*x - *y).abs())
37        .fold(T::zero(), ::std::ops::Add::add)
38}
39
40/// Euclidean distance
41///
42/// # Arguments
43/// * `a` - First point
44/// * `b` - Second point
45///
46pub fn euclidean<T>(a: &[T], b: &[T]) -> T
47where
48    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
49{
50    debug_assert_eq!(a.len(), b.len());
51    a.iter()
52        .zip(b.iter())
53        .map(|(x, y)| (*x - *y).powi(2))
54        .fold(T::zero(), ::std::ops::Add::add)
55        .sqrt()
56}
57
58/// Local Outlier Probability (LoOP) according to
59/// > Kriegel, H.-P.; Kröger, P.; Schubert, E. & Zimek, A. (2009),
60/// > LoOP: local outlier probabilities.,
61/// > in David Wai-Lok Cheung; Il-Yeol Song; Wesley W. Chu; Xiaohua Hu & Jimmy Lin, ed.,
62/// > 'CIKM' , ACM, , pp. 1649-1652 .
63///
64/// # Arguments
65/// * `data` - 2D array of data points N x M (N rows & M columns).
66///   **It is important to note that the data is expected to be in contiguous memory.**
67/// * `k` - Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug of nearest neighbors to consider
68/// * `lambda` - Scaling factor for the probabilistic set distance & Probabilistic Local Outlier (PLod)
69/// * `distance_fn` - Optional distance function to use (default is Manhattan distance)
70///
71pub fn local_outlier_probabilities<T>(
72    data: Array2<T>,
73    k: usize,
74    lambda: u8,
75    distance_fn: Option<DistanceFn<T>>,
76) -> Result<Array1<T>, Error>
77where
78    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
79{
80    // Unwrap the distance function or use the default
81    let distance_fn = distance_fn.unwrap_or(manhattan);
82
83    // Build the KDTree for finding nearest neighbors
84    let mut tree = KdTree::new(data.shape()[1]);
85
86    // Get and array of references to the underlying data in the array
87    // and add them to the KDTree. This should safe some memory on larger features arrays
88    let point_refs = data
89        .outer_iter()
90        .map(|point| {
91            point
92                .to_slice()
93                .ok_or(Error::SliceNotContiguous("collecting points references"))
94        })
95        .collect::<Result<Vec<_>, Error>>()?;
96
97    for (idx, point) in point_refs.iter().enumerate() {
98        tree.add(*point, idx)?;
99    }
100
101    // Build the neighbor list. We need to manually remove the queried point from the list of neighbors
102    // therefore the map()-part is a bit more complicated and we increase the `k by 1
103    let neighbors_list = point_refs
104        .iter()
105        .enumerate()
106        .map(|(point_idx, point)| {
107            match tree.nearest(point, k + 1, &distance_fn) {
108                Ok(neighbors) => {
109                    // kdtree::nearest includes the point itself, so we need to add 1 to k and remove the point
110                    let filtered = neighbors
111                        .into_iter()
112                        .filter(|neighbor| {
113                            // filter out the point itself
114                            *neighbor.1 != point_idx
115                        })
116                        .collect::<Vec<_>>();
117                    Ok(filtered)
118                }
119                Err(err) => Err(Error::KdTreeError(err)),
120            }
121        })
122        .collect::<Result<Vec<Vec<(T, &usize)>>, Error>>()?;
123
124    // Let's prepare some to Floats to work with
125    let k_float = T::from(k).ok_or(Error::KCastingError("k".to_string()))?;
126    let lambda_float = T::from(lambda).ok_or(Error::KCastingError("lambda".to_string()))?;
127    let two_squared = T::from(2.0).ok_or(Error::KCastingError("2.0".to_string()))?;
128
129    // Calculate the probabilistic distance for each point
130    let pdists = neighbors_list
131        .iter()
132        .map(|neighbors| calc_sigma(neighbors, k_float))
133        .collect::<Array1<_>>()
134        * lambda_float;
135
136    // Calculate the Probabilistic Outlier Factor for each point
137    let plofs = neighbors_list
138        .iter()
139        .zip(pdists.iter())
140        .map(|(nearest_neighbors, pdist)| calc_plof(nearest_neighbors, *pdist, &pdists))
141        .collect::<Array1<_>>();
142
143    // Aggregate the Probabilistic Outlier Factor (nPLOF)
144    let nplof = calc_nplof(&plofs, lambda_float);
145
146    // Calculate the local outlier probability
147    let local_outlier_prob =
148        (plofs / (nplof * two_squared.sqrt())).map(|x| erf_approx(*x).max(T::zero()));
149
150    Ok(local_outlier_prob)
151}
152
153/// Calculate the sigma used in the probabilistic distance function
154///
155/// # Arguments
156/// * `neighbors` - A slice of tuples containing the distance and index of the neighbors
157/// * `k` - Number of neighbors to consider
158///
159fn calc_sigma<T>(neighbors: &[(T, &usize)], k: T) -> T
160where
161    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
162{
163    (neighbors.iter().map(|(dist, _)| dist.powi(2)).sum::<T>() / k).sqrt()
164}
165
166/// Probabilistic Outlier Factor (PLOF) for a point
167///
168/// # Arguments
169/// * `neighbors` - A slice of tuples containing the distance and index of the neighbors
170/// * `pdist` - Probabilistic distance of the point
171/// * `pdists` - Probabilistic distance of the points
172///
173fn calc_plof<T>(nearest_neighbors: &[(T, &usize)], pdist: T, pdists: &Array1<T>) -> T
174where
175    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
176{
177    let nn_mean = nearest_neighbors
178        .iter()
179        .map(|(_, idx)| pdists[**idx])
180        .sum::<T>()
181        / T::from(nearest_neighbors.len()).unwrap();
182
183    pdist / nn_mean - T::one()
184}
185
186/// Aggregate Probabilistic Outlier Factor (nPLOF)
187///
188/// # Arguments
189/// * `plofs` - Array of PLOF values
190///
191fn calc_nplof<T>(plofs: &Array1<T>, lambda: T) -> T
192where
193    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
194{
195    let plofs_squared_mean =
196        plofs.iter().map(|x| x.powi(2)).sum::<T>() / T::from(plofs.len()).unwrap();
197
198    lambda * plofs_squared_mean.sqrt()
199}
200
201/// Approximate the error function (erf) according to
202/// > Abramowitz, Milton, and Irene A. Stegun, eds.
203/// > Handbook of mathematical functions with formulas, graphs, and mathematical tables. Vol. 55. US Government printing office, 1948.
204/// > Equation 7.1.25
205///
206/// # Arguments
207/// * `x` - Input value
208///
209fn erf_approx<T>(x: T) -> T
210where
211    T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
212{
213    T::one()
214        - T::one()
215            / (T::one()
216                + T::from(0.278393).unwrap() * x
217                + T::from(0.230389).unwrap() * x.powi(2)
218                + T::from(0.000972).unwrap() * x.powi(3)
219                + T::from(0.078108).unwrap() * x.powi(4))
220            .powi(4)
221}
222
223#[cfg(test)]
224mod tests {
225    use std::path::PathBuf;
226
227    use super::*;
228    use ndarray_stats::DeviationExt;
229    use polars::prelude::*;
230
231    #[test]
232    fn test_loop() {
233        // Test file contains a peptide spectrum matches form a proteomics-MS experiment with a precalculated loop score (PyNomaly)
234        // based on the features xcorr, ions_matched_ratio and mass_diff, using lambda=3 (default) and k=1000
235        let df_file_path = PathBuf::from("test_files/scored_psms.tsv");
236
237        // Read the TSV file into a DataFrame
238        let df = CsvReadOptions::default()
239            .with_has_header(true)
240            .with_parse_options(CsvParseOptions::default().with_separator(b'\t'))
241            .try_into_reader_with_file_path(Some(df_file_path))
242            .unwrap()
243            .finish()
244            .unwrap();
245
246        // Select the relevant columns
247        let feature_df = df
248            .select(["xcorr", "ions_matched_ratio", "mass_diff"])
249            .unwrap();
250
251        // Convert the DataFrame to a 2D array
252        // The data is expected to be in contiguous memory therefore we use the C-order
253        let array = feature_df.to_ndarray::<Float64Type>(IndexOrder::C).unwrap();
254
255        // Calculate the local outlier probabilities
256        let loop_score = local_outlier_probabilities(array, 1000, 3, None).unwrap();
257
258        assert_eq!(loop_score.len(), df.height());
259
260        let loop_score_py = df
261            .column("loop_score")
262            .unwrap()
263            .f64()
264            .unwrap()
265            .into_no_null_iter()
266            .collect::<Array1<_>>();
267
268        let rmse = loop_score.root_mean_sq_err(&loop_score_py).unwrap();
269
270        // RMSE under 0.02 should be good enough
271        assert!(rmse < 0.02, "RMSE > 0.02");
272    }
273}