local_outlier_probabilities/
lib.rs1#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/Readme.md"))]
3
4pub mod error;
5
6use std::fmt::Debug;
7use std::iter::Sum;
8use std::ops::AddAssign;
9
10use kdtree::KdTree;
11use ndarray::{Array1, Array2, ScalarOperand};
12use num_traits::{Float, FloatConst};
13
14use crate::error::Error;
15
16#[allow(type_alias_bounds)]
20pub type DistanceFn<T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug> =
21 fn(&[T], &[T]) -> T;
22
23pub fn manhattan<T>(a: &[T], b: &[T]) -> T
30where
31 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
32{
33 debug_assert_eq!(a.len(), b.len());
34 a.iter()
35 .zip(b.iter())
36 .map(|(x, y)| (*x - *y).abs())
37 .fold(T::zero(), ::std::ops::Add::add)
38}
39
40pub fn euclidean<T>(a: &[T], b: &[T]) -> T
47where
48 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
49{
50 debug_assert_eq!(a.len(), b.len());
51 a.iter()
52 .zip(b.iter())
53 .map(|(x, y)| (*x - *y).powi(2))
54 .fold(T::zero(), ::std::ops::Add::add)
55 .sqrt()
56}
57
58pub fn local_outlier_probabilities<T>(
72 data: Array2<T>,
73 k: usize,
74 lambda: u8,
75 distance_fn: Option<DistanceFn<T>>,
76) -> Result<Array1<T>, Error>
77where
78 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
79{
80 let distance_fn = distance_fn.unwrap_or(manhattan);
82
83 let mut tree = KdTree::new(data.shape()[1]);
85
86 let point_refs = data
89 .outer_iter()
90 .map(|point| {
91 point
92 .to_slice()
93 .ok_or(Error::SliceNotContiguous("collecting points references"))
94 })
95 .collect::<Result<Vec<_>, Error>>()?;
96
97 for (idx, point) in point_refs.iter().enumerate() {
98 tree.add(*point, idx)?;
99 }
100
101 let neighbors_list = point_refs
104 .iter()
105 .enumerate()
106 .map(|(point_idx, point)| {
107 match tree.nearest(point, k + 1, &distance_fn) {
108 Ok(neighbors) => {
109 let filtered = neighbors
111 .into_iter()
112 .filter(|neighbor| {
113 *neighbor.1 != point_idx
115 })
116 .collect::<Vec<_>>();
117 Ok(filtered)
118 }
119 Err(err) => Err(Error::KdTreeError(err)),
120 }
121 })
122 .collect::<Result<Vec<Vec<(T, &usize)>>, Error>>()?;
123
124 let k_float = T::from(k).ok_or(Error::KCastingError("k".to_string()))?;
126 let lambda_float = T::from(lambda).ok_or(Error::KCastingError("lambda".to_string()))?;
127 let two_squared = T::from(2.0).ok_or(Error::KCastingError("2.0".to_string()))?;
128
129 let pdists = neighbors_list
131 .iter()
132 .map(|neighbors| calc_sigma(neighbors, k_float))
133 .collect::<Array1<_>>()
134 * lambda_float;
135
136 let plofs = neighbors_list
138 .iter()
139 .zip(pdists.iter())
140 .map(|(nearest_neighbors, pdist)| calc_plof(nearest_neighbors, *pdist, &pdists))
141 .collect::<Array1<_>>();
142
143 let nplof = calc_nplof(&plofs, lambda_float);
145
146 let local_outlier_prob =
148 (plofs / (nplof * two_squared.sqrt())).map(|x| erf_approx(*x).max(T::zero()));
149
150 Ok(local_outlier_prob)
151}
152
153fn calc_sigma<T>(neighbors: &[(T, &usize)], k: T) -> T
160where
161 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
162{
163 (neighbors.iter().map(|(dist, _)| dist.powi(2)).sum::<T>() / k).sqrt()
164}
165
166fn calc_plof<T>(nearest_neighbors: &[(T, &usize)], pdist: T, pdists: &Array1<T>) -> T
174where
175 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
176{
177 let nn_mean = nearest_neighbors
178 .iter()
179 .map(|(_, idx)| pdists[**idx])
180 .sum::<T>()
181 / T::from(nearest_neighbors.len()).unwrap();
182
183 pdist / nn_mean - T::one()
184}
185
186fn calc_nplof<T>(plofs: &Array1<T>, lambda: T) -> T
192where
193 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
194{
195 let plofs_squared_mean =
196 plofs.iter().map(|x| x.powi(2)).sum::<T>() / T::from(plofs.len()).unwrap();
197
198 lambda * plofs_squared_mean.sqrt()
199}
200
201fn erf_approx<T>(x: T) -> T
210where
211 T: Float + FloatConst + ScalarOperand + AddAssign + Sum + Debug,
212{
213 T::one()
214 - T::one()
215 / (T::one()
216 + T::from(0.278393).unwrap() * x
217 + T::from(0.230389).unwrap() * x.powi(2)
218 + T::from(0.000972).unwrap() * x.powi(3)
219 + T::from(0.078108).unwrap() * x.powi(4))
220 .powi(4)
221}
222
223#[cfg(test)]
224mod tests {
225 use std::path::PathBuf;
226
227 use super::*;
228 use ndarray_stats::DeviationExt;
229 use polars::prelude::*;
230
231 #[test]
232 fn test_loop() {
233 let df_file_path = PathBuf::from("test_files/scored_psms.tsv");
236
237 let df = CsvReadOptions::default()
239 .with_has_header(true)
240 .with_parse_options(CsvParseOptions::default().with_separator(b'\t'))
241 .try_into_reader_with_file_path(Some(df_file_path))
242 .unwrap()
243 .finish()
244 .unwrap();
245
246 let feature_df = df
248 .select(["xcorr", "ions_matched_ratio", "mass_diff"])
249 .unwrap();
250
251 let array = feature_df.to_ndarray::<Float64Type>(IndexOrder::C).unwrap();
254
255 let loop_score = local_outlier_probabilities(array, 1000, 3, None).unwrap();
257
258 assert_eq!(loop_score.len(), df.height());
259
260 let loop_score_py = df
261 .column("loop_score")
262 .unwrap()
263 .f64()
264 .unwrap()
265 .into_no_null_iter()
266 .collect::<Array1<_>>();
267
268 let rmse = loop_score.root_mean_sq_err(&loop_score_py).unwrap();
269
270 assert!(rmse < 0.02, "RMSE > 0.02");
272 }
273}