geo 0.23.1

Geospatial primitives and algorithms
Documentation
use std::iter::Sum;
use std::ops::RangeInclusive;

use crate::{GeoFloat, MultiPoint, Point};

use rstar::primitives::GeomWithData;
use rstar::RTree;

/// Calculate the [Local Outlier Factor](https://en.wikipedia.org/wiki/Local_outlier_factor) of a set of points
///
/// Based on: Breunig, M., Kriegel, H., Ng, R., and Sander, J. (2000). *LOF: identifying density-based local
/// outliers.* In ACM Int. Conf. on Management of Data, pages 93-104. doi: [10.1145/335191.335388](https://doi.org/10.1145/335191.335388)
///
/// LOF is an unsupervised algorithm that uses local data for anomaly detection.
///
/// Outlier vs inlier classification is **highly dependent** on the shape of the data. LOF values <= 1
/// can generally be considered inliers, but e.g. a highly concentrated, uniform dataset could result in
/// points with a LOF of 1.05 being outliers.
/// LOF scores should thus be evaluated in the context of the dataset as a whole in order to classify outliers.
///
/// If you wish to run multiple outlier detection processes with differing neighbour counts in order
/// to build up data for more robust detection (see p. 100-1 above), you can use the [`OutlierDetection::prepared_detector`] method, which retains
/// the spatial index and point set between runs for greater efficiency. The [`OutlierDetection::generate_ensemble`] method
/// will efficiently run the LOF algorithm over a contiguous range of neighbour inputs,
/// allowing aggregations to be carried out over the resulting data.
pub trait OutlierDetection<T>
where
    T: GeoFloat,
{
    /// The LOF algorithm. `k_neighbours` specifies the number of neighbours to use for local outlier
    /// classification. The paper linked above (see p. 100) suggests a `k_neighbours` value of 10 - 20
    /// as a lower bound for "real-world"
    /// data.
    ///
    /// # Note on Erroneous Input
    /// If `k_neighbours` >= points in the set, or `k_neighbours` < 1, all input points will be returned with an LOF score of 1.
    /// If there are at least `k_neighbours` duplicate points of an input point, LOF scores can be `∞` or `NaN`.
    /// It is thus advisable to **deduplicate** or otherwise ensure the uniqueness of the input points.
    ///
    /// # Note on Returned Points
    /// Outlier scores are always returned corresponding to input point order
    ///
    /// # Examples
    ///
    /// ## MultiPoint
    ///
    /// ```
    /// use approx::assert_relative_eq;
    /// use geo::OutlierDetection;
    /// use geo::{point, MultiPoint};
    ///
    /// let v = vec![
    ///     point!(x: 0.0, y: 0.0),
    ///     point!(x: 0.0, y: 1.0),
    ///     point!(x: 3.0, y: 0.0),
    ///     point!(x: 1.0, y: 1.0),
    /// ];
    ///
    /// let lofscores = v.outliers(2);
    /// // The third point is an outlier, resulting in a large LOF score
    /// assert_relative_eq!(lofscores[2], 3.0);
    /// // The last point is an inlier, resulting in a small LOF score
    /// assert_relative_eq!(lofscores[3], 1.0);
    /// ```
    ///
    /// ## Computing indices, sorting by LOF score
    ///```
    /// use geo::OutlierDetection;
    /// use geo::{point, MultiPoint};
    ///
    /// // these points contain 4 strong outliers
    /// let v = vec![
    ///     point!(x: 0.16, y: 0.14),
    ///     point!(x: 0.15, y: 0.33),
    ///     point!(x: 0.37, y: 0.25),
    ///     point!(x: 0.3 , y: 0.4),
    ///     point!(x: 0.3 , y: 0.1),
    ///     point!(x: 0.3 , y: 0.2),
    ///     point!(x: 1.3 , y: 2.3),
    ///     point!(x: 1.7 , y: 0.2),
    ///     point!(x: 0.7 , y: -0.9),
    ///     point!(x: 0.21, y: 2.45),
    ///     point!(x: 0.8 , y: 0.7),
    ///     point!(x: 0.9 , y: 0.7),
    ///     point!(x: 0.8 , y: 0.6),
    ///     point!(x: 0.73, y: 0.65),
    ///     point!(x: 0.9 , y: 0.6),
    ///     point!(x: 1.0, y: 0.6),
    ///     point!(x: 1.0, y: 0.7),
    ///     point!(x: 0.25, y: 0.29),
    ///     point!(x: 0.2 , y: 0.2),
    /// ];
    /// let lofs = &mut v.outliers(3);
    /// let mut idx_lofs: Vec<(usize, f64)> = lofs
    ///     .iter()
    ///     .enumerate()
    ///     .map(|(idx, score)| (idx, *score))
    ///     .collect();
    /// // sort by LOF score
    /// idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
    /// // most likely outliers first
    /// idx_lofs.reverse();
    /// // four outliers, LOF scores way above 10
    /// idx_lofs
    ///     .iter()
    ///     .take(4)
    ///     .for_each(|score| assert!(score.1 > 10.0));
    ///```
    fn outliers(&self, k_neighbours: usize) -> Vec<T>;

    /// Create a prepared outlier detector allowing multiple runs to retain the spatial index in use.
    /// A [`PreparedDetector`] can efficiently recompute outliers with different `k_neigbhours` values.
    fn prepared_detector(&self) -> PreparedDetector<T>;

    /// Perform successive runs with `k_neighbours` values between `bounds`,
    /// generating an ensemble of LOF scores, which may be aggregated using e.g. min, max, or mean
    ///
    /// # Examples
    ///```
    /// use geo::OutlierDetection;
    /// use geo::{point, Point, MultiPoint};
    /// let v: Vec<Point<f64>> = vec![
    ///     point!(x: 0.16, y: 0.14),
    ///     point!(x: 0.15, y: 0.33),
    ///     point!(x: 0.37, y: 0.25),
    ///     point!(x: 0.3 , y: 0.4),
    ///     point!(x: 0.3 , y: 0.1),
    ///     point!(x: 0.3 , y: 0.2),
    ///     point!(x: 1.3 , y: 2.3),
    ///     point!(x: 1.7 , y: 0.2),
    ///     point!(x: 0.7 , y: -0.9),
    ///     point!(x: 0.21, y: 2.45),
    ///     point!(x: 0.8 , y: 0.7),
    ///     point!(x: 0.9 , y: 0.7),
    ///     point!(x: 0.8 , y: 0.6),
    ///     point!(x: 0.73, y: 0.65),
    ///     point!(x: 0.9 , y: 0.6),
    ///     point!(x: 1.0, y: 0.6),
    ///     point!(x: 1.0, y: 0.7),
    ///     point!(x: 0.25, y: 0.29),
    ///     point!(x: 0.2 , y: 0.2),
    /// ];
    /// let ensemble = v.generate_ensemble((2..=5));
    /// // retain the maximum LOF value for each point for all runs
    /// // this will result in a single Vec
    /// let aggregated = ensemble[1..].iter().fold(ensemble[0].clone(), |acc, xs| {
    ///     acc.iter()
    ///         .zip(xs)
    ///         .map(|(elem1, elem2)| elem1.min(*elem2))
    ///         .collect()
    /// });
    /// assert_eq!(v.len(), aggregated.len());
    ///```
    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>>;

    /// Convenience method to efficiently calculate the minimum values of an LOF ensemble
    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T>;

    /// Convenience method to efficiently calculate the maximum values of an LOF ensemble
    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T>;
}

/// This struct allows multiple detection operations to be run on a point set using varying `k_neighbours` sizes
/// without having to rebuild the underlying spatial index. Its [`PreparedDetector::outliers`] method
/// has the same signature as [`OutlierDetection::outliers`], but retains the underlying spatial index and point set
/// for greater efficiency.
#[derive(Clone, Debug)]
pub struct PreparedDetector<'a, T>
where
    T: GeoFloat,
{
    tree: RTree<GeomWithData<Point<T>, usize>>,
    points: &'a [Point<T>],
}

impl<'a, T> PreparedDetector<'a, T>
where
    T: GeoFloat + Sum,
{
    /// Create a new "prepared" detector which allows repeated LOF algorithm calls with varying neighbour sizes
    fn new(points: &'a [Point<T>]) -> Self {
        let geoms: Vec<GeomWithData<_, usize>> = points
            .iter()
            .enumerate()
            .map(|(idx, point)| GeomWithData::new(*point, idx))
            .collect();
        let tree = RTree::bulk_load(geoms);
        Self { tree, points }
    }

    /// See [`OutlierDetection::outliers`] for usage
    pub fn outliers(&self, kneighbours: usize) -> Vec<T> {
        lof(self.points, &self.tree, kneighbours)
    }
}

fn lof<T>(
    points: &[Point<T>],
    tree: &RTree<GeomWithData<Point<T>, usize>>,
    kneighbours: usize,
) -> Vec<T>
where
    T: GeoFloat + Sum,
{
    debug_assert!(kneighbours > 0);
    if points.len() <= kneighbours || kneighbours < 1 {
        // no point in trying to run the algorithm in this case
        return points.iter().map(|_| T::one()).collect();
    }
    let knn_dists = points
        .iter()
        .map(|point| {
            tree.nearest_neighbor_iter_with_distance_2(point)
                .take(kneighbours)
                .collect()
        })
        .collect::<Vec<Vec<_>>>();
    // calculate LRD (local reachability density) of each point
    // LRD is the estimated distance at which a point can be found by its neighbours:
    // count(neighbour_set) / sum(max(point.kTh_dist, point.dist2(other point)) for all points in neighbour_set)
    // we call this sum-of–max-distances reachDistance
    let local_reachability_densities: Vec<T> = knn_dists
        .iter()
        .map(|neighbours| {
            // for each point's neighbour set, calculate kth distance
            let kth_dist = neighbours
                .iter()
                .map(|(_, distance)| distance)
                .last()
                .unwrap();
            T::from(neighbours.len()).unwrap()
                / neighbours
                    .iter()
                    // sum the max between neighbour distance and kth distance for the neighbour set
                    .map(|(_, distance)| distance.max(*kth_dist))
                    .sum()
        })
        .collect();
    // LOF of a point p is the sum of the LRD of all the points
    // in the set kNearestSet(p) * the sum of the reachDistance of all the points of the same set,
    // to the point p, all divided by the number of items in p's kNN set, squared.
    knn_dists
        .iter()
        .enumerate()
        .map(|(_, neighbours)| {
            // for each point's neighbour set, calculate kth distance
            let kth_dist = neighbours
                .iter()
                .map(|(_, distance)| distance)
                .last()
                .unwrap();
            // sum neighbour set LRD scores
            let lrd_scores: T = neighbours
                .iter()
                .map(|(neighbour, _)| local_reachability_densities[neighbour.data])
                .sum();
            // sum neighbour set reachDistance
            let sum_rd: T = neighbours
                .iter()
                .map(|(_, distance)| distance.max(*kth_dist))
                .sum();
            (lrd_scores * sum_rd) / T::from(neighbours.len().pow(2)).unwrap()
        })
        .collect()
}

impl<T> OutlierDetection<T> for MultiPoint<T>
where
    T: GeoFloat + Sum,
{
    fn outliers(&self, k_neighbours: usize) -> Vec<T> {
        let pd = self.prepared_detector();
        pd.outliers(k_neighbours)
    }

    fn prepared_detector(&self) -> PreparedDetector<T> {
        PreparedDetector::new(&self.0)
    }

    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
        let pd = self.prepared_detector();
        bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
    }
    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
        let pd = self.prepared_detector();
        bounds
            .map(|kneighbours| pd.outliers(kneighbours))
            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
            .unwrap()
    }

    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
        let pd = self.prepared_detector();
        bounds
            .map(|kneighbours| pd.outliers(kneighbours))
            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
            .unwrap()
    }
}

impl<T> OutlierDetection<T> for [Point<T>]
where
    T: GeoFloat + Sum,
{
    fn outliers(&self, k_neighbours: usize) -> Vec<T> {
        let pd = self.prepared_detector();
        pd.outliers(k_neighbours)
    }

    fn prepared_detector(&self) -> PreparedDetector<T> {
        PreparedDetector::new(self)
    }

    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
        let pd = self.prepared_detector();
        bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
    }

    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
        let pd = self.prepared_detector();
        bounds
            .map(|kneighbours| pd.outliers(kneighbours))
            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
            .unwrap()
    }

    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
        let pd = self.prepared_detector();
        bounds
            .map(|kneighbours| pd.outliers(kneighbours))
            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
            .unwrap()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::point;

    #[test]
    fn test_lof() {
        // third point is an outlier
        let v = vec![
            Point::new(0.0, 0.0),
            Point::new(0.0, 1.0),
            Point::new(3.0, 0.0),
            Point::new(1.0, 1.0),
        ];

        let lofs = &v.outliers(3);
        assert_eq!(lofs[2], 3.3333333333333335);
    }
    #[test]
    fn test_lof2() {
        // fourth point is an outlier
        let v = vec![
            Point::new(0.0, 0.0),
            Point::new(1.0, 0.0),
            Point::new(1.0, 1.0),
            Point::new(0.0, 3.0),
        ];
        let lofs = &v.outliers(3);
        assert_eq!(lofs[3], 3.3333333333333335);
    }
    #[test]
    fn test_lof3() {
        // second point is an outlier, sort and reverse so scores are in descending order
        let v = vec![
            Point::new(0.0, 0.0),
            Point::new(0.0, 3.0),
            Point::new(1.0, 0.0),
            Point::new(1.0, 1.0),
        ];
        let lofs = &mut v.outliers(3);
        lofs.sort_by(|a, b| a.partial_cmp(b).unwrap());
        lofs.reverse();
        assert_eq!(lofs[0], 3.3333333333333335);
    }
    #[test]
    fn test_lof4() {
        // this dataset contains 4 outliers
        // indices 6, 7, 8, 9 should be detected
        // order: 9, 6, 8, 7
        let v = vec![
            point!(x: 0.16, y: 0.14),
            point!(x: 0.15, y: 0.33),
            point!(x: 0.37, y: 0.25),
            point!(x: 0.3 , y: 0.4),
            point!(x: 0.3 , y: 0.1),
            point!(x: 0.3 , y: 0.2),
            point!(x: 1.3 , y: 2.3),
            point!(x: 1.7 , y: 0.2),
            point!(x: 0.7 , y: -0.9),
            point!(x: 0.21, y: 2.45),
            point!(x: 0.8 , y: 0.7),
            point!(x: 0.9 , y: 0.7),
            point!(x: 0.8 , y: 0.6),
            point!(x: 0.73, y: 0.65),
            point!(x: 0.9 , y: 0.6),
            point!(x: 1.0, y: 0.6),
            point!(x: 1.0, y: 0.7),
            point!(x: 0.25, y: 0.29),
            point!(x: 0.2 , y: 0.2),
        ];
        let lofs = &mut v.outliers(3);
        let mut idx_lofs: Vec<(usize, f64)> = lofs
            .iter()
            .enumerate()
            .map(|(idx, score)| (idx, *score))
            .collect();
        idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
        idx_lofs.reverse();
        // four outliers, scores way above 10
        idx_lofs
            .iter()
            .take(4)
            .for_each(|score| assert!(score.1 > 10.0));
        // the rest below 2
        idx_lofs
            .iter()
            .skip(4)
            .for_each(|score| assert!(score.1 < 2.0));
        // ensure that scores are being computed correctly
        assert_eq!(idx_lofs[0].0, 9);
        assert_eq!(idx_lofs[1].0, 6);
        assert_eq!(idx_lofs[2].0, 8);
        assert_eq!(idx_lofs[3].0, 7);
    }
    #[test]
    fn test_lof5() {
        // third point is an outlier
        let v = vec![
            Point::new(0.0, 0.0),
            Point::new(0.0, 1.0),
            Point::new(3.0, 0.0),
            Point::new(1.0, 1.0),
        ];

        let prepared = &v.prepared_detector();
        let s1 = prepared.outliers(2);
        let s2 = prepared.outliers(3);
        // different neighbour sizes give different scores
        assert_ne!(s1[2], s2[2]);
    }
}