geo/algorithm/
outlier_detection.rs

1use std::iter::Sum;
2use std::ops::RangeInclusive;
3
4use crate::{GeoFloat, MultiPoint, Point};
5
6use rstar::primitives::GeomWithData;
7use rstar::RTree;
8
9/// Calculate the [Local Outlier Factor](https://en.wikipedia.org/wiki/Local_outlier_factor) of a set of points
10///
11/// Based on: Breunig, M., Kriegel, H., Ng, R., and Sander, J. (2000). *LOF: identifying density-based local
12/// outliers.* In ACM Int. Conf. on Management of Data, pages 93-104. doi: [10.1145/335191.335388](https://doi.org/10.1145/335191.335388)
13///
14/// LOF is an unsupervised algorithm that uses local data for anomaly detection.
15///
16/// Outlier vs inlier classification is **highly dependent** on the shape of the data. LOF values <= 1
17/// can generally be considered inliers, but e.g. a highly concentrated, uniform dataset could result in
18/// points with a LOF of 1.05 being outliers.
19/// LOF scores should thus be evaluated in the context of the dataset as a whole in order to classify outliers.
20///
21/// If you wish to run multiple outlier detection processes with differing neighbour counts in order
22/// to build up data for more robust detection (see p. 100-1 above), you can use the [`OutlierDetection::prepared_detector`] method, which retains
23/// the spatial index and point set between runs for greater efficiency. The [`OutlierDetection::generate_ensemble`] method
24/// will efficiently run the LOF algorithm over a contiguous range of neighbour inputs,
25/// allowing aggregations to be carried out over the resulting data.
26pub trait OutlierDetection<T>
27where
28    T: GeoFloat,
29{
30    /// The LOF algorithm. `k_neighbours` specifies the number of neighbours to use for local outlier
31    /// classification. The paper linked above (see p. 100) suggests a `k_neighbours` value of 10 - 20
32    /// as a lower bound for "real-world"
33    /// data.
34    ///
35    /// # Note on Erroneous Input
36    /// If `k_neighbours` >= points in the set, or `k_neighbours` < 1, all input points will be returned with an LOF score of 1.
37    /// If there are at least `k_neighbours` duplicate points of an input point, LOF scores can be `∞` or `NaN`.
38    /// It is thus advisable to **deduplicate** or otherwise ensure the uniqueness of the input points.
39    ///
40    /// # Note on Returned Points
41    /// Outlier scores are always returned corresponding to input point order
42    ///
43    /// # Examples
44    ///
45    /// ## MultiPoint
46    ///
47    /// ```
48    /// use approx::assert_relative_eq;
49    /// use geo::OutlierDetection;
50    /// use geo::{point, MultiPoint};
51    ///
52    /// let v = vec![
53    ///     point!(x: 0.0, y: 0.0),
54    ///     point!(x: 0.0, y: 1.0),
55    ///     point!(x: 3.0, y: 0.0),
56    ///     point!(x: 1.0, y: 1.0),
57    /// ];
58    ///
59    /// let lofscores = v.outliers(2);
60    /// // The third point is an outlier, resulting in a large LOF score
61    /// assert_relative_eq!(lofscores[2], 3.0);
62    /// // The last point is an inlier, resulting in a small LOF score
63    /// assert_relative_eq!(lofscores[3], 1.0);
64    /// ```
65    ///
66    /// ## Computing indices, sorting by LOF score
67    ///```
68    /// use geo::OutlierDetection;
69    /// use geo::{point, MultiPoint};
70    ///
71    /// // these points contain 4 strong outliers
72    /// let v = vec![
73    ///     point!(x: 0.16, y: 0.14),
74    ///     point!(x: 0.15, y: 0.33),
75    ///     point!(x: 0.37, y: 0.25),
76    ///     point!(x: 0.3 , y: 0.4),
77    ///     point!(x: 0.3 , y: 0.1),
78    ///     point!(x: 0.3 , y: 0.2),
79    ///     point!(x: 1.3 , y: 2.3),
80    ///     point!(x: 1.7 , y: 0.2),
81    ///     point!(x: 0.7 , y: -0.9),
82    ///     point!(x: 0.21, y: 2.45),
83    ///     point!(x: 0.8 , y: 0.7),
84    ///     point!(x: 0.9 , y: 0.7),
85    ///     point!(x: 0.8 , y: 0.6),
86    ///     point!(x: 0.73, y: 0.65),
87    ///     point!(x: 0.9 , y: 0.6),
88    ///     point!(x: 1.0, y: 0.6),
89    ///     point!(x: 1.0, y: 0.7),
90    ///     point!(x: 0.25, y: 0.29),
91    ///     point!(x: 0.2 , y: 0.2),
92    /// ];
93    /// let lofs = &mut v.outliers(3);
94    /// let mut idx_lofs: Vec<(usize, f64)> = lofs
95    ///     .iter()
96    ///     .enumerate()
97    ///     .map(|(idx, score)| (idx, *score))
98    ///     .collect();
99    /// // sort by LOF score
100    /// idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
101    /// // most likely outliers first
102    /// idx_lofs.reverse();
103    /// // four outliers, LOF scores way above 10
104    /// idx_lofs
105    ///     .iter()
106    ///     .take(4)
107    ///     .for_each(|score| assert!(score.1 > 10.0));
108    ///```
109    fn outliers(&self, k_neighbours: usize) -> Vec<T>;
110
111    /// Create a prepared outlier detector allowing multiple runs to retain the spatial index in use.
112    /// A [`PreparedDetector`] can efficiently recompute outliers with different `k_neigbhours` values.
113    fn prepared_detector(&self) -> PreparedDetector<T>;
114
115    /// Perform successive runs with `k_neighbours` values between `bounds`,
116    /// generating an ensemble of LOF scores, which may be aggregated using e.g. min, max, or mean
117    ///
118    /// # Examples
119    ///```
120    /// use geo::OutlierDetection;
121    /// use geo::{point, Point, MultiPoint};
122    /// let v: Vec<Point<f64>> = vec![
123    ///     point!(x: 0.16, y: 0.14),
124    ///     point!(x: 0.15, y: 0.33),
125    ///     point!(x: 0.37, y: 0.25),
126    ///     point!(x: 0.3 , y: 0.4),
127    ///     point!(x: 0.3 , y: 0.1),
128    ///     point!(x: 0.3 , y: 0.2),
129    ///     point!(x: 1.3 , y: 2.3),
130    ///     point!(x: 1.7 , y: 0.2),
131    ///     point!(x: 0.7 , y: -0.9),
132    ///     point!(x: 0.21, y: 2.45),
133    ///     point!(x: 0.8 , y: 0.7),
134    ///     point!(x: 0.9 , y: 0.7),
135    ///     point!(x: 0.8 , y: 0.6),
136    ///     point!(x: 0.73, y: 0.65),
137    ///     point!(x: 0.9 , y: 0.6),
138    ///     point!(x: 1.0, y: 0.6),
139    ///     point!(x: 1.0, y: 0.7),
140    ///     point!(x: 0.25, y: 0.29),
141    ///     point!(x: 0.2 , y: 0.2),
142    /// ];
143    /// let ensemble = v.generate_ensemble((2..=5));
144    /// // retain the maximum LOF value for each point for all runs
145    /// // this will result in a single Vec
146    /// let aggregated = ensemble[1..].iter().fold(ensemble[0].clone(), |acc, xs| {
147    ///     acc.iter()
148    ///         .zip(xs)
149    ///         .map(|(elem1, elem2)| elem1.min(*elem2))
150    ///         .collect()
151    /// });
152    /// assert_eq!(v.len(), aggregated.len());
153    ///```
154    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>>;
155
156    /// Convenience method to efficiently calculate the minimum values of an LOF ensemble
157    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T>;
158
159    /// Convenience method to efficiently calculate the maximum values of an LOF ensemble
160    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T>;
161}
162
163/// This struct allows multiple detection operations to be run on a point set using varying `k_neighbours` sizes
164/// without having to rebuild the underlying spatial index. Its [`PreparedDetector::outliers`] method
165/// has the same signature as [`OutlierDetection::outliers`], but retains the underlying spatial index and point set
166/// for greater efficiency.
167#[derive(Clone, Debug)]
168pub struct PreparedDetector<'a, T>
169where
170    T: GeoFloat,
171{
172    tree: RTree<GeomWithData<Point<T>, usize>>,
173    points: &'a [Point<T>],
174}
175
176impl<'a, T> PreparedDetector<'a, T>
177where
178    T: GeoFloat + Sum,
179{
180    /// Create a new "prepared" detector which allows repeated LOF algorithm calls with varying neighbour sizes
181    fn new(points: &'a [Point<T>]) -> Self {
182        let geoms: Vec<GeomWithData<_, usize>> = points
183            .iter()
184            .enumerate()
185            .map(|(idx, point)| GeomWithData::new(*point, idx))
186            .collect();
187        let tree = RTree::bulk_load(geoms);
188        Self { tree, points }
189    }
190
191    /// See [`OutlierDetection::outliers`] for usage
192    pub fn outliers(&self, kneighbours: usize) -> Vec<T> {
193        lof(self.points, &self.tree, kneighbours)
194    }
195}
196
197fn lof<T>(
198    points: &[Point<T>],
199    tree: &RTree<GeomWithData<Point<T>, usize>>,
200    kneighbours: usize,
201) -> Vec<T>
202where
203    T: GeoFloat + Sum,
204{
205    debug_assert!(kneighbours > 0);
206    if points.len() <= kneighbours || kneighbours < 1 {
207        // no point in trying to run the algorithm in this case
208        return points.iter().map(|_| T::one()).collect();
209    }
210    let knn_dists = points
211        .iter()
212        .map(|point| {
213            tree.nearest_neighbor_iter_with_distance_2(point)
214                .take(kneighbours)
215                .collect()
216        })
217        .collect::<Vec<Vec<_>>>();
218    // calculate LRD (local reachability density) of each point
219    // LRD is the estimated distance at which a point can be found by its neighbours:
220    // count(neighbour_set) / sum(max(point.kTh_dist, point.dist2(other point)) for all points in neighbour_set)
221    // we call this sum-of–max-distances reachDistance
222    let local_reachability_densities: Vec<T> = knn_dists
223        .iter()
224        .map(|neighbours| {
225            // for each point's neighbour set, calculate kth distance
226            let kth_dist = neighbours
227                .iter()
228                .map(|(_, distance)| distance)
229                .last()
230                .unwrap();
231            T::from(neighbours.len()).unwrap()
232                / neighbours
233                    .iter()
234                    // sum the max between neighbour distance and kth distance for the neighbour set
235                    .map(|(_, distance)| distance.max(*kth_dist))
236                    .sum()
237        })
238        .collect();
239    // LOF of a point p is the sum of the LRD of all the points
240    // in the set kNearestSet(p) * the sum of the reachDistance of all the points of the same set,
241    // to the point p, all divided by the number of items in p's kNN set, squared.
242    knn_dists
243        .iter()
244        .enumerate()
245        .map(|(_, neighbours)| {
246            // for each point's neighbour set, calculate kth distance
247            let kth_dist = neighbours
248                .iter()
249                .map(|(_, distance)| distance)
250                .last()
251                .unwrap();
252            // sum neighbour set LRD scores
253            let lrd_scores: T = neighbours
254                .iter()
255                .map(|(neighbour, _)| local_reachability_densities[neighbour.data])
256                .sum();
257            // sum neighbour set reachDistance
258            let sum_rd: T = neighbours
259                .iter()
260                .map(|(_, distance)| distance.max(*kth_dist))
261                .sum();
262            (lrd_scores * sum_rd) / T::from(neighbours.len().pow(2)).unwrap()
263        })
264        .collect()
265}
266
267impl<T> OutlierDetection<T> for MultiPoint<T>
268where
269    T: GeoFloat + Sum,
270{
271    fn outliers(&self, k_neighbours: usize) -> Vec<T> {
272        let pd = self.prepared_detector();
273        pd.outliers(k_neighbours)
274    }
275
276    fn prepared_detector(&self) -> PreparedDetector<T> {
277        PreparedDetector::new(&self.0)
278    }
279
280    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
281        let pd = self.prepared_detector();
282        bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
283    }
284    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
285        let pd = self.prepared_detector();
286        bounds
287            .map(|kneighbours| pd.outliers(kneighbours))
288            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
289            .unwrap()
290    }
291
292    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
293        let pd = self.prepared_detector();
294        bounds
295            .map(|kneighbours| pd.outliers(kneighbours))
296            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
297            .unwrap()
298    }
299}
300
301impl<T> OutlierDetection<T> for [Point<T>]
302where
303    T: GeoFloat + Sum,
304{
305    fn outliers(&self, k_neighbours: usize) -> Vec<T> {
306        let pd = self.prepared_detector();
307        pd.outliers(k_neighbours)
308    }
309
310    fn prepared_detector(&self) -> PreparedDetector<T> {
311        PreparedDetector::new(self)
312    }
313
314    fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
315        let pd = self.prepared_detector();
316        bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
317    }
318
319    fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
320        let pd = self.prepared_detector();
321        bounds
322            .map(|kneighbours| pd.outliers(kneighbours))
323            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
324            .unwrap()
325    }
326
327    fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
328        let pd = self.prepared_detector();
329        bounds
330            .map(|kneighbours| pd.outliers(kneighbours))
331            .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
332            .unwrap()
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339    use crate::point;
340
341    #[test]
342    fn test_lof() {
343        // third point is an outlier
344        let v = vec![
345            Point::new(0.0, 0.0),
346            Point::new(0.0, 1.0),
347            Point::new(3.0, 0.0),
348            Point::new(1.0, 1.0),
349        ];
350
351        let lofs = &v.outliers(3);
352        assert_eq!(lofs[2], 3.3333333333333335);
353    }
354    #[test]
355    fn test_lof2() {
356        // fourth point is an outlier
357        let v = vec![
358            Point::new(0.0, 0.0),
359            Point::new(1.0, 0.0),
360            Point::new(1.0, 1.0),
361            Point::new(0.0, 3.0),
362        ];
363        let lofs = &v.outliers(3);
364        assert_eq!(lofs[3], 3.3333333333333335);
365    }
366    #[test]
367    fn test_lof3() {
368        // second point is an outlier, sort and reverse so scores are in descending order
369        let v = vec![
370            Point::new(0.0, 0.0),
371            Point::new(0.0, 3.0),
372            Point::new(1.0, 0.0),
373            Point::new(1.0, 1.0),
374        ];
375        let lofs = &mut v.outliers(3);
376        lofs.sort_by(|a, b| a.partial_cmp(b).unwrap());
377        lofs.reverse();
378        assert_eq!(lofs[0], 3.3333333333333335);
379    }
380    #[test]
381    fn test_lof4() {
382        // this dataset contains 4 outliers
383        // indices 6, 7, 8, 9 should be detected
384        // order: 9, 6, 8, 7
385        let v = vec![
386            point!(x: 0.16, y: 0.14),
387            point!(x: 0.15, y: 0.33),
388            point!(x: 0.37, y: 0.25),
389            point!(x: 0.3 , y: 0.4),
390            point!(x: 0.3 , y: 0.1),
391            point!(x: 0.3 , y: 0.2),
392            point!(x: 1.3 , y: 2.3),
393            point!(x: 1.7 , y: 0.2),
394            point!(x: 0.7 , y: -0.9),
395            point!(x: 0.21, y: 2.45),
396            point!(x: 0.8 , y: 0.7),
397            point!(x: 0.9 , y: 0.7),
398            point!(x: 0.8 , y: 0.6),
399            point!(x: 0.73, y: 0.65),
400            point!(x: 0.9 , y: 0.6),
401            point!(x: 1.0, y: 0.6),
402            point!(x: 1.0, y: 0.7),
403            point!(x: 0.25, y: 0.29),
404            point!(x: 0.2 , y: 0.2),
405        ];
406        let lofs = &mut v.outliers(3);
407        let mut idx_lofs: Vec<(usize, f64)> = lofs
408            .iter()
409            .enumerate()
410            .map(|(idx, score)| (idx, *score))
411            .collect();
412        idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
413        idx_lofs.reverse();
414        // four outliers, scores way above 10
415        idx_lofs
416            .iter()
417            .take(4)
418            .for_each(|score| assert!(score.1 > 10.0));
419        // the rest below 2
420        idx_lofs
421            .iter()
422            .skip(4)
423            .for_each(|score| assert!(score.1 < 2.0));
424        // ensure that scores are being computed correctly
425        assert_eq!(idx_lofs[0].0, 9);
426        assert_eq!(idx_lofs[1].0, 6);
427        assert_eq!(idx_lofs[2].0, 8);
428        assert_eq!(idx_lofs[3].0, 7);
429    }
430    #[test]
431    fn test_lof5() {
432        // third point is an outlier
433        let v = vec![
434            Point::new(0.0, 0.0),
435            Point::new(0.0, 1.0),
436            Point::new(3.0, 0.0),
437            Point::new(1.0, 1.0),
438        ];
439
440        let prepared = &v.prepared_detector();
441        let s1 = prepared.outliers(2);
442        let s2 = prepared.outliers(3);
443        // different neighbour sizes give different scores
444        assert_ne!(s1[2], s2[2]);
445    }
446}