geo/algorithm/outlier_detection.rs
1use std::iter::Sum;
2use std::ops::RangeInclusive;
3
4use crate::{GeoFloat, MultiPoint, Point};
5
6use rstar::primitives::GeomWithData;
7use rstar::RTree;
8
9/// Calculate the [Local Outlier Factor](https://en.wikipedia.org/wiki/Local_outlier_factor) of a set of points
10///
11/// Based on: Breunig, M., Kriegel, H., Ng, R., and Sander, J. (2000). *LOF: identifying density-based local
12/// outliers.* In ACM Int. Conf. on Management of Data, pages 93-104. doi: [10.1145/335191.335388](https://doi.org/10.1145/335191.335388)
13///
14/// LOF is an unsupervised algorithm that uses local data for anomaly detection.
15///
16/// Outlier vs inlier classification is **highly dependent** on the shape of the data. LOF values <= 1
17/// can generally be considered inliers, but e.g. a highly concentrated, uniform dataset could result in
18/// points with a LOF of 1.05 being outliers.
19/// LOF scores should thus be evaluated in the context of the dataset as a whole in order to classify outliers.
20///
21/// If you wish to run multiple outlier detection processes with differing neighbour counts in order
22/// to build up data for more robust detection (see p. 100-1 above), you can use the [`OutlierDetection::prepared_detector`] method, which retains
23/// the spatial index and point set between runs for greater efficiency. The [`OutlierDetection::generate_ensemble`] method
24/// will efficiently run the LOF algorithm over a contiguous range of neighbour inputs,
25/// allowing aggregations to be carried out over the resulting data.
26pub trait OutlierDetection<T>
27where
28 T: GeoFloat,
29{
30 /// The LOF algorithm. `k_neighbours` specifies the number of neighbours to use for local outlier
31 /// classification. The paper linked above (see p. 100) suggests a `k_neighbours` value of 10 - 20
32 /// as a lower bound for "real-world"
33 /// data.
34 ///
35 /// # Note on Erroneous Input
36 /// If `k_neighbours` >= points in the set, or `k_neighbours` < 1, all input points will be returned with an LOF score of 1.
37 /// If there are at least `k_neighbours` duplicate points of an input point, LOF scores can be `∞` or `NaN`.
38 /// It is thus advisable to **deduplicate** or otherwise ensure the uniqueness of the input points.
39 ///
40 /// # Note on Returned Points
41 /// Outlier scores are always returned corresponding to input point order
42 ///
43 /// # Examples
44 ///
45 /// ## MultiPoint
46 ///
47 /// ```
48 /// use approx::assert_relative_eq;
49 /// use geo::OutlierDetection;
50 /// use geo::{point, MultiPoint};
51 ///
52 /// let v = vec![
53 /// point!(x: 0.0, y: 0.0),
54 /// point!(x: 0.0, y: 1.0),
55 /// point!(x: 3.0, y: 0.0),
56 /// point!(x: 1.0, y: 1.0),
57 /// ];
58 ///
59 /// let lofscores = v.outliers(2);
60 /// // The third point is an outlier, resulting in a large LOF score
61 /// assert_relative_eq!(lofscores[2], 3.0);
62 /// // The last point is an inlier, resulting in a small LOF score
63 /// assert_relative_eq!(lofscores[3], 1.0);
64 /// ```
65 ///
66 /// ## Computing indices, sorting by LOF score
67 ///```
68 /// use geo::OutlierDetection;
69 /// use geo::{point, MultiPoint};
70 ///
71 /// // these points contain 4 strong outliers
72 /// let v = vec![
73 /// point!(x: 0.16, y: 0.14),
74 /// point!(x: 0.15, y: 0.33),
75 /// point!(x: 0.37, y: 0.25),
76 /// point!(x: 0.3 , y: 0.4),
77 /// point!(x: 0.3 , y: 0.1),
78 /// point!(x: 0.3 , y: 0.2),
79 /// point!(x: 1.3 , y: 2.3),
80 /// point!(x: 1.7 , y: 0.2),
81 /// point!(x: 0.7 , y: -0.9),
82 /// point!(x: 0.21, y: 2.45),
83 /// point!(x: 0.8 , y: 0.7),
84 /// point!(x: 0.9 , y: 0.7),
85 /// point!(x: 0.8 , y: 0.6),
86 /// point!(x: 0.73, y: 0.65),
87 /// point!(x: 0.9 , y: 0.6),
88 /// point!(x: 1.0, y: 0.6),
89 /// point!(x: 1.0, y: 0.7),
90 /// point!(x: 0.25, y: 0.29),
91 /// point!(x: 0.2 , y: 0.2),
92 /// ];
93 /// let lofs = &mut v.outliers(3);
94 /// let mut idx_lofs: Vec<(usize, f64)> = lofs
95 /// .iter()
96 /// .enumerate()
97 /// .map(|(idx, score)| (idx, *score))
98 /// .collect();
99 /// // sort by LOF score
100 /// idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
101 /// // most likely outliers first
102 /// idx_lofs.reverse();
103 /// // four outliers, LOF scores way above 10
104 /// idx_lofs
105 /// .iter()
106 /// .take(4)
107 /// .for_each(|score| assert!(score.1 > 10.0));
108 ///```
109 fn outliers(&self, k_neighbours: usize) -> Vec<T>;
110
111 /// Create a prepared outlier detector allowing multiple runs to retain the spatial index in use.
112 /// A [`PreparedDetector`] can efficiently recompute outliers with different `k_neigbhours` values.
113 fn prepared_detector(&self) -> PreparedDetector<T>;
114
115 /// Perform successive runs with `k_neighbours` values between `bounds`,
116 /// generating an ensemble of LOF scores, which may be aggregated using e.g. min, max, or mean
117 ///
118 /// # Examples
119 ///```
120 /// use geo::OutlierDetection;
121 /// use geo::{point, Point, MultiPoint};
122 /// let v: Vec<Point<f64>> = vec![
123 /// point!(x: 0.16, y: 0.14),
124 /// point!(x: 0.15, y: 0.33),
125 /// point!(x: 0.37, y: 0.25),
126 /// point!(x: 0.3 , y: 0.4),
127 /// point!(x: 0.3 , y: 0.1),
128 /// point!(x: 0.3 , y: 0.2),
129 /// point!(x: 1.3 , y: 2.3),
130 /// point!(x: 1.7 , y: 0.2),
131 /// point!(x: 0.7 , y: -0.9),
132 /// point!(x: 0.21, y: 2.45),
133 /// point!(x: 0.8 , y: 0.7),
134 /// point!(x: 0.9 , y: 0.7),
135 /// point!(x: 0.8 , y: 0.6),
136 /// point!(x: 0.73, y: 0.65),
137 /// point!(x: 0.9 , y: 0.6),
138 /// point!(x: 1.0, y: 0.6),
139 /// point!(x: 1.0, y: 0.7),
140 /// point!(x: 0.25, y: 0.29),
141 /// point!(x: 0.2 , y: 0.2),
142 /// ];
143 /// let ensemble = v.generate_ensemble((2..=5));
144 /// // retain the maximum LOF value for each point for all runs
145 /// // this will result in a single Vec
146 /// let aggregated = ensemble[1..].iter().fold(ensemble[0].clone(), |acc, xs| {
147 /// acc.iter()
148 /// .zip(xs)
149 /// .map(|(elem1, elem2)| elem1.min(*elem2))
150 /// .collect()
151 /// });
152 /// assert_eq!(v.len(), aggregated.len());
153 ///```
154 fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>>;
155
156 /// Convenience method to efficiently calculate the minimum values of an LOF ensemble
157 fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T>;
158
159 /// Convenience method to efficiently calculate the maximum values of an LOF ensemble
160 fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T>;
161}
162
163/// This struct allows multiple detection operations to be run on a point set using varying `k_neighbours` sizes
164/// without having to rebuild the underlying spatial index. Its [`PreparedDetector::outliers`] method
165/// has the same signature as [`OutlierDetection::outliers`], but retains the underlying spatial index and point set
166/// for greater efficiency.
167#[derive(Clone, Debug)]
168pub struct PreparedDetector<'a, T>
169where
170 T: GeoFloat,
171{
172 tree: RTree<GeomWithData<Point<T>, usize>>,
173 points: &'a [Point<T>],
174}
175
176impl<'a, T> PreparedDetector<'a, T>
177where
178 T: GeoFloat + Sum,
179{
180 /// Create a new "prepared" detector which allows repeated LOF algorithm calls with varying neighbour sizes
181 fn new(points: &'a [Point<T>]) -> Self {
182 let geoms: Vec<GeomWithData<_, usize>> = points
183 .iter()
184 .enumerate()
185 .map(|(idx, point)| GeomWithData::new(*point, idx))
186 .collect();
187 let tree = RTree::bulk_load(geoms);
188 Self { tree, points }
189 }
190
191 /// See [`OutlierDetection::outliers`] for usage
192 pub fn outliers(&self, kneighbours: usize) -> Vec<T> {
193 lof(self.points, &self.tree, kneighbours)
194 }
195}
196
197fn lof<T>(
198 points: &[Point<T>],
199 tree: &RTree<GeomWithData<Point<T>, usize>>,
200 kneighbours: usize,
201) -> Vec<T>
202where
203 T: GeoFloat + Sum,
204{
205 debug_assert!(kneighbours > 0);
206 if points.len() <= kneighbours || kneighbours < 1 {
207 // no point in trying to run the algorithm in this case
208 return points.iter().map(|_| T::one()).collect();
209 }
210 let knn_dists = points
211 .iter()
212 .map(|point| {
213 tree.nearest_neighbor_iter_with_distance_2(point)
214 .take(kneighbours)
215 .collect()
216 })
217 .collect::<Vec<Vec<_>>>();
218 // calculate LRD (local reachability density) of each point
219 // LRD is the estimated distance at which a point can be found by its neighbours:
220 // count(neighbour_set) / sum(max(point.kTh_dist, point.dist2(other point)) for all points in neighbour_set)
221 // we call this sum-of–max-distances reachDistance
222 let local_reachability_densities: Vec<T> = knn_dists
223 .iter()
224 .map(|neighbours| {
225 // for each point's neighbour set, calculate kth distance
226 let kth_dist = neighbours
227 .iter()
228 .map(|(_, distance)| distance)
229 .last()
230 .unwrap();
231 T::from(neighbours.len()).unwrap()
232 / neighbours
233 .iter()
234 // sum the max between neighbour distance and kth distance for the neighbour set
235 .map(|(_, distance)| distance.max(*kth_dist))
236 .sum()
237 })
238 .collect();
239 // LOF of a point p is the sum of the LRD of all the points
240 // in the set kNearestSet(p) * the sum of the reachDistance of all the points of the same set,
241 // to the point p, all divided by the number of items in p's kNN set, squared.
242 knn_dists
243 .iter()
244 .enumerate()
245 .map(|(_, neighbours)| {
246 // for each point's neighbour set, calculate kth distance
247 let kth_dist = neighbours
248 .iter()
249 .map(|(_, distance)| distance)
250 .last()
251 .unwrap();
252 // sum neighbour set LRD scores
253 let lrd_scores: T = neighbours
254 .iter()
255 .map(|(neighbour, _)| local_reachability_densities[neighbour.data])
256 .sum();
257 // sum neighbour set reachDistance
258 let sum_rd: T = neighbours
259 .iter()
260 .map(|(_, distance)| distance.max(*kth_dist))
261 .sum();
262 (lrd_scores * sum_rd) / T::from(neighbours.len().pow(2)).unwrap()
263 })
264 .collect()
265}
266
267impl<T> OutlierDetection<T> for MultiPoint<T>
268where
269 T: GeoFloat + Sum,
270{
271 fn outliers(&self, k_neighbours: usize) -> Vec<T> {
272 let pd = self.prepared_detector();
273 pd.outliers(k_neighbours)
274 }
275
276 fn prepared_detector(&self) -> PreparedDetector<T> {
277 PreparedDetector::new(&self.0)
278 }
279
280 fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
281 let pd = self.prepared_detector();
282 bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
283 }
284 fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
285 let pd = self.prepared_detector();
286 bounds
287 .map(|kneighbours| pd.outliers(kneighbours))
288 .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
289 .unwrap()
290 }
291
292 fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
293 let pd = self.prepared_detector();
294 bounds
295 .map(|kneighbours| pd.outliers(kneighbours))
296 .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
297 .unwrap()
298 }
299}
300
301impl<T> OutlierDetection<T> for [Point<T>]
302where
303 T: GeoFloat + Sum,
304{
305 fn outliers(&self, k_neighbours: usize) -> Vec<T> {
306 let pd = self.prepared_detector();
307 pd.outliers(k_neighbours)
308 }
309
310 fn prepared_detector(&self) -> PreparedDetector<T> {
311 PreparedDetector::new(self)
312 }
313
314 fn generate_ensemble(&self, bounds: RangeInclusive<usize>) -> Vec<Vec<T>> {
315 let pd = self.prepared_detector();
316 bounds.map(|kneighbours| pd.outliers(kneighbours)).collect()
317 }
318
319 fn ensemble_min(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
320 let pd = self.prepared_detector();
321 bounds
322 .map(|kneighbours| pd.outliers(kneighbours))
323 .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.min(b)).collect())
324 .unwrap()
325 }
326
327 fn ensemble_max(&self, bounds: RangeInclusive<usize>) -> Vec<T> {
328 let pd = self.prepared_detector();
329 bounds
330 .map(|kneighbours| pd.outliers(kneighbours))
331 .reduce(|acc, vec| acc.iter().zip(vec).map(|(a, b)| a.max(b)).collect())
332 .unwrap()
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339 use crate::point;
340
341 #[test]
342 fn test_lof() {
343 // third point is an outlier
344 let v = vec![
345 Point::new(0.0, 0.0),
346 Point::new(0.0, 1.0),
347 Point::new(3.0, 0.0),
348 Point::new(1.0, 1.0),
349 ];
350
351 let lofs = &v.outliers(3);
352 assert_eq!(lofs[2], 3.3333333333333335);
353 }
354 #[test]
355 fn test_lof2() {
356 // fourth point is an outlier
357 let v = vec![
358 Point::new(0.0, 0.0),
359 Point::new(1.0, 0.0),
360 Point::new(1.0, 1.0),
361 Point::new(0.0, 3.0),
362 ];
363 let lofs = &v.outliers(3);
364 assert_eq!(lofs[3], 3.3333333333333335);
365 }
366 #[test]
367 fn test_lof3() {
368 // second point is an outlier, sort and reverse so scores are in descending order
369 let v = vec![
370 Point::new(0.0, 0.0),
371 Point::new(0.0, 3.0),
372 Point::new(1.0, 0.0),
373 Point::new(1.0, 1.0),
374 ];
375 let lofs = &mut v.outliers(3);
376 lofs.sort_by(|a, b| a.partial_cmp(b).unwrap());
377 lofs.reverse();
378 assert_eq!(lofs[0], 3.3333333333333335);
379 }
380 #[test]
381 fn test_lof4() {
382 // this dataset contains 4 outliers
383 // indices 6, 7, 8, 9 should be detected
384 // order: 9, 6, 8, 7
385 let v = vec![
386 point!(x: 0.16, y: 0.14),
387 point!(x: 0.15, y: 0.33),
388 point!(x: 0.37, y: 0.25),
389 point!(x: 0.3 , y: 0.4),
390 point!(x: 0.3 , y: 0.1),
391 point!(x: 0.3 , y: 0.2),
392 point!(x: 1.3 , y: 2.3),
393 point!(x: 1.7 , y: 0.2),
394 point!(x: 0.7 , y: -0.9),
395 point!(x: 0.21, y: 2.45),
396 point!(x: 0.8 , y: 0.7),
397 point!(x: 0.9 , y: 0.7),
398 point!(x: 0.8 , y: 0.6),
399 point!(x: 0.73, y: 0.65),
400 point!(x: 0.9 , y: 0.6),
401 point!(x: 1.0, y: 0.6),
402 point!(x: 1.0, y: 0.7),
403 point!(x: 0.25, y: 0.29),
404 point!(x: 0.2 , y: 0.2),
405 ];
406 let lofs = &mut v.outliers(3);
407 let mut idx_lofs: Vec<(usize, f64)> = lofs
408 .iter()
409 .enumerate()
410 .map(|(idx, score)| (idx, *score))
411 .collect();
412 idx_lofs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
413 idx_lofs.reverse();
414 // four outliers, scores way above 10
415 idx_lofs
416 .iter()
417 .take(4)
418 .for_each(|score| assert!(score.1 > 10.0));
419 // the rest below 2
420 idx_lofs
421 .iter()
422 .skip(4)
423 .for_each(|score| assert!(score.1 < 2.0));
424 // ensure that scores are being computed correctly
425 assert_eq!(idx_lofs[0].0, 9);
426 assert_eq!(idx_lofs[1].0, 6);
427 assert_eq!(idx_lofs[2].0, 8);
428 assert_eq!(idx_lofs[3].0, 7);
429 }
430 #[test]
431 fn test_lof5() {
432 // third point is an outlier
433 let v = vec![
434 Point::new(0.0, 0.0),
435 Point::new(0.0, 1.0),
436 Point::new(3.0, 0.0),
437 Point::new(1.0, 1.0),
438 ];
439
440 let prepared = &v.prepared_detector();
441 let s1 = prepared.outliers(2);
442 let s2 = prepared.outliers(3);
443 // different neighbour sizes give different scores
444 assert_ne!(s1[2], s2[2]);
445 }
446}