rand_distr/
zipf.rs

1// Copyright 2021 Developers of the Rand project.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! The Zipf distribution.
10
11use crate::{Distribution, StandardUniform};
12use core::fmt;
13use num_traits::Float;
14use rand::{Rng, RngExt};
15
16/// The Zipf (Zipfian) distribution `Zipf(n, s)`.
17///
18/// The samples follow [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law):
19/// The frequency of each sample from a finite set of size `n` is inversely
20/// proportional to a power of its frequency rank (with exponent `s`).
21///
22/// For large `n`, this converges to the [`Zeta`](crate::Zeta) distribution.
23///
24/// For `s = 0`, this becomes a [`uniform`](crate::Uniform) distribution.
25///
26/// # Plot
27///
28/// The following plot illustrates the Zipf distribution for `n = 10` and
29/// various values of `s`.
30///
31/// ![Zipf distribution](https://raw.githubusercontent.com/rust-random/charts/main/charts/zipf.svg)
32///
33/// # Example
34/// ```
35/// use rand::prelude::*;
36/// use rand_distr::Zipf;
37///
38/// let val: f64 = rand::rng().sample(Zipf::new(10.0, 1.5).unwrap());
39/// println!("{}", val);
40/// ```
41///
42/// # Integer vs FP return type
43///
44/// This implementation uses floating-point (FP) logic internally. It may be
45/// expected that the samples are no greater than `n`, thus it is reasonable to
46/// cast generated samples to any integer type which can also represent `n`
47/// (e.g. `distr.sample(&mut rng) as u64`).
48///
49/// # Implementation details
50///
51/// Implemented via [rejection sampling](https://en.wikipedia.org/wiki/Rejection_sampling),
52/// due to Jason Crease[1].
53///
54/// [1]: https://jasoncrease.medium.com/rejection-sampling-the-zipf-distribution-6b359792cffa
55#[derive(Clone, Copy, Debug, PartialEq)]
56pub struct Zipf<F>
57where
58    F: Float,
59    StandardUniform: Distribution<F>,
60{
61    s: F,
62    t: F,
63    q: F,
64}
65
66/// Error type returned from [`Zipf::new`].
67#[derive(Clone, Copy, Debug, PartialEq, Eq)]
68pub enum Error {
69    /// `s < 0` or `s` is `nan`
70    STooSmall,
71    /// `n < 1` or `n` is `nan`
72    NTooSmall,
73    /// `n = inf` and `s <= 1`
74    IllDefined,
75}
76
77impl fmt::Display for Error {
78    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79        f.write_str(match self {
80            Error::STooSmall => "s < 0 or is NaN in Zipf distribution",
81            Error::NTooSmall => "n < 1 or is NaN in Zipf distribution",
82            Error::IllDefined => "n = inf and s <= 1 in Zipf distribution",
83        })
84    }
85}
86
87#[cfg(feature = "std")]
88impl std::error::Error for Error {}
89
90impl<F> Zipf<F>
91where
92    F: Float,
93    StandardUniform: Distribution<F>,
94{
95    /// Construct a new `Zipf` distribution for a set with `n` elements and a
96    /// frequency rank exponent `s`.
97    ///
98    /// The parameter `n` is typically integral, however we use type
99    /// <pre><code>F: [Float]</code></pre> in order to permit very large values
100    /// and since our implementation requires a floating-point type.
101    #[inline]
102    pub fn new(n: F, s: F) -> Result<Zipf<F>, Error> {
103        if !(s >= F::zero()) {
104            return Err(Error::STooSmall);
105        }
106        if !(n >= F::one()) {
107            return Err(Error::NTooSmall);
108        }
109        if n.is_infinite() && s <= F::one() {
110            return Err(Error::IllDefined);
111        }
112        let q = if s != F::one() {
113            // Make sure to calculate the division only once.
114            F::one() / (F::one() - s)
115        } else {
116            // This value is never used.
117            F::zero()
118        };
119        let t = if s == F::infinity() {
120            F::one()
121        } else if s != F::one() {
122            (n.powf(F::one() - s) - s) * q
123        } else {
124            F::one() + n.ln()
125        };
126        debug_assert!(t > F::zero());
127        Ok(Zipf { s, t, q })
128    }
129
130    /// Inverse cumulative density function
131    #[inline]
132    fn inv_cdf(&self, p: F) -> F {
133        let one = F::one();
134        let pt = p * self.t;
135        if pt <= one {
136            pt
137        } else if self.s != one {
138            (pt * (one - self.s) + self.s).powf(self.q)
139        } else {
140            (pt - one).exp()
141        }
142    }
143}
144
145impl<F> Distribution<F> for Zipf<F>
146where
147    F: Float,
148    StandardUniform: Distribution<F>,
149{
150    #[inline]
151    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> F {
152        let one = F::one();
153        loop {
154            let inv_b = self.inv_cdf(rng.sample(StandardUniform));
155            let x = (inv_b + one).floor();
156            let mut ratio = x.powf(-self.s);
157            if x > one {
158                ratio = ratio * inv_b.powf(self.s)
159            };
160
161            let y = rng.sample(StandardUniform);
162            if y < ratio {
163                return x;
164            }
165        }
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    fn test_samples<F: Float + fmt::Debug, D: Distribution<F>>(distr: D, zero: F, expected: &[F]) {
174        let mut rng = crate::test::rng(213);
175        let mut buf = [zero; 4];
176        for x in &mut buf {
177            *x = rng.sample(&distr);
178        }
179        assert_eq!(buf, expected);
180    }
181
182    #[test]
183    #[should_panic]
184    fn zipf_s_too_small() {
185        Zipf::new(10., -1.).unwrap();
186    }
187
188    #[test]
189    #[should_panic]
190    fn zipf_n_too_small() {
191        Zipf::new(0., 1.).unwrap();
192    }
193
194    #[test]
195    #[should_panic]
196    fn zipf_nan() {
197        Zipf::new(10., f64::NAN).unwrap();
198    }
199
200    #[test]
201    fn zipf_sample() {
202        let d = Zipf::new(10., 0.5).unwrap();
203        let mut rng = crate::test::rng(2);
204        for _ in 0..1000 {
205            let r = d.sample(&mut rng);
206            assert!(r >= 1.);
207        }
208    }
209
210    #[test]
211    fn zipf_sample_s_1() {
212        let d = Zipf::new(10., 1.).unwrap();
213        let mut rng = crate::test::rng(2);
214        for _ in 0..1000 {
215            let r = d.sample(&mut rng);
216            assert!(r >= 1.);
217        }
218    }
219
220    #[test]
221    fn zipf_sample_s_0() {
222        let d = Zipf::new(10., 0.).unwrap();
223        let mut rng = crate::test::rng(2);
224        for _ in 0..1000 {
225            let r = d.sample(&mut rng);
226            assert!(r >= 1.);
227        }
228        // TODO: verify that this is a uniform distribution
229    }
230
231    #[test]
232    fn zipf_sample_s_inf() {
233        let d = Zipf::new(10., f64::infinity()).unwrap();
234        let mut rng = crate::test::rng(2);
235        for _ in 0..1000 {
236            let r = d.sample(&mut rng);
237            assert!(r == 1.);
238        }
239    }
240
241    #[test]
242    fn zipf_sample_large_n() {
243        let d = Zipf::new(f64::MAX, 1.5).unwrap();
244        let mut rng = crate::test::rng(2);
245        for _ in 0..1000 {
246            let r = d.sample(&mut rng);
247            assert!(r >= 1.);
248        }
249        // TODO: verify that this is a zeta distribution
250    }
251
252    #[test]
253    fn zipf_value_stability() {
254        test_samples(Zipf::new(10., 0.5).unwrap(), 0f32, &[10.0, 2.0, 6.0, 7.0]);
255        test_samples(Zipf::new(10., 2.0).unwrap(), 0f64, &[1.0, 2.0, 3.0, 2.0]);
256    }
257
258    #[test]
259    fn zipf_distributions_can_be_compared() {
260        assert_eq!(Zipf::new(1.0, 2.0), Zipf::new(1.0, 2.0));
261    }
262}