rand_distr/
zipf.rs

1// Copyright 2021 Developers of the Rand project.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! The Zipf distribution.
10
11use crate::{Distribution, StandardUniform};
12use core::fmt;
13use num_traits::Float;
14use rand::Rng;
15
16/// The Zipf (Zipfian) distribution `Zipf(n, s)`.
17///
18/// The samples follow [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law):
19/// The frequency of each sample from a finite set of size `n` is inversely
20/// proportional to a power of its frequency rank (with exponent `s`).
21///
22/// For large `n`, this converges to the [`Zeta`](crate::Zeta) distribution.
23///
24/// For `s = 0`, this becomes a [`uniform`](crate::Uniform) distribution.
25///
26/// # Plot
27///
28/// The following plot illustrates the Zipf distribution for `n = 10` and
29/// various values of `s`.
30///
31/// ![Zipf distribution](https://raw.githubusercontent.com/rust-random/charts/main/charts/zipf.svg)
32///
33/// # Example
34/// ```
35/// use rand::prelude::*;
36/// use rand_distr::Zipf;
37///
38/// let val: f64 = rand::rng().sample(Zipf::new(10.0, 1.5).unwrap());
39/// println!("{}", val);
40/// ```
41///
42/// # Integer vs FP return type
43///
44/// This implementation uses floating-point (FP) logic internally. It may be
45/// expected that the samples are no greater than `n`, thus it is reasonable to
46/// cast generated samples to any integer type which can also represent `n`
47/// (e.g. `distr.sample(&mut rng) as u64`).
48///
49/// # Implementation details
50///
51/// Implemented via [rejection sampling](https://en.wikipedia.org/wiki/Rejection_sampling),
52/// due to Jason Crease[1].
53///
54/// [1]: https://jasoncrease.medium.com/rejection-sampling-the-zipf-distribution-6b359792cffa
55#[derive(Clone, Copy, Debug, PartialEq)]
56pub struct Zipf<F>
57where
58    F: Float,
59    StandardUniform: Distribution<F>,
60{
61    s: F,
62    t: F,
63    q: F,
64}
65
66/// Error type returned from [`Zipf::new`].
67#[derive(Clone, Copy, Debug, PartialEq, Eq)]
68pub enum Error {
69    /// `s < 0` or `nan`.
70    STooSmall,
71    /// `n < 1`.
72    NTooSmall,
73}
74
75impl fmt::Display for Error {
76    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77        f.write_str(match self {
78            Error::STooSmall => "s < 0 or is NaN in Zipf distribution",
79            Error::NTooSmall => "n < 1 in Zipf distribution",
80        })
81    }
82}
83
84#[cfg(feature = "std")]
85impl std::error::Error for Error {}
86
87impl<F> Zipf<F>
88where
89    F: Float,
90    StandardUniform: Distribution<F>,
91{
92    /// Construct a new `Zipf` distribution for a set with `n` elements and a
93    /// frequency rank exponent `s`.
94    ///
95    /// The parameter `n` is typically integral, however we use type
96    /// <pre><code>F: [Float]</code></pre> in order to permit very large values
97    /// and since our implementation requires a floating-point type.
98    #[inline]
99    pub fn new(n: F, s: F) -> Result<Zipf<F>, Error> {
100        if !(s >= F::zero()) {
101            return Err(Error::STooSmall);
102        }
103        if n < F::one() {
104            return Err(Error::NTooSmall);
105        }
106        let q = if s != F::one() {
107            // Make sure to calculate the division only once.
108            F::one() / (F::one() - s)
109        } else {
110            // This value is never used.
111            F::zero()
112        };
113        let t = if s != F::one() {
114            (n.powf(F::one() - s) - s) * q
115        } else {
116            F::one() + n.ln()
117        };
118        debug_assert!(t > F::zero());
119        Ok(Zipf { s, t, q })
120    }
121
122    /// Inverse cumulative density function
123    #[inline]
124    fn inv_cdf(&self, p: F) -> F {
125        let one = F::one();
126        let pt = p * self.t;
127        if pt <= one {
128            pt
129        } else if self.s != one {
130            (pt * (one - self.s) + self.s).powf(self.q)
131        } else {
132            (pt - one).exp()
133        }
134    }
135}
136
137impl<F> Distribution<F> for Zipf<F>
138where
139    F: Float,
140    StandardUniform: Distribution<F>,
141{
142    #[inline]
143    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> F {
144        let one = F::one();
145        loop {
146            let inv_b = self.inv_cdf(rng.sample(StandardUniform));
147            let x = (inv_b + one).floor();
148            let mut ratio = x.powf(-self.s);
149            if x > one {
150                ratio = ratio * inv_b.powf(self.s)
151            };
152
153            let y = rng.sample(StandardUniform);
154            if y < ratio {
155                return x;
156            }
157        }
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    fn test_samples<F: Float + fmt::Debug, D: Distribution<F>>(distr: D, zero: F, expected: &[F]) {
166        let mut rng = crate::test::rng(213);
167        let mut buf = [zero; 4];
168        for x in &mut buf {
169            *x = rng.sample(&distr);
170        }
171        assert_eq!(buf, expected);
172    }
173
174    #[test]
175    #[should_panic]
176    fn zipf_s_too_small() {
177        Zipf::new(10., -1.).unwrap();
178    }
179
180    #[test]
181    #[should_panic]
182    fn zipf_n_too_small() {
183        Zipf::new(0., 1.).unwrap();
184    }
185
186    #[test]
187    #[should_panic]
188    fn zipf_nan() {
189        Zipf::new(10., f64::NAN).unwrap();
190    }
191
192    #[test]
193    fn zipf_sample() {
194        let d = Zipf::new(10., 0.5).unwrap();
195        let mut rng = crate::test::rng(2);
196        for _ in 0..1000 {
197            let r = d.sample(&mut rng);
198            assert!(r >= 1.);
199        }
200    }
201
202    #[test]
203    fn zipf_sample_s_1() {
204        let d = Zipf::new(10., 1.).unwrap();
205        let mut rng = crate::test::rng(2);
206        for _ in 0..1000 {
207            let r = d.sample(&mut rng);
208            assert!(r >= 1.);
209        }
210    }
211
212    #[test]
213    fn zipf_sample_s_0() {
214        let d = Zipf::new(10., 0.).unwrap();
215        let mut rng = crate::test::rng(2);
216        for _ in 0..1000 {
217            let r = d.sample(&mut rng);
218            assert!(r >= 1.);
219        }
220        // TODO: verify that this is a uniform distribution
221    }
222
223    #[test]
224    fn zipf_sample_large_n() {
225        let d = Zipf::new(f64::MAX, 1.5).unwrap();
226        let mut rng = crate::test::rng(2);
227        for _ in 0..1000 {
228            let r = d.sample(&mut rng);
229            assert!(r >= 1.);
230        }
231        // TODO: verify that this is a zeta distribution
232    }
233
234    #[test]
235    fn zipf_value_stability() {
236        test_samples(Zipf::new(10., 0.5).unwrap(), 0f32, &[10.0, 2.0, 6.0, 7.0]);
237        test_samples(Zipf::new(10., 2.0).unwrap(), 0f64, &[1.0, 2.0, 3.0, 2.0]);
238    }
239
240    #[test]
241    fn zipf_distributions_can_be_compared() {
242        assert_eq!(Zipf::new(1.0, 2.0), Zipf::new(1.0, 2.0));
243    }
244}