nostr_types/types/
hll8.rs

1use crate::Error;
2use std::ops::AddAssign;
3
4/// HyperLogLog approximate counting mechanism
5///
6/// This uses a fixed set of 256 buckets (k=8, M=256), each holding a count up to 255.
7/// https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
8///
9/// This is used for NIP-45 PR #1561
10#[derive(Debug, Clone, Copy)]
11pub struct Hll8([u8; 256]);
12
13impl Default for Hll8 {
14    fn default() -> Self {
15        Self::new()
16    }
17}
18
19impl Hll8 {
20    /// Create a new Hll8
21    pub fn new() -> Hll8 {
22        Hll8([0; 256])
23    }
24
25    /// Import from a (hex) string
26    pub fn from_hex_string(s: &str) -> Result<Hll8, Error> {
27        let vec: Vec<u8> = hex::decode(s)?;
28        let arr: [u8; 256] = vec.try_into().map_err(|_| Error::InvalidHll)?;
29        Ok(Hll8(arr))
30    }
31
32    /// Export to a (hex) string
33    pub fn to_hex_string(&self) -> String {
34        hex::encode(self.0)
35    }
36
37    /// Clear to zero counts
38    pub fn clear(&mut self) {
39        for i in 0..=255 {
40            self.0[i] = 0
41        }
42    }
43
44    /// Add an element to the count by value
45    /// Returns false on error (e.g. offset is out of range)
46    pub fn add_element(&mut self, input: &[u8; 32], offset: usize) -> Result<(), Error> {
47        if offset >= 24 {
48            return Err(Error::OutOfRange(offset));
49        }
50
51        // Use the byte at that offset as the bucket
52        let index = input[offset];
53
54        // Count zeros after that offset
55        let zeros = {
56            let mut zeros: u8 = 0;
57            #[allow(clippy::needless_range_loop)]
58            for i in offset + 1..=31 {
59                let leading = input[i].leading_zeros();
60                zeros += leading as u8;
61                if leading < 8 {
62                    break;
63                }
64            }
65            zeros
66        };
67
68        let rho = zeros + 1;
69        self.add_element_inner(index, rho);
70
71        Ok(())
72    }
73
74    /// Add an element to the count by index and rho (position of the first 1)
75    pub fn add_element_inner(&mut self, index: u8, rho: u8) {
76        if rho > self.0[index as usize] {
77            self.0[index as usize] = rho;
78        }
79    }
80
81    /// Compute the approximate count
82    pub fn estimate_count(&self) -> usize {
83        // 2007 paper calls this 'V';  2016 paper calls this 'z'
84        let zero_count = self.0.iter().filter(|&c| *c == 0).count();
85
86        // Sum over the reciprocals (SUM of 2^-m)
87        let mut sum: f64 = 0.0;
88        for i in 0..=255 {
89            let power: usize = 1 << self.0[i];
90            sum += 1.0 / (power as f64);
91        }
92
93        let estimate = estimate_hyperloglog(sum, zero_count);
94        // estimate_fiatjaf(sum, zero_count);
95
96        estimate.round() as usize
97    }
98}
99
100impl AddAssign for Hll8 {
101    fn add_assign(&mut self, other: Self) {
102        for i in 0..=255 {
103            if other.0[i] > self.0[i] {
104                self.0[i] = other.0[i];
105            }
106        }
107    }
108}
109
110// The number of buckets we use = 2**k  ('m' from the papers)
111const M: f64 = 256.0;
112
113// The correction factor, 'α' from the paper for k=8,M=256
114const ALPHA: f64 = 0.7213 / (1.0 + 1.079 / M);
115
116// 2^32 as a floating point number
117const TWO32: f64 = 4_294_967_296.0;
118
119// HyperLogLog++ Threshold, when to switch from linear counting for M=256 (k=8)
120#[allow(dead_code)]
121const THRESHOLD: f64 = 220.0;
122
123fn estimate_hyperloglog(sum: f64, zero_count: usize) -> f64 {
124    let mut estimate = ALPHA * (M * M) / sum;
125    if estimate <= (5.0 / 2.0) * M {
126        // 640
127        if zero_count != 0 {
128            estimate = M * (M / (zero_count as f64)).ln(); // linear
129        }
130    } else if estimate > (1.0 / 30.0) * TWO32 {
131        // 143165576
132        estimate = -TWO32 * (1.0 - estimate / TWO32).log2();
133    };
134    estimate
135}
136
137#[allow(dead_code)]
138fn estimate_fiatjaf(sum: f64, zero_count: usize) -> f64 {
139    let estimate = ALPHA * M * M / sum;
140    if zero_count == 0 {
141        return estimate;
142    }
143    let linear = M * (M / zero_count as f64).ln();
144    if linear <= THRESHOLD {
145        linear
146    } else if estimate < 256.0 * 3.0 {
147        // 768
148        linear
149    } else {
150        estimate
151    }
152}
153
154#[cfg(test)]
155mod test {
156    use super::*;
157
158    #[test]
159    fn test_hll8() {
160        use rand::seq::SliceRandom;
161        use rand_core::{OsRng, RngCore};
162        let mut rng = rand::thread_rng();
163
164        // Create 2500 well known different keys
165        let mut input: Vec<[u8; 32]> = Vec::new();
166        for _ in 0..2500 {
167            let mut key = [0u8; 32];
168            OsRng.fill_bytes(&mut key);
169            input.push(key);
170        }
171
172        for numkeys in [
173            1, 2, 3, 5, 9, 15, 33, 50, 89, 115, 150, 195, 260, 420, 1000, 2500,
174        ] {
175            // Test Hll8 using these keys
176            let mut h = Hll8::new();
177            for _ in 0..=100_000 {
178                let random_key = input[0..numkeys].choose(&mut rng).unwrap();
179                h.add_element(random_key, 16).unwrap();
180            }
181
182            // Even though we added 100,000 elements, there are only numkey distinct ones.
183
184            println!("Actual: {}  Estimate: {}", numkeys, h.estimate_count());
185        }
186    }
187}