nostr_types/types/
hll8.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
use crate::Error;
use std::ops::AddAssign;

/// HyperLogLog approximate counting mechanism
///
/// This uses a fixed set of 256 buckets (k=8, M=256), each holding a count up to 255.
/// https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
///
/// This is used for NIP-45 PR #1561
#[derive(Debug, Clone, Copy)]
pub struct Hll8([u8; 256]);

impl Default for Hll8 {
    fn default() -> Self {
        Self::new()
    }
}

impl Hll8 {
    /// Create a new Hll8
    pub fn new() -> Hll8 {
        Hll8([0; 256])
    }

    /// Import from a (hex) string
    pub fn from_hex_string(s: &str) -> Result<Hll8, Error> {
        let vec: Vec<u8> = hex::decode(s)?;
        let arr: [u8; 256] = vec.try_into().map_err(|_| Error::InvalidHll)?;
        Ok(Hll8(arr))
    }

    /// Export to a (hex) string
    pub fn to_hex_string(&self) -> String {
        hex::encode(self.0)
    }

    /// Clear to zero counts
    pub fn clear(&mut self) {
        for i in 0..=255 {
            self.0[i] = 0
        }
    }

    /// Add an element to the count by value
    /// Returns false on error (e.g. offset is out of range)
    pub fn add_element(&mut self, input: &[u8; 32], offset: usize) -> Result<(), Error> {
        if offset >= 24 {
            return Err(Error::OutOfRange(offset));
        }

        // Use the byte at that offset as the bucket
        let index = input[offset];

        // Count zeros after that offset
        let zeros = {
            let mut zeros: u8 = 0;
            #[allow(clippy::needless_range_loop)]
            for i in offset + 1..=31 {
                let leading = input[i].leading_zeros();
                zeros += leading as u8;
                if leading < 8 {
                    break;
                }
            }
            zeros
        };

        let rho = zeros + 1;
        self.add_element_inner(index, rho);

        Ok(())
    }

    /// Add an element to the count by index and rho (position of the first 1)
    pub fn add_element_inner(&mut self, index: u8, rho: u8) {
        if rho > self.0[index as usize] {
            self.0[index as usize] = rho;
        }
    }

    /// Compute the approximate count
    pub fn estimate_count(&self) -> usize {
        // 2007 paper calls this 'V';  2016 paper calls this 'z'
        let zero_count = self.0.iter().filter(|&c| *c == 0).count();

        // Sum over the reciprocals (SUM of 2^-m)
        let mut sum: f64 = 0.0;
        for i in 0..=255 {
            let power: usize = 1 << self.0[i];
            sum += 1.0 / (power as f64);
        }

        let estimate = estimate_hyperloglog(sum, zero_count);
        // estimate_fiatjaf(sum, zero_count);

        estimate.round() as usize
    }
}

impl AddAssign for Hll8 {
    fn add_assign(&mut self, other: Self) {
        for i in 0..=255 {
            if other.0[i] > self.0[i] {
                self.0[i] = other.0[i];
            }
        }
    }
}

// The number of buckets we use = 2**k  ('m' from the papers)
const M: f64 = 256.0;

// The correction factor, 'α' from the paper for k=8,M=256
const ALPHA: f64 = 0.7213 / (1.0 + 1.079 / M);

// 2^32 as a floating point number
const TWO32: f64 = 4_294_967_296.0;

// HyperLogLog++ Threshold, when to switch from linear counting for M=256 (k=8)
#[allow(dead_code)]
const THRESHOLD: f64 = 220.0;

fn estimate_hyperloglog(sum: f64, zero_count: usize) -> f64 {
    let mut estimate = ALPHA * (M * M) / sum;
    if estimate <= (5.0 / 2.0) * M {
        // 640
        if zero_count != 0 {
            estimate = M * (M / (zero_count as f64)).ln(); // linear
        }
    } else if estimate > (1.0 / 30.0) * TWO32 {
        // 143165576
        estimate = -TWO32 * (1.0 - estimate / TWO32).log2();
    };
    estimate
}

#[allow(dead_code)]
fn estimate_fiatjaf(sum: f64, zero_count: usize) -> f64 {
    let estimate = ALPHA * M * M / sum;
    if zero_count == 0 {
        return estimate;
    }
    let linear = M * (M / zero_count as f64).ln();
    if linear <= THRESHOLD {
        linear
    } else if estimate < 256.0 * 3.0 {
        // 768
        linear
    } else {
        estimate
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_hll8() {
        use rand::seq::SliceRandom;
        use rand_core::{OsRng, RngCore};
        let mut rng = rand::thread_rng();

        // Create 2500 well known different keys
        let mut input: Vec<[u8; 32]> = Vec::new();
        for _ in 0..2500 {
            let mut key = [0u8; 32];
            OsRng.fill_bytes(&mut key);
            input.push(key);
        }

        for numkeys in [
            1, 2, 3, 5, 9, 15, 33, 50, 89, 115, 150, 195, 260, 420, 1000, 2500,
        ] {
            // Test Hll8 using these keys
            let mut h = Hll8::new();
            for _ in 0..=100_000 {
                let random_key = input[0..numkeys].choose(&mut rng).unwrap();
                h.add_element(random_key, 16).unwrap();
            }

            // Even though we added 100,000 elements, there are only numkey distinct ones.

            println!("Actual: {}  Estimate: {}", numkeys, h.estimate_count());
        }
    }
}