Skip to content

Commit 173ab0e

Browse files
committed
Bloom filter with tests
1 parent b2b8585 commit 173ab0e

File tree

1 file changed

+103
-0
lines changed

1 file changed

+103
-0
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
See https://en.wikipedia.org/wiki/Bloom_filter
3+
"""
4+
from hashlib import sha256, md5
5+
from random import randint, choices
6+
import string
7+
8+
9+
class Bloom:
10+
def __init__(self, size=8):
11+
self.bitstring = 0b0
12+
self.size = size
13+
14+
def add(self, value):
15+
h = self.hash(value)
16+
self.bitstring |= h
17+
print(
18+
f"""\
19+
[add] value = {value}
20+
hash = {self.format_bin(h)}
21+
filter = {self.format_bin(self.bitstring)}
22+
"""
23+
)
24+
25+
def exists(self, value):
26+
h = self.hash(value)
27+
res = (h & self.bitstring) == h
28+
29+
print(
30+
f"""\
31+
[exists] value = {value}
32+
hash = {self.format_bin(h)}
33+
filter = {self.format_bin(self.bitstring)}
34+
res = {res}
35+
"""
36+
)
37+
return res
38+
39+
def format_bin(self, value):
40+
res = bin(value)[2:]
41+
return res.zfill(self.size)
42+
43+
def hash(self, value):
44+
res = 0b0
45+
for func in (sha256, md5):
46+
b = func(value.encode()).digest()
47+
position = int.from_bytes(b, "little") % self.size
48+
res |= 2**position
49+
return res
50+
51+
52+
def test_movies():
53+
b = Bloom()
54+
b.add("titanic")
55+
b.add("avatar")
56+
57+
assert b.exists("titanic")
58+
assert b.exists("avatar")
59+
60+
assert b.exists("the goodfather") in (True, False)
61+
assert b.exists("interstellar") in (True, False)
62+
assert b.exists("Parasite") in (True, False)
63+
assert b.exists("Pulp fiction") in (True, False)
64+
65+
66+
def random_string(size):
67+
return "".join(choices(string.ascii_lowercase + " ", k=size))
68+
69+
70+
def test_probability(m=64, n=20):
71+
b = Bloom(size=m)
72+
73+
added = {random_string(10) for i in range(n)}
74+
for a in added:
75+
b.add(a)
76+
77+
# number of hash functions is fixed
78+
k = 2
79+
80+
n_ones = bin(b.bitstring).count("1")
81+
expected_probability = (n_ones / m) ** k
82+
83+
expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k
84+
85+
not_added = {random_string(10) for i in range(1000)}
86+
fails = 0
87+
for string in not_added:
88+
if b.exists(string):
89+
fails += 1
90+
fail_rate = fails / len(not_added)
91+
92+
print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}")
93+
print(f"{expected_probability=}")
94+
print(f"{expected_probability_wikipedia=}")
95+
96+
assert (
97+
abs(expected_probability - fail_rate) <= 0.05
98+
) # 5% margin calculated experiementally
99+
100+
101+
if __name__ == "__main__":
102+
test_movies()
103+
test_probability()

0 commit comments

Comments
 (0)