|
| 1 | +""" |
| 2 | +See https://en.wikipedia.org/wiki/Bloom_filter |
| 3 | +""" |
| 4 | +from hashlib import sha256, md5 |
| 5 | +from random import randint, choices |
| 6 | +import string |
| 7 | + |
| 8 | + |
| 9 | +class Bloom: |
| 10 | + def __init__(self, size=8): |
| 11 | + self.bitstring = 0b0 |
| 12 | + self.size = size |
| 13 | + |
| 14 | + def add(self, value): |
| 15 | + h = self.hash(value) |
| 16 | + self.bitstring |= h |
| 17 | + print( |
| 18 | + f"""\ |
| 19 | +[add] value = {value} |
| 20 | + hash = {self.format_bin(h)} |
| 21 | + filter = {self.format_bin(self.bitstring)} |
| 22 | +""" |
| 23 | + ) |
| 24 | + |
| 25 | + def exists(self, value): |
| 26 | + h = self.hash(value) |
| 27 | + res = (h & self.bitstring) == h |
| 28 | + |
| 29 | + print( |
| 30 | + f"""\ |
| 31 | +[exists] value = {value} |
| 32 | + hash = {self.format_bin(h)} |
| 33 | + filter = {self.format_bin(self.bitstring)} |
| 34 | + res = {res} |
| 35 | +""" |
| 36 | + ) |
| 37 | + return res |
| 38 | + |
| 39 | + def format_bin(self, value): |
| 40 | + res = bin(value)[2:] |
| 41 | + return res.zfill(self.size) |
| 42 | + |
| 43 | + def hash(self, value): |
| 44 | + res = 0b0 |
| 45 | + for func in (sha256, md5): |
| 46 | + b = func(value.encode()).digest() |
| 47 | + position = int.from_bytes(b, "little") % self.size |
| 48 | + res |= 2**position |
| 49 | + return res |
| 50 | + |
| 51 | + |
| 52 | +def test_movies(): |
| 53 | + b = Bloom() |
| 54 | + b.add("titanic") |
| 55 | + b.add("avatar") |
| 56 | + |
| 57 | + assert b.exists("titanic") |
| 58 | + assert b.exists("avatar") |
| 59 | + |
| 60 | + assert b.exists("the goodfather") in (True, False) |
| 61 | + assert b.exists("interstellar") in (True, False) |
| 62 | + assert b.exists("Parasite") in (True, False) |
| 63 | + assert b.exists("Pulp fiction") in (True, False) |
| 64 | + |
| 65 | + |
| 66 | +def random_string(size): |
| 67 | + return "".join(choices(string.ascii_lowercase + " ", k=size)) |
| 68 | + |
| 69 | + |
| 70 | +def test_probability(m=64, n=20): |
| 71 | + b = Bloom(size=m) |
| 72 | + |
| 73 | + added = {random_string(10) for i in range(n)} |
| 74 | + for a in added: |
| 75 | + b.add(a) |
| 76 | + |
| 77 | + # number of hash functions is fixed |
| 78 | + k = 2 |
| 79 | + |
| 80 | + n_ones = bin(b.bitstring).count("1") |
| 81 | + expected_probability = (n_ones / m) ** k |
| 82 | + |
| 83 | + expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k |
| 84 | + |
| 85 | + not_added = {random_string(10) for i in range(1000)} |
| 86 | + fails = 0 |
| 87 | + for string in not_added: |
| 88 | + if b.exists(string): |
| 89 | + fails += 1 |
| 90 | + fail_rate = fails / len(not_added) |
| 91 | + |
| 92 | + print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}") |
| 93 | + print(f"{expected_probability=}") |
| 94 | + print(f"{expected_probability_wikipedia=}") |
| 95 | + |
| 96 | + assert ( |
| 97 | + abs(expected_probability - fail_rate) <= 0.05 |
| 98 | + ) # 5% margin calculated experiementally |
| 99 | + |
| 100 | + |
| 101 | +if __name__ == "__main__": |
| 102 | + test_movies() |
| 103 | + test_probability() |
0 commit comments