Skip to content

Commit eae5a72

Browse files
feat: add Bloom Filter to data_structures (TheAlgorithms#1953)
* feat: add bloom_filter.c(bloom filter and bitset for it) * updating DIRECTORY.md * docs: add comments to bloom_filter * test: add tests and minor fixes * docs: more common docs * fix: clang-tidy warnings fix * fix: clang-forma - minor fixes * fix: line-space and better naming in hashStr * updating DIRECTORY.md * fix: apply suggestions from code review Co-authored-by: David Leal <halfpacho@gmail.com> * fix: test_bitset was moved before main. Changed description for template T parameter * Apply suggestions from code review Co-authored-by: David Leal <halfpacho@gmail.com> * fix: namespace fix * fix: minor style fix * Apply suggestions from code review Co-authored-by: David Leal <halfpacho@gmail.com> * fix: params names was removed from prototypes * Update data_structures/bloom_filter.cpp Co-authored-by: David Leal <halfpacho@gmail.com> * docs: minor improvements/fixes Co-authored-by: David <Panquesito7@users.noreply.github.com> Co-authored-by: David Leal <halfpacho@gmail.com>
1 parent 0594923 commit eae5a72

File tree

2 files changed

+292
-0
lines changed

2 files changed

+292
-0
lines changed

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
* [Binary Search Tree](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/binary_search_tree.cpp)
4242
* [Binary Search Tree2](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/binary_search_tree2.cpp)
4343
* [Binaryheap](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/binaryheap.cpp)
44+
* [Bloom Filter](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/bloom_filter.cpp)
4445
* [Circular Queue Using Linked List](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/circular_queue_using_linked_list.cpp)
4546
* Cll
4647
* [Cll](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/data_structures/cll/cll.cpp)

data_structures/bloom_filter.cpp

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
/**
2+
* @file
3+
* @brief [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter)
4+
* generic implementation in C++
5+
* @details A Bloom filter is a space-efficient probabilistic data structure,
6+
* a query returns either "possibly in set" or "definitely not in set".
7+
*
8+
* More generally, fewer than 10 bits per element are required for a 1% false
9+
* positive probability, independent of the size or number of elements in the
10+
* set.
11+
*
12+
* It helps us to not make an "expensive operations", like disk IO - we can
13+
* use bloom filter to check incoming request, and with a good probability
14+
* get an answer of bloom filter, that we don't need to make our "expensive
15+
* operation"
16+
*
17+
*
18+
* [Very good use case example](https://stackoverflow.com/a/30247022)
19+
*
20+
* Basic bloom filter doesn't support deleting of elements, so
21+
* we don't need to implement deletion in bloom filter and bitset in our case.
22+
* @author [DanArmor](https://github.com/DanArmor)
23+
*/
24+
25+
#include <cassert> /// for assert
26+
#include <functional> /// for list of hash functions for bloom filter constructor
27+
#include <initializer_list> /// for initializer_list for bloom filter constructor
28+
#include <string> /// for testing on strings
29+
#include <vector> /// for std::vector
30+
#include <iostream> /// for IO operations
31+
32+
/**
33+
* @namespace data_structures
34+
* @brief Data Structures algorithms
35+
*/
36+
namespace data_structures {
37+
/**
38+
* @brief Simple bitset implementation for bloom filter
39+
*/
40+
class Bitset {
41+
private:
42+
std::vector<std::size_t> data; ///< short info of this variable
43+
static const std::size_t blockSize =
44+
sizeof(std::size_t); ///< size of integer type, that we are using in
45+
///< our bitset
46+
public:
47+
explicit Bitset(std::size_t);
48+
std::size_t size();
49+
void add(std::size_t);
50+
bool contains(std::size_t);
51+
};
52+
53+
/**
54+
* @brief Utility function to return the size of the inner array
55+
* @returns the size of inner array
56+
*/
57+
std::size_t Bitset::size() { return data.size(); }
58+
59+
/**
60+
* @brief BitSet class constructor
61+
* @param initSize amount of blocks, each contain sizeof(std::size_t) bits
62+
*/
63+
Bitset::Bitset(std::size_t initSize) : data(initSize) {}
64+
65+
/**
66+
* @brief Turn bit on position x to 1s
67+
*
68+
* @param x position to turn bit on
69+
* @returns void
70+
*/
71+
void Bitset::add(std::size_t x) {
72+
std::size_t blockIndex = x / blockSize;
73+
if (blockIndex >= data.size()) {
74+
data.resize(blockIndex + 1);
75+
}
76+
data[blockIndex] |= 1 << (x % blockSize);
77+
}
78+
79+
/**
80+
* @brief Doest bitset contains element x
81+
*
82+
* @param x position in bitset to check
83+
* @returns true if bit position x is 1
84+
* @returns false if bit position x is 0
85+
*/
86+
bool Bitset::contains(std::size_t x) {
87+
std::size_t blockIndex = x / blockSize;
88+
if (blockIndex >= data.size()) {
89+
return false;
90+
}
91+
return data[blockIndex] & (1 << (x % blockSize));
92+
}
93+
94+
/**
95+
* @brief Bloom filter template class
96+
* @tparam T type of elements that we need to filter
97+
*/
98+
template <typename T>
99+
class BloomFilter {
100+
private:
101+
Bitset set; ///< inner bitset for elements
102+
std::vector<std::function<std::size_t(T)>>
103+
hashFunks; ///< hash functions for T type
104+
105+
public:
106+
BloomFilter(std::size_t,
107+
std::initializer_list<std::function<std::size_t(T)>>);
108+
void add(T);
109+
bool contains(T);
110+
};
111+
112+
/**
113+
* @brief Constructor for Bloom filter
114+
*
115+
* @tparam T type of elements that we need to filter
116+
* @param size initial size of Bloom filter
117+
* @param funks hash functions for T type
118+
* @returns none
119+
*/
120+
template <typename T>
121+
BloomFilter<T>::BloomFilter(
122+
std::size_t size,
123+
std::initializer_list<std::function<std::size_t(T)>> funks)
124+
: set(size), hashFunks(funks) {}
125+
126+
/**
127+
* @brief Add function for Bloom filter
128+
*
129+
* @tparam T type of elements that we need to filter
130+
* @param x element to add to filter
131+
* @returns void
132+
*/
133+
template <typename T>
134+
void BloomFilter<T>::add(T x) {
135+
for (std::size_t i = 0; i < hashFunks.size(); i++) {
136+
set.add(hashFunks[i](x) % (sizeof(std::size_t) * set.size()));
137+
}
138+
}
139+
140+
/**
141+
* @brief Check element function for Bloom filter
142+
*
143+
* @tparam T type of elements that we need to filter
144+
* @param x element to check in filter
145+
* @return true if the element probably appears in the filter
146+
* @return false if the element certainly does not appear in the filter
147+
*/
148+
template <typename T>
149+
bool BloomFilter<T>::contains(T x) {
150+
for (std::size_t i = 0; i < hashFunks.size(); i++) {
151+
if (set.contains(hashFunks[i](x) %
152+
(sizeof(std::size_t) * set.size())) == false) {
153+
return false;
154+
}
155+
}
156+
return true;
157+
}
158+
159+
/**
160+
* @brief [Function djb2](http://www.cse.yorku.ca/~oz/hash.html)
161+
* to get hash for the given string.
162+
*
163+
* @param s string to get hash from
164+
* @returns hash for a string
165+
*/
166+
static std::size_t hashDJB2(std::string const& s) {
167+
std::size_t hash = 5381;
168+
for (char c : s) {
169+
hash = ((hash << 5) + hash) + c;
170+
}
171+
return hash;
172+
}
173+
174+
/**
175+
* @brief [Hash
176+
* function](https://stackoverflow.com/questions/8317508/hash-function-for-a-string),
177+
* to get hash for the given string.
178+
*
179+
* @param s string to get hash from
180+
* @returns hash for the given string
181+
*/
182+
static std::size_t hashStr(std::string const& s) {
183+
std::size_t hash = 37;
184+
std::size_t primeNum1 = 54059;
185+
std::size_t primeNum2 = 76963;
186+
for (char c : s) {
187+
hash = (hash * primeNum1) ^ (c * primeNum2);
188+
}
189+
return hash;
190+
}
191+
192+
/**
193+
* @brief [Hash function for
194+
* test](https://stackoverflow.com/questions/664014/what-integer-hash-function-are-good-that-accepts-an-integer-hash-key)
195+
*
196+
* @param x to get hash from
197+
* @returns hash for the `x` parameter
198+
*/
199+
std::size_t hashInt_1(int x) {
200+
x = ((x >> 16) ^ x) * 0x45d9f3b;
201+
x = ((x >> 16) ^ x) * 0x45d9f3b;
202+
x = (x >> 16) ^ x;
203+
return x;
204+
}
205+
206+
/**
207+
* @brief [Hash function for
208+
* test](https://stackoverflow.com/questions/664014/what-integer-hash-function-are-good-that-accepts-an-integer-hash-key)
209+
*
210+
* @param x to get hash from
211+
* @returns hash for the `x` parameter
212+
*/
213+
std::size_t hashInt_2(int x) {
214+
auto y = static_cast<std::size_t>(x);
215+
y = (y ^ (y >> 30)) * static_cast<std::size_t>(0xbf58476d1ce4e5b9);
216+
y = (y ^ (y >> 27)) * static_cast<std::size_t>(0x94d049bb133111eb);
217+
y = y ^ (y >> 31);
218+
return y;
219+
}
220+
} // namespace data_structures
221+
222+
/**
223+
* @brief Test for bloom filter with string as generic type
224+
* @returns void
225+
*/
226+
static void test_bloom_filter_string() {
227+
data_structures::BloomFilter<std::string> filter(
228+
10, {data_structures::hashDJB2, data_structures::hashStr});
229+
std::vector<std::string> toCheck{"hello", "world", "!"};
230+
std::vector<std::string> toFalse{"false", "world2", "!!!"};
231+
for (auto& x : toCheck) {
232+
filter.add(x);
233+
}
234+
for (auto& x : toFalse) {
235+
assert(filter.contains(x) == false);
236+
}
237+
for (auto& x : toCheck) {
238+
assert(filter.contains(x));
239+
}
240+
}
241+
242+
/**
243+
* @brief Test for bloom filter with int as generic type
244+
* @returns void
245+
*/
246+
static void test_bloom_filter_int() {
247+
data_structures::BloomFilter<int> filter(
248+
20, {data_structures::hashInt_1, data_structures::hashInt_2});
249+
std::vector<int> toCheck{100, 200, 300, 50};
250+
std::vector<int> toFalse{1, 2, 3, 4, 5, 6, 7, 8};
251+
for (int x : toCheck) {
252+
filter.add(x);
253+
}
254+
for (int x : toFalse) {
255+
assert(filter.contains(x) == false);
256+
}
257+
for (int x : toCheck) {
258+
assert(filter.contains(x));
259+
}
260+
}
261+
262+
/**
263+
* @brief Test for bitset
264+
*
265+
* @returns void
266+
*/
267+
static void test_bitset() {
268+
data_structures::Bitset set(2);
269+
std::vector<std::size_t> toCheck{0, 1, 5, 8, 63, 64, 67, 127};
270+
for (auto x : toCheck) {
271+
set.add(x);
272+
assert(set.contains(x));
273+
}
274+
assert(set.contains(128) == false);
275+
assert(set.contains(256) == false);
276+
}
277+
278+
/**
279+
* @brief Main function
280+
* @returns 0 on exit
281+
*/
282+
int main() {
283+
// run self-test implementations
284+
285+
test_bitset(); // run test for bitset, because bloom filter is depending on it
286+
test_bloom_filter_string();
287+
test_bloom_filter_int();
288+
289+
std::cout << "All tests have successfully passed!\n";
290+
return 0;
291+
}

0 commit comments

Comments
 (0)