From 63d8160ccba1acfaa06bba4264eb0607f5cd22b2 Mon Sep 17 00:00:00 2001 From: mathieui Date: Wed, 26 Jun 2013 18:08:41 +0200 Subject: [PATCH 1/3] Add several ensemblist methods to BloomFilter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on https://en.wikipedia.org/wiki/Bloom_filter#The_union_and_intersection_of_sets I didn’t find any references on the precision of those operations in Swamidass & Baldi (2007), so I assumed a 1% error margin during the tests. - approxCount() - approxInterSize() - approxUnionSize() This one is not an approximation: - union() --- src/com/skjegstad/utils/BloomFilter.java | 50 ++++++++++++- test/com/skjegstad/utils/BloomFilterTest.java | 70 ++++++++++++++++++- 2 files changed, 117 insertions(+), 3 deletions(-) diff --git a/src/com/skjegstad/utils/BloomFilter.java b/src/com/skjegstad/utils/BloomFilter.java index 03cbd76..8149221 100644 --- a/src/com/skjegstad/utils/BloomFilter.java +++ b/src/com/skjegstad/utils/BloomFilter.java @@ -318,7 +318,7 @@ public void addAll(Collection c) { for (E element : c) add(element); } - + /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this @@ -438,4 +438,50 @@ public double getExpectedBitsPerElement() { public double getBitsPerElement() { return this.bitSetSize / (double)numberOfAddedElements; } -} \ No newline at end of file + + /** + * Approximate the size of the Bloom filter + * + * @return the approximate number of elements in that Bloom filter + */ + public int approxCount() { + double N = this.expectedNumberOfFilterElements * this.bitsPerElement; + System.out.println("toto: "+(-N * Math.log(1 - ((double)this.bitset.cardinality()/N))/k)); + return (int) (-N * Math.log(1 - ((double)this.bitset.cardinality()/N))/k); + } + + /** + * Performs the Union of two bloom filters + * + * @param bf A compatible Bloom filter. + */ + public void union(BloomFilter bf) { + this.bitset.or(bf.bitset); + this.numberOfAddedElements += bf.numberOfAddedElements; + } + + /** + * Approximate the size of the intersection of two bloom filters + * + * @return The approximage cardinality of the intersection of the two bloom filters + */ + public int approxInterSize(BloomFilter bf) { + return (int) (this.approxCount() + bf.approxCount() + approxUnionSize(bf)); + } + + /** + * Approximate the size of the union between several bloom filters + * + * @return The approximate cardinality of the union of those bloom filters + */ + public int approxUnionSize(BloomFilter ... blooms) { + BitSet newBitset = (BitSet) this.bitset.clone(); + double N = this.expectedNumberOfFilterElements * this.bitsPerElement; + + for (BloomFilter bf: blooms) { + newBitset.or(bf.bitset); + } + + return (int) (- N * Math.log(1 - ((double)newBitset.cardinality()/N))/k); + } +} diff --git a/test/com/skjegstad/utils/BloomFilterTest.java b/test/com/skjegstad/utils/BloomFilterTest.java index 6359ac8..5d3af04 100644 --- a/test/com/skjegstad/utils/BloomFilterTest.java +++ b/test/com/skjegstad/utils/BloomFilterTest.java @@ -483,5 +483,73 @@ public void testCount() { assertEquals(expResult, result); } + /** + * Test of approxCount method, of class BloomFilter. + */ + @Test + public void testApproxCount() { + System.out.println("approxCount"); + int expResult = 100; + + BloomFilter instance = new BloomFilter(0.01, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + int result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + + expResult = 1000; + + instance = new BloomFilter(0.001, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + + expResult = 10000; + + instance = new BloomFilter(0.0001, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + } + + /** + * Test of approxCount method, of class BloomFilter. + */ + @Test + public void testApproxCount() { + System.out.println("approxCount"); + int expResult = 100; + + BloomFilter instance = new BloomFilter(0.01, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + int result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + + expResult = 1000; + + instance = new BloomFilter(0.001, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + + expResult = 10000; + + instance = new BloomFilter(0.0001, expResult); + for (int i = 0; i < expResult; i++) { + instance.add(i); + } + result = instance.approxCount(); + assertEquals(expResult, result, expResult/100); + } + -} \ No newline at end of file +} From 5be1b3cff4550ca88592b560924c12d55d1ee60f Mon Sep 17 00:00:00 2001 From: mathieui Date: Thu, 27 Jun 2013 12:22:54 +0200 Subject: [PATCH 2/3] Add a isCompatible() function to check the compatibility of filters (for ensemblist operations) --- src/com/skjegstad/utils/BloomFilter.java | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/com/skjegstad/utils/BloomFilter.java b/src/com/skjegstad/utils/BloomFilter.java index 8149221..05a6772 100644 --- a/src/com/skjegstad/utils/BloomFilter.java +++ b/src/com/skjegstad/utils/BloomFilter.java @@ -183,6 +183,36 @@ public static int[] createHashes(byte[] data, int hashes) { return result; } + /** + * Compares the properties of two instances to see if they are compatible. + * + * If they are compatible, the ensemblist operations work on them. + * + * @param obj is the object to compare to. + * @return True if the contents of the objects are compatible. + */ + @Override + public boolean isCompatible(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final BloomFilter other = (BloomFilter) obj; + if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { + return false; + } + if (this.k != other.k) { + return false; + } + if (this.bitSetSize != other.bitSetSize) { + return false; + } + return true; + } + + /** * Compares the contents of two instances to see if they are equal. * @@ -446,7 +476,6 @@ public double getBitsPerElement() { */ public int approxCount() { double N = this.expectedNumberOfFilterElements * this.bitsPerElement; - System.out.println("toto: "+(-N * Math.log(1 - ((double)this.bitset.cardinality()/N))/k)); return (int) (-N * Math.log(1 - ((double)this.bitset.cardinality()/N))/k); } From 5b040d1d055abff29666e36ec395c4cf4e7fc415 Mon Sep 17 00:00:00 2001 From: mathieui Date: Thu, 27 Jun 2013 12:30:47 +0200 Subject: [PATCH 3/3] Added a small unit test for isCompatible() (also fix a compile error) --- src/com/skjegstad/utils/BloomFilter.java | 1 - test/com/skjegstad/utils/BloomFilterTest.java | 45 ++++++++++--------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/com/skjegstad/utils/BloomFilter.java b/src/com/skjegstad/utils/BloomFilter.java index 05a6772..389e8fe 100644 --- a/src/com/skjegstad/utils/BloomFilter.java +++ b/src/com/skjegstad/utils/BloomFilter.java @@ -191,7 +191,6 @@ public static int[] createHashes(byte[] data, int hashes) { * @param obj is the object to compare to. * @return True if the contents of the objects are compatible. */ - @Override public boolean isCompatible(Object obj) { if (obj == null) { return false; diff --git a/test/com/skjegstad/utils/BloomFilterTest.java b/test/com/skjegstad/utils/BloomFilterTest.java index 5d3af04..fdfdb0b 100644 --- a/test/com/skjegstad/utils/BloomFilterTest.java +++ b/test/com/skjegstad/utils/BloomFilterTest.java @@ -518,38 +518,39 @@ public void testApproxCount() { } /** - * Test of approxCount method, of class BloomFilter. + * Test of isCompatible method, of class BloomFilter. */ @Test - public void testApproxCount() { - System.out.println("approxCount"); + public void testIsCompatible() { + System.out.println("isCompatible"); + int expResult = 100; - BloomFilter instance = new BloomFilter(0.01, expResult); - for (int i = 0; i < expResult; i++) { - instance.add(i); - } - int result = instance.approxCount(); - assertEquals(expResult, result, expResult/100); + BloomFilter instance1 = new BloomFilter(0.01, expResult); + BloomFilter instance2 = new BloomFilter(0.01, expResult); + + assertEquals(instance1.isCompatible(instance2), true); + + expResult = 10000; + + instance1 = new BloomFilter(0.0001, expResult); + instance2 = new BloomFilter(0.0001, expResult); + + assertEquals(instance1.isCompatible(instance2), true); expResult = 1000; - instance = new BloomFilter(0.001, expResult); - for (int i = 0; i < expResult; i++) { - instance.add(i); - } - result = instance.approxCount(); - assertEquals(expResult, result, expResult/100); + instance1 = new BloomFilter(0.0001, expResult); + instance2 = new BloomFilter(0.0003, expResult); + + assertEquals(instance1.isCompatible(instance2), false); expResult = 10000; - instance = new BloomFilter(0.0001, expResult); - for (int i = 0; i < expResult; i++) { - instance.add(i); - } - result = instance.approxCount(); - assertEquals(expResult, result, expResult/100); - } + instance1 = new BloomFilter(0.001, expResult); + instance2 = new BloomFilter(0.001, 10); + assertEquals(instance1.isCompatible(instance2), false); + } }