change deduplication to avoid consolidating identical objects and instead remove higher order bins that are duplicates of lower order bins

paulbkoch · paulbkoch · commit 78673b6b9b95 · 2025-03-12T22:34:05.000-07:00
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_bin.py b/python/interpret-core/interpret/glassbox/_ebm/_bin.py
@@ -33,7 +33,7 @@
 def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_features):
     # called under: predict
 
-    # prior to calling this function, call deduplicate_bins which will eliminate extra work in this function
+    # prior to calling this function, call remove_extra_bins which will eliminate extra work in this function
 
     # this generator function returns data in whatever order it thinks is most efficient.  Normally for
     # mains it returns them in order, but pairs will be returned as their data completes and they can
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
@@ -63,13 +63,12 @@
 from ._json import UNTESTED_from_jsonable, to_jsonable
 from ._tensor import remove_last, trim_tensor
 from ._utils import (
-    deduplicate_bins,
     generate_term_names,
     generate_term_types,
     make_bag,
     order_terms,
     process_terms,
-    remove_unused_higher_bins,
+    remove_extra_bins,
 )
 
 _log = logging.getLogger(__name__)
@@ -1429,8 +1428,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
 
         best_iteration = np.array(best_iteration, np.int64)
 
-        remove_unused_higher_bins(term_features, bins)
-        deduplicate_bins(bins)
+        remove_extra_bins(term_features, bins)
 
         bagged_scores = (
             np.array([model[idx] for model in models], np.float64)
@@ -2445,8 +2443,7 @@ def sweep(self, terms=True, bins=True, features=False):
             raise ValueError(msg)
 
         if bins is True:
-            remove_unused_higher_bins(self.term_features_, self.bins_)
-            deduplicate_bins(self.bins_)
+            remove_extra_bins(self.term_features_, self.bins_)
         elif bins is not False:
             msg = "bins must be True or False"
             _log.error(msg)
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py b/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py
@@ -11,11 +11,10 @@
 from ...utils._native import Native
 from ._utils import (
     convert_categorical_to_continuous,
-    deduplicate_bins,
     generate_term_names,
     order_terms,
     process_terms,
-    remove_unused_higher_bins,
+    remove_extra_bins,
 )
 
 _log = logging.getLogger(__name__)
@@ -512,7 +511,6 @@ def merge_ebms(models):
             new_leveled_bins.append(merged_bins)
         new_bins.append(new_leveled_bins)
     ebm.feature_types_in_ = new_feature_types
-    deduplicate_bins(new_bins)
     ebm.bins_ = new_bins
 
     feature_names_merged = [None] * n_features
@@ -768,9 +766,7 @@ def merge_ebms(models):
         ]
 
     # TODO: we might be able to do these operations earlier
-    remove_unused_higher_bins(ebm.term_features_, ebm.bins_)
-    # removing the higher order terms might allow us to eliminate some extra bins now that couldn't before
-    deduplicate_bins(ebm.bins_)
+    remove_extra_bins(ebm.term_features_, ebm.bins_)
 
     # dependent attributes (can be re-derrived after serialization)
     ebm.n_features_in_ = len(ebm.bins_)  # scikit-learn specified name
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_utils.py b/python/interpret-core/interpret/glassbox/_ebm/_utils.py
@@ -256,7 +256,7 @@ def order_terms(term_features, *args):
     return ret if len(ret) >= 2 else ret[0]
 
 
-def remove_unused_higher_bins(term_features, bins):
+def remove_extra_bins(term_features, bins):
     # many features are not used in pairs, so we can simplify the model
     # by removing the extra higher interaction level bins
 
@@ -267,33 +267,33 @@ def remove_unused_higher_bins(term_features, bins):
                 highest_levels[feature_idx], len(feature_idxs)
             )
 
-    for bin_levels, max_level in zip(bins, highest_levels):
-        del bin_levels[max_level:]
+    for bin_levels, i in zip(bins, highest_levels):
+        if i != 0:
+            if len(bin_levels) == 0:
+                raise Exception("Empty bin cannot be used in a term.")
 
+            i = min(i, len(bin_levels)) - 1
+            types = set(map(type, bin_levels))
 
-def deduplicate_bins(bins):
-    # calling this function before calling score_terms allows score_terms to operate more efficiently since it'll
-    # be able to avoid re-binning data for pairs that have already been processed in mains or other pairs since we
-    # use the id of the bins to identify feature data that was previously binned
+            if len(types) != 1:
+                raise Exception("Inconsistent bin types.")
 
-    uniques = {}
-    for bin_levels in bins:
-        highest_key = None
-        highest_idx = -1
-        for level_idx, feature_bins in enumerate(bin_levels):
-            if isinstance(feature_bins, dict):
-                key = frozenset(feature_bins.items())
+            if next(iter(types)) == dict:
+                key = frozenset(bin_levels[i].items())
+                i -= 1
+                while 0 <= i:
+                    if key != frozenset(bin_levels[i].items()):
+                        break
+                    i -= 1
             else:
-                key = tuple(feature_bins)
-            if key in uniques:
-                bin_levels[level_idx] = uniques[key]
-            else:
-                uniques[key] = feature_bins
-
-            if highest_key != key:
-                highest_key = key
-                highest_idx = level_idx
-        del bin_levels[highest_idx + 1 :]
+                key = tuple(bin_levels[i])
+                i -= 1
+                while 0 <= i:
+                    if key != tuple(bin_levels[i]):
+                        break
+                    i -= 1
+            i += 2
+        del bin_levels[i:]
 
 
 def convert_to_intervals(cuts):  # pragma: no cover
diff --git a/python/interpret-core/tests/glassbox/ebm/test_ebm_utils.py b/python/interpret-core/tests/glassbox/ebm/test_ebm_utils.py
@@ -5,32 +5,39 @@
     convert_categorical_to_continuous,
     convert_to_cuts,
     convert_to_intervals,
-    deduplicate_bins,
+    remove_extra_bins,
     make_bag,
 )
 
 
-def test_deduplicate_bins():
+def test_remove_extra_bins():
     bins = [
         [{"a": 1, "b": 2}, {"a": 2, "b": 1}, {"b": 2, "a": 1}, {"b": 2, "a": 1}],
         [
-            np.array([1, 2, 3], dtype=np.float64),
             np.array([1, 3, 2], dtype=np.float64),
             np.array([1, 2, 3], dtype=np.float64),
+            np.array([1, 2, 3], dtype=np.float64),
+        ],
+        [
+            np.array([9, 8, 7], dtype=np.float64),
         ],
+        [{"m": 1, "q": 2}],
+        [{"r": 7, "t": 8}, {"r": 7, "t": 8}],
+        [{"one": 1, "two": 2}],
+        [{"never_used": 1, "never_ever": 2}],
+        [],
     ]
 
-    deduplicate_bins(bins)
+    remove_extra_bins([(0, 1, 2, 3, 4), (5,)], bins)
 
     assert len(bins[0]) == 3
-    assert id(bins[0][0]) != id(bins[0][1])
-    assert id(bins[0][0]) == id(bins[0][2])
-    assert id(bins[0][1]) != id(bins[0][2])
-
-    assert len(bins[1]) == 3
-    assert id(bins[1][0]) != id(bins[1][1])
-    assert id(bins[1][0]) == id(bins[1][2])
-    assert id(bins[1][1]) != id(bins[1][2])
+    assert len(bins[1]) == 2
+    assert len(bins[2]) == 1
+    assert len(bins[3]) == 1
+    assert len(bins[4]) == 1
+    assert len(bins[5]) == 1
+    assert len(bins[6]) == 0
+    assert len(bins[7]) == 0
 
 
 def test_conversion_cut_intervals():