ENH: speed up _harmonize_tensor (#603)

DerWeh · Weh Andreas · web-flow · commit 3fed13424aa7 · 2025-03-15T01:35:52.000-07:00
* DOC: fix typos

Common case is just a single element, thus that the overhead of NumPy is
very large.

* ENH: speed up _harmonize_tensor

---------

Co-authored-by: Weh Andreas &lt;andreas.weh@uni-a.de&gt;
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py b/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py
@@ -4,7 +4,7 @@
 import logging
 import warnings
 from itertools import chain, count
-from math import isnan
+from math import isnan, prod
 
 import numpy as np
 
@@ -47,15 +47,15 @@ def _harmonize_tensor(
     # greater than the old model's lowest cut.
     # eg:  new:      |    |            |   |    |
     #      old:                        |        |
-    #   other1:      |    |   proprotion   |
+    #   other1:      |    |   proportion   |
     #   other2:      |        proportion        |
     # One wrinkle is that for pairs, we'll be using the pair cuts and we need to
-    # one-dimensionalize any existing pair weights onto their respective 1D axies
-    # before proportionating them.  Annother issue is that we might not even have
+    # one-dimensionalize any existing pair weights onto their respective 1D axis
+    # before proportioning them.  Another issue is that we might not even have
     # another term_feature that uses some particular feature that we use in our model
     # so we don't have any weights.  We can solve that issue by dropping any feature's
     # bins for terms that we have no information for.  After we do this we'll have
-    # guaranteed that we only have new bin cuts for feature axies that we have inside
+    # guaranteed that we only have new bin cuts for feature axes that we have inside
     # the bin level that we're handling!
 
     old_feature_idxs = list(old_feature_idxs)
@@ -241,7 +241,7 @@ def _harmonize_tensor(
             map_bins[bin_idx]
             for map_bins, bin_idx in zip(mapping, old_reversed_bin_idxs)
         ]
-        n_cells2 = np.prod([len(x) for x in cell_map])
+        n_cells2 = prod(map(len, cell_map))
         val = 0 if n_multiclasses == 1 else np.zeros(n_multiclasses, np.float64)
         total_weight = 0.0
         for cell2_idx in range(n_cells2):
@@ -416,7 +416,7 @@ def merge_ebms(models):
 
     # TODO: every time we merge models we fragment the bins more and more and this is undesirable
     # especially for pairs.  When we build models, we store the feature bin cuts for pairs even
-    # if we have no pairs that use that paritcular feature as a pair.  We can eliminate these useless
+    # if we have no pairs that use that particular feature as a pair.  We can eliminate these useless
     # pair feature cuts before merging the bins and that'll give us less resulting cuts.  Having less
     # cuts reduces the number of estimates that we need to make and reduces the complexity of the
     # tensors, so it's good to have this reduction.
@@ -470,7 +470,7 @@ def merge_ebms(models):
                 # order and also handling merged categories (where two categories map to a single score)
                 # We should first try to progress in order along each set of keys and see if we can
                 # establish the perfect order which might work if there are isolated missing categories
-                # and if we can't get a unique guaranteed sorted order that way then examime all the
+                # and if we can't get a unique guaranteed sorted order that way then examine all the
                 # different known sort order and figure out if any of the possible orderings match
                 merged_bins = dict(zip(merged_keys, count(1)))
             else:
@@ -550,7 +550,7 @@ def merge_ebms(models):
         ):
             if hasattr(ebm, "feature_bounds_"):
                 # TODO: estimate the histogram bin counts by taking the min of the mins and the max of the maxes
-                # and re-apportioning the counts based on the distributions of the previous histograms.  Proprotion
+                # and re-apportioning the counts based on the distributions of the previous histograms.  Proportion
                 # them to the floor of their counts and then assign any remaining integers based on how much
                 # they reduce the RMSE of the integer counts from the ideal floating point counts.
                 pass
@@ -623,7 +623,7 @@ def merge_ebms(models):
 
     # TODO: in the future we might at this point try and figure out the most
     #       common feature ordering within the terms.  Take the mode first
-    #       and amonst the orderings that tie, choose the one that's best sorted by
+    #       and amongst the orderings that tie, choose the one that's best sorted by
     #       feature indexes
     ebm.term_features_ = sorted_fgs
 
@@ -634,26 +634,26 @@ def merge_ebms(models):
         # interaction mismatches where an interaction will be in one model, but not the other.
         # We need to estimate the bin_weight_ tensors that would have existed in this case.
         # We'll use the interaction terms that we do have in other models to estimate the
-        # distribution in the essense of the data, which should be roughly consistent or you
+        # distribution in the essence of the data, which should be roughly consistent or you
         # shouldn't be attempting to merge the models in the first place.  We'll then scale
-        # the percentage distribution by the total weight of the model that we're fillin in the
+        # the percentage distribution by the total weight of the model that we're filling in the
         # details for.
 
         # TODO: this algorithm has some problems.  The estimated tensor that we get by taking the
         # model weight and distributing it by a per-cell percentage measure means that we get
-        # inconsistent weight distibutions along the axis.  We can take our resulting weight tensor
+        # inconsistent weight distributions along the axis.  We can take our resulting weight tensor
         # and sum the columns/rows to get the weights on each individual feature axis.  Our model
         # however comes with a known set of weights on each feature, and the result of our operation
         # will not match the existing distribution in almost all cases.  I think there might be
         # some algorithm where we start with the per-feature weights and use the distribution hints
         # from the other models to inform where we place our exact weights that we know about in our
-        # model from each axis.  The problem is that the sums in both axies need to agree, and each
+        # model from each axis.  The problem is that the sums in both axes need to agree, and each
         # change we make influences both.  I'm not sure we can even guarantee that there is an answer
         # and if there was one I'm not sure how we'd go about generating it.  I'm going to leave
         # this problem for YOU: a future person who is smarter than me and has more time to solve this.
         # One hint: I think a possible place to start would be an iterative algorithm that's similar
         # to purification where you randomly select a row/column and try to get closer at each step
-        # to the rigth answer.  Good luck!
+        # to the right answer.  Good luck!
         #
         # Oh, there's also another deeper problem.. let's say you had a crazy 5 way interaction in the
         # model eg: (0,1,2,3,4) and you had 2 and 3 way interactions that either overlap or not.
@@ -731,7 +731,7 @@ def merge_ebms(models):
                         model.bagged_scores_[term_idx][bag_idx],
                         model.bin_weights_[
                             term_idx
-                        ],  # we use these to weigh distribution of scores for mulple bins
+                        ],  # we use these to weigh distribution of scores for multiple bins
                     )
                     new_bagged_scores.append(harmonized_bagged_scores)
         ebm.bin_weights_.append(np.sum(new_bin_weights, axis=0))
@@ -768,7 +768,7 @@ def merge_ebms(models):
     # TODO: we might be able to do these operations earlier
     remove_extra_bins(ebm.term_features_, ebm.bins_)
 
-    # dependent attributes (can be re-derrived after serialization)
+    # dependent attributes (can be re-derived after serialization)
     ebm.n_features_in_ = len(ebm.bins_)  # scikit-learn specified name
     ebm.term_names_ = generate_term_names(ebm.feature_names_in_, ebm.term_features_)