Skip to content

Commit 3fed134

Browse files
DerWehWeh Andreas
and
Weh Andreas
authored
ENH: speed up _harmonize_tensor (#603)
* DOC: fix typos Common case is just a single element, thus that the overhead of NumPy is very large. * ENH: speed up _harmonize_tensor --------- Co-authored-by: Weh Andreas <andreas.weh@uni-a.de>
1 parent 9b7dfd6 commit 3fed134

File tree

1 file changed

+17
-17
lines changed

1 file changed

+17
-17
lines changed

python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import logging
55
import warnings
66
from itertools import chain, count
7-
from math import isnan
7+
from math import isnan, prod
88

99
import numpy as np
1010

@@ -47,15 +47,15 @@ def _harmonize_tensor(
4747
# greater than the old model's lowest cut.
4848
# eg: new: | | | | |
4949
# old: | |
50-
# other1: | | proprotion |
50+
# other1: | | proportion |
5151
# other2: | proportion |
5252
# One wrinkle is that for pairs, we'll be using the pair cuts and we need to
53-
# one-dimensionalize any existing pair weights onto their respective 1D axies
54-
# before proportionating them. Annother issue is that we might not even have
53+
# one-dimensionalize any existing pair weights onto their respective 1D axis
54+
# before proportioning them. Another issue is that we might not even have
5555
# another term_feature that uses some particular feature that we use in our model
5656
# so we don't have any weights. We can solve that issue by dropping any feature's
5757
# bins for terms that we have no information for. After we do this we'll have
58-
# guaranteed that we only have new bin cuts for feature axies that we have inside
58+
# guaranteed that we only have new bin cuts for feature axes that we have inside
5959
# the bin level that we're handling!
6060

6161
old_feature_idxs = list(old_feature_idxs)
@@ -241,7 +241,7 @@ def _harmonize_tensor(
241241
map_bins[bin_idx]
242242
for map_bins, bin_idx in zip(mapping, old_reversed_bin_idxs)
243243
]
244-
n_cells2 = np.prod([len(x) for x in cell_map])
244+
n_cells2 = prod(map(len, cell_map))
245245
val = 0 if n_multiclasses == 1 else np.zeros(n_multiclasses, np.float64)
246246
total_weight = 0.0
247247
for cell2_idx in range(n_cells2):
@@ -416,7 +416,7 @@ def merge_ebms(models):
416416

417417
# TODO: every time we merge models we fragment the bins more and more and this is undesirable
418418
# especially for pairs. When we build models, we store the feature bin cuts for pairs even
419-
# if we have no pairs that use that paritcular feature as a pair. We can eliminate these useless
419+
# if we have no pairs that use that particular feature as a pair. We can eliminate these useless
420420
# pair feature cuts before merging the bins and that'll give us less resulting cuts. Having less
421421
# cuts reduces the number of estimates that we need to make and reduces the complexity of the
422422
# tensors, so it's good to have this reduction.
@@ -470,7 +470,7 @@ def merge_ebms(models):
470470
# order and also handling merged categories (where two categories map to a single score)
471471
# We should first try to progress in order along each set of keys and see if we can
472472
# establish the perfect order which might work if there are isolated missing categories
473-
# and if we can't get a unique guaranteed sorted order that way then examime all the
473+
# and if we can't get a unique guaranteed sorted order that way then examine all the
474474
# different known sort order and figure out if any of the possible orderings match
475475
merged_bins = dict(zip(merged_keys, count(1)))
476476
else:
@@ -550,7 +550,7 @@ def merge_ebms(models):
550550
):
551551
if hasattr(ebm, "feature_bounds_"):
552552
# TODO: estimate the histogram bin counts by taking the min of the mins and the max of the maxes
553-
# and re-apportioning the counts based on the distributions of the previous histograms. Proprotion
553+
# and re-apportioning the counts based on the distributions of the previous histograms. Proportion
554554
# them to the floor of their counts and then assign any remaining integers based on how much
555555
# they reduce the RMSE of the integer counts from the ideal floating point counts.
556556
pass
@@ -623,7 +623,7 @@ def merge_ebms(models):
623623

624624
# TODO: in the future we might at this point try and figure out the most
625625
# common feature ordering within the terms. Take the mode first
626-
# and amonst the orderings that tie, choose the one that's best sorted by
626+
# and amongst the orderings that tie, choose the one that's best sorted by
627627
# feature indexes
628628
ebm.term_features_ = sorted_fgs
629629

@@ -634,26 +634,26 @@ def merge_ebms(models):
634634
# interaction mismatches where an interaction will be in one model, but not the other.
635635
# We need to estimate the bin_weight_ tensors that would have existed in this case.
636636
# We'll use the interaction terms that we do have in other models to estimate the
637-
# distribution in the essense of the data, which should be roughly consistent or you
637+
# distribution in the essence of the data, which should be roughly consistent or you
638638
# shouldn't be attempting to merge the models in the first place. We'll then scale
639-
# the percentage distribution by the total weight of the model that we're fillin in the
639+
# the percentage distribution by the total weight of the model that we're filling in the
640640
# details for.
641641

642642
# TODO: this algorithm has some problems. The estimated tensor that we get by taking the
643643
# model weight and distributing it by a per-cell percentage measure means that we get
644-
# inconsistent weight distibutions along the axis. We can take our resulting weight tensor
644+
# inconsistent weight distributions along the axis. We can take our resulting weight tensor
645645
# and sum the columns/rows to get the weights on each individual feature axis. Our model
646646
# however comes with a known set of weights on each feature, and the result of our operation
647647
# will not match the existing distribution in almost all cases. I think there might be
648648
# some algorithm where we start with the per-feature weights and use the distribution hints
649649
# from the other models to inform where we place our exact weights that we know about in our
650-
# model from each axis. The problem is that the sums in both axies need to agree, and each
650+
# model from each axis. The problem is that the sums in both axes need to agree, and each
651651
# change we make influences both. I'm not sure we can even guarantee that there is an answer
652652
# and if there was one I'm not sure how we'd go about generating it. I'm going to leave
653653
# this problem for YOU: a future person who is smarter than me and has more time to solve this.
654654
# One hint: I think a possible place to start would be an iterative algorithm that's similar
655655
# to purification where you randomly select a row/column and try to get closer at each step
656-
# to the rigth answer. Good luck!
656+
# to the right answer. Good luck!
657657
#
658658
# Oh, there's also another deeper problem.. let's say you had a crazy 5 way interaction in the
659659
# model eg: (0,1,2,3,4) and you had 2 and 3 way interactions that either overlap or not.
@@ -731,7 +731,7 @@ def merge_ebms(models):
731731
model.bagged_scores_[term_idx][bag_idx],
732732
model.bin_weights_[
733733
term_idx
734-
], # we use these to weigh distribution of scores for mulple bins
734+
], # we use these to weigh distribution of scores for multiple bins
735735
)
736736
new_bagged_scores.append(harmonized_bagged_scores)
737737
ebm.bin_weights_.append(np.sum(new_bin_weights, axis=0))
@@ -768,7 +768,7 @@ def merge_ebms(models):
768768
# TODO: we might be able to do these operations earlier
769769
remove_extra_bins(ebm.term_features_, ebm.bins_)
770770

771-
# dependent attributes (can be re-derrived after serialization)
771+
# dependent attributes (can be re-derived after serialization)
772772
ebm.n_features_in_ = len(ebm.bins_) # scikit-learn specified name
773773
ebm.term_names_ = generate_term_names(ebm.feature_names_in_, ebm.term_features_)
774774

0 commit comments

Comments
 (0)