4
4
import logging
5
5
import warnings
6
6
from itertools import chain , count
7
- from math import isnan
7
+ from math import isnan , prod
8
8
9
9
import numpy as np
10
10
@@ -47,15 +47,15 @@ def _harmonize_tensor(
47
47
# greater than the old model's lowest cut.
48
48
# eg: new: | | | | |
49
49
# old: | |
50
- # other1: | | proprotion |
50
+ # other1: | | proportion |
51
51
# other2: | proportion |
52
52
# One wrinkle is that for pairs, we'll be using the pair cuts and we need to
53
- # one-dimensionalize any existing pair weights onto their respective 1D axies
54
- # before proportionating them. Annother issue is that we might not even have
53
+ # one-dimensionalize any existing pair weights onto their respective 1D axis
54
+ # before proportioning them. Another issue is that we might not even have
55
55
# another term_feature that uses some particular feature that we use in our model
56
56
# so we don't have any weights. We can solve that issue by dropping any feature's
57
57
# bins for terms that we have no information for. After we do this we'll have
58
- # guaranteed that we only have new bin cuts for feature axies that we have inside
58
+ # guaranteed that we only have new bin cuts for feature axes that we have inside
59
59
# the bin level that we're handling!
60
60
61
61
old_feature_idxs = list (old_feature_idxs )
@@ -241,7 +241,7 @@ def _harmonize_tensor(
241
241
map_bins [bin_idx ]
242
242
for map_bins , bin_idx in zip (mapping , old_reversed_bin_idxs )
243
243
]
244
- n_cells2 = np . prod ([ len ( x ) for x in cell_map ] )
244
+ n_cells2 = prod (map ( len , cell_map ) )
245
245
val = 0 if n_multiclasses == 1 else np .zeros (n_multiclasses , np .float64 )
246
246
total_weight = 0.0
247
247
for cell2_idx in range (n_cells2 ):
@@ -416,7 +416,7 @@ def merge_ebms(models):
416
416
417
417
# TODO: every time we merge models we fragment the bins more and more and this is undesirable
418
418
# especially for pairs. When we build models, we store the feature bin cuts for pairs even
419
- # if we have no pairs that use that paritcular feature as a pair. We can eliminate these useless
419
+ # if we have no pairs that use that particular feature as a pair. We can eliminate these useless
420
420
# pair feature cuts before merging the bins and that'll give us less resulting cuts. Having less
421
421
# cuts reduces the number of estimates that we need to make and reduces the complexity of the
422
422
# tensors, so it's good to have this reduction.
@@ -470,7 +470,7 @@ def merge_ebms(models):
470
470
# order and also handling merged categories (where two categories map to a single score)
471
471
# We should first try to progress in order along each set of keys and see if we can
472
472
# establish the perfect order which might work if there are isolated missing categories
473
- # and if we can't get a unique guaranteed sorted order that way then examime all the
473
+ # and if we can't get a unique guaranteed sorted order that way then examine all the
474
474
# different known sort order and figure out if any of the possible orderings match
475
475
merged_bins = dict (zip (merged_keys , count (1 )))
476
476
else :
@@ -550,7 +550,7 @@ def merge_ebms(models):
550
550
):
551
551
if hasattr (ebm , "feature_bounds_" ):
552
552
# TODO: estimate the histogram bin counts by taking the min of the mins and the max of the maxes
553
- # and re-apportioning the counts based on the distributions of the previous histograms. Proprotion
553
+ # and re-apportioning the counts based on the distributions of the previous histograms. Proportion
554
554
# them to the floor of their counts and then assign any remaining integers based on how much
555
555
# they reduce the RMSE of the integer counts from the ideal floating point counts.
556
556
pass
@@ -623,7 +623,7 @@ def merge_ebms(models):
623
623
624
624
# TODO: in the future we might at this point try and figure out the most
625
625
# common feature ordering within the terms. Take the mode first
626
- # and amonst the orderings that tie, choose the one that's best sorted by
626
+ # and amongst the orderings that tie, choose the one that's best sorted by
627
627
# feature indexes
628
628
ebm .term_features_ = sorted_fgs
629
629
@@ -634,26 +634,26 @@ def merge_ebms(models):
634
634
# interaction mismatches where an interaction will be in one model, but not the other.
635
635
# We need to estimate the bin_weight_ tensors that would have existed in this case.
636
636
# We'll use the interaction terms that we do have in other models to estimate the
637
- # distribution in the essense of the data, which should be roughly consistent or you
637
+ # distribution in the essence of the data, which should be roughly consistent or you
638
638
# shouldn't be attempting to merge the models in the first place. We'll then scale
639
- # the percentage distribution by the total weight of the model that we're fillin in the
639
+ # the percentage distribution by the total weight of the model that we're filling in the
640
640
# details for.
641
641
642
642
# TODO: this algorithm has some problems. The estimated tensor that we get by taking the
643
643
# model weight and distributing it by a per-cell percentage measure means that we get
644
- # inconsistent weight distibutions along the axis. We can take our resulting weight tensor
644
+ # inconsistent weight distributions along the axis. We can take our resulting weight tensor
645
645
# and sum the columns/rows to get the weights on each individual feature axis. Our model
646
646
# however comes with a known set of weights on each feature, and the result of our operation
647
647
# will not match the existing distribution in almost all cases. I think there might be
648
648
# some algorithm where we start with the per-feature weights and use the distribution hints
649
649
# from the other models to inform where we place our exact weights that we know about in our
650
- # model from each axis. The problem is that the sums in both axies need to agree, and each
650
+ # model from each axis. The problem is that the sums in both axes need to agree, and each
651
651
# change we make influences both. I'm not sure we can even guarantee that there is an answer
652
652
# and if there was one I'm not sure how we'd go about generating it. I'm going to leave
653
653
# this problem for YOU: a future person who is smarter than me and has more time to solve this.
654
654
# One hint: I think a possible place to start would be an iterative algorithm that's similar
655
655
# to purification where you randomly select a row/column and try to get closer at each step
656
- # to the rigth answer. Good luck!
656
+ # to the right answer. Good luck!
657
657
#
658
658
# Oh, there's also another deeper problem.. let's say you had a crazy 5 way interaction in the
659
659
# model eg: (0,1,2,3,4) and you had 2 and 3 way interactions that either overlap or not.
@@ -731,7 +731,7 @@ def merge_ebms(models):
731
731
model .bagged_scores_ [term_idx ][bag_idx ],
732
732
model .bin_weights_ [
733
733
term_idx
734
- ], # we use these to weigh distribution of scores for mulple bins
734
+ ], # we use these to weigh distribution of scores for multiple bins
735
735
)
736
736
new_bagged_scores .append (harmonized_bagged_scores )
737
737
ebm .bin_weights_ .append (np .sum (new_bin_weights , axis = 0 ))
@@ -768,7 +768,7 @@ def merge_ebms(models):
768
768
# TODO: we might be able to do these operations earlier
769
769
remove_extra_bins (ebm .term_features_ , ebm .bins_ )
770
770
771
- # dependent attributes (can be re-derrived after serialization)
771
+ # dependent attributes (can be re-derived after serialization)
772
772
ebm .n_features_in_ = len (ebm .bins_ ) # scikit-learn specified name
773
773
ebm .term_names_ = generate_term_names (ebm .feature_names_in_ , ebm .term_features_ )
774
774
0 commit comments