Skip to content

Commit 5d77def

Browse files
committed
Release
1 parent bb7bf57 commit 5d77def

File tree

8 files changed

+83
-137
lines changed

8 files changed

+83
-137
lines changed

imgs/linear_boost_importances.png

5.51 KB
Loading

lineartree/_classes.py

Lines changed: 36 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from joblib import Parallel, effective_n_jobs # , delayed
77

88
from sklearn.dummy import DummyClassifier
9-
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
9+
from sklearn.tree import DecisionTreeRegressor
1010
from sklearn.ensemble import RandomForestRegressor
1111

1212
from sklearn.base import is_regressor
@@ -19,6 +19,9 @@
1919
from ._criterion import mse, rmse, mae, poisson
2020
from ._criterion import hamming, crossentropy
2121

22+
import sklearn
23+
_sklearn_v1 = eval(sklearn.__version__.split('.')[0]) > 0
24+
2225

2326
CRITERIA = {"mse": mse,
2427
"rmse": rmse,
@@ -853,8 +856,7 @@ def __init__(self, base_estimator, *, loss, n_estimators,
853856
max_depth, min_samples_split, min_samples_leaf,
854857
min_weight_fraction_leaf, max_features,
855858
random_state, max_leaf_nodes,
856-
min_impurity_decrease, min_impurity_split,
857-
ccp_alpha):
859+
min_impurity_decrease, ccp_alpha):
858860

859861
self.base_estimator = base_estimator
860862
self.loss = loss
@@ -867,7 +869,6 @@ def __init__(self, base_estimator, *, loss, n_estimators,
867869
self.random_state = random_state
868870
self.max_leaf_nodes = max_leaf_nodes
869871
self.min_impurity_decrease = min_impurity_decrease
870-
self.min_impurity_split = min_impurity_split
871872
self.ccp_alpha = ccp_alpha
872873

873874
def _fit(self, X, y, sample_weight=None):
@@ -918,47 +919,33 @@ def _fit(self, X, y, sample_weight=None):
918919
else:
919920
resid = SCORING[self.loss](y, pred)
920921

921-
if self.loss == 'hamming':
922-
tree = DecisionTreeClassifier(
923-
criterion='gini', max_depth=self.max_depth,
924-
min_samples_split=self.min_samples_split,
925-
min_samples_leaf=self.min_samples_leaf,
926-
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
927-
max_features=self.max_features,
928-
random_state=self.random_state,
929-
max_leaf_nodes=self.max_leaf_nodes,
930-
min_impurity_decrease=self.min_impurity_decrease,
931-
min_impurity_split=self.min_impurity_split,
932-
ccp_alpha=self.ccp_alpha
933-
)
934-
else:
935-
tree = DecisionTreeRegressor(
936-
criterion='mse', max_depth=self.max_depth,
937-
min_samples_split=self.min_samples_split,
938-
min_samples_leaf=self.min_samples_leaf,
939-
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
940-
max_features=self.max_features,
941-
random_state=self.random_state,
942-
max_leaf_nodes=self.max_leaf_nodes,
943-
min_impurity_decrease=self.min_impurity_decrease,
944-
min_impurity_split=self.min_impurity_split,
945-
ccp_alpha=self.ccp_alpha
946-
)
922+
if resid.ndim > 1:
923+
resid = resid.mean(1)
924+
925+
criterion = 'squared_error' if _sklearn_v1 else 'mse'
926+
927+
tree = DecisionTreeRegressor(
928+
criterion=criterion, max_depth=self.max_depth,
929+
min_samples_split=self.min_samples_split,
930+
min_samples_leaf=self.min_samples_leaf,
931+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
932+
max_features=self.max_features,
933+
random_state=self.random_state,
934+
max_leaf_nodes=self.max_leaf_nodes,
935+
min_impurity_decrease=self.min_impurity_decrease,
936+
ccp_alpha=self.ccp_alpha
937+
)
947938

948939
tree.fit(X, resid, sample_weight=sample_weight, check_input=False)
949940
self._trees.append(tree)
950941

951-
impurity = tree.tree_.impurity
952-
pred_leaves = tree.apply(X, check_input=False)
953-
leaves = np.unique(pred_leaves)
954-
955-
worst_leaf = np.argmax([impurity[l] for l in leaves])
956-
worst_leaf = leaves[worst_leaf]
957-
self._leaves.append(worst_leaf)
942+
pred_tree = np.abs(tree.predict(X, check_input=False))
943+
worst_pred = np.max(pred_tree)
944+
self._leaves.append(worst_pred)
958945

959-
pred_leaves = (pred_leaves == worst_leaf).astype(np.float32)
960-
pred_leaves = pred_leaves.reshape(-1, 1)
961-
X = np.concatenate([X, pred_leaves], axis=1)
946+
pred_tree = (pred_tree == worst_pred).astype(np.float32)
947+
pred_tree = pred_tree.reshape(-1, 1)
948+
X = np.concatenate([X, pred_tree], axis=1)
962949

963950
self.base_estimator_ = deepcopy(self.base_estimator)
964951
self.base_estimator_.fit(X, y, sample_weight=sample_weight)
@@ -993,10 +980,10 @@ def transform(self, X):
993980
self._check_n_features(X, reset=False)
994981

995982
for tree, leaf in zip(self._trees, self._leaves):
996-
pred_leaves = tree.apply(X, check_input=False)
997-
pred_leaves = (pred_leaves == leaf).astype(np.float32)
998-
pred_leaves = pred_leaves.reshape(-1, 1)
999-
X = np.concatenate([X, pred_leaves], axis=1)
983+
pred_tree = np.abs(tree.predict(X, check_input=False))
984+
pred_tree = (pred_tree == leaf).astype(np.float32)
985+
pred_tree = pred_tree.reshape(-1, 1)
986+
X = np.concatenate([X, pred_tree], axis=1)
1000987

1001988
return X
1002989

@@ -1010,8 +997,8 @@ class _LinearForest(BaseEstimator):
1010997
def __init__(self, base_estimator, *, n_estimators, max_depth,
1011998
min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
1012999
max_features, max_leaf_nodes, min_impurity_decrease,
1013-
min_impurity_split, bootstrap, oob_score, n_jobs,
1014-
random_state, ccp_alpha, max_samples):
1000+
bootstrap, oob_score, n_jobs, random_state,
1001+
ccp_alpha, max_samples):
10151002

10161003
self.base_estimator = base_estimator
10171004
self.n_estimators = n_estimators
@@ -1022,7 +1009,6 @@ def __init__(self, base_estimator, *, n_estimators, max_depth,
10221009
self.max_features = max_features
10231010
self.max_leaf_nodes = max_leaf_nodes
10241011
self.min_impurity_decrease = min_impurity_decrease
1025-
self.min_impurity_split = min_impurity_split
10261012
self.bootstrap = bootstrap
10271013
self.oob_score = oob_score
10281014
self.n_jobs = n_jobs
@@ -1100,17 +1086,18 @@ def _fit(self, X, y, sample_weight=None):
11001086
self.base_estimator_.fit(X, y, sample_weight)
11011087
resid = y - self.base_estimator_.predict(X)
11021088

1089+
criterion = 'squared_error' if _sklearn_v1 else 'mse'
1090+
11031091
self.forest_estimator_ = RandomForestRegressor(
11041092
n_estimators=self.n_estimators,
1105-
criterion='mse',
1093+
criterion=criterion,
11061094
max_depth=self.max_depth,
11071095
min_samples_split=self.min_samples_split,
11081096
min_samples_leaf=self.min_samples_leaf,
11091097
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
11101098
max_features=self.max_features,
11111099
max_leaf_nodes=self.max_leaf_nodes,
11121100
min_impurity_decrease=self.min_impurity_decrease,
1113-
min_impurity_split=self.min_impurity_split,
11141101
bootstrap=self.bootstrap,
11151102
oob_score=self.oob_score,
11161103
n_jobs=self.n_jobs,

lineartree/lineartree.py

Lines changed: 8 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -566,10 +566,6 @@ class LinearBoostRegressor(_LinearBoosting, RegressorMixin):
566566
A node will be split if this split induces a decrease of the impurity
567567
greater than or equal to this value.
568568
569-
min_impurity_split : float, default=0
570-
Threshold for early stopping in tree growth. A node will split
571-
if its impurity is above the threshold, otherwise it is a leaf.
572-
573569
ccp_alpha : non-negative float, default=0.0
574570
Complexity parameter used for Minimal Cost-Complexity Pruning. The
575571
subtree with the largest cost complexity that is smaller than
@@ -619,8 +615,7 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
619615
max_depth=3, min_samples_split=2, min_samples_leaf=1,
620616
min_weight_fraction_leaf=0.0, max_features=None,
621617
random_state=None, max_leaf_nodes=None,
622-
min_impurity_decrease=0.0, min_impurity_split=None,
623-
ccp_alpha=0.0):
618+
min_impurity_decrease=0.0, ccp_alpha=0.0):
624619

625620
self.base_estimator = base_estimator
626621
self.loss = loss
@@ -633,7 +628,6 @@ def __init__(self, base_estimator, *, loss='linear', n_estimators=10,
633628
self.random_state = random_state
634629
self.max_leaf_nodes = max_leaf_nodes
635630
self.min_impurity_decrease = min_impurity_decrease
636-
self.min_impurity_split = min_impurity_split
637631
self.ccp_alpha = ccp_alpha
638632

639633
def fit(self, X, y, sample_weight=None):
@@ -777,10 +771,6 @@ class LinearBoostClassifier(_LinearBoosting, ClassifierMixin):
777771
A node will be split if this split induces a decrease of the impurity
778772
greater than or equal to this value.
779773
780-
min_impurity_split : float, default=0
781-
Threshold for early stopping in tree growth. A node will split
782-
if its impurity is above the threshold, otherwise it is a leaf.
783-
784774
ccp_alpha : non-negative float, default=0.0
785775
Complexity parameter used for Minimal Cost-Complexity Pruning. The
786776
subtree with the largest cost complexity that is smaller than
@@ -830,8 +820,7 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
830820
max_depth=3, min_samples_split=2, min_samples_leaf=1,
831821
min_weight_fraction_leaf=0.0, max_features=None,
832822
random_state=None, max_leaf_nodes=None,
833-
min_impurity_decrease=0.0, min_impurity_split=None,
834-
ccp_alpha=0.0):
823+
min_impurity_decrease=0.0, ccp_alpha=0.0):
835824

836825
self.base_estimator = base_estimator
837826
self.loss = loss
@@ -844,7 +833,6 @@ def __init__(self, base_estimator, *, loss='hamming', n_estimators=10,
844833
self.random_state = random_state
845834
self.max_leaf_nodes = max_leaf_nodes
846835
self.min_impurity_decrease = min_impurity_decrease
847-
self.min_impurity_split = min_impurity_split
848836
self.ccp_alpha = ccp_alpha
849837

850838
def fit(self, X, y, sample_weight=None):
@@ -1039,10 +1027,6 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
10391027
A node will be split if this split induces a decrease of the impurity
10401028
greater than or equal to this value.
10411029
1042-
min_impurity_split : float, default=None
1043-
Threshold for early stopping in tree growth. A node will split
1044-
if its impurity is above the threshold, otherwise it is a leaf.
1045-
10461030
bootstrap : bool, default=True
10471031
Whether bootstrap samples are used when building trees. If False, the
10481032
whole dataset is used to build each tree.
@@ -1076,7 +1060,7 @@ class LinearForestClassifier(_LinearForest, ClassifierMixin):
10761060
- If None (default), then draw `X.shape[0]` samples.
10771061
- If int, then draw `max_samples` samples.
10781062
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
1079-
`max_samples` should be in the interval `(0, 1)`.
1063+
`max_samples` should be in the interval `(0, 1]`.
10801064
10811065
Attributes
10821066
----------
@@ -1129,9 +1113,8 @@ def __init__(self, base_estimator, *, n_estimators=100,
11291113
max_depth=None, min_samples_split=2, min_samples_leaf=1,
11301114
min_weight_fraction_leaf=0., max_features="auto",
11311115
max_leaf_nodes=None, min_impurity_decrease=0.,
1132-
min_impurity_split=None, bootstrap=True,
1133-
oob_score=False, n_jobs=None, random_state=None,
1134-
ccp_alpha=0.0, max_samples=None):
1116+
bootstrap=True, oob_score=False, n_jobs=None,
1117+
random_state=None, ccp_alpha=0.0, max_samples=None):
11351118

11361119
self.base_estimator = base_estimator
11371120
self.n_estimators = n_estimators
@@ -1142,7 +1125,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
11421125
self.max_features = max_features
11431126
self.max_leaf_nodes = max_leaf_nodes
11441127
self.min_impurity_decrease = min_impurity_decrease
1145-
self.min_impurity_split = min_impurity_split
11461128
self.bootstrap = bootstrap
11471129
self.oob_score = oob_score
11481130
self.n_jobs = n_jobs
@@ -1351,10 +1333,6 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
13511333
A node will be split if this split induces a decrease of the impurity
13521334
greater than or equal to this value.
13531335
1354-
min_impurity_split : float, default=None
1355-
Threshold for early stopping in tree growth. A node will split
1356-
if its impurity is above the threshold, otherwise it is a leaf.
1357-
13581336
bootstrap : bool, default=True
13591337
Whether bootstrap samples are used when building trees. If False, the
13601338
whole dataset is used to build each tree.
@@ -1388,7 +1366,7 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
13881366
- If None (default), then draw `X.shape[0]` samples.
13891367
- If int, then draw `max_samples` samples.
13901368
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
1391-
`max_samples` should be in the interval `(0, 1)`.
1369+
`max_samples` should be in the interval `(0, 1]`.
13921370
13931371
Attributes
13941372
----------
@@ -1437,14 +1415,12 @@ class LinearForestRegressor(_LinearForest, RegressorMixin):
14371415
Authors: Haozhe Zhang, Dan Nettleton, Zhengyuan Zhu.
14381416
(https://arxiv.org/abs/1904.10416)
14391417
"""
1440-
14411418
def __init__(self, base_estimator, *, n_estimators=100,
14421419
max_depth=None, min_samples_split=2, min_samples_leaf=1,
14431420
min_weight_fraction_leaf=0., max_features="auto",
14441421
max_leaf_nodes=None, min_impurity_decrease=0.,
1445-
min_impurity_split=None, bootstrap=True,
1446-
oob_score=False, n_jobs=None, random_state=None,
1447-
ccp_alpha=0.0, max_samples=None):
1422+
bootstrap=True, oob_score=False, n_jobs=None,
1423+
random_state=None, ccp_alpha=0.0, max_samples=None):
14481424

14491425
self.base_estimator = base_estimator
14501426
self.n_estimators = n_estimators
@@ -1455,7 +1431,6 @@ def __init__(self, base_estimator, *, n_estimators=100,
14551431
self.max_features = max_features
14561432
self.max_leaf_nodes = max_leaf_nodes
14571433
self.min_impurity_decrease = min_impurity_decrease
1458-
self.min_impurity_split = min_impurity_split
14591434
self.bootstrap = bootstrap
14601435
self.oob_score = oob_score
14611436
self.n_jobs = n_jobs

0 commit comments

Comments
 (0)