Skip to content

Commit b170f6b

Browse files
intermediate changes
ref:1b720c067b08cec4c591141fb1b3f423d84abef1
1 parent df7b526 commit b170f6b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+16742
-6428
lines changed

catboost/app/mode_calc.cpp

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -125,20 +125,21 @@ int mode_calc(int argc, const char* argv[]) {
125125
auto visibleLabelsHelper = BuildLabelsHelper<TExternalLabelsHelper>(model);
126126

127127
SetSilentLogingMode();
128-
approx.OutputToFile(
129-
&executor,
130-
params.OutputColumnsIds,
131-
visibleLabelsHelper,
132-
poolPart,
133-
true,
134-
&outputStream,
135-
// TODO: src file columns output is incompatible with block processing
136-
/*testSetPath*/NCB::TPathWithScheme(),
137-
/*testFileWhichOf*/ {0, 0},
138-
params.DsvPoolFormatParams.Format,
139-
IsFirstBlock,
140-
docIdOffset,
141-
std::make_pair(evalPeriod, iterationsLimit)
128+
OutputEvalResultToFile(
129+
approx,
130+
&executor,
131+
params.OutputColumnsIds,
132+
visibleLabelsHelper,
133+
poolPart,
134+
true,
135+
&outputStream,
136+
// TODO: src file columns output is incompatible with block processing
137+
/*testSetPath*/NCB::TPathWithScheme(),
138+
/*testFileWhichOf*/ {0, 0},
139+
params.DsvPoolFormatParams.Format,
140+
IsFirstBlock,
141+
docIdOffset,
142+
std::make_pair(evalPeriod, iterationsLimit)
142143
);
143144
docIdOffset += blockSize;
144145
IsFirstBlock = false;

catboost/libs/algo/calc_score_cache.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ void TCalcScoreFold::SetPermutationBlockSizeAndCalcStatsRanges(int permutationBl
522522
int blockSize = DefaultCalcStatsObjBlockSize;
523523
if (docCount && HasQueryInfo()) {
524524
if (HasPairs(LearnQueriesInfo)) {
525-
rangeEnd = CB_THREAD_LIMIT;
525+
rangeEnd = CeilDiv(docCount, DefaultCalcStatsObjBlockSize);
526526
blockSize = 1;
527527
} else {
528528
rangeEnd = LearnQueriesInfo.ysize();

catboost/libs/algo/quantization.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ void QuantizeTrainPools(
450450
catFeatures,
451451
floatFeatures,
452452
learnData->AllFeatures,
453-
/*allowNansOnlyInTest=*/false,
453+
/*allowNansOnlyInTest=*/true,
454454
/*clearPoolAfterBinarization=*/pools.AllowClearTest,
455455
localExecutor,
456456
/*select=*/{},

catboost/libs/algo/score_calcer.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,11 @@ static void CalcStatsImpl(
332332
fold.BodyTailArr[0].WeightedDerivatives[0].data(),
333333
docCount
334334
);
335-
const auto docPart = CeilDiv(docCount, CB_THREAD_LIMIT);
335+
const auto blockCount = fold.GetCalcStatsIndexRanges().RangesCount();
336+
const auto docPart = CeilDiv(docCount, blockCount);
336337

337338
const auto pairCount = pairs.ysize();
338-
const auto pairPart = CeilDiv(pairCount, CB_THREAD_LIMIT);
339+
const auto pairPart = CeilDiv(pairCount, blockCount);
339340

340341
NCB::MapMerge(
341342
localExecutor,

catboost/libs/eval_result/eval_helpers.cpp

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,8 @@ static int GetColumnIndex(const TPoolColumnsMetaInfo& poolColumnsMetaInfo, EColu
575575

576576

577577

578-
void TEvalResult::OutputToFile(
578+
void OutputEvalResultToFile(
579+
const TEvalResult& evalResult,
579580
NPar::TLocalExecutor* executor,
580581
const TVector<TString>& outputColumns,
581582
const TExternalLabelsHelper& visibleLabelsHelper,
@@ -611,7 +612,7 @@ void TEvalResult::OutputToFile(
611612
for (const auto& columnName : outputColumns) {
612613
EPredictionType type;
613614
if (TryFromString<EPredictionType>(columnName, type)) {
614-
columnPrinter.push_back(MakeHolder<TEvalPrinter>(executor, RawValues, type, visibleLabelsHelper, evalParameters));
615+
columnPrinter.push_back(MakeHolder<TEvalPrinter>(executor, evalResult.GetRawValuesConstRef(), type, visibleLabelsHelper, evalParameters));
615616
continue;
616617
}
617618
EColumn outputType;
@@ -745,7 +746,8 @@ void TEvalResult::OutputToFile(
745746
}
746747
}
747748

748-
void TEvalResult::OutputToFile(
749+
void OutputEvalResultToFile(
750+
const TEvalResult& evalResult,
749751
int threadCount,
750752
const TVector<TString>& outputColumns,
751753
const TExternalLabelsHelper& visibleLabelsHelper,
@@ -757,25 +759,32 @@ void TEvalResult::OutputToFile(
757759
const TDsvFormatOptions& testSetFormat,
758760
bool writeHeader,
759761
ui64 docIdOffset) {
762+
760763
NPar::TLocalExecutor executor;
761764
executor.RunAdditionalThreads(threadCount - 1);
762-
OutputToFile(&executor,
763-
outputColumns,
764-
visibleLabelsHelper,
765-
pool,
766-
isPartOfFullTestSet,
767-
outputStream,
768-
testSetPath,
769-
testFileWhichOf,
770-
testSetFormat,
771-
writeHeader,
772-
docIdOffset);
765+
OutputEvalResultToFile(
766+
evalResult,
767+
&executor,
768+
outputColumns,
769+
visibleLabelsHelper,
770+
pool,
771+
isPartOfFullTestSet,
772+
outputStream,
773+
testSetPath,
774+
testFileWhichOf,
775+
testSetFormat,
776+
writeHeader,
777+
docIdOffset);
773778
}
774779

775780
TVector<TVector<TVector<double>>>& TEvalResult::GetRawValuesRef() {
776781
return RawValues;
777782
}
778783

784+
const TVector<TVector<TVector<double>>>& TEvalResult::GetRawValuesConstRef() const {
785+
return RawValues;
786+
}
787+
779788
void TEvalResult::ClearRawValues() {
780789
RawValues.clear();
781790
RawValues.resize(1);

catboost/libs/eval_result/eval_helpers.h

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,37 +62,41 @@ class TEvalResult {
6262
}
6363

6464
TVector<TVector<TVector<double>>>& GetRawValuesRef();
65+
const TVector<TVector<TVector<double>>>& GetRawValuesConstRef() const;
6566
void ClearRawValues();
6667

6768
/// *Move* data from `rawValues` to `RawValues[0]`
6869
void SetRawValuesByMove(TVector<TVector<double>>& rawValues);
6970

70-
void OutputToFile(
71-
NPar::TLocalExecutor* executor,
72-
const TVector<TString>& outputColumns,
73-
const TExternalLabelsHelper& visibleLabelsHelper,
74-
const TPool& pool,
75-
bool isPartOfTestSet, // pool is a part of test set, can't output testSetPath columns
76-
IOutputStream* outputStream,
77-
const NCB::TPathWithScheme& testSetPath,
78-
std::pair<int, int> testFileWhichOf,
79-
const NCB::TDsvFormatOptions& testSetFormat,
80-
bool writeHeader = true,
81-
ui64 docIdOffset = 0,
82-
TMaybe<std::pair<size_t, size_t>> evalParameters = TMaybe<std::pair<size_t, size_t>>());
83-
void OutputToFile(
84-
int threadCount,
85-
const TVector<TString>& outputColumns,
86-
const TExternalLabelsHelper& visibleLabelsHelper,
87-
const TPool& pool,
88-
bool isPartOfTestSet, // pool is a part of test set, can't output testSetPath columns
89-
IOutputStream* outputStream,
90-
const NCB::TPathWithScheme& testSetPath,
91-
std::pair<int, int> testFileWhichOf,
92-
const NCB::TDsvFormatOptions& testSetFormat,
93-
bool writeHeader = true,
94-
ui64 docIdOffset = 0);
95-
9671
private:
9772
TVector<TVector<TVector<double>>> RawValues; // [evalIter][dim][docIdx]
9873
};
74+
75+
void OutputEvalResultToFile(
76+
const TEvalResult& evalResult,
77+
NPar::TLocalExecutor* executor,
78+
const TVector<TString>& outputColumns,
79+
const TExternalLabelsHelper& visibleLabelsHelper,
80+
const TPool& pool,
81+
bool isPartOfTestSet, // pool is a part of test set, can't output testSetPath columns
82+
IOutputStream* outputStream,
83+
const NCB::TPathWithScheme& testSetPath,
84+
std::pair<int, int> testFileWhichOf,
85+
const NCB::TDsvFormatOptions& testSetFormat,
86+
bool writeHeader = true,
87+
ui64 docIdOffset = 0,
88+
TMaybe<std::pair<size_t, size_t>> evalParameters = TMaybe<std::pair<size_t, size_t>>());
89+
90+
void OutputEvalResultToFile(
91+
const TEvalResult& evalResult,
92+
int threadCount,
93+
const TVector<TString>& outputColumns,
94+
const TExternalLabelsHelper& visibleLabelsHelper,
95+
const TPool& pool,
96+
bool isPartOfTestSet, // pool is a part of test set, can't output testSetPath columns
97+
IOutputStream* outputStream,
98+
const NCB::TPathWithScheme& testSetPath,
99+
std::pair<int, int> testFileWhichOf,
100+
const NCB::TDsvFormatOptions& testSetFormat,
101+
bool writeHeader = true,
102+
ui64 docIdOffset = 0);

catboost/libs/train_lib/train_model.cpp

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -729,29 +729,33 @@ class TCPUModelTrainer : public IModelTrainer {
729729
for (int testIdx = 0; testIdx < pools.Test.ysize(); ++testIdx) {
730730
const TPool& testPool = pools.Test[testIdx];
731731
const NCB::TPathWithScheme& testSetPath = testIdx < loadOptions.TestSetPaths.ysize() ? loadOptions.TestSetPaths[testIdx] : NCB::TPathWithScheme();
732-
evalResults[testIdx].OutputToFile(threadCount,
733-
outputOptions.GetOutputColumns(),
734-
visibleLabelsHelper,
735-
testPool,
736-
false,
737-
&fileStream,
738-
testSetPath,
739-
{testIdx, pools.Test.ysize()},
740-
loadOptions.DsvPoolFormatParams.Format,
741-
/*writeHeader*/ testIdx < 1);
732+
OutputEvalResultToFile(
733+
evalResults[testIdx],
734+
threadCount,
735+
outputOptions.GetOutputColumns(),
736+
visibleLabelsHelper,
737+
testPool,
738+
false,
739+
&fileStream,
740+
testSetPath,
741+
{testIdx, pools.Test.ysize()},
742+
loadOptions.DsvPoolFormatParams.Format,
743+
/*writeHeader*/ testIdx < 1);
742744
}
743745
if (pools.Test.empty()) {
744746
// Make sure to emit header to fileStream
745-
evalResults[0].OutputToFile(threadCount,
746-
outputOptions.GetOutputColumns(),
747-
visibleLabelsHelper,
748-
TPool(),
749-
false,
750-
&fileStream,
751-
NCB::TPathWithScheme(),
752-
{0, 1},
753-
loadOptions.DsvPoolFormatParams.Format,
754-
/*writeHeader*/ true);
747+
OutputEvalResultToFile(
748+
evalResults[0],
749+
threadCount,
750+
outputOptions.GetOutputColumns(),
751+
visibleLabelsHelper,
752+
TPool(),
753+
false,
754+
&fileStream,
755+
NCB::TPathWithScheme(),
756+
{0, 1},
757+
loadOptions.DsvPoolFormatParams.Format,
758+
/*writeHeader*/ true);
755759
}
756760
} else {
757761
MATRIXNET_INFO_LOG << "Skipping test eval output" << Endl;

catboost/libs/train_lib/ut/train_model_ut.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99

1010
Y_UNIT_TEST_SUITE(TrainModelTests) {
1111
Y_UNIT_TEST(TrainWithoutNansTestWithNans) {
12-
// Train doesn't have NaNs, so TrainModel implicitly forbids them (during quantization), and
13-
// test data have NaN feature, so the entire training process fails.
12+
// Train doesn't have NaNs, so TrainModel implicitly forbids them (during quantization), but
13+
// test data has NaNs and we just allow that
14+
//
15+
// See MLTOOLS-1602 and MLTOOLS-2235 for details (though there aren't much details).
1416
//
1517
TTempDir trainDir;
1618

@@ -45,13 +47,15 @@ Y_UNIT_TEST_SUITE(TrainModelTests) {
4547
);
4648
};
4749

48-
UNIT_ASSERT_EXCEPTION_CONTAINS(f(), TCatboostException, "There are NaNs in test dataset");
50+
UNIT_ASSERT_NO_EXCEPTION(f());
4951
}
5052

5153
Y_UNIT_TEST(TrainWithoutNansApplyWithNans) {
5254
// Train doesn't have NaNs, so TrainModel implicitly forbids them (during quantization), but
5355
// during model application we allow NaNs (because it's too expensive to check for their
54-
// presence)
56+
// presence).
57+
//
58+
// See MLTOOLS-1602 and MLTOOLS-2235 for details (though there aren't much details).
5559
//
5660
TTempDir trainDir;
5761

catboost/python-package/ut/medium/test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,8 +2184,7 @@ def test_eval_set_with_nans(task_type):
21842184
model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE', 'task_type': task_type, 'devices': '0'})
21852185
train_pool = Pool(features, label=labels)
21862186
test_pool = Pool(features_with_nans, label=labels)
2187-
with pytest.raises(CatboostError, match='NaNs in test.* no NaNs in learn'):
2188-
model.fit(train_pool, eval_set=test_pool)
2187+
model.fit(train_pool, eval_set=test_pool)
21892188

21902189

21912190
def test_learning_rate_auto_set(task_type):

0 commit comments

Comments
 (0)