2019-12-11 11:20:40 +08:00
import json
2020-02-17 11:31:13 +08:00
import locale
2022-10-26 16:56:11 +08:00
import os
2020-08-22 13:18:48 +08:00
import tempfile
2015-09-08 09:47:48 -04:00
2022-10-26 16:56:11 +08:00
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
dpath = tm . data_dir ( __file__ )
2015-09-08 09:47:48 -04:00
2015-10-21 23:24:37 -05:00
rng = np . random . RandomState ( 1994 )
2016-04-24 16:34:46 +09:00
2022-01-16 02:11:53 +08:00
def json_model ( model_path : str , parameters : dict ) - > dict :
2023-03-22 23:49:56 +08:00
datasets = pytest . importorskip ( " sklearn.datasets " )
X , y = datasets . make_classification ( 64 , n_features = 8 , n_classes = 3 , n_informative = 6 )
if parameters . get ( " objective " , None ) == " multi:softmax " :
parameters [ " num_class " ] = 3
2019-12-23 19:47:35 +08:00
dm1 = xgb . DMatrix ( X , y )
2020-01-28 13:29:09 +08:00
bst = xgb . train ( parameters , dm1 )
2019-12-23 19:47:35 +08:00
bst . save_model ( model_path )
2023-03-22 23:49:56 +08:00
2022-01-16 02:11:53 +08:00
if model_path . endswith ( " ubj " ) :
import ubjson
with open ( model_path , " rb " ) as ubjfd :
model = ubjson . load ( ubjfd )
else :
with open ( model_path , ' r ' ) as fd :
model = json . load ( fd )
2019-12-23 19:47:35 +08:00
return model
2020-11-03 02:27:39 -05:00
class TestModels :
2016-04-24 16:34:46 +09:00
def test_glm ( self ) :
2019-03-17 17:55:04 +08:00
param = { ' verbosity ' : 0 , ' objective ' : ' binary:logistic ' ,
2020-01-30 16:00:18 +08:00
' booster ' : ' gblinear ' , ' alpha ' : 0.0001 , ' lambda ' : 1 ,
' nthread ' : 1 }
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
dtest = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.test " ) )
2016-04-24 16:34:46 +09:00
watchlist = [ ( dtest , ' eval ' ) , ( dtrain , ' train ' ) ]
num_round = 4
bst = xgb . train ( param , dtrain , num_round , watchlist )
assert isinstance ( bst , xgb . core . Booster )
preds = bst . predict ( dtest )
labels = dtest . get_label ( )
err = sum ( 1 for i in range ( len ( preds ) )
if int ( preds [ i ] > 0.5 ) != labels [ i ] ) / float ( len ( preds ) )
2016-11-20 18:23:19 -06:00
assert err < 0.2
2016-04-24 16:34:46 +09:00
2016-06-09 06:04:01 +09:00
def test_dart ( self ) :
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
dtest = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.test " ) )
2020-01-13 08:48:30 -05:00
param = { ' max_depth ' : 5 , ' objective ' : ' binary:logistic ' ,
' eval_metric ' : ' logloss ' , ' booster ' : ' dart ' , ' verbosity ' : 1 }
2016-06-09 06:04:01 +09:00
# specify validations set to watch performance
watchlist = [ ( dtest , ' eval ' ) , ( dtrain , ' train ' ) ]
num_round = 2
bst = xgb . train ( param , dtrain , num_round , watchlist )
# this is prediction
preds = bst . predict ( dtest , ntree_limit = num_round )
labels = dtest . get_label ( )
2020-01-13 08:48:30 -05:00
err = sum ( 1 for i in range ( len ( preds ) )
if int ( preds [ i ] > 0.5 ) != labels [ i ] ) / float ( len ( preds ) )
2016-06-09 06:04:01 +09:00
# error must be smaller than 10%
assert err < 0.1
2020-08-22 13:18:48 +08:00
with tempfile . TemporaryDirectory ( ) as tmpdir :
dtest_path = os . path . join ( tmpdir , ' dtest.dmatrix ' )
model_path = os . path . join ( tmpdir , ' xgboost.model.dart ' )
# save dmatrix into binary buffer
dtest . save_binary ( dtest_path )
model_path = model_path
# save model
bst . save_model ( model_path )
# load model and data in
bst2 = xgb . Booster ( params = param , model_file = model_path )
dtest2 = xgb . DMatrix ( dtest_path )
2016-06-09 06:04:01 +09:00
preds2 = bst2 . predict ( dtest2 , ntree_limit = num_round )
2020-08-22 13:18:48 +08:00
2016-06-09 06:04:01 +09:00
# assert they are the same
assert np . sum ( np . abs ( preds2 - preds ) ) == 0
2020-01-13 08:48:30 -05:00
def my_logloss ( preds , dtrain ) :
labels = dtrain . get_label ( )
return ' logloss ' , np . sum (
np . log ( np . where ( labels , preds , 1 - preds ) ) )
# check whether custom evaluation metrics work
bst = xgb . train ( param , dtrain , num_round , watchlist ,
feval = my_logloss )
preds3 = bst . predict ( dtest , ntree_limit = num_round )
assert all ( preds3 == preds )
2016-06-09 06:04:01 +09:00
# check whether sample_type and normalize_type work
num_round = 50
2019-03-17 17:55:04 +08:00
param [ ' verbosity ' ] = 0
2016-06-09 06:04:01 +09:00
param [ ' learning_rate ' ] = 0.1
param [ ' rate_drop ' ] = 0.1
preds_list = [ ]
2020-01-13 08:48:30 -05:00
for p in [ [ p0 , p1 ] for p0 in [ ' uniform ' , ' weighted ' ]
for p1 in [ ' tree ' , ' forest ' ] ] :
2016-06-09 06:04:01 +09:00
param [ ' sample_type ' ] = p [ 0 ]
param [ ' normalize_type ' ] = p [ 1 ]
bst = xgb . train ( param , dtrain , num_round , watchlist )
preds = bst . predict ( dtest , ntree_limit = num_round )
2020-01-13 08:48:30 -05:00
err = sum ( 1 for i in range ( len ( preds ) )
if int ( preds [ i ] > 0.5 ) != labels [ i ] ) / float ( len ( preds ) )
2016-06-09 06:04:01 +09:00
assert err < 0.1
preds_list . append ( preds )
for ii in range ( len ( preds_list ) ) :
for jj in range ( ii + 1 , len ( preds_list ) ) :
assert np . sum ( np . abs ( preds_list [ ii ] - preds_list [ jj ] ) ) > 0
2019-12-24 09:43:41 +08:00
def test_boost_from_prediction ( self ) :
# Re-construct dtrain here to avoid modification
2022-10-26 16:56:11 +08:00
margined = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2019-12-24 09:43:41 +08:00
bst = xgb . train ( { ' tree_method ' : ' hist ' } , margined , 1 )
predt_0 = bst . predict ( margined , output_margin = True )
margined . set_base_margin ( predt_0 )
bst = xgb . train ( { ' tree_method ' : ' hist ' } , margined , 1 )
predt_1 = bst . predict ( margined )
assert np . any ( np . abs ( predt_1 - predt_0 ) > 1e-6 )
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2019-12-24 09:43:41 +08:00
bst = xgb . train ( { ' tree_method ' : ' hist ' } , dtrain , 2 )
predt_2 = bst . predict ( dtrain )
assert np . all ( np . abs ( predt_2 - predt_1 ) < 1e-6 )
2020-12-17 19:59:19 +08:00
def test_boost_from_existing_model ( self ) :
2022-10-26 16:56:11 +08:00
X = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2020-12-17 19:59:19 +08:00
booster = xgb . train ( { ' tree_method ' : ' hist ' } , X , num_boost_round = 4 )
assert booster . num_boosted_rounds ( ) == 4
booster = xgb . train ( { ' tree_method ' : ' hist ' } , X , num_boost_round = 4 ,
xgb_model = booster )
assert booster . num_boosted_rounds ( ) == 8
booster = xgb . train ( { ' updater ' : ' prune ' , ' process_type ' : ' update ' } , X ,
num_boost_round = 4 , xgb_model = booster )
# Trees are moved for update, the rounds is reduced. This test is
# written for being compatible with current code (1.0.0). If the
# behaviour is considered sub-optimal, feel free to change.
assert booster . num_boosted_rounds ( ) == 4
2021-06-09 14:51:17 +08:00
def run_custom_objective ( self , tree_method = None ) :
param = {
' max_depth ' : 2 ,
' eta ' : 1 ,
' objective ' : ' reg:logistic ' ,
" tree_method " : tree_method
}
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
dtest = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.test " ) )
2016-04-24 16:34:46 +09:00
watchlist = [ ( dtest , ' eval ' ) , ( dtrain , ' train ' ) ]
2020-08-05 12:27:19 +08:00
num_round = 10
2016-04-24 16:34:46 +09:00
def logregobj ( preds , dtrain ) :
labels = dtrain . get_label ( )
preds = 1.0 / ( 1.0 + np . exp ( - preds ) )
grad = preds - labels
hess = preds * ( 1.0 - preds )
return grad , hess
def evalerror ( preds , dtrain ) :
labels = dtrain . get_label ( )
2020-08-05 12:27:19 +08:00
preds = 1.0 / ( 1.0 + np . exp ( - preds ) )
2020-07-23 03:28:17 +08:00
return ' error ' , float ( sum ( labels != ( preds > 0.5 ) ) ) / len ( labels )
2016-04-24 16:34:46 +09:00
# test custom_objective in training
2020-08-05 12:27:19 +08:00
bst = xgb . train ( param , dtrain , num_round , watchlist , obj = logregobj ,
feval = evalerror )
2016-04-24 16:34:46 +09:00
assert isinstance ( bst , xgb . core . Booster )
preds = bst . predict ( dtest )
labels = dtest . get_label ( )
err = sum ( 1 for i in range ( len ( preds ) )
if int ( preds [ i ] > 0.5 ) != labels [ i ] ) / float ( len ( preds ) )
assert err < 0.1
# test custom_objective in cross-validation
xgb . cv ( param , dtrain , num_round , nfold = 5 , seed = 0 ,
obj = logregobj , feval = evalerror )
# test maximize parameter
def neg_evalerror ( preds , dtrain ) :
labels = dtrain . get_label ( )
return ' error ' , float ( sum ( labels == ( preds > 0.0 ) ) ) / len ( labels )
2020-08-05 12:27:19 +08:00
bst2 = xgb . train ( param , dtrain , num_round , watchlist , logregobj ,
neg_evalerror , maximize = True )
2016-04-24 16:34:46 +09:00
preds2 = bst2 . predict ( dtest )
err2 = sum ( 1 for i in range ( len ( preds2 ) )
if int ( preds2 [ i ] > 0.5 ) != labels [ i ] ) / float ( len ( preds2 ) )
assert err == err2
2021-06-09 14:51:17 +08:00
def test_custom_objective ( self ) :
self . run_custom_objective ( )
2016-06-05 00:17:35 -05:00
def test_multi_eval_metric ( self ) :
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
dtest = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.test " ) )
2016-06-05 00:17:35 -05:00
watchlist = [ ( dtest , ' eval ' ) , ( dtrain , ' train ' ) ]
2019-07-20 08:34:56 -04:00
param = { ' max_depth ' : 2 , ' eta ' : 0.2 , ' verbosity ' : 1 ,
2019-03-17 17:55:04 +08:00
' objective ' : ' binary:logistic ' }
2016-06-05 00:17:35 -05:00
param [ ' eval_metric ' ] = [ " auc " , " logloss " , ' error ' ]
evals_result = { }
bst = xgb . train ( param , dtrain , 4 , watchlist , evals_result = evals_result )
assert isinstance ( bst , xgb . core . Booster )
assert len ( evals_result [ ' eval ' ] ) == 3
assert set ( evals_result [ ' eval ' ] . keys ( ) ) == { ' auc ' , ' error ' , ' logloss ' }
2016-04-24 16:34:46 +09:00
def test_fpreproc ( self ) :
2019-03-17 17:55:04 +08:00
param = { ' max_depth ' : 2 , ' eta ' : 1 , ' verbosity ' : 0 ,
2016-04-24 16:34:46 +09:00
' objective ' : ' binary:logistic ' }
num_round = 2
def fpreproc ( dtrain , dtest , param ) :
label = dtrain . get_label ( )
ratio = float ( np . sum ( label == 0 ) ) / np . sum ( label == 1 )
param [ ' scale_pos_weight ' ] = ratio
return ( dtrain , dtest , param )
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2016-04-24 16:34:46 +09:00
xgb . cv ( param , dtrain , num_round , nfold = 5 ,
metrics = { ' auc ' } , seed = 0 , fpreproc = fpreproc )
def test_show_stdv ( self ) :
2019-03-17 17:55:04 +08:00
param = { ' max_depth ' : 2 , ' eta ' : 1 , ' verbosity ' : 0 ,
2016-04-24 16:34:46 +09:00
' objective ' : ' binary:logistic ' }
num_round = 2
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2016-04-24 16:34:46 +09:00
xgb . cv ( param , dtrain , num_round , nfold = 5 ,
metrics = { ' error ' } , seed = 0 , show_stdv = False )
2016-04-29 13:51:34 +09:00
2023-03-14 22:09:36 +08:00
def test_prediction_cache ( self ) - > None :
X , y = tm . make_sparse_regression ( 512 , 4 , 0.5 , as_dense = False )
Xy = xgb . DMatrix ( X , y )
param = { " max_depth " : 8 }
booster = xgb . train ( param , Xy , num_boost_round = 1 )
with tempfile . TemporaryDirectory ( ) as tmpdir :
path = os . path . join ( tmpdir , " model.json " )
booster . save_model ( path )
predt_0 = booster . predict ( Xy )
param [ " max_depth " ] = 2
booster = xgb . train ( param , Xy , num_boost_round = 1 )
predt_1 = booster . predict ( Xy )
assert not np . isclose ( predt_0 , predt_1 ) . all ( )
booster . load_model ( path )
predt_2 = booster . predict ( Xy )
np . testing . assert_allclose ( predt_0 , predt_2 )
2016-04-29 13:51:34 +09:00
def test_feature_names_validation ( self ) :
X = np . random . random ( ( 10 , 3 ) )
y = np . random . randint ( 2 , size = ( 10 , ) )
2021-02-25 18:54:16 +08:00
dm1 = xgb . DMatrix ( X , y , feature_names = ( " a " , " b " , " c " ) )
dm2 = xgb . DMatrix ( X , y )
2016-04-29 13:51:34 +09:00
bst = xgb . train ( [ ] , dm1 )
bst . predict ( dm1 ) # success
2020-11-03 02:27:39 -05:00
with pytest . raises ( ValueError ) :
bst . predict ( dm2 )
2016-04-29 13:51:34 +09:00
bst . predict ( dm1 ) # success
bst = xgb . train ( [ ] , dm2 )
bst . predict ( dm2 ) # success
2019-12-11 11:20:40 +08:00
2020-02-13 20:41:59 +08:00
def test_model_binary_io ( self ) :
model_path = ' test_model_binary_io.bin '
parameters = { ' tree_method ' : ' hist ' , ' booster ' : ' gbtree ' ,
' scale_pos_weight ' : ' 0.5 ' }
X = np . random . random ( ( 10 , 3 ) )
y = np . random . random ( ( 10 , ) )
dtrain = xgb . DMatrix ( X , y )
bst = xgb . train ( parameters , dtrain , num_boost_round = 2 )
bst . save_model ( model_path )
bst = xgb . Booster ( model_file = model_path )
os . remove ( model_path )
config = json . loads ( bst . save_config ( ) )
assert float ( config [ ' learner ' ] [ ' objective ' ] [
' reg_loss_param ' ] [ ' scale_pos_weight ' ] ) == 0.5
2020-02-26 11:30:13 +08:00
buf = bst . save_raw ( )
from_raw = xgb . Booster ( )
from_raw . load_model ( buf )
buf_from_raw = from_raw . save_raw ( )
assert buf == buf_from_raw
2022-01-16 02:11:53 +08:00
def run_model_json_io ( self , parameters : dict , ext : str ) - > None :
if ext == " ubj " and tm . no_ubjson ( ) [ " condition " ] :
pytest . skip ( tm . no_ubjson ( ) [ " reason " ] )
2020-02-17 11:31:13 +08:00
loc = locale . getpreferredencoding ( False )
2022-01-16 02:11:53 +08:00
model_path = ' test_model_json_io. ' + ext
2020-01-28 13:29:09 +08:00
j_model = json_model ( model_path , parameters )
2019-12-11 19:49:01 +08:00
assert isinstance ( j_model [ ' learner ' ] , dict )
2019-12-11 11:20:40 +08:00
2020-02-13 20:41:59 +08:00
bst = xgb . Booster ( model_file = model_path )
2019-12-11 11:20:40 +08:00
2019-12-23 19:47:35 +08:00
bst . save_model ( fname = model_path )
2022-01-16 02:11:53 +08:00
if ext == " ubj " :
import ubjson
with open ( model_path , " rb " ) as ubjfd :
j_model = ubjson . load ( ubjfd )
else :
with open ( model_path , ' r ' ) as fd :
j_model = json . load ( fd )
2019-12-11 19:49:01 +08:00
assert isinstance ( j_model [ ' learner ' ] , dict )
2019-12-11 11:20:40 +08:00
2019-12-23 19:47:35 +08:00
os . remove ( model_path )
2020-02-17 11:31:13 +08:00
assert locale . getpreferredencoding ( False ) == loc
2019-12-23 19:47:35 +08:00
2022-01-19 02:27:51 +08:00
json_raw = bst . save_raw ( raw_format = " json " )
from_jraw = xgb . Booster ( )
from_jraw . load_model ( json_raw )
ubj_raw = bst . save_raw ( raw_format = " ubj " )
from_ubjraw = xgb . Booster ( )
from_ubjraw . load_model ( ubj_raw )
2023-03-22 23:49:56 +08:00
if parameters . get ( " multi_strategy " , None ) != " multi_output_tree " :
# old binary model is not supported.
old_from_json = from_jraw . save_raw ( raw_format = " deprecated " )
old_from_ubj = from_ubjraw . save_raw ( raw_format = " deprecated " )
2022-01-19 02:27:51 +08:00
2023-03-22 23:49:56 +08:00
assert old_from_json == old_from_ubj
2022-01-19 02:27:51 +08:00
2022-06-01 16:20:58 +08:00
raw_json = bst . save_raw ( raw_format = " json " )
pretty = json . dumps ( json . loads ( raw_json ) , indent = 2 ) + " \n \n "
bst . load_model ( bytearray ( pretty , encoding = " ascii " ) )
2023-03-22 23:49:56 +08:00
if parameters . get ( " multi_strategy " , None ) != " multi_output_tree " :
# old binary model is not supported.
old_from_json = from_jraw . save_raw ( raw_format = " deprecated " )
old_from_ubj = from_ubjraw . save_raw ( raw_format = " deprecated " )
assert old_from_json == old_from_ubj
2022-06-01 16:20:58 +08:00
2023-03-22 23:49:56 +08:00
rng = np . random . default_rng ( )
X = rng . random ( size = from_jraw . num_features ( ) * 10 ) . reshape (
( 10 , from_jraw . num_features ( ) )
)
predt_from_jraw = from_jraw . predict ( xgb . DMatrix ( X ) )
predt_from_bst = bst . predict ( xgb . DMatrix ( X ) )
np . testing . assert_allclose ( predt_from_jraw , predt_from_bst )
2022-06-01 16:20:58 +08:00
2022-01-16 02:11:53 +08:00
@pytest.mark.parametrize ( " ext " , [ " json " , " ubj " ] )
def test_model_json_io ( self , ext : str ) - > None :
parameters = { " booster " : " gbtree " , " tree_method " : " hist " }
self . run_model_json_io ( parameters , ext )
2023-03-22 23:49:56 +08:00
parameters = {
" booster " : " gbtree " ,
" tree_method " : " hist " ,
" multi_strategy " : " multi_output_tree " ,
" objective " : " multi:softmax " ,
}
self . run_model_json_io ( parameters , ext )
2022-01-16 02:11:53 +08:00
parameters = { " booster " : " gblinear " }
self . run_model_json_io ( parameters , ext )
parameters = { " booster " : " dart " , " tree_method " : " hist " }
self . run_model_json_io ( parameters , ext )
2019-12-23 19:47:35 +08:00
@pytest.mark.skipif ( * * tm . no_json_schema ( ) )
2020-05-15 10:18:43 +08:00
def test_json_io_schema ( self ) :
2019-12-23 19:47:35 +08:00
import jsonschema
2020-02-13 20:41:59 +08:00
model_path = ' test_json_schema.json '
2019-12-23 19:47:35 +08:00
path = os . path . dirname (
os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) ) )
doc = os . path . join ( path , ' doc ' , ' model.schema ' )
with open ( doc , ' r ' ) as fd :
schema = json . load ( fd )
2020-01-28 13:29:09 +08:00
parameters = { ' tree_method ' : ' hist ' , ' booster ' : ' gbtree ' }
jsonschema . validate ( instance = json_model ( model_path , parameters ) ,
schema = schema )
os . remove ( model_path )
parameters = { ' tree_method ' : ' hist ' , ' booster ' : ' dart ' }
jsonschema . validate ( instance = json_model ( model_path , parameters ) ,
schema = schema )
2019-12-23 19:47:35 +08:00
os . remove ( model_path )
2020-05-15 10:18:43 +08:00
2020-08-05 15:21:11 +08:00
try :
2022-10-26 16:56:11 +08:00
dtrain = xgb . DMatrix ( os . path . join ( dpath , " agaricus.txt.train " ) )
2020-08-05 15:21:11 +08:00
xgb . train ( { ' objective ' : ' foo ' } , dtrain , num_boost_round = 1 )
except ValueError as e :
e_str = str ( e )
beg = e_str . find ( ' Objective candidate ' )
end = e_str . find ( ' Stack trace ' )
e_str = e_str [ beg : end ]
e_str = e_str . strip ( )
splited = e_str . splitlines ( )
objectives = [ s . split ( ' : ' ) [ 1 ] for s in splited ]
j_objectives = schema [ ' properties ' ] [ ' learner ' ] [ ' properties ' ] [
' objective ' ] [ ' oneOf ' ]
objectives_from_schema = set ( )
for j_obj in j_objectives :
objectives_from_schema . add (
j_obj [ ' properties ' ] [ ' name ' ] [ ' const ' ] )
objectives = set ( objectives )
assert objectives == objectives_from_schema
2020-05-15 10:18:43 +08:00
@pytest.mark.skipif ( * * tm . no_json_schema ( ) )
def test_json_dump_schema ( self ) :
import jsonschema
def validate_model ( parameters ) :
X = np . random . random ( ( 100 , 30 ) )
y = np . random . randint ( 0 , 4 , size = ( 100 , ) )
parameters [ ' num_class ' ] = 4
m = xgb . DMatrix ( X , y )
booster = xgb . train ( parameters , m )
dump = booster . get_dump ( dump_format = ' json ' )
for i in range ( len ( dump ) ) :
jsonschema . validate ( instance = json . loads ( dump [ i ] ) ,
schema = schema )
path = os . path . dirname (
os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) ) )
doc = os . path . join ( path , ' doc ' , ' dump.schema ' )
with open ( doc , ' r ' ) as fd :
schema = json . load ( fd )
parameters = { ' tree_method ' : ' hist ' , ' booster ' : ' gbtree ' ,
' objective ' : ' multi:softmax ' }
validate_model ( parameters )
parameters = { ' tree_method ' : ' hist ' , ' booster ' : ' dart ' ,
' objective ' : ' multi:softmax ' }
validate_model ( parameters )
2020-11-03 02:27:39 -05:00
2022-02-19 08:05:28 +08:00
def test_categorical_model_io ( self ) :
X , y = tm . make_categorical ( 256 , 16 , 71 , False )
Xy = xgb . DMatrix ( X , y , enable_categorical = True )
booster = xgb . train ( { " tree_method " : " approx " } , Xy , num_boost_round = 16 )
predt_0 = booster . predict ( Xy )
with tempfile . TemporaryDirectory ( ) as tempdir :
path = os . path . join ( tempdir , " model.binary " )
with pytest . raises ( ValueError , match = r " .*JSON/UBJSON.* " ) :
booster . save_model ( path )
path = os . path . join ( tempdir , " model.json " )
booster . save_model ( path )
booster = xgb . Booster ( model_file = path )
predt_1 = booster . predict ( Xy )
np . testing . assert_allclose ( predt_0 , predt_1 )
path = os . path . join ( tempdir , " model.ubj " )
booster . save_model ( path )
booster = xgb . Booster ( model_file = path )
predt_1 = booster . predict ( Xy )
np . testing . assert_allclose ( predt_0 , predt_1 )
2021-01-13 16:56:49 +08:00
@pytest.mark.skipif ( * * tm . no_sklearn ( ) )
def test_attributes ( self ) :
from sklearn . datasets import load_iris
X , y = load_iris ( return_X_y = True )
cls = xgb . XGBClassifier ( n_estimators = 2 )
cls . fit ( X , y , early_stopping_rounds = 1 , eval_set = [ ( X , y ) ] )
2021-01-19 23:51:16 +08:00
assert cls . get_booster ( ) . best_ntree_limit == 2
2021-01-13 16:56:49 +08:00
assert cls . best_ntree_limit == cls . get_booster ( ) . best_ntree_limit
with tempfile . TemporaryDirectory ( ) as tmpdir :
path = os . path . join ( tmpdir , " cls.json " )
cls . save_model ( path )
cls = xgb . XGBClassifier ( n_estimators = 2 )
cls . load_model ( path )
2021-01-19 23:51:16 +08:00
assert cls . get_booster ( ) . best_ntree_limit == 2
2021-01-13 16:56:49 +08:00
assert cls . best_ntree_limit == cls . get_booster ( ) . best_ntree_limit
2022-03-29 02:32:42 +08:00
def run_slice (
self ,
booster : xgb . Booster ,
dtrain : xgb . DMatrix ,
num_parallel_tree : int ,
num_classes : int ,
num_boost_round : int
) :
2020-11-03 02:27:39 -05:00
beg = 3
end = 7
2022-03-29 02:32:42 +08:00
sliced : xgb . Booster = booster [ beg : end ]
2021-07-06 11:47:49 +08:00
assert sliced . feature_types == booster . feature_types
2020-11-03 02:27:39 -05:00
sliced_trees = ( end - beg ) * num_parallel_tree * num_classes
assert sliced_trees == len ( sliced . get_dump ( ) )
sliced_trees = sliced_trees / / 2
2022-03-29 02:32:42 +08:00
sliced = booster [ beg : end : 2 ]
2020-11-03 02:27:39 -05:00
assert sliced_trees == len ( sliced . get_dump ( ) )
2022-03-29 02:32:42 +08:00
sliced = booster [ beg : . . . ]
2020-11-03 02:27:39 -05:00
sliced_trees = ( num_boost_round - beg ) * num_parallel_tree * num_classes
assert sliced_trees == len ( sliced . get_dump ( ) )
2022-03-29 02:32:42 +08:00
sliced = booster [ beg : ]
2020-11-03 02:27:39 -05:00
sliced_trees = ( num_boost_round - beg ) * num_parallel_tree * num_classes
assert sliced_trees == len ( sliced . get_dump ( ) )
2022-03-29 02:32:42 +08:00
sliced = booster [ : end ]
2020-11-03 02:27:39 -05:00
sliced_trees = end * num_parallel_tree * num_classes
assert sliced_trees == len ( sliced . get_dump ( ) )
2022-03-29 02:32:42 +08:00
sliced = booster [ . . . : end ]
2020-11-03 02:27:39 -05:00
sliced_trees = end * num_parallel_tree * num_classes
assert sliced_trees == len ( sliced . get_dump ( ) )
2022-03-29 02:32:42 +08:00
with pytest . raises ( ValueError , match = r " >= 0 " ) :
booster [ - 1 : 0 ]
2020-11-03 02:27:39 -05:00
# we do not accept empty slice.
2023-03-27 23:10:54 +08:00
with pytest . raises ( ValueError , match = " Empty slice " ) :
2020-11-03 02:27:39 -05:00
booster [ 1 : 1 ]
# stop can not be smaller than begin
2022-03-29 02:32:42 +08:00
with pytest . raises ( ValueError , match = r " Invalid.* " ) :
2020-11-03 02:27:39 -05:00
booster [ 3 : 0 ]
2022-03-29 02:32:42 +08:00
with pytest . raises ( ValueError , match = r " Invalid.* " ) :
2020-11-03 02:27:39 -05:00
booster [ 3 : - 1 ]
# negative step is not supported.
2022-03-29 02:32:42 +08:00
with pytest . raises ( ValueError , match = r " .*>= 1.* " ) :
2020-11-03 02:27:39 -05:00
booster [ 0 : 2 : - 1 ]
# step can not be 0.
2022-03-29 02:32:42 +08:00
with pytest . raises ( ValueError , match = r " .*>= 1.* " ) :
2020-11-03 02:27:39 -05:00
booster [ 0 : 2 : 0 ]
trees = [ _ for _ in booster ]
assert len ( trees ) == num_boost_round
with pytest . raises ( TypeError ) :
booster [ " wrong type " ]
with pytest . raises ( IndexError ) :
2022-03-29 02:32:42 +08:00
booster [ : num_boost_round + 1 ]
2020-11-03 02:27:39 -05:00
with pytest . raises ( ValueError ) :
2022-03-29 02:32:42 +08:00
booster [ 1 , 2 ] # too many dims
2020-11-03 02:27:39 -05:00
# setitem is not implemented as model is immutable during slicing.
with pytest . raises ( TypeError ) :
2022-03-29 02:32:42 +08:00
booster [ . . . : end ] = booster
2020-11-03 02:27:39 -05:00
sliced_0 = booster [ 1 : 3 ]
2021-02-08 18:26:32 +08:00
np . testing . assert_allclose (
booster . predict ( dtrain , iteration_range = ( 1 , 3 ) ) , sliced_0 . predict ( dtrain )
)
2020-11-03 02:27:39 -05:00
sliced_1 = booster [ 3 : 7 ]
2021-02-08 18:26:32 +08:00
np . testing . assert_allclose (
booster . predict ( dtrain , iteration_range = ( 3 , 7 ) ) , sliced_1 . predict ( dtrain )
)
2020-11-03 02:27:39 -05:00
predt_0 = sliced_0 . predict ( dtrain , output_margin = True )
predt_1 = sliced_1 . predict ( dtrain , output_margin = True )
merged = predt_0 + predt_1 - 0.5 # base score.
single = booster [ 1 : 7 ] . predict ( dtrain , output_margin = True )
np . testing . assert_allclose ( merged , single , atol = 1e-6 )
sliced_0 = booster [ 1 : 7 : 2 ] # 1,3,5
sliced_1 = booster [ 2 : 8 : 2 ] # 2,4,6
predt_0 = sliced_0 . predict ( dtrain , output_margin = True )
predt_1 = sliced_1 . predict ( dtrain , output_margin = True )
merged = predt_0 + predt_1 - 0.5
single = booster [ 1 : 7 ] . predict ( dtrain , output_margin = True )
np . testing . assert_allclose ( merged , single , atol = 1e-6 )
2021-02-25 18:54:16 +08:00
2022-03-29 02:32:42 +08:00
@pytest.mark.skipif ( * * tm . no_sklearn ( ) )
@pytest.mark.parametrize ( " booster " , [ " gbtree " , " dart " ] )
def test_slice ( self , booster ) :
from sklearn . datasets import make_classification
num_classes = 3
X , y = make_classification (
n_samples = 1000 , n_informative = 5 , n_classes = num_classes
)
dtrain = xgb . DMatrix ( data = X , label = y )
num_parallel_tree = 4
num_boost_round = 16
total_trees = num_parallel_tree * num_classes * num_boost_round
booster = xgb . train (
{
" num_parallel_tree " : num_parallel_tree ,
" subsample " : 0.5 ,
" num_class " : num_classes ,
" booster " : booster ,
" objective " : " multi:softprob " ,
} ,
num_boost_round = num_boost_round ,
dtrain = dtrain ,
)
booster . feature_types = [ " q " ] * X . shape [ 1 ]
assert len ( booster . get_dump ( ) ) == total_trees
self . run_slice ( booster , dtrain , num_parallel_tree , num_classes , num_boost_round )
bytesarray = booster . save_raw ( raw_format = " ubj " )
booster = xgb . Booster ( model_file = bytesarray )
self . run_slice ( booster , dtrain , num_parallel_tree , num_classes , num_boost_round )
bytesarray = booster . save_raw ( raw_format = " deprecated " )
booster = xgb . Booster ( model_file = bytesarray )
self . run_slice ( booster , dtrain , num_parallel_tree , num_classes , num_boost_round )
2023-03-27 23:10:54 +08:00
def test_slice_multi ( self ) - > None :
from sklearn . datasets import make_classification
num_classes = 3
X , y = make_classification (
n_samples = 1000 , n_informative = 5 , n_classes = num_classes
)
Xy = xgb . DMatrix ( data = X , label = y )
num_parallel_tree = 4
num_boost_round = 16
class ResetStrategy ( xgb . callback . TrainingCallback ) :
def after_iteration ( self , model , epoch : int , evals_log ) - > bool :
model . set_param ( { " multi_strategy " : " multi_output_tree " } )
return False
booster = xgb . train (
{
" num_parallel_tree " : num_parallel_tree ,
" num_class " : num_classes ,
" booster " : " gbtree " ,
" objective " : " multi:softprob " ,
" multi_strategy " : " multi_output_tree " ,
" tree_method " : " hist " ,
" base_score " : 0 ,
} ,
num_boost_round = num_boost_round ,
dtrain = Xy ,
callbacks = [ ResetStrategy ( ) ]
)
sliced = [ t for t in booster ]
assert len ( sliced ) == 16
predt0 = booster . predict ( Xy , output_margin = True )
predt1 = np . zeros ( predt0 . shape )
for t in booster :
predt1 + = t . predict ( Xy , output_margin = True )
np . testing . assert_allclose ( predt0 , predt1 , atol = 1e-5 )
2021-02-25 18:54:16 +08:00
@pytest.mark.skipif ( * * tm . no_pandas ( ) )
def test_feature_info ( self ) :
import pandas as pd
rows = 100
cols = 10
X = rng . randn ( rows , cols )
y = rng . randn ( rows )
feature_names = [ " test_feature_ " + str ( i ) for i in range ( cols ) ]
X_pd = pd . DataFrame ( X , columns = feature_names )
2022-04-23 02:07:01 +08:00
X_pd . iloc [ : , 3 ] = X_pd . iloc [ : , 3 ] . astype ( np . int32 )
2021-02-25 18:54:16 +08:00
Xy = xgb . DMatrix ( X_pd , y )
assert Xy . feature_types [ 3 ] == " int "
booster = xgb . train ( { } , dtrain = Xy , num_boost_round = 1 )
assert booster . feature_names == Xy . feature_names
assert booster . feature_names == feature_names
assert booster . feature_types == Xy . feature_types
with tempfile . TemporaryDirectory ( ) as tmpdir :
path = tmpdir + " model.json "
booster . save_model ( path )
booster = xgb . Booster ( )
booster . load_model ( path )
assert booster . feature_names == Xy . feature_names
assert booster . feature_types == Xy . feature_types