2016-11-04 22:25:25 +05:30
import json
2022-10-26 16:56:11 +08:00
from pathlib import Path
import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
2026-02-18 15:25:59 +01:00
from xgboost . _c_api import _parse_version
2015-09-16 21:53:51 +09:00
2024-01-05 17:53:36 +08:00
dpath = " demo/data/ "
2015-10-21 23:24:37 -05:00
rng = np . random . RandomState ( 1994 )
2015-07-03 21:27:29 -07:00
2015-09-16 21:53:51 +09:00
2020-11-19 17:00:15 -08:00
class TestBasic :
2020-02-26 14:23:33 +08:00
def test_compat ( self ) :
from xgboost . compat import lazy_isinstance
2024-01-05 17:53:36 +08:00
2020-02-26 14:23:33 +08:00
a = np . array ( [ 1 , 2 , 3 ] )
2024-01-05 17:53:36 +08:00
assert lazy_isinstance ( a , " numpy " , " ndarray " )
assert not lazy_isinstance ( a , " numpy " , " dataframe " )
2016-04-24 16:34:46 +09:00
2026-02-22 10:02:35 +08:00
def test_basic ( self , tmp_path : Path ) - > None :
2023-04-28 19:45:15 +08:00
dtrain , dtest = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
param = { " max_depth " : 2 , " eta " : 1 , " objective " : " binary:logistic " }
2015-09-16 20:37:19 +09:00
# specify validations set to watch performance
2024-01-05 17:53:36 +08:00
watchlist = [ ( dtrain , " train " ) ]
2015-09-16 20:37:19 +09:00
num_round = 2
2024-01-05 17:53:36 +08:00
bst = xgb . train ( param , dtrain , num_round , evals = watchlist , verbose_eval = True )
2020-02-14 13:04:23 +08:00
preds = bst . predict ( dtrain )
labels = dtrain . get_label ( )
2024-01-05 17:53:36 +08:00
err = sum (
1 for i in range ( len ( preds ) ) if int ( preds [ i ] > 0.5 ) != labels [ i ]
) / float ( len ( preds ) )
2020-02-14 13:04:23 +08:00
# error must be smaller than 10%
assert err < 0.1
2015-09-16 20:37:19 +09:00
preds = bst . predict ( dtest )
labels = dtest . get_label ( )
2024-01-05 17:53:36 +08:00
err = sum (
1 for i in range ( len ( preds ) ) if int ( preds [ i ] > 0.5 ) != labels [ i ]
) / float ( len ( preds ) )
2015-09-16 20:37:19 +09:00
# error must be smaller than 10%
assert err < 0.1
2026-02-22 10:02:35 +08:00
dtest_path = tmp_path / " dtest.dmatrix "
# save dmatrix into binary buffer
dtest . save_binary ( dtest_path )
# save model
model_path = tmp_path / " model.ubj "
bst . save_model ( model_path )
# load model and data in
bst2 = xgb . Booster ( model_file = model_path )
dtest2 = xgb . DMatrix ( dtest_path )
preds2 = bst2 . predict ( dtest2 )
# assert they are the same
assert np . sum ( np . abs ( preds2 - preds ) ) == 0
def test_metric_config ( self , tmp_path : Path ) - > None :
2024-01-05 17:53:36 +08:00
# Make sure that the metric configuration happens in booster so the string
# `['error', 'auc']` doesn't get passed down to core.
2023-04-28 19:45:15 +08:00
dtrain , dtest = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
param = {
" max_depth " : 2 ,
" eta " : 1 ,
" objective " : " binary:logistic " ,
" eval_metric " : [ " error " , " auc " ] ,
}
watchlist = [ ( dtest , " eval " ) , ( dtrain , " train " ) ]
2020-12-16 05:35:04 +08:00
num_round = 2
2024-01-05 17:53:36 +08:00
booster = xgb . train ( param , dtrain , num_round , evals = watchlist )
2020-12-16 05:35:04 +08:00
predt_0 = booster . predict ( dtrain )
2026-02-22 10:02:35 +08:00
path = tmp_path / " model.json "
booster . save_model ( path )
2020-12-16 05:35:04 +08:00
2026-02-22 10:02:35 +08:00
booster = xgb . Booster ( params = param , model_file = path )
predt_1 = booster . predict ( dtrain )
np . testing . assert_allclose ( predt_0 , predt_1 )
2020-12-16 05:35:04 +08:00
2026-02-22 10:02:35 +08:00
def test_multiclass ( self , tmp_path : Path ) - > None :
2023-04-28 19:45:15 +08:00
dtrain , dtest = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
param = { " max_depth " : 2 , " eta " : 1 , " num_class " : 2 }
2016-03-10 19:21:29 -08:00
# specify validations set to watch performance
2024-01-05 17:53:36 +08:00
watchlist = [ ( dtest , " eval " ) , ( dtrain , " train " ) ]
2016-03-10 19:21:29 -08:00
num_round = 2
2024-01-05 17:53:36 +08:00
bst = xgb . train ( param , dtrain , num_round , evals = watchlist )
2016-03-10 19:21:29 -08:00
# this is prediction
preds = bst . predict ( dtest )
labels = dtest . get_label ( )
2024-01-05 17:53:36 +08:00
err = sum ( 1 for i in range ( len ( preds ) ) if preds [ i ] != labels [ i ] ) / float (
len ( preds )
)
2016-03-10 19:21:29 -08:00
# error must be smaller than 10%
assert err < 0.1
2026-02-22 10:02:35 +08:00
dtest_path = tmp_path / " dtest.buffer "
model_path = tmp_path / " model.ubj "
# save dmatrix into binary buffer
dtest . save_binary ( dtest_path )
# save model
bst . save_model ( model_path )
# load model and data in
bst2 = xgb . Booster ( model_file = model_path )
dtest2 = xgb . DMatrix ( dtest_path )
preds2 = bst2 . predict ( dtest2 )
# assert they are the same
assert np . sum ( np . abs ( preds2 - preds ) ) == 0
2016-03-10 19:21:29 -08:00
2016-11-04 22:25:25 +05:30
def test_dump ( self ) :
data = np . random . randn ( 100 , 2 )
target = np . array ( [ 0 , 1 ] * 50 )
2024-01-05 17:53:36 +08:00
features = [ " Feature1 " , " Feature2 " ]
2016-11-04 22:25:25 +05:30
dm = xgb . DMatrix ( data , label = target , feature_names = features )
2024-01-05 17:53:36 +08:00
params = {
" objective " : " binary:logistic " ,
" eval_metric " : " logloss " ,
" eta " : 0.3 ,
" max_depth " : 1 ,
}
2016-11-04 22:25:25 +05:30
bst = xgb . train ( params , dm , num_boost_round = 1 )
# number of feature importances should == number of features
dump1 = bst . get_dump ( )
2024-01-05 17:53:36 +08:00
assert len ( dump1 ) == 1 , " Expected only 1 tree to be dumped. "
2026-02-18 15:25:59 +01:00
assert len ( dump1 [ 0 ] . splitlines ( ) ) == 3 , (
" Expected 1 root and 2 leaves - 3 lines in dump. "
)
2016-11-04 22:25:25 +05:30
dump2 = bst . get_dump ( with_stats = True )
2026-02-18 15:25:59 +01:00
assert dump2 [ 0 ] . count ( " \n " ) == 3 , (
" Expected 1 root and 2 leaves - 3 lines in dump. "
)
2024-01-05 17:53:36 +08:00
msg = " Expected more info when with_stats=True is given. "
assert dump2 [ 0 ] . find ( " \n " ) > dump1 [ 0 ] . find ( " \n " ) , msg
2016-11-04 22:25:25 +05:30
dump3 = bst . get_dump ( dump_format = " json " )
dump3j = json . loads ( dump3 [ 0 ] )
2024-01-05 17:53:36 +08:00
assert dump3j [ " nodeid " ] == 0 , " Expected the root node on top. "
2016-11-04 22:25:25 +05:30
dump4 = bst . get_dump ( dump_format = " json " , with_stats = True )
dump4j = json . loads ( dump4 [ 0 ] )
2024-01-05 17:53:36 +08:00
assert " gain " in dump4j , " Expected ' gain ' to be dumped in JSON. "
2016-11-04 22:25:25 +05:30
2021-06-25 14:34:02 +08:00
with pytest . raises ( ValueError ) :
bst . get_dump ( fmap = " foo " )
2021-06-18 11:53:16 +08:00
def test_feature_score ( self ) :
rng = np . random . RandomState ( 0 )
data = rng . randn ( 100 , 2 )
target = np . array ( [ 0 , 1 ] * 50 )
features = [ " F0 " ]
with pytest . raises ( ValueError ) :
xgb . DMatrix ( data , label = target , feature_names = features )
params = { " objective " : " binary:logistic " }
dm = xgb . DMatrix ( data , label = target , feature_names = [ " F0 " , " F1 " ] )
booster = xgb . train ( params , dm , num_boost_round = 1 )
# no error since feature names might be assigned before the booster seeing data
# and booster doesn't known about the actual number of features.
booster . feature_names = [ " F0 " ]
with pytest . raises ( ValueError ) :
booster . get_fscore ( )
2021-06-21 20:58:43 +08:00
booster . feature_names = None
# Use JSON to make sure the output has native Python type
scores = json . loads ( json . dumps ( booster . get_fscore ( ) ) )
np . testing . assert_allclose ( scores [ " f0 " ] , 6.0 )
2015-09-16 21:53:51 +09:00
def test_load_file_invalid ( self ) :
2020-11-19 17:00:15 -08:00
with pytest . raises ( xgb . core . XGBoostError ) :
2024-01-05 17:53:36 +08:00
xgb . Booster ( model_file = " incorrect_path " )
2015-09-16 21:53:51 +09:00
2020-11-19 17:00:15 -08:00
with pytest . raises ( xgb . core . XGBoostError ) :
2024-01-05 17:53:36 +08:00
xgb . Booster ( model_file = " 不正なパス " )
2015-09-16 20:47:37 +09:00
2024-01-05 17:53:36 +08:00
@pytest.mark.parametrize (
" path " , [ " 모델.ubj " , " がうる・ぐら.json " ] , ids = [ " path-0 " , " path-1 " ]
)
2026-02-22 10:02:35 +08:00
def test_unicode_path ( self , tmp_path : Path , path : str ) - > None :
model_path = tmp_path / path
2023-08-07 23:27:25 -07:00
dtrain , _ = tm . load_agaricus ( __file__ )
param = { " max_depth " : 2 , " eta " : 1 , " objective " : " binary:logistic " }
bst = xgb . train ( param , dtrain , num_boost_round = 2 )
bst . save_model ( model_path )
bst2 = xgb . Booster ( model_file = model_path )
assert bst . get_dump ( dump_format = " text " ) == bst2 . get_dump ( dump_format = " text " )
2017-07-20 19:43:17 -07:00
def test_dmatrix_numpy_init_omp ( self ) :
rows = [ 1000 , 11326 , 15000 ]
cols = 50
for row in rows :
X = np . random . randn ( row , cols )
2024-01-05 17:53:36 +08:00
y = np . random . randn ( row ) . astype ( " f " )
2017-07-20 19:43:17 -07:00
dm = xgb . DMatrix ( X , y , nthread = 0 )
np . testing . assert_array_equal ( dm . get_label ( ) , y )
assert dm . num_row ( ) == row
assert dm . num_col ( ) == cols
dm = xgb . DMatrix ( X , y , nthread = 10 )
np . testing . assert_array_equal ( dm . get_label ( ) , y )
assert dm . num_row ( ) == row
assert dm . num_col ( ) == cols
2015-10-02 21:56:35 +09:00
def test_cv ( self ) :
2023-04-28 19:45:15 +08:00
dm , _ = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
params = { " max_depth " : 2 , " eta " : 1 , " objective " : " binary:logistic " }
2015-10-02 21:56:35 +09:00
# return np.ndarray
cv = xgb . cv ( params , dm , num_boost_round = 10 , nfold = 10 , as_pandas = False )
2016-05-19 17:47:11 -07:00
assert isinstance ( cv , dict )
assert len ( cv ) == ( 4 )
2016-12-22 17:53:30 -06:00
def test_cv_no_shuffle ( self ) :
2023-04-28 19:45:15 +08:00
dm , _ = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
params = { " max_depth " : 2 , " eta " : 1 , " objective " : " binary:logistic " }
2016-12-22 17:53:30 -06:00
# return np.ndarray
2024-01-05 17:53:36 +08:00
cv = xgb . cv (
params , dm , num_boost_round = 10 , shuffle = False , nfold = 10 , as_pandas = False
)
2016-12-22 17:53:30 -06:00
assert isinstance ( cv , dict )
assert len ( cv ) == ( 4 )
2018-06-30 20:23:49 +01:00
def test_cv_explicit_fold_indices ( self ) :
2023-04-28 19:45:15 +08:00
dm , _ = tm . load_agaricus ( __file__ )
2024-01-05 17:53:36 +08:00
params = { " max_depth " : 2 , " eta " : 1 , " objective " : " binary:logistic " }
2018-06-30 20:23:49 +01:00
folds = [
# Train Test
( [ 1 , 3 ] , [ 5 , 8 ] ) ,
( [ 7 , 9 ] , [ 23 , 43 ] ) ,
]
# return np.ndarray
2024-01-05 17:53:36 +08:00
cv = xgb . cv ( params , dm , num_boost_round = 10 , folds = folds , as_pandas = False )
2018-06-30 20:23:49 +01:00
assert isinstance ( cv , dict )
assert len ( cv ) == ( 4 )
def test_cv_explicit_fold_indices_labels ( self ) :
2024-01-05 17:53:36 +08:00
params = { " max_depth " : 2 , " eta " : 1 , " objective " : " reg:squarederror " }
2018-06-30 20:23:49 +01:00
N = 100
F = 3
dm = xgb . DMatrix ( data = np . random . randn ( N , F ) , label = np . arange ( N ) )
folds = [
# Train Test
( [ 1 , 3 ] , [ 5 , 8 ] ) ,
( [ 7 , 9 ] , [ 23 , 43 , 11 ] ) ,
]
# Use callback to log the test labels in each fold
2021-10-08 17:24:59 +08:00
class Callback ( xgb . callback . TrainingCallback ) :
def __init__ ( self ) - > None :
super ( ) . __init__ ( )
def after_iteration (
2024-01-05 17:53:36 +08:00
self ,
model ,
2021-10-08 17:24:59 +08:00
epoch : int ,
2024-01-05 17:53:36 +08:00
evals_log : xgb . callback . TrainingCallback . EvalsLog ,
2021-10-08 17:24:59 +08:00
) :
print ( [ fold . dtest . get_label ( ) for fold in model . cvfolds ] )
cb = Callback ( )
2018-06-30 20:23:49 +01:00
# Run cross validation and capture standard out to test callback result
2020-11-10 07:47:48 +08:00
with tm . captured_output ( ) as ( out , err ) :
2018-06-30 20:23:49 +01:00
xgb . cv (
2024-01-05 17:53:36 +08:00
params ,
dm ,
num_boost_round = 1 ,
folds = folds ,
callbacks = [ cb ] ,
as_pandas = False ,
2018-06-30 20:23:49 +01:00
)
output = out . getvalue ( ) . strip ( )
2024-01-05 17:53:36 +08:00
solution = (
" [array([5., 8.], dtype=float32), array([23., 43., 11.], "
+ " dtype=float32)] "
)
2018-11-15 13:56:33 +13:00
assert output == solution
2018-07-05 20:06:59 -07:00
2019-08-15 04:46:25 -04:00
2020-11-19 17:00:15 -08:00
class TestBasicPathLike :
2020-10-12 15:44:41 +08:00
""" Unit tests using pathlib.Path for file interaction. """
2019-08-15 04:46:25 -04:00
def test_DMatrix_init_from_path ( self ) :
""" Initialization from the data path. """
2023-04-28 19:45:15 +08:00
dtrain , _ = tm . load_agaricus ( __file__ )
2019-08-15 04:46:25 -04:00
assert dtrain . num_row ( ) == 6513
assert dtrain . num_col ( ) == 127
2026-02-22 10:02:35 +08:00
def test_DMatrix_save_to_path ( self , tmp_path : Path ) - > None :
2019-08-15 04:46:25 -04:00
""" Saving to a binary file using pathlib from a DMatrix. """
data = np . random . randn ( 100 , 2 )
target = np . array ( [ 0 , 1 ] * 50 )
2024-01-05 17:53:36 +08:00
features = [ " Feature1 " , " Feature2 " ]
2019-08-15 04:46:25 -04:00
dm = xgb . DMatrix ( data , label = target , feature_names = features )
2026-02-22 10:02:35 +08:00
binary_path = tmp_path / " dtrain.bin "
2019-08-15 04:46:25 -04:00
dm . save_binary ( binary_path )
assert binary_path . exists ( )
def test_Booster_init_invalid_path ( self ) :
""" An invalid model_file path should raise XGBoostError. """
2020-11-19 17:00:15 -08:00
with pytest . raises ( xgb . core . XGBoostError ) :
xgb . Booster ( model_file = Path ( " invalidpath " ) )
2024-08-06 02:35:32 +08:00
def test_parse_ver ( ) - > None :
( major , minor , patch ) , post = _parse_version ( " 2.1.0 " )
assert post == " "
( major , minor , patch ) , post = _parse_version ( " 2.1.0-dev " )
assert post == " dev "
( major , minor , patch ) , post = _parse_version ( " 2.1.0rc1 " )
assert post == " rc1 "
( major , minor , patch ) , post = _parse_version ( " 2.1.0.post1 " )
assert post == " post1 "