@@ -22,6 +22,10 @@ prescreen.parameters <- function(params) {
22
22
23
23
prescreen.objective <- function (objective ) {
24
24
if (! is.null(objective )) {
25
+ if (! is.character(objective ) || length(objective ) != 1L || is.na(objective )) {
26
+ stop(" 'objective' must be a single character/string variable." )
27
+ }
28
+
25
29
if (objective %in% .OBJECTIVES_NON_DEFAULT_MODE()) {
26
30
stop(
27
31
" Objectives with non-default prediction mode (" ,
@@ -30,8 +34,8 @@ prescreen.objective <- function(objective) {
30
34
)
31
35
}
32
36
33
- if (! is.character( objective ) || length( objective ) != 1L || is.na( objective )) {
34
- stop(" 'objective' must be a single character/string variable ." )
37
+ if (objective %in% .RANKING_OBJECTIVES( )) {
38
+ stop(" Ranking objectives are not supported in 'xgboost()'. Try 'xgb.train()' ." )
35
39
}
36
40
}
37
41
}
@@ -501,7 +505,7 @@ check.nthreads <- function(nthreads) {
501
505
return (as.integer(nthreads ))
502
506
}
503
507
504
- check.can.use.qdm <- function (x , params ) {
508
+ check.can.use.qdm <- function (x , params , eval_set ) {
505
509
if (" booster" %in% names(params )) {
506
510
if (params $ booster == " gblinear" ) {
507
511
return (FALSE )
@@ -512,6 +516,9 @@ check.can.use.qdm <- function(x, params) {
512
516
return (FALSE )
513
517
}
514
518
}
519
+ if (NROW(eval_set )) {
520
+ return (FALSE )
521
+ }
515
522
return (TRUE )
516
523
}
517
524
@@ -717,6 +724,129 @@ process.x.and.col.args <- function(
717
724
return (lst_args )
718
725
}
719
726
727
+ process.eval.set <- function (eval_set , lst_args ) {
728
+ if (! NROW(eval_set )) {
729
+ return (NULL )
730
+ }
731
+ nrows <- nrow(lst_args $ dmatrix_args $ data )
732
+ is_classif <- hasName(lst_args $ metadata , " y_levels" )
733
+ processed_y <- lst_args $ dmatrix_args $ label
734
+ eval_set <- as.vector(eval_set )
735
+ if (length(eval_set ) == 1L ) {
736
+
737
+ eval_set <- as.numeric(eval_set )
738
+ if (is.na(eval_set ) || eval_set < 0 || eval_set > = 1 ) {
739
+ stop(" 'eval_set' as a fraction must be a number between zero and one (non-inclusive)." )
740
+ }
741
+ if (eval_set == 0 ) {
742
+ return (NULL )
743
+ }
744
+ nrow_eval <- as.integer(round(nrows * eval_set , 0 ))
745
+ if (nrow_eval < 1 ) {
746
+ warning(
747
+ " Desired 'eval_set' fraction amounts to zero observations." ,
748
+ " Will not create evaluation set."
749
+ )
750
+ return (NULL )
751
+ }
752
+ nrow_train <- nrows - nrow_eval
753
+ if (nrow_train < 2L ) {
754
+ stop(" Desired 'eval_set' fraction would leave less than 2 observations for training data." )
755
+ }
756
+ if (is_classif && nrow_train < length(lst_args $ metadata $ y_levels )) {
757
+ stop(" Desired 'eval_set' fraction would not leave enough samples for each class of 'y'." )
758
+ }
759
+
760
+ seed <- lst_args $ params $ seed
761
+ if (! is.null(seed )) {
762
+ set.seed(seed )
763
+ }
764
+
765
+ idx_shuffled <- sample(nrows , nrows , replace = FALSE )
766
+ idx_eval <- idx_shuffled [seq(1L , nrow_eval )]
767
+ idx_train <- idx_shuffled [seq(nrow_eval + 1L , nrows )]
768
+ # Here we want the training set to include all of the classes of 'y' for classification
769
+ # objectives. If that condition doesn't hold with the random sample, then it forcibly
770
+ # makes a new random selection in such a way that the condition would always hold, by
771
+ # first sampling one random example of 'y' for training and then choosing the evaluation
772
+ # set from the remaining rows. The procedure here is quite inefficient, but there aren't
773
+ # enough random-related functions in base R to be able to construct an efficient version.
774
+ if (is_classif && length(unique(processed_y [idx_train ])) < length(lst_args $ metadata $ y_levels )) {
775
+ # These are defined in order to avoid NOTEs from CRAN checks
776
+ # when using non-standard data.table evaluation with column names.
777
+ idx <- NULL
778
+ y <- NULL
779
+ ranked_idx <- NULL
780
+ chosen <- NULL
781
+
782
+ dt <- data.table :: data.table(y = processed_y , idx = seq(1L , nrows ))[
783
+ , .(
784
+ ranked_idx = seq(1L , .N ),
785
+ chosen = rep(sample(.N , 1L ), .N ),
786
+ idx
787
+ )
788
+ , by = y
789
+ ]
790
+ min_idx_train <- dt [ranked_idx == chosen , idx ]
791
+ rem_idx <- dt [ranked_idx != chosen , idx ]
792
+ if (length(rem_idx ) == nrow_eval ) {
793
+ idx_train <- min_idx_train
794
+ idx_eval <- rem_idx
795
+ } else {
796
+ rem_idx <- rem_idx [sample(length(rem_idx ), length(rem_idx ), replace = FALSE )]
797
+ idx_eval <- rem_idx [seq(1L , nrow_eval )]
798
+ idx_train <- c(min_idx_train , rem_idx [seq(nrow_eval + 1L , length(rem_idx ))])
799
+ }
800
+ }
801
+
802
+ } else {
803
+
804
+ if (any(eval_set != floor(eval_set ))) {
805
+ stop(" 'eval_set' as indices must contain only integers." )
806
+ }
807
+ eval_set <- as.integer(eval_set )
808
+ idx_min <- min(eval_set )
809
+ if (is.na(idx_min ) || idx_min < 1L ) {
810
+ stop(" 'eval_set' contains invalid indices." )
811
+ }
812
+ idx_max <- max(eval_set )
813
+ if (is.na(idx_max ) || idx_max > nrows ) {
814
+ stop(" 'eval_set' contains row indices beyond the size of the input data." )
815
+ }
816
+ idx_train <- seq(1L , nrows )[- eval_set ]
817
+ if (is_classif && length(unique(processed_y [idx_train ])) < length(lst_args $ metadata $ y_levels )) {
818
+ warning(" 'eval_set' indices will leave some classes of 'y' outside of the training data." )
819
+ }
820
+ idx_eval <- eval_set
821
+
822
+ }
823
+
824
+ # Note: slicing is done in the constructed DMatrix object instead of in the
825
+ # original input, because objects from 'Matrix' might change class after
826
+ # being sliced (e.g. 'dgRMatrix' turns into 'dgCMatrix').
827
+ return (list (idx_train = idx_train , idx_eval = idx_eval ))
828
+ }
829
+
830
+ check.early.stopping.rounds <- function (early_stopping_rounds , eval_set ) {
831
+ if (is.null(early_stopping_rounds )) {
832
+ return (NULL )
833
+ }
834
+ if (is.null(eval_set )) {
835
+ stop(" 'early_stopping_rounds' requires passing 'eval_set'." )
836
+ }
837
+ if (NROW(early_stopping_rounds ) != 1L ) {
838
+ stop(" 'early_stopping_rounds' must be NULL or an integer greater than zero." )
839
+ }
840
+ early_stopping_rounds <- as.integer(early_stopping_rounds )
841
+ if (is.na(early_stopping_rounds ) || early_stopping_rounds < = 0L ) {
842
+ stop(
843
+ " 'early_stopping_rounds' must be NULL or an integer greater than zero. Got: " ,
844
+ early_stopping_rounds
845
+ )
846
+ }
847
+ return (early_stopping_rounds )
848
+ }
849
+
720
850
# ' Fit XGBoost Model
721
851
# '
722
852
# ' @export
@@ -808,6 +938,35 @@ process.x.and.col.args <- function(
808
938
# ' 2 (info), and 3 (debug).
809
939
# ' @param monitor_training Whether to monitor objective optimization progress on the input data.
810
940
# ' Note that same 'x' and 'y' data are used for both model fitting and evaluation.
941
+ # ' @param eval_set Subset of the data to use as evaluation set. Can be passed as:
942
+ # ' - A vector of row indices (base-1 numeration) indicating the observations that are to be designed
943
+ # ' as evaluation data.
944
+ # ' - A number between zero and one indicating a random fraction of the input data to use as
945
+ # ' evaluation data. Note that the selection will be done uniformly at random, regardless of
946
+ # ' argument `weights`.
947
+ # '
948
+ # ' If passed, this subset of the data will be excluded from the training procedure, and the
949
+ # ' evaluation metric(s) supplied under `eval_metric` will be calculated on this dataset after each
950
+ # ' boosting iteration (pass `verbosity>0` to have these metrics printed during training). If
951
+ # ' `eval_metric` is not passed, a default metric will be selected according to `objective`.
952
+ # '
953
+ # ' If passing a fraction, in classification problems, the evaluation set will be chosen in such a
954
+ # ' way that at least one observation of each class will be kept in the training data.
955
+ # '
956
+ # ' For more elaborate evaluation variants (e.g. custom metrics, multiple evaluation sets, etc.),
957
+ # ' one might want to use [xgb.train()] instead.
958
+ # ' @param early_stopping_rounds Number of boosting rounds after which training will be stopped
959
+ # ' if there is no improvement in performance (as measured by the last metric passed under
960
+ # ' `eval_metric`, or by the default metric for the objective if `eval_metric` is not passed) on the
961
+ # ' evaluation data from `eval_set`. Must pass `eval_set` in order to use this functionality.
962
+ # '
963
+ # ' If `NULL`, early stopping will not be used.
964
+ # ' @param print_every_n When passing `verbosity>0` and either `monitor_training=TRUE` or `eval_set`,
965
+ # ' evaluation logs (metrics calculated on the training and/or evaluation data) will be printed every
966
+ # ' nth iteration according to the value passed here. The first and last iteration are always
967
+ # ' included regardless of this 'n'.
968
+ # '
969
+ # ' Only has an effect when passing `verbosity>0`.
811
970
# ' @param nthreads Number of parallel threads to use. If passing zero, will use all CPU threads.
812
971
# ' @param seed Seed to use for random number generation. If passing `NULL`, will draw a random
813
972
# ' number using R's PRNG system to use as seed.
@@ -893,8 +1052,11 @@ xgboost <- function(
893
1052
objective = NULL ,
894
1053
nrounds = 100L ,
895
1054
weights = NULL ,
896
- verbosity = 0L ,
1055
+ verbosity = if (is.null( eval_set )) 0L else 1L ,
897
1056
monitor_training = verbosity > 0 ,
1057
+ eval_set = NULL ,
1058
+ early_stopping_rounds = NULL ,
1059
+ print_every_n = 1L ,
898
1060
nthreads = parallel :: detectCores(),
899
1061
seed = 0L ,
900
1062
monotone_constraints = NULL ,
@@ -907,7 +1069,7 @@ xgboost <- function(
907
1069
params <- list (... )
908
1070
params <- prescreen.parameters(params )
909
1071
prescreen.objective(objective )
910
- use_qdm <- check.can.use.qdm(x , params )
1072
+ use_qdm <- check.can.use.qdm(x , params , eval_set )
911
1073
lst_args <- process.y.margin.and.objective(y , base_margin , objective , params )
912
1074
lst_args <- process.row.weights(weights , lst_args )
913
1075
lst_args <- process.x.and.col.args(
@@ -918,8 +1080,9 @@ xgboost <- function(
918
1080
lst_args ,
919
1081
use_qdm
920
1082
)
1083
+ eval_set <- process.eval.set(eval_set , lst_args )
921
1084
922
- if (use_qdm && " max_bin" %in% names( params )) {
1085
+ if (use_qdm && hasName( params , " max_bin" )) {
923
1086
lst_args $ dmatrix_args $ max_bin <- params $ max_bin
924
1087
}
925
1088
@@ -929,18 +1092,27 @@ xgboost <- function(
929
1092
lst_args $ params $ seed <- seed
930
1093
931
1094
params <- c(lst_args $ params , params )
1095
+ params $ verbosity <- verbosity
932
1096
933
1097
fn_dm <- if (use_qdm ) xgb.QuantileDMatrix else xgb.DMatrix
934
1098
dm <- do.call(fn_dm , lst_args $ dmatrix_args )
1099
+ if (! is.null(eval_set )) {
1100
+ dm_eval <- xgb.slice.DMatrix(dm , eval_set $ idx_eval )
1101
+ dm <- xgb.slice.DMatrix(dm , eval_set $ idx_train )
1102
+ }
935
1103
evals <- list ()
936
1104
if (monitor_training ) {
937
1105
evals <- list (train = dm )
938
1106
}
1107
+ if (! is.null(eval_set )) {
1108
+ evals <- c(evals , list (eval = dm_eval ))
1109
+ }
939
1110
model <- xgb.train(
940
1111
params = params ,
941
1112
data = dm ,
942
1113
nrounds = nrounds ,
943
1114
verbose = verbosity ,
1115
+ print_every_n = print_every_n ,
944
1116
evals = evals
945
1117
)
946
1118
attributes(model )$ metadata <- lst_args $ metadata
0 commit comments