diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index f0a0d29cd309..4cfcd7239dd0 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -367,6 +367,7 @@ def _is_1d_collection(data: Any) -> bool: def _list_to_1d_numpy( + *, data: Any, dtype: "np.typing.DTypeLike", name: str, @@ -1840,7 +1841,7 @@ def __del__(self) -> None: except AttributeError: pass - def _create_sample_indices(self, total_nrow: int) -> np.ndarray: + def _create_sample_indices(self, *, total_nrow: int) -> np.ndarray: """Get an array of randomly chosen indices from this ``Dataset``. Indices are sampled without replacement. @@ -2167,26 +2168,26 @@ def _lazy_init( ) ) elif isinstance(data, scipy.sparse.csr_matrix): - self.__init_from_csr(data, params_str, ref_dataset) + self.__init_from_csr(csr=data, params_str=params_str, ref_dataset=ref_dataset) elif isinstance(data, scipy.sparse.csc_matrix): - self.__init_from_csc(data, params_str, ref_dataset) + self.__init_from_csc(csc=data, params_str=params_str, ref_dataset=ref_dataset) elif isinstance(data, np.ndarray): - self.__init_from_np2d(data, params_str, ref_dataset) + self.__init_from_np2d(mat=data, params_str=params_str, ref_dataset=ref_dataset) elif _is_pyarrow_table(data): - self.__init_from_pyarrow_table(data, params_str, ref_dataset) + self.__init_from_pyarrow_table(table=data, params_str=params_str, ref_dataset=ref_dataset) elif isinstance(data, list) and len(data) > 0: if _is_list_of_numpy_arrays(data): - self.__init_from_list_np2d(data, params_str, ref_dataset) + self.__init_from_list_np2d(mats=data, params_str=params_str, ref_dataset=ref_dataset) elif _is_list_of_sequences(data): - self.__init_from_seqs(data, ref_dataset) + self.__init_from_seqs(seqs=data, ref_dataset=ref_dataset) else: raise TypeError("Data list can only be of ndarray or Sequence") elif isinstance(data, Sequence): - self.__init_from_seqs([data], ref_dataset) + self.__init_from_seqs(seqs=[data], ref_dataset=ref_dataset) else: try: csr = scipy.sparse.csr_matrix(data) - self.__init_from_csr(csr, params_str, ref_dataset) + self.__init_from_csr(csr=csr, params_str=params_str, ref_dataset=ref_dataset) except BaseException as err: raise TypeError(f"Cannot initialize Dataset from {type(data).__name__}") from err if label is not None: @@ -2225,7 +2226,7 @@ def _yield_row_from_seqlist(seqs: List[Sequence], indices: Iterable[int]) -> Ite row = seq[id_in_seq] yield row if row.flags["OWNDATA"] else row.copy() - def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]: + def __sample(self, *, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]: """Sample data from seqs. Mimics behavior in c_api.cpp:LGBM_DatasetCreateFromMats() @@ -2234,7 +2235,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr ------- sampled_rows, sampled_row_indices """ - indices = self._create_sample_indices(total_nrow) + indices = self._create_sample_indices(total_nrow=total_nrow) # Select sampled rows, transpose to column order. sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices))) @@ -2255,6 +2256,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr def __init_from_seqs( self, + *, seqs: List[Sequence], ref_dataset: Optional[_DatasetHandle], ) -> "Dataset": @@ -2275,7 +2277,7 @@ def __init_from_seqs( param_str = _param_dict_to_str(self.get_params()) sample_cnt = _get_sample_count(total_nrow, param_str) - sample_data, col_indices = self.__sample(seqs, total_nrow) + sample_data, col_indices = self.__sample(seqs=seqs, total_nrow=total_nrow) self._init_from_sample(sample_data, col_indices, sample_cnt, total_nrow) for seq in seqs: @@ -2288,6 +2290,7 @@ def __init_from_seqs( def __init_from_np2d( self, + *, mat: np.ndarray, params_str: str, ref_dataset: Optional[_DatasetHandle], @@ -2315,6 +2318,7 @@ def __init_from_np2d( def __init_from_list_np2d( self, + *, mats: List[np.ndarray], params_str: str, ref_dataset: Optional[_DatasetHandle], @@ -2369,6 +2373,7 @@ def __init_from_list_np2d( def __init_from_csr( self, + *, csr: scipy.sparse.csr_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle], @@ -2403,6 +2408,7 @@ def __init_from_csr( def __init_from_csc( self, + *, csc: scipy.sparse.csc_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle], @@ -2437,6 +2443,7 @@ def __init_from_csc( def __init_from_pyarrow_table( self, + *, table: pa_Table, params_str: str, ref_dataset: Optional[_DatasetHandle], @@ -2466,6 +2473,7 @@ def __init_from_pyarrow_table( @staticmethod def _compare_params_for_warning( + *, params: Dict[str, Any], other_params: Dict[str, Any], ignore_keys: Set[str], @@ -2535,7 +2543,11 @@ def construct(self) -> "Dataset": ) else: # construct subset - used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name="used_indices") + used_indices = _list_to_1d_numpy( + data=self.used_indices, + dtype=np.int32, + name="used_indices", + ) assert used_indices.flags.c_contiguous if self.reference.group is not None: group_info = np.array(self.reference.group).astype(np.int32, copy=False) @@ -2803,9 +2815,9 @@ def set_field( if field_name == "init_score": dtype = np.float64 if _is_1d_collection(data): - data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) + data = _list_to_1d_numpy(data=data, dtype=dtype, name=field_name) elif _is_2d_collection(data): - data = _data_to_2d_numpy(data, dtype=dtype, name=field_name) + data = _data_to_2d_numpy(data=data, dtype=dtype, name=field_name) data = data.ravel(order="F") else: raise TypeError( @@ -2817,7 +2829,7 @@ def set_field( dtype = np.int32 else: dtype = np.float32 - data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) + data = _list_to_1d_numpy(data=data, dtype=dtype, name=field_name) ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr] if data.dtype == np.float32 or data.dtype == np.float64: @@ -3058,7 +3070,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": elif _is_pyarrow_array(label): label_array = label else: - label_array = _list_to_1d_numpy(label, dtype=np.float32, name="label") + label_array = _list_to_1d_numpy(data=label, dtype=np.float32, name="label") self.set_field("label", label_array) self.label = self.get_field("label") # original values can be modified at cpp side return self @@ -3091,7 +3103,7 @@ def set_weight( # Set field if self._handle is not None and weight is not None: if not _is_pyarrow_array(weight): - weight = _list_to_1d_numpy(weight, dtype=np.float32, name="weight") + weight = _list_to_1d_numpy(data=weight, dtype=np.float32, name="weight") self.set_field("weight", weight) self.weight = self.get_field("weight") # original values can be modified at cpp side return self @@ -3141,7 +3153,7 @@ def set_group( self.group = group if self._handle is not None and group is not None: if not _is_pyarrow_array(group): - group = _list_to_1d_numpy(group, dtype=np.int32, name="group") + group = _list_to_1d_numpy(data=group, dtype=np.int32, name="group") self.set_field("group", group) # original values can be modified at cpp side constructed_group = self.get_field("group") @@ -3167,7 +3179,7 @@ def set_position( """ self.position = position if self._handle is not None and position is not None: - position = _list_to_1d_numpy(position, dtype=np.int32, name="position") + position = _list_to_1d_numpy(data=position, dtype=np.int32, name="position") self.set_field("position", position) return self @@ -3884,6 +3896,7 @@ def _get_node_index( return f"{tree_num}{node_type}{node_num}" def _get_split_feature( + *, tree: Dict[str, Any], feature_names: Optional[List[str]], ) -> Optional[str]: @@ -3907,7 +3920,7 @@ def _is_single_node_tree(tree: Dict[str, Any]) -> bool: node["left_child"] = None node["right_child"] = None node["parent_index"] = parent_node - node["split_feature"] = _get_split_feature(tree, feature_names) + node["split_feature"] = _get_split_feature(tree=tree, feature_names=feature_names) node["split_gain"] = None node["threshold"] = None node["decision_type"] = None @@ -4132,11 +4145,12 @@ def update( else: if not self.__set_objective_to_none: self.reset_parameter({"objective": "none"}).__set_objective_to_none = True - grad, hess = fobj(self.__inner_predict(0), self.train_set) - return self.__boost(grad, hess) + grad, hess = fobj(self.__inner_predict(data_idx=0), self.train_set) + return self.__boost(grad=grad, hess=hess) def __boost( self, + *, grad: np.ndarray, hess: np.ndarray, ) -> bool: @@ -4171,8 +4185,8 @@ def __boost( if self.__num_class > 1: grad = grad.ravel(order="F") hess = hess.ravel(order="F") - grad = _list_to_1d_numpy(grad, dtype=np.float32, name="gradient") - hess = _list_to_1d_numpy(hess, dtype=np.float32, name="hessian") + grad = _list_to_1d_numpy(data=grad, dtype=np.float32, name="gradient") + hess = _list_to_1d_numpy(data=hess, dtype=np.float32, name="hessian") assert grad.flags.c_contiguous assert hess.flags.c_contiguous if len(grad) != len(hess): @@ -5178,7 +5192,7 @@ def __inner_eval( for eval_function in feval: if eval_function is None: continue - feval_ret = eval_function(self.__inner_predict(data_idx), cur_data) + feval_ret = eval_function(self.__inner_predict(data_idx=data_idx), cur_data) if isinstance(feval_ret, list): for eval_name, val, is_higher_better in feval_ret: ret.append((data_name, eval_name, val, is_higher_better)) @@ -5187,7 +5201,7 @@ def __inner_eval( ret.append((data_name, eval_name, val, is_higher_better)) return ret - def __inner_predict(self, data_idx: int) -> np.ndarray: + def __inner_predict(self, *, data_idx: int) -> np.ndarray: """Predict for training and validation dataset.""" if data_idx >= self.__num_dataset: raise ValueError("Data_idx should be smaller than number of dataset") diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 8018dd92efd2..d52f1680d44f 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -301,16 +301,16 @@ def _reset_storages(self) -> None: self.best_score: List[float] = [] self.best_iter: List[int] = [] self.best_score_list: List[_ListOfEvalResultTuples] = [] - self.cmp_op: List[Callable[[float, float], bool]] = [] + self.cmp_op: List[Callable[[float, float, float], bool]] = [] self.first_metric = "" - def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: + def _gt_delta(self, *, curr_score: float, best_score: float, delta: float) -> bool: return curr_score > best_score + delta - def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: + def _lt_delta(self, *, curr_score: float, best_score: float, delta: float) -> bool: return curr_score < best_score - delta - def _is_train_set(self, dataset_name: str, env: CallbackEnv) -> bool: + def _is_train_set(self, *, dataset_name: str, env: CallbackEnv) -> bool: """Check, by name, if a given Dataset is the training data.""" # for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set # and those metrics are considered for early stopping @@ -413,7 +413,9 @@ def __call__(self, env: CallbackEnv) -> None: first_time_updating_best_score_list = self.best_score_list == [] for i in range(len(env.evaluation_result_list)): dataset_name, metric_name, metric_value, *_ = env.evaluation_result_list[i] - if first_time_updating_best_score_list or self.cmp_op[i](metric_value, self.best_score[i]): + if first_time_updating_best_score_list or self.cmp_op[i]( # type: ignore[call-arg] + curr_score=metric_value, best_score=self.best_score[i] + ): self.best_score[i] = metric_value self.best_iter[i] = env.iteration if first_time_updating_best_score_list: diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index bc533d39219a..eb8a7cbe823e 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -113,6 +113,7 @@ def _get_dask_client(client: Optional[Client]) -> Client: def _assign_open_ports_to_workers( + *, client: Client, workers: List[str], ) -> Tuple[Dict[str, Future], Dict[str, int]]: @@ -165,7 +166,11 @@ def _remove_list_padding(*args: Any) -> List[List[Any]]: return [[z for z in arg if z is not None] for arg in args] -def _pad_eval_names(lgbm_model: LGBMModel, required_names: List[str]) -> LGBMModel: +def _pad_eval_names( + *, + lgbm_model: LGBMModel, + required_names: List[str], +) -> LGBMModel: """Append missing (key, value) pairs to a LightGBM model's evals_result_ and best_score_ OrderedDict attrs based on a set of required eval_set names. Allows users to rely on expected eval_set names being present when fitting DaskLGBM estimators with ``eval_set``. @@ -356,12 +361,12 @@ def _train_part( if n_evals: # ensure that expected keys for evals_result_ and best_score_ exist regardless of padding. - model = _pad_eval_names(model, required_names=evals_result_names) + model = _pad_eval_names(lgbm_model=model, required_names=evals_result_names) return model if return_model else None -def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]: +def _split_to_parts(*, data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]: parts = data.to_delayed() if isinstance(parts, np.ndarray): if is_matrix: @@ -372,7 +377,11 @@ def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]: return parts -def _machines_to_worker_map(machines: str, worker_addresses: Iterable[str]) -> Dict[str, int]: +def _machines_to_worker_map( + *, + machines: str, + worker_addresses: Iterable[str], +) -> Dict[str, int]: """Create a worker_map from machines list. Given ``machines`` and a list of Dask worker addresses, return a mapping where the keys are @@ -773,7 +782,8 @@ def _train( else: _log_info("Finding random open ports for workers") worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers( - client, list(worker_map.keys()) + client=client, + workers=list(worker_map.keys()), ) machines = ",".join( @@ -1091,7 +1101,7 @@ def _lgb_dask_fit( ) self.set_params(**model.get_params()) # type: ignore[attr-defined] - self._lgb_dask_copy_extra_params(model, self) # type: ignore[attr-defined] + self._lgb_dask_copy_extra_params(source=model, dest=self) # type: ignore[attr-defined] return self @@ -1099,12 +1109,13 @@ def _lgb_dask_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel: params = self.get_params() # type: ignore[attr-defined] params.pop("client", None) model = model_factory(**params) - self._lgb_dask_copy_extra_params(self, model) + self._lgb_dask_copy_extra_params(source=self, dest=model) model._other_params.pop("client", None) return model @staticmethod def _lgb_dask_copy_extra_params( + *, source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel], ) -> None: diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 3bcb2829aad5..d617623b44e2 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -685,18 +685,18 @@ def test_list_to_1d_numpy(collection, dtype, rng): ValueError, match=r"pandas dtypes must be int, float or bool\.\nFields with bad pandas dtypes: 0: object", ): - lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name) + lgb.basic._list_to_1d_numpy(data=y, dtype=np.float32, name=custom_name) return elif pd.api.types.is_string_dtype(y): with pytest.raises( ValueError, match=r"pandas dtypes must be int, float or bool\.\nFields with bad pandas dtypes: 0: str" ): - lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name) + lgb.basic._list_to_1d_numpy(data=y, dtype=np.float32, name=custom_name) return if isinstance(y, np.ndarray) and len(y.shape) == 2: with pytest.warns(UserWarning, match="column-vector"): - lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name) + lgb.basic._list_to_1d_numpy(data=y, dtype=np.float32, name=custom_name) return elif isinstance(y, list) and isinstance(y[0], list): err_msg = ( @@ -704,10 +704,10 @@ def test_list_to_1d_numpy(collection, dtype, rng): r"It should be list, numpy 1-D array or pandas Series" ) with pytest.raises(TypeError, match=err_msg): - lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name) + lgb.basic._list_to_1d_numpy(data=y, dtype=np.float32, name=custom_name) return - result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name=custom_name) + result = lgb.basic._list_to_1d_numpy(data=y, dtype=dtype, name=custom_name) assert result.size == 10 assert result.dtype == dtype diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index dc52496ec8a5..8b8666ae397b 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1324,7 +1324,10 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c # model 2 - machines given workers = list(client.scheduler_info()["workers"]) workers_hostname = _get_workers_hostname(cluster) - remote_sockets, open_ports = lgb.dask._assign_open_ports_to_workers(client, workers) + remote_sockets, open_ports = lgb.dask._assign_open_ports_to_workers( + client=client, + workers=workers, + ) for s in remote_sockets.values(): s.release() dask_model2 = dask_model_factory( diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 28cd142f9069..96442d95c570 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -35,7 +35,7 @@ def load_linnerud(**kwargs): def make_ranking( - n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0 + *, n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0 ): """Generate a learning-to-rank dataset - feature vectors grouped together with integer-valued graded relevance scores. Replace this with a sklearn.datasets function @@ -116,7 +116,7 @@ def make_ranking( @lru_cache(maxsize=None) -def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42): +def make_synthetic_regression(*, n_samples=100, n_features=4, n_informative=2, random_state=42): return sklearn.datasets.make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state )