4
4
5
5
import numpy as np
6
6
import numpy .typing as npt
7
- from scipy .linalg import eigh , lu_factor , lu_solve
7
+ from scipy .linalg import cho_factor , cho_solve , eigh , lu_factor , lu_solve
8
8
from sklearn .base import BaseEstimator , clone
9
9
from sklearn .isotonic import IsotonicRegression
10
+ from sklearn .linear_model import QuantileRegressor
10
11
from sklearn .metrics import accuracy_score , r2_score
11
12
from sklearn .metrics .pairwise import euclidean_distances , rbf_kernel
12
13
from sklearn .utils .validation import check_consistent_length , check_X_y
21
22
ComplexMatrix ,
22
23
ComplexVector ,
23
24
FloatMatrix ,
25
+ FloatTensor ,
24
26
FloatVector ,
25
27
GenericVector ,
26
28
)
@@ -34,34 +36,30 @@ class NeoLSSVM(BaseEstimator):
34
36
35
37
A neo Least-Squares Support Vector Machine with:
36
38
37
- - [x] A next-generation regularisation term that penalises the complexity of the prediction
38
- surface, decision function, and maximises the margin.
39
- - [x] Large-scale support through state-of-the-art random feature maps.
40
- - [x] Optional automatic selection of primal or dual problem.
41
- - [x] Automatic optimal tuning of the regularisation hyperparameter γ that minimises the
42
- leave-one-out error, without having to refit the model.
43
- - [x] Automatic tuning of the kernel parameters σ, without having to refit the model.
44
- - [x] Automatic robust shift and scaling of the feature matrix and labels.
45
- - [x] Leave-one-out residuals and error as a free output after fitting, optimally clipped in
46
- classification.
47
- - [x] Isotonically calibrated class probabilities based on leave-one-out predictions.
48
- - [ ] Automatic robust fit by removing outliers.
39
+ 1. ⚡ Linear complexity in the number of training examples with Orthogonal Random Features.
40
+ 2. 🚀 Hyperparameter free: zero-cost optimization of the regularisation parameter γ and
41
+ kernel parameter σ.
42
+ 3. 🏔️ Adds a new tertiary objective that minimizes the complexity of the prediction surface.
43
+ 4. 🎁 Returns the leave-one-out residuals and error for free after fitting.
44
+ 5. 🌀 Learns an affine transformation of the feature matrix to optimally separate the
45
+ target's bins.
46
+ 6. 🪞 Can solve the LS-SVM both in the primal and dual space.
47
+ 7. 🌡️ Isotonically calibrated `predict_proba` based on the leave-one-out predictions.
48
+ 8. 🎲 Asymmetric conformal Bayesian confidence intervals for classification and regression.
49
49
"""
50
50
51
51
def __init__ ( # noqa: PLR0913
52
52
self ,
53
53
* ,
54
- primal_feature_map : KernelApproximatingFeatureMap | None = None ,
55
- dual_feature_map : AffineSeparator | None = None ,
56
- dual : bool | None = None ,
57
- refit : bool = False ,
54
+ primal_feature_map : KernelApproximatingFeatureMap | Literal [ "auto" ] = "auto" ,
55
+ dual_feature_map : AffineSeparator | Literal [ "auto" ] = "auto" ,
56
+ dual : bool | Literal [ "auto" ] = "auto" ,
57
+ estimator_type : Literal [ "auto" , "classifier" , "regressor" ] = "auto" ,
58
58
random_state : int | np .random .RandomState | None = 42 ,
59
- estimator_type : Literal ["classifier" , "regressor" ] | None = None ,
60
59
) -> None :
61
60
self .primal_feature_map = primal_feature_map
62
61
self .dual_feature_map = dual_feature_map
63
62
self .dual = dual
64
- self .refit = refit
65
63
self .random_state = random_state
66
64
self .estimator_type = estimator_type
67
65
@@ -156,6 +154,7 @@ def _optimize_β̂_γ(
156
154
)
157
155
# Store the leave-one-out residuals, leverage, error, and score.
158
156
self .loo_residuals_ = loo_residuals [:, optimum ]
157
+ self .loo_ŷ_ = y + self .loo_residuals_
159
158
self .loo_leverage_ = h @ rγ [:, optimum ]
160
159
self .loo_error_ = self .loo_errors_γs_ [optimum ]
161
160
if self ._estimator_type == "classifier" :
@@ -164,12 +163,17 @@ def _optimize_β̂_γ(
164
163
self .loo_score_ = r2_score (y , ŷ_loo [:, optimum ], sample_weight = s )
165
164
β̂ , γ = β̂ @ rγ [:, optimum ], self .γs_ [optimum ]
166
165
# Resolve the linear system for better accuracy.
167
- if self .refit :
168
- β̂ = np . linalg . solve ( γ * C + A , φSTSy )
166
+ self .L_ = cho_factor ( γ * C + A )
167
+ β̂ = cho_solve ( self . L_ , φSTSy )
169
168
self .residuals_ = np .real (φ @ β̂ ) - y
170
169
if self ._estimator_type == "classifier" :
171
170
self .residuals_ [(y > 0 ) & (self .residuals_ > 0 )] = 0
172
171
self .residuals_ [(y < 0 ) & (self .residuals_ < 0 )] = 0
172
+ # Compute the leave-one-out nonconformity with the Sherman-Morrison formula.
173
+ σ2 = np .real (np .sum (φ * cho_solve (self .L_ , φ .conj ().T ).T , axis = 1 ))
174
+ σ2 = np .ascontiguousarray (σ2 )
175
+ loo_σ2 = σ2 + (s * σ2 ) ** 2 / (1 - self .loo_leverage_ )
176
+ self .loo_nonconformity_ = np .sqrt (loo_σ2 )
173
177
# TODO: Print warning if optimal γ is found at the edge.
174
178
return β̂ , γ
175
179
@@ -287,19 +291,24 @@ def _optimize_α̂_γ(
287
291
)
288
292
# Store the leave-one-out residuals, leverage, error, and score.
289
293
self .loo_residuals_ = loo_residuals [:, optimum ]
294
+ self .loo_ŷ_ = y + self .loo_residuals_
290
295
self .loo_error_ = self .loo_errors_γs_ [optimum ]
291
296
if self ._estimator_type == "classifier" :
292
297
self .loo_score_ = accuracy_score (y , np .sign (ŷ_loo [:, optimum ]), sample_weight = s )
293
298
elif self ._estimator_type == "regressor" :
294
299
self .loo_score_ = r2_score (y , ŷ_loo [:, optimum ], sample_weight = s )
295
300
α̂ , γ = α̂_loo [:, optimum ], self .γs_ [optimum ]
296
301
# Resolve the linear system for better accuracy.
297
- if self .refit :
298
- α̂ = np . linalg . solve ( γ * ρ * np . diag ( sn ** - 2 ) + K , y )
302
+ self .L_ = cho_factor ( γ * ρ * np . diag ( sn ** - 2 ) + K )
303
+ α̂ = cho_solve ( self . L_ , y )
299
304
self .residuals_ = F @ α̂ - y
300
305
if self ._estimator_type == "classifier" :
301
306
self .residuals_ [(y > 0 ) & (self .residuals_ > 0 )] = 0
302
307
self .residuals_ [(y < 0 ) & (self .residuals_ < 0 )] = 0
308
+ # Compute the nonconformity. TODO: Apply a leave-one-out correction.
309
+ K = rbf_kernel (X , X , gamma = 0.5 )
310
+ σ2 = 1.0 - np .sum (K * cho_solve (self .L_ , K .T ).T , axis = 1 )
311
+ self .loo_nonconformity_ = np .sqrt (σ2 )
303
312
# TODO: Print warning if optimal γ is found at the edge.
304
313
return α̂ , γ
305
314
@@ -334,7 +343,9 @@ def fit(
334
343
or np .issubdtype (y .dtype , np .timedelta64 )
335
344
):
336
345
inferred_estimator_type = "regressor"
337
- self ._estimator_type : str | None = self .estimator_type or inferred_estimator_type
346
+ self ._estimator_type : str | None = (
347
+ inferred_estimator_type if self .estimator_type == "auto" else self .estimator_type
348
+ )
338
349
if self ._estimator_type == "classifier" :
339
350
self .classes_ : GenericVector = unique_y
340
351
negatives = y == self .classes_ [0 ]
@@ -346,18 +357,24 @@ def fit(
346
357
message = "Target type not supported"
347
358
raise ValueError (message )
348
359
# Determine whether we want to solve this in the primal or dual space.
349
- self .dual_ = X .shape [0 ] <= 1024 if self .dual is None else self .dual # noqa: PLR2004
360
+ self .dual_ = X .shape [0 ] <= 1024 if self .dual == "auto" else self .dual # noqa: PLR2004
350
361
self .primal_ = not self .dual_
351
362
# Learn an optimal distance metric for the primal or dual space and apply it to the feature
352
363
# matrix X.
353
364
if self .primal_ :
354
365
self .primal_feature_map_ = clone (
355
- self .primal_feature_map or OrthogonalRandomFourierFeatures ()
366
+ OrthogonalRandomFourierFeatures ()
367
+ if self .primal_feature_map == "auto"
368
+ else self .primal_feature_map
356
369
)
357
370
self .primal_feature_map_ .fit (X , y_ , sample_weight_ )
358
371
φ = self .primal_feature_map_ .transform (X )
359
372
else :
360
- self .dual_feature_map_ = clone (self .dual_feature_map or AffineSeparator ())
373
+ nz_weight = sample_weight_ > 0
374
+ X , y_ , sample_weight_ = X [nz_weight ], y_ [nz_weight ], sample_weight_ [nz_weight ]
375
+ self .dual_feature_map_ = clone (
376
+ AffineSeparator () if self .dual_feature_map == "auto" else self .dual_feature_map
377
+ )
361
378
self .dual_feature_map_ .fit (X , y_ , sample_weight_ )
362
379
self .X_ = self .dual_feature_map_ .transform (X )
363
380
# Solve the primal or dual system. We optimise the following sub-objectives for the weights
@@ -375,21 +392,94 @@ def fit(
375
392
self .predict_proba_calibrator_ = IsotonicRegression (
376
393
out_of_bounds = "clip" , y_min = 0 , y_max = 1 , increasing = True
377
394
)
378
- ŷ_loo = y_ + self .loo_residuals_
379
395
target = np .zeros_like (y_ )
380
396
target [y_ == np .max (y_ )] = 1.0
381
- self .predict_proba_calibrator_ .fit (ŷ_loo , target , sample_weight_ )
397
+ self .predict_proba_calibrator_ .fit (self .loo_ŷ_ , target , sample_weight_ )
398
+ # Lazily fit conformal predictors as quantile regression models that predict the lower and
399
+ # upper bounds of the (relative) leave-one-out residuals.
400
+ self .conformal_regressors_ : dict [str , dict [float , QuantileRegressor ]] = {
401
+ "Δ⁺" : {},
402
+ "Δ⁻" : {},
403
+ "Δ⁺/ŷ" : {},
404
+ "Δ⁻/ŷ" : {},
405
+ }
382
406
return self
383
407
408
+ def nonconformity_measure (self , X : FloatMatrix [F ]) -> FloatVector [F ]:
409
+ """Compute the nonconformity of a set of examples."""
410
+ # Estimate the nonconformity as the variance of this model's Gaussian Process.
411
+ σ2 : FloatVector [F ]
412
+ if self .primal_ :
413
+ # If β̂ := (LL')⁻¹ y* and cov(y*) := LL', then cov(β̂) = cov((LL')⁻¹ y*) = (LL')⁻¹
414
+ # assuming 𝔼(β̂) = 0. It follows that cov(ŷ(x)) = cov(φ(x)'β̂) = φ(x)'(LL')⁻¹φ(x).
415
+ φH = cast (KernelApproximatingFeatureMap , self .primal_feature_map_ ).transform (X )
416
+ σ2 = np .real (np .sum (φH * cho_solve (self .L_ , φH .conj ().T ).T , axis = 1 ))
417
+ σ2 = np .ascontiguousarray (σ2 )
418
+ else :
419
+ # Compute the cov(ŷ(x)) as K(x, x) − K(x, X) (LL')⁻¹ K(X, x). TODO: Document derivation.
420
+ X = cast (AffineFeatureMap , self .dual_feature_map_ ).transform (X )
421
+ K = rbf_kernel (X , self .X_ , gamma = 0.5 )
422
+ σ2 = 1.0 - np .sum (K * cho_solve (self .L_ , K .T ).T , axis = 1 )
423
+ # Convert the variance to a standard deviation.
424
+ σ = np .sqrt (σ2 )
425
+ return σ
426
+
427
+ def predict_confidence_interval (
428
+ self , X : FloatMatrix [F ], * , confidence_level : float = 0.95
429
+ ) -> FloatMatrix [F ] | FloatTensor [F ]:
430
+ # Compute the nonconformity measure for the given examples.
431
+ X_nonconformity = self .nonconformity_measure (X )[:, np .newaxis ]
432
+ # Determine the quantiles at the edge of the confidence interval.
433
+ quantile = 1 - (1 - confidence_level ) / 2
434
+ # Lazily fit any missing conformal regressors.
435
+ # TODO: Perhaps exclude samples that were used in the feature map.
436
+ for target_type in ("Δ⁺" , "Δ⁻" , "Δ⁺/ŷ" , "Δ⁻/ŷ" ):
437
+ quantile_regressors = self .conformal_regressors_ [target_type ]
438
+ if quantile not in quantile_regressors :
439
+ sgn = (self .loo_residuals_ > 0 ) if "⁺" in target_type else (self .loo_residuals_ < 0 )
440
+ eps = np .finfo (self .loo_ŷ_ .dtype ).eps
441
+ quantile_regressors [quantile ] = QuantileRegressor (
442
+ quantile = quantile , alpha = np .sqrt (eps ), solver = "highs"
443
+ ).fit (
444
+ self .loo_nonconformity_ [sgn , np .newaxis ],
445
+ np .abs (self .loo_residuals_ [sgn ]) / np .maximum (np .abs (self .loo_ŷ_ )[sgn ], eps )
446
+ if "/ŷ" in target_type
447
+ else np .abs (self .loo_residuals_ [sgn ]),
448
+ )
449
+ # Predict the confidence interval for the nonconformity measure.
450
+ ŷ = self .decision_function (X )
451
+ Δ_lower = np .minimum (
452
+ self .conformal_regressors_ ["Δ⁻" ][quantile ].predict (X_nonconformity ),
453
+ np .abs (ŷ ) * self .conformal_regressors_ ["Δ⁻/ŷ" ][quantile ].predict (X_nonconformity ),
454
+ )
455
+ Δ_upper = np .minimum (
456
+ self .conformal_regressors_ ["Δ⁺" ][quantile ].predict (X_nonconformity ),
457
+ np .abs (ŷ ) * self .conformal_regressors_ ["Δ⁺/ŷ" ][quantile ].predict (X_nonconformity ),
458
+ )
459
+ # Assemble the confidence interval.
460
+ C = np .hstack (((ŷ - Δ_lower )[:, np .newaxis ], (ŷ + Δ_upper )[:, np .newaxis ]))
461
+ # In case of classification, convert the decision function values to probabilities.
462
+ if self ._estimator_type == "classifier" :
463
+ C = np .hstack (
464
+ [
465
+ self .predict_proba_calibrator_ .transform (C [:, 0 ])[:, np .newaxis ],
466
+ self .predict_proba_calibrator_ .transform (C [:, 1 ])[:, np .newaxis ],
467
+ ]
468
+ )
469
+ C = np .dstack ([1 - C [:, ::- 1 ], C ])
470
+ return C
471
+
384
472
def decision_function (self , X : FloatMatrix [F ]) -> FloatVector [F ]:
385
- """Evaluate this predictor's decision function."""
473
+ """Evaluate this predictor's prediction function."""
474
+ # Compute the point predictions ŷ(X).
386
475
ŷ : FloatVector [F ]
387
476
if self .primal_ :
388
477
# Apply the feature map φ and predict as ŷ(x) := φ(x)'β̂.
389
478
φ = cast (KernelApproximatingFeatureMap , self .primal_feature_map_ ).transform (X )
390
479
ŷ = np .real (φ @ self .β̂_ )
480
+ ŷ = np .ascontiguousarray (ŷ )
391
481
else :
392
- # Apply an affine transformation to X, then predict as ŷ(x) := k(x, X) â + 1'â .
482
+ # Apply an affine transformation to X, then predict as ŷ(x) := k(x, X) α̂ + 1'α̂ .
393
483
X = cast (AffineFeatureMap , self .dual_feature_map_ ).transform (X )
394
484
K = rbf_kernel (X , self .X_ , gamma = 0.5 )
395
485
b = np .sum (self .α̂_ )
@@ -398,7 +488,7 @@ def decision_function(self, X: FloatMatrix[F]) -> FloatVector[F]:
398
488
399
489
def predict (self , X : FloatMatrix [F ]) -> GenericVector :
400
490
"""Predict the output on a given dataset."""
401
- # Evaluate ŷ given the feature matrix X .
491
+ # Compute the point predictions ŷ(X) .
402
492
ŷ_df = self .decision_function (X )
403
493
if self ._estimator_type == "classifier" :
404
494
# For binary classification, round to the nearest class label. When the decision
@@ -415,23 +505,29 @@ def predict(self, X: FloatMatrix[F]) -> GenericVector:
415
505
ŷ = ŷ .astype (self .y_dtype_ )
416
506
return ŷ
417
507
418
- def predict_proba (self , X : FloatMatrix [F ]) -> FloatMatrix [F ]:
419
- """Predict the output probability (classification) or confidence interval (regression)."""
508
+ def predict_proba (
509
+ self ,
510
+ X : FloatMatrix [F ],
511
+ * ,
512
+ confidence_interval : bool = False ,
513
+ confidence_level : float = 0.95 ,
514
+ ) -> FloatVector [F ] | FloatMatrix [F ] | FloatTensor [F ]:
515
+ """Predict the class probability or confidence interval."""
516
+ if confidence_interval :
517
+ # Return the confidence interval for classification or regression.
518
+ C = self .predict_confidence_interval (X , confidence_level = confidence_level )
519
+ return C
420
520
if self ._estimator_type == "classifier" :
521
+ # Return the class probabilities for classification.
421
522
ŷ_classification = self .decision_function (X )
422
523
p = self .predict_proba_calibrator_ .transform (ŷ_classification )
423
524
P = np .hstack ([1 - p [:, np .newaxis ], p [:, np .newaxis ]])
424
525
else :
425
- # TODO: Replace point predictions with confidence interval .
526
+ # Return the point predictions for regression .
426
527
ŷ_regression = self .predict (X )
427
- P = np . hstack (( ŷ_regression [:, np . newaxis ], ŷ_regression [:, np . newaxis ]))
528
+ P = ŷ_regression
428
529
return P
429
530
430
- @property
431
- def loo_score (self ) -> float :
432
- """Compute the leave-one-out score of this classifier or regressor."""
433
- return cast (float , self .loo_score_ )
434
-
435
531
def score (
436
532
self , X : FloatMatrix [F ], y : GenericVector , sample_weight : FloatVector [F ] | None = None
437
533
) -> float :
0 commit comments