Skip to content

Commit 8fd0bbe

Browse files
committed
feat: add conformal Bayesian prediction
1 parent a2d6028 commit 8fd0bbe

File tree

4 files changed

+170
-49
lines changed

4 files changed

+170
-49
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Neo LS-SVM is a modern [Least-Squares Support Vector Machine](https://en.wikiped
1111
5. 🌀 Learns an affine transformation of the feature matrix to optimally separate the target's bins.
1212
6. 🪞 Can solve the LS-SVM both in the primal and dual space.
1313
7. 🌡️ Isotonically calibrated `predict_proba` based on the leave-one-out predictions.
14+
8. 🎲 Asymmetric conformal Bayesian confidence intervals for classification and regression.
1415

1516
## Using
1617

src/neo_ls_svm/_feature_maps.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -153,13 +153,13 @@ def fit(
153153
def transform(self, X: FloatMatrix[F]) -> ComplexMatrix[C]:
154154
"""Transform a feature matrix X ∈ Rⁿˣᵈ into φ(X) ∈ Cⁿˣᴰ⁺¹ so that φ(X)ᵢ := [φ(xᵢ)' 1].
155155
156-
Notice that we can choose to solve an LS-SVM in the primal or dual space using the matrix
157-
identity (γI + AB)⁻¹ A = A (γI + BA)⁻¹:
156+
Notice that we can choose to solve an LS-SVM in the primal or dual space using the
157+
push-through identity (γ𝕀 + AB)⁻¹ A = A (γ𝕀 + BA)⁻¹:
158158
159-
argmin ||y - φ(X)β̂||² + γ||β̂||²
160-
= (γI + φ(X)'φ(X))⁻¹ φ(X)'y
161-
= φ(X)' (γI + φ(X)φ(X)')⁻¹y with identity (γI + AB)⁻¹ A = A (γI + BA)⁻¹
162-
= φ(X)'a where a = (γI + φ(X)φ(X)')⁻¹y = (γI + k(xᵢ, xⱼ))⁻¹y
159+
argmin ||φ(X)β̂ - y||² + γ||β̂||²
160+
= (γ𝕀 + φ(X)'φ(X))⁻¹ φ(X)'y
161+
= φ(X)' (γ𝕀 + φ(X)φ(X)')⁻¹y with the identity (γ𝕀 + AB)⁻¹ A = A (γ𝕀 + BA)⁻¹
162+
= φ(X)'α̂ where α̂ := (γ𝕀 + φ(X)φ(X)')⁻¹y = (γ𝕀 + k(xᵢ, xⱼ))⁻¹y
163163
164164
This means that k(x, y) = φ(x)'φ(y) by definition. Now we look for a φ(x) so that k(x, y) =
165165
φ(x)'φ(y) for the Gaussian kernel k(x, y) = exp(- ||y - x||² / 2). If we take h(x) :=

src/neo_ls_svm/_neo_ls_svm.py

+137-41
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
import numpy as np
66
import numpy.typing as npt
7-
from scipy.linalg import eigh, lu_factor, lu_solve
7+
from scipy.linalg import cho_factor, cho_solve, eigh, lu_factor, lu_solve
88
from sklearn.base import BaseEstimator, clone
99
from sklearn.isotonic import IsotonicRegression
10+
from sklearn.linear_model import QuantileRegressor
1011
from sklearn.metrics import accuracy_score, r2_score
1112
from sklearn.metrics.pairwise import euclidean_distances, rbf_kernel
1213
from sklearn.utils.validation import check_consistent_length, check_X_y
@@ -21,6 +22,7 @@
2122
ComplexMatrix,
2223
ComplexVector,
2324
FloatMatrix,
25+
FloatTensor,
2426
FloatVector,
2527
GenericVector,
2628
)
@@ -34,34 +36,30 @@ class NeoLSSVM(BaseEstimator):
3436
3537
A neo Least-Squares Support Vector Machine with:
3638
37-
- [x] A next-generation regularisation term that penalises the complexity of the prediction
38-
surface, decision function, and maximises the margin.
39-
- [x] Large-scale support through state-of-the-art random feature maps.
40-
- [x] Optional automatic selection of primal or dual problem.
41-
- [x] Automatic optimal tuning of the regularisation hyperparameter γ that minimises the
42-
leave-one-out error, without having to refit the model.
43-
- [x] Automatic tuning of the kernel parameters σ, without having to refit the model.
44-
- [x] Automatic robust shift and scaling of the feature matrix and labels.
45-
- [x] Leave-one-out residuals and error as a free output after fitting, optimally clipped in
46-
classification.
47-
- [x] Isotonically calibrated class probabilities based on leave-one-out predictions.
48-
- [ ] Automatic robust fit by removing outliers.
39+
1. ⚡ Linear complexity in the number of training examples with Orthogonal Random Features.
40+
2. 🚀 Hyperparameter free: zero-cost optimization of the regularisation parameter γ and
41+
kernel parameter σ.
42+
3. 🏔️ Adds a new tertiary objective that minimizes the complexity of the prediction surface.
43+
4. 🎁 Returns the leave-one-out residuals and error for free after fitting.
44+
5. 🌀 Learns an affine transformation of the feature matrix to optimally separate the
45+
target's bins.
46+
6. 🪞 Can solve the LS-SVM both in the primal and dual space.
47+
7. 🌡️ Isotonically calibrated `predict_proba` based on the leave-one-out predictions.
48+
8. 🎲 Asymmetric conformal Bayesian confidence intervals for classification and regression.
4949
"""
5050

5151
def __init__( # noqa: PLR0913
5252
self,
5353
*,
54-
primal_feature_map: KernelApproximatingFeatureMap | None = None,
55-
dual_feature_map: AffineSeparator | None = None,
56-
dual: bool | None = None,
57-
refit: bool = False,
54+
primal_feature_map: KernelApproximatingFeatureMap | Literal["auto"] = "auto",
55+
dual_feature_map: AffineSeparator | Literal["auto"] = "auto",
56+
dual: bool | Literal["auto"] = "auto",
57+
estimator_type: Literal["auto", "classifier", "regressor"] = "auto",
5858
random_state: int | np.random.RandomState | None = 42,
59-
estimator_type: Literal["classifier", "regressor"] | None = None,
6059
) -> None:
6160
self.primal_feature_map = primal_feature_map
6261
self.dual_feature_map = dual_feature_map
6362
self.dual = dual
64-
self.refit = refit
6563
self.random_state = random_state
6664
self.estimator_type = estimator_type
6765

@@ -156,6 +154,7 @@ def _optimize_β̂_γ(
156154
)
157155
# Store the leave-one-out residuals, leverage, error, and score.
158156
self.loo_residuals_ = loo_residuals[:, optimum]
157+
self.loo_ŷ_ = y + self.loo_residuals_
159158
self.loo_leverage_ = h @ [:, optimum]
160159
self.loo_error_ = self.loo_errors_γs_[optimum]
161160
if self._estimator_type == "classifier":
@@ -164,12 +163,17 @@ def _optimize_β̂_γ(
164163
self.loo_score_ = r2_score(y, ŷ_loo[:, optimum], sample_weight=s)
165164
β̂, γ = β̂ @ [:, optimum], self.γs_[optimum]
166165
# Resolve the linear system for better accuracy.
167-
if self.refit:
168-
β̂ = np.linalg.solve(γ * C + A, φSTSy)
166+
self.L_ = cho_factor(γ * C + A)
167+
β̂ = cho_solve(self.L_, φSTSy)
169168
self.residuals_ = np.real(φ @ β̂) - y
170169
if self._estimator_type == "classifier":
171170
self.residuals_[(y > 0) & (self.residuals_ > 0)] = 0
172171
self.residuals_[(y < 0) & (self.residuals_ < 0)] = 0
172+
# Compute the leave-one-out nonconformity with the Sherman-Morrison formula.
173+
σ2 = np.real(np.sum(φ * cho_solve(self.L_, φ.conj().T).T, axis=1))
174+
σ2 = np.ascontiguousarray(σ2)
175+
loo_σ2 = σ2 + (s * σ2) ** 2 / (1 - self.loo_leverage_)
176+
self.loo_nonconformity_ = np.sqrt(loo_σ2)
173177
# TODO: Print warning if optimal γ is found at the edge.
174178
return β̂, γ
175179

@@ -287,19 +291,24 @@ def _optimize_α̂_γ(
287291
)
288292
# Store the leave-one-out residuals, leverage, error, and score.
289293
self.loo_residuals_ = loo_residuals[:, optimum]
294+
self.loo_ŷ_ = y + self.loo_residuals_
290295
self.loo_error_ = self.loo_errors_γs_[optimum]
291296
if self._estimator_type == "classifier":
292297
self.loo_score_ = accuracy_score(y, np.sign(ŷ_loo[:, optimum]), sample_weight=s)
293298
elif self._estimator_type == "regressor":
294299
self.loo_score_ = r2_score(y, ŷ_loo[:, optimum], sample_weight=s)
295300
α̂, γ = α̂_loo[:, optimum], self.γs_[optimum]
296301
# Resolve the linear system for better accuracy.
297-
if self.refit:
298-
α̂ = np.linalg.solve(γ * ρ * np.diag(sn**-2) + K, y)
302+
self.L_ = cho_factor(γ * ρ * np.diag(sn**-2) + K)
303+
α̂ = cho_solve(self.L_, y)
299304
self.residuals_ = F @ α̂ - y
300305
if self._estimator_type == "classifier":
301306
self.residuals_[(y > 0) & (self.residuals_ > 0)] = 0
302307
self.residuals_[(y < 0) & (self.residuals_ < 0)] = 0
308+
# Compute the nonconformity. TODO: Apply a leave-one-out correction.
309+
K = rbf_kernel(X, X, gamma=0.5)
310+
σ2 = 1.0 - np.sum(K * cho_solve(self.L_, K.T).T, axis=1)
311+
self.loo_nonconformity_ = np.sqrt(σ2)
303312
# TODO: Print warning if optimal γ is found at the edge.
304313
return α̂, γ
305314

@@ -334,7 +343,9 @@ def fit(
334343
or np.issubdtype(y.dtype, np.timedelta64)
335344
):
336345
inferred_estimator_type = "regressor"
337-
self._estimator_type: str | None = self.estimator_type or inferred_estimator_type
346+
self._estimator_type: str | None = (
347+
inferred_estimator_type if self.estimator_type == "auto" else self.estimator_type
348+
)
338349
if self._estimator_type == "classifier":
339350
self.classes_: GenericVector = unique_y
340351
negatives = y == self.classes_[0]
@@ -346,18 +357,24 @@ def fit(
346357
message = "Target type not supported"
347358
raise ValueError(message)
348359
# Determine whether we want to solve this in the primal or dual space.
349-
self.dual_ = X.shape[0] <= 1024 if self.dual is None else self.dual # noqa: PLR2004
360+
self.dual_ = X.shape[0] <= 1024 if self.dual == "auto" else self.dual # noqa: PLR2004
350361
self.primal_ = not self.dual_
351362
# Learn an optimal distance metric for the primal or dual space and apply it to the feature
352363
# matrix X.
353364
if self.primal_:
354365
self.primal_feature_map_ = clone(
355-
self.primal_feature_map or OrthogonalRandomFourierFeatures()
366+
OrthogonalRandomFourierFeatures()
367+
if self.primal_feature_map == "auto"
368+
else self.primal_feature_map
356369
)
357370
self.primal_feature_map_.fit(X, y_, sample_weight_)
358371
φ = self.primal_feature_map_.transform(X)
359372
else:
360-
self.dual_feature_map_ = clone(self.dual_feature_map or AffineSeparator())
373+
nz_weight = sample_weight_ > 0
374+
X, y_, sample_weight_ = X[nz_weight], y_[nz_weight], sample_weight_[nz_weight]
375+
self.dual_feature_map_ = clone(
376+
AffineSeparator() if self.dual_feature_map == "auto" else self.dual_feature_map
377+
)
361378
self.dual_feature_map_.fit(X, y_, sample_weight_)
362379
self.X_ = self.dual_feature_map_.transform(X)
363380
# Solve the primal or dual system. We optimise the following sub-objectives for the weights
@@ -375,21 +392,94 @@ def fit(
375392
self.predict_proba_calibrator_ = IsotonicRegression(
376393
out_of_bounds="clip", y_min=0, y_max=1, increasing=True
377394
)
378-
ŷ_loo = y_ + self.loo_residuals_
379395
target = np.zeros_like(y_)
380396
target[y_ == np.max(y_)] = 1.0
381-
self.predict_proba_calibrator_.fit(ŷ_loo, target, sample_weight_)
397+
self.predict_proba_calibrator_.fit(self.loo_ŷ_, target, sample_weight_)
398+
# Lazily fit conformal predictors as quantile regression models that predict the lower and
399+
# upper bounds of the (relative) leave-one-out residuals.
400+
self.conformal_regressors_: dict[str, dict[float, QuantileRegressor]] = {
401+
"Δ⁺": {},
402+
"Δ⁻": {},
403+
"Δ⁺/ŷ": {},
404+
"Δ⁻/ŷ": {},
405+
}
382406
return self
383407

408+
def nonconformity_measure(self, X: FloatMatrix[F]) -> FloatVector[F]:
409+
"""Compute the nonconformity of a set of examples."""
410+
# Estimate the nonconformity as the variance of this model's Gaussian Process.
411+
σ2: FloatVector[F]
412+
if self.primal_:
413+
# If β̂ := (LL')⁻¹ y* and cov(y*) := LL', then cov(β̂) = cov((LL')⁻¹ y*) = (LL')⁻¹
414+
# assuming 𝔼(β̂) = 0. It follows that cov(ŷ(x)) = cov(φ(x)'β̂) = φ(x)'(LL')⁻¹φ(x).
415+
φH = cast(KernelApproximatingFeatureMap, self.primal_feature_map_).transform(X)
416+
σ2 = np.real(np.sum(φH * cho_solve(self.L_, φH.conj().T).T, axis=1))
417+
σ2 = np.ascontiguousarray(σ2)
418+
else:
419+
# Compute the cov(ŷ(x)) as K(x, x) − K(x, X) (LL')⁻¹ K(X, x). TODO: Document derivation.
420+
X = cast(AffineFeatureMap, self.dual_feature_map_).transform(X)
421+
K = rbf_kernel(X, self.X_, gamma=0.5)
422+
σ2 = 1.0 - np.sum(K * cho_solve(self.L_, K.T).T, axis=1)
423+
# Convert the variance to a standard deviation.
424+
σ = np.sqrt(σ2)
425+
return σ
426+
427+
def predict_confidence_interval(
428+
self, X: FloatMatrix[F], *, confidence_level: float = 0.95
429+
) -> FloatMatrix[F] | FloatTensor[F]:
430+
# Compute the nonconformity measure for the given examples.
431+
X_nonconformity = self.nonconformity_measure(X)[:, np.newaxis]
432+
# Determine the quantiles at the edge of the confidence interval.
433+
quantile = 1 - (1 - confidence_level) / 2
434+
# Lazily fit any missing conformal regressors.
435+
# TODO: Perhaps exclude samples that were used in the feature map.
436+
for target_type in ("Δ⁺", "Δ⁻", "Δ⁺/ŷ", "Δ⁻/ŷ"):
437+
quantile_regressors = self.conformal_regressors_[target_type]
438+
if quantile not in quantile_regressors:
439+
sgn = (self.loo_residuals_ > 0) if "⁺" in target_type else (self.loo_residuals_ < 0)
440+
eps = np.finfo(self.loo_ŷ_.dtype).eps
441+
quantile_regressors[quantile] = QuantileRegressor(
442+
quantile=quantile, alpha=np.sqrt(eps), solver="highs"
443+
).fit(
444+
self.loo_nonconformity_[sgn, np.newaxis],
445+
np.abs(self.loo_residuals_[sgn]) / np.maximum(np.abs(self.loo_ŷ_)[sgn], eps)
446+
if "/ŷ" in target_type
447+
else np.abs(self.loo_residuals_[sgn]),
448+
)
449+
# Predict the confidence interval for the nonconformity measure.
450+
ŷ = self.decision_function(X)
451+
Δ_lower = np.minimum(
452+
self.conformal_regressors_["Δ⁻"][quantile].predict(X_nonconformity),
453+
np.abs(ŷ) * self.conformal_regressors_["Δ⁻/ŷ"][quantile].predict(X_nonconformity),
454+
)
455+
Δ_upper = np.minimum(
456+
self.conformal_regressors_["Δ⁺"][quantile].predict(X_nonconformity),
457+
np.abs(ŷ) * self.conformal_regressors_["Δ⁺/ŷ"][quantile].predict(X_nonconformity),
458+
)
459+
# Assemble the confidence interval.
460+
C = np.hstack(((ŷ - Δ_lower)[:, np.newaxis], (ŷ + Δ_upper)[:, np.newaxis]))
461+
# In case of classification, convert the decision function values to probabilities.
462+
if self._estimator_type == "classifier":
463+
C = np.hstack(
464+
[
465+
self.predict_proba_calibrator_.transform(C[:, 0])[:, np.newaxis],
466+
self.predict_proba_calibrator_.transform(C[:, 1])[:, np.newaxis],
467+
]
468+
)
469+
C = np.dstack([1 - C[:, ::-1], C])
470+
return C
471+
384472
def decision_function(self, X: FloatMatrix[F]) -> FloatVector[F]:
385-
"""Evaluate this predictor's decision function."""
473+
"""Evaluate this predictor's prediction function."""
474+
# Compute the point predictions ŷ(X).
386475
ŷ: FloatVector[F]
387476
if self.primal_:
388477
# Apply the feature map φ and predict as ŷ(x) := φ(x)'β̂.
389478
φ = cast(KernelApproximatingFeatureMap, self.primal_feature_map_).transform(X)
390479
ŷ = np.real(φ @ self.β̂_)
480+
ŷ = np.ascontiguousarray(ŷ)
391481
else:
392-
# Apply an affine transformation to X, then predict as ŷ(x) := k(x, X) + 1'.
482+
# Apply an affine transformation to X, then predict as ŷ(x) := k(x, X) α̂ + 1'α̂.
393483
X = cast(AffineFeatureMap, self.dual_feature_map_).transform(X)
394484
K = rbf_kernel(X, self.X_, gamma=0.5)
395485
b = np.sum(self.α̂_)
@@ -398,7 +488,7 @@ def decision_function(self, X: FloatMatrix[F]) -> FloatVector[F]:
398488

399489
def predict(self, X: FloatMatrix[F]) -> GenericVector:
400490
"""Predict the output on a given dataset."""
401-
# Evaluate ŷ given the feature matrix X.
491+
# Compute the point predictions ŷ(X).
402492
ŷ_df = self.decision_function(X)
403493
if self._estimator_type == "classifier":
404494
# For binary classification, round to the nearest class label. When the decision
@@ -415,23 +505,29 @@ def predict(self, X: FloatMatrix[F]) -> GenericVector:
415505
ŷ = ŷ.astype(self.y_dtype_)
416506
return ŷ
417507

418-
def predict_proba(self, X: FloatMatrix[F]) -> FloatMatrix[F]:
419-
"""Predict the output probability (classification) or confidence interval (regression)."""
508+
def predict_proba(
509+
self,
510+
X: FloatMatrix[F],
511+
*,
512+
confidence_interval: bool = False,
513+
confidence_level: float = 0.95,
514+
) -> FloatVector[F] | FloatMatrix[F] | FloatTensor[F]:
515+
"""Predict the class probability or confidence interval."""
516+
if confidence_interval:
517+
# Return the confidence interval for classification or regression.
518+
C = self.predict_confidence_interval(X, confidence_level=confidence_level)
519+
return C
420520
if self._estimator_type == "classifier":
521+
# Return the class probabilities for classification.
421522
ŷ_classification = self.decision_function(X)
422523
p = self.predict_proba_calibrator_.transform(ŷ_classification)
423524
P = np.hstack([1 - p[:, np.newaxis], p[:, np.newaxis]])
424525
else:
425-
# TODO: Replace point predictions with confidence interval.
526+
# Return the point predictions for regression.
426527
ŷ_regression = self.predict(X)
427-
P = np.hstack((ŷ_regression[:, np.newaxis], ŷ_regression[:, np.newaxis]))
528+
P = ŷ_regression
428529
return P
429530

430-
@property
431-
def loo_score(self) -> float:
432-
"""Compute the leave-one-out score of this classifier or regressor."""
433-
return cast(float, self.loo_score_)
434-
435531
def score(
436532
self, X: FloatMatrix[F], y: GenericVector, sample_weight: FloatVector[F] | None = None
437533
) -> float:

tests/test_neo_ls_svm.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ def test_compare_neo_ls_svm_with_svm(dataset: Dataset, table_vectorizer: TableVe
1717
X_train, X_test, y_train, y_test = dataset
1818
# Create the pipelines.
1919
num_unique = len(y_train.unique())
20-
if num_unique == 2: # noqa: PLR2004
20+
binary = num_unique == 2 # noqa: PLR2004
21+
multiclass = 2 < num_unique <= np.ceil(np.sqrt(len(y_train))) # noqa: PLR2004
22+
if binary:
2123
neo_ls_svm_pipeline = make_pipeline(table_vectorizer, NeoLSSVM())
2224
svm_pipeline = make_pipeline(table_vectorizer, SVC())
23-
elif num_unique <= np.ceil(np.sqrt(len(y_train))):
25+
elif multiclass:
2426
neo_ls_svm_pipeline = make_pipeline(table_vectorizer, OneVsRestClassifier(NeoLSSVM()))
2527
svm_pipeline = make_pipeline(table_vectorizer, OneVsRestClassifier(SVC()))
2628
else:
@@ -33,6 +35,28 @@ def test_compare_neo_ls_svm_with_svm(dataset: Dataset, table_vectorizer: TableVe
3335
neo_ls_svm_score = neo_ls_svm_pipeline.score(X_test, y_test)
3436
svm_score = svm_pipeline.score(X_test, y_test)
3537
assert neo_ls_svm_score > svm_score
38+
# Verify the coverage of the confidence interval.
39+
if multiclass:
40+
return
41+
confidence_level = 0.8
42+
X_conf = neo_ls_svm_pipeline.predict_proba(
43+
X_test, confidence_interval=True, confidence_level=confidence_level
44+
)
45+
if binary:
46+
assert np.all(X_conf >= 0)
47+
assert np.all(X_conf <= 1)
48+
assert np.all(X_conf[:, 0, 0] <= X_conf[:, 1, 0])
49+
assert np.all(X_conf[:, 0, 1] <= X_conf[:, 1, 1])
50+
is_neg = y_test == neo_ls_svm_pipeline.steps[-1][1].classes_[0]
51+
is_pos = ~is_neg
52+
neg_covered = np.any(X_conf[:, :, 0] > 0.5, axis=1) & is_neg # noqa: PLR2004
53+
pos_covered = np.any(X_conf[:, :, 1] > 0.5, axis=1) & is_pos # noqa: PLR2004
54+
covered = neg_covered | pos_covered
55+
elif not multiclass:
56+
assert np.all(X_conf[:, 0] <= X_conf[:, 1])
57+
covered = (X_conf[:, 0] <= y_test) & (y_test <= X_conf[:, 1])
58+
coverage = np.mean(covered)
59+
assert coverage >= confidence_level
3660

3761

3862
def test_sklearn_check_estimator() -> None:

0 commit comments

Comments
 (0)