Skip to content

Commit

Permalink
feat(judge): Implement critical effect size check (#369)
Browse files Browse the repository at this point in the history
  • Loading branch information
csanden authored and skandragon committed Sep 6, 2018
1 parent cabd26a commit 1a6aa79
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,17 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
val nanStrategyString = MapUtils.getAsStringWithDefault("none", metricConfig.getAnalysisConfigurations, "canary", "nanStrategy")
val nanStrategy = NaNStrategy.parse(nanStrategyString)

val critical = MapUtils.getAsBooleanWithDefault(false, metricConfig.getAnalysisConfigurations, "canary", "critical")
val isCriticalMetric = MapUtils.getAsBooleanWithDefault(false, metricConfig.getAnalysisConfigurations, "canary", "critical")

//Effect Size Parameters
val allowedIncrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedIncrease")
val allowedDecrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedDecrease")
val effectSizeThresholds = (allowedDecrease, allowedIncrease)

//Critical Effect Size Parameters
val criticalIncrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "criticalIncrease")
val criticalDecrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "criticalDecrease")
val criticalThresholds = (criticalDecrease, criticalIncrease)

//=============================================
// Metric Transformation (Remove NaN values, etc.)
Expand All @@ -167,8 +173,7 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
//=============================================
// Metric Classification
// ============================================
val thresholds = (allowedDecrease, allowedIncrease)
val mannWhitney = new MannWhitneyClassifier(tolerance = 0.25, confLevel = 0.98, effectSizeThresholds = thresholds)
val mannWhitney = new MannWhitneyClassifier(tolerance = 0.25, confLevel = 0.98, effectSizeThresholds, criticalThresholds)

val resultBuilder = CanaryAnalysisResult.builder()
.name(metric.getName)
Expand All @@ -177,13 +182,13 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
.groups(metricConfig.getGroups)
.experimentMetadata(Map("stats" -> experimentStats.toMap.asJava.asInstanceOf[Object]).asJava)
.controlMetadata(Map("stats" -> controlStats.toMap.asJava.asInstanceOf[Object]).asJava)
.critical(critical)

try {
val metricClassification = mannWhitney.classify(transformedControl, transformedExperiment, directionality, nanStrategy)
val metricClassification = mannWhitney.classify(transformedControl, transformedExperiment, directionality, nanStrategy, isCriticalMetric)
resultBuilder
.classification(metricClassification.classification.toString)
.classificationReason(metricClassification.reason.orNull)
.critical(metricClassification.critical)
.resultMetadata(Map("ratio" -> metricClassification.deviation.asInstanceOf[Object]).asJava)
.build()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,14 @@ object NaNStrategy {
}
}

case class MetricClassification(classification: MetricClassificationLabel, reason: Option[String], deviation: Double)
case class MetricClassification(classification: MetricClassificationLabel,
reason: Option[String],
deviation: Double,
critical: Boolean)

abstract class BaseMetricClassifier {
def classify(control: Metric, experiment: Metric,
direction: MetricDirection = MetricDirection.Either,
nanStrategy: NaNStrategy = NaNStrategy.Remove): MetricClassification
nanStrategy: NaNStrategy = NaNStrategy.Remove,
isCriticalMetric: Boolean = false): MetricClassification
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import com.netflix.kayenta.mannwhitney.{MannWhitney, MannWhitneyParams}
import org.apache.commons.math3.stat.StatUtils

case class MannWhitneyResult(lowerConfidence: Double, upperConfidence: Double, estimate: Double, deviation: Double)
case class ComparisonResult(classification: MetricClassificationLabel, reason: Option[String], deviation: Double)

class MannWhitneyClassifier(tolerance: Double=0.25,
confLevel: Double=0.95,
effectSizeThresholds: (Double, Double) = (1.0, 1.0)) extends BaseMetricClassifier {
effectSizeThresholds: (Double, Double) = (1.0, 1.0),
criticalThresholds: (Double, Double) = (1.0, 1.0)) extends BaseMetricClassifier {

/**
* Mann-Whitney U Test
Expand Down Expand Up @@ -84,63 +86,88 @@ class MannWhitneyClassifier(tolerance: Double=0.25,
}

/**
* Compare the experiment to the control using the Mann-Whitney U Test
* Compare the experiment to the control using the Mann-Whitney U Test and check the magnitude of the effect
*/
private def compare(control: Metric, experiment: Metric, direction: MetricDirection): MetricClassification = {
private def compare(control: Metric,
experiment: Metric,
direction: MetricDirection,
effectSizeThresholds: (Double, Double)): ComparisonResult = {

//Perform the Mann-Whitney U Test
val mwResult = MannWhitneyUTest(experiment.values, control.values)
val (lowerBound, upperBound) = calculateBounds(mwResult)

if((direction == MetricDirection.Increase || direction == MetricDirection.Either) && mwResult.lowerConfidence > upperBound){
val reason = s"The metric was classified as $High"
return MetricClassification(High, Some(reason), mwResult.deviation)
//Check if the experiment is high in comparison to the control
val isHigh = {
(direction == MetricDirection.Increase || direction == MetricDirection.Either) &&
mwResult.lowerConfidence > upperBound &&
mwResult.deviation >= effectSizeThresholds._2
}

}else if((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && mwResult.upperConfidence < lowerBound){
val reason = s"The metric was classified as $Low"
return MetricClassification(Low, Some(reason), mwResult.deviation)
//Check if the experiment is low in comparison to the control
val isLow = {
(direction == MetricDirection.Decrease || direction == MetricDirection.Either) &&
mwResult.upperConfidence < lowerBound &&
mwResult.deviation <= effectSizeThresholds._1
}

MetricClassification(Pass, None, mwResult.deviation)
if(isHigh){
val reason = s"${experiment.name} was classified as $High"
ComparisonResult(High, Some(reason), mwResult.deviation)

}else if(isLow){
val reason = s"${experiment.name} was classified as $Low"
ComparisonResult(Low, Some(reason), mwResult.deviation)

} else {
ComparisonResult(Pass, None, mwResult.deviation)
}
}

override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {
nanStrategy: NaNStrategy,
isCriticalMetric: Boolean): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
if (nanStrategy == NaNStrategy.Remove) {
return MetricClassification(Nodata, None, 1.0)
return MetricClassification(Nodata, None, 1.0, isCriticalMetric)
} else {
return MetricClassification(Pass, None, 1.0)
return MetricClassification(Pass, None, 1.0, critical = false)
}
}

//Check if the experiment and control data are equal
if (experiment.values.sorted.sameElements(control.values.sorted)) {
val reason = s"The ${experiment.label} and ${control.label} data are identical"
return MetricClassification(Pass, Some(reason), 1.0)
return MetricClassification(Pass, Some(reason), 1.0, critical = false)
}

//Check the number of unique observations
if (experiment.values.union(control.values).distinct.length == 1) {
return MetricClassification(Pass, None, 1.0)
return MetricClassification(Pass, None, 1.0, critical = false)
}

//Compare the experiment to the control using the Mann-Whitney U Test
val comparisonResult = compare(control, experiment, direction)
//Compare the experiment to the control using the Mann-Whitney U Test, checking the magnitude of the effect
val comparison = compare(control, experiment, direction, effectSizeThresholds)

//Check the Effect Size between the experiment and control
if(comparisonResult.classification == High && comparisonResult.deviation < effectSizeThresholds._2){
return MetricClassification(Pass, None, comparisonResult.deviation)
//Check if the metric was marked as critical, and if the metric was classified as a failure (High, Low)
if(isCriticalMetric && comparison.classification == High && comparison.deviation >= criticalThresholds._2){
val reason = s"The metric ${experiment.name} was classified as $High (Critical)"
MetricClassification(High, Some(reason), comparison.deviation, critical = true)

}else if(comparisonResult.classification == Low && comparisonResult.deviation > effectSizeThresholds._1) {
return MetricClassification(Pass, None, comparisonResult.deviation)
}else if(isCriticalMetric && comparison.classification == Low && comparison.deviation <= criticalThresholds._1){
val reason = s"The metric ${experiment.name} was classified as $Low (Critical)"
MetricClassification(Low, Some(reason), comparison.deviation, critical = true)

}else if(isCriticalMetric && (comparison.classification == Nodata || comparison.classification == Error)){
MetricClassification(comparison.classification, comparison.reason, comparison.deviation, critical = true)

}else{
MetricClassification(comparison.classification, comparison.reason, comparison.deviation, critical = false)
}

comparisonResult
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ class MeanInequalityClassifier extends BaseMetricClassifier {
override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {
nanStrategy: NaNStrategy,
isCriticalMetric: Boolean): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
if (nanStrategy == NaNStrategy.Remove) {
return MetricClassification(Nodata, None, 0.0)
return MetricClassification(Nodata, None, 0.0, isCriticalMetric)
} else {
return MetricClassification(Pass, None, 1.0)
return MetricClassification(Pass, None, 1.0, critical = false)
}
}

Expand All @@ -47,14 +48,14 @@ class MeanInequalityClassifier extends BaseMetricClassifier {

if ((direction == MetricDirection.Increase || direction == MetricDirection.Either) && experimentMean > controlMean) {
val reason = s"The ${experiment.label} mean was greater than the ${control.label} mean"
return MetricClassification(High, Some(reason), ratio)
return MetricClassification(High, Some(reason), ratio, isCriticalMetric)

} else if ((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && experimentMean < controlMean) {
val reason = s"The ${experiment.label} mean was less than the ${control.label} mean"
return MetricClassification(Low, Some(reason), ratio)
return MetricClassification(Low, Some(reason), ratio, isCriticalMetric)
}

MetricClassification(Pass, None, 1.0)
MetricClassification(Pass, None, 1.0, critical = false)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,27 @@ class RandomClassifier(labels: List[MetricClassificationLabel] = List(Pass, High
override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {
nanStrategy: NaNStrategy,
isCriticalMetric: Boolean): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
if (nanStrategy == NaNStrategy.Remove) {
return MetricClassification(Nodata, None, 0.0)
return MetricClassification(Nodata, None, 0.0, isCriticalMetric)
} else {
return MetricClassification(Pass, None, 1.0)
return MetricClassification(Pass, None, 1.0, critical = false)
}
}

//Check if the experiment and control data are equal
if (experiment.values.sameElements(control.values)){
return MetricClassification(Pass, None, 1.0)
return MetricClassification(Pass, None, 1.0, critical = false)
}

val ratio = StatUtils.mean(experiment.values)/StatUtils.mean(control.values)
val randomClassificationLabel = getRandomLabel(labels)

MetricClassification(randomClassificationLabel, None, ratio)
MetricClassification(randomClassificationLabel, None, ratio, isCriticalMetric)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ class ThresholdScoreClassifier(passThreshold: Double, warningThreshold: Double)
override def classify(scoreResults: ScoreResult): ScoreClassification = {
val score = scoreResults.summaryScore
if(score >= passThreshold){
ScoreClassification(Pass, None, score)
ScoreClassification(Pass, scoreResults.reason, score)
}else if(score >= warningThreshold){
ScoreClassification(Marginal, None, score)
ScoreClassification(Marginal, scoreResults.reason, score)
}else{
ScoreClassification(Fail, None, score)
ScoreClassification(Fail, scoreResults.reason, score)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package com.netflix.kayenta.judge.scorers

import com.netflix.kayenta.canary.results.CanaryAnalysisResult

case class ScoreResult(groupScores: Option[List[GroupScore]], summaryScore: Double, numMetrics: Double)
case class ScoreResult(groupScores: Option[List[GroupScore]], summaryScore: Double, numMetrics: Double, reason: Option[String])
case class GroupScore(name: String, score: Double, noData: Boolean, labelCounts: Map[String, Int], numMetrics: Double)

abstract class BaseScorer {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import scala.collection.JavaConverters._

class WeightedSumScorer(groupWeights: Map[String, Double]) extends BaseScorer {

val NODATA_THRESHOLD = 50.0
val NODATA_THRESHOLD = 50

private def calculateGroupScore(groupName: String, classificationLabels: List[String]): GroupScore = {
val labelCounts = classificationLabels.groupBy(identity).mapValues(_.size)
Expand Down Expand Up @@ -76,9 +76,8 @@ class WeightedSumScorer(groupWeights: Map[String, Double]) extends BaseScorer {
MathUtils.round(summaryScore, 2)
}

def criticalFailures(results: List[CanaryAnalysisResult]): Boolean = {
val criticalFailures = results.filter { result => result.isCritical && !result.getClassification.equals(Pass.toString) }
criticalFailures.nonEmpty
def criticalFailures(results: List[CanaryAnalysisResult]): List[CanaryAnalysisResult] = {
results.filter { result => result.isCritical && !result.getClassification.equals(Pass.toString) }
}

def tooManyNodata(results: List[CanaryAnalysisResult]): Boolean = {
Expand All @@ -90,13 +89,18 @@ class WeightedSumScorer(groupWeights: Map[String, Double]) extends BaseScorer {
override def score(results: List[CanaryAnalysisResult]): ScoreResult = {
val groupScores = calculateGroupScores(results)

if (criticalFailures(results)) {
ScoreResult(Some(groupScores), 0.0, results.size)
val failures = criticalFailures(results)
if (failures.nonEmpty) {
val reason = s"Canary Failed: ${failures.head.getClassificationReason}"
ScoreResult(Some(groupScores), 0.0, results.size, Some(reason))

} else if (tooManyNodata(results)) {
ScoreResult(Some(groupScores), 0.0, results.size)
val reason = s"Canary Failed: $NODATA_THRESHOLD% or more metrics returned ${Nodata.toString.toUpperCase}"
ScoreResult(Some(groupScores), 0.0, results.size, Some(reason))

} else {
val summaryScore = calculateSummaryScore(groupScores)
ScoreResult(Some(groupScores), summaryScore, results.size)
ScoreResult(Some(groupScores), summaryScore, results.size , reason = None)
}
}
}
Expand Down
Loading

0 comments on commit 1a6aa79

Please sign in to comment.