Skip to content

Commit

Permalink
feat(judge): Implement effect size post-MannWhitney check (#356)
Browse files Browse the repository at this point in the history
  • Loading branch information
csanden authored and skandragon committed Jul 23, 2018
1 parent f8f6a3d commit 3f7ed70
Show file tree
Hide file tree
Showing 10 changed files with 353 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {

val critical = MapUtils.getAsBooleanWithDefault(false, metricConfig.getAnalysisConfigurations, "canary", "critical")

//Effect Size Parameters
val allowedIncrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedIncrease")
val allowedDecrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedDecrease")

//=============================================
// Metric Transformation (Remove NaN values, etc.)
// ============================================
Expand All @@ -163,7 +167,8 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
//=============================================
// Metric Classification
// ============================================
val mannWhitney = new MannWhitneyClassifier(tolerance = netflixJudgeConfigurationProperties.getTolerance, netflixJudgeConfigurationProperties.getConfLevel)
val thresholds = (allowedDecrease, allowedIncrease)
val mannWhitney = new MannWhitneyClassifier(tolerance = 0.25, confLevel = 0.98, effectSizeThresholds = thresholds)

val resultBuilder = CanaryAnalysisResult.builder()
.name(metric.getName)
Expand All @@ -179,7 +184,7 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
resultBuilder
.classification(metricClassification.classification.toString)
.classificationReason(metricClassification.reason.orNull)
.resultMetadata(Map("ratio" -> metricClassification.ratio.asInstanceOf[Object]).asJava)
.resultMetadata(Map("ratio" -> metricClassification.deviation.asInstanceOf[Object]).asJava)
.build()

} catch {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ object NaNStrategy {
}
}

case class MetricClassification(classification: MetricClassificationLabel, reason: Option[String], ratio: Double)
case class MetricClassification(classification: MetricClassificationLabel, reason: Option[String], deviation: Double)

abstract class BaseMetricClassifier {
def classify(control: Metric, experiment: Metric,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ package com.netflix.kayenta.judge.classifiers.metric

import com.netflix.kayenta.judge.Metric
import com.netflix.kayenta.judge.preprocessing.Transforms
import com.netflix.kayenta.judge.stats.EffectSizes
import com.netflix.kayenta.mannwhitney.{MannWhitney, MannWhitneyParams}
import org.apache.commons.math3.stat.StatUtils

case class MannWhitneyResult(lowerConfidence: Double, upperConfidence: Double, estimate: Double)
case class MannWhitneyResult(lowerConfidence: Double, upperConfidence: Double, estimate: Double, deviation: Double)

class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) extends BaseMetricClassifier {
class MannWhitneyClassifier(tolerance: Double=0.25,
confLevel: Double=0.95,
effectSizeThresholds: (Double, Double) = (1.0, 1.0)) extends BaseMetricClassifier {

/**
* Mann-Whitney U Test
Expand All @@ -43,7 +46,10 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
val confInterval = testResult.confidenceInterval
val estimate = testResult.estimate

MannWhitneyResult(confInterval(0), confInterval(1), estimate)
//Calculate the deviation (Effect Size) between the experiment and control
val effectSize = calculateDeviation(experiment, control)

MannWhitneyResult(confInterval(0), confInterval(1), estimate, effectSize)
}

/**
Expand All @@ -61,7 +67,7 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
* Calculate the upper and lower bounds for classifying the metric.
* The bounds are calculated as a fraction of the Hodges–Lehmann estimator
*/
def calculateBounds(testResult: MannWhitneyResult): (Double, Double) = {
private def calculateBounds(testResult: MannWhitneyResult): (Double, Double) = {
val estimate = math.abs(testResult.estimate)
val criticalValue = tolerance * estimate

Expand All @@ -70,11 +76,43 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
(lowerBound, upperBound)
}

override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
/**
* Calculate the deviation (Effect Size) between the experiment and control
*/
private def calculateDeviation(experiment: Array[Double], control: Array[Double]): Double = {
if(StatUtils.mean(control) == 0.0) 1.0 else EffectSizes.meanRatio(control, experiment)
}

/**
* Compare the experiment to the control using the Mann-Whitney U Test
*/
private def compare(control: Metric, experiment: Metric, direction: MetricDirection): MetricClassification = {

//Perform the Mann-Whitney U Test
val mwResult = MannWhitneyUTest(experiment.values, control.values)
val (lowerBound, upperBound) = calculateBounds(mwResult)

if((direction == MetricDirection.Increase || direction == MetricDirection.Either) && mwResult.lowerConfidence > upperBound){
val reason = s"The metric was classified as $High"
return MetricClassification(High, Some(reason), mwResult.deviation)

}else if((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && mwResult.upperConfidence < lowerBound){
val reason = s"The metric was classified as $Low"
return MetricClassification(Low, Some(reason), mwResult.deviation)
}

MetricClassification(Pass, None, mwResult.deviation)
}

override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
if (nanStrategy == NaNStrategy.Remove) {
return MetricClassification(Nodata, None, 0.0)
return MetricClassification(Nodata, None, 1.0)
} else {
return MetricClassification(Pass, None, 1.0)
}
Expand All @@ -91,21 +129,18 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
return MetricClassification(Pass, None, 1.0)
}

//Perform the Mann-Whitney U Test
val mwResult = MannWhitneyUTest(experiment.values, control.values)
val meanRatio = StatUtils.mean(experiment.values)/StatUtils.mean(control.values)
val (lowerBound, upperBound) = calculateBounds(mwResult)
//Compare the experiment to the control using the Mann-Whitney U Test
val comparisonResult = compare(control, experiment, direction)

if((direction == MetricDirection.Increase || direction == MetricDirection.Either) && mwResult.lowerConfidence > upperBound){
val reason = s"The metric was classified as $High"
return MetricClassification(High, Some(reason), meanRatio)
//Check the Effect Size between the experiment and control
if(comparisonResult.classification == High && comparisonResult.deviation < effectSizeThresholds._2){
return MetricClassification(Pass, None, comparisonResult.deviation)

}else if((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && mwResult.upperConfidence < lowerBound){
val reason = s"The metric was classified as $Low"
return MetricClassification(Low, Some(reason), meanRatio)
}else if(comparisonResult.classification == Low && comparisonResult.deviation > effectSizeThresholds._1) {
return MetricClassification(Pass, None, comparisonResult.deviation)
}

MetricClassification(Pass, None, meanRatio)
comparisonResult
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ import org.apache.commons.math3.stat.StatUtils
*/
class MeanInequalityClassifier extends BaseMetricClassifier {

override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ class RandomClassifier(labels: List[MetricClassificationLabel] = List(Pass, High
*/
def getRandomLabel(list: List[MetricClassificationLabel]): MetricClassificationLabel = Random.shuffle(list).head

override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
override def classify(control: Metric,
experiment: Metric,
direction: MetricDirection,
nanStrategy: NaNStrategy): MetricClassification = {

//Check if there is no-data for the experiment or control
if (experiment.values.isEmpty || control.values.isEmpty) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
package com.netflix.kayenta.judge.stats

import com.netflix.kayenta.judge.Metric
import org.apache.commons.math.util.FastMath
import org.apache.commons.math3.stat.StatUtils
import org.apache.commons.math3.stat.descriptive.rank.Percentile
import org.apache.commons.math3.stat.descriptive.rank.Percentile.EstimationType

case class MetricStatistics(min: Double, max: Double, mean: Double, median: Double, count: Int){

case class MetricStatistics(min: Double, max: Double, mean: Double, std: Double, count: Int){
def toMap: Map[String, Any] = {
Map("min" -> min, "max" -> max, "mean" -> mean, "median" -> median, "count" -> count)
Map("min" -> min, "max" -> max, "mean" -> mean, "std" -> std, "count" -> count)
}
}

Expand All @@ -45,6 +47,10 @@ object DescriptiveStatistics {
if (metric.values.isEmpty) 0.0 else StatUtils.max(metric.values)
}

def std(metric: Metric): Double = {
if (metric.values.isEmpty) 0.0 else FastMath.sqrt(StatUtils.variance(metric.values))
}

/**
* Returns an estimate of the pth percentile of the values in the metric object.
* Uses the R-7 estimation strategy when the desired percentile lies between two data points.
Expand All @@ -68,12 +74,15 @@ object DescriptiveStatistics {
percentile.evaluate(values, p)
}

/**
* Calculate a set of descriptive statistics for the input metric
*/
def summary(metric: Metric): MetricStatistics = {
val mean = this.mean(metric)
val median = this.median(metric)
val min = this.min(metric)
val max = this.max(metric)
val std = this.std(metric)
val count = metric.values.length
MetricStatistics(min, max, mean, median, count)
MetricStatistics(min, max, mean, std, count)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package com.netflix.kayenta.judge.stats

import com.netflix.kayenta.judge.Metric
import com.netflix.kayenta.judge.stats.DescriptiveStatistics._
import org.apache.commons.math.util.FastMath
import org.apache.commons.math3.stat.StatUtils


object EffectSizes {

/**
* Mean Ratio
* Measures the difference between the mean values as a ratio (experiment/control)
* Note: This is included for backwards compatibility
*/
def meanRatio(control: Array[Double], experiment: Array[Double]): Double = {
require(StatUtils.mean(control) != 0.0, "the mean of the control must be non-zero")
StatUtils.mean(experiment)/StatUtils.mean(control)
}

/**
* Mean Ratio
* Measures the difference between the mean values as a ratio (experiment/control)
* Note: This is included for backwards compatibility
*/
def meanRatio(control: Metric, experiment: Metric): Double = {
require(mean(control) != 0.0, "the mean of the control must be non-zero")
mean(experiment)/mean(control)
}

/**
* Mean Ratio
* Measures the difference between the mean values as a ratio (experiment/control)
* Note: This is included for backwards compatibility
*/
def meanRatio(control: MetricStatistics, experiment: MetricStatistics): Double = {
require(control.mean != 0.0, "the mean of the control must be non-zero")
experiment.mean/control.mean
}

/**
* Cohen's d (Pooled Standard Deviation)
* Cohen's d is an effect size used to indicate the standardized difference between two means
* https://en.wikipedia.org/wiki/Effect_size#Cohen's_d
*/
def cohenD(control: Metric, experiment: Metric): Double = {
cohenD(summary(control), summary(experiment))
}

/**
* Cohen's d (Pooled Standard Deviation)
* Cohen's d is an effect size used to indicate the standardized difference between two means
* https://en.wikipedia.org/wiki/Effect_size#Cohen's_d
*/
def cohenD(control: MetricStatistics, experiment: MetricStatistics): Double = {
val pooledStd = FastMath.sqrt(((experiment.count - 1) * FastMath.pow(experiment.std, 2) + (control.count - 1) * FastMath.pow(control.std, 2)) / (control.count + experiment.count - 2))
FastMath.abs(experiment.mean - control.mean) / pooledStd
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ object MapUtils {
get(data, path: _*).getOrElse(default).asInstanceOf[Boolean]
}

def getAsDoubleWithDefault(default: Double, data: Any, path: String*): Double = {
get(data, path: _*).getOrElse(default).asInstanceOf[Double]
}
}
Loading

0 comments on commit 3f7ed70

Please sign in to comment.