feat(judge): Implement effect size post-MannWhitney check (#356)

spinnaker · Jul 23, 2018 · 3f7ed70 · 3f7ed70
1 parent f8f6a3d
commit 3f7ed70
Show file tree

Hide file tree

Showing 10 changed files with 353 additions and 37 deletions.
diff --git a/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/NetflixACAJudge.scala b/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/NetflixACAJudge.scala
@@ -148,6 +148,10 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
 
     val critical = MapUtils.getAsBooleanWithDefault(false, metricConfig.getAnalysisConfigurations, "canary", "critical")
 
+    //Effect Size Parameters
+    val allowedIncrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedIncrease")
+    val allowedDecrease = MapUtils.getAsDoubleWithDefault(1.0, metricConfig.getAnalysisConfigurations, "canary", "effectSize", "allowedDecrease")
+
     //=============================================
     // Metric Transformation (Remove NaN values, etc.)
     // ============================================
@@ -163,7 +167,8 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
     //=============================================
     // Metric Classification
     // ============================================
-    val mannWhitney = new MannWhitneyClassifier(tolerance = netflixJudgeConfigurationProperties.getTolerance, netflixJudgeConfigurationProperties.getConfLevel)
+    val thresholds = (allowedDecrease, allowedIncrease)
+    val mannWhitney = new MannWhitneyClassifier(tolerance = 0.25, confLevel = 0.98, effectSizeThresholds = thresholds)
 
     val resultBuilder = CanaryAnalysisResult.builder()
       .name(metric.getName)
@@ -179,7 +184,7 @@ class NetflixACAJudge extends CanaryJudge with StrictLogging {
       resultBuilder
         .classification(metricClassification.classification.toString)
         .classificationReason(metricClassification.reason.orNull)
-        .resultMetadata(Map("ratio" -> metricClassification.ratio.asInstanceOf[Object]).asJava)
+        .resultMetadata(Map("ratio" -> metricClassification.deviation.asInstanceOf[Object]).asJava)
         .build()
 
     } catch {

diff --git a/...ge/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/BaseMetricClassifier.scala b/...ge/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/BaseMetricClassifier.scala
@@ -55,7 +55,7 @@ object NaNStrategy {
   }
 }
 
-case class MetricClassification(classification: MetricClassificationLabel, reason: Option[String], ratio: Double)
+case class MetricClassification(classification: MetricClassificationLabel, reason: Option[String], deviation: Double)
 
 abstract class BaseMetricClassifier {
   def classify(control: Metric, experiment: Metric,

diff --git a/...e/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/MannWhitneyClassifier.scala b/...e/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/MannWhitneyClassifier.scala
@@ -18,12 +18,15 @@ package com.netflix.kayenta.judge.classifiers.metric
 
 import com.netflix.kayenta.judge.Metric
 import com.netflix.kayenta.judge.preprocessing.Transforms
+import com.netflix.kayenta.judge.stats.EffectSizes
 import com.netflix.kayenta.mannwhitney.{MannWhitney, MannWhitneyParams}
 import org.apache.commons.math3.stat.StatUtils
 
-case class MannWhitneyResult(lowerConfidence: Double, upperConfidence: Double, estimate: Double)
+case class MannWhitneyResult(lowerConfidence: Double, upperConfidence: Double, estimate: Double, deviation: Double)
 
-class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) extends BaseMetricClassifier {
+class MannWhitneyClassifier(tolerance: Double=0.25,
+                            confLevel: Double=0.95,
+                            effectSizeThresholds: (Double, Double) = (1.0, 1.0)) extends BaseMetricClassifier {
 
   /**
     * Mann-Whitney U Test
@@ -43,7 +46,10 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
     val confInterval = testResult.confidenceInterval
     val estimate = testResult.estimate
 
-    MannWhitneyResult(confInterval(0), confInterval(1), estimate)
+    //Calculate the deviation (Effect Size) between the experiment and control
+    val effectSize = calculateDeviation(experiment, control)
+
+    MannWhitneyResult(confInterval(0), confInterval(1), estimate, effectSize)
   }
 
   /**
@@ -61,7 +67,7 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
     * Calculate the upper and lower bounds for classifying the metric.
     * The bounds are calculated as a fraction of the Hodges–Lehmann estimator
     */
-  def calculateBounds(testResult: MannWhitneyResult): (Double, Double) = {
+  private def calculateBounds(testResult: MannWhitneyResult): (Double, Double) = {
     val estimate = math.abs(testResult.estimate)
     val criticalValue = tolerance * estimate
 
@@ -70,11 +76,43 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
     (lowerBound, upperBound)
   }
 
-  override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
+  /**
+    * Calculate the deviation (Effect Size) between the experiment and control
+    */
+  private def calculateDeviation(experiment: Array[Double], control: Array[Double]): Double = {
+    if(StatUtils.mean(control) == 0.0) 1.0 else EffectSizes.meanRatio(control, experiment)
+  }
+
+  /**
+    * Compare the experiment to the control using the Mann-Whitney U Test
+    */
+  private def compare(control: Metric, experiment: Metric, direction: MetricDirection): MetricClassification = {
+
+    //Perform the Mann-Whitney U Test
+    val mwResult = MannWhitneyUTest(experiment.values, control.values)
+    val (lowerBound, upperBound) = calculateBounds(mwResult)
+
+    if((direction == MetricDirection.Increase || direction == MetricDirection.Either) && mwResult.lowerConfidence > upperBound){
+      val reason = s"The metric was classified as $High"
+      return MetricClassification(High, Some(reason), mwResult.deviation)
+
+    }else if((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && mwResult.upperConfidence < lowerBound){
+      val reason = s"The metric was classified as $Low"
+      return MetricClassification(Low, Some(reason), mwResult.deviation)
+    }
+
+    MetricClassification(Pass, None, mwResult.deviation)
+  }
+
+  override def classify(control: Metric,
+                        experiment: Metric,
+                        direction: MetricDirection,
+                        nanStrategy: NaNStrategy): MetricClassification = {
+
     //Check if there is no-data for the experiment or control
     if (experiment.values.isEmpty || control.values.isEmpty) {
       if (nanStrategy == NaNStrategy.Remove) {
-        return MetricClassification(Nodata, None, 0.0)
+        return MetricClassification(Nodata, None, 1.0)
       } else {
         return MetricClassification(Pass, None, 1.0)
       }
@@ -91,21 +129,18 @@ class MannWhitneyClassifier(tolerance: Double=0.25, confLevel: Double=0.95) exte
       return MetricClassification(Pass, None, 1.0)
     }
 
-    //Perform the Mann-Whitney U Test
-    val mwResult = MannWhitneyUTest(experiment.values, control.values)
-    val meanRatio = StatUtils.mean(experiment.values)/StatUtils.mean(control.values)
-    val (lowerBound, upperBound) = calculateBounds(mwResult)
+    //Compare the experiment to the control using the Mann-Whitney U Test
+    val comparisonResult = compare(control, experiment, direction)
 
-    if((direction == MetricDirection.Increase || direction == MetricDirection.Either) && mwResult.lowerConfidence > upperBound){
-      val reason = s"The metric was classified as $High"
-      return MetricClassification(High, Some(reason), meanRatio)
+    //Check the Effect Size between the experiment and control
+    if(comparisonResult.classification == High && comparisonResult.deviation < effectSizeThresholds._2){
+      return MetricClassification(Pass, None, comparisonResult.deviation)
 
-    }else if((direction == MetricDirection.Decrease || direction == MetricDirection.Either) && mwResult.upperConfidence < lowerBound){
-      val reason = s"The metric was classified as $Low"
-      return MetricClassification(Low, Some(reason), meanRatio)
+    }else if(comparisonResult.classification == Low && comparisonResult.deviation > effectSizeThresholds._1) {
+      return MetricClassification(Pass, None, comparisonResult.deviation)
     }
 
-    MetricClassification(Pass, None, meanRatio)
+    comparisonResult
   }
 
 }
diff --git a/...rc/main/scala/com/netflix/kayenta/judge/classifiers/metric/MeanInequalityClassifier.scala b/...rc/main/scala/com/netflix/kayenta/judge/classifiers/metric/MeanInequalityClassifier.scala
@@ -27,7 +27,10 @@ import org.apache.commons.math3.stat.StatUtils
   */
 class MeanInequalityClassifier extends BaseMetricClassifier {
 
-  override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
+  override def classify(control: Metric,
+                        experiment: Metric,
+                        direction: MetricDirection,
+                        nanStrategy: NaNStrategy): MetricClassification = {
 
     //Check if there is no-data for the experiment or control
     if (experiment.values.isEmpty || control.values.isEmpty) {

diff --git a/...-judge/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/RandomClassifier.scala b/...-judge/src/main/scala/com/netflix/kayenta/judge/classifiers/metric/RandomClassifier.scala
@@ -34,7 +34,10 @@ class RandomClassifier(labels: List[MetricClassificationLabel] = List(Pass, High
     */
   def getRandomLabel(list: List[MetricClassificationLabel]): MetricClassificationLabel = Random.shuffle(list).head
 
-  override def classify(control: Metric, experiment: Metric, direction: MetricDirection, nanStrategy: NaNStrategy): MetricClassification = {
+  override def classify(control: Metric,
+                        experiment: Metric,
+                        direction: MetricDirection,
+                        nanStrategy: NaNStrategy): MetricClassification = {
 
     //Check if there is no-data for the experiment or control
     if (experiment.values.isEmpty || control.values.isEmpty) {

diff --git a/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/stats/DescriptiveStatistics.scala b/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/stats/DescriptiveStatistics.scala
@@ -17,13 +17,15 @@
 package com.netflix.kayenta.judge.stats
 
 import com.netflix.kayenta.judge.Metric
+import org.apache.commons.math.util.FastMath
 import org.apache.commons.math3.stat.StatUtils
 import org.apache.commons.math3.stat.descriptive.rank.Percentile
 import org.apache.commons.math3.stat.descriptive.rank.Percentile.EstimationType
 
-case class MetricStatistics(min: Double, max: Double, mean: Double, median: Double, count: Int){
+
+case class MetricStatistics(min: Double, max: Double, mean: Double, std: Double, count: Int){
   def toMap:  Map[String, Any] = {
-    Map("min" -> min, "max" -> max, "mean" -> mean, "median" -> median, "count" -> count)
+    Map("min" -> min, "max" -> max, "mean" -> mean, "std" -> std, "count" -> count)
   }
 }
 
@@ -45,6 +47,10 @@ object DescriptiveStatistics {
     if (metric.values.isEmpty) 0.0 else StatUtils.max(metric.values)
   }
 
+  def std(metric: Metric): Double = {
+    if (metric.values.isEmpty) 0.0 else FastMath.sqrt(StatUtils.variance(metric.values))
+  }
+
   /**
     * Returns an estimate of the pth percentile of the values in the metric object.
     * Uses the R-7 estimation strategy when the desired percentile lies between two data points.
@@ -68,12 +74,15 @@ object DescriptiveStatistics {
     percentile.evaluate(values, p)
   }
 
+  /**
+    * Calculate a set of descriptive statistics for the input metric
+    */
   def summary(metric: Metric): MetricStatistics = {
     val mean = this.mean(metric)
-    val median = this.median(metric)
     val min = this.min(metric)
     val max = this.max(metric)
+    val std = this.std(metric)
     val count = metric.values.length
-    MetricStatistics(min, max, mean, median, count)
+    MetricStatistics(min, max, mean, std, count)
   }
 }
diff --git a/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/stats/EffectSizes.scala b/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/stats/EffectSizes.scala
@@ -0,0 +1,60 @@
+package com.netflix.kayenta.judge.stats
+
+import com.netflix.kayenta.judge.Metric
+import com.netflix.kayenta.judge.stats.DescriptiveStatistics._
+import org.apache.commons.math.util.FastMath
+import org.apache.commons.math3.stat.StatUtils
+
+
+object EffectSizes {
+
+  /**
+    * Mean Ratio
+    * Measures the difference between the mean values as a ratio (experiment/control)
+    * Note: This is included for backwards compatibility
+    */
+  def meanRatio(control: Array[Double], experiment: Array[Double]): Double = {
+    require(StatUtils.mean(control) != 0.0, "the mean of the control must be non-zero")
+    StatUtils.mean(experiment)/StatUtils.mean(control)
+  }
+
+  /**
+    * Mean Ratio
+    * Measures the difference between the mean values as a ratio (experiment/control)
+    * Note: This is included for backwards compatibility
+    */
+  def meanRatio(control: Metric, experiment: Metric): Double = {
+    require(mean(control) != 0.0, "the mean of the control must be non-zero")
+    mean(experiment)/mean(control)
+  }
+
+  /**
+    * Mean Ratio
+    * Measures the difference between the mean values as a ratio (experiment/control)
+    * Note: This is included for backwards compatibility
+    */
+  def meanRatio(control: MetricStatistics, experiment: MetricStatistics): Double = {
+    require(control.mean != 0.0, "the mean of the control must be non-zero")
+    experiment.mean/control.mean
+  }
+
+  /**
+    * Cohen's d (Pooled Standard Deviation)
+    * Cohen's d is an effect size used to indicate the standardized difference between two means
+    * https://en.wikipedia.org/wiki/Effect_size#Cohen's_d
+    */
+  def cohenD(control: Metric, experiment: Metric): Double = {
+    cohenD(summary(control), summary(experiment))
+  }
+
+  /**
+    * Cohen's d (Pooled Standard Deviation)
+    * Cohen's d is an effect size used to indicate the standardized difference between two means
+    * https://en.wikipedia.org/wiki/Effect_size#Cohen's_d
+    */
+  def cohenD(control: MetricStatistics, experiment: MetricStatistics): Double = {
+    val pooledStd = FastMath.sqrt(((experiment.count - 1) * FastMath.pow(experiment.std, 2) + (control.count - 1) * FastMath.pow(control.std, 2)) / (control.count + experiment.count - 2))
+    FastMath.abs(experiment.mean - control.mean) / pooledStd
+  }
+
+}
diff --git a/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/utils/MapUtils.scala b/kayenta-judge/src/main/scala/com/netflix/kayenta/judge/utils/MapUtils.scala
@@ -48,4 +48,7 @@ object MapUtils {
     get(data, path: _*).getOrElse(default).asInstanceOf[Boolean]
   }
 
+  def getAsDoubleWithDefault(default: Double, data: Any, path: String*): Double = {
+    get(data, path: _*).getOrElse(default).asInstanceOf[Double]
+  }
 }