Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script and workflow to detect flaky tests in testgrid. #17662

Merged
merged 2 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/measure-testgrid-flakiness.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
name: Measure TestGrid Flakiness

on:
schedule:
- cron: "0 0 * * 0" # run every Sunday at midnight

permissions: read-all

jobs:
measure-testgrid-flakiness:
name: Measure TestGrid Flakiness
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- id: goversion
run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT"
- uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
with:
go-version: ${{ steps.goversion.outputs.goversion }}
- env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail

./scripts/measure-testgrid-flakiness.sh
15 changes: 15 additions & 0 deletions scripts/measure-testgrid-flakiness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash
# Measures test flakiness and create issues for flaky tests

set -euo pipefail

if [[ -z ${GITHUB_TOKEN:-} ]]
then
echo "Please set the \$GITHUB_TOKEN environment variable for the script to work"
exit 1
fi

pushd ./tools/testgrid-analysis
go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-e2e-amd64
go run main.go flaky --create-issue --dashboard=sig-etcd-periodics --tab=ci-etcd-unit-test-amd64
popd
146 changes: 146 additions & 0 deletions tools/testgrid-analysis/cmd/data.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"fmt"
"io"
"net/http"
"os"
"strings"

apipb "github.com/GoogleCloudPlatform/testgrid/pb/api/v1"
statuspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status"
"google.golang.org/protobuf/encoding/protojson"
)

var (
validTestStatuses = []statuspb.TestStatus{statuspb.TestStatus_PASS, statuspb.TestStatus_FAIL, statuspb.TestStatus_FLAKY}
failureTestStatuses = []statuspb.TestStatus{statuspb.TestStatus_FAIL, statuspb.TestStatus_FLAKY}
validTestStatusesInt = intStatusSet(validTestStatuses)
failureTestStatusesInt = intStatusSet(failureTestStatuses)

skippedTestStatuses = make(map[int32]struct{})
)

type TestResultSummary struct {
Name string
FullName string
TotalRuns, FailedRuns int
FailureRate float32
FailureLogs []string
IssueBody string
}

func fetchTestResultSummaries(dashboard, tab string) []*TestResultSummary {
// Fetch test data
rowsURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/rows", dashboard, tab)
headersURL := fmt.Sprintf("http://testgrid-data.k8s.io/api/v1/dashboards/%s/tabs/%s/headers", dashboard, tab)

var testData apipb.ListRowsResponse
var headerData apipb.ListHeadersResponse
protojson.Unmarshal(fetchJSON(rowsURL), &testData)
protojson.Unmarshal(fetchJSON(headersURL), &headerData)

var allTests []string
for _, row := range testData.Rows {
allTests = append(allTests, row.Name)
}

summaries := []*TestResultSummary{}
// Process rows
for _, row := range testData.Rows {
t := processRow(dashboard, tab, row, allTests, headerData.Headers)
summaries = append(summaries, t)
}
return summaries
}

func processRow(dashboard, tab string, row *apipb.ListRowsResponse_Row, allTests []string, headers []*apipb.ListHeadersResponse_Header) *TestResultSummary {
t := TestResultSummary{Name: shortenTestName(row.Name), FullName: row.Name}
// we do not want to create issues for a parent test.
if isParentTest(row.Name, allTests) {
return &t
}
if !strings.HasPrefix(row.Name, "go.etcd.io") {
return &t
}
total := 0
failed := 0
logs := []string{}
for i, cell := range row.Cells {
// ignore tests with status not in the validTestStatuses
// cell result codes are listed in https://github.com/GoogleCloudPlatform/testgrid/blob/main/pb/test_status/test_status.proto
if _, ok := validTestStatusesInt[cell.Result]; !ok {
if cell.Result != 0 {
skippedTestStatuses[cell.Result] = struct{}{}
}
continue
}
total += 1
if _, ok := failureTestStatusesInt[cell.Result]; ok {
failed += 1
header := headers[i]
// markdown table format of | commit | log |
logs = append(logs, fmt.Sprintf("| %s | %s | https://prow.k8s.io/view/gs/kubernetes-jenkins/logs/%s/%s |", strings.Join(header.Extra, ","), header.Started.AsTime().String(), tab, header.Build))
}
}
t.FailedRuns = failed
t.TotalRuns = total
t.FailureLogs = logs
t.FailureRate = float32(failed) / float32(total)
if t.FailedRuns > 0 {
dashboardUrl := fmt.Sprintf("[%s](https://testgrid.k8s.io/%s#%s)", tab, dashboard, tab)
t.IssueBody = fmt.Sprintf("## %s Test: %s \nTest failed %.1f%% (%d/%d) of the time\n\nfailure logs are:\n| commit | started | log |\n| --- | --- | --- |\n%s\n",
dashboardUrl, t.FullName, t.FailureRate*100, t.FailedRuns, t.TotalRuns, strings.Join(t.FailureLogs, "\n"))
t.IssueBody += "\nPlease follow the [instructions in the contributing guide](https://github.com/etcd-io/etcd/blob/main/CONTRIBUTING.md#check-for-flaky-tests) to reproduce the issue.\n"
}
return &t
}

// isParentTest checks if a test is a rollup of some child tests.
func isParentTest(test string, allTests []string) bool {
for _, t := range allTests {
if t != test && strings.HasPrefix(t, test+"/") {
return true
}
}
return false
}

func fetchJSON(url string) []byte {
resp, err := http.Get(url)
if err != nil {
fmt.Println("Error fetching test data:", err)
os.Exit(1)
}
defer resp.Body.Close()
testBody, _ := io.ReadAll(resp.Body)
return testBody
}

// intStatusSet converts a list of statuspb.TestStatus into a set of int.
func intStatusSet(statuses []statuspb.TestStatus) map[int32]struct{} {
s := make(map[int32]struct{})
for _, status := range statuses {
s[int32(status)] = struct{}{}
}
return s
}

func shortenTestName(fullname string) string {
parts := strings.Split(fullname, ".")
return parts[len(parts)-1]
}
75 changes: 75 additions & 0 deletions tools/testgrid-analysis/cmd/flaky.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"fmt"

"github.com/spf13/cobra"
)

// flakyCmd represents the flaky command
var flakyCmd = &cobra.Command{
Use: "flaky",
Short: "detect flaky tests",
Long: `detect flaky tests within the dashobard#tab, and create GitHub issues if desired.`,
Run: flakyFunc,
}

var (
flakyThreshold float32
minRuns int
createGithubIssue bool
githubOwner string
githubRepo string

lineSep = "-------------------------------------------------------------"
)

func init() {
rootCmd.AddCommand(flakyCmd)

flakyCmd.Flags().BoolVar(&createGithubIssue, "create-issue", false, "create Github issue for each flaky test")
flakyCmd.Flags().Float32Var(&flakyThreshold, "flaky-threshold", 0.1, "fraction threshold of test failures for a test to be considered flaky")
flakyCmd.Flags().IntVar(&minRuns, "min-runs", 20, "minimum test runs for a test to be included in flaky analysis")
flakyCmd.Flags().StringVar(&githubOwner, "github-owner", "etcd-io", "the github organization to create the issue for")
flakyCmd.Flags().StringVar(&githubRepo, "github-repo", "etcd", "the github repo to create the issue for")
}

func flakyFunc(cmd *cobra.Command, args []string) {
fmt.Printf("flaky called, for %s#%s, createGithubIssue=%v, githubRepo=%s/%s, flakyThreshold=%f, minRuns=%d\n", dashboard, tab, createGithubIssue, githubOwner, githubRepo, flakyThreshold, minRuns)

allTests := fetchTestResultSummaries(dashboard, tab)
flakyTests := []*TestResultSummary{}
for _, t := range allTests {
if t.TotalRuns >= minRuns && t.FailureRate >= flakyThreshold {
flakyTests = append(flakyTests, t)
}
}
fmt.Println(lineSep)
fmt.Printf("Detected total %d flaky tests for %s#%s\n", len(flakyTests), dashboard, tab)
fmt.Println(lineSep)
if len(flakyTests) == 0 {
return
}
for _, t := range flakyTests {
fmt.Println(lineSep)
fmt.Println(t.IssueBody)
fmt.Println(lineSep)
}
if createGithubIssue {
createIssues(flakyTests, []string{"type/flake"})
}
}
78 changes: 78 additions & 0 deletions tools/testgrid-analysis/cmd/github.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"context"
"fmt"
"os"
"strings"

"github.com/google/go-github/v60/github"
)

func createIssues(tests []*TestResultSummary, labels []string) {
openIssues := getOpenIssues(labels)
for _, t := range tests {
createIssueIfNonExist(tab, t, openIssues, append(labels, "help wanted"))
}
}

func getOpenIssues(labels []string) []*github.Issue {
client := github.NewClient(nil).WithAuthToken(os.Getenv("GITHUB_TOKEN"))
ctx := context.Background()
// list open issues with label type/flake
issueOpt := &github.IssueListByRepoOptions{
Labels: labels,
ListOptions: github.ListOptions{PerPage: 100},
}
allIssues := []*github.Issue{}
for {
issues, resp, err := client.Issues.ListByRepo(ctx, githubOwner, githubRepo, issueOpt)
if err != nil {
panic(err)
}
allIssues = append(allIssues, issues...)
if resp.NextPage == 0 {
break
}
issueOpt.Page = resp.NextPage
}
fmt.Printf("There are %d issues open with label %v\n", len(allIssues), labels)
return allIssues
}

func createIssueIfNonExist(tab string, t *TestResultSummary, issues []*github.Issue, labels []string) {
// check if there is already an open issue regarding this test
for _, issue := range issues {
if strings.Contains(*issue.Title, t.Name) {
fmt.Printf("%s is already open for test %s\n\n", issue.GetHTMLURL(), t.Name)
return
}
}
fmt.Printf("Opening new issue for %s\n", t.Name)
client := github.NewClient(nil).WithAuthToken(os.Getenv("GITHUB_TOKEN"))
ctx := context.Background()
req := &github.IssueRequest{
Title: github.String(fmt.Sprintf("Flaky test %s", t.Name)),
Body: &t.IssueBody,
Labels: &labels,
}
issue, _, err := client.Issues.Create(ctx, githubOwner, githubRepo, req)
if err != nil {
panic(err)
}
fmt.Printf("New issue %s created for %s\n\n", issue.GetHTMLURL(), t.Name)
}
44 changes: 44 additions & 0 deletions tools/testgrid-analysis/cmd/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"os"

"github.com/spf13/cobra"
)

var (
dashboard string
tab string
)

var rootCmd = &cobra.Command{
Use: "testgrid-analysis",
Short: "testgrid-analysis",
Long: `testgrid-analysis analyzes the testgrid test results of sig-etcd.`,
}

func Execute() {
err := rootCmd.Execute()
if err != nil {
os.Exit(1)
}
}

func init() {
rootCmd.PersistentFlags().StringVar(&dashboard, "dashboard", "sig-etcd-periodics", "testgrid dashboard to retrieve data from")
rootCmd.PersistentFlags().StringVar(&tab, "tab", "ci-etcd-e2e-amd64", "testgrid tab within the dashboard to retrieve data from")
}
Loading
Loading