diff --git a/devops/tooling/statuspage/metrics-shipper/index.mjs b/devops/tooling/statuspage/metrics-shipper/index.mjs new file mode 100644 index 000000000..1f384022c --- /dev/null +++ b/devops/tooling/statuspage/metrics-shipper/index.mjs @@ -0,0 +1,253 @@ +import * as http from 'https'; +import { + CloudWatchClient, + GetMetricDataCommand, +} from '@aws-sdk/client-cloudwatch'; +import { STSClient, AssumeRoleCommand } from '@aws-sdk/client-sts'; + +const PAGE_ID = 'zlzvfpy9x7fy'; + +const sts = new STSClient({ region: 'us-east-2' }); + +const apiKey = process.env.STATUSPAGE_API_KEY; + +function roleArn(accountId) { + return `arn:aws:iam::${accountId}:role/CloudWatch-CrossAccountSharingRole`; +} + +async function sendDataPoints(pageId, data) { + return new Promise((resolve, reject) => { + const apiBase = 'https://api.statuspage.io/v1'; + const url = `${apiBase}/pages/${pageId}/metrics/data.json`; + + const authHeader = { Authorization: `OAuth ${apiKey}` }; + const options = { method: 'POST', headers: authHeader }; + + const request = http.request(url, options, (res) => { + if (res.statusMessage === 'Unauthorized') { + const genericError = + 'Error encountered. Please ensure that your page code and authorization key are correct.'; + return console.error(genericError); + } + res.on('data', function () {}); + + res.on('end', function () { + resolve(); + }); + + res.on('error', (error) => { + reject(error); + }); + }); + + request.end(JSON.stringify({ data: data })); + }); +} + +async function iteratorAgeData(accountId, region, functionName) { + const role = await sts.send( + new AssumeRoleCommand({ + RoleArn: roleArn(accountId), + RoleSessionName: 'statuspage_data_sender', + }), + ); + + const cloudwatch = new CloudWatchClient({ + region: region, + credentials: { + accessKeyId: role.Credentials.AccessKeyId, + secretAccessKey: role.Credentials.SecretAccessKey, + sessionToken: role.Credentials.SessionToken, + }, + }); + + const results = await cloudwatch.send( + new GetMetricDataCommand({ + MetricDataQueries: [ + { + Id: 'iteratorAge', + MetricStat: { + Metric: { + Namespace: 'AWS/Lambda', + MetricName: 'IteratorAge', + Dimensions: [ + { + Name: 'FunctionName', + Value: functionName, + }, + ], + }, + Period: 60, + Stat: 'Maximum', + Unit: 'Milliseconds', + }, + }, + ], + StartTime: new Date(+new Date() - 10 * 60000), + EndTime: new Date(), + }), + ); + + const datapoints = {}; + + if (results.MetricDataResults.length) { + for ( + let index = 0; + index < results.MetricDataResults[0].Values.length; + index++ + ) { + const value = results.MetricDataResults[0].Values[index]; + const timestamp = results.MetricDataResults[0].Timestamps[index]; + + datapoints[`${Math.floor(+timestamp / 1000)}`] = value; + } + } + + return datapoints; +} + +async function analyticsProcessingLatency() { + const usEast1Data = await iteratorAgeData( + '561178107736', + 'us-east-1', + 'infrastructure-cd-root-production-A-CountsFunction-h0qlwOs7VMJZ', + ); + const usWest2Data = await iteratorAgeData( + '561178107736', + 'us-west-2', + 'infrastructure-cd-root-production-A-CountsFunction-HZN3g5yVxDSq', + ); + + const uniqueTimestamps = [ + ...new Set([...Object.keys(usEast1Data), ...Object.keys(usWest2Data)]), + ]; + + const datapoints = []; + + for (const timestamp of uniqueTimestamps) { + let value; + + if (usEast1Data[timestamp] && usWest2Data[timestamp]) { + value = Math.max(usEast1Data[timestamp], usWest2Data[timestamp]); + } else { + value = usEast1Data[timestamp] || usWest2Data[timestamp]; + } + + datapoints.push({ + timestamp: +timestamp, + value: value / 1000, + }); + } + + const data = { + '4n05sgqq2jqg': datapoints, + }; + + return data; +} + +async function mediaProcessingVolume() { + const role = await sts.send( + new AssumeRoleCommand({ + RoleArn: roleArn('561178107736'), + RoleSessionName: 'statuspage_data_sender', + }), + ); + + const cloudwatch = new CloudWatchClient({ + region: 'us-east-2', + credentials: { + accessKeyId: role.Credentials.AccessKeyId, + secretAccessKey: role.Credentials.SecretAccessKey, + sessionToken: role.Credentials.SessionToken, + }, + }); + + const results = await cloudwatch.send( + new GetMetricDataCommand({ + MetricDataQueries: [ + { + Id: 'tasks', + MetricStat: { + Metric: { + Namespace: 'PRX/Porter', + MetricName: 'TasksRequested', + Dimensions: [ + { + Name: 'StateMachineArn', + Value: + 'arn:aws:states:us-east-2:561178107736:stateMachine:StateMachine-KcY9t10Lo2vV', + }, + ], + }, + Period: 60, + Stat: 'Sum', + Unit: 'Count', + }, + }, + ], + StartTime: new Date(+new Date() - 10 * 60000), + EndTime: new Date(), + }), + ); + + const datapoints = []; + + if (results.MetricDataResults.length) { + for ( + let index = 0; + index < results.MetricDataResults[0].Values.length; + index++ + ) { + const value = results.MetricDataResults[0].Values[index]; + const timestamp = results.MetricDataResults[0].Timestamps[index]; + + datapoints.push({ + timestamp: Math.floor(+timestamp / 1000), + value: value, + }); + } + } + + const data = { + hlrxcqsmmhys: datapoints, + }; + + return data; +} + +// Statuspage expects to get at least one value for every 5 minute period; if +// there's a 5 minute period with no value, it show up as a gap in the chart. +// +// The chart has a maximum resolution of 30 seconds. For data sent with a +// timestamp that doesn't land on a a 30 second boundary, Statuspage will +// decide how to shift the data. If multiple data points are sent that would +// be shifted to the same boundary, Statuspage picks one (i.e., does not +// aggregate. +// +// You can send values multiple times for a timestamp, and the value will be +// updated over time with the latest value. +// +// The way this Lambda generally works is: query a metric with a 60-second +// period for a 10 minute block (i.e., get back 10 values, each representing a +// minute). Each value may be a sum, average, etc of all the values that exist +// for that minute in CloudWatch. Send all 10 values to Statuspage. Do this +// every few minutes. That means every time this runs, roughly 7 of the 10 +// data points being sent will be updates or duplicates. That's fine. Some +// values don't appear in CloudWatch Metrics for several minutes, so this +// approach ensures that eventually the data in Statuspage is fairly complete. +// +// https://developer.statuspage.io/#tag/metrics +export const handler = async (event) => { + const analyticsProcessingLatencyData = await analyticsProcessingLatency(); + const mediaProcessingVolumeData = await mediaProcessingVolume(); + + const metricsData = { + ...analyticsProcessingLatencyData, + ...mediaProcessingVolumeData, + }; + + console.log(JSON.stringify(metricsData)); + + await sendDataPoints(PAGE_ID, metricsData); +}; diff --git a/devops/tooling/statuspage/samconfig.toml b/devops/tooling/statuspage/samconfig.toml new file mode 100644 index 000000000..239e21606 --- /dev/null +++ b/devops/tooling/statuspage/samconfig.toml @@ -0,0 +1,16 @@ +# sam build && sam deploy + +version = 0.1 + +[default.deploy.parameters] +profile = "prx-devops" +stack_name = "prx-devops-statuspage-shipper" +s3_prefix = "prx-devops-statuspage-shipper" +confirm_changeset = true +capabilities = "CAPABILITY_IAM" +region = "us-east-2" +s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-1x0dmn1dmp2ru" +# Parameter overrides only need to be included when a parameter is changing +# parameter_overrides = [ + # "StatuspageApiKey=GET_ME_FROM_STATUSPAGE_BACKEND" +# ] diff --git a/devops/tooling/statuspage/template.yml b/devops/tooling/statuspage/template.yml new file mode 100644 index 000000000..37d3f8405 --- /dev/null +++ b/devops/tooling/statuspage/template.yml @@ -0,0 +1,77 @@ +# devops/tooling/statuspage/template.yml +# +AWSTemplateFormatVersion: "2010-09-09" +Transform: AWS::Serverless-2016-10-31 + +Description: >- + Creates resources for integrating with Statuspage.io + +Parameters: + CrossAccountSharingRoleName: { Type: String, Default: CloudWatch-CrossAccountSharingRole } + StatuspageApiKey: { Type: String } + +Resources: + MetricsShipperFunction: + Type: AWS::Serverless::Function + Properties: + CodeUri: metrics-shipper/ + Description: >- + Fetches various metrics from CloudWatch and sends them to Statuspage + as system metrics + Environment: + Variables: + STATUSPAGE_API_KEY: !Ref StatuspageApiKey + Events: + Cron: + Properties: + Description: Ships metrics to Statuspage + Schedule: rate(3 minutes) + State: ENABLED + Type: Schedule + Handler: index.handler + MemorySize: 128 + Policies: + - Statement: + - Action: sts:AssumeRole + Effect: Allow + Resource: !Sub arn:aws:iam::*:role/${CrossAccountSharingRoleName} + Version: "2012-10-17" + Runtime: nodejs18.x + Tags: + prx:meta:tagging-version: "2021-04-07" + prx:cloudformation:stack-name: !Ref AWS::StackName + prx:cloudformation:stack-id: !Ref AWS::StackId + prx:ops:environment: Production + prx:dev:application: DevOps + Timeout: 30 + MetricsShipperLogGroup: + Type: AWS::Logs::LogGroup + DeletionPolicy: Delete + UpdateReplacePolicy: Delete + Properties: + LogGroupName: !Sub /aws/lambda/${MetricsShipperFunction} + RetentionInDays: 5 + Tags: + - { Key: prx:meta:tagging-version, Value: "2021-04-07" } + - { Key: prx:cloudformation:stack-name, Value: !Ref AWS::StackName } + - { Key: prx:cloudformation:stack-id, Value: !Ref AWS::StackId } + - { Key: prx:ops:environment, Value: Production } + - { Key: prx:dev:application, Value: DevOps } + MetricsShipperErrorAlarm: + Type: AWS::CloudWatch::Alarm + Properties: + AlarmName: !Sub WARN [DevOps] Statuspage metrics shipper FUNCTION ERRORS (${AWS::StackName}) + AlarmDescription: >- + The Lambda function that ships metrics from CloudWatch to Statuspage + failing, so system metrics on the PRX status page may be incorrect. + ComparisonOperator: GreaterThanThreshold + Dimensions: + - Name: FunctionName + Value: !Ref MetricsShipperFunction + EvaluationPeriods: 1 + MetricName: Errors + Namespace: AWS/Lambda + Period: 60 + Statistic: Sum + Threshold: 1 + TreatMissingData: notBreaching