Skip to content

Commit

Permalink
Add Statuspage metrics shipper function and template
Browse files Browse the repository at this point in the history
  • Loading branch information
farski committed Sep 25, 2023
1 parent 0e89825 commit 6cbba6d
Show file tree
Hide file tree
Showing 3 changed files with 346 additions and 0 deletions.
253 changes: 253 additions & 0 deletions devops/tooling/statuspage/metrics-shipper/index.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import * as http from 'https';
import {
CloudWatchClient,
GetMetricDataCommand,
} from '@aws-sdk/client-cloudwatch';
import { STSClient, AssumeRoleCommand } from '@aws-sdk/client-sts';

const PAGE_ID = 'zlzvfpy9x7fy';

const sts = new STSClient({ region: 'us-east-2' });

const apiKey = process.env.STATUSPAGE_API_KEY;

function roleArn(accountId) {
return `arn:aws:iam::${accountId}:role/CloudWatch-CrossAccountSharingRole`;
}

async function sendDataPoints(pageId, data) {
return new Promise((resolve, reject) => {
const apiBase = 'https://api.statuspage.io/v1';
const url = `${apiBase}/pages/${pageId}/metrics/data.json`;

const authHeader = { Authorization: `OAuth ${apiKey}` };
const options = { method: 'POST', headers: authHeader };

const request = http.request(url, options, (res) => {
if (res.statusMessage === 'Unauthorized') {
const genericError =
'Error encountered. Please ensure that your page code and authorization key are correct.';
return console.error(genericError);
}
res.on('data', function () {});

res.on('end', function () {
resolve();
});

res.on('error', (error) => {
reject(error);
});
});

request.end(JSON.stringify({ data: data }));
});
}

async function iteratorAgeData(accountId, region, functionName) {
const role = await sts.send(
new AssumeRoleCommand({
RoleArn: roleArn(accountId),
RoleSessionName: 'statuspage_data_sender',
}),
);

const cloudwatch = new CloudWatchClient({
region: region,
credentials: {
accessKeyId: role.Credentials.AccessKeyId,
secretAccessKey: role.Credentials.SecretAccessKey,
sessionToken: role.Credentials.SessionToken,
},
});

const results = await cloudwatch.send(
new GetMetricDataCommand({
MetricDataQueries: [
{
Id: 'iteratorAge',
MetricStat: {
Metric: {
Namespace: 'AWS/Lambda',
MetricName: 'IteratorAge',
Dimensions: [
{
Name: 'FunctionName',
Value: functionName,
},
],
},
Period: 60,
Stat: 'Maximum',
Unit: 'Milliseconds',
},
},
],
StartTime: new Date(+new Date() - 10 * 60000),
EndTime: new Date(),
}),
);

const datapoints = {};

if (results.MetricDataResults.length) {
for (
let index = 0;
index < results.MetricDataResults[0].Values.length;
index++
) {
const value = results.MetricDataResults[0].Values[index];
const timestamp = results.MetricDataResults[0].Timestamps[index];

datapoints[`${Math.floor(+timestamp / 1000)}`] = value;
}
}

return datapoints;
}

async function analyticsProcessingLatency() {
const usEast1Data = await iteratorAgeData(
'561178107736',
'us-east-1',
'infrastructure-cd-root-production-A-CountsFunction-h0qlwOs7VMJZ',
);
const usWest2Data = await iteratorAgeData(
'561178107736',
'us-west-2',
'infrastructure-cd-root-production-A-CountsFunction-HZN3g5yVxDSq',
);

const uniqueTimestamps = [
...new Set([...Object.keys(usEast1Data), ...Object.keys(usWest2Data)]),
];

const datapoints = [];

for (const timestamp of uniqueTimestamps) {
let value;

if (usEast1Data[timestamp] && usWest2Data[timestamp]) {
value = Math.max(usEast1Data[timestamp], usWest2Data[timestamp]);
} else {
value = usEast1Data[timestamp] || usWest2Data[timestamp];
}

datapoints.push({
timestamp: +timestamp,
value: value / 1000,
});
}

const data = {
'4n05sgqq2jqg': datapoints,
};

return data;
}

async function mediaProcessingVolume() {
const role = await sts.send(
new AssumeRoleCommand({
RoleArn: roleArn('561178107736'),
RoleSessionName: 'statuspage_data_sender',
}),
);

const cloudwatch = new CloudWatchClient({
region: 'us-east-2',
credentials: {
accessKeyId: role.Credentials.AccessKeyId,
secretAccessKey: role.Credentials.SecretAccessKey,
sessionToken: role.Credentials.SessionToken,
},
});

const results = await cloudwatch.send(
new GetMetricDataCommand({
MetricDataQueries: [
{
Id: 'tasks',
MetricStat: {
Metric: {
Namespace: 'PRX/Porter',
MetricName: 'TasksRequested',
Dimensions: [
{
Name: 'StateMachineArn',
Value:
'arn:aws:states:us-east-2:561178107736:stateMachine:StateMachine-KcY9t10Lo2vV',
},
],
},
Period: 60,
Stat: 'Sum',
Unit: 'Count',
},
},
],
StartTime: new Date(+new Date() - 10 * 60000),
EndTime: new Date(),
}),
);

const datapoints = [];

if (results.MetricDataResults.length) {
for (
let index = 0;
index < results.MetricDataResults[0].Values.length;
index++
) {
const value = results.MetricDataResults[0].Values[index];
const timestamp = results.MetricDataResults[0].Timestamps[index];

datapoints.push({
timestamp: Math.floor(+timestamp / 1000),
value: value,
});
}
}

const data = {
hlrxcqsmmhys: datapoints,
};

return data;
}

// Statuspage expects to get at least one value for every 5 minute period; if
// there's a 5 minute period with no value, it show up as a gap in the chart.
//
// The chart has a maximum resolution of 30 seconds. For data sent with a
// timestamp that doesn't land on a a 30 second boundary, Statuspage will
// decide how to shift the data. If multiple data points are sent that would
// be shifted to the same boundary, Statuspage picks one (i.e., does not
// aggregate.
//
// You can send values multiple times for a timestamp, and the value will be
// updated over time with the latest value.
//
// The way this Lambda generally works is: query a metric with a 60-second
// period for a 10 minute block (i.e., get back 10 values, each representing a
// minute). Each value may be a sum, average, etc of all the values that exist
// for that minute in CloudWatch. Send all 10 values to Statuspage. Do this
// every few minutes. That means every time this runs, roughly 7 of the 10
// data points being sent will be updates or duplicates. That's fine. Some
// values don't appear in CloudWatch Metrics for several minutes, so this
// approach ensures that eventually the data in Statuspage is fairly complete.
//
// https://developer.statuspage.io/#tag/metrics
export const handler = async (event) => {
const analyticsProcessingLatencyData = await analyticsProcessingLatency();
const mediaProcessingVolumeData = await mediaProcessingVolume();

const metricsData = {
...analyticsProcessingLatencyData,
...mediaProcessingVolumeData,
};

console.log(JSON.stringify(metricsData));

await sendDataPoints(PAGE_ID, metricsData);
};
16 changes: 16 additions & 0 deletions devops/tooling/statuspage/samconfig.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# sam build && sam deploy

version = 0.1

[default.deploy.parameters]
profile = "prx-devops"
stack_name = "prx-devops-statuspage-shipper"
s3_prefix = "prx-devops-statuspage-shipper"
confirm_changeset = true
capabilities = "CAPABILITY_IAM"
region = "us-east-2"
s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-1x0dmn1dmp2ru"
# Parameter overrides only need to be included when a parameter is changing
# parameter_overrides = [
# "StatuspageApiKey=GET_ME_FROM_STATUSPAGE_BACKEND"
# ]
77 changes: 77 additions & 0 deletions devops/tooling/statuspage/template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# devops/tooling/statuspage/template.yml
#
AWSTemplateFormatVersion: "2010-09-09"
Transform: AWS::Serverless-2016-10-31

Description: >-
Creates resources for integrating with Statuspage.io
Parameters:
CrossAccountSharingRoleName: { Type: String, Default: CloudWatch-CrossAccountSharingRole }
StatuspageApiKey: { Type: String }

Resources:
MetricsShipperFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: metrics-shipper/
Description: >-
Fetches various metrics from CloudWatch and sends them to Statuspage
as system metrics
Environment:
Variables:
STATUSPAGE_API_KEY: !Ref StatuspageApiKey
Events:
Cron:
Properties:
Description: Ships metrics to Statuspage
Schedule: rate(3 minutes)
State: ENABLED
Type: Schedule
Handler: index.handler
MemorySize: 128
Policies:
- Statement:
- Action: sts:AssumeRole
Effect: Allow
Resource: !Sub arn:aws:iam::*:role/${CrossAccountSharingRoleName}
Version: "2012-10-17"
Runtime: nodejs18.x
Tags:
prx:meta:tagging-version: "2021-04-07"
prx:cloudformation:stack-name: !Ref AWS::StackName
prx:cloudformation:stack-id: !Ref AWS::StackId
prx:ops:environment: Production
prx:dev:application: DevOps
Timeout: 30
MetricsShipperLogGroup:
Type: AWS::Logs::LogGroup
DeletionPolicy: Delete
UpdateReplacePolicy: Delete
Properties:
LogGroupName: !Sub /aws/lambda/${MetricsShipperFunction}
RetentionInDays: 5
Tags:
- { Key: prx:meta:tagging-version, Value: "2021-04-07" }
- { Key: prx:cloudformation:stack-name, Value: !Ref AWS::StackName }
- { Key: prx:cloudformation:stack-id, Value: !Ref AWS::StackId }
- { Key: prx:ops:environment, Value: Production }
- { Key: prx:dev:application, Value: DevOps }
MetricsShipperErrorAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub WARN [DevOps] Statuspage metrics shipper <prod> FUNCTION ERRORS (${AWS::StackName})
AlarmDescription: >-
The Lambda function that ships metrics from CloudWatch to Statuspage
failing, so system metrics on the PRX status page may be incorrect.
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: FunctionName
Value: !Ref MetricsShipperFunction
EvaluationPeriods: 1
MetricName: Errors
Namespace: AWS/Lambda
Period: 60
Statistic: Sum
Threshold: 1
TreatMissingData: notBreaching

0 comments on commit 6cbba6d

Please sign in to comment.