Skip to content

Commit

Permalink
fix(orchestration): tweak config for faster failures and less wasted …
Browse files Browse the repository at this point in the history
…performance (#1419)

- reduce heartbeat time for faster failures (expected every 5min, send
every 90s)
- reduces transliteration task retries on timeout (missed heartbeat) to
3 and adds wait and backoff
- reduce max time for transliteration task to 1hr (down from 2hrs)
- remove ulimit config that is not required
- reduce size of additional lambda functions by replacing typed
CacheStrategy with strings, with removes `aws-cdk-lib` from bundling

----

*By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache-2.0 license*
  • Loading branch information
mrgrain authored Aug 30, 2024
1 parent c457b63 commit b4aec14
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 159 deletions.
3 changes: 2 additions & 1 deletion projenrc/magic-ecs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ function newEcsTask(project: TypeScriptProject, entrypoint: string) {
main.line();

main.open('async function main(): Promise<void> {');
main.line('const heartbeat = setInterval(sendHeartbeat, 180_000);'); // Heartbeat is only expected every 10min
main.line('// Heartbeat is expected every 5min');
main.line('const heartbeat = setInterval(sendHeartbeat, 90_000);');
main.line('try {');
// Deserialize the input, which ECS provides as a sequence of JSON objects. We skip the first 2 values (argv[0] is the
// node binary, and argv[1] is this JS file).
Expand Down
115 changes: 40 additions & 75 deletions src/__tests__/__snapshots__/construct-hub.test.ts.snap

Large diffs are not rendered by default.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 8 additions & 15 deletions src/__tests__/devapp/__snapshots__/snapshot.test.ts.snap

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions src/backend/orchestration/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,8 @@ export class Orchestration extends Construct {
.otherwise(new Succeed(this, 'Done'))
);

const transliteratorTimeout = Duration.hours(2);
// aws-cdk-lib is one of the biggest libs and runs in ~8min
const transliteratorTimeout = Duration.hours(1);

this.ecsCluster = new Cluster(this, 'Cluster', {
containerInsights: true,
Expand Down Expand Up @@ -380,7 +381,7 @@ export class Orchestration extends Construct {
// cases the first heartbeat may take a while to come back due to
// the time it takes to provision the task in the cluster, so we
// give a more generous buffer here.
heartbeat: Duration.minutes(10),
heartbeat: Duration.minutes(5),
})
// Do not retry NoSpaceLeftOnDevice errors, these are typically not transient.
.addRetry({
Expand All @@ -407,7 +408,9 @@ export class Orchestration extends Construct {
errors: ['States.Timeout'], // The task has stopped responding, or is just taking a long time to provision
// To compensate we'll give more retries and pause between them in
// case it's just a transient issue.
maxAttempts: 5,
backoffRate: 2,
interval: Duration.seconds(30),
maxAttempts: 3,
})
.addRetry({ maxAttempts: 3 })
.addCatch(ignore, { errors: [UNPROCESSABLE_PACKAGE_ERROR_NAME] })
Expand Down
31 changes: 20 additions & 11 deletions src/backend/transliterator/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import {
ICluster,
LogDrivers,
OperatingSystemFamily,
UlimitName,
} from 'aws-cdk-lib/aws-ecs';
import { Effect, PolicyStatement } from 'aws-cdk-lib/aws-iam';
import { ILogGroup, LogGroup, RetentionDays } from 'aws-cdk-lib/aws-logs';
Expand Down Expand Up @@ -174,16 +173,26 @@ export class Transliterator extends Construct {
},
}),
});
// Encountered an error of "EMFILE: too many open files" in ECS.
// Default nofile ulimit is 1024/4096.
//
// For ECS ulimit documentation see: https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Ulimit.html
// For construct hub tracking issue see: https://github.com/cdklabs/construct-hub/issues/982
this.containerDefinition.addUlimits({
name: UlimitName.NOFILE, // file descriptors
softLimit: 16_384,
hardLimit: 65_535,
});

/**
* Construct hub tracking issue: https://github.com/cdklabs/construct-hub/issues/982
*
* Encountered one of these errors in ECS:
* - EMFILE: too many open files
* - Error: getaddrinfo EBUSY states.us-east-1.amazonaws.com
*
* This issue occurs because the container is running out of file descriptors, to read files or make network requests.
* Normally this limit is configured via the `ulimit` setting. However on AWS Fargate a much lower limit applies.
* For ECS ulimit documentation see: https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Ulimit.html
*
* We currently don't believe a specific ulimit configuration is needed, the defaults (which are the max) should work.
* For references, here is the previously used config.
*/
// this.containerDefinition.addUlimits({
// name: UlimitName.NOFILE, // file descriptors
// softLimit: 16_384,
// hardLimit: 65_535,
// });

repository?.grantReadFromRepository(this.taskDefinition.taskRole);

Expand Down
3 changes: 2 additions & 1 deletion src/backend/transliterator/transliterator.ecs-entrypoint.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 14 additions & 11 deletions src/caching.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { CacheControl } from 'aws-cdk-lib/aws-s3-deployment';
import { Duration } from 'aws-cdk-lib/core';
// This file MUST NOT import anything from aws-cdk-lib
// If you do, it will cause aws-cdk-lib to be bundled into the lambda handlers.
// Bundling aws-cdk-lib, will make them 30mb+ of size,
// and could potentially break the handler due to importing dodgy transitive dependencies.
// Yes this has happened before.

/**
* Caching policies for serving data for the Construct Hub web app.
Expand All @@ -10,21 +13,21 @@ export class CacheStrategy {
*/
public static default() {
return new CacheStrategy([
CacheControl.setPublic(),
CacheControl.maxAge(Duration.minutes(5)),
CacheControl.mustRevalidate(),
CacheControl.sMaxAge(Duration.minutes(1)),
CacheControl.proxyRevalidate(),
'public',
'max-age=300',
'must-revalidate',
's-maxage=60',
'proxy-revalidate',
]);
}

private constructor(private readonly cacheControl: CacheControl[]) {}
private constructor(private readonly cacheControl: string[]) {}

public toString() {
return this.cacheControl.map((c) => c.value).join(', ');
public toString(): string {
return this.cacheControl.join(', ');
}

public toArray() {
public toArray(): string[] {
return this.cacheControl;
}
}
8 changes: 6 additions & 2 deletions src/webapp/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,9 @@ export class WebApp extends Construct {
distribution: this.distribution,
prune: false,
sources: [s3deploy.Source.asset(webappDir)],
cacheControl: CacheStrategy.default().toArray(),
cacheControl: CacheStrategy.default()
.toArray()
.map(s3deploy.CacheControl.fromString),
});

// Generate config.json to customize frontend behavior
Expand Down Expand Up @@ -363,7 +365,9 @@ export class WebApp extends Construct {
destinationBucket: this.bucket,
distribution: this.distribution,
prune: false,
cacheControl: CacheStrategy.default().toArray(),
cacheControl: CacheStrategy.default()
.toArray()
.map(s3deploy.CacheControl.fromString),
});

new CfnOutput(this, 'DomainName', {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1614,14 +1614,7 @@
}
}
},
"Name": "Resource",
"Ulimits": [
{
"HardLimit": 65535,
"Name": "nofile",
"SoftLimit": 16384
}
]
"Name": "Resource"
}
],
"Cpu": "4096",
Expand Down

0 comments on commit b4aec14

Please sign in to comment.