diff --git a/packer/linux/conf/bin/bk-check-disk-space.sh b/packer/linux/conf/bin/bk-check-disk-space.sh index 3c25f11ef..43d072324 100755 --- a/packer/linux/conf/bin/bk-check-disk-space.sh +++ b/packer/linux/conf/bin/bk-check-disk-space.sh @@ -1,25 +1,50 @@ #!/bin/bash set -euo pipefail -DISK_MIN_AVAILABLE=${DISK_MIN_AVAILABLE:-5242880} # 5GB -DISK_MIN_INODES=${DISK_MIN_INODES:-250000} # docker needs lots - -DOCKER_DIR="/var/lib/docker/" - -disk_avail=$(df -k --output=avail "$DOCKER_DIR" | tail -n1) - -echo "Disk space free: $(df -k -h --output=avail "$DOCKER_DIR" | tail -n1 | sed -e 's/^[[:space:]]//')" - -if [[ $disk_avail -lt $DISK_MIN_AVAILABLE ]]; then - echo "Not enough disk space free, cutoff is ${DISK_MIN_AVAILABLE} 🚨" >&2 - exit 1 +# Usage: +# bk-check-disk-space.sh (min disk required) (min inodes required) +# min disk required can be either an amount of bytes, a pattern like 10G +# or 500M, or a percentage like 5% +# min inodes must be a number, default to 250,000 + +. "$(dirname "$0")"/dehumanize.sh + +min_available=${1:-5G} +docker_dir="/var/lib/docker/" + +# First check the disk available + +disk_avail=$(df -k --output=avail "$docker_dir" | tail -n1) +disk_avail_human=$(df -k -h --output=avail "$docker_dir" | tail -n1 | tr -d '[:space:]') +disk_used_pct=$(df -k --output=pcent "$docker_dir" | tail -n1 | tr -d '[:space:]' | tr -d '%') +disk_free_pct=$((100-disk_used_pct)) + +printf "Disk space free: %s (%s%%)\\n" "$disk_avail_human" "$disk_free_pct" + +# Check if the min_available is a percentage +if [[ $min_available =~ \%$ ]] ; then + if [[ $(echo "${disk_free_pct}<${min_available}" | sed 's/%//g' | bc) -gt 0 ]] ; then + echo "Not enough disk space free, cutoff is ${min_available} 🚨" >&2 + exit 1 + fi +else + if [[ $disk_avail -lt $(dehumanize "$min_available") ]]; then + echo "Not enough disk space free, cutoff is ${min_available} 🚨" >&2 + exit 1 + fi fi -inodes_avail=$(df -k --output=iavail "$DOCKER_DIR" | tail -n1) +# Next check inodes, these can be exhausted by docker build operations + +inodes_min_available=${2:-250000} +inodes_avail=$(df -k --output=iavail "$docker_dir" | tail -n1 | tr -d '[:space:]') +inodes_avail_human=$(df -k -h --output=iavail "$docker_dir" | tail -n1 | tr -d '[:space:]') +inodes_used_pct=$(df -k --output=ipcent "$docker_dir" | tail -n1 | tr -d '[:space:]' | tr -d '%') +inodes_free_pct=$((100-inodes_used_pct)) -echo "Inodes free: $(df -k -h --output=iavail "$DOCKER_DIR" | tail -n1 | sed -e 's/^[[:space:]]//')" +printf "Inodes free: %s (%s%%)\\n" "$inodes_avail_human" "$inodes_free_pct" -if [[ $inodes_avail -lt $DISK_MIN_INODES ]]; then - echo "Not enough inodes free, cutoff is ${DISK_MIN_INODES} 🚨" >&2 +if [[ $inodes_avail -lt $inodes_min_available ]]; then + echo "Not enough inodes free, cutoff is ${inodes_min_available} 🚨" >&2 exit 1 fi diff --git a/packer/linux/conf/bin/bk-install-elastic-stack.sh b/packer/linux/conf/bin/bk-install-elastic-stack.sh index 4c55c0cbf..982f11151 100755 --- a/packer/linux/conf/bin/bk-install-elastic-stack.sh +++ b/packer/linux/conf/bin/bk-install-elastic-stack.sh @@ -62,6 +62,12 @@ export PLUGINS_ENABLED="${PLUGINS_ENABLED[*]-}" export BUILDKITE_ECR_POLICY=${BUILDKITE_ECR_POLICY:-none} EOF +# cron-env is sourced by crontab entries and low disk scripts +cat << EOF > /var/lib/buildkite-agent/cron-env +export DISK_MIN_AVAILABLE=$DISK_MIN_AVAILABLE +export DOCKER_PRUNE_UNTIL=$DOCKER_PRUNE_UNTIL +EOF + if [[ "${BUILDKITE_AGENT_RELEASE}" == "edge" ]] ; then echo "Downloading buildkite-agent edge..." curl -Lsf -o /usr/bin/buildkite-agent-edge \ diff --git a/packer/linux/conf/bin/dehumanize-test.sh b/packer/linux/conf/bin/dehumanize-test.sh new file mode 100755 index 000000000..1e3d4c393 --- /dev/null +++ b/packer/linux/conf/bin/dehumanize-test.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -o pipefail + +. "$(dirname "$0")"/dehumanize.sh + +test_without_unit(){ + assertEquals 45 $(dehumanize 45) +} + +test_bytes(){ + assertEquals 45 $(dehumanize 45b) + assertEquals 45 $(dehumanize 45B) +} + +test_kilobytes(){ + assertEquals 46080 $(dehumanize 45kb) + assertEquals 46080 $(dehumanize 45KB) +} + +test_megabytes(){ + assertEquals 47185920 $(dehumanize 45mb) + assertEquals 47185920 $(dehumanize 45MB) +} + +test_gigabytes(){ + assertEquals 48318382080 $(dehumanize 45gb) + assertEquals 48318382080 $(dehumanize 45GB) +} + +test_terabytes(){ + assertEquals 49478023249920 $(dehumanize 45tb) + assertEquals 49478023249920 $(dehumanize 45TB) +} + +test_using_decimals(){ + assertEquals 1610612736 $(dehumanize 1.5gb) +} + +. shunit2 diff --git a/packer/linux/conf/bin/dehumanize.sh b/packer/linux/conf/bin/dehumanize.sh new file mode 100644 index 000000000..ca90e8897 --- /dev/null +++ b/packer/linux/conf/bin/dehumanize.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# Converts human-readable units like 1.43K and 120.3M to bytes +dehumanize() { + awk '/[0-9][bB]?$/ {printf "%u\n", $1*1} + /[tT][bB]?$/ {printf "%u\n", $1*(1024*1024*1024*1024)} + /[gG][bB]?$/ {printf "%u\n", $1*(1024*1024*1024)} + /[mM][bB]?$/ {printf "%u\n", $1*(1024*1024)} + /[kK][bB]?$/ {printf "%u\n", $1*1024}' <<< "$1" +} diff --git a/packer/linux/conf/buildkite-agent/hooks/environment b/packer/linux/conf/buildkite-agent/hooks/environment index d5995e13f..b853dba19 100755 --- a/packer/linux/conf/buildkite-agent/hooks/environment +++ b/packer/linux/conf/buildkite-agent/hooks/environment @@ -7,6 +7,11 @@ source ~/cfn-env echo "~~~ :llama: Setting up elastic stack environment ($BUILDKITE_STACK_VERSION)" cat ~/cfn-env +if [[ -f ~/cron-env ]] ; then + # shellcheck source=/dev/null + source ~/cron-env +fi + echo "Checking docker" if ! docker ps ; then echo "^^^ +++" @@ -17,13 +22,13 @@ if ! docker ps ; then fi echo "Checking disk space" -if ! /usr/local/bin/bk-check-disk-space.sh ; then +if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL:-4h}" docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL:-4h}" echo "Checking disk space again" - if ! /usr/local/bin/bk-check-disk-space.sh ; then + if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}"; then echo "Disk health checks failed" >&2 exit 1 fi diff --git a/packer/linux/conf/docker/cron.hourly/docker-gc b/packer/linux/conf/docker/cron.hourly/docker-gc index 1ab07e68f..71f85ae0a 100755 --- a/packer/linux/conf/docker/cron.hourly/docker-gc +++ b/packer/linux/conf/docker/cron.hourly/docker-gc @@ -5,10 +5,16 @@ if [[ $EUID -eq 0 ]]; then exec >> /var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log fi -DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-4h} +# Load config from file if it exists +if [[ -f /var/lib/buildkite-agent/cron-env ]] ; then + # shellcheck source=/dev/null + source /var/lib/buildkite-agent/cron-env +else + DOCKER_PRUNE_UNTIL=4h +fi ## ------------------------------------------ ## Prune stuff that doesn't affect cache hits -docker network prune --force --filter "until=${DOCKER_PRUNE_UNTIL}" -docker container prune --force --filter "until=${DOCKER_PRUNE_UNTIL}" +docker network prune --force --filter "until=${!DOCKER_PRUNE_UNTIL}" +docker container prune --force --filter "until=${!DOCKER_PRUNE_UNTIL}" diff --git a/packer/linux/conf/docker/cron.hourly/docker-low-disk-gc b/packer/linux/conf/docker/cron.hourly/docker-low-disk-gc index ff68b64dc..c0697c9c3 100644 --- a/packer/linux/conf/docker/cron.hourly/docker-low-disk-gc +++ b/packer/linux/conf/docker/cron.hourly/docker-low-disk-gc @@ -5,8 +5,6 @@ if [[ $EUID -eq 0 ]]; then exec >> /var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log fi -DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-1h} - mark_instance_unhealthy() { # cancel any running buildkite builds killall -QUIT buildkite-agent || true @@ -19,14 +17,20 @@ mark_instance_unhealthy() { trap mark_instance_unhealthy ERR +# Load config from file if it exists +if [[ -f /var/lib/buildkite-agent/cron-env ]] ; then + # shellcheck source=/dev/null + source /var/lib/buildkite-agent/cron-env +fi + ## ----------------------------------------------------------------- ## Check disk, we only want to prune images/containers if we have to -if ! /usr/local/bin/bk-check-disk-space.sh ; then - echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL}" - docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL}" +if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then + echo "Cleaning up docker resources older than 1h" + docker image prune --all --force --filter "until=1h" - if ! /usr/local/bin/bk-check-disk-space.sh ; then + if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then echo "Disk health checks failed" >&2 exit 1 fi diff --git a/templates/aws-stack.yml b/templates/aws-stack.yml index f6855d9fb..12de96435 100644 --- a/templates/aws-stack.yml +++ b/templates/aws-stack.yml @@ -375,6 +375,16 @@ Parameters: - "false" Default: "false" + MinimumDiskAvailableBeforeCleanup: + Type: String + Description: Either a percentage (%) or absolute unit (B, MB, GB) of disk below which disk cleanup is run + Default: "2GB" + + DockerPruneUntil: + Type: String + Description: How far back to prune docker networks images and containers on hourly cleanup + Default: "4h" + Outputs: VpcId: Value: @@ -857,6 +867,8 @@ Resources: BUILDKITE_ECR_POLICY=${ECRAccessPolicy} \ BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BuildkiteTerminateInstanceAfterJob} \ BUILDKITE_ADDITIONAL_SUDO_PERMISSIONS=${BuildkiteAdditionalSudoPermissions} \ + DISK_MIN_AVAILABLE="${MinimumDiskAvailableBeforeCleanup}" \ + DOCKER_PRUNE_UNTIL="${DockerPruneUntil}" \ AWS_DEFAULT_REGION=${AWS::Region} \ SECRETS_PLUGIN_ENABLED=${EnableSecretsPlugin} \ ECR_PLUGIN_ENABLED=${EnableECRPlugin} \