diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..216a619c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + target-branch: "dev" + schedule: + interval: "weekly" \ No newline at end of file diff --git a/.github/workflows/codesee-arch-diagram.yml b/.github/workflows/codesee-arch-diagram.yml new file mode 100644 index 00000000..1ec93fd3 --- /dev/null +++ b/.github/workflows/codesee-arch-diagram.yml @@ -0,0 +1,87 @@ +on: + push: + branches: + - main + pull_request_target: + types: [opened, synchronize, reopened] + +name: CodeSee Map + +jobs: + test_map_action: + runs-on: ubuntu-latest + continue-on-error: true + name: Run CodeSee Map Analysis + steps: + - name: checkout + id: checkout + uses: actions/checkout@v2 + with: + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + fetch-depth: 0 + + # codesee-detect-languages has an output with id languages. + - name: Detect Languages + id: detect-languages + uses: Codesee-io/codesee-detect-languages-action@latest + + - name: Configure JDK 16 + uses: actions/setup-java@v2 + if: ${{ fromJSON(steps.detect-languages.outputs.languages).java }} + with: + java-version: '16' + distribution: 'zulu' + + # CodeSee Maps Go support uses a static binary so there's no setup step required. + + - name: Configure Node.js 14 + uses: actions/setup-node@v2 + if: ${{ fromJSON(steps.detect-languages.outputs.languages).javascript }} + with: + node-version: '14' + + - name: Configure Python 3.x + uses: actions/setup-python@v2 + if: ${{ fromJSON(steps.detect-languages.outputs.languages).python }} + with: + python-version: '3.10' + architecture: 'x64' + + - name: Configure Ruby '3.x' + uses: ruby/setup-ruby@v1 + if: ${{ fromJSON(steps.detect-languages.outputs.languages).ruby }} + with: + ruby-version: '3.0' + + # We need the rust toolchain because it uses rustc and cargo to inspect the package + - name: Configure Rust 1.x stable + uses: actions-rs/toolchain@v1 + if: ${{ fromJSON(steps.detect-languages.outputs.languages).rust }} + with: + toolchain: stable + + - name: Generate Map + id: generate-map + uses: Codesee-io/codesee-map-action@latest + with: + step: map + api_token: ${{ secrets.CODESEE_ARCH_DIAG_API_TOKEN }} + github_ref: ${{ github.ref }} + languages: ${{ steps.detect-languages.outputs.languages }} + + - name: Upload Map + id: upload-map + uses: Codesee-io/codesee-map-action@latest + with: + step: mapUpload + api_token: ${{ secrets.CODESEE_ARCH_DIAG_API_TOKEN }} + github_ref: ${{ github.ref }} + + - name: Insights + id: insights + uses: Codesee-io/codesee-map-action@latest + with: + step: insights + api_token: ${{ secrets.CODESEE_ARCH_DIAG_API_TOKEN }} + github_ref: ${{ github.ref }} diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml new file mode 100644 index 00000000..cd2e829f --- /dev/null +++ b/.github/workflows/docker-release.yml @@ -0,0 +1,47 @@ +name: docker_release + +on: + push: + branches: [ main, dev ] + paths-ignore: + - 'docs_src/**' + - 'README.md' + - 'CHANGELOG.md' + - 'CITATION' + - 'book.toml' + - 'CONTRIBUTING.md' + tags: [ 'v*.*.*' ] + +jobs: + build_and_push: + name: build_and_push_docker_image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Docker metadata + id: meta + uses: docker/metadata-action@v3 + with: + images: | + hubblo/scaphandre + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Cargo.lock b/Cargo.lock index 272c1bd0..f1a0fbf5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,10 +3,10 @@ version = 3 [[package]] -name = "adler32" -version = "1.2.0" +name = "adler" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "aho-corasick" @@ -304,6 +304,18 @@ dependencies = [ "instant", ] +[[package]] +name = "flate2" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" +dependencies = [ + "cfg-if", + "crc32fast", + "libc", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -636,24 +648,6 @@ version = "0.2.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2a5ac8f984bfcf3a823267e5fde638acc3325f6496633a5da6bb6eb2171e103" -[[package]] -name = "libflate" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "158ae2ca09a761eaf6050894f5a6f013f2773dafe24f67bfa73a7504580e2916" -dependencies = [ - "adler32", - "crc32fast", - "libflate_lz77", - "rle-decode-fast", -] - -[[package]] -name = "libflate_lz77" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3286f09f7d4926fc486334f28d8d2e6ebe4f7f9994494b6dab27ddfad2c9b11b" - [[package]] name = "libnghttp2-sys" version = "0.1.6+1.43.0" @@ -735,6 +729,16 @@ version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg", +] + [[package]] name = "mio" version = "0.7.13" @@ -954,17 +958,17 @@ dependencies = [ [[package]] name = "procfs" -version = "0.8.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a336c8310f4955f343935b9c11a30254d1ad8fad98ec257a4407a061a6fd49" +checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104" dependencies = [ "bitflags", "byteorder", "chrono", + "flate2", "hex", "lazy_static", "libc", - "libflate", ] [[package]] @@ -1098,12 +1102,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "rle-decode-fast" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac" - [[package]] name = "rustc_version" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index 1cd6b0fa..1cc7f3fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ loggerv = "0.7.2" log = "0.4" clap = "2.33.3" regex = "1" -procfs = "0.8.1" +procfs = "0.12.0" riemann_client = { version = "0.9.0", optional = true } hostname = "0.3.1" protobuf = "2.20.0" diff --git a/Dockerfile b/Dockerfile index dd0ac35f..e622898e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.48 as planner +FROM rust:1.59 as planner WORKDIR app RUN cargo install cargo-chef @@ -7,7 +7,7 @@ COPY . . # Analyze dependencies RUN cargo chef prepare --recipe-path recipe.json -FROM rust:1.48 as cacher +FROM rust:1.59 as cacher WORKDIR app RUN cargo install cargo-chef COPY --from=planner /app/recipe.json recipe.json @@ -15,7 +15,7 @@ COPY --from=planner /app/recipe.json recipe.json # Cache dependencies RUN cargo chef cook --release --recipe-path recipe.json -FROM rust:1.48 as builder +FROM rust:1.59 as builder WORKDIR app COPY . . @@ -24,7 +24,7 @@ COPY --from=cacher /app/target target COPY --from=cacher $CARGO_HOME $CARGO_HOME RUN cargo build --release -FROM ubuntu:20.10 as runtime +FROM ubuntu:20.04 as runtime WORKDIR app RUN apt-get update \ diff --git a/docs_src/compatibility.md b/docs_src/compatibility.md index f4727a15..0496e13d 100644 --- a/docs_src/compatibility.md +++ b/docs_src/compatibility.md @@ -1,6 +1,6 @@ # Compatibility -Scaphandre intends to provide multiple ways to gather power consumption metrics and make understanding tech services footprint possible in many situations. Depending on how you use scaph, you may have some restrictions. +Scaphandre intends to provide multiple ways to gather power consumption metrics and make understanding tech services footprint possible in many situations. Depending on how you use scaphandre, you may have some restrictions. To summarize, scaphandre should provide two ways to estimate the power consumption of a service, process or machine. Either by **measuring it**, using software interfaces that give access to hardware metrics, or by **estimating it** if measuring is not an option (this is a [planned feature](https://github.com/hubblo-org/scaphandre/issues/25), not yet implemented as those lines are written, in december 2020). diff --git a/docs_src/explanations/how-scaph-computes-per-process-power-consumption.md b/docs_src/explanations/how-scaph-computes-per-process-power-consumption.md index 9e27a357..6b940ed4 100644 --- a/docs_src/explanations/how-scaph-computes-per-process-power-consumption.md +++ b/docs_src/explanations/how-scaph-computes-per-process-power-consumption.md @@ -2,9 +2,10 @@ Scaphandre is a tool that makes it possible to see the power being used by a single process on a computer. -This sounds like a simple thing thing to be able to do, but in practice a number of details can make this more complex. +This sounds like a simple thing to be able to do, but in practice a number of details can make this more complex. + +So having a good mental model of how it works will make it easier to understand when and how to use Scaphandre. Let's start with a simplified mental model below, before moving on to multiple processors or virtual machines - but once you understand the key ideas outlined below, it's easier to see how they can be applied to thinking about tracking power on virtual machines, or when multiple processors are available. -So having a good mental for how it works will make it understand when and how to use Scaphandre. For simplicity we start with a simplified mental model below, before thinking about multiple processors or virtual machines - but once you understand the key ideas outlined below, it's easier to see how they can be applied to thinking about tracking power on in virtual machines, or when we have multiple processors available. ### How a computer works on multiple jobs at the same time When we first think about how much energy a single process running in a computer might use, we might start with a mental model that looks like the figure below, with large, uninterrupted chunks of compute time allocated to each process. @@ -15,38 +16,36 @@ This is easy to understand, and it matches how we might be billed for a share of #### Timesharing of work -However, if the reality was _exactly_ like this diagram, our computers would only ever be able to do something at a time. It's more accurate and helpful to think of computers working on lots of different jobs at the same time - they work on one job for short interval of time, then another, and another and so one. You'll often see these [small intervals of time referred to as _[jiffies][]_. - -[jiffies]: https://www.anshulpatel.in/post/linux_cpu_percentage/ +However, if the reality was _exactly_ like this diagram, our computers would only ever be able to do one thing at a time. It's more accurate and helpful to think of computers working on lots of different jobs at the same time - they work on one job for short interval of time, then another, and another and so one. You'll often see these small intervals of time referred to as _[jiffies](https://www.anshulpatel.in/post/linux_cpu_percentage/)_. ![work on jobs is split into jiffies](../img/jiffies.png) -In a given amount of time, certain jobs that are more important, or resource intensive will use more jiffies than others. Fortunately, each job keeps a running total of the total jiffies allocated to it, so if we know how many jiffies have been used in total, it can give us an idea how much of a machine's resources are being used by a given process. +In a given amount of time, some jobs that are prioritized or more resource intensive will use more jiffies than others. Fortunately, each job keeps a running total of the total jiffies allocated to it, so if we know how many jiffies have been used in total, it can give us an idea how much of a machine's resources are being used by a given process. ![work on jobs is split into jiffies](../img/total-time-share.png) ### Going from share of resources to actual power figures It's possible without Scaphandre to understand how large a share of a machines' resources are being used by a given process. -This is useful, by itself, but if we want to understand how much _power_ used per process, not just the share of the machine's resources, we need to know how much power is being used by the machine in absolute terms. +This is useful, by itself, but if we want to understand how much _power_ is used per process, not just the share of the machine's resources, we need to know how much power is being used by the machine in absolute terms. -To do this, we need a sensor of some kind to track power usage by the machine itself. Some servers have these, like with Intel's RAPL sensors, which we cover this in more detail later on. This makes it possible to understand how much power is being used by CPUs, GPUs and so on, in terms of watts, or if we are looking at just a single process, various fractions of a watt. +To do this, we need a sensor of some kind to track power usage by the machine itself. Some servers have these, like with Intel's RAPL sensors, which we cover this in more detail later on. This makes it possible to understand how much power is being used by CPUs, GPUs and so on, in terms of watts, or, if we are looking at just a single process, various fractions of a watt. ![Sensors provide power over time](../img/power-over-time.png) -To understand the power used by a single process we combine both of these ideas. We count the jiffies used by _our_ job when it's being worked on, and for each jiffie, we check how much power is being drawn at those moments in time in absolute terms. +To understand the power used by a single process we combine both of these ideas. We count the jiffies used by _our_ job when it's being worked on, and for each jiffy, we check how much power is being drawn at those moments in time. ![Combined we can see how much the power during 'our' jiffies](../img/power-and-share-of-usage.png) -Finally, when we group together all the power readings for all our jiffies over a given time period, we can arrive at a useable figure for how much power has been used, in terms of watt hours. +Finally, when we group together all the power readings for all our jiffies over a given time period, we can arrive at a usable figure for how much power has been used, in terms of watt hours. -Once you have a figure in terms of watt hours, there are various ways you can convert this to environmental impact. A common way is to use an _emission factor_ for the electricity used, to turn it into a quantity of carbon emissions. +Once we have a figure in terms of watt hours, there are various ways we can convert this to environmental impact. A common way is to use an _emission factor_ for the electricity used, to turn the power consumption data into an estimate of associated carbon emissions. ![Combined we can see how much the power during 'our' jiffies](../img/power-by-process.png) ### Working with virtualisation and multiple processors -While the reality is again more complicated than the diagram below, you ideas broadly apply when you introduce multiple processors too. +While the reality is again more complicated than the diagram below, the same ideas broadly apply when you introduce multiple processors too. If you are able to read from sensors that can share how much power is being used by the various processors at work, and know how much of the time is being allocated to our processes during those moments, you can get a good idea of what these figures are, at a per-process level. @@ -61,31 +60,31 @@ However, if a guest virtual machine or guest container _does_ have access to rea ## More details about how Scaphandre works - As you can see with the [prometheus exporter reference](../references/exporter-prometheus.md), scaphandre exporters can provide process level power consumption metrics. This section will explain how it is done and how it may be improved in the future. + ## Some details about RAPL We'll talk here about the case where scaphandre is able to effectively measure the power consumption of the host (see [compatibility](../compatibility.md) section for more on sensors and their prerequisites) and specifically about the [PowercapRAPL](../references/sensor-powercap_rapl.md) sensor. -Let's clarify what's happening when you collect metrics with scaphandre and this sensor. -RAPL stands for [Running Average Power Limit](https://01.org/blogs/2014/running-average-power-limit-%E2%80%93-rapl). It's a technnology embedded in most Intel and AMD x86 CPUs produced after 2012. +Let's clarify what's happening when you collect metrics with scaphandre and the RAPL sensor. +RAPL stands for [Running Average Power Limit](https://01.org/blogs/2014/running-average-power-limit-%E2%80%93-rapl). It's a technology embedded in most Intel and AMD x86 CPUs produced after 2012. Thanks to this technology it is possible to get the total energy consumption of the CPU, of the consumption per CPU socket, plus in some cases, the consumption of the DRAM controller. In most cases it represents the vast majority of the energy consumption of the machine (except when running GPU intensive workloads, for example). Further improvements shall be made in scaphandre to fully measure the consumption when GPU are involved (or a lot of hard drives on the same host...). -Between scaphandre and those data is the powercap kernel module that writes the energy consumption in files. Scaphandre, reads those files, stores the data in buffer and then allows for more processing through the exporters. +Between scaphandre and this data is the powercap kernel module that writes the energy consumption to files. Scaphandre reads those files, stores the data in buffers and then allows for more processing through the exporters. ### How to get the consumption of one process ? -The PowercapRAPL sensor does actually some more than just collecting those energy consumption metrics (and casting it in power consumption metrics). +The PowercapRAPL sensor does actually more than just collecting those energy consumption metrics (and converting them to power consumption metrics). -Every time the exporter asks for a measurement (either periodically like in the [Stdout](../references/exporter-stdout.md) exporter, or every time a request comes like for the Prometheus exporter) the sensor reads the values of the energy counters from powercap. It then stores those values, and does the same for the CPU usage statistics of the CPU (the one you can see in `/proc/stats`) and for each running process on the machine at that time (see `/proc/PID/stats`). +Every time the exporter asks for a measurement (either periodically like in the [Stdout](../references/exporter-stdout.md) exporter, or every time a request comes as would be the case for the Prometheus exporter) the sensor reads the values of the energy counters from powercap. It then stores those values, and does the same for the CPU usage statistics of the CPU (the one you can see in `/proc/stats`) and for each running process on the machine at that time (see `/proc/PID/stats`). -With those data it is possible to compute the ratio of CPU time actively spent for a given PID on the CPU time actively spent doing something. With this ratio we can then get the subset of power consumption that is related to that PID on a given timeframe (between two measurement requests). +With this data it is possible to compute the ratio of CPU time actively spent for a given PID on the CPU time actively spent doing something. With this ratio we can then get the subset of power consumption that is related to that PID on a given timeframe (between two measurement requests). ### How to get the consumption of an application/a service ? -Services and programs are often not running only one PID. It's needed to aggregate the consumption of all related PIDs to know what this service is actually consuming. +Services and programs are often not running on only one PID. It's needed to aggregate the consumption of all related PIDs to know what this service is actually consuming. -To do that, in the current state of scaphandre development, you can use the Prometheus exporter, and then use Prometheus TSDB and query language capabilities. You'll find examples looking at the graphs and queries [here](https://metrics.hubblo.org). In a near future, more advanced features may be implemented in scaphandre to allow such classification even if you don't have access to a proper TSDB. +To do that, in the current state of scaphandre development, you can use the Prometheus exporter, and then use Prometheus and its query language capabilities. You'll find examples looking at the graphs and queries [here](https://metrics.hubblo.org). In a near future, more advanced features may be implemented in scaphandre to allow such classification even if you don't have access to a proper [Time Series database (TSDB)](https://en.wikipedia.org/wiki/Time_series_database). diff --git a/docs_src/explanations/internal-structure.md b/docs_src/explanations/internal-structure.md index 9a189ff2..8d667575 100644 --- a/docs_src/explanations/internal-structure.md +++ b/docs_src/explanations/internal-structure.md @@ -1,6 +1,6 @@ # Internal structure -Scaphandre is designed to be extensible. As it performs basically two tasks: **collecting**/pre-computing the power consumption metrics and **shipping** it, it is composed of two main components: a **sensor** and an **exporter**. Each can be implemented in different wats, to match a certain use case. When you run scaphandre from the command line, `-s` allows you to choose the sensor you want to use, and the next subcommand is the name of the exporter. +Scaphandre is designed to be extensible. As it performs basically two tasks: **collecting**/pre-computing the power consumption metrics and **publishing** it, it is composed of two main components: a **sensor** and an **exporter**. Each can be implemented in different ways to match a certain use case. When you run scaphandre from the command line, `-s` allows you to choose the sensor you want to use, and the next subcommand is the name of the exporter. ## Sensors @@ -9,7 +9,7 @@ Sensors are meant to: 1. get the power consumptions metrics of the host 2. make it available for the exporter -The [PowercapRAPL](../references/sensors-powercap_rapl.md) for instance, gets and transforms metrics coming from the powercap Linux kernel module, that serves as an interface to get the data from the [RAPL](https://01.org/blogs/2014/running-average-power-limit-%E2%80%93-rapl) feature of x86 CPUs. Because this feature is only accessible when you are running on a bare metal machine, this sensor will not work in a virtual machine, except if you first run scaphandre on the hypervisor and make the VM metrics available, with the [qemu exporter](../references/exporter-qemu.md), to scaphandre running inside the virtual machine. +The [PowercapRAPL](../references/sensor-powercap_rapl.md) for instance, gets and transforms metrics coming from the powercap Linux kernel module, that serves as an interface to get the data from the [RAPL](https://01.org/blogs/2014/running-average-power-limit-%E2%80%93-rapl) feature of x86 CPUs. Because this feature is only accessible when you are running on a bare metal machine, this sensor will not work in a virtual machine, except if you first run scaphandre on the hypervisor and make the VM metrics available, with the [qemu exporter](../references/exporter-qemu.md), to scaphandre running inside the virtual machine. When you don't have access to the hypervisor/bare-metal machine (ie. when you run on public cloud instances and your provider doesn't run scaphandre) you still have the option to estimate the power consumption, based on both the ressources (cpu/gpu/ram/io...) consumed by the virtual machine at a given time, and the characteristics of the underlying hardware. This is the way we are designing the future [estimation-based sensor](https://github.com/hubblo-org/scaphandre/issues/25), to match that use case. @@ -22,6 +22,6 @@ An exporter is expected to: 1. ask the sensors to get new metrics and store them for later, potential usage 2. export the current metrics -The [Stdout](../references/exporter-stdout.md) exporter exposes the metrics on the standard output (in your terminal). The [prometheus](../references/exporter-prometheus.md) exporter exposes the metrics on an http endpoint, to be scraped by a [prometheus](https://prometheus.io) instance. An exporter should be created for each monitoring scenario (do you want to feed your favorite monitoring/data analysis tool with scaphandre metrics ? feel free to open a [PR](https://github.com/hubblo-org/scaphandre/pulls) to create a new exporter !). +The [Stdout](../references/exporter-stdout.md) exporter exposes the metrics on the standard output (in your terminal). The [prometheus](../references/exporter-prometheus.md) exporter exposes the metrics on an HTTP endpoint, to be scraped by a [prometheus](https://prometheus.io) instance. An exporter should be created for each monitoring scenario (do you want to feed your favorite monitoring/data analysis tool with scaphandre metrics ? feel free to open a [PR](https://github.com/hubblo-org/scaphandre/pulls) to create a new exporter !). -As introduced in the [sensors](#sensors) section, the [Qemu](../references/exporter-qemu.md) exporter, is very specific. It is only intended to collect metrics related to running virtual machines on a Qemu/KVM hypervisor. Those metrics can then be made available to each virtual machine and it's own scaphandre instance, running the [PowercapRAPL](../references/sensor-powercap_rapl.md) sensor (with the `--vm` flag on). The qemu exporter puts VM's metrics in files the same way the powercap kernel module does it. It mimics this behavior, so the sensor can act the same way it would on a bare metal machine. \ No newline at end of file +As introduced in the [sensors](#sensors) section, the [Qemu](../references/exporter-qemu.md) exporter, is very specific. It is only intended to collect metrics related to running virtual machines on a Qemu/KVM hypervisor. Those metrics can then be made available to each virtual machine and their own scaphandre instance, running the [PowercapRAPL](../references/sensor-powercap_rapl.md) sensor (with the `--vm` flag on). The qemu exporter puts VM's metrics in files the same way the powercap kernel module does it. It mimics this behavior, so the sensor can act the same way it would on a bare metal machine. \ No newline at end of file diff --git a/docs_src/how-to_guides/propagate-metrics-hypervisor-to-vm_qemu-kvm.md b/docs_src/how-to_guides/propagate-metrics-hypervisor-to-vm_qemu-kvm.md index e14dd8e6..c3660a08 100644 --- a/docs_src/how-to_guides/propagate-metrics-hypervisor-to-vm_qemu-kvm.md +++ b/docs_src/how-to_guides/propagate-metrics-hypervisor-to-vm_qemu-kvm.md @@ -25,7 +25,7 @@ For each virtual machine you want to give access to its metrics, create a [tmpfs mount -t tmpfs tmpfs_DOMAIN_NAME /var/lib/libvirt/scaphandre/DOMAIN_NAME -o size=5m -In the definition of the virtual machine (ehre we are using libvirt), ensure you have a filesystem configuration to give access to the mountpoint: +In the definition of the virtual machine (here we are using libvirt), ensure you have a filesystem configuration to give access to the mountpoint: virsh edit DOMAIN_NAME @@ -50,4 +50,4 @@ You can now run scaphandre to export the metrics with the exporter of your choic Please refer to the [qemu exporter](docs/exporters/qemu.md) reference for more details. -**Note:** This how to is only suitable for a "manual" use case. For all automated systems like openstack or proxmox, some more work needs to be done to make the integration of those steps easier. \ No newline at end of file +**Note:** This how to is only suitable for a "manual" use case. For all automated systems like openstack or proxmox, some more work needs to be done to make the integration of those steps easier. diff --git a/helm/scaphandre/templates/psp.yaml b/helm/scaphandre/templates/psp.yaml index a3ba2f8a..f7d702d5 100644 --- a/helm/scaphandre/templates/psp.yaml +++ b/helm/scaphandre/templates/psp.yaml @@ -22,6 +22,7 @@ spec: - min: 1 max: 65535 volumes: - - 'hostPath' + - hostPath + - projected hostPID: true hostIPC: true diff --git a/src/exporters/json.rs b/src/exporters/json.rs index e901c450..c687bfa3 100644 --- a/src/exporters/json.rs +++ b/src/exporters/json.rs @@ -26,7 +26,6 @@ impl Exporter for JSONExporter { fn get_options() -> Vec> { let mut options = Vec::new(); let arg = Arg::with_name("timeout") - .default_value("10") .help("Maximum time spent measuring, in seconds.") .long("timeout") .short("t") @@ -127,7 +126,6 @@ impl JSONExporter { /// Runs iteration() every 'step', until 'timeout' pub fn runner(&mut self, parameters: ArgMatches) { - let timeout = parameters.value_of("timeout").unwrap(); let topology = self.sensor.get_topology().unwrap(); let mut metric_generator = MetricGenerator::new( topology, @@ -135,32 +133,34 @@ impl JSONExporter { parameters.is_present("qemu"), parameters.is_present("containers"), ); - if timeout.is_empty() { - self.iterate(¶meters, &mut metric_generator); - } else { - let now = Instant::now(); - let timeout_secs: u64 = timeout.parse().unwrap(); - - // We have a default value of 2s so it is safe to unwrap the option - // Panic if a non numerical value is passed - let step_duration: u64 = parameters - .value_of("step_duration") - .unwrap() - .parse() - .expect("Wrong step_duration value, should be a number of seconds"); - let step_duration_nano: u32 = parameters - .value_of("step_duration_nano") - .unwrap() - .parse() - .expect("Wrong step_duration_nano value, should be a number of nano seconds"); + // We have a default value of 2s so it is safe to unwrap the option + // Panic if a non numerical value is passed + let step_duration: u64 = parameters + .value_of("step_duration") + .unwrap() + .parse() + .expect("Wrong step_duration value, should be a number of seconds"); + let step_duration_nano: u32 = parameters + .value_of("step_duration_nano") + .unwrap() + .parse() + .expect("Wrong step_duration_nano value, should be a number of nano seconds"); - info!("Measurement step is: {}s", step_duration); + info!("Measurement step is: {}s", step_duration); + if let Some(timeout) = parameters.value_of("timeout") { + let now = Instant::now(); + let timeout_secs: u64 = timeout.parse().unwrap(); while now.elapsed().as_secs() <= timeout_secs { self.iterate(¶meters, &mut metric_generator); thread::sleep(Duration::new(step_duration, step_duration_nano)); } + } else { + loop { + self.iterate(¶meters, &mut metric_generator); + thread::sleep(Duration::new(step_duration, step_duration_nano)); + } } } diff --git a/src/exporters/mod.rs b/src/exporters/mod.rs index 04befba1..6cc26d9e 100644 --- a/src/exporters/mod.rs +++ b/src/exporters/mod.rs @@ -598,13 +598,13 @@ impl MetricGenerator { if let Ok(pods_result) = kubernetes.list_pods("".to_string()) { self.pods = pods_result; debug!("Found {} pods", &self.pods.len()); - self.pods_last_check = current_system_time_since_epoch().as_secs().to_string(); } else { info!("Failed getting pods list, despite client seems ok."); } } else { debug!("Kubernetes socket is not some."); } + self.pods_last_check = current_system_time_since_epoch().as_secs().to_string(); } } @@ -692,7 +692,7 @@ impl MetricGenerator { attributes.insert("exe".to_string(), exe.clone()); if let Some(cmdline_str) = cmdline { - attributes.insert("cmdline".to_string(), cmdline_str.replace("\"", "\\\"")); + attributes.insert("cmdline".to_string(), cmdline_str.replace('\"', "\\\"")); if self.qemu { if let Some(vmname) = utils::filter_qemu_cmdline(&cmdline_str) { diff --git a/src/exporters/prometheus.rs b/src/exporters/prometheus.rs index 3911c6e3..5651e70f 100644 --- a/src/exporters/prometheus.rs +++ b/src/exporters/prometheus.rs @@ -194,7 +194,7 @@ fn format_metric(key: &str, value: &str, labels: Option<&HashMap if let Some(labels) = labels { result.push('{'); for (k, v) in labels.iter() { - result.push_str(&format!("{}=\"{}\",", k, v.replace("\"", "_"))); + result.push_str(&format!("{}=\"{}\",", k, v.replace('\"', "_"))); } result.remove(result.len() - 1); result.push('}'); diff --git a/src/exporters/qemu.rs b/src/exporters/qemu.rs index eda2bf68..5e65967b 100644 --- a/src/exporters/qemu.rs +++ b/src/exporters/qemu.rs @@ -79,12 +79,11 @@ impl QemuExporter { Err(error) => panic!("Couldn't create {}. Got: {}", &path, error), } } - let tdiff = time_tdiff.total_time_jiffies() - * procfs::ticks_per_second().unwrap() as f32; + let tdiff = time_tdiff.total_time_jiffies(); trace!("Time_pdiff={} time_tdiff={}", time_pdiff.to_string(), tdiff); - let ratio = time_pdiff as f32 / tdiff; + let ratio = time_pdiff / tdiff; trace!("Ratio is {}", ratio.to_string()); - let uj_to_add = ratio * topo_rec_uj.value.parse::().unwrap(); + let uj_to_add = ratio * topo_rec_uj.value.parse::().unwrap(); trace!("Adding {} uJ", uj_to_add); let complete_path = format!("{}/{}/intel-rapl:0", path, vm_name); if let Ok(result) = diff --git a/src/exporters/riemann.rs b/src/exporters/riemann.rs index 45378e37..d8cdb76b 100644 --- a/src/exporters/riemann.rs +++ b/src/exporters/riemann.rs @@ -38,17 +38,16 @@ impl RiemannClient { .unwrap() .parse::() .expect("Fail parsing port number"); - let client: Client; - if parameters.is_present("mtls") { + let client: Client = if parameters.is_present("mtls") { let cafile = parameters.value_of("cafile").unwrap(); let certfile = parameters.value_of("certfile").unwrap(); let keyfile = parameters.value_of("keyfile").unwrap(); - client = Client::connect_tls(&address, port, cafile, certfile, keyfile) - .expect("Fail to connect to Riemann server using mTLS"); + Client::connect_tls(&address, port, cafile, certfile, keyfile) + .expect("Fail to connect to Riemann server using mTLS") } else { - client = Client::connect(&(address, port)) - .expect("Fail to connect to Riemann server using raw TCP"); - } + Client::connect(&(address, port)) + .expect("Fail to connect to Riemann server using raw TCP") + }; RiemannClient { client } } @@ -88,7 +87,7 @@ impl RiemannClient { i64::try_from(value).expect("Metric cannot be converted to signed integer."), ), MetricValueType::Text(ref value) => { - let value = value.replace(",", ".").replace("\n", ""); + let value = value.replace(',', ".").replace('\n', ""); if value.contains('.') { event.set_metric_d(value.parse::().expect("Cannot parse metric value.")); } else { @@ -184,7 +183,7 @@ impl Exporter for RiemannExporter { attributes.insert("exe".to_string(), exe.clone()); if let Some(cmdline_str) = cmdline { - attributes.insert("cmdline".to_string(), cmdline_str.replace("\"", "\\\"")); + attributes.insert("cmdline".to_string(), cmdline_str.replace('\"', "\\\"")); if parameters.is_present("qemu") { if let Some(vmname) = utils::filter_qemu_cmdline(&cmdline_str) { @@ -197,9 +196,7 @@ impl Exporter for RiemannExporter { // to differentiate services/metrics let metric_name = format!( "{}_{}_{}", - "scaph_process_power_consumption_microwatts", - pid.to_string(), - exe + "scaph_process_power_consumption_microwatts", pid, exe ); if let Some(power) = metric_generator .topology diff --git a/src/exporters/stdout.rs b/src/exporters/stdout.rs index 6f55e638..85be82a3 100644 --- a/src/exporters/stdout.rs +++ b/src/exporters/stdout.rs @@ -99,17 +99,16 @@ impl StdoutExporter { .parse() .expect("Wrong process_number value, should be a number"); - let regex_filter: Option; - if !parameters.is_present("regex_filter") + let regex_filter: Option = if !parameters.is_present("regex_filter") || parameters.value_of("regex_filter").unwrap().is_empty() { - regex_filter = None; + None } else { - regex_filter = Some( + Some( Regex::new(parameters.value_of("regex_filter").unwrap()) .expect("Wrong regex_filter, regexp is invalid"), - ); - } + ) + }; if parameters.occurrences_of("regex_filter") == 1 && parameters.occurrences_of("process_number") == 1 @@ -229,20 +228,20 @@ impl StdoutExporter { println!("{}\n", to_print); } - let consumers: Vec<(procfs::process::Process, u64)>; - if let Some(regex_filter) = regex_filter { - println!("Processes filtered by '{}':", regex_filter.as_str()); - consumers = metric_generator - .topology - .proc_tracker - .get_filtered_processes(regex_filter); - } else { - println!("Top {} consumers:", process_number); - consumers = metric_generator - .topology - .proc_tracker - .get_top_consumers(process_number); - } + let consumers: Vec<(procfs::process::Process, u64)> = + if let Some(regex_filter) = regex_filter { + println!("Processes filtered by '{}':", regex_filter.as_str()); + metric_generator + .topology + .proc_tracker + .get_filtered_processes(regex_filter) + } else { + println!("Top {} consumers:", process_number); + metric_generator + .topology + .proc_tracker + .get_top_consumers(process_number) + }; info!("consumers : {:?}", consumers); println!("Power\t\tPID\tExe"); diff --git a/src/exporters/warpten.rs b/src/exporters/warpten.rs index adfba807..70343ce0 100644 --- a/src/exporters/warpten.rs +++ b/src/exporters/warpten.rs @@ -19,17 +19,16 @@ impl Exporter for Warp10Exporter { let host = parameters.value_of("host").unwrap(); let scheme = parameters.value_of("scheme").unwrap(); let port = parameters.value_of("port").unwrap(); - let write_token; - if let Some(token) = parameters.value_of("write-token") { - write_token = token.to_owned(); + let write_token = if let Some(token) = parameters.value_of("write-token") { + token.to_owned() } else { - write_token = match env::var("SCAPH_WARP10_WRITE_TOKEN") { + match env::var("SCAPH_WARP10_WRITE_TOKEN") { Ok(val) => val, Err(_e) => panic!( "SCAPH_WARP10_WRITE_TOKEN not found in env, nor write-token flag was used." ), - }; - } + } + }; //let read_token = parameters.value_of("read-token"); let step = parameters.value_of("step").unwrap(); let qemu = parameters.is_present("qemu"); @@ -343,14 +342,12 @@ impl Warp10Exporter { } plabels.push(warp10::Label::new( "cmdline", - &cmdline_str.replace("\"", "\\\""), + &cmdline_str.replace('\"', "\\\""), )); } let metric_name = format!( "{}_{}_{}", - "scaph_process_power_consumption_microwats", - pid.to_string(), - exe + "scaph_process_power_consumption_microwats", pid, exe ); if let Some(power) = self.topology.get_process_power_consumption_microwatts(pid) { process_data.push(warp10::Data::new( diff --git a/src/sensors/mod.rs b/src/sensors/mod.rs index c15e4525..b3d8a0f6 100644 --- a/src/sensors/mod.rs +++ b/src/sensors/mod.rs @@ -389,8 +389,8 @@ impl Topology { /// and previous stats measurement (from stat_buffer), attribute by attribute. pub fn get_stats_diff(&self) -> Option { if self.stat_buffer.len() > 1 { - let last = &self.stat_buffer[0].cputime; - let previous = &self.stat_buffer[1].cputime; + let last = &self.stat_buffer[0]; + let previous = &self.stat_buffer[1]; let mut iowait = None; let mut irq = None; let mut softirq = None; @@ -416,18 +416,16 @@ impl Topology { guest_nice = Some(last.guest_nice.unwrap() - previous.guest_nice.unwrap()); } return Some(CPUStat { - cputime: CpuTime { - user: last.user - previous.user, - nice: last.nice - previous.nice, - system: last.system - previous.system, - idle: last.idle - previous.idle, - iowait, - irq, - softirq, - steal, - guest, - guest_nice, - }, + user: last.user - previous.user, + nice: last.nice - previous.nice, + system: last.system - previous.system, + idle: last.idle - previous.idle, + iowait, + irq, + softirq, + steal, + guest, + guest_nice, }); } None @@ -438,7 +436,16 @@ impl Topology { let kernelstats_or_not = KernelStats::new(); if let Ok(res_cputime) = kernelstats_or_not { return Some(CPUStat { - cputime: res_cputime.total, + user: res_cputime.total.user, + guest: res_cputime.total.guest, + guest_nice: res_cputime.total.guest_nice, + idle: res_cputime.total.idle, + iowait: res_cputime.total.iowait, + irq: res_cputime.total.irq, + nice: res_cputime.total.nice, + softirq: res_cputime.total.softirq, + steal: res_cputime.total.steal, + system: res_cputime.total.system, }); } None @@ -489,8 +496,7 @@ impl Topology { //trace!("Topology stats measured diff: {:?}", topo_stats_diff); let process_total_time = last.total_time_jiffies() - previous.total_time_jiffies(); - let topo_total_time = topo_stats_diff.total_time_jiffies() - * procfs::ticks_per_second().unwrap() as f32; + let topo_total_time = topo_stats_diff.total_time_jiffies(); let usage_percent = process_total_time as f64 / topo_total_time as f64; let topo_conso = self.get_records_diff_power_microwatts(); if let Some(val) = &topo_conso { @@ -523,14 +529,13 @@ impl Topology { let process_total_time = last.total_time_jiffies() - previous.total_time_jiffies(); - let topo_total_time = topo_stats_diff.total_time_jiffies() - * procfs::ticks_per_second().unwrap() as f32; + let topo_total_time = topo_stats_diff.total_time_jiffies(); let usage = process_total_time as f64 / topo_total_time as f64; return Some(Record::new( current_system_time_since_epoch(), - (usage * 100.0).to_string(), + usage.to_string(), units::Unit::Percentage, )); } @@ -750,32 +755,28 @@ impl CPUSocket { /// a CpuTime struct containing stats for the whole socket. pub fn read_stats(&self) -> Option { let mut stats = CPUStat { - cputime: CpuTime { - user: 0.0, - nice: 0.0, - system: 0.0, - idle: 0.0, - iowait: Some(0.0), - irq: Some(0.0), - softirq: Some(0.0), - guest: Some(0.0), - guest_nice: Some(0.0), - steal: Some(0.0), - }, + user: 0, + nice: 0, + system: 0, + idle: 0, + iowait: Some(0), + irq: Some(0), + softirq: Some(0), + guest: Some(0), + guest_nice: Some(0), + steal: Some(0), }; for c in &self.cpu_cores { let c_stats = c.read_stats().unwrap(); - stats.cputime.user += c_stats.user; - stats.cputime.nice += c_stats.nice; - stats.cputime.system += c_stats.system; - stats.cputime.idle += c_stats.idle; - stats.cputime.iowait = - Some(stats.cputime.iowait.unwrap_or_default() + c_stats.iowait.unwrap_or_default()); - stats.cputime.irq = - Some(stats.cputime.irq.unwrap_or_default() + c_stats.irq.unwrap_or_default()); - stats.cputime.softirq = Some( - stats.cputime.softirq.unwrap_or_default() + c_stats.softirq.unwrap_or_default(), - ); + stats.user += c_stats.user; + stats.nice += c_stats.nice; + stats.system += c_stats.system; + stats.idle += c_stats.idle; + stats.iowait = + Some(stats.iowait.unwrap_or_default() + c_stats.iowait.unwrap_or_default()); + stats.irq = Some(stats.irq.unwrap_or_default() + c_stats.irq.unwrap_or_default()); + stats.softirq = + Some(stats.softirq.unwrap_or_default() + c_stats.softirq.unwrap_or_default()); } Some(stats) } @@ -785,8 +786,8 @@ impl CPUSocket { /// by field. pub fn get_stats_diff(&mut self) -> Option { if self.stat_buffer.len() > 1 { - let last = &self.stat_buffer[0].cputime; - let previous = &self.stat_buffer[1].cputime; + let last = &self.stat_buffer[0]; + let previous = &self.stat_buffer[1]; let mut iowait = None; let mut irq = None; let mut softirq = None; @@ -812,18 +813,16 @@ impl CPUSocket { guest_nice = Some(last.guest_nice.unwrap() - previous.guest_nice.unwrap()); } return Some(CPUStat { - cputime: CpuTime { - user: last.user - previous.user, - nice: last.nice - previous.nice, - system: last.system - previous.system, - idle: last.idle - previous.idle, - iowait, - irq, - softirq, - steal, - guest, - guest_nice, - }, + user: last.user - previous.user, + nice: last.nice - previous.nice, + system: last.system - previous.system, + idle: last.idle - previous.idle, + iowait, + irq, + softirq, + steal, + guest, + guest_nice, }); } None @@ -842,15 +841,26 @@ impl CPUSocket { "last_record value: {} previous_record value: {}", &last_record.value, &previous_record.value ); - if let (Ok(last_microjoules), Ok(previous_microjoules)) = ( - last_record.value.trim().parse::(), - previous_record.value.trim().parse::(), - ) { - let microjoules = last_microjoules - previous_microjoules; + let last_rec_val = last_record.value.trim(); + debug!("l851 : trying to parse {} as u64", last_rec_val); + let prev_rec_val = previous_record.value.trim(); + debug!("l853 : trying to parse {} as u64", prev_rec_val); + if let (Ok(last_microjoules), Ok(previous_microjoules)) = + (last_rec_val.parse::(), prev_rec_val.parse::()) + { + let mut microjoules = 0; + if last_microjoules >= previous_microjoules { + microjoules = last_microjoules - previous_microjoules; + } else { + debug!( + "previous_microjoules ({}) > last_microjoules ({})", + previous_microjoules, last_microjoules + ); + } let time_diff = last_record.timestamp.as_secs_f64() - previous_record.timestamp.as_secs_f64(); let microwatts = microjoules as f64 / time_diff; - debug!("microwatts: {}", microwatts); + debug!("l866: microwatts: {}", microwatts); return Some(Record::new( last_record.timestamp, (microwatts as u64).to_string(), @@ -1052,23 +1062,32 @@ impl fmt::Display for Record { #[derive(Debug)] pub struct CPUStat { - pub cputime: CpuTime, + user: u64, + nice: u64, + system: u64, + idle: u64, + irq: Option, + iowait: Option, + softirq: Option, + steal: Option, + guest: Option, + guest_nice: Option, } impl CPUStat { /// Returns the total of active CPU time spent, for this stat measurement /// (not iowait, idle, irq or softirq) - pub fn total_time_jiffies(&self) -> f32 { - let user = self.cputime.user; - let nice = self.cputime.nice; - let system = self.cputime.system; - let idle = self.cputime.idle; - let irq = self.cputime.irq.unwrap_or_default(); - let iowait = self.cputime.iowait.unwrap_or_default(); - let softirq = self.cputime.softirq.unwrap_or_default(); - let steal = self.cputime.steal.unwrap_or_default(); - let guest_nice = self.cputime.guest_nice.unwrap_or_default(); - let guest = self.cputime.guest.unwrap_or_default(); + pub fn total_time_jiffies(&self) -> u64 { + let user = self.user; + let nice = self.nice; + let system = self.system; + let idle = self.idle; + let irq = self.irq.unwrap_or_default(); + let iowait = self.iowait.unwrap_or_default(); + let softirq = self.softirq.unwrap_or_default(); + let steal = self.steal.unwrap_or_default(); + let guest_nice = self.guest_nice.unwrap_or_default(); + let guest = self.guest.unwrap_or_default(); trace!( "CPUStat contains user {} nice {} system {} idle: {} irq {} softirq {} iowait {} steal {} guest_nice {} guest {}", @@ -1082,18 +1101,16 @@ impl Clone for CPUStat { /// Returns a copy of CPUStat instance fn clone(&self) -> CPUStat { CPUStat { - cputime: CpuTime { - user: self.cputime.user, - nice: self.cputime.nice, - system: self.cputime.system, - softirq: self.cputime.softirq, - irq: self.cputime.irq, - idle: self.cputime.idle, - iowait: self.cputime.iowait, - steal: self.cputime.steal, - guest: self.cputime.guest, - guest_nice: self.cputime.guest_nice, - }, + user: self.user, + guest: self.guest, + guest_nice: self.guest_nice, + idle: self.idle, + iowait: self.iowait, + irq: self.irq, + nice: self.nice, + softirq: self.softirq, + steal: self.steal, + system: self.system, } } } @@ -1112,7 +1129,7 @@ mod tests { for c in &cores { println!("{:?}", c.attributes.get("processor")); } - assert_eq!(cores.len() > 0, true); + assert_eq!(!cores.is_empty(), true); for c in &cores { assert_eq!(c.attributes.len() > 5, true); } diff --git a/src/sensors/utils.rs b/src/sensors/utils.rs index 4b1c5a64..95f1b3b8 100644 --- a/src/sensors/utils.rs +++ b/src/sensors/utils.rs @@ -205,7 +205,7 @@ impl ProcessTracker { if let Some(container) = containers.iter().find(|x| x.Id == container_id) { let mut names = String::from(""); for n in &container.Names { - names.push_str(&n.trim().replace("/", "")); + names.push_str(&n.trim().replace('/', "")); } description.insert(String::from("container_names"), names); description.insert(