From b99430fe6658506bbb893bd57181475640882124 Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Sat, 9 Nov 2024 09:50:25 +0800 Subject: [PATCH 1/6] Add RDMA collector Signed-off-by: Xiaodong Ye --- collector/helper.go | 8 ++ collector/rdma_linux.go | 240 ++++++++++++++++++++++++++++++++++++++++ go.mod | 3 + go.sum | 7 ++ 4 files changed, 258 insertions(+) create mode 100644 collector/rdma_linux.go diff --git a/collector/helper.go b/collector/helper.go index 21dddf887c..937d277331 100644 --- a/collector/helper.go +++ b/collector/helper.go @@ -32,6 +32,14 @@ func readUintFromFile(path string) (uint64, error) { return value, nil } +func readStringFromFile(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} + var metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`) // SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores. diff --git a/collector/rdma_linux.go b/collector/rdma_linux.go new file mode 100644 index 0000000000..ac7bb05214 --- /dev/null +++ b/collector/rdma_linux.go @@ -0,0 +1,240 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !nordma +// +build !nordma + +// The hard work of collecting data from the kernel via the MLNX_OFED interfaces is done by +// https://github.com/Mellanox/rdmamap +// by Mellanox. Used under the Apache 2.0 license. + +package collector + +import ( + "fmt" + "log/slog" + "os" + "path/filepath" + "regexp" + "strings" + "sync" + + "github.com/Mellanox/rdmamap" + "github.com/alecthomas/kingpin/v2" + "github.com/prometheus/client_golang/prometheus" +) + +var ( + rdmaDeviceInclude = kingpin.Flag("collector.rdma.device-include", "Regexp of rdma devices to include (mutually exclusive to device-exclude).").String() + rdmaDeviceExclude = kingpin.Flag("collector.rdma.device-exclude", "Regexp of rdma devices to exclude (mutually exclusive to device-include).").String() + rdmaIncludedMetrics = kingpin.Flag("collector.rdma.metrics-include", "Regexp of rdma stats to include.").Default(".*").String() + + rdmaHwCounters = map[string]string{ + "roce_slow_restart_cnps": "RDMA RoCE slow restart CNPS", + "rp_cnp_ignored": "RDMA RP CNP ignored", + "roce_adp_retrans_to": "RDMA RoCE adaptive retransmission timeout", + "rx_icrc_encapsulated": "RDMA RX ICRC encapsulated", + "resp_local_length_error": "RDMA response local length error", + "np_ecn_marked_roce_packets": "RDMA NP ECN marked RoCE packets", + "roce_slow_restart_trans": "RDMA RoCE slow restart transactions", + "req_remote_invalid_request": "RDMA request remote invalid request", + "local_ack_timeout_err": "RDMA local ACK timeout error", + "lifespan": "RDMA lifespan", + "req_cqe_error": "RDMA request CQE error", + "rnr_nak_retry_err": "RDMA RNR NAK retry error", + "np_cnp_sent": "RDMA NP CNP sent", + "rx_dct_connect": "RDMA RX DCT connect", + "rp_cnp_handled": "RDMA RP CNP handled", + "implied_nak_seq_err": "RDMA implied NAK sequence error", + "roce_slow_restart": "RDMA RoCE slow restart", + "req_cqe_flush_error": "RDMA request CQE flush error", + "packet_seq_err": "RDMA packet sequence error", + "duplicate_request": "RDMA duplicate request", + "roce_adp_retrans": "RDMA RoCE adaptive retransmission", + "out_of_buffer": "RDMA out of buffer", + "resp_cqe_error": "RDMA response CQE error", + "resp_cqe_flush_error": "RDMA response CQE flush error", + "out_of_sequence": "RDMA out of sequence", + "rx_read_requests": "RDMA RX read requests", + "rx_atomic_requests": "RDMA RX atomic requests", + "req_remote_access_errors": "RDMA request remote access errors", + "rx_write_requests": "RDMA RX write requests", + "resp_remote_access_errors": "RDMA response remote access errors", + } + rdmaCounters = map[string]string{ + "unicast_rcv_packets": "RDMA unicast received packets", + "port_xmit_data": "RDMA port transmit data", + "port_xmit_constraint_errors": "RDMA port transmit constraint errors", + "VL15_dropped": "RDMA VL15 dropped", + "port_rcv_errors": "RDMA port receive errors", + "port_xmit_wait": "RDMA port transmit wait", + "link_error_recovery": "RDMA link error recovery", + "multicast_rcv_packets": "RDMA multicast received packets", + "multicast_xmit_packets": "RDMA multicast transmitted packets", + "port_rcv_remote_physical_errors": "RDMA port receive remote physical errors", + "port_rcv_packets": "RDMA port receive packets", + "unicast_xmit_packets": "RDMA unicast transmitted packets", + "excessive_buffer_overrun_errors": "RDMA excessive buffer overrun errors", + "port_rcv_data": "RDMA port receive data", + "port_rcv_constraint_errors": "RDMA port receive constraint errors", + "link_downed": "RDMA link downed", + "local_link_integrity_errors": "RDMA local link integrity errors", + "port_xmit_discards": "RDMA port transmit discards", + "port_rcv_switch_relay_errors": "RDMA port receive switch relay errors", + "port_xmit_packets": "RDMA port transmit packets", + "symbol_error": "RDMA symbol error", + } +) + +type rdmaCollector struct { + entries map[string]*prometheus.Desc + entriesMutex sync.Mutex + deviceFilter deviceFilter + infoDesc *prometheus.Desc + metricsPattern *regexp.Regexp + logger *slog.Logger +} + +// makeRdmaCollector is the internal constructor for rdmaCollector. +func makeRdmaCollector(logger *slog.Logger) (*rdmaCollector, error) { + if *rdmaDeviceInclude != "" { + logger.Info("Parsed flag --collector.rdma.device-include", "flag", *rdmaDeviceInclude) + } + if *rdmaDeviceExclude != "" { + logger.Info("Parsed flag --collector.rdma.device-exclude", "flag", *rdmaDeviceExclude) + } + if *rdmaIncludedMetrics != "" { + logger.Info("Parsed flag --collector.rdma.metrics-include", "flag", *rdmaIncludedMetrics) + } + + entries := make(map[string]*prometheus.Desc, len(rdmaHwCounters)+len(rdmaCounters)) + for metric, help := range rdmaHwCounters { + entries[metric] = prometheus.NewDesc( + buildRdmaFQName(fmt.Sprintf("hw_%s", metric)), + help, + []string{"device", "port", "interfaces"}, nil, + ) + } + for metric, help := range rdmaCounters { + entries[metric] = prometheus.NewDesc( + buildRdmaFQName(metric), + help, + []string{"device", "port", "interfaces"}, nil, + ) + } + + // Pre-populate some common rdma metrics. + return &rdmaCollector{ + deviceFilter: newDeviceFilter(*rdmaDeviceExclude, *rdmaDeviceInclude), + metricsPattern: regexp.MustCompile(*rdmaIncludedMetrics), + logger: logger, + entries: entries, + infoDesc: prometheus.NewDesc( + buildRdmaFQName("info"), + "A metric with a constant '1' value labeled by device, vendor_id, device_id, firmware_version, driver_version.", + []string{"device", "vendor_id", "device_id", "firmware_version", "driver_version"}, nil, + ), + }, nil +} + +func init() { + registerCollector("rdma", defaultDisabled, NewRdmaCollector) +} + +// Generate the fully-qualified metric name for the rdma metric. +func buildRdmaFQName(metric string) string { + metricName := strings.TrimLeft(strings.ToLower(SanitizeMetricName(metric)), "_") + return prometheus.BuildFQName(namespace, "rdma", metricName) +} + +// NewRdmaCollector returns a new Collector exposing rdma stats. +func NewRdmaCollector(logger *slog.Logger) (Collector, error) { + return makeRdmaCollector(logger) +} + +func getNetworkInterfaces(rdmaDeviceName string) string { + var ifs []string + + dir := filepath.Join(rdmamap.RdmaClassDir, rdmaDeviceName, "device", "net") + fd, err := os.Open(dir) + if err != nil { + return "" + } + defer fd.Close() + + fileInfos, err := fd.Readdir(-1) + if err != nil { + return "" + } + + for i := range fileInfos { + if fileInfos[i].Name() == "." || fileInfos[i].Name() == ".." { + continue + } + ifs = append(ifs, fileInfos[i].Name()) + } + return strings.Join(ifs, ",") +} + +func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { + rdmaDevices := rdmamap.GetRdmaDeviceList() + if len(rdmaDevices) == 0 { + return fmt.Errorf("no rdma devices found") + } + + for _, device := range rdmaDevices { + if c.deviceFilter.ignored(device) { + continue + } + + interfaces := getNetworkInterfaces(device) + + stats, err := rdmamap.GetRdmaSysfsAllPortsStats(device) + if err != nil { + c.logger.Error("rdma stats error", "err", err, "device", device) + continue + } + + updateFunc := func(name string, value float64, labelValues ...string) { + if !c.metricsPattern.MatchString(name) { + return + } + ch <- prometheus.MustNewConstMetric(c.entry(name), prometheus.GaugeValue, + value, labelValues...) + } + + for _, portstats := range stats.PortStats { + for _, stat := range portstats.HwStats { + updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces) + } + for _, stat := range portstats.Stats { + updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces) + } + } + + vendorID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "vendor")) + deviceID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "device")) + firmwareVersion := readStringFromFile("/sys/class/infiniband/mlx5_0/fw_ver") + driverVersion := readStringFromFile("/sys/module/mlx5_core/version") + ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1.0, + device, vendorID, deviceID, firmwareVersion, driverVersion) + } + + return nil +} + +func (c *rdmaCollector) entry(key string) *prometheus.Desc { + c.entriesMutex.Lock() + defer c.entriesMutex.Unlock() + return c.entries[key] +} diff --git a/go.mod b/go.mod index 7d4f847139..82d7a0cba2 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/prometheus/node_exporter go 1.22.0 require ( + github.com/Mellanox/rdmamap v1.1.0 github.com/alecthomas/kingpin/v2 v2.4.0 github.com/beevik/ntp v1.4.3 github.com/coreos/go-systemd/v22 v22.5.0 @@ -48,6 +49,8 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 // indirect + github.com/vishvananda/netlink v1.1.0 // indirect + github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect github.com/xhit/go-str2duration/v2 v2.1.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect diff --git a/go.sum b/go.sum index 5787f76dcc..5b4b0f1009 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4= +github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8= github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= @@ -96,6 +98,10 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= +github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= @@ -112,6 +118,7 @@ golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= From 09f6565f466b6b5d6e59b6040fa6c1af5eb511f0 Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Sun, 10 Nov 2024 14:00:14 +0800 Subject: [PATCH 2/6] Address review comments Signed-off-by: Xiaodong Ye --- collector/rdma_linux.go | 23 +++++++++++++++++++++-- go.mod | 4 +++- go.sum | 7 ++++--- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/collector/rdma_linux.go b/collector/rdma_linux.go index ac7bb05214..ee16c89662 100644 --- a/collector/rdma_linux.go +++ b/collector/rdma_linux.go @@ -117,6 +117,25 @@ func makeRdmaCollector(logger *slog.Logger) (*rdmaCollector, error) { logger.Info("Parsed flag --collector.rdma.metrics-include", "flag", *rdmaIncludedMetrics) } + // Update paths to respect the mount points setup. + for _, dir := range []*string{ + &rdmamap.RdmaClassDir, + &rdmamap.RdmaIbUcmDir, + &rdmamap.RdmaUmadDir, + &rdmamap.RdmaUverbsDir, + &rdmamap.PciDevDir, + &rdmamap.AuxDevDir, + } { + *dir = strings.TrimPrefix(*dir, "/sys") + *dir = sysFilePath(*dir) + } + for _, dir := range []*string{ + &rdmamap.RdmaUcmDevice, + &rdmamap.RdmaDeviceDir, + } { + *dir = rootfsFilePath(*dir) + } + entries := make(map[string]*prometheus.Desc, len(rdmaHwCounters)+len(rdmaCounters)) for metric, help := range rdmaHwCounters { entries[metric] = prometheus.NewDesc( @@ -224,8 +243,8 @@ func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { vendorID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "vendor")) deviceID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "device")) - firmwareVersion := readStringFromFile("/sys/class/infiniband/mlx5_0/fw_ver") - driverVersion := readStringFromFile("/sys/module/mlx5_core/version") + firmwareVersion := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, "mlx5_0", "fw_ver")) + driverVersion := readStringFromFile(sysFilePath("module/mlx5_core/version")) ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1.0, device, vendorID, deviceID, firmwareVersion, driverVersion) } diff --git a/go.mod b/go.mod index 82d7a0cba2..db59a5d293 100644 --- a/go.mod +++ b/go.mod @@ -50,7 +50,7 @@ require ( github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 // indirect github.com/vishvananda/netlink v1.1.0 // indirect - github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect + github.com/vishvananda/netns v0.0.4 // indirect github.com/xhit/go-str2duration/v2 v2.1.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect @@ -62,3 +62,5 @@ require ( google.golang.org/protobuf v1.35.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) + +replace github.com/Mellanox/rdmamap => github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50 diff --git a/go.sum b/go.sum index 5b4b0f1009..9cd8ce9f7f 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4= -github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8= github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= @@ -100,10 +98,13 @@ github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOf github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= -github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= +github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= +github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50 h1:S5Kx7HYonzAhND9Pb7ZekrdQ/E1uc7m6ZdmVjy8pD3o= +github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50/go.mod h1:D3ffy5KqtmeWfuW0cX/GQW0J6S3k8aORk4bf9CBOhng= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= From 5346a18a048ef37643b66df446e73d75cbaed8af Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Tue, 19 Nov 2024 22:22:13 +0800 Subject: [PATCH 3/6] Add two missing metrics Signed-off-by: Xiaodong Ye --- collector/rdma_linux.go | 62 +++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/collector/rdma_linux.go b/collector/rdma_linux.go index ee16c89662..f7563ae389 100644 --- a/collector/rdma_linux.go +++ b/collector/rdma_linux.go @@ -40,36 +40,38 @@ var ( rdmaIncludedMetrics = kingpin.Flag("collector.rdma.metrics-include", "Regexp of rdma stats to include.").Default(".*").String() rdmaHwCounters = map[string]string{ - "roce_slow_restart_cnps": "RDMA RoCE slow restart CNPS", - "rp_cnp_ignored": "RDMA RP CNP ignored", - "roce_adp_retrans_to": "RDMA RoCE adaptive retransmission timeout", - "rx_icrc_encapsulated": "RDMA RX ICRC encapsulated", - "resp_local_length_error": "RDMA response local length error", - "np_ecn_marked_roce_packets": "RDMA NP ECN marked RoCE packets", - "roce_slow_restart_trans": "RDMA RoCE slow restart transactions", - "req_remote_invalid_request": "RDMA request remote invalid request", - "local_ack_timeout_err": "RDMA local ACK timeout error", - "lifespan": "RDMA lifespan", - "req_cqe_error": "RDMA request CQE error", - "rnr_nak_retry_err": "RDMA RNR NAK retry error", - "np_cnp_sent": "RDMA NP CNP sent", - "rx_dct_connect": "RDMA RX DCT connect", - "rp_cnp_handled": "RDMA RP CNP handled", - "implied_nak_seq_err": "RDMA implied NAK sequence error", - "roce_slow_restart": "RDMA RoCE slow restart", - "req_cqe_flush_error": "RDMA request CQE flush error", - "packet_seq_err": "RDMA packet sequence error", - "duplicate_request": "RDMA duplicate request", - "roce_adp_retrans": "RDMA RoCE adaptive retransmission", - "out_of_buffer": "RDMA out of buffer", - "resp_cqe_error": "RDMA response CQE error", - "resp_cqe_flush_error": "RDMA response CQE flush error", - "out_of_sequence": "RDMA out of sequence", - "rx_read_requests": "RDMA RX read requests", - "rx_atomic_requests": "RDMA RX atomic requests", - "req_remote_access_errors": "RDMA request remote access errors", - "rx_write_requests": "RDMA RX write requests", - "resp_remote_access_errors": "RDMA response remote access errors", + "roce_slow_restart_cnps": "RDMA RoCE slow restart CNPS", + "rp_cnp_ignored": "RDMA RP CNP ignored", + "roce_adp_retrans_to": "RDMA RoCE adaptive retransmission timeout", + "rx_icrc_encapsulated": "RDMA RX ICRC encapsulated", + "resp_local_length_error": "RDMA response local length error", + "np_ecn_marked_roce_packets": "RDMA NP ECN marked RoCE packets", + "roce_slow_restart_trans": "RDMA RoCE slow restart transactions", + "req_remote_invalid_request": "RDMA request remote invalid request", + "local_ack_timeout_err": "RDMA local ACK timeout error", + "lifespan": "RDMA lifespan", + "req_cqe_error": "RDMA request CQE error", + "rnr_nak_retry_err": "RDMA RNR NAK retry error", + "np_cnp_sent": "RDMA NP CNP sent", + "rx_dct_connect": "RDMA RX DCT connect", + "rp_cnp_handled": "RDMA RP CNP handled", + "implied_nak_seq_err": "RDMA implied NAK sequence error", + "roce_slow_restart": "RDMA RoCE slow restart", + "req_cqe_flush_error": "RDMA request CQE flush error", + "packet_seq_err": "RDMA packet sequence error", + "duplicate_request": "RDMA duplicate request", + "roce_adp_retrans": "RDMA RoCE adaptive retransmission", + "out_of_buffer": "RDMA out of buffer", + "resp_cqe_error": "RDMA response CQE error", + "resp_cqe_flush_error": "RDMA response CQE flush error", + "out_of_sequence": "RDMA out of sequence", + "rx_read_requests": "RDMA RX read requests", + "rx_atomic_requests": "RDMA RX atomic requests", + "req_remote_access_errors": "RDMA request remote access errors", + "rx_write_requests": "RDMA RX write requests", + "resp_remote_access_errors": "RDMA response remote access errors", + "req_transport_retries_exceeded": "RDMA request transport retries exceeded", + "req_rnr_retries_exceeded": "RDMA request RNR retries exceeded", } rdmaCounters = map[string]string{ "unicast_rcv_packets": "RDMA unicast received packets", From 699af30017930df9d6d7f215ab5f239de668294d Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Wed, 20 Nov 2024 08:48:47 +0800 Subject: [PATCH 4/6] Add a warning message if metric not found Signed-off-by: Xiaodong Ye --- collector/rdma_linux.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/collector/rdma_linux.go b/collector/rdma_linux.go index f7563ae389..76eee2d1ac 100644 --- a/collector/rdma_linux.go +++ b/collector/rdma_linux.go @@ -230,6 +230,11 @@ func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { if !c.metricsPattern.MatchString(name) { return } + entry := c.entry(name) + if entry == nil { + c.logger.Warn("rdma metric not found", "name", name) + return + } ch <- prometheus.MustNewConstMetric(c.entry(name), prometheus.GaugeValue, value, labelValues...) } From 3be42e0bf442aa0c53d1ca9f507c8cc55bcdbfd3 Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Fri, 13 Dec 2024 09:16:33 +0800 Subject: [PATCH 5/6] Update go module: rdmamap Signed-off-by: Xiaodong Ye --- go.mod | 4 +--- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index db59a5d293..b4a2494365 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/prometheus/node_exporter go 1.22.0 require ( - github.com/Mellanox/rdmamap v1.1.0 + github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57 github.com/alecthomas/kingpin/v2 v2.4.0 github.com/beevik/ntp v1.4.3 github.com/coreos/go-systemd/v22 v22.5.0 @@ -62,5 +62,3 @@ require ( google.golang.org/protobuf v1.35.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) - -replace github.com/Mellanox/rdmamap => github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50 diff --git a/go.sum b/go.sum index 9cd8ce9f7f..0bfbc6a9de 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57 h1:ffMnYJFt7Bgp/2s2fOsQ0LpKfuU4xCk4afAtQG1wuBM= +github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57/go.mod h1:D3ffy5KqtmeWfuW0cX/GQW0J6S3k8aORk4bf9CBOhng= github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= @@ -103,8 +105,6 @@ github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1Y github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= -github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50 h1:S5Kx7HYonzAhND9Pb7ZekrdQ/E1uc7m6ZdmVjy8pD3o= -github.com/yeahdongcn/rdmamap v0.0.0-20241110052645-2f11ac5dce50/go.mod h1:D3ffy5KqtmeWfuW0cX/GQW0J6S3k8aORk4bf9CBOhng= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= From deb325bf5e014c71484e01290d8b231592067e44 Mon Sep 17 00:00:00 2001 From: Xiaodong Ye Date: Tue, 24 Dec 2024 10:18:25 +0800 Subject: [PATCH 6/6] Address review comments Signed-off-by: Xiaodong Ye --- collector/rdma_linux.go | 206 +++++++++++++++++++++++++--------------- 1 file changed, 128 insertions(+), 78 deletions(-) diff --git a/collector/rdma_linux.go b/collector/rdma_linux.go index 76eee2d1ac..d75ae455eb 100644 --- a/collector/rdma_linux.go +++ b/collector/rdma_linux.go @@ -39,62 +39,114 @@ var ( rdmaDeviceExclude = kingpin.Flag("collector.rdma.device-exclude", "Regexp of rdma devices to exclude (mutually exclusive to device-include).").String() rdmaIncludedMetrics = kingpin.Flag("collector.rdma.metrics-include", "Regexp of rdma stats to include.").Default(".*").String() - rdmaHwCounters = map[string]string{ - "roce_slow_restart_cnps": "RDMA RoCE slow restart CNPS", - "rp_cnp_ignored": "RDMA RP CNP ignored", - "roce_adp_retrans_to": "RDMA RoCE adaptive retransmission timeout", - "rx_icrc_encapsulated": "RDMA RX ICRC encapsulated", - "resp_local_length_error": "RDMA response local length error", - "np_ecn_marked_roce_packets": "RDMA NP ECN marked RoCE packets", - "roce_slow_restart_trans": "RDMA RoCE slow restart transactions", - "req_remote_invalid_request": "RDMA request remote invalid request", - "local_ack_timeout_err": "RDMA local ACK timeout error", - "lifespan": "RDMA lifespan", - "req_cqe_error": "RDMA request CQE error", - "rnr_nak_retry_err": "RDMA RNR NAK retry error", - "np_cnp_sent": "RDMA NP CNP sent", - "rx_dct_connect": "RDMA RX DCT connect", - "rp_cnp_handled": "RDMA RP CNP handled", - "implied_nak_seq_err": "RDMA implied NAK sequence error", - "roce_slow_restart": "RDMA RoCE slow restart", - "req_cqe_flush_error": "RDMA request CQE flush error", - "packet_seq_err": "RDMA packet sequence error", - "duplicate_request": "RDMA duplicate request", - "roce_adp_retrans": "RDMA RoCE adaptive retransmission", - "out_of_buffer": "RDMA out of buffer", - "resp_cqe_error": "RDMA response CQE error", - "resp_cqe_flush_error": "RDMA response CQE flush error", - "out_of_sequence": "RDMA out of sequence", - "rx_read_requests": "RDMA RX read requests", - "rx_atomic_requests": "RDMA RX atomic requests", - "req_remote_access_errors": "RDMA request remote access errors", - "rx_write_requests": "RDMA RX write requests", - "resp_remote_access_errors": "RDMA response remote access errors", - "req_transport_retries_exceeded": "RDMA request transport retries exceeded", - "req_rnr_retries_exceeded": "RDMA request RNR retries exceeded", + lookupTable = map[string]string{ + "port_rcv_data": "mlx5_port_rcv_data_total", + "port_rcv_packets": "mlx5_port_rcv_packets_total", + "port_multicast_rcv_packets": "mlx5_port_multicast_rcv_packets_total", + "port_unicast_rcv_packets": "mlx5_port_unicast_rcv_packets_total", + "port_xmit_data": "mlx5_port_xmit_data_total", + "port_xmit_packets": "mlx5_port_xmit_packets_total", + "port_rcv_switch_relay_errors": "mlx5_port_rcv_switch_relay_errors_total", + "port_rcv_errors": "mlx5_port_rcv_errors_total", + "port_rcv_constraint_errors": "mlx5_port_rcv_constraint_errors_total", + "local_link_integrity_errors": "mlx5_local_link_integrity_errors_total", + "port_xmit_wait": "mlx5_port_xmit_wait_total", + "port_multicast_xmit_packets": "mlx5_port_multicast_xmit_packets_total", + "port_unicast_xmit_packets": "mlx5_port_unicast_xmit_packets_total", + "port_xmit_discards": "mlx5_port_xmit_discards_total", + "port_xmit_constraint_errors": "mlx5_port_xmit_constraint_errors_total", + "port_rcv_remote_physical_errors": "mlx5_port_rcv_remote_physical_errors_total", + "symbol_error": "mlx5_symbol_error_total", + "VL15_dropped": "mlx5_vl15_dropped_total", + "link_error_recovery": "mlx5_link_error_recovery_total", + "link_downed": "mlx5_link_downed_total", + "duplicate_request": "mlx5_duplicate_request_total", + "implied_nak_seq_err": "mlx5_implied_nak_seq_err_total", + "lifespan": "mlx5_lifespan_ms", + "local_ack_timeout_err": "mlx5_local_ack_timeout_err_total", + "np_cnp_sent": "mlx5_np_cnp_sent_total", + "np_ecn_marked_roce_packets": "mlx5_np_ecn_marked_roce_packets_total", + "out_of_buffer": "mlx5_out_of_buffer_total", + "out_of_sequence": "mlx5_out_of_sequence_total", + "packet_seq_err": "mlx5_packet_seq_err_total", + "req_cqe_error": "mlx5_req_cqe_error_total", + "req_cqe_flush_error": "mlx5_req_cqe_flush_error_total", + "req_remote_access_errors": "mlx5_req_remote_access_errors_total", + "req_remote_invalid_request": "mlx5_req_remote_invalid_request_total", + "resp_cqe_error": "mlx5_resp_cqe_error_total", + "resp_cqe_flush_error": "mlx5_resp_cqe_flush_error_total", + "resp_local_length_error": "mlx5_resp_local_length_error_total", + "resp_remote_access_errors": "mlx5_resp_remote_access_errors_total", + "rnr_nak_retry_err": "mlx5_rnr_nak_retry_err_total", + "rp_cnp_handled": "mlx5_rp_cnp_handled_total", + "rp_cnp_ignored": "mlx5_rp_cnp_ignored_total", + "rx_atomic_requests": "mlx5_rx_atomic_requests_total", + "rx_dct_connect": "mlx5_rx_dct_connect_total", + "rx_read_requests": "mlx5_rx_read_requests_total", + "rx_write_requests": "mlx5_rx_write_requests_total", + "rx_icrc_encapsulated": "mlx5_rx_icrc_encapsulated_total", + "roce_adp_retrans": "mlx5_roce_adp_retrans_total", + "roce_adp_retrans_to": "mlx5_roce_adp_retrans_to_total", + "roce_slow_restart": "mlx5_roce_slow_restart_total", + "roce_slow_restart_cnps": "mlx5_roce_slow_restart_cnps_total", + "roce_slow_restart_trans": "mlx5_roce_slow_restart_trans_total", } - rdmaCounters = map[string]string{ - "unicast_rcv_packets": "RDMA unicast received packets", - "port_xmit_data": "RDMA port transmit data", - "port_xmit_constraint_errors": "RDMA port transmit constraint errors", - "VL15_dropped": "RDMA VL15 dropped", - "port_rcv_errors": "RDMA port receive errors", - "port_xmit_wait": "RDMA port transmit wait", - "link_error_recovery": "RDMA link error recovery", - "multicast_rcv_packets": "RDMA multicast received packets", - "multicast_xmit_packets": "RDMA multicast transmitted packets", - "port_rcv_remote_physical_errors": "RDMA port receive remote physical errors", - "port_rcv_packets": "RDMA port receive packets", - "unicast_xmit_packets": "RDMA unicast transmitted packets", - "excessive_buffer_overrun_errors": "RDMA excessive buffer overrun errors", - "port_rcv_data": "RDMA port receive data", - "port_rcv_constraint_errors": "RDMA port receive constraint errors", - "link_downed": "RDMA link downed", - "local_link_integrity_errors": "RDMA local link integrity errors", - "port_xmit_discards": "RDMA port transmit discards", - "port_rcv_switch_relay_errors": "RDMA port receive switch relay errors", - "port_xmit_packets": "RDMA port transmit packets", - "symbol_error": "RDMA symbol error", + + // https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters + portCounters = map[string]string{ + "mlx5_port_rcv_data_total": "Total number of data octets received on all VLs from the port (divided by 4, counting in double words)", + "mlx5_port_rcv_packets_total": "Total number of received packets (may include packets with errors)", + "mlx5_port_multicast_rcv_packets_total": "Total number of multicast packets received (including those with errors)", + "mlx5_port_unicast_rcv_packets_total": "Total number of unicast packets received (including those with errors)", + "mlx5_port_xmit_data_total": "Total number of data octets transmitted on all VLs from the port (divided by 4, counting in double words)", + "mlx5_port_xmit_packets_total": "Total number of transmitted packets (may include packets with errors)", + "mlx5_port_rcv_switch_relay_errors_total": "Total number of packets discarded because they could not be forwarded by the switch relay", + "mlx5_port_rcv_errors_total": "Total number of received packets with errors", + "mlx5_port_rcv_constraint_errors_total": "Total number of packets discarded due to constraints on the switch physical port", + "mlx5_local_link_integrity_errors_total": "Total number of times local physical errors exceeded the threshold and caused a local link integrity failure", + "mlx5_port_xmit_wait_total": "Total number of ticks during which the port had data to transmit but no data was sent due to insufficient credits or lack of arbitration", + "mlx5_port_multicast_xmit_packets_total": "Total number of multicast packets transmitted (including those with errors)", + "mlx5_port_unicast_xmit_packets_total": "Total number of unicast packets transmitted (including those with errors)", + "mlx5_port_xmit_discards_total": "Total number of outbound packets discarded because the port is down or congested", + "mlx5_port_xmit_constraint_errors_total": "Total number of packets not transmitted due to constraints on the switch physical port", + "mlx5_port_rcv_remote_physical_errors_total": "Total number of packets marked with the EBP delimiter received on the port", + "mlx5_symbol_error_total": "Total number of minor link errors detected on one or more physical lanes", + "mlx5_vl15_dropped_total": "Total number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers)", + "mlx5_link_error_recovery_total": "Total number of successful link error recovery processes completed by the Port Training state machine", + "mlx5_link_downed_total": "Total number of failed link error recovery processes that caused the link to be downed", + } + + hwCounters = map[string]string{ + "mlx5_duplicate_request_total": "Total number of received packets that were duplicates of previous requests", + "mlx5_implied_nak_seq_err_total": "Total number of times the requested ACK had a PSN larger than the expected PSN for an RDMA read or response", + "mlx5_lifespan_ms": "Maximum period in milliseconds which defines the aging of counter reads", + "mlx5_local_ack_timeout_err_total": "Total number of times the QP's ACK timer expired for RC, XRC, or DCT QPs at the sender side (retry limit not exceeded)", + "mlx5_np_cnp_sent_total": "Total number of CNP packets sent by the Notification Point due to congestion in the RoCEv2 IP header (ECN bits)", + "mlx5_np_ecn_marked_roce_packets_total": "Total number of RoCEv2 packets received marked with ECN (congestion experienced)", + "mlx5_out_of_buffer_total": "Total number of drops due to lack of WQE for the associated QPs", + "mlx5_out_of_sequence_total": "Total number of out-of-sequence packets received", + "mlx5_packet_seq_err_total": "Total number of received NAK sequence error packets (QP retry limit not exceeded)", + "mlx5_req_cqe_error_total": "Total number of times the requester detected CQEs completed with errors", + "mlx5_req_cqe_flush_error_total": "Total number of times the requester detected CQEs completed with flushed errors", + "mlx5_req_remote_access_errors_total": "Total number of times the requester detected remote access errors", + "mlx5_req_remote_invalid_request_total": "Total number of times the requester detected remote invalid request errors", + "mlx5_resp_cqe_error_total": "Total number of times the responder detected CQEs completed with errors", + "mlx5_resp_cqe_flush_error_total": "Total number of times the responder detected CQEs completed with flushed errors", + "mlx5_resp_local_length_error_total": "Total number of times the responder detected local length errors", + "mlx5_resp_remote_access_errors_total": "Total number of times the responder detected remote access errors", + "mlx5_rnr_nak_retry_err_total": "Total number of received RNR NAK packets (QP retry limit not exceeded)", + "mlx5_rp_cnp_handled_total": "Total number of CNP packets handled by the Reaction Point HCA to throttle transmission rate", + "mlx5_rp_cnp_ignored_total": "Total number of CNP packets ignored by the Reaction Point HCA", + "mlx5_rx_atomic_requests_total": "Total number of received ATOMIC requests for associated QPs", + "mlx5_rx_dct_connect_total": "Total number of received connection requests for associated DCTs", + "mlx5_rx_read_requests_total": "Total number of received READ requests for associated QPs", + "mlx5_rx_write_requests_total": "Total number of received WRITE requests for associated QPs", + "mlx5_rx_icrc_encapsulated_total": "Total number of RoCE packets with ICRC errors", + "mlx5_roce_adp_retrans_total": "Total number of adaptive retransmissions for RoCE traffic", + "mlx5_roce_adp_retrans_to_total": "Total number of times RoCE traffic reached timeout due to adaptive retransmission", + "mlx5_roce_slow_restart_total": "Total number of times RoCE slow restart was used", + "mlx5_roce_slow_restart_cnps_total": "Total number of times RoCE slow restart generated CNP packets", + "mlx5_roce_slow_restart_trans_total": "Total number of times RoCE slow restart changed state to slow restart", } ) @@ -138,20 +190,15 @@ func makeRdmaCollector(logger *slog.Logger) (*rdmaCollector, error) { *dir = rootfsFilePath(*dir) } - entries := make(map[string]*prometheus.Desc, len(rdmaHwCounters)+len(rdmaCounters)) - for metric, help := range rdmaHwCounters { - entries[metric] = prometheus.NewDesc( - buildRdmaFQName(fmt.Sprintf("hw_%s", metric)), - help, - []string{"device", "port", "interfaces"}, nil, - ) - } - for metric, help := range rdmaCounters { - entries[metric] = prometheus.NewDesc( - buildRdmaFQName(metric), - help, - []string{"device", "port", "interfaces"}, nil, - ) + entries := make(map[string]*prometheus.Desc, len(portCounters)+len(hwCounters)) + for _, counters := range []map[string]string{portCounters, hwCounters} { + for metric, help := range counters { + entries[metric] = prometheus.NewDesc( + buildRdmaFQName(metric), + help, + []string{"device", "port", "interfaces"}, nil, + ) + } } // Pre-populate some common rdma metrics. @@ -226,24 +273,27 @@ func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { continue } - updateFunc := func(name string, value float64, labelValues ...string) { - if !c.metricsPattern.MatchString(name) { + updateFunc := func(key string, value float64, labelValues ...string) { + metric, ok := lookupTable[key] + if !ok { + c.logger.Warn("rdma metric not found in lookup table", "key", key) return } - entry := c.entry(name) + if !c.metricsPattern.MatchString(metric) { + c.logger.Debug("rdma metric excluded", "metric", metric) + return + } + entry := c.entry(metric) if entry == nil { - c.logger.Warn("rdma metric not found", "name", name) + c.logger.Warn("rdma metric not found", "metric", metric) return } - ch <- prometheus.MustNewConstMetric(c.entry(name), prometheus.GaugeValue, + ch <- prometheus.MustNewConstMetric(entry, prometheus.GaugeValue, value, labelValues...) } for _, portstats := range stats.PortStats { - for _, stat := range portstats.HwStats { - updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces) - } - for _, stat := range portstats.Stats { + for _, stat := range append(portstats.HwStats, portstats.Stats...) { updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces) } }