diff --git a/pkg/engines/k8s/labeler.go b/pkg/engines/k8s/labeler.go index 056a41f..b8224b8 100644 --- a/pkg/engines/k8s/labeler.go +++ b/pkg/engines/k8s/labeler.go @@ -24,6 +24,18 @@ import ( "github.com/NVIDIA/topograph/pkg/topology" ) +const ( + hierarchyLayerAccelerator = "network.topology.kubernetes.io/accelerator" + hierarchyLayerBlock = "network.topology.kubernetes.io/block" + hierarchyLayerSpine = "network.topology.kubernetes.io/spine" + hierarchyLayerDatacenter = "network.topology.kubernetes.io/datacenter" +) + +var switchNetworkHierarchy = []string{hierarchyLayerBlock, hierarchyLayerSpine, hierarchyLayerDatacenter} + +// map nodename:[label name: label value] +type nodeLabelMap map[string]map[string]string + type Labeler interface { AddNodeLabels(context.Context, string, map[string]string) error } @@ -39,41 +51,62 @@ func NewTopologyLabeler() *topologyLabeler { } func (l *topologyLabeler) ApplyNodeLabels(ctx context.Context, v *topology.Vertex, labeler Labeler) error { - if v == nil { + if v == nil || len(v.Vertices) == 0 { return nil } - levels := []string{} - if len(v.ID) != 0 { - levels = append(levels, v.ID) + + nodeMap := make(nodeLabelMap) + if blockRoot, ok := v.Vertices[topology.TopologyBlock]; ok { + if err := l.getBlockNodeLabels(blockRoot, nodeMap); err != nil { + return err + } } - return l.applyNodeLabels(ctx, v, labeler, levels) + if treeRoot, ok := v.Vertices[topology.TopologyTree]; ok { + layers := []string{} + if len(treeRoot.ID) != 0 { + layers = append(layers, treeRoot.ID) + } + if err := l.getTreeNodeLabels(treeRoot, nodeMap, layers); err != nil { + return err + } + } + + for nodeName, labels := range nodeMap { + if err := labeler.AddNodeLabels(ctx, nodeName, labels); err != nil { + return err + } + } + + return nil } -func (l *topologyLabeler) applyNodeLabels(ctx context.Context, v *topology.Vertex, labeler Labeler, levels []string) error { +func (l *topologyLabeler) getTreeNodeLabels(v *topology.Vertex, nodeMap nodeLabelMap, layers []string) error { if len(v.Vertices) == 0 { // compute node - if len(levels) != 0 { - if v.ID != levels[0] { - return fmt.Errorf("instance ID mismatch: expected %s, got %s", v.ID, levels[0]) + if len(layers) != 0 { + if v.ID != layers[0] { + return fmt.Errorf("instance ID mismatch: expected %s, got %s", v.ID, layers[0]) } - - labels := make(map[string]string) - for i, sw := range levels[1:] { + nodeName := v.Name + labels, ok := nodeMap[nodeName] + if !ok { + labels = make(map[string]string) + nodeMap[nodeName] = labels + } + for i, sw := range layers[1:] { if len(sw) == 0 { break } - labels[fmt.Sprintf("topology.kubernetes.io/network-level-%d", i+1)] = l.checkLabel(sw) - } - - if err := labeler.AddNodeLabels(ctx, v.Name, labels); err != nil { - return err + if i < len(switchNetworkHierarchy) { + labels[(switchNetworkHierarchy[i])] = l.checkLabel(sw) + } } } return nil } for _, w := range v.Vertices { - if err := l.applyNodeLabels(ctx, w, labeler, append([]string{w.ID}, levels...)); err != nil { + if err := l.getTreeNodeLabels(w, nodeMap, append([]string{w.ID}, layers...)); err != nil { return err } } @@ -81,6 +114,24 @@ func (l *topologyLabeler) applyNodeLabels(ctx context.Context, v *topology.Verte return nil } +func (l *topologyLabeler) getBlockNodeLabels(v *topology.Vertex, nodeMap nodeLabelMap) error { + for _, block := range v.Vertices { + for _, node := range block.Vertices { + nodeName := node.Name + labels, ok := nodeMap[nodeName] + if !ok { + labels = make(map[string]string) + nodeMap[nodeName] = labels + } + if val, ok := labels[hierarchyLayerAccelerator]; ok { + return fmt.Errorf("multiple accelerator labels %s, %s for node %s", val, block.ID, nodeName) + } + labels[hierarchyLayerAccelerator] = l.checkLabel(block.ID) + } + } + return nil +} + // checkLabel checks the length of the label value. // If more than 63 characters (Kubernetes limit), it will replace it with hash func (l *topologyLabeler) checkLabel(val string) string { diff --git a/pkg/engines/k8s/labeler_test.go b/pkg/engines/k8s/labeler_test.go index ceb563f..9a9685c 100644 --- a/pkg/engines/k8s/labeler_test.go +++ b/pkg/engines/k8s/labeler_test.go @@ -38,16 +38,99 @@ func (l *testLabeler) AddNodeLabels(_ context.Context, nodeName string, labels m return nil } -func TestApplyNodeLabels(t *testing.T) { +func TestApplyNodeLabelsWithTree(t *testing.T) { root, _ := translate.GetTreeTestSet(true) labeler := &testLabeler{data: make(map[string]map[string]string)} data := map[string]map[string]string{ - "Node201": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"}, - "Node202": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"}, - "Node205": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"}, - "Node304": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"}, - "Node305": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"}, - "Node306": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"}, + "Node201": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"}, + "Node202": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"}, + "Node205": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"}, + "Node304": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"}, + "Node305": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"}, + "Node306": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"}, + } + + err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler) + require.NoError(t, err) + require.Equal(t, data, labeler.data) +} + +func TestApplyNodeLabelsWithBlock(t *testing.T) { + root, _ := translate.GetBlockWithMultiIBTestSet() + labeler := &testLabeler{data: make(map[string]map[string]string)} + data := map[string]map[string]string{ + "Node104": { + "network.topology.kubernetes.io/accelerator": "B1", + "network.topology.kubernetes.io/block": "S2", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node105": { + "network.topology.kubernetes.io/accelerator": "B1", + "network.topology.kubernetes.io/block": "S2", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node106": { + "network.topology.kubernetes.io/accelerator": "B1", + "network.topology.kubernetes.io/block": "S2", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node201": { + "network.topology.kubernetes.io/accelerator": "B2", + "network.topology.kubernetes.io/block": "S3", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node202": { + "network.topology.kubernetes.io/accelerator": "B2", + "network.topology.kubernetes.io/block": "S3", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node205": { + "network.topology.kubernetes.io/accelerator": "B2", + "network.topology.kubernetes.io/block": "S3", + "network.topology.kubernetes.io/spine": "S1", + "network.topology.kubernetes.io/datacenter": "ibRoot2", + }, + "Node301": { + "network.topology.kubernetes.io/accelerator": "B3", + "network.topology.kubernetes.io/block": "S5", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, + "Node302": { + "network.topology.kubernetes.io/accelerator": "B3", + "network.topology.kubernetes.io/block": "S5", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, + "Node303": { + "network.topology.kubernetes.io/accelerator": "B3", + "network.topology.kubernetes.io/block": "S5", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, + "Node401": { + "network.topology.kubernetes.io/accelerator": "B4", + "network.topology.kubernetes.io/block": "S6", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, + "Node402": { + "network.topology.kubernetes.io/accelerator": "B4", + "network.topology.kubernetes.io/block": "S6", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, + "Node403": { + "network.topology.kubernetes.io/accelerator": "B4", + "network.topology.kubernetes.io/block": "S6", + "network.topology.kubernetes.io/spine": "S4", + "network.topology.kubernetes.io/datacenter": "ibRoot1", + }, } err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler) diff --git a/pkg/translate/output.go b/pkg/translate/output.go index 4714744..84b7fed 100644 --- a/pkg/translate/output.go +++ b/pkg/translate/output.go @@ -351,6 +351,17 @@ func split(input string) (string, string) { } func GetTreeTestSet(testForLongLabelName bool) (*topology.Vertex, map[string]string) { + // + // S1 + // / \ + // S2 S3 + // | | + // --- --- + // I14 I21 + // I15 I22 + // I16 I25 + // --- --- + // var s3name string if testForLongLabelName { s3name = "S3very-very-long-id-to-check-label-value-limits-of-63-characters" @@ -394,6 +405,19 @@ func GetTreeTestSet(testForLongLabelName bool) (*topology.Vertex, map[string]str } func GetBlockWithMultiIBTestSet() (*topology.Vertex, map[string]string) { + // + // ibRoot2 ibRoot1 + // | | + // S1 S4 + // / \ / \ + // S2 S3 S5 S6 + // | | | | + // --- --- --- --- + // I14\ I21\ I31\ I41\ + // I15-B1 I22-B2 I32-B3 I42-B4 + // I16/ I25/ I33/ I43/ + // --- --- --- --- + // instance2node := map[string]string{ "I14": "Node104", "I15": "Node105", "I16": "Node106", "I21": "Node201", "I22": "Node202", "I25": "Node205",