Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement block topology for K8s #43

Merged
merged 1 commit into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 69 additions & 18 deletions pkg/engines/k8s/labeler.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ import (
"github.com/NVIDIA/topograph/pkg/topology"
)

const (
hierarchyLayerAccelerator = "network.topology.kubernetes.io/accelerator"
hierarchyLayerBlock = "network.topology.kubernetes.io/block"
hierarchyLayerSpine = "network.topology.kubernetes.io/spine"
hierarchyLayerDatacenter = "network.topology.kubernetes.io/datacenter"
)

var switchNetworkHierarchy = []string{hierarchyLayerBlock, hierarchyLayerSpine, hierarchyLayerDatacenter}

// map nodename:[label name: label value]
type nodeLabelMap map[string]map[string]string

type Labeler interface {
AddNodeLabels(context.Context, string, map[string]string) error
}
Expand All @@ -39,48 +51,87 @@ func NewTopologyLabeler() *topologyLabeler {
}

func (l *topologyLabeler) ApplyNodeLabels(ctx context.Context, v *topology.Vertex, labeler Labeler) error {
if v == nil {
if v == nil || len(v.Vertices) == 0 {
return nil
}
levels := []string{}
if len(v.ID) != 0 {
levels = append(levels, v.ID)

nodeMap := make(nodeLabelMap)
if blockRoot, ok := v.Vertices[topology.TopologyBlock]; ok {
if err := l.getBlockNodeLabels(blockRoot, nodeMap); err != nil {
return err
}
}

return l.applyNodeLabels(ctx, v, labeler, levels)
if treeRoot, ok := v.Vertices[topology.TopologyTree]; ok {
layers := []string{}
if len(treeRoot.ID) != 0 {
layers = append(layers, treeRoot.ID)
}
if err := l.getTreeNodeLabels(treeRoot, nodeMap, layers); err != nil {
return err
}
}

for nodeName, labels := range nodeMap {
if err := labeler.AddNodeLabels(ctx, nodeName, labels); err != nil {
return err
}
}

return nil
}

func (l *topologyLabeler) applyNodeLabels(ctx context.Context, v *topology.Vertex, labeler Labeler, levels []string) error {
func (l *topologyLabeler) getTreeNodeLabels(v *topology.Vertex, nodeMap nodeLabelMap, layers []string) error {
if len(v.Vertices) == 0 { // compute node
if len(levels) != 0 {
if v.ID != levels[0] {
return fmt.Errorf("instance ID mismatch: expected %s, got %s", v.ID, levels[0])
if len(layers) != 0 {
if v.ID != layers[0] {
return fmt.Errorf("instance ID mismatch: expected %s, got %s", v.ID, layers[0])
}

labels := make(map[string]string)
for i, sw := range levels[1:] {
nodeName := v.Name
labels, ok := nodeMap[nodeName]
if !ok {
labels = make(map[string]string)
nodeMap[nodeName] = labels
}
for i, sw := range layers[1:] {
if len(sw) == 0 {
break
}
labels[fmt.Sprintf("topology.kubernetes.io/network-level-%d", i+1)] = l.checkLabel(sw)
}

if err := labeler.AddNodeLabels(ctx, v.Name, labels); err != nil {
return err
if i < len(switchNetworkHierarchy) {
labels[(switchNetworkHierarchy[i])] = l.checkLabel(sw)
}
}
}
return nil
}

for _, w := range v.Vertices {
if err := l.applyNodeLabels(ctx, w, labeler, append([]string{w.ID}, levels...)); err != nil {
if err := l.getTreeNodeLabels(w, nodeMap, append([]string{w.ID}, layers...)); err != nil {
return err
}
}

return nil
}

func (l *topologyLabeler) getBlockNodeLabels(v *topology.Vertex, nodeMap nodeLabelMap) error {
for _, block := range v.Vertices {
for _, node := range block.Vertices {
nodeName := node.Name
labels, ok := nodeMap[nodeName]
if !ok {
labels = make(map[string]string)
nodeMap[nodeName] = labels
}
if val, ok := labels[hierarchyLayerAccelerator]; ok {
return fmt.Errorf("multiple accelerator labels %s, %s for node %s", val, block.ID, nodeName)
}
labels[hierarchyLayerAccelerator] = l.checkLabel(block.ID)
}
}
return nil
}

// checkLabel checks the length of the label value.
// If more than 63 characters (Kubernetes limit), it will replace it with hash
func (l *topologyLabeler) checkLabel(val string) string {
Expand Down
97 changes: 90 additions & 7 deletions pkg/engines/k8s/labeler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,99 @@ func (l *testLabeler) AddNodeLabels(_ context.Context, nodeName string, labels m
return nil
}

func TestApplyNodeLabels(t *testing.T) {
func TestApplyNodeLabelsWithTree(t *testing.T) {
root, _ := translate.GetTreeTestSet(true)
labeler := &testLabeler{data: make(map[string]map[string]string)}
data := map[string]map[string]string{
"Node201": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"},
"Node202": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"},
"Node205": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"},
"Node304": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"},
"Node305": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"},
"Node306": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"},
"Node201": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"},
"Node202": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"},
"Node205": {"network.topology.kubernetes.io/block": "S2", "network.topology.kubernetes.io/spine": "S1"},
"Node304": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"},
"Node305": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"},
"Node306": {"network.topology.kubernetes.io/block": "xf946c4acef2d5939", "network.topology.kubernetes.io/spine": "S1"},
}

err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler)
require.NoError(t, err)
require.Equal(t, data, labeler.data)
}

func TestApplyNodeLabelsWithBlock(t *testing.T) {
root, _ := translate.GetBlockWithMultiIBTestSet()
labeler := &testLabeler{data: make(map[string]map[string]string)}
data := map[string]map[string]string{
"Node104": {
"network.topology.kubernetes.io/accelerator": "B1",
"network.topology.kubernetes.io/block": "S2",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node105": {
"network.topology.kubernetes.io/accelerator": "B1",
"network.topology.kubernetes.io/block": "S2",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node106": {
"network.topology.kubernetes.io/accelerator": "B1",
"network.topology.kubernetes.io/block": "S2",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node201": {
"network.topology.kubernetes.io/accelerator": "B2",
"network.topology.kubernetes.io/block": "S3",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node202": {
"network.topology.kubernetes.io/accelerator": "B2",
"network.topology.kubernetes.io/block": "S3",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node205": {
"network.topology.kubernetes.io/accelerator": "B2",
"network.topology.kubernetes.io/block": "S3",
"network.topology.kubernetes.io/spine": "S1",
"network.topology.kubernetes.io/datacenter": "ibRoot2",
},
"Node301": {
"network.topology.kubernetes.io/accelerator": "B3",
"network.topology.kubernetes.io/block": "S5",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
"Node302": {
"network.topology.kubernetes.io/accelerator": "B3",
"network.topology.kubernetes.io/block": "S5",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
"Node303": {
"network.topology.kubernetes.io/accelerator": "B3",
"network.topology.kubernetes.io/block": "S5",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
"Node401": {
"network.topology.kubernetes.io/accelerator": "B4",
"network.topology.kubernetes.io/block": "S6",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
"Node402": {
"network.topology.kubernetes.io/accelerator": "B4",
"network.topology.kubernetes.io/block": "S6",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
"Node403": {
"network.topology.kubernetes.io/accelerator": "B4",
"network.topology.kubernetes.io/block": "S6",
"network.topology.kubernetes.io/spine": "S4",
"network.topology.kubernetes.io/datacenter": "ibRoot1",
},
}

err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler)
Expand Down
24 changes: 24 additions & 0 deletions pkg/translate/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,17 @@ func split(input string) (string, string) {
}

func GetTreeTestSet(testForLongLabelName bool) (*topology.Vertex, map[string]string) {
//
// S1
// / \
// S2 S3
// | |
// --- ---
// I14 I21
// I15 I22
// I16 I25
// --- ---
//
var s3name string
if testForLongLabelName {
s3name = "S3very-very-long-id-to-check-label-value-limits-of-63-characters"
Expand Down Expand Up @@ -394,6 +405,19 @@ func GetTreeTestSet(testForLongLabelName bool) (*topology.Vertex, map[string]str
}

func GetBlockWithMultiIBTestSet() (*topology.Vertex, map[string]string) {
//
// ibRoot2 ibRoot1
// | |
// S1 S4
// / \ / \
// S2 S3 S5 S6
// | | | |
// --- --- --- ---
// I14\ I21\ I31\ I41\
// I15-B1 I22-B2 I32-B3 I42-B4
// I16/ I25/ I33/ I43/
// --- --- --- ---
//
instance2node := map[string]string{
"I14": "Node104", "I15": "Node105", "I16": "Node106",
"I21": "Node201", "I22": "Node202", "I25": "Node205",
Expand Down
Loading