Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tree topology output #47

Merged
merged 2 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions pkg/ib/ib.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ func GenerateTopologyConfig(data []byte) (*topology.Vertex, error) {
if err != nil {
return nil, fmt.Errorf("unable to parse ibnetdiscover file: %v", err)
}

root, err := buildTree(switches, hca)
if err != nil {
return nil, fmt.Errorf("unable to build tree: %v", err)
Expand All @@ -68,11 +67,7 @@ func GenerateTopologyConfig(data []byte) (*topology.Vertex, error) {
if err != nil {
return nil, err
}
rootNode := &topology.Vertex{
Vertices: make(map[string]*topology.Vertex),
}
rootNode.Vertices[topology.TopologyTree] = treeNode
return rootNode, nil
return treeNode, nil
}

func (sw *Switch) toGraph() (*topology.Vertex, error) {
Expand Down
64 changes: 47 additions & 17 deletions pkg/providers/baremetal/mnnvl.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func getIbTree(ctx context.Context, _ []string) (*topology.Vertex, error) {
return nil, fmt.Errorf("exec error in sinfo: %v", err)
}

// scan each line containing slurm partition and the nodes in it
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
nodeLine := scanner.Text()
Expand All @@ -63,16 +64,16 @@ func getIbTree(ctx context.Context, _ []string) (*topology.Vertex, error) {
continue
}
partitionName := strings.TrimSpace(arr[0])
state := strings.TrimSpace(arr[4])
nodeList := strings.TrimSpace(arr[5])
if strings.HasPrefix(state, "down") || strings.HasSuffix(state, "*") {
continue
nodesArr, err := deCompressNodeNames(nodeList)
if err != nil {
return nil, fmt.Errorf("deCompressNodeNames failed : %v", err)
}
nodesArr := deCompressNodeNames(nodeList)
// map of slurm partition name -> node names
partitionNodeMap[partitionName] = append(partitionNodeMap[partitionName], nodesArr...)
}

for pName, nodes := range partitionNodeMap {
// for each partition in slurm, find the IB tree it belongs to
if _, exists := partitionVisitedMap[pName]; !exists {
for _, node := range nodes {
if _, exists := nodeVisited[node]; !exists {
Expand All @@ -95,68 +96,97 @@ func getIbTree(ctx context.Context, _ []string) (*topology.Vertex, error) {
ibKey := ibPrefix + strconv.Itoa(ibCount)
treeRoot.Vertices[ibKey] = ibRoot
break
} else {
fmt.Printf("Missing ibnetdiscover output\n")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use log instead?
Do we need to return error?
Do we need to have a metric for this error?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure about this. It is not really an error scenario if IB is not present. This text is supposed to be informational. Should we remove it instead?

}
} else {
partitionVisitedMap[pName] = true
}
}
}
}

return treeRoot, nil
}

// deCompressNodeNames returns array of node names
func deCompressNodeNames(nodeList string) []string {
func deCompressNodeNames(nodeList string) ([]string, error) {
nodeArr := []string{}
// split entries by comma
// example : nodename-1-[001-004,007,91-99,100],nodename-2-89
arr := strings.Split(nodeList, ",")
prefix := ""
var nodeName string

// example : nodename-1-[001-004 , 007, 91-99 , 100], nodename-2-89
for _, entry := range arr {
// example : nodename-1-[001-004
if strings.Contains(entry, "[") {
tuple := strings.Split(entry, "[")
// example : 100]
entryWithoutSuffix := strings.TrimSuffix(entry, "]")
tuple := strings.Split(entryWithoutSuffix, "[")
prefix = tuple[0]
// example : nodename-1-[001-004
if strings.Contains(tuple[1], "-") {
nr := strings.Split(tuple[1], "-")
start, _ := strconv.Atoi(nr[0])
end, _ := strconv.Atoi(nr[1])
w := len(nr[0])
start, err := strconv.Atoi(nr[0])
if err != nil {
return nil, fmt.Errorf("Atoi err for range start: %v", err)
}
end, err := strconv.Atoi(nr[1])
if err != nil {
return nil, fmt.Errorf("Atoi err for range end: %v", err)
}
for i := start; i <= end; i++ {
nodeName = prefix + strconv.Itoa(i)
suffixNum := fmt.Sprintf(fmt.Sprintf("%%0%dd", w), i)
nodeName = prefix + suffixNum
nodeArr = append(nodeArr, nodeName)
}
// avoid another nodename append at the end
continue
} else {
// example : nodename-1-[001
nv := tuple[1]
nodeName = prefix + nv
}
} else { // no [ means, this could be whole nodename or suffix
// example: 100], nodename-2-89, 90
if len(prefix) > 0 { //prefix exists, so must be a suffix.
if strings.HasSuffix(entry, "]") { //if suffix has ], reset prefix
nv := strings.Split(entry, "]")
nodeName = prefix + nv[0]
prefix = ""
} else if strings.Contains(entry, "-") { // suffix containing range of nodes
// example: 100-102]
nr := strings.Split(entry, "-")
start, _ := strconv.Atoi(nr[0])
end, _ := strconv.Atoi(nr[1])
w := len(nr[0])
start, err := strconv.Atoi(nr[0])
if err != nil {
return nil, fmt.Errorf("Atoi err for range start when prefix is set: %v", err)
}
end, err := strconv.Atoi(nr[1])
if err != nil {
return nil, fmt.Errorf("Atoi err for range end when prefix is set: %v", err)
}
for i := start; i <= end; i++ {
nodeName = prefix + strconv.Itoa(i)
suffixNum := fmt.Sprintf(fmt.Sprintf("%%0%dd", w), i)
nodeName = prefix + suffixNum
nodeArr = append(nodeArr, nodeName)
}
// avoid another nodename append at the end
continue
} else {
//example: 90
nodeName = prefix + entry
}
} else { // no prefix yet, must be whole nodename
//example: nodename-2-89
nodeName = entry
}

}
nodeArr = append(nodeArr, nodeName)
}

return nodeArr
return nodeArr, nil
}

// getClusterOutput reads output from nodeInfo and populates the structs
Expand Down
Loading