Skip to content

Commit b4bfa6c

Browse files
authored
Merge pull request #112 from openebs/disbale-rdma-device
fix: disable rdma device and restart csi node pod
2 parents d7e9ce9 + 7a4e8d1 commit b4bfa6c

File tree

4 files changed

+134
-13
lines changed

4 files changed

+134
-13
lines changed

common/e2e_config/e2e_config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type ProductSpec struct {
4242
CpuCount string `yaml:"cpuCount" env-default:"2"`
4343
CrdGroupName string `yaml:"crdGroupName" env-default:"openebs.io"`
4444
CrdPoolsResourceName string `yaml:"crdPoolsResourceName" env-default:"mayastorpools"`
45-
CsiDaemonsetName string `yaml:"csiDaemonsetName" env-default:"mayastor-csi"`
45+
CsiDaemonsetName string `yaml:"csiDaemonsetName" env-default:"mayastor-csi-node"`
4646
CsiNodeServiceAppLabel string `yaml:"csiNodeServiceAppLabel" env-default:"csi-node"`
4747
CsiNodeServiceDaemonset string `yaml:"csiNodeServiceDaemonset" env-default:"mayastor-csi-node"`
4848
CsiNodeContainerName string `yaml:"csiNodeContainerName" env-default:"csi-node"`
@@ -241,7 +241,8 @@ type E2EConfig struct {
241241

242242
IOEngineNvmeTimeout int `yaml:"ioEngineNvmeTimeout" env-default:"0"`
243243

244-
// Individual Test parameters
244+
// disable rdma device over list of interface
245+
DisabledRdmaDevices []string `yaml:"disabledRdmaDevices"`
245246
}
246247

247248
var once sync.Once

common/k8stest/util_rdma.go

+62-10
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,11 @@ func RemoveRdmaDeviceOnNode(node string) error {
181181
return nil
182182
}
183183

184-
func DisableRdmaOnNode(node string) error {
184+
func DisableRdmaOnNode(node string, networkInterface string) error {
185185
logf.Log.Info("Disable rdma from IO engine node", "name", node)
186186

187-
iface := e2e_config.GetConfig().NetworkInterface
188187
// get dev link port wrt to interface
189-
rdmaDevName, err := GetDevLinkName(node, iface)
188+
rdmaDevName, err := GetDevLinkName(node, networkInterface)
190189
if err != nil {
191190
return err
192191
}
@@ -212,7 +211,9 @@ func DisableRdmaOnNode(node string) error {
212211
}
213212
}
214213

215-
return nil
214+
// Restart csi node pod on the node
215+
return RestartCsiNodePodOnNode(node, 240, 120)
216+
216217
}
217218

218219
func RemoveRdmaDeviceOnAllWorkerNodes() error {
@@ -223,7 +224,7 @@ func RemoveRdmaDeviceOnAllWorkerNodes() error {
223224
logf.Log.Info("Remove rdma from IO engine node")
224225
for _, node := range workerNodes.Items {
225226
logf.Log.Info("IO engine", "Node", node.Name)
226-
err := DisableRdmaOnNode(node.Name)
227+
err := DisableRdmaOnNode(node.Name, e2e_config.GetConfig().NetworkInterface)
227228
if err != nil {
228229
return err
229230
}
@@ -239,20 +240,19 @@ func EnableRdmaDeviceOnAllWorkerNodes() error {
239240
logf.Log.Info("Enable rdma from IO engine node")
240241
for _, node := range workerNodes.Items {
241242
logf.Log.Info("IO engine", "Node", node.Name)
242-
err := EnableRdmaOnNode(node.Name)
243+
err := EnableRdmaOnNode(node.Name, e2e_config.GetConfig().NetworkInterface)
243244
if err != nil {
244245
return err
245246
}
246247
}
247248
return nil
248249
}
249250

250-
func EnableRdmaOnNode(node string) error {
251+
func EnableRdmaOnNode(node string, networkInterface string) error {
251252
logf.Log.Info("Enable rdma on IO engine node", "name", node)
252-
// get interface name
253-
iface := e2e_config.GetConfig().NetworkInterface
253+
254254
// get dev link port wrt to interface
255-
rdmaDevPortName, err := GetDevLinkName(node, iface)
255+
rdmaDevPortName, err := GetDevLinkName(node, networkInterface)
256256
if err != nil {
257257
return err
258258
}
@@ -317,3 +317,55 @@ func GetDevLinkName(node, iface string) (string, error) {
317317
}
318318
return "", nil
319319
}
320+
321+
// DisableConfiguredDisabledRdmaDevicesOnAllMayastorNodes disable rdma devices which are configured in platform config
322+
// on all mayastor nodes. If no rdma devices are configured then it will return without doing anything.
323+
// In some cases, multiple rdma devices are configured in cluster, so it should be disabled all the devices on nodes
324+
func DisableConfiguredDisabledRdmaDevicesOnAllMayastorNodes() error {
325+
ifaceList := e2e_config.GetConfig().DisabledRdmaDevices
326+
if len(ifaceList) == 0 {
327+
logf.Log.Info("No rdma devices configured which needs to be disabled")
328+
return nil
329+
}
330+
workerNodes, err := ListIOEngineNodes()
331+
if err != nil {
332+
return err
333+
}
334+
for _, node := range workerNodes.Items {
335+
logf.Log.Info("Disable rdma device", "Node", node.Name)
336+
for _, iface := range ifaceList {
337+
err := DisableRdmaOnNode(node.Name, iface)
338+
if err != nil {
339+
return err
340+
}
341+
}
342+
343+
}
344+
return nil
345+
}
346+
347+
// RestoreConfiguredDisabledRdmaDevicesOnAllMayastorNodes enable rdma devices which are configured in platform config
348+
// on all mayastor nodes. If no rdma devices are configured then it will return without doing anything.
349+
// In some cases, multiple rdma devices are configured in cluster were disabled, so it should be enabled all the devices on nodes
350+
func RestoreConfiguredDisabledRdmaDevicesOnAllMayastorNodes() error {
351+
ifaceList := e2e_config.GetConfig().DisabledRdmaDevices
352+
if len(ifaceList) == 0 {
353+
logf.Log.Info("No rdma devices configured which needs to be restored")
354+
return nil
355+
}
356+
workerNodes, err := ListIOEngineNodes()
357+
if err != nil {
358+
return err
359+
}
360+
for _, node := range workerNodes.Items {
361+
logf.Log.Info("Enable rdma device", "Node", node.Name)
362+
for _, iface := range ifaceList {
363+
err := EnableRdmaOnNode(node.Name, iface)
364+
if err != nil {
365+
return err
366+
}
367+
}
368+
369+
}
370+
return nil
371+
}

common/k8stest/util_testpods.go

+66
Original file line numberDiff line numberDiff line change
@@ -1218,3 +1218,69 @@ func GetMayastorPodNameonNodeByPrefix(prefix string, nodeName string) (string, e
12181218
}
12191219
return "", fmt.Errorf("failed to get mayastor pod with prefix %s on node %s", prefix, nodeName)
12201220
}
1221+
1222+
// GetCsiNodePodNameOnNodeByPrefix return csi node pod on a node with a given prefix
1223+
func GetPodNameOnNodeByPrefix(podPrefix, nodeName, namespace string) (string, error) {
1224+
podApi := gTestEnv.KubeInt.CoreV1().Pods
1225+
pods, err := podApi(namespace).List(context.TODO(), metaV1.ListOptions{})
1226+
if err != nil {
1227+
return "", fmt.Errorf("failed to list pod in %s namespace, error: %v", common.NSMayastor(), err)
1228+
}
1229+
for _, pod := range pods.Items {
1230+
if strings.HasPrefix(pod.Name, podPrefix) {
1231+
if pod.Spec.NodeName == nodeName {
1232+
return pod.Name, nil
1233+
}
1234+
}
1235+
}
1236+
return "", fmt.Errorf("failed to get mayastor pod with prefix %s on node %s", podPrefix, nodeName)
1237+
}
1238+
1239+
func restartCsiNodePodsOnNode(nodeName string) error {
1240+
podApi := gTestEnv.KubeInt.CoreV1().Pods
1241+
1242+
podName, err := GetPodNameOnNodeByPrefix(e2e_config.GetConfig().Product.CsiDaemonsetName, nodeName, common.NSMayastor())
1243+
1244+
if err == nil {
1245+
logf.Log.Info("Restarting", "pod", podName)
1246+
time.Sleep(1 * time.Second)
1247+
1248+
delErr := podApi(common.NSMayastor()).Delete(context.TODO(), podName, metaV1.DeleteOptions{})
1249+
if delErr != nil {
1250+
logf.Log.Info("Failed to delete", "pod", podName, "error", delErr)
1251+
err = delErr
1252+
} else {
1253+
logf.Log.Info("Deleted", "pod", podName)
1254+
}
1255+
}
1256+
return err
1257+
}
1258+
1259+
// RestartCsiNodePodOnNode restart csi node pod scheduled on a given node and wait for mayastor pods and pool readiness
1260+
func RestartCsiNodePodOnNode(nodeName string, readyTOSecs int, poolsTOSecs int) error {
1261+
ready := false
1262+
1263+
err := restartCsiNodePodsOnNode(nodeName)
1264+
if err != nil {
1265+
logf.Log.Info("Warning: RestartMayastorPodsOnNode failed", "error", err)
1266+
}
1267+
1268+
ready, err = MayastorReady(10, readyTOSecs)
1269+
if err != nil {
1270+
return fmt.Errorf("failure waiting for mayastor to be ready %v", err)
1271+
}
1272+
if !ready {
1273+
return fmt.Errorf("mayastor is not ready after deleting all pods")
1274+
}
1275+
1276+
const sleepTime = 10
1277+
for ix := 0; ix < (poolsTOSecs+sleepTime-1)/sleepTime; ix++ {
1278+
time.Sleep(sleepTime * time.Second)
1279+
err = custom_resources.CheckAllMsPoolsAreOnline()
1280+
if err == nil {
1281+
break
1282+
}
1283+
}
1284+
1285+
return err
1286+
}

configurations/maasci_config.yaml

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ configName: maasci
66
grpcMandated: true
77
deferredAssert: true
88
beforeEachCheckAndRestart: true
9-
networkInterface: ens1f0
9+
networkInterface: ens1f0
10+
disabledRdmaDevices:
11+
- "ens1f1"

0 commit comments

Comments
 (0)