diff --git a/Makefile b/Makefile index ed738906..b9903d6a 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ FILE_LIST = LICENSE \ README.md \ + alert \ crm_cfg \ icons \ man \ diff --git a/SAPHanaSR-angi.spec b/SAPHanaSR-angi.spec index 26892b73..053f3b3b 100644 --- a/SAPHanaSR-angi.spec +++ b/SAPHanaSR-angi.spec @@ -92,6 +92,9 @@ install -m 0644 srHook/susCostOpt.py %{buildroot}/usr/share/%{name}/ install -m 0644 srHook/susChkSrv.py %{buildroot}/usr/share/%{name}/ install -m 0444 srHook/global.ini_* %{buildroot}/usr/share/%{name}/samples +# alert manager +install -m 0755 alert/SAPHanaSR-alert-fencing %{buildroot}/usr/bin + # crm config templates install -m 0644 crm_cfg/angi-ScaleUp/[0-9]*_* %{buildroot}/usr/share/%{name}/samples/crm_cfg/angi-ScaleUp @@ -134,6 +137,7 @@ install -m 0444 tools/saphana_sr_tools.py %{buildroot}/usr/lib/%{name} /usr/bin/SAPHanaSR-filter-legacy /usr/bin/SAPHanaSR-hookHelper /usr/bin/SAPHanaSR-manageProvider +/usr/bin/SAPHanaSR-alert-fencing %license LICENSE %dir %{_docdir}/%{name} diff --git a/SAPHanaSR-tester.spec b/SAPHanaSR-tester.spec index 12b2e430..eb042cd1 100644 --- a/SAPHanaSR-tester.spec +++ b/SAPHanaSR-tester.spec @@ -20,7 +20,7 @@ License: GPL-2.0 Group: Productivity/Clustering/HA AutoReqProv: on Summary: Test suite for SAPHanaSR clusters -Version: 1.2.13 +Version: 1.2.14 Release: 0 Url: https://www.suse.com/c/fail-safe-operation-of-sap-hana-suse-extends-its-high-availability-solution/ @@ -80,7 +80,7 @@ install -m 0644 test/saphana_sr_test.py %{buildroot}/usr/lib/%{name} install -m 0755 test/cs_* %{buildroot}/usr/bin install -m 0755 test/callTest* %{buildroot}/usr/bin install -m 0755 test/loopTests* %{buildroot}/usr/bin -install -m 0755 test/sct_* %{buildroot}/usr/bin +install -m 0755 test/bin/sct_* %{buildroot}/usr/bin # client files install -m 0755 tools/SAPHanaSR-showAttr %{buildroot}/usr/bin diff --git a/alert/SAPHanaSR-alert-fencing b/alert/SAPHanaSR-alert-fencing new file mode 100755 index 00000000..6005e254 --- /dev/null +++ b/alert/SAPHanaSR-alert-fencing @@ -0,0 +1,102 @@ +#!/bin/bash +# +# SAPHanaSR-alert +# Author: Lars Pinne Fabian Herschel, June 2024 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2024 SUSE LLC +# Version: 2024-06-18-15:33 +# +# crm configure alert nodes-1 "/usr/bin/SAPHanaSR-alert" select nodes +# crm configure alert fencing-1 "/usr/bin/SAPHanaSR-alert" select fencing attributes alert_uptime_threshold=300 +# + +logger_tag="SAPHanaSR-alert-fencing" +logger="/usr/bin/logger" + +# ON_FAIL_ACTION="${OCF_RESKEY_ON_FAIL_ACTION:-proceed}" +CRM_alert_recipient="${CRM_alert_recipient:-/dev/null}" +crm_alert_kind="${CRM_alert_kind:-manual call}" +crm_alert_node="${CRM_alert_node:-$HOSTNAME}" +crm_alert_desc="${CRM_alert_desc:-no description provided}" + +$logger -t "$logger_tag" "AH: begin event '$crm_alert_kind'" +cache_file="/run/crm/SAPHanaSR_site_cache" + +alert_uptime_threshold="${alert_uptime_threshold:-300}" + +IFS=. read -r sys_uptime REST .RE .PP @@ -404,7 +364,6 @@ In case systemd-style init is used for the HANA database, the services can be checked. Example SID is HA1, instance number is 10. .PP .RS 2 -.br # systemctl list-unit-files | grep -i sap .br # systemctl status SAPHA1_10.service @@ -461,8 +420,7 @@ Please report any other feedback and suggestions to feedback@suse.com. .\" .SH SEE ALSO \fBocf_suse_SAPHanaTopology\fP(7) , \fBocf_suse_SAPHanaController\fP(7) , -\fBocf_suse_SAPHanaFilesystem\fP(7) , -\fBocf_heartbeat_IPAddr2\fP(7) , \fBocf_heartbeat_Filesystem\fP(7) , +\fBocf_suse_SAPHanaFilesystem\fP(7) , \fBocf_heartbeat_IPaddr2\fP(7) , \fBsbd\fP(8) , \fBstonith_sbd\fP(7) , \fBstonith_admin\fP(8) , \fBcrm_no_quorum_policy\fP(7) , \fBcrm\fP(8) , \fBcrm_simulate\fP(8) , \fBSAPHanaSR-ScaleOut\fP(7) , \fBSAPHanaSR-showAttr\fP(7) , diff --git a/man/SAPHanaSR-alert-fencing.8 b/man/SAPHanaSR-alert-fencing.8 new file mode 100644 index 00000000..6a6f91d8 --- /dev/null +++ b/man/SAPHanaSR-alert-fencing.8 @@ -0,0 +1,178 @@ +.\" Version: 1.001 +.\" +.TH SAPHanaSR-alert-fencing 7 "12 Jun 2024" "" "SAPHanaSR" +.\" +.SH NAME +SAPHanaSR-alert-fencing \- Alert agent for cluster fencing alerts. +.PP +.\" +.SH DESCRIPTION +SAPHanaSR-alert-fencing can be used to react on Linux cluster fencing alerts. +.PP +The Linux cluster provides an interface to initiate external action when a cluster +event occurs (alert). Than the cluster calls an external program (an alert agent) +to handle that alert. +.PP +When the Linux cluster has performed an node fencing, it can call +SAPHanaSR-alert-fencing on each active cluster node. The agent checks whether +the local node belongs to the same HANA site as the fenced node. If so, it asks +the cluster to fence the local node as well. +.PP +This improves three use cases for HANA scale-out: +.br +- HA/DR provider hook script susChkSrv.py action_on_lost=fence +.br +- resource agent SAPHanaController ON_FAIL_ACTION=fence +.br +- resource agent SAPHanaFilesystem ON_FAIL_ACTION=fence +.br +See also manual pages ocf_sus_SAPHanaController(7), ocf_suse_SAPHanaFilesystem(7) +and susChkSrv.py(7). +.PP +.\" +.SH SUPPORTED PARAMETERS +.TP +\fBtimeout\fR +If the alert agent does not complete within this amount of time, it will be terminated. Optional, default "30s". Example "meta timeout=30s". +.\" .TP +.\" \fBenabled\fR +.\" If false for an alert, the alert will not be used. If true for an alert and false for a particular recipient of that alert, that recipient will not be used. Optional, default "true". +.TP +\fBalert_uptime_threshold\fR +How long a node must be up and running (uptime) before fencing alerts will be processed. This avoids fencing loops. Optional, default "300". Example "attributes alert_uptime_threshold=300". +.\" +.PP +.\" +.SH RETURN CODES +.B 0 +Successful program execution. +.br +.B >0 +Usage, syntax or execution errors. +.br +In addition log entries are written, which can be scanned by using a pattern +like "SAPHanaSR-alert-fencing". +.PP +.\" +.SH EXAMPLES +.PP +\fB*\fR Example configuration for the fencing alert handler. +.PP +The following lines needs to be added to the cluster´s CIB: +.PP +.RS 2 +alert fencing-1 "/usr/bin/SAPHanaSR-alert-fencing" \\ +.br + select fencing \\ +.br + attributes alert_uptime_threshold=300 +.RE +.PP +\fB*\fR Example for configuring the alert agent by using crm. +.PP +Alternate way for configuring the alert agent. +.PP +.RS 2 +# crm configure alert fencing-1 "/usr/bin/SAPHanaSR-alert-fencing" select fencing +.RE +.PP +\fB*\fR Showing all configured alert agents. +.PP +.RS 2 +# crm configure show type:alert +.RE +.PP +\fB*\fR Showing agent messages. +.PP +.RS 2 +# grep SAPHanaSR-alert-fencing /var/log/messages +.RE +.PP +\fB*\fR Showing history of fence actions and cleaning it up. +.PP +Example node with failed fencing action is node22. +.PP +.RS 2 +# crm_mon -1 --include=none,fencing +.br +# stonith_admin --cleanup --history node22 +.RE +.PP +\fB*\fR Example for manually fencing an node. +.PP +This could be done for testing the SAPHanaSR-alert-fencing agent integration. +This test should not be done on production systems. +See manual page crm(8) for details. +Fenced node is node1. +.br +Note: Understand the impact before trying. +.PP +.RS 2 +# crm node fence node1 +.RE +.PP +.\" +.SH FILES +.TP +/usr/bin/SAPHanaSR-alert-fencing +the alert agent +.TP +/run/crm/SAPHanaSR_site_cache +the internal cache for host to site relation - do not touch this file +.TP +/etc/sysconfig/sbd +config file for SBD daemon +.PP +.\" +.SH REQUIREMENTS +1. Pacemaker 2.1.2 or newer. +.PP +2. SAP HANA scale-out performance-optimized scenario. No HANA host auto-failover, +thus no standby nodes. +.PP +3. Only one SID is controlled by the Linux cluster. +.PP +4. Site names and host names should not be changed. +.PP +5. No other alert agent should be configured for the fencing alert. +.PP +6. SAPHanaFilesystem RA with monitor operations is active. +.PP +7. Automatic restart of just fenced nodes should be disabled by adapting +SBD_START_MODE. In case of automatic restart of just fenced nodes, it might be +necessary to adapt SBD_START_DELAY in order to avoid fencing loops. See manual +page sbd(8). +.PP +8. Fencing is executed unconditionally. The alert agent relies on the preceding +fencing decision. Neither site role nor SR state is checked. +.PP +9. The alert agent runtime almost completely depends on call-outs to OS and +Linux cluster. +.\" +.SH BUGS +In case of any problem, please use your favourite SAP support process to open +a request for the component BC-OP-LNX-SUSE. +Please report any other feedback and suggestions to feedback@suse.com. +.PP +.\" +.SH SEE ALSO +\fBSAPHanaSR-angi\fP(7) , \fBSAPHanaSR-ScaleOut\fP(7) , +\fBocf_suse_SAPHanaController\fP(7) , \fBocf_suse_SAPHanaFilesystem\fP(7) , +\fBsusChkSrv.py\fP(7) , \fBcrm\fP(8) , \fBsbd\fP(8) , +.br +https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Administration/singlehtml/#alert-agents +.PP +.\" +.SH AUTHORS +F.Herschel, L.Pinne. +.PP +.\" +.SH COPYRIGHT +.br +(c) 2024 SUSE LLC +.br +SAPHanaSR-alert-fencing comes with ABSOLUTELY NO WARRANTY. +.br +For details see the GNU General Public License at +http://www.gnu.org/licenses/gpl.html +.\" diff --git a/man/SAPHanaSR-showAttr.8 b/man/SAPHanaSR-showAttr.8 index 6a5411ca..438db263 100644 --- a/man/SAPHanaSR-showAttr.8 +++ b/man/SAPHanaSR-showAttr.8 @@ -532,7 +532,7 @@ show version. .TP 4 \fB --select\fR \fISELECT\fR .\" TODO explain meaning of values -show selected information only. Allowed values: [ all | default | minimal | sr | cluster | cluster2 | cluster3 ]. Default is default. +show selected information only. Allowed values: [ all | default | minimal | sr | cluster | cluster2 | cluster3 | skitelist ]. Default is default. .TP 4 \fB --sid\fR \fISID\fR use SAP system ID \fISID\fR. Should be autodetected, if there is only one SAP HANA instance installed on the local cluster node. The SAP system ID is a 3 alphanum string with a valid SAP system name like SLE, HAE, FH1, C11, or P42. @@ -542,7 +542,7 @@ use SAP system ID \fISID\fR. Should be autodetected, if there is only one SAP HA sort Hosts section table by field. Allowed values: [ roles | site ]. Default is sort by hostnames. .TP 4 \fB --format\fR \fIFORMAT\fR -output format. Allowed values: [ script | tables | json | tester ]. Default is tables. +output format. Allowed values: [ script | tables | json | tester | csv | cache ]. Default is tables. .TP 4 \fB --cib\fR \fIOFFLINE_CIB_FILE\fR read data from given offline CIB file. diff --git a/man/SAPHanaSR-upgrade-to-angi-demo.8 b/man/SAPHanaSR-upgrade-to-angi-demo.8 index 64fac6ce..ca9c72ca 100644 --- a/man/SAPHanaSR-upgrade-to-angi-demo.8 +++ b/man/SAPHanaSR-upgrade-to-angi-demo.8 @@ -147,7 +147,7 @@ performing an upgrade: The script needs to be copied to all cluster nodes upfront. It should be called on the HANA primary node. Before doing this, you should check and prepare pre-requisites, see example above. The proposed commands need to be checked. -Sometimes adaptions are neccessary. +Sometimes adaptions are necessary. See also SAPHanaSR_upgrade_to_angi(7). .PP .RS 2 @@ -161,7 +161,7 @@ Before doing this, you should check and prepare pre-requisites, see example above. The runbook draft is stored as file "SAPHanaSR-upgrade-draft.txt". This draft can be used for preparing details for the upgrade procedure. The proposed commands need to be checked. Sometimes adaptions are -neccessary. Of course the result needs to be checked finally as well. +necessary. Of course the result needs to be checked finally as well. See also SAPHanaSR_upgrade_to_angi(7) and tee(1). .PP .RS 2 @@ -200,7 +200,7 @@ performing the removal of SAPHanaSR: The script needs to be copied to all cluster nodes beforehand. It should be called on the HANA primary node. Before doing this, you should check and prepare pre-requisites, see example above. The proposed commands need to be checked. -Sometimes adaptions are neccessary. Of course the result needs to be checked +Sometimes adaptions are necessary. Of course the result needs to be checked finally as well. See also SAPHanaSR_upgrade_to_angi(7). .PP .RS 2 diff --git a/man/SAPHanaSR.7 b/man/SAPHanaSR.7 index 26e581fb..68b6103e 100644 --- a/man/SAPHanaSR.7 +++ b/man/SAPHanaSR.7 @@ -193,7 +193,7 @@ best practices. .PP 2. Technical users and groups such as sidadm are defined locally in the Linux system. If users are resolved by remote service, local caching is -neccessary. Substitute user (su) to sidadm needs to work reliable and without +necessary. Substitute user (su) to sidadm needs to work reliable and without customized actions or messages. Supported shell is bash. .PP 3. Strict time synchronization between the cluster nodes, e.g. NTP. All nodes of diff --git a/man/SAPHanaSR_basic_cluster.7 b/man/SAPHanaSR_basic_cluster.7 index 85ec5347..5ff08b17 100644 --- a/man/SAPHanaSR_basic_cluster.7 +++ b/man/SAPHanaSR_basic_cluster.7 @@ -225,7 +225,7 @@ with the HANA primary master nameserver. .PP .RS 2 .br -primitive rsc_ip_SLE_HDB00 IPAddr2 \\ +primitive rsc_ip_SLE_HDB00 IPaddr2 \\ .br op monitor interval=10 timeout=20 \\ .br @@ -246,7 +246,7 @@ along with the HANA secondary master nameserver. .PP .RS 2 .br -primitive rsc_ip_ro_SLE_HDB00 IPAddr2 \\ +primitive rsc_ip_ro_SLE_HDB00 IPaddr2 \\ .br op monitor interval=10 timeout=20 \\ .br @@ -278,7 +278,7 @@ HANA resource, you need to reduce that additional resource´s stickiness to 1. .PP .RS 2 .br -primitive rsc_ip_SLE_HDB00 IPAddr2 \\ +primitive rsc_ip_SLE_HDB00 IPaddr2 \\ .br op monitor interval=10s timeout=20s \\ .br @@ -423,7 +423,7 @@ Please report any other feedback and suggestions to feedback@suse.com. .\" .SH SEE ALSO \fBocf_suse_SAPHanaTopology\fP(7) , \fBocf_suse_SAPHanaController\fP(7) , -\fBocf_heartbeat_IPAddr2\fP(7) , \fBocf_heartbeat_Filesystem\fP(7) , +\fBocf_suse_SAPHanaFilesystem\fP(7) , \fBocf_heartbeat_IPaddr2\fP(7) , \fBocf_heartbeat_MailTo\fP(7) , \fBsbd\fP(8) , \fBstonith_sbd\fP(7) , \fBstonith_admin\fP(8) , \fBcrm_no_quorum_policy\fP(7) , \fBcrm\fP(8) , \fBcrm_simulate\fP(8) , diff --git a/man/SAPHanaSR_maintenance_examples.7 b/man/SAPHanaSR_maintenance_examples.7 index 5db66fc2..f653efaa 100644 --- a/man/SAPHanaSR_maintenance_examples.7 +++ b/man/SAPHanaSR_maintenance_examples.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH SAPHanaSR_maintenance_examples 7 "25 Jan 2024" "" "SAPHanaSR" +.TH SAPHanaSR_maintenance_examples 7 "08 May 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_maintenance_examples \- maintenance examples for SAPHanaController. @@ -47,7 +47,7 @@ This might be convenient when performing administrative actions or cluster tests \fB*\fR Overview on stopping the HANA database at one site. This procedure does work for scale-up and scale-out. No takeover will be done. This procedure -should be used, when it is neccessary to stop the HANA database. Stopping the HANA database +should be used, when it is necessary to stop the HANA database. Stopping the HANA database should not be done by just stopping the Linux cluster or shutting down the OS. This particularly applies to scale-out systems. It might be good to define upfront which HANA site needs to be stopped. In case both sites need to be stopped, it might be good to define the order. First @@ -297,12 +297,32 @@ This is an advanced task. 10. Please bring back the other node and register that HANA as soon as possible. If the HANA primary stays alone for too long, the log area will fill up. .RE .PP +\fB*\fR Start Linux cluster after node has been fenced. + +It is recommended to not configure the Linux cluster for always starting +autmatically on boot. Better is to start automatically only, if cluster and/or +node have been stopped cleanly. If the node has been rebooted by STONITH, the +cluster should not start automatically. If the cluster is configure that way, +some steps are needed to start the cluster after a node has been rebooted by +STONITH. STONITH via SBD is used in this example. +.PP +.RS 2 +# cs_clear_sbd_devices --all +.br +# cs_show_sbd_devices +.br +# crm cluster start +.br +# crm_mon -r +.RE +.PP .\" \fB*\fR Overview on maintenance procedure for Linux, HANA remains running, on pacemaker-2.0. It is necessary to wait for each step to complete and to check the result. It -also is necessary to test and document the whole procedure before applying in production. -See also section REQUIREMENTS below and example on checking status of HANA and cluster above. +also is necessary to test and document the whole procedure before applying in +production. See also section REQUIREMENTS below and example on checking status +of HANA and cluster above. .\" TODO details .PP .RS 2 diff --git a/man/SAPHanaSR_upgrade_to_angi.7 b/man/SAPHanaSR_upgrade_to_angi.7 index 007d763d..025f6227 100644 --- a/man/SAPHanaSR_upgrade_to_angi.7 +++ b/man/SAPHanaSR_upgrade_to_angi.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH SAPHanaSR_upgrade_to_angi 7 "02 Apr 2024" "" "SAPHanaSR" +.TH SAPHanaSR_upgrade_to_angi 7 "04 Jul 2024" "" "SAPHanaSR" .\" .SH NAME SAPHanaSR_upgrade_to_angi \- How to upgrade from SAPHanaSR or SAPHanaSR-ScaleOut to SAPHanaSR-angi. @@ -16,9 +16,9 @@ fully backward compatible. Upgrading existing clusters is possible by following a defined procedure. The upgrade should lead to the same configuration as an installation from scratch. .PP -The upgrade procedure depends on an initial setup as decribed in setup guides +The upgrade procedure depends on an initial setup as described in setup guides and manual pages. See REQUIREMENTS below and in manual pages SAPHanaSR(7) or -SAPHanaSR-ScaleOut(7). The procedure does not neccesarily need downtime for +SAPHanaSR-ScaleOut(7). The procedure does not necessarily need downtime for HANA, if planned and excuted carefully. Nevertheless, it should be done under friendly conditions. .PP @@ -77,8 +77,8 @@ hana__site_opMode_ hana__site_srMode_ .br hana__site_srPoll_ -.br -TODO vhost remoteHost +.\" .br +.\" TODO vhost remoteHost .RE .PP \fB*\fR What will be changed for SAP HANA scale-out scenarios? @@ -96,18 +96,19 @@ c. Tools are placed in /usr/bin/ instead of /usr/sbin/. .br d. Node attributes will be removed. .RS 4 -gra +hana__gra .br -gsh -TODO +hana__gsh .RE e. Site and global attributes will be removed from property SAPHanaSR. .RS 4 -mts -upd +hana__glob_mts +.br +hana__glob_upd +.br hana__glob_sync_state +.br hana__glob_srHook (in case of obsolete scale-out SAPHanaSR.py) -TODO .RE f. Site and global attributes will be added to property SAPHanaSR. .RS 4 @@ -124,15 +125,13 @@ hana__site_srr_ hana__site_srMode_ .br hana__site_srPoll_ -.br -TODO .RE .PP \fB*\fR How does the upgrade procedure look like at a glance? .PP The upgrade procedure consists of four phases: preparing, removing, adding, finalising. Linux cluster and HANA are kept running. However, resource -management is disabled and the system goes thru fragiles states during the +management is disabled and the system goes thru fragile states during the upgrade. .PP .RS 2 diff --git a/man/ocf_suse_SAPHana.7 b/man/ocf_suse_SAPHana.7 index bb003b63..f1af26b1 100644 --- a/man/ocf_suse_SAPHana.7 +++ b/man/ocf_suse_SAPHana.7 @@ -1,6 +1,6 @@ .\" Version: 0.160.1 .\" -.TH ocf_suse_SAPHana 7 "13 Dec 2023" "" "OCF resource agents" +.TH ocf_suse_SAPHana 7 "21 Jun 2024" "" "OCF resource agents" .\" .SH NAME SAPHana \- Manages takeover between two SAP HANA databases with system replication. @@ -145,10 +145,10 @@ Optional. Default value: false\&. .RS 4 Time difference needed between two primary time stamps (LPTs), in case a dual-primary situation occurs. If the difference between both node's -last primary time stamps is less than DUPLICATE_PRIMARY_TIMESTAMP, +last primary time stamps is less than DUPLICATE_PRIMARY_TIMEOUT, then the cluster holds one or both instances in a "WAITING" status. This is to give an admin the chance to react on a takeover. -Note: How the cluster proceeds after the DUPLICATE_PRIMARY_TIMESTAMP +Note: How the cluster proceeds after the DUPLICATE_PRIMARY_TIMEOUT has passed, depends on the parameter AUTOMATED_REGISTER. See also the examples section below. .br @@ -596,7 +596,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2017 SUSE Linux GmbH, Germany. .br -(c) 2018-2023 SUSE LLC +(c) 2018-2024 SUSE LLC .br The resource agent SAPHana comes with ABSOLUTELY NO WARRANTY. .br diff --git a/man/ocf_suse_SAPHanaController.7 b/man/ocf_suse_SAPHanaController.7 index 4d4d6e87..e1f73007 100644 --- a/man/ocf_suse_SAPHanaController.7 +++ b/man/ocf_suse_SAPHanaController.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH ocf_suse_SAPHanaController 7 "14 Mar 2024" "" "OCF resource agents" +.TH ocf_suse_SAPHanaController 7 "21 Jun 2024" "" "OCF resource agents" .\" .SH NAME SAPHanaController \- Manages takeover between two SAP HANA databases with system replication. @@ -31,7 +31,7 @@ SAPHanaController and SAPHanaTopology RAs, the SAPHanaSR solution uses an "HA/DR provider" API provided by HANA to get informed about the current state of the system replication. .PP -On initial cluster start, the cluster needs to detect a valid HANA system replication setup, including system replication status (SOK) and last primary timestamp (LPT). This is neccessary to ensure data integrity. +On initial cluster start, the cluster needs to detect a valid HANA system replication setup, including system replication status (SOK) and last primary timestamp (LPT). This is necessary to ensure data integrity. .PP The SAPHanaController RA performs the actual check of the SAP HANA database instances and is configured as promoatble clone resource. @@ -141,6 +141,7 @@ Defines how the RA escalates monitor failures on an HANA primary node. If srHook=SOK, in case of monitor failure an node fencing could be triggered. For srHook=SFAIL, the restart will be proceeded as usual. This option may speed up takeover on scale-up systems, depending on how long HANA needs for stopping. +For scale-out see also SAPHanaSR-alert-fencing(8). Values: [ proceed | fence ]. .br - proceed: proceed the failure as usual, i.e. initiate demote-stop sequence. @@ -159,8 +160,8 @@ Optional. Default value: false\&. .PP \fBDUPLICATE_PRIMARY_TIMEOUT\fR .RS 4 -Time difference needed between two primary time stamps (LPTs), in case a dual-primary situation occurs. If the difference between both node's last primary time stamps is less than DUPLICATE_PRIMARY_TIMESTAMP, then the cluster holds one or both instances in a "WAITING" status. This is to give an admin the chance to react on a failover. -Note: How the cluster proceeds after the DUPLICATE_PRIMARY_TIMESTAMP has passed, depends on the parameter AUTOMATED_REGISTER. See also the examples section below. +Time difference needed between two primary time stamps (LPTs), in case a dual-primary situation occurs. If the difference between both node's last primary time stamps is less than DUPLICATE_PRIMARY_TIMEOUT, then the cluster holds one or both instances in a "WAITING" status. This is to give an admin the chance to react on a failover. +Note: How the cluster proceeds after the DUPLICATE_PRIMARY_TIMEOUT has passed, depends on the parameter AUTOMATED_REGISTER. See also the examples section below. .br Optional. Default value: 7200\&. .RE @@ -518,7 +519,13 @@ Please report any other feedback and suggestions to feedback@suse.com. .PP .\" .SH SEE ALSO -\fBocf_suse_SAPHanaTopology\fP(7) , \fBocf_suse_SAPHanaFilesystem\fP(7) , \fBocf_heartbeat_IPaddr2\fP(8) , \fBSAPHanaSR-monitor\fP(8) , \fBSAPHanaSR-showAttr\fP(8) , \fBSAPHanaSR\fP(7) , \fBSAPHanaSR-ScaleOut\fP(7) , \fBSAPHanaSR_maintenance_examples\fP(7) , \fBSAPHanaSR_basic_cluster\fP(7) , \fBSAPHanaSR-ScaleOut_basic_cluster\fP(7) , \fBSAPHanaSR-manageAttr\fP(8) , \fBchrony.conf\fP(5) , \fBstonith\fP(8) , \fBcrm\fP(8) +\fBocf_suse_SAPHanaTopology\fP(7) , \fBocf_suse_SAPHanaFilesystem\fP(7) , +\fBocf_heartbeat_IPaddr2\fP(8) , \fBSAPHanaSR-monitor\fP(8) , \fBSAPHanaSR-showAttr\fP(8) , +\fBSAPHanaSR\fP(7) , \fBSAPHanaSR-ScaleOut\fP(7) , +\fBSAPHanaSR_maintenance_examples\fP(7) , \fBSAPHanaSR_basic_cluster\fP(7) , +\fBSAPHanaSR-ScaleOut_basic_cluster\fP(7) , \fBSAPHanaSR-manageAttr\fP(8) , +\fBSAPHanaSR-alert-fencing\fP(8) , +\fBchrony.conf\fP(5) , \fBstonith\fP(8) , \fBcrm\fP(8) .br https://documentation.suse.com/sbp/sap/ , .br diff --git a/man/ocf_suse_SAPHanaFilesystem.7 b/man/ocf_suse_SAPHanaFilesystem.7 index 8363bb3d..10decf5a 100644 --- a/man/ocf_suse_SAPHanaFilesystem.7 +++ b/man/ocf_suse_SAPHanaFilesystem.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH ocf_suse_SAPHanaFilesystem 7 "24 Apr 2024" "" "OCF resource agents" +.TH ocf_suse_SAPHanaFilesystem 7 "24 Jun 2024" "" "OCF resource agents" .\" .SH NAME SAPHanaFilesystem \- Monitors mounted SAP HANA filesystems. @@ -426,8 +426,10 @@ SAPHanaSR-angi(7) and its references. .br 6. SAP HANA host auto-failover is currently not supported. .br -7. If an HANA worker node of a scale-out site got fenced but not the master -nameserver, the time needed for stopping the whole site depends on HANA timeouts. +7. For HANA scale-out, the SAPHanaSR-alert-fencing should be configured. See manual +page SAPHanaSR-alert-fencing(8) for details. +.\" 7. If an HANA worker node of a scale-out site got fenced but not the master +.\" nameserver, the time needed for stopping the whole site depends on HANA timeouts. .PP .\" .SH BUGS @@ -439,7 +441,8 @@ Please report any other feedback and suggestions to feedback@suse.com. .SH SEE ALSO \fBocf_suse_SAPHanaController\fP(7) , \fBocf_suse_SAPHanaTopology\fP(7) , \fBsusHanaSR.py\fP(7) , \fBSAPHanaSR-showAttr\fP(8) , -\fBSAPHanaSR-angi\fP(7) , \fBSAPHanaSR\fP(7) , \fBSAPHanaSR-ScaleOut\fP(7) , +\fBSAPHanaSR-alert-fencing\fP(8) , \fBSAPHanaSR-angi\fP(7) , \fBSAPHanaSR\fP(7) , +\fBSAPHanaSR-ScaleOut\fP(7) , \fBfstab\fP(5) , \fBmount\fP(8) , \fBnfs\fP(5) , .br https://documentation.suse.com/sbp/sap/ , diff --git a/man/ocf_suse_SAPHanaTopology.7 b/man/ocf_suse_SAPHanaTopology.7 index 15da329f..ced3d55d 100644 --- a/man/ocf_suse_SAPHanaTopology.7 +++ b/man/ocf_suse_SAPHanaTopology.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH ocf_suse_SAPHanaTopology 7 "13 Dec 2023 "" "OCF resource agents" +.TH ocf_suse_SAPHanaTopology 7 "02 Jul 2024" "" "OCF resource agents" .\" .SH NAME SAPHanaTopology \- Helps to manage two SAP HANA databases with system replication. @@ -12,10 +12,10 @@ SAPHanaTopology \- Helps to manage two SAP HANA databases with system replicatio .\" .SH DESCRIPTION SAPHanaTopology is a resource agent (RA) that analyzes the SAP HANA topology -and "sends" all findings via the node status attributes to all nodes in the -cluster. These attributes are taken by the SAPHanaController RA to control the -SAP HANA databases. In addition SAPHanaTopology starts and monitors the local -saphostagent. +and "sends" all findings via cluster information base (CIB) attributes to all +nodes in the cluster. These attributes are taken by the SAPHanaController RA to +control the SAP HANA databases. In addition SAPHanaTopology starts and monitors +the local saphostagent. .PP The resource agent uses the following interfaces provided by SAP: .PP @@ -315,7 +315,7 @@ F.Herschel, L.Pinne. .br (c) 2015-2017 SUSE Linux GmbH, Germany. .br -(c) 2018-2023 SUSE LLC +(c) 2018-2024 SUSE LLC .br SAPHanaTopology comes with ABSOLUTELY NO WARRANTY. .br diff --git a/man/susChkSrv.py.7 b/man/susChkSrv.py.7 index beb8bd3e..b11f98f6 100644 --- a/man/susChkSrv.py.7 +++ b/man/susChkSrv.py.7 @@ -1,6 +1,6 @@ .\" Version: 1.001 .\" -.TH susChkSrv.py 7 "18 Mar 2024" "" "SAPHanaSR" +.TH susChkSrv.py 7 "24 Jun 2024" "" "SAPHanaSR" .\" .SH NAME susChkSrv.py \- Provider for SAP HANA srHook method srServiceStateChanged(). @@ -75,8 +75,9 @@ If this is combined with SAPHanaController RA parameter 'AUTOMATED_REGISTER=true HANA needs to release all OS resources prior to the automated registering. .br - \fBfence\fP: do 'crm node fence <\fIhost\fR>'. This needs a Linux cluster -STONITH method and sudo permission. This action is primarily meant for scale-up.If it happens on a scale-out worker node, the remaining master needs to time -out before the Linux cluster will react. +STONITH method and sudo permission. This action is primarily meant for scale-up. +For scale-out, SAPHanaSR-agent-fencing should be configured additionally, see +manual page SAPHanaSR-agent-fencing(8) for details. .br .\" TODO - suicide: do 'systemctl reboot'. Do NOT use this! .\" .br @@ -112,7 +113,7 @@ See also SAPHanaSR_basic_cluster(7). .br Optional. Default is 20 seconds. .TP -* The "HA/DR providers" API accepts the following parameter for the trace section in globnal.ini: +* The "HA/DR providers" API accepts the following parameter for the trace section in global.ini: .TP \fB[trace]\fP .TP @@ -427,8 +428,8 @@ susChkSrv.py parameter 'action_on_lost=fence' is set. SAPHanaController parameter 'AUTOMATED_REGISTER=true' is set, it depends on HANA to release all OS resources prior to the registering attempt. .PP -10. If an HANA worker node of a scale-out site got fenced but not the master -nameserver, the time needed for stopping the whole site depends on HANA timeouts. +10. For HANA scale-out, the susChkSrv.py parameter 'action_on_lost=fence' should +be used only, if the SAPHanaSR-alert-fencing is configured. .PP 11. If the hook provider should be pre-compiled, the particular Python version that comes with SAP HANA has to be used. @@ -442,8 +443,9 @@ Please report any other feedback and suggestions to feedback@suse.com. .SH SEE ALSO \fBSAPHanaSR\fP(7) , \fBSAPHanaSR-ScaleOut\fP(7) , \fBSAPHanaSR.py\fP(7) , \fBocf_suse_SAPHanaTopology\fP(7) , \fBocf_suse_SAPHanaController\fP(7) , -\fBSAPHanaSR-hookHelper\fP(8) , -\fBSAPHanaSR-manageProvider\fP(8) , \fBcrm\fP(8) , \fBcrm_attribute\fP(8) , +\fBSAPHanaSR-hookHelper\fP(8) , \fBSAPHanaSR-manageProvider\fP(8) , +\fBSAPHanaSR-alert-fencing\fP(8) , +\fBcrm\fP(8) , \fBcrm_attribute\fP(8) , \fBpython3\fP(8) , \fBkillall\fP(1) , .br https://help.sap.com/docs/SAP_HANA_PLATFORM?locale=en-US diff --git a/man/susCostOpt.py.7 b/man/susCostOpt.py.7 index 439c5e0f..4949bdd4 100644 --- a/man/susCostOpt.py.7 +++ b/man/susCostOpt.py.7 @@ -150,10 +150,10 @@ Please refer to SAP documentation for details on HANA commands. * Overview on recovery procedure for reverting to normal operation after takeover. .PP On postTakeover() the hook script changes configuration in memory and in -persistence. It is neccessary to recover the initial settings on secondary site +persistence. It is necessary to recover the initial settings on secondary site (step 7) before the fully operational state can be re-established (steps 8-11). Futher the HANA cluster resource default is AUTOMATED_REGISTER=false. This also -makes administrative interaction neccessary (steps 1-4). +makes administrative interaction necessary (steps 1-4). If AUTOMATED_REGISTER=true is set, the Linux cluster will do that automatically. See manual page ocf_suse_SAPHanaController(7) for details on cluster resource settings. Use exact same site names as known to the Linux cluster. See manual page diff --git a/ra/SAPHanaFilesystem b/ra/SAPHanaFilesystem index 9ea34e0e..caa2f6a6 100755 --- a/ra/SAPHanaFilesystem +++ b/ra/SAPHanaFilesystem @@ -21,7 +21,7 @@ # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # ####################################################################### -SAPHanaFilesystemVersion="1.2.7" +SAPHanaFilesystemVersion="1.2.8" # # Initialization: timeB=$(date '+%s') diff --git a/ra/saphana-common-lib b/ra/saphana-common-lib index 2b533de3..78571166 100755 --- a/ra/saphana-common-lib +++ b/ra/saphana-common-lib @@ -131,7 +131,7 @@ function core_init() { SAPSTARTPROFILE="" # Resource Agent Generation # shellcheck disable=SC2034 - RAG="2.0" + RAG="3.0" SAPHanaFilter='ra-act-dec-lpa' super_ocf_log info "RA saphana_common_lib_version=$saphana_common_lib_version" set +o posix # disable possix mode of the bash diff --git a/ra/saphana-controller-common-lib b/ra/saphana-controller-common-lib index 11055fdd..5482e04e 100755 --- a/ra/saphana-controller-common-lib +++ b/ra/saphana-controller-common-lib @@ -559,15 +559,15 @@ function get_hana_landscape_status() { else super_ocf_log info "RUNTIME do NOT use cached value for lss return code (cache_mode=$cache_mode, g_cache_lss=$g_cache_lss)" hana_LSS_Out=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python landscapeHostConfiguration.py --sapcontrol=1" 2>/dev/null); rc=$? - if [[ "$rc" == 124 ]]; then + if [[ "$rc" -ge 124 ]]; then # TODO: PRIO 1: Check, if we should loop here like 'for i in 1 2 3 ...' ? # landscape timeout - super_ocf_log warn "RA: landscapeHostConfiguration.py TIMEOUT after $HANA_CALL_TIMEOUT seconds" + super_ocf_log warn "RA: landscapeHostConfiguration.py TIMEOUT after $HANA_CALL_TIMEOUT seconds (rc=$rc)" sleep 20 # shellcheck disable=SC2034 hana_LSS_Out=$(HANA_CALL --timeout "$HANA_CALL_TIMEOUT" --cmd "python landscapeHostConfiguration.py --sapcontrol=1" 2>/dev/null); rc=$? - if [ "$rc" == 124 ]; then - super_ocf_log warn "RA: landscapeHostConfiguration.py second TIMEOUT after $HANA_CALL_TIMEOUT seconds" + if [ "$rc" -ge 124 ]; then + super_ocf_log warn "RA: landscapeHostConfiguration.py second TIMEOUT after $HANA_CALL_TIMEOUT seconds (rc=$rc)" # TODO PRIO2: How to handle still hanging lss - current solution is to say "FATAL" - Maybe we should return the stored attribute value? rc=0 fi diff --git a/ra/saphana-controller-lib b/ra/saphana-controller-lib index 235e7afe..dac3ce5c 100755 --- a/ra/saphana-controller-lib +++ b/ra/saphana-controller-lib @@ -161,7 +161,7 @@ function saphana_print_parameters() { Define timeout how long a call to HANA to receive information can take. Define timeout how long a call to HANA to receive information can take. This could be eg landscapeHostConfiguration.py. There are some specific calls to HANA which have their own timeout values. For example the takeover command does not timeout (inf). - If the timeout is reached, the return code will be 124. If you increase the timeouts for HANA calls you should also adjust the operation timeouts + If the timeout is reached, the return code will be 124 or 137 (for kill -9). If you increase the timeouts for HANA calls you should also adjust the operation timeouts of your cluster resources. @@ -239,105 +239,6 @@ function saphana_methods() { return "$rc" } # end function saphana_methods -function saphana_init_handle_update() { - # called by: ?? (not found) - # TODO PRIO2: check, if we still need this - # handle RA update state - node_updated=$(get_hana_attribute "${NODENAME}" "${ATTR_NAME_HANA_GRA[@]}") - onode_site=$(get_hana_attribute "${NODENAME}" "${ATTR_NAME_HANA_SITE[@]}") - if [ -n "$onode_site" ] && [ "$node_updated" != "$RAG" ]; then - # only set on HANA nodes, skip the decision maker / tiebreaker - # attribute is empty or set by an older RA generation - # update attribute to the new RA generation - node_updated="$RAG" - set_hana_attribute "${NODENAME}" "${node_updated}" "${ATTR_NAME_HANA_GRA[@]}" - fi - cluster_update_state=$(get_hana_attribute "X" "${ATTR_NAME_HANA_UPD_STATE[@]}") - # evaluate and update/set the update state of the cluster only on a master - # nameserver to prevent race conditions - if is_master_nameserver; then - nrOfNodes=1 - updNodes=1 - for onode in "${otherNodes[@]}"; do - onode_site=$(get_hana_attribute "$onode" "${ATTR_NAME_HANA_SITE[@]}") - if [ -z "$onode_site" ]; then - # not a HANA node, maybe the decision maker / tiebreaker - continue - fi - (( nrOfNodes++ )) - onode_updated=$(get_hana_attribute "$onode" "${ATTR_NAME_HANA_GRA[@]}") - if [ -n "$onode_updated" ]; then - if [ "$onode_updated" == "$RAG" ]; then - # only nodes with an equal RA generation as the local - # node are considered as 'updated nodes' - (( updNodes++ )) - fi - fi - done - if [ "$nrOfNodes" == "$updNodes" ]; then - updState="ok" - else - # oldNodes=$(("$nrOfNodes" - "$updNodes")) # TODO PRIO2: NG - check usage of this variable (oldNodes) - updState="nok" - fi - # if cluster_update_state is empty or the stored value does not match the - # current running state 'updState' update the attribute and the variable - if [ "$cluster_update_state" != "$updState" ]; then - set_hana_attribute "X" "$updState" "${ATTR_NAME_HANA_UPD_STATE[@]}" - cluster_update_state="$updState" - fi - fi - super_ocf_log info "UPD: cluster update state is '$cluster_update_state'" - # if NOT all cluster nodes are updated to the new RA, we still working - # with the old, global Hook attribute - if [ "$cluster_update_state" == "ok" ]; then - # all cluster nodes are now updated to the new RA, now check, if - # we can/need/should migrate the SRHook attribute - # check which Hook generation is used on all cluster nodes - srHook_gen=$(get_hana_attribute "${NODENAME}" "${ATTR_NAME_HANA_SRHOOK_GEN[@]}") - if [ -z "$srHook_gen" ]; then - # we are on the new RA package, but the new srHook code is currently - # not active, no reload or restart of srHook on HANA side - super_ocf_log info "HOOK: RA saphana_init - on the local cluster node '${NODENAME}' the srHook generation attribute is empty. May be the new srHook is currently not active, no reloaded or no restart of srHook on HANA side was done" - fi - for onode in "${otherNodes[@]}"; do - onode_site=$(get_hana_attribute "$onode" "${ATTR_NAME_HANA_SITE[@]}") - if [ -z "$onode_site" ]; then - # not a HANA node, maybe the decision maker / tiebreaker - continue - fi - onode_srHook_gen=$(get_hana_attribute "$onode" "${ATTR_NAME_HANA_SRHOOK_GEN[@]}") - if [ -z "$onode_srHook_gen" ]; then - # we are on the new RA package, but the new srHook code is - # currently not active, no reload or restart of srHook on HANA - # side - super_ocf_log info "HOOK: RA saphana_init - on cluster node '$onode' the srHook generation attribute is empty. May be the new srHook is currently not active, no reloaded or no restart of srHook on HANA side was done" - fi - if [ "$srHook_gen" != "$onode_srHook_gen" ]; then - # the cluster nodes are running different Hook generations - super_ocf_log info "HOOK: RA saphana_init - the cluster nodes '$NODENAME' and '$onode' are running different Hook generations ('$srHook_gen' - '$onode_srHook_gen')." - fi - done - # be in mind: it may be that not all nodes running the same Hook generation - multiTargetSupport=$(get_hana_attribute "X" "${ATTR_NAME_HANA_multiTargetSupport[@]}") - if [ -z "$multiTargetSupport" ]; then - # cluster attribute 'hana_${sid}_glob_mts' not set - super_ocf_log info "RA: multiTargetSupport attribute not set. May be no Hook is configured or the old-style Hook is used." - fi - if ocf_is_true "$multiTargetSupport"; then - super_ocf_log info "RA: multiTargetSupport attribute is set to 'true'" - if [ "$srHook_gen" == "2.0" ]; then - # TODO PRIO1: NG - ATTR_NAME_HANA_GLOB_SRHOOK needs to be ATTR_NAME_HANA_SITE_SRHOOK - old_sync=$(get_hana_attribute "X" "${ATTR_NAME_HANA_GLOB_SRHOOK[@]}") - if [ -n "$old_sync" ]; then - # old, global attribute still exists - super_ocf_log info "RA: The global Hook attribute is still available. Use cmd 'SAPHanaSR-manageAttr' to remove this attribute" - fi - fi - fi - fi -} # end function saphana_init_handle_update - function saphana_init_sap_paths() { # function: saphana_init_sap_paths - set variables used for SAP paths (directories, config files and executables) # globals: TODO OCF_RESKEY_DIR_EXECUTABLE, SID, InstanceName, DIR_EXECUTABLE, SAPSTARTSRV, SAPCONTROL, OCF_RESKEY_DIR_PROFILE, SAPVIRHOST @@ -1736,7 +1637,7 @@ function saphana_monitor_secondary() { lpa_set_lpt 10 "$gSite" rc="$OCF_ERR_GENERIC" ;; - 1 ) # ERROR + 1 ) # ERROR (rc is set to OCF_NOT_RUNNING by init (see local definition)) super_ocf_log debug "DBG: 012 * lpa_set_lpt 10 $gSite" lpa_set_lpt 10 "$gSite" ;; diff --git a/ra/saphana-filesystem-lib b/ra/saphana-filesystem-lib index 48c2fb8d..4facbb05 100755 --- a/ra/saphana-filesystem-lib +++ b/ra/saphana-filesystem-lib @@ -354,6 +354,13 @@ function shfs_monitor() { ;; esac fi + + # TODO PRIO2: Param to switch writing cache file on/off needed? + # TODO PRIO2: Writing cache file only for scale-out? + # TODO PRIO3: Also caching info for primary-secondary role, if needed + # TODO PRIO3: Also caching info for system replication status (srHook) role, if needed + # DONE PRIO1: Remove grep and sed to adjust output + ( SAPHanaSR-showAttr --format=cache --select=sitelist > /run/crm/SAPHanaSR_site_cache; chmod 644 /run/crm/SAPHanaSR_site_cache )& super_ocf_log info "FLOW ${FUNCNAME[0]} rc=$rc" return "$rc" } # end function shfs_monitor diff --git a/ra/saphana-topology-lib b/ra/saphana-topology-lib index 9915c418..60916c7a 100755 --- a/ra/saphana-topology-lib +++ b/ra/saphana-topology-lib @@ -113,7 +113,7 @@ SAPHanaTopology scans the output table of landscapeHostConfiguration.py to ident Define timeout how long a call to HANA to receive information can take. Define timeout how long a call to HANA to receive information can take. This could be eg landscapeHostConfiguration.py. There are some specific calls to HANA which have their own timeout values. For example the takeover command does not timeout (inf). - If the timeout is reached, the return code will be 124. If you increase the timeouts for HANA calls you should also adjust the operation timeouts + If the timeout is reached, the return code will be 124 or 137 (for kill -9). If you increase the timeouts for HANA calls you should also adjust the operation timeouts of your cluster resources. @@ -283,7 +283,7 @@ function sht_start() { start_saphostagent fi gNodeRole="$( get_role_by_landscape "$gVirtName")"; hanalrc="$?" - if [[ "$hanalrc" != 124 ]]; then + if [[ "$hanalrc" -lt 124 ]]; then set_hana_attribute "${NODENAME}" "$gNodeRole" "${ATTR_NAME_HANA_ROLES[@]}" fi # TODO PRIO 1: scale-out used side-effect via RC_hdbnsutil to give back different return codes; scale-up rc was always OCF_SUCCESS @@ -451,7 +451,7 @@ function sht_stop_clone() { fi get_local_virtual_name gNodeRole="$( get_role_by_landscape "$gVirtName" --timeout="$timeout")"; hanalrc="$?" - if [[ "$hanalrc" != "124" ]]; then + if [[ "$hanalrc" -lt "124" ]]; then # normal exit, use gNodeRole tout=0 set_hana_attribute "${NODENAME}" "$gNodeRole" "${ATTR_NAME_HANA_ROLES[@]}" @@ -519,7 +519,7 @@ function sht_monitor_clone() { done g_cache_lss="$hanalrc" super_ocf_log info "DEC: gNodeRole=$gNodeRole gTopology=$gTopology hanalrc=$g_cache_lss" - if [[ "$hanalrc" != "124" ]]; then + if [[ "$hanalrc" -lt "124" ]]; then # normal exit, use gNodeRole super_ocf_log info "DEC: gNodeRole=$gNodeRole" set_hana_attribute "${NODENAME}" "$gNodeRole" "${ATTR_NAME_HANA_ROLES[@]}" @@ -536,7 +536,7 @@ function sht_monitor_clone() { super_ocf_log info "DEC: set_hana_attribute ${NODENAME} $gSite ${ATTR_NAME_HANA_SITE[0]}" fi # TODO PRIO2: NG - COULD/SHOULD WE LIMIT THE SET OF THE LSS/SRR ATTRIBUTE TO ONLY THE_MASTER nodes? - # ignore timeout (124) and "ignore" (5) as return code from the landscapeHostConfiguration call + # ignore timeout (124, 137) and "ignore" (5) as return code from the landscapeHostConfiguration call case "$hanaPrim" in P ) ;; S ) # only secondary may propagate its sync status diff --git a/test/callTest-multiNode b/test/bin/callTest-multiNode similarity index 100% rename from test/callTest-multiNode rename to test/bin/callTest-multiNode diff --git a/test/cs_ssh b/test/bin/cs_ssh similarity index 100% rename from test/cs_ssh rename to test/bin/cs_ssh diff --git a/test/disp_sql_counter b/test/bin/disp_sql_counter similarity index 100% rename from test/disp_sql_counter rename to test/bin/disp_sql_counter diff --git a/test/filter_sap_trc_by_time b/test/bin/filter_sap_trc_by_time old mode 100644 new mode 100755 similarity index 100% rename from test/filter_sap_trc_by_time rename to test/bin/filter_sap_trc_by_time diff --git a/test/loopTests01 b/test/bin/loopTests01 similarity index 100% rename from test/loopTests01 rename to test/bin/loopTests01 diff --git a/test/loopTests02 b/test/bin/loopTests02 similarity index 100% rename from test/loopTests02 rename to test/bin/loopTests02 diff --git a/test/loopTests03 b/test/bin/loopTests03 similarity index 100% rename from test/loopTests03 rename to test/bin/loopTests03 diff --git a/test/loopTests04 b/test/bin/loopTests04 similarity index 100% rename from test/loopTests04 rename to test/bin/loopTests04 diff --git a/test/loopTests04.json b/test/bin/loopTests04.json similarity index 100% rename from test/loopTests04.json rename to test/bin/loopTests04.json diff --git a/test/bin/runtest b/test/bin/runtest new file mode 100755 index 00000000..6613c033 --- /dev/null +++ b/test/bin/runtest @@ -0,0 +1,39 @@ +#!/bin/bash +# runtests +# 2024-07-04 + +PROPERTIES="./properties_angi-ScaleOut_hoeferspitze.json" +REMOTENODES="hoeferspitze11 hoeferspitze21" +ASTROOT=/usr/share/SAPHanaSR-tester/json/angi-ScaleOut +LOGFILE="runtests.log" + +exec 3>&1 +exec 1>>"$LOGFILE" + +SECNCAS="kill_secn_inst kill_secn_indexserver kill_secn_node standby_secn_node standby_secn_worker_node free_log_area" +PRIMCAS="kill_prim_inst kill_prim_worker_inst kill_prim_indexserver kill_prim_node kill_prim_worker_node freeze_prim_master_nfs standby_prim_node" +BOTHCAS="restart_cluster restart_cluster_hana_running restart_cluster_turn_hana maintenance_cluster_turn_hana maintenance_with_standby_nodes nop" + +#TCASES="flup kill_prim_node_fencing_alert kill_prim_worker_node_fencing_alert free_log_area" +TCASES="flup kill_prim_node free_log_area kill_prim_worker_node free_log_area" +#TCASES="flup free_log_area $SECNCAS $PRIMCAS free_log_area $BOTHCAS free_log_area" + +for TEST in $TCASES; do + echo "#### $(date +%Y-%m-%d" "%H:%M:%S) run test: $TEST ####" + for N in $REMOTENODES; do + hana_fs="/hana" + hana_fill=$(ssh $N "df --output=pcent $hana_fs | grep -v Use") + echo "$(date +%Y-%m-%d\ %H:%M:%S) fill grade: $N $hana_fs $hana_fill" + done + SAPHanaSR-testCluster --testFile "$ASTROOT/$TEST.json" \ + --remoteNodes $REMOTENODES --defaultsFile "$ASTROOT/defaults.json" \ + --logFile "$TEST.log" --properties "$PROPERTIES" --dumpFailures >&3; rc=$? + echo "#### $(date +%Y-%m-%d" "%H:%M:%S) end test: $TEST rc: $rc ####" + if [[ "$rc" != "0" ]]; then + exit 1 + else + sleep 10 + fi +done +exit 0 +# diff --git a/test/sct_showlog b/test/bin/sct_showlog similarity index 100% rename from test/sct_showlog rename to test/bin/sct_showlog diff --git a/test/sct_test_block_sap_hana_sr b/test/bin/sct_test_block_sap_hana_sr similarity index 100% rename from test/sct_test_block_sap_hana_sr rename to test/bin/sct_test_block_sap_hana_sr diff --git a/test/sct_test_create_cluster_config b/test/bin/sct_test_create_cluster_config similarity index 100% rename from test/sct_test_create_cluster_config rename to test/bin/sct_test_create_cluster_config diff --git a/test/sct_test_delete_cluster_config b/test/bin/sct_test_delete_cluster_config similarity index 100% rename from test/sct_test_delete_cluster_config rename to test/bin/sct_test_delete_cluster_config diff --git a/test/sct_test_free_log_area b/test/bin/sct_test_free_log_area similarity index 100% rename from test/sct_test_free_log_area rename to test/bin/sct_test_free_log_area diff --git a/test/sct_test_freeze_prim_fs b/test/bin/sct_test_freeze_prim_fs similarity index 100% rename from test/sct_test_freeze_prim_fs rename to test/bin/sct_test_freeze_prim_fs diff --git a/test/sct_test_freeze_prim_master_nfs b/test/bin/sct_test_freeze_prim_master_nfs similarity index 100% rename from test/sct_test_freeze_prim_master_nfs rename to test/bin/sct_test_freeze_prim_master_nfs diff --git a/test/sct_test_freeze_prim_site_nfs b/test/bin/sct_test_freeze_prim_site_nfs similarity index 100% rename from test/sct_test_freeze_prim_site_nfs rename to test/bin/sct_test_freeze_prim_site_nfs diff --git a/test/sct_test_freeze_secn_site_nfs b/test/bin/sct_test_freeze_secn_site_nfs similarity index 100% rename from test/sct_test_freeze_secn_site_nfs rename to test/bin/sct_test_freeze_secn_site_nfs diff --git a/test/sct_test_maintenance_cluster_hana_running b/test/bin/sct_test_maintenance_cluster_hana_running similarity index 100% rename from test/sct_test_maintenance_cluster_hana_running rename to test/bin/sct_test_maintenance_cluster_hana_running diff --git a/test/sct_test_maintenance_cluster_turn_hana b/test/bin/sct_test_maintenance_cluster_turn_hana similarity index 100% rename from test/sct_test_maintenance_cluster_turn_hana rename to test/bin/sct_test_maintenance_cluster_turn_hana diff --git a/test/sct_test_properties b/test/bin/sct_test_properties similarity index 100% rename from test/sct_test_properties rename to test/bin/sct_test_properties diff --git a/test/sct_test_restart_cluster b/test/bin/sct_test_restart_cluster similarity index 100% rename from test/sct_test_restart_cluster rename to test/bin/sct_test_restart_cluster diff --git a/test/sct_test_restart_cluster_hana_running b/test/bin/sct_test_restart_cluster_hana_running similarity index 100% rename from test/sct_test_restart_cluster_hana_running rename to test/bin/sct_test_restart_cluster_hana_running diff --git a/test/sct_test_restart_cluster_turn_hana b/test/bin/sct_test_restart_cluster_turn_hana similarity index 100% rename from test/sct_test_restart_cluster_turn_hana rename to test/bin/sct_test_restart_cluster_turn_hana diff --git a/test/sct_test_unblock_sap_hana_sr b/test/bin/sct_test_unblock_sap_hana_sr similarity index 100% rename from test/sct_test_unblock_sap_hana_sr rename to test/bin/sct_test_unblock_sap_hana_sr diff --git a/test/sct_test_unfreeze_prim_fs b/test/bin/sct_test_unfreeze_prim_fs similarity index 100% rename from test/sct_test_unfreeze_prim_fs rename to test/bin/sct_test_unfreeze_prim_fs diff --git a/test/sct_test_unfreeze_prim_master_nfs b/test/bin/sct_test_unfreeze_prim_master_nfs similarity index 100% rename from test/sct_test_unfreeze_prim_master_nfs rename to test/bin/sct_test_unfreeze_prim_master_nfs diff --git a/test/sct_test_unfreeze_prim_site_nfs b/test/bin/sct_test_unfreeze_prim_site_nfs similarity index 100% rename from test/sct_test_unfreeze_prim_site_nfs rename to test/bin/sct_test_unfreeze_prim_site_nfs diff --git a/test/call_any b/test/call_any deleted file mode 100644 index cfafebdb..00000000 --- a/test/call_any +++ /dev/null @@ -1,12 +0,0 @@ -export TROOT=$PWD/json/angi-ScaleUp/ -TEST="$1"; shift -# remotes="localhorst1 localhorst2 192.168.178.1 localhost" -remotes="localhost" -#remotes="127.0.0.1" -./SAPHanaSR-testCluster \ - --testFile "$TROOT/$TEST".json \ - --remoteNodes $remotes \ - --defaultsFile "$TROOT"/defaults.json \ - --properties ./properties.json \ - --logFile test.log \ - $@ diff --git a/test/json/angi-ScaleOut/defaults.json b/test/json/angi-ScaleOut/defaults.json index 3491593e..0f9de39c 100644 --- a/test/json/angi-ScaleOut/defaults.json +++ b/test/json/angi-ScaleOut/defaults.json @@ -58,7 +58,7 @@ "pWorkerUp": [ "clone_state == DEMOTED", "roles == slave:slave:worker:slave", - "score == -12200" + "score == -10000" ], "sWorkerUp": [ "clone_state == DEMOTED", diff --git a/test/json/angi-ScaleOut/kill_prim_indexserver_fencing_alert.json b/test/json/angi-ScaleOut/kill_prim_indexserver_fencing_alert.json new file mode 100644 index 00000000..8c3fa794 --- /dev/null +++ b/test/json/angi-ScaleOut/kill_prim_indexserver_fencing_alert.json @@ -0,0 +1,122 @@ +{ + "test": "kill_prim_indexserver_fencing_alert", + "name": "Kill primary master indexserver with fencing alert agent implemented", + "start": "prereq10", + "steps": [ + { + "step": "prereq10", + "name": "test prerequitsites", + "next": "step20", + "loop": 1, + "wait": 1, + "post": "kill_prim_indexserver", + "pSite": "pSiteUp", + "sSite": "sSiteUp", + "pHost": "pHostUp", + "sHost": "sHostUp", + "pWorker": "pWorkerUp", + "sWorker": "sWorkerUp" + }, + { + "step": "step20", + "name": "failure detected", + "next": "step30", + "loop": 180, + "wait": 2, + "comment": "sSite: srPoll could get SFAIL on scale-out", + "pSite": [ + "lss ~ (1|2)", + "srr == P", + "lpt >~ 1000000000:20", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll == PRIM" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr == S", + "srHook ~ (PRIM|SOK)", + "srPoll ~ (SOK|SFAIL)" + ], + "pHost": [ + ], + "sHost": [ + "clone_state ~ (PROMOTED|DEMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145)" + ] + }, + { + "step": "step30", + "name": "pmaster fenced", + "next": "step40", + "loop": 120, + "wait": 2, + "pHost": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step40", + "name": "pworker fenced", + "next": "step50", + "loop": 120, + "wait": 2, + "pWorker": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step50", + "name": "begin recover", + "next": "final60", + "loop": 120, + "wait": 2, + "todo": "pHost+sHost to check site-name", + "pSite": [ + "lss == 1", + "srr == P", + "lpt >~ 1000000000:(30|20|10)", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll == PRIM" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr ~ (S|P)", + "srHook == PRIM", + "srPoll ~ (SOK|SFAIL)" + ], + "pHost": [ + "clone_state ~ (UNDEFINED|DEMOTED)", + "roles == master1::worker:", + "score ~ (90|70|5)" + ], + "sHost": [ + "clone_state ~ (DEMOTED|PROMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145)", + "srah == T" + ] + }, + { + "step": "final60", + "name": "end recover", + "next": "END", + "loop": 360, + "wait": 2, + "post": "cleanup", + "remark": "pXXX and sXXX are now exchanged", + "pSite": "sSiteUp", + "sSite": "pSiteUp", + "pHost": "sHostUp", + "sHost": "pHostUp", + "pWorker": "sWorkerUp", + "sWorker": "pWorkerUp" + } + ] +} diff --git a/test/json/angi-ScaleOut/kill_prim_node.json b/test/json/angi-ScaleOut/kill_prim_node.json index 3445fa23..d0ce73b2 100644 --- a/test/json/angi-ScaleOut/kill_prim_node.json +++ b/test/json/angi-ScaleOut/kill_prim_node.json @@ -14,8 +14,8 @@ "sSite": "sSiteUp", "pHost": "pHostUp", "sHost": "sHostUp", - "sWorker": "sWorkerUp", - "pWorker": "pWorkerUp" + "pWorker": "pWorkerUp", + "sWorker": "sWorkerUp" }, { "step": "step20", @@ -88,7 +88,7 @@ "sSite": "pSiteUp", "pHost": "sHostUp", "sHost": "pHostUp", - "pWorker": "sWorkerUp", + "pWorker": "sWorkerUp", "sWorker": "pWorkerUp" } ] diff --git a/test/json/angi-ScaleOut/kill_prim_node_fencing_alert.json b/test/json/angi-ScaleOut/kill_prim_node_fencing_alert.json new file mode 100644 index 00000000..d106bb76 --- /dev/null +++ b/test/json/angi-ScaleOut/kill_prim_node_fencing_alert.json @@ -0,0 +1,119 @@ +{ + "test": "kill_prim_node fencing alert", + "name": "Kill primary master node with fencing alert agent implemented", + "start": "prereq10", + "steps": [ + { + "step": "prereq10", + "name": "test prerequitsites", + "next": "step20", + "loop": 1, + "wait": 1, + "post": "kill_prim_node", + "pSite": "pSiteUp", + "sSite": "sSiteUp", + "pHost": "pHostUp", + "sHost": "sHostUp", + "pWorker": "pWorkerUp", + "sWorker": "sWorkerUp" + }, + { + "step": "step20", + "name": "failure detected", + "next": "step30", + "loop": 120, + "wait": 2, + "pSite": [ + "lss == 1", + "srr == P", + "lpt >~ 1000000000:(20|10)", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll == PRIM" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr ~ (S|P)", + "srHook ~ (PRIM|SOK)", + "srPoll ~ (SOK|SFAIL)" + ], + "pHost": [ + ], + "sHost": [ + "clone_state ~ (PROMOTED|DEMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145)" + ] + }, + { + "step": "step30", + "name": "pmaster fenced", + "next": "step40", + "loop": 120, + "wait": 2, + "pHost": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step40", + "name": "pworker fenced", + "next": "step50", + "loop": 120, + "wait": 2, + "pWorker": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step50", + "name": "begin recover", + "next": "final60", + "loop": 300, + "wait": 2, + "todo": "pHost+sHost to check site-name", + "pSite": [ + "lss ~ (1|2)", + "srr ~ (P|S)", + "lpt >~ 1000000000:(30|20|10)", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll ~ (PRIM|SFAIL)" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr ~ (S|P)", + "srHook == PRIM", + "srPoll ~ (SOK|PRIM)" + ], + "pHost": [ + "clone_state ~ (UNDEFINED|DEMOTED|WAITING4NODES)", + "roles == master1::worker:" + ], + "sHost": [ + "clone_state ~ (DEMOTED|PROMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145|150)" + ] + }, + { + "step": "final60", + "name": "end recover", + "next": "END", + "loop": 300, + "wait": 2, + "post": "cleanup", + "remark": "pXXX and sXXX are now exchanged", + "pSite": "sSiteUp", + "sSite": "pSiteUp", + "pHost": "sHostUp", + "sHost": "pHostUp", + "pWorker": "sWorkerUp", + "sWorker": "pWorkerUp" + } + ] +} diff --git a/test/json/angi-ScaleOut/kill_prim_worker_node_fencing_alert.json b/test/json/angi-ScaleOut/kill_prim_worker_node_fencing_alert.json new file mode 100644 index 00000000..4115211f --- /dev/null +++ b/test/json/angi-ScaleOut/kill_prim_worker_node_fencing_alert.json @@ -0,0 +1,119 @@ +{ + "test": "kill_prim_worker_node fencing alert", + "name": "Kill primary worker node with fencing agent implemented", + "start": "prereq10", + "steps": [ + { + "step": "prereq10", + "name": "test prerequitsites", + "next": "step20", + "loop": 1, + "wait": 1, + "post": "kill_prim_worker_node", + "pSite": "pSiteUp", + "sSite": "sSiteUp", + "pHost": "pHostUp", + "sHost": "sHostUp", + "sWorker": "sWorkerUp", + "pWorker": "pWorkerUp" + }, + { + "step": "step20", + "name": "failure detected", + "next": "step30", + "loop": 120, + "wait": 2, + "pSite": [ + "lss == 1", + "srr == P", + "lpt >~ 1000000000:(20|10)", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll == PRIM" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr ~ (S|P)", + "srHook ~ (PRIM|SOK)", + "srPoll ~ (SOK|SFAIL)" + ], + "pHost": [ + ], + "sHost": [ + "clone_state ~ (PROMOTED|DEMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145)" + ] + }, + { + "step": "step30", + "name": "pworker fenced", + "next": "step40", + "loop": 120, + "wait": 2, + "pWorker": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step40", + "name": "pmaster fenced", + "next": "step50", + "loop": 120, + "wait": 2, + "pHost": [ + "clone_state is None", + "role is None", + "score is None" + ] + }, + { + "step": "step50", + "name": "begin recover", + "next": "final60", + "loop": 300, + "wait": 2, + "todo": "pHost+sHost to check site-name", + "pSite": [ + "lss ~ (1|2)", + "srr ~ (P|S)", + "lpt >~ 1000000000:(30|20|10)", + "srHook ~ (PRIM|SWAIT|SREG)", + "srPoll ~ (PRIM|SFAIL)" + ], + "sSite": [ + "lpt >~ 1000000000:30", + "lss == 4", + "srr ~ (S|P)", + "srHook == PRIM", + "srPoll ~ (SOK|PRIM)" + ], + "pHost": [ + "clone_state ~ (UNDEFINED|DEMOTED|WAITING4NODES)", + "roles == master1::worker:" + ], + "sHost": [ + "clone_state ~ (DEMOTED|PROMOTED)", + "roles == master1:master:worker:master", + "score ~ (100|145|150)" + ] + }, + { + "step": "final60", + "name": "end recover", + "next": "END", + "loop": 300, + "wait": 2, + "post": "cleanup", + "remark": "pXXX and sXXX are now exchanged", + "pSite": "sSiteUp", + "sSite": "pSiteUp", + "pHost": "sHostUp", + "sHost": "pHostUp", + "pWorker": "sWorkerUp", + "sWorker": "pWorkerUp" + } + ] +} diff --git a/test/README.saphanasrlib.txt b/test/misc/README.saphanasrlib.txt similarity index 100% rename from test/README.saphanasrlib.txt rename to test/misc/README.saphanasrlib.txt diff --git a/test/README.txt b/test/misc/README.txt similarity index 100% rename from test/README.txt rename to test/misc/README.txt diff --git a/test/fix_indent b/test/misc/fix_indent similarity index 100% rename from test/fix_indent rename to test/misc/fix_indent diff --git a/test/fix_indent.txt b/test/misc/fix_indent.txt similarity index 100% rename from test/fix_indent.txt rename to test/misc/fix_indent.txt diff --git a/test/querySteps.py b/test/misc/querySteps.py similarity index 100% rename from test/querySteps.py rename to test/misc/querySteps.py diff --git a/test/properties.json b/test/properties.json deleted file mode 120000 index 4184f996..00000000 --- a/test/properties.json +++ /dev/null @@ -1 +0,0 @@ -json/angi-ScaleUp/properties.json \ No newline at end of file diff --git a/test/SAPHanaSR-checkJson b/test/tester/SAPHanaSR-checkJson similarity index 100% rename from test/SAPHanaSR-checkJson rename to test/tester/SAPHanaSR-checkJson diff --git a/test/SAPHanaSR-testCluster b/test/tester/SAPHanaSR-testCluster similarity index 100% rename from test/SAPHanaSR-testCluster rename to test/tester/SAPHanaSR-testCluster diff --git a/test/SAPHanaSR-testCluster-html b/test/tester/SAPHanaSR-testCluster-html similarity index 100% rename from test/SAPHanaSR-testCluster-html rename to test/tester/SAPHanaSR-testCluster-html diff --git a/test/saphana_sr_test.py b/test/tester/saphana_sr_test.py similarity index 100% rename from test/saphana_sr_test.py rename to test/tester/saphana_sr_test.py diff --git a/tools/SAPHanaSR-showAttr b/tools/SAPHanaSR-showAttr index 326510ff..f8530281 100755 --- a/tools/SAPHanaSR-showAttr +++ b/tools/SAPHanaSR-showAttr @@ -5,7 +5,7 @@ saphana_sr_tools.py Author: Fabian Herschel, May 2023 License: GNU General Public License (GPL) - Copyright: (c) 2023 SUSE LLC + Copyright: (c) 2023,2024 SUSE LLC # TODO: STEP02: Think also about multi SID implementation - maybe by using multiple HanaCluster objects (one per SID) """ @@ -153,3 +153,10 @@ if __name__ == "__main__": myHana.print_dic_as_path(myHana.res_dict, "resource", "Resource", quote='"') myHana.print_dic_as_path(myHana.site_dict, "site", "Site", quote='"') myHana.print_dic_as_path(myHana.host_dict, "host", "Host", quote='"') + elif oformat in {"cache"}: + myHana.print_dic_as_csv(myHana.host_dict, "host", "Host", quote='', short=True) + elif oformat in {"csv"}: + myHana.print_dic_as_csv(myHana.glob_dict, "global", "Global", quote='') + myHana.print_dic_as_csv(myHana.res_dict, "resource", "Resource", quote='') + myHana.print_dic_as_csv(myHana.site_dict, "site", "Site", quote='') + myHana.print_dic_as_csv(myHana.host_dict, "host", "Host", quote='') diff --git a/tools/SAPHanaSR-showAttr.properties.demo.json b/tools/SAPHanaSR-showAttr.properties.demo.json new file mode 100644 index 00000000..313166fd --- /dev/null +++ b/tools/SAPHanaSR-showAttr.properties.demo.json @@ -0,0 +1,10 @@ +{ +"selections": { + "demo": { + "global": [], + "resource": [], + "site": [".*"], + "host": ["Host", "clone_state", "roles", "score", "site", "sra", "srah"] + } + } +} diff --git a/tools/SAPHanaSR-upgrade-to-angi-demo b/tools/SAPHanaSR-upgrade-to-angi-demo index e6c4aec0..55e0cd2b 100755 --- a/tools/SAPHanaSR-upgrade-to-angi-demo +++ b/tools/SAPHanaSR-upgrade-to-angi-demo @@ -12,7 +12,7 @@ # # define parameters and functions # -VERSION="2024-05-06 0.3" +VERSION="2024-06-05 0.3a" DRYRUN=yes # TODO DRYRUN=no EXE=$(basename $0) @@ -208,7 +208,7 @@ function f_maintenance-on-classic() { echo "echo \"property cib-bootstrap-options: stop-orphan-resources=false\" | crm configure load update -" [ $DRYRUN = no ] && echo "property cib-bootstrap-options: stop-orphan-resources=false" |\ crm configure load update - - echo-funa run "${FUNCNAME[0]}" + echo-funa end "${FUNCNAME[0]}" } function f_maintenance-off-angi() { @@ -235,7 +235,7 @@ function f_maintenance-off-angi() { echo "echo \"property cib-bootstrap-options: stop-orphan-resources=true\" | crm configure load update -" [ $DRYRUN = no ] && echo "property cib-bootstrap-options: stop-orphan-resources=true" |\ crm configure load update - - echo-funa run "${FUNCNAME[0]}" + echo-funa end "${FUNCNAME[0]}" } function del-srhook-local-classic() { @@ -568,7 +568,7 @@ function f_check-prereq() { echo "ERROR: Package SAPHanaSR-tester-client installed." pre_rc=9 fi - rmt=$(zypper se $RPMNEW 2>/dev/null | grep -c $RPMNEW) + rmt=$(zypper se -t package $RPMNEW 2>/dev/null | grep -c $RPMNEW) if [ $rmt != 1 ]; then echo "ERROR: Can not find $RPMNEW in software channels." pre_rc=9 diff --git a/tools/saphana_sr_tools.py b/tools/saphana_sr_tools.py index e83eaf83..f733cadc 100644 --- a/tools/saphana_sr_tools.py +++ b/tools/saphana_sr_tools.py @@ -131,6 +131,12 @@ class HanaCluster(): 'site': ['Site', 'lpt', 'lss', 'mns', 'opMode', 'srHook', 'srMode', 'srPoll', 'srr'], 'host': ['Host', 'clone_state', 'node_state', 'roles', 'score', 'site', 'sra', 'srah', 'standby', 'vhost', 'fail.*'], }, + 'sitelist': { + 'global': [], + 'resource': [], + 'site': [], + 'host': ['site'], + }, } def __init__(self): @@ -499,6 +505,29 @@ def print_dic_as_path(self, print_dic, area, table_name, **kargs): value = print_dic[key][col] print(f"{time_string}{table_name}/{key}/{col}={quote}{value}{quote}") + def print_dic_as_csv(self, print_dic, area, table_name, **kargs): + """ + TODO: description + """ + time_string = "" + quote = '' + short = False + if 'quote' in kargs: + quote = kargs['quote'] + if 'ts' in kargs: + time_string = f"{kargs['ts']} " + if 'short' in kargs: + short = kargs['short'] + for key in print_dic: + for col in print_dic[key]: + if self.filter(area, col) is True: + value = print_dic[key][col] + if short: + print(f"{key}:{quote}{value}{quote}") + else: + #print(f"{time_string}{table_name}/{key}/{col}={quote}{value}{quote}") + print(f"{table_name}:{key}:{col}:{quote}{value}{quote}") + def filter(self, area, column_name): ''' filter column_names False, if column should be skipped