diff --git a/bsp-files/filter_out_from_controller b/bsp-files/filter_out_from_controller index c07ec6d7..bd5c4604 100644 --- a/bsp-files/filter_out_from_controller +++ b/bsp-files/filter_out_from_controller @@ -25,6 +25,7 @@ mtce-guestServer nfscheck radvd config-gate-worker +isolcpus-device-plugin kernel-rt kernel-module-igb-uio kernel-module-igb-uio-rt @@ -33,6 +34,7 @@ kernel-rt-modules-extra kmod-e1000e-rt kmod-i40e-rt kmod-iavf-rt +kmod-ice-rt kmod-ixgbe-rt kmod-ixgbevf-rt kmod-igb_uio-rt @@ -53,3 +55,7 @@ openvswitch-config pci-irq-affinity-agent kvm-timer-advance sysinv-fpga-agent +kernel-rt-headers +kernel-rt-devel +kernel-headers +kernel-devel diff --git a/bsp-files/filter_out_from_smallsystem b/bsp-files/filter_out_from_smallsystem index 53d70456..5798bbac 100644 --- a/bsp-files/filter_out_from_smallsystem +++ b/bsp-files/filter_out_from_smallsystem @@ -13,6 +13,7 @@ kernel-rt-modules-extra kmod-e1000e-rt kmod-i40e-rt kmod-iavf-rt +kmod-ice-rt kmod-ixgbe-rt kmod-ixgbevf-rt kmod-igb_uio-rt @@ -26,3 +27,5 @@ qat17-rt kernel-rt-tools kernel-rt-tools-libs kmod-drbd-rt +kernel-rt-headers +kernel-rt-devel diff --git a/bsp-files/filter_out_from_smallsystem_lowlatency b/bsp-files/filter_out_from_smallsystem_lowlatency index 63952b03..a23573d0 100644 --- a/bsp-files/filter_out_from_smallsystem_lowlatency +++ b/bsp-files/filter_out_from_smallsystem_lowlatency @@ -11,6 +11,7 @@ kernel-module-igb-uio kmod-e1000e kmod-i40e kmod-iavf +kmod-ice kmod-ixgbe kmod-ixgbevf kmod-igb_uio @@ -23,3 +24,5 @@ kernel-tools kernel-tools-libs kmod-drbd kernel-modules-extra +kernel-headers +kernel-devel diff --git a/bsp-files/filter_out_from_storage b/bsp-files/filter_out_from_storage index 33122822..bb46fab1 100644 --- a/bsp-files/filter_out_from_storage +++ b/bsp-files/filter_out_from_storage @@ -69,6 +69,7 @@ influxdb influxdb-extensions io-monitor io-scheduler +isolcpus-device-plugin isomd5sum ipxe-roms-qemu kernel-module-openvswitch @@ -120,8 +121,6 @@ nova-tests nova-api-proxy nova-placement-api novnc -net-snmp -net-snmp-config openstack-aodh-api openstack-aodh-commmon openstack-aodh-compat @@ -256,7 +255,6 @@ qemu-kvm-ev qemu-kvm-tools-ev radvd rubygem-rdoc -snmp-ext task-cloud-compute task-cloud-controller tgt @@ -290,6 +288,7 @@ kernel-rt-modules-extra kmod-e1000e-rt kmod-i40e-rt kmod-iavf-rt +kmod-ice-rt kmod-ixgbe-rt kmod-ixgbevf-rt kmod-igb_uio-rt @@ -304,7 +303,6 @@ kernel-rt-tools kernel-rt-tools-libs NaviCLI-Linux-64-x86-en_US kmod-drbd-rt -snmp-audittrail wrs-ssl tpm2-tools tss2 @@ -340,6 +338,11 @@ stx-oidc-auth-helm stx-cert-manager-helm stx-nginx-ingress-controller-helm stx-portieris-helm +stx-snmp-helm stx-vault-helm sysinv-fpga-agent k8s-pod-recovery +kernel-rt-headers +kernel-rt-devel +kernel-headers +kernel-devel diff --git a/bsp-files/filter_out_from_worker b/bsp-files/filter_out_from_worker index 83b3a9af..8590a982 100644 --- a/bsp-files/filter_out_from_worker +++ b/bsp-files/filter_out_from_worker @@ -81,8 +81,6 @@ nova-tests nova-api-proxy nova-placement-api novnc -net-snmp -net-snmp-config openldap-backend-bdb openldap-backend-dnssrv openldap-backend-hdb @@ -138,7 +136,6 @@ python-swiftclient python-wsme fm-mgr fm-rest-api -snmp-ext sm sm-api sm-client @@ -258,6 +255,7 @@ kernel-rt-modules-extra kmod-e1000e-rt kmod-i40e-rt kmod-iavf-rt +kmod-ice-rt kmod-ixgbe-rt kmod-ixgbevf-rt kmod-igb_uio-rt @@ -272,7 +270,6 @@ kernel-rt-tools kernel-rt-tools-libs NaviCLI-Linux-64-x86-en_US kmod-drbd-rt -snmp-audittrail wrs-ssl tpm2-tools tss2 @@ -301,5 +298,8 @@ stx-oidc-auth-helm stx-cert-manager-helm stx-nginx-ingress-controller-helm stx-portieris-helm +stx-snmp-helm stx-vault-helm k8s-pod-recovery +kernel-rt-headers +kernel-rt-devel diff --git a/bsp-files/filter_out_from_worker_lowlatency b/bsp-files/filter_out_from_worker_lowlatency index ae4c8fe0..2b02e1df 100644 --- a/bsp-files/filter_out_from_worker_lowlatency +++ b/bsp-files/filter_out_from_worker_lowlatency @@ -81,8 +81,6 @@ nova-tests nova-api-proxy nova-placement-api novnc -net-snmp -net-snmp-config neutron-plugin-ml2 neutron-server neutron-tests @@ -141,7 +139,6 @@ python-swiftclient python-wsme fm-mgr fm-rest-api -snmp-ext sm sm-api sm-client @@ -261,6 +258,7 @@ kernel-module-igb-uio kmod-e1000e kmod-i40e kmod-iavf +kmod-ice kmod-ixgbe kmod-ixgbevf kmod-igb_uio @@ -274,7 +272,6 @@ kernel-tools-libs kernel-modules-extra NaviCLI-Linux-64-x86-en_US kmod-drbd-rt -snmp-audittrail wrs-ssl tpm2-tools tss2 @@ -302,5 +299,8 @@ stx-oidc-auth-helm stx-cert-manager-helm stx-nginx-ingress-controller-helm stx-portieris-helm +stx-snmp-helm stx-vault-helm k8s-pod-recovery +kernel-headers +kernel-devel diff --git a/bsp-files/kickstarts/pre_disk_aio.cfg b/bsp-files/kickstarts/pre_disk_aio.cfg index 68e1a3e9..09d831dc 100755 --- a/bsp-files/kickstarts/pre_disk_aio.cfg +++ b/bsp-files/kickstarts/pre_disk_aio.cfg @@ -29,11 +29,12 @@ ## ETCD_STOR_SIZE = 5GiB ## CEPH_MON_SIZE = 20GiB ## KUBELET_STOR_SIZE = 10GiB +## DC_VAULT_SIZE = 15GiB ## RESERVED_PE = 16MiB (based on pesize=32768) ## -## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 163.02GiB +## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 178.02GiB ## -##*************************************************************************************************** +##********************************************************************************************************** ## Small disk install - (for disks below 240GB) ## - DB size is doubled to allow for upgrades ## @@ -50,11 +51,12 @@ ## ETCD_STOR_SIZE = 5GiB ## CEPH_MON_SIZE = 20GiB ## KUBELET_STOR_SIZE = 10GiB +## DC_VAULT_SIZE = 15GiB ## RESERVED_PE = 16MiB (based on pesize=32768) ## -## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 148.02GiB +## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 163.02GiB ## -##*************************************************************************************************** +##********************************************************************************************************* ## Tiny disk install - (for disks below 154GB) ## ## NOTE: Tiny disk setup is mainly for StarlingX running in QEMU/KVM VM. @@ -89,15 +91,15 @@ EFI_SIZE=300 # which are DEFAULT_SMALL_DISK_SIZE # MINIMUM_SMALL_DISK_SIZE default_small_disk_size=240 -minimum_small_disk_size=181 +minimum_small_disk_size=196 sz=$(blockdev --getsize64 $rootfs_device) # Round CGCS_PV_SIZE to the closest upper value that can be divided by 1024. if [ $sz -gt $(($default_small_disk_size*$gb)) ] ; then - # Large disk: CGCS_PV_SIZE=164GiB*1024=167936 - CGCS_PV_SIZE=167936 + # Large disk: CGCS_PV_SIZE=179GiB*1024=183296 + CGCS_PV_SIZE=183296 elif [ $sz -ge $(($minimum_small_disk_size*$gb)) ] ; then - # Small disk: CGCS_PV_SIZE=149GiB*1024=152576 - CGCS_PV_SIZE=152576 + # Small disk: CGCS_PV_SIZE=164GiB*1024=167936 + CGCS_PV_SIZE=167936 else # Tiny disk: CGCS_PV_SIZE=43GiB*1024=44032 # Using a disk with a size under 60GiB will fail. diff --git a/bsp-files/kickstarts/pre_disk_setup_common.cfg b/bsp-files/kickstarts/pre_disk_setup_common.cfg index 0d885384..07b0304f 100644 --- a/bsp-files/kickstarts/pre_disk_setup_common.cfg +++ b/bsp-files/kickstarts/pre_disk_setup_common.cfg @@ -167,6 +167,13 @@ else # Avoid wiping ceph osds if sysinv tells us so if [ ${WIPE_CEPH_OSDS} == "false" ]; then wipe_dev="true" + + pvs | grep -q "$dev *ceph" + if [ $? -eq 0 ]; then + wlog "skip rook provisoned disk $dev" + continue + fi + part_numbers=( `parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}'` ) # Scanning the partitions looking for CEPH OSDs and # skipping any disk found with such partitions @@ -178,7 +185,15 @@ else wipe_dev="false" break fi + + pvs | grep -q -e "${dev}${part_number} *ceph" -e "${dev}p${part_number} *ceph" + if [ $? -eq 0 ]; then + wlog "Rook OSD found on $dev$part_number, skip wipe" + wipe_dev="false" + break + fi done + if [ "$wipe_dev" == "false" ]; then continue fi diff --git a/installer/pxe-network-installer/centos/build_srpm.data b/installer/pxe-network-installer/centos/build_srpm.data index 7be54e52..d59c4cec 100644 --- a/installer/pxe-network-installer/centos/build_srpm.data +++ b/installer/pxe-network-installer/centos/build_srpm.data @@ -6,6 +6,6 @@ COPY_LIST="pxe-network-installer/* \ /import/mirrors/CentOS/stx-installer/vmlinuz \ " -TIS_PATCH_VER=28 +TIS_PATCH_VER=PKG_GITREVCOUNT+13 BUILD_IS_BIG=4 BUILD_IS_SLOW=4 diff --git a/installer/pxe-network-installer/centos/pxe-network-installer.spec b/installer/pxe-network-installer/centos/pxe-network-installer.spec index bd767a0d..d6268e38 100644 --- a/installer/pxe-network-installer/centos/pxe-network-installer.spec +++ b/installer/pxe-network-installer/centos/pxe-network-installer.spec @@ -110,6 +110,7 @@ install -v -m 644 %{_sourcedir}/efi-centos-pxe-worker_lowlatency-install \ install -v -m 644 %{_sourcedir}/efi-centos-pxe-smallsystem_lowlatency-install \ %{buildroot}/pxeboot/pxelinux.cfg.files/efi-pxe-smallsystem_lowlatency-install-%{platform_release} +ln -sf /pxeboot/EFI/grubx64.efi %{buildroot}/pxeboot/grubx64.efi sed -i "s/xxxSW_VERSIONxxx/%{platform_release}/g" \ %{buildroot}/pxeboot/pxelinux.cfg.files/pxe-* \ diff --git a/mtce-common/src/common/bmcUtil.cpp b/mtce-common/src/common/bmcUtil.cpp index 43bed414..964e4eca 100644 --- a/mtce-common/src/common/bmcUtil.cpp +++ b/mtce-common/src/common/bmcUtil.cpp @@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr, * *************************************************************************/ -string bmcUtil_create_data_fn ( string & hostname, - string file_suffix, - bmc_protocol_enum protocol ) +string bmcUtil_create_data_fn ( const string & hostname, + string file_suffix, + bmc_protocol_enum protocol ) { /* create the output filename */ string datafile ; diff --git a/mtce-common/src/common/bmcUtil.h b/mtce-common/src/common/bmcUtil.h index 0208b88c..8c2a351d 100644 --- a/mtce-common/src/common/bmcUtil.h +++ b/mtce-common/src/common/bmcUtil.h @@ -82,6 +82,14 @@ typedef struct } bmc_info_type ; +typedef struct +{ + string hostname; + string host_ip ; + string bm_ip ; + string bm_un ; + string bm_pw ; +} bmcUtil_accessInfo_type ; /* BMC commands */ typedef enum @@ -107,6 +115,7 @@ typedef enum #define BMC_QUERY_FILE_SUFFIX ((const char *)("_root_query")) #define BMC_INFO_FILE_SUFFIX ((const char *)("_bmc_info")) #define BMC_POWER_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result")) +#define BMC_RESET_CMD_FILE_SUFFIX ((const char *)("_reset")) #define BMC_BOOTDEV_CMD_FILE_SUFFIX ((const char *)("_bootdev")) #define BMC_RESTART_CAUSE_FILE_SUFFIX ((const char *)("_restart_cause")) #define BMC_POWER_STATUS_FILE_SUFFIX ((const char *)("_power_status")) @@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr, bmc_protocol_enum protocol ); /* create the output filename */ -string bmcUtil_create_data_fn ( string & hostname, - string file_suffix, - bmc_protocol_enum protocol ); +string bmcUtil_create_data_fn ( const string & hostname, + string file_suffix, + bmc_protocol_enum protocol ); /* Get power state from query response data. */ int bmcUtil_is_power_on ( string hostname, diff --git a/mtce-common/src/common/hostUtil.cpp b/mtce-common/src/common/hostUtil.cpp index 06b15c16..5e5c4a16 100644 --- a/mtce-common/src/common/hostUtil.cpp +++ b/mtce-common/src/common/hostUtil.cpp @@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un ) return (false); } +bool hostUtil_is_valid_pw ( string pw ) +{ + if ( !pw.empty() ) + if ( pw.compare(NONE) ) + return (true); + return (false); +} + bool hostUtil_is_valid_mac_addr ( string mac ) { if ( !mac.empty() ) diff --git a/mtce-common/src/common/hostUtil.h b/mtce-common/src/common/hostUtil.h index 09e19b53..d9a561a8 100644 --- a/mtce-common/src/common/hostUtil.h +++ b/mtce-common/src/common/hostUtil.h @@ -46,6 +46,7 @@ string hostUtil_getPrefixPath ( void ); bool hostUtil_is_valid_uuid ( string uuid ); bool hostUtil_is_valid_ip_addr ( string ip ); bool hostUtil_is_valid_username ( string un ); +bool hostUtil_is_valid_pw ( string pw ); bool hostUtil_is_valid_bm_type ( string bm_type ); int hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data ); diff --git a/mtce-common/src/common/ipmiUtil.cpp b/mtce-common/src/common/ipmiUtil.cpp index c5e03193..0679df2e 100644 --- a/mtce-common/src/common/ipmiUtil.cpp +++ b/mtce-common/src/common/ipmiUtil.cpp @@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty ipmiUtil_bmc_info_log ( hostname, bmc_info, rc ); return (rc); } + + +int ipmiUtil_reset_host_now ( string hostname, + bmcUtil_accessInfo_type accessInfo, + string output_filename) +{ + dlog("%s %s BMC [IP:%s UN:%s]", + accessInfo.hostname.c_str(), + accessInfo.host_ip.c_str(), + accessInfo.bm_ip.c_str(), + accessInfo.bm_un.c_str()); + + if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false ) + daemon_make_dir(BMC_OUTPUT_DIR) ; + if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false ) + daemon_make_dir(IPMITOOL_OUTPUT_DIR) ; + + /* create temp password file */ + thread_info_type info ; + info.hostname = accessInfo.hostname ; + info.password_file = "" ; + info.pw_file_fd = 0 ; + + /* Use common utility to create a temp pw file */ + bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL ); + + /* create request */ + string request = + ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD, + accessInfo.bm_ip, + accessInfo.bm_un, + info.password_file, + output_filename ); + + /* issue request + * + * Note: Could launch a thread to avoid any stall. + * However, mtcClient can withstand up to a 25 second stall + * before pmon will fail it due to active monitoring. + * UT showed that there is no stall at all. */ + unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ; + unsigned long long before_time = gettime_monotonic_nsec () ; + int rc = system ( request.data()) ; + unsigned long long after_time = gettime_monotonic_nsec () ; + unsigned long long delta_time = after_time-before_time ; + if ( rc ) + { + wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) ); + rc = FAIL_SYSTEM_CALL ; + } + if ( delta_time > (latency_threshold_secs*1000000000)) + { + wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(), + (delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0, + (delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0); + } + + /* Cleanup */ + if ( info.pw_file_fd > 0 ) + close(info.pw_file_fd); + daemon_remove_file ( info.password_file.data()); + return (rc); +} diff --git a/mtce-common/src/common/ipmiUtil.h b/mtce-common/src/common/ipmiUtil.h index 7cc9edbc..39e8b9fd 100644 --- a/mtce-common/src/common/ipmiUtil.h +++ b/mtce-common/src/common/ipmiUtil.h @@ -57,6 +57,8 @@ int ipmiUtil_init ( void ); int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info ); +int ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename ); + /* Create the ipmi request */ string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out ); diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 3c7c482a..046db72a 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_REQ_MTCALIVE: return ("mtcAlive req"); case MTC_MSG_LOCKED: return ("locked msg"); case MTC_CMD_LAZY_REBOOT: return ("lazy reboot"); + case MTC_MSG_INFO: return ("info msg"); + case MTC_CMD_SYNC: return ("sync"); /* goenabled commands and messages */ case MTC_MSG_MAIN_GOENABLED: return ("goEnabled main msg"); @@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_EVENT_PMON_MAJOR: return("pmon major event"); case MTC_EVENT_PMON_MINOR: return("pmon minor event"); case MTC_EVENT_PMON_LOG: return("pmon log"); - case MTC_EVENT_PMOND_RAISE: return("pmon raise"); + case MTC_EVENT_PMOND_RAISE: return("pmond raise"); + case MTC_EVENT_PMOND_CLEAR: return("pmond clear"); /* data port events */ case MTC_EVENT_AVS_CLEAR: return("AVS clear"); @@ -394,10 +397,9 @@ void mtc_stages_init ( void ) recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start"; recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak"; recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change"; - recoveryStages_str[MTC_RECOVERY__ENABLE_START ] = "Enable-Start"; recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure"; recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait"; - recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT ] = "Enable-Wait"; + recoveryStages_str[MTC_RECOVERY__ENABLE ] = "Enable"; recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown"; disableStages_str [MTC_DISABLE__START ] = "Disable-Start"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 0603c0ce..b98c34d4 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -185,7 +185,7 @@ typedef enum #define DEFAULT_MTCALIVE_TIMEOUT (1200) #define DEFAULT_GOENABLE_TIMEOUT (300) #define DEFAULT_DOR_MODE_TIMEOUT (20) -#define DEFAULT_DOR_MODE_CPE_TIMEOUT (600) +#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) /** TODO: Convert names to omit JSON part */ #define MTC_JSON_INV_LABEL "ihosts" @@ -263,6 +263,7 @@ typedef enum #define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed" #define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout" #define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling" +#define MTC_TASK_RECOVERY_FAIL_HB "Graceful Recovery Heartbeat Failure, re-enabling" #define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling" #define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait" #define MTC_TASK_RECOVERED "Gracefully Recovered" @@ -311,7 +312,7 @@ typedef enum #define MTC_TASK_POWERCYCLE_FAIL "Critical Event Power-Cycle %d; failed" #define MTC_TASK_POWERCYCLE_DOWN "Critical Event Power-Down ; due to persistent critical sensor" #define MTC_TASK_RESETTING_HOST "Resetting Host, critical sensor" -#define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots" +#define MTC_TASK_AIO_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots" #define MTC_TASK_SELF_UNLOCK_MSG "Unlocking active controller, please stand-by while it reboots" #define MTC_TASK_FAILED_SWACT_REQ "Critical failure.Requesting SWACT to enabled standby controller" #define MTC_TASK_FAILED_NO_BACKUP "Critical failure.Please provision/enable standby controller" @@ -383,8 +384,8 @@ typedef enum /* 5 milliseconds */ #define MTCAGENT_SELECT_TIMEOUT (5000) -/* dedicate more idle time in CPE ; there is less maintenance to do */ -#define MTCAGENT_CPE_SELECT_TIMEOUT (10000) +/* dedicate more idle time in AIO ; there is less maintenance to do */ +#define MTCAGENT_AIO_SELECT_TIMEOUT (10000) /** Number of retries maintenance will do when it experiences * a REST API call failure ; any failure */ @@ -751,7 +752,9 @@ typedef struct #define MTC_CMD_START_STORAGE_SVCS 19 /* to host */ #define MTC_CMD_LAZY_REBOOT 20 /* to host */ #define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */ -#define MTC_CMD_LAST 22 +#define MTC_MSG_INFO 22 /* to host */ +#define MTC_CMD_SYNC 23 /* to host */ +#define MTC_CMD_LAST 24 #define RESET_PROG_MAX_REBOOTS_B4_RESET (5) #define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2) @@ -946,7 +949,7 @@ typedef enum string get_delStages_str ( mtc_delStages_enum stage ); -#define MTC_MAX_FAST_ENABLES (3) +#define MTC_MAX_FAST_ENABLES (5) typedef enum { MTC_RECOVERY__START = 0, @@ -972,10 +975,9 @@ typedef enum MTC_RECOVERY__HEARTBEAT_START, MTC_RECOVERY__HEARTBEAT_SOAK, MTC_RECOVERY__STATE_CHANGE, - MTC_RECOVERY__ENABLE_START, MTC_RECOVERY__FAILURE, MTC_RECOVERY__WORKQUEUE_WAIT, - MTC_RECOVERY__ENABLE_WAIT, + MTC_RECOVERY__ENABLE, MTC_RECOVERY__STAGES, } mtc_recoveryStages_enum ; @@ -1263,6 +1265,14 @@ typedef enum MTC_AR_DISABLE_CAUSE__NONE, } autorecovery_disable_cause_enum ; +/* code that represents a specific group of maintenance information + * ... typically for a specific feature */ +typedef enum +{ + MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO, + MTC_INFO_CODE__LAST +} mtcInfo_enum ; + /* Service Based Auto Recovery Control Structure */ typedef struct { diff --git a/mtce-common/src/common/threadUtil.cpp b/mtce-common/src/common/threadUtil.cpp index 034647eb..46e650bb 100644 --- a/mtce-common/src/common/threadUtil.cpp +++ b/mtce-common/src/common/threadUtil.cpp @@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl ) return (false); } +/**************************************************************************** + * + * Name : thread_done_consume + * + * Description: Return to IDLE stage. + * + ****************************************************************************/ + +int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info ) +{ + if ( ctrl.stage == THREAD_STAGE__IDLE ) + { + return PASS ; + } + else if ( ctrl.done == false ) + { + if ( info.runcount > ctrl.runcount ) + { + ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d", + info.hostname.c_str(), + info.command, + ctrl.runcount, + info.runcount); + ctrl.done = true ; + ctrl.stage = THREAD_STAGE__DONE ; + thread_handler (ctrl, info); + return PASS ; + } + else + { + thread_kill(ctrl, info); + return RETRY ; + } + } + else + { + ctrl.stage = THREAD_STAGE__DONE ; + thread_handler( ctrl, info ); + return PASS ; + } +} + /**************************************************************************** * * Name : thread_launch @@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info ) ( ctrl.stage != THREAD_STAGE__WAIT ) && ( ctrl.stage != THREAD_STAGE__IDLE )) { - blog ("%s kill request\n", ctrl.hostname.c_str() ); + wlog ("%s kill request\n", ctrl.hostname.c_str() ); _stage_change ( ctrl, THREAD_STAGE__KILL ); } } diff --git a/mtce-common/src/common/threadUtil.h b/mtce-common/src/common/threadUtil.h index 552d47bb..2cbabe41 100644 --- a/mtce-common/src/common/threadUtil.h +++ b/mtce-common/src/common/threadUtil.h @@ -284,6 +284,7 @@ bool thread_done ( thread_ctrl_type & ctrl ); bool thread_idle ( thread_ctrl_type & ctrl ); void thread_kill ( thread_ctrl_type & ctrl , thread_info_type & info ); string thread_stage ( thread_ctrl_type & ctrl ); +int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info ); /* Cooperative service of cancel and exit requests from parent */ void pthread_signal_handler ( thread_info_type * info_ptr ); diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 3f9ac031..0f9f5322 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -38,15 +38,15 @@ using namespace std ; /* List of different types */ typedef enum { - SYSTEM_TYPE__NORMAL =0, - SYSTEM_TYPE__CPE_MODE__DUPLEX =1, - SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT =2, - SYSTEM_TYPE__CPE_MODE__SIMPLEX =3, + SYSTEM_TYPE__NORMAL =0, + SYSTEM_TYPE__AIO__DUPLEX =1, + SYSTEM_TYPE__AIO__DUPLEX_DIRECT =2, + SYSTEM_TYPE__AIO__SIMPLEX =3, } system_type_enum ; /** Called by signal handler on daemon exit - * Performs cleanup by closing open files + * Performs cleanup by closing open files * and freeing used memory */ void daemon_exit ( void ); diff --git a/mtce-common/src/daemon/daemon_files.cpp b/mtce-common/src/daemon/daemon_files.cpp index 8272e7a8..0809b756 100755 --- a/mtce-common/src/daemon/daemon_files.cpp +++ b/mtce-common/src/daemon/daemon_files.cpp @@ -347,7 +347,7 @@ string daemon_mgmnt_iface ( void ) system_type_enum daemon_system_type ( void ) { char buffer [BUFFER]; - system_type_enum system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ; + system_type_enum system_type = SYSTEM_TYPE__AIO__SIMPLEX ; FILE * cfg_file_stream = fopen ( PLATFORM_CONF_FILE, "r" ); if ( cfg_file_stream != NULL ) { @@ -401,11 +401,11 @@ system_type_enum daemon_system_type ( void ) if ( !mode.empty() ) { if ( mode.compare("duplex") == 0 ) - system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX ; + system_type = SYSTEM_TYPE__AIO__DUPLEX ; else if ( mode.compare("duplex-direct") == 0 ) - system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT ; + system_type = SYSTEM_TYPE__AIO__DUPLEX_DIRECT ; else if ( mode.compare("simplex") == 0 ) - system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ; + system_type = SYSTEM_TYPE__AIO__SIMPLEX ; else { elog ("%s All-In-One system type ; mode unknown\n", SYSTEM_TYPE_PREFIX ); @@ -438,21 +438,21 @@ system_type_enum daemon_system_type ( void ) ilog("%s Standard System\n", SYSTEM_TYPE_PREFIX); break ; } - case SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT: + case SYSTEM_TYPE__AIO__DUPLEX_DIRECT: { ilog ("%s All-in-one Duplex Direct Connect\n", SYSTEM_TYPE_PREFIX ); break ; } - case SYSTEM_TYPE__CPE_MODE__DUPLEX: + case SYSTEM_TYPE__AIO__DUPLEX: { ilog ("%s All-in-one Duplex\n", SYSTEM_TYPE_PREFIX ); break ; } - case SYSTEM_TYPE__CPE_MODE__SIMPLEX: + case SYSTEM_TYPE__AIO__SIMPLEX: default: { ilog ("%s All-in-one Simplex \n", SYSTEM_TYPE_PREFIX ); - system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ; + system_type = SYSTEM_TYPE__AIO__SIMPLEX ; break ; } } diff --git a/mtce-control/src/scripts/hbsAgent.service b/mtce-control/src/scripts/hbsAgent.service index 7e111707..bd4bcd63 100644 --- a/mtce-control/src/scripts/hbsAgent.service +++ b/mtce-control/src/scripts/hbsAgent.service @@ -1,22 +1,13 @@ [Unit] Description=StarlingX Maintenance Heartbeat Agent -After=network.target syslog.service config.service +After=hbsClient.service Before=pmon.service [Service] Type=forking ExecStart=/etc/rc.d/init.d/hbsAgent start -ExecStop=/etc/rc.d/init.d/hbsAgent start +ExecStop=/etc/rc.d/init.d/hbsAgent stop PIDFile=/var/run/hbsAgent.pid -KillMode=process -SendSIGKILL=no - -# Process recovery is handled by pmond if its running. -# Delay 10 seconds to give pmond a chance to recover -# before systemd kicks in to do it as a backup plan. -Restart=always -RestartSec=10 [Install] WantedBy=multi-user.target - diff --git a/mtce/src/alarm/scripts/mtcalarm.logrotate b/mtce/src/alarm/scripts/mtcalarm.logrotate index c1b91aa2..8287c7e5 100644 --- a/mtce/src/alarm/scripts/mtcalarm.logrotate +++ b/mtce/src/alarm/scripts/mtcalarm.logrotate @@ -1,17 +1,19 @@ -#daily -nodateext -start 1 -compress -copytruncate -notifempty -missingok +# +# Copyright (c) 2018-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/mtcalarmd.log { + create 0640 root root + start 1 size 10M rotate 20 - sharedscripts + compress + notifempty + missingok postrotate systemctl reload syslog-ng > /dev/null 2>&1 || true endscript + delaycompress } diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index ae43fe64..e2320430 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) { ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ; } - ptr->alarms_loaded = false ; + ptr->active_alarms = "" ; /* no active alarms */ ptr->cfgEvent.base = NULL ; ptr->sysinvEvent.base= NULL ; @@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) return ptr ; } + struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname ) { /* check for empty list condition */ @@ -2706,7 +2707,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2818,7 +2819,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2835,7 +2836,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2853,7 +2854,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2871,7 +2872,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2889,7 +2890,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -2940,7 +2941,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = MTC_OPER_STATE__DISABLED ; node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ; - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ; node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ; @@ -2958,7 +2959,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); @@ -3295,6 +3296,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr ) } } +/*************************************************************************** + * + * Name : build_mtcInfo_dict + * + * Purpose : Build a json dictionary for the specified info code enum + * + * Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported + * + * Returns : Returns a json dictionary of mtcInfo. + * + * { + * "controller-0":{ + * "ip":"192.168.204.2", + * "bm_ip":"xxx.xxx.xx.23", + * "bm_un":"root", + * "bm_pw":"root" + * }, + * "controller-1":{ + * "ip":"192.168.204.3", + * "bm_ip":"xxx.xxx.xx.24", + * "bm_un":"root", + * "bm_pw":"root" + * } + * } + * + **************************************************************************/ + +string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code ) +{ + string mtcInfo_dict = "" ; + + /* loop/exit control */ + int temp = 0 ; + + /* should never happen but better to be safe */ + if ( head == NULL ) + return mtcInfo_dict ; + + /* force the update to be a dictionary */ + mtcInfo_dict = "{" ; + + for ( struct node * ptr = head ; ; ptr = ptr->next ) + { + if (( ptr->nodetype & CONTROLLER_TYPE ) && + ( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO )) + { + if ( temp ) + mtcInfo_dict.append(","); + mtcInfo_dict.append("\"" + ptr->hostname + "\":{"); + mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\","); + mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\","); + mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\","); + mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}"); + if ( ++temp >= 2 ) + break ; + } + if (( ptr->next == NULL ) || ( ptr == tail )) + break ; + } + mtcInfo_dict.append("}"); + return mtcInfo_dict ; +} + +/************************************************************************** + * + * Name : mtcInfo_handler + * + * Purpose : Send mtcInfo update to provisioned controllers when + * the push flag is set. + * + **************************************************************************/ + +void nodeLinkClass::mtcInfo_handler ( void ) +{ + /* This is set in the bm_handler once access to the BMC using + * provisioned credentials have been verified. */ + if ( this->want_mtcInfo_push ) + { + /* handler will enhance when more codes are introduced */ + mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ; + + string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code); + if ( ! mtcInfo_dict.empty() ) + { + string temp = CONTROLLER_0 ; + send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict); + if ( this->controllers > 1 ) + { + temp = CONTROLLER_1; + send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict); + } + } + this->want_mtcInfo_push = false ; + } +} + /* Lock Rules * * 1. Cannot lock this controller @@ -4034,6 +4131,18 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname ) return (0); } + +int nodeLinkClass::get_mtce_flags ( string & hostname ) +{ + nodeLinkClass::node* node_ptr ; + node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + return ( node_ptr->mtce_flags ); + } + return (0); +} + void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) { nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); @@ -4114,7 +4223,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) /* Deal with sub-function if AIO controller host */ - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { if ( flags & MTC_FLAG__SUBF_GOENABLED ) { @@ -4422,6 +4531,18 @@ string nodeLinkClass::get_bm_ip ( string hostname ) return (""); } +string nodeLinkClass::get_bm_pw ( string hostname ) +{ + nodeLinkClass::node* node_ptr ; + node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + return (node_ptr->bm_pw); + } + elog ("%s bm pw lookup failed\n", hostname.c_str() ); + return (""); +} + string nodeLinkClass::get_bm_un ( string hostname ) { nodeLinkClass::node* node_ptr ; @@ -4774,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa /* Otherwise this is a single host that has recovered * possibly as part of a mnfa group or simply a lone wolf */ - else + else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) && + (( clstr_network_provisioned == false ) || + (( clstr_network_provisioned == true ) && + ( node_ptr->hbs_minor[CLSTR_IFACE] == false )))) { if ( node_ptr->mnfa_graceful_recovery == true ) { @@ -4782,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa mnfa_awol_list.remove(node_ptr->hostname); } + /* Don't recover until heartbeat is working over all + * monitored interfaces */ mnfa_recover_host ( node_ptr ); if ( mnfa_active == true ) @@ -4819,17 +4945,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa } if ( temp_count != mnfa_host_count[iface] ) - { + { slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), mnfa_host_count[iface], temp_count ); mnfa_host_count[iface] = temp_count ; mnfa_host_count[iface] = temp_count ; - } + } else { - wlog ("%s MNFA host tally (%s:%d)\n", + dlog ("%s MNFA host tally (%s:%d)\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), mnfa_host_count[iface] ); @@ -4935,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } return ; } + else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK ) + { + elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n", + hostname.c_str(), + get_iface_name_str(iface)); + force_full_enable ( node_ptr ); + return ; + } mnfa_add_host ( node_ptr , iface ); if ( mnfa_active == false ) { + /* if node is already in graceful recovery just ignore the event */ + if ( node_ptr->graceful_recovery_counter != 0 ) + { + dlog ("%s %s loss event ; already in graceful recovery try %d", + hostname.c_str(), + get_iface_name_str(iface), + node_ptr->graceful_recovery_counter ); + return ; + } elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); if ( iface == CLSTR_IFACE ) { @@ -4980,6 +5123,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } } +/**************************************************************************** + * + * Name : manage_heartbeat_clear + * + * Description: Manage clearing heartbeat failure status + * + * Assuptions : Called by Both hbsAgent and mtcAgent + * + ***************************************************************************/ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) { nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); @@ -4995,13 +5147,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) node_ptr->heartbeat_failed[i] = false ; if ( i == MGMNT_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; } if ( i == CLSTR_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; } } } @@ -5010,13 +5166,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) node_ptr->heartbeat_failed[iface] = false ; if ( iface == MGMNT_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; } else if ( iface == CLSTR_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; } } } @@ -5795,9 +5955,6 @@ int nodeLinkClass::critical_process_failed( string & hostname, node_ptr->hostname.c_str()); /* dlog */ } - /* Start fresh the next time we enter graceful recovery handler */ - node_ptr->graceful_recovery_counter = 0 ; - /* Set node as unlocked-disabled-failed */ allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, MTC_OPER_STATE__DISABLED, @@ -6755,7 +6912,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr, } /** Validate and log Recovery stage changes */ -int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, +int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, mtc_recoveryStages_enum newHdlrStage ) { int rc = PASS ; @@ -6763,14 +6920,14 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, if (( newHdlrStage >= MTC_RECOVERY__STAGES ) || ( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES )) { - slog ("%s Invalid recovery stage (%d:%d)\n", + slog ("%s Invalid recovery stage (%d:%d)\n", node_ptr->hostname.c_str(), - node_ptr->recoveryStage, + node_ptr->recoveryStage, newHdlrStage ); if ( newHdlrStage < MTC_RECOVERY__STAGES ) { - clog ("%s ? -> %s\n", + clog ("%s ? -> %s\n", node_ptr->hostname.c_str(), get_recoveryStages_str(newHdlrStage).c_str()); @@ -6782,11 +6939,11 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, rc = FAIL ; } } - else + else { - clog ("%s %s -> %s\n", + clog ("%s %s -> %s\n", node_ptr->hostname.c_str(), - get_recoveryStages_str(node_ptr->recoveryStage).c_str(), + get_recoveryStages_str(node_ptr->recoveryStage).c_str(), get_recoveryStages_str(newHdlrStage).c_str()); node_ptr->recoveryStage = newHdlrStage ; @@ -7514,7 +7671,7 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr, mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" ); if (( NOT_THIS_HOST ) && - ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )) + ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX )) { if ( ++node_ptr->ar_count[node_ptr->ar_cause] >= this->ar_threshold [node_ptr->ar_cause] ) @@ -7746,7 +7903,11 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen if ( true_false == true ) { - ilog ("%s heartbeat start", hostname.c_str()); + ilog ("%s %s heartbeat %sstart", + hostname.c_str(), + get_iface_name_str(iface), + node_ptr->monitor[iface] ? "re" : ""); + node_ptr->no_work_log_throttle = 0 ; node_ptr->b2b_misses_count[iface] = 0 ; node_ptr->hbs_misses_count[iface] = 0 ; @@ -7758,7 +7919,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen } else { - ilog ("%s heartbeat stop", hostname.c_str()); + if ( node_ptr->monitor[iface] == true ) + { + ilog ("%s %s heartbeat stop", + hostname.c_str(), + get_iface_name_str(iface)); + } } node_ptr->monitor[iface] = true_false ; } @@ -7771,7 +7937,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen void nodeLinkClass::set_hwmond_monitor_state ( string & hostname, bool state ) { if ( hostname.length() ) - { + { struct nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) @@ -8511,7 +8677,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p -#define HBS_LOSS_REPORT_THROTTLE (100) +#define HBS_LOSS_REPORT_THROTTLE (100000) int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { int lost = 0 ; @@ -8551,6 +8717,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { + if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold ) + { + hbs_cluster_change ( pulse_ptr->hostname + " " + + get_iface_name_str(iface) + + " heartbeat miss " + + itos(pulse_ptr->b2b_misses_count[iface])); + } if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) @@ -8657,57 +8830,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } } - /* Turn the cluster-host heartbeat loss into a degrade only - * condition if the clstr_degrade_only flag is set */ - if (( iface == CLSTR_IFACE ) && - ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) && - ( clstr_degrade_only == true )) - { - /* Only print the log at the threshold boundary */ - if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) - { - if ( this->active_controller ) - { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - } - - wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", - pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); - } - } - /* Turn the clstr heartbeat loss into a degrade only * condition for inactive controller on normal system. */ - else if (( iface == CLSTR_IFACE ) && - ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) && - ( this->system_type == SYSTEM_TYPE__NORMAL ) && - (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) + if (( iface == CLSTR_IFACE ) && + ((( this->system_type == SYSTEM_TYPE__NORMAL ) && + (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) || + ( clstr_degrade_only == true ))) { /* Only print the log at the threshold boundary */ - if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) + if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold ) { if ( this->active_controller ) { manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); } - wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", + wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface)); - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); + get_iface_name_str(iface), + clstr_degrade_only ? "config option" : "system type"); + hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" ); } } else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) + // else if ( pulse_ptr->hbs_failure[iface] == false ) { - elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); + elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n", + pulse_ptr->hostname.c_str(), + get_iface_name_str(iface), + pulse_ptr->b2b_misses_count[iface]); + hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" ); if ( this->active_controller ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - + if ( pulse_ptr->hbs_failure[iface] == false ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } /* report this host as failed */ if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) { @@ -8715,10 +8874,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } } else - { pulse_ptr->hbs_failure[iface] = true ; - } - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); + pulse_ptr->hbs_failure_count[iface]++ ; } if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) @@ -8963,21 +9120,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", + node_ptr->hostname.c_str(), node_ptr->mtcAlive_online ? 'Y' : 'N', node_ptr->mtcAlive_offline ? 'Y' : 'N', node_ptr->mtcAlive_count, node_ptr->mtcAlive_gate ? "closed" : "open", - node_ptr->mtcAlive_misses); + node_ptr->mtcAlive_misses); mem_log (str); } void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", + node_ptr->hostname.c_str(), node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .", node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .", node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .", @@ -8987,6 +9144,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) mem_log (str); } +void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr ) +{ + if ( ! node_ptr->active_alarms.empty() ) + { + char str[MAX_MEM_LOG_DATA] ; + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n", + node_ptr->hostname.c_str(), + node_ptr->active_alarms.c_str()); + mem_log (str); + } +} + void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; @@ -9037,8 +9206,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n", - node_ptr->hostname.c_str(), - node_ptr->mac.c_str(), + node_ptr->hostname.c_str(), + node_ptr->mac.c_str(), node_ptr->ip.c_str(), node_ptr->clstr_ip.c_str(), node_ptr->uptime ); @@ -9050,11 +9219,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr ) char str[MAX_MEM_LOG_DATA] ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n", + node_ptr->hostname.c_str(), get_iface_name_str (iface), - node_ptr->hbs_minor[iface] ? "true " : "false", - node_ptr->hbs_degrade[iface] ? "true " : "false", + node_ptr->hbs_minor[iface] ? "true " : "false", + node_ptr->hbs_degrade[iface] ? "true " : "false", node_ptr->hbs_failure[iface] ? "true " : "false", node_ptr->monitor[iface] ? "YES" : "no" ); mem_log (str); @@ -9083,8 +9252,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr ) void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", + node_ptr->hostname.c_str(), get_oosTestStages_str(node_ptr->oosTestStage).c_str(), node_ptr->oos_test_count, get_insvTestStages_str(node_ptr->insvTestStage).c_str(), @@ -9117,7 +9286,7 @@ void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr ) node_ptr->function); mem_log (str); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n", node_ptr->hostname.c_str(), @@ -9156,6 +9325,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname ) // mem_log_reset_info ( node_ptr ); mem_log_power_info ( node_ptr ); mem_log_alarm1 ( node_ptr ); + mem_log_alarm2 ( node_ptr ); mem_log_mtcalive ( node_ptr ); mem_log_stage ( node_ptr ); mem_log_bm ( node_ptr ); diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 42ca79d6..5df2ce56 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -76,11 +76,11 @@ using namespace std; #define LARGE_SYSTEM \ ( this->system_type == SYSTEM_TYPE__NORMAL ) -#define CPE_SYSTEM \ +#define AIO_SYSTEM \ ( this->system_type != SYSTEM_TYPE__NORMAL ) -#define SIMPLEX_CPE_SYSTEM \ - ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) +#define SIMPLEX_AIO_SYSTEM \ + ( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) /** * @addtogroup nodeLinkClass @@ -652,12 +652,12 @@ private: /** @} private_monitoring_services_variables */ - /* List of alarms and current severity */ - #define MAX_ALARMS (10) + /* List of alarms current severity */ EFmAlarmSeverityT alarms[MAX_ALARMS]; - /* tracks whether the alarms for this host have been loaded already or not */ - bool alarms_loaded ; + /* string containing active alarms and their severity + * ... for logging purposes only */ + string active_alarms ; /** true if this host has recovered before the mnfa timeout period. * This bool flags the graceful recovery handler that this node @@ -665,8 +665,6 @@ private: * and uptime accordingly */ bool mnfa_graceful_recovery ; - int stress_iteration ; - /* BMC Protocol Learning Controls and State */ /* specifies what BMC protocol is selected for this host @@ -828,10 +826,13 @@ private: int oos_test_handler ( struct nodeLinkClass::node * node_ptr ); int insv_test_handler ( struct nodeLinkClass::node * node_ptr ); int stress_handler ( struct nodeLinkClass::node * node_ptr ); - int bmc_handler ( struct nodeLinkClass::node * node_ptr ); + int bmc_handler ( struct nodeLinkClass::node * node_ptr ); int degrade_handler ( struct nodeLinkClass::node * node_ptr ); + int uptime_handler ( void ); + void mtcInfo_handler ( void ); + int host_services_handler ( struct nodeLinkClass::node * node_ptr ); /* Starts the specified 'reset or powercycle' recovery monitor */ @@ -840,6 +841,9 @@ private: /* server specific power state query handler */ bool (*is_poweron_handler) (string hostname, string query_response ); + /* Audit that monitors and auto corrects alarm state mismatches */ + void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr ); + /* Calculate the overall reset progression timeout */ int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries ); @@ -851,13 +855,22 @@ private: void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state ); void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface ); + /********* mtcInfo in the database ************/ int mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value ); string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key ); void mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key ); void mtcInfo_log ( struct nodeLinkClass::node * node_ptr ); - int set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info ); + /********* mtcInfo that gets puished out to daemons ***********/ + + + /* flag telling mtce when a mtcInfo push needs to be done */ + bool want_mtcInfo_push = false ; + + /* performs the mtcInfo push */ + void push_mtcInfo ( void ); + /***************************************************************************** * * Name : bmc_command_send @@ -1192,11 +1205,11 @@ private: * Set to true when the autorecovery threshold is reached * and we want to avoid taking further autorecovery action * even though it may be requested. */ - bool autorecovery_disabled ; + bool autorecovery_disabled = false ; /* Set to true by fault detection methods that are * autorecoverable when in simplex mode. */ - bool autorecovery_enabled ; + bool autorecovery_enabled = false ; /** Tracks the number of hosts that 'are currently' in service trouble * wrt heartbeat (above minor threshold). @@ -1292,6 +1305,7 @@ private: void mem_log_state1 ( struct nodeLinkClass::node * node_ptr ); void mem_log_state2 ( struct nodeLinkClass::node * node_ptr ); void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ); + void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr ); void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ); void mem_log_stage ( struct nodeLinkClass::node * node_ptr ); void mem_log_test_info ( struct nodeLinkClass::node * node_ptr ); @@ -1464,11 +1478,14 @@ public: /***********************************************************/ + /** Number of provisioned controllers */ + int controllers = 0 ; + /** Number of provisioned hosts (nodes) */ - int hosts ; + int hosts = 0 ; /* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */ - bool unlock_ready_wait ; + bool unlock_ready_wait = false ; /** Host has been deleted */ bool host_deleted ; @@ -1517,6 +1534,9 @@ public: /** Return the number of inventoried hosts */ int num_hosts ( void ); + /** Return the number of inventoried controllers */ + int num_controllers ( void ); + /** ********************************************************************** * * Name : nodeLinkClass::workQueue_enqueue @@ -1664,6 +1684,9 @@ public: /* Clear heartbeat failed flag for all interfaces */ void manage_heartbeat_clear ( string hostname, iface_enum iface ); + /* Build a json dictionary of containing code specified maintenance info */ + string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code ); + /** Test and Debug Members and Variables */ /** Print node info banner */ @@ -1752,6 +1775,7 @@ public: #define MTC_FLAG__I_AM_LOCKED (0x00000008) */ void set_mtce_flags ( string hostname, int flags, int iface ); + int get_mtce_flags ( string & hostname ); /** Updates the node's health code * Codes are found in nodeBase.h @@ -1789,6 +1813,7 @@ public: string get_bm_ip ( string hostname ); string get_bm_un ( string hostname ); + string get_bm_pw ( string hostname ); string get_bm_type ( string hostname ); string get_hostname_from_bm_ip ( string bm_ip ); diff --git a/mtce/src/fsmon/scripts/fsmon.logrotate b/mtce/src/fsmon/scripts/fsmon.logrotate index 0476a8b2..821391f8 100644 --- a/mtce/src/fsmon/scripts/fsmon.logrotate +++ b/mtce/src/fsmon/scripts/fsmon.logrotate @@ -1,15 +1,19 @@ -#daily -nodateext +# +# Copyright (c) 2015-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/fsmond.log { - size 10M + create 0640 root root start 1 - missingok + size 10M rotate 20 compress - sharedscripts + notifempty + missingok postrotate systemctl reload syslog-ng > /dev/null 2>&1 || true endscript + delaycompress } diff --git a/mtce/src/heartbeat/Makefile b/mtce/src/heartbeat/Makefile index a625f20b..a0fdb8da 100755 --- a/mtce/src/heartbeat/Makefile +++ b/mtce/src/heartbeat/Makefile @@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common INCLUDES += -I../common -I../alarm -I../maintenance -I../public -CCFLAGS = -g -O2 -Wall -Wextra -Werror +CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11 STATIC_ANALYSIS_TOOL = cppcheck STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0) diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index 4eec5a29..ecd6941a 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -1381,6 +1381,7 @@ int daemon_init ( string iface, string nodetype ) hbs_ctrl.locked = true ; } + daemon_init_fit(); return (rc); } @@ -1521,6 +1522,7 @@ void hbs_sm_handler ( void ) * False if time delta is greater * ***************************************************************************/ +#define HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES (10000) bool manage_sm_heartbeat ( void ) { struct timespec ts ; @@ -1532,8 +1534,9 @@ bool manage_sm_heartbeat ( void ) if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS ) { sm_heartbeat_count = 0; - if (( ++sm_heartbeat_count_b2b_misses < 20 )|| - (!( sm_heartbeat_count_b2b_misses % 100 ))) + if ((( ++sm_heartbeat_count_b2b_misses < 20 ) || + (!( sm_heartbeat_count_b2b_misses % 1000 ))) && + ( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES )) { wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d", delta.secs, delta.msecs, @@ -1817,6 +1820,10 @@ void daemon_service_run ( void ) inv.name = hbsInv.my_hostname ; inv.nodetype = CONTROLLER_TYPE ; hbsInv.add_heartbeat_host ( inv ); + + /* add this host to local inventory */ + hostname_inventory.push_front(hbsInv.my_hostname); + ilog ("%s added to inventory (self)", hbsInv.my_hostname.c_str()); } /* enable the base level signal handler latency monitor */ @@ -1841,7 +1848,7 @@ void daemon_service_run ( void ) clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last ); /* no need for the heartbeat audit in a simplex system */ - if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { /* start the state audit */ /* run the first audit in 30 seconds */ @@ -2056,7 +2063,7 @@ void daemon_service_run ( void ) hbsInv.active_controller ? "" : "in" ); /* no need for the heartbeat audit in a simplex system */ - if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { /* Due to activity state change we will dump * the heartbeat cluster state at now time @@ -2074,6 +2081,7 @@ void daemon_service_run ( void ) inv.nodetype = msg.parm[0]; hbsInv.add_heartbeat_host ( inv ) ; hostname_inventory.push_back ( inv.name ); + hostname_inventory.unique(); // avoid duplicates ilog ("%s added to heartbeat service (%d)\n", inv.name.c_str(), inv.nodetype); @@ -2119,7 +2127,7 @@ void daemon_service_run ( void ) { if ( hostname != hbsInv.my_hostname ) { - hbsInv.mon_host ( hostname, false, true ); + hbsInv.mon_host ( hostname, false, false ); hbs_cluster_del ( hostname ); ilog ("%s heartbeat service disabled by stop command", hostname.c_str()); @@ -2366,6 +2374,7 @@ void daemon_service_run ( void ) arrival_histogram[iface] = "" ; unexpected_pulse_list[iface] = "" ; + rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri ); if ( rc != 0 ) { @@ -2523,7 +2532,9 @@ void daemon_service_run ( void ) } } /* log cluster throttled */ - if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 ))) + if ((( heartbeat_ok == false ) && + ( !( sm_heartbeat_count_b2b_misses % 1000 ))) && + ( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES )) { hbs_state_audit ( ); } diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index bfa8f1d1..b9f067f7 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri void hbs_sm_handler ( void ); /* send the cluster vault to SM */ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); +int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); /* copy cluster data from src to dst */ void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); @@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ); /* Heartbeat service state audit */ void hbs_state_audit ( void ); +/* Send state change message to SM if there has been a + * state change in the last period */ +void hbs_cluster_change_notifier ( void ); + /** * @} hbs_base */ diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 780fa8e3..1f82a4e3 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -69,6 +69,8 @@ typedef struct msgClassSock * sm_socket_ptr ; + string cluster_change_reason ; + } hbs_cluster_ctrl_type ; /* Cluster control structire construct allocation. */ @@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) { ctrl.sm_socket_ptr = sm_socket_ptr ; } + ctrl.cluster_change_reason = ""; + ctrl.log_throttle = 0 ; } @@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller, void hbs_cluster_change ( string cluster_change_reason ) { - hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason ); + ilog ("reason: %s", cluster_change_reason.c_str()); + if ( ctrl.cluster_change_reason.empty() ) + ctrl.cluster_change_reason = cluster_change_reason ; + else + ctrl.cluster_change_reason.append("," + cluster_change_reason) ; +} + +/**************************************************************************** + * + * Name : hbs_cluster_change_notifier + * + * Description : Send SM the cluster info if there has been a state change. + * + ***************************************************************************/ +void hbs_cluster_change_notifier ( void ) +{ + if ( ! ctrl.cluster_change_reason.empty () ) + { + if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0, + ctrl.cluster_change_reason ) == PASS ) + { + ctrl.cluster_change_reason.clear(); + } + } } /**************************************************************************** @@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface, wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT, "Unable to store history beyond %d ", ctrl.cluster.histories ); + hbs_cluster_change_notifier (); return ; } else @@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface, else history_ptr->oldest_entry_index++ ; + hbs_cluster_change_notifier (); + /* clear the log throttle if we are updating history ok. */ ctrl.log_throttle = 0 ; } @@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void ) * ***************************************************************************/ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) +int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) { + int rc = FAIL_SOCKET_SENDTO ; ctrl.cluster.reqid = (unsigned short)reqid ; if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) { - ilog ("cluster state notification Reason: %s", reason.c_str()); int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes(); int bytes = sm_client_sock->write((char*)&ctrl.cluster, len); if ( bytes <= 0 ) @@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n", bytes , errno, strerror(errno)); } - hbs_cluster_dump ( ctrl.cluster ); + else + { + /* limit the string length */ + ilog ("reason: %s", reason.substr(0,80).c_str()); + hbs_cluster_dump ( ctrl.cluster ); + rc = PASS ; + } } else { wlog ("cannot send cluster info due to socket error"); } + return(rc); } /**************************************************************************** @@ -689,7 +726,7 @@ void hbs_history_save ( string hostname, { if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) ) { - hbs_cluster_change ("peer controller cluster event " + + hbs_cluster_change ("peer cluster delta " + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network)); } diff --git a/mtce/src/heartbeat/hbsStubs.cpp b/mtce/src/heartbeat/hbsStubs.cpp index 81326d17..474a7221 100644 --- a/mtce/src/heartbeat/hbsStubs.cpp +++ b/mtce/src/heartbeat/hbsStubs.cpp @@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void ) void nodeLinkClass::mnfa_exit ( bool force ) { force = force ; } -int send_mtc_cmd ( string & hostname, int cmd, int interface ) -{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; } +int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict) +{ + UNUSED(hostname); + UNUSED(cmd); + UNUSED(interface); + UNUSED(json_dict); + return PASS ; +} int nodeLinkClass::mtcInvApi_subf_states ( string hostname, string oper_subf, diff --git a/mtce/src/hostw/scripts/hostw.logrotate b/mtce/src/hostw/scripts/hostw.logrotate index 065ccc1f..cb2f6aa4 100644 --- a/mtce/src/hostw/scripts/hostw.logrotate +++ b/mtce/src/hostw/scripts/hostw.logrotate @@ -1,16 +1,19 @@ -#daily -nodateext +# +# Copyright (c) 2020-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/hostwd.log { - nodateext - size 10M + create 0640 root root start 1 - missingok + size 10M rotate 20 compress - sharedscripts + notifempty + missingok postrotate systemctl reload syslog-ng > /dev/null 2>&1 || true endscript + delaycompress } diff --git a/mtce/src/hwmon/hwmonSensor.cpp b/mtce/src/hwmon/hwmonSensor.cpp index 475c7cd5..07f3c392 100644 --- a/mtce/src/hwmon/hwmonSensor.cpp +++ b/mtce/src/hwmon/hwmonSensor.cpp @@ -254,7 +254,7 @@ void hwmonGroup_init ( string & hostname , struct sensor_group_type * group_ptr group_ptr->actions_critical_choices.append(HWMON_ACTION_ALARM); /* Don't support reset and power cycle in AIO simplex mode */ - if ( obj_ptr->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( obj_ptr->system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { group_ptr->actions_critical_choices.append(","); group_ptr->actions_critical_choices.append(HWMON_ACTION_RESET); diff --git a/mtce/src/hwmon/hwmonThreads.cpp b/mtce/src/hwmon/hwmonThreads.cpp index f5305050..a572ce4c 100644 --- a/mtce/src/hwmon/hwmonThreads.cpp +++ b/mtce/src/hwmon/hwmonThreads.cpp @@ -964,6 +964,10 @@ static int _parse_redfish_sensor_data( char * json_str_ptr, thread_info_type * i { strcpy(_sample_list[samples].status, "cr"); } + else if (!strcmp (health.data(), REDFISH_SEVERITY__NONRECOVERABLE )) + { + strcpy(_sample_list[samples].status, "nr"); + } else { strcpy(_sample_list[samples].status, "na"); diff --git a/mtce/src/hwmon/hwmonThreads.h b/mtce/src/hwmon/hwmonThreads.h index f215cca8..2e5a0a3a 100644 --- a/mtce/src/hwmon/hwmonThreads.h +++ b/mtce/src/hwmon/hwmonThreads.h @@ -33,6 +33,7 @@ #define REDFISH_SEVERITY__GOOD "OK" #define REDFISH_SEVERITY__MAJOR "Warning" #define REDFISH_SEVERITY__CRITICAL "Critical" +#define REDFISH_SEVERITY__NONRECOVERABLE "NonRecoverable" #define BMC_SENSOR_DEFAULT_UNIT_TYPE_TEMP "degrees" #define BMC_SENSOR_DEFAULT_UNIT_TYPE_VOLT "Volts" diff --git a/mtce/src/hwmon/scripts/hwmon.logrotate b/mtce/src/hwmon/scripts/hwmon.logrotate index dd1eceee..e8ce8e66 100644 --- a/mtce/src/hwmon/scripts/hwmon.logrotate +++ b/mtce/src/hwmon/scripts/hwmon.logrotate @@ -1,28 +1,21 @@ -#daily -nodateext -start 1 -missingok -notifempty -compress -sharedscripts -postrotate - systemctl reload syslog-ng > /dev/null 2>&1 || true -endscript +# +# Copyright (c) 2020-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/hwmond.log -{ - size 50M - rotate 5 -} - /var/log/hwmond_event.log -{ - size 50M - rotate 5 -} - /var/log/hwmond_api.log { + create 0640 root root + start 1 size 50M rotate 5 + compress + notifempty + missingok + postrotate + systemctl reload syslog-ng > /dev/null 2>&1 || true + endscript + delaycompress } diff --git a/mtce/src/lmon/scripts/lmon.logrotate b/mtce/src/lmon/scripts/lmon.logrotate index b59fa9ff..e6fe3191 100644 --- a/mtce/src/lmon/scripts/lmon.logrotate +++ b/mtce/src/lmon/scripts/lmon.logrotate @@ -1,16 +1,19 @@ -#daily -nodateext +# +# Copyright (c) 2020-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/lmond.log { - nodateext - size 10M + create 0640 root root start 1 - missingok + size 10M rotate 20 compress - sharedscripts + notifempty + missingok postrotate systemctl reload syslog-ng > /dev/null 2>&1 || true endscript + delaycompress } diff --git a/mtce/src/maintenance/Makefile b/mtce/src/maintenance/Makefile index 7d11f6ac..767ffefe 100755 --- a/mtce/src/maintenance/Makefile +++ b/mtce/src/maintenance/Makefile @@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public -CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces +CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11 STATIC_ANALYSIS_TOOL = cppcheck STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0) diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index 8262da9f..28d1b6bc 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -26,6 +26,7 @@ using namespace std; #include "daemon_common.h" /* */ #include "nodeBase.h" /* */ +#include "nodeClass.h" /* */ #include "nodeTimers.h" /* */ #include "nodeUtil.h" /* */ #include "mtcAlarm.h" /* for ... this module header */ @@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname ) } } +/**************************************************************************** + * + * Name : mtcAlarm_audit + * + * Purpose : Monitor and Auto-Correct maintenance alarms + * + * Description: Query locked state alarm (raw) + * if successful + * - Query alarms + * - compare to running state + * - correct mismatches ; internal state takes precidence + * - log all alarm state changes + * + ****************************************************************************/ + +void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr ) +{ + /* + * Read locked state alarm directly to detect fm access failures. + * If successful further reads are done using a wrapper utility. + */ + SFmAlarmDataT alarm_query ; + AlarmFilter alarm_filter ; + EFmErrorT rc ; + + memset(&alarm_query, 0, sizeof(alarm_query)); + memset(&alarm_filter, 0, sizeof(alarm_filter)); + snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", + LOCK_ALARM_ID); + snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s", + ENTITY_PREFIX, node_ptr->hostname.data()); + rc = fm_get_fault ( &alarm_filter, &alarm_query ); + if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND )) + { + wlog("%s alarm query failure ; code:%d", + node_ptr->hostname.c_str(), + rc ); + return ; + } + + /* With FM comms proven working lets check the other mtc alarms */ + string active_alarms = ""; + for ( int i = 0 ; i < MAX_ALARMS ; i++ ) + { + mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ; + if ( id == MTC_ALARM_ID__LOCK ) + { + /* Unexpected severity case */ + if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) + { + if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING ) + { + node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ; + + wlog("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(alarm_query.severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK ); + + } + if (!active_alarms.empty()) + active_alarms.append(", "); + active_alarms.append(_getIdentity(id) + ":"); + active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id])); + } + /* Unexpected assertion case */ + else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR )) + { + node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ; + + wlog("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(alarm_query.severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + mtcAlarm_clear ( node_ptr->hostname, id ); + } + } + else if (( id == MTC_ALARM_ID__CONFIG ) || + ( id == MTC_ALARM_ID__ENABLE ) || + ( id == MTC_ALARM_ID__BM ) || + ( id == MTC_ALARM_ID__CH_CONT) || + ( id == MTC_ALARM_ID__CH_COMP)) + { + EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id); + if ( severity != node_ptr->alarms[id] ) + { + ilog ("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR ) + { + mtcAlarm_clear ( node_ptr->hostname, id ); + } + else + { + mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] ); + } + } + if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR ) + { + if (!active_alarms.empty()) + active_alarms.append(", "); + active_alarms.append(_getIdentity(id) + ":"); + active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id])); + } + } + /* else don't care about other alarm ids ; logs events etc */ + } + + /* manage logging of active alarms */ + if ( !active_alarms.empty() ) + { + if ( node_ptr->active_alarms != active_alarms ) + { + ilog ("%s active alarms: %s", + node_ptr->hostname.c_str(), + active_alarms.c_str()); + + node_ptr->active_alarms = active_alarms ; + } + /* else + * do nothing because there are active alarms + * that have not changed since the last audit. + */ + } + else if ( ! node_ptr->active_alarms.empty() ) + { + /* clear active alarm list since there 'were' active alarms + * but there are no longer active alarms */ + node_ptr->active_alarms.clear(); + ilog ("%s no active alarms", node_ptr->hostname.c_str()); + } + /* else + * no active alarms ; don't log */ +} + /************************* A L A R M I N G **************************/ +/* Raise the specified maintenance alarm severity */ +int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity ) +{ + switch ( severity ) + { + case FM_ALARM_SEVERITY_MINOR: + return (mtcAlarm_minor(hostname,id)); + case FM_ALARM_SEVERITY_MAJOR: + return (mtcAlarm_major(hostname,id)); + case FM_ALARM_SEVERITY_CRITICAL: + return (mtcAlarm_critical(hostname,id)); + default: + return (FAIL_BAD_PARM); + } +} + /* Clear the specified hosts's maintenance alarm */ int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id ) { diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index 25565d4f..6e93f659 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id ); /** Clear the specified maintenance alarm for specific host */ int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id ); +/** Raise specified severity level alarm for the specified host */ +int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity ); + /** Assert a specified mtce alarm against the specified host with a WARNING severity level */ int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id ); diff --git a/mtce/src/maintenance/mtcBmcUtil.cpp b/mtce/src/maintenance/mtcBmcUtil.cpp index 2c76a654..817db71b 100644 --- a/mtce/src/maintenance/mtcBmcUtil.cpp +++ b/mtce/src/maintenance/mtcBmcUtil.cpp @@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, { int rc = PASS ; + /* handle 'kill of in-progress' thread or 'done but not consumed' thread */ + if ( ! thread_idle ( node_ptr->bmc_thread_ctrl )) + { + if ( ! thread_done ( node_ptr->bmc_thread_ctrl )) + { + thread_kill ( node_ptr->bmc_thread_ctrl, + node_ptr->bmc_thread_info ); + return (RETRY); + } + else + { + mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer ); + if ( thread_done_consume ( node_ptr->bmc_thread_ctrl, + node_ptr->bmc_thread_info ) != PASS ) + { + return (RETRY); + } + } + } + node_ptr->bmc_thread_info.command = command ; /* Update / Setup the BMC access credentials */ @@ -437,6 +457,13 @@ bmc_command_recv_cleanup: if ( rc != RETRY ) { + ilog ("%s %s recv '%s' command (%s) (rc:%d)", + node_ptr->hostname.c_str(), + node_ptr->bmc_thread_ctrl.name.c_str(), + bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(), + bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(), + rc); + node_ptr->bmc_thread_ctrl.done = true ; node_ptr->bmc_thread_ctrl.retries = 0 ; node_ptr->bmc_thread_ctrl.id = 0 ; diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index d3793553..41e37213 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -20,7 +20,7 @@ #include #include -#include /* for ... unix domain sockets */ +#include /* for ... unix domain sockets */ #include #include #include @@ -29,8 +29,8 @@ #include #include #include -#include /* for the list of conf file names */ - +#include /* for ... list of conf file names */ +#include /* for ... sync */ using namespace std; @@ -70,11 +70,15 @@ void stop_pmon( void ) { /* max pipe command response length */ #define PIPE_COMMAND_RESPON_LEN (100) + + ilog("Stopping collectd."); + int rc = system("/usr/local/sbin/pmon-stop collectd"); + sleep (2); ilog("Stopping pmon to prevent process recovery during shutdown"); for ( int retry = 0 ; retry < 5 ; retry++ ) { char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ; - int rc = system("/usr/bin/systemctl stop pmon"); + rc = system("/usr/bin/systemctl stop pmon"); sleep(2); /* confirm pmon is no longer active */ @@ -204,6 +208,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str()); return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface )); } + else if ( msg.cmd == MTC_MSG_INFO ) + { + mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str()); + load_mtcInfo_msg ( msg ); + return ( PASS ); /* no ack for this message */ + } + else if ( msg.cmd == MTC_CMD_SYNC ) + { + ilog ("mtc '%s' message received (%s network)\n", + get_mtcNodeCommand_str(msg.cmd), + interface_name.c_str()); + + ilog ("Sync Start"); + sync (); + ilog ("Sync Done"); + + return ( PASS ); /* no ack for this message */ + } else if ( msg.cmd == MTC_MSG_LOCKED ) { /* Only recreate the file if its not already present */ @@ -603,7 +625,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } /** Send an event to the mtcAgent **/ -int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr ) +int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr ) { mtc_message_type event ; @@ -619,6 +641,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na /* We don't use the buffer for mtce events to remove it from the size */ bytes = ((sizeof(mtc_message_type))-(BUF_SIZE)); } + else if ( cmd == MTC_EVENT_MONITOR_READY ) + { + string event_info = "{\"" ; + event_info.append(MTC_JSON_INV_NAME); + event_info.append("\":\""); + event_info.append(get_hostname()); + event_info.append("\",\""); + event_info.append(MTC_JSON_SERVICE); + event_info.append("\":\""); + event_info.append(MTC_SERVICE_MTCCLIENT_NAME ); + event_info.append("\"}"); + + size_t len = event_info.length()+1 ; + snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); + snprintf ( &event.buf[0], len, "%s", event_info.data()); + bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); + ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME); + } else if (( cmd == MTC_EVENT_AVS_CLEAR ) || ( cmd == MTC_EVENT_AVS_MAJOR ) || ( cmd == MTC_EVENT_AVS_CRITICAL )) @@ -666,7 +706,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na { if ( bytes == 0 ) { - slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd ); + slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd ); rc = FAIL_NO_DATA ; } else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes ) @@ -912,15 +952,18 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa } /* Send to controller-1 cluster address */ - if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) && - ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true )) + if ( get_ctrl_ptr()->system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { - print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false ); - sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ; - } - else - { - elog("mtc_client_tx_socket_c1_clstr not ok"); + if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) && + ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true )) + { + print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false ); + sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ; + } + else + { + elog("mtc_client_tx_socket_c1_clstr not ok"); + } } } else @@ -933,32 +976,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa return (PASS) ; } -/* Accelerated Virtual Switch 'events' socket - * - for receiving data port state change event - * Event strings are - * - * {"type":"port-state", "severity":"critical|major|clear"} - * - * type:port-state - the provider network data port status has changed to the supplied fault severity - * - * severity: - * critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services) - * major - port has failed and is part of an aggregate with other inservice-ports (degrade only) - * clear - port has recovered from a failed state and is operational (clear degrade, enable services) - * - * NOTE: The port status can transition from any of the above states to any other state. - * - * The neutron agent monitors the vswitch ports at a 2 second interval. - * If a port changes link state during the polling period, it will - * raise/clear the alarm, but now also calculates the impact of that port - * failure on the provider network data interface. - * - * The overall aggregated state across all provider network interfaces will - * be reported to maintenance when ports enter a link down or up state. - * The agent will also periodically send the current provider network port - * status to maintenance every 30 seconds. - * - */ +int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port) +{ + mtc_message_type msg ; + int bytes = 0 ; + MEMSET_ZERO (msg); + snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header()); + msg.cmd = cmd ; + + switch ( cmd ) + { + case MTC_CMD_SYNC: + { + ilog ("Sending '%s' command to %s:%s:%d", + get_mtcNodeCommand_str(cmd), + hostname.c_str(), + address.c_str(), port); + + msg.num = 0 ; + + /* buffer not used in this message */ + bytes = ((sizeof(mtc_message_type))-(BUF_SIZE)); + + break ; + } + default: + { + slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd ); + return (FAIL_BAD_CASE); + } + } + int rc = FAIL ; + + /* Send to controller floating address */ + if (( sock_ptr->mtc_client_tx_socket ) && + ( sock_ptr->mtc_client_tx_socket->sock_ok() == true )) + { + print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); + rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ; + if ( 0 >= rc ) + { + elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno)); + rc = FAIL_SOCKET_SENDTO ; + } + else + rc = PASS ; + } + else + { + elog("mtc_client_tx_socket not ok"); + rc = FAIL_BAD_STATE ; + } + return (rc) ; +} int mtcCompMsg_testhead ( void ) { diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 6a820ed1..5a7be7e9 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT ); return (PASS); } + else if ( service == MTC_SERVICE_MTCCLIENT_NAME ) + { + ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + + /* if this ready event is from the mtcClient of a + * controller that has valid bmc access info then + * build the 'peer controller kill' mtcInfo and + * send it to that mtcClient */ + if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE ) + { + string bm_pw = obj_ptr->get_bm_pw ( hostname ) ; + if ( !bm_pw.empty() && ( bm_pw != NONE )) + { + string bm_un = obj_ptr->get_bm_un ( hostname ) ; + string bm_ip = obj_ptr->get_bm_ip ( hostname ) ; + if (( hostUtil_is_valid_username ( bm_un )) && + ( hostUtil_is_valid_ip_addr ( bm_ip ))) + { + send_mtc_cmd ( hostname, + MTC_MSG_INFO, + MGMNT_INTERFACE, + obj_ptr->build_mtcInfo_dict ( + MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO)); + } + } + } + return (PASS); + } if ( service == MTC_SERVICE_HWMOND_NAME ) { std::list::iterator temp ; @@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, return (rc); } -int send_mtc_cmd ( string & hostname, int cmd , int interface ) +int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict ) { int rc = FAIL ; bool force = false ; mtc_message_type mtc_cmd ; + string data = "" ; mtc_socket_type * sock_ptr = get_sockPtr (); memset (&mtc_cmd,0,sizeof(mtc_message_type)); @@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) switch ( cmd ) { + case MTC_MSG_INFO: + { + snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() ); + mtc_cmd.cmd = cmd ; + mtc_cmd.num = 0 ; + data = "{\"mtcInfo\":" + json_dict + "}"; + ilog("%s mtc info update", hostname.c_str()); + rc = PASS ; + break ; + } case MTC_REQ_MTCALIVE: { snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() ); @@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) * Note: the minus 1 is to overwrite the null */ snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data()); - string data = "{\"address\":\""; - data.append(obj_ptr->my_float_ip) ; - data.append("\",\"interface\":\""); - data.append(get_iface_name_str(interface)); - data.append("\"}"); + /* If data is empty then at least add where the message came from */ + if ( data.empty() ) + { + data = "{\"address\":\""; + data.append(obj_ptr->my_float_ip) ; + data.append("\",\"interface\":\""); + data.append(get_iface_name_str(interface)); + data.append("\"}"); + } + else + { + ; /* data is already pre loaded by the command case above */ + } + /* copy data into message buffer */ snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data()); bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1))); @@ -1176,7 +1224,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY ) { /* no heartbeating in simplex mode */ - if ( obj_ptr->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( obj_ptr->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) { return (PASS); } @@ -1214,13 +1262,68 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); } - /* Send the start event to the heartbeat service for all enabled hosts */ + /* Consider sending the 'start' request to the heartbeat service + * for all enabled hosts. */ if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) { - send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + /* However, bypass sending heartbeat 'start' for nodes that + * are not ready to heartbeat; enabling, configuring, testing. + * Such cases are if a host is: + * + * 1. running the add_handler or + * 2. running the enable_handler or + * 3. running the enable_subf_handler or + * 4. not configured or + * 5. not tested (goenabled not complete) + * + */ + mtc_nodeAdminAction_enum current_action = + obj_ptr->get_adminAction (hostname); + if (( current_action != MTC_ADMIN_ACTION__ADD ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF )) + { + int mtce_flags = obj_ptr->get_mtce_flags(hostname); + if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && + ( mtce_flags & MTC_FLAG__MAIN_GOENABLED )) + { + if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) && + ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )) + { + /* If its an AIO then its worker subfunction + * needs to have been be configured and tested. */ + if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__SUBF_GOENABLED )) + { + ilog("%s heartbeat start (AIO controller)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + else + { + wlog ("%s not heartbeat ready (subf) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } + else + { + ilog("%s heartbeat start (from ready event)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + } + else + { + wlog ("%s not heartbeat ready (main) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } } } ilog ("%s %s inventory push ... done", diff --git a/mtce/src/maintenance/mtcInvApi.cpp b/mtce/src/maintenance/mtcInvApi.cpp index 0743455b..770c580f 100755 --- a/mtce/src/maintenance/mtcInvApi.cpp +++ b/mtce/src/maintenance/mtcInvApi.cpp @@ -974,7 +974,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no else avail = " " ; - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { if ( ! oper_subf.empty() ) { @@ -1016,7 +1016,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no this->sysinvEvent.payload.erase(len-1,1); this->sysinvEvent.payload.append ( "]"); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { ilog ("%s %s-%s-%s %s-%s\n", node_ptr->hostname.c_str(), diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 6e58c7fe..9db192a8 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -43,9 +43,9 @@ #include #include #include -//#include /* for ... syslog */ #include #include +#include /* for ... json_tokener_parse */ using namespace std; @@ -56,6 +56,10 @@ using namespace std; #include "nodeBase.h" /* for ... Common Definitions */ #include "nodeTimers.h" /* fpr ... Timer Service */ #include "nodeUtil.h" /* for ... Common Utilities */ +#include "hostUtil.h" /* for ... hostUtil_is_valid_... */ +#include "jsonUtil.h" /* for ... jsonUtil_get_key_value_string */ +#include "bmcUtil.h" /* for ... bmcUtil_accessInfo_type */ +#include "ipmiUtil.h" /* for ... ipmiUtil_reset_host_now */ #include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */ #include "mtcNodeMsg.h" /* for ... common maintenance messaging */ #include "mtcNodeComp.h" /* for ... this module header */ @@ -96,7 +100,7 @@ string get_hostname ( void ) * Daemon Configuration Structure - The allocated struct * @see daemon_common.h for daemon_config_type struct format. */ -static daemon_config_type mtc_config ; +static daemon_config_type mtc_config ; daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; } /** @@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; } static mtc_socket_type mtc_sock ; static mtc_socket_type * sock_ptr ; +static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"}; +static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"}; int run_goenabled_scripts ( string type ); @@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc) mtcTimer_stop_int_safe ( ctrl.hostservices.timer ); ctrl.hostservices.timer.ring = true ; } + else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid ) + { + ctrl.peer_ctrlr_reset.sync_timer.ring = true ; + mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer ); + } + else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid ) + { + /* use auto restart */ + ctrl.peer_ctrlr_reset.audit_timer.ring = true ; + } else { mtcTimer_stop_tid_int_safe ( tid_ptr ); @@ -207,9 +223,8 @@ void daemon_exit ( void ) exit (0) ; } - /* Startup config read */ -static int mtc_config_handler ( void * user, +static int mtc_config_handler ( void * user, const char * section, const char * name, const char * value) @@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user, config_ptr->failsafe_shutdown_delay = atoi(value); ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay ); } - else + if (( ctrl.nodetype & CONTROLLER_TYPE ) && + (MATCH("client", "sync_b4_peer_ctrlr_reset"))) { - return (PASS); + ctrl.peer_ctrlr_reset.sync = atoi(value); + ilog("SyncB4 Reset: %s", + ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" ); } - return (FAIL); + return (PASS); } /* Read the mtc.ini file and load control */ @@ -431,7 +449,7 @@ void setup_clstr_tx_sockets ( void ) mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(false); } } - if ( ctrl.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { dlog ("setup of %s TX\n", CONTROLLER_1_CLUSTER_HOST); @@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void ) _scripts_cleanup (ctrl.active_script_set) ; } +int issue_reset_and_cleanup ( void ) +{ + int rc = FAIL ; + const char peer_ctrlr [] = "Peer controller reset" ; + + ilog("SM %s request", peer_ctrlr ); + /* check creds */ + if (( hostUtil_is_valid_ip_addr ( peer_controller.bm_ip ) == false ) || + ( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) || + ( hostUtil_is_valid_pw ( peer_controller.bm_pw ) == false )) + { + elog("%s cannot reset peer BMC host at %s due to invalid credentials", + ctrl.hostname, peer_controller.bm_ip.c_str()); + return (rc); + } + + /* create output filename - no need to delete after operation */ + string output_filename = bmcUtil_create_data_fn ( ctrl.hostname, + BMC_RESET_CMD_FILE_SUFFIX, + BMC_PROTOCOL__IPMITOOL ); + if ( output_filename.empty() ) + { + elog("%s ; failed to create output filename", peer_ctrlr); + rc = FAIL_STRING_EMPTY ; + } + else if ( ipmiUtil_reset_host_now ( ctrl.hostname, + peer_controller, + output_filename ) == PASS ) + { + string result = daemon_get_file_str ( output_filename.data() ); + ilog("%s succeeded", peer_ctrlr); + + /* don't fail the operation if the result is unexpected ; but log it */ + if ( result.compare( IPMITOOL_POWER_RESET_RESP ) ) + { + dlog("... but reset command output was unexpected ; %s", + result.c_str()); + } + rc = PASS ; + } + else + { + elog("%s failed", peer_ctrlr); + rc = FAIL_OPERATION ; + } + + if ( rc == PASS ) + { + /* give the host a chance to reset before + * telling SM the reset is done */ + sleep (2) ; + + /* Don't want to remove the file if the reset was not successful */ + dlog("removing %s", RESET_PEER_NOW ); + daemon_remove_file ( RESET_PEER_NOW ); + } + return (rc); +} + /* The main service loop */ int daemon_init ( string iface, string nodetype_str ) @@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str ) ctrl.subfunction = 0 ; ctrl.system_type = daemon_system_type (); ctrl.clstr_iface_provisioned = false ; + ctrl.peer_ctrlr_reset.sync = false ; /* convert node type to integer */ ctrl.nodetype = get_host_function_mask ( nodetype_str ) ; @@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str ) mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" ); mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" ); + /* initialize peer controller reset feature */ + mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ), + mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ), + ctrl.peer_ctrlr_reset.sync_timer.ring = false ; + ctrl.peer_ctrlr_reset.audit_timer.ring = false ; + ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ; + /* initialize the script group control structures */ script_ctrl_init ( &ctrl.goenabled ); script_ctrl_init ( &ctrl.hostservices ); @@ -1073,6 +1158,17 @@ void daemon_service_run ( void ) /* Send first mtcAlive ASAP */ mtcTimer_start ( ctrl.timer, timer_handler, 1 ); + /* Monitor for peer controller reset requests when this + * daemon runs on a controller */ + if ( ctrl.nodetype & CONTROLLER_TYPE ) + { + mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer, + timer_handler, + ctrl.peer_ctrlr_reset.audit_period ); + } + + mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL ); + /* lets go select so that the sock does not go crazy */ dlog ("%s running main loop with %d msecs socket timeout\n", &ctrl.hostname[0], (SOCKET_WAIT/1000) ); @@ -1305,8 +1401,20 @@ void daemon_service_run ( void ) socket_reinit = true ; } - /* Clstr Tx */ - else if (( ctrl.clstr_iface_provisioned == true ) && + /* Clstr Tx ; AIO SX */ + else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) && + ( ctrl.clstr_iface_provisioned == true ) && + (( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) || + ( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ))) + { + wlog ("calling setup_clstr_tx_sockets (auto-recovery)\n"); + setup_clstr_tx_sockets(); + socket_reinit = true ; + } + + /* Clstr Tx ; not AIO SX */ + else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) && + ( ctrl.clstr_iface_provisioned == true ) && (( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) || ( mtc_sock.mtc_client_tx_socket_c1_clstr == NULL ) || ( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ) || @@ -1384,7 +1492,51 @@ void daemon_service_run ( void ) } } } - + /* service controller specific audits */ + if ( ctrl.nodetype & CONTROLLER_TYPE ) + { + /* peer controller reset service audit */ + if ( ctrl.peer_ctrlr_reset.audit_timer.ring ) + { + if ( daemon_is_file_present ( RESET_PEER_NOW ) ) + { + if ( ctrl.peer_ctrlr_reset.sync ) + { + if ( ctrl.peer_ctrlr_reset.sync_timer.ring ) + { + issue_reset_and_cleanup (); + ctrl.peer_ctrlr_reset.sync_timer.ring = false ; + } + else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL ) + { + if ( send_mtcClient_cmd ( &mtc_sock, + MTC_CMD_SYNC, + peer_controller.hostname, + peer_controller.host_ip, + mtc_config.mtc_rx_mgmnt_port) == PASS ) + { + mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 ); + ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10); + } + else + { + elog("failed to send 'sync' command to peer controller mtcClient"); + ctrl.peer_ctrlr_reset.sync_timer.ring = true ; + } + } + else + { + ; /* wait longer */ + } + } + else + { + issue_reset_and_cleanup (); + } + } + ctrl.peer_ctrlr_reset.audit_timer.ring = false ; + } + } daemon_signal_hdlr (); } daemon_exit(); @@ -1573,7 +1725,7 @@ int run_hostservices_scripts ( unsigned int cmd ) /* For the stop command we need the mtcClient to run both controller and - * worker stop services if we are on a CPE system. + * worker stop services if we are on a AIO system. * This saves the mtcAgent from having to issue and manage 2 commands, * one for controller and 1 for worker */ if ( ctrl.system_type != SYSTEM_TYPE__NORMAL ) @@ -1750,7 +1902,6 @@ void daemon_sigchld_hdlr ( void ) } default: { - wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set ); return ; } } @@ -1820,6 +1971,84 @@ void daemon_sigchld_hdlr ( void ) } } +/*************************************************************************** + * + * Name : load_mtcInfo_msg + * + * Description: Extract the mtc info from the MTC_MSG_INFO message. + * + * Assumptions: So far only the peer controller reset feature uses this. + * + * Returns : Nothing + * + ***************************************************************************/ + +void load_mtcInfo_msg ( mtc_message_type & msg ) +{ + if ( ctrl.nodetype & CONTROLLER_TYPE ) + { + mlog1("%s", &msg.buf[0]); + struct json_object *_obj = json_tokener_parse( &msg.buf[0] ); + if ( _obj ) + { + if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 )) + peer_controller.hostname = CONTROLLER_0 ; + else + peer_controller.hostname = CONTROLLER_1 ; + + struct json_object *info_obj = (struct json_object *)(NULL); + json_bool json_rc = json_object_object_get_ex( _obj, + "mtcInfo", + &info_obj ); + if ( ( json_rc == TRUE ) && ( info_obj )) + { + struct json_object *ctrl_obj = (struct json_object *)(NULL); + json_bool json_rc = + json_object_object_get_ex( info_obj, + peer_controller.hostname.data(), + &ctrl_obj ); + + if (( json_rc == TRUE ) && ( ctrl_obj )) + { + peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ; + peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ; + peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un"); + peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw"); + + /* log the mc info but not the bmc password ; only + * indicate that it looks 'ok' or 'is 'none' */ + ilog ("%s is my peer [host:%s bmc:%s:%s:%s]", + peer_controller.hostname.c_str(), + peer_controller.host_ip.c_str(), + peer_controller.bm_ip.c_str(), + peer_controller.bm_un.c_str(), + hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none"); + } + else + { + wlog("peer mtcInfo missing (rc:%d) ; %s", + json_rc, &msg.buf[0]); + } + } + else + { + wlog("mtcInfo label parse error (rc:%d) ; %s", + json_rc, &msg.buf[0]); + } + json_object_put(_obj); + } + else + { + wlog("message buffer tokenize error ; %s", &msg.buf[0]); + } + } + else + { + slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname); + } +} + + /* Push daemon state to log file */ void daemon_dump_info ( void ) { @@ -1853,13 +2082,13 @@ int daemon_run_testhead ( void ) * STAGE 1: some test ************************************************/ printf ( "| Test %d : Maintenance Service Test ............. ", stage ); - if ( rc != PASS ) + if ( rc != PASS ) { FAILED_STR ; rc = FAIL ; } else - PASSED ; + PASSED ; printf ("+---------------------------------------------------------+\n"); return PASS ; diff --git a/mtce/src/maintenance/mtcNodeComp.h b/mtce/src/maintenance/mtcNodeComp.h index 612144f8..190500c6 100644 --- a/mtce/src/maintenance/mtcNodeComp.h +++ b/mtce/src/maintenance/mtcNodeComp.h @@ -17,6 +17,10 @@ #include #include +using namespace std; + +#include "nodeTimers.h" /* for ... Timer Service */ + /** Compute Config mask */ #define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\ CONFIG_CLIENT_MTC_MGMNT_PORT |\ @@ -59,6 +63,22 @@ typedef struct } script_ctrl_type ; void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr ); +/* peer controller reset control structure and associated definitions */ + +/* This is a flag file set by SM when SM wants maintanence to perform a + * BMC reset of the other (peer) controller */ +#define RESET_PEER_NOW "/var/run/.sm_reset_peer" + +#define PEER_CTRLR_AUDIT_PERIOD (2) +typedef struct +{ + struct + mtc_timer sync_timer ; + mtc_timer audit_timer ; + int audit_period ; + bool sync ; +} peer_ctrlr_reset_type ; + typedef struct { char hostname [MAX_HOST_NAME_SIZE+1]; @@ -76,7 +96,7 @@ typedef struct unsigned int function ; unsigned int subfunction ; - struct mtc_timer timer ; /* mtcAlive timer */ + struct mtc_timer timer ; /* mtcAlive timer */ bool clstr_iface_provisioned ; @@ -102,6 +122,7 @@ typedef struct /* Where to send events */ string mtcAgent_ip ; + peer_ctrlr_reset_type peer_ctrlr_reset; } ctrl_type ; ctrl_type * get_ctrl_ptr ( void ); @@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void ); bool is_subfunction_worker ( void ); int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor ); int run_hostservices_scripts ( unsigned int cmd ); +void load_mtcInfo_msg ( mtc_message_type & msg ); #endif diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index 6732ca88..a52b67e3 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1187,15 +1187,6 @@ int _self_provision ( void ) if ( my_identity.name == record_info.name ) { - /* If the active controller was 'locked' and is being auto-corrected - * to 'unlocked' then ensure that there is no locked alarm set for it */ - if ( record_info.admin != "locked" ) - { - mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK ); - /* this is not required because its already inited to clear */ - // node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR - } - if ( my_identity.mac != record_info.mac ) { wlog ("%s mac address mismatch (%s - %s)\n", @@ -1326,6 +1317,7 @@ void nodeLinkClass::fsm ( void ) daemon_signal_hdlr (); mtcHttpSvr_look ( mtce_event ); } + mtcInv.mtcInfo_handler(); } } @@ -1515,9 +1507,9 @@ void daemon_service_run ( void ) if ( ts.tv_sec < MTC_MINS_15 ) { - /* CPE DOR window is much greater in CPE since heartbeat - * cannot start until the inactive CPE has run both manifests */ - int timeout = DEFAULT_DOR_MODE_CPE_TIMEOUT ; + /* AIO DOR window is much greater in AIO since heartbeat + * cannot start until the inactive AIO has run both manifests */ + int timeout = DEFAULT_DOR_MODE_AIO_TIMEOUT ; /* override the timeout to a smaller value for normal system */ if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL ) @@ -1601,7 +1593,7 @@ void daemon_service_run ( void ) if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL ) mtc_sock.waitd.tv_usec = MTCAGENT_SELECT_TIMEOUT ; else - mtc_sock.waitd.tv_usec = MTCAGENT_CPE_SELECT_TIMEOUT ; + mtc_sock.waitd.tv_usec = MTCAGENT_AIO_SELECT_TIMEOUT ; /* This is used as a delay up to select_timeout */ rc = select( socks.back()+1, &mtc_sock.readfds, NULL, NULL, &mtc_sock.waitd); diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index af5e9a26..98c0e8a4 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) /* Monitor and Manage active threads */ thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info ); + if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL ) + { + /* do nothing while thread is being killed */ + return RETRY ; + } /* manage the host connected state and board management alarms */ nodeLinkClass::bmc_handler ( node_ptr ); @@ -310,10 +315,10 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) } /**************************************************************************** - * No Op: Do nothing for this Healthy Enabled Locked CPE Simplex Host + * No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host **************************************************************************** */ - else if (( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) && + else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) && ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) && ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )) { diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index de5ae2a4..49ac2684 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -481,7 +481,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) { bool aio = false ; - if ( SIMPLEX_CPE_SYSTEM ) + if ( SIMPLEX_AIO_SYSTEM ) aio = true ; else aio = false ; @@ -525,7 +525,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } } mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" ); - mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG ); + mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_AIO_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG ); wlog ("%s unlocking %s with reboot\n", my_hostname.c_str(), @@ -546,7 +546,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) * Condition 1: While there is no in-service backup controller * to swact to. In this case the ctive controller * - is only degraded to avoid a system outage. - * - the CPE subfunction is failed + * - the AIO subfunction is failed * - worker SubFunction Alarm is raised * - Enable alarm is raised * - A process monitor alarm may also be raised if @@ -648,7 +648,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else { - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { /* Raise Critical Compute Function Alarm */ alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ); @@ -661,7 +661,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->graceful_recovery_counter = 0 ; node_ptr->health_threshold_counter = 0 ; - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { node_ptr->inservice_failed_subf = true ; subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED, @@ -1358,7 +1358,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) * have a worker function and the heartbeat for those hosts * are started at the end of the subfunction handler. */ if (( THIS_HOST ) || - (( CPE_SYSTEM ) && ( is_controller(node_ptr)) )) + (( AIO_SYSTEM ) && ( is_controller(node_ptr)) )) { enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); } @@ -1523,8 +1523,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) if ( is_controller(node_ptr) ) { /* Defer telling SM the controller state if - * this is a CPE and this is the only controller */ - if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 )) + * this is a AIO and this is the only controller */ + if ( AIO_SYSTEM && ( num_controllers_enabled() > 0 )) { wlog ("%s deferring SM enable notification till subfunction-enable complete\n", node_ptr->hostname.c_str()); @@ -1555,7 +1555,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) enableStageChange ( node_ptr, MTC_ENABLE__START ); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr))) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr))) { ilog ("%s running worker sub-function enable handler\n", node_ptr->hostname.c_str()); mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF ); @@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->http_retries_cur = 0 ; node_ptr->unknown_health_reported = false ; - plog ("%s %sGraceful Recovery (uptime was %d)\n", + plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n", node_ptr->hostname.c_str(), node_ptr->mnfa_graceful_recovery ? "MNFA " : "", + node_ptr->graceful_recovery_counter, node_ptr->uptime ); /* Cancel any outstanding timers */ @@ -1660,7 +1661,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) * 2. Setting the node operational state to Disabled * 3. Setting the Enable action */ - if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) + node_ptr->graceful_recovery_counter++ ; + if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) { /* gate off further mtcAlive messaging timme the offline * handler runs. This prevents stale messages from making it @@ -1772,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( node_ptr->mnfa_graceful_recovery == true ) { - if ( node_ptr->uptime > MTC_MINS_10 ) + if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1788,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else { /* did reboot case */ - wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); ilog ("%s ... without additional reboot %s\n", node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" ); @@ -1806,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) break ; } } - else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save )) + else if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset%s\n", + wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR)" : "" ); + node_ptr->was_dor_recovery_mode ? " (DOR)" : "", + node_ptr->uptime); wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1875,7 +1880,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED ); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED ); @@ -1905,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { int timeout = 0 ; - /* Set the FSM task state to booting */ + /* Set the FSM task state to 'Graceful Recovery Wait' */ node_ptr->uptime = 0 ; mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); @@ -2266,7 +2271,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { /* The active controller would never get/be here but * if it did then just fall through to change state. */ - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { /* Here we need to run the sub-fnction goenable and start * host services if this is the other controller in a AIO @@ -2442,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else /* success path */ { - /* allow the fsm to wait for up to 1 minute for the - * hbsClient's ready event before starting heartbeat + /* allow the fsm to wait for up to 'worker config timeout' + * for the hbsClient's ready event before starting heartbeat * test. */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT ); recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ); } break ; @@ -2502,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcTimer.ring == true ) { + ilog ("%s heartbeating", node_ptr->hostname.c_str()); /* if heartbeat is not working then we will * never get here and enable the host */ recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE ); @@ -2510,7 +2516,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_RECOVERY__STATE_CHANGE: { - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { /* Set node as unlocked-enabled */ subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED, @@ -2555,7 +2561,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( rc == PASS ) { /* Start Graceful Recovery */ - recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ; + recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ; break ; } else if ( rc == FAIL_WORKQ_TIMEOUT ) @@ -2571,51 +2577,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) nodeLinkClass::force_full_enable ( node_ptr ); break ; } - case MTC_RECOVERY__ENABLE_START: + case MTC_RECOVERY__ENABLE: { - /* Create the recovery enable timer. This timer is short. - * A node need to stay enabled with the hartbeat service - * running for a period of time before declaring it enabled */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); - - recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ; - break; - } - case MTC_RECOVERY__ENABLE_WAIT: - { - /* When this timer fires the host has been up for enough time */ - if ( node_ptr->mtcTimer.ring == true ) + if ( is_controller(node_ptr) ) { - if ( is_controller(node_ptr) ) + if ( mtcSmgrApi_request ( node_ptr, + CONTROLLER_ENABLED, + SMGR_MAX_RETRIES ) != PASS ) { - if ( mtcSmgrApi_request ( node_ptr, - CONTROLLER_ENABLED, - SMGR_MAX_RETRIES ) != PASS ) - { - wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n", - node_ptr->hostname.c_str()); - } + wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n", + node_ptr->hostname.c_str()); } - /* Node Has Recovered */ - node_ptr->graceful_recovery_counter = 0 ; - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - node_ptr->health_threshold_counter = 0 ; - node_ptr->enabled_count++ ; - node_ptr->http_retries_cur = 0 ; - - doneQueue_purge ( node_ptr ); - if ( node_ptr->was_dor_recovery_mode ) - { - report_dor_recovery ( node_ptr , "is ENABLED" ); - } - else - { - plog ("%s is ENABLED (Gracefully Recovered)\n", - node_ptr->hostname.c_str()); - } - alarm_enabled_clear ( node_ptr, false ); } + /* Node Has Recovered */ + node_ptr->graceful_recovery_counter = 0 ; + recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + node_ptr->health_threshold_counter = 0 ; + node_ptr->enabled_count++ ; + node_ptr->http_retries_cur = 0 ; + + doneQueue_purge ( node_ptr ); + if ( node_ptr->was_dor_recovery_mode ) + { + report_dor_recovery ( node_ptr , "is ENABLED" ); + } + else + { + plog ("%s is ENABLED (Gracefully Recovered)\n", + node_ptr->hostname.c_str()); + } + alarm_enabled_clear ( node_ptr, false ); break ; } default: @@ -2783,7 +2775,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) MTC_OPER_STATE__DISABLED, locked_status ); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED, locked_status ); @@ -3432,7 +3424,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) /* otherwise change state */ mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" ); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" ); } @@ -3473,7 +3465,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str()); mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" ); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" ); } @@ -6093,7 +6085,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcInfo_log(node_ptr); - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) == false ) { @@ -6120,52 +6112,38 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcInvApi_update_state ( node_ptr, "availability", "available" ); } - /* handle other cases */ - EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname, - MTC_ALARM_ID__ENABLE); + /* Query FM for existing Enable and Config alarm status */ + EFmAlarmSeverityT enable_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE); + EFmAlarmSeverityT config_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG); - if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) + /* Clear generic enable alarm over process restart. + * Will get reasserted if the cause condition still exists */ + if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { - node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ; - - /* If the node is locked then the Enable alarm - * should not be present */ - if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - sev = FM_ALARM_SEVERITY_CLEAR ; - } + ilog ("%s found enable alarm ; clearing %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(enable_alarm_severity).c_str()); + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); } - /* Manage enable alarm over process restart. - * - * - clear the alarm in the active controller case - * - maintain the alarm, set degrade state in MAJOR and CRIT cases - * - clear alarm for all other severities. - */ - if ( THIS_HOST ) + /* The config alarm is maintained if it exists. + * The in-service test handler will clear the alarm + * if the config failure is gone */ + if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { - if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - } - } - else - { - if (( sev == FM_ALARM_SEVERITY_CRITICAL ) || - ( sev == FM_ALARM_SEVERITY_MAJOR )) - { - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ; - node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ; - } - else if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - } + node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; + node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ; + ilog ("%s found config alarm ; loaded %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(config_alarm_severity).c_str()); } if ( is_controller(node_ptr) ) { + this->controllers++ ; + mtc_cmd_enum state = CONTROLLER_DISABLED ; if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && @@ -6199,7 +6177,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) { ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE ); - /* Work Around for issue: */ mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime ); mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE ); @@ -6233,7 +6210,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD ); } } - if ( daemon_get_cfg_ptr()->debug_level & 1 ) nodeLinkClass::host_print (node_ptr); @@ -6290,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), node_ptr->uptime ); break ; } + /* Handle catching and recovering/restoring hosts that might + * have been in the Graceful Recovery Wait state. + * + * Prevents an extra reboot for hosts that might be in + * Graceful Recovery over a maintenance process restart. */ + else if (( NOT_THIS_HOST ) && + ( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT))) + { + ilog ("%s is in %s ; restoring state", + node_ptr->hostname.c_str(), + MTC_TASK_RECOVERY_WAIT); + + /* Complete necessary add operations before switching + * to Recovery */ + LOAD_NODETYPE_TIMERS ; + workQueue_purge ( node_ptr ); + if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) && + ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) && + ( hostUtil_is_valid_username ( node_ptr->bm_un ))) + { + set_bm_prov ( node_ptr, true ) ; + } + mtcTimer_reset ( node_ptr->mtcTimer ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + node_ptr->addStage = MTC_ADD__START; + + /* Switch into recovery_handler's Graceful Recovery Wait + * state with the Graceful Recovery Wait timeout */ + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, + node_ptr->mtcalive_timeout ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT ); + break ; + } else { if ( is_controller(node_ptr) ) @@ -6354,7 +6364,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) send_hbs_command ( node_ptr->hostname, MTC_CMD_ADD_HOST ); - if ( ( CPE_SYSTEM ) || ( is_worker (node_ptr) == true )) + if ( ( AIO_SYSTEM ) || ( is_worker (node_ptr) == true )) { send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST ); } @@ -6368,6 +6378,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ADD__WORKQUEUE_WAIT: { + rc = workQueue_done ( node_ptr ); if ( rc == RETRY ) { @@ -6393,11 +6404,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) { /* start the heartbeat service in all cases except for - * THIS host and CPE controller hosts */ + * THIS host and AIO controller hosts */ if ( NOT_THIS_HOST ) { if (( LARGE_SYSTEM ) || - (( CPE_SYSTEM ) && ( this->dor_mode_active == false ))) + (( AIO_SYSTEM ) && ( this->dor_mode_active == false ))) { send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } @@ -6430,7 +6441,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ; } - if (( ! SIMPLEX_CPE_SYSTEM ) && + if (( ! SIMPLEX_AIO_SYSTEM ) && ( node_ptr->bmc_provisioned == true )) { mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM ); @@ -6438,7 +6449,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } /* Special Add handling for the AIO system */ - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) @@ -6455,6 +6466,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } node_ptr->addStage = MTC_ADD__START; + plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime ); node_ptr->add_completed = true ; break ; @@ -6635,6 +6647,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR ); node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ; } + /* store mtcInfo, which specifies the selected BMC protocol, + * into the sysinv database */ mtcInvApi_update_mtcInfo ( node_ptr ); ilog ("%s bmc control using %s:%s", @@ -6751,8 +6765,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->bmc_thread_ctrl.done = true ; node_ptr->bmc_thread_info.command = 0 ; } + /* store mtcInfo, which specifies the selected BMC protocol, + * into the sysinv database */ mtcInvApi_update_mtcInfo ( node_ptr ); + /* push the BMC access info out to the mtcClient when + * a controller's BMC connection is established/verified */ + if ( node_ptr->nodetype & CONTROLLER_TYPE ) + this->want_mtcInfo_push = true ; + send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST ); send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } @@ -6942,6 +6963,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) } } /* end power off detection handling */ + /* push the BMC access info out to the mtcClient when + * a controller's BMC connection is established/verified */ + if ( node_ptr->nodetype & CONTROLLER_TYPE ) + this->want_mtcInfo_push = true ; + send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST ); send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST ); @@ -7199,6 +7225,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) } } + /* audit alarms */ + mtcAlarm_audit (node_ptr ); + break ; } case MTC_OOS_TEST__WAIT: @@ -7494,7 +7523,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) * In the restart case the subfunction fsm enable handler is not run so * we try to detect the missing goenabled_subf flag as an inservice test. * - * Only in CPE type + * Only in AIO type * - clear the alarm if the issue goes away - * i.e. the goenabled tests eventually pass. Today * hey are not re-run in the background but someday they may be @@ -7502,7 +7531,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) * and we have only a single enabled controller (which must be this one) * and the alarm is not already raised. **/ - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) + if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && @@ -7597,7 +7626,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } - /* Monitor the health of the host - no pass file */ + /* Monitor the health of the host */ if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || @@ -7623,6 +7652,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str()); } + /* + * In-service Config Failure/Alarm handling + */ + + /* Detect new config failure condition */ if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) { /* not healthy .... */ @@ -7634,16 +7668,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str()); if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD ) - { - node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; - - /* threshold is reached so raise the config alarm if it is not already raised */ - if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG ); - node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ; - } - } + alarm_config_failure ( node_ptr ); } } else @@ -7663,6 +7688,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } } + /* or correct an alarmed config failure that has cleared */ + else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG ) + { + if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) + alarm_config_clear ( node_ptr ); + } else { node_ptr->health_threshold_counter = 0 ; diff --git a/mtce/src/maintenance/mtcNodeMnfa.cpp b/mtce/src/maintenance/mtcNodeMnfa.cpp index af2493b1..8ebbc15c 100644 --- a/mtce/src/maintenance/mtcNodeMnfa.cpp +++ b/mtce/src/maintenance/mtcNodeMnfa.cpp @@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mnfa_graceful_recovery == true ) { - /* Restart the heartbeat for this recovered host */ - // send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS ); - if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) { - ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str()); - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); + ilog ("%s graceful recovery (graceful recover count:%d)", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter); } else { - wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() ); + wlog ("%s graceful recovery restart (graceful recover count:%d)", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter ); } + recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); } } @@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force ) * Clear heartbeat degrades */ for ( struct node * ptr = head ; ; ptr = ptr->next ) { - if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) || - ( ptr->hbs_minor[MGMNT_IFACE] == true )) && - ( ptr->operState == MTC_OPER_STATE__ENABLED )) + std::list::iterator mnfa_awol_ptr ; + for ( mnfa_awol_ptr = mnfa_awol_list.begin() ; + mnfa_awol_ptr != mnfa_awol_list.end() ; + mnfa_awol_ptr++ ) { - ptr->hbs_minor[MGMNT_IFACE] = false ; - ptr->hbs_minor[CLSTR_IFACE] = false ; + /* skip host if not in the mnfa pool */ + if ( ptr->hostname.compare(*(mnfa_awol_ptr)) ) + continue ; - if ( force == true ) + if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) || + ( ptr->hbs_minor[MGMNT_IFACE] == true )) && + ( ptr->operState == MTC_OPER_STATE__ENABLED )) { - elog ("... %s failed ; auto-recovering\n", - ptr->hostname.c_str()); + ptr->hbs_minor[MGMNT_IFACE] = false ; + ptr->hbs_minor[CLSTR_IFACE] = false ; - /* Set node as failed */ - availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED ); - enableStageChange ( ptr, MTC_ENABLE__START ); - adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE ); - } - else - { - if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) + if ( force == true ) { - if ( ptr->degrade_mask == 0 ) - { - availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE ); - } - } + elog ("... %s failed ; auto-recovering\n", + ptr->hostname.c_str()); - if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) - { - recoveryStageChange ( ptr, MTC_RECOVERY__START ); - adminActionChange ( ptr, MTC_ADMIN_ACTION__RECOVER ); + /* Set node as failed */ + availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED ); + enableStageChange ( ptr, MTC_ENABLE__START ); + adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE ); } else { - wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() ); + mnfa_recover_host ( ptr ); } } + break ; } if (( ptr->next == NULL ) || ( ptr == tail )) break ; diff --git a/mtce/src/maintenance/mtcNodeMsg.h b/mtce/src/maintenance/mtcNodeMsg.h index 6816354c..11319c0f 100755 --- a/mtce/src/maintenance/mtcNodeMsg.h +++ b/mtce/src/maintenance/mtcNodeMsg.h @@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa int recv_mtc_reply_noblock ( void ); -int send_mtc_cmd ( string & hostname, int cmd, int interface ); +int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" ); int mtc_service_command ( mtc_socket_type * sock_ptr , int interface ); int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status ); -int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr ); +int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr ); int mtc_clstr_init ( mtc_socket_type * sock_ptr , char * iface ); string get_who_i_am ( void ); +int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port); + #endif diff --git a/mtce/src/maintenance/mtcSmgrApi.cpp b/mtce/src/maintenance/mtcSmgrApi.cpp index e511228c..9fd4ddc7 100644 --- a/mtce/src/maintenance/mtcSmgrApi.cpp +++ b/mtce/src/maintenance/mtcSmgrApi.cpp @@ -96,7 +96,7 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m int rc = PASS ; string operation_string = "unknown" ; - if ( system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + if ( system_type == SYSTEM_TYPE__AIO__SIMPLEX ) { dlog ("%s simpex mode ; SM '%d' request not sent\n", node_ptr->hostname.c_str(), operation ); return ( PASS ); diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index e22aaa2c..5c994f4a 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) { mtcTimer_reset (node_ptr->mtcTimer); - plog ("%s Subf Configured OK\n", name.c_str()); + plog ("%s Subf Configured OK (oob:%x)\n", + name.c_str(), node_ptr->mtce_flags); enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER ); alarm_config_clear ( node_ptr ); break ; } - if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || - (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) + if (( node_ptr->mtce_flags ) && + (( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) || + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { mtcTimer_reset (node_ptr->mtcTimer); @@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) /* timeout handling */ else if ( node_ptr->mtcTimer.ring == true ) { - elog ("%s configuration timeout (%d secs)\n", + elog ("%s configuration timeout (%d secs) (oob:%x)\n", name.c_str(), - MTC_WORKER_CONFIG_TIMEOUT ); + MTC_WORKER_CONFIG_TIMEOUT, + node_ptr->mtce_flags); alarm_config_failure ( node_ptr ); @@ -169,7 +172,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) * * issue: subfunction go-enable patching script fails and * maintenance reboots the active controller when no-reboot - * patching maintenance in CPE. + * patching maintenance in AIO. * * The fix is to avoid running the subfunction go-enabled tests * on self while patching. @@ -490,7 +493,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) fail = true ; } - else if ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + else if ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ) { /* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */ for ( int i = 0 ; i < MAX_IFACES ; i++ ) diff --git a/mtce/src/pmon/pmon.h b/mtce/src/pmon/pmon.h index 11ed9714..158553da 100755 --- a/mtce/src/pmon/pmon.h +++ b/mtce/src/pmon/pmon.h @@ -231,6 +231,7 @@ typedef struct recovery_method_type recovery_method ; /**< How processes are recovered */ bool reload_config ; bool patching_in_progress ; + bool last_alarm_query_pass; } pmon_ctrl_type ; void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr ); diff --git a/mtce/src/pmon/pmonAlarm.cpp b/mtce/src/pmon/pmonAlarm.cpp index 2a491642..86e0a319 100644 --- a/mtce/src/pmon/pmonAlarm.cpp +++ b/mtce/src/pmon/pmonAlarm.cpp @@ -38,14 +38,14 @@ void pmonAlarm_init ( void ) alarmUtil_type * ptr ; /** Process Failure Alarm ****************************************************/ - + ptr = &alarm_list[PMON_ALARM_ID__PMOND]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID); ptr->name = "process failure" ; ptr->instc_prefix = "process=" ; - + ptr->critl_reason = ""; ptr->minor_reason = ""; ptr->major_reason = ""; @@ -56,12 +56,12 @@ void pmonAlarm_init ( void ) ptr->alarm.inhibit_alarms = FM_FALSE; ptr->alarm.service_affecting = FM_TRUE ; ptr->alarm.suppression = FM_TRUE ; - + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ - snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, - "If problem consistently occurs after Host is locked and unlocked then " + snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "If problem consistently occurs after Host is locked and unlocked then " "contact next level of support for root cause analysis and recovery."); } @@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id ) /****************************************************************************** * - * Name : manage_queried_alarms + * Name : query_alarms * * Description: query FM for all the existing process monitor alarms and build * up the callers 'saved_alarm_list' with those process names and * corresponding severity. * - * Assumptions: If the hostname is passed in as not empty then assume the clear - * is requested. - * * Updates : callers saved_alarm_list * + * Returns : PASS if FM returns no error + * FAIL_REQUEST ... alarmUtil_query_identity failed + * FAIL_OPERATION ... fm_get_fault failed + * FAIL_NULL_POINTER ... failed to get memory + * ******************************************************************************/ -void manage_queried_alarms ( list & saved_alarm_list, string hostname ) +int query_alarms ( list & saved_alarm_list, string hostname ) { + static const char HOSTNAME_LABEL [] = "host=" ; + static const char PROCNAME_LABEL [] = ".process=" ; + + int rc = FAIL ; saved_alarm_list.clear(); - /** - * Query all the pmon alarms and if there is an alarm for a - * process that is functioing properly then clear the alarm. - **/ SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS)); if ( alarm_list_ptr ) { - if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS ) + /* Query all the pmon alarms */ + rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ); + if ( rc == RETRY ) + { + dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str()); + rc = PASS ; + } + else if ( rc == PASS ) { for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i ) { /* loop over each active alarm and maintain its activity state */ if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) ) { - int rc ; AlarmFilter alarm_filter ; SFmAlarmDataT alarm_query ; memset(&alarm_query, 0, sizeof(alarm_query)); @@ -139,34 +147,49 @@ void manage_queried_alarms ( list & saved_alarm_lis if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK ) { - string entity = alarm_filter.entity_instance_id ; - size_t pos = entity.find("process="); - if ( pos != std::string::npos ) - { - string pn = entity.substr(pos+strlen("process=")); - ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id, - alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str()); + rc = PASS ; - /* filter out 'process=pmond' as that alarm is handled by hbsAgent */ - if ( pn.compare("pmond") ) + string entity = alarm_filter.entity_instance_id ; + size_t pos_hn = entity.find(HOSTNAME_LABEL); + size_t pos_pn = entity.find(PROCNAME_LABEL); + + if (( pos_hn != std::string::npos ) && + ( pos_pn != std::string::npos )) + { + string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL)); + string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL)); + + /* verify hostname */ + if ( ( hn.length() == 0 ) || ( hn != hostname ) ) { - if ( !hostname.empty() ) - { - pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn ); - } - else - { - active_process_alarms_type this_alarm ; - this_alarm.process = pn ; - this_alarm.severity = alarm_query.severity ; - saved_alarm_list.push_front ( this_alarm ); - } + /* ignore alarms not for this host */ + dlog ("%s %s %s alarm not for this host", + entity.c_str(), + hn.c_str(), + pn.c_str()); + continue ; + } + dlog ("%s alarm is %s (process:%s)\n", + alarm_filter.entity_instance_id, + alarmUtil_getSev_str(alarm_query.severity).c_str(), + pn.c_str()); + + /* filter out 'process=pmond' + * ... that alarm is handled by hbsAgent */ + if ( pn != MTC_SERVICE_PMOND_NAME ) + { + active_process_alarms_type this_alarm ; + this_alarm.process = pn ; + this_alarm.severity = alarm_query.severity ; + saved_alarm_list.push_front ( this_alarm ); } } } else { - ilog ("fm_get_fault failed (rc:%d)\n", rc ); + wlog ("fm_get_fault failed (rc:%d)\n", rc ); + rc = FAIL_OPERATION ; + break ; } } else @@ -174,10 +197,21 @@ void manage_queried_alarms ( list & saved_alarm_lis dlog2 ("last entry %d\n", i); break ; } - } + } /* for loop */ + } + else + { + wlog("failed to query alarms from fm ; rc:%d", rc); + rc = FAIL_REQUEST ; } free(alarm_list_ptr); } + else + { + elog ("unable to allocate memory for alarm list"); + rc = FAIL_NULL_POINTER ; + } + return (rc); } /************************* A L A R M I N G **************************/ diff --git a/mtce/src/pmon/pmonAlarm.h b/mtce/src/pmon/pmonAlarm.h index 79414e1c..392fea82 100644 --- a/mtce/src/pmon/pmonAlarm.h +++ b/mtce/src/pmon/pmonAlarm.h @@ -37,8 +37,10 @@ typedef struct EFmAlarmSeverityT severity ; } active_process_alarms_type ; -/* Clear any pending alarms if the specified hostname is valid */ -void manage_queried_alarms ( list & alarm_list, string hostname="" ); +/* Query FM for a list of Process Monitor (200.006) alarms */ +int query_alarms ( list & alarm_list, string hostname="" ); + +void alarmed_process_audit ( void ); void pmonAlarm_init ( void ); diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 7ab0a8ee..2abe1255 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ; std::list config_files ; std::list::iterator string_iter_ptr ; -/* If there is an alarm in the list that matches one in the process list - * then update that process with its severity and failed state. - * If there is a process in the saved list that is not in the process list - * then clear its alarm as it is no longer valid. - */ -void manage_process_alarms ( list & _list, - process_config_type * const ptr, - int const processes ); - static process_config_type process_config[MAX_PROCESSES] ; /* lookup process control by index and return its pointer if found. @@ -216,6 +207,7 @@ void pmon_timer_init ( void ) /* Init the timer for this process */ mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ; } + _pmon_ctrl_ptr->last_alarm_query_pass = false ; } void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr ); @@ -371,7 +363,7 @@ void init_process_config_memory ( void ) * all the process config files from /etc/pmon.d */ void load_processes ( void ) { - list saved_alarm_list ; + list queried_alarm_list ; int rc = PASS ; @@ -385,10 +377,6 @@ void load_processes ( void ) close_process_socket ( &process_config[i] ); } - /* Query fm for existing pmon process alarms and - * for each that is found store their 'name' and - * 'severity' in the passed in saved list */ - manage_queried_alarms ( saved_alarm_list ); /* init the process config memory */ init_process_config_memory (); @@ -454,13 +442,8 @@ void load_processes ( void ) } _pmon_ctrl_ptr->reload_config = false ; - /* If there were process alarms that existed over the reload - * then ensure that those processes are updated with that information. */ - if ( saved_alarm_list.size () ) - { - ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size()); - manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes ); - } + /* use the audit to clear pre-existing alarms at process startup */ + alarmed_process_audit (); } @@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr } } -/************************************************************************ +/*************************************************************************** * - * Name : manage_process_alarms + * Name : alarmed_process_audit * - * Description: This interface manages process alarms over a process - * configuration reload + * Purpose : Verify the process state matches the queried alarm state * - * Steps: + * Description: To correct process alarm state mismatches. * - * 1. Loop over each item in the list and mark the process as failed - * with the specified severity level. - * - * 2. If the process is not found then clear its alarm as it is no - * longer a valid process in the new profile and we don't want a - * lingering stuck alarm. - * - *************************************************************************/ + ***************************************************************************/ -void manage_process_alarms ( list & _list, - process_config_type * const ptr, - int const processes ) +void alarmed_process_audit ( void ) { - /* get out if the list is empty ; should not have been called if - * empty but ... just in case */ - if ( ! _list.empty() ) + /* Don't audit FM in service after the last query was successful. + * There is a blocking issue that needs to be dealt with */ + if ( _pmon_ctrl_ptr->last_alarm_query_pass == true ) + return ; + + /* + * Query fm for existing pmon process alarms and + * for each that is found store their 'name' and + * 'severity' in the passed in queried_alarm_list. + */ + list queried_alarm_list ; + int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname ); + _pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS); + + /* just return if query failed */ + if ( _pmon_ctrl_ptr->last_alarm_query_pass == false ) + return ; + + if ( queried_alarm_list.size () ) { list::iterator _iter_ptr ; + alog ("audit found %ld active alarms", queried_alarm_list.size()); + /* loop over the list ... */ - for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr ) + for ( _iter_ptr=queried_alarm_list.begin(); + _iter_ptr!=queried_alarm_list.end(); + ++_iter_ptr ) { - /* for each item assum it is not found */ bool found = false ; + alog ("%s audit", _iter_ptr->process.c_str()); - /* try and find this process in the new process profile */ - for ( int i = 0 ; i < processes ; i++ ) + /* find this process*/ + for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ ) { - if ( ! _iter_ptr->process.compare((ptr+i)->process) ) - { - /* If the process is found then mark it as failed and update its severity. - * At this point we then assume that there is an alarm raised for this process. */ - found = true ; + process_config_type * ptr = &process_config[i]; - (ptr+i)->failed = false ; - wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() ); - pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); + if ( ! _iter_ptr->process.compare(ptr->process) ) + { + found = true ; + if ( ptr->failed == false ) + { + ilog ("%s stale alarm ; clearing", + _iter_ptr->process.c_str() ); + + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + _iter_ptr->process ); + } + else if ( _iter_ptr->severity != ptr->alarm_severity ) + { + wlog ("%s alarm severity mismatch ; %s -> %s ; correcting", + ptr->process, + alarmUtil_getSev_str(_iter_ptr->severity).c_str(), + alarmUtil_getSev_str(ptr->alarm_severity).c_str()); + if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR ) + { + pmonAlarm_minor(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process, 0); + } + else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR ) + { + pmonAlarm_major(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process); + } + else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL ) + { + pmonAlarm_critical(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process); + } + else + { + wlog ("%s unexpected severity '%s' ; clearing alarm", + ptr->process, + ptr->severity); + + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process ); + } + } + else + { + alog ("%s is alarmed '%s' ; audit", + ptr->process, + ptr->severity); + } } } - /* if not found then just clear the alarm */ if ( found == false) { - wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() ); - pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); + wlog ("%s is not a monitored process ; clearing alarm", + _iter_ptr->process.c_str()); + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + _iter_ptr->process ); } } } } + void pmon_service ( pmon_ctrl_type * ctrl_ptr ) { std::list socks ; @@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) { _get_events (); mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period ); + + alarmed_process_audit (); } /* Run the degrade set/clear by audit */ diff --git a/mtce/src/pmon/scripts/pmon.logrotate b/mtce/src/pmon/scripts/pmon.logrotate old mode 100755 new mode 100644 index ea151b26..08416fb9 --- a/mtce/src/pmon/scripts/pmon.logrotate +++ b/mtce/src/pmon/scripts/pmon.logrotate @@ -1,16 +1,19 @@ -#daily -nodateext +# +# Copyright (c) 2015-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 /var/log/pmond.log { - nodateext - size 10M + create 0640 root root start 1 - missingok + size 10M rotate 20 compress - sharedscripts + notifempty + missingok postrotate systemctl reload syslog-ng > /dev/null 2>&1 || true endscript + delaycompress } diff --git a/mtce/src/pmon/scripts/pmond.conf b/mtce/src/pmon/scripts/pmond.conf old mode 100755 new mode 100644 diff --git a/mtce/src/scripts/crashdump.logrotate b/mtce/src/scripts/crashdump.logrotate index a16bcb7c..ca8e84c6 100644 --- a/mtce/src/scripts/crashdump.logrotate +++ b/mtce/src/scripts/crashdump.logrotate @@ -1,7 +1,11 @@ +# +# Copyright (c) 2020-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 + /var/log/crash/vmcore.tar /var/log/crash/vmcore_first.tar { - nodateext size 1K start 1 rotate 1 diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index edfd6c5d..461766b0 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -87,6 +87,10 @@ sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger daemon_log_port = 2121 ; daemon logger port mtcalarm_req_port = 2122 ; +sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient + ; before issuing BMC reset. + + [timeouts] ; configurable maintenance timeout values in seconds failsafe_shutdown_delay = 120; diff --git a/mtce/src/scripts/mtce.logrotate b/mtce/src/scripts/mtce.logrotate index 17842c8a..8095f311 100644 --- a/mtce/src/scripts/mtce.logrotate +++ b/mtce/src/scripts/mtce.logrotate @@ -1,59 +1,67 @@ -#daily - -# Apply all these options to all the logs -nodateext -start 1 -compress -notifempty -missingok -sharedscripts -postrotate - systemctl reload syslog-ng > /dev/null 2>&1 || true -endscript - +# +# Copyright (c) 2015-2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# /var/log/mtcAgent.log { - size 100M + create 0640 root root + start 1 rotate 10 + size 100M + compress + notifempty + missingok + postrotate + systemctl reload syslog-ng > /dev/null 2>&1 || true + endscript + delaycompress } /var/log/hbsAgent.log -{ - size 20M - rotate 5 -} - /var/log/mtcClient.log -{ - size 20M - rotate 5 -} - /var/log/hbsClient.log { - size 20M + create 0640 root root + start 1 rotate 5 + size 20M + compress + notifempty + missingok + postrotate + systemctl reload syslog-ng > /dev/null 2>&1 || true + endscript + delaycompress } /var/log/mtclogd.log { - size 10M + create 0640 root root + start 1 rotate 5 + size 10M + compress + notifempty + missingok + postrotate + systemctl reload syslog-ng > /dev/null 2>&1 || true + endscript + delaycompress } +# The mtclogd opens and closes these log files on every log addition. +# Therefore does not require a notification over log rotation. +/var/log/mtcAgent_event.log +/var/log/mtcAgent_alarm.log /var/log/mtcAgent_api.log { - size 20M + create 0640 root root + start 1 rotate 5 -} - -/var/log/mtcAgent_event.log -{ - size 20M - rotate 5 -} -/var/log/mtcAgent_alarm.log -{ size 10M - rotate 5 + compress + notifempty + missingok + delaycompress } diff --git a/mtce/src/scripts/wipedisk b/mtce/src/scripts/wipedisk index 636ced60..2dc9bc34 100755 --- a/mtce/src/scripts/wipedisk +++ b/mtce/src/scripts/wipedisk @@ -18,6 +18,28 @@ usage () exit 1 } +# Systemd automatically remounts all the mounted filesystems at shutdown +# When we are deleting a partition, we have to unmount its corresponding filesystem +# because remounting deleted filesystems at shutdown will throw errors +unmount_fs() +{ + local fs=$1 + local ret_code=0 + echo "Trying to unmount $fs" + if findmnt $fs > /dev/null 2>&1 ; then + if umount -f $fs ; then + echo "$fs has been successfully unmounted" + else + echo "Error! Failed to unmount $fs" + ret_code=1 + fi + else + echo "Warning! $fs is not mounted" + ret_code=2 + fi + return $ret_code +} + OPTS=`getopt -o h -l force -- "$@"` if [ $? != 0 ] then @@ -100,11 +122,14 @@ fi BACKUP_PART_GUID="BA5EBA11-0000-1111-2222-000000000002" part_type_guid_str="Partition GUID code" +# get the nodetype variable to check later if this node is a controller +. /etc/platform/platform.conf + for dev in $WIPE_HDD do if [[ -e $dev ]] then - if [ "$dev" == "$rootfs" ] + if [[ "$dev" == "$rootfs" && "${nodetype}" == "controller" ]] then part_numbers=( $(parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}') ) for part_number in "${part_numbers[@]}"; do @@ -128,6 +153,7 @@ do # Skip / or we will lose access to the tools on the system. if [[ $part != $rootfs_part ]] then + unmount_fs $part dd if=/dev/zero of=$part bs=512 count=34 dd if=/dev/zero of=$part bs=512 count=34 seek=$((`blockdev --getsz $part` - 34)) fi @@ -141,6 +167,7 @@ do else echo "Wiping $dev..." wipefs -f -a $dev + unmount_fs $dev # Clearing previous GPT tables or LVM data # Delete the first few bytes at the start and end of the partition. This is required with