Merge "Mtce heartbeat cluster state change notification improvement"

2021-01-18 16:15:27 +00:00 · 2021-01-18 16:15:27 +00:00 · 7a3adb2cdc
parent d73e6af5a3 5ab03b5222
commit 7a3adb2cdc
5 changed files with 78 additions and 54 deletions
--- a/mtce-control/src/scripts/hbsAgent.service
+++ b/mtce-control/src/scripts/hbsAgent.service
@ -1,22 +1,13 @@
 [Unit]
 Description=StarlingX Maintenance Heartbeat Agent
-After=network.target syslog.service config.service
+After=hbsClient.service
 Before=pmon.service

 [Service]
 Type=forking
 ExecStart=/etc/rc.d/init.d/hbsAgent start
-ExecStop=/etc/rc.d/init.d/hbsAgent start
+ExecStop=/etc/rc.d/init.d/hbsAgent stop
 PIDFile=/var/run/hbsAgent.pid
-KillMode=process
-SendSIGKILL=no
-
-# Process recovery is handled by pmond if its running.
-# Delay 10 seconds to give pmond a chance to recover
-# before systemd kicks in to do it as a backup plan.
-Restart=always
-RestartSec=10

 [Install]
 WantedBy=multi-user.target
-
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -8628,7 +8628,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p



-#define HBS_LOSS_REPORT_THROTTLE (100)
+#define HBS_LOSS_REPORT_THROTTLE (100000)
 int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
 {
    int lost = 0  ;
@ -8668,6 +8668,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )

            if ( pulse_ptr->b2b_misses_count[iface] > 1 )
            {
+                if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
+                {
+                    hbs_cluster_change ( pulse_ptr->hostname + " " +
+                            get_iface_name_str(iface) +
+                            " heartbeat miss " +
+                            itos(pulse_ptr->b2b_misses_count[iface]));
+                }
                if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
                {
                    if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@ -8774,57 +8781,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                }
            }

-            /* Turn the cluster-host heartbeat loss into a degrade only
-             * condition if the clstr_degrade_only flag is set */
-            if (( iface == CLSTR_IFACE ) &&
-                ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                ( clstr_degrade_only == true ))
-            {
-                /* Only print the log at the threshold boundary */
-                if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
-                {
-                    if ( this->active_controller )
-                    {
-                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-                    }
-
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
-                               pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface) );
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
-                }
-            }
-
            /* Turn the clstr heartbeat loss into a degrade only
             * condition for inactive controller on normal system. */
-            else if (( iface == CLSTR_IFACE ) &&
-                     ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                     ( this->system_type == SYSTEM_TYPE__NORMAL ) &&
-                     (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
+            if (( iface == CLSTR_IFACE ) &&
+                ((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
+                 (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
+                 ( clstr_degrade_only == true )))
            {
                /* Only print the log at the threshold boundary */
-                if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+                if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
                {
                    if ( this->active_controller )
                    {
                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
                    }
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
+                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
                               pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface));
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+                               get_iface_name_str(iface),
+                               clstr_degrade_only ? "config option" : "system type");
+                    hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
                }
            }

            else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+            // else if ( pulse_ptr->hbs_failure[iface] == false )
            {
-                elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
-                                                        get_iface_name_str(iface) );
+                elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
+                          pulse_ptr->hostname.c_str(),
+                          get_iface_name_str(iface),
+                          pulse_ptr->b2b_misses_count[iface]);
+                hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );

                if ( this->active_controller )
                {
-                    manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-
+                    if ( pulse_ptr->hbs_failure[iface] == false )
+                    {
+                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
+                    }
                    /* report this host as failed */
                    if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
                    {
@ -8832,10 +8825,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                    }
                }
                else
-                {
                    pulse_ptr->hbs_failure[iface] = true ;
-                }
-                hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+
                pulse_ptr->hbs_failure_count[iface]++ ;
            }
            if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -2374,6 +2374,7 @@ void daemon_service_run ( void )
                    arrival_histogram[iface] = "" ;
                    unexpected_pulse_list[iface] = "" ;

+
                    rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
                    if ( rc != 0 )
                    {
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@ -326,7 +326,7 @@ void hbs_cluster_log  ( string & hostname, mtce_hbs_cluster_type & cluster, stri
 void hbs_sm_handler ( void );

 /* send the cluster vault to SM */
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );

 /* copy cluster data from src to dst */
 void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
 /* Heartbeat service state audit */
 void hbs_state_audit ( void );

+/* Send state change message to SM if there has been a
+ * state change in the last period */
+void hbs_cluster_change_notifier ( void );
+
 /**
 * @} hbs_base
 */
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@ -69,6 +69,8 @@ typedef struct

    msgClassSock * sm_socket_ptr ;

+    string cluster_change_reason ;
+
 } hbs_cluster_ctrl_type ;

 /* Cluster control structire construct allocation. */
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
    {
        ctrl.sm_socket_ptr = sm_socket_ptr ;
    }
+    ctrl.cluster_change_reason = "";
+
    ctrl.log_throttle = 0 ;
 }

@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,

 void hbs_cluster_change ( string cluster_change_reason )
 {
-    hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
+    ilog ("reason: %s", cluster_change_reason.c_str());
+    if ( ctrl.cluster_change_reason.empty() )
+        ctrl.cluster_change_reason = cluster_change_reason ;
+    else
+        ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
+}
+
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_change_notifier
+ *
+ * Description : Send SM the cluster info if there has been a state change.
+ *
+ ***************************************************************************/
+void hbs_cluster_change_notifier ( void )
+{
+    if ( ! ctrl.cluster_change_reason.empty () )
+    {
+        if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
+                               ctrl.cluster_change_reason ) == PASS )
+        {
+            ctrl.cluster_change_reason.clear();
+        }
+    }
 }

 /****************************************************************************
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
            wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
                             "Unable to store history beyond %d ",
                             ctrl.cluster.histories );
+            hbs_cluster_change_notifier ();
            return ;
        }
        else
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
    else
        history_ptr->oldest_entry_index++ ;

+    hbs_cluster_change_notifier ();
+
    /* clear the log throttle if we are updating history ok. */
    ctrl.log_throttle = 0 ;
 }
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
 *
 ***************************************************************************/

-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
 {
+    int rc = FAIL_SOCKET_SENDTO ;
    ctrl.cluster.reqid = (unsigned short)reqid ;
    if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
    {
-        ilog ("cluster state notification Reason: %s", reason.c_str());
        int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
        int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
        if ( bytes <= 0 )
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
             elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
                    bytes , errno, strerror(errno));
        }
-        hbs_cluster_dump ( ctrl.cluster );
+        else
+        {
+            /* limit the string length */
+            ilog ("reason: %s", reason.substr(0,80).c_str());
+            hbs_cluster_dump ( ctrl.cluster );
+            rc = PASS ;
+        }
    }
    else
    {
        wlog ("cannot send cluster info due to socket error");
    }
+    return(rc);
 }

 /****************************************************************************
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
        {
            if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
            {
-                 hbs_cluster_change ("peer controller cluster event " +
+                 hbs_cluster_change ("peer cluster delta " +
                 hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
            }