From 9d7a4bf92c5ad168b5338855872d6f71d99abb1f Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Fri, 7 Dec 2018 13:24:27 -0500
Subject: [PATCH] Implement Active-Active Heartbeat as HA Improvement Fix

A few small issues were found during integration testing with SM.

This update delivers those integration tested fixes.

1. Send cluster event to SM only after the first 10 heartbeat
   pulses are received.
2. Only send inventory to hbsAgent on provisioned controllers.
3. Add new OOB SM_UNHEALTHY flag to detect and act on an SM
   declared unhealthy controller.
4. Network monitoring enable fix.
5. Fix oldest entry tracking when a network history is not full.
6. Prevent clearing local uptime for a host that is being enabled.
7. Refactor cluster state change notification logging and handling.

These fixes were both UT and IT tested in multiple labs

Change-Id: I28485f241ac47bb3ed3ec1e2a8f4c09a1ca2070a
Story: 2003576
Task: 24907
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce-common/src/common/nodeBase.h     |   1 +
 mtce/centos/build_srpm.data           |   2 +-
 mtce/src/common/nodeClass.cpp         |  41 ++-
 mtce/src/common/nodeClass.h           |   3 +
 mtce/src/heartbeat/hbsAgent.cpp       |  30 +-
 mtce/src/heartbeat/hbsBase.h          |  37 ++-
 mtce/src/heartbeat/hbsClient.cpp      |  93 +++++-
 mtce/src/heartbeat/hbsCluster.cpp     | 442 +++++++++++++++-----------
 mtce/src/heartbeat/hbsUtil.cpp        | 127 ++++----
 mtce/src/maintenance/mtcCompMsg.cpp   |   4 +-
 mtce/src/maintenance/mtcCtrlMsg.cpp   |   6 +-
 mtce/src/maintenance/mtcNodeHdlrs.cpp |  12 +
 mtce/src/maintenance/mtcStubs.cpp     |   4 +
 13 files changed, 529 insertions(+), 273 deletions(-)
diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h
index 788954b6..9e8586cd 100755
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@@ -86,6 +86,7 @@ void daemon_exit ( void );
 #define MTC_FLAG__SM_DEGRADED      (0x00000080)
 #define MTC_FLAG__PATCHING         (0x00000100) /* Patching in progress */
 #define MTC_FLAG__PATCHED          (0x00000200) /* Patched but not reset */
+#define MTC_FLAG__SM_UNHEALTHY     (0x00001000)
 
 #define MTC_UNHEALTHY_THRESHOLD    (3)
 
diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data
index 01e786b1..9e6662d0 100644
--- a/mtce/centos/build_srpm.data
+++ b/mtce/centos/build_srpm.data
@@ -1,3 +1,3 @@
 SRC_DIR="src"
-TIS_PATCH_VER=142
+TIS_PATCH_VER=143
 BUILD_IS_SLOW=5
diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp
index e33c70ce..04d96330 100755
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@@ -4214,6 +4214,25 @@ int nodeLinkClass::num_controllers_enabled ( void )
     return (cnt);
 }
 
+
+/** Returns true if the specified hostname is provisioned */
+bool nodeLinkClass::hostname_provisioned ( string hostname )
+{
+    bool provisioned = false ;
+    for ( struct node * ptr = head ;  ; ptr = ptr->next )
+    {
+        if ( ptr->hostname.compare(hostname) == 0 )
+        {
+            provisioned = true ;
+            break ;
+        }
+        if (( ptr->next == NULL ) || ( ptr == tail ))
+            break ;
+    }
+    return (provisioned);
+}
+
+
 int nodeLinkClass::service_netlink_events ( int nl_socket , int ioctl_socket )
 {
     std::list<string> links_gone_down ;
@@ -7336,9 +7355,6 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
         bool want_log = true ;
         for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
         {
-            if ( node_ptr->monitor[iface] == true_false )
-                continue ;
-
             if ( iface == INFRA_IFACE )
             {
                 if ( this->infra_network_provisioned == false )
@@ -7810,6 +7826,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
         {
             ptr->hbs_count[iface]++ ;
             ptr->b2b_pulses_count[iface]++ ;
+
+            if ( ptr->b2b_pulses_count[iface] == hbs_failure_threshold )
+            {
+                hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat pass" );
+            }
+            else if ( ptr->b2b_pulses_count[iface] == 1 )
+            {
+                hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat start" );
+            }
+
             if ( ptr->hbs_failure[iface] == true )
             {
                 /* threshold failure recovery */
@@ -8281,8 +8307,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                     wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
                                      "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
                                                                        get_iface_name_str(iface) );
-                    this->print_node_info ();
-                    hbs_cluster_log ( this->my_hostname, "event", true );
+                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                 }
             }
 
@@ -8303,8 +8328,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                     wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
                                      "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
                                                                        get_iface_name_str(iface) );
-                    this->print_node_info ();
-                    hbs_cluster_log ( this->my_hostname, "event", true );
+                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                 }
             }
 
@@ -8327,8 +8351,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                 else
                 {
                     pulse_ptr->hbs_failure[iface] = true ;
-                    this->print_node_info ();
-                    hbs_cluster_log ( this->my_hostname, "event", true );
+                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                 }
                 pulse_ptr->hbs_failure_count[iface]++ ;
             }
diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h
index 0701a749..e5c39172 100755
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@@ -1394,6 +1394,9 @@ public:
     /** Sets a hosts's function and subfunction members */
     int update_host_functions ( string hostname , string functions );
 
+    /** Returns true if the specified hostname is provisioned */
+    bool hostname_provisioned ( string hostname );
+
     /***********************************************************/
 
     /** Number of provisioned hosts (nodes) */
diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp
index 1e2b2fc1..ffa0e0fa 100644
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@@ -1277,6 +1277,9 @@ int daemon_init ( string iface, string nodetype )
     /* init the utility module */
     hbs_utils_init ();
 
+    /* init the cluster control structure */
+    hbs_cluster_ctrl_init ();
+
     /* initialize the timer */
     mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
     mtcTimer_init ( hbsTimer_audit, "controller", "state audit" );
@@ -1398,7 +1401,7 @@ void hbs_sm_handler ( void )
                     ( request == SUPPORTED_REQUEST ))
                 {
                     /* success path ... */
-                    hbs_cluster_send( hbs_sock.sm_client_sock, reqid );
+                    hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
 
                     /* reset log throttle */
                    _hbs_sm_handler_log_throttle = 0 ;
@@ -1722,6 +1725,7 @@ void daemon_service_run ( void )
                 {
                     hbsInv.hbs_disabled = true ;
                     hbsInv.hbs_state_change = true ;
+                    hbs_cluster_lock();
                     ilog ("heartbeat service going disabled (locked)");
 
                     /* force the throttle 'still disabled' log to wait for
@@ -1900,8 +1904,18 @@ void daemon_service_run ( void )
                         }
                         else if ( msg.cmd == MTC_CMD_STOP_HOST )
                         {
-                            hbsInv.mon_host ( hostname, false, true );
-                            hbs_cluster_del ( hostname );
+                            if ( hostname == hbsInv.my_hostname )
+                            {
+                                ilog ("%s heartbeat service disabled by stop command",
+                                          hostname.c_str());
+
+                                hbs_manage_controller_state( hostname, false );
+                            }
+                            else
+                            {
+                                hbsInv.mon_host ( hostname, false, true );
+                                hbs_cluster_del ( hostname );
+                            }
                         }
                         else if ( msg.cmd == MTC_CMD_START_HOST )
                         {
@@ -1938,9 +1952,7 @@ void daemon_service_run ( void )
 
                             hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ;
                             ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period );
-
-                            /* Send SM cluster information at start of MNFA */
-                            hbs_cluster_send( hbs_sock.sm_client_sock, 0 );
+                            hbs_cluster_change ( "backoff" );
                             hbsInv.print_node_info();
                         }
                         else
@@ -2170,6 +2182,9 @@ void daemon_service_run ( void )
                  * algorithm into 'receive' mode */
                 heartbeat_request = false ;
 
+                /* tell cluster module that a new pulse period has started */
+                hbs_cluster_period_start();
+
                 /* Start the heartbeat timer.
                  * All nodes are expected to send a
                  *  pulse before this timer expires. */
@@ -2263,6 +2278,9 @@ void daemon_service_run ( void )
          */
         else
         {
+            /* manage vault wrt peer controller */
+            hbs_cluster_peer();
+
             for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
             {
                 /* Do not service the infrastructure interface if it is not provisioned */
diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h
index b679fadf..f68a902a 100755
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@@ -232,6 +232,9 @@ void   hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
 
 /******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
 
+/* Init the control structure */
+void hbs_cluster_ctrl_init ( void );
+
 /* Set the cluster vault to default state.
  * Called upon daemon init or heartbeat period change. */
 void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
@@ -240,16 +243,25 @@ void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
  * Primarily to know how many history elements are missing. */
 unsigned short hbs_cluster_unused_bytes ( void );
 
+/* Inform the cluster module that there was a change to the cluster */
+void hbs_cluster_change ( string cluster_change_reason );
+
 /* Add and delete hosts from the monitored list.
  * Automatically adjusts the numbers in the cluster vault. */
 void hbs_cluster_add  ( string & hostname );
 void hbs_cluster_del  ( string & hostname );
+void hbs_cluster_rem  ( unsigned short controller );
+void hbs_cluster_lock ( void );
+
+/* Do stuff in preparation for another pulse period start */
+void hbs_cluster_period_start ( void );
 
 /* Report status of storgate-0 */
 void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
 
-/* Look for and clog changes in cluster state */
-int  hbs_cluster_cmp  ( hbs_message_type & msg );
+/* Compare 2 histories */
+int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
+                     mtce_hbs_cluster_history_type h2 );
 
 /* Manage the enabled state of the controllers */
 void hbs_manage_controller_state ( string & hostname, bool enabled );
@@ -266,6 +278,9 @@ int  hbs_cluster_save (               string & hostname,
                         mtce_hbs_network_enum  network,
                             hbs_message_type & msg );
 
+/* Manage peer controller vault history. */
+void hbs_cluster_peer ( void );
+
 /*
  * Called by the hbsAgent pulse receiver to create a network specific
  * history update entry consisting of
@@ -285,6 +300,19 @@ void hbs_cluster_update ( iface_enum iface,
  * the other controller back in its response. */
 void hbs_cluster_append ( hbs_message_type & msg );
 
+/* Inject a history entry at the next position for all networks of the
+ * specified controller.
+ *
+ * This is used to add a 0:0 entry into the vault history of the specified
+ * controller as indication that that no host for this pulse period
+ * provided history for this controller.
+ *
+ * Procedure was made generic so that it 'could' be used to add history
+ * of any values for fault insertion or other potential future purposes
+ */
+void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding );
+
+
 /* Produce formatted clog's that characterize current and changing cluster
  * history for a given network. Each log is controller/network specific. */
 void hbs_cluster_log  ( string & hostname,                                  string prefix, bool force=false );
@@ -295,13 +323,14 @@ void hbs_cluster_log  ( string & hostname, mtce_hbs_cluster_type & cluster, stri
 void hbs_sm_handler ( void );
 
 /* send the cluster vault to SM */
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
+void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
 
 /* copy cluster data from src to dst */
 void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
 
 /* print the contents of the vault */
-void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force );
+void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled );
+void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason );
 
 /* Heartbeat service state audit */
 void hbs_state_audit ( void );
diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp
index c0dbad8d..21a9d62b 100644
--- a/mtce/src/heartbeat/hbsClient.cpp
+++ b/mtce/src/heartbeat/hbsClient.cpp
@@ -108,6 +108,9 @@ static stallMon_type stallMon ;
 /* Cached Cluster view from controllers */
 mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS];
 
+/* Incremented every time the hbsClient fails to receive a summary this
+ * controller for 2 back-to-back pulse intervals. */
+int missed_controller_summary_tracker[MTCE_HBS_MAX_CONTROLLERS] ;
 
 void daemon_sigchld_hdlr ( void )
 {
@@ -887,8 +890,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
 
         if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS )
         {
-            slog ("controller-%d provided %d network histories ; max is %d per controller",
+            slog ("controller-%d %s provided %d network histories ; max is %d per controller",
                    controller,
+                   get_iface_name_str(iface),
                    hbs_sock.rx_mesg[iface].cluster.histories,
                    MTCE_HBS_MAX_NETWORKS );
         }
@@ -903,29 +907,81 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
         {
             hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster,
                                controller_cluster_cache[controller] );
+
             clog1 ("controller-%d cluster info from %s pulse request saved to cache",
                     controller, get_iface_name_str(iface));
 
-            hbs_sock.rx_mesg[iface].cluster.histories = 0 ;
+            /* Clear the expecting count for this controller.
+             * Each heartbeat cycle should result in this being cleared for
+             * both controllers.
+             *
+             * Clearing this is indication that we got a pulse request from
+             * this controller. The code below will increment this count
+             * for its peer controller on every request.
+             * An accumulation of count is indication that we are not
+             * receiving response from the indexed controller */
+            missed_controller_summary_tracker[controller] = 0 ;
 
             if ( have_other_controller_history ( controller ) == true )
             {
-                /* Now copy the other controller's cached cluster info into
-                 * this controlers response */
-                hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
-                                   hbs_sock.rx_mesg[iface].cluster );
+                /******************************************************************
+                 *
+                 * Increment the expecting count for the other controller.
+                 * If that other controller's expecting count reaches 2 or
+                 * more then do not include a summary for that controller
+                 * in this response.
+                 *
+                 * This avoids sending stale summary info.
+                 *
+                 *****************************************************************/
 
-                if ( daemon_get_cfg_ptr()->debug_state & 4 )
+                /* Since the controllers run asynchronously the absence of
+                 * one or 2 between pulse requests for the same controller
+                 * can happen. This is why we compare against greater than
+                 * the number of monitored networks (histories for this
+                 * controller) times 2 ; following Nyquist Theorem . */
+                if ( ++missed_controller_summary_tracker[controller?0:1] >
+                        controller_cluster_cache[controller?0:1].histories * 2 )
                 {
-                    string dump_banner = "" ;
-                    dump_banner.append("controller-") ;
-                    dump_banner.append(itos(controller?0:1));
-                    dump_banner.append(" cluster info from cache injected into controller-");
-                    dump_banner.append(itos(controller));
-                    dump_banner.append(":");
-                    dump_banner.append(get_iface_name_str(iface));
-                    dump_banner.append(" pulse response");
-                    hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true );
+                    wlog ("controller-%d %s cluster info cleared (%d)",
+                            controller?0:1,
+                            get_iface_name_str(iface),
+                            missed_controller_summary_tracker[controller?0:1]);
+
+                    /* Clear the cached history for that controller who's
+                     * heartbeat requests are no longer being seen.
+                     * No need to clear the history entries,
+                     * just the number of histories to 0 and update bytes. */
+                    controller_cluster_cache[controller?0:1].histories = 0 ;
+                    controller_cluster_cache[controller?0:1].bytes = BYTES_IN_CLUSTER_VAULT(0) ;
+
+                    /* now that the peer controller cluster info is cleared
+                     * we will not see another log from above until we get
+                     * another pulse request from the peer controller. */
+                }
+                else
+                {
+                    clog  ("controller-%d %s cluster info added to response (%d)",
+                            controller?0:1,
+                            get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
+
+                    /* Now copy the other controller's cached cluster info into
+                     * this controller's response */
+                    hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
+                                       hbs_sock.rx_mesg[iface].cluster );
+
+                    if ( daemon_get_cfg_ptr()->debug_state & 4 )
+                    {
+                        string dump_banner = "" ;
+                        dump_banner.append("controller-") ;
+                        dump_banner.append(itos(controller?0:1));
+                        dump_banner.append(" cluster info from cache injected into controller-");
+                        dump_banner.append(itos(controller));
+                        dump_banner.append(":");
+                        dump_banner.append(get_iface_name_str(iface));
+                        dump_banner.append(" pulse response");
+                        hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
+                    }
                 }
             }
         }
@@ -1079,7 +1135,10 @@ int daemon_init ( string iface, string nodeType_str )
 
     /* Initialize the controller cluster view data bounce structure */
     for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
+    {
         memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ;
+        missed_controller_summary_tracker[c] = 0 ;
+    }
 
     /* init the utility module */
     hbs_utils_init ();
@@ -1306,7 +1365,7 @@ void daemon_service_run ( void )
                 int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
                 if ( bytes )
                 {
-                    hbs_cluster_dump (msg, "Cluster info received", true );
+                    hbs_cluster_dump (msg, "cluster info received" );
                 }
             }
 #endif
diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp
index 85b8f363..b46f95b6 100644
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@@ -37,6 +37,7 @@ typedef struct
     bool controller_2_enabled ;
 #endif
 
+    bool peer_controller_enabled ;
     /* Used to prevent log flooding in presence of back to back errors. */
     unsigned int log_throttle ;
 
@@ -64,9 +65,9 @@ typedef struct
     /* The working heartbeat cluster data vault. */
     mtce_hbs_cluster_type cluster ;
 
-    bool cluster_change ;
-    int  cluster_change_threshold_count ;
-    int  cluster_change_difference_count ;
+    string cluster_change_reason ;
+
+    bool got_peer_controller_history ;
 
     msgClassSock * sm_socket_ptr ;
 
@@ -126,6 +127,24 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
     ctrl.log_throttle = 0 ;
 }
 
+void hbs_cluster_ctrl_init ( void )
+{
+    ctrl.this_controller = 0xffff ;
+    ctrl.controller_0_enabled = false ;
+    ctrl.controller_1_enabled = false ;
+#ifdef THREE_CONTROLLER_SYSTEM
+    ctrl.controller_2_enabled = false ;
+#endif
+    ctrl.peer_controller_enabled = false ;
+    ctrl.log_throttle = 0 ;
+    ctrl.monitored_networks = 0 ;
+    ctrl.monitored_hosts = 0 ;
+    ctrl.monitored_hostname_list.clear();
+    ctrl.cluster_change_reason = "" ;
+    ctrl.got_peer_controller_history = false ;
+    ctrl.sm_socket_ptr = NULL ;
+    memset(&ctrl.storage_0_not_responding_count[0], 0, sizeof(ctrl.storage_0_not_responding_count));
+}
 
 /****************************************************************************
  *
@@ -149,6 +168,23 @@ void hbs_cluster_nums ( unsigned short this_controller,
    ctrl.monitored_networks = monitored_networks ;
 }
 
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_change
+ *
+ * Description : Maintain a the cluster change reason.
+ *
+ *               cleared and printed in hbs_cluster_update.
+ *
+ ***************************************************************************/
+
+void hbs_cluster_change ( string cluster_change_reason )
+{
+    if ( ctrl.cluster_change_reason.empty() )
+        ctrl.cluster_change_reason = cluster_change_reason ;
+    else if ( cluster_change_reason.find ( "peer controller cluster event" ) == std::string::npos )
+        ctrl.cluster_change_reason.append(" ; " + cluster_change_reason);
+}
 
 /****************************************************************************
  *
@@ -196,7 +232,7 @@ void cluster_storage0_state ( bool enabled )
         ctrl.cluster.storage0_enabled = enabled ;
         ilog ("storage-0 heartbeat state changed to %s",
                 enabled ? "enabled" : "disabled" );
-        ctrl.cluster_change = true ;
+        hbs_cluster_change ( "storage-0 state change" );
     }
 }
 
@@ -211,21 +247,50 @@ void cluster_storage0_state ( bool enabled )
 
 void hbs_manage_controller_state ( string & hostname, bool enabled )
 {
+    int controller = -1 ;
+
     /* track controller state */
     if ( hostname == CONTROLLER_0 )
     {
+        controller = 0 ;
         ctrl.controller_0_enabled = enabled ;
     }
     else if ( hostname == CONTROLLER_1 )
     {
+        controller = 1 ;
         ctrl.controller_1_enabled = enabled ;
     }
-#ifdef THREE_CONTROLLER_SYSTEM
-    else if ( hostname == CONTROLLER_2 )
+    else
     {
-        ctrl.controller_2_enabled = enabled ;
+        /* ignore all other host names */
+        return ;
+    }
+
+    /* manage the state of the peer controller */
+    if ( ctrl.this_controller != controller )
+    {
+        /* Clear peer controller cluster history when the peer
+         * controller goes disabled */
+        if (( ctrl.peer_controller_enabled == true ) &&
+            ( enabled == false ))
+        {
+            hbs_cluster_rem ( controller );
+        }
+        if ( enabled == false )
+        {
+            hbs_cluster_change ( "peer controller disabled" ) ;
+        }
+        else
+        {
+           hbs_cluster_change ( "peer controller enabled" ) ;
+        }
+        ctrl.peer_controller_enabled = enabled ;
+    }
+    else if ( enabled == false )
+    {
+        hbs_cluster_change ( "this controller locked" ) ;
+        hbs_cluster_lock();
     }
-#endif
 }
 
 
@@ -267,7 +332,6 @@ void hbs_cluster_add ( string & hostname )
         ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
         ilog ("%s added to cluster", hostname.c_str());
         cluster_list ();
-        ctrl.cluster_change = true ;
     }
 
     /* Manage storage-0 state */
@@ -284,13 +348,6 @@ void hbs_cluster_add ( string & hostname )
 
     /* Manage controller state ; true means enabled in this case. */
     hbs_manage_controller_state ( hostname, true );
-
-    if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
-    {
-        hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
-        ctrl.cluster_change = false ;
-    }
-
 }
 
 /****************************************************************************
@@ -341,17 +398,32 @@ void hbs_cluster_del ( string & hostname )
 
             cluster_list ();
 
-            ctrl.cluster_change = true ;
+            hbs_cluster_change ( hostname + " deleted" );
 
             break ;
         }
     }
+}
 
-    if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
-    {
-        hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
-        ctrl.cluster_change = false ;
-    }
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_period_start
+ *
+ * Description : The following things need to be done at the start of
+ *               every pulse period ...
+ *
+ *               - set 'got_peer_controller_history' to false only to get
+ *                 set true when one at least one hbsClient response
+ *                 contains history from the other controller.
+ *
+ ***************************************************************************/
+
+void hbs_cluster_period_start ( void )
+{
+    clog3 ("Pulse Period Start ; waiting on responses (last:%d)",
+            ctrl.got_peer_controller_history );
+    if ( ctrl.got_peer_controller_history )
+        ctrl.got_peer_controller_history = false ;
 }
 
 /****************************************************************************
@@ -500,114 +572,36 @@ void hbs_cluster_update ( iface_enum iface,
             ctrl.storage_0_not_responding_count[n] = 0 ;
     }
 
-    /*
-     * Manage the history entry index.
-     *
-     * Get the previous entry index ...
-     * ... which is the one before the oldest index.
-     * ... which is the index for the next entry.
-     */
-    unsigned short last_entry_index ;
-    unsigned short oldest_entry_index = history_ptr->oldest_entry_index ;
-
-    if ( oldest_entry_index == 0 )
-    {
-        /* Go to the end of the array. */
-        last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
-    }
-    else
-    {
-        /* Otherwise, the previous index in the array */
-        last_entry_index = oldest_entry_index - 1 ;
-    }
-
-    bool   logit = false ;
-    string logit_reason = "" ;
-
-    /* Update the history with this data. */
-    history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
-    history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
-
-    if (( history_ptr->entry[oldest_entry_index].hosts_enabled !=
-          history_ptr->entry[  last_entry_index].hosts_enabled ) ||
-        ( history_ptr->entry[oldest_entry_index].hosts_responding !=
-          history_ptr->entry[  last_entry_index].hosts_responding))
-    {
-        /* Only log on change events. */
-        if ( history_ptr->entry[oldest_entry_index].hosts_enabled ==
-             history_ptr->entry[oldest_entry_index].hosts_responding )
-        {
-            ilog ("controller-%d %s cluster of %d is healthy",
-                   ctrl.this_controller,
-                   hbs_cluster_network_name(n).c_str(),
-                   history_ptr->entry[oldest_entry_index].hosts_enabled);
-            ctrl.cluster_change_threshold_count = 0 ;
-            ctrl.cluster_change_difference_count = 0 ;
-        }
-        else
-        {
-            ctrl.cluster_change_threshold_count++ ;
-            ctrl.cluster_change_difference_count =
-            history_ptr->entry[oldest_entry_index].hosts_enabled -
-            history_ptr->entry[oldest_entry_index].hosts_responding ;
-        }
-    }
-    if ( daemon_get_cfg_ptr()->debug_state&4 )
-    {
-        logit = true ;
-        logit_reason = "(debug)" ;
-    }
-//    else if (( ctrl.cluster_change_threshold_count == 1 ) &&
-//             ( cluster_change == false ))
-//    {
-//        logit = true ;
-//        logit_reason = "" ;
-//    }
-    else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD )
-    {
-        logit = true ;
-        ctrl.cluster_change_threshold_count = 0 ;
-        logit_reason = "(threshold)" ;
-    }
-    else
-    {
-        int delta =
-        history_ptr->entry[oldest_entry_index].hosts_enabled -
-        history_ptr->entry[oldest_entry_index].hosts_responding ;
-        if ( delta != ctrl.cluster_change_difference_count )
-        {
-            logit = true ;
-            ctrl.cluster_change_difference_count = delta ;
-            logit_reason = "(delta)" ;
-        }
-    }
-
-    if ( logit )
-    {
-        ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s",
-               ctrl.this_controller,
-               hbs_cluster_network_name(n).c_str(),
-               history_ptr->entry[oldest_entry_index].hosts_enabled,
-               history_ptr->entry[oldest_entry_index].hosts_responding,
-               ctrl.cluster_change_difference_count,
-               not_responding_hosts,
-               logit_reason.c_str());
-    }
-
     /* Increment the entries count till it reaches the max. */
     if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
         history_ptr->entries++ ;
 
-    /* Manage the next entry update index ; aka the oldest index. */
-    if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
+    /* Update the history with this data. */
+    history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
+    history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
+
+    /* Manage the next entry update index ; aka the oldest index.
+     * - handle not full case ; oldest entry is the first entry
+     * - handle the full case ; wrap around */
+    if (( history_ptr->entries == 0 ) ||
+        ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)))
         history_ptr->oldest_entry_index = 0 ;
     else
         history_ptr->oldest_entry_index++ ;
 
+    /* send SM an update if the cluster has changed which is indicated
+     * by string content in ctrl.cluster_change_reason. */
+    if ( ! ctrl.cluster_change_reason.empty() )
+    {
+        hbs_cluster_send( ctrl.sm_socket_ptr, 0, ctrl.cluster_change_reason );
+        ctrl.cluster_change_reason = "" ;
+    }
+
     /* clear the log throttle if we are updating history ok. */
     ctrl.log_throttle = 0 ;
 }
 
+
 /****************************************************************************
  *
  * Name        : hbs_cluster_append
@@ -646,6 +640,23 @@ void hbs_cluster_append ( hbs_message_type & msg )
             ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
 }
 
+/* Manage peer controller vault history. */
+void hbs_cluster_peer ( void )
+{
+    /* Manage updating the local peer controller history data with 0:0
+     * for this pulse period if there was no response from the peer
+     * controller for this pulse period. */
+    if (( ctrl.got_peer_controller_history == false ) &&
+        ( ctrl.peer_controller_enabled == true ))
+    {
+        ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
+
+        /* if no nodes have reported peer controller history then inject
+         * a 0:0 value in for this pulse period for that controller. */
+        hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
+    }
+}
+
 /****************************************************************************
  *
  * Name        : hbs_cluster_unused_bytes
@@ -679,7 +690,7 @@ unsigned short hbs_cluster_unused_bytes ( void )
  *
  ***************************************************************************/
 
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
+void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
 {
     ctrl.cluster.reqid = (unsigned short)reqid ;
     if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
@@ -691,16 +702,7 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
              elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
                     bytes , errno, strerror(errno));
         }
-        else
-        {
-            string reason = "" ;
-            // ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
-            if ( reqid )
-                reason = "cluster query" ;
-            else
-                reason = "cluster event" ;
-            hbs_cluster_dump ( ctrl.cluster, reason, true );
-        }
+        hbs_cluster_dump ( ctrl.cluster, reason );
     }
     else
     {
@@ -725,6 +727,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
         if (( ctrl.cluster.history[h].controller ==  sample.controller ) &&
             ( ctrl.cluster.history[h].network == sample.network ))
         {
+            if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
+            {
+                 hbs_cluster_change ("peer controller cluster event " +
+                 hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
+            }
+
             memcpy( &ctrl.cluster.history[h], &sample,
                     sizeof(mtce_hbs_cluster_history_type));
 
@@ -738,9 +746,13 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
         }
     }
 
+    hbs_cluster_change ( "peer controller cluster " +
+    hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
+
     /* not found ? Add a new one */
     memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample,
             sizeof(mtce_hbs_cluster_history_type));
+
     ctrl.cluster.histories++ ;
     ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
 
@@ -753,7 +765,7 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
 
 void hbs_state_audit ( void )
 {
-   hbs_cluster_dump ( ctrl.cluster, "Audit", true );
+   hbs_cluster_dump ( ctrl.cluster, "Audit" );
 }
 
 
@@ -779,46 +791,39 @@ void hbs_cluster_log ( string & hostname,
  *
  * Name        : hbs_cluster_cmp
  *
- * Descrition  : Performs a sanity check over the cluster structure.
+ * Descrition  : Compare 2 histories
  *
- * Assumptions : Debug tool, not called at runtime.
- *
- * Returns     : PASS or FAIL
+ * Returns     : 0 - when number of enabled hosts and responding
+ *                      hosts are the same for all the entries.
+ *               # - the number of entries that are different.
  *
  ***************************************************************************/
 
-int hbs_cluster_cmp( hbs_message_type & msg )
+int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
+                     mtce_hbs_cluster_history_type h2 )
 {
-    if ( msg.cluster.version < ctrl.cluster.version )
+    int h1_delta = 0 ;
+    int h2_delta = 0 ;
+    int    delta = 0 ;
+
+    for ( int e = 0 ; e < h1.entries ; e++ )
+        if ( h1.entry[e].hosts_enabled != h1.entry[e].hosts_responding )
+            h1_delta++ ;
+
+    for ( int e = 0 ; e < h2.entries ; e++ )
+        if ( h2.entry[e].hosts_enabled != h2.entry[e].hosts_responding )
+            h2_delta++ ;
+
+    if ( h1_delta > h2_delta )
+        delta = h1_delta-h2_delta ;
+    else if ( h2_delta > h1_delta )
+        delta = h2_delta-h1_delta ;
+
+    if ( delta )
     {
-        wlog ("Unexpected version (%d:%d)",
-               msg.cluster.version, ctrl.cluster.version );
+        clog3 ("peer controller reporting %d deltas", delta );
     }
-    else if ( msg.cluster.revision != ctrl.cluster.revision )
-    {
-        wlog ("Unexpected revision (%d:%d)",
-               msg.cluster.revision, ctrl.cluster.revision );
-    }
-    else if ( msg.cluster.magic_number != ctrl.cluster.magic_number )
-    {
-        wlog ("Unexpected magic number (%d:%d)",
-               msg.cluster.magic_number, ctrl.cluster.magic_number );
-    }
-    else if ( msg.cluster.period_msec != ctrl.cluster.period_msec )
-    {
-        wlog ("Cluster Heartbeat period delta (%d:%d)",
-               msg.cluster.period_msec, ctrl.cluster.period_msec );
-    }
-    else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled )
-    {
-        wlog ("Cluster storage0 enabled state delta (%d:%d)",
-               msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled );
-    }
-    else
-    {
-        return (PASS);
-    }
-    return (FAIL);
+    return(delta);
 }
 
 /****************************************************************************
@@ -843,23 +848,106 @@ int hbs_cluster_save ( string & hostname,
     if ( ! ctrl.monitored_hosts )
         return RETRY ;
 
-    if ( msg.cluster.histories == 0 )
-        return PASS ;
-
-    for ( int h = 0 ; h < msg.cluster.histories ; h++ )
+    if ( ! msg.cluster.histories )
     {
-        if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
+        wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
+                         "%s %s ; no peer controller history",
+                         hostname.c_str(),
+                         hbs_cluster_network_name(network).c_str());
+    }
+
+    if ( ctrl.peer_controller_enabled )
+    {
+        /* Should only contain the other controllers history */
+        for ( int h = 0 ; h < msg.cluster.histories ; h++ )
         {
-            elog ("Invalid network id (%d:%d:%d)",
-                   h,
-                   msg.cluster.history[h].controller,
-                   msg.cluster.history[h].network );
+            if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
+            {
+                elog ("Invalid network id (%d:%d:%d)",
+                       h,
+                       msg.cluster.history[h].controller,
+                       msg.cluster.history[h].network );
+            }
+            else if ( msg.cluster.history[h].controller != ctrl.this_controller )
+            {
+                /* set that we got some history and save it */
+                ctrl.got_peer_controller_history = true ;
+                hbs_history_save ( hostname, msg.cluster.history[h] );
+            }
+            hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
         }
-        else if ( msg.cluster.history[h].controller != ctrl.this_controller )
-        {
-            hbs_history_save ( hostname, msg.cluster.history[h] );
-        }
-        hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
     }
     return (PASS);
 }
+
+
+void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding )
+{
+    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
+    {
+        if ( ctrl.cluster.history[h].controller == controller )
+        {
+            bool dumpit = false ;
+            if (( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled ) ||
+                ( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding ))
+            {
+                /* Inject requested data for all networks of specified controller */
+                ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled = hosts_enabled ;
+                ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding = hosts_responding ;
+
+                wlog ("controller-%d injected %d:%d into controller-%d %s history (entry %d)",
+                       controller?0:1,
+                       hosts_enabled,
+                       hosts_responding,
+                       controller,
+                       hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
+                       ctrl.cluster.history[h].oldest_entry_index  );
+                dumpit = true ;
+            }
+            /* manage the oldest index */
+            if ( ++ctrl.cluster.history[h].oldest_entry_index == MTCE_HBS_HISTORY_ENTRIES )
+                ctrl.cluster.history[h].oldest_entry_index = 0 ;
+
+            /* DEBUG: */
+            if ( dumpit )
+                hbs_cluster_dump( ctrl.cluster.history[h], ctrl.cluster.storage0_enabled );
+        }
+    }
+}
+
+
+void hbs_cluster_rem ( unsigned short controller )
+{
+    int removed = 0 ;
+    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
+    {
+        if ( ctrl.cluster.history[h].controller == controller )
+        {
+            removed++ ;
+            wlog ("controller-%d %s network history removed from cluster (slot %d)",
+                   controller,
+                   hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
+                   h );
+            memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
+        }
+    }
+
+    if ( removed )
+    {
+        hbs_cluster_change ( "removed controller history" ) ;
+    }
+
+    ctrl.cluster.histories -= removed ;
+    ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
+}
+
+/* remove all cluster history on a lock operation */
+void hbs_cluster_lock( void )
+{
+    ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
+    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
+    {
+        memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
+    }
+    ctrl.cluster.histories = 0 ;
+}
diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp
index 3980014a..5006b089 100644
--- a/mtce/src/heartbeat/hbsUtil.cpp
+++ b/mtce/src/heartbeat/hbsUtil.cpp
@@ -113,11 +113,13 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
 
 /****************************************************************************
  *
- * Name        : hbs_cluster_copy
+ * Name       : hbs_cluster_copy
  *
- * Descrition  : Copies cluster from src to dst.
+ * Descrition : Copies cluster from src to dst.
  *
- * Returns     : Nothing.
+ * Parameters : cluster type.
+ *
+ * Returns    : Nothing.
  *
  ***************************************************************************/
 
@@ -206,7 +208,6 @@ void hbs_cluster_log ( string & hostname,
                     line.append (str);
                     str[0] = '\0' ;
                 }
-//#ifdef WANT_DOTS
                 else if (( history_ptr->entry[this_index].hosts_enabled ==
                            e.hosts_enabled ) &&
                          ( history_ptr->entry[this_index].hosts_responding ==
@@ -214,7 +215,6 @@ void hbs_cluster_log ( string & hostname,
                 {
                     line.append(". ");
                 }
-//#endif
                 else
                 {
                     snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
@@ -302,66 +302,83 @@ void hbs_cluster_log ( string & hostname,
 
 /****************************************************************************
  *
- * name       : hbs_cluster_dump
+ * Name        : hbs_cluster_dump
  *
- * Description: Formatted dump of the vault contents to the log file.
+ * Description : Formatted dump of the specified history to the log file.
+ *
+ * Parameters  :
+ *
+ *    history is a single history type whose contents will be logged.
+ *    storage0_enabled true suggests the storage state should also be logged.
  *
  ***************************************************************************/
-void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force )
+
+void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled )
 {
-    if ( vault.version == 0 )
+    #define MAX_LINE_LEN (500)
+    char str[MAX_LINE_LEN] ;
+    int i = 0 ;
+    for ( int e = 0 ; e < history.entries_max ; e++ )
+    {
+        snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
+                   history.oldest_entry_index==e ? '>' : ' ',
+                   history.entry[e].hosts_enabled,
+                   history.entry[e].hosts_responding);
+        i = strlen(str) ;
+    }
+    if ( storage0_enabled )
+    {
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
+                 history.controller,
+                 hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
+                 history.storage0_responding ? "y" : "n",
+                 str);
+    }
+    else
+    {
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
+                 history.controller,
+                 hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
+                 str);
+    }
+}
+
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_dump
+ *
+ * Description : Formatted dump of the vault contents to the log file.
+ *
+ * Parameters  :
+ *
+ *    vault is a reference to a cluster type whose contents will be logged.
+ *    reason is a string induicatig the reason for the dump.
+ *
+ ***************************************************************************/
+
+void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason )
+{
+    if (( vault.version == 0 ) || ( vault.histories == 0 ))
         return ;
 
-    int debug = daemon_get_cfg_ptr()->debug_state ;
-
-    if (( debug & 2 ) || ( force == true ))
+    /* The reason is cumulative , if long then use a new line */
+    if ( reason.length() > 40 )
     {
-        ilog ("%s", log_prefix.c_str());
-        syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)",
-                 vault.version,
-                 vault.revision,
-                 vault.period_msec,
-                 vault.storage0_enabled ? " with storage-0: enabled " : "",
-                 vault.histories,
-                 vault.bytes );
+        syslog ( LOG_INFO, "Cluster Dump  : %s", reason.c_str());
+        reason = "" ;
     }
+    syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec period %s;%d network histories (%d bytes) %s",
+             vault.version,
+             vault.revision,
+             vault.period_msec,
+             vault.storage0_enabled ? " with storage-0: enabled " : "",
+             vault.histories,
+             vault.bytes,
+             reason.c_str());
 
-    if (( debug & 4 ) || ( force == true ))
+    for ( int h = 0 ; h < vault.histories ; h++ )
     {
-        for ( int h = 0 ; h < vault.histories ; h++ )
-        {
-            #define MAX_LINE_LEN (500)
-            char str[MAX_LINE_LEN] ;
-            int i = 0 ;
-            for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
-            {
-                snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
-                           vault.history[h].oldest_entry_index==e ? '>' : ' ',
-                           vault.history[h].entry[e].hosts_enabled,
-                           vault.history[h].entry[e].hosts_responding);
-                i = strlen(str) ;
-            }
-            if ( vault.storage0_enabled )
-            {
-                syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
-                                    vault.history[h].controller,
-                                    hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
-                                    vault.history[h].storage0_responding ? "y" : "n",
-                                    str);
-            }
-            else
-            {
-                syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
-                                    vault.history[h].controller,
-                                    hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
-                                    str);
-            }
-        }
-    }
-
-    if ( debug & 8 )
-    {
-        dump_memory ( &vault, 16, vault.bytes );
+        hbs_cluster_dump ( vault.history[h], vault.storage0_enabled );
     }
 }
 
diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp
index d0aa0284..d134ccc6 100755
--- a/mtce/src/maintenance/mtcCompMsg.cpp
+++ b/mtce/src/maintenance/mtcCompMsg.cpp
@@ -795,9 +795,9 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int
         }
 
         if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
-        {
             msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
-        }
+        if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
+            msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ;
 
     /* add the interface and sequence number to the mtcAlice message */
     identity.append ( ",\"interface\":\"");
diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp
index 06dfd228..13b0a9fd 100755
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@@ -794,8 +794,10 @@ int send_hbs_command ( string hostname, int cmd, string controller )
     controllers.clear();
     if ( controller == CONTROLLER )
     {
-        controllers.push_back(CONTROLLER_0);
-        controllers.push_back(CONTROLLER_1);
+        if ( obj_ptr->hostname_provisioned(CONTROLLER_0) )
+            controllers.push_back(CONTROLLER_0);
+        if ( obj_ptr->hostname_provisioned(CONTROLLER_1) )
+            controllers.push_back(CONTROLLER_1);
     }
     else
     {
diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp
index 2536a7b7..73f85b1d 100755
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@@ -1065,6 +1065,18 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                         enableStageChange(node_ptr, MTC_ENABLE__FAILURE);
                         break ;
                     }
+
+                    else if (( is_controller(node_ptr) == true ) &&
+                             ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY ))
+                    {
+                        elog ("%s is SM UNHEALTHY",
+                                  node_ptr->hostname.c_str() );
+                        elog ("%s ... enable failed ; controller needs to reboot\n",
+                                  node_ptr->hostname.c_str());
+                        enableStageChange(node_ptr, MTC_ENABLE__FAILURE);
+                        break ;
+                    }
+
                     /* Set the node mtcAlive timer to configured value.
                      * This will revert bact to normal timeout after any first
                      * unlock value that may be in effect. */
diff --git a/mtce/src/maintenance/mtcStubs.cpp b/mtce/src/maintenance/mtcStubs.cpp
index 4fc3ff80..4d579928 100644
--- a/mtce/src/maintenance/mtcStubs.cpp
+++ b/mtce/src/maintenance/mtcStubs.cpp
@@ -21,3 +21,7 @@ void hbs_cluster_log ( string & hostname, string prefix, bool force=false )
     UNUSED(force);
 }
 
+void hbs_cluster_change ( string reason )
+{
+    UNUSED(reason);
+}