diff --git a/service-mgmt/sm-1.0.0/centos/build_srpm.data b/service-mgmt/sm-1.0.0/centos/build_srpm.data
index 9ee81464..918657b2 100644
--- a/service-mgmt/sm-1.0.0/centos/build_srpm.data
+++ b/service-mgmt/sm-1.0.0/centos/build_srpm.data
@@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
 COPY_LIST="$PKG_BASE/LICENSE"
 TAR_NAME=sm
 VERSION=1.0.0
-TIS_PATCH_VER=27
+TIS_PATCH_VER=28
diff --git a/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.cpp b/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.cpp
index c2158788..ca91c18a 100644
--- a/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.cpp
@@ -21,7 +21,15 @@
 #include "sm_debug.h"
 #include "sm_limits.h"
 #include "sm_selobj.h"
-#include "sm_timer.h"
+#include "sm_worker_thread.h"
+
+// uncomment when debugging this module to enabled DPRINTFD output to log file
+// #define __DEBUG__MSG__
+
+#ifdef __DEBUG__MSG__
+#undef DPRINTFD
+#define DPRINTFD DPRINTFI
+#endif
 
 #define LOOPBACK_IP "127.0.0.1"
 #define SM_CLIENT_PORT_KEY "sm_client_port"
@@ -64,6 +72,45 @@ bool operator!=(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs)
     return !(lhs == rhs);
 }
 
+void log_cluster_hbs_state(const SmClusterHbsStateT& state)
+{
+    if(0 == state.last_update)
+    {
+        DPRINTFI("Cluster hbs state not available");
+        return;
+    }
+
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    int secs_since_update = ts.tv_sec - state.last_update;
+
+    if(state.storage0_enabled)
+    {
+        DPRINTFI("Cluster hbs last updated %d secs ago, storage-0 is provisioned, "
+                 "from controller-0: %d nodes enabled, %d nodes reachable, storage-0 %s responding "
+                 "from controller-1: %d nodes enabled, %d nodes reachable, storage-0 %s responding",
+                 secs_since_update,
+                 state.controllers[0].number_of_node_enabled,
+                 state.controllers[0].number_of_node_reachable,
+                 state.controllers[0].storage0_responding ? "is" : "is not",
+                 state.controllers[1].number_of_node_enabled,
+                 state.controllers[1].number_of_node_reachable,
+                 state.controllers[1].storage0_responding ? "is" : "is not"
+                );
+    }else
+    {
+        DPRINTFI("Cluster hbs last updated %d secs ago, storage-0 is not provisioned, "
+                 "from controller-0: %d nodes enabled, %d nodes reachable, "
+                 "from controller-1: %d nodes enabled, %d nodes reachable",
+                 secs_since_update,
+                 state.controllers[0].number_of_node_enabled,
+                 state.controllers[0].number_of_node_reachable,
+                 state.controllers[1].number_of_node_enabled,
+                 state.controllers[1].number_of_node_reachable
+                );
+    }
+}
+
 pthread_mutex_t SmClusterHbsInfoMsg::_mutex;
 const unsigned short Invalid_Req_Id = 0;
 int SmClusterHbsInfoMsg::_sock = -1;
@@ -103,8 +150,8 @@ bool SmClusterHbsInfoMsg::_process_cluster_hbs_history(mtce_hbs_cluster_history_
         DPRINTFE("Invalid oldest entry index %d", history.oldest_entry_index);
         return false;
     }
-    int newest_entry_index = (history.oldest_entry_index + history.entries) % MTCE_HBS_HISTORY_ENTRIES;
 
+    int newest_entry_index = (history.oldest_entry_index + history.entries - 1) % MTCE_HBS_HISTORY_ENTRIES;
     mtce_hbs_cluster_entry_type& entry = history.entry[newest_entry_index];
 
     SmClusterHbsInfoT& controller_state = state.controllers[history.controller];
@@ -112,7 +159,11 @@ bool SmClusterHbsInfoMsg::_process_cluster_hbs_history(mtce_hbs_cluster_history_
     if(entry.hosts_responding > controller_state.number_of_node_reachable)
     {
         controller_state.number_of_node_reachable = entry.hosts_responding;
+        controller_state.number_of_node_enabled = entry.hosts_enabled;
     }
+
+    DPRINTFD("Oldest index %d, entries %d, newest index %d, nodes %d",
+        history.oldest_entry_index, history.entries, newest_entry_index, entry.hosts_responding);
     return true;
 }
 
@@ -123,7 +174,6 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
     while(true)
     {
         int bytes_read = recv( selobj, &msg, sizeof(msg), MSG_NOSIGNAL | MSG_DONTWAIT );
-        DPRINTFD("msg received %d bytes. buffer size %d", bytes_read, sizeof(msg));
         if(bytes_read < 0)
         {
             if(EAGAIN != errno)
@@ -132,6 +182,7 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
             }
             return;
         }
+        DPRINTFD("msg received %d bytes. buffer size %d", bytes_read, sizeof(msg));
         if(size_of_msg_header > (unsigned int)bytes_read)
         {
             DPRINTFE("size not right, msg size %d, expected not less than %d",
@@ -173,6 +224,8 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
         {
             _cluster_hbs_state_previous = _cluster_hbs_state_current;
             _cluster_hbs_state_current = state;
+            DPRINTFD("cluster hbs state changed");
+            log_cluster_hbs_state(_cluster_hbs_state_current);
         }
         else
         {
@@ -220,6 +273,13 @@ SmErrorT SmClusterHbsInfoMsg::_get_address(const char* port_key, struct sockaddr
     return SM_OKAY;
 }
 
+static void send_query(SmSimpleAction&)
+{
+    SmClusterHbsInfoMsg::cluster_hbs_info_query();
+}
+
+static SmSimpleAction _query_hbs_cluster_info_action("send hbs-cluster query", send_query);
+
 // ****************************************************************************
 // SmClusterHbsInfoMsg::cluster_hbs_info_query -
 //      trigger a query of cluster hbs info.
@@ -244,35 +304,37 @@ bool SmClusterHbsInfoMsg::cluster_hbs_info_query(cluster_hbs_query_ready_callbac
     char query[request_size];
     unsigned short reqid;
     struct timespec ts;
-    mutex_holder holder(&_mutex);
-    if(0 != clock_gettime(CLOCK_REALTIME, &ts))
     {
-        DPRINTFE("Failed to get realtime");
-        reqid = (unsigned short)1;
-    }else
-    {
-        unsigned short* v = (unsigned short*)(&ts.tv_nsec);
-        reqid = (*v) % 0xFFFE + 1;
-    }
+        mutex_holder holder(&_mutex);
+        if(0 != clock_gettime(CLOCK_REALTIME, &ts))
+        {
+            DPRINTFE("Failed to get realtime");
+            reqid = (unsigned short)1;
+        }else
+        {
+            unsigned short* v = (unsigned short*)(&ts.tv_nsec);
+            reqid = (*v) % 0xFFFE + 1;
+        }
 
-    struct sockaddr_in addr;
-    if(SM_OKAY != _get_address(SM_SERVER_PORT_KEY, &addr))
-    {
-        DPRINTFE("Failed to get address");
-        return false;
-    }
+        struct sockaddr_in addr;
+        if(SM_OKAY != _get_address(SM_SERVER_PORT_KEY, &addr))
+        {
+            DPRINTFE("Failed to get address");
+            return false;
+        }
 
-    int msg_size = snprintf(query, sizeof(query), json_fmt, reqid);
+        int msg_size = snprintf(query, sizeof(query), json_fmt, reqid);
 
-    DPRINTFD("msg (%d:%d) to send %s", msg_size + 1, strlen(query), query);
-    if(0 > sendto(_sock, query, msg_size + 1, 0, (sockaddr*)&addr, sizeof(addr)))
-    {
-        DPRINTFE("Failed to send msg. Error %s", strerror(errno));
-        return false;
-    }
-    if(NULL != callback)
-    {
-        _callbacks.push_back(callback);
+        DPRINTFD("send %d bytes %s", msg_size, query);
+        if(0 > sendto(_sock, query, msg_size, 0, (sockaddr*)&addr, sizeof(addr)))
+        {
+            DPRINTFE("Failed to send msg. Error %s", strerror(errno));
+            return false;
+        }
+        if(NULL != callback)
+        {
+            _callbacks.push_back(callback);
+        }
     }
     return true;
 }
@@ -386,6 +448,7 @@ SmErrorT SmClusterHbsInfoMsg::initialize()
         return SM_FAILED;
     }
 
+    SmWorkerThread::get_worker().add_action(&_query_hbs_cluster_info_action);
     return SM_OKAY;
 }
 
@@ -432,6 +495,7 @@ void SmClusterHbsInfoMsg::dump_hbs_record(FILE* fp)
 
     if(0 != _cluster_hbs_state_previous.last_update)
     {
+        t = ts.tv_sec - _cluster_hbs_state_previous.last_update;
         fprintf(fp, "\n  Previous state, since %d seconds ago\n", (int)t);
 
         fprintf(fp, "  storage-0 is %s configured\n", _cluster_hbs_state_previous.storage0_enabled ? "" : "not");
diff --git a/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.h b/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.h
index 73a1b566..1cb304fd 100644
--- a/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.h
+++ b/service-mgmt/sm-1.0.0/src/sm_cluster_hbs_info_msg.h
@@ -21,7 +21,10 @@ struct _SmClusterHbsInfoT
 {
     bool storage0_responding;
     int number_of_node_reachable;
-    _SmClusterHbsInfoT() : storage0_responding(false), number_of_node_reachable(0)
+    int number_of_node_enabled;
+    _SmClusterHbsInfoT() : storage0_responding(false),
+                           number_of_node_reachable(0),
+                           number_of_node_enabled(0)
     {
     }
 };
@@ -47,6 +50,12 @@ typedef struct
 bool operator==(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
 bool operator!=(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
 
+inline bool is_valid(const SmClusterHbsStateT& state)
+{
+    return state.last_update > 0;
+}
+
+void log_cluster_hbs_state(const SmClusterHbsStateT& state);
 
 typedef void(*cluster_hbs_query_ready_callback)();
 // ****************************************************************************
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.c b/service-mgmt/sm-1.0.0/src/sm_failover.c
index 3924e328..c55bc118 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover.c
+++ b/service-mgmt/sm-1.0.0/src/sm_failover.c
@@ -29,6 +29,7 @@
 #include "sm_service_domain_neighbor_fsm.h"
 #include "sm_service_domain_member_table.h"
 #include "sm_service_domain_interface_fsm.h"
+#include "sm_service_domain_fsm.h"
 #include "sm_heartbeat_msg.h"
 #include "sm_node_swact_monitor.h"
 #include "sm_util_types.h"
@@ -602,36 +603,33 @@ SmFailoverActionResultT sm_failover_swact()
 // ****************************************************************************
 // Failover - fail self
 // ==================
-SmFailoverActionResultT sm_failover_fail_self()
+SmErrorT sm_failover_fail_self()
 {
     DPRINTFI("To disable %s", _host_name);
     SmErrorT error = sm_node_fsm_event_handler(
-        _host_name, SM_NODE_EVENT_DISABLED, NULL, "Host is isolated" );
+        _host_name, SM_NODE_EVENT_DISABLED, NULL, "Host is failed" );
     if( SM_OKAY != error )
     {
         DPRINTFE("Failed to disable %s, error: %s", _host_name, sm_error_str(error));
-        return SM_FAILOVER_ACTION_RESULT_FAILED;
+        return SM_FAILED;
     }
 
     sm_node_utils_set_unhealthy();
 
     error = sm_node_api_fail_node( _host_name );
-    if (SM_OKAY == error )
-    {
-        return SM_FAILOVER_ACTION_RESULT_OK;
-    }
-    else
+    if (SM_OKAY != error )
     {
         DPRINTFE("Failed to set %s failed, error %s.", _host_name, sm_error_str(error));
-        return SM_FAILOVER_ACTION_RESULT_FAILED;
+        return SM_FAILED;
     }
+    return SM_OKAY;
 }
 // ****************************************************************************
 
 // ****************************************************************************
 // Failover - disable node
 // ==================
-SmFailoverActionResultT sm_failover_disable_node(char* node_name)
+SmErrorT sm_failover_disable_node(char* node_name)
 {
     DPRINTFI("To disable %s", node_name);
 
@@ -645,9 +643,9 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name)
     {
         DPRINTFE( "Failed to disable node %s, error=%s.",
                       node_name, sm_error_str( error ) );
-        return SM_FAILOVER_ACTION_RESULT_FAILED;
+        return SM_FAILED;
     }
-    return SM_FAILOVER_ACTION_RESULT_OK;
+    return SM_OKAY;
 }
 // ****************************************************************************
 
@@ -796,6 +794,21 @@ bool this_controller_unlocked()
 }
 // ****************************************************************************
 
+static SmErrorT sm_ensure_leader_scheduler()
+{
+    char controller_domain[] = "controller";
+    char reason_text[SM_LOG_REASON_TEXT_MAX_CHAR] = "Loss of heartbeat";
+
+    SmErrorT error = sm_service_domain_fsm_set_state(
+        controller_domain,
+        SM_SERVICE_DOMAIN_STATE_LEADER,
+        reason_text );
+    if(SM_OKAY != error)
+    {
+        DPRINTFE("Failed to ensure leader scheduler. Error %s", sm_error_str(error));
+    }
+    return error;
+}
 // ****************************************************************************
 // Failover - set system to scheduled status
 // ==================
@@ -808,6 +821,16 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
     SmNodeScheduleStateT host_target_state, peer_target_state;
     host_target_state = failover_status.get_host_schedule_state();
     peer_target_state = failover_status.get_peer_schedule_state();
+    SmHeartbeatStateT heartbeat_state = failover_status.get_heartbeat_state();
+    if(SM_HEARTBEAT_OK != heartbeat_state)
+    {
+        if(SM_OKAY != sm_ensure_leader_scheduler())
+        {
+            DPRINTFE("Failed to set new leader scheduler to local");
+            return SM_FAILED;
+        }
+    }
+
     if(SM_NODE_STATE_ACTIVE == host_target_state)
     {
         if(SM_NODE_STATE_STANDBY == _host_state &&
@@ -818,8 +841,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
                 DPRINTFE("Failed to activate %s.", _host_name);
                 return SM_FAILED;
             }
-            result = sm_failover_disable_node(_peer_name);
-            if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
+            if(SM_OKAY != sm_failover_disable_node(_peer_name))
             {
                 DPRINTFE("Failed to disable node %s.", _peer_name);
                 return SM_FAILED;
@@ -839,8 +861,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
     }
     else if(SM_NODE_STATE_FAILED == host_target_state)
     {
-        result = sm_failover_disable_node(_host_name);
-        if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
+        if(SM_OKAY != sm_failover_fail_self())
         {
             DPRINTFE("Failed disable host %s.", _host_name);
             return SM_FAILED;
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp
index 9e7f7711..b23a33d8 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp
@@ -6,6 +6,7 @@
 #include "sm_failover_fail_pending_state.h"
 #include <stdlib.h>
 #include <unistd.h>
+#include "sm_cluster_hbs_info_msg.h"
 #include "sm_types.h"
 #include "sm_limits.h"
 #include "sm_debug.h"
@@ -17,7 +18,8 @@
 #include "sm_node_api.h"
 #include "sm_worker_thread.h"
 
-static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms
+static const int FAIL_PENDING_TIMEOUT = 2000; // 2seconds
+static const int DELAY_QUERY_HBS_MS   = FAIL_PENDING_TIMEOUT - 200; // give 200ms for hbs agent to respond
 
 static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID;
 static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node
@@ -294,6 +296,20 @@ SmErrorT SmFailoverFailPendingState::enter_state()
     return error;
 }
 
+void _cluster_hbs_response_callback()
+{
+    const SmClusterHbsStateT& cluster_hbs_state = SmClusterHbsInfoMsg::get_current_state();
+    log_cluster_hbs_state(cluster_hbs_state);
+    SmSystemFailoverStatus::get_status().set_cluster_hbs_state(cluster_hbs_state);
+}
+
+bool SmFailoverFailPendingState::_delay_query_hbs_timeout(
+    SmTimerIdT timer_id, int64_t user_data)
+{
+    SmClusterHbsInfoMsg::cluster_hbs_info_query(_cluster_hbs_response_callback);
+    return false;
+}
+
 SmErrorT SmFailoverFailPendingState::_register_timer()
 {
     SmErrorT error;
@@ -303,31 +319,47 @@ SmErrorT SmFailoverFailPendingState::_register_timer()
         this->_deregister_timer();
     }
 
-    error = sm_timer_register( timer_name, FAIL_PENDING_TIMEOUT,
-                               SmFailoverFailPendingState::_fail_pending_timeout,
-                               0, &this->_pending_timer_id);
+    error = sm_timer_register(timer_name, FAIL_PENDING_TIMEOUT,
+                              SmFailoverFailPendingState::_fail_pending_timeout,
+                              0, &this->_pending_timer_id);
+
+    const char* delay_query_hbs_timer_name = "DELAY QUERY HBS";
+
+    error = sm_timer_register(delay_query_hbs_timer_name, DELAY_QUERY_HBS_MS,
+                              SmFailoverFailPendingState::_delay_query_hbs_timeout,
+                              0, &this->_delay_query_hbs_timer_id);
 
     return error;
 }
 
 SmErrorT SmFailoverFailPendingState::_deregister_timer()
 {
-    SmErrorT error;
-    if(SM_TIMER_ID_INVALID == this->_pending_timer_id)
+    SmErrorT error = SM_OKAY;
+    if(SM_TIMER_ID_INVALID != this->_pending_timer_id)
     {
-        return SM_OKAY;
+        error = sm_timer_deregister(this->_pending_timer_id);
+        if( SM_OKAY != error )
+        {
+            DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
+                      sm_error_str( error ) );
+        }else
+        {
+            this->_pending_timer_id = SM_TIMER_ID_INVALID;
+        }
     }
 
-    error = sm_timer_deregister(this->_pending_timer_id);
-    if( SM_OKAY != error )
+    if(SM_TIMER_ID_INVALID != this->_delay_query_hbs_timer_id)
     {
-        DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
-                  sm_error_str( error ) );
-    }else
-    {
-        this->_pending_timer_id = SM_TIMER_ID_INVALID;
+        error = sm_timer_deregister(this->_delay_query_hbs_timer_id);
+        if( SM_OKAY != error )
+        {
+            DPRINTFE( "Failed to cancel query hbs info timer, error=%s.",
+                      sm_error_str( error ) );
+        }else
+        {
+            this->_delay_query_hbs_timer_id = SM_TIMER_ID_INVALID;
+        }
     }
-
     return error;
 }
 
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h
index 7b2c7fe4..58732d5c 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h
@@ -22,8 +22,11 @@ class SmFailoverFailPendingState : public SmFSMState
 
     private:
         SmTimerIdT _pending_timer_id;
+        SmTimerIdT _delay_query_hbs_timer_id;
 
         static bool _fail_pending_timeout(SmTimerIdT timer_id, int64_t user_data);
+        static bool _delay_query_hbs_timeout(SmTimerIdT timer_id, int64_t user_data);
+        static void cluster_hbs_response_callback();
         SmErrorT _register_timer();
         SmErrorT _deregister_timer();
 };
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp
index 577a733a..d2ae6831 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp
@@ -10,41 +10,15 @@
 #include "sm_failover_fsm.h"
 #include "sm_failover_ss.h"
 
-static void _audit_failover_state()
-{
-    SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
-    SmErrorT error = sm_failover_ss_get_survivor(failover_status);
-    SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
-    SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
-    if(SM_OKAY != error)
-    {
-        DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
-        return;
-    }
-
-    if(SM_NODE_STATE_FAILED == host_state)
-    {
-        // don't need to set to failed state, already here
-    }
-    else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state)
-    {
-        // standby is the only possible state to be scheduled to from failed state
-        SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
-    }else
-    {
-        DPRINTFE("Runtime error: unexpected scheduling state: %s",
-            sm_node_schedule_state_str(host_state));
-    }
-}
-
 SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
 {
+    // Currently the only supported scenario to recover from failure is
+    // reboot triggered by mtce.
+    // So once entering failed state, wait for reboot to reenter the normal state.
     switch (event)
     {
         case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
-            DPRINTFI("sm_heartbeat_recover event received.");
-            _audit_failover_state();
-
+            // event will be fired, but couldn't bring fsm state back to normal
             break;
 
         default:
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp
index d0f3cbaa..c719dda0 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp
@@ -40,6 +40,7 @@ void SmIFStateChangedEventData::set_interface_state(
             DPRINTFE("Runtime error: invalid interface type %d", interface_type);
     }
 }
+
 SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInterfaceTypeT interface_type) const
 {
     switch (interface_type)
@@ -54,7 +55,6 @@ SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInter
             DPRINTFE("Runtime error: invalid interface type %d", interface_type);
             return SM_FAILOVER_INTERFACE_UNKNOWN;
     }
-
 }
 
 SmErrorT SmFSMState::enter_state()
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp
index cdef009d..32affc8f 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp
@@ -13,6 +13,7 @@
 #include "sm_failover_utils.h"
 #include "sm_failover_fsm.h"
 #include "sm_failover_ss.h"
+#include "sm_cluster_hbs_info_msg.h"
 
 SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
 {
@@ -79,6 +80,29 @@ SmErrorT SmFailoverNormalState::exit_state()
         failover_status.set_peer_pre_failure_schedule_state(peer_state);
     }
 
+    const SmClusterHbsStateT& cluster_hbs_state_cur = SmClusterHbsInfoMsg::get_current_state();
+    const SmClusterHbsStateT& cluster_hbs_state_pre = SmClusterHbsInfoMsg::get_previous_state();
+    SmClusterHbsStateT pre_failure_cluster_hsb_state;
+    if(!is_valid(cluster_hbs_state_cur))
+    {
+        DPRINTFE("No cluster hbs state available");
+    }else
+    {
+        struct timespec ts;
+        clock_gettime(CLOCK_REALTIME, &ts);
+        if(ts.tv_sec - cluster_hbs_state_cur.last_update <= 1 && cluster_hbs_state_pre.last_update != 0)
+        {
+            // cluster hbs state changed within past 1 second, take the pre state as pre-failure state.
+            pre_failure_cluster_hsb_state = cluster_hbs_state_pre;
+        }else
+        {
+            pre_failure_cluster_hsb_state = cluster_hbs_state_cur;
+        }
+
+        log_cluster_hbs_state(pre_failure_cluster_hsb_state);
+        failover_status.set_pre_failure_cluster_hbs_state(pre_failure_cluster_hsb_state);
+    }
+
     SmFSMState::exit_state();
     return SM_OKAY;
 }
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c
index 8ed1b41f..e1d50e85 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c
@@ -5,6 +5,7 @@
 //
 
 #include "sm_failover_ss.h"
+#include <string.h>
 #include <time.h>
 #include "sm_debug.h"
 #include "sm_limits.h"
@@ -15,6 +16,14 @@
 #include "sm_node_api.h"
 #include "sm_failover.h"
 
+// uncomment when debugging this module to enabled DPRINTFD output to log file
+// #define __DEBUG__MSG__
+
+#ifdef __DEBUG__MSG__
+#undef DPRINTFD
+#define DPRINTFD DPRINTFI
+#endif
+
 //
 SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
 
@@ -117,6 +126,26 @@ void SmSystemFailoverStatus::set_host_pre_failure_schedule_state(SmNodeScheduleS
     }
 }
 
+void SmSystemFailoverStatus::set_cluster_hbs_state(const SmClusterHbsStateT& state)
+{
+    if( !is_valid(state) )
+    {
+        DPRINTFE("Runtime error. Invalid cluster hbs state");
+        return;
+    }
+    _cluster_hbs_state = state;
+}
+
+void SmSystemFailoverStatus::set_pre_failure_cluster_hbs_state(const SmClusterHbsStateT& state)
+{
+    if( !is_valid(state) )
+    {
+        DPRINTFE("Runtime error. Invalid cluster hbs state");
+        return;
+    }
+    _pre_failure_cluster_hbs_state = state;
+}
+
 void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
 {
     if(_is_valid_schedule_state(state))
@@ -250,6 +279,8 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char
         sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
     }
 
+    SmSystemFailoverStatus::get_status().set_heartbeat_state(sys_status.heartbeat_state);
+
     sys_status.host_status.node_name = host_name;
     sys_status.host_status.interface_state = sm_failover_if_state_get();
     sys_status.host_status.current_schedule_state = host_state;
@@ -319,8 +350,154 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
     }else
     {
         DPRINTFI("Loss of heartbeat ALL");
-        selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
-        selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+        bool expect_storage_0 = false;
+        SmClusterHbsStateT pre_failure_cluster_hbs_state = selection.get_pre_failure_cluster_hbs_state();
+        SmClusterHbsStateT current_cluster_hbs_state = selection.get_cluster_hbs_state();
+        bool has_cluser_info = true;
+        int max_nodes_available = 0;
+        if(is_valid(pre_failure_cluster_hbs_state))
+        {
+            expect_storage_0 = pre_failure_cluster_hbs_state.storage0_enabled;
+            for(unsigned int i = 0; i < max_controllers; i ++)
+            {
+                if(max_nodes_available < pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable)
+                {
+                    max_nodes_available = pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable;
+                }
+            }
+        }else if(is_valid(current_cluster_hbs_state))
+        {
+            expect_storage_0 = current_cluster_hbs_state.storage0_enabled;
+            for(unsigned int i = 0; i < max_controllers; i ++)
+            {
+                if(max_nodes_available < pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable)
+                {
+                    max_nodes_available = pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable;
+                }
+            }
+        }else
+        {
+            has_cluser_info = false;
+        }
+
+        if(has_cluser_info && max_nodes_available > 1)
+        {
+            DPRINTFD("storage-0 is %s", expect_storage_0 ? "enabled":"not enabled");
+            int this_controller_index, peer_controller_index;
+
+            char host_name[SM_NODE_NAME_MAX_CHAR];
+            SmErrorT error = sm_node_utils_get_hostname(host_name);
+            if( SM_OKAY != error )
+            {
+                DPRINTFE( "Failed to get hostname, error=%s.",
+                          sm_error_str( error ) );
+                return SM_FAILED;
+            }
+
+            if(0 == strncmp(SM_NODE_CONTROLLER_0_NAME, host_name, sizeof(SM_NODE_CONTROLLER_0_NAME)))
+            {
+                this_controller_index = 0;
+                peer_controller_index = 1;
+            }else
+            {
+                this_controller_index = 1;
+                peer_controller_index = 0;
+            }
+
+            bool survivor_selected = false;
+            if(expect_storage_0)
+            {
+                if(current_cluster_hbs_state.controllers[this_controller_index].storage0_responding &&
+                    !current_cluster_hbs_state.controllers[peer_controller_index].storage0_responding)
+                {
+                    DPRINTFI("peer cannot reach storage-0. host can. host will be survivor");
+                    selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
+                    selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+                    survivor_selected = true;
+                }else if(!current_cluster_hbs_state.controllers[this_controller_index].storage0_responding &&
+                         current_cluster_hbs_state.controllers[peer_controller_index].storage0_responding)
+                {
+                    DPRINTFI("host cannot reach storage-0. peer can. peer will be survivor");
+                    selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
+                    selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
+                    survivor_selected = true;
+                }
+            }
+
+            if(!survivor_selected)
+            {
+                // so no storage-0 or storage-0 state same on both side
+                if(current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable >
+                   current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable)
+                {
+                    DPRINTFI("host reaches %d nodes, peer reaches %d nodes, host will be survivor",
+                        current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
+                        current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
+                    );
+                    selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
+                    selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+                    survivor_selected = true;
+                }else if (current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable <
+                   current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable)
+                {
+                    DPRINTFI("host reaches %d nodes, peer reaches %d nodes, peer will be survivor",
+                        current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
+                        current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
+                    );
+                    selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
+                    selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
+                    survivor_selected = true;
+                }else
+                {
+                    if(pre_failure_cluster_hbs_state != current_cluster_hbs_state)
+                    {
+                        if(0 == current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable)
+                        {
+                            // Cannot reach any nodes, I am dead
+                            DPRINTFI("host cannot reach any nodes, peer will be survivor",
+                                    current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
+                                    current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
+                                );
+                            selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
+                            selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
+                        }else
+                        {
+                            // equaly split, failed the standby
+                            if(SM_NODE_STATE_ACTIVE == system_status.host_status.current_schedule_state)
+                            {
+                                DPRINTFI("host reaches %d nodes, peer reaches %d nodes, host will be survivor",
+                                    current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
+                                    current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
+                                );
+                                selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+                            }else
+                            {
+                                DPRINTFI("host reaches %d nodes, peer reaches %d nodes, peer will be survivor",
+                                    current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
+                                    current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
+                                );
+                                selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // no connectivity status changed? peer sm is not responding
+                        DPRINTFI("Peer SM is not responding, host will be survivor");
+                        selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
+                        selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // no cluster info, peer is assumed down
+            // the connecting to majority nodes rule is postponed
+            DPRINTFI("No cluster hbs info, host will be survivor");
+            selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
+            selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
+        }
     }
 
     if(SM_SYSTEM_MODE_CPE_DUPLEX == system_status.system_mode)
diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h
index 5a4609cb..9fc2a85c 100644
--- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h
@@ -8,6 +8,7 @@
 #define __SM_FAILOVER_SS_H__
 #include <stdio.h>
 #include "sm_types.h"
+#include "sm_cluster_hbs_info_msg.h"
 
 typedef struct
 {
@@ -30,13 +31,13 @@ typedef enum
     SM_HEARTBEAT_INDIRECT,
     //no heartbeat
     SM_HEARTBEAT_LOSS
-}SmHeartbeatStatusT;
+}SmHeartbeatStateT;
 
 typedef struct
 {
     SmNodeStatusT host_status;
     SmNodeStatusT peer_status;
-    SmHeartbeatStatusT heartbeat_state;
+    SmHeartbeatStateT heartbeat_state;
     SmSystemModeT system_mode;
 }SmSystemStatusT;
 
@@ -48,11 +49,30 @@ class SmSystemFailoverStatus
         inline SmNodeScheduleStateT get_host_schedule_state() const {
             return _host_schedule_state;
         }
+
         inline SmNodeScheduleStateT get_host_pre_failure_schedule_state() const {
             return _host_pre_failure_schedule_state;
         }
+
+        inline SmClusterHbsStateT get_cluster_hbs_state() const {
+            return _cluster_hbs_state;
+        }
+
+        inline SmClusterHbsStateT get_pre_failure_cluster_hbs_state() const {
+            return _pre_failure_cluster_hbs_state;
+        }
+
+        inline void set_heartbeat_state(SmHeartbeatStateT heartbeat_state)
+        {
+            _heartbeat_state = heartbeat_state;
+        }
+        inline SmHeartbeatStateT get_heartbeat_state() const {
+            return _heartbeat_state;
+        }
         void set_host_schedule_state(SmNodeScheduleStateT state);
         void set_host_pre_failure_schedule_state(SmNodeScheduleStateT state);
+        void set_cluster_hbs_state(const SmClusterHbsStateT& state);
+        void set_pre_failure_cluster_hbs_state(const SmClusterHbsStateT& state);
         inline SmNodeScheduleStateT get_peer_schedule_state() const {
             return _peer_schedule_state;
         }
@@ -68,8 +88,11 @@ class SmSystemFailoverStatus
         SmSystemFailoverStatus();
         SmNodeScheduleStateT _host_pre_failure_schedule_state;
         SmNodeScheduleStateT _peer_pre_failure_schedule_state;
+        SmClusterHbsStateT   _pre_failure_cluster_hbs_state;
         SmNodeScheduleStateT _host_schedule_state;
         SmNodeScheduleStateT _peer_schedule_state;
+        SmClusterHbsStateT   _cluster_hbs_state;
+        SmHeartbeatStateT    _heartbeat_state;
         static const char filename[];
         static const char file_format[];