diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h
index 9ea16339..3f9ac031 100755
--- a/mtce-common/src/daemon/daemon_common.h
+++ b/mtce-common/src/daemon/daemon_common.h
@@ -245,6 +245,7 @@ typedef struct
     long msecs ;
 } time_delta_type ;
 
+int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
 int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
 int gettime   ( time_debug_type & nowtime ) ;
 unsigned long long  gettime_monotonic_nsec ( void );
diff --git a/mtce-common/src/daemon/daemon_debug.cpp b/mtce-common/src/daemon/daemon_debug.cpp
index 4a39ccfa..68e97652 100755
--- a/mtce-common/src/daemon/daemon_debug.cpp
+++ b/mtce-common/src/daemon/daemon_debug.cpp
@@ -26,20 +26,20 @@ unsigned long long  gettime_monotonic_nsec ( void )
 {
     struct timespec ts;
     clock_gettime (CLOCK_MONOTONIC, &ts);
-    return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; 
+    return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec;
 }
-    
+
 int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta )
-{ 
+{
     /* Subtract before from after */
 
     if ((after.ts.tv_sec < before.ts.tv_sec) ||
         ((after.ts.tv_sec == before.ts.tv_sec) &&
-         (after.ts.tv_nsec <= before.ts.tv_nsec))) 
+         (after.ts.tv_nsec <= before.ts.tv_nsec)))
     {
         delta.secs = delta.msecs = 1 ;
-    } 
-    else 
+    }
+    else
     {
         delta.secs = after.ts.tv_sec - before.ts.tv_sec ;
         if (after.ts.tv_nsec < before.ts.tv_nsec)
@@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
         {
             delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
         }
-        delta.msecs = (delta.msecs/1000);
+        /* convert nsec time to msec time */
+        delta.msecs = (delta.msecs/1000000);
+    }
+    return (PASS) ;
+}
+
+int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
+{
+    /* Subtract before from after */
+
+    if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
+    {
+        delta.secs = delta.msecs = 0 ;
+    }
+    else
+    {
+        delta.secs = after.tv_sec - before.tv_sec ;
+        if (after.tv_nsec < before.tv_nsec)
+        {
+            delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
+            delta.secs-- ;
+        }
+        else
+        {
+            delta.msecs = after.tv_nsec - before.tv_nsec ;
+        }
+        /* convert nsec time to msec time */
+        delta.msecs = (delta.msecs/1000000);
     }
     return (PASS) ;
 }
diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp
index b596baba..4eec5a29 100644
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@@ -69,6 +69,19 @@ using namespace std;
 
 #define MAX_LEN 1000
 
+/* Stores the MONOTONIC time the last SM heartbeat was received
+ * Heartbeat metrics
+ * SM heartbeat period definition */
+static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
+static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
+static int sm_heartbeat_count_b2b_misses = 0 ;
+static int sm_heartbeat_count            = 0 ;
+const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
+const int SM_HEARTBEAT_PULSE_PERIOD_MSECS  = 800;
+const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
+const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
+    SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
+
 /* Historical String data for mem_logs */
 static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
 static string arrival_histogram[MAX_IFACES]     = { "" , "" } ;
@@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
  *
  ****************************************************************************/
 static int _hbs_sm_handler_log_throttle = 0 ;
+static int _hbs_sm_heartbeat_log_throttle = 0 ;
 void hbs_sm_handler ( void )
 {
     #define _MAX_MSG_LEN           (80)
@@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void )
                 if (( service == SUPPERTED_SERVICE ) &&
                     ( request == SUPPORTED_REQUEST ))
                 {
-                    /* success path ... */
-                    hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
-
+                    /* SM heartbeat pulses have a reqid = 0 and do not require a response */
+                    if ( reqid == 0 )
+                    {
+                        time_delta_type delta ;
+                        struct timespec ts = sm_heartbeat_timestamp_last ;
+                        clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
+                        if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
+                        {
+                            sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
+                        }
+                        timedelta (ts, sm_heartbeat_timestamp_last, delta );
+                        sm_heartbeat_count++ ;
+                        ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
+                              sm_heartbeat_count, delta.secs, delta.msecs);
+                    }
+                    else
+                    {
+                        /* success path ... */
+                        hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
+                    }
                     /* reset log throttle */
                    _hbs_sm_handler_log_throttle = 0 ;
                 }
@@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
     dlog ("... %s", sm_mesg );
 }
 
+/****************************************************************************
+ *
+ * Name       ; manage_sm_heartbeat
+ *
+ * Purpose    : Determine if we received an SM heartbeat message within
+ *              the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
+ *
+ * Description: Compare the monotonic now time to the monotonic time
+ *              of the last received  SM heartbeat pulse.
+ *
+ * Returns    : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
+ *              False if time delta is greater
+ *
+ ***************************************************************************/
+bool manage_sm_heartbeat ( void )
+{
+    struct timespec ts ;
+    time_delta_type delta ;
+    clock_gettime (CLOCK_MONOTONIC, &ts );
+    timedelta (sm_heartbeat_timestamp_last, ts, delta );
+    int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
+    bool heartbeat_ok;
+    if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
+    {
+        sm_heartbeat_count = 0;
+        if (( ++sm_heartbeat_count_b2b_misses < 20 )||
+            (!( sm_heartbeat_count_b2b_misses % 100 )))
+        {
+            wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
+                  delta.secs, delta.msecs,
+                  sm_heartbeat_count_b2b_misses,
+                  sm_heartbeat_count);
+        }
+        heartbeat_ok = false;
+    }
+    else
+    {
+        if(sm_heartbeat_count_b2b_misses)
+        {
+            int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
+
+            if(sm_heartbeat_count >= expected_beeps)
+            {
+                if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
+                {
+                    ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
+                          sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
+                          delta_in_ms,
+                          sm_heartbeat_count_b2b_misses);
+
+                    sm_heartbeat_count_b2b_misses = 0;
+                    sm_heartbeat_count = 0;
+                    sm_heartbeat_timestamp_restart.tv_sec = 0;
+                    sm_heartbeat_timestamp_restart.tv_nsec = 0;
+                    heartbeat_ok = true;
+                }else
+                {
+                    ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
+                          sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
+                          delta_in_ms,
+                          sm_heartbeat_count_b2b_misses);
+                    heartbeat_ok = false; // not good enough to declare recovered yet
+                }
+            }else
+            {
+                ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
+                     "Counting will restart.",
+                    delta_in_ms, expected_beeps - sm_heartbeat_count);
+                sm_heartbeat_timestamp_restart = ts;
+                sm_heartbeat_count = 1;
+                heartbeat_ok = false; // recover is interrupted by further missing beep
+            }
+        }else
+        {
+            if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
+            {
+                ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
+                    delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
+                );
+            }
+            heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
+        }
+    }
+    return heartbeat_ok;
+}
+
 /****************************************************************************
  *
  * Name       : daemon_service_run
@@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
                 hbs_state_audit ();
             }
 
-            /* run the first audit in 30 seconds */
+            /* The first audit was run after 30 seconds but then the
+             * continuous rate is every hour */
             mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
         }
 
@@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
                     sockets_init = true ;
                     monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
 
+                    /* Update Sm heartbeat time to now time */
+                    clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
+
                     /* no need for the heartbeat audit in a simplex system */
                     if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
                     {
@@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
          */
         else
         {
+            bool heartbeat_ok = manage_sm_heartbeat();
+
             /* manage vault wrt peer controller */
             hbs_cluster_peer();
 
@@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
                 int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
                 if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
                 {
-                    hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
+                    hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
                 }
             }
+            /* log cluster throttled */
+            if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
+            {
+                hbs_state_audit ( );
+            }
             hbsTimer.ring = false ;
             heartbeat_request = true ;
             seq_num++ ;
diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h
index 932ed543..bfa8f1d1 100755
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
  */
 void hbs_cluster_update ( iface_enum iface,
                           unsigned short not_responding_hosts,
-                          bool storage_0_responding );
+                          bool storage_0_responding,
+                          bool sm_heartbeat_ok);
 
 /* Called by the hbsAgent pulse transmitter to append this controllers
  * running cluster view in the next multicast pulse request.
diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp
index 10ac8991..780fa8e3 100644
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@@ -217,7 +217,7 @@ void cluster_list ( void )
 
 void cluster_storage0_state ( bool enabled )
 {
-    if ( ctrl.cluster.storage0_enabled != enabled )
+    if ( (bool)ctrl.cluster.storage0_enabled != enabled )
     {
         ctrl.cluster.storage0_enabled = enabled ;
         ilog ("storage-0 heartbeat state changed to %s",
@@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname )
 void hbs_cluster_del ( string & hostname )
 {
     std::list<string>::iterator hostname_ptr ;
+    bool found = false ;
     for ( hostname_ptr  = ctrl.monitored_hostname_list.begin();
           hostname_ptr != ctrl.monitored_hostname_list.end() ;
           hostname_ptr++ )
     {
         if ( hostname_ptr->compare(hostname) == 0 )
         {
-            ctrl.monitored_hostname_list.remove(hostname) ;
-            ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
-
-            /* Manage storage-0 state. */
-            if ( hostname.compare(STORAGE_0) == 0 )
-            {
-                cluster_storage0_state ( false );
-            }
-
-            /* If we get down to 0 monitored hosts then just start fresh */
-            if (( ctrl.monitored_hosts ) == 0 )
-            {
-                hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
-            }
-
-            ilog ("%s deleted from cluster", hostname.c_str());
-
-            cluster_list ();
-
-            hbs_cluster_change ( hostname + " deleted" );
-
+            found = true ;
             break ;
         }
     }
+    if ( found == true )
+    {
+        ctrl.monitored_hostname_list.remove(hostname) ;
+        ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
+
+        /* Manage storage-0 state. */
+        if ( hostname.compare(STORAGE_0) == 0 )
+                cluster_storage0_state ( false );
+
+        /* If we get down to 0 monitored hosts then just start fresh */
+        if (( ctrl.monitored_hosts ) == 0 )
+                hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
+
+        ilog ("%s deleted from cluster", hostname.c_str());
+
+        cluster_list ();
+
+        hbs_cluster_change ( hostname + " deleted" );
+    }
+    else
+    {
+        slog("%s not found in cluster list", hostname.c_str());
+    }
 }
 
 /****************************************************************************
@@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
  *               4. Maintain a back to back non-responding count for storage-0.
  *                  Once the count reaches the minimum threshold of
  *                  STORAGE_0_NR_THRESHOLD then the specific network history
- *                  is updated to indicate storgae-0 is not responding. Once
+ *                  is updated to indicate storage-0 is not responding. Once
  *                  storage-0 starts responding again with a single response
  *                  then that network history is updated to indicate storage-0
  *                  is responding.
@@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )
 
 void hbs_cluster_update ( iface_enum iface,
                       unsigned short not_responding_hosts,
-                                bool storage_0_responding )
+                                bool storage_0_responding,
+                                bool sm_heartbeat_ok)
 {
     if ( ctrl.monitored_hosts == 0 )
         return ;
@@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
     else
         return ;
 
+    if ( ctrl.monitored_hosts < not_responding_hosts )
+    {
+        slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
+    }
+
     if ( not_responding_hosts )
     {
         clog ("controller-%d %s enabled:%d not responding:%d",
@@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
         }
     }
 
+    /* update sm heartbeat status */
+    if ( sm_heartbeat_ok == true )
+    {
+        history_ptr->sm_heartbeat_fail = false ;
+    }
+    else
+    {
+        history_ptr->sm_heartbeat_fail = true ;
+    }
+
     /* Manage storage-0 status. */
     if ( ctrl.cluster.storage0_enabled )
     {
         /* Handle storage-0 status change from not responding to responding. */
         if ( storage_0_responding == true )
         {
-            if (history_ptr->storage0_responding == false)
+            if ((bool)history_ptr->storage0_responding == false)
             {
                 history_ptr->storage0_responding = true ;
                 ilog ("controller-%d %s heartbeat ; storage-0 is ok",
@@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
         }
 
         /* Handle storage-0 status change from responding to not responding. */
-        if (( history_ptr->storage0_responding == true ) &&
+        if (( (bool)history_ptr->storage0_responding == true ) &&
             ( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
         {
             history_ptr->storage0_responding = false ;
@@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
     else
     {
         /* Typical path for storage-0 disabled or normal non-storage system case */
-        if ( history_ptr->storage0_responding == true )
+        if ( (bool)history_ptr->storage0_responding == true )
             history_ptr->storage0_responding = false ;
 
         /* Handle clearing threshold count when storage-0 is not enabled. */
@@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
     if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
         history_ptr->entries++ ;
 
+    if ( ctrl.monitored_hosts < not_responding_hosts )
+    {
+        slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
+    }
     /* Update the history with this data. */
     history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
     history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp
index ae2a81d3..a9875ed5 100644
--- a/mtce/src/heartbeat/hbsUtil.cpp
+++ b/mtce/src/heartbeat/hbsUtil.cpp
@@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
     }
     if ( storage0_enabled )
     {
-        syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
                  history.controller,
                  hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
                  history.storage0_responding ? "y" : "n",
+                 history.sm_heartbeat_fail ? "miss":" ok",
                  str);
     }
     else
     {
-        syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
+        syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
                  history.controller,
                  hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
+                 history.sm_heartbeat_fail ? "miss":" ok ",
                  str);
     }
 }
diff --git a/mtce/src/heartbeat/mtceHbsCluster.h b/mtce/src/heartbeat/mtceHbsCluster.h
index 02255226..7690285c 100644
--- a/mtce/src/heartbeat/mtceHbsCluster.h
+++ b/mtce/src/heartbeat/mtceHbsCluster.h
@@ -71,7 +71,10 @@ typedef struct
 {
     unsigned short controller         :4 ; /* value 0 or 1 (and 2 in future) */
     unsigned short network            :4 ; /* see mtce_hbs_network_enum      */
-    unsigned short reserved_bits      :7 ; /* future - initted to 0          */
+    unsigned short reserved_bits      :6 ; /* future - initted to 0          */
+    unsigned short sm_heartbeat_fail  :1 ; /* SM Health
+                                              0 = SM Heartbeat OK,
+                                              1 = SM Heartbeat Failure       */
     unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy    */
     unsigned short entries               ; /* # of valid values in .entry    */
     unsigned short entries_max           ; /* max size of the enry array     */
diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf
index 1b431004..0a2b005c 100644
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@@ -30,6 +30,11 @@ barbican_port = 9311          ; The Barbican Port Number
 token_refresh_rate = 1200     ; Authentication token refresh rate in seconds.
                               ; A value of zero means no refresh.
                               ; range is 0 or 600-22800
+
+sm_heartbeat_loss_thld = 800  ; number of msecs SM heartbeat fails before
+                              ; declaring an SM process heartbeat loss in
+                              ; the heartbeat cluster.
+
 autorecovery_threshold = 3    ; The number of times maintenance will try to
                               ;  auto recover a critically failed controller
                               ;  while there is no backup controllers to fail