Mtce: Avoid running subfunction FSM for AIO-DX compute only hosts

The AIO-DX system type was enhanced to allow the customer to
provision additional compute hosts.

However, the maintenance system made a global assumption about the
'system type' and would run the enable subfunction FSMs against
these compute only hosts. In doing so the unlock of these added
hosts would timeout and heartbeat would not enable.

This update ensures that the subfunction FSM is only run against
hosts that have both controller and compute functions.

Change-Id: If7711519d3435ef19faa13e7905afae2ce9084bc
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-05-08 15:32:28 -05:00
parent 08e66abb1b
commit 2351cd73a5
5 changed files with 121 additions and 101 deletions

View File

@ -2609,7 +2609,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2720,7 +2720,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2736,13 +2736,13 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__LOCK );
}
else if ( !inv.action.compare ("force-lock") && validStates )
@ -2754,8 +2754,8 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2766,58 +2766,58 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
else if ( !inv.action.compare ("reboot") && validStates )
{
ilog ("%s Added with 'reboot' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__REBOOT );
}
else if ( !inv.action.compare ("reset") && validStates )
{
ilog ("%s Added with 'reset' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RESET );
}
}
else if ( !inv.action.compare ("power-off") && validStates )
{
ilog ("%s Added in a 'locked' and 'power-off' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED;
node_ptr->availStatus = MTC_AVAIL_STATUS__POWERED_OFF ;
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__POWERED_OFF ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWEROFF );
}
else if ( !inv.action.compare ("power-on") && validStates )
{
ilog ("%s Added with 'power-on' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
@ -2832,9 +2832,9 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
}
else
{
wlog ("%s Need add Action support for '%s' action\n", node_ptr->hostname.c_str(),
wlog ("%s Need add Action support for '%s' action\n", node_ptr->hostname.c_str(),
inv.action.c_str());
print_inv ( inv );
/* Load in maintenance states */
@ -2842,7 +2842,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
@ -2859,8 +2859,8 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -3678,8 +3678,8 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags )
}
/* Deal with sub-function if combo host */
if ( CPE_SYSTEM )
/* Deal with sub-function if AIO controller host */
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( flags & MTC_FLAG__SUBF_GOENABLED )
{
@ -5426,7 +5426,7 @@ int nodeLinkClass::degrade_resource_raise ( string & hostname,
* This utility handles critical process failure event notifications.
* Typically this interface will force a host re-enable through reset.
*
* For CPE Simplex this failure sets the auto recovery bool
* For AIO Simplex this failure sets the auto recovery bool
* so that the main enable FSM can handle it through a thresholded
* self reboot.
*
@ -5768,32 +5768,48 @@ string nodeLinkClass::get_node_subfunction_str ( string hostname )
bool nodeLinkClass::is_controller ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
return is_controller(node_ptr);
if ( node_ptr )
{
return is_controller(node_ptr);
}
return false ;
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
return is_compute(node_ptr);
if ( node_ptr )
{
return is_compute(node_ptr);
}
return false ;
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute_subfunction ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
return is_compute_subfunction(node_ptr);
if ( node_ptr )
{
return is_compute_subfunction(node_ptr);
}
return false ;
}
/** Check if a node is a storage */
bool nodeLinkClass::is_storage ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
return is_storage(node_ptr);
if ( node_ptr )
{
return is_storage(node_ptr);
}
return false ;
}
/** Maintenance FSM Test Case Setup procedure */
int nodeLinkClass::set_enableStage ( string & hostname,
int nodeLinkClass::set_enableStage ( string & hostname,
mtc_enableStages_enum stage )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
@ -8306,10 +8322,10 @@ void nodeLinkClass::mem_log_bm ( struct nodeLinkClass::node * node_ptr )
void nodeLinkClass::mem_log_identity ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s (%u)\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s (%u)\n",
node_ptr->hostname.c_str(),
node_ptr->uuid.c_str(),
node_ptr->type.c_str(),
node_ptr->type.c_str(),
node_ptr->nodetype);
mem_log (str);
}
@ -8321,32 +8337,23 @@ void nodeLinkClass::mem_log_state1 ( struct nodeLinkClass::node * node_ptr )
string op = operState_enum_to_str(node_ptr->operState) ;
string av = availStatus_enum_to_str(node_ptr->availStatus);
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s-%s-%s degrade_mask:%08x\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s-%s-%s degrade_mask:%08x\n",
node_ptr->hostname.c_str(),
ad.c_str(),
op.c_str(),
op.c_str(),
av.c_str(),
node_ptr->degrade_mask);
mem_log (str);
op = operState_enum_to_str(node_ptr->operState_subf) ;
av = availStatus_enum_to_str(node_ptr->availStatus_subf);
if ( node_ptr->subfunction_str.empty() )
if ( ! node_ptr->subfunction_str.empty() )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tFunction: %s %s-%s-%s\n",
node_ptr->hostname.c_str(),
node_ptr->function_str.c_str(),
ad.c_str(),
op.c_str(),
av.c_str());
}
else
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tFunctions %s-%s %s-%s-%s\n",
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Functions: %s-%s %s-%s-%s\n",
node_ptr->hostname.c_str(),
node_ptr->function_str.c_str(),
node_ptr->subfunction_str.c_str(),
ad.c_str(),
op.c_str(),
op.c_str(),
av.c_str());
}
mem_log (str);
@ -8357,10 +8364,10 @@ void nodeLinkClass::mem_log_state2 ( struct nodeLinkClass::node * node_ptr )
char str[MAX_MEM_LOG_DATA] ;
string aa = adminAction_enum_to_str(node_ptr->adminAction) ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAction:%s invAction:%s Task:%s\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAction:%s invAction:%s Task:%s\n",
node_ptr->hostname.c_str(),
aa.c_str(),
node_ptr->action.c_str(),
node_ptr->action.c_str(),
node_ptr->task.c_str());
mem_log (str);
}
@ -8518,7 +8525,7 @@ void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr )
node_ptr->function);
mem_log (str);
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n",
node_ptr->hostname.c_str(),

View File

@ -1007,12 +1007,12 @@ int daemon_init ( string iface, string nodeType_str )
if ( is_combo_system ( my_nodetype ) == true )
{
daemon_wait_for_file ( GOENABLED_SUBF_PASS , 0);
ilog ("GOENABLE (small system)\n");
ilog ("GOENABLE (AIO Host)\n");
}
else
{
daemon_wait_for_file ( GOENABLED_MAIN_PASS , 0);
ilog ("GOENABLE (large system)\n");
ilog ("GOENABLE\n");
}
/* Configure the client */

View File

@ -940,7 +940,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
else
avail = " " ;
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( ! oper_subf.empty() )
{
@ -982,7 +982,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
this->sysinvEvent.payload.erase(len-1,1);
this->sysinvEvent.payload.append ( "]");
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
ilog ("%s %s-%s-%s %s-%s\n",
node_ptr->hostname.c_str(),

View File

@ -1629,29 +1629,43 @@ int run_hostservices_scripts ( unsigned int cmd )
* one for controller and 1 for compute */
if ( ctrl.system_type != SYSTEM_TYPE__NORMAL )
{
string dir = "" ;
if ( action == "stop" )
{
std::list<string> more_scripts ;
if ( cmd == MTC_CMD_STOP_COMPUTE_SVCS )
{
dir = SERVICES_DIR ;
dir.append("/controller");
/* only add the controller if we get a compute stop
* and this host has a controller nodetype function */
if (ctrl.nodetype & CONTROLLER_TYPE)
{
dir = SERVICES_DIR ;
dir.append("/controller");
}
}
else
else if ( cmd == MTC_CMD_STOP_CONTROL_SVCS )
{
dir = SERVICES_DIR ;
dir.append("/compute");
/* add the compute stop if we get a controller stop
* and this host has a compute nodetype function */
if (ctrl.nodetype & COMPUTE_TYPE)
{
dir = SERVICES_DIR ;
dir.append("/compute");
}
}
if ( load_filenames_in_dir ( dir.data(), more_scripts ) != PASS )
if ( ! dir.empty() )
{
ctrl.active_script_set = NO_SCRIPTS ;
return (FAIL_READ_FILES) ;
}
if ( load_filenames_in_dir ( dir.data(), more_scripts ) != PASS )
{
ctrl.active_script_set = NO_SCRIPTS ;
return (FAIL_READ_FILES) ;
}
if ( ! more_scripts.empty() )
{
scripts.merge(more_scripts);
if ( ! more_scripts.empty() )
{
scripts.merge(more_scripts);
}
}
}
}

View File

@ -600,7 +600,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
this->autorecovery_disabled = true ;
}
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Raise Critical Compute Function Alarm */
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
@ -613,7 +613,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->graceful_recovery_counter = 0 ;
node_ptr->health_threshold_counter = 0 ;
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->inservice_failed_subf = true ;
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
@ -1262,8 +1262,16 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
else /* success path */
{
/* heartbeat is only started now in a normal system. */
if (( LARGE_SYSTEM ) && ( NOT_THIS_HOST ))
/* Don't start the self heartbeat for the active controller.
* Also, in AIO , hosts that have a controller function also
* have a compute function and the heartbeat for those hosts
* are started at the end of the subfunction handler. */
if (( THIS_HOST ) ||
(( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
/* allow the fsm to wait for up to 1 minute for the
* hbsClient's ready event before starting heartbeat
@ -1271,12 +1279,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
}
else
{
/* The heartbeat soak will be started in the enable
* subfunction handler in the combined system. */
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
}
break ;
}
@ -1446,7 +1448,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
enableStageChange ( node_ptr, MTC_ENABLE__START );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
{
ilog ("%s running compute sub-function enable handler\n", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
@ -1749,7 +1751,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
@ -2140,7 +2142,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
/* The active controller would never get/be here but
* if it did then just fall through to change state. */
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Here we need to run the sub-fnction goenable and start
* host services if this is the other controller in a AIO
@ -2368,7 +2370,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_RECOVERY__STATE_CHANGE:
{
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Set node as unlocked-enabled */
subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
@ -2646,7 +2648,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__DISABLED,
locked_status );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
locked_status );
@ -3290,7 +3292,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
/* otherwise change state */
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
}
@ -3331,7 +3333,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str());
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
}
@ -5318,7 +5320,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->uuid.length() ? node_ptr->uuid.c_str() : "" );
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( daemon_is_file_present ( CONFIG_COMPLETE_COMPUTE ) == false )
{
@ -5578,12 +5580,17 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
if (( NOT_THIS_HOST ) && ( LARGE_SYSTEM ))
/* start the heartbeat service in all cases except for
* THIS host and CPE controller hosts */
if ( NOT_THIS_HOST )
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
if (( LARGE_SYSTEM ) ||
(( CPE_SYSTEM ) && ( is_controller(node_ptr) == false )))
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
}
}
/* Only run hardware monitor if the bm ip is provisioned */
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )))
@ -5627,19 +5634,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
/* Special Add handling for the AIO system */
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* If AIO and unlocked-enabled and not this host then
* start the heartbeat service
* (inactive controller case) */
if ( NOT_THIS_HOST )
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
/* In AIO if in DOR mode and the host is unlocked enabled
* we need to run the subfunction handler and request
* to start host services. */
@ -6353,7 +6352,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
* and we have only a single enabled controller (which must be this one)
* and the alarm is not already raised.
**/
if ( CPE_SYSTEM )
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&