Add support for peer controller reset via mtcClient

This update adds the ability for SM to passively
request the mtcClient to BMC reset its peer controller
as a means to recover a severely loaded active controller.

To do this the mtcAgent is modified keep the controllers'
mtcClients updated with the BMC info of its peer.

The mtcClient is modified to audit for the SM signal
and then when asserted issue a BMC reset of its peer
controller using ipmitool system call.

The ability to command the peer mtcCient to 'sync'
prior to the BMC reset is implemented but configured
disabled for now.

Change-Id: Ibe4c8aaa3a980cbe5f34c3e22f015698a6453c1a
Partial-Bug: #1895350
Co-Authored-By: Bin.Qian@windriver.com
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-09-14 16:42:54 -04:00
parent 484d662cb7
commit 9ab726b0eb
21 changed files with 674 additions and 69 deletions

View File

@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
*
*************************************************************************/
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
{
/* create the output filename */
string datafile ;

View File

@ -82,6 +82,14 @@ typedef struct
} bmc_info_type ;
typedef struct
{
string hostname;
string host_ip ;
string bm_ip ;
string bm_un ;
string bm_pw ;
} bmcUtil_accessInfo_type ;
/* BMC commands */
typedef enum
@ -107,6 +115,7 @@ typedef enum
#define BMC_QUERY_FILE_SUFFIX ((const char *)("_root_query"))
#define BMC_INFO_FILE_SUFFIX ((const char *)("_bmc_info"))
#define BMC_POWER_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
#define BMC_RESET_CMD_FILE_SUFFIX ((const char *)("_reset"))
#define BMC_BOOTDEV_CMD_FILE_SUFFIX ((const char *)("_bootdev"))
#define BMC_RESTART_CAUSE_FILE_SUFFIX ((const char *)("_restart_cause"))
#define BMC_POWER_STATUS_FILE_SUFFIX ((const char *)("_power_status"))
@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
bmc_protocol_enum protocol );
/* create the output filename */
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
/* Get power state from query response data. */
int bmcUtil_is_power_on ( string hostname,

View File

@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un )
return (false);
}
bool hostUtil_is_valid_pw ( string pw )
{
if ( !pw.empty() )
if ( pw.compare(NONE) )
return (true);
return (false);
}
bool hostUtil_is_valid_mac_addr ( string mac )
{
if ( !mac.empty() )

View File

@ -46,6 +46,7 @@ string hostUtil_getPrefixPath ( void );
bool hostUtil_is_valid_uuid ( string uuid );
bool hostUtil_is_valid_ip_addr ( string ip );
bool hostUtil_is_valid_username ( string un );
bool hostUtil_is_valid_pw ( string pw );
bool hostUtil_is_valid_bm_type ( string bm_type );
int hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data );

View File

@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty
ipmiUtil_bmc_info_log ( hostname, bmc_info, rc );
return (rc);
}
int ipmiUtil_reset_host_now ( string hostname,
bmcUtil_accessInfo_type accessInfo,
string output_filename)
{
dlog("%s %s BMC [IP:%s UN:%s]",
accessInfo.hostname.c_str(),
accessInfo.host_ip.c_str(),
accessInfo.bm_ip.c_str(),
accessInfo.bm_un.c_str());
if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false )
daemon_make_dir(BMC_OUTPUT_DIR) ;
if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false )
daemon_make_dir(IPMITOOL_OUTPUT_DIR) ;
/* create temp password file */
thread_info_type info ;
info.hostname = accessInfo.hostname ;
info.password_file = "" ;
info.pw_file_fd = 0 ;
/* Use common utility to create a temp pw file */
bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL );
/* create request */
string request =
ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD,
accessInfo.bm_ip,
accessInfo.bm_un,
info.password_file,
output_filename );
/* issue request
*
* Note: Could launch a thread to avoid any stall.
* However, mtcClient can withstand up to a 25 second stall
* before pmon will fail it due to active monitoring.
* UT showed that there is no stall at all. */
unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ;
unsigned long long before_time = gettime_monotonic_nsec () ;
int rc = system ( request.data()) ;
unsigned long long after_time = gettime_monotonic_nsec () ;
unsigned long long delta_time = after_time-before_time ;
if ( rc )
{
wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) );
rc = FAIL_SYSTEM_CALL ;
}
if ( delta_time > (latency_threshold_secs*1000000000))
{
wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(),
(delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0,
(delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0);
}
/* Cleanup */
if ( info.pw_file_fd > 0 )
close(info.pw_file_fd);
daemon_remove_file ( info.password_file.data());
return (rc);
}

View File

@ -57,6 +57,8 @@ int ipmiUtil_init ( void );
int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info );
int ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename );
/* Create the ipmi request */
string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out );

View File

@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_REQ_MTCALIVE: return ("mtcAlive req");
case MTC_MSG_LOCKED: return ("locked msg");
case MTC_CMD_LAZY_REBOOT: return ("lazy reboot");
case MTC_MSG_INFO: return ("info msg");
case MTC_CMD_SYNC: return ("sync");
/* goenabled commands and messages */
case MTC_MSG_MAIN_GOENABLED: return ("goEnabled main msg");
@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_EVENT_PMON_MAJOR: return("pmon major event");
case MTC_EVENT_PMON_MINOR: return("pmon minor event");
case MTC_EVENT_PMON_LOG: return("pmon log");
case MTC_EVENT_PMOND_RAISE: return("pmon raise");
case MTC_EVENT_PMOND_RAISE: return("pmond raise");
case MTC_EVENT_PMOND_CLEAR: return("pmond clear");
/* data port events */
case MTC_EVENT_AVS_CLEAR: return("AVS clear");

View File

@ -751,7 +751,9 @@ typedef struct
#define MTC_CMD_START_STORAGE_SVCS 19 /* to host */
#define MTC_CMD_LAZY_REBOOT 20 /* to host */
#define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */
#define MTC_CMD_LAST 22
#define MTC_MSG_INFO 22 /* to host */
#define MTC_CMD_SYNC 23 /* to host */
#define MTC_CMD_LAST 24
#define RESET_PROG_MAX_REBOOTS_B4_RESET (5)
#define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2)
@ -1263,6 +1265,14 @@ typedef enum
MTC_AR_DISABLE_CAUSE__NONE,
} autorecovery_disable_cause_enum ;
/* code that represents a specific group of maintenance information
* ... typically for a specific feature */
typedef enum
{
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO,
MTC_INFO_CODE__LAST
} mtcInfo_enum ;
/* Service Based Auto Recovery Control Structure */
typedef struct
{

View File

@ -3295,6 +3295,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr )
}
}
/***************************************************************************
*
* Name : build_mtcInfo_dict
*
* Purpose : Build a json dictionary for the specified info code enum
*
* Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported
*
* Returns : Returns a json dictionary of mtcInfo.
*
* {
* "controller-0":{
* "ip":"192.168.204.2",
* "bm_ip":"xxx.xxx.xx.23",
* "bm_un":"root",
* "bm_pw":"root"
* },
* "controller-1":{
* "ip":"192.168.204.3",
* "bm_ip":"xxx.xxx.xx.24",
* "bm_un":"root",
* "bm_pw":"root"
* }
* }
*
**************************************************************************/
string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code )
{
string mtcInfo_dict = "" ;
/* loop/exit control */
int temp = 0 ;
/* should never happen but better to be safe */
if ( head == NULL )
return mtcInfo_dict ;
/* force the update to be a dictionary */
mtcInfo_dict = "{" ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( ptr->nodetype & CONTROLLER_TYPE ) &&
( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ))
{
if ( temp )
mtcInfo_dict.append(",");
mtcInfo_dict.append("\"" + ptr->hostname + "\":{");
mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\",");
mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\",");
mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\",");
mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}");
if ( ++temp >= 2 )
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
mtcInfo_dict.append("}");
return mtcInfo_dict ;
}
/**************************************************************************
*
* Name : mtcInfo_handler
*
* Purpose : Send mtcInfo update to provisioned controllers when
* the push flag is set.
*
**************************************************************************/
void nodeLinkClass::mtcInfo_handler ( void )
{
/* This is set in the bm_handler once access to the BMC using
* provisioned credentials have been verified. */
if ( this->want_mtcInfo_push )
{
/* handler will enhance when more codes are introduced */
mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ;
string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code);
if ( ! mtcInfo_dict.empty() )
{
string temp = CONTROLLER_0 ;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
if ( this->controllers > 1 )
{
temp = CONTROLLER_1;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
}
}
this->want_mtcInfo_push = false ;
}
}
/* Lock Rules
*
* 1. Cannot lock this controller
@ -4422,6 +4518,18 @@ string nodeLinkClass::get_bm_ip ( string hostname )
return ("");
}
string nodeLinkClass::get_bm_pw ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_pw);
}
elog ("%s bm pw lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_bm_un ( string hostname )
{
nodeLinkClass::node* node_ptr ;

View File

@ -828,10 +828,13 @@ private:
int oos_test_handler ( struct nodeLinkClass::node * node_ptr );
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
int stress_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
int uptime_handler ( void );
void mtcInfo_handler ( void );
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
/* Starts the specified 'reset or powercycle' recovery monitor */
@ -851,13 +854,22 @@ private:
void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state );
void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface );
/********* mtcInfo in the database ************/
int mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value );
string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_log ( struct nodeLinkClass::node * node_ptr );
int set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info );
/********* mtcInfo that gets puished out to daemons ***********/
/* flag telling mtce when a mtcInfo push needs to be done */
bool want_mtcInfo_push = false ;
/* performs the mtcInfo push */
void push_mtcInfo ( void );
/*****************************************************************************
*
* Name : bmc_command_send
@ -1192,11 +1204,11 @@ private:
* Set to true when the autorecovery threshold is reached
* and we want to avoid taking further autorecovery action
* even though it may be requested. */
bool autorecovery_disabled ;
bool autorecovery_disabled = false ;
/* Set to true by fault detection methods that are
* autorecoverable when in simplex mode. */
bool autorecovery_enabled ;
bool autorecovery_enabled = false ;
/** Tracks the number of hosts that 'are currently' in service trouble
* wrt heartbeat (above minor threshold).
@ -1464,11 +1476,14 @@ public:
/***********************************************************/
/** Number of provisioned controllers */
int controllers = 0 ;
/** Number of provisioned hosts (nodes) */
int hosts ;
int hosts = 0 ;
/* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */
bool unlock_ready_wait ;
bool unlock_ready_wait = false ;
/** Host has been deleted */
bool host_deleted ;
@ -1517,6 +1532,9 @@ public:
/** Return the number of inventoried hosts */
int num_hosts ( void );
/** Return the number of inventoried controllers */
int num_controllers ( void );
/** **********************************************************************
*
* Name : nodeLinkClass::workQueue_enqueue
@ -1664,6 +1682,9 @@ public:
/* Clear heartbeat failed flag for all interfaces */
void manage_heartbeat_clear ( string hostname, iface_enum iface );
/* Build a json dictionary of containing code specified maintenance info */
string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code );
/** Test and Debug Members and Variables */
/** Print node info banner */
@ -1789,6 +1810,7 @@ public:
string get_bm_ip ( string hostname );
string get_bm_un ( string hostname );
string get_bm_pw ( string hostname );
string get_bm_type ( string hostname );
string get_hostname_from_bm_ip ( string bm_ip );

View File

@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
CCFLAGS = -g -O2 -Wall -Wextra -Werror
CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void )
void nodeLinkClass::mnfa_exit ( bool force )
{ force = force ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface )
{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict)
{
UNUSED(hostname);
UNUSED(cmd);
UNUSED(interface);
UNUSED(json_dict);
return PASS ;
}
int nodeLinkClass::mtcInvApi_subf_states ( string hostname,
string oper_subf,

View File

@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient
LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -20,7 +20,7 @@
#include <stdio.h>
#include <string.h>
#include <sys/un.h> /* for ... unix domain sockets */
#include <sys/un.h> /* for ... unix domain sockets */
#include <arpa/inet.h>
#include <sys/socket.h>
#include <net/if.h>
@ -29,8 +29,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <list> /* for the list of conf file names */
#include <list> /* for ... list of conf file names */
#include <unistd.h> /* for ... sync */
using namespace std;
@ -204,6 +204,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str());
return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
}
else if ( msg.cmd == MTC_MSG_INFO )
{
mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str());
load_mtcInfo_msg ( msg );
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_CMD_SYNC )
{
ilog ("mtc '%s' message received (%s network)\n",
get_mtcNodeCommand_str(msg.cmd),
interface_name.c_str());
ilog ("Sync Start");
sync ();
ilog ("Sync Done");
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_MSG_LOCKED )
{
/* Only recreate the file if its not already present */
@ -603,7 +621,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
}
/** Send an event to the mtcAgent **/
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr )
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
{
mtc_message_type event ;
@ -619,6 +637,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
/* We don't use the buffer for mtce events to remove it from the size */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
}
else if ( cmd == MTC_EVENT_MONITOR_READY )
{
string event_info = "{\"" ;
event_info.append(MTC_JSON_INV_NAME);
event_info.append("\":\"");
event_info.append(get_hostname());
event_info.append("\",\"");
event_info.append(MTC_JSON_SERVICE);
event_info.append("\":\"");
event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
event_info.append("\"}");
size_t len = event_info.length()+1 ;
snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
snprintf ( &event.buf[0], len, "%s", event_info.data());
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
}
else if (( cmd == MTC_EVENT_AVS_CLEAR ) ||
( cmd == MTC_EVENT_AVS_MAJOR ) ||
( cmd == MTC_EVENT_AVS_CRITICAL ))
@ -666,7 +702,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
{
if ( bytes == 0 )
{
slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd );
slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd );
rc = FAIL_NO_DATA ;
}
else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes )
@ -933,32 +969,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
return (PASS) ;
}
/* Accelerated Virtual Switch 'events' socket
* - for receiving data port state change event
* Event strings are
*
* {"type":"port-state", "severity":"critical|major|clear"}
*
* type:port-state - the provider network data port status has changed to the supplied fault severity
*
* severity:
* critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services)
* major - port has failed and is part of an aggregate with other inservice-ports (degrade only)
* clear - port has recovered from a failed state and is operational (clear degrade, enable services)
*
* NOTE: The port status can transition from any of the above states to any other state.
*
* The neutron agent monitors the vswitch ports at a 2 second interval.
* If a port changes link state during the polling period, it will
* raise/clear the alarm, but now also calculates the impact of that port
* failure on the provider network data interface.
*
* The overall aggregated state across all provider network interfaces will
* be reported to maintenance when ports enter a link down or up state.
* The agent will also periodically send the current provider network port
* status to maintenance every 30 seconds.
*
*/
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port)
{
mtc_message_type msg ;
int bytes = 0 ;
MEMSET_ZERO (msg);
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header());
msg.cmd = cmd ;
switch ( cmd )
{
case MTC_CMD_SYNC:
{
ilog ("Sending '%s' command to %s:%s:%d",
get_mtcNodeCommand_str(cmd),
hostname.c_str(),
address.c_str(), port);
msg.num = 0 ;
/* buffer not used in this message */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
break ;
}
default:
{
slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd );
return (FAIL_BAD_CASE);
}
}
int rc = FAIL ;
/* Send to controller floating address */
if (( sock_ptr->mtc_client_tx_socket ) &&
( sock_ptr->mtc_client_tx_socket->sock_ok() == true ))
{
print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ;
if ( 0 >= rc )
{
elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno));
rc = FAIL_SOCKET_SENDTO ;
}
else
rc = PASS ;
}
else
{
elog("mtc_client_tx_socket not ok");
rc = FAIL_BAD_STATE ;
}
return (rc) ;
}
int mtcCompMsg_testhead ( void )
{

View File

@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT );
return (PASS);
}
else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
{
ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
/* if this ready event is from the mtcClient of a
* controller that has valid bmc access info then
* build the 'peer controller kill' mtcInfo and
* send it to that mtcClient */
if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )
{
string bm_pw = obj_ptr->get_bm_pw ( hostname ) ;
if ( !bm_pw.empty() && ( bm_pw != NONE ))
{
string bm_un = obj_ptr->get_bm_un ( hostname ) ;
string bm_ip = obj_ptr->get_bm_ip ( hostname ) ;
if (( hostUtil_is_valid_username ( bm_un )) &&
( hostUtil_is_valid_ip_addr ( bm_ip )))
{
send_mtc_cmd ( hostname,
MTC_MSG_INFO,
MGMNT_INTERFACE,
obj_ptr->build_mtcInfo_dict (
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO));
}
}
}
return (PASS);
}
if ( service == MTC_SERVICE_HWMOND_NAME )
{
std::list<string>::iterator temp ;
@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
return (rc);
}
int send_mtc_cmd ( string & hostname, int cmd , int interface )
int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict )
{
int rc = FAIL ;
bool force = false ;
mtc_message_type mtc_cmd ;
string data = "" ;
mtc_socket_type * sock_ptr = get_sockPtr ();
memset (&mtc_cmd,0,sizeof(mtc_message_type));
@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
switch ( cmd )
{
case MTC_MSG_INFO:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
mtc_cmd.cmd = cmd ;
mtc_cmd.num = 0 ;
data = "{\"mtcInfo\":" + json_dict + "}";
ilog("%s mtc info update", hostname.c_str());
rc = PASS ;
break ;
}
case MTC_REQ_MTCALIVE:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
* Note: the minus 1 is to overwrite the null */
snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data());
string data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
/* If data is empty then at least add where the message came from */
if ( data.empty() )
{
data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
}
else
{
; /* data is already pre loaded by the command case above */
}
/* copy data into message buffer */
snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data());
bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1)));

View File

@ -43,9 +43,9 @@
#include <signal.h>
#include <fcntl.h>
#include <errno.h>
//#include <syslog.h> /* for ... syslog */
#include <sys/stat.h>
#include <list>
#include <json-c/json.h> /* for ... json_tokener_parse */
using namespace std;
@ -56,6 +56,10 @@ using namespace std;
#include "nodeBase.h" /* for ... Common Definitions */
#include "nodeTimers.h" /* fpr ... Timer Service */
#include "nodeUtil.h" /* for ... Common Utilities */
#include "hostUtil.h" /* for ... hostUtil_is_valid_... */
#include "jsonUtil.h" /* for ... jsonUtil_get_key_value_string */
#include "bmcUtil.h" /* for ... bmcUtil_accessInfo_type */
#include "ipmiUtil.h" /* for ... ipmiUtil_reset_host_now */
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
#include "mtcNodeMsg.h" /* for ... common maintenance messaging */
#include "mtcNodeComp.h" /* for ... this module header */
@ -96,7 +100,7 @@ string get_hostname ( void )
* Daemon Configuration Structure - The allocated struct
* @see daemon_common.h for daemon_config_type struct format.
*/
static daemon_config_type mtc_config ;
static daemon_config_type mtc_config ;
daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
/**
@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
static mtc_socket_type mtc_sock ;
static mtc_socket_type * sock_ptr ;
static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"};
static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"};
int run_goenabled_scripts ( string type );
@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc)
mtcTimer_stop_int_safe ( ctrl.hostservices.timer );
ctrl.hostservices.timer.ring = true ;
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid )
{
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer );
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid )
{
/* use auto restart */
ctrl.peer_ctrlr_reset.audit_timer.ring = true ;
}
else
{
mtcTimer_stop_tid_int_safe ( tid_ptr );
@ -207,9 +223,8 @@ void daemon_exit ( void )
exit (0) ;
}
/* Startup config read */
static int mtc_config_handler ( void * user,
static int mtc_config_handler ( void * user,
const char * section,
const char * name,
const char * value)
@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user,
config_ptr->failsafe_shutdown_delay = atoi(value);
ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay );
}
else
if (( ctrl.nodetype & CONTROLLER_TYPE ) &&
(MATCH("client", "sync_b4_peer_ctrlr_reset")))
{
return (PASS);
ctrl.peer_ctrlr_reset.sync = atoi(value);
ilog("SyncB4 Reset: %s",
ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" );
}
return (FAIL);
return (PASS);
}
/* Read the mtc.ini file and load control */
@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void )
_scripts_cleanup (ctrl.active_script_set) ;
}
int issue_reset_and_cleanup ( void )
{
int rc = FAIL ;
const char peer_ctrlr [] = "Peer controller reset" ;
ilog("SM %s request", peer_ctrlr );
/* check creds */
if (( hostUtil_is_valid_ip_addr ( peer_controller.bm_ip ) == false ) ||
( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) ||
( hostUtil_is_valid_pw ( peer_controller.bm_pw ) == false ))
{
elog("%s cannot reset peer BMC host at %s due to invalid credentials",
ctrl.hostname, peer_controller.bm_ip.c_str());
return (rc);
}
/* create output filename - no need to delete after operation */
string output_filename = bmcUtil_create_data_fn ( ctrl.hostname,
BMC_RESET_CMD_FILE_SUFFIX,
BMC_PROTOCOL__IPMITOOL );
if ( output_filename.empty() )
{
elog("%s ; failed to create output filename", peer_ctrlr);
rc = FAIL_STRING_EMPTY ;
}
else if ( ipmiUtil_reset_host_now ( ctrl.hostname,
peer_controller,
output_filename ) == PASS )
{
string result = daemon_get_file_str ( output_filename.data() );
ilog("%s succeeded", peer_ctrlr);
/* don't fail the operation if the result is unexpected ; but log it */
if ( result.compare( IPMITOOL_POWER_RESET_RESP ) )
{
dlog("... but reset command output was unexpected ; %s",
result.c_str());
}
rc = PASS ;
}
else
{
elog("%s failed", peer_ctrlr);
rc = FAIL_OPERATION ;
}
if ( rc == PASS )
{
/* give the host a chance to reset before
* telling SM the reset is done */
sleep (2) ;
/* Don't want to remove the file if the reset was not successful */
dlog("removing %s", RESET_PEER_NOW );
daemon_remove_file ( RESET_PEER_NOW );
}
return (rc);
}
/* The main service loop */
int daemon_init ( string iface, string nodetype_str )
@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str )
ctrl.subfunction = 0 ;
ctrl.system_type = daemon_system_type ();
ctrl.clstr_iface_provisioned = false ;
ctrl.peer_ctrlr_reset.sync = false ;
/* convert node type to integer */
ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str )
mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" );
mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" );
/* initialize peer controller reset feature */
mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ),
mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ),
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ;
/* initialize the script group control structures */
script_ctrl_init ( &ctrl.goenabled );
script_ctrl_init ( &ctrl.hostservices );
@ -1073,6 +1158,17 @@ void daemon_service_run ( void )
/* Send first mtcAlive ASAP */
mtcTimer_start ( ctrl.timer, timer_handler, 1 );
/* Monitor for peer controller reset requests when this
* daemon runs on a controller */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer,
timer_handler,
ctrl.peer_ctrlr_reset.audit_period );
}
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
/* lets go select so that the sock does not go crazy */
dlog ("%s running main loop with %d msecs socket timeout\n",
&ctrl.hostname[0], (SOCKET_WAIT/1000) );
@ -1384,7 +1480,51 @@ void daemon_service_run ( void )
}
}
}
/* service controller specific audits */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
/* peer controller reset service audit */
if ( ctrl.peer_ctrlr_reset.audit_timer.ring )
{
if ( daemon_is_file_present ( RESET_PEER_NOW ) )
{
if ( ctrl.peer_ctrlr_reset.sync )
{
if ( ctrl.peer_ctrlr_reset.sync_timer.ring )
{
issue_reset_and_cleanup ();
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
}
else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL )
{
if ( send_mtcClient_cmd ( &mtc_sock,
MTC_CMD_SYNC,
peer_controller.hostname,
peer_controller.host_ip,
mtc_config.mtc_rx_mgmnt_port) == PASS )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 );
ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10);
}
else
{
elog("failed to send 'sync' command to peer controller mtcClient");
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
}
}
else
{
; /* wait longer */
}
}
else
{
issue_reset_and_cleanup ();
}
}
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
}
}
daemon_signal_hdlr ();
}
daemon_exit();
@ -1750,7 +1890,6 @@ void daemon_sigchld_hdlr ( void )
}
default:
{
wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set );
return ;
}
}
@ -1820,6 +1959,84 @@ void daemon_sigchld_hdlr ( void )
}
}
/***************************************************************************
*
* Name : load_mtcInfo_msg
*
* Description: Extract the mtc info from the MTC_MSG_INFO message.
*
* Assumptions: So far only the peer controller reset feature uses this.
*
* Returns : Nothing
*
***************************************************************************/
void load_mtcInfo_msg ( mtc_message_type & msg )
{
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mlog1("%s", &msg.buf[0]);
struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
if ( _obj )
{
if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 ))
peer_controller.hostname = CONTROLLER_0 ;
else
peer_controller.hostname = CONTROLLER_1 ;
struct json_object *info_obj = (struct json_object *)(NULL);
json_bool json_rc = json_object_object_get_ex( _obj,
"mtcInfo",
&info_obj );
if ( ( json_rc == TRUE ) && ( info_obj ))
{
struct json_object *ctrl_obj = (struct json_object *)(NULL);
json_bool json_rc =
json_object_object_get_ex( info_obj,
peer_controller.hostname.data(),
&ctrl_obj );
if (( json_rc == TRUE ) && ( ctrl_obj ))
{
peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ;
peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ;
peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un");
peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw");
/* log the mc info but not the bmc password ; only
* indicate that it looks 'ok' or 'is 'none' */
ilog ("%s is my peer [host:%s bmc:%s:%s:%s]",
peer_controller.hostname.c_str(),
peer_controller.host_ip.c_str(),
peer_controller.bm_ip.c_str(),
peer_controller.bm_un.c_str(),
hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none");
}
else
{
wlog("peer mtcInfo missing (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
}
else
{
wlog("mtcInfo label parse error (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
json_object_put(_obj);
}
else
{
wlog("message buffer tokenize error ; %s", &msg.buf[0]);
}
}
else
{
slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname);
}
}
/* Push daemon state to log file */
void daemon_dump_info ( void )
{
@ -1853,13 +2070,13 @@ int daemon_run_testhead ( void )
* STAGE 1: some test
************************************************/
printf ( "| Test %d : Maintenance Service Test ............. ", stage );
if ( rc != PASS )
if ( rc != PASS )
{
FAILED_STR ;
rc = FAIL ;
}
else
PASSED ;
PASSED ;
printf ("+---------------------------------------------------------+\n");
return PASS ;

View File

@ -17,6 +17,10 @@
#include <string.h>
#include <unistd.h>
using namespace std;
#include "nodeTimers.h" /* for ... Timer Service */
/** Compute Config mask */
#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\
CONFIG_CLIENT_MTC_MGMNT_PORT |\
@ -59,6 +63,22 @@ typedef struct
} script_ctrl_type ;
void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );
/* peer controller reset control structure and associated definitions */
/* This is a flag file set by SM when SM wants maintanence to perform a
* BMC reset of the other (peer) controller */
#define RESET_PEER_NOW "/var/run/.sm_reset_peer"
#define PEER_CTRLR_AUDIT_PERIOD (2)
typedef struct
{
struct
mtc_timer sync_timer ;
mtc_timer audit_timer ;
int audit_period ;
bool sync ;
} peer_ctrlr_reset_type ;
typedef struct
{
char hostname [MAX_HOST_NAME_SIZE+1];
@ -76,7 +96,7 @@ typedef struct
unsigned int function ;
unsigned int subfunction ;
struct mtc_timer timer ; /* mtcAlive timer */
struct mtc_timer timer ; /* mtcAlive timer */
bool clstr_iface_provisioned ;
@ -102,6 +122,7 @@ typedef struct
/* Where to send events */
string mtcAgent_ip ;
peer_ctrlr_reset_type peer_ctrlr_reset;
} ctrl_type ;
ctrl_type * get_ctrl_ptr ( void );
@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void );
bool is_subfunction_worker ( void );
int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
int run_hostservices_scripts ( unsigned int cmd );
void load_mtcInfo_msg ( mtc_message_type & msg );
#endif

View File

@ -1326,6 +1326,7 @@ void nodeLinkClass::fsm ( void )
daemon_signal_hdlr ();
mtcHttpSvr_look ( mtce_event );
}
mtcInv.mtcInfo_handler();
}
}

View File

@ -6166,6 +6166,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
if ( is_controller(node_ptr) )
{
this->controllers++ ;
mtc_cmd_enum state = CONTROLLER_DISABLED ;
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6635,6 +6637,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR );
node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
ilog ("%s bmc control using %s:%s",
@ -6751,8 +6755,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->bmc_thread_ctrl.done = true ;
node_ptr->bmc_thread_info.command = 0 ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
@ -6942,6 +6953,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
}
} /* end power off detection handling */
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );

View File

@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
int recv_mtc_reply_noblock ( void );
int send_mtc_cmd ( string & hostname, int cmd, int interface );
int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" );
int mtc_service_command ( mtc_socket_type * sock_ptr , int interface );
int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status );
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr );
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr );
int mtc_clstr_init ( mtc_socket_type * sock_ptr , char * iface );
string get_who_i_am ( void );
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port);
#endif

View File

@ -87,6 +87,10 @@ sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
daemon_log_port = 2121 ; daemon logger port
mtcalarm_req_port = 2122 ;
sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient
; before issuing BMC reset.
[timeouts] ; configurable maintenance timeout values in seconds
failsafe_shutdown_delay = 120;