; CGTS Maintenance Service config file [agent] ; Agent Configuration scheduling_priority = 1 ; Range of 1 .. 99 is acceptable ; hbs_minor_threshold = 4 ; Heartbeat minor threshold count. ; Specifies the number of consecutive ; heartbeat misses that result in a ; minor notification to maintenance. offline_period = 100 ; number of msecs to wait for each offline audit offline_threshold = 46 ; number of back to back mtcAlive requests missed ; 100:46 will yield a typical 5 sec holdoff from ; failed to offline inventory_port = 6385 ; The Inventory Port Number keystone_port = 5000 ; The Keystone Port Number ha_port = 7777 ; The Inventory Port Number mtc_agent_port = 2101 ; OBS: ........ Active Controller Maintenance Rx Port mtc_rx_mgmnt_port = 2101 ; Active Controller Maintenance Mgmnt Network Rx Port mtc_rx_infra_port = 2111 ; Active Controller Maintenance Infra Network Rx Port hbs_agent_mgmnt_port = 2103 ; Management Interface Heartbeat Pulse Response Rx Port hbs_agent_infra_port = 2113 ; Infrastructure Interface Heartbeat Pulse Response Rx Port infra_agent_port = 2110 ; Agent Command Response RX Port mtc_to_hbs_cmd_port = 2104 ; Mtc to Hbs Command Port Number mtc_to_guest_cmd_port = 2108 ; Mtc to guestAgent Command port hbs_to_mtc_event_port = 2107 ; Hbs to Mtc Event Port Number inv_event_port = 2112 ; The Inventory Event Port Number token_refresh_rate = 1200 ; Authentication token refresh rate in seconds. ; A value of zero means no refresh. ; range is 0 or 600-22800 autorecovery_threshold = 3 ; The number of times maintenance will try to ; auto recover a critically failed controller ; while there is no backup controllers to fail ; over to before giving up. ; Service specific Auto Recovery failure thresholds. ; ; ar__threshold = ; ; If a host fails to enable due to a particular service failure, for example ; configuration, goenabled etc. , then the mtcAgent will stop retrying after ; the particular services' threshold is reached. While at threshold auto ; recovery for specified host is disabled. The host sits there in the ; unlocked-disabled-failed state with the WEBGUI host status showing that ; auto recovery is disabled and horizon showing then a lock/unlock is required ; to trigger another enable attempt and reset of the auto recovery counter. ar_config_threshold = 2 ar_goenable_threshold = 2 ar_hostservices_threshold = 2 ar_heartbeat_threshold = 2 ; Service specific Auto Recovery retry interval. ; ; ar__interval = ; ; When a host fails to enable due to a particular service reason then ; the mtcAgent will use the service specific interval value specified ; to wait before it retries the enable sequence again. ar_config_interval = 30 ar_goenable_interval = 30 ar_hostservices_interval = 30 ar_heartbeat_interval = 600 api_retries = 10 ; number of API retries b4 failure [client] ; Client Configuration scheduling_priority = 99 ; realtime scheduling; range of 1 .. 99 is acceptable mtc_rx_mgmnt_port = 2118 ; Client Maintenance Command Rx Port mtc_rx_infra_port = 2115 ; Client Maintenance Command Rx Port hbs_client_mgmnt_port = 2106 ; Management Interface Heartbeat Pulse Request Rx Port hbs_client_infra_port = 2116 ; Infrastructure Interface Heartbeat Pulse Request Rx Port hwmon_cmd_port = 2114 ; hwmond Command Rx Port Number pmon_pulse_port = 2109 ; Process Monitor I'm Alive pulse Port Rx Port rmon_event_port = 2302 ; Resource Monitor Event Port Rx Port sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger ; a scheduler history dump daemon_log_port = 2121 ; daemon logger port mtcalarm_req_port = 2122 ; [timeouts] ; configurable maintenance timeout values in seconds failsafe_shutdown_delay = 120; goenabled_timeout = 600 ; Time (seconds) that Mtce waits for ; a hosts's goenabled message ; after which it fails the enable. ; Value boosted from 300 to 600 to handle LO DOR host_services_timeout = 300 ; Time (seconds) that the mtcClient waits for ; all the host services scripts to complete. ; Introduced in support of new monitored ; implementation. node_reinstall_timeout = 2400 ; Timeout in seconds for a node reinstall. ; There is no retry mechanism, the admin will be ; notified that the reinstall failed. loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout ; the max number of seconds that a host can be in ; loss of communication state without failing the unit dor_mode_timeout = 20 ; The default base time in seconds for how long ; maintenance DOR mode is active. This number ; is extended by the number of enabled hosts. dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds ; that is added to the host specific recovery time ; making the overall host's dor recovery timeout. swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing ; the swact operation sysinv_timeout = 20 ; timeout in seconds for sysinv REST requests sysinv_noncrit_timeout = 10 ; timeout for non-critical sysinv REST requests work_queue_timeout = 200 ; time to wait at the end of an action handler for all the ; posted work items to be completed uptime_period = 60 ; Mtce refresh of uptime in first hour. Not implemented online_period = 7 ; Mtce refresh of locked availability status insv_test_period = 10 ; Time in seconds between in-service tests oos_test_period = 300 ; Time in seconds between out-of-service tests audit_period = 50 ; Audit period [debug] ; SIGHUP to reload debug_timer = 0 ; enable(1) or disable(0) timer logs (tlog) debug_json = 0 ; enable(1) or disable(0) Json logs (jlog) debug_fsm = 0 ; enable(1) or disable(0) fsm logs (flog) debug_http = 0 ; enable(1) or disable(0) http logs (hlog) debug_alive = 0 ; enable(1) or disable(0) mtcAlive logs (alog) debug_bm = 0 ; enable(1) or disable(0) board management (blog) debug_msg = 0 ; enable(1) or disable(0) message logs (mlog) debug_state = 0 ; enable(1) or disable(0) state change logs (clog) debug_work = 0 ; enable(1) or disable(0) work queue trace logs (qlog) debug_level = 0 ; decimal mask 0..15 (8,4,2,1) and 16 for mem logging debug_all = 0 ; enable full tracing flush = 1 ; enable(1) or disable(0) force log flush (main loop) flush_thld = 5 ; if enabled - force flush after this number of loops latency_thld = 500 ; scheduling latency log threshold ; msec debug_event = none ; string name of HTTP API to trace debug_filter = none ; filter string (not used yet) infra_degrade_only = 0 ; Only degrade for infra failures testmode = 0 ; set to 1 to enable test mode testmask = 0 ; bit field of out-of-service stress tests fit_code = 0 ; codes are in nodeBase.h -> fit_code_enum fit_host = none ; the hostname to apply the fit to stall_pmon_thld = 1250 ; number of missed pmond pulses before we enter monitor mode ; 1200 pulses * 50 msec select timeout is 60 seconds stall_poll_period = 20 ; number of seconds between monitor periods stall_mon_period = 120 ; consecutive seconds that processes do not show forward ; progress before we perform recovery action stall_mon_start_delay = 600 ; number of seconds after the hbsClient is started before ; the stall monitor will start monitoring stall_rec_thld = 2 ; How many processes have to be failed for ; recovery action to take effect ; A poll period failure for a process is constituted by the ; inability to get pid, read scheduler stats or the stats ; not changing during that period ; 10 of the 11 audits need to fail befor we declare that ; the process has failed. ; In this case with 5 monitored processes and this threshold ; set to 4 then if 4 or more of the 5 processes fail all ; but 1 of the audits then we take recovery action ; A list of up to 7 processes to monitor. ; Use 'none' a key word for an unused process timeslot. mon_process_1 = pmond mon_process_2 = fsmond mon_process_3 = none mon_process_4 = none mon_process_5 = none mon_process_6 = none mon_process_7 = none