metal/mtce/src/scripts/mtc.conf

179 lines
9.5 KiB
Plaintext

; CGTS Maintenance Service config file
[agent] ; Agent Configuration
scheduling_priority = 1 ; Range of 1 .. 99 is acceptable
;
hbs_minor_threshold = 4 ; Heartbeat minor threshold count.
; Specifies the number of consecutive
; heartbeat misses that result in a
; minor notification to maintenance.
offline_period = 100 ; number of msecs to wait for each offline audit
offline_threshold = 46 ; number of back to back mtcAlive requests missed
; 100:46 will yield a typical 5 sec holdoff from
; failed to offline
inventory_port = 6385 ; The Inventory Port Number
keystone_port = 5000 ; The Keystone Port Number
ha_port = 7777 ; The Inventory Port Number
mtc_agent_port = 2101 ; OBS: ........ Active Controller Maintenance Rx Port
mtc_rx_mgmnt_port = 2101 ; Active Controller Maintenance Mgmnt Network Rx Port
mtc_rx_infra_port = 2111 ; Active Controller Maintenance Infra Network Rx Port
hbs_agent_mgmnt_port = 2103 ; Management Interface Heartbeat Pulse Response Rx Port
hbs_agent_infra_port = 2113 ; Infrastructure Interface Heartbeat Pulse Response Rx Port
infra_agent_port = 2110 ; Agent Command Response RX Port
mtc_to_hbs_cmd_port = 2104 ; Mtc to Hbs Command Port Number
mtc_to_guest_cmd_port = 2108 ; Mtc to guestAgent Command port
hbs_to_mtc_event_port = 2107 ; Hbs to Mtc Event Port Number
inv_event_port = 2112 ; The Inventory Event Port Number
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
; A value of zero means no refresh.
; range is 0 or 600-22800
autorecovery_threshold = 3 ; The number of times maintenance will try to
; auto recover a critically failed controller
; while there is no backup controllers to fail
; over to before giving up.
; Service specific Auto Recovery failure thresholds.
;
; ar_<service>_threshold = <max_retries>
;
; If a host fails to enable due to a particular service failure, for example
; configuration, goenabled etc. , then the mtcAgent will stop retrying after
; the particular services' threshold is reached. While at threshold auto
; recovery for specified host is disabled. The host sits there in the
; unlocked-disabled-failed state with the WEBGUI host status showing that
; auto recovery is disabled and horizon showing then a lock/unlock is required
; to trigger another enable attempt and reset of the auto recovery counter.
ar_config_threshold = 2
ar_goenable_threshold = 2
ar_hostservices_threshold = 2
ar_heartbeat_threshold = 2
; Service specific Auto Recovery retry interval.
;
; ar_<service>_interval = <retry delay in seconds>
;
; When a host fails to enable due to a particular service reason then
; the mtcAgent will use the service specific interval value specified
; to wait before it retries the enable sequence again.
ar_config_interval = 30
ar_goenable_interval = 30
ar_hostservices_interval = 30
ar_heartbeat_interval = 600
api_retries = 10 ; number of API retries b4 failure
[client] ; Client Configuration
scheduling_priority = 99 ; realtime scheduling; range of 1 .. 99 is acceptable
mtc_rx_mgmnt_port = 2118 ; Client Maintenance Command Rx Port
mtc_rx_infra_port = 2115 ; Client Maintenance Command Rx Port
hbs_client_mgmnt_port = 2106 ; Management Interface Heartbeat Pulse Request Rx Port
hbs_client_infra_port = 2116 ; Infrastructure Interface Heartbeat Pulse Request Rx Port
hwmon_cmd_port = 2114 ; hwmond Command Rx Port Number
pmon_pulse_port = 2109 ; Process Monitor I'm Alive pulse Port Rx Port
rmon_event_port = 2302 ; Resource Monitor Event Port Rx Port
sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
; a scheduler history dump
daemon_log_port = 2121 ; daemon logger port
mtcalarm_req_port = 2122 ;
[timeouts] ; configurable maintenance timeout values in seconds
failsafe_shutdown_delay = 120;
goenabled_timeout = 600 ; Time (seconds) that Mtce waits for
; a hosts's goenabled message
; after which it fails the enable.
; Value boosted from 300 to 600 to handle LO DOR
host_services_timeout = 300 ; Time (seconds) that the mtcClient waits for
; all the host services scripts to complete.
; Introduced in support of new monitored
; implementation.
node_reinstall_timeout = 2400 ; Timeout in seconds for a node reinstall.
; There is no retry mechanism, the admin will be
; notified that the reinstall failed.
loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
; the max number of seconds that a host can be in
; loss of communication state without failing the unit
dor_mode_timeout = 20 ; The default base time in seconds for how long
; maintenance DOR mode is active. This number
; is extended by the number of enabled hosts.
dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
; that is added to the host specific recovery time
; making the overall host's dor recovery timeout.
swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing
; the swact operation
sysinv_timeout = 20 ; timeout in seconds for sysinv REST requests
sysinv_noncrit_timeout = 10 ; timeout for non-critical sysinv REST requests
work_queue_timeout = 200 ; time to wait at the end of an action handler for all the
; posted work items to be completed
uptime_period = 60 ; Mtce refresh of uptime in first hour. Not implemented
online_period = 7 ; Mtce refresh of locked availability status
insv_test_period = 10 ; Time in seconds between in-service tests
oos_test_period = 300 ; Time in seconds between out-of-service tests
audit_period = 50 ; Audit period
[debug] ; SIGHUP to reload
debug_timer = 0 ; enable(1) or disable(0) timer logs (tlog)
debug_json = 0 ; enable(1) or disable(0) Json logs (jlog)
debug_fsm = 0 ; enable(1) or disable(0) fsm logs (flog)
debug_http = 0 ; enable(1) or disable(0) http logs (hlog)
debug_alive = 0 ; enable(1) or disable(0) mtcAlive logs (alog)
debug_bm = 0 ; enable(1) or disable(0) board management (blog)
debug_msg = 0 ; enable(1) or disable(0) message logs (mlog)
debug_state = 0 ; enable(1) or disable(0) state change logs (clog)
debug_work = 0 ; enable(1) or disable(0) work queue trace logs (qlog)
debug_level = 0 ; decimal mask 0..15 (8,4,2,1) and 16 for mem logging
debug_all = 0 ; enable full tracing
flush = 1 ; enable(1) or disable(0) force log flush (main loop)
flush_thld = 5 ; if enabled - force flush after this number of loops
latency_thld = 500 ; scheduling latency log threshold ; msec
debug_event = none ; string name of HTTP API to trace
debug_filter = none ; filter string (not used yet)
infra_degrade_only = 0 ; Only degrade for infra failures
testmode = 0 ; set to 1 to enable test mode
testmask = 0 ; bit field of out-of-service stress tests
fit_code = 0 ; codes are in nodeBase.h -> fit_code_enum
fit_host = none ; the hostname to apply the fit to
stall_pmon_thld = 1250 ; number of missed pmond pulses before we enter monitor mode
; 1200 pulses * 50 msec select timeout is 60 seconds
stall_poll_period = 20 ; number of seconds between monitor periods
stall_mon_period = 120 ; consecutive seconds that processes do not show forward
; progress before we perform recovery action
stall_mon_start_delay = 600 ; number of seconds after the hbsClient is started before
; the stall monitor will start monitoring
stall_rec_thld = 2 ; How many processes have to be failed for
; recovery action to take effect
; A poll period failure for a process is constituted by the
; inability to get pid, read scheduler stats or the stats
; not changing during that period
; 10 of the 11 audits need to fail befor we declare that
; the process has failed.
; In this case with 5 monitored processes and this threshold
; set to 4 then if 4 or more of the 5 processes fail all
; but 1 of the audits then we take recovery action
; A list of up to 7 processes to monitor.
; Use 'none' a key word for an unused process timeslot.
mon_process_1 = pmond
mon_process_2 = fsmond
mon_process_3 = none
mon_process_4 = none
mon_process_5 = none
mon_process_6 = none
mon_process_7 = none