182 lines
9.6 KiB
Plaintext
182 lines
9.6 KiB
Plaintext
; CGTS Maintenance Service config file
|
|
[agent] ; Agent Configuration
|
|
scheduling_priority = 1 ; Range of 1 .. 99 is acceptable
|
|
;
|
|
hbs_minor_threshold = 4 ; Heartbeat minor threshold count.
|
|
; Specifies the number of consecutive
|
|
; heartbeat misses that result in a
|
|
; minor notification to maintenance.
|
|
|
|
offline_period = 100 ; number of msecs to wait for each offline audit
|
|
offline_threshold = 46 ; number of back to back mtcAlive requests missed
|
|
; 100:46 will yield a typical 5 sec holdoff from
|
|
; failed to offline
|
|
|
|
inventory_port = 6385 ; The Inventory Port Number
|
|
keystone_port = 5000 ; The Keystone Port Number
|
|
ha_port = 7777 ; The Inventory Port Number
|
|
mtc_agent_port = 2101 ; OBS: ........ Active Controller Maintenance Rx Port
|
|
mtc_rx_mgmnt_port = 2101 ; Active Controller Maintenance Mgmnt Network Rx Port
|
|
mtc_rx_clstr_port = 2111 ; Active Controller Maintenance Clstr Network Rx Port
|
|
hbs_agent_mgmnt_port = 2103 ; Management Interface Heartbeat Pulse Response Rx Port
|
|
hbs_agent_clstr_port = 2113 ; Cluster-host Interface Heartbeat Pulse Response Rx Port
|
|
clstr_agent_port = 2110 ; Agent Command Response RX Port
|
|
mtc_to_hbs_cmd_port = 2104 ; Mtc to Hbs Command Port Number
|
|
mtc_to_guest_cmd_port = 2108 ; Mtc to guestAgent Command port
|
|
hbs_to_mtc_event_port = 2107 ; Hbs to Mtc Event Port Number
|
|
inv_event_port = 2112 ; The Inventory Event Port Number
|
|
barbican_port = 9311 ; The Barbican Port Number
|
|
|
|
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
|
|
; A value of zero means no refresh.
|
|
; range is 0 or 600-22800
|
|
autorecovery_threshold = 3 ; The number of times maintenance will try to
|
|
; auto recover a critically failed controller
|
|
; while there is no backup controllers to fail
|
|
; over to before giving up.
|
|
|
|
|
|
; Service specific Auto Recovery failure thresholds.
|
|
;
|
|
; ar_<service>_threshold = <max_retries>
|
|
;
|
|
; If a host fails to enable due to a particular service failure, for example
|
|
; configuration, goenabled etc. , then the mtcAgent will stop retrying after
|
|
; the particular services' threshold is reached. While at threshold auto
|
|
; recovery for specified host is disabled. The host sits there in the
|
|
; unlocked-disabled-failed state with the WEBGUI host status showing that
|
|
; auto recovery is disabled and horizon showing then a lock/unlock is required
|
|
; to trigger another enable attempt and reset of the auto recovery counter.
|
|
ar_config_threshold = 2
|
|
ar_goenable_threshold = 2
|
|
ar_hostservices_threshold = 2
|
|
ar_heartbeat_threshold = 2
|
|
|
|
; Service specific Auto Recovery retry interval.
|
|
;
|
|
; ar_<service>_interval = <retry delay in seconds>
|
|
;
|
|
; When a host fails to enable due to a particular service reason then
|
|
; the mtcAgent will use the service specific interval value specified
|
|
; to wait before it retries the enable sequence again.
|
|
ar_config_interval = 30
|
|
ar_goenable_interval = 30
|
|
ar_hostservices_interval = 30
|
|
ar_heartbeat_interval = 600
|
|
|
|
api_retries = 10 ; number of API retries b4 failure
|
|
|
|
[client] ; Client Configuration
|
|
|
|
scheduling_priority = 45 ; realtime scheduling; range of 1 .. 99
|
|
; recommended setting: no higher than 45.
|
|
|
|
mtc_rx_mgmnt_port = 2118 ; Client Maintenance Command Rx Port
|
|
mtc_rx_clstr_port = 2115 ; Client Maintenance Command Rx Port
|
|
hbs_client_mgmnt_port = 2106 ; Management Interface Heartbeat Pulse Request Rx Port
|
|
hbs_client_clstr_port = 2116 ; Cluster-host Interface Heartbeat Pulse Request Rx Port
|
|
hwmon_cmd_port = 2114 ; hwmond Command Rx Port Number
|
|
pmon_pulse_port = 2109 ; Process Monitor I'm Alive pulse Port Rx Port
|
|
sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
|
|
; a scheduler history dump
|
|
daemon_log_port = 2121 ; daemon logger port
|
|
mtcalarm_req_port = 2122 ;
|
|
|
|
[timeouts] ; configurable maintenance timeout values in seconds
|
|
|
|
failsafe_shutdown_delay = 120;
|
|
|
|
goenabled_timeout = 600 ; Time (seconds) that Mtce waits for
|
|
; a hosts's goenabled message
|
|
; after which it fails the enable.
|
|
; Value boosted from 300 to 600 to handle LO DOR
|
|
|
|
host_services_timeout = 300 ; Time (seconds) that the mtcClient waits for
|
|
; all the host services scripts to complete.
|
|
; Introduced in support of new monitored
|
|
; implementation.
|
|
|
|
node_reinstall_timeout = 2400 ; Timeout in seconds for a node reinstall.
|
|
; There is no retry mechanism, the admin will be
|
|
; notified that the reinstall failed.
|
|
|
|
loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
|
|
; the max number of seconds that a host can be in
|
|
; loss of communication state without failing the unit
|
|
|
|
dor_mode_timeout = 20 ; The default base time in seconds for how long
|
|
; maintenance DOR mode is active. This number
|
|
; is extended by the number of enabled hosts.
|
|
dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
|
|
; that is added to the host specific recovery time
|
|
; making the overall host's dor recovery timeout.
|
|
|
|
swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing
|
|
; the swact operation
|
|
|
|
sysinv_timeout = 20 ; timeout in seconds for sysinv REST requests
|
|
sysinv_noncrit_timeout = 10 ; timeout for non-critical sysinv REST requests
|
|
work_queue_timeout = 200 ; time to wait at the end of an action handler for all the
|
|
; posted work items to be completed
|
|
uptime_period = 60 ; Mtce refresh of uptime in first hour. Not implemented
|
|
online_period = 7 ; Mtce refresh of locked availability status
|
|
|
|
insv_test_period = 10 ; Time in seconds between in-service tests
|
|
oos_test_period = 300 ; Time in seconds between out-of-service tests
|
|
|
|
audit_period = 50 ; Audit period
|
|
|
|
[debug] ; SIGHUP to reload
|
|
debug_timer = 0 ; enable(1) or disable(0) timer logs (tlog)
|
|
debug_json = 0 ; enable(1) or disable(0) Json logs (jlog)
|
|
debug_fsm = 0 ; enable(1) or disable(0) fsm logs (flog)
|
|
debug_http = 0 ; enable(1) or disable(0) http logs (hlog)
|
|
debug_alive = 0 ; enable(1) or disable(0) mtcAlive logs (alog)
|
|
debug_bm = 0 ; enable(1) or disable(0) board management (blog)
|
|
debug_msg = 0 ; enable(1) or disable(0) message logs (mlog)
|
|
debug_state = 0 ; enable(1) or disable(0) state change logs (clog)
|
|
debug_work = 0 ; enable(1) or disable(0) work queue trace logs (qlog)
|
|
debug_level = 0 ; decimal mask 0..15 (8,4,2,1) and 16 for mem logging
|
|
debug_all = 0 ; enable full tracing
|
|
|
|
flush = 1 ; enable(1) or disable(0) force log flush (main loop)
|
|
flush_thld = 5 ; if enabled - force flush after this number of loops
|
|
latency_thld = 500 ; scheduling latency log threshold ; msec
|
|
debug_event = none ; string name of HTTP API to trace
|
|
debug_filter = none ; filter string (not used yet)
|
|
|
|
clstr_degrade_only = 0 ; Only degrade for cluster-host failures
|
|
testmode = 0 ; set to 1 to enable test mode
|
|
testmask = 0 ; bit field of out-of-service stress tests
|
|
fit_code = 0 ; codes are in nodeBase.h -> fit_code_enum
|
|
fit_host = none ; the hostname to apply the fit to
|
|
|
|
stall_pmon_thld = 1250 ; number of missed pmond pulses before we enter monitor mode
|
|
; 1200 pulses * 50 msec select timeout is 60 seconds
|
|
stall_poll_period = 20 ; number of seconds between monitor periods
|
|
|
|
stall_mon_period = 120 ; consecutive seconds that processes do not show forward
|
|
; progress before we perform recovery action
|
|
stall_mon_start_delay = 600 ; number of seconds after the hbsClient is started before
|
|
; the stall monitor will start monitoring
|
|
stall_rec_thld = 2 ; How many processes have to be failed for
|
|
; recovery action to take effect
|
|
; A poll period failure for a process is constituted by the
|
|
; inability to get pid, read scheduler stats or the stats
|
|
; not changing during that period
|
|
; 10 of the 11 audits need to fail befor we declare that
|
|
; the process has failed.
|
|
; In this case with 5 monitored processes and this threshold
|
|
; set to 4 then if 4 or more of the 5 processes fail all
|
|
; but 1 of the audits then we take recovery action
|
|
|
|
; A list of up to 7 processes to monitor.
|
|
; Use 'none' a key word for an unused process timeslot.
|
|
mon_process_1 = pmond
|
|
mon_process_2 = fsmond
|
|
mon_process_3 = none
|
|
mon_process_4 = none
|
|
mon_process_5 = none
|
|
mon_process_6 = none
|
|
mon_process_7 = none
|