metal/mtce/src/scripts/mtc.conf

; CGTS Maintenance Service config file
[agent]                       ; Agent Configuration
scheduling_priority = 1       ; Range of 1 .. 99 is acceptable
                              ;
hbs_minor_threshold = 4       ; Heartbeat minor threshold count.
                              ; Specifies the number of consecutive
                              ;  heartbeat misses that result in a
                              ;  minor notification to maintenance.

offline_period = 100          ; number of msecs to wait for each offline audit
offline_threshold = 46        ; number of back to back mtcAlive requests missed
                              ; 100:46 will yield a typical 5 sec holdoff from
                              ; failed to offline

inventory_port = 6385         ; The Inventory Port Number
keystone_port = 5000          ; The Keystone Port Number
ha_port = 7777                ; The Inventory Port Number
mtc_agent_port = 2101         ; OBS: ........ Active Controller Maintenance Rx Port
mtc_rx_mgmnt_port = 2101      ; Active Controller Maintenance Mgmnt Network Rx Port
mtc_rx_infra_port = 2111      ; Active Controller Maintenance Infra Network Rx Port
hbs_agent_mgmnt_port = 2103   ; Management Interface Heartbeat Pulse Response Rx Port
hbs_agent_infra_port = 2113   ; Infrastructure Interface Heartbeat Pulse Response Rx Port
infra_agent_port = 2110       ; Agent Command Response RX Port
mtc_to_hbs_cmd_port = 2104    ; Mtc to Hbs Command Port Number
mtc_to_guest_cmd_port = 2108  ; Mtc to guestAgent Command port
hbs_to_mtc_event_port = 2107  ; Hbs to Mtc Event Port Number
inv_event_port = 2112         ; The Inventory Event Port Number

token_refresh_rate = 1200     ; Authentication token refresh rate in seconds.
                              ; A value of zero means no refresh.
                              ; range is 0 or 600-22800
autorecovery_threshold = 3    ; The number of times maintenance will try to
                              ;  auto recover a critically failed controller
                              ;  while there is no backup controllers to fail
                              ;  over to before giving up.


; Service specific Auto Recovery failure thresholds.
;
; ar_<service>_threshold = <max_retries>
;
; If a host fails to enable due to a particular service failure, for example
; configuration, goenabled etc. , then the mtcAgent will stop retrying after
; the particular services' threshold is reached. While at threshold auto
; recovery for specified host is disabled. The host sits there in the
; unlocked-disabled-failed state with the WEBGUI host status showing that
; auto recovery is disabled and horizon showing then a lock/unlock is required
; to trigger another enable attempt and reset of the auto recovery counter.
ar_config_threshold = 2
ar_goenable_threshold = 2
ar_hostservices_threshold = 2
ar_heartbeat_threshold = 2

; Service specific Auto Recovery retry interval.
;
; ar_<service>_interval = <retry delay in seconds>
;
; When a host fails to enable due to a particular service reason then
; the mtcAgent will use the service specific interval value specified
; to wait before it retries the enable sequence again.
ar_config_interval = 30
ar_goenable_interval = 30
ar_hostservices_interval = 30
ar_heartbeat_interval = 600

api_retries = 10              ; number of API retries b4 failure

[client]                     ; Client Configuration
scheduling_priority = 99     ; realtime scheduling; range of 1 .. 99 is acceptable
mtc_rx_mgmnt_port = 2118     ; Client Maintenance Command Rx Port
mtc_rx_infra_port = 2115     ; Client Maintenance Command Rx Port
hbs_client_mgmnt_port = 2106 ; Management Interface Heartbeat Pulse Request Rx Port
hbs_client_infra_port = 2116 ; Infrastructure Interface Heartbeat Pulse Request Rx Port
hwmon_cmd_port = 2114        ; hwmond Command Rx Port Number
pmon_pulse_port = 2109       ; Process Monitor I'm Alive pulse Port Rx Port
rmon_event_port = 2302       ; Resource Monitor Event Port Rx Port
sched_delay_threshold = 300  ; scheduler delay time in msecs that will trigger
                             ; a scheduler history dump
daemon_log_port = 2121       ; daemon logger port
mtcalarm_req_port = 2122     ;

[timeouts]                   ; configurable maintenance timeout values in seconds

failsafe_shutdown_delay = 120;

goenabled_timeout =  600     ; Time (seconds) that Mtce waits for
                             ;   a hosts's goenabled message
                             ;   after which it fails the enable.
                             ; Value boosted from 300 to 600 to handle LO DOR

host_services_timeout = 300  ; Time (seconds) that the mtcClient waits for
                             ;   all the host services scripts to complete.
                             ; Introduced in support of new monitored
                             ;   implementation.

node_reinstall_timeout = 2400      ; Timeout in seconds for a node reinstall.
                                   ; There is no retry mechanism, the admin will be
                                   ; notified that the reinstall failed.

loc_recovery_timeout = 5      ; Loss Of Communication Recovery Timeout
                              ;  the max number of seconds that a host can be in
                              ;  loss of communication state without failing the unit

dor_mode_timeout = 20           ; The default base time in seconds for how long
                                ; maintenance DOR mode is active. This number
                                ; is extended by the number of enabled hosts.
dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
                                ; that is added to the host specific recovery time
                                ; making the overall host's dor recovery timeout.

swact_timeout = 120         ; Seconds Mtce waits for HA Service SWACT before failing
                            ;  the swact operation

sysinv_timeout = 20         ; timeout in seconds for sysinv REST requests
sysinv_noncrit_timeout = 10 ; timeout for non-critical sysinv REST requests
work_queue_timeout = 200    ; time to wait at the end of an action handler for all the
                            ;  posted work items to be completed
uptime_period = 60          ; Mtce refresh of uptime in first hour. Not implemented
online_period = 7           ; Mtce refresh of locked availability status

insv_test_period = 10       ; Time in seconds between in-service tests
oos_test_period = 300       ; Time in seconds between out-of-service tests

audit_period = 50           ; Audit period

[debug]                     ; SIGHUP to reload
debug_timer = 0             ; enable(1) or disable(0) timer logs (tlog)
debug_json = 0              ; enable(1) or disable(0) Json logs (jlog)
debug_fsm = 0               ; enable(1) or disable(0) fsm logs (flog)
debug_http = 0              ; enable(1) or disable(0) http logs (hlog)
debug_alive = 0             ; enable(1) or disable(0) mtcAlive logs (alog)
debug_bm = 0                ; enable(1) or disable(0) board management (blog)
debug_msg = 0               ; enable(1) or disable(0) message logs (mlog)
debug_state = 0             ; enable(1) or disable(0) state change logs (clog)
debug_work = 0              ; enable(1) or disable(0) work queue trace logs (qlog)
debug_level = 0             ; decimal mask 0..15 (8,4,2,1) and 16 for mem logging
debug_all = 0               ; enable full tracing

flush = 1                   ; enable(1) or disable(0) force log flush (main loop)
flush_thld = 5              ; if enabled - force flush after this number of loops
latency_thld = 500          ; scheduling latency log threshold ; msec
debug_event = none          ; string name of HTTP API to trace
debug_filter = none         ; filter string (not used yet)

infra_degrade_only = 0      ; Only degrade for infra failures
testmode = 0                ; set to 1 to enable test mode
testmask = 0                ; bit field of out-of-service stress tests
fit_code = 0                ; codes are in nodeBase.h -> fit_code_enum
fit_host = none             ; the hostname to apply the fit to

stall_pmon_thld = 1250      ; number of missed pmond pulses before we enter monitor mode
                            ;   1200 pulses * 50 msec select timeout is 60 seconds
stall_poll_period = 20      ; number of seconds between monitor periods

stall_mon_period = 120      ; consecutive seconds that processes do not show forward
                            ;   progress before we perform recovery action
stall_mon_start_delay = 600 ; number of seconds after the hbsClient is started before
                            ;   the stall monitor will start monitoring
stall_rec_thld = 2          ; How many processes have to be failed for
                            ;   recovery action to take effect
                            ; A poll period failure for a process is constituted by the
                            ;   inability to get pid, read scheduler stats or the stats
                            ;   not changing during that period
                            ; 10 of the 11 audits need to fail befor we declare that
                            ;   the process has failed.
                            ; In this case with 5 monitored processes and this threshold
                            ;   set to 4 then if 4 or more of the 5 processes fail all
                            ;   but 1 of the audits then we take recovery action

; A list of up to 7 processes to monitor.
; Use 'none' a key word for an unused process timeslot.
mon_process_1 = pmond
mon_process_2 = fsmond
mon_process_3 = none
mon_process_4 = none
mon_process_5 = none
mon_process_6 = none
mon_process_7 = none