Various engtools fixes

- Change engtools init order to ensure that stats streaming agents on
  the compute and storage nodes do not
  start prematurely after DOR
- Workaround a systemd preun scriptlet issue that caused patch removal
  failure
- Stream database stats in batches (max 10 DBs/batch)
- Account for new processes

Story: 2002895
Task: 22858

Change-Id: Iaeeca7f51b442c27fc475777abc612d53dc97ce5
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Tee Ngo 2018-06-12 16:52:34 -04:00 committed by Dean Troyer
parent dde0f8a754
commit 363b9486e1
6 changed files with 68 additions and 44 deletions

View File

@ -10,7 +10,7 @@ BuildArch: noarch
Source: %{name}-%{version}.tar.gz
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
BuildRequires: systemd
Requires: iperf3
%description

View File

@ -15,7 +15,7 @@ DURATION=
[StaticCollection]
# Set this option to Y/N before patch creation to enable/disable static stats collection
ENABLE_STATIC_COLLECTION=Y
ENABLE_STATIC_COLLECTION=N
[CollectInternal]
# controller external OAM interface used to communicate with remote server. If unset, the first interface from ifconfig will be used
@ -46,7 +46,7 @@ filestats=30
netstats=10
postgres=30
rabbitmq=3600
vswitch=30
vswitch=120
[AdditionalOptions]
# Set this option to Y/N to enable/disable Openstack API GET/POST collection
@ -62,10 +62,10 @@ FAST_POSTGRES_CONNECTIONS=N
AUTO_DELETE_DB=N
[ControllerServices]
CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor keystone-all magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent
CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry gnocchi-api gnocchi-metricd heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent dcmanager-api dcmanager-manager dcorch-engine dcorch-neutron-api-proxy dcorch-nova-api-proxy dcorch-patch-api-proxy dcorch-snmp dcorch-sysinv-api-proxy memcached influxd
[ComputeServices]
COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent
COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent dmeventd virtlockd
[StorageServices]
STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
@ -74,4 +74,4 @@ STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info
[CommonServices]
COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool
COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd

View File

@ -1,6 +1,7 @@
[Unit]
Description=Engineering data collection tools to monitor host performance
After=network.service
Requires=network.service
After=network.service getty.target
[Service]
Type=forking

View File

@ -270,14 +270,10 @@ OPT_USE_INTERVALS=0
BINDIR=/usr/bin
LBINDIR=/usr/local/bin
while IFS='' read -r line || [[ -n "$line" ]]; do
if [[ $line =~ 'ENABLE_STATIC_COLLECTION'* ]]; then
static_collection=${line:25:1}
fi
done < /etc/engtools/engtools.conf
. /etc/engtools/engtools.conf
declare -a tlist
if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
if [[ ${ENABLE_STATIC_COLLECTION} == "Y" ]] || [[ ${ENABLE_STATIC_COLLECTION} == "y" ]]; then
tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" )
@ -290,45 +286,55 @@ if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
if [[ "${HOSTNAME}" =~ "controller-" ]]; then
tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" )
tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" )
# tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
elif [[ "${HOSTNAME}" =~ "compute-" ]]; then
tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
fi
# ticker - shows progress on the screen
tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
fi
# ticker - shows progress on the screen
tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
if [[ ${ENABLE_LIVE_STREAM} == "Y" ]] || [[ ${ENABLE_LIVE_STREAM} == "y" ]]; then
${TOOLBIN}/live_stream.py &
fi
#-------------------------------------------------------------------------------
# Main loop
#-------------------------------------------------------------------------------
OPT_DEBUG=0
REP=0
while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
[[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
do
# increment loop counter
((REP++))
# purge oldest files
purge_oldest_files
if [ ${#tlist[@]} -ne 0 ]; then
# Static stats collection is turned on
while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
[[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
do
# increment loop counter
((REP++))
# define filename timestamp
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
# purge oldest files
purge_oldest_files
# collect tools in parallel to separate output files
LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
do_parallel_commands
# define filename timestamp
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
# collect tools in parallel to separate output files
LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
do_parallel_commands
wait
# Compress latest increment
LOG "compressing: ${parallel_outfiles[@]}"
${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
done
# Wait for the compression to complete
wait
tools_cleanup 0
fi
# Compress latest increment
LOG "compressing: ${parallel_outfiles[@]}"
${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
done
# wait for compression to complete
# Should wait here in case live stats streaming is turned on.
wait
tools_cleanup 0
exit 0

View File

@ -60,7 +60,6 @@ case $1 in
log_daemon_msg "Starting ${NAME}"
if start-stop-daemon --start --background --quiet --oknodo --pidfile ${PIDFILE} \
--exec ${DAEMON} -- ${DAEMON_ARGS} ; then
./usr/local/bin/live_stream.py &
log_end_msg 0
else
log_end_msg 1

View File

@ -167,6 +167,10 @@ def collectMemstats(influx_info, node, ci, services, syseng_services, openstack_
fields[gsvc]["vsz"] += vsz
elif svc == "postgres":
if (len(line) <= i+2):
# Command line could be "sudo su postgres", skip it
break
if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
psvc = ""
if line[i + 2] in openstack_services:
@ -284,6 +288,10 @@ def collectSchedtop(influx_info, node, ci, services, syseng_services, openstack_
fields[gsvc] += occ
elif svc == "postgres":
if (len(line) <= i+2):
# Command line could be "sudo su postgres", skip it
break
if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
psvc = ""
if line[i + 2] in openstack_services:
@ -589,20 +597,22 @@ def collectPostgres(influx_info, node, ci):
postgres_output = postgres_output1 = None
influx_string = influx_string1 = ""
good_string = False
dbcount = 0
BATCH_SIZE = 10
while True:
try:
# make sure this is active controller, otherwise postgres queries wont work
if isActiveController():
while True:
# get list of databases and their sizes
postgres_output = Popen("sudo -u postgres psql --pset pager=off -q -t -c'SELECT datname, pg_database_size(datname) FROM pg_database WHERE datistemplate = false;'", shell=True, stdout=PIPE)
lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
if lines == "" or lines is None:
db_lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
if db_lines == "" or db_lines is None:
postgres_output.kill()
break
else:
# for each database from the previous output
for line in lines:
for line in db_lines:
if not line:
break
line = line.replace(" ", "").split("|")
@ -613,8 +623,8 @@ def collectPostgres(influx_info, node, ci):
# get tables for each database
sql = "SELECT table_schema,table_name,pg_size_pretty(table_size) AS table_size,pg_size_pretty(indexes_size) AS indexes_size,pg_size_pretty(total_size) AS total_size,live_tuples,dead_tuples FROM (SELECT table_schema,table_name,pg_table_size(table_name) AS table_size,pg_indexes_size(table_name) AS indexes_size,pg_total_relation_size(table_name) AS total_size,pg_stat_get_live_tuples(table_name::regclass) AS live_tuples,pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples FROM (SELECT table_schema,table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE') AS all_tables ORDER BY total_size DESC) AS pretty_sizes;"
postgres_output1 = Popen('sudo -u postgres psql --pset pager=off -q -t -d{} -c"{}"'.format(line[0], sql), shell=True, stdout=PIPE)
lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
for line in lines:
tbl_lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
for line in tbl_lines:
if line == "":
continue
else:
@ -648,6 +658,13 @@ def collectPostgres(influx_info, node, ci):
fields1["dead_tuples"] = int(elements[6])
influx_string1 += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement1, "node", tags["node"], "service", tags["service"], "table_schema", tags["table_schema"], "table", tags["table"], "table_size", fields1["table_size"], "index_size", fields1["index_size"], "total_size", fields1["total_size"], "live_tuples", fields1["live_tuples"], "dead_tuples", fields1["dead_tuples"]) + "\n"
good_string = True
dbcount += 1
if dbcount == BATCH_SIZE and good_string:
# Curl will barf if the batch is too large
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
p.communicate()
influx_string1 = ""
dbcount = 0
if good_string:
# send table data to InfluxDB
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True)
@ -655,6 +672,7 @@ def collectPostgres(influx_info, node, ci):
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
p.communicate()
influx_string = influx_string1 = ""
dbcount = 0
time.sleep(ci["postgres"])
postgres_output1.kill()
postgres_output.kill()
@ -1331,7 +1349,7 @@ if __name__ == "__main__":
live_svc = ("live_stream.py",)
static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh")
collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None}
openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres")
openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi")
# memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff
exclude_list = ("python", "python2", "bash", "perl", "sudo", "init")
skip_list = ("ps", "top", "sh", "<defunct>", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")