[PTP] Reduce cpu usage and correct holdover time

Fixes two issues:

1.
The CPU usage consumed by monitoring the kern.log file for changes in
the NIC cgu status was found to be excessive. This change reverts that
implementation and returns to a polling approach similar to what was
already used for monitoring ptp4l and os clock statuses. This has been
observed to bring main process in the notificationservice-base image
down from ~60% CPU to ~8-10% CPU utilization.

In the future, a preferrable implementation would be to work with device
driver owners to provide support for udev events, removing the need to
poll the status of devices.

2.
User supplied holdover times for each service type were not being
applied correctly. Updated daemon.py to set the holdover times in the
service context.

This also includes providing a user configurable "CONTROL_TIMEOUT"
parameter to control the frequency of polling. This is a global value
and affects the polling rate for all services.

Test-plan:
PASS: Build and install ptp-notification app and containers
PASS: Observe reduced CPU usage and confirm that GNSS monitoring still
works
PASS: User supplied holdover times work correctly, along with polling
rate

Story: 2010056
Task: 46512
Task: 46513

Signed-off-by: Cole Walker <cole.walker@windriver.com>
Change-Id: Ic0050cc09f5118e7f1c32aa13168084d6456437e
This commit is contained in:
Cole Walker 2022-10-03 15:37:33 -04:00
parent 30f691a06b
commit 63fefca36c
12 changed files with 50 additions and 183 deletions

View File

@ -12,7 +12,6 @@ RUN set -ex ;\
gcc python3-devel python3-pip \
&& pip3 install --upgrade pip \
&& pip3 install --user pecan \
&& pip3 install pygtail \
&& pip3 install oslo-config \
&& pip3 install oslo-messaging \
&& pip3 install WSME

View File

@ -48,9 +48,6 @@ PHC2SYS_TOLERANCE_HIGH = 37000001000
PTP_V1_KEY = "ptp_notification_v1"
# testing values
CGU_PATH_VALID = "/sys/kernel/debug/ice/0000:18:00.0/cgu"
SPEC_VERSION = "1.0"
DATA_VERSION = "1.0"
DATA_TYPE_NOTIFICATION = "notification"

View File

@ -1,69 +0,0 @@
#
# Copyright (c) 2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import logging
from pygtail import Pygtail
from typing import List
from abc import ABC, abstractmethod
from trackingfunctionsdk.common.helpers import log_helper
from trackingfunctionsdk.common.helpers.gnss_monitor import Observer
LOG = logging.getLogger(__name__)
log_helper.config_logger(LOG)
class DmesgSubject(ABC):
@abstractmethod
def attach(self, observer: Observer) -> None:
pass
@abstractmethod
def detach(self, observer: Observer) -> None:
pass
@abstractmethod
def notify(self) -> None:
pass
class DmesgWatcher(DmesgSubject, ABC):
_observers: List[Observer] = []
_checklist = []
_matched_line = ""
def __init__(self, dmesg_log_file="/logs/kern.log"):
self.dmesg_log_file = dmesg_log_file
def parse_dmesg_event(self, dmesg_entry) -> None:
for observer in self._observers:
if observer.dmesg_values_to_check['pin'] in dmesg_entry \
and observer.dmesg_values_to_check['pci_addr'] in dmesg_entry:
matched_line = dmesg_entry
self.notify(observer, matched_line)
def run_watcher(self) -> None:
"""
This is intended to be run as a separate thread to follow the log file for events.
There is currently no support in the NIC device drivers for udev events that
would avoid polling/monitoring.
"""
while True:
for line in Pygtail(self.dmesg_log_file, offset_file="./kern.offset"):
self.parse_dmesg_event(line)
def attach(self, observer: Observer) -> None:
LOG.info("DmesgWatcher: Attached an observer.")
self._observers.append(observer)
def notify(self, observer, matched_line) -> None:
LOG.info("DmesgWatcher: Notifying observers.")
observer.update(self, matched_line)
def detach(self, observer: Observer) -> None:
self._observers.remove(observer)
LOG.debug("Removed an observer.")

View File

@ -11,7 +11,6 @@ import re
from abc import ABC, abstractmethod
from trackingfunctionsdk.common.helpers import log_helper
from trackingfunctionsdk.common.helpers import constants
from trackingfunctionsdk.common.helpers.cgu_handler import CguHandler
from trackingfunctionsdk.model.dto.gnssstate import GnssState
@ -94,8 +93,6 @@ class GnssMonitor(Observer):
LOG.debug("Set state GNSS to %s" % self._state)
def get_gnss_status(self, holdover_time, freq, sync_state, event_time):
current_time = datetime.datetime.utcnow().timestamp()
time_in_holdover = round(current_time - event_time)
previous_sync_state = sync_state
max_holdover_time = (holdover_time - freq * 2)

View File

@ -117,12 +117,21 @@ class OsClockMonitor:
def get_os_clock_offset(self):
"""Get the os CLOCK_REALTIME offset"""
ptp_device_path = "/dev/" + self.ptp_device
offset = subprocess.check_output([constants.PHC_CTL_PATH, ptp_device_path, 'cmp']
try:
ptp_device_path = "/dev/" + self.ptp_device
offset = subprocess.check_output([constants.PHC_CTL_PATH, ptp_device_path, 'cmp']
).decode().split()[-1]
offset = offset.strip("-ns")
LOG.debug("PHC offset is %s" % offset)
self.offset = offset
offset = offset.strip("-ns")
LOG.debug("PHC offset is %s" % offset)
self.offset = offset
except Exception as ex:
# We have seen rare instances where the ptp device cannot be read but then works fine
# on the next attempt. Setting the offset to 0 here will allow the OS clock to move to
# holdover. If there is a real fault, it will stay in holdover and tranition to freerun
# but if it was just a single miss, it will return to locked on the next check.
LOG.warning("Unable to read device offset for %s due to %s" % (ptp_device_path, ex))
LOG.warning("Check operation of %s. Defaulting offset value to 0." % ptp_device_path)
self.offset = "0"
def set_os_clock_state(self):
offset_int = int(self.offset)
@ -139,8 +148,10 @@ class OsClockMonitor:
def os_clock_status(self, holdover_time, freq, sync_state, event_time):
current_time = datetime.datetime.utcnow().timestamp()
time_in_holdover = round(current_time - event_time)
time_in_holdover = None
previous_sync_state = sync_state
if previous_sync_state == constants.HOLDOVER_PHC_STATE:
time_in_holdover = round(current_time - event_time)
max_holdover_time = (holdover_time - freq * 2)
self.get_os_clock_offset()
@ -153,6 +164,8 @@ class OsClockMonitor:
self._state = constants.HOLDOVER_PHC_STATE
elif previous_sync_state == constants.HOLDOVER_PHC_STATE and \
time_in_holdover < max_holdover_time:
LOG.debug("OS Clock: Time in holdover is %s Max time in holdover is %s" % (
time_in_holdover, max_holdover_time))
self._state = constants.HOLDOVER_PHC_STATE
else:
self._state = constants.FREERUN_PHC_STATE

View File

@ -116,11 +116,14 @@ class PtpMonitor:
# Freerun —> Locked #
####################################
current_time = datetime.datetime.utcnow().timestamp()
time_in_holdover = round(current_time - self._ptp_event_time)
time_in_holdover = None
previous_sync_state = self._ptp_sync_state
if previous_sync_state == PtpState.Holdover:
time_in_holdover = round(current_time - self._ptp_event_time)
# max holdover time is calculated to be in a 'safety' zone
max_holdover_time = (self.holdover_time - self.freq * 2)
pmc, ptp4l, phc2sys, ptp4lconf = utils.check_critical_resources(self.ptp4l_service_name,
self.phc2sys_service_name)
# run pmc command if preconditions met
@ -140,6 +143,8 @@ class PtpMonitor:
sync_state = PtpState.Holdover
elif previous_sync_state == PtpState.Holdover and time_in_holdover < \
max_holdover_time:
LOG.debug("PTP Status: Time in holdover is %s Max time in holdover is %s" % (
time_in_holdover, max_holdover_time))
sync_state = PtpState.Holdover
else:
sync_state = PtpState.Freerun

View File

@ -16,7 +16,6 @@ from trackingfunctionsdk.client.ptpeventproducer import PtpEventProducer
from trackingfunctionsdk.common.helpers import constants
from trackingfunctionsdk.common.helpers import ptpsync as utils
from trackingfunctionsdk.common.helpers import log_helper
from trackingfunctionsdk.common.helpers.dmesg_watcher import DmesgWatcher
from trackingfunctionsdk.common.helpers.gnss_monitor import GnssMonitor
from trackingfunctionsdk.common.helpers.os_clock_monitor import OsClockMonitor
from trackingfunctionsdk.common.helpers.ptp_monitor import PtpMonitor
@ -262,6 +261,9 @@ class PtpWatcherDefault:
self.ptptracker_context[config] = PtpWatcherDefault.DEFAULT_PTPTRACKER_CONTEXT.copy()
self.ptptracker_context[config]['sync_state'] = PtpState.Freerun
self.ptptracker_context[config]['last_event_time'] = self.init_time
self.ptptracker_context[config]['holdover_seconds'] = os.environ.get("PTP_HOLDOVER_SECONDS", 30)
self.ptptracker_context[config]['poll_freq_seconds'] = os.environ.get(
"CONTROL_TIMEOUT", 2)
self.ptp_device_simulated = "true" == self.ptptracker_context[config].get(
'device_simulated',
"False")
@ -274,6 +276,9 @@ class PtpWatcherDefault:
self.gnsstracker_context[config] = PtpWatcherDefault.DEFAULT_GNSSTRACKER_CONTEXT.copy()
self.gnsstracker_context[config]['sync_state'] = GnssState.Failure_Nofix
self.gnsstracker_context[config]['last_event_time'] = self.init_time
self.gnsstracker_context[config]['holdover_seconds'] = os.environ.get("GNSS_HOLDOVER_SECONDS", 30)
self.gnsstracker_context[config]['poll_freq_seconds'] = os.environ.get(
"CONTROL_TIMEOUT", 2)
self.gnsstracker_context_lock = threading.Lock()
LOG.debug("gnsstracker_context: %s" % self.gnsstracker_context)
@ -282,6 +287,9 @@ class PtpWatcherDefault:
self.osclocktracker_context = PtpWatcherDefault.DEFAULT_OS_CLOCK_TRACKER_CONTEXT.copy()
self.osclocktracker_context['sync_state'] = OsClockState.Freerun
self.osclocktracker_context['last_event_time'] = self.init_time
self.osclocktracker_context['holdover_seconds'] = os.environ.get("OS_CLOCK_HOLDOVER_SECONDS", 30)
self.osclocktracker_context['poll_freq_seconds'] = os.environ.get(
"CONTROL_TIMEOUT", 2)
self.osclocktracker_context_lock = threading.Lock()
# Overall Sync Context
@ -289,6 +297,9 @@ class PtpWatcherDefault:
self.overalltracker_context = PtpWatcherDefault.DEFAULT_OVERALL_SYNC_TRACKER_CONTEXT.copy()
self.overalltracker_context['sync_state'] = OverallClockState.Freerun
self.overalltracker_context['last_event_time'] = self.init_time
self.overalltracker_context['holdover_seconds'] = os.environ.get("OVERALL_HOLDOVER_SECONDS", 30)
self.overalltracker_context['poll_freq_seconds'] = os.environ.get(
"CONTROL_TIMEOUT", 2)
self.overalltracker_context_lock = threading.Lock()
self.event_timeout = float(os.environ.get('CONTROL_TIMEOUT', 2))
@ -315,12 +326,7 @@ class PtpWatcherDefault:
# Main loop in run() sets it to false after the first iteration
self.forced_publishing = True
self.watcher = DmesgWatcher()
self.observer_list = [GnssMonitor(i) for i in self.daemon_context['GNSS_CONFIGS']]
for observer in self.observer_list:
self.watcher.attach(observer)
self.watcher_thread = threading.Thread(target=self.watcher.run_watcher)
# Setup OS Clock monitor
self.os_clock_monitor = OsClockMonitor(phc2sys_config=self.daemon_context['PHC2SYS_CONFIG'])
@ -343,9 +349,6 @@ class PtpWatcherDefault:
# start location listener
self.__start_listener()
# start GNSS monitoring
self.watcher_thread.start()
while True:
# announce the location
forced = self.forced_publishing
@ -398,7 +401,9 @@ class PtpWatcherDefault:
new_event_time = last_event_time
previous_sync_state = sync_state
current_time = datetime.datetime.utcnow().timestamp()
time_in_holdover = round(current_time - last_event_time)
time_in_holdover = None
if previous_sync_state == constants.HOLDOVER_PHC_STATE:
time_in_holdover = round(current_time - last_event_time)
max_holdover_time = (holdover_time - freq * 2)
gnss_state = None
os_clock_state = None
@ -434,6 +439,8 @@ class PtpWatcherDefault:
sync_state = OverallClockState.Holdover
elif previous_sync_state == constants.HOLDOVER_PHC_STATE and \
time_in_holdover < max_holdover_time:
LOG.debug("Overall sync: Time in holdover is %s Max time in holdover is %s" % (
time_in_holdover, max_holdover_time))
sync_state = OverallClockState.Holdover
else:
sync_state = OverallClockState.Freerun

View File

@ -1,56 +0,0 @@
#
# Copyright (c) 2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import unittest
import os
from unittest.mock import MagicMock
from trackingfunctionsdk.common.helpers.dmesg_watcher import DmesgWatcher
from trackingfunctionsdk.common.helpers.gnss_monitor import GnssMonitor
testpath = os.environ.get("TESTPATH", "")
class DmesgWatcherTests(unittest.TestCase):
testDmesgWatcher = DmesgWatcher()
observer_a = GnssMonitor(testpath + "./test_input_files/ts2phc_valid.conf",
"tty_GNSS_1800_0", "0000:18:00.0",
testpath + "./test_input_files/mock_cgu_output")
observer_b = GnssMonitor(testpath + "./test_input_files/ts2phc_valid.conf",
"tty_GNSS_1a00_0", "0000:1a:00.0",
testpath + "./test_input_files/mock_cgu_output")
def test_parse_dmesg_event(self):
self.testDmesgWatcher.attach(self.observer_a)
self.testDmesgWatcher.notify = MagicMock()
with open(testpath + "./test_input_files/mock_kern.log", 'r') as dmesg:
for line in dmesg:
self.testDmesgWatcher.parse_dmesg_event(line)
assert self.testDmesgWatcher.notify.called
# Test that notify is not called when there is no match
self.testDmesgWatcher.notify.reset_mock()
self.testDmesgWatcher.attach(self.observer_b)
with open(testpath + "./test_input_files/mock_kern.log", 'r') as dmesg:
for line in dmesg:
self.testDmesgWatcher.parse_dmesg_event(line)
assert self.testDmesgWatcher.notify.assert_not_called
def test_attach_detach(self):
self.testDmesgWatcher.attach(self.observer_a)
self.testDmesgWatcher.attach(self.observer_b)
self.assertEqual(len(self.testDmesgWatcher._observers), 2)
self.testDmesgWatcher.detach(self.observer_a)
self.testDmesgWatcher.detach(self.observer_b)
self.assertEqual(len(self.testDmesgWatcher._observers), 0)
def test_notify(self):
self.observer_a.update = MagicMock
self.testDmesgWatcher.notify(observer=self.observer_a,
matched_line="2022-06-03T19:50:05.959 controller-0 kernel: warning [ "
"4.635511] ice 0000:18:00.0: <DPLL1> state changed to: "
"locked_ho_ack, pin GNSS-1PPS")
assert self.observer_a.update.called

View File

@ -7,4 +7,3 @@ WebOb~=1.8.7
requests~=2.27.1
mock
PyYAML
pygtail~=0.12.0

View File

@ -44,16 +44,9 @@ NOTIFICATION_TRANSPORT_ENDPOINT = 'rabbit://{0}:{1}@[{2}]:{3}'.format(
PTP_DEVICE_SIMULATED = os.environ.get("PTP_DEVICE_SIMULATED", True)
PTP_HOLDOVER_SECONDS = os.environ.get("PTP_HOLDOVER_SECONDS", 30)
PTP_POLL_FREQ_SECONDS = os.environ.get("PTP_POLL_FREQ_SECONDS", 2)
GNSS_HOLDOVER_SECONDS = os.environ.get("GNSS_HOLDOVER_SECONDS", 30)
GNSS_POLL_FREQ_SECONDS = os.environ.get("GNSS_POLL_FREQ_SECONDS", 2)
OS_CLOCK_HOLDOVER_SECONDS = os.environ.get("OS_CLOCK_HOLDOVER_SECONDS", 30)
OS_CLOCK_POLL_FREQ_SECONDS = os.environ.get("OS_CLOCK_POLL_FREQ_SECONDS", 2)
OVERALL_HOLDOVER_SECONDS = os.environ.get("OVERALL_HOLDOVER_SECONDS", 30)
OVERALL_POLL_FREQ_SECONDS = os.environ.get("OVERALL_POLL_FREQ_SECONDS", 2)
PHC2SYS_CONFIG = None
PHC2SYS_SERVICE_NAME = None
@ -111,20 +104,16 @@ context = {
'ptptracker_context': {
'device_simulated': PTP_DEVICE_SIMULATED,
'holdover_seconds': PTP_HOLDOVER_SECONDS,
'poll_freq_seconds': PTP_POLL_FREQ_SECONDS
'holdover_seconds': PTP_HOLDOVER_SECONDS
},
'gnsstracker_context': {
'holdover_seconds': GNSS_HOLDOVER_SECONDS,
'poll_freq_seconds': GNSS_POLL_FREQ_SECONDS
'holdover_seconds': GNSS_HOLDOVER_SECONDS
},
'osclocktracker_context': {
'holdover_seconds': OS_CLOCK_HOLDOVER_SECONDS,
'poll_freq_seconds': OS_CLOCK_POLL_FREQ_SECONDS
'holdover_seconds': OS_CLOCK_HOLDOVER_SECONDS
},
'overalltracker_context': {
'holdover_seconds': OVERALL_HOLDOVER_SECONDS,
'poll_freq_seconds': OVERALL_POLL_FREQ_SECONDS
'holdover_seconds': OVERALL_HOLDOVER_SECONDS
}
}
@ -140,8 +129,6 @@ sqlalchemy_conf_json = json.dumps(sqlalchemy_conf)
default_daemoncontrol = DaemonControl(sqlalchemy_conf_json, json.dumps(context))
default_daemoncontrol.refresh()
while True:
pass

View File

@ -111,20 +111,10 @@ spec:
value: "{{ .Values.ptptracking.device.simulated }}"
- name: PTP_HOLDOVER_SECONDS
value: "{{ .Values.ptptracking.device.holdover_seconds }}"
- name: PTP_POLL_FREQ_SECONDS
value: "{{ .Values.ptptracking.device.poll_freq_seconds }}"
- name: GNSS_HOLDOVER_SECONDS
value: "{{ .Values.ptptracking.gnss.holdover_seconds }}"
- name: GNSS_POLL_FREQ_SECONDS
value: "{{ .Values.ptptracking.gnss.poll_freq_seconds }}"
- name: OS_CLOCK_HOLDOVER_SECONDS
value: "{{ .Values.ptptracking.osclock.holdover_seconds }}"
- name: OS_CLOCK_POLL_FREQ_SECONDS
value: "{{ .Values.ptptracking.osclock.poll_freq_seconds }}"
- name: OVERALL_HOLDOVER_SECONDS
value: "{{ .Values.ptptracking.overall.holdover_seconds }}"
- name: OVERALL_POLL_FREQ_SECONDS
value: "{{ .Values.ptptracking.overall.poll_freq_seconds }}"
- name: NOTIFICATIONSERVICE_USER
value: "admin"
- name: NOTIFICATIONSERVICE_PASS
@ -149,6 +139,8 @@ spec:
value: "{{ .Values.ptptracking.ts2phcServiceName }}"
- name: LOGGING_LEVEL
value: "{{ .Values.ptptracking.log_level }}"
- name: CONTROL_TIMEOUT
value: "{{ .Values.ptptracking.control_timeout }}"
command: ["python3", "/mnt/ptptracking_start.py"]
securityContext:
privileged: true

View File

@ -75,16 +75,12 @@ ptptracking:
repository: starlingx/notificationservice-base
tag: stx.7.0-v1.0.5
pullPolicy: IfNotPresent
control_timeout: 2
device:
simulated: false
holdover_seconds: 15
poll_freq_seconds: 2
gnss:
holdover_seconds: 15
poll_freq_seconds: 2
osclock:
holdover_seconds: 15
poll_freq_seconds: 2
overall:
holdover_seconds: 15
poll_freq_seconds: 2