#!/bin/bash ################################################################################ # # Copyright (c) 2016 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ################################################################################ # # Description: Displays memory usage information to check for memory leaks. # # Behaviour : The script takes in a list of commands whose processes you want # monitored and then finds their process IDs, and uses that to find # their current Resident Set Size (RSS) using ps, and looks up their # Proportional Set Size (PSS) in /proc//smaps. Only the initial # process run by the system is monitored; child processes and other # instances of these processes are ignored. # # This script is to be run on a controller node, and requires that it be run # with sudo privileges or else it may not have access to /proc//smaps for # each of the desired processes. # # The script should be run with the following options: # sudo ./memchk -t [#] --C [commands] # Where following -t the time in seconds with which you want this script to repeat # should be indicated. The default if no time is specified is 3600 seconds (1 hour). # Following --C all arguments are viewed as commands you wish to be monitored. # Each command should be separated by a space. # e.g. sudo ./memchk -t 1800 --C command1 command2 command3 # # Error logs can be found in /tmp/memchk_err.log # Standard output can be found in /tmp/memchk_out.log # ################################################################################ changeP=() changeR=() rss=() firstP=() lastP=() firstR=() lastR=() leaking=() leakFlag=0 flag=0 period=0 s1=() # Holds behaviour of most recent sample (does not change if present behaviour continues) s2=() # Holds type of the previously observed behaviour that is different from the present behaviour commands=() trend=() baseline=() # Sum of all RSS values for a given PID count=() # Number of times RSS values have been sampled for a given PID increasing=() decreasing=() stable=() pattern=() # Stores a string indicating the present pattern function trapCalled { echo $'\nReceived trap signal' >&2 exit } trap trapCalled SIGHUP SIGINT SIGTERM function helpMessage { echo "--------------------------------------------------------------------------------------" echo "Memory Leak And Information Tracking Tool" echo "" echo "Usage:" echo "" echo "sudo ./memchk.sh --C [commands]" echo "" echo " -t ... time in seconds with which to run this script. No time" echo " specified will result in the default of 3600s (1 hour)." echo " --C ... space delimited list of commands to monitor. This option" echo " must be the last one entered." echo " --help | -h ... this info" echo "" echo "Note: This script must be run using sudo. If it is not, access to the memory information" echo " of a given process may not be allowed by the system. PSS info is obtained from" echo " /procs//smaps" echo "" echo " Error logs can be found in /tmp/memchk_err.log" echo " Standard output can be found in /tmp/memchk_out.log" echo "" echo "" echo "Examples:" echo "" echo "sudo memchk -t 60 --C mtcClient mtcAgent ... Check PSS and RSS values of the processes belonging to mtcClient" echo " and mtcAgent every 60 seconds (1 minute)" echo "sudo memchk -t 3600 --C pmond rmond hwmond ... Check PSS and RSS values of pmond, rmond and hwmond every 3600s (1h)" echo "sudo memchl --C pmond rmond hwmond ... Check PSS and RSS values of commands using default period of 3600s (1h)" echo "--------------------------------------------------------------------------------------" exit 0 } # Prints information on suspected leaking process function memLeak { printf "\n" >&2 printf '%0.1s' "*"{1..150} >&2 # Iterates over all keys in the array. for proc in ${!leaking[@]} do printf "\nPossible mem leak in: %s PID: %s Current RSS: %s Orig RSS: %s Current PSS: %s Orig PSS: %s\n" \ ${leaking[proc]} $proc ${rss[proc]} ${firstR[proc]} ${lastP[proc]} ${firstP[proc]} >&2 done printf '%0.1s' "*"{1..150} >&2 printf "\n" >&2 } if [ $UID -ne 0 ]; then echo $'\nWarning: Memchk must be run as \'root\' user to access PSS memory information' echo $'Use the -h option for help\n' exit 1 fi if [ $# -eq 0 ]; then echo $'\nNo commands specified\nPlease try again and enter a command whose memory you would like to monitor' echo $'Use the -h option for help\n' exit 1 fi exec > >(tee /tmp/NEWmemchk_out.log) 2> >(tee /tmp/NEWmemchk_err.log >&2) # Cycles through commandline arguments to make sure valid input was received and # to assign values correctly while [[ $# > 0 ]]; do key="$1" case $key in # To make this more user-friendly, instead of having the user enter the period in seconds, consider using # 'shopt -s extglob' and 'if [[ $2 = +([0-9])m ]];' to check if the user entered 15m as a period to indicate 15 minutes etc. # Modify the regex for seconds and hours as well, then multiply value as necessary to convert into seconds for script -t) period="$2" shift ;; --C) shift if [ "$#" -eq "0" ]; then printf "Error: No commands specified.\n" exit 1 fi for c in "$@"; do commands+=("$1") shift done ;; -h|--help) helpMessage ;; *) printf "\nUnknown argument passed: %s\n" $key printf "Use the -h option for help\n" exit 1 esac shift done # Makes sure period has a positive value if [ "$period" -le "0" ]; then period=3600 printf "You have entered an invalid period. Period has been set to 3600 seconds.\n" # The rate of kB/h has been hard-coded into the table, if values greater than or equal to 1 hour are used, the table # will not show an accurate representation in the change in usage over time. There are various accuracy issues in # modifying the code to display data to match your chosen period. Consider this and modify accordingly. elif [ "$period" -lt "3600" ]; then printf "\nWARNING: You have chosen a period that is less than 1 hour. The rate of change in the table is displayed in kB/h, keep this in mind when reviewing results.\n" fi while true; do # Prints header for columns printf "\n%15s | %8s | Leak | %10s | %13s | %8s | %8s | %8s | %13s | %8s | %8s | %8s | Period: %-${#period}ss\n" \ "Cmd" "PID" "Trend" "Change in RSS" "RSS" "Orig RSS" "Prev RSS" "Change in PSS" "PSS" "Orig PSS" "Prev PSS" "$period" >&1 padding=$(printf '%0.1s' "-"{1..180}) printf '%*.*s' 0 $((156 + ${#period} )) "$padding" # Prints line of hyphens of variable size depending on the number of characters in period. # Cycles through each of the originally entered commands. This list does not change. for cmd in ${commands[@]} do # Finds all the PIDs associated with each command (commands may have more than one instance) procs="$(pgrep $cmd)" # The number of processes may change on each loop. Keep this in mind if expanding or reusing this script. for pid in ${procs[@]} do # In smaps the PSS value is located 3 lines below the line containing the process name. This works by setting # the awk variable comm to contain the same value as cmd, the file is then searched for the string pattern # contained in comm (cmd) and the PSS value associated with each instance of comm is summed and then printed. pss=$(awk -v comm="$cmd" '$0 ~ comm {getline;getline;getline;sum += $2;} END {print sum}' /proc/"$pid"/smaps) # obtains the RSS value of the indicated process rssCurrent=$(ps -p "$pid" --no-header -o rss) lastR[pid]="${rss[pid]}" # Child processes may exist ephemerally, as a result they may be added to our list of PIDs, but no longer # exist when we try to read their associated files in /proc/. This makes sure the file exists and that the # parent process is 1. If the parent process ID is not 1 then the process in question is a child proceess # and we do not care about its memory usage (for the purposes of this specific script). The continue # statement will return us to the for-loop and begin running for the next pid. if [ -f "/proc/$pid/status" ] && [ "$(awk '$0 ~ "PPid:" {print $2}' /proc/"$pid"/status)" -ne "1" ];then continue; fi # This checks that neither rssCurrent nor pss have empty values due to a child process being generated # and then killed off before its values could be read. Root occasionally generates a child process of # one of the monitored commands so the above if-statement doesn't exclude it because the PPID is 1. if [ -z "$rssCurrent" ] || [ -z "$pss" ]; then continue; fi # Sets initial values for PSS and RSS. NA is set instead of 0 because using numbers could lead to false # or inaccurate information. It also previously allowed one to see when child processes were spawned. if [ "$flag" -ne "1" ]; then firstP[pid]="$pss" lastP[pid]="NA" rss[pid]="$rssCurrent" firstR[pid]="${rss[pid]}" lastR[pid]="NA" s1[pid]="" s2[pid]="" trend[pid]=0 increasing[pid]=0 decreasing[pid]=0 stable[pid]=0 count[pid]=0 baseline[pid]=0 fi # In the event of a memory leak (the RSS value increasing), an X is placed in the 'Leak' column of the # printed table. The PID of the process is also added to an array to be sent to the memLeak function # once all of the commands' processes have been checked. A flag indicating that a possible leak has # been detected is also set. if [ "${rss[pid]}" -lt "$rssCurrent" ]; then lastR[pid]="${rss[pid]}" rss[pid]="$rssCurrent" leaking[pid]="$cmd" leak[pid]="X" let leakFlag=1 fi # Calculates the changes in PSS and RSS usage over time. If this is the first run and there is no # previous value with which to compare against, delta is set to 0, where delta is the change over # time. if [ "${lastP[pid]}" = "NA" ]; then changeP[$pid]=0; deltaP=0.000; else changeP[pid]="$((changeP[$pid] + $pss - lastP[$pid]))"; deltaP=$(awk -v chP="${changeP[$pid]}" -v hrs="${hours}" -v t="${period}" 'BEGIN {printf "%.3f", (chP/(hrs*t))*3600; exit(0)}'); fi if [ "${lastR[pid]}" = "NA" ]; then changeR[$pid]=0; deltaR=0.000; else changeR[pid]="$((changeR[$pid] + rss[$pid] - lastR[$pid]))"; deltaR=$(awk -v chR="${changeR[$pid]}" -v hrs="${hours}" -v t="${period}" 'BEGIN {printf "%.3f", (chR/(hrs*t))*3600; exit(0)}'); fi # The below if-else block seeks to determine gradual sustained patterns of RSS usage over time to determine if the memory usage is gradually # increasing throughout the lifespan of the process (possible memory leak) or not. Non-gradual usage changes can be due to dynamic reallocation # and such 'eratic' behaviour is not indicative of any overall trends. # NOTE: If you would like to do this properly and determine whether or not such patterns exist by evaluating the RSS usage accross the entire # lifespan of the script, consider the following method: # Take the (RSS, time) value-pairs and make an augmented matrix and then use Gaussian elimination to solve the matrix and use the remaining # values as the coefficients to create a least-squares parabola, which you can then find the first derivative of to determine the rate of # change -- which will indicate increasing or decreasing behaviour at your current point relative to nearby datapoints and the behaviour # of the rest of your graph (function). # To do this consider using the python package numpy for matrix math and for derivatives. The issue with this method is finding a way to # pipe data to a python script and have it return to the bash script. Because piping is usually done asynchronously, you may find that there # are issues with having the values returned and printed properly in the bash script. This math can also be done in bash, but will take # considerable effort. #--------------------------------------------------------------------------------------------------------------------------------------------------- # This part checks to see that an established trend is being maintained. # It first checks that trend[pid] is greater than or equal to 3 because the else-block below increments the trend number such that when the same # pattern e.g. an increase in RSS that exceeds the baseline average occurs 3 times in a row, it establishes that there exists a trend of RSS increasing. # The existence of a pattern or 'trend' is gauged by whether the same behaviour has occured 3 times in a row, and that it continues to occur without # the opposite behaviour happening. For example, if a trend of increasing RSS has been observed, and on the next sample the RSS value is found to # be below the baseline average, this indicates that a decrease has occured and thus the trend has been broken, and a new trend must be established. # If the current trend is either increasing or decreasing, the RSS value can be equal to the baseline average for two consecutive samples without # the trend being broken. However, if the RSS value is equal to the baseline average for 3 consecutive samples, this indicates a new trend of the # RSS value reaching a stable value, and the current trend of increasing or decreasing is broken; that is to say, only a trend of 'stable' will be # permitted; if an increase or decrease is observed, all temporary and trend values will be reset, and the code will enter the else-block below and # attempt to establish a new trend from scratch. The reason for this behaviour is that trends are determined by observing a behaviour three # consecutive times, if a stable behaviour starts turning into an increasing or decreasing behaviour, the else-block is entered to wait for a new # behaviour to be established. # A trend cannot change immediately from increasing to decreasing. This is done to avoid representing erratic behaviour as a long-term pattern. # An increasing or decreasing trend must change to 'none' -- no trend observed -- before the opposite trend can be declared. # The baseline average is the RSS values for a PID from each sample added together and divided by the number of samples that have taken place. let count[pid]+=1 let baseline[pid]+="$rssCurrent" avg=$(awk -v b="${baseline[pid]}" -v c="${count[pid]}" 'BEGIN {printf "%.0f", (b/c); exit(0)}') if [ "${trend[pid]}" -ge "3" ]; then if [ "${rss[pid]}" -gt "$avg" ] && ([ "${s1[pid]}" = "increasing" ] || ([ "${s1[pid]}" != "decreasing" ] && [ "${s2[pid]}" != "decreasing" ]) && [ "${stable[pid]}" -ne "3" ]); then if [ "${s1[pid]}" != "increasing" ]; then s2[pid]="${s1[pid]}" s1[pid]="increasing" fi elif [ "${rss[pid]}" -eq "$avg" ]; then if [ "${s1[pid]}" != "stable" ]; then stable[pid]=0 s2[pid]="${s1[pid]}" s1[pid]="stable" fi let stable[pid]+=1 let stable[pid]+=1 elif [ "${rss[pid]}" -lt "$avg" ] && ([ "${s1[pid]}" = "decreasing" ] || ([ "${s1[pid]}" != "increasing" ] && [ "${s2[pid]}" != "increasing" ]) && [ "${stable[pid]}" -ne "3" ]); then if [ "${s1[pid]}" != "decreasing" ]; then s2[pid]="${s1[pid]}" s1[pid]="decreasing" fi else s1[pid]="" s2[pid]="" trend[pid]=0 increasing[pid]=0 decreasing[pid]=0 stable[pid]=0 fi # This else-block is used to establish whether or not a trend has been established. It waits for a pattern of the RSS value of a PID to increase, # decrease, or remain stable relative to the baseline average three times in a row before it will declare that a trend exists. This is to avoid # viewing erratic increases and decreases in RSS as gradual increases or decreases in the system's (process') RSS usage. else if [ "${count[pid]}" -gt "0" ]; then if [ "${rss[pid]}" -gt "$avg" ]; then let trend[pid]+=1 let increasing[pid]+=1 s1[pid]="increasing" elif [ "${rss[pid]}" -eq "$avg" ]; then let trend[pid]+=1 let stable[pid]+=1 s1[pid]="stable" elif [ "${rss[pid]}" -lt "$avg" ]; then let trend[pid]+=1 let decreasing[pid]+=1 s1[pid]="decreasing" fi if [ "${increasing[pid]}" -gt "0" ] && [ "${decreasing[pid]}" -gt "0" ]; then increasing[pid]=0 decreasing[pid]=0 stable[pid]=0 trend[pid]=0 fi fi fi if [ "${trend[pid]}" -ge "3" ]; then pattern[pid]="${s1[pid]}"; else pattern[pid]="none"; fi # Sets the trend variable for printing if a trend exists printf "\n%15s | %8s | %2s | %10s | %8s kB/h | %8s | %8s | %8s | %8s kB/h | %8s | %8s | %8s |" \ $cmd $pid "${leak[pid]}" "${pattern[pid]}" $deltaR ${rss[pid]} ${firstR[pid]} ${lastR[pid]} $deltaP $pss ${firstP[pid]} ${lastP[pid]} >&1 lastP[pid]="$pss" leak[pid]="" # Resets the indicator in the 'Leak' column done done if [ "$leakFlag" -eq "1" ]; then memLeak leaking[@]; fi # Calls the mem leak function if flag is set unset leaking[@] # Clear the array holding PIDs of processes with potential leaks let leakFlag=0 let hours+=1 # Hour count[pid]er used in calculating delta let flag=1 # Flag indicating that first run has completed so we no longer have to set values of 'NA' echo $'\n' sleep "$period" done