utils/middleware/util/recipes-common/monitor-tools/scripts/schedtop

1313 lines
40 KiB
Perl
Executable File

#!/usr/bin/perl
########################################################################
#
# Copyright (c) 2015-2016 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
########################################################################
#
# Description:
# This displays occupancy and scheduling information per sample period.
# Output includes total occupancy, per-core occupancy, loadavg, per-task cpu,
# per-task scheduling, per-task io-wait.
#
# Usage: schedtop OPTIONS
# [--delay=<seconds>] [--repeat=<num>] [--period=<seconds>]
# [--reset-hwm] [--idle] [--sort=<cpu|io>]
# [--help]
use strict;
use warnings;
use Data::Dumper;
use POSIX qw(uname strftime);
use Time::HiRes qw(clock_gettime usleep CLOCK_MONOTONIC CLOCK_REALTIME);
use Benchmark ':hireswallclock';
use Carp qw(croak carp);
use Math::BigInt;
# Define toolname
our $TOOLNAME = "schedtop";
our $VERSION = "0.1";
# Constants
use constant SI_k => 1.0E3;
use constant SI_M => 1.0E6;
use constant SI_G => 1.0E9;
use constant Ki => 1024.0;
use constant Mi => 1024.0*1024.0;
use constant Gi => 1024.0*1024.0*1024.0;
# Globals
our %opt_V = ();
our %opt_P = ();
our %percpu_0 = ();
our %percpu_1 = ();
our %task_0 = ();
our %task_1 = ();
our %tids_0 = ();
our %tids_1 = ();
our %D_task = ();
our %D_percpu = ();
our %loadavg = ();
our $tm_0 = ();
our $tm_1 = ();
our $tr_0 = ();
our $tr_1 = ();
our $tm_elapsed = ();
our $tr_elapsed = ();
our $tm_final = ();
our $uptime = ();
our $num_cpus = 1;
our $affinity_mask = Math::BigInt->new('0');
our $w_aff = 10;
our $num_tasks = 0;
our $num_blk = 0;
our $is_schedstat = 1;
our $USER_HZ = 100; # no easy way to get this
our $CLOCK_NS = SI_G / $USER_HZ;
our $print_host = 1;
# Print options
our ($P_none, $P_lite, $P_brief, $P_full) = (0, 1, 2, 3);
our ($P_ps, $P_cpu, $P_del, $P_io, $P_id, $P_cmd) = (0, 1, 2, 3, 4, 5);
our @P_list = ($::P_ps, $::P_cpu, $::P_del, $::P_io, $::P_id, $::P_cmd);
# Argument list parameters
our ($arg_debug,
$arg_delay,
$arg_repeat,
$arg_period,
$arg_reset_hwm,
$arg_idle,
$arg_sort,
$arg_print) = ();
#-------------------------------------------------------------------------------
# MAIN Program
#-------------------------------------------------------------------------------
my $ONE_BILLION = 1.0E9;
my $MIN_DELAY = 0.001;
my $MAX_DELAY = 0.001;
# benchmark variables
my ($bd, $b0, $b1);
my @policies = ('OT', 'FF', 'RR', 'BA', 'ID', 'UN', 'UN');
my @delta_list = (
'nr_switches',
'nr_migrations',
'exec_runtime',
'wait_sum',
'wait_count',
'iowait_sum',
'iowait_count',
'syscr',
'syscw',
'read_bytes',
'write_bytes',
'cancelled_write_bytes',
);
my @state_list = (
'exec_max', 'wait_max',
'pid', 'ppid', 'state', 'comm', 'cmdline', 'wchan', 'affinity',
'VmSize', 'VmRSS', 'start_time',
'nice', 'policy', 'priority', 'rt_priority', 'task_cpu'
);
# Autoflush output
select(STDERR);
$| = 1;
select(STDOUT); # default
$| = 1;
# Parse input arguments and print tool usage if necessary
&parse_schedtop_args(
\$::arg_debug,
\$::arg_delay,
\$::arg_repeat,
\$::arg_period,
\$::arg_reset_hwm,
\$::arg_idle,
\$::arg_sort,
\$::arg_print,
);
# Set default print options
if ($::arg_print eq 'full') {
for my $P (@::P_list) { $::opt_P{$P} = $::P_full; }
} elsif ($::arg_print eq 'brief') {
for my $P (@::P_list) { $::opt_P{$P} = $::P_brief; }
} else {
for my $P (@::P_list) { $::opt_P{$P} = $::P_none; }
}
# Disable some options if data not present
$::opt_V{'sched'} = &is_sched();
$::opt_V{'io'} = &is_io();
if ($::opt_V{'sched'} == 0) {
$::opt_P{$::P_cpu} = $::P_none;
$::opt_P{$::P_del} = $::P_none;
$::opt_P{$::P_io} = $::P_none;
undef $::arg_reset_hwm;
}
if ($::opt_V{'io'} == 0) {
if ($::opt_V{'sched'} == 0) {
$::opt_P{$::P_io} = $::P_none;
$::arg_sort = 'cpu';
} else {
if ($::opt_P{$::P_io} != $::P_none) {
$::opt_P{$::P_io} = $::P_lite;
}
}
}
# Check for root user
if ($>) {
warn "$::TOOLNAME: requires root/sudo.\n";
exit 1;
}
# Print out some debugging information
if (defined $::arg_debug) {
$Data::Dumper::Indent = 1;
}
# Check for schedstat support; fallback to stats
$is_schedstat = -e '/proc/schedstat' ? 1 : 0;
# Print out selected options
printf "selected options: ".
"delay = %.3fs, repeat = %d, idle=%s, hwm=%s, sort=%s, print=%s\n",
$::arg_delay, $::arg_repeat,
(defined $::arg_idle ? 'idle_tasks' : 'no_idle_tasks'),
(defined $::arg_reset_hwm ? 'reset-hwm' : 'unchanged'),
$::arg_sort, $::arg_print;
# Capture timestamp
$b0 = new Benchmark;
# Get number of logical cpus
&get_num_logical_cpus(\$::num_cpus);
$::affinity_mask = Math::BigInt->new('0');
for (my $i=0; $i < $::num_cpus; $i++) {
my $y = Math::BigInt->new('1');
$y->blsft($i);
$::affinity_mask->bior($y);
}
$w_aff = &max(length 'AFF', length $::affinity_mask->as_hex());
# Reset scheduling hi-water marks
if (defined $::arg_reset_hwm) {
&get_tids(\%::tids_1);
&reset_sched_hwm(\%::tids_1);
sleep(0.001);
}
# Get current hires epoc timestamp
$::tm_1 = clock_gettime(CLOCK_MONOTONIC);
$::tr_1 = clock_gettime(CLOCK_REALTIME);
$::tm_final = $::tm_1 + $::arg_delay*$::arg_repeat;
# Set initial delay
$::tm_elapsed = $::arg_delay;
$MAX_DELAY = $::arg_delay + $MIN_DELAY;
# Get overall per-cpu stats
if ($is_schedstat) {
&read_schedstat(\%::percpu_1);
} else {
&read_stat(\%::percpu_1);
}
# Get list of pids and tids
&get_tids(\%::tids_1);
# Get current scheduling and io info for all tids
&read_sched(\%::tids_1, \%::task_1);
# determine column sort order
my ($s_key1, $s_key2, $s_key3) = ();
if ($::arg_sort eq 'cpu') {
($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', 'pid');
} elsif ($::arg_sort eq 'io') {
($s_key1, $s_key2, $s_key3) = ('io', 'ios', 'exec_runtime');
} else {
($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', , 'pid');
}
# Main loop
REPEAT_LOOP: for (my $repeat=1; $repeat <= $::arg_repeat; $repeat++) {
# copy all state variables
$::tm_0 = (); $::tr_0 = (); %::percpu_0 = (); %::tids_0 = (); %::task_0 = ();
$::tm_0 = $::tm_1; $::tr_0 = $::tr_1;
foreach my $cpu (keys %::percpu_1) { $::percpu_0{$cpu} = $::percpu_1{$cpu}; }
foreach my $tid (keys %::tids_1) { $::tids_0{$tid} = $::tids_1{$tid}; }
foreach my $tid (keys %::task_1) {
foreach my $var (keys $::task_1{$tid}) {
$::task_0{$tid}{$var} = $::task_1{$tid}{$var};
}
}
# estimate sleep delay to achieve desired interarrival by subtracting out
# the measured cpu runtime of the tool.
my $delay = $::arg_delay;
if (defined $::D_task{$$}{'exec_runtime'}) {
$delay -= ($::D_task{$$}{'exec_runtime'}/SI_k);
}
$delay = $MIN_DELAY if ($delay < $MIN_DELAY);
$delay = $MAX_DELAY if ($delay > $MAX_DELAY);
usleep( SI_M*$delay );
# Collect current state
$::tm_1 = (); $::tr_1 = (); %::percpu_1 = (); %::tids_1 = (); %::task_1 = ();
# Get current hires epoc timestamp
$::tm_1 = clock_gettime(CLOCK_MONOTONIC);
$::tr_1 = clock_gettime(CLOCK_REALTIME);
# Get overall per-cpu stats
if ($is_schedstat) {
&read_schedstat(\%::percpu_1);
} else {
&read_stat(\%::percpu_1);
}
# Get list of pids and tids
&get_tids(\%::tids_1);
# Get current scheduling and io info for all tids
&read_sched(\%::tids_1, \%::task_1);
# Get current uptime
&get_uptime(\$::uptime);
# Get current loadavg
&get_loadavg(\%::loadavg, \$::runq, \$::num_tasks);
# Get current processes blocked
&get_blocked(\$::num_blk);
# Delta calculation
%::D_task = (); %::D_percpu = ();
$::tm_elapsed = $::tm_1 - $::tm_0;
$::tr_elapsed = $::tr_1 - $::tr_0;
foreach my $tid (keys %::task_1) {
next if ( !(exists $::task_0{$tid}) );
# simple delta
foreach my $var (@delta_list) {
$::D_task{$tid}{$var} = ($::task_1{$tid}{$var} - $::task_0{$tid}{$var});
}
# state information
foreach my $state (@state_list) {
$::D_task{$tid}{$state} = $::task_1{$tid}{$state};
}
# derived calculations
my $exec_runtime = $::D_task{$tid}{'exec_runtime'};
my $nr_switches = $::D_task{$tid}{'nr_switches'};
my $iowait_sum = $::D_task{$tid}{'iowait_sum'};
if ($nr_switches > 0.0) {
$::D_task{$tid}{'tlen'} = $exec_runtime / $nr_switches;
} else {
$::D_task{$tid}{'tlen'} = 0.0;
}
if ($::tm_elapsed > 0.0) {
$::D_task{$tid}{'occ'} = 100.0*$exec_runtime/1.0E3/$::tm_elapsed;
$::D_task{$tid}{'iowait'} = 100.0*$iowait_sum/1.0E3/$::tm_elapsed;
} else {
$::D_task{$tid}{'occ'} = 0.0;
$::D_task{$tid}{'iowait'} = 0.0;
}
$::D_task{$tid}{'io'} = $::D_task{$tid}{'read_bytes'}
+ $::D_task{$tid}{'write_bytes'}
+ $::D_task{$tid}{'cancelled_write_bytes'};
$::D_task{$tid}{'ios'} = $::D_task{$tid}{'syscw'}
+ $::D_task{$tid}{'iowait_count'};
}
foreach my $cpu (keys %::percpu_1) {
$::D_percpu{$cpu}{'runtime'} = ($::percpu_1{$cpu} - $::percpu_0{$cpu})/1.0E6;
if ($::tm_elapsed > 0.0) {
$::D_percpu{$cpu}{'occ'} = 100.0*$D_percpu{$cpu}{'runtime'}/1.0E3/$::tm_elapsed;
} else {
$::D_percpu{$cpu}{'occ'} = 0.0;
}
}
my $occ_total = 0.0;
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
$occ_total += $::D_percpu{$cpu}{'occ'};
}
# Print summary
&schedtop_header(
\$::tr_1,
\$::tm_elapsed,
\$::tr_elapsed,
\$::uptime,
\$::loadavg,
\$::runq,
\$::num_blk,
\$::num_tasks,
\$::print_host
);
printf "%-5s %7s ", 'core:', 'total';
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
printf "%5s ", $cpu;
}
print "\n";
printf "%-5s %7.1f ", 'occ:', $occ_total;
for (my $cpu=0; $cpu < $::num_cpus; $cpu++) {
printf "%5.1f ", $::D_percpu{$cpu}{'occ'};
}
print "\n";
print "\n";
# Build up output line by specific area
my $L = ();
$L = '';
$L .= sprintf "%6s %6s %6s ", "TID", "PID", "PPID";
if ($::opt_P{$::P_ps} != $::P_none) {
$L .= sprintf "%1s %2s %*s %2s %3s %4s ",
"S", "P", $w_aff, "AFF", "PO", "NI", "PR";
}
if ($::opt_P{$::P_cpu} == $::P_brief) {
$L .= sprintf "%6s %7s ", "ctxt", "occ";
} elsif ($::opt_P{$::P_cpu} == $::P_full) {
$L .= sprintf "%6s %6s %7s ", "ctxt", "migr", "occ";
}
if ($::opt_P{$::P_del} != $::P_none) {
$L .= sprintf "%7s %7s %7s %7s ", "tlen", "tmax", "delay", "dmax";
}
if ($::opt_P{$::P_io} == $::P_lite) {
$L .= sprintf "%7s %6s ", "iowt", "iocnt";
} elsif ($::opt_P{$::P_io} == $::P_brief) {
$L .= sprintf "%7s %8s %8s ", "iowt", "read", "write";
} elsif ($::opt_P{$::P_io} == $::P_full) {
$L .= sprintf "%7s %8s %8s %8s %8s %8s ",
"iowt", "read", "write", "wcncl", "rsysc", "wsysc";
}
if ($::opt_P{$::P_id} != $::P_none) {
$L .= sprintf "%-22s ", "wchan";
}
if ($::opt_P{$::P_cmd} == $::P_brief) {
$L .= sprintf "%s", "cmdline";
} elsif ($::opt_P{$::P_cmd} == $::P_full) {
$L .= sprintf "%-15s %s", "comm", "cmdline";
}
print $L, "\n";
foreach my $tid (sort {($D_task{$b}{$s_key1} <=> $D_task{$a}{$s_key1}) or
($D_task{$b}{$s_key2} <=> $D_task{$a}{$s_key2}) or
($D_task{$b}{$s_key3} <=> $D_task{$a}{$s_key3})} keys %D_task) {
my $exec_runtime = $::D_task{$tid}{'exec_runtime'};
my $nr_switches = $::D_task{$tid}{'nr_switches'};
my $aff = $::D_task{$tid}{'affinity'}->as_hex();
# skip printing if there is no actual delta
if ( !(defined $::arg_idle) ) {
next if (($exec_runtime == 0.0) && ($nr_switches == 0));
}
# Build up output line by specific area
$L = '';
$L .= sprintf "%6d %6d %6d ",
$tid, $::D_task{$tid}{'pid'}, $::D_task{$tid}{'ppid'};
if ($::opt_P{$::P_ps} != $::P_none) {
$L .= sprintf "%1s %2d %*s %2s %3d %4d ",
$::D_task{$tid}{'state'}, $::D_task{$tid}{'task_cpu'}, $w_aff, $aff,
$policies[$::D_task{$tid}{'policy'}], $::D_task{$tid}{'nice'},
$::D_task{$tid}{'priority'};
}
if ($::opt_P{$::P_cpu} == $::P_brief) {
$L .= sprintf "%6d %7.2f ",
$::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'occ'};
} elsif ($::opt_P{$::P_cpu} == $::P_full) {
$L .= sprintf "%6d %6d %7.2f ",
$::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'nr_migrations'},
$::D_task{$tid}{'occ'},
}
if ($::opt_P{$::P_del} != $::P_none) {
$L .= sprintf "%7.3f %7.1f %7.3f %7.1f ",
$::D_task{$tid}{'tlen'}, $::D_task{$tid}{'exec_max'},
$::D_task{$tid}{'wait_sum'}, $::D_task{$tid}{'wait_max'};
}
if ($::opt_P{$::P_io} == $::P_lite) {
$L .= sprintf "%7.2f %6d ",
$::D_task{$tid}{'iowait'}, $::D_task{$tid}{'iowait_count'};
} elsif ($::opt_P{$::P_io} == $::P_brief) {
$L .= sprintf "%7.2f %8s %8s ",
$::D_task{$tid}{'iowait'},
&format_SI($::D_task{$tid}{'read_bytes'}),
&format_SI($::D_task{$tid}{'write_bytes'});
} elsif ($::opt_P{$::P_io} == $::P_full) {
$L .= sprintf "%7.2f %8s %8s %8s %8s %8s ",
$::D_task{$tid}{'iowait'},
&format_SI($::D_task{$tid}{'read_bytes'}),
&format_SI($::D_task{$tid}{'write_bytes'}),
&format_SI($::D_task{$tid}{'cancelled_write_bytes'}),
&format_SI($::D_task{$tid}{'syscr'}),
&format_SI($::D_task{$tid}{'syscw'});
}
if ($::opt_P{$::P_id} != $::P_none) {
$L .= sprintf "%-22s ", substr($::D_task{$tid}{'wchan'}, 0, 22);
}
if ($::opt_P{$::P_cmd} == $::P_brief) {
$L .= sprintf "%s", $::D_task{$tid}{'cmdline'};
} elsif ($::opt_P{$::P_cmd} == $::P_full) {
$L .= sprintf "%-15s %s",
$::D_task{$tid}{'comm'}, $::D_task{$tid}{'cmdline'};
}
print $L, "\n";
}
print "\n";
# exit repeat loop if we have exceeded overall time
last if ($::tm_1 > $::tm_final);
} # REPEAT LOOP
# Print that tool has finished
print "done\n";
# Capture timestamp and report delta
$b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0);
printf "processing time: %s\n", timestr($bd);
exit 0;
#-------------------------------------------------------------------------------
# Convert a number to SI unit xxx.yyyG
sub format_SI
{
(my $value) = @_;
if ($value >= SI_G) {
return sprintf("%.3fG", $value/SI_G);
} elsif ($value >= SI_M) {
return sprintf("%.3fM", $value/SI_M);
} elsif ($value >= SI_k) {
return sprintf("%.3fk", $value/SI_k);
} else {
return sprintf("%.0f", $value);
}
}
# Convert to IEC binary unit xxx.yyyGi
# Since underlying memory units are in pages, don't need decimals for Ki
sub format_IEC
{
(my $value) = @_;
if ($value >= Gi) {
return sprintf("%.3fGi", $value/Gi);
} elsif ($value >= Mi) {
return sprintf("%.3fMi", $value/Mi);
} elsif ($value >= Ki) {
return sprintf("%.0fKi", $value/Ki);
} else {
return sprintf("%.0f", $value);
}
}
# Determine whether scheduler stats are available
sub is_sched
{
return (-e '/proc/1/task/1/sched') ? 1 : 0;
}
# Determine whether IO stats are available
sub is_io
{
return (-e '/proc/1/task/1/io') ? 1 : 0;
}
# Determine max of array
sub max {
my ($max, @vars) = @_;
for (@vars) {
$max = $_ if $_ > $max;
}
return $max;
}
# Determine tids and pid mapping by walking /proc/<pid>/task/<tid>
sub get_tids
{
(local *::tids) = @_;
my (@pids_, @tids_) = ();
my ($dh, $pid, $tid);
# get pid list
my $dir = '/proc';
opendir($dh, $dir) || croak "Cannot open directory: $dir ($!)";
@pids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh);
closedir $dh;
# get tid list
foreach $pid (@pids_) {
$dir = '/proc/' . $pid . '/task';
opendir(my $dh, $dir) || next;
@tids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh);
closedir $dh;
foreach $tid (@tids_) { $::tids{$tid} = $pid; }
}
}
# Reset scheduling hi-water-marks
sub reset_sched_hwm
{
(local *::tids) = @_;
# reset scheduling hi-water-marks by writing '0' to each task
foreach my $tid (keys %::tids) {
my $file = '/proc/' . $tid . '/sched';
open(my $fh, "> $file") || next;
print $fh "0\n";
close($fh);
}
}
# Parse cpu and scheduling info for each tid
# - ignore the specific tid if there is incomplete data,
# (i.e., cannot obtain info because task has died,
# eg. missing ./stat, ./status, ./cmdline, ./wchan)
#
sub read_sched
{
(local *::tids, local *::task) = @_;
%::task = ();
foreach my $tid (keys %::tids) {
my ($fh, $file, $pid, $comm, $cmdline, $wchan, $id) = ();
my ($tpid, $tcomm, $state, $ppid, $pgrp, $sid,
$tty_nr, $tty_pgrp, $flags,
$min_flt, $cmin_flt, $maj_flt, $cmaj_flt,
$utime, $stime, $cutime, $cstime,
$priority, $nice, $num_threads,
$it_real_value, $start_time,
$vsize, $rss, $rsslim,
$start_code, $end_code, $start_stack, $esp, $eip,
$pending, $blocked, $sigign, $sigcatch, $wchan_addr,
$dum1, $dum2, $exit_signal, $task_cpu,
$rt_priority, $policy, $blkio_ticks,
$gtime, $cgtime,
$start_data, $end_data, $start_brk, $arg_start, $arg_end,
$env_start, $env_end, $exit_code) = ();
my ($nr_switches, $nr_migrations) = (0,0);
my ($exec_runtime, $exec_max) = (0.0, 0.0);
my ($wait_max, $wait_sum, $wait_count) = (0.0, 0.0, 0);
my ($iowait_sum, $iowait_count) = (0.0, 0);
my ($VmSize, $VmRSS) = ();
my $Cpus_allowed = Math::BigInt->new('0');
my $affinity = Math::BigInt->new('0');
my ($rchar, $wchar, $syscr, $syscw, $read_bytes, $write_bytes,
$cancelled_write_bytes) = (0,0,0,0,0,0,0);
my ($sched_valid, $io_valid, $status_valid, $cmdline_valid,
$wchan_valid, $stat_valid) = ();
$pid = $::tids{$tid};
# NOTE: Format change over time: OLD: se.statistics.X, NEW: se.statistics->X
#cat /proc/1/sched
#systemd (1, #threads: 1)
#-------------------------------------------------------------------
#se.exec_start : 33792676.285222
#se.vruntime : 28019997.693224
#se.sum_exec_runtime : 21918.207287
#se.nr_migrations : 5413
#se.statistics->sum_sleep_runtime : 1166561.198533
#se.statistics->wait_start : 0.000000
#se.statistics->sleep_start : 33792676.285222
#se.statistics->block_start : 0.000000
#se.statistics->sleep_max : 18951.679990
#se.statistics->block_max : 0.000000
#se.statistics->exec_max : 0.909747
#se.statistics->slice_max : 1.790123
#se.statistics->wait_max : 4.026544
#se.statistics->wait_sum : 507.245963
#se.statistics->wait_count : 2540
#se.statistics->iowait_sum : 0.000000
#se.statistics->iowait_count : 0
#se.statistics->nr_migrations_cold : 0
#se.statistics->nr_failed_migrations_affine : 67
#se.statistics->nr_failed_migrations_running : 1
#se.statistics->nr_failed_migrations_hot : 1
#se.statistics->nr_forced_migrations : 0
#se.statistics->nr_wakeups : 2472
#se.statistics->nr_wakeups_sync : 34
#se.statistics->nr_wakeups_migrate : 176
#se.statistics->nr_wakeups_local : 1442
#se.statistics->nr_wakeups_remote : 1030
#se.statistics->nr_wakeups_affine : 155
#se.statistics->nr_wakeups_affine_attempts : 969
#se.statistics->nr_wakeups_passive : 0
#se.statistics->nr_wakeups_idle : 0
#avg_atom : 0.286970
#avg_per_cpu : 4.049179
#nr_switches : 76378
#nr_voluntary_switches : 72308
#nr_involuntary_switches : 4070
#se.load.weight : 1024
#policy : 0
#prio : 120
#clock-delta : 28
# parse /proc/<pid>/task/<tid>/sched
$file = '/proc/' . $pid . '/task/' . $tid . '/sched';
open($fh, $file) || goto SKIP_SCHED;
$_ = <$fh>;
if (/^(.*)\s+\((\d+),\s+#threads:/) {
$comm = $1; $id = $2;
}
my ($k, $v, $c0);
LOOP_SCHED: while (<$fh>) {
if (/^se\.statistics.{1,2}wait_max\s+:\s+(\S+)/) {
$wait_max = $1;
} elsif (/^se\.statistics.{1,2}wait_sum\s+:\s+(\S+)/) {
$wait_sum = $1;
} elsif (/^se\.statistics.{1,2}wait_count\s+:\s+(\S+)/) {
$wait_count = $1;
} elsif (/^se\.statistics.{1,2}exec_max\s+:\s+(\S+)/) {
$exec_max = $1;
} elsif (/^se\.statistics.{1,2}iowait_sum\s+:\s+(\S+)/) {
$iowait_sum = $1;
} elsif (/^se\.statistics.{1,2}iowait_count\s+:\s+(\S+)/) {
$iowait_count = $1;
} elsif (/^se\.sum_exec_runtime\s+:\s+(\S+)/) {
$exec_runtime = $1;
} elsif (/^se\.nr_migrations\s+:\s+(\S+)/) {
$nr_migrations = $1;
} elsif (/^nr_switches\s+:\s+(\S+)/) {
$nr_switches = $1;
$sched_valid = 1;
last LOOP_SCHED;
}
}
close($fh);
SKIP_SCHED:;
#cat /proc/1/io
#rchar: 3432590242
#wchar: 438665986
#syscr: 316595
#syscw: 104722
#read_bytes: 1586438144
#write_bytes: 246829056
#cancelled_write_bytes: 7798784
# parse /proc/<pid>/task/<tid>/io
$file = '/proc/' . $pid . '/task/' . $tid . '/io';
open($fh, $file) || goto SKIP_IO;
LOOP_IO: while (<$fh>) {
if (/^rchar:\s+(\S+)/) {
$rchar = $1;
} elsif (/^wchar:\s+(\S+)/) {
$wchar = $1;
} elsif (/^syscr:\s+(\S+)/) {
$syscr = $1;
} elsif (/^syscw:\s+(\S+)/) {
$syscw = $1;
} elsif (/^read_bytes:\s+(\S+)/) {
$read_bytes = $1;
} elsif (/^write_bytes:\s+(\S+)/) {
$write_bytes = $1;
} elsif (/^cancelled_write_bytes:\s+(\S+)/) {
$cancelled_write_bytes = $1;
$io_valid = 1;
last LOOP_IO;
}
}
close($fh);
SKIP_IO:;
# parse /proc/<pid>/task/<tid>/status
$file = '/proc/' . $pid . '/task/' . $tid . '/status';
open($fh, $file) || next;
LOOP_STATUS: while (<$fh>) {
if (/^Name:\s+(.*)/) {
$comm = $1;
} elsif (/^State:\s+(\S+)/) {
$state = $1;
} elsif (/^PPid:\s+(\S+)/) {
$ppid = $1;
} elsif (/^VmSize:\s+(\S+)/) {
$VmSize = $1;
} elsif (/^VmRSS:\s+(\S+)/) {
$VmRSS = $1;
} elsif (/^Cpus_allowed:\s+([0]+,)*(\S+)/) {
my $h = $2; $h =~ tr/,/_/;
$Cpus_allowed = Math::BigInt->from_hex($h);
$affinity = $Cpus_allowed->band($::affinity_mask);
$status_valid = 1;
last LOOP_STATUS;
}
}
close($fh);
# parse /proc/<pid>/task/<tid>/cmdline
$file = '/proc/' . $pid . '/task/' . $tid . '/cmdline';
open($fh, $file) || next;
LOOP_CMDLINE: while (<$fh>) {
if (/^(.*)$/) {
$cmdline = $1;
$cmdline =~ s/\000/ /g;
$cmdline_valid = 1;
last LOOP_CMDLINE;
}
}
if (!$cmdline_valid) {
$cmdline_valid = 1;
$cmdline = $comm;
}
close($fh);
# parse /proc/<pid>/task/<tid>/wchan
$file = '/proc/' . $pid . '/task/' . $tid . '/wchan';
open($fh, $file) || next;
LOOP_WCHAN: while (<$fh>) {
if (/^(.*)$/) {
$wchan = $1;
$wchan_valid = 1;
last LOOP_WCHAN;
}
}
close($fh);
#Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
#..............................................................................
# Field Content
# tpid process id (or tid, if /proc/<pid>/task/<tid>/stat)
# tcomm filename of the executable
# state state (R is running, S is sleeping, D is sleeping in an
# uninterruptible wait, Z is zombie, T is traced or stopped)
# ppid process id of the parent process
# pgrp pgrp of the process
# sid session id
# tty_nr tty the process uses
# tty_pgrp pgrp of the tty
# flags task flags
# min_flt number of minor faults
# cmin_flt number of minor faults with child's
# maj_flt number of major faults
# cmaj_flt number of major faults with child's
# utime user mode jiffies
# stime kernel mode jiffies
# cutime user mode jiffies with child's
# cstime kernel mode jiffies with child's
# priority priority level
# nice nice level
# num_threads number of threads
# it_real_value (obsolete, always 0)
# start_time time the process started after system boot
# vsize virtual memory size
# rss resident set memory size
# rsslim current limit in bytes on the rss
# start_code address above which program text can run
# end_code address below which program text can run
# start_stack address of the start of the main process stack
# esp current value of ESP
# eip current value of EIP
# pending bitmap of pending signals
# blocked bitmap of blocked signals
# sigign bitmap of ignored signals
# sigcatch bitmap of catched signals
# wchan address where process went to sleep
# 0 (place holder)
# 0 (place holder)
# exit_signal signal to send to parent thread on exit
# task_cpu which CPU the task is scheduled on
# rt_priority realtime priority
# policy scheduling policy (man sched_setscheduler)
# blkio_ticks time spent waiting for block IO
# gtime guest time of the task in jiffies
# cgtime guest time of the task children in jiffies
# start_data address above which program data+bss is placed
# end_data address below which program data+bss is placed
# start_brk address above which program heap can be expanded with brk()
# arg_start address above which program command line is placed
# arg_end address below which program command line is placed
# env_start address above which program environment is placed
# env_end address below which program environment is placed
# exit_code the thread's exit_code in the form reported by the waitpid system call
# parse /proc/<pid>/task/<tid>/stat
$file = '/proc/' . $pid . '/task/' . $tid . '/stat';
my $dummy;
open($fh, $file) || next;
$_ = <$fh>;
($tpid, $tcomm, $dummy) = /^(\d+)\s+\((.*)\)\s+(.*)/;
($state, $ppid, $pgrp, $sid,
$tty_nr, $tty_pgrp, $flags,
$min_flt, $cmin_flt, $maj_flt, $cmaj_flt,
$utime, $stime, $cutime, $cstime,
$priority, $nice, $num_threads,
$it_real_value, $start_time,
$vsize, $rss, $rsslim,
$start_code, $end_code, $start_stack, $esp, $eip,
$pending, $blocked, $sigign, $sigcatch, $wchan_addr,
$dum1, $dum2, $exit_signal, $task_cpu,
$rt_priority, $policy, $blkio_ticks, $gtime, $cgtime,
$start_data, $end_data, $start_brk, $arg_start, $arg_end,
$env_start, $env_end, $exit_code) = split(/\s+/, $dummy);
$stat_valid = 1;
close($fh);
# sched
if (defined $sched_valid) {
$::task{$tid}{'exec_runtime'} = $exec_runtime;
$::task{$tid}{'exec_max'} = $exec_max;
$::task{$tid}{'wait_max'} = $wait_max;
$::task{$tid}{'wait_sum'} = $wait_sum;
$::task{$tid}{'wait_count'} = $wait_count;
$::task{$tid}{'iowait_sum'} = $iowait_sum;
$::task{$tid}{'iowait_count'} = $iowait_count;
$::task{$tid}{'nr_migrations'} = $nr_migrations;
$::task{$tid}{'nr_switches'} = $nr_switches;
} else {
$::task{$tid}{'exec_runtime'} = 0;
$::task{$tid}{'exec_max'} = 0;
$::task{$tid}{'wait_max'} = 0;
$::task{$tid}{'wait_sum'} = 0;
$::task{$tid}{'wait_count'} = 0;
$::task{$tid}{'iowait_sum'} = 0;
$::task{$tid}{'iowait_count'} = 0;
$::task{$tid}{'nr_migrations'} = 0;
$::task{$tid}{'nr_switches'} = 0;
}
# io
if (defined $io_valid) {
$::task{$tid}{'rchar'} = $rchar;
$::task{$tid}{'wchar'} = $wchar;
$::task{$tid}{'syscr'} = $syscr;
$::task{$tid}{'syscw'} = $syscw;
$::task{$tid}{'read_bytes'} = $read_bytes;
$::task{$tid}{'write_bytes'} = $write_bytes;
$::task{$tid}{'cancelled_write_bytes'} = $cancelled_write_bytes;
} else {
$::task{$tid}{'rchar'} = 0;
$::task{$tid}{'wchar'} = 0;
$::task{$tid}{'syscr'} = 0;
$::task{$tid}{'syscw'} = 0;
$::task{$tid}{'read_bytes'} = 0;
$::task{$tid}{'write_bytes'} = 0;
$::task{$tid}{'cancelled_write_bytes'} = 0;
}
# status
if (defined $status_valid) {
$::task{$tid}{'pid'} = $pid;
$::task{$tid}{'comm'} = $comm;
$::task{$tid}{'state'} = $state;
$::task{$tid}{'ppid'} = $ppid;
$::task{$tid}{'VmSize'} = $VmSize;
$::task{$tid}{'VmRSS'} = $VmRSS;
$::task{$tid}{'affinity'} = $affinity;
} else {
$::task{$tid}{'pid'} = 0;
$::task{$tid}{'comm'} = '-';
$::task{$tid}{'state'} = '-';
$::task{$tid}{'ppid'} = 0;
$::task{$tid}{'VmSize'} = 0;
$::task{$tid}{'VmRSS'} = 0;
$::task{$tid}{'affinity'} = Math::BigInt->new('0');
}
# cmdline
if (defined $cmdline_valid) {
$::task{$tid}{'cmdline'} = $cmdline;
} else {
$::task{$tid}{'cmdline'} = $comm;
}
# wchan
if (defined $cmdline_valid) {
$::task{$tid}{'wchan'} = $wchan;
} else {
$::task{$tid}{'wchan'} = '-';
}
# stat
if (defined $stat_valid) {
$::task{$tid}{'nice'} = $nice;
$::task{$tid}{'policy'} = $policy;
$::task{$tid}{'priority'} = $priority;
$::task{$tid}{'rt_priority'} = $rt_priority;
$::task{$tid}{'start_time'} = $start_time;
$::task{$tid}{'task_cpu'} = $task_cpu;
} else {
$::task{$tid}{'nice'} = 0;
$::task{$tid}{'policy'} = '-';
$::task{$tid}{'priority'} = 0;
$::task{$tid}{'rt_priority'} = 0;
$::task{$tid}{'start_time'} = '';
$::task{$tid}{'task_cpu'} = 0;
}
}
}
# Parse per-cpu hi-resolution scheduling stats
sub read_schedstat
{
(local *::percpu) = @_;
my ($version, $timestamp);
my ($cpu, $cputime);
my ($fh, $file);
%::percpu = ();
# parse /proc/schedstat
$file = '/proc/schedstat';
open($fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>; ($version) = /^version\s+(\d+)/;
$_ = <$fh>; ($timestamp) = /^timestamp\s+(\d+)/;
if ($version == 15) {
LOOP_SCHEDSTAT: while (<$fh>) {
# version 15: cputime is 7th field
if (/^cpu(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+/) {
$cpu = $1; $cputime = $2;
$::percpu{$cpu} = $cputime;
}
}
} else {
croak "schedstat version: $version method not implemented.";
}
close($fh);
SKIP_SCHED:;
}
# Parse per-cpu jiffie stats; cputime excludes iowait.
sub read_stat
{
(local *::percpu) = @_;
my ($cpu, $cputime);
my ($user, $sys, $nice, $idle, $iowt, $hirq, $sirq);
my ($fh, $file);
%::percpu = ();
# parse /proc/stat
$file = '/proc/stat';
open($fh, $file) || croak "Cannot open file: $file ($!)";
LOOP_STAT: while (<$fh>) {
if (/^cpu(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+/) {
$cpu =$1; $user = $2; $sys = $3; $nice = $4; $idle = $5; $iowt = $6; $hirq = $7; $sirq = $8;
$cputime = $CLOCK_NS * ($user + $sys + $nice + $iowt + $hirq + $sirq);
$::percpu{$cpu} = $cputime;
}
}
close($fh);
}
# Parse load-average from /proc/loadavg
sub get_loadavg
{
(local *::loadavg, local *::runq, local *::num_tasks) = @_;
$::loadavg{'1'} = 0.0;
$::loadavg{'5'} = 0.0;
$::loadavg{'15'} = 0.0;
$::runq = 0;
$::num_tasks = 0;
my $file = '/proc/loadavg';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>;
if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\/(\d+)\s+\d+/) {
$::loadavg{'1'} = $1;
$::loadavg{'5'} = $2;
$::loadavg{'15'} = $3;
$::runq = $4;
$::num_tasks = $5;
}
close($fh);
}
# Parse blocked from /proc/stat
sub get_blocked
{
(local *::num_blk) = @_;
$::num_blk = 0;
my $file = '/proc/stat';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
while ($_ = <$fh>) {
if (/^procs_blocked\s+(\d+)/) {
$::num_blk = $1;
}
}
close($fh);
}
# Parse uptime from /proc/uptime
sub get_uptime
{
(local *::uptime) = @_;
$::uptime = 0.0;
my $file = '/proc/uptime';
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
$_ = <$fh>;
if (/^(\S+)\s+\S+/) {
$::uptime = $1;
}
close($fh);
}
# Get number of online logical cpus
sub get_num_logical_cpus {
(local *::num_cpus) = @_;
$::num_cpus = 0;
my $file = "/proc/cpuinfo";
open(my $fh, $file) || croak "Cannot open file: $file ($!)";
LOOP_CPUINFO: while (<$fh>) {
if (/^[Pp]rocessor\s+:\s\d+/) {
$::num_cpus++;
}
}
close($fh);
}
# Print header
sub schedtop_header {
(local *::tr_1,
local *::tm_elapsed,
local *::tr_elapsed,
local *::uptime,
local *::loadavg,
local *::runq,
local *::num_blk,
local *::num_tasks,
local *::print_host,
) = @_;
# process epoch to get current timestamp
my $mm_in_s = 60;
my $hh_in_s = 60*60;
my $dd_in_s = 24*60*60;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1);
my $msec = 1000.0*($::tr_1 - int($::tr_1));
# convert uptime to elapsed <d>:<hh>:<mm>:<ss>
my ($up, $up_dd, $up_hh, $up_mm, $up_ss);
$up = int($::uptime);
$up_dd = int($up/$dd_in_s);
$up -= $dd_in_s*$up_dd;
$up_hh = int($up/$hh_in_s);
$up -= $hh_in_s*$up_hh;
$up_mm = int($up/$mm_in_s);
$up -= $mm_in_s*$up_mm;
$up_ss = $up;
# Calculate skew of CLOCK_REALTIME vs CLOCK_MONOTONIC,
# and display skew if > 5% relative difference.
my $skew_ms = ($::tr_elapsed - $::tm_elapsed)*1000.0;
my $skew = "";
if (abs($skew_ms)/$::tm_elapsed > 50.0) {
$skew = sprintf " skew:%.3f ms", $skew_ms;
}
#schedtop -- 2014/03/03 02:00:21.357 dt:2050.003 ms ldavg:0.07, 0.09, 0.08 runq:1 blk:0 nproc:440 up:6:13:00:56 skew:0.001 ms
printf "%s %s -- ".
"%4d-%02d-%02d %02d:%02d:%02d.%03d ".
"dt:%.3f ms ".
"ldavg:%.2f, %.2f, %.2f runq:%d blk:%d nproc:%d ".
"up:%d:%02d:%02d:%02d %s\n",
$::TOOLNAME, $::VERSION,
1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec,
$::tm_elapsed*1000.0,
$::loadavg{'1'}, $::loadavg{'5'}, $::loadavg{'15'},
$::runq, $::num_blk, $::num_tasks,
$up_dd, $up_hh, $up_mm, $up_ss,
$skew;
return if (!($::print_host));
# After first print, disable print host information
$::print_host = 0;
# Get host specific information
my ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE);
($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE) = POSIX::uname();
my ($NODETYPE, $SUBFUNCTION, $BUILDINFO) = ('-', '-', '-');
my ($SW_VERSION, $BUILD_ID) = ('-', '-');
# Get platform nodetype and subfunction
PLATFORM: {
my $file = "/etc/platform/platform.conf";
open(FILE, $file) || next;
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^nodetype=(\S+)/) {
$NODETYPE = $1;
}
if (/^subfunction=(\S+)/) {
$SUBFUNCTION = $1;
}
}
close(FILE);
}
# Get loadbuild info
BUILD: {
my $file = "/etc/build.info";
open(FILE, $file) || next;
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^SW_VERSION=\"([^"]+)\"/) {
$SW_VERSION = $1;
}
if (/^BUILD_ID=\"([^"]+)\"/) {
$BUILD_ID = $1;
}
}
close(FILE);
}
$BUILDINFO = join(' ', $SW_VERSION, $BUILD_ID);
# Parse /proc/cpuinfo to get specific processor info
my ($n_cpu, $model_name, $cpu_MHz) = (0, '-', 0);
CPUINFO: {
my $file = "/proc/cpuinfo";
open(FILE, $file) || croak "Cannot open file: $file ($!)";
while($_ = <FILE>) {
s/[\0\e\f\r\a]//g; chomp; # strip control characters if any
if (/^[Pp]rocessor\s+:\s+\d+/) {
$n_cpu++;
} elsif (/^model name\s+:\s+(.*)$/) {
$_ = $1; s/\s+/ /g;
$model_name = $_;
} elsif (/^cpu MHz\s+:\s+(\S+)/) {
$cpu_MHz = $1;
} elsif (/^bogomips\s+:\s+(\S+)/) {
$cpu_MHz = $1 if ($cpu_MHz == 0);
}
}
close(FILE);
}
printf " host:%s nodetype:%s subfunction:%s\n",
$NODENAME, $NODETYPE, $SUBFUNCTION;
printf " arch:%s processor:%s speed:%.0f #CPUs:%d\n",
$MACHINE, $model_name, $cpu_MHz, $n_cpu;
printf " %s %s build:%s\n", $OSTYPE, $OSRELEASE, $BUILDINFO;
}
# Parse and validate command line arguments
sub parse_schedtop_args {
(local *::arg_debug,
local *::arg_delay,
local *::arg_repeat,
local *::arg_period,
local *::arg_reset_hwm,
local *::arg_idle,
local *::arg_sort,
local *::arg_print,
) = @_;
# Local variables
my ($fail, $arg_help);
# Use the Argument processing module
use Getopt::Long;
# Print usage if no arguments
if (!@::ARGV) {
&Usage();
exit 0;
}
# Process input arguments
$fail = 0;
GetOptions(
"debug:i", \$::arg_debug,
"delay=f", \$::arg_delay,
"period=i", \$::arg_period,
"repeat=i", \$::arg_repeat,
"reset-hwm", \$::arg_reset_hwm,
"idle", \$::arg_idle,
"sort=s", \$::arg_sort,
"print=s", \$::arg_print,
"help|h", \$arg_help
) || GetOptionsMessage();
# Print help documentation if user has selected --help
&ListHelp() if (defined $arg_help);
# Validate options
if ((defined $::arg_repeat) && (defined $::arg_period)) {
$fail = 1;
warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n";
}
if ((defined $::arg_delay) && ($::arg_delay < 0.01)) {
$fail = 1;
warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n",
$::arg_delay;
}
if ((defined $::arg_sort) && !(($::arg_sort eq 'cpu') || ($::arg_sort eq 'io'))) {
$fail = 1;
warn "$::TOOLNAME: Input error: --sort=$::arg_sort invalid; valid options are: cpu, io.\n";
}
if ((defined $::arg_print) && !(($::arg_print eq 'brief') || ($::arg_print eq 'full'))) {
$fail = 1;
warn "$::TOOLNAME: Input error: --print=$::arg_print invalid; valid options are: brief, full\n";
}
if (@::ARGV) {
$fail = 1;
warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n";
}
# Set reasonable defaults
$::arg_delay ||= 1.0;
$::arg_repeat ||= 1;
if ($::arg_period) {
$::arg_repeat = $::arg_period / $::arg_delay;
} else {
$::arg_period = $::arg_delay * $::arg_repeat;
}
$::arg_sort ||= 'cpu';
$::arg_print ||= 'full';
# Upon missing or invalid options, print usage
if ($fail == 1) {
&Usage();
exit 1;
}
}
# Print out a warning message and usage
sub GetOptionsMessage {
warn "$::TOOLNAME: Error processing input arguments.\n";
&Usage();
exit 1;
}
# Print out program usage
sub Usage {
printf "Usage: $::TOOLNAME OPTIONS\n";
printf " [--delay=<seconds>] [--repeat=<num>] [--period=<seconds>]\n";
printf " [--reset-hwm] [--idle] [--sort=<cpu|io>] [--print=<brief|full>]\n";
printf " [--help]\n";
printf "\n";
}
# Print tool help
sub ListHelp {
printf "$::TOOLNAME -- display per-task scheduling occupancy\n";
&Usage();
printf "Options: miscellaneous\n";
printf " --delay=<seconds> : output interval (seconds): default: 1.0\n";
printf " --repeat=<num> : number of repeat samples: default: 1\n";
printf " --period=<seconds> : overall tool duration (seconds): default: --\n";
printf " --reset-hwm : reset scheduling delay hi-water marks\n";
printf " --idle : specify printing of idle tasks\n";
printf " --sort=<cpu|io> : sort order, select from 'cpu' or 'io'\n";
printf " --print=<brief|full> : select 'brief' or 'full' fields to display\n";
printf " --help : this help\n";
exit 0;
}
1;