#!/usr/bin/perl ######################################################################## # # Copyright (c) 2015-2016 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # # ######################################################################## # # Description: # This displays occupancy and scheduling information per sample period. # Output includes total occupancy, per-core occupancy, loadavg, per-task cpu, # per-task scheduling, per-task io-wait. # # Usage: schedtop OPTIONS # [--delay=] [--repeat=] [--period=] # [--reset-hwm] [--idle] [--sort=] # [--help] use strict; use warnings; use Data::Dumper; use POSIX qw(uname strftime); use Time::HiRes qw(clock_gettime usleep CLOCK_MONOTONIC CLOCK_REALTIME); use Benchmark ':hireswallclock'; use Carp qw(croak carp); use Math::BigInt; # Define toolname our $TOOLNAME = "schedtop"; our $VERSION = "0.1"; # Constants use constant SI_k => 1.0E3; use constant SI_M => 1.0E6; use constant SI_G => 1.0E9; use constant Ki => 1024.0; use constant Mi => 1024.0*1024.0; use constant Gi => 1024.0*1024.0*1024.0; # Globals our %opt_V = (); our %opt_P = (); our %percpu_0 = (); our %percpu_1 = (); our %task_0 = (); our %task_1 = (); our %tids_0 = (); our %tids_1 = (); our %D_task = (); our %D_percpu = (); our %loadavg = (); our $tm_0 = (); our $tm_1 = (); our $tr_0 = (); our $tr_1 = (); our $tm_elapsed = (); our $tr_elapsed = (); our $tm_final = (); our $uptime = (); our $num_cpus = 1; our $affinity_mask = Math::BigInt->new('0'); our $w_aff = 10; our $num_tasks = 0; our $num_blk = 0; our $is_schedstat = 1; our $USER_HZ = 100; # no easy way to get this our $CLOCK_NS = SI_G / $USER_HZ; our $print_host = 1; # Print options our ($P_none, $P_lite, $P_brief, $P_full) = (0, 1, 2, 3); our ($P_ps, $P_cpu, $P_del, $P_io, $P_id, $P_cmd) = (0, 1, 2, 3, 4, 5); our @P_list = ($::P_ps, $::P_cpu, $::P_del, $::P_io, $::P_id, $::P_cmd); # Argument list parameters our ($arg_debug, $arg_delay, $arg_repeat, $arg_period, $arg_reset_hwm, $arg_idle, $arg_sort, $arg_print) = (); #------------------------------------------------------------------------------- # MAIN Program #------------------------------------------------------------------------------- my $ONE_BILLION = 1.0E9; my $MIN_DELAY = 0.001; my $MAX_DELAY = 0.001; # benchmark variables my ($bd, $b0, $b1); my @policies = ('OT', 'FF', 'RR', 'BA', 'ID', 'UN', 'UN'); my @delta_list = ( 'nr_switches', 'nr_migrations', 'exec_runtime', 'wait_sum', 'wait_count', 'iowait_sum', 'iowait_count', 'syscr', 'syscw', 'read_bytes', 'write_bytes', 'cancelled_write_bytes', ); my @state_list = ( 'exec_max', 'wait_max', 'pid', 'ppid', 'state', 'comm', 'cmdline', 'wchan', 'affinity', 'VmSize', 'VmRSS', 'start_time', 'nice', 'policy', 'priority', 'rt_priority', 'task_cpu' ); # Autoflush output select(STDERR); $| = 1; select(STDOUT); # default $| = 1; # Parse input arguments and print tool usage if necessary &parse_schedtop_args( \$::arg_debug, \$::arg_delay, \$::arg_repeat, \$::arg_period, \$::arg_reset_hwm, \$::arg_idle, \$::arg_sort, \$::arg_print, ); # Set default print options if ($::arg_print eq 'full') { for my $P (@::P_list) { $::opt_P{$P} = $::P_full; } } elsif ($::arg_print eq 'brief') { for my $P (@::P_list) { $::opt_P{$P} = $::P_brief; } } else { for my $P (@::P_list) { $::opt_P{$P} = $::P_none; } } # Disable some options if data not present $::opt_V{'sched'} = &is_sched(); $::opt_V{'io'} = &is_io(); if ($::opt_V{'sched'} == 0) { $::opt_P{$::P_cpu} = $::P_none; $::opt_P{$::P_del} = $::P_none; $::opt_P{$::P_io} = $::P_none; undef $::arg_reset_hwm; } if ($::opt_V{'io'} == 0) { if ($::opt_V{'sched'} == 0) { $::opt_P{$::P_io} = $::P_none; $::arg_sort = 'cpu'; } else { if ($::opt_P{$::P_io} != $::P_none) { $::opt_P{$::P_io} = $::P_lite; } } } # Check for root user if ($>) { warn "$::TOOLNAME: requires root/sudo.\n"; exit 1; } # Print out some debugging information if (defined $::arg_debug) { $Data::Dumper::Indent = 1; } # Check for schedstat support; fallback to stats $is_schedstat = -e '/proc/schedstat' ? 1 : 0; # Print out selected options printf "selected options: ". "delay = %.3fs, repeat = %d, idle=%s, hwm=%s, sort=%s, print=%s\n", $::arg_delay, $::arg_repeat, (defined $::arg_idle ? 'idle_tasks' : 'no_idle_tasks'), (defined $::arg_reset_hwm ? 'reset-hwm' : 'unchanged'), $::arg_sort, $::arg_print; # Capture timestamp $b0 = new Benchmark; # Get number of logical cpus &get_num_logical_cpus(\$::num_cpus); $::affinity_mask = Math::BigInt->new('0'); for (my $i=0; $i < $::num_cpus; $i++) { my $y = Math::BigInt->new('1'); $y->blsft($i); $::affinity_mask->bior($y); } $w_aff = &max(length 'AFF', length $::affinity_mask->as_hex()); # Reset scheduling hi-water marks if (defined $::arg_reset_hwm) { &get_tids(\%::tids_1); &reset_sched_hwm(\%::tids_1); sleep(0.001); } # Get current hires epoc timestamp $::tm_1 = clock_gettime(CLOCK_MONOTONIC); $::tr_1 = clock_gettime(CLOCK_REALTIME); $::tm_final = $::tm_1 + $::arg_delay*$::arg_repeat; # Set initial delay $::tm_elapsed = $::arg_delay; $MAX_DELAY = $::arg_delay + $MIN_DELAY; # Get overall per-cpu stats if ($is_schedstat) { &read_schedstat(\%::percpu_1); } else { &read_stat(\%::percpu_1); } # Get list of pids and tids &get_tids(\%::tids_1); # Get current scheduling and io info for all tids &read_sched(\%::tids_1, \%::task_1); # determine column sort order my ($s_key1, $s_key2, $s_key3) = (); if ($::arg_sort eq 'cpu') { ($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', 'pid'); } elsif ($::arg_sort eq 'io') { ($s_key1, $s_key2, $s_key3) = ('io', 'ios', 'exec_runtime'); } else { ($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', , 'pid'); } # Main loop REPEAT_LOOP: for (my $repeat=1; $repeat <= $::arg_repeat; $repeat++) { # copy all state variables $::tm_0 = (); $::tr_0 = (); %::percpu_0 = (); %::tids_0 = (); %::task_0 = (); $::tm_0 = $::tm_1; $::tr_0 = $::tr_1; foreach my $cpu (keys %::percpu_1) { $::percpu_0{$cpu} = $::percpu_1{$cpu}; } foreach my $tid (keys %::tids_1) { $::tids_0{$tid} = $::tids_1{$tid}; } foreach my $tid (keys %::task_1) { foreach my $var (keys $::task_1{$tid}) { $::task_0{$tid}{$var} = $::task_1{$tid}{$var}; } } # estimate sleep delay to achieve desired interarrival by subtracting out # the measured cpu runtime of the tool. my $delay = $::arg_delay; if (defined $::D_task{$$}{'exec_runtime'}) { $delay -= ($::D_task{$$}{'exec_runtime'}/SI_k); } $delay = $MIN_DELAY if ($delay < $MIN_DELAY); $delay = $MAX_DELAY if ($delay > $MAX_DELAY); usleep( SI_M*$delay ); # Collect current state $::tm_1 = (); $::tr_1 = (); %::percpu_1 = (); %::tids_1 = (); %::task_1 = (); # Get current hires epoc timestamp $::tm_1 = clock_gettime(CLOCK_MONOTONIC); $::tr_1 = clock_gettime(CLOCK_REALTIME); # Get overall per-cpu stats if ($is_schedstat) { &read_schedstat(\%::percpu_1); } else { &read_stat(\%::percpu_1); } # Get list of pids and tids &get_tids(\%::tids_1); # Get current scheduling and io info for all tids &read_sched(\%::tids_1, \%::task_1); # Get current uptime &get_uptime(\$::uptime); # Get current loadavg &get_loadavg(\%::loadavg, \$::runq, \$::num_tasks); # Get current processes blocked &get_blocked(\$::num_blk); # Delta calculation %::D_task = (); %::D_percpu = (); $::tm_elapsed = $::tm_1 - $::tm_0; $::tr_elapsed = $::tr_1 - $::tr_0; foreach my $tid (keys %::task_1) { next if ( !(exists $::task_0{$tid}) ); # simple delta foreach my $var (@delta_list) { $::D_task{$tid}{$var} = ($::task_1{$tid}{$var} - $::task_0{$tid}{$var}); } # state information foreach my $state (@state_list) { $::D_task{$tid}{$state} = $::task_1{$tid}{$state}; } # derived calculations my $exec_runtime = $::D_task{$tid}{'exec_runtime'}; my $nr_switches = $::D_task{$tid}{'nr_switches'}; my $iowait_sum = $::D_task{$tid}{'iowait_sum'}; if ($nr_switches > 0.0) { $::D_task{$tid}{'tlen'} = $exec_runtime / $nr_switches; } else { $::D_task{$tid}{'tlen'} = 0.0; } if ($::tm_elapsed > 0.0) { $::D_task{$tid}{'occ'} = 100.0*$exec_runtime/1.0E3/$::tm_elapsed; $::D_task{$tid}{'iowait'} = 100.0*$iowait_sum/1.0E3/$::tm_elapsed; } else { $::D_task{$tid}{'occ'} = 0.0; $::D_task{$tid}{'iowait'} = 0.0; } $::D_task{$tid}{'io'} = $::D_task{$tid}{'read_bytes'} + $::D_task{$tid}{'write_bytes'} + $::D_task{$tid}{'cancelled_write_bytes'}; $::D_task{$tid}{'ios'} = $::D_task{$tid}{'syscw'} + $::D_task{$tid}{'iowait_count'}; } foreach my $cpu (keys %::percpu_1) { $::D_percpu{$cpu}{'runtime'} = ($::percpu_1{$cpu} - $::percpu_0{$cpu})/1.0E6; if ($::tm_elapsed > 0.0) { $::D_percpu{$cpu}{'occ'} = 100.0*$D_percpu{$cpu}{'runtime'}/1.0E3/$::tm_elapsed; } else { $::D_percpu{$cpu}{'occ'} = 0.0; } } my $occ_total = 0.0; for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { $occ_total += $::D_percpu{$cpu}{'occ'}; } # Print summary &schedtop_header( \$::tr_1, \$::tm_elapsed, \$::tr_elapsed, \$::uptime, \$::loadavg, \$::runq, \$::num_blk, \$::num_tasks, \$::print_host ); printf "%-5s %7s ", 'core:', 'total'; for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { printf "%5s ", $cpu; } print "\n"; printf "%-5s %7.1f ", 'occ:', $occ_total; for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { printf "%5.1f ", $::D_percpu{$cpu}{'occ'}; } print "\n"; print "\n"; # Build up output line by specific area my $L = (); $L = ''; $L .= sprintf "%6s %6s %6s ", "TID", "PID", "PPID"; if ($::opt_P{$::P_ps} != $::P_none) { $L .= sprintf "%1s %2s %*s %2s %3s %4s ", "S", "P", $w_aff, "AFF", "PO", "NI", "PR"; } if ($::opt_P{$::P_cpu} == $::P_brief) { $L .= sprintf "%6s %7s ", "ctxt", "occ"; } elsif ($::opt_P{$::P_cpu} == $::P_full) { $L .= sprintf "%6s %6s %7s ", "ctxt", "migr", "occ"; } if ($::opt_P{$::P_del} != $::P_none) { $L .= sprintf "%7s %7s %7s %7s ", "tlen", "tmax", "delay", "dmax"; } if ($::opt_P{$::P_io} == $::P_lite) { $L .= sprintf "%7s %6s ", "iowt", "iocnt"; } elsif ($::opt_P{$::P_io} == $::P_brief) { $L .= sprintf "%7s %8s %8s ", "iowt", "read", "write"; } elsif ($::opt_P{$::P_io} == $::P_full) { $L .= sprintf "%7s %8s %8s %8s %8s %8s ", "iowt", "read", "write", "wcncl", "rsysc", "wsysc"; } if ($::opt_P{$::P_id} != $::P_none) { $L .= sprintf "%-22s ", "wchan"; } if ($::opt_P{$::P_cmd} == $::P_brief) { $L .= sprintf "%s", "cmdline"; } elsif ($::opt_P{$::P_cmd} == $::P_full) { $L .= sprintf "%-15s %s", "comm", "cmdline"; } print $L, "\n"; foreach my $tid (sort {($D_task{$b}{$s_key1} <=> $D_task{$a}{$s_key1}) or ($D_task{$b}{$s_key2} <=> $D_task{$a}{$s_key2}) or ($D_task{$b}{$s_key3} <=> $D_task{$a}{$s_key3})} keys %D_task) { my $exec_runtime = $::D_task{$tid}{'exec_runtime'}; my $nr_switches = $::D_task{$tid}{'nr_switches'}; my $aff = $::D_task{$tid}{'affinity'}->as_hex(); # skip printing if there is no actual delta if ( !(defined $::arg_idle) ) { next if (($exec_runtime == 0.0) && ($nr_switches == 0)); } # Build up output line by specific area $L = ''; $L .= sprintf "%6d %6d %6d ", $tid, $::D_task{$tid}{'pid'}, $::D_task{$tid}{'ppid'}; if ($::opt_P{$::P_ps} != $::P_none) { $L .= sprintf "%1s %2d %*s %2s %3d %4d ", $::D_task{$tid}{'state'}, $::D_task{$tid}{'task_cpu'}, $w_aff, $aff, $policies[$::D_task{$tid}{'policy'}], $::D_task{$tid}{'nice'}, $::D_task{$tid}{'priority'}; } if ($::opt_P{$::P_cpu} == $::P_brief) { $L .= sprintf "%6d %7.2f ", $::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'occ'}; } elsif ($::opt_P{$::P_cpu} == $::P_full) { $L .= sprintf "%6d %6d %7.2f ", $::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'nr_migrations'}, $::D_task{$tid}{'occ'}, } if ($::opt_P{$::P_del} != $::P_none) { $L .= sprintf "%7.3f %7.1f %7.3f %7.1f ", $::D_task{$tid}{'tlen'}, $::D_task{$tid}{'exec_max'}, $::D_task{$tid}{'wait_sum'}, $::D_task{$tid}{'wait_max'}; } if ($::opt_P{$::P_io} == $::P_lite) { $L .= sprintf "%7.2f %6d ", $::D_task{$tid}{'iowait'}, $::D_task{$tid}{'iowait_count'}; } elsif ($::opt_P{$::P_io} == $::P_brief) { $L .= sprintf "%7.2f %8s %8s ", $::D_task{$tid}{'iowait'}, &format_SI($::D_task{$tid}{'read_bytes'}), &format_SI($::D_task{$tid}{'write_bytes'}); } elsif ($::opt_P{$::P_io} == $::P_full) { $L .= sprintf "%7.2f %8s %8s %8s %8s %8s ", $::D_task{$tid}{'iowait'}, &format_SI($::D_task{$tid}{'read_bytes'}), &format_SI($::D_task{$tid}{'write_bytes'}), &format_SI($::D_task{$tid}{'cancelled_write_bytes'}), &format_SI($::D_task{$tid}{'syscr'}), &format_SI($::D_task{$tid}{'syscw'}); } if ($::opt_P{$::P_id} != $::P_none) { $L .= sprintf "%-22s ", substr($::D_task{$tid}{'wchan'}, 0, 22); } if ($::opt_P{$::P_cmd} == $::P_brief) { $L .= sprintf "%s", $::D_task{$tid}{'cmdline'}; } elsif ($::opt_P{$::P_cmd} == $::P_full) { $L .= sprintf "%-15s %s", $::D_task{$tid}{'comm'}, $::D_task{$tid}{'cmdline'}; } print $L, "\n"; } print "\n"; # exit repeat loop if we have exceeded overall time last if ($::tm_1 > $::tm_final); } # REPEAT LOOP # Print that tool has finished print "done\n"; # Capture timestamp and report delta $b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0); printf "processing time: %s\n", timestr($bd); exit 0; #------------------------------------------------------------------------------- # Convert a number to SI unit xxx.yyyG sub format_SI { (my $value) = @_; if ($value >= SI_G) { return sprintf("%.3fG", $value/SI_G); } elsif ($value >= SI_M) { return sprintf("%.3fM", $value/SI_M); } elsif ($value >= SI_k) { return sprintf("%.3fk", $value/SI_k); } else { return sprintf("%.0f", $value); } } # Convert to IEC binary unit xxx.yyyGi # Since underlying memory units are in pages, don't need decimals for Ki sub format_IEC { (my $value) = @_; if ($value >= Gi) { return sprintf("%.3fGi", $value/Gi); } elsif ($value >= Mi) { return sprintf("%.3fMi", $value/Mi); } elsif ($value >= Ki) { return sprintf("%.0fKi", $value/Ki); } else { return sprintf("%.0f", $value); } } # Determine whether scheduler stats are available sub is_sched { return (-e '/proc/1/task/1/sched') ? 1 : 0; } # Determine whether IO stats are available sub is_io { return (-e '/proc/1/task/1/io') ? 1 : 0; } # Determine max of array sub max { my ($max, @vars) = @_; for (@vars) { $max = $_ if $_ > $max; } return $max; } # Determine tids and pid mapping by walking /proc//task/ sub get_tids { (local *::tids) = @_; my (@pids_, @tids_) = (); my ($dh, $pid, $tid); # get pid list my $dir = '/proc'; opendir($dh, $dir) || croak "Cannot open directory: $dir ($!)"; @pids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh); closedir $dh; # get tid list foreach $pid (@pids_) { $dir = '/proc/' . $pid . '/task'; opendir(my $dh, $dir) || next; @tids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh); closedir $dh; foreach $tid (@tids_) { $::tids{$tid} = $pid; } } } # Reset scheduling hi-water-marks sub reset_sched_hwm { (local *::tids) = @_; # reset scheduling hi-water-marks by writing '0' to each task foreach my $tid (keys %::tids) { my $file = '/proc/' . $tid . '/sched'; open(my $fh, "> $file") || next; print $fh "0\n"; close($fh); } } # Parse cpu and scheduling info for each tid # - ignore the specific tid if there is incomplete data, # (i.e., cannot obtain info because task has died, # eg. missing ./stat, ./status, ./cmdline, ./wchan) # sub read_sched { (local *::tids, local *::task) = @_; %::task = (); foreach my $tid (keys %::tids) { my ($fh, $file, $pid, $comm, $cmdline, $wchan, $id) = (); my ($tpid, $tcomm, $state, $ppid, $pgrp, $sid, $tty_nr, $tty_pgrp, $flags, $min_flt, $cmin_flt, $maj_flt, $cmaj_flt, $utime, $stime, $cutime, $cstime, $priority, $nice, $num_threads, $it_real_value, $start_time, $vsize, $rss, $rsslim, $start_code, $end_code, $start_stack, $esp, $eip, $pending, $blocked, $sigign, $sigcatch, $wchan_addr, $dum1, $dum2, $exit_signal, $task_cpu, $rt_priority, $policy, $blkio_ticks, $gtime, $cgtime, $start_data, $end_data, $start_brk, $arg_start, $arg_end, $env_start, $env_end, $exit_code) = (); my ($nr_switches, $nr_migrations) = (0,0); my ($exec_runtime, $exec_max) = (0.0, 0.0); my ($wait_max, $wait_sum, $wait_count) = (0.0, 0.0, 0); my ($iowait_sum, $iowait_count) = (0.0, 0); my ($VmSize, $VmRSS) = (); my $Cpus_allowed = Math::BigInt->new('0'); my $affinity = Math::BigInt->new('0'); my ($rchar, $wchar, $syscr, $syscw, $read_bytes, $write_bytes, $cancelled_write_bytes) = (0,0,0,0,0,0,0); my ($sched_valid, $io_valid, $status_valid, $cmdline_valid, $wchan_valid, $stat_valid) = (); $pid = $::tids{$tid}; # NOTE: Format change over time: OLD: se.statistics.X, NEW: se.statistics->X #cat /proc/1/sched #systemd (1, #threads: 1) #------------------------------------------------------------------- #se.exec_start : 33792676.285222 #se.vruntime : 28019997.693224 #se.sum_exec_runtime : 21918.207287 #se.nr_migrations : 5413 #se.statistics->sum_sleep_runtime : 1166561.198533 #se.statistics->wait_start : 0.000000 #se.statistics->sleep_start : 33792676.285222 #se.statistics->block_start : 0.000000 #se.statistics->sleep_max : 18951.679990 #se.statistics->block_max : 0.000000 #se.statistics->exec_max : 0.909747 #se.statistics->slice_max : 1.790123 #se.statistics->wait_max : 4.026544 #se.statistics->wait_sum : 507.245963 #se.statistics->wait_count : 2540 #se.statistics->iowait_sum : 0.000000 #se.statistics->iowait_count : 0 #se.statistics->nr_migrations_cold : 0 #se.statistics->nr_failed_migrations_affine : 67 #se.statistics->nr_failed_migrations_running : 1 #se.statistics->nr_failed_migrations_hot : 1 #se.statistics->nr_forced_migrations : 0 #se.statistics->nr_wakeups : 2472 #se.statistics->nr_wakeups_sync : 34 #se.statistics->nr_wakeups_migrate : 176 #se.statistics->nr_wakeups_local : 1442 #se.statistics->nr_wakeups_remote : 1030 #se.statistics->nr_wakeups_affine : 155 #se.statistics->nr_wakeups_affine_attempts : 969 #se.statistics->nr_wakeups_passive : 0 #se.statistics->nr_wakeups_idle : 0 #avg_atom : 0.286970 #avg_per_cpu : 4.049179 #nr_switches : 76378 #nr_voluntary_switches : 72308 #nr_involuntary_switches : 4070 #se.load.weight : 1024 #policy : 0 #prio : 120 #clock-delta : 28 # parse /proc//task//sched $file = '/proc/' . $pid . '/task/' . $tid . '/sched'; open($fh, $file) || goto SKIP_SCHED; $_ = <$fh>; if (/^(.*)\s+\((\d+),\s+#threads:/) { $comm = $1; $id = $2; } my ($k, $v, $c0); LOOP_SCHED: while (<$fh>) { if (/^se\.statistics.{1,2}wait_max\s+:\s+(\S+)/) { $wait_max = $1; } elsif (/^se\.statistics.{1,2}wait_sum\s+:\s+(\S+)/) { $wait_sum = $1; } elsif (/^se\.statistics.{1,2}wait_count\s+:\s+(\S+)/) { $wait_count = $1; } elsif (/^se\.statistics.{1,2}exec_max\s+:\s+(\S+)/) { $exec_max = $1; } elsif (/^se\.statistics.{1,2}iowait_sum\s+:\s+(\S+)/) { $iowait_sum = $1; } elsif (/^se\.statistics.{1,2}iowait_count\s+:\s+(\S+)/) { $iowait_count = $1; } elsif (/^se\.sum_exec_runtime\s+:\s+(\S+)/) { $exec_runtime = $1; } elsif (/^se\.nr_migrations\s+:\s+(\S+)/) { $nr_migrations = $1; } elsif (/^nr_switches\s+:\s+(\S+)/) { $nr_switches = $1; $sched_valid = 1; last LOOP_SCHED; } } close($fh); SKIP_SCHED:; #cat /proc/1/io #rchar: 3432590242 #wchar: 438665986 #syscr: 316595 #syscw: 104722 #read_bytes: 1586438144 #write_bytes: 246829056 #cancelled_write_bytes: 7798784 # parse /proc//task//io $file = '/proc/' . $pid . '/task/' . $tid . '/io'; open($fh, $file) || goto SKIP_IO; LOOP_IO: while (<$fh>) { if (/^rchar:\s+(\S+)/) { $rchar = $1; } elsif (/^wchar:\s+(\S+)/) { $wchar = $1; } elsif (/^syscr:\s+(\S+)/) { $syscr = $1; } elsif (/^syscw:\s+(\S+)/) { $syscw = $1; } elsif (/^read_bytes:\s+(\S+)/) { $read_bytes = $1; } elsif (/^write_bytes:\s+(\S+)/) { $write_bytes = $1; } elsif (/^cancelled_write_bytes:\s+(\S+)/) { $cancelled_write_bytes = $1; $io_valid = 1; last LOOP_IO; } } close($fh); SKIP_IO:; # parse /proc//task//status $file = '/proc/' . $pid . '/task/' . $tid . '/status'; open($fh, $file) || next; LOOP_STATUS: while (<$fh>) { if (/^Name:\s+(.*)/) { $comm = $1; } elsif (/^State:\s+(\S+)/) { $state = $1; } elsif (/^PPid:\s+(\S+)/) { $ppid = $1; } elsif (/^VmSize:\s+(\S+)/) { $VmSize = $1; } elsif (/^VmRSS:\s+(\S+)/) { $VmRSS = $1; } elsif (/^Cpus_allowed:\s+([0]+,)*(\S+)/) { my $h = $2; $h =~ tr/,/_/; $Cpus_allowed = Math::BigInt->from_hex($h); $affinity = $Cpus_allowed->band($::affinity_mask); $status_valid = 1; last LOOP_STATUS; } } close($fh); # parse /proc//task//cmdline $file = '/proc/' . $pid . '/task/' . $tid . '/cmdline'; open($fh, $file) || next; LOOP_CMDLINE: while (<$fh>) { if (/^(.*)$/) { $cmdline = $1; $cmdline =~ s/\000/ /g; $cmdline_valid = 1; last LOOP_CMDLINE; } } if (!$cmdline_valid) { $cmdline_valid = 1; $cmdline = $comm; } close($fh); # parse /proc//task//wchan $file = '/proc/' . $pid . '/task/' . $tid . '/wchan'; open($fh, $file) || next; LOOP_WCHAN: while (<$fh>) { if (/^(.*)$/) { $wchan = $1; $wchan_valid = 1; last LOOP_WCHAN; } } close($fh); #Table 1-4: Contents of the stat files (as of 2.6.30-rc7) #.............................................................................. # Field Content # tpid process id (or tid, if /proc//task//stat) # tcomm filename of the executable # state state (R is running, S is sleeping, D is sleeping in an # uninterruptible wait, Z is zombie, T is traced or stopped) # ppid process id of the parent process # pgrp pgrp of the process # sid session id # tty_nr tty the process uses # tty_pgrp pgrp of the tty # flags task flags # min_flt number of minor faults # cmin_flt number of minor faults with child's # maj_flt number of major faults # cmaj_flt number of major faults with child's # utime user mode jiffies # stime kernel mode jiffies # cutime user mode jiffies with child's # cstime kernel mode jiffies with child's # priority priority level # nice nice level # num_threads number of threads # it_real_value (obsolete, always 0) # start_time time the process started after system boot # vsize virtual memory size # rss resident set memory size # rsslim current limit in bytes on the rss # start_code address above which program text can run # end_code address below which program text can run # start_stack address of the start of the main process stack # esp current value of ESP # eip current value of EIP # pending bitmap of pending signals # blocked bitmap of blocked signals # sigign bitmap of ignored signals # sigcatch bitmap of catched signals # wchan address where process went to sleep # 0 (place holder) # 0 (place holder) # exit_signal signal to send to parent thread on exit # task_cpu which CPU the task is scheduled on # rt_priority realtime priority # policy scheduling policy (man sched_setscheduler) # blkio_ticks time spent waiting for block IO # gtime guest time of the task in jiffies # cgtime guest time of the task children in jiffies # start_data address above which program data+bss is placed # end_data address below which program data+bss is placed # start_brk address above which program heap can be expanded with brk() # arg_start address above which program command line is placed # arg_end address below which program command line is placed # env_start address above which program environment is placed # env_end address below which program environment is placed # exit_code the thread's exit_code in the form reported by the waitpid system call # parse /proc//task//stat $file = '/proc/' . $pid . '/task/' . $tid . '/stat'; my $dummy; open($fh, $file) || next; $_ = <$fh>; ($tpid, $tcomm, $dummy) = /^(\d+)\s+\((.*)\)\s+(.*)/; ($state, $ppid, $pgrp, $sid, $tty_nr, $tty_pgrp, $flags, $min_flt, $cmin_flt, $maj_flt, $cmaj_flt, $utime, $stime, $cutime, $cstime, $priority, $nice, $num_threads, $it_real_value, $start_time, $vsize, $rss, $rsslim, $start_code, $end_code, $start_stack, $esp, $eip, $pending, $blocked, $sigign, $sigcatch, $wchan_addr, $dum1, $dum2, $exit_signal, $task_cpu, $rt_priority, $policy, $blkio_ticks, $gtime, $cgtime, $start_data, $end_data, $start_brk, $arg_start, $arg_end, $env_start, $env_end, $exit_code) = split(/\s+/, $dummy); $stat_valid = 1; close($fh); # sched if (defined $sched_valid) { $::task{$tid}{'exec_runtime'} = $exec_runtime; $::task{$tid}{'exec_max'} = $exec_max; $::task{$tid}{'wait_max'} = $wait_max; $::task{$tid}{'wait_sum'} = $wait_sum; $::task{$tid}{'wait_count'} = $wait_count; $::task{$tid}{'iowait_sum'} = $iowait_sum; $::task{$tid}{'iowait_count'} = $iowait_count; $::task{$tid}{'nr_migrations'} = $nr_migrations; $::task{$tid}{'nr_switches'} = $nr_switches; } else { $::task{$tid}{'exec_runtime'} = 0; $::task{$tid}{'exec_max'} = 0; $::task{$tid}{'wait_max'} = 0; $::task{$tid}{'wait_sum'} = 0; $::task{$tid}{'wait_count'} = 0; $::task{$tid}{'iowait_sum'} = 0; $::task{$tid}{'iowait_count'} = 0; $::task{$tid}{'nr_migrations'} = 0; $::task{$tid}{'nr_switches'} = 0; } # io if (defined $io_valid) { $::task{$tid}{'rchar'} = $rchar; $::task{$tid}{'wchar'} = $wchar; $::task{$tid}{'syscr'} = $syscr; $::task{$tid}{'syscw'} = $syscw; $::task{$tid}{'read_bytes'} = $read_bytes; $::task{$tid}{'write_bytes'} = $write_bytes; $::task{$tid}{'cancelled_write_bytes'} = $cancelled_write_bytes; } else { $::task{$tid}{'rchar'} = 0; $::task{$tid}{'wchar'} = 0; $::task{$tid}{'syscr'} = 0; $::task{$tid}{'syscw'} = 0; $::task{$tid}{'read_bytes'} = 0; $::task{$tid}{'write_bytes'} = 0; $::task{$tid}{'cancelled_write_bytes'} = 0; } # status if (defined $status_valid) { $::task{$tid}{'pid'} = $pid; $::task{$tid}{'comm'} = $comm; $::task{$tid}{'state'} = $state; $::task{$tid}{'ppid'} = $ppid; $::task{$tid}{'VmSize'} = $VmSize; $::task{$tid}{'VmRSS'} = $VmRSS; $::task{$tid}{'affinity'} = $affinity; } else { $::task{$tid}{'pid'} = 0; $::task{$tid}{'comm'} = '-'; $::task{$tid}{'state'} = '-'; $::task{$tid}{'ppid'} = 0; $::task{$tid}{'VmSize'} = 0; $::task{$tid}{'VmRSS'} = 0; $::task{$tid}{'affinity'} = Math::BigInt->new('0'); } # cmdline if (defined $cmdline_valid) { $::task{$tid}{'cmdline'} = $cmdline; } else { $::task{$tid}{'cmdline'} = $comm; } # wchan if (defined $cmdline_valid) { $::task{$tid}{'wchan'} = $wchan; } else { $::task{$tid}{'wchan'} = '-'; } # stat if (defined $stat_valid) { $::task{$tid}{'nice'} = $nice; $::task{$tid}{'policy'} = $policy; $::task{$tid}{'priority'} = $priority; $::task{$tid}{'rt_priority'} = $rt_priority; $::task{$tid}{'start_time'} = $start_time; $::task{$tid}{'task_cpu'} = $task_cpu; } else { $::task{$tid}{'nice'} = 0; $::task{$tid}{'policy'} = '-'; $::task{$tid}{'priority'} = 0; $::task{$tid}{'rt_priority'} = 0; $::task{$tid}{'start_time'} = ''; $::task{$tid}{'task_cpu'} = 0; } } } # Parse per-cpu hi-resolution scheduling stats sub read_schedstat { (local *::percpu) = @_; my ($version, $timestamp); my ($cpu, $cputime); my ($fh, $file); %::percpu = (); # parse /proc/schedstat $file = '/proc/schedstat'; open($fh, $file) || croak "Cannot open file: $file ($!)"; $_ = <$fh>; ($version) = /^version\s+(\d+)/; $_ = <$fh>; ($timestamp) = /^timestamp\s+(\d+)/; if ($version == 15) { LOOP_SCHEDSTAT: while (<$fh>) { # version 15: cputime is 7th field if (/^cpu(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+/) { $cpu = $1; $cputime = $2; $::percpu{$cpu} = $cputime; } } } else { croak "schedstat version: $version method not implemented."; } close($fh); SKIP_SCHED:; } # Parse per-cpu jiffie stats; cputime excludes iowait. sub read_stat { (local *::percpu) = @_; my ($cpu, $cputime); my ($user, $sys, $nice, $idle, $iowt, $hirq, $sirq); my ($fh, $file); %::percpu = (); # parse /proc/stat $file = '/proc/stat'; open($fh, $file) || croak "Cannot open file: $file ($!)"; LOOP_STAT: while (<$fh>) { if (/^cpu(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+/) { $cpu =$1; $user = $2; $sys = $3; $nice = $4; $idle = $5; $iowt = $6; $hirq = $7; $sirq = $8; $cputime = $CLOCK_NS * ($user + $sys + $nice + $iowt + $hirq + $sirq); $::percpu{$cpu} = $cputime; } } close($fh); } # Parse load-average from /proc/loadavg sub get_loadavg { (local *::loadavg, local *::runq, local *::num_tasks) = @_; $::loadavg{'1'} = 0.0; $::loadavg{'5'} = 0.0; $::loadavg{'15'} = 0.0; $::runq = 0; $::num_tasks = 0; my $file = '/proc/loadavg'; open(my $fh, $file) || croak "Cannot open file: $file ($!)"; $_ = <$fh>; if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\/(\d+)\s+\d+/) { $::loadavg{'1'} = $1; $::loadavg{'5'} = $2; $::loadavg{'15'} = $3; $::runq = $4; $::num_tasks = $5; } close($fh); } # Parse blocked from /proc/stat sub get_blocked { (local *::num_blk) = @_; $::num_blk = 0; my $file = '/proc/stat'; open(my $fh, $file) || croak "Cannot open file: $file ($!)"; while ($_ = <$fh>) { if (/^procs_blocked\s+(\d+)/) { $::num_blk = $1; } } close($fh); } # Parse uptime from /proc/uptime sub get_uptime { (local *::uptime) = @_; $::uptime = 0.0; my $file = '/proc/uptime'; open(my $fh, $file) || croak "Cannot open file: $file ($!)"; $_ = <$fh>; if (/^(\S+)\s+\S+/) { $::uptime = $1; } close($fh); } # Get number of online logical cpus sub get_num_logical_cpus { (local *::num_cpus) = @_; $::num_cpus = 0; my $file = "/proc/cpuinfo"; open(my $fh, $file) || croak "Cannot open file: $file ($!)"; LOOP_CPUINFO: while (<$fh>) { if (/^[Pp]rocessor\s+:\s\d+/) { $::num_cpus++; } } close($fh); } # Print header sub schedtop_header { (local *::tr_1, local *::tm_elapsed, local *::tr_elapsed, local *::uptime, local *::loadavg, local *::runq, local *::num_blk, local *::num_tasks, local *::print_host, ) = @_; # process epoch to get current timestamp my $mm_in_s = 60; my $hh_in_s = 60*60; my $dd_in_s = 24*60*60; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1); my $msec = 1000.0*($::tr_1 - int($::tr_1)); # convert uptime to elapsed ::: my ($up, $up_dd, $up_hh, $up_mm, $up_ss); $up = int($::uptime); $up_dd = int($up/$dd_in_s); $up -= $dd_in_s*$up_dd; $up_hh = int($up/$hh_in_s); $up -= $hh_in_s*$up_hh; $up_mm = int($up/$mm_in_s); $up -= $mm_in_s*$up_mm; $up_ss = $up; # Calculate skew of CLOCK_REALTIME vs CLOCK_MONOTONIC, # and display skew if > 5% relative difference. my $skew_ms = ($::tr_elapsed - $::tm_elapsed)*1000.0; my $skew = ""; if (abs($skew_ms)/$::tm_elapsed > 50.0) { $skew = sprintf " skew:%.3f ms", $skew_ms; } #schedtop -- 2014/03/03 02:00:21.357 dt:2050.003 ms ldavg:0.07, 0.09, 0.08 runq:1 blk:0 nproc:440 up:6:13:00:56 skew:0.001 ms printf "%s %s -- ". "%4d-%02d-%02d %02d:%02d:%02d.%03d ". "dt:%.3f ms ". "ldavg:%.2f, %.2f, %.2f runq:%d blk:%d nproc:%d ". "up:%d:%02d:%02d:%02d %s\n", $::TOOLNAME, $::VERSION, 1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec, $::tm_elapsed*1000.0, $::loadavg{'1'}, $::loadavg{'5'}, $::loadavg{'15'}, $::runq, $::num_blk, $::num_tasks, $up_dd, $up_hh, $up_mm, $up_ss, $skew; return if (!($::print_host)); # After first print, disable print host information $::print_host = 0; # Get host specific information my ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE); ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE) = POSIX::uname(); my ($NODETYPE, $SUBFUNCTION, $BUILDINFO) = ('-', '-', '-'); my ($SW_VERSION, $BUILD_ID) = ('-', '-'); # Get platform nodetype and subfunction PLATFORM: { my $file = "/etc/platform/platform.conf"; open(FILE, $file) || next; while($_ = ) { s/[\0\e\f\r\a]//g; chomp; # strip control characters if any if (/^nodetype=(\S+)/) { $NODETYPE = $1; } if (/^subfunction=(\S+)/) { $SUBFUNCTION = $1; } } close(FILE); } # Get loadbuild info BUILD: { my $file = "/etc/build.info"; open(FILE, $file) || next; while($_ = ) { s/[\0\e\f\r\a]//g; chomp; # strip control characters if any if (/^SW_VERSION=\"([^"]+)\"/) { $SW_VERSION = $1; } if (/^BUILD_ID=\"([^"]+)\"/) { $BUILD_ID = $1; } } close(FILE); } $BUILDINFO = join(' ', $SW_VERSION, $BUILD_ID); # Parse /proc/cpuinfo to get specific processor info my ($n_cpu, $model_name, $cpu_MHz) = (0, '-', 0); CPUINFO: { my $file = "/proc/cpuinfo"; open(FILE, $file) || croak "Cannot open file: $file ($!)"; while($_ = ) { s/[\0\e\f\r\a]//g; chomp; # strip control characters if any if (/^[Pp]rocessor\s+:\s+\d+/) { $n_cpu++; } elsif (/^model name\s+:\s+(.*)$/) { $_ = $1; s/\s+/ /g; $model_name = $_; } elsif (/^cpu MHz\s+:\s+(\S+)/) { $cpu_MHz = $1; } elsif (/^bogomips\s+:\s+(\S+)/) { $cpu_MHz = $1 if ($cpu_MHz == 0); } } close(FILE); } printf " host:%s nodetype:%s subfunction:%s\n", $NODENAME, $NODETYPE, $SUBFUNCTION; printf " arch:%s processor:%s speed:%.0f #CPUs:%d\n", $MACHINE, $model_name, $cpu_MHz, $n_cpu; printf " %s %s build:%s\n", $OSTYPE, $OSRELEASE, $BUILDINFO; } # Parse and validate command line arguments sub parse_schedtop_args { (local *::arg_debug, local *::arg_delay, local *::arg_repeat, local *::arg_period, local *::arg_reset_hwm, local *::arg_idle, local *::arg_sort, local *::arg_print, ) = @_; # Local variables my ($fail, $arg_help); # Use the Argument processing module use Getopt::Long; # Print usage if no arguments if (!@::ARGV) { &Usage(); exit 0; } # Process input arguments $fail = 0; GetOptions( "debug:i", \$::arg_debug, "delay=f", \$::arg_delay, "period=i", \$::arg_period, "repeat=i", \$::arg_repeat, "reset-hwm", \$::arg_reset_hwm, "idle", \$::arg_idle, "sort=s", \$::arg_sort, "print=s", \$::arg_print, "help|h", \$arg_help ) || GetOptionsMessage(); # Print help documentation if user has selected --help &ListHelp() if (defined $arg_help); # Validate options if ((defined $::arg_repeat) && (defined $::arg_period)) { $fail = 1; warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n"; } if ((defined $::arg_delay) && ($::arg_delay < 0.01)) { $fail = 1; warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n", $::arg_delay; } if ((defined $::arg_sort) && !(($::arg_sort eq 'cpu') || ($::arg_sort eq 'io'))) { $fail = 1; warn "$::TOOLNAME: Input error: --sort=$::arg_sort invalid; valid options are: cpu, io.\n"; } if ((defined $::arg_print) && !(($::arg_print eq 'brief') || ($::arg_print eq 'full'))) { $fail = 1; warn "$::TOOLNAME: Input error: --print=$::arg_print invalid; valid options are: brief, full\n"; } if (@::ARGV) { $fail = 1; warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n"; } # Set reasonable defaults $::arg_delay ||= 1.0; $::arg_repeat ||= 1; if ($::arg_period) { $::arg_repeat = $::arg_period / $::arg_delay; } else { $::arg_period = $::arg_delay * $::arg_repeat; } $::arg_sort ||= 'cpu'; $::arg_print ||= 'full'; # Upon missing or invalid options, print usage if ($fail == 1) { &Usage(); exit 1; } } # Print out a warning message and usage sub GetOptionsMessage { warn "$::TOOLNAME: Error processing input arguments.\n"; &Usage(); exit 1; } # Print out program usage sub Usage { printf "Usage: $::TOOLNAME OPTIONS\n"; printf " [--delay=] [--repeat=] [--period=]\n"; printf " [--reset-hwm] [--idle] [--sort=] [--print=]\n"; printf " [--help]\n"; printf "\n"; } # Print tool help sub ListHelp { printf "$::TOOLNAME -- display per-task scheduling occupancy\n"; &Usage(); printf "Options: miscellaneous\n"; printf " --delay= : output interval (seconds): default: 1.0\n"; printf " --repeat= : number of repeat samples: default: 1\n"; printf " --period= : overall tool duration (seconds): default: --\n"; printf " --reset-hwm : reset scheduling delay hi-water marks\n"; printf " --idle : specify printing of idle tasks\n"; printf " --sort= : sort order, select from 'cpu' or 'io'\n"; printf " --print= : select 'brief' or 'full' fields to display\n"; printf " --help : this help\n"; exit 0; } 1;