diff --git a/README.md b/README.md index 750f147..fe6ff87 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Many of these tools employ workarounds so that functionality is possible on exis Using perf_events: - misc/[perf-stat-hist](misc/perf-stat-hist): power-of aggregations for tracepoint variables. [Examples](examples/perf-stat-hist_example.txt). +- [syscount](syscount): count syscalls by syscall or process. [Examples](examples/syscount_example.txt). - disk/[bitesize](disk/bitesize): histogram summary of disk I/O size. [Examples](examples/bitesize_example.txt). Using ftrace: diff --git a/examples/syscount_example.txt b/examples/syscount_example.txt new file mode 100644 index 0000000..15aa6e5 --- /dev/null +++ b/examples/syscount_example.txt @@ -0,0 +1,275 @@ +Demonstrations of syscount, the Linux perf_events version. + + +The first mode I use is "-c", where it behaves like "strace -c", but for the +entire system (all procesess) and with much lower overhead: + +# ./syscount -c +Tracing... Ctrl-C to end. +^Csleep: Interrupt +SYSCALL COUNT +accept 1 +getsockopt 1 +setsid 1 +chdir 2 +getcwd 2 +getpeername 2 +getsockname 2 +setgid 2 +setgroups 2 +setpgid 2 +setuid 2 +getpgrp 4 +getpid 4 +rename 4 +setitimer 4 +setrlimit 4 +setsockopt 4 +statfs 4 +set_tid_address 5 +readlink 6 +set_robust_list 6 +nanosleep 7 +newuname 7 +faccessat 8 +futex 10 +clock_gettime 16 +newlstat 20 +pipe 20 +epoll_wait 24 +getrlimit 25 +socket 27 +connect 29 +exit_group 30 +getppid 31 +dup2 34 +wait4 51 +fcntl 58 +getegid 72 +getgid 72 +getuid 72 +geteuid 75 +perf_event_open 100 +munmap 121 +gettimeofday 216 +access 266 +ioctl 340 +poll 348 +sendto 374 +mprotect 414 +brk 597 +rt_sigaction 632 +recvfrom 664 +lseek 749 +newfstatat 2922 +openat 2925 +newfstat 3229 +newstat 4334 +open 4534 +fchdir 5845 +getdents 5854 +read 7673 +close 7728 +select 9633 +rt_sigprocmask 19886 +write 34581 + +While tracing, the write() syscall was executed 34,581 times. + +This mode uses "perf stat" to count the syscalls:* tracepoints in-kernel. + + +You can add a duration (-d) and limit the number shown (-t): + +# ./syscount -cd 5 -t 10 +Tracing for 5 seconds. Top 10 only... +SYSCALL COUNT +gettimeofday 1009 +write 3583 +read 8174 +openat 21550 +newfstat 21558 +open 21824 +fchdir 43098 +getdents 43106 +close 43694 +newfstatat 110936 + +While tracing for 5 seconds, the newfstatat() syscall was executed 110,936 +times. + + +Without the -c, syscount shows syscalls by process name: + +# ./syscount -d 5 -t 10 +Tracing for 5 seconds. Top 10 only... +[ perf record: Woken up 66 times to write data ] +[ perf record: Captured and wrote 16.513 MB perf.data (~721455 samples) ] +COMM COUNT +stat 450 +perl 537 +catalina.sh 1700 +postgres 2094 +run 2362 +:6946 4764 +ps 5961 +sshd 45796 +find 61039 + +So processes named "find" called 61,039 syscalls during the 5 seconds of +tracing. + +Note that this mode writes a perf.data file. This is higher overhead for a +few reasons: + +- all data is passed from kernel to user space, which eats CPU for the memory + copy. Note that it is buffered in an efficient way by perf_events, which + wakes up and context switches only a small number of times: 66 in this case, + to hand 16 Mbytes of trace data to user space. +- data is post-processed in user space, eating more CPU. +- data is stored on the file system in the perf.data file, consuming available + storage. + +This will be improved in future kernels, but it is difficult to improve this +much further in existing kernels. For example, using a pipe to "perf script" +instead of writing perf.data can have issues with feedback loops, where +perf traces itself. This syscount version goes to lengths to avoid tracing +its own perf, but +right now with existing functionality in older kernels. The trip via perf.data +is necessary + + +Running without options shows syscalls by process name until Ctrl-C: + +# ./syscount +Tracing... Ctrl-C to end. +^C[ perf record: Woken up 39 times to write data ] +[ perf record: Captured and wrote 9.644 MB perf.data (~421335 samples) ] +COMM COUNT +apache2 8 +apacheLogParser 13 +platformservice 16 +snmpd 16 +ntpd 21 +multilog 66 +supervise 84 +dirname 102 +echo 102 +svstat 108 +cut 111 +bash 113 +grep 132 +xargs 132 +redis-server 190 +sed 192 +setuidgid 294 +stat 450 +perl 537 +catalina.sh 1275 +postgres 1736 +run 2352 +:7396 4527 +ps 5925 +sshd 20154 +find 28700 + +Note again it is writing a perf.data file to do this. + + +The -v option adds process IDs: + +# ./syscount -v +Tracing... Ctrl-C to end. +^C[ perf record: Woken up 48 times to write data ] +[ perf record: Captured and wrote 12.114 MB perf.data (~529276 samples) ] +PID COMM COUNT +3599 apacheLogParser 3 +7977 xargs 3 +7982 supervise 3 +7993 xargs 3 +3575 apache2 4 +1311 ntpd 6 +3135 postgres 6 +3600 apacheLogParser 6 +3210 platformservice 8 +6503 sshd 9 +7978 :7978 9 +7994 run 9 +7968 :7968 11 +7984 run 11 +1451 snmpd 16 +3040 svscan 17 +3066 postgres 17 +3133 postgres 24 +3134 postgres 24 +3136 postgres 24 +3061 multilog 29 +3055 supervise 30 +7979 bash 31 +7977 echo 34 +7981 dirname 34 +7993 echo 34 +7968 svstat 36 +7984 svstat 36 +7975 cut 37 +7991 cut 37 +9857 bash 37 +7967 :7967 40 +7983 run 40 +7972 :7972 41 +7976 xargs 41 +7988 run 41 +7992 xargs 41 +7969 :7969 42 +7976 :7976 42 +7985 run 42 +7992 run 42 +7973 :7973 43 +7974 :7974 43 +7989 run 43 +7990 run 43 +7973 grep 44 +7989 grep 44 +7975 :7975 45 +7991 run 45 +7970 :7970 51 +7986 run 51 +7981 catalina.sh 52 +7974 sed 64 +7990 sed 64 +3455 postgres 66 +7971 :7971 66 +7987 run 66 +7966 :7966 96 +7966 setuidgid 98 +3064 redis-server 110 +7970 stat 150 +7986 stat 150 +7969 perl 179 +7985 perl 179 +7982 run 341 +7966 catalina.sh 373 +7980 postgres 432 +7972 ps 1971 +7988 ps 1983 +9832 sshd 37511 +7979 find 51040 + +Once you've found a process ID of interest, you can use "-c" and "-p PID" to +show syscall names. This also switches to "perf stat" mode for in-kernel +counts, and lower overhead: + +# ./syscount -cp 7979 +Tracing PID 7979... Ctrl-C to end. +^CSYSCALL COUNT +brk 10 +newfstat 2171 +open 2171 +newfstatat 2175 +openat 2175 +close 4346 +fchdir 4346 +getdents 4351 +write 25482 + +So the most frequent syscall by PID 7979 was write(). diff --git a/man/man8/syscount.8 b/man/man8/syscount.8 new file mode 100644 index 0000000..69221f3 --- /dev/null +++ b/man/man8/syscount.8 @@ -0,0 +1,96 @@ +.TH syscount 8 "2014-07-07" "USER COMMANDS" +.SH NAME +syscount \- count system calls. Uses Linux perf_events. +.SH SYNOPSIS +.B syscount +[\-chv] [\-t top] {\-p PID|\-d seconds|command} +.SH DESCRIPTION +This is a proof-of-concept using perf_events capabilities for older kernel +versions, that lack custom in-kernel aggregations. Once they exist, this +script can be substantially rewritten and improved (lower overhead). +.SH REQUIREMENTS +Linux perf_events: add linux-tools-common, run "perf", then +add any additional packages it requests. Also needs awk. +.SH OPTIONS +.TP +\-c +Show counts by syscall name. This mode (without -v) uses in-kernel counts, which +have lower overhead than the default mode. +.TP +\-h +Usage message. +.TP +\-v +Verbose: include PID. +.TP +\-p PID +Trace this process ID only. +.TP +\-d seconds +Duration of trace in seconds. +.TP +command +Run and trace this command. +.SH EXAMPLES +.TP +Trace and summarize syscalls by process name: +# +.B syscount +.TP +Trace and summarize syscalls by syscall name (lower overhead): +# +.B syscount \-c +.TP +Trace for 5 seconds, showing by process name: +# +.B syscount \-d 5 +.TP +Trace PID 932 only, and show by syscall name (lower overhead): +# +.B syscount \-cp 923 +.TP +Execute the """ls""" command, and show by syscall name: +# +.B syscount -c ls +.SH FIELDS +.TP +PID +Process ID. +.TP +COMM +Process command name. +.TP +SYSCALL +Syscall name. +.TP +COUNT +Number of syscalls during tracing. +.SH OVERHEAD +Modes that report syscall names only (\-c, \-cp PID, \-cd secs) have +lower overhead, since they use in-kernel counts. Other modes which report +process IDs (\-cv) or process names (default) create a perf.data file for +post processing, and you will see messages about it doing this. Beware of +the file size (test for short durations, or use \-c to see counts based on +in-kernel counters), and gauge overheads based on the perf.data size. + +Note that this script delibrately does not pipe perf record into +perf script, which would avoid perf.data, because it can create a feedback +loop where the perf script syscalls are recorded. Hopefully there will be a +fix for this in a later perf version, so perf.data can be skipped, or other +kernel features to aggregate by process name in-kernel directly (eg, via +eBPF, ktap, or SystemTap). +.SH SOURCE +This is from the perf-tools collection. +.IP +https://github.com/brendangregg/perf-tools +.PP +Also look under the examples directory for a text file containing example +usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +iosnoop(8), iolatency(8), iostat(1) diff --git a/syscount b/syscount new file mode 100755 index 0000000..7ee2432 --- /dev/null +++ b/syscount @@ -0,0 +1,189 @@ +#!/bin/bash +# +# syscount - count system calls. +# Written using Linux perf_events (aka "perf"). +# +# This is a proof-of-concept using perf_events capabilities for older kernel +# versions, that lack custom in-kernel aggregations. Once they exist, this +# script can be substantially rewritten and improved (lower overhead). +# +# USAGE: syscount [-chv] [-t top] {-p PID|-d seconds|command} +# +# Run "syscount -h" for full usage. +# +# REQUIREMENTS: Linux perf_events: add linux-tools-common, run "perf", then +# add any additional packages it requests. Also needs awk. +# +# OVERHEADS: Modes that report syscall names only (-c, -cp PID, -cd secs) have +# lower overhead, since they use in-kernel counts. Other modes which report +# process IDs (-cv) or process names (default) create a perf.data file for +# post processing, and you will see messages about it doing this. Beware of +# the file size (test for short durations, or use -c to see counts based on +# in-kernel counters), and gauge overheads based on the perf.data size. +# +# Note that this script delibrately does not pipe perf record into +# perf script, which would avoid perf.data, because it can create a feedback +# loop where the perf script syscalls are recorded. Hopefully there will be a +# fix for this in a later perf version, so perf.data can be skipped, or other +# kernel features to aggregate by process name in-kernel directly (eg, via +# eBPF, ktap, or SystemTap). +# +# From perf-tools: https://github.com/brendangregg/perf-tools +# +# See the syscount(8) man page (in perf-tools) for more info. +# +# COPYRIGHT: Copyright (c) 2014 Brendan Gregg. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# (http://www.gnu.org/copyleft/gpl.html) +# +# 07-Jul-2014 Brendan Gregg Created this. + +# default variables +opt_count=0; opt_pid=0; opt_verbose=0; opt_cmd=0; opt_duration=0; opt_tail=0 +tnum=; pid=; duration=; cmd=; cpus=-a; opts=; tcmd=cat; ttext= +trap '' INT QUIT TERM PIPE HUP + +stdout_workaround=1 # needed for older perf versions +write_workaround=1 # needed for perf versions that trace their own writes +ulimit -n 32768 # often needed + +### parse options +while getopts cd:hp:t:v opt +do + case $opt in + c) opt_count=1 ;; + d) opt_duration=1; duration=$OPTARG ;; + p) opt_pid=1; pid=$OPTARG ;; + t) opt_tail=1; tnum=$OPTARG ;; + v) opt_verbose=1 ;; + h|?) cat <<-END >&2 + USAGE: syscount [-chv] [-t top] {-p PID|-d seconds|command} + syscount # count by process name + -c # show counts by syscall name + -h # this usage message + -v # verbose: shows PID + -p PID # trace this PID only + -d seconds # duration of trace + -t num # show top number only + command # run and trace this command + eg, + syscount # syscalls by process name + syscount -c # syscalls by syscall name + syscount -d 5 # trace for 5 seconds + syscount -cp 923 # syscall names for PID 923 + syscount -c ls # syscall names for "ls" + END + exit 1 + esac +done +shift $(( $OPTIND - 1 )) + +### option logic +if (( $# > 0 )); then + opt_cmd=1 + cmd="$@" + cpus= +fi +if (( opt_pid + opt_duration + opt_cmd > 1 )); then + echo >&2 "ERROR: Pick one of {-p PID|-n name|-d seconds|command}" + exit 1 +fi +if (( opt_tail )); then + tcmd="tail -$tnum" + ttext=" Top $tnum only." +fi +if (( opt_duration )); then + cmd="sleep $duration" + echo "Tracing for $duration seconds.$ttext.." +fi +if (( opt_pid )); then + cpus= + cmd="-p $pid" + echo "Tracing PID $pid.$ttext.. Ctrl-C to end." +fi +(( opt_cmd )) && echo "Tracing while running: \"$cmd\".$ttext.." +(( opt_pid + opt_duration + opt_cmd == 0 )) && \ + echo "Tracing.$ttext.. Ctrl-C to end." +(( stdout_workaround )) && opts="-o /dev/stdout" + +### execute syscall name mode +if (( opt_count && ! opt_verbose )); then + : ${cmd:=sleep 999999} + out=$(perf stat $opts -e 'syscalls:sys_enter_*' $cpus $cmd) + printf "%-17s %8s\n" "SYSCALL" "COUNT" + echo "$out" | awk ' + $1 && $2 ~ /syscalls:/ { + sub("syscalls:sys_enter_", ""); sub(":", "") + gsub(",", "") + printf "%-17s %8s\n", $2, $1 + }' | sort -n -k2 | $tcmd + exit +fi + +### execute syscall name with pid mode +if (( opt_count && opt_verbose )); then + if (( write_workaround )); then + # this list must end in write to associate the filter + tp=$(perf list syscalls:sys_enter_* | awk ' + $1 != "syscalls:sys_enter_write" { printf "-e %s ", $1 }') + tp="$tp -e syscalls:sys_enter_write" + sh -c "perf record $tp --filter 'common_pid != '\$\$ $cpus $cmd" + else + perf record 'syscalls:sys_enter_*' $cpus $cmd + # could also pipe direct to perf script + fi + + printf "%-6s %-16s %-17s %8s\n" "PID" "COMM" "SYSCALL" "COUNT" + perf script -f pid,comm,event | awk '$1 != "#" { + sub("sys_enter_", ""); sub(":", "") + a[$1 ";" $2 ";" $3]++ + } + END { + for (k in a) { + split(k, b, ";"); + printf "%-6s %-16s %-17s %8d\n", b[2], b[1], b[3], a[k] + } + }' | sort -n -k4 | $tcmd + exit +fi + +### execute process name mode +tp="-e raw_syscalls:sys_enter" +if (( write_workaround )); then + sh -c "perf record $tp --filter 'common_pid != '\$\$ $cpus $cmd" +else + perf record $tp $cpus $cmd +fi + +if (( opt_verbose )); then + printf "%-6s %-16s %8s\n" "PID" "COMM" "COUNT" + perf script -f pid,comm | awk '$1 != "#" { a[$1 ";" $2]++ } + END { + for (k in a) { + split(k, b, ";"); + printf "%-6s %-16s %8d\n", b[2], b[1], a[k] + } + }' | sort -n -k3 | $tcmd +else + printf "%-16s %8s\n" "COMM" "COUNT" + perf script -f comm | awk '$1 != "#" { a[$1]++ } + END { + for (k in a) { + printf "%-16s %8d\n", k, a[k] + } + }' | sort -n -k2 | $tcmd +fi