2014-07-21 19:29:00 -07:00
|
|
|
#!/bin/bash
|
|
|
|
|
#
|
|
|
|
|
# iolatency - summarize block device I/O latency as a histogram.
|
|
|
|
|
# Written using Linux ftrace.
|
|
|
|
|
#
|
|
|
|
|
# This shows the distribution of latency, allowing modes and latency outliers
|
|
|
|
|
# to be identified and studied.
|
|
|
|
|
#
|
2014-07-23 17:44:52 -07:00
|
|
|
# USAGE: ./iolatency [-hQT] [-d device] [-i iotype] [interval [count]]
|
2014-07-21 19:29:00 -07:00
|
|
|
#
|
|
|
|
|
# REQUIREMENTS: FTRACE CONFIG and block:block_rq_* tracepoints, which you may
|
|
|
|
|
# already have on recent kernels.
|
|
|
|
|
#
|
|
|
|
|
# OVERHEAD: block device I/O issue and completion events are traced and buffered
|
|
|
|
|
# in-kernel, then processed and summarized in user space. There may be
|
|
|
|
|
# measurable overhead with this approach, relative to the block device IOPS.
|
|
|
|
|
#
|
|
|
|
|
# This was written as a proof of concept for ftrace.
|
|
|
|
|
#
|
|
|
|
|
# From perf-tools: https://github.com/brendangregg/perf-tools
|
|
|
|
|
#
|
|
|
|
|
# COPYRIGHT: Copyright (c) 2014 Brendan Gregg.
|
|
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or
|
|
|
|
|
# modify it under the terms of the GNU General Public License
|
|
|
|
|
# as published by the Free Software Foundation; either version 2
|
|
|
|
|
# of the License, or (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program; if not, write to the Free Software Foundation,
|
|
|
|
|
# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
|
#
|
|
|
|
|
# (http://www.gnu.org/copyleft/gpl.html)
|
|
|
|
|
#
|
|
|
|
|
# 20-Jul-2014 Brendan Gregg Created this.
|
|
|
|
|
|
|
|
|
|
### default variables
|
|
|
|
|
tracing=/sys/kernel/debug/tracing
|
|
|
|
|
flock=/var/tmp/.ftrace-lock
|
|
|
|
|
bufsize_kb=4096
|
|
|
|
|
opt_device=0; device=; opt_iotype=0; iotype=; opt_timestamp=0
|
2014-07-23 17:44:52 -07:00
|
|
|
opt_interval=0; interval=1; opt_count=0; count=0; opt_queue=0
|
2014-07-24 16:00:14 -07:00
|
|
|
trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section
|
2014-07-21 19:29:00 -07:00
|
|
|
|
|
|
|
|
function usage {
|
|
|
|
|
cat <<-END >&2
|
2014-07-23 17:44:52 -07:00
|
|
|
USAGE: iolatency [-hQT] [-d device] [-i iotype] [interval [count]]
|
2014-07-21 19:29:00 -07:00
|
|
|
-d device # device string (eg, "202,1)
|
|
|
|
|
-i iotype # match type (eg, '*R*' for all reads)
|
2014-07-23 17:44:52 -07:00
|
|
|
-Q # use queue insert as start time
|
2014-07-21 19:29:00 -07:00
|
|
|
-T # timestamp on output
|
|
|
|
|
-h # this usage message
|
|
|
|
|
interval # summary interval, seconds (default 1)
|
|
|
|
|
count # number of summaries
|
|
|
|
|
eg,
|
2014-07-23 19:21:22 -07:00
|
|
|
iolatency # summarize latency every second
|
|
|
|
|
iolatency -Q # include block I/O queue time
|
|
|
|
|
iolatency 5 2 # 2 x 5 second summaries
|
|
|
|
|
iolatency -i '*R*' # trace reads
|
|
|
|
|
iolatency -d 202,1 # trace device 202,1 only
|
2014-07-23 17:44:52 -07:00
|
|
|
|
|
|
|
|
See the man page and example file for more info.
|
2014-07-21 19:29:00 -07:00
|
|
|
END
|
|
|
|
|
exit
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function warn {
|
|
|
|
|
if ! eval "$@"; then
|
|
|
|
|
echo >&2 "WARNING: command failed \"$@\""
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function end {
|
|
|
|
|
# disable tracing
|
|
|
|
|
echo 2>/dev/null
|
|
|
|
|
echo "Ending tracing..." 2>/dev/null
|
|
|
|
|
cd $tracing
|
2014-07-23 17:44:52 -07:00
|
|
|
warn "echo 0 > events/block/$b_start/enable"
|
2014-07-21 19:29:00 -07:00
|
|
|
warn "echo 0 > events/block/block_rq_complete/enable"
|
|
|
|
|
if (( opt_device || opt_iotype )); then
|
2014-07-23 17:44:52 -07:00
|
|
|
warn "echo 0 > events/block/$b_start/filter"
|
2014-07-21 19:29:00 -07:00
|
|
|
warn "echo 0 > events/block/block_rq_complete/filter"
|
|
|
|
|
fi
|
|
|
|
|
warn "echo > trace"
|
|
|
|
|
(( wroteflock )) && warn "rm $flock"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function die {
|
|
|
|
|
echo >&2 "$@"
|
|
|
|
|
exit 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function edie {
|
|
|
|
|
# die with a quiet end()
|
|
|
|
|
echo >&2 "$@"
|
|
|
|
|
exec >/dev/null 2>&1
|
|
|
|
|
end
|
|
|
|
|
exit 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
### process options
|
2014-07-23 17:44:52 -07:00
|
|
|
while getopts d:hi:QT opt
|
2014-07-21 19:29:00 -07:00
|
|
|
do
|
|
|
|
|
case $opt in
|
|
|
|
|
d) opt_device=1; device=$OPTARG ;;
|
|
|
|
|
i) opt_iotype=1; iotype=$OPTARG ;;
|
2014-07-23 17:44:52 -07:00
|
|
|
Q) opt_queue=1 ;;
|
2014-07-21 19:29:00 -07:00
|
|
|
T) opt_timestamp=1 ;;
|
|
|
|
|
h|?) usage ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
shift $(( $OPTIND - 1 ))
|
|
|
|
|
if (( $# )); then
|
|
|
|
|
opt_interval=1
|
|
|
|
|
interval=$1
|
|
|
|
|
shift
|
|
|
|
|
fi
|
|
|
|
|
if (( $# )); then
|
|
|
|
|
opt_count=1
|
|
|
|
|
count=$1
|
|
|
|
|
fi
|
|
|
|
|
if (( opt_device )); then
|
|
|
|
|
major=${device%,*}
|
|
|
|
|
minor=${device#*,}
|
|
|
|
|
dev=$(( (major << 20) + minor ))
|
|
|
|
|
fi
|
2014-07-23 17:44:52 -07:00
|
|
|
if (( opt_queue )); then
|
|
|
|
|
b_start=block_rq_insert
|
|
|
|
|
else
|
|
|
|
|
b_start=block_rq_issue
|
|
|
|
|
fi
|
2014-07-21 19:29:00 -07:00
|
|
|
|
|
|
|
|
### select awk
|
|
|
|
|
[[ -x /usr/bin/mawk ]] && awk='mawk -W interactive' || awk=awk
|
|
|
|
|
|
|
|
|
|
### check permissions
|
|
|
|
|
cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE?
|
|
|
|
|
debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)"
|
|
|
|
|
|
|
|
|
|
### ftrace lock
|
|
|
|
|
[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock"
|
|
|
|
|
echo $$ > $flock || die "ERROR: unable to write $flock."
|
|
|
|
|
wroteflock=1
|
|
|
|
|
|
|
|
|
|
### setup and begin tracing
|
|
|
|
|
warn "echo nop > current_tracer"
|
|
|
|
|
warn "echo $bufsize_kb > buffer_size_kb"
|
|
|
|
|
filter=
|
|
|
|
|
if (( opt_iotype )); then
|
|
|
|
|
filter="rwbs ~ \"$iotype\""
|
|
|
|
|
fi
|
|
|
|
|
if (( opt_device )); then
|
|
|
|
|
[[ "$filter" != "" ]] && filter="$filter && "
|
|
|
|
|
filter="${filter}dev == $dev"
|
|
|
|
|
fi
|
|
|
|
|
if (( opt_iotype || opt_device )); then
|
2014-07-23 17:44:52 -07:00
|
|
|
if ! echo "$filter" > events/block/$b_start/filter || \
|
2014-07-21 19:29:00 -07:00
|
|
|
! echo "$filter" > events/block/block_rq_complete/filter
|
|
|
|
|
then
|
|
|
|
|
edie "ERROR: setting -d or -t filter. Exiting."
|
|
|
|
|
fi
|
|
|
|
|
fi
|
2014-07-23 17:44:52 -07:00
|
|
|
if ! echo 1 > events/block/$b_start/enable || \
|
2014-07-21 19:29:00 -07:00
|
|
|
! echo 1 > events/block/block_rq_complete/enable; then
|
|
|
|
|
edie "ERROR: enabling block I/O tracepoints. Exiting."
|
|
|
|
|
fi
|
|
|
|
|
etext=
|
|
|
|
|
(( !opt_count )) && etext=" Ctrl-C to end."
|
|
|
|
|
echo "Tracing block I/O. Output every $interval seconds.$etext"
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
# Determine output format. It may be one of the following (newest first):
|
|
|
|
|
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
|
|
|
|
|
# TASK-PID CPU# TIMESTAMP FUNCTION
|
|
|
|
|
# To differentiate between them, the number of header fields is counted,
|
|
|
|
|
# and an offset set, to skip the extra column when needed.
|
|
|
|
|
#
|
|
|
|
|
offset=$($awk 'BEGIN { o = 0; }
|
|
|
|
|
$1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; }
|
|
|
|
|
$2 ~ /TASK/ { print o; exit }' trace)
|
|
|
|
|
|
|
|
|
|
### print trace buffer
|
|
|
|
|
warn "echo > trace"
|
|
|
|
|
i=0
|
|
|
|
|
while (( !opt_count || (i < count) )); do
|
|
|
|
|
(( i++ ))
|
|
|
|
|
sleep $interval
|
|
|
|
|
|
|
|
|
|
# snapshots were added in 3.10
|
|
|
|
|
if [[ -x snapshot ]]; then
|
|
|
|
|
echo 1 > snapshot
|
|
|
|
|
echo > trace
|
|
|
|
|
cat snapshot
|
|
|
|
|
else
|
|
|
|
|
cat trace
|
|
|
|
|
echo > trace
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
(( opt_timestamp )) && printf "time %(%H:%M:%S)T:\n" -1
|
|
|
|
|
echo "tick"
|
|
|
|
|
done | \
|
2014-07-23 17:44:52 -07:00
|
|
|
$awk -v o=$offset -v opt_timestamp=$opt_timestamp -v b_start=$b_start '
|
2014-07-21 19:29:00 -07:00
|
|
|
function star(sval, smax, swidth) {
|
|
|
|
|
stars = ""
|
|
|
|
|
if (smax == 0) return ""
|
|
|
|
|
for (si = 0; si < (swidth * sval / smax); si++) {
|
|
|
|
|
stars = stars "#"
|
|
|
|
|
}
|
|
|
|
|
return stars
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BEGIN { max_i = 0 }
|
|
|
|
|
|
|
|
|
|
# common fields
|
|
|
|
|
$1 != "#" {
|
|
|
|
|
time = $(3+o); sub(":", "", time)
|
|
|
|
|
dev = $(5+o)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# block I/O request
|
2014-07-23 17:44:52 -07:00
|
|
|
$1 != "#" && $0 ~ b_start {
|
2014-07-21 19:29:00 -07:00
|
|
|
#
|
|
|
|
|
# example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar]
|
|
|
|
|
# The cmd field "()" might contain multiple words (hex),
|
|
|
|
|
# hence stepping from the right (NF-3).
|
|
|
|
|
#
|
|
|
|
|
loc = $(NF-3)
|
|
|
|
|
starts[dev, loc] = time
|
|
|
|
|
next
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# block I/O completion
|
|
|
|
|
$1 != "#" && $0 ~ /rq_complete/ {
|
|
|
|
|
#
|
|
|
|
|
# example: (fields1..4+o) 202,1 W () 12862256 + 8 [0]
|
|
|
|
|
#
|
|
|
|
|
dir = $(6+o)
|
|
|
|
|
loc = $(NF-3)
|
|
|
|
|
|
|
|
|
|
if (starts[dev, loc] > 0) {
|
|
|
|
|
latency_ms = 1000 * (time - starts[dev, loc])
|
|
|
|
|
i = 0
|
|
|
|
|
for (ms = 1; latency_ms > ms; ms *= 2) { i++ }
|
|
|
|
|
hist[i]++
|
|
|
|
|
if (i > max_i)
|
|
|
|
|
max_i = i
|
|
|
|
|
delete starts[dev, loc]
|
|
|
|
|
}
|
|
|
|
|
next
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# timestamp
|
|
|
|
|
$1 == "time" {
|
|
|
|
|
lasttime = $2
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# print summary
|
|
|
|
|
$1 == "tick" {
|
|
|
|
|
print ""
|
|
|
|
|
if (opt_timestamp)
|
|
|
|
|
print lasttime
|
|
|
|
|
|
|
|
|
|
# find max value
|
|
|
|
|
max_v = 0
|
|
|
|
|
for (i = 0; i <= max_i; i++) {
|
|
|
|
|
if (hist[i] > max_v)
|
|
|
|
|
max_v = hist[i]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# print histogram
|
|
|
|
|
printf "%8s .. %-8s: %-8s |%-38s|\n", ">=(ms)", "<(ms)",
|
|
|
|
|
"I/O", "Distribution"
|
|
|
|
|
ms = 1
|
|
|
|
|
from = 0
|
|
|
|
|
for (i = 0; i <= max_i; i++) {
|
|
|
|
|
printf "%8d -> %-8d: %-8d |%-38s|\n", from, ms,
|
|
|
|
|
hist[i], star(hist[i], max_v, 38)
|
|
|
|
|
from = ms
|
|
|
|
|
ms *= 2
|
|
|
|
|
}
|
|
|
|
|
fflush()
|
|
|
|
|
delete hist
|
|
|
|
|
delete starts # invalid if events missed between snapshots
|
|
|
|
|
max_i = 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" }
|
|
|
|
|
'
|
|
|
|
|
|
|
|
|
|
### end tracing
|
|
|
|
|
end
|