- 2056a782f8e7 [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23
- v2.6.17-rc1 include/linux/blktrace_api.h
- v2.6.17-rc1 block/blktrace.c
blktrace.c 实现了 message 的填充(包括 message header 的构造 - struct blk_io_trace)并通过 relayfs 传递 message 到 userspace;blktrace_api 封装对 __blk_add_trace 的调用,Block IO layer 通过调用这些 APIs 实时将 trace 信息发送到 userspace
struct blk_io_trace - 每条 message 的 header
一条 message 包含 一个 header struct blk_io_trace
+ pdu
#define BLK_IO_TRACE_MAGIC 0x65617400
#define BLK_IO_TRACE_VERSION 0x07
struct blk_io_trace {
u32 magic; /* MAGIC << 8 | version */
u32 sequence; /* event number */
u64 time; /* in microseconds */
u64 sector; /* disk offset */
u32 bytes; /* transfer length */
u32 action; /* what happened */
u32 pid; /* who did it */
u32 device; /* device number */
u32 cpu; /* on what cpu did it happen */
u16 error; /* completion error */
u16 pdu_len; /* length of data after this trace */
};
struct blk_trace - 记录在 request_queue->blk_trace
request_queue->blk_trace 为 NULL 表示没有 trace block IO;request_queue->blk_trace 不为 NULL 看 blk_trace->trace_state 的状态
enum {
Blktrace_setup = 1,
Blktrace_running,
Blktrace_stopped,
};
struct blk_trace {
int trace_state;
struct rchan *rchan; 使用 relay 消息传递机制
unsigned long *sequence;
u16 act_mask;
u64 start_lba;
u64 end_lba;
u32 pid;
u32 dev;
struct dentry *dir;
struct dentry *dropped_file;
atomic_t dropped;
};
Trace API
Trace targets: action against a request
、action against a bio
、unplug action with an integer payload
、remap/split bio against dm- or md- devices
、generic
blk_add_trace_rq - Add a trace for a request oriented action
@q: queue the io is for
@rq: the source request
@what: the action
Records an action against a request. Will log the bio offset + size.
blk_add_trace_bio - Add a trace for a bio oriented action
@q: queue the io is for
@bio: the source bio
@what: the action
Records an action against a bio. Will log the bio offset + size.
blk_add_trace_generic - Add a trace for a generic action
@q: queue the io is for
@bio: the source bio
@rw: the data direction
@what: the action
Records a simple trace
blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
@q: queue the io is for
@what: the action
@bio: the source bio
@pdu: the integer payload
Adds a trace with some integer payload. This might be an unplug
option given as the action, with the depth at unplug time given
as the payload
blk_add_trace_remap - Add a trace for a remap operation
@q: queue the io is for
@bio: the source bio
@dev: target device
@from: source sector
@to: target sector
Device mapper or raid target sometimes need to split a bio because
it spans a stripe (or similar). Add a trace for that action.
trace categories
enum blktrace_cat {
BLK_TC_READ = 1 << 0, /* reads */
BLK_TC_WRITE = 1 << 1, /* writes */
BLK_TC_BARRIER = 1 << 2, /* barrier */
BLK_TC_SYNC = 1 << 3, /* barrier */
BLK_TC_QUEUE = 1 << 4, /* queueing/merging */
BLK_TC_REQUEUE = 1 << 5, /* requeueing */
BLK_TC_ISSUE = 1 << 6, /* issue */
BLK_TC_COMPLETE = 1 << 7, /* completions */
BLK_TC_FS = 1 << 8, /* fs requests */
BLK_TC_PC = 1 << 9, /* pc requests */
BLK_TC_NOTIFY = 1 << 10, /* special message */
BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
};
trace actions
#define BLK_TC_SHIFT (16)
#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT)
enum blktrace_act {
__BLK_TA_QUEUE = 1, /* queued */
__BLK_TA_BACKMERGE, /* back merged to existing rq */
__BLK_TA_FRONTMERGE, /* front merge to existing rq */
__BLK_TA_GETRQ, /* allocated new request */
__BLK_TA_SLEEPRQ, /* sleeping on rq allocation */
__BLK_TA_REQUEUE, /* request requeued */
__BLK_TA_ISSUE, /* sent to driver */
__BLK_TA_COMPLETE, /* completed by driver */
__BLK_TA_PLUG, /* queue was plugged */
__BLK_TA_UNPLUG_IO, /* queue was unplugged by io */
__BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
__BLK_TA_INSERT, /* insert request */
__BLK_TA_SPLIT, /* bio was split */
__BLK_TA_BOUNCE, /* bio was bounced */
__BLK_TA_REMAP, /* bio was remapped */
};
/*
* Trace actions in full. Additionally, read or write is masked
*/
#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
#define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_SPLIT (__BLK_TA_SPLIT)
#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
struct blk_io_trace_remap - the remap event
struct blk_io_trace_remap {
u32 device;
u32 __pad;
u64 sector;
};
__blk_add_trace - 生成一条 message 描述 trace block IO 信息
/*
* The worker for the various blk_add_trace*() types. Fills out a
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int rw, u32 what, int error, int pdu_len, void *pdu_data)
{
struct task_struct *tsk = current;
struct blk_io_trace *t;
unsigned long flags;
unsigned long *sequence;
pid_t pid;
int cpu;
if (unlikely(bt->trace_state != Blktrace_running))
return;
what |= ddir_act[rw & WRITE];
what |= bio_act[trace_barrier_bit(rw)];
what |= bio_act[trace_sync_bit(rw)];
pid = tsk->pid;
if (unlikely(act_log_check(bt, what, sector, pid)))
return;
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes. Once reserved, it's
* enough to get preemption disabled to prevent read of this data
* before we are through filling it. get_cpu()/put_cpu() does this
* for us
*/
local_irq_save(flags);
if (unlikely(tsk->btrace_seq != blktrace_seq))
trace_note_tsk(bt, tsk);
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
// 一条 message 包含
// 一个 header struct blk_io_trace
// pdu
if (t) {
cpu = smp_processor_id();
sequence = per_cpu_ptr(bt->sequence, cpu);
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
t->sector = sector;
t->bytes = bytes;
t->action = what;
t->pid = pid;
t->device = bt->dev;
t->cpu = cpu;
t->error = error;
t->pdu_len = pdu_len;
if (pdu_len)
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__blk_add_trace);
IOCTL
struct blk_user_trace_setup - user setup structure passed with BLKTRACESTART
struct blk_user_trace_setup {
char name[BDEVNAME_SIZE]; /* output */
u16 act_mask; /* input */
u32 buf_size; /* input */
u32 buf_nr; /* input */
u64 start_lba;
u64 end_lba;
u32 pid;
};
blk_trace_ioctl
#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
#define BLKTRACESTART _IO(0x12,116)
#define BLKTRACESTOP _IO(0x12,117)
#define BLKTRACETEARDOWN _IO(0x12,118)
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
*
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
request_queue_t *q;
int ret, start = 0;
q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
mutex_lock(&bdev->bd_mutex);
switch (cmd) {
case BLKTRACESETUP:
ret = blk_trace_setup(q, bdev, arg);
break; // Setup everything required to start tracing
case BLKTRACESTART:
start = 1; // 注意这里会 fallthrough
case BLKTRACESTOP:
ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&bdev->bd_mutex);
return ret;
}