blktrace - early version of kernel part

blktrace.c 实现了 message 的填充(包括 message header 的构造 - struct blk_io_trace)并通过 relayfs 传递 message 到 userspace;blktrace_api 封装对 __blk_add_trace 的调用,Block IO layer 通过调用这些 APIs 实时将 trace 信息发送到 userspace

struct blk_io_trace - 每条 message 的 header

一条 message 包含 一个 header struct blk_io_trace + pdu

    #define BLK_IO_TRACE_MAGIC  0x65617400
    #define BLK_IO_TRACE_VERSION    0x07

    struct blk_io_trace {
        u32 magic;      /* MAGIC << 8 | version */
        u32 sequence;   /* event number */
        u64 time;       /* in microseconds */
        u64 sector;     /* disk offset */
        u32 bytes;      /* transfer length */
        u32 action;     /* what happened */
        u32 pid;        /* who did it */
        u32 device;     /* device number */
        u32 cpu;        /* on what cpu did it happen */
        u16 error;      /* completion error */
        u16 pdu_len;    /* length of data after this trace */
    };

struct blk_trace - 记录在 request_queue->blk_trace

request_queue->blk_trace 为 NULL 表示没有 trace block IO;request_queue->blk_trace 不为 NULL 看 blk_trace->trace_state 的状态

    enum {
        Blktrace_setup = 1,
        Blktrace_running,
        Blktrace_stopped,
    };

    struct blk_trace {
        int trace_state;
        struct rchan *rchan;            使用 relay 消息传递机制
        unsigned long *sequence;
        u16 act_mask;
        u64 start_lba;
        u64 end_lba;
        u32 pid;
        u32 dev;
        struct dentry *dir;
        struct dentry *dropped_file;
        atomic_t dropped;
    };

Trace API

Trace targets: action against a requestaction against a biounplug action with an integer payloadremap/split bio against dm- or md- devicesgeneric

    blk_add_trace_rq - Add a trace for a request oriented action
            @q:     queue the io is for
            @rq:    the source request
            @what:  the action
        Records an action against a request. Will log the bio offset + size.


    blk_add_trace_bio - Add a trace for a bio oriented action
            @q:     queue the io is for
            @bio:   the source bio
            @what:  the action
        Records an action against a bio. Will log the bio offset + size.


    blk_add_trace_generic - Add a trace for a generic action
            @q:     queue the io is for
            @bio:   the source bio
            @rw:    the data direction
            @what:  the action
        Records a simple trace


    blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
            @q:     queue the io is for
            @what:  the action
            @bio:   the source bio
            @pdu:   the integer payload
        Adds a trace with some integer payload. This might be an unplug
        option given as the action, with the depth at unplug time given
        as the payload


    blk_add_trace_remap - Add a trace for a remap operation
            @q:     queue the io is for
            @bio:   the source bio
            @dev:   target device
            @from:  source sector
            @to:    target sector
        Device mapper or raid target sometimes need to split a bio because
        it spans a stripe (or similar). Add a trace for that action.

trace categories

    enum blktrace_cat {
        BLK_TC_READ     = 1 << 0,   /* reads */
        BLK_TC_WRITE    = 1 << 1,   /* writes */
        BLK_TC_BARRIER  = 1 << 2,   /* barrier */
        BLK_TC_SYNC     = 1 << 3,   /* barrier */
        BLK_TC_QUEUE    = 1 << 4,   /* queueing/merging */
        BLK_TC_REQUEUE  = 1 << 5,   /* requeueing */
        BLK_TC_ISSUE    = 1 << 6,   /* issue */
        BLK_TC_COMPLETE = 1 << 7,   /* completions */
        BLK_TC_FS       = 1 << 8,   /* fs requests */
        BLK_TC_PC       = 1 << 9,   /* pc requests */
        BLK_TC_NOTIFY   = 1 << 10,  /* special message */

        BLK_TC_END      = 1 << 15,  /* only 16-bits, reminder */
    };

trace actions

    #define BLK_TC_SHIFT        (16)
    #define BLK_TC_ACT(act)     ((act) << BLK_TC_SHIFT)

    enum blktrace_act {
        __BLK_TA_QUEUE = 1,     /* queued */
        __BLK_TA_BACKMERGE,     /* back merged to existing rq */
        __BLK_TA_FRONTMERGE,    /* front merge to existing rq */
        __BLK_TA_GETRQ,         /* allocated new request */
        __BLK_TA_SLEEPRQ,       /* sleeping on rq allocation */
        __BLK_TA_REQUEUE,       /* request requeued */
        __BLK_TA_ISSUE,         /* sent to driver */
        __BLK_TA_COMPLETE,      /* completed by driver */
        __BLK_TA_PLUG,          /* queue was plugged */
        __BLK_TA_UNPLUG_IO,     /* queue was unplugged by io */
        __BLK_TA_UNPLUG_TIMER,  /* queue was unplugged by timer */
        __BLK_TA_INSERT,        /* insert request */
        __BLK_TA_SPLIT,         /* bio was split */
        __BLK_TA_BOUNCE,        /* bio was bounced */
        __BLK_TA_REMAP,         /* bio was remapped */
    };

    /*
     * Trace actions in full. Additionally, read or write is masked
     */
    #define BLK_TA_QUEUE        (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_BACKMERGE    (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_FRONTMERGE   (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_GETRQ        (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_SLEEPRQ      (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_REQUEUE      (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
    #define BLK_TA_ISSUE        (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
    #define BLK_TA_COMPLETE     (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
    #define BLK_TA_PLUG         (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_UNPLUG_IO    (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_INSERT       (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_SPLIT        (__BLK_TA_SPLIT)
    #define BLK_TA_BOUNCE       (__BLK_TA_BOUNCE)
    #define BLK_TA_REMAP        (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))

struct blk_io_trace_remap - the remap event

    struct blk_io_trace_remap {
        u32 device;
        u32 __pad;
        u64 sector;
    };

__blk_add_trace - 生成一条 message 描述 trace block IO 信息

    /*
     * The worker for the various blk_add_trace*() types. Fills out a
     * blk_io_trace structure and places it in a per-cpu subbuffer.
     */
    void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                 int rw, u32 what, int error, int pdu_len, void *pdu_data)
    {
        struct task_struct *tsk = current;
        struct blk_io_trace *t;
        unsigned long flags;
        unsigned long *sequence;
        pid_t pid;
        int cpu;

        if (unlikely(bt->trace_state != Blktrace_running))
            return;

        what |= ddir_act[rw & WRITE];
        what |= bio_act[trace_barrier_bit(rw)];
        what |= bio_act[trace_sync_bit(rw)];

        pid = tsk->pid;
        if (unlikely(act_log_check(bt, what, sector, pid)))
            return;

        /*
         * A word about the locking here - we disable interrupts to reserve
         * some space in the relay per-cpu buffer, to prevent an irq
         * from coming in and stepping on our toes. Once reserved, it's
         * enough to get preemption disabled to prevent read of this data
         * before we are through filling it. get_cpu()/put_cpu() does this
         * for us
         */
        local_irq_save(flags);

        if (unlikely(tsk->btrace_seq != blktrace_seq))
            trace_note_tsk(bt, tsk);

        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
                    // 一条 message 包含
                    //          一个 header struct blk_io_trace
                    //          pdu
        if (t) {
            cpu = smp_processor_id();
            sequence = per_cpu_ptr(bt->sequence, cpu);

            t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
            t->sequence = ++(*sequence);
            t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
            t->sector = sector;
            t->bytes = bytes;
            t->action = what;
            t->pid = pid;
            t->device = bt->dev;
            t->cpu = cpu;
            t->error = error;
            t->pdu_len = pdu_len;

            if (pdu_len)
                memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
        }

        local_irq_restore(flags);
    }
    EXPORT_SYMBOL_GPL(__blk_add_trace);

IOCTL

struct blk_user_trace_setup - user setup structure passed with BLKTRACESTART

    struct blk_user_trace_setup {
        char name[BDEVNAME_SIZE];   /* output */
        u16 act_mask;               /* input */
        u32 buf_size;               /* input */
        u32 buf_nr;                 /* input */
        u64 start_lba;
        u64 end_lba;
        u32 pid;
    };

blk_trace_ioctl

    #define BLKTRACESETUP       _IOWR(0x12,115,struct blk_user_trace_setup)
    #define BLKTRACESTART       _IO(0x12,116)
    #define BLKTRACESTOP        _IO(0x12,117)
    #define BLKTRACETEARDOWN    _IO(0x12,118)

    /**
     * blk_trace_ioctl: - handle the ioctls associated with tracing
     * @bdev:   the block device
     * @cmd:    the ioctl cmd
     * @arg:    the argument data, if any
     *
     **/
    int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
    {
        request_queue_t *q;
        int ret, start = 0;

        q = bdev_get_queue(bdev);
        if (!q)
            return -ENXIO;

        mutex_lock(&bdev->bd_mutex);

        switch (cmd) {
        case BLKTRACESETUP:
            ret = blk_trace_setup(q, bdev, arg);    
            break;              // Setup everything required to start tracing
        case BLKTRACESTART:
            start = 1;          // 注意这里会 fallthrough 
        case BLKTRACESTOP:
            ret = blk_trace_startstop(q, start);
            break;
        case BLKTRACETEARDOWN:
            ret = blk_trace_remove(q);
            break;
        default:
            ret = -ENOTTY;
            break;
        }

        mutex_unlock(&bdev->bd_mutex);
        return ret;
    }