FIO是一款方便IO性能测试的工具,以统计全面、模式灵活深得用户欢心。当前支持libaio、sync等IO引擎。在存储系统开发中,如何快速快速全面评测系统IO性能?給FIO添加调用存储系统提供的IO的插件,是一个好办法。那么,该怎么添加一个插件呢?

首先,得熟悉几个基本的公共数据结构。

fio 插件的公共接口

单任务IO latency/IOPS/BW 统计信息

fio/fio.h:

struct thread_data {
    struct flist_head opt_list;
    unsigned long flags;
    struct thread_options o;
    void *eo;
    pthread_t thread;
    unsigned int thread_number;
    unsigned int subjob_number;
    unsigned int groupid;
    struct thread_stat ts __attribute__ ((aligned(8)));

    int client_type;

    struct io_log *slat_log;
    struct io_log *clat_log;
    struct io_log *clat_hist_log;
    struct io_log *lat_log;
    struct io_log *bw_log;
    struct io_log *iops_log;

    struct workqueue log_compress_wq;

    struct thread_data *parent;

    uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
    struct timespec bw_sample_time;

    uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
    struct timespec iops_sample_time;

    volatile int update_rusage;
    struct fio_sem *rusage_sem;
    struct rusage ru_start;
    struct rusage ru_end;

    struct fio_file **files;
    unsigned char *file_locks;
    unsigned int files_size;
    unsigned int files_index;
    unsigned int nr_open_files;
    unsigned int nr_done_files;
    union {
        unsigned int next_file;
        struct frand_state next_file_state;
.....
    int error;
    int sig;
    int done;
    int stop_io;
    pid_t pid;
    char *orig_buffer;
    size_t orig_buffer_size;
    volatile int runstate;
    volatile bool terminate;
    bool last_was_sync;
    enum fio_ddir last_ddir;

    int mmapfd;

    void *iolog_buf;
    FILE *iolog_f;

    unsigned long rand_seeds[FIO_RAND_NR_OFFS];

    struct frand_state bsrange_state[DDIR_RWDIR_CNT];
    struct frand_state verify_state;
    struct frand_state trim_state;
    struct frand_state delay_state;

    struct frand_state buf_state;
    struct frand_state buf_state_prev;
    struct frand_state dedupe_state;
    struct frand_state zone_state;

    struct zone_split_index **zone_state_index;

    unsigned int verify_batch;
    unsigned int trim_batch;

    struct thread_io_list *vstate;

    int shm_id;

    /*
     * IO engine hooks, contains everything needed to submit an io_u
     * to any of the available IO engines.
     */
    struct ioengine_ops *io_ops;
    int io_ops_init;

    /*
     * IO engine private data and dlhandle.
     */
    void *io_ops_data;
.....

基本读IO操作单元

接受fio 打出的IO请求,分发到具体的IO引擎。 fio/fio/io_u.h

struct io_u {
    struct timespec start_time;
    struct timespec issue_time;

    **struct fio_file *file;**
    unsigned int flags;
    enum fio_ddir ddir;

    /*
     * For replay workloads, we may want to account as a different
     * IO type than what is being submitted.
     */
    enum fio_ddir acct_ddir;

    /*
     * Write generation
     */
    unsigned short numberio;

    /*
     * Allocated/set buffer and length
     */
    unsigned long buflen;
    unsigned long long offset;
    void *buf;
.......
 struct io_piece *ipo;

    unsigned int resid;
    unsigned int error;

    /*
     * io engine private data
     */
    union {
        unsigned int index;
        unsigned int seen;
        void *engine_data;
    };

    union {
        struct flist_head verify_list;
        struct workqueue_work work;
    };

    /*
     * Callback for io completion
     */
    int (*end_io)(struct thread_data *, struct io_u **);

    union {
#ifdef CONFIG_LIBAIO
    **    struct iocb iocb;**
#endif
#ifdef CONFIG_POSIXAIO
        os_aiocb_t aiocb;
#endif
#ifdef FIO_HAVE_SGIO
        struct sg_io_hdr hdr;
#endif
#ifdef CONFIG_GUASI
        guasi_req_t greq;
#endif
#ifdef CONFIG_SOLARISAIO
        aio_result_t resultp;
#endif
#ifdef FIO_HAVE_BINJECT
        struct b_user_cmd buc;
#endif
#ifdef CONFIG_RDMA
        struct ibv_mr *mr;
#endif
        void *mmap_data;
    };

uio 提供的基本操作:

extern struct io_u *__get_io_u(struct thread_data *);
extern struct io_u *get_io_u(struct thread_data *);
extern void put_io_u(struct thread_data *, struct io_u *);
extern void clear_io_u(struct thread_data *, struct io_u *);
extern void requeue_io_u(struct thread_data *, struct io_u **);
extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
extern int __must_check io_u_queued_complete(struct thread_data *, int);
extern void io_u_queued(struct thread_data *, struct io_u *);
extern int io_u_quiesce(struct thread_data *);
extern void io_u_log_error(struct thread_data *, struct io_u *);
extern void io_u_mark_depth(struct thread_data *, unsigned int);
extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
void io_u_mark_complete(struct thread_data *, unsigned int);
void io_u_mark_submit(struct thread_data *, unsigned int);
bool queue_full(const struct thread_data *);

int do_io_u_sync(const struct thread_data *, struct io_u *);
int do_io_u_trim(const struct thread_data *, struct io_u *);

代码位置: fio/engines/engine.h

定义并实现ioengine ops

struct ioengine_ops {
    struct flist_head list;
    const char *name;
    int version;
    int flags;
    int (*setup)(struct thread_data *);
    int (*init)(struct thread_data *);
    int (*prep)(struct thread_data *, struct io_u *);
    int (*queue)(struct thread_data *, struct io_u *);
    int (*commit)(struct thread_data *);
    int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
    struct io_u *(*event)(struct thread_data *, int);
    char *(*errdetails)(struct io_u *);
    int (*cancel)(struct thread_data *, struct io_u *);
    void (*cleanup)(struct thread_data *);
    int (*open_file)(struct thread_data *, struct fio_file *);
    int (*close_file)(struct thread_data *, struct fio_file *);
    int (*invalidate)(struct thread_data *, struct fio_file *);
    int (*unlink_file)(struct thread_data *, struct fio_file *);
    int (*get_file_size)(struct thread_data *, struct fio_file *);
    void (*terminate)(struct thread_data *);
    int (*iomem_alloc)(struct thread_data *, size_t);
    void (*iomem_free)(struct thread_data *);
    int (*io_u_init)(struct thread_data *, struct io_u *);
    void (*io_u_free)(struct thread_data *, struct io_u *);
    int option_struct_size;
    struct fio_option *options;
};

注册ioengine ops

extern struct ioengine_ops *load_ioengine(struct thread_data *);
extern void register_ioengine(struct ioengine_ops *);
extern void unregister_ioengine(struct ioengine_ops *);
extern void free_ioengine(struct thread_data *);
extern void close_ioengine(struct thread_data *);

fio libaio 插件的实现和分析

下面,以libaio的支持为例,进行分析:

代码位置: fio/engines/libaio.c

static struct ioengine_ops ioengine = {
    .name           = "libaio",
    .version        = FIO_IOOPS_VERSION,
    .init           = fio_libaio_init,
    .prep           = fio_libaio_prep,
    .queue          = fio_libaio_queue,
    .commit         = fio_libaio_commit,
    .cancel         = fio_libaio_cancel,
    .getevents      = fio_libaio_getevents,
    .event          = fio_libaio_event,
    .cleanup        = fio_libaio_cleanup,
    .open_file      = generic_open_file,
    .close_file     = generic_close_file,
    .get_file_size      = generic_get_file_size,
    .options        = options,
    .option_struct_size = sizeof(struct libaio_options),
};

name

定义支持fio的IO引擎的名称

version

定义当前改实现的版本

init

调用libaio的初始化函数;申请空间;倒腾对应的成员到thread data中去。

static int fio_libaio_init(struct thread_data *td)
{
    struct libaio_options *o = td->eo;
    struct libaio_data *ld;
    int err = 0;

    ld = calloc(1, sizeof(*ld));

    /*
     * First try passing in 0 for queue depth, since we don't
     * care about the user ring. If that fails, the kernel is too old
     * and we need the right depth.
     */
    if (!o->userspace_reap)
        err = io_queue_init(INT_MAX, &ld->aio_ctx);
    if (o->userspace_reap || err == -EINVAL)
        err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
    if (err) {
        td_verror(td, -err, "io_queue_init");
        log_err("fio: check /proc/sys/fs/aio-max-nr\n");
        free(ld);
        return 1;
    }

    ld->entries = td->o.iodepth;
    ld->is_pow2 = is_power_of_2(ld->entries);
    ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
    ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
    ld->io_us = calloc(ld->entries, sizeof(struct io_u *));

    td->io_ops_data = ld;
    return 0;
}

prep

处理从FIO传过来的IO请求,拿到xfer_buf/xfer_buflen/ io_u->offset,作为 io_prep_pread的参数, 用它来初始化libaio的IOrequest。

static int fio_libaio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
{
    struct fio_file *f = io_u->file;

    if (io_u->ddir == DDIR_READ)
        io_prep_pread(&io_u->iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
    else if (io_u->ddir == DDIR_WRITE)
        io_prep_pwrite(&io_u->iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
    else if (ddir_sync(io_u->ddir))
        io_prep_fsync(&io_u->iocb, f->fd);

    return 0;
}

请求入队

把fio套件打出来的请求加入libaio内部的请求队列进去:

static int fio_libaio_queue(struct thread_data *td, struct io_u *io_u)
{
    struct libaio_data *ld = td->io_ops_data;

    fio_ro_check(td, io_u);

    if (ld->queued == td->o.iodepth)
        return FIO_Q_BUSY;

    /*
     * fsync is tricky, since it can fail and we need to do it
     * serialized with other io. the reason is that linux doesn't
     * support aio fsync yet. So return busy for the case where we
     * have pending io, to let fio complete those first.
     */
    if (ddir_sync(io_u->ddir)) {
        if (ld->queued)
            return FIO_Q_BUSY;

        do_io_u_sync(td, io_u);
        return FIO_Q_COMPLETED;
    }

    if (io_u->ddir == DDIR_TRIM) {
        if (ld->queued)
            return FIO_Q_BUSY;

        do_io_u_trim(td, io_u);
        return FIO_Q_COMPLETED;
    }

    ld->iocbs[ld->head] = &io_u->iocb;
    ld->io_us[ld->head] = io_u;
    ring_inc(ld, &ld->head, 1);
    return FIO_Q_QUEUED;
}

提交IO请求

把上面queue中的请求通过 libaio io_submit API 真正提交:

static int fio_libaio_commit(struct thread_data *td)
{
    struct libaio_data *ld = td->io_ops_data;
    struct iocb **iocbs;
    struct io_u **io_us;
    struct timespec ts;
    int ret, wait_start = 0;

    if (!ld->queued)
        return 0;

    do {
        long nr = ld->queued;

        nr = min((unsigned int) nr, ld->entries - ld->tail);
        io_us = ld->io_us + ld->tail;
        iocbs = ld->iocbs + ld->tail;

        ret = io_submit(ld->aio_ctx, nr, iocbs);
        if (ret > 0) {
            fio_libaio_queued(td, io_us, ret);
            io_u_mark_submit(td, ret);

            ld->queued -= ret;
            ring_inc(ld, &ld->tail, ret);
            ret = 0;
            wait_start = 0;

取消执行

取消提交的IO请求。同样是从统一的thread_data 和 uio请求里拿相关参数, 返回实际完成的io 请求的数量。

static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u)
{
    struct libaio_data *ld = td->io_ops_data;

    return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
}

抓取已经完成的IO的请求

上面提交的IO请求,多少已经完成,需要统计。同样调用libaio 对应的getevents接口:

static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
                unsigned int max, const struct timespec *t)
{
    struct libaio_data *ld = td->io_ops_data;
    struct libaio_options *o = td->eo;
    unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
    struct timespec __lt, *lt = NULL;
    int r, events = 0;

    if (t) {
        __lt = *t;
        lt = &__lt;
    }

    do {
        if (o->userspace_reap == 1
            && actual_min == 0
            && ((struct aio_ring *)(ld->aio_ctx))->magic
                == AIO_RING_MAGIC) {
            r = user_io_getevents(ld->aio_ctx, max,
                ld->aio_events + events);
        } else {
            r = io_getevents(ld->aio_ctx, actual_min,
                max, ld->aio_events + events, lt);
        }
        if (r > 0)
            events += r;
        else if ((min && r == 0) || r == -EAGAIN) {
            fio_libaio_commit(td);
            usleep(100);
        } else if (r != -EINTR)
            break;
    } while (events < min);

    return r < 0 ? r : events;
}

从IO 完成的请求的event中找到对应的IO请求

统计时需要,因为libaio可能是一批发射请求的,需要知道哪些IO请求已经完成:

static struct io_u *fio_libaio_event(struct thread_data *td, int event)
{
    struct libaio_data *ld = td->io_ops_data;
    struct io_event *ev;
    struct io_u *io_u;

    ev = ld->aio_events + event;
    io_u = container_of(ev->obj, struct io_u, iocb);

    if (ev->res != io_u->xfer_buflen) {
        if (ev->res > io_u->xfer_buflen)
            io_u->error = -ev->res;
        else
            io_u->resid = io_u->xfer_buflen - ev->res;
    } else
        io_u->error = 0;

    return io_u;
}

清理环境

测试完成之后,需要释放之前申请的内存。

static void fio_libaio_cleanup(struct thread_data *td)
{
    struct libaio_data *ld = td->io_ops_data;

    if (ld) {
        /*
         * Work-around to avoid huge RCU stalls at exit time. If we
         * don't do this here, then it'll be torn down by exit_aio().
         * But for that case we can parallellize the freeing, thus
         * speeding it up a lot.
         */
        if (!(td->flags & TD_F_CHILD))
            io_destroy(ld->aio_ctx);
        free(ld->aio_events);
        free(ld->iocbs);
        free(ld->io_us);
        free(ld);
    }
}

打开文件

测试前需要打开文件。对于通用的块设备、文件系统中的文件,直接调用generic_open_file就可以,它主要内容如下:

int generic_open_file(struct thread_data *td, struct fio_file *f)
{
    int is_std = 0;
    int flags = 0;
    int from_hash = 0;
....
 if (td_trim(td))
        goto skip_flags;
    if (td->o.odirect)
        flags |= OS_O_DIRECT;
    if (td->o.oatomic) {
        if (!FIO_O_ATOMIC) {
            td_verror(td, EINVAL, "OS does not support atomic IO");
            return 1;
        }
        flags |= OS_O_DIRECT | FIO_O_ATOMIC;
    }
    if (td->o.sync_io)
        flags |= O_SYNC;
    if (td->o.create_on_open && td->o.allow_create)
        flags |= O_CREAT;
skip_flags:
    if (f->filetype != FIO_TYPE_FILE)
        flags |= FIO_O_NOATIME;

open_again:
    if (td_write(td)) {
        if (!read_only)
            flags |= O_RDWR;

        if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
            flags |= O_CREAT;

        if (is_std)
            f->fd = dup(STDOUT_FILENO);
        else
            from_hash = file_lookup_open(f, flags);
    } else if (td_read(td)) {
        if (f->filetype == FIO_TYPE_CHAR && !read_only)
            flags |= O_RDWR;
        else
            flags |= O_RDONLY;

        if (is_std)
            f->fd = dup(STDIN_FILENO);
        else
            from_hash = file_lookup_open(f, flags);
    } else if (td_trim(td)) {
        assert(!td_rw(td)); /* should have matched above */
        flags |= O_RDWR;
        from_hash = file_lookup_open(f, flags);
    }
....
}

关闭文件

同上,关闭测试完成的文件,对于块设备和文件系统中的文件,调用close系统调用就可以generic_close_file:

int generic_close_file(struct thread_data fio_unused *td, struct fio_file *f)
{
    int ret = 0;

    dprint(FD_FILE, "fd close %s\n", f->file_name);

    remove_file_hash(f);

    if (close(f->fd) < 0)
        ret = errno;

    f->fd = -1;

    if (f->shadow_fd != -1) {
        close(f->shadow_fd);
        f->shadow_fd = -1;
    }

    f->engine_pos = 0;
    return ret;
}

获取文件大小

对于libaio支持的块设备或文件,也是调用通用的接口去stat 文件大小:

/*
 * This function i.e. get_file_size() is the default .get_file_size
 * implementation of majority of I/O engines.
 */
int generic_get_file_size(struct thread_data *td, struct fio_file *f)
{
    return get_file_size(td, f);
}


选性描述

对于libaio的option,具体描述如下:

static struct fio_option options[] = {
    {
        .name   = "userspace_reap",
        .lname  = "Libaio userspace reaping",
        .type   = FIO_OPT_STR_SET,
        .off1   = offsetof(struct libaio_options, userspace_reap),
        .help   = "Use alternative user-space reap implementation",
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_LIBAIO,
    },
    {
        .name   = NULL,
    },
};

选项大小

告诉fio 测试套件,libaio引擎的option需占用的空间大小:

 .option_struct_size = sizeof(struct libaio_options),

libaio引擎的注册和注销

调用engine通用的注册和注销接口如下:

static void fio_init fio_libaio_register(void)
{
    register_ioengine(&ioengine);
}

static void fio_exit fio_libaio_unregister(void)
{
    unregister_ioengine(&ioengine);
}