Efficient IO with io_uring

0x00 引言

io uring是Linux上面新的异步IO机制。在io uring之前，Linux下面的异步IO机制是aio，不过aio存在不少的问题。io uring一出现就受到了比较大的关注，这也可能多亏了aio衬托地好吧¯_(ツ)_/¯。io uring的核心是三个系统调用，直接使用这三个系统调用的话会比较麻烦。所以作者还额外添加了一个库liburing，方便使用。

0x01 liburing

liburing是为了简化io uring的使用的一个库，估计后面使用io uring的话，使用到这个库的机会还是很大的。liburing中，一个核心的结构是struct io_uring，

struct io_uring {
	struct io_uring_sq sq;
	struct io_uring_cq cq;
	unsigned flags;
	int ring_fd;
};
// 其中的两个struct结构如下
struct io_uring_sq {
	unsigned *khead;
	unsigned *ktail;
	unsigned *kring_mask;
	unsigned *kring_entries;
	unsigned *kflags;
	unsigned *kdropped;
	unsigned *array;
	struct io_uring_sqe *sqes;

	unsigned sqe_head;
	unsigned sqe_tail;

	size_t ring_sz;
	void *ring_ptr;
};

struct io_uring_cq {
	unsigned *khead;
	unsigned *ktail;
	unsigned *kring_mask;
	unsigned *kring_entries;
	unsigned *koverflow;
	struct io_uring_cqe *cqes;

	size_t ring_sz;
	void *ring_ptr;
};

可以看出后面两个结构的前面四个字段和后面两个字段是相同的。初始化struct io_uring的时候，使用下面的第一个函数。这个函数的作用就如名字表现的一些，是io uring相关队列的初始化。实际上在这个函数的实现中，会调用多个mmap来初始化一些内存。在后面会分析。在初始化完成之后，为了提交IO请求，需要获取里面queue的一个项。使用下面的第二个函数。在获取到了空闲的项之后，使用下面的第三、四个函数初始化读、写请求。这两个函数除了第一个参数是struct io_uring_sqe之外，和preadv、pwritev是差不多的。在准备完成之后，使用下面的第五个函数提交请求。在提交了IO请求的时候，可以通过下面的第六、七个函数获取请求完成的情况，这两个函数的区别是一个非阻塞和一个阻塞的区别。默认情况下吗，完成的IO请求还会存在内部的队列中需要通过io_uring_cqe_seen表标记完成操作。使用完成之后要通过io_uring_queue_exit来完成资源清理的工作。在http://git.kernel.dk/cgit/liburing/tree/examples/io_uring-test.c 这里有一个简单的例子。

extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags);
extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
				       const struct iovec *iovecs, unsigned nr_vecs, off_t offset)；
void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
					const struct iovec *iovecs, unsigned nr_vecs, off_t offset);
extern int io_uring_submit(struct io_uring *ring);
extern int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);
extern int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);
/*
 * Must be called after io_uring_{peek,wait}_cqe() after the cqe has
 * been processed by the application.
 */
static inline void io_uring_cqe_seen(struct io_uring *ring, struct io_uring_cqe *cqe);
extern void io_uring_queue_exit(struct io_uring *ring);

0x02 系统调用

io uring相关的三个syscall以及几个习惯的struct结构如下。前面的io_uring_queue_init中的操作主要就是io_uring_setup，然后再初始化struct io_uring_sq sq 和 struct io_uring_cq cq的内存，另外还会分配一个struct io_uring_sqe的数组。这里mmap的时候，使用了IORING_OFF_SQ_RING、IORING_OFF_SQES和IORING_OFF_CQ_RING这个预先定义的offset，用于告诉相同要分配的是io uring相关的一些内存。这些内存在使用io_uring_setup的时候内核会分配好，但是在用户空间要使用这些内存的话，需要将这些内存映射到用户空间，这里就利用mmap，并添加了几个offset用于实现io uring要求的映射。这些offset值定义了保存到这个三个结构保存到位置。

int io_uring_setup(unsigned entries, struct io_uring_params *p);
int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig);
int io_uring_register(int fd, unsigned int opcode, const void *arg, unsigned int nr_args);

#define IORING_OFF_SQ_RING		0ULL
#define IORING_OFF_CQ_RING		0x8000000ULL
#define IORING_OFF_SQES			0x10000000ULL

struct io_uring_params {
	__u32 sq_entries;
	__u32 cq_entries;
	__u32 flags;
	__u32 sq_thread_cpu;
	__u32 sq_thread_idle;
	__u32 resv[5];
	struct io_sqring_offsets sq_off;
	struct io_cqring_offsets cq_off;
};

/*
 * Filled with the offset for mmap(2)
 */
struct io_sqring_offsets {
	__u32 head;
	__u32 tail;
	__u32 ring_mask;
	__u32 ring_entries;
	__u32 flags;
	__u32 dropped;
	__u32 array;
	__u32 resv1;
	__u64 resv2;
};
struct io_cqring_offsets {
	__u32 head;
	__u32 tail;
	__u32 ring_mask;
	__u32 ring_entries;
	__u32 overflow;
	__u32 cqes;
	__u64 resv[2];
};

/*
 * IO submission data structure (Submission Queue Entry)
 */
struct io_uring_sqe {
	__u8	opcode;		/* type of operation for this sqe */
	__u8	flags;		/* IOSQE_ flags */
	__u16	ioprio;		/* ioprio for the request */
	__s32	fd;		/* file descriptor to do IO on */
	__u64	off;		/* offset into file */
	__u64	addr;		/* pointer to buffer or iovecs */
	__u32	len;		/* buffer size or number of iovecs */
	union {
		__kernel_rwf_t	rw_flags;
		__u32		fsync_flags;
		__u16		poll_events;
		__u32		sync_range_flags;
		__u32		msg_flags;
	};
	__u64	user_data;	/* data to be passed back at completion time */
	union {
		__u16	buf_index;	/* index into fixed buffers, if used */
		__u64	__pad2[3];
	};
};
/*
 * IO completion data structure (Completion Queue Entry)
 */
struct io_uring_cqe {
	__u64	user_data;	/* sqe->data submission passed back */
	__s32	res;		/* result code for this event */
	__u32	flags;
};

#define IORING_ENTER_GETEVENTS	(1U << 0)

在初始化完成之后，用户空间就可以使用这些队列来添加IO请求。为了提交这些请求到内核执行，这里就要使用int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig) syscall。这里有两个相关的flags，如果设置了IORING_ENTER_GETEVENTS的flags，这个调用就会一直等到至少有min_complete个请求完成才会返回。如果在io_uring_setup的时候设置了IORING_SETUP_SQPOLL的flag，则系统会使用额外的一个线程来执行轮询的操作，这个线程可以运行在指定的CPU核心上面，io_uring_params中的 sq_thread_cpu 和sq_thread_idle连个参数可以对轮询的CPU和时间进行一些设置。用于内核和运用直接使用了同一块内存来处理，这里实际上可以看作一种特定的共享内存，所以在设置SQ提及新的IO请求内核里面可以直接看到，不需要经过系统调用。另外的IORING_SQ_NEED_WAKEUP可以在一些时候唤醒休眠中的轮询线程。如果在io_uring_setup的时候设置了IORING_SETUP_SQPOLL的flag，则执行完成的线程可以直接通过访问相关的队列就可以获取到，不需要经过系统调用。另外io uring还包含了其它的一些高级特性。

0x03 内部实现

Kernel中io uring表示一个io uring的实例的结构是io uring ctx。其中两个主要的部分是sq ring和cq ring。分别代表了提交的请求的ring和已经完成的请求返回结构的ring。io_sq_ring和io_cq_ring的前面是一些控制字段和一些其它的数据，尾部是一个分配的时候变长的一块内存。io_sq_ring和io_cq_ring不同的一个地方就是io_cq_ring后面保存的直接是一个struct io_uring_cqe 的数组，前面的struct io_uring保存了其中被使用的entry的访问。而io_sq_ring保存的是一个offset index的数组，里面的保存的信息才最终定位到对应的struct io_uring_sqe，这些struct io_uring_sqe被保存到另外的一块内存中。从直觉上来说，sq和cq这里的结构会是相同的。但是这里用了不同的方式，sq加了一层的间接，从网上能找到一些信息，但不是很理解。个人猜测是不同的请求完成的时候不同，而这样进行一段时间之后，sqe里面可用的entry就是不连续的，通过一个index数据间接的方式有可用当作是连续的使用。这两个数据在前面使用mmap分配内存的时候，对应到了不同的offset，即前面IORING_OFF_SQ_RING、IORING_OFF_CQ_RING和IORING_OFF_SQES的预定于的值。

struct io_sq_ring {
	/*
	 * Head and tail offsets into the ring; the offsets need to be
	 * masked to get valid indices.
	 *
	 * The kernel controls head and the application controls tail.
	 */
	struct io_uring		r;
	/*
	 * Bitmask to apply to head and tail offsets (constant, equals
	 * ring_entries - 1)
	 */
	u32			ring_mask;
	/* Ring size (constant, power of 2) */
	u32			ring_entries;
	/*
	 * Number of invalid entries dropped by the kernel due to
	 * invalid index stored in array
	 *
	 * Written by the kernel, shouldn't be modified by the
	 * application (i.e. get number of "new events" by comparing to
	 * cached value).
	 *
	 * After a new SQ head value was read by the application this
	 * counter includes all submissions that were dropped reaching
	 * the new SQ head (and possibly more).
	 */
	u32			dropped;
	/*
	 * Runtime flags
	 *
	 * Written by the kernel, shouldn't be modified by the
	 * application.
	 *
	 * The application needs a full memory barrier before checking
	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
	 */
	u32			flags;
	/*
	 * Ring buffer of indices into array of io_uring_sqe, which is
	 * mmapped by the application using the IORING_OFF_SQES offset.
	 *
	 * This indirection could e.g. be used to assign fixed
	 * io_uring_sqe entries to operations and only submit them to
	 * the queue when needed.
	 *
	 * The kernel modifies neither the indices array nor the entries
	 * array.
	 */
	u32			array[];
};

/*
 * This data is shared with the application through the mmap at offset
 * IORING_OFF_CQ_RING.
 *
 * The offsets to the member fields are published through struct
 * io_cqring_offsets when calling io_uring_setup.
 */
struct io_cq_ring {
	/*
	 * Head and tail offsets into the ring; the offsets need to be
	 * masked to get valid indices.
	 *
	 * The application controls head and the kernel tail.
	 */
	struct io_uring		r;
	/*
	 * Bitmask to apply to head and tail offsets (constant, equals
	 * ring_entries - 1)
	 */
	u32			ring_mask;
	/* Ring size (constant, power of 2) */
	u32			ring_entries;
	/*
	 * Number of completion events lost because the queue was full;
	 * this should be avoided by the application by making sure
	 * there are not more requests pending thatn there is space in
	 * the completion queue.
	 *
	 * Written by the kernel, shouldn't be modified by the
	 * application (i.e. get number of "new events" by comparing to
	 * cached value).
	 *
	 * As completion events come in out of order this counter is not
	 * ordered with any other data.
	 */
	u32			overflow;
	/*
	 * Ring buffer of completion events.
	 *
	 * The kernel writes completion events fresh every time they are
	 * produced, so the application is allowed to modify pending
	 * entries.
	 */
	struct io_uring_cqe	cqes[];
};


struct io_ring_ctx {
	struct {
		struct percpu_ref	refs;
	} ____cacheline_aligned_in_smp;

	struct {
		unsigned int		flags;
		bool			compat;
		bool			account_mem;

		/* SQ ring */
		struct io_sq_ring	*sq_ring;
		unsigned		cached_sq_head;
		unsigned		sq_entries;
		unsigned		sq_mask;
		unsigned		sq_thread_idle;
		struct io_uring_sqe	*sq_sqes;

		struct list_head	defer_list;
	} ____cacheline_aligned_in_smp;

	/* IO offload */
	struct workqueue_struct	*sqo_wq;
	struct task_struct	*sqo_thread;	/* if using sq thread polling */
	struct mm_struct	*sqo_mm;
	wait_queue_head_t	sqo_wait;
	struct completion	sqo_thread_started;

	struct {
		/* CQ ring */
		struct io_cq_ring	*cq_ring;
		unsigned		cached_cq_tail;
		unsigned		cq_entries;
		unsigned		cq_mask;
		struct wait_queue_head	cq_wait;
		struct fasync_struct	*cq_fasync;
		struct eventfd_ctx	*cq_ev_fd;
	} ____cacheline_aligned_in_smp;

	/*
	 * If used, fixed file set. Writers must ensure that ->refs is dead,
	 * readers must ensure that ->refs is alive as long as the file* is
	 * used. Only updated through io_uring_register(2).
	 */
	struct file		**user_files;
	unsigned		nr_user_files;

	/* if used, fixed mapped user buffers */
	unsigned		nr_user_bufs;
	struct io_mapped_ubuf	*user_bufs;

	struct user_struct	*user;

	struct completion	ctx_done;

	struct {
		struct mutex		uring_lock;
		wait_queue_head_t	wait;
	} ____cacheline_aligned_in_smp;

	struct {
		spinlock_t		completion_lock;
		bool			poll_multi_file;
		/*
		 * ->poll_list is protected by the ctx->uring_lock for
		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
		 * For SQPOLL, only the single threaded io_sq_thread() will
		 * manipulate the list, hence no extra locking is needed there.
		 */
		struct list_head	poll_list;
		struct list_head	cancel_list;
	} ____cacheline_aligned_in_smp;

	struct async_list	pending_async[2];

#if defined(CONFIG_UNIX)
	struct socket		*ring_sock;
#endif
};

cqe是内核写应用读的部分，而sqe是应用写内核读的部分。io uring setup的系统调用就是初始化相关数据结构的操作。如果省略很多的错误检查、权限检查和资源配额检查等等的部分，整体的操作还是比较简单的。liburing中初始化的一个函数如下，调用io_uring_setup函数的时候，传入的io_uring_params参数中，一部分是用户设置传入的，还有很大的一部分部分都是在内核中io uring create操作中设置的，之后设置好的struct io_uring_params会被传到用户空间，用户空间根据这些数据来使用mmap分配内存，初始化一些数据结构，具体可以参看http://git.kernel.dk/cgit/liburing/tree/src/setup.c 这里，

int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) {
	struct io_uring_params p;
	int fd, ret;

	memset(&p, 0, sizeof(p));
	p.flags = flags;

	fd = io_uring_setup(entries, &p);
	if (fd < 0)
		return -errno;

	ret = io_uring_queue_mmap(fd, &p, ring);
	if (ret)
		close(fd);

	return ret;
}

内核中io_uring_enter的部分的代码如下，这里省略了一些检查的代码。这里整体的思路比较清晰。第一种情况，flags设置了IORING_SETUP_SQPOLL参数，这样的话，内核会使用一个特定的线程去轮询操作。这里使用的轮询结构会最终对应到struct file_operations中的iopoll操作，这个操作作为一个新的接口在最近才添加到这里，Linux native aio的新功能也使用了这个iopoll。这里io uring实际上只有vfs层的改动，其它的都是使用以及存在的东西，而且几个核心的东西和aio使用的相同/类似。这一步没有设置的时候，会走到io_ring_submit，进行提交请求的操作。另外如前面所提到的，在设置了flagsIORING_ENTER_GETEVENTS的情况，会等到min_complete个请求完成才会返回，这里会根据是否使用了IORING_SETUP_IOPOLL走如到两个分支。

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		u32, min_complete, u32, flags, const sigset_t __user *, sig,
		size_t, sigsz)
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	int submitted = 0;
	struct fd f;
  //...
	/*
	 * For SQ polling, the thread will do all submissions and completions.
	 * Just return the requested submit count, and wake the thread if
	 * we were asked to.
	 */
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		if (flags & IORING_ENTER_SQ_WAKEUP)
			wake_up(&ctx->sqo_wait);
		submitted = to_submit;
		goto out_ctx;
	}

	ret = 0;
	if (to_submit) {
		to_submit = min(to_submit, ctx->sq_entries);
		mutex_lock(&ctx->uring_lock);
		submitted = io_ring_submit(ctx, to_submit);
		mutex_unlock(&ctx->uring_lock);
	}
  
	if (flags & IORING_ENTER_GETEVENTS) {
		unsigned nr_events = 0;
		min_complete = min(min_complete, ctx->cq_entries);
		if (ctx->flags & IORING_SETUP_IOPOLL) {
			ret = io_iopoll_check(ctx, &nr_events, min_complete);
		} else {
			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
		}
	}

out_ctx:
	io_ring_drop_ctx_refs(ctx, 1);
out_fput:
	fdput(f);
	return submitted ? submitted : ret;
}

以io_iopoll_check为例，正常情况下执行路线是io_iopoll_check -> io_iopoll_getevents -> io_do_iopoll -> (kiocb->ki_filp->f_op->iopoll). 在完成请求的操作之后，会调用下面这个函数提交结果到cqe数组中，这样应用就能看到结果了。这里的io_cqring_fill_event就是获取一个目前可以写入到cqe，写入数据。这里最终调用的会是io_get_cqring，可以见就是返回目前tail的后面的一个。

static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
			       struct list_head *done)
{
	void *reqs[IO_IOPOLL_BATCH];
	struct io_kiocb *req;
	int to_free;

	to_free = 0;
	while (!list_empty(done)) {
		req = list_first_entry(done, struct io_kiocb, list);
		list_del(&req->list);

		io_cqring_fill_event(ctx, req->user_data, req->result);
		(*nr_events)++;

		if (refcount_dec_and_test(&req->refs)) {
			/* If we're not using fixed files, we have to pair the
			 * completion part with the file put. Use regular
			 * completions for those, only batch free for fixed
			 * file and non-linked commands.
			 */
			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
			    REQ_F_FIXED_FILE) {
				reqs[to_free++] = req;
				if (to_free == ARRAY_SIZE(reqs))
					io_free_req_many(ctx, reqs, &to_free);
			} else {
				io_free_req(req);
			}
		}
	}

	io_commit_cqring(ctx);
	io_free_req_many(ctx, reqs, &to_free);
}

static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
	struct io_cq_ring *ring = ctx->cq_ring;
	unsigned tail;

	tail = ctx->cached_cq_tail;
	/*
	 * writes to the cq entry need to come after reading head; the
	 * control dependency is enough as we're using WRITE_ONCE to
	 * fill the cq entry
	 */
	if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
		return NULL;

	ctx->cached_cq_tail++;
	return &ring->cqes[tail & ctx->cq_mask];
}

static int io_sq_thread(void *data)为内核轮询线程的逻辑，在设置了对应的flags的时候会启动。

0x04 评估

这里的一些信息可以参看[1]。另外网上可以找到不少的关于io uring性能的数据。在一些情况下，io uring可以达到甚至超过spdk的性能。io uring的表现很惊艳。

... For peak performance, io_uring helps us get to 1.7M 4k IOPS with polling. aio reaches a performance cliff much lower than that, at 608K. The comparison here isn't quite fair, since aio doesn't support polled IO. If we disable polling, io_uring is able to drive about 1.2M IOPS for the (otherwise) same test case.

参考

Efficient IO with io_uring, http://kernel.dk/io_uring.pdf