Kyber IO Scheduler of Linux
0x00 引言
Kyber IO调度器是Linux上面针对高速存储设备设计的一个新的IO调度器,配和多队列的Block层使用。在Linux 4.12的时候和BFQ调度器一起成为内核中的一个可选项(emmmm,BFQ的系统复杂程度远高于这个Kyber)。另外,关于Kyber更方面都没有详细的信息,只能在一些[1]地方找到几句简单的介绍。这里完全是根据这几句话和内核中的源代码来推测它的实现,所以可能会存在不少的不准确的地方。
0x01 基本思路
Kyber调度器的基本思路是会为每一个的硬件的队列维护一个不同类型IO请求的队列,这些请求主要根据IO操作的方式来进行区分。Kyber按照读、同步写以及其它的(异步写等)将IO请求分为了3类。在Kyber的设计中,更加倾向于让读有些,这个策略也和其它的一些调度器的设计类似,
34 /* Scheduling domains. */
35 enum {
36 KYBER_READ,
37 KYBER_SYNC_WRITE,
38 KYBER_OTHER, /* Async writes, discard, etc. */
39 KYBER_NUM_DOMAINS,
40 };
Kyber在一个Kyber上下文中维护了关于这几类请求的队列。它通过限制每一个队列的长度来对在这里产生的请求的延迟进行控制。Kyber只有在这些队列里面的请求被处理了之后才会收集新的请求。这里限制的方式采用了基于Token的方式。另外这里的策略和一些交换机中控制内部缓冲区的思路相似。在下面的一些常量的定义中,可以看出对于读IO请求的偏好,
53 /*
54 * Initial device-wide depths for each scheduling domain.
55 *
56 * Even for fast devices with lots of tags like NVMe, you can saturate
57 * the device with only a fraction of the maximum possible queue depth.
58 * So, we cap these to a reasonable value.
59 */
60 static const unsigned int kyber_depth[] = {
61 [KYBER_READ] = 256,
62 [KYBER_SYNC_WRITE] = 128,
63 [KYBER_OTHER] = 64,
64 };
65
66 /*
67 * Scheduling domain batch sizes. We favor reads.
68 */
69 static const unsigned int kyber_batch_size[] = {
70 [KYBER_READ] = 16,
71 [KYBER_SYNC_WRITE] = 8,
72 [KYBER_OTHER] = 8,
73 };
另外,由于Kyber面向的是高速存储,这类设备一般是NVMe SSD、NVM之类的一些技术。采用类似CFQ中的一些对请求排序的方法可能有损于性能,所以在Kyber的代码中没有看到对请求排序的逻辑。Kyber会对一些IO请求进行合并操作,以及会尝试批量处理这些请求来提高性能。批量处理的批量的大小根据请求类型来决定。Kyber整体的逻辑不复杂,实际上,Kyber调度器实现文件只有1000来行,除去一些非核心的代码,实际Kyber的核心的代码是很少的。
0x02 代码
下面就是Kyber中几个核心的数据结构。一个kyber_ctx_queue中主要的就是里面对应不同IO种类请求的队列。另外这个数据结构主要被kyber_hctx_data使用,
75 /*
76 * There is a same mapping between ctx & hctx and kcq & khd,
77 * we use request->mq_ctx->index_hw to index the kcq in khd.
78 */
79 struct kyber_ctx_queue {
80 /*
81 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82 * Also protect the rqs on rq_list when merge.
83 */
84 spinlock_t lock;
85 struct list_head rq_list[KYBER_NUM_DOMAINS];
86 } ____cacheline_aligned_in_smp;
87
88 struct kyber_queue_data {
89 struct request_queue *q;
90
91 struct blk_stat_callback *cb;
92
93 /*
94 * The device is divided into multiple scheduling domains based on the
95 * request type. Each domain has a fixed number of in-flight requests of
96 * that type device-wide, limited by these tokens.
97 */
98 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99
100 /*
101 * Async request percentage, converted to per-word depth for
102 * sbitmap_get_shallow().
103 */
104 unsigned int async_depth;
105
106 /* Target latencies in nanoseconds. */
107 u64 read_lat_nsec, write_lat_nsec;
108 };
109
110 struct kyber_hctx_data {
111 spinlock_t lock;
112 struct list_head rqs[KYBER_NUM_DOMAINS];
113 unsigned int cur_domain;
114 unsigned int batching;
115 struct kyber_ctx_queue *kcqs;
116 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
118 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119 atomic_t wait_index[KYBER_NUM_DOMAINS];
120 };
添加requests的操作就是根据request的类型添加到对应的队列,
531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532 struct list_head *rq_list, bool at_head)
533 {
534 struct kyber_hctx_data *khd = hctx->sched_data;
535 struct request *rq, *next;
536
537 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540 struct list_head *head = &kcq->rq_list[sched_domain];
541
542 spin_lock(&kcq->lock);
543 if (at_head)
544 list_move(&rq->queuelist, head);
545 else
546 list_move_tail(&rq->queuelist, head);
547 sbitmap_set_bit(&khd->kcq_map[sched_domain],
548 rq->mq_ctx->index_hw);
549 blk_mq_sched_request_inserted(rq);
550 spin_unlock(&kcq->lock);
551 }
552 }
另外一个核心的函数就是request分发的逻辑。这部分的逻辑会尝试一些分发一个批量的requests。如果遇到没有请求 or 进行中的请求超过了Token表示的限制,会尝试去处理其它的Domain的请求。上面的数据结构中保存了目前处理的Domain(cur_domain)的信息。在函数kyber_dispatch_cur_domain主要就是队列的一些处理以及Token的处理。
733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734 {
735 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736 struct kyber_hctx_data *khd = hctx->sched_data;
737 struct request *rq;
738 int i;
739
740 spin_lock(&khd->lock);
741
742 /*
743 * First, if we are still entitled to batch, try to dispatch a request
744 * from the batch.
745 */
746 if (khd->batching < kyber_batch_size[khd->cur_domain]) {
747 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
748 if (rq)
749 goto out;
750 }
751
752 /*
753 * Either,
754 * 1. We were no longer entitled to a batch.
755 * 2. The domain we were batching didn't have any requests.
756 * 3. The domain we were batching was out of tokens.
757 *
758 * Start another batch. Note that this wraps back around to the original
759 * domain if no other domains have requests or tokens.
760 */
761 khd->batching = 0;
762 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764 khd->cur_domain = 0;
765 else
766 khd->cur_domain++;
767
768 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
769 if (rq)
770 goto out;
771 }
772
773 rq = NULL;
774 out:
775 spin_unlock(&khd->lock);
776 return rq;
777 }
另外中Kyber两个设置的参数就是读、写的延迟,这里主要通过调整队列的长度实现。另外,在实际的运行中可以遇到读、写请求的延迟出现比较大的区别。Kyber将读、写延迟定义了Great、Good、Bad等的等级,根据实际测量到的延迟和目标的延迟确定。Kyber会尝试将不同Domain的请求保持同一个评价,即都为Good or 都为 Bad,来保证公平性,
135 enum {
136 NONE = 0,
137 GOOD = 1,
138 GREAT = 2,
139 BAD = -1,
140 AWFUL = -2,
141 };
142
143 #define IS_GOOD(status) ((status) > 0)
144 #define IS_BAD(status) ((status) < 0)
145
146 static int kyber_lat_status(struct blk_stat_callback *cb,
147 unsigned int sched_domain, u64 target)
148 {
149 u64 latency;
150
151 if (!cb->stat[sched_domain].nr_samples)
152 return NONE;
153
154 latency = cb->stat[sched_domain].mean;
155 if (latency >= 2 * target)
156 return AWFUL;
157 else if (latency > target)
158 return BAD;
159 else if (latency <= target / 2)
160 return GREAT;
161 else /* (latency <= target) */
162 return GOOD;
163 }
这里的队列深度的调整,Kyber使用了一些启发式的方法(启发式的方法很多时候就是指一些不知所以然,根据测试or经验而来,但是很多时候又用的一些方法,23333)。Kyber这里提交将读 or 同步写的范围一类,其它的分为一类,虽然调整的参数有所不同,但是基本的逻辑是一样的,
165 /*
166 * Adjust the read or synchronous write depth given the status of reads and
167 * writes. The goal is that the latencies of the two domains are fair (i.e., if
168 * one is good, then the other is good).
169 */
170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171 unsigned int sched_domain, int this_status,
172 int other_status)
173 {
174 unsigned int orig_depth, depth;
175
176 /*
177 * If this domain had no samples, or reads and writes are both good or
178 * both bad, don't adjust the depth.
179 */
180 if (this_status == NONE ||
181 (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182 (IS_BAD(this_status) && IS_BAD(other_status)))
183 return;
184
185 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186
187 if (other_status == NONE) {
188 depth++;
189 } else {
190 switch (this_status) {
191 case GOOD:
192 if (other_status == AWFUL)
193 depth -= max(depth / 4, 1U);
194 else
195 depth -= max(depth / 8, 1U);
196 break;
197 case GREAT:
198 if (other_status == AWFUL)
199 depth /= 2;
200 else
201 depth -= max(depth / 4, 1U);
202 break;
203 case BAD:
204 depth++;
205 break;
206 case AWFUL:
207 if (other_status == GREAT)
208 depth += 2;
209 else
210 depth++;
211 break;
212 }
213 }
214
215 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216 if (depth != orig_depth)
217 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218 }
219
Kyber和其它的一些调度器相比,还是一种很简单的设计。但是在它适合的环境下,对延迟的降低非常有利。
0x03 评估
这里的信息在网络上面能够找到一些相关的信息。
参考
- https://patchwork.kernel.org/patch/9672023/, Introduce Kyber multiqueue I/O scheduler.
- https://code.woboq.org/linux/linux/block/kyber-iosched.c.html, Kyber Scheduler代码.