Kyber IO Scheduler of Linux


Kyber IO Scheduler of Linux

0x00 引言

Kyber IO调度器是Linux上面针对高速存储设备设计的一个新的IO调度器,配和多队列的Block层使用。在Linux 4.12的时候和BFQ调度器一起成为内核中的一个可选项(emmmm,BFQ的系统复杂程度远高于这个Kyber)。另外,关于Kyber更方面都没有详细的信息,只能在一些[1]地方找到几句简单的介绍。这里完全是根据这几句话和内核中的源代码来推测它的实现,所以可能会存在不少的不准确的地方。

0x01 基本思路

Kyber调度器的基本思路是会为每一个的硬件的队列维护一个不同类型IO请求的队列,这些请求主要根据IO操作的方式来进行区分。Kyber按照读、同步写以及其它的(异步写等)将IO请求分为了3类。在Kyber的设计中,更加倾向于让读有些,这个策略也和其它的一些调度器的设计类似,

34    /* Scheduling domains. */
35    enum {
36        KYBER_READ,
37        KYBER_SYNC_WRITE,
38        KYBER_OTHER, /* Async writes, discard, etc. */
39        KYBER_NUM_DOMAINS,
40    };

Kyber在一个Kyber上下文中维护了关于这几类请求的队列。它通过限制每一个队列的长度来对在这里产生的请求的延迟进行控制。Kyber只有在这些队列里面的请求被处理了之后才会收集新的请求。这里限制的方式采用了基于Token的方式。另外这里的策略和一些交换机中控制内部缓冲区的思路相似。在下面的一些常量的定义中,可以看出对于读IO请求的偏好,

53    /*
54     * Initial device-wide depths for each scheduling domain.
55     *
56     * Even for fast devices with lots of tags like NVMe, you can saturate
57     * the device with only a fraction of the maximum possible queue depth.
58     * So, we cap these to a reasonable value.
59     */
60    static const unsigned int kyber_depth[] = {
61        [KYBER_READ] = 256,
62        [KYBER_SYNC_WRITE] = 128,
63        [KYBER_OTHER] = 64,
64    };
65    
66    /*
67     * Scheduling domain batch sizes. We favor reads.
68     */
69    static const unsigned int kyber_batch_size[] = {
70        [KYBER_READ] = 16,
71        [KYBER_SYNC_WRITE] = 8,
72        [KYBER_OTHER] = 8,
73    };

另外,由于Kyber面向的是高速存储,这类设备一般是NVMe SSD、NVM之类的一些技术。采用类似CFQ中的一些对请求排序的方法可能有损于性能,所以在Kyber的代码中没有看到对请求排序的逻辑。Kyber会对一些IO请求进行合并操作,以及会尝试批量处理这些请求来提高性能。批量处理的批量的大小根据请求类型来决定。Kyber整体的逻辑不复杂,实际上,Kyber调度器实现文件只有1000来行,除去一些非核心的代码,实际Kyber的核心的代码是很少的。

0x02 代码

下面就是Kyber中几个核心的数据结构。一个kyber_ctx_queue中主要的就是里面对应不同IO种类请求的队列。另外这个数据结构主要被kyber_hctx_data使用,

75    /*
76     * There is a same mapping between ctx & hctx and kcq & khd,
77     * we use request->mq_ctx->index_hw to index the kcq in khd.
78     */
79    struct kyber_ctx_queue {
80        /*
81         * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82         * Also protect the rqs on rq_list when merge.
83         */
84        spinlock_t lock;
85        struct list_head rq_list[KYBER_NUM_DOMAINS];
86    } ____cacheline_aligned_in_smp;
87    
88    struct kyber_queue_data {
89        struct request_queue *q;
90    
91        struct blk_stat_callback *cb;
92    
93        /*
94         * The device is divided into multiple scheduling domains based on the
95         * request type. Each domain has a fixed number of in-flight requests of
96         * that type device-wide, limited by these tokens.
97         */
98        struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99    
100        /*
101         * Async request percentage, converted to per-word depth for
102         * sbitmap_get_shallow().
103         */
104        unsigned int async_depth;
105    
106        /* Target latencies in nanoseconds. */
107        u64 read_lat_nsec, write_lat_nsec;
108    };
109    
110    struct kyber_hctx_data {
111        spinlock_t lock;
112        struct list_head rqs[KYBER_NUM_DOMAINS];
113        unsigned int cur_domain;
114        unsigned int batching;
115        struct kyber_ctx_queue *kcqs;
116        struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117        wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
118        struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119        atomic_t wait_index[KYBER_NUM_DOMAINS];
120    };

添加requests的操作就是根据request的类型添加到对应的队列,

531    static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532                      struct list_head *rq_list, bool at_head)
533    {
534        struct kyber_hctx_data *khd = hctx->sched_data;
535        struct request *rq, *next;
536    
537        list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538            unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539            struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540            struct list_head *head = &kcq->rq_list[sched_domain];
541    
542            spin_lock(&kcq->lock);
543            if (at_head)
544                list_move(&rq->queuelist, head);
545            else
546                list_move_tail(&rq->queuelist, head);
547            sbitmap_set_bit(&khd->kcq_map[sched_domain],
548                    rq->mq_ctx->index_hw);
549            blk_mq_sched_request_inserted(rq);
550            spin_unlock(&kcq->lock);
551        }
552    }

另外一个核心的函数就是request分发的逻辑。这部分的逻辑会尝试一些分发一个批量的requests。如果遇到没有请求 or 进行中的请求超过了Token表示的限制,会尝试去处理其它的Domain的请求。上面的数据结构中保存了目前处理的Domain(cur_domain)的信息。在函数kyber_dispatch_cur_domain主要就是队列的一些处理以及Token的处理。

733    static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734    {
735        struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736        struct kyber_hctx_data *khd = hctx->sched_data;
737        struct request *rq;
738        int i;
739    
740        spin_lock(&khd->lock);
741    
742        /*
743         * First, if we are still entitled to batch, try to dispatch a request
744         * from the batch.
745         */
746        if (khd->batching < kyber_batch_size[khd->cur_domain]) {
747            rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
748            if (rq)
749                goto out;
750        }
751    
752        /*
753         * Either,
754         * 1. We were no longer entitled to a batch.
755         * 2. The domain we were batching didn't have any requests.
756         * 3. The domain we were batching was out of tokens.
757         *
758         * Start another batch. Note that this wraps back around to the original
759         * domain if no other domains have requests or tokens.
760         */
761        khd->batching = 0;
762        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763            if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764                khd->cur_domain = 0;
765            else
766                khd->cur_domain++;
767    
768            rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
769            if (rq)
770                goto out;
771        }
772    
773        rq = NULL;
774    out:
775        spin_unlock(&khd->lock);
776        return rq;
777    }

另外中Kyber两个设置的参数就是读、写的延迟,这里主要通过调整队列的长度实现。另外,在实际的运行中可以遇到读、写请求的延迟出现比较大的区别。Kyber将读、写延迟定义了Great、Good、Bad等的等级,根据实际测量到的延迟和目标的延迟确定。Kyber会尝试将不同Domain的请求保持同一个评价,即都为Good or 都为 Bad,来保证公平性,

135    enum {
136        NONE = 0,
137        GOOD = 1,
138        GREAT = 2,
139        BAD = -1,
140        AWFUL = -2,
141    };
142    
143    #define IS_GOOD(status) ((status) > 0)
144    #define IS_BAD(status) ((status) < 0)
145    
146    static int kyber_lat_status(struct blk_stat_callback *cb,
147                    unsigned int sched_domain, u64 target)
148    {
149        u64 latency;
150    
151        if (!cb->stat[sched_domain].nr_samples)
152            return NONE;
153    
154        latency = cb->stat[sched_domain].mean;
155        if (latency >= 2 * target)
156            return AWFUL;
157        else if (latency > target)
158            return BAD;
159        else if (latency <= target / 2)
160            return GREAT;
161        else /* (latency <= target) */
162            return GOOD;
163    }

这里的队列深度的调整,Kyber使用了一些启发式的方法(启发式的方法很多时候就是指一些不知所以然,根据测试or经验而来,但是很多时候又用的一些方法,23333)。Kyber这里提交将读 or 同步写的范围一类,其它的分为一类,虽然调整的参数有所不同,但是基本的逻辑是一样的,

165    /*
166     * Adjust the read or synchronous write depth given the status of reads and
167     * writes. The goal is that the latencies of the two domains are fair (i.e., if
168     * one is good, then the other is good).
169     */
170    static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171                      unsigned int sched_domain, int this_status,
172                      int other_status)
173    {
174        unsigned int orig_depth, depth;
175    
176        /*
177         * If this domain had no samples, or reads and writes are both good or
178         * both bad, don't adjust the depth.
179         */
180        if (this_status == NONE ||
181            (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182            (IS_BAD(this_status) && IS_BAD(other_status)))
183            return;
184    
185        orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186    
187        if (other_status == NONE) {
188            depth++;
189        } else {
190            switch (this_status) {
191            case GOOD:
192                if (other_status == AWFUL)
193                    depth -= max(depth / 4, 1U);
194                else
195                    depth -= max(depth / 8, 1U);
196                break;
197            case GREAT:
198                if (other_status == AWFUL)
199                    depth /= 2;
200                else
201                    depth -= max(depth / 4, 1U);
202                break;
203            case BAD:
204                depth++;
205                break;
206            case AWFUL:
207                if (other_status == GREAT)
208                    depth += 2;
209                else
210                    depth++;
211                break;
212            }
213        }
214    
215        depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216        if (depth != orig_depth)
217            sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218    }
219    

Kyber和其它的一些调度器相比,还是一种很简单的设计。但是在它适合的环境下,对延迟的降低非常有利。

0x03 评估

这里的信息在网络上面能够找到一些相关的信息。

参考

  1. https://patchwork.kernel.org/patch/9672023/, Introduce Kyber multiqueue I/O scheduler.
  2. https://code.woboq.org/linux/linux/block/kyber-iosched.c.html, Kyber Scheduler代码.