Kyber IO Scheduler of Linux

0x00 引言

Kyber IO调度器是Linux上面针对高速存储设备设计的一个新的IO调度器，配和多队列的Block层使用。在Linux 4.12的时候和BFQ调度器一起成为内核中的一个可选项(emmmm，BFQ的系统复杂程度远高于这个Kyber)。另外，关于Kyber更方面都没有详细的信息，只能在一些[1]地方找到几句简单的介绍。这里完全是根据这几句话和内核中的源代码来推测它的实现，所以可能会存在不少的不准确的地方。

0x01 基本思路

Kyber调度器的基本思路是会为每一个的硬件的队列维护一个不同类型IO请求的队列，这些请求主要根据IO操作的方式来进行区分。Kyber按照读、同步写以及其它的(异步写等)将IO请求分为了3类。在Kyber的设计中，更加倾向于让读有些，这个策略也和其它的一些调度器的设计类似，

  /* Scheduling domains. */
  enum {
      KYBER_READ,
      KYBER_SYNC_WRITE,
      KYBER_OTHER, /* Async writes, discard, etc. */
      KYBER_NUM_DOMAINS,
  };

Kyber在一个Kyber上下文中维护了关于这几类请求的队列。它通过限制每一个队列的长度来对在这里产生的请求的延迟进行控制。Kyber只有在这些队列里面的请求被处理了之后才会收集新的请求。这里限制的方式采用了基于Token的方式。另外这里的策略和一些交换机中控制内部缓冲区的思路相似。在下面的一些常量的定义中，可以看出对于读IO请求的偏好，

  /*
   * Initial device-wide depths for each scheduling domain.
   *
   * Even for fast devices with lots of tags like NVMe, you can saturate
   * the device with only a fraction of the maximum possible queue depth.
   * So, we cap these to a reasonable value.
   */
  static const unsigned int kyber_depth[] = {
      [KYBER_READ] = 256,
      [KYBER_SYNC_WRITE] = 128,
      [KYBER_OTHER] = 64,
  };
  
  /*
   * Scheduling domain batch sizes. We favor reads.
   */
  static const unsigned int kyber_batch_size[] = {
      [KYBER_READ] = 16,
      [KYBER_SYNC_WRITE] = 8,
      [KYBER_OTHER] = 8,
  };

另外，由于Kyber面向的是高速存储，这类设备一般是NVMe SSD、NVM之类的一些技术。采用类似CFQ中的一些对请求排序的方法可能有损于性能，所以在Kyber的代码中没有看到对请求排序的逻辑。Kyber会对一些IO请求进行合并操作，以及会尝试批量处理这些请求来提高性能。批量处理的批量的大小根据请求类型来决定。Kyber整体的逻辑不复杂，实际上，Kyber调度器实现文件只有1000来行，除去一些非核心的代码，实际Kyber的核心的代码是很少的。

0x02 代码

下面就是Kyber中几个核心的数据结构。一个kyber_ctx_queue中主要的就是里面对应不同IO种类请求的队列。另外这个数据结构主要被kyber_hctx_data使用，

  /*
   * There is a same mapping between ctx & hctx and kcq & khd,
   * we use request->mq_ctx->index_hw to index the kcq in khd.
   */
  struct kyber_ctx_queue {
      /*
       * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
       * Also protect the rqs on rq_list when merge.
       */
      spinlock_t lock;
      struct list_head rq_list[KYBER_NUM_DOMAINS];
  } ____cacheline_aligned_in_smp;
  
  struct kyber_queue_data {
      struct request_queue *q;
  
      struct blk_stat_callback *cb;
  
      /*
       * The device is divided into multiple scheduling domains based on the
       * request type. Each domain has a fixed number of in-flight requests of
       * that type device-wide, limited by these tokens.
       */
      struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  
      /*
       * Async request percentage, converted to per-word depth for
       * sbitmap_get_shallow().
       */
      unsigned int async_depth;
  
      /* Target latencies in nanoseconds. */
      u64 read_lat_nsec, write_lat_nsec;
  };
  
  struct kyber_hctx_data {
      spinlock_t lock;
      struct list_head rqs[KYBER_NUM_DOMAINS];
      unsigned int cur_domain;
      unsigned int batching;
      struct kyber_ctx_queue *kcqs;
      struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
      wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
      struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
      atomic_t wait_index[KYBER_NUM_DOMAINS];
  };

添加requests的操作就是根据request的类型添加到对应的队列，

  static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
                    struct list_head *rq_list, bool at_head)
  {
      struct kyber_hctx_data *khd = hctx->sched_data;
      struct request *rq, *next;
  
      list_for_each_entry_safe(rq, next, rq_list, queuelist) {
          unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
          struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
          struct list_head *head = &kcq->rq_list[sched_domain];
  
          spin_lock(&kcq->lock);
          if (at_head)
              list_move(&rq->queuelist, head);
          else
              list_move_tail(&rq->queuelist, head);
          sbitmap_set_bit(&khd->kcq_map[sched_domain],
                  rq->mq_ctx->index_hw);
          blk_mq_sched_request_inserted(rq);
          spin_unlock(&kcq->lock);
      }
  }

另外一个核心的函数就是request分发的逻辑。这部分的逻辑会尝试一些分发一个批量的requests。如果遇到没有请求 or 进行中的请求超过了Token表示的限制，会尝试去处理其它的Domain的请求。上面的数据结构中保存了目前处理的Domain(cur_domain)的信息。在函数kyber_dispatch_cur_domain主要就是队列的一些处理以及Token的处理。

  static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
  {
      struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
      struct kyber_hctx_data *khd = hctx->sched_data;
      struct request *rq;
      int i;
  
      spin_lock(&khd->lock);
  
      /*
       * First, if we are still entitled to batch, try to dispatch a request
       * from the batch.
       */
      if (khd->batching < kyber_batch_size[khd->cur_domain]) {
          rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
          if (rq)
              goto out;
      }
  
      /*
       * Either,
       * 1. We were no longer entitled to a batch.
       * 2. The domain we were batching didn't have any requests.
       * 3. The domain we were batching was out of tokens.
       *
       * Start another batch. Note that this wraps back around to the original
       * domain if no other domains have requests or tokens.
       */
      khd->batching = 0;
      for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
          if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
              khd->cur_domain = 0;
          else
              khd->cur_domain++;
  
          rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
          if (rq)
              goto out;
      }
  
      rq = NULL;
  out:
      spin_unlock(&khd->lock);
      return rq;
  }

另外中Kyber两个设置的参数就是读、写的延迟，这里主要通过调整队列的长度实现。另外，在实际的运行中可以遇到读、写请求的延迟出现比较大的区别。Kyber将读、写延迟定义了Great、Good、Bad等的等级，根据实际测量到的延迟和目标的延迟确定。Kyber会尝试将不同Domain的请求保持同一个评价，即都为Good or 都为 Bad，来保证公平性，

  enum {
      NONE = 0,
      GOOD = 1,
      GREAT = 2,
      BAD = -1,
      AWFUL = -2,
  };
  
  #define IS_GOOD(status) ((status) > 0)
  #define IS_BAD(status) ((status) < 0)
  
  static int kyber_lat_status(struct blk_stat_callback *cb,
                  unsigned int sched_domain, u64 target)
  {
      u64 latency;
  
      if (!cb->stat[sched_domain].nr_samples)
          return NONE;
  
      latency = cb->stat[sched_domain].mean;
      if (latency >= 2 * target)
          return AWFUL;
      else if (latency > target)
          return BAD;
      else if (latency <= target / 2)
          return GREAT;
      else /* (latency <= target) */
          return GOOD;
  }

这里的队列深度的调整，Kyber使用了一些启发式的方法(启发式的方法很多时候就是指一些不知所以然，根据测试or经验而来，但是很多时候又用的一些方法，23333)。Kyber这里提交将读 or 同步写的范围一类，其它的分为一类，虽然调整的参数有所不同，但是基本的逻辑是一样的，

  /*
   * Adjust the read or synchronous write depth given the status of reads and
   * writes. The goal is that the latencies of the two domains are fair (i.e., if
   * one is good, then the other is good).
   */
  static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
                    unsigned int sched_domain, int this_status,
                    int other_status)
  {
      unsigned int orig_depth, depth;
  
      /*
       * If this domain had no samples, or reads and writes are both good or
       * both bad, don't adjust the depth.
       */
      if (this_status == NONE ||
          (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
          (IS_BAD(this_status) && IS_BAD(other_status)))
          return;
  
      orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
  
      if (other_status == NONE) {
          depth++;
      } else {
          switch (this_status) {
          case GOOD:
              if (other_status == AWFUL)
                  depth -= max(depth / 4, 1U);
              else
                  depth -= max(depth / 8, 1U);
              break;
          case GREAT:
              if (other_status == AWFUL)
                  depth /= 2;
              else
                  depth -= max(depth / 4, 1U);
              break;
          case BAD:
              depth++;
              break;
          case AWFUL:
              if (other_status == GREAT)
                  depth += 2;
              else
                  depth++;
              break;
          }
      }
  
      depth = clamp(depth, 1U, kyber_depth[sched_domain]);
      if (depth != orig_depth)
          sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
  }
  

Kyber和其它的一些调度器相比，还是一种很简单的设计。但是在它适合的环境下，对延迟的降低非常有利。

0x03 评估

这里的信息在网络上面能够找到一些相关的信息。

参考

https://patchwork.kernel.org/patch/9672023/, Introduce Kyber multiqueue I/O scheduler.
https://code.woboq.org/linux/linux/block/kyber-iosched.c.html, Kyber Scheduler代码.