drivers/nvme/target/rdma.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVMe over Fabrics RDMA target.
   4  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 #include <linux/atomic.h>
   8 #include <linux/ctype.h>
   9 #include <linux/delay.h>
  10 #include <linux/err.h>
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/nvme.h>
  14 #include <linux/slab.h>
  15 #include <linux/string.h>
  16 #include <linux/wait.h>
  17 #include <linux/inet.h>
  18 #include <asm/unaligned.h>
  19
  20 #include <rdma/ib_verbs.h>
  21 #include <rdma/rdma_cm.h>
  22 #include <rdma/rw.h>
  23
  24 #include <linux/nvme-rdma.h>
  25 #include "nvmet.h"
  26
  27 /*
  28  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  29  */
  30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE     PAGE_SIZE
  31 #define NVMET_RDMA_MAX_INLINE_SGE               4
  32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE         max_t(int, SZ_16K, PAGE_SIZE)
  33
  34 struct nvmet_rdma_cmd {
  35         struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
  36         struct ib_cqe           cqe;
  37         struct ib_recv_wr       wr;
  38         struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
  39         struct nvme_command     *nvme_cmd;
  40         struct nvmet_rdma_queue *queue;
  41 };
  42
  43 enum {
  44         NVMET_RDMA_REQ_INLINE_DATA      = (1 << 0),
  45         NVMET_RDMA_REQ_INVALIDATE_RKEY  = (1 << 1),
  46 };
  47
  48 struct nvmet_rdma_rsp {
  49         struct ib_sge           send_sge;
  50         struct ib_cqe           send_cqe;
  51         struct ib_send_wr       send_wr;
  52
  53         struct nvmet_rdma_cmd   *cmd;
  54         struct nvmet_rdma_queue *queue;
  55
  56         struct ib_cqe           read_cqe;
  57         struct rdma_rw_ctx      rw;
  58
  59         struct nvmet_req        req;
  60
  61         bool                    allocated;
  62         u8                      n_rdma;
  63         u32                     flags;
  64         u32                     invalidate_rkey;
  65
  66         struct list_head        wait_list;
  67         struct list_head        free_list;
  68 };
  69
  70 enum nvmet_rdma_queue_state {
  71         NVMET_RDMA_Q_CONNECTING,
  72         NVMET_RDMA_Q_LIVE,
  73         NVMET_RDMA_Q_DISCONNECTING,
  74 };
  75
  76 struct nvmet_rdma_queue {
  77         struct rdma_cm_id       *cm_id;
  78         struct nvmet_port       *port;
  79         struct ib_cq            *cq;
  80         atomic_t                sq_wr_avail;
  81         struct nvmet_rdma_device *dev;
  82         spinlock_t              state_lock;
  83         enum nvmet_rdma_queue_state state;
  84         struct nvmet_cq         nvme_cq;
  85         struct nvmet_sq         nvme_sq;
  86
  87         struct nvmet_rdma_rsp   *rsps;
  88         struct list_head        free_rsps;
  89         spinlock_t              rsps_lock;
  90         struct nvmet_rdma_cmd   *cmds;
  91
  92         struct work_struct      release_work;
  93         struct list_head        rsp_wait_list;
  94         struct list_head        rsp_wr_wait_list;
  95         spinlock_t              rsp_wr_wait_lock;
  96
  97         int                     idx;
  98         int                     host_qid;
  99         int                     recv_queue_size;
 100         int                     send_queue_size;
 101
 102         struct list_head        queue_list;
 103 };
 104
 105 struct nvmet_rdma_device {
 106         struct ib_device        *device;
 107         struct ib_pd            *pd;
 108         struct ib_srq           *srq;
 109         struct nvmet_rdma_cmd   *srq_cmds;
 110         size_t                  srq_size;
 111         struct kref             ref;
 112         struct list_head        entry;
 113         int                     inline_data_size;
 114         int                     inline_page_count;
 115 };
 116
 117 static bool nvmet_rdma_use_srq;
 118 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
 119 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
 120
 121 static DEFINE_IDA(nvmet_rdma_queue_ida);
 122 static LIST_HEAD(nvmet_rdma_queue_list);
 123 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
 124
 125 static LIST_HEAD(device_list);
 126 static DEFINE_MUTEX(device_list_mutex);
 127
 128 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
 129 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
 130 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 131 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
 132 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
 133 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
 134 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 135                                 struct nvmet_rdma_rsp *r);
 136 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 137                                 struct nvmet_rdma_rsp *r);
 138
 139 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 140
 141 static int num_pages(int len)
 142 {
 143         return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
 144 }
 145
 146 /* XXX: really should move to a generic header sooner or later.. */
 147 static inline u32 get_unaligned_le24(const u8 *p)
 148 {
 149         return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
 150 }
 151
 152 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
 153 {
 154         return nvme_is_write(rsp->req.cmd) &&
 155                 rsp->req.transfer_len &&
 156                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 157 }
 158
 159 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
 160 {
 161         return !nvme_is_write(rsp->req.cmd) &&
 162                 rsp->req.transfer_len &&
 163                 !rsp->req.cqe->status &&
 164                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 165 }
 166
 167 static inline struct nvmet_rdma_rsp *
 168 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
 169 {
 170         struct nvmet_rdma_rsp *rsp;
 171         unsigned long flags;
 172
 173         spin_lock_irqsave(&queue->rsps_lock, flags);
 174         rsp = list_first_entry_or_null(&queue->free_rsps,
 175                                 struct nvmet_rdma_rsp, free_list);
 176         if (likely(rsp))
 177                 list_del(&rsp->free_list);
 178         spin_unlock_irqrestore(&queue->rsps_lock, flags);
 179
 180         if (unlikely(!rsp)) {
 181                 int ret;
 182
 183                 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
 184                 if (unlikely(!rsp))
 185                         return NULL;
 186                 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
 187                 if (unlikely(ret)) {
 188                         kfree(rsp);
 189                         return NULL;
 190                 }
 191
 192                 rsp->allocated = true;
 193         }
 194
 195         return rsp;
 196 }
 197
 198 static inline void
 199 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 200 {
 201         unsigned long flags;
 202
 203         if (unlikely(rsp->allocated)) {
 204                 nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
 205                 kfree(rsp);
 206                 return;
 207         }
 208
 209         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
 210         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
 211         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 212 }
 213
 214 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
 215                                 struct nvmet_rdma_cmd *c)
 216 {
 217         struct scatterlist *sg;
 218         struct ib_sge *sge;
 219         int i;
 220
 221         if (!ndev->inline_data_size)
 222                 return;
 223
 224         sg = c->inline_sg;
 225         sge = &c->sge[1];
 226
 227         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 228                 if (sge->length)
 229                         ib_dma_unmap_page(ndev->device, sge->addr,
 230                                         sge->length, DMA_FROM_DEVICE);
 231                 if (sg_page(sg))
 232                         __free_page(sg_page(sg));
 233         }
 234 }
 235
 236 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
 237                                 struct nvmet_rdma_cmd *c)
 238 {
 239         struct scatterlist *sg;
 240         struct ib_sge *sge;
 241         struct page *pg;
 242         int len;
 243         int i;
 244
 245         if (!ndev->inline_data_size)
 246                 return 0;
 247
 248         sg = c->inline_sg;
 249         sg_init_table(sg, ndev->inline_page_count);
 250         sge = &c->sge[1];
 251         len = ndev->inline_data_size;
 252
 253         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 254                 pg = alloc_page(GFP_KERNEL);
 255                 if (!pg)
 256                         goto out_err;
 257                 sg_assign_page(sg, pg);
 258                 sge->addr = ib_dma_map_page(ndev->device,
 259                         pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
 260                 if (ib_dma_mapping_error(ndev->device, sge->addr))
 261                         goto out_err;
 262                 sge->length = min_t(int, len, PAGE_SIZE);
 263                 sge->lkey = ndev->pd->local_dma_lkey;
 264                 len -= sge->length;
 265         }
 266
 267         return 0;
 268 out_err:
 269         for (; i >= 0; i--, sg--, sge--) {
 270                 if (sge->length)
 271                         ib_dma_unmap_page(ndev->device, sge->addr,
 272                                         sge->length, DMA_FROM_DEVICE);
 273                 if (sg_page(sg))
 274                         __free_page(sg_page(sg));
 275         }
 276         return -ENOMEM;
 277 }
 278
 279 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
 280                         struct nvmet_rdma_cmd *c, bool admin)
 281 {
 282         /* NVMe command / RDMA RECV */
 283         c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
 284         if (!c->nvme_cmd)
 285                 goto out;
 286
 287         c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
 288                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 289         if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
 290                 goto out_free_cmd;
 291
 292         c->sge[0].length = sizeof(*c->nvme_cmd);
 293         c->sge[0].lkey = ndev->pd->local_dma_lkey;
 294
 295         if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
 296                 goto out_unmap_cmd;
 297
 298         c->cqe.done = nvmet_rdma_recv_done;
 299
 300         c->wr.wr_cqe = &c->cqe;
 301         c->wr.sg_list = c->sge;
 302         c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
 303
 304         return 0;
 305
 306 out_unmap_cmd:
 307         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 308                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 309 out_free_cmd:
 310         kfree(c->nvme_cmd);
 311
 312 out:
 313         return -ENOMEM;
 314 }
 315
 316 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
 317                 struct nvmet_rdma_cmd *c, bool admin)
 318 {
 319         if (!admin)
 320                 nvmet_rdma_free_inline_pages(ndev, c);
 321         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 322                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 323         kfree(c->nvme_cmd);
 324 }
 325
 326 static struct nvmet_rdma_cmd *
 327 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
 328                 int nr_cmds, bool admin)
 329 {
 330         struct nvmet_rdma_cmd *cmds;
 331         int ret = -EINVAL, i;
 332
 333         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
 334         if (!cmds)
 335                 goto out;
 336
 337         for (i = 0; i < nr_cmds; i++) {
 338                 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
 339                 if (ret)
 340                         goto out_free;
 341         }
 342
 343         return cmds;
 344
 345 out_free:
 346         while (--i >= 0)
 347                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 348         kfree(cmds);
 349 out:
 350         return ERR_PTR(ret);
 351 }
 352
 353 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
 354                 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
 355 {
 356         int i;
 357
 358         for (i = 0; i < nr_cmds; i++)
 359                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 360         kfree(cmds);
 361 }
 362
 363 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 364                 struct nvmet_rdma_rsp *r)
 365 {
 366         /* NVMe CQE / RDMA SEND */
 367         r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL);
 368         if (!r->req.cqe)
 369                 goto out;
 370
 371         r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe,
 372                         sizeof(*r->req.cqe), DMA_TO_DEVICE);
 373         if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
 374                 goto out_free_rsp;
 375
 376         r->req.p2p_client = &ndev->device->dev;
 377         r->send_sge.length = sizeof(*r->req.cqe);
 378         r->send_sge.lkey = ndev->pd->local_dma_lkey;
 379
 380         r->send_cqe.done = nvmet_rdma_send_done;
 381
 382         r->send_wr.wr_cqe = &r->send_cqe;
 383         r->send_wr.sg_list = &r->send_sge;
 384         r->send_wr.num_sge = 1;
 385         r->send_wr.send_flags = IB_SEND_SIGNALED;
 386
 387         /* Data In / RDMA READ */
 388         r->read_cqe.done = nvmet_rdma_read_data_done;
 389         return 0;
 390
 391 out_free_rsp:
 392         kfree(r->req.cqe);
 393 out:
 394         return -ENOMEM;
 395 }
 396
 397 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 398                 struct nvmet_rdma_rsp *r)
 399 {
 400         ib_dma_unmap_single(ndev->device, r->send_sge.addr,
 401                                 sizeof(*r->req.cqe), DMA_TO_DEVICE);
 402         kfree(r->req.cqe);
 403 }
 404
 405 static int
 406 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
 407 {
 408         struct nvmet_rdma_device *ndev = queue->dev;
 409         int nr_rsps = queue->recv_queue_size * 2;
 410         int ret = -EINVAL, i;
 411
 412         queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
 413                         GFP_KERNEL);
 414         if (!queue->rsps)
 415                 goto out;
 416
 417         for (i = 0; i < nr_rsps; i++) {
 418                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 419
 420                 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
 421                 if (ret)
 422                         goto out_free;
 423
 424                 list_add_tail(&rsp->free_list, &queue->free_rsps);
 425         }
 426
 427         return 0;
 428
 429 out_free:
 430         while (--i >= 0) {
 431                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 432
 433                 list_del(&rsp->free_list);
 434                 nvmet_rdma_free_rsp(ndev, rsp);
 435         }
 436         kfree(queue->rsps);
 437 out:
 438         return ret;
 439 }
 440
 441 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
 442 {
 443         struct nvmet_rdma_device *ndev = queue->dev;
 444         int i, nr_rsps = queue->recv_queue_size * 2;
 445
 446         for (i = 0; i < nr_rsps; i++) {
 447                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 448
 449                 list_del(&rsp->free_list);
 450                 nvmet_rdma_free_rsp(ndev, rsp);
 451         }
 452         kfree(queue->rsps);
 453 }
 454
 455 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
 456                 struct nvmet_rdma_cmd *cmd)
 457 {
 458         int ret;
 459
 460         ib_dma_sync_single_for_device(ndev->device,
 461                 cmd->sge[0].addr, cmd->sge[0].length,
 462                 DMA_FROM_DEVICE);
 463
 464         if (ndev->srq)
 465                 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
 466         else
 467                 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
 468
 469         if (unlikely(ret))
 470                 pr_err("post_recv cmd failed\n");
 471
 472         return ret;
 473 }
 474
 475 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
 476 {
 477         spin_lock(&queue->rsp_wr_wait_lock);
 478         while (!list_empty(&queue->rsp_wr_wait_list)) {
 479                 struct nvmet_rdma_rsp *rsp;
 480                 bool ret;
 481
 482                 rsp = list_entry(queue->rsp_wr_wait_list.next,
 483                                 struct nvmet_rdma_rsp, wait_list);
 484                 list_del(&rsp->wait_list);
 485
 486                 spin_unlock(&queue->rsp_wr_wait_lock);
 487                 ret = nvmet_rdma_execute_command(rsp);
 488                 spin_lock(&queue->rsp_wr_wait_lock);
 489
 490                 if (!ret) {
 491                         list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
 492                         break;
 493                 }
 494         }
 495         spin_unlock(&queue->rsp_wr_wait_lock);
 496 }
 497
 498
 499 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
 500 {
 501         struct nvmet_rdma_queue *queue = rsp->queue;
 502
 503         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 504
 505         if (rsp->n_rdma) {
 506                 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 507                                 queue->cm_id->port_num, rsp->req.sg,
 508                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 509         }
 510
 511         if (rsp->req.sg != rsp->cmd->inline_sg)
 512                 nvmet_req_free_sgl(&rsp->req);
 513
 514         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
 515                 nvmet_rdma_process_wr_wait_list(queue);
 516
 517         nvmet_rdma_put_rsp(rsp);
 518 }
 519
 520 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
 521 {
 522         if (queue->nvme_sq.ctrl) {
 523                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 524         } else {
 525                 /*
 526                  * we didn't setup the controller yet in case
 527                  * of admin connect error, just disconnect and
 528                  * cleanup the queue
 529                  */
 530                 nvmet_rdma_queue_disconnect(queue);
 531         }
 532 }
 533
 534 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 535 {
 536         struct nvmet_rdma_rsp *rsp =
 537                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
 538         struct nvmet_rdma_queue *queue = cq->cq_context;
 539
 540         nvmet_rdma_release_rsp(rsp);
 541
 542         if (unlikely(wc->status != IB_WC_SUCCESS &&
 543                      wc->status != IB_WC_WR_FLUSH_ERR)) {
 544                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
 545                         wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 546                 nvmet_rdma_error_comp(queue);
 547         }
 548 }
 549
 550 static void nvmet_rdma_queue_response(struct nvmet_req *req)
 551 {
 552         struct nvmet_rdma_rsp *rsp =
 553                 container_of(req, struct nvmet_rdma_rsp, req);
 554         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 555         struct ib_send_wr *first_wr;
 556
 557         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
 558                 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
 559                 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
 560         } else {
 561                 rsp->send_wr.opcode = IB_WR_SEND;
 562         }
 563
 564         if (nvmet_rdma_need_data_out(rsp))
 565                 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
 566                                 cm_id->port_num, NULL, &rsp->send_wr);
 567         else
 568                 first_wr = &rsp->send_wr;
 569
 570         nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
 571
 572         ib_dma_sync_single_for_device(rsp->queue->dev->device,
 573                 rsp->send_sge.addr, rsp->send_sge.length,
 574                 DMA_TO_DEVICE);
 575
 576         if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
 577                 pr_err("sending cmd response failed\n");
 578                 nvmet_rdma_release_rsp(rsp);
 579         }
 580 }
 581
 582 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 583 {
 584         struct nvmet_rdma_rsp *rsp =
 585                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
 586         struct nvmet_rdma_queue *queue = cq->cq_context;
 587
 588         WARN_ON(rsp->n_rdma <= 0);
 589         atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
 590         rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 591                         queue->cm_id->port_num, rsp->req.sg,
 592                         rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 593         rsp->n_rdma = 0;
 594
 595         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 596                 nvmet_req_uninit(&rsp->req);
 597                 nvmet_rdma_release_rsp(rsp);
 598                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 599                         pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
 600                                 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 601                         nvmet_rdma_error_comp(queue);
 602                 }
 603                 return;
 604         }
 605
 606         nvmet_req_execute(&rsp->req);
 607 }
 608
 609 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
 610                 u64 off)
 611 {
 612         int sg_count = num_pages(len);
 613         struct scatterlist *sg;
 614         int i;
 615
 616         sg = rsp->cmd->inline_sg;
 617         for (i = 0; i < sg_count; i++, sg++) {
 618                 if (i < sg_count - 1)
 619                         sg_unmark_end(sg);
 620                 else
 621                         sg_mark_end(sg);
 622                 sg->offset = off;
 623                 sg->length = min_t(int, len, PAGE_SIZE - off);
 624                 len -= sg->length;
 625                 if (!i)
 626                         off = 0;
 627         }
 628
 629         rsp->req.sg = rsp->cmd->inline_sg;
 630         rsp->req.sg_cnt = sg_count;
 631 }
 632
 633 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 634 {
 635         struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
 636         u64 off = le64_to_cpu(sgl->addr);
 637         u32 len = le32_to_cpu(sgl->length);
 638
 639         if (!nvme_is_write(rsp->req.cmd)) {
 640                 rsp->req.error_loc =
 641                         offsetof(struct nvme_common_command, opcode);
 642                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 643         }
 644
 645         if (off + len > rsp->queue->dev->inline_data_size) {
 646                 pr_err("invalid inline data offset!\n");
 647                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 648         }
 649
 650         /* no data command? */
 651         if (!len)
 652                 return 0;
 653
 654         nvmet_rdma_use_inline_sg(rsp, len, off);
 655         rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
 656         rsp->req.transfer_len += len;
 657         return 0;
 658 }
 659
 660 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 661                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
 662 {
 663         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 664         u64 addr = le64_to_cpu(sgl->addr);
 665         u32 key = get_unaligned_le32(sgl->key);
 666         int ret;
 667
 668         rsp->req.transfer_len = get_unaligned_le24(sgl->length);
 669
 670         /* no data command? */
 671         if (!rsp->req.transfer_len)
 672                 return 0;
 673
 674         ret = nvmet_req_alloc_sgl(&rsp->req);
 675         if (ret < 0)
 676                 goto error_out;
 677
 678         ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
 679                         rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
 680                         nvmet_data_dir(&rsp->req));
 681         if (ret < 0)
 682                 goto error_out;
 683         rsp->n_rdma += ret;
 684
 685         if (invalidate) {
 686                 rsp->invalidate_rkey = key;
 687                 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
 688         }
 689
 690         return 0;
 691
 692 error_out:
 693         rsp->req.transfer_len = 0;
 694         return NVME_SC_INTERNAL;
 695 }
 696
 697 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 698 {
 699         struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
 700
 701         switch (sgl->type >> 4) {
 702         case NVME_SGL_FMT_DATA_DESC:
 703                 switch (sgl->type & 0xf) {
 704                 case NVME_SGL_FMT_OFFSET:
 705                         return nvmet_rdma_map_sgl_inline(rsp);
 706                 default:
 707                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 708                         rsp->req.error_loc =
 709                                 offsetof(struct nvme_common_command, dptr);
 710                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 711                 }
 712         case NVME_KEY_SGL_FMT_DATA_DESC:
 713                 switch (sgl->type & 0xf) {
 714                 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
 715                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
 716                 case NVME_SGL_FMT_ADDRESS:
 717                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 718                 default:
 719                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 720                         rsp->req.error_loc =
 721                                 offsetof(struct nvme_common_command, dptr);
 722                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 723                 }
 724         default:
 725                 pr_err("invalid SGL type: %#x\n", sgl->type);
 726                 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
 727                 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 728         }
 729 }
 730
 731 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 732 {
 733         struct nvmet_rdma_queue *queue = rsp->queue;
 734
 735         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
 736                         &queue->sq_wr_avail) < 0)) {
 737                 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
 738                                 1 + rsp->n_rdma, queue->idx,
 739                                 queue->nvme_sq.ctrl->cntlid);
 740                 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 741                 return false;
 742         }
 743
 744         if (nvmet_rdma_need_data_in(rsp)) {
 745                 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
 746                                 queue->cm_id->port_num, &rsp->read_cqe, NULL))
 747                         nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
 748         } else {
 749                 nvmet_req_execute(&rsp->req);
 750         }
 751
 752         return true;
 753 }
 754
 755 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
 756                 struct nvmet_rdma_rsp *cmd)
 757 {
 758         u16 status;
 759
 760         ib_dma_sync_single_for_cpu(queue->dev->device,
 761                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
 762                 DMA_FROM_DEVICE);
 763         ib_dma_sync_single_for_cpu(queue->dev->device,
 764                 cmd->send_sge.addr, cmd->send_sge.length,
 765                 DMA_TO_DEVICE);
 766
 767         if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
 768                         &queue->nvme_sq, &nvmet_rdma_ops))
 769                 return;
 770
 771         status = nvmet_rdma_map_sgl(cmd);
 772         if (status)
 773                 goto out_err;
 774
 775         if (unlikely(!nvmet_rdma_execute_command(cmd))) {
 776                 spin_lock(&queue->rsp_wr_wait_lock);
 777                 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
 778                 spin_unlock(&queue->rsp_wr_wait_lock);
 779         }
 780
 781         return;
 782
 783 out_err:
 784         nvmet_req_complete(&cmd->req, status);
 785 }
 786
 787 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 788 {
 789         struct nvmet_rdma_cmd *cmd =
 790                 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
 791         struct nvmet_rdma_queue *queue = cq->cq_context;
 792         struct nvmet_rdma_rsp *rsp;
 793
 794         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 795                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 796                         pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
 797                                 wc->wr_cqe, ib_wc_status_msg(wc->status),
 798                                 wc->status);
 799                         nvmet_rdma_error_comp(queue);
 800                 }
 801                 return;
 802         }
 803
 804         if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
 805                 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
 806                 nvmet_rdma_error_comp(queue);
 807                 return;
 808         }
 809
 810         cmd->queue = queue;
 811         rsp = nvmet_rdma_get_rsp(queue);
 812         if (unlikely(!rsp)) {
 813                 /*
 814                  * we get here only under memory pressure,
 815                  * silently drop and have the host retry
 816                  * as we can't even fail it.
 817                  */
 818                 nvmet_rdma_post_recv(queue->dev, cmd);
 819                 return;
 820         }
 821         rsp->queue = queue;
 822         rsp->cmd = cmd;
 823         rsp->flags = 0;
 824         rsp->req.cmd = cmd->nvme_cmd;
 825         rsp->req.port = queue->port;
 826         rsp->n_rdma = 0;
 827
 828         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
 829                 unsigned long flags;
 830
 831                 spin_lock_irqsave(&queue->state_lock, flags);
 832                 if (queue->state == NVMET_RDMA_Q_CONNECTING)
 833                         list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
 834                 else
 835                         nvmet_rdma_put_rsp(rsp);
 836                 spin_unlock_irqrestore(&queue->state_lock, flags);
 837                 return;
 838         }
 839
 840         nvmet_rdma_handle_command(queue, rsp);
 841 }
 842
 843 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
 844 {
 845         if (!ndev->srq)
 846                 return;
 847
 848         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 849         ib_destroy_srq(ndev->srq);
 850 }
 851
 852 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
 853 {
 854         struct ib_srq_init_attr srq_attr = { NULL, };
 855         struct ib_srq *srq;
 856         size_t srq_size;
 857         int ret, i;
 858
 859         srq_size = 4095;        /* XXX: tune */
 860
 861         srq_attr.attr.max_wr = srq_size;
 862         srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
 863         srq_attr.attr.srq_limit = 0;
 864         srq_attr.srq_type = IB_SRQT_BASIC;
 865         srq = ib_create_srq(ndev->pd, &srq_attr);
 866         if (IS_ERR(srq)) {
 867                 /*
 868                  * If SRQs aren't supported we just go ahead and use normal
 869                  * non-shared receive queues.
 870                  */
 871                 pr_info("SRQ requested but not supported.\n");
 872                 return 0;
 873         }
 874
 875         ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
 876         if (IS_ERR(ndev->srq_cmds)) {
 877                 ret = PTR_ERR(ndev->srq_cmds);
 878                 goto out_destroy_srq;
 879         }
 880
 881         ndev->srq = srq;
 882         ndev->srq_size = srq_size;
 883
 884         for (i = 0; i < srq_size; i++) {
 885                 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
 886                 if (ret)
 887                         goto out_free_cmds;
 888         }
 889
 890         return 0;
 891
 892 out_free_cmds:
 893         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 894 out_destroy_srq:
 895         ib_destroy_srq(srq);
 896         return ret;
 897 }
 898
 899 static void nvmet_rdma_free_dev(struct kref *ref)
 900 {
 901         struct nvmet_rdma_device *ndev =
 902                 container_of(ref, struct nvmet_rdma_device, ref);
 903
 904         mutex_lock(&device_list_mutex);
 905         list_del(&ndev->entry);
 906         mutex_unlock(&device_list_mutex);
 907
 908         nvmet_rdma_destroy_srq(ndev);
 909         ib_dealloc_pd(ndev->pd);
 910
 911         kfree(ndev);
 912 }
 913
 914 static struct nvmet_rdma_device *
 915 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 916 {
 917         struct nvmet_port *port = cm_id->context;
 918         struct nvmet_rdma_device *ndev;
 919         int inline_page_count;
 920         int inline_sge_count;
 921         int ret;
 922
 923         mutex_lock(&device_list_mutex);
 924         list_for_each_entry(ndev, &device_list, entry) {
 925                 if (ndev->device->node_guid == cm_id->device->node_guid &&
 926                     kref_get_unless_zero(&ndev->ref))
 927                         goto out_unlock;
 928         }
 929
 930         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 931         if (!ndev)
 932                 goto out_err;
 933
 934         inline_page_count = num_pages(port->inline_data_size);
 935         inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
 936                                 cm_id->device->attrs.max_recv_sge) - 1;
 937         if (inline_page_count > inline_sge_count) {
 938                 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
 939                         port->inline_data_size, cm_id->device->name,
 940                         inline_sge_count * PAGE_SIZE);
 941                 port->inline_data_size = inline_sge_count * PAGE_SIZE;
 942                 inline_page_count = inline_sge_count;
 943         }
 944         ndev->inline_data_size = port->inline_data_size;
 945         ndev->inline_page_count = inline_page_count;
 946         ndev->device = cm_id->device;
 947         kref_init(&ndev->ref);
 948
 949         ndev->pd = ib_alloc_pd(ndev->device, 0);
 950         if (IS_ERR(ndev->pd))
 951                 goto out_free_dev;
 952
 953         if (nvmet_rdma_use_srq) {
 954                 ret = nvmet_rdma_init_srq(ndev);
 955                 if (ret)
 956                         goto out_free_pd;
 957         }
 958
 959         list_add(&ndev->entry, &device_list);
 960 out_unlock:
 961         mutex_unlock(&device_list_mutex);
 962         pr_debug("added %s.\n", ndev->device->name);
 963         return ndev;
 964
 965 out_free_pd:
 966         ib_dealloc_pd(ndev->pd);
 967 out_free_dev:
 968         kfree(ndev);
 969 out_err:
 970         mutex_unlock(&device_list_mutex);
 971         return NULL;
 972 }
 973
 974 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
 975 {
 976         struct ib_qp_init_attr qp_attr;
 977         struct nvmet_rdma_device *ndev = queue->dev;
 978         int comp_vector, nr_cqe, ret, i;
 979
 980         /*
 981          * Spread the io queues across completion vectors,
 982          * but still keep all admin queues on vector 0.
 983          */
 984         comp_vector = !queue->host_qid ? 0 :
 985                 queue->idx % ndev->device->num_comp_vectors;
 986
 987         /*
 988          * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
 989          */
 990         nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
 991
 992         queue->cq = ib_alloc_cq(ndev->device, queue,
 993                         nr_cqe + 1, comp_vector,
 994                         IB_POLL_WORKQUEUE);
 995         if (IS_ERR(queue->cq)) {
 996                 ret = PTR_ERR(queue->cq);
 997                 pr_err("failed to create CQ cqe= %d ret= %d\n",
 998                        nr_cqe + 1, ret);
 999                 goto out;
1000         }
1001
1002         memset(&qp_attr, 0, sizeof(qp_attr));
1003         qp_attr.qp_context = queue;
1004         qp_attr.event_handler = nvmet_rdma_qp_event;
1005         qp_attr.send_cq = queue->cq;
1006         qp_attr.recv_cq = queue->cq;
1007         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1008         qp_attr.qp_type = IB_QPT_RC;
1009         /* +1 for drain */
1010         qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
1011         qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
1012         qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
1013                                         ndev->device->attrs.max_send_sge);
1014
1015         if (ndev->srq) {
1016                 qp_attr.srq = ndev->srq;
1017         } else {
1018                 /* +1 for drain */
1019                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1020                 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1021         }
1022
1023         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1024         if (ret) {
1025                 pr_err("failed to create_qp ret= %d\n", ret);
1026                 goto err_destroy_cq;
1027         }
1028
1029         atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1030
1031         pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1032                  __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1033                  qp_attr.cap.max_send_wr, queue->cm_id);
1034
1035         if (!ndev->srq) {
1036                 for (i = 0; i < queue->recv_queue_size; i++) {
1037                         queue->cmds[i].queue = queue;
1038                         ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1039                         if (ret)
1040                                 goto err_destroy_qp;
1041                 }
1042         }
1043
1044 out:
1045         return ret;
1046
1047 err_destroy_qp:
1048         rdma_destroy_qp(queue->cm_id);
1049 err_destroy_cq:
1050         ib_free_cq(queue->cq);
1051         goto out;
1052 }
1053
1054 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1055 {
1056         struct ib_qp *qp = queue->cm_id->qp;
1057
1058         ib_drain_qp(qp);
1059         rdma_destroy_id(queue->cm_id);
1060         ib_destroy_qp(qp);
1061         ib_free_cq(queue->cq);
1062 }
1063
1064 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1065 {
1066         pr_debug("freeing queue %d\n", queue->idx);
1067
1068         nvmet_sq_destroy(&queue->nvme_sq);
1069
1070         nvmet_rdma_destroy_queue_ib(queue);
1071         if (!queue->dev->srq) {
1072                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1073                                 queue->recv_queue_size,
1074                                 !queue->host_qid);
1075         }
1076         nvmet_rdma_free_rsps(queue);
1077         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1078         kfree(queue);
1079 }
1080
1081 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1082 {
1083         struct nvmet_rdma_queue *queue =
1084                 container_of(w, struct nvmet_rdma_queue, release_work);
1085         struct nvmet_rdma_device *dev = queue->dev;
1086
1087         nvmet_rdma_free_queue(queue);
1088
1089         kref_put(&dev->ref, nvmet_rdma_free_dev);
1090 }
1091
1092 static int
1093 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1094                                 struct nvmet_rdma_queue *queue)
1095 {
1096         struct nvme_rdma_cm_req *req;
1097
1098         req = (struct nvme_rdma_cm_req *)conn->private_data;
1099         if (!req || conn->private_data_len == 0)
1100                 return NVME_RDMA_CM_INVALID_LEN;
1101
1102         if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1103                 return NVME_RDMA_CM_INVALID_RECFMT;
1104
1105         queue->host_qid = le16_to_cpu(req->qid);
1106
1107         /*
1108          * req->hsqsize corresponds to our recv queue size plus 1
1109          * req->hrqsize corresponds to our send queue size
1110          */
1111         queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1112         queue->send_queue_size = le16_to_cpu(req->hrqsize);
1113
1114         if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1115                 return NVME_RDMA_CM_INVALID_HSQSIZE;
1116
1117         /* XXX: Should we enforce some kind of max for IO queues? */
1118
1119         return 0;
1120 }
1121
1122 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1123                                 enum nvme_rdma_cm_status status)
1124 {
1125         struct nvme_rdma_cm_rej rej;
1126
1127         pr_debug("rejecting connect request: status %d (%s)\n",
1128                  status, nvme_rdma_cm_msg(status));
1129
1130         rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1131         rej.sts = cpu_to_le16(status);
1132
1133         return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1134 }
1135
1136 static struct nvmet_rdma_queue *
1137 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1138                 struct rdma_cm_id *cm_id,
1139                 struct rdma_cm_event *event)
1140 {
1141         struct nvmet_rdma_queue *queue;
1142         int ret;
1143
1144         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1145         if (!queue) {
1146                 ret = NVME_RDMA_CM_NO_RSC;
1147                 goto out_reject;
1148         }
1149
1150         ret = nvmet_sq_init(&queue->nvme_sq);
1151         if (ret) {
1152                 ret = NVME_RDMA_CM_NO_RSC;
1153                 goto out_free_queue;
1154         }
1155
1156         ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1157         if (ret)
1158                 goto out_destroy_sq;
1159
1160         /*
1161          * Schedules the actual release because calling rdma_destroy_id from
1162          * inside a CM callback would trigger a deadlock. (great API design..)
1163          */
1164         INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1165         queue->dev = ndev;
1166         queue->cm_id = cm_id;
1167
1168         spin_lock_init(&queue->state_lock);
1169         queue->state = NVMET_RDMA_Q_CONNECTING;
1170         INIT_LIST_HEAD(&queue->rsp_wait_list);
1171         INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1172         spin_lock_init(&queue->rsp_wr_wait_lock);
1173         INIT_LIST_HEAD(&queue->free_rsps);
1174         spin_lock_init(&queue->rsps_lock);
1175         INIT_LIST_HEAD(&queue->queue_list);
1176
1177         queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1178         if (queue->idx < 0) {
1179                 ret = NVME_RDMA_CM_NO_RSC;
1180                 goto out_destroy_sq;
1181         }
1182
1183         ret = nvmet_rdma_alloc_rsps(queue);
1184         if (ret) {
1185                 ret = NVME_RDMA_CM_NO_RSC;
1186                 goto out_ida_remove;
1187         }
1188
1189         if (!ndev->srq) {
1190                 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1191                                 queue->recv_queue_size,
1192                                 !queue->host_qid);
1193                 if (IS_ERR(queue->cmds)) {
1194                         ret = NVME_RDMA_CM_NO_RSC;
1195                         goto out_free_responses;
1196                 }
1197         }
1198
1199         ret = nvmet_rdma_create_queue_ib(queue);
1200         if (ret) {
1201                 pr_err("%s: creating RDMA queue failed (%d).\n",
1202                         __func__, ret);
1203                 ret = NVME_RDMA_CM_NO_RSC;
1204                 goto out_free_cmds;
1205         }
1206
1207         return queue;
1208
1209 out_free_cmds:
1210         if (!ndev->srq) {
1211                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1212                                 queue->recv_queue_size,
1213                                 !queue->host_qid);
1214         }
1215 out_free_responses:
1216         nvmet_rdma_free_rsps(queue);
1217 out_ida_remove:
1218         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1219 out_destroy_sq:
1220         nvmet_sq_destroy(&queue->nvme_sq);
1221 out_free_queue:
1222         kfree(queue);
1223 out_reject:
1224         nvmet_rdma_cm_reject(cm_id, ret);
1225         return NULL;
1226 }
1227
1228 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1229 {
1230         struct nvmet_rdma_queue *queue = priv;
1231
1232         switch (event->event) {
1233         case IB_EVENT_COMM_EST:
1234                 rdma_notify(queue->cm_id, event->event);
1235                 break;
1236         default:
1237                 pr_err("received IB QP event: %s (%d)\n",
1238                        ib_event_msg(event->event), event->event);
1239                 break;
1240         }
1241 }
1242
1243 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1244                 struct nvmet_rdma_queue *queue,
1245                 struct rdma_conn_param *p)
1246 {
1247         struct rdma_conn_param  param = { };
1248         struct nvme_rdma_cm_rep priv = { };
1249         int ret = -ENOMEM;
1250
1251         param.rnr_retry_count = 7;
1252         param.flow_control = 1;
1253         param.initiator_depth = min_t(u8, p->initiator_depth,
1254                 queue->dev->device->attrs.max_qp_init_rd_atom);
1255         param.private_data = &priv;
1256         param.private_data_len = sizeof(priv);
1257         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1258         priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1259
1260         ret = rdma_accept(cm_id, &param);
1261         if (ret)
1262                 pr_err("rdma_accept failed (error code = %d)\n", ret);
1263
1264         return ret;
1265 }
1266
1267 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1268                 struct rdma_cm_event *event)
1269 {
1270         struct nvmet_rdma_device *ndev;
1271         struct nvmet_rdma_queue *queue;
1272         int ret = -EINVAL;
1273
1274         ndev = nvmet_rdma_find_get_device(cm_id);
1275         if (!ndev) {
1276                 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1277                 return -ECONNREFUSED;
1278         }
1279
1280         queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1281         if (!queue) {
1282                 ret = -ENOMEM;
1283                 goto put_device;
1284         }
1285         queue->port = cm_id->context;
1286
1287         if (queue->host_qid == 0) {
1288                 /* Let inflight controller teardown complete */
1289                 flush_scheduled_work();
1290         }
1291
1292         ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1293         if (ret) {
1294                 schedule_work(&queue->release_work);
1295                 /* Destroying rdma_cm id is not needed here */
1296                 return 0;
1297         }
1298
1299         mutex_lock(&nvmet_rdma_queue_mutex);
1300         list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1301         mutex_unlock(&nvmet_rdma_queue_mutex);
1302
1303         return 0;
1304
1305 put_device:
1306         kref_put(&ndev->ref, nvmet_rdma_free_dev);
1307
1308         return ret;
1309 }
1310
1311 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1312 {
1313         unsigned long flags;
1314
1315         spin_lock_irqsave(&queue->state_lock, flags);
1316         if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1317                 pr_warn("trying to establish a connected queue\n");
1318                 goto out_unlock;
1319         }
1320         queue->state = NVMET_RDMA_Q_LIVE;
1321
1322         while (!list_empty(&queue->rsp_wait_list)) {
1323                 struct nvmet_rdma_rsp *cmd;
1324
1325                 cmd = list_first_entry(&queue->rsp_wait_list,
1326                                         struct nvmet_rdma_rsp, wait_list);
1327                 list_del(&cmd->wait_list);
1328
1329                 spin_unlock_irqrestore(&queue->state_lock, flags);
1330                 nvmet_rdma_handle_command(queue, cmd);
1331                 spin_lock_irqsave(&queue->state_lock, flags);
1332         }
1333
1334 out_unlock:
1335         spin_unlock_irqrestore(&queue->state_lock, flags);
1336 }
1337
1338 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1339 {
1340         bool disconnect = false;
1341         unsigned long flags;
1342
1343         pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1344
1345         spin_lock_irqsave(&queue->state_lock, flags);
1346         switch (queue->state) {
1347         case NVMET_RDMA_Q_CONNECTING:
1348         case NVMET_RDMA_Q_LIVE:
1349                 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1350                 disconnect = true;
1351                 break;
1352         case NVMET_RDMA_Q_DISCONNECTING:
1353                 break;
1354         }
1355         spin_unlock_irqrestore(&queue->state_lock, flags);
1356
1357         if (disconnect) {
1358                 rdma_disconnect(queue->cm_id);
1359                 schedule_work(&queue->release_work);
1360         }
1361 }
1362
1363 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1364 {
1365         bool disconnect = false;
1366
1367         mutex_lock(&nvmet_rdma_queue_mutex);
1368         if (!list_empty(&queue->queue_list)) {
1369                 list_del_init(&queue->queue_list);
1370                 disconnect = true;
1371         }
1372         mutex_unlock(&nvmet_rdma_queue_mutex);
1373
1374         if (disconnect)
1375                 __nvmet_rdma_queue_disconnect(queue);
1376 }
1377
1378 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1379                 struct nvmet_rdma_queue *queue)
1380 {
1381         WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1382
1383         mutex_lock(&nvmet_rdma_queue_mutex);
1384         if (!list_empty(&queue->queue_list))
1385                 list_del_init(&queue->queue_list);
1386         mutex_unlock(&nvmet_rdma_queue_mutex);
1387
1388         pr_err("failed to connect queue %d\n", queue->idx);
1389         schedule_work(&queue->release_work);
1390 }
1391
1392 /**
1393  * nvme_rdma_device_removal() - Handle RDMA device removal
1394  * @cm_id:      rdma_cm id, used for nvmet port
1395  * @queue:      nvmet rdma queue (cm id qp_context)
1396  *
1397  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1398  * to unplug. Note that this event can be generated on a normal
1399  * queue cm_id and/or a device bound listener cm_id (where in this
1400  * case queue will be null).
1401  *
1402  * We registered an ib_client to handle device removal for queues,
1403  * so we only need to handle the listening port cm_ids. In this case
1404  * we nullify the priv to prevent double cm_id destruction and destroying
1405  * the cm_id implicitely by returning a non-zero rc to the callout.
1406  */
1407 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1408                 struct nvmet_rdma_queue *queue)
1409 {
1410         struct nvmet_port *port;
1411
1412         if (queue) {
1413                 /*
1414                  * This is a queue cm_id. we have registered
1415                  * an ib_client to handle queues removal
1416                  * so don't interfear and just return.
1417                  */
1418                 return 0;
1419         }
1420
1421         port = cm_id->context;
1422
1423         /*
1424          * This is a listener cm_id. Make sure that
1425          * future remove_port won't invoke a double
1426          * cm_id destroy. use atomic xchg to make sure
1427          * we don't compete with remove_port.
1428          */
1429         if (xchg(&port->priv, NULL) != cm_id)
1430                 return 0;
1431
1432         /*
1433          * We need to return 1 so that the core will destroy
1434          * it's own ID.  What a great API design..
1435          */
1436         return 1;
1437 }
1438
1439 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1440                 struct rdma_cm_event *event)
1441 {
1442         struct nvmet_rdma_queue *queue = NULL;
1443         int ret = 0;
1444
1445         if (cm_id->qp)
1446                 queue = cm_id->qp->qp_context;
1447
1448         pr_debug("%s (%d): status %d id %p\n",
1449                 rdma_event_msg(event->event), event->event,
1450                 event->status, cm_id);
1451
1452         switch (event->event) {
1453         case RDMA_CM_EVENT_CONNECT_REQUEST:
1454                 ret = nvmet_rdma_queue_connect(cm_id, event);
1455                 break;
1456         case RDMA_CM_EVENT_ESTABLISHED:
1457                 nvmet_rdma_queue_established(queue);
1458                 break;
1459         case RDMA_CM_EVENT_ADDR_CHANGE:
1460         case RDMA_CM_EVENT_DISCONNECTED:
1461         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1462                 nvmet_rdma_queue_disconnect(queue);
1463                 break;
1464         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1465                 ret = nvmet_rdma_device_removal(cm_id, queue);
1466                 break;
1467         case RDMA_CM_EVENT_REJECTED:
1468                 pr_debug("Connection rejected: %s\n",
1469                          rdma_reject_msg(cm_id, event->status));
1470                 /* FALLTHROUGH */
1471         case RDMA_CM_EVENT_UNREACHABLE:
1472         case RDMA_CM_EVENT_CONNECT_ERROR:
1473                 nvmet_rdma_queue_connect_fail(cm_id, queue);
1474                 break;
1475         default:
1476                 pr_err("received unrecognized RDMA CM event %d\n",
1477                         event->event);
1478                 break;
1479         }
1480
1481         return ret;
1482 }
1483
1484 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1485 {
1486         struct nvmet_rdma_queue *queue;
1487
1488 restart:
1489         mutex_lock(&nvmet_rdma_queue_mutex);
1490         list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1491                 if (queue->nvme_sq.ctrl == ctrl) {
1492                         list_del_init(&queue->queue_list);
1493                         mutex_unlock(&nvmet_rdma_queue_mutex);
1494
1495                         __nvmet_rdma_queue_disconnect(queue);
1496                         goto restart;
1497                 }
1498         }
1499         mutex_unlock(&nvmet_rdma_queue_mutex);
1500 }
1501
1502 static int nvmet_rdma_add_port(struct nvmet_port *port)
1503 {
1504         struct rdma_cm_id *cm_id;
1505         struct sockaddr_storage addr = { };
1506         __kernel_sa_family_t af;
1507         int ret;
1508
1509         switch (port->disc_addr.adrfam) {
1510         case NVMF_ADDR_FAMILY_IP4:
1511                 af = AF_INET;
1512                 break;
1513         case NVMF_ADDR_FAMILY_IP6:
1514                 af = AF_INET6;
1515                 break;
1516         default:
1517                 pr_err("address family %d not supported\n",
1518                                 port->disc_addr.adrfam);
1519                 return -EINVAL;
1520         }
1521
1522         if (port->inline_data_size < 0) {
1523                 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1524         } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1525                 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1526                         port->inline_data_size,
1527                         NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1528                 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1529         }
1530
1531         ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1532                         port->disc_addr.trsvcid, &addr);
1533         if (ret) {
1534                 pr_err("malformed ip/port passed: %s:%s\n",
1535                         port->disc_addr.traddr, port->disc_addr.trsvcid);
1536                 return ret;
1537         }
1538
1539         cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1540                         RDMA_PS_TCP, IB_QPT_RC);
1541         if (IS_ERR(cm_id)) {
1542                 pr_err("CM ID creation failed\n");
1543                 return PTR_ERR(cm_id);
1544         }
1545
1546         /*
1547          * Allow both IPv4 and IPv6 sockets to bind a single port
1548          * at the same time.
1549          */
1550         ret = rdma_set_afonly(cm_id, 1);
1551         if (ret) {
1552                 pr_err("rdma_set_afonly failed (%d)\n", ret);
1553                 goto out_destroy_id;
1554         }
1555
1556         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
1557         if (ret) {
1558                 pr_err("binding CM ID to %pISpcs failed (%d)\n",
1559                         (struct sockaddr *)&addr, ret);
1560                 goto out_destroy_id;
1561         }
1562
1563         ret = rdma_listen(cm_id, 128);
1564         if (ret) {
1565                 pr_err("listening to %pISpcs failed (%d)\n",
1566                         (struct sockaddr *)&addr, ret);
1567                 goto out_destroy_id;
1568         }
1569
1570         pr_info("enabling port %d (%pISpcs)\n",
1571                 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
1572         port->priv = cm_id;
1573         return 0;
1574
1575 out_destroy_id:
1576         rdma_destroy_id(cm_id);
1577         return ret;
1578 }
1579
1580 static void nvmet_rdma_remove_port(struct nvmet_port *port)
1581 {
1582         struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
1583
1584         if (cm_id)
1585                 rdma_destroy_id(cm_id);
1586 }
1587
1588 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1589                 struct nvmet_port *port, char *traddr)
1590 {
1591         struct rdma_cm_id *cm_id = port->priv;
1592
1593         if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1594                 struct nvmet_rdma_rsp *rsp =
1595                         container_of(req, struct nvmet_rdma_rsp, req);
1596                 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1597                 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1598
1599                 sprintf(traddr, "%pISc", addr);
1600         } else {
1601                 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
1602         }
1603 }
1604
1605 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1606         .owner                  = THIS_MODULE,
1607         .type                   = NVMF_TRTYPE_RDMA,
1608         .msdbd                  = 1,
1609         .has_keyed_sgls         = 1,
1610         .add_port               = nvmet_rdma_add_port,
1611         .remove_port            = nvmet_rdma_remove_port,
1612         .queue_response         = nvmet_rdma_queue_response,
1613         .delete_ctrl            = nvmet_rdma_delete_ctrl,
1614         .disc_traddr            = nvmet_rdma_disc_port_addr,
1615 };
1616
1617 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1618 {
1619         struct nvmet_rdma_queue *queue, *tmp;
1620         struct nvmet_rdma_device *ndev;
1621         bool found = false;
1622
1623         mutex_lock(&device_list_mutex);
1624         list_for_each_entry(ndev, &device_list, entry) {
1625                 if (ndev->device == ib_device) {
1626                         found = true;
1627                         break;
1628                 }
1629         }
1630         mutex_unlock(&device_list_mutex);
1631
1632         if (!found)
1633                 return;
1634
1635         /*
1636          * IB Device that is used by nvmet controllers is being removed,
1637          * delete all queues using this device.
1638          */
1639         mutex_lock(&nvmet_rdma_queue_mutex);
1640         list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1641                                  queue_list) {
1642                 if (queue->dev->device != ib_device)
1643                         continue;
1644
1645                 pr_info("Removing queue %d\n", queue->idx);
1646                 list_del_init(&queue->queue_list);
1647                 __nvmet_rdma_queue_disconnect(queue);
1648         }
1649         mutex_unlock(&nvmet_rdma_queue_mutex);
1650
1651         flush_scheduled_work();
1652 }
1653
1654 static struct ib_client nvmet_rdma_ib_client = {
1655         .name   = "nvmet_rdma",
1656         .remove = nvmet_rdma_remove_one
1657 };
1658
1659 static int __init nvmet_rdma_init(void)
1660 {
1661         int ret;
1662
1663         ret = ib_register_client(&nvmet_rdma_ib_client);
1664         if (ret)
1665                 return ret;
1666
1667         ret = nvmet_register_transport(&nvmet_rdma_ops);
1668         if (ret)
1669                 goto err_ib_client;
1670
1671         return 0;
1672
1673 err_ib_client:
1674         ib_unregister_client(&nvmet_rdma_ib_client);
1675         return ret;
1676 }
1677
1678 static void __exit nvmet_rdma_exit(void)
1679 {
1680         nvmet_unregister_transport(&nvmet_rdma_ops);
1681         ib_unregister_client(&nvmet_rdma_ib_client);
1682         WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1683         ida_destroy(&nvmet_rdma_queue_ida);
1684 }
1685
1686 module_init(nvmet_rdma_init);
1687 module_exit(nvmet_rdma_exit);
1688
1689 MODULE_LICENSE("GPL v2");
1690 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */