Linux-libre 5.4.48-gnu
[librecmc/linux-libre.git] / drivers / nvme / target / rdma.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics RDMA target.
4  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/atomic.h>
8 #include <linux/ctype.h>
9 #include <linux/delay.h>
10 #include <linux/err.h>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/nvme.h>
14 #include <linux/slab.h>
15 #include <linux/string.h>
16 #include <linux/wait.h>
17 #include <linux/inet.h>
18 #include <asm/unaligned.h>
19
20 #include <rdma/ib_verbs.h>
21 #include <rdma/rdma_cm.h>
22 #include <rdma/rw.h>
23
24 #include <linux/nvme-rdma.h>
25 #include "nvmet.h"
26
27 /*
28  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
29  */
30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE     PAGE_SIZE
31 #define NVMET_RDMA_MAX_INLINE_SGE               4
32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE         max_t(int, SZ_16K, PAGE_SIZE)
33
34 struct nvmet_rdma_cmd {
35         struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
36         struct ib_cqe           cqe;
37         struct ib_recv_wr       wr;
38         struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
39         struct nvme_command     *nvme_cmd;
40         struct nvmet_rdma_queue *queue;
41 };
42
43 enum {
44         NVMET_RDMA_REQ_INLINE_DATA      = (1 << 0),
45         NVMET_RDMA_REQ_INVALIDATE_RKEY  = (1 << 1),
46 };
47
48 struct nvmet_rdma_rsp {
49         struct ib_sge           send_sge;
50         struct ib_cqe           send_cqe;
51         struct ib_send_wr       send_wr;
52
53         struct nvmet_rdma_cmd   *cmd;
54         struct nvmet_rdma_queue *queue;
55
56         struct ib_cqe           read_cqe;
57         struct rdma_rw_ctx      rw;
58
59         struct nvmet_req        req;
60
61         bool                    allocated;
62         u8                      n_rdma;
63         u32                     flags;
64         u32                     invalidate_rkey;
65
66         struct list_head        wait_list;
67         struct list_head        free_list;
68 };
69
70 enum nvmet_rdma_queue_state {
71         NVMET_RDMA_Q_CONNECTING,
72         NVMET_RDMA_Q_LIVE,
73         NVMET_RDMA_Q_DISCONNECTING,
74 };
75
76 struct nvmet_rdma_queue {
77         struct rdma_cm_id       *cm_id;
78         struct nvmet_port       *port;
79         struct ib_cq            *cq;
80         atomic_t                sq_wr_avail;
81         struct nvmet_rdma_device *dev;
82         spinlock_t              state_lock;
83         enum nvmet_rdma_queue_state state;
84         struct nvmet_cq         nvme_cq;
85         struct nvmet_sq         nvme_sq;
86
87         struct nvmet_rdma_rsp   *rsps;
88         struct list_head        free_rsps;
89         spinlock_t              rsps_lock;
90         struct nvmet_rdma_cmd   *cmds;
91
92         struct work_struct      release_work;
93         struct list_head        rsp_wait_list;
94         struct list_head        rsp_wr_wait_list;
95         spinlock_t              rsp_wr_wait_lock;
96
97         int                     idx;
98         int                     host_qid;
99         int                     recv_queue_size;
100         int                     send_queue_size;
101
102         struct list_head        queue_list;
103 };
104
105 struct nvmet_rdma_device {
106         struct ib_device        *device;
107         struct ib_pd            *pd;
108         struct ib_srq           *srq;
109         struct nvmet_rdma_cmd   *srq_cmds;
110         size_t                  srq_size;
111         struct kref             ref;
112         struct list_head        entry;
113         int                     inline_data_size;
114         int                     inline_page_count;
115 };
116
117 static bool nvmet_rdma_use_srq;
118 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
119 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
120
121 static DEFINE_IDA(nvmet_rdma_queue_ida);
122 static LIST_HEAD(nvmet_rdma_queue_list);
123 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
124
125 static LIST_HEAD(device_list);
126 static DEFINE_MUTEX(device_list_mutex);
127
128 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
129 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
130 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
131 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
132 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
133 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
134 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
135                                 struct nvmet_rdma_rsp *r);
136 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
137                                 struct nvmet_rdma_rsp *r);
138
139 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
140
141 static int num_pages(int len)
142 {
143         return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
144 }
145
146 /* XXX: really should move to a generic header sooner or later.. */
147 static inline u32 get_unaligned_le24(const u8 *p)
148 {
149         return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
150 }
151
152 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
153 {
154         return nvme_is_write(rsp->req.cmd) &&
155                 rsp->req.transfer_len &&
156                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
157 }
158
159 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
160 {
161         return !nvme_is_write(rsp->req.cmd) &&
162                 rsp->req.transfer_len &&
163                 !rsp->req.cqe->status &&
164                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
165 }
166
167 static inline struct nvmet_rdma_rsp *
168 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
169 {
170         struct nvmet_rdma_rsp *rsp;
171         unsigned long flags;
172
173         spin_lock_irqsave(&queue->rsps_lock, flags);
174         rsp = list_first_entry_or_null(&queue->free_rsps,
175                                 struct nvmet_rdma_rsp, free_list);
176         if (likely(rsp))
177                 list_del(&rsp->free_list);
178         spin_unlock_irqrestore(&queue->rsps_lock, flags);
179
180         if (unlikely(!rsp)) {
181                 int ret;
182
183                 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
184                 if (unlikely(!rsp))
185                         return NULL;
186                 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
187                 if (unlikely(ret)) {
188                         kfree(rsp);
189                         return NULL;
190                 }
191
192                 rsp->allocated = true;
193         }
194
195         return rsp;
196 }
197
198 static inline void
199 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
200 {
201         unsigned long flags;
202
203         if (unlikely(rsp->allocated)) {
204                 nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
205                 kfree(rsp);
206                 return;
207         }
208
209         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
210         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
211         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
212 }
213
214 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
215                                 struct nvmet_rdma_cmd *c)
216 {
217         struct scatterlist *sg;
218         struct ib_sge *sge;
219         int i;
220
221         if (!ndev->inline_data_size)
222                 return;
223
224         sg = c->inline_sg;
225         sge = &c->sge[1];
226
227         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
228                 if (sge->length)
229                         ib_dma_unmap_page(ndev->device, sge->addr,
230                                         sge->length, DMA_FROM_DEVICE);
231                 if (sg_page(sg))
232                         __free_page(sg_page(sg));
233         }
234 }
235
236 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
237                                 struct nvmet_rdma_cmd *c)
238 {
239         struct scatterlist *sg;
240         struct ib_sge *sge;
241         struct page *pg;
242         int len;
243         int i;
244
245         if (!ndev->inline_data_size)
246                 return 0;
247
248         sg = c->inline_sg;
249         sg_init_table(sg, ndev->inline_page_count);
250         sge = &c->sge[1];
251         len = ndev->inline_data_size;
252
253         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
254                 pg = alloc_page(GFP_KERNEL);
255                 if (!pg)
256                         goto out_err;
257                 sg_assign_page(sg, pg);
258                 sge->addr = ib_dma_map_page(ndev->device,
259                         pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
260                 if (ib_dma_mapping_error(ndev->device, sge->addr))
261                         goto out_err;
262                 sge->length = min_t(int, len, PAGE_SIZE);
263                 sge->lkey = ndev->pd->local_dma_lkey;
264                 len -= sge->length;
265         }
266
267         return 0;
268 out_err:
269         for (; i >= 0; i--, sg--, sge--) {
270                 if (sge->length)
271                         ib_dma_unmap_page(ndev->device, sge->addr,
272                                         sge->length, DMA_FROM_DEVICE);
273                 if (sg_page(sg))
274                         __free_page(sg_page(sg));
275         }
276         return -ENOMEM;
277 }
278
279 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
280                         struct nvmet_rdma_cmd *c, bool admin)
281 {
282         /* NVMe command / RDMA RECV */
283         c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
284         if (!c->nvme_cmd)
285                 goto out;
286
287         c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
288                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
289         if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
290                 goto out_free_cmd;
291
292         c->sge[0].length = sizeof(*c->nvme_cmd);
293         c->sge[0].lkey = ndev->pd->local_dma_lkey;
294
295         if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
296                 goto out_unmap_cmd;
297
298         c->cqe.done = nvmet_rdma_recv_done;
299
300         c->wr.wr_cqe = &c->cqe;
301         c->wr.sg_list = c->sge;
302         c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
303
304         return 0;
305
306 out_unmap_cmd:
307         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
308                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
309 out_free_cmd:
310         kfree(c->nvme_cmd);
311
312 out:
313         return -ENOMEM;
314 }
315
316 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
317                 struct nvmet_rdma_cmd *c, bool admin)
318 {
319         if (!admin)
320                 nvmet_rdma_free_inline_pages(ndev, c);
321         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
322                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
323         kfree(c->nvme_cmd);
324 }
325
326 static struct nvmet_rdma_cmd *
327 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
328                 int nr_cmds, bool admin)
329 {
330         struct nvmet_rdma_cmd *cmds;
331         int ret = -EINVAL, i;
332
333         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
334         if (!cmds)
335                 goto out;
336
337         for (i = 0; i < nr_cmds; i++) {
338                 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
339                 if (ret)
340                         goto out_free;
341         }
342
343         return cmds;
344
345 out_free:
346         while (--i >= 0)
347                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
348         kfree(cmds);
349 out:
350         return ERR_PTR(ret);
351 }
352
353 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
354                 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
355 {
356         int i;
357
358         for (i = 0; i < nr_cmds; i++)
359                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
360         kfree(cmds);
361 }
362
363 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
364                 struct nvmet_rdma_rsp *r)
365 {
366         /* NVMe CQE / RDMA SEND */
367         r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL);
368         if (!r->req.cqe)
369                 goto out;
370
371         r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe,
372                         sizeof(*r->req.cqe), DMA_TO_DEVICE);
373         if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
374                 goto out_free_rsp;
375
376         r->req.p2p_client = &ndev->device->dev;
377         r->send_sge.length = sizeof(*r->req.cqe);
378         r->send_sge.lkey = ndev->pd->local_dma_lkey;
379
380         r->send_cqe.done = nvmet_rdma_send_done;
381
382         r->send_wr.wr_cqe = &r->send_cqe;
383         r->send_wr.sg_list = &r->send_sge;
384         r->send_wr.num_sge = 1;
385         r->send_wr.send_flags = IB_SEND_SIGNALED;
386
387         /* Data In / RDMA READ */
388         r->read_cqe.done = nvmet_rdma_read_data_done;
389         return 0;
390
391 out_free_rsp:
392         kfree(r->req.cqe);
393 out:
394         return -ENOMEM;
395 }
396
397 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
398                 struct nvmet_rdma_rsp *r)
399 {
400         ib_dma_unmap_single(ndev->device, r->send_sge.addr,
401                                 sizeof(*r->req.cqe), DMA_TO_DEVICE);
402         kfree(r->req.cqe);
403 }
404
405 static int
406 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
407 {
408         struct nvmet_rdma_device *ndev = queue->dev;
409         int nr_rsps = queue->recv_queue_size * 2;
410         int ret = -EINVAL, i;
411
412         queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
413                         GFP_KERNEL);
414         if (!queue->rsps)
415                 goto out;
416
417         for (i = 0; i < nr_rsps; i++) {
418                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
419
420                 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
421                 if (ret)
422                         goto out_free;
423
424                 list_add_tail(&rsp->free_list, &queue->free_rsps);
425         }
426
427         return 0;
428
429 out_free:
430         while (--i >= 0) {
431                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
432
433                 list_del(&rsp->free_list);
434                 nvmet_rdma_free_rsp(ndev, rsp);
435         }
436         kfree(queue->rsps);
437 out:
438         return ret;
439 }
440
441 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
442 {
443         struct nvmet_rdma_device *ndev = queue->dev;
444         int i, nr_rsps = queue->recv_queue_size * 2;
445
446         for (i = 0; i < nr_rsps; i++) {
447                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
448
449                 list_del(&rsp->free_list);
450                 nvmet_rdma_free_rsp(ndev, rsp);
451         }
452         kfree(queue->rsps);
453 }
454
455 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
456                 struct nvmet_rdma_cmd *cmd)
457 {
458         int ret;
459
460         ib_dma_sync_single_for_device(ndev->device,
461                 cmd->sge[0].addr, cmd->sge[0].length,
462                 DMA_FROM_DEVICE);
463
464         if (ndev->srq)
465                 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
466         else
467                 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
468
469         if (unlikely(ret))
470                 pr_err("post_recv cmd failed\n");
471
472         return ret;
473 }
474
475 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
476 {
477         spin_lock(&queue->rsp_wr_wait_lock);
478         while (!list_empty(&queue->rsp_wr_wait_list)) {
479                 struct nvmet_rdma_rsp *rsp;
480                 bool ret;
481
482                 rsp = list_entry(queue->rsp_wr_wait_list.next,
483                                 struct nvmet_rdma_rsp, wait_list);
484                 list_del(&rsp->wait_list);
485
486                 spin_unlock(&queue->rsp_wr_wait_lock);
487                 ret = nvmet_rdma_execute_command(rsp);
488                 spin_lock(&queue->rsp_wr_wait_lock);
489
490                 if (!ret) {
491                         list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
492                         break;
493                 }
494         }
495         spin_unlock(&queue->rsp_wr_wait_lock);
496 }
497
498
499 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
500 {
501         struct nvmet_rdma_queue *queue = rsp->queue;
502
503         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
504
505         if (rsp->n_rdma) {
506                 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
507                                 queue->cm_id->port_num, rsp->req.sg,
508                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
509         }
510
511         if (rsp->req.sg != rsp->cmd->inline_sg)
512                 nvmet_req_free_sgl(&rsp->req);
513
514         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
515                 nvmet_rdma_process_wr_wait_list(queue);
516
517         nvmet_rdma_put_rsp(rsp);
518 }
519
520 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
521 {
522         if (queue->nvme_sq.ctrl) {
523                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
524         } else {
525                 /*
526                  * we didn't setup the controller yet in case
527                  * of admin connect error, just disconnect and
528                  * cleanup the queue
529                  */
530                 nvmet_rdma_queue_disconnect(queue);
531         }
532 }
533
534 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
535 {
536         struct nvmet_rdma_rsp *rsp =
537                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
538         struct nvmet_rdma_queue *queue = cq->cq_context;
539
540         nvmet_rdma_release_rsp(rsp);
541
542         if (unlikely(wc->status != IB_WC_SUCCESS &&
543                      wc->status != IB_WC_WR_FLUSH_ERR)) {
544                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
545                         wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
546                 nvmet_rdma_error_comp(queue);
547         }
548 }
549
550 static void nvmet_rdma_queue_response(struct nvmet_req *req)
551 {
552         struct nvmet_rdma_rsp *rsp =
553                 container_of(req, struct nvmet_rdma_rsp, req);
554         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
555         struct ib_send_wr *first_wr;
556
557         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
558                 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
559                 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
560         } else {
561                 rsp->send_wr.opcode = IB_WR_SEND;
562         }
563
564         if (nvmet_rdma_need_data_out(rsp))
565                 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
566                                 cm_id->port_num, NULL, &rsp->send_wr);
567         else
568                 first_wr = &rsp->send_wr;
569
570         nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
571
572         ib_dma_sync_single_for_device(rsp->queue->dev->device,
573                 rsp->send_sge.addr, rsp->send_sge.length,
574                 DMA_TO_DEVICE);
575
576         if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
577                 pr_err("sending cmd response failed\n");
578                 nvmet_rdma_release_rsp(rsp);
579         }
580 }
581
582 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
583 {
584         struct nvmet_rdma_rsp *rsp =
585                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
586         struct nvmet_rdma_queue *queue = cq->cq_context;
587
588         WARN_ON(rsp->n_rdma <= 0);
589         atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
590         rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
591                         queue->cm_id->port_num, rsp->req.sg,
592                         rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
593         rsp->n_rdma = 0;
594
595         if (unlikely(wc->status != IB_WC_SUCCESS)) {
596                 nvmet_req_uninit(&rsp->req);
597                 nvmet_rdma_release_rsp(rsp);
598                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
599                         pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
600                                 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
601                         nvmet_rdma_error_comp(queue);
602                 }
603                 return;
604         }
605
606         nvmet_req_execute(&rsp->req);
607 }
608
609 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
610                 u64 off)
611 {
612         int sg_count = num_pages(len);
613         struct scatterlist *sg;
614         int i;
615
616         sg = rsp->cmd->inline_sg;
617         for (i = 0; i < sg_count; i++, sg++) {
618                 if (i < sg_count - 1)
619                         sg_unmark_end(sg);
620                 else
621                         sg_mark_end(sg);
622                 sg->offset = off;
623                 sg->length = min_t(int, len, PAGE_SIZE - off);
624                 len -= sg->length;
625                 if (!i)
626                         off = 0;
627         }
628
629         rsp->req.sg = rsp->cmd->inline_sg;
630         rsp->req.sg_cnt = sg_count;
631 }
632
633 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
634 {
635         struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
636         u64 off = le64_to_cpu(sgl->addr);
637         u32 len = le32_to_cpu(sgl->length);
638
639         if (!nvme_is_write(rsp->req.cmd)) {
640                 rsp->req.error_loc =
641                         offsetof(struct nvme_common_command, opcode);
642                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
643         }
644
645         if (off + len > rsp->queue->dev->inline_data_size) {
646                 pr_err("invalid inline data offset!\n");
647                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
648         }
649
650         /* no data command? */
651         if (!len)
652                 return 0;
653
654         nvmet_rdma_use_inline_sg(rsp, len, off);
655         rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
656         rsp->req.transfer_len += len;
657         return 0;
658 }
659
660 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
661                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
662 {
663         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
664         u64 addr = le64_to_cpu(sgl->addr);
665         u32 key = get_unaligned_le32(sgl->key);
666         int ret;
667
668         rsp->req.transfer_len = get_unaligned_le24(sgl->length);
669
670         /* no data command? */
671         if (!rsp->req.transfer_len)
672                 return 0;
673
674         ret = nvmet_req_alloc_sgl(&rsp->req);
675         if (ret < 0)
676                 goto error_out;
677
678         ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
679                         rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
680                         nvmet_data_dir(&rsp->req));
681         if (ret < 0)
682                 goto error_out;
683         rsp->n_rdma += ret;
684
685         if (invalidate) {
686                 rsp->invalidate_rkey = key;
687                 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
688         }
689
690         return 0;
691
692 error_out:
693         rsp->req.transfer_len = 0;
694         return NVME_SC_INTERNAL;
695 }
696
697 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
698 {
699         struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
700
701         switch (sgl->type >> 4) {
702         case NVME_SGL_FMT_DATA_DESC:
703                 switch (sgl->type & 0xf) {
704                 case NVME_SGL_FMT_OFFSET:
705                         return nvmet_rdma_map_sgl_inline(rsp);
706                 default:
707                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
708                         rsp->req.error_loc =
709                                 offsetof(struct nvme_common_command, dptr);
710                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
711                 }
712         case NVME_KEY_SGL_FMT_DATA_DESC:
713                 switch (sgl->type & 0xf) {
714                 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
715                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
716                 case NVME_SGL_FMT_ADDRESS:
717                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
718                 default:
719                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
720                         rsp->req.error_loc =
721                                 offsetof(struct nvme_common_command, dptr);
722                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
723                 }
724         default:
725                 pr_err("invalid SGL type: %#x\n", sgl->type);
726                 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
727                 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
728         }
729 }
730
731 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
732 {
733         struct nvmet_rdma_queue *queue = rsp->queue;
734
735         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
736                         &queue->sq_wr_avail) < 0)) {
737                 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
738                                 1 + rsp->n_rdma, queue->idx,
739                                 queue->nvme_sq.ctrl->cntlid);
740                 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
741                 return false;
742         }
743
744         if (nvmet_rdma_need_data_in(rsp)) {
745                 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
746                                 queue->cm_id->port_num, &rsp->read_cqe, NULL))
747                         nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
748         } else {
749                 nvmet_req_execute(&rsp->req);
750         }
751
752         return true;
753 }
754
755 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
756                 struct nvmet_rdma_rsp *cmd)
757 {
758         u16 status;
759
760         ib_dma_sync_single_for_cpu(queue->dev->device,
761                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
762                 DMA_FROM_DEVICE);
763         ib_dma_sync_single_for_cpu(queue->dev->device,
764                 cmd->send_sge.addr, cmd->send_sge.length,
765                 DMA_TO_DEVICE);
766
767         if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
768                         &queue->nvme_sq, &nvmet_rdma_ops))
769                 return;
770
771         status = nvmet_rdma_map_sgl(cmd);
772         if (status)
773                 goto out_err;
774
775         if (unlikely(!nvmet_rdma_execute_command(cmd))) {
776                 spin_lock(&queue->rsp_wr_wait_lock);
777                 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
778                 spin_unlock(&queue->rsp_wr_wait_lock);
779         }
780
781         return;
782
783 out_err:
784         nvmet_req_complete(&cmd->req, status);
785 }
786
787 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
788 {
789         struct nvmet_rdma_cmd *cmd =
790                 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
791         struct nvmet_rdma_queue *queue = cq->cq_context;
792         struct nvmet_rdma_rsp *rsp;
793
794         if (unlikely(wc->status != IB_WC_SUCCESS)) {
795                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
796                         pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
797                                 wc->wr_cqe, ib_wc_status_msg(wc->status),
798                                 wc->status);
799                         nvmet_rdma_error_comp(queue);
800                 }
801                 return;
802         }
803
804         if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
805                 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
806                 nvmet_rdma_error_comp(queue);
807                 return;
808         }
809
810         cmd->queue = queue;
811         rsp = nvmet_rdma_get_rsp(queue);
812         if (unlikely(!rsp)) {
813                 /*
814                  * we get here only under memory pressure,
815                  * silently drop and have the host retry
816                  * as we can't even fail it.
817                  */
818                 nvmet_rdma_post_recv(queue->dev, cmd);
819                 return;
820         }
821         rsp->queue = queue;
822         rsp->cmd = cmd;
823         rsp->flags = 0;
824         rsp->req.cmd = cmd->nvme_cmd;
825         rsp->req.port = queue->port;
826         rsp->n_rdma = 0;
827
828         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
829                 unsigned long flags;
830
831                 spin_lock_irqsave(&queue->state_lock, flags);
832                 if (queue->state == NVMET_RDMA_Q_CONNECTING)
833                         list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
834                 else
835                         nvmet_rdma_put_rsp(rsp);
836                 spin_unlock_irqrestore(&queue->state_lock, flags);
837                 return;
838         }
839
840         nvmet_rdma_handle_command(queue, rsp);
841 }
842
843 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
844 {
845         if (!ndev->srq)
846                 return;
847
848         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
849         ib_destroy_srq(ndev->srq);
850 }
851
852 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
853 {
854         struct ib_srq_init_attr srq_attr = { NULL, };
855         struct ib_srq *srq;
856         size_t srq_size;
857         int ret, i;
858
859         srq_size = 4095;        /* XXX: tune */
860
861         srq_attr.attr.max_wr = srq_size;
862         srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
863         srq_attr.attr.srq_limit = 0;
864         srq_attr.srq_type = IB_SRQT_BASIC;
865         srq = ib_create_srq(ndev->pd, &srq_attr);
866         if (IS_ERR(srq)) {
867                 /*
868                  * If SRQs aren't supported we just go ahead and use normal
869                  * non-shared receive queues.
870                  */
871                 pr_info("SRQ requested but not supported.\n");
872                 return 0;
873         }
874
875         ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
876         if (IS_ERR(ndev->srq_cmds)) {
877                 ret = PTR_ERR(ndev->srq_cmds);
878                 goto out_destroy_srq;
879         }
880
881         ndev->srq = srq;
882         ndev->srq_size = srq_size;
883
884         for (i = 0; i < srq_size; i++) {
885                 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
886                 if (ret)
887                         goto out_free_cmds;
888         }
889
890         return 0;
891
892 out_free_cmds:
893         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
894 out_destroy_srq:
895         ib_destroy_srq(srq);
896         return ret;
897 }
898
899 static void nvmet_rdma_free_dev(struct kref *ref)
900 {
901         struct nvmet_rdma_device *ndev =
902                 container_of(ref, struct nvmet_rdma_device, ref);
903
904         mutex_lock(&device_list_mutex);
905         list_del(&ndev->entry);
906         mutex_unlock(&device_list_mutex);
907
908         nvmet_rdma_destroy_srq(ndev);
909         ib_dealloc_pd(ndev->pd);
910
911         kfree(ndev);
912 }
913
914 static struct nvmet_rdma_device *
915 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
916 {
917         struct nvmet_port *port = cm_id->context;
918         struct nvmet_rdma_device *ndev;
919         int inline_page_count;
920         int inline_sge_count;
921         int ret;
922
923         mutex_lock(&device_list_mutex);
924         list_for_each_entry(ndev, &device_list, entry) {
925                 if (ndev->device->node_guid == cm_id->device->node_guid &&
926                     kref_get_unless_zero(&ndev->ref))
927                         goto out_unlock;
928         }
929
930         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
931         if (!ndev)
932                 goto out_err;
933
934         inline_page_count = num_pages(port->inline_data_size);
935         inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
936                                 cm_id->device->attrs.max_recv_sge) - 1;
937         if (inline_page_count > inline_sge_count) {
938                 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
939                         port->inline_data_size, cm_id->device->name,
940                         inline_sge_count * PAGE_SIZE);
941                 port->inline_data_size = inline_sge_count * PAGE_SIZE;
942                 inline_page_count = inline_sge_count;
943         }
944         ndev->inline_data_size = port->inline_data_size;
945         ndev->inline_page_count = inline_page_count;
946         ndev->device = cm_id->device;
947         kref_init(&ndev->ref);
948
949         ndev->pd = ib_alloc_pd(ndev->device, 0);
950         if (IS_ERR(ndev->pd))
951                 goto out_free_dev;
952
953         if (nvmet_rdma_use_srq) {
954                 ret = nvmet_rdma_init_srq(ndev);
955                 if (ret)
956                         goto out_free_pd;
957         }
958
959         list_add(&ndev->entry, &device_list);
960 out_unlock:
961         mutex_unlock(&device_list_mutex);
962         pr_debug("added %s.\n", ndev->device->name);
963         return ndev;
964
965 out_free_pd:
966         ib_dealloc_pd(ndev->pd);
967 out_free_dev:
968         kfree(ndev);
969 out_err:
970         mutex_unlock(&device_list_mutex);
971         return NULL;
972 }
973
974 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
975 {
976         struct ib_qp_init_attr qp_attr;
977         struct nvmet_rdma_device *ndev = queue->dev;
978         int comp_vector, nr_cqe, ret, i;
979
980         /*
981          * Spread the io queues across completion vectors,
982          * but still keep all admin queues on vector 0.
983          */
984         comp_vector = !queue->host_qid ? 0 :
985                 queue->idx % ndev->device->num_comp_vectors;
986
987         /*
988          * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
989          */
990         nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
991
992         queue->cq = ib_alloc_cq(ndev->device, queue,
993                         nr_cqe + 1, comp_vector,
994                         IB_POLL_WORKQUEUE);
995         if (IS_ERR(queue->cq)) {
996                 ret = PTR_ERR(queue->cq);
997                 pr_err("failed to create CQ cqe= %d ret= %d\n",
998                        nr_cqe + 1, ret);
999                 goto out;
1000         }
1001
1002         memset(&qp_attr, 0, sizeof(qp_attr));
1003         qp_attr.qp_context = queue;
1004         qp_attr.event_handler = nvmet_rdma_qp_event;
1005         qp_attr.send_cq = queue->cq;
1006         qp_attr.recv_cq = queue->cq;
1007         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1008         qp_attr.qp_type = IB_QPT_RC;
1009         /* +1 for drain */
1010         qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
1011         qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
1012         qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
1013                                         ndev->device->attrs.max_send_sge);
1014
1015         if (ndev->srq) {
1016                 qp_attr.srq = ndev->srq;
1017         } else {
1018                 /* +1 for drain */
1019                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1020                 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1021         }
1022
1023         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1024         if (ret) {
1025                 pr_err("failed to create_qp ret= %d\n", ret);
1026                 goto err_destroy_cq;
1027         }
1028
1029         atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1030
1031         pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1032                  __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1033                  qp_attr.cap.max_send_wr, queue->cm_id);
1034
1035         if (!ndev->srq) {
1036                 for (i = 0; i < queue->recv_queue_size; i++) {
1037                         queue->cmds[i].queue = queue;
1038                         ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1039                         if (ret)
1040                                 goto err_destroy_qp;
1041                 }
1042         }
1043
1044 out:
1045         return ret;
1046
1047 err_destroy_qp:
1048         rdma_destroy_qp(queue->cm_id);
1049 err_destroy_cq:
1050         ib_free_cq(queue->cq);
1051         goto out;
1052 }
1053
1054 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1055 {
1056         struct ib_qp *qp = queue->cm_id->qp;
1057
1058         ib_drain_qp(qp);
1059         rdma_destroy_id(queue->cm_id);
1060         ib_destroy_qp(qp);
1061         ib_free_cq(queue->cq);
1062 }
1063
1064 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1065 {
1066         pr_debug("freeing queue %d\n", queue->idx);
1067
1068         nvmet_sq_destroy(&queue->nvme_sq);
1069
1070         nvmet_rdma_destroy_queue_ib(queue);
1071         if (!queue->dev->srq) {
1072                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1073                                 queue->recv_queue_size,
1074                                 !queue->host_qid);
1075         }
1076         nvmet_rdma_free_rsps(queue);
1077         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1078         kfree(queue);
1079 }
1080
1081 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1082 {
1083         struct nvmet_rdma_queue *queue =
1084                 container_of(w, struct nvmet_rdma_queue, release_work);
1085         struct nvmet_rdma_device *dev = queue->dev;
1086
1087         nvmet_rdma_free_queue(queue);
1088
1089         kref_put(&dev->ref, nvmet_rdma_free_dev);
1090 }
1091
1092 static int
1093 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1094                                 struct nvmet_rdma_queue *queue)
1095 {
1096         struct nvme_rdma_cm_req *req;
1097
1098         req = (struct nvme_rdma_cm_req *)conn->private_data;
1099         if (!req || conn->private_data_len == 0)
1100                 return NVME_RDMA_CM_INVALID_LEN;
1101
1102         if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1103                 return NVME_RDMA_CM_INVALID_RECFMT;
1104
1105         queue->host_qid = le16_to_cpu(req->qid);
1106
1107         /*
1108          * req->hsqsize corresponds to our recv queue size plus 1
1109          * req->hrqsize corresponds to our send queue size
1110          */
1111         queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1112         queue->send_queue_size = le16_to_cpu(req->hrqsize);
1113
1114         if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1115                 return NVME_RDMA_CM_INVALID_HSQSIZE;
1116
1117         /* XXX: Should we enforce some kind of max for IO queues? */
1118
1119         return 0;
1120 }
1121
1122 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1123                                 enum nvme_rdma_cm_status status)
1124 {
1125         struct nvme_rdma_cm_rej rej;
1126
1127         pr_debug("rejecting connect request: status %d (%s)\n",
1128                  status, nvme_rdma_cm_msg(status));
1129
1130         rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1131         rej.sts = cpu_to_le16(status);
1132
1133         return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1134 }
1135
1136 static struct nvmet_rdma_queue *
1137 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1138                 struct rdma_cm_id *cm_id,
1139                 struct rdma_cm_event *event)
1140 {
1141         struct nvmet_rdma_queue *queue;
1142         int ret;
1143
1144         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1145         if (!queue) {
1146                 ret = NVME_RDMA_CM_NO_RSC;
1147                 goto out_reject;
1148         }
1149
1150         ret = nvmet_sq_init(&queue->nvme_sq);
1151         if (ret) {
1152                 ret = NVME_RDMA_CM_NO_RSC;
1153                 goto out_free_queue;
1154         }
1155
1156         ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1157         if (ret)
1158                 goto out_destroy_sq;
1159
1160         /*
1161          * Schedules the actual release because calling rdma_destroy_id from
1162          * inside a CM callback would trigger a deadlock. (great API design..)
1163          */
1164         INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1165         queue->dev = ndev;
1166         queue->cm_id = cm_id;
1167
1168         spin_lock_init(&queue->state_lock);
1169         queue->state = NVMET_RDMA_Q_CONNECTING;
1170         INIT_LIST_HEAD(&queue->rsp_wait_list);
1171         INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1172         spin_lock_init(&queue->rsp_wr_wait_lock);
1173         INIT_LIST_HEAD(&queue->free_rsps);
1174         spin_lock_init(&queue->rsps_lock);
1175         INIT_LIST_HEAD(&queue->queue_list);
1176
1177         queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1178         if (queue->idx < 0) {
1179                 ret = NVME_RDMA_CM_NO_RSC;
1180                 goto out_destroy_sq;
1181         }
1182
1183         ret = nvmet_rdma_alloc_rsps(queue);
1184         if (ret) {
1185                 ret = NVME_RDMA_CM_NO_RSC;
1186                 goto out_ida_remove;
1187         }
1188
1189         if (!ndev->srq) {
1190                 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1191                                 queue->recv_queue_size,
1192                                 !queue->host_qid);
1193                 if (IS_ERR(queue->cmds)) {
1194                         ret = NVME_RDMA_CM_NO_RSC;
1195                         goto out_free_responses;
1196                 }
1197         }
1198
1199         ret = nvmet_rdma_create_queue_ib(queue);
1200         if (ret) {
1201                 pr_err("%s: creating RDMA queue failed (%d).\n",
1202                         __func__, ret);
1203                 ret = NVME_RDMA_CM_NO_RSC;
1204                 goto out_free_cmds;
1205         }
1206
1207         return queue;
1208
1209 out_free_cmds:
1210         if (!ndev->srq) {
1211                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1212                                 queue->recv_queue_size,
1213                                 !queue->host_qid);
1214         }
1215 out_free_responses:
1216         nvmet_rdma_free_rsps(queue);
1217 out_ida_remove:
1218         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1219 out_destroy_sq:
1220         nvmet_sq_destroy(&queue->nvme_sq);
1221 out_free_queue:
1222         kfree(queue);
1223 out_reject:
1224         nvmet_rdma_cm_reject(cm_id, ret);
1225         return NULL;
1226 }
1227
1228 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1229 {
1230         struct nvmet_rdma_queue *queue = priv;
1231
1232         switch (event->event) {
1233         case IB_EVENT_COMM_EST:
1234                 rdma_notify(queue->cm_id, event->event);
1235                 break;
1236         default:
1237                 pr_err("received IB QP event: %s (%d)\n",
1238                        ib_event_msg(event->event), event->event);
1239                 break;
1240         }
1241 }
1242
1243 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1244                 struct nvmet_rdma_queue *queue,
1245                 struct rdma_conn_param *p)
1246 {
1247         struct rdma_conn_param  param = { };
1248         struct nvme_rdma_cm_rep priv = { };
1249         int ret = -ENOMEM;
1250
1251         param.rnr_retry_count = 7;
1252         param.flow_control = 1;
1253         param.initiator_depth = min_t(u8, p->initiator_depth,
1254                 queue->dev->device->attrs.max_qp_init_rd_atom);
1255         param.private_data = &priv;
1256         param.private_data_len = sizeof(priv);
1257         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1258         priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1259
1260         ret = rdma_accept(cm_id, &param);
1261         if (ret)
1262                 pr_err("rdma_accept failed (error code = %d)\n", ret);
1263
1264         return ret;
1265 }
1266
1267 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1268                 struct rdma_cm_event *event)
1269 {
1270         struct nvmet_rdma_device *ndev;
1271         struct nvmet_rdma_queue *queue;
1272         int ret = -EINVAL;
1273
1274         ndev = nvmet_rdma_find_get_device(cm_id);
1275         if (!ndev) {
1276                 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1277                 return -ECONNREFUSED;
1278         }
1279
1280         queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1281         if (!queue) {
1282                 ret = -ENOMEM;
1283                 goto put_device;
1284         }
1285         queue->port = cm_id->context;
1286
1287         if (queue->host_qid == 0) {
1288                 /* Let inflight controller teardown complete */
1289                 flush_scheduled_work();
1290         }
1291
1292         ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1293         if (ret) {
1294                 schedule_work(&queue->release_work);
1295                 /* Destroying rdma_cm id is not needed here */
1296                 return 0;
1297         }
1298
1299         mutex_lock(&nvmet_rdma_queue_mutex);
1300         list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1301         mutex_unlock(&nvmet_rdma_queue_mutex);
1302
1303         return 0;
1304
1305 put_device:
1306         kref_put(&ndev->ref, nvmet_rdma_free_dev);
1307
1308         return ret;
1309 }
1310
1311 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1312 {
1313         unsigned long flags;
1314
1315         spin_lock_irqsave(&queue->state_lock, flags);
1316         if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1317                 pr_warn("trying to establish a connected queue\n");
1318                 goto out_unlock;
1319         }
1320         queue->state = NVMET_RDMA_Q_LIVE;
1321
1322         while (!list_empty(&queue->rsp_wait_list)) {
1323                 struct nvmet_rdma_rsp *cmd;
1324
1325                 cmd = list_first_entry(&queue->rsp_wait_list,
1326                                         struct nvmet_rdma_rsp, wait_list);
1327                 list_del(&cmd->wait_list);
1328
1329                 spin_unlock_irqrestore(&queue->state_lock, flags);
1330                 nvmet_rdma_handle_command(queue, cmd);
1331                 spin_lock_irqsave(&queue->state_lock, flags);
1332         }
1333
1334 out_unlock:
1335         spin_unlock_irqrestore(&queue->state_lock, flags);
1336 }
1337
1338 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1339 {
1340         bool disconnect = false;
1341         unsigned long flags;
1342
1343         pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1344
1345         spin_lock_irqsave(&queue->state_lock, flags);
1346         switch (queue->state) {
1347         case NVMET_RDMA_Q_CONNECTING:
1348         case NVMET_RDMA_Q_LIVE:
1349                 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1350                 disconnect = true;
1351                 break;
1352         case NVMET_RDMA_Q_DISCONNECTING:
1353                 break;
1354         }
1355         spin_unlock_irqrestore(&queue->state_lock, flags);
1356
1357         if (disconnect) {
1358                 rdma_disconnect(queue->cm_id);
1359                 schedule_work(&queue->release_work);
1360         }
1361 }
1362
1363 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1364 {
1365         bool disconnect = false;
1366
1367         mutex_lock(&nvmet_rdma_queue_mutex);
1368         if (!list_empty(&queue->queue_list)) {
1369                 list_del_init(&queue->queue_list);
1370                 disconnect = true;
1371         }
1372         mutex_unlock(&nvmet_rdma_queue_mutex);
1373
1374         if (disconnect)
1375                 __nvmet_rdma_queue_disconnect(queue);
1376 }
1377
1378 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1379                 struct nvmet_rdma_queue *queue)
1380 {
1381         WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1382
1383         mutex_lock(&nvmet_rdma_queue_mutex);
1384         if (!list_empty(&queue->queue_list))
1385                 list_del_init(&queue->queue_list);
1386         mutex_unlock(&nvmet_rdma_queue_mutex);
1387
1388         pr_err("failed to connect queue %d\n", queue->idx);
1389         schedule_work(&queue->release_work);
1390 }
1391
1392 /**
1393  * nvme_rdma_device_removal() - Handle RDMA device removal
1394  * @cm_id:      rdma_cm id, used for nvmet port
1395  * @queue:      nvmet rdma queue (cm id qp_context)
1396  *
1397  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1398  * to unplug. Note that this event can be generated on a normal
1399  * queue cm_id and/or a device bound listener cm_id (where in this
1400  * case queue will be null).
1401  *
1402  * We registered an ib_client to handle device removal for queues,
1403  * so we only need to handle the listening port cm_ids. In this case
1404  * we nullify the priv to prevent double cm_id destruction and destroying
1405  * the cm_id implicitely by returning a non-zero rc to the callout.
1406  */
1407 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1408                 struct nvmet_rdma_queue *queue)
1409 {
1410         struct nvmet_port *port;
1411
1412         if (queue) {
1413                 /*
1414                  * This is a queue cm_id. we have registered
1415                  * an ib_client to handle queues removal
1416                  * so don't interfear and just return.
1417                  */
1418                 return 0;
1419         }
1420
1421         port = cm_id->context;
1422
1423         /*
1424          * This is a listener cm_id. Make sure that
1425          * future remove_port won't invoke a double
1426          * cm_id destroy. use atomic xchg to make sure
1427          * we don't compete with remove_port.
1428          */
1429         if (xchg(&port->priv, NULL) != cm_id)
1430                 return 0;
1431
1432         /*
1433          * We need to return 1 so that the core will destroy
1434          * it's own ID.  What a great API design..
1435          */
1436         return 1;
1437 }
1438
1439 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1440                 struct rdma_cm_event *event)
1441 {
1442         struct nvmet_rdma_queue *queue = NULL;
1443         int ret = 0;
1444
1445         if (cm_id->qp)
1446                 queue = cm_id->qp->qp_context;
1447
1448         pr_debug("%s (%d): status %d id %p\n",
1449                 rdma_event_msg(event->event), event->event,
1450                 event->status, cm_id);
1451
1452         switch (event->event) {
1453         case RDMA_CM_EVENT_CONNECT_REQUEST:
1454                 ret = nvmet_rdma_queue_connect(cm_id, event);
1455                 break;
1456         case RDMA_CM_EVENT_ESTABLISHED:
1457                 nvmet_rdma_queue_established(queue);
1458                 break;
1459         case RDMA_CM_EVENT_ADDR_CHANGE:
1460         case RDMA_CM_EVENT_DISCONNECTED:
1461         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1462                 nvmet_rdma_queue_disconnect(queue);
1463                 break;
1464         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1465                 ret = nvmet_rdma_device_removal(cm_id, queue);
1466                 break;
1467         case RDMA_CM_EVENT_REJECTED:
1468                 pr_debug("Connection rejected: %s\n",
1469                          rdma_reject_msg(cm_id, event->status));
1470                 /* FALLTHROUGH */
1471         case RDMA_CM_EVENT_UNREACHABLE:
1472         case RDMA_CM_EVENT_CONNECT_ERROR:
1473                 nvmet_rdma_queue_connect_fail(cm_id, queue);
1474                 break;
1475         default:
1476                 pr_err("received unrecognized RDMA CM event %d\n",
1477                         event->event);
1478                 break;
1479         }
1480
1481         return ret;
1482 }
1483
1484 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1485 {
1486         struct nvmet_rdma_queue *queue;
1487
1488 restart:
1489         mutex_lock(&nvmet_rdma_queue_mutex);
1490         list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1491                 if (queue->nvme_sq.ctrl == ctrl) {
1492                         list_del_init(&queue->queue_list);
1493                         mutex_unlock(&nvmet_rdma_queue_mutex);
1494
1495                         __nvmet_rdma_queue_disconnect(queue);
1496                         goto restart;
1497                 }
1498         }
1499         mutex_unlock(&nvmet_rdma_queue_mutex);
1500 }
1501
1502 static int nvmet_rdma_add_port(struct nvmet_port *port)
1503 {
1504         struct rdma_cm_id *cm_id;
1505         struct sockaddr_storage addr = { };
1506         __kernel_sa_family_t af;
1507         int ret;
1508
1509         switch (port->disc_addr.adrfam) {
1510         case NVMF_ADDR_FAMILY_IP4:
1511                 af = AF_INET;
1512                 break;
1513         case NVMF_ADDR_FAMILY_IP6:
1514                 af = AF_INET6;
1515                 break;
1516         default:
1517                 pr_err("address family %d not supported\n",
1518                                 port->disc_addr.adrfam);
1519                 return -EINVAL;
1520         }
1521
1522         if (port->inline_data_size < 0) {
1523                 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1524         } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1525                 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1526                         port->inline_data_size,
1527                         NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1528                 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1529         }
1530
1531         ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1532                         port->disc_addr.trsvcid, &addr);
1533         if (ret) {
1534                 pr_err("malformed ip/port passed: %s:%s\n",
1535                         port->disc_addr.traddr, port->disc_addr.trsvcid);
1536                 return ret;
1537         }
1538
1539         cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1540                         RDMA_PS_TCP, IB_QPT_RC);
1541         if (IS_ERR(cm_id)) {
1542                 pr_err("CM ID creation failed\n");
1543                 return PTR_ERR(cm_id);
1544         }
1545
1546         /*
1547          * Allow both IPv4 and IPv6 sockets to bind a single port
1548          * at the same time.
1549          */
1550         ret = rdma_set_afonly(cm_id, 1);
1551         if (ret) {
1552                 pr_err("rdma_set_afonly failed (%d)\n", ret);
1553                 goto out_destroy_id;
1554         }
1555
1556         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
1557         if (ret) {
1558                 pr_err("binding CM ID to %pISpcs failed (%d)\n",
1559                         (struct sockaddr *)&addr, ret);
1560                 goto out_destroy_id;
1561         }
1562
1563         ret = rdma_listen(cm_id, 128);
1564         if (ret) {
1565                 pr_err("listening to %pISpcs failed (%d)\n",
1566                         (struct sockaddr *)&addr, ret);
1567                 goto out_destroy_id;
1568         }
1569
1570         pr_info("enabling port %d (%pISpcs)\n",
1571                 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
1572         port->priv = cm_id;
1573         return 0;
1574
1575 out_destroy_id:
1576         rdma_destroy_id(cm_id);
1577         return ret;
1578 }
1579
1580 static void nvmet_rdma_remove_port(struct nvmet_port *port)
1581 {
1582         struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
1583
1584         if (cm_id)
1585                 rdma_destroy_id(cm_id);
1586 }
1587
1588 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1589                 struct nvmet_port *port, char *traddr)
1590 {
1591         struct rdma_cm_id *cm_id = port->priv;
1592
1593         if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1594                 struct nvmet_rdma_rsp *rsp =
1595                         container_of(req, struct nvmet_rdma_rsp, req);
1596                 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1597                 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1598
1599                 sprintf(traddr, "%pISc", addr);
1600         } else {
1601                 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
1602         }
1603 }
1604
1605 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1606         .owner                  = THIS_MODULE,
1607         .type                   = NVMF_TRTYPE_RDMA,
1608         .msdbd                  = 1,
1609         .has_keyed_sgls         = 1,
1610         .add_port               = nvmet_rdma_add_port,
1611         .remove_port            = nvmet_rdma_remove_port,
1612         .queue_response         = nvmet_rdma_queue_response,
1613         .delete_ctrl            = nvmet_rdma_delete_ctrl,
1614         .disc_traddr            = nvmet_rdma_disc_port_addr,
1615 };
1616
1617 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1618 {
1619         struct nvmet_rdma_queue *queue, *tmp;
1620         struct nvmet_rdma_device *ndev;
1621         bool found = false;
1622
1623         mutex_lock(&device_list_mutex);
1624         list_for_each_entry(ndev, &device_list, entry) {
1625                 if (ndev->device == ib_device) {
1626                         found = true;
1627                         break;
1628                 }
1629         }
1630         mutex_unlock(&device_list_mutex);
1631
1632         if (!found)
1633                 return;
1634
1635         /*
1636          * IB Device that is used by nvmet controllers is being removed,
1637          * delete all queues using this device.
1638          */
1639         mutex_lock(&nvmet_rdma_queue_mutex);
1640         list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1641                                  queue_list) {
1642                 if (queue->dev->device != ib_device)
1643                         continue;
1644
1645                 pr_info("Removing queue %d\n", queue->idx);
1646                 list_del_init(&queue->queue_list);
1647                 __nvmet_rdma_queue_disconnect(queue);
1648         }
1649         mutex_unlock(&nvmet_rdma_queue_mutex);
1650
1651         flush_scheduled_work();
1652 }
1653
1654 static struct ib_client nvmet_rdma_ib_client = {
1655         .name   = "nvmet_rdma",
1656         .remove = nvmet_rdma_remove_one
1657 };
1658
1659 static int __init nvmet_rdma_init(void)
1660 {
1661         int ret;
1662
1663         ret = ib_register_client(&nvmet_rdma_ib_client);
1664         if (ret)
1665                 return ret;
1666
1667         ret = nvmet_register_transport(&nvmet_rdma_ops);
1668         if (ret)
1669                 goto err_ib_client;
1670
1671         return 0;
1672
1673 err_ib_client:
1674         ib_unregister_client(&nvmet_rdma_ib_client);
1675         return ret;
1676 }
1677
1678 static void __exit nvmet_rdma_exit(void)
1679 {
1680         nvmet_unregister_transport(&nvmet_rdma_ops);
1681         ib_unregister_client(&nvmet_rdma_ib_client);
1682         WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1683         ida_destroy(&nvmet_rdma_queue_ida);
1684 }
1685
1686 module_init(nvmet_rdma_init);
1687 module_exit(nvmet_rdma_exit);
1688
1689 MODULE_LICENSE("GPL v2");
1690 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */