Linux-libre 3.16.41-gnu
[librecmc/linux-libre.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <asm/bitops.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57  * Globals/Macros
58  */
59
60 #ifdef RPC_DEBUG
61 # define RPCDBG_FACILITY        RPCDBG_TRANS
62 #endif
63
64 /*
65  * internal functions
66  */
67
68 /*
69  * handle replies in tasklet context, using a single, global list
70  * rdma tasklet function -- just turn around and call the func
71  * for all replies on the list
72  */
73
74 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75 static LIST_HEAD(rpcrdma_tasklets_g);
76
77 static void
78 rpcrdma_run_tasklet(unsigned long data)
79 {
80         struct rpcrdma_rep *rep;
81         void (*func)(struct rpcrdma_rep *);
82         unsigned long flags;
83
84         data = data;
85         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86         while (!list_empty(&rpcrdma_tasklets_g)) {
87                 rep = list_entry(rpcrdma_tasklets_g.next,
88                                  struct rpcrdma_rep, rr_list);
89                 list_del(&rep->rr_list);
90                 func = rep->rr_func;
91                 rep->rr_func = NULL;
92                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94                 if (func)
95                         func(rep);
96                 else
97                         rpcrdma_recv_buffer_put(rep);
98
99                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100         }
101         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 }
103
104 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106 static inline void
107 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108 {
109         unsigned long flags;
110
111         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114         tasklet_schedule(&rpcrdma_tasklet_g);
115 }
116
117 static void
118 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119 {
120         struct rpcrdma_ep *ep = context;
121
122         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
123                 __func__, event->event, event->device->name, context);
124         if (ep->rep_connected == 1) {
125                 ep->rep_connected = -EIO;
126                 ep->rep_func(ep);
127                 wake_up_all(&ep->rep_connect_wait);
128         }
129 }
130
131 static void
132 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133 {
134         struct rpcrdma_ep *ep = context;
135
136         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
137                 __func__, event->event, event->device->name, context);
138         if (ep->rep_connected == 1) {
139                 ep->rep_connected = -EIO;
140                 ep->rep_func(ep);
141                 wake_up_all(&ep->rep_connect_wait);
142         }
143 }
144
145 static void
146 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
147 {
148         struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
149
150         dprintk("RPC:       %s: frmr %p status %X opcode %d\n",
151                 __func__, frmr, wc->status, wc->opcode);
152
153         if (wc->wr_id == 0ULL)
154                 return;
155         if (wc->status != IB_WC_SUCCESS)
156                 return;
157
158         if (wc->opcode == IB_WC_FAST_REG_MR)
159                 frmr->r.frmr.state = FRMR_IS_VALID;
160         else if (wc->opcode == IB_WC_LOCAL_INV)
161                 frmr->r.frmr.state = FRMR_IS_INVALID;
162 }
163
164 static int
165 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
166 {
167         struct ib_wc *wcs;
168         int budget, count, rc;
169
170         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
171         do {
172                 wcs = ep->rep_send_wcs;
173
174                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
175                 if (rc <= 0)
176                         return rc;
177
178                 count = rc;
179                 while (count-- > 0)
180                         rpcrdma_sendcq_process_wc(wcs++);
181         } while (rc == RPCRDMA_POLLSIZE && --budget);
182         return 0;
183 }
184
185 /* Handle provider send completion upcalls.
186  */
187 static void
188 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
189 {
190         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
191
192         do {
193                 rpcrdma_sendcq_poll(cq, ep);
194         } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
195                                   IB_CQ_REPORT_MISSED_EVENTS) > 0);
196 }
197
198 static void
199 rpcrdma_recvcq_process_wc(struct ib_wc *wc)
200 {
201         struct rpcrdma_rep *rep =
202                         (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
203
204         dprintk("RPC:       %s: rep %p status %X opcode %X length %u\n",
205                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
206
207         if (wc->status != IB_WC_SUCCESS) {
208                 rep->rr_len = ~0U;
209                 goto out_schedule;
210         }
211         if (wc->opcode != IB_WC_RECV)
212                 return;
213
214         rep->rr_len = wc->byte_len;
215         ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
216                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
217
218         if (rep->rr_len >= 16) {
219                 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
220                 unsigned int credits = ntohl(p->rm_credit);
221
222                 if (credits == 0)
223                         credits = 1;    /* don't deadlock */
224                 else if (credits > rep->rr_buffer->rb_max_requests)
225                         credits = rep->rr_buffer->rb_max_requests;
226                 atomic_set(&rep->rr_buffer->rb_credits, credits);
227         }
228
229 out_schedule:
230         rpcrdma_schedule_tasklet(rep);
231 }
232
233 static int
234 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
235 {
236         struct ib_wc *wcs;
237         int budget, count, rc;
238
239         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
240         do {
241                 wcs = ep->rep_recv_wcs;
242
243                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
244                 if (rc <= 0)
245                         return rc;
246
247                 count = rc;
248                 while (count-- > 0)
249                         rpcrdma_recvcq_process_wc(wcs++);
250         } while (rc == RPCRDMA_POLLSIZE && --budget);
251         return 0;
252 }
253
254 /* Handle provider receive completion upcalls.
255  */
256 static void
257 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
258 {
259         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
260
261         do {
262                 rpcrdma_recvcq_poll(cq, ep);
263         } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
264                                   IB_CQ_REPORT_MISSED_EVENTS) > 0);
265 }
266
267 #ifdef RPC_DEBUG
268 static const char * const conn[] = {
269         "address resolved",
270         "address error",
271         "route resolved",
272         "route error",
273         "connect request",
274         "connect response",
275         "connect error",
276         "unreachable",
277         "rejected",
278         "established",
279         "disconnected",
280         "device removal"
281 };
282 #endif
283
284 static int
285 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286 {
287         struct rpcrdma_xprt *xprt = id->context;
288         struct rpcrdma_ia *ia = &xprt->rx_ia;
289         struct rpcrdma_ep *ep = &xprt->rx_ep;
290 #ifdef RPC_DEBUG
291         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
292 #endif
293         struct ib_qp_attr attr;
294         struct ib_qp_init_attr iattr;
295         int connstate = 0;
296
297         switch (event->event) {
298         case RDMA_CM_EVENT_ADDR_RESOLVED:
299         case RDMA_CM_EVENT_ROUTE_RESOLVED:
300                 ia->ri_async_rc = 0;
301                 complete(&ia->ri_done);
302                 break;
303         case RDMA_CM_EVENT_ADDR_ERROR:
304                 ia->ri_async_rc = -EHOSTUNREACH;
305                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
306                         __func__, ep);
307                 complete(&ia->ri_done);
308                 break;
309         case RDMA_CM_EVENT_ROUTE_ERROR:
310                 ia->ri_async_rc = -ENETUNREACH;
311                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
312                         __func__, ep);
313                 complete(&ia->ri_done);
314                 break;
315         case RDMA_CM_EVENT_ESTABLISHED:
316                 connstate = 1;
317                 ib_query_qp(ia->ri_id->qp, &attr,
318                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319                         &iattr);
320                 dprintk("RPC:       %s: %d responder resources"
321                         " (%d initiator)\n",
322                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323                 goto connected;
324         case RDMA_CM_EVENT_CONNECT_ERROR:
325                 connstate = -ENOTCONN;
326                 goto connected;
327         case RDMA_CM_EVENT_UNREACHABLE:
328                 connstate = -ENETDOWN;
329                 goto connected;
330         case RDMA_CM_EVENT_REJECTED:
331                 connstate = -ECONNREFUSED;
332                 goto connected;
333         case RDMA_CM_EVENT_DISCONNECTED:
334                 connstate = -ECONNABORTED;
335                 goto connected;
336         case RDMA_CM_EVENT_DEVICE_REMOVAL:
337                 connstate = -ENODEV;
338 connected:
339                 dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
340                         __func__,
341                         (event->event <= 11) ? conn[event->event] :
342                                                 "unknown connection error",
343                         &addr->sin_addr.s_addr,
344                         ntohs(addr->sin_port),
345                         ep, event->event);
346                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347                 dprintk("RPC:       %s: %sconnected\n",
348                                         __func__, connstate > 0 ? "" : "dis");
349                 ep->rep_connected = connstate;
350                 ep->rep_func(ep);
351                 wake_up_all(&ep->rep_connect_wait);
352                 break;
353         default:
354                 dprintk("RPC:       %s: unexpected CM event %d\n",
355                         __func__, event->event);
356                 break;
357         }
358
359 #ifdef RPC_DEBUG
360         if (connstate == 1) {
361                 int ird = attr.max_dest_rd_atomic;
362                 int tird = ep->rep_remote_cma.responder_resources;
363                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
364                         "on %s, memreg %d slots %d ird %d%s\n",
365                         &addr->sin_addr.s_addr,
366                         ntohs(addr->sin_port),
367                         ia->ri_id->device->name,
368                         ia->ri_memreg_strategy,
369                         xprt->rx_buf.rb_max_requests,
370                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371         } else if (connstate < 0) {
372                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373                         &addr->sin_addr.s_addr,
374                         ntohs(addr->sin_port),
375                         connstate);
376         }
377 #endif
378
379         return 0;
380 }
381
382 static struct rdma_cm_id *
383 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384                         struct rpcrdma_ia *ia, struct sockaddr *addr)
385 {
386         struct rdma_cm_id *id;
387         int rc;
388
389         init_completion(&ia->ri_done);
390
391         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
392         if (IS_ERR(id)) {
393                 rc = PTR_ERR(id);
394                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
395                         __func__, rc);
396                 return id;
397         }
398
399         ia->ri_async_rc = -ETIMEDOUT;
400         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401         if (rc) {
402                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
403                         __func__, rc);
404                 goto out;
405         }
406         wait_for_completion_interruptible_timeout(&ia->ri_done,
407                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
408         rc = ia->ri_async_rc;
409         if (rc)
410                 goto out;
411
412         ia->ri_async_rc = -ETIMEDOUT;
413         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414         if (rc) {
415                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
416                         __func__, rc);
417                 goto out;
418         }
419         wait_for_completion_interruptible_timeout(&ia->ri_done,
420                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
421         rc = ia->ri_async_rc;
422         if (rc)
423                 goto out;
424
425         return id;
426
427 out:
428         rdma_destroy_id(id);
429         return ERR_PTR(rc);
430 }
431
432 /*
433  * Drain any cq, prior to teardown.
434  */
435 static void
436 rpcrdma_clean_cq(struct ib_cq *cq)
437 {
438         struct ib_wc wc;
439         int count = 0;
440
441         while (1 == ib_poll_cq(cq, 1, &wc))
442                 ++count;
443
444         if (count)
445                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
446                         __func__, count, wc.opcode);
447 }
448
449 /*
450  * Exported functions.
451  */
452
453 /*
454  * Open and initialize an Interface Adapter.
455  *  o initializes fields of struct rpcrdma_ia, including
456  *    interface and provider attributes and protection zone.
457  */
458 int
459 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460 {
461         int rc, mem_priv;
462         struct ib_device_attr devattr;
463         struct rpcrdma_ia *ia = &xprt->rx_ia;
464
465         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466         if (IS_ERR(ia->ri_id)) {
467                 rc = PTR_ERR(ia->ri_id);
468                 goto out1;
469         }
470
471         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472         if (IS_ERR(ia->ri_pd)) {
473                 rc = PTR_ERR(ia->ri_pd);
474                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
475                         __func__, rc);
476                 goto out2;
477         }
478
479         /*
480          * Query the device to determine if the requested memory
481          * registration strategy is supported. If it isn't, set the
482          * strategy to a globally supported model.
483          */
484         rc = ib_query_device(ia->ri_id->device, &devattr);
485         if (rc) {
486                 dprintk("RPC:       %s: ib_query_device failed %d\n",
487                         __func__, rc);
488                 goto out3;
489         }
490
491         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492                 ia->ri_have_dma_lkey = 1;
493                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494         }
495
496         if (memreg == RPCRDMA_FRMR) {
497                 /* Requires both frmr reg and local dma lkey */
498                 if (((devattr.device_cap_flags &
499                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
500                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
501                       (devattr.max_fast_reg_page_list_len == 0)) {
502                         dprintk("RPC:       %s: FRMR registration "
503                                 "not supported by HCA\n", __func__);
504                         memreg = RPCRDMA_MTHCAFMR;
505                 } else {
506                         /* Mind the ia limit on FRMR page list depth */
507                         ia->ri_max_frmr_depth = min_t(unsigned int,
508                                 RPCRDMA_MAX_DATA_SEGS,
509                                 devattr.max_fast_reg_page_list_len);
510                 }
511         }
512         if (memreg == RPCRDMA_MTHCAFMR) {
513                 if (!ia->ri_id->device->alloc_fmr) {
514                         dprintk("RPC:       %s: MTHCAFMR registration "
515                                 "not supported by HCA\n", __func__);
516 #if RPCRDMA_PERSISTENT_REGISTRATION
517                         memreg = RPCRDMA_ALLPHYSICAL;
518 #else
519                         rc = -ENOMEM;
520                         goto out2;
521 #endif
522                 }
523         }
524
525         /*
526          * Optionally obtain an underlying physical identity mapping in
527          * order to do a memory window-based bind. This base registration
528          * is protected from remote access - that is enabled only by binding
529          * for the specific bytes targeted during each RPC operation, and
530          * revoked after the corresponding completion similar to a storage
531          * adapter.
532          */
533         switch (memreg) {
534         case RPCRDMA_FRMR:
535                 break;
536 #if RPCRDMA_PERSISTENT_REGISTRATION
537         case RPCRDMA_ALLPHYSICAL:
538                 mem_priv = IB_ACCESS_LOCAL_WRITE |
539                                 IB_ACCESS_REMOTE_WRITE |
540                                 IB_ACCESS_REMOTE_READ;
541                 goto register_setup;
542 #endif
543         case RPCRDMA_MTHCAFMR:
544                 if (ia->ri_have_dma_lkey)
545                         break;
546                 mem_priv = IB_ACCESS_LOCAL_WRITE;
547 #if RPCRDMA_PERSISTENT_REGISTRATION
548         register_setup:
549 #endif
550                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
551                 if (IS_ERR(ia->ri_bind_mem)) {
552                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
553                                 "phys register failed with %lX\n",
554                                 __func__, PTR_ERR(ia->ri_bind_mem));
555                         rc = -ENOMEM;
556                         goto out3;
557                 }
558                 break;
559         default:
560                 printk(KERN_ERR "RPC: Unsupported memory "
561                                 "registration mode: %d\n", memreg);
562                 rc = -ENOMEM;
563                 goto out3;
564         }
565         dprintk("RPC:       %s: memory registration strategy is %d\n",
566                 __func__, memreg);
567
568         /* Else will do memory reg/dereg for each chunk */
569         ia->ri_memreg_strategy = memreg;
570
571         return 0;
572
573 out3:
574         ib_dealloc_pd(ia->ri_pd);
575         ia->ri_pd = NULL;
576 out2:
577         rdma_destroy_id(ia->ri_id);
578         ia->ri_id = NULL;
579 out1:
580         return rc;
581 }
582
583 /*
584  * Clean up/close an IA.
585  *   o if event handles and PD have been initialized, free them.
586  *   o close the IA
587  */
588 void
589 rpcrdma_ia_close(struct rpcrdma_ia *ia)
590 {
591         int rc;
592
593         dprintk("RPC:       %s: entering\n", __func__);
594         if (ia->ri_bind_mem != NULL) {
595                 rc = ib_dereg_mr(ia->ri_bind_mem);
596                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
597                         __func__, rc);
598         }
599         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
600                 if (ia->ri_id->qp)
601                         rdma_destroy_qp(ia->ri_id);
602                 rdma_destroy_id(ia->ri_id);
603                 ia->ri_id = NULL;
604         }
605         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
606                 rc = ib_dealloc_pd(ia->ri_pd);
607                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
608                         __func__, rc);
609         }
610 }
611
612 /*
613  * Create unconnected endpoint.
614  */
615 int
616 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
617                                 struct rpcrdma_create_data_internal *cdata)
618 {
619         struct ib_device_attr devattr;
620         struct ib_cq *sendcq, *recvcq;
621         int rc, err;
622
623         rc = ib_query_device(ia->ri_id->device, &devattr);
624         if (rc) {
625                 dprintk("RPC:       %s: ib_query_device failed %d\n",
626                         __func__, rc);
627                 return rc;
628         }
629
630         /* check provider's send/recv wr limits */
631         if (cdata->max_requests > devattr.max_qp_wr)
632                 cdata->max_requests = devattr.max_qp_wr;
633
634         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
635         ep->rep_attr.qp_context = ep;
636         /* send_cq and recv_cq initialized below */
637         ep->rep_attr.srq = NULL;
638         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
639         switch (ia->ri_memreg_strategy) {
640         case RPCRDMA_FRMR: {
641                 int depth = 7;
642
643                 /* Add room for frmr register and invalidate WRs.
644                  * 1. FRMR reg WR for head
645                  * 2. FRMR invalidate WR for head
646                  * 3. N FRMR reg WRs for pagelist
647                  * 4. N FRMR invalidate WRs for pagelist
648                  * 5. FRMR reg WR for tail
649                  * 6. FRMR invalidate WR for tail
650                  * 7. The RDMA_SEND WR
651                  */
652
653                 /* Calculate N if the device max FRMR depth is smaller than
654                  * RPCRDMA_MAX_DATA_SEGS.
655                  */
656                 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
657                         int delta = RPCRDMA_MAX_DATA_SEGS -
658                                     ia->ri_max_frmr_depth;
659
660                         do {
661                                 depth += 2; /* FRMR reg + invalidate */
662                                 delta -= ia->ri_max_frmr_depth;
663                         } while (delta > 0);
664
665                 }
666                 ep->rep_attr.cap.max_send_wr *= depth;
667                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
668                         cdata->max_requests = devattr.max_qp_wr / depth;
669                         if (!cdata->max_requests)
670                                 return -EINVAL;
671                         ep->rep_attr.cap.max_send_wr = cdata->max_requests *
672                                                        depth;
673                 }
674                 break;
675         }
676         default:
677                 break;
678         }
679         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
680         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
681         ep->rep_attr.cap.max_recv_sge = 1;
682         ep->rep_attr.cap.max_inline_data = 0;
683         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
684         ep->rep_attr.qp_type = IB_QPT_RC;
685         ep->rep_attr.port_num = ~0;
686
687         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
688                 "iovs: send %d recv %d\n",
689                 __func__,
690                 ep->rep_attr.cap.max_send_wr,
691                 ep->rep_attr.cap.max_recv_wr,
692                 ep->rep_attr.cap.max_send_sge,
693                 ep->rep_attr.cap.max_recv_sge);
694
695         /* set trigger for requesting send completion */
696         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
697         if (ep->rep_cqinit <= 2)
698                 ep->rep_cqinit = 0;
699         INIT_CQCOUNT(ep);
700         ep->rep_ia = ia;
701         init_waitqueue_head(&ep->rep_connect_wait);
702         INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
703
704         sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
705                                   rpcrdma_cq_async_error_upcall, ep,
706                                   ep->rep_attr.cap.max_send_wr + 1, 0);
707         if (IS_ERR(sendcq)) {
708                 rc = PTR_ERR(sendcq);
709                 dprintk("RPC:       %s: failed to create send CQ: %i\n",
710                         __func__, rc);
711                 goto out1;
712         }
713
714         rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
715         if (rc) {
716                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
717                         __func__, rc);
718                 goto out2;
719         }
720
721         recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
722                                   rpcrdma_cq_async_error_upcall, ep,
723                                   ep->rep_attr.cap.max_recv_wr + 1, 0);
724         if (IS_ERR(recvcq)) {
725                 rc = PTR_ERR(recvcq);
726                 dprintk("RPC:       %s: failed to create recv CQ: %i\n",
727                         __func__, rc);
728                 goto out2;
729         }
730
731         rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
732         if (rc) {
733                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
734                         __func__, rc);
735                 ib_destroy_cq(recvcq);
736                 goto out2;
737         }
738
739         ep->rep_attr.send_cq = sendcq;
740         ep->rep_attr.recv_cq = recvcq;
741
742         /* Initialize cma parameters */
743
744         /* RPC/RDMA does not use private data */
745         ep->rep_remote_cma.private_data = NULL;
746         ep->rep_remote_cma.private_data_len = 0;
747
748         /* Client offers RDMA Read but does not initiate */
749         ep->rep_remote_cma.initiator_depth = 0;
750         if (devattr.max_qp_rd_atom > 32)        /* arbitrary but <= 255 */
751                 ep->rep_remote_cma.responder_resources = 32;
752         else
753                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
754
755         ep->rep_remote_cma.retry_count = 7;
756         ep->rep_remote_cma.flow_control = 0;
757         ep->rep_remote_cma.rnr_retry_count = 0;
758
759         return 0;
760
761 out2:
762         err = ib_destroy_cq(sendcq);
763         if (err)
764                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
765                         __func__, err);
766 out1:
767         return rc;
768 }
769
770 /*
771  * rpcrdma_ep_destroy
772  *
773  * Disconnect and destroy endpoint. After this, the only
774  * valid operations on the ep are to free it (if dynamically
775  * allocated) or re-create it.
776  */
777 void
778 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
779 {
780         int rc;
781
782         dprintk("RPC:       %s: entering, connected is %d\n",
783                 __func__, ep->rep_connected);
784
785         cancel_delayed_work_sync(&ep->rep_connect_worker);
786
787         if (ia->ri_id->qp) {
788                 rc = rpcrdma_ep_disconnect(ep, ia);
789                 if (rc)
790                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
791                                 " returned %i\n", __func__, rc);
792                 rdma_destroy_qp(ia->ri_id);
793                 ia->ri_id->qp = NULL;
794         }
795
796         /* padding - could be done in rpcrdma_buffer_destroy... */
797         if (ep->rep_pad_mr) {
798                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
799                 ep->rep_pad_mr = NULL;
800         }
801
802         rpcrdma_clean_cq(ep->rep_attr.recv_cq);
803         rc = ib_destroy_cq(ep->rep_attr.recv_cq);
804         if (rc)
805                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
806                         __func__, rc);
807
808         rpcrdma_clean_cq(ep->rep_attr.send_cq);
809         rc = ib_destroy_cq(ep->rep_attr.send_cq);
810         if (rc)
811                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
812                         __func__, rc);
813 }
814
815 /*
816  * Connect unconnected endpoint.
817  */
818 int
819 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
820 {
821         struct rdma_cm_id *id;
822         int rc = 0;
823         int retry_count = 0;
824
825         if (ep->rep_connected != 0) {
826                 struct rpcrdma_xprt *xprt;
827 retry:
828                 dprintk("RPC:       %s: reconnecting...\n", __func__);
829                 rc = rpcrdma_ep_disconnect(ep, ia);
830                 if (rc && rc != -ENOTCONN)
831                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
832                                 " status %i\n", __func__, rc);
833
834                 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
835                 rpcrdma_clean_cq(ep->rep_attr.send_cq);
836
837                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
838                 id = rpcrdma_create_id(xprt, ia,
839                                 (struct sockaddr *)&xprt->rx_data.addr);
840                 if (IS_ERR(id)) {
841                         rc = -EHOSTUNREACH;
842                         goto out;
843                 }
844                 /* TEMP TEMP TEMP - fail if new device:
845                  * Deregister/remarshal *all* requests!
846                  * Close and recreate adapter, pd, etc!
847                  * Re-determine all attributes still sane!
848                  * More stuff I haven't thought of!
849                  * Rrrgh!
850                  */
851                 if (ia->ri_id->device != id->device) {
852                         printk("RPC:       %s: can't reconnect on "
853                                 "different device!\n", __func__);
854                         rdma_destroy_id(id);
855                         rc = -ENETUNREACH;
856                         goto out;
857                 }
858                 /* END TEMP */
859                 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
860                 if (rc) {
861                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
862                                 __func__, rc);
863                         rdma_destroy_id(id);
864                         rc = -ENETUNREACH;
865                         goto out;
866                 }
867                 rdma_destroy_qp(ia->ri_id);
868                 rdma_destroy_id(ia->ri_id);
869                 ia->ri_id = id;
870         } else {
871                 dprintk("RPC:       %s: connecting...\n", __func__);
872                 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
873                 if (rc) {
874                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
875                                 __func__, rc);
876                         /* do not update ep->rep_connected */
877                         return -ENETUNREACH;
878                 }
879         }
880
881         ep->rep_connected = 0;
882
883         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
884         if (rc) {
885                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
886                                 __func__, rc);
887                 goto out;
888         }
889
890         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
891
892         /*
893          * Check state. A non-peer reject indicates no listener
894          * (ECONNREFUSED), which may be a transient state. All
895          * others indicate a transport condition which has already
896          * undergone a best-effort.
897          */
898         if (ep->rep_connected == -ECONNREFUSED &&
899             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
900                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
901                 goto retry;
902         }
903         if (ep->rep_connected <= 0) {
904                 /* Sometimes, the only way to reliably connect to remote
905                  * CMs is to use same nonzero values for ORD and IRD. */
906                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
907                     (ep->rep_remote_cma.responder_resources == 0 ||
908                      ep->rep_remote_cma.initiator_depth !=
909                                 ep->rep_remote_cma.responder_resources)) {
910                         if (ep->rep_remote_cma.responder_resources == 0)
911                                 ep->rep_remote_cma.responder_resources = 1;
912                         ep->rep_remote_cma.initiator_depth =
913                                 ep->rep_remote_cma.responder_resources;
914                         goto retry;
915                 }
916                 rc = ep->rep_connected;
917         } else {
918                 dprintk("RPC:       %s: connected\n", __func__);
919         }
920
921 out:
922         if (rc)
923                 ep->rep_connected = rc;
924         return rc;
925 }
926
927 /*
928  * rpcrdma_ep_disconnect
929  *
930  * This is separate from destroy to facilitate the ability
931  * to reconnect without recreating the endpoint.
932  *
933  * This call is not reentrant, and must not be made in parallel
934  * on the same endpoint.
935  */
936 int
937 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
938 {
939         int rc;
940
941         rpcrdma_clean_cq(ep->rep_attr.recv_cq);
942         rpcrdma_clean_cq(ep->rep_attr.send_cq);
943         rc = rdma_disconnect(ia->ri_id);
944         if (!rc) {
945                 /* returns without wait if not connected */
946                 wait_event_interruptible(ep->rep_connect_wait,
947                                                         ep->rep_connected != 1);
948                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
949                         (ep->rep_connected == 1) ? "still " : "dis");
950         } else {
951                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
952                 ep->rep_connected = rc;
953         }
954         return rc;
955 }
956
957 /*
958  * Initialize buffer memory
959  */
960 int
961 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
962         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
963 {
964         char *p;
965         size_t len, rlen, wlen;
966         int i, rc;
967         struct rpcrdma_mw *r;
968
969         buf->rb_max_requests = cdata->max_requests;
970         spin_lock_init(&buf->rb_lock);
971         atomic_set(&buf->rb_credits, 1);
972
973         /* Need to allocate:
974          *   1.  arrays for send and recv pointers
975          *   2.  arrays of struct rpcrdma_req to fill in pointers
976          *   3.  array of struct rpcrdma_rep for replies
977          *   4.  padding, if any
978          *   5.  mw's, fmr's or frmr's, if any
979          * Send/recv buffers in req/rep need to be registered
980          */
981
982         len = buf->rb_max_requests *
983                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
984         len += cdata->padding;
985         switch (ia->ri_memreg_strategy) {
986         case RPCRDMA_FRMR:
987                 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
988                                 sizeof(struct rpcrdma_mw);
989                 break;
990         case RPCRDMA_MTHCAFMR:
991                 /* TBD we are perhaps overallocating here */
992                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
993                                 sizeof(struct rpcrdma_mw);
994                 break;
995         default:
996                 break;
997         }
998
999         /* allocate 1, 4 and 5 in one shot */
1000         p = kzalloc(len, GFP_KERNEL);
1001         if (p == NULL) {
1002                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1003                         __func__, len);
1004                 rc = -ENOMEM;
1005                 goto out;
1006         }
1007         buf->rb_pool = p;       /* for freeing it later */
1008
1009         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1010         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1011         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1012         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1013
1014         /*
1015          * Register the zeroed pad buffer, if any.
1016          */
1017         if (cdata->padding) {
1018                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1019                                             &ep->rep_pad_mr, &ep->rep_pad);
1020                 if (rc)
1021                         goto out;
1022         }
1023         p += cdata->padding;
1024
1025         INIT_LIST_HEAD(&buf->rb_mws);
1026         r = (struct rpcrdma_mw *)p;
1027         switch (ia->ri_memreg_strategy) {
1028         case RPCRDMA_FRMR:
1029                 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1030                         r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1031                                                 ia->ri_max_frmr_depth);
1032                         if (IS_ERR(r->r.frmr.fr_mr)) {
1033                                 rc = PTR_ERR(r->r.frmr.fr_mr);
1034                                 dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1035                                         " failed %i\n", __func__, rc);
1036                                 goto out;
1037                         }
1038                         r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1039                                                 ia->ri_id->device,
1040                                                 ia->ri_max_frmr_depth);
1041                         if (IS_ERR(r->r.frmr.fr_pgl)) {
1042                                 rc = PTR_ERR(r->r.frmr.fr_pgl);
1043                                 dprintk("RPC:       %s: "
1044                                         "ib_alloc_fast_reg_page_list "
1045                                         "failed %i\n", __func__, rc);
1046
1047                                 ib_dereg_mr(r->r.frmr.fr_mr);
1048                                 goto out;
1049                         }
1050                         list_add(&r->mw_list, &buf->rb_mws);
1051                         ++r;
1052                 }
1053                 break;
1054         case RPCRDMA_MTHCAFMR:
1055                 /* TBD we are perhaps overallocating here */
1056                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1057                         static struct ib_fmr_attr fa =
1058                                 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1059                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1060                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1061                                 &fa);
1062                         if (IS_ERR(r->r.fmr)) {
1063                                 rc = PTR_ERR(r->r.fmr);
1064                                 dprintk("RPC:       %s: ib_alloc_fmr"
1065                                         " failed %i\n", __func__, rc);
1066                                 goto out;
1067                         }
1068                         list_add(&r->mw_list, &buf->rb_mws);
1069                         ++r;
1070                 }
1071                 break;
1072         default:
1073                 break;
1074         }
1075
1076         /*
1077          * Allocate/init the request/reply buffers. Doing this
1078          * using kmalloc for now -- one for each buf.
1079          */
1080         wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1081         rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1082         dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
1083                 __func__, wlen, rlen);
1084
1085         for (i = 0; i < buf->rb_max_requests; i++) {
1086                 struct rpcrdma_req *req;
1087                 struct rpcrdma_rep *rep;
1088
1089                 req = kmalloc(wlen, GFP_KERNEL);
1090                 if (req == NULL) {
1091                         dprintk("RPC:       %s: request buffer %d alloc"
1092                                 " failed\n", __func__, i);
1093                         rc = -ENOMEM;
1094                         goto out;
1095                 }
1096                 memset(req, 0, sizeof(struct rpcrdma_req));
1097                 buf->rb_send_bufs[i] = req;
1098                 buf->rb_send_bufs[i]->rl_buffer = buf;
1099
1100                 rc = rpcrdma_register_internal(ia, req->rl_base,
1101                                 wlen - offsetof(struct rpcrdma_req, rl_base),
1102                                 &buf->rb_send_bufs[i]->rl_handle,
1103                                 &buf->rb_send_bufs[i]->rl_iov);
1104                 if (rc)
1105                         goto out;
1106
1107                 buf->rb_send_bufs[i]->rl_size = wlen -
1108                                                 sizeof(struct rpcrdma_req);
1109
1110                 rep = kmalloc(rlen, GFP_KERNEL);
1111                 if (rep == NULL) {
1112                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1113                                 __func__, i);
1114                         rc = -ENOMEM;
1115                         goto out;
1116                 }
1117                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1118                 buf->rb_recv_bufs[i] = rep;
1119                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1120
1121                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1122                                 rlen - offsetof(struct rpcrdma_rep, rr_base),
1123                                 &buf->rb_recv_bufs[i]->rr_handle,
1124                                 &buf->rb_recv_bufs[i]->rr_iov);
1125                 if (rc)
1126                         goto out;
1127
1128         }
1129         dprintk("RPC:       %s: max_requests %d\n",
1130                 __func__, buf->rb_max_requests);
1131         /* done */
1132         return 0;
1133 out:
1134         rpcrdma_buffer_destroy(buf);
1135         return rc;
1136 }
1137
1138 /*
1139  * Unregister and destroy buffer memory. Need to deal with
1140  * partial initialization, so it's callable from failed create.
1141  * Must be called before destroying endpoint, as registrations
1142  * reference it.
1143  */
1144 void
1145 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1146 {
1147         int rc, i;
1148         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1149         struct rpcrdma_mw *r;
1150
1151         /* clean up in reverse order from create
1152          *   1.  recv mr memory (mr free, then kfree)
1153          *   2.  send mr memory (mr free, then kfree)
1154          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1155          *   4.  arrays
1156          */
1157         dprintk("RPC:       %s: entering\n", __func__);
1158
1159         for (i = 0; i < buf->rb_max_requests; i++) {
1160                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1161                         rpcrdma_deregister_internal(ia,
1162                                         buf->rb_recv_bufs[i]->rr_handle,
1163                                         &buf->rb_recv_bufs[i]->rr_iov);
1164                         kfree(buf->rb_recv_bufs[i]);
1165                 }
1166                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1167                         rpcrdma_deregister_internal(ia,
1168                                         buf->rb_send_bufs[i]->rl_handle,
1169                                         &buf->rb_send_bufs[i]->rl_iov);
1170                         kfree(buf->rb_send_bufs[i]);
1171                 }
1172         }
1173
1174         while (!list_empty(&buf->rb_mws)) {
1175                 r = list_entry(buf->rb_mws.next,
1176                         struct rpcrdma_mw, mw_list);
1177                 list_del(&r->mw_list);
1178                 switch (ia->ri_memreg_strategy) {
1179                 case RPCRDMA_FRMR:
1180                         rc = ib_dereg_mr(r->r.frmr.fr_mr);
1181                         if (rc)
1182                                 dprintk("RPC:       %s:"
1183                                         " ib_dereg_mr"
1184                                         " failed %i\n",
1185                                         __func__, rc);
1186                         ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1187                         break;
1188                 case RPCRDMA_MTHCAFMR:
1189                         rc = ib_dealloc_fmr(r->r.fmr);
1190                         if (rc)
1191                                 dprintk("RPC:       %s:"
1192                                         " ib_dealloc_fmr"
1193                                         " failed %i\n",
1194                                         __func__, rc);
1195                         break;
1196                 default:
1197                         break;
1198                 }
1199         }
1200
1201         kfree(buf->rb_pool);
1202 }
1203
1204 /*
1205  * Get a set of request/reply buffers.
1206  *
1207  * Reply buffer (if needed) is attached to send buffer upon return.
1208  * Rule:
1209  *    rb_send_index and rb_recv_index MUST always be pointing to the
1210  *    *next* available buffer (non-NULL). They are incremented after
1211  *    removing buffers, and decremented *before* returning them.
1212  */
1213 struct rpcrdma_req *
1214 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1215 {
1216         struct rpcrdma_req *req;
1217         unsigned long flags;
1218         int i;
1219         struct rpcrdma_mw *r;
1220
1221         spin_lock_irqsave(&buffers->rb_lock, flags);
1222         if (buffers->rb_send_index == buffers->rb_max_requests) {
1223                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1224                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1225                 return ((struct rpcrdma_req *)NULL);
1226         }
1227
1228         req = buffers->rb_send_bufs[buffers->rb_send_index];
1229         if (buffers->rb_send_index < buffers->rb_recv_index) {
1230                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1231                         __func__,
1232                         buffers->rb_recv_index - buffers->rb_send_index);
1233                 req->rl_reply = NULL;
1234         } else {
1235                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1236                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1237         }
1238         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1239         if (!list_empty(&buffers->rb_mws)) {
1240                 i = RPCRDMA_MAX_SEGS - 1;
1241                 do {
1242                         r = list_entry(buffers->rb_mws.next,
1243                                         struct rpcrdma_mw, mw_list);
1244                         list_del(&r->mw_list);
1245                         req->rl_segments[i].mr_chunk.rl_mw = r;
1246                 } while (--i >= 0);
1247         }
1248         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1249         return req;
1250 }
1251
1252 /*
1253  * Put request/reply buffers back into pool.
1254  * Pre-decrement counter/array index.
1255  */
1256 void
1257 rpcrdma_buffer_put(struct rpcrdma_req *req)
1258 {
1259         struct rpcrdma_buffer *buffers = req->rl_buffer;
1260         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1261         int i;
1262         unsigned long flags;
1263
1264         spin_lock_irqsave(&buffers->rb_lock, flags);
1265         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1266         req->rl_niovs = 0;
1267         if (req->rl_reply) {
1268                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1269                 req->rl_reply->rr_func = NULL;
1270                 req->rl_reply = NULL;
1271         }
1272         switch (ia->ri_memreg_strategy) {
1273         case RPCRDMA_FRMR:
1274         case RPCRDMA_MTHCAFMR:
1275                 /*
1276                  * Cycle mw's back in reverse order, and "spin" them.
1277                  * This delays and scrambles reuse as much as possible.
1278                  */
1279                 i = 1;
1280                 do {
1281                         struct rpcrdma_mw **mw;
1282                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1283                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1284                         *mw = NULL;
1285                 } while (++i < RPCRDMA_MAX_SEGS);
1286                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1287                                         &buffers->rb_mws);
1288                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1289                 break;
1290         default:
1291                 break;
1292         }
1293         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1294 }
1295
1296 /*
1297  * Recover reply buffers from pool.
1298  * This happens when recovering from error conditions.
1299  * Post-increment counter/array index.
1300  */
1301 void
1302 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1303 {
1304         struct rpcrdma_buffer *buffers = req->rl_buffer;
1305         unsigned long flags;
1306
1307         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1308                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1309         spin_lock_irqsave(&buffers->rb_lock, flags);
1310         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1311                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1312                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1313         }
1314         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1315 }
1316
1317 /*
1318  * Put reply buffers back into pool when not attached to
1319  * request. This happens in error conditions.
1320  */
1321 void
1322 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1323 {
1324         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1325         unsigned long flags;
1326
1327         rep->rr_func = NULL;
1328         spin_lock_irqsave(&buffers->rb_lock, flags);
1329         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1330         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1331 }
1332
1333 /*
1334  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1335  */
1336
1337 int
1338 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1339                                 struct ib_mr **mrp, struct ib_sge *iov)
1340 {
1341         struct ib_phys_buf ipb;
1342         struct ib_mr *mr;
1343         int rc;
1344
1345         /*
1346          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1347          */
1348         iov->addr = ib_dma_map_single(ia->ri_id->device,
1349                         va, len, DMA_BIDIRECTIONAL);
1350         iov->length = len;
1351
1352         if (ia->ri_have_dma_lkey) {
1353                 *mrp = NULL;
1354                 iov->lkey = ia->ri_dma_lkey;
1355                 return 0;
1356         } else if (ia->ri_bind_mem != NULL) {
1357                 *mrp = NULL;
1358                 iov->lkey = ia->ri_bind_mem->lkey;
1359                 return 0;
1360         }
1361
1362         ipb.addr = iov->addr;
1363         ipb.size = iov->length;
1364         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1365                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1366
1367         dprintk("RPC:       %s: phys convert: 0x%llx "
1368                         "registered 0x%llx length %d\n",
1369                         __func__, (unsigned long long)ipb.addr,
1370                         (unsigned long long)iov->addr, len);
1371
1372         if (IS_ERR(mr)) {
1373                 *mrp = NULL;
1374                 rc = PTR_ERR(mr);
1375                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1376         } else {
1377                 *mrp = mr;
1378                 iov->lkey = mr->lkey;
1379                 rc = 0;
1380         }
1381
1382         return rc;
1383 }
1384
1385 int
1386 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1387                                 struct ib_mr *mr, struct ib_sge *iov)
1388 {
1389         int rc;
1390
1391         ib_dma_unmap_single(ia->ri_id->device,
1392                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1393
1394         if (NULL == mr)
1395                 return 0;
1396
1397         rc = ib_dereg_mr(mr);
1398         if (rc)
1399                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1400         return rc;
1401 }
1402
1403 /*
1404  * Wrappers for chunk registration, shared by read/write chunk code.
1405  */
1406
1407 static void
1408 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1409 {
1410         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1411         seg->mr_dmalen = seg->mr_len;
1412         if (seg->mr_page)
1413                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1414                                 seg->mr_page, offset_in_page(seg->mr_offset),
1415                                 seg->mr_dmalen, seg->mr_dir);
1416         else
1417                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1418                                 seg->mr_offset,
1419                                 seg->mr_dmalen, seg->mr_dir);
1420         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1421                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1422                         __func__,
1423                         (unsigned long long)seg->mr_dma,
1424                         seg->mr_offset, seg->mr_dmalen);
1425         }
1426 }
1427
1428 static void
1429 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1430 {
1431         if (seg->mr_page)
1432                 ib_dma_unmap_page(ia->ri_id->device,
1433                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1434         else
1435                 ib_dma_unmap_single(ia->ri_id->device,
1436                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1437 }
1438
1439 static int
1440 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1441                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1442                         struct rpcrdma_xprt *r_xprt)
1443 {
1444         struct rpcrdma_mr_seg *seg1 = seg;
1445         struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1446
1447         u8 key;
1448         int len, pageoff;
1449         int i, rc;
1450         int seg_len;
1451         u64 pa;
1452         int page_no;
1453
1454         pageoff = offset_in_page(seg1->mr_offset);
1455         seg1->mr_offset -= pageoff;     /* start of page */
1456         seg1->mr_len += pageoff;
1457         len = -pageoff;
1458         if (*nsegs > ia->ri_max_frmr_depth)
1459                 *nsegs = ia->ri_max_frmr_depth;
1460         for (page_no = i = 0; i < *nsegs;) {
1461                 rpcrdma_map_one(ia, seg, writing);
1462                 pa = seg->mr_dma;
1463                 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1464                         seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1465                                 page_list[page_no++] = pa;
1466                         pa += PAGE_SIZE;
1467                 }
1468                 len += seg->mr_len;
1469                 ++seg;
1470                 ++i;
1471                 /* Check for holes */
1472                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1473                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1474                         break;
1475         }
1476         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1477                 __func__, seg1->mr_chunk.rl_mw, i);
1478
1479         if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1480                 dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
1481                         __func__,
1482                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1483                 /* Invalidate before using. */
1484                 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1485                 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1486                 invalidate_wr.next = &frmr_wr;
1487                 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1488                 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1489                 invalidate_wr.ex.invalidate_rkey =
1490                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1491                 DECR_CQCOUNT(&r_xprt->rx_ep);
1492                 post_wr = &invalidate_wr;
1493         } else
1494                 post_wr = &frmr_wr;
1495
1496         /* Prepare FRMR WR */
1497         memset(&frmr_wr, 0, sizeof frmr_wr);
1498         frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1499         frmr_wr.opcode = IB_WR_FAST_REG_MR;
1500         frmr_wr.send_flags = IB_SEND_SIGNALED;
1501         frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1502         frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1503         frmr_wr.wr.fast_reg.page_list_len = page_no;
1504         frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1505         frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1506         if (frmr_wr.wr.fast_reg.length < len) {
1507                 while (seg1->mr_nsegs--)
1508                         rpcrdma_unmap_one(ia, seg++);
1509                 return -EIO;
1510         }
1511
1512         /* Bump the key */
1513         key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1514         ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1515
1516         frmr_wr.wr.fast_reg.access_flags = (writing ?
1517                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1518                                 IB_ACCESS_REMOTE_READ);
1519         frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1520         DECR_CQCOUNT(&r_xprt->rx_ep);
1521
1522         rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1523
1524         if (rc) {
1525                 dprintk("RPC:       %s: failed ib_post_send for register,"
1526                         " status %i\n", __func__, rc);
1527                 while (i--)
1528                         rpcrdma_unmap_one(ia, --seg);
1529         } else {
1530                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1531                 seg1->mr_base = seg1->mr_dma + pageoff;
1532                 seg1->mr_nsegs = i;
1533                 seg1->mr_len = len;
1534         }
1535         *nsegs = i;
1536         return rc;
1537 }
1538
1539 static int
1540 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1541                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1542 {
1543         struct rpcrdma_mr_seg *seg1 = seg;
1544         struct ib_send_wr invalidate_wr, *bad_wr;
1545         int rc;
1546
1547         while (seg1->mr_nsegs--)
1548                 rpcrdma_unmap_one(ia, seg++);
1549
1550         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1551         invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1552         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1553         invalidate_wr.send_flags = IB_SEND_SIGNALED;
1554         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1555         DECR_CQCOUNT(&r_xprt->rx_ep);
1556
1557         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1558         if (rc)
1559                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1560                         " status %i\n", __func__, rc);
1561         return rc;
1562 }
1563
1564 static int
1565 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1566                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1567 {
1568         struct rpcrdma_mr_seg *seg1 = seg;
1569         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1570         int len, pageoff, i, rc;
1571
1572         pageoff = offset_in_page(seg1->mr_offset);
1573         seg1->mr_offset -= pageoff;     /* start of page */
1574         seg1->mr_len += pageoff;
1575         len = -pageoff;
1576         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1577                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1578         for (i = 0; i < *nsegs;) {
1579                 rpcrdma_map_one(ia, seg, writing);
1580                 physaddrs[i] = seg->mr_dma;
1581                 len += seg->mr_len;
1582                 ++seg;
1583                 ++i;
1584                 /* Check for holes */
1585                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1586                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1587                         break;
1588         }
1589         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1590                                 physaddrs, i, seg1->mr_dma);
1591         if (rc) {
1592                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1593                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1594                         len, (unsigned long long)seg1->mr_dma,
1595                         pageoff, i, rc);
1596                 while (i--)
1597                         rpcrdma_unmap_one(ia, --seg);
1598         } else {
1599                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1600                 seg1->mr_base = seg1->mr_dma + pageoff;
1601                 seg1->mr_nsegs = i;
1602                 seg1->mr_len = len;
1603         }
1604         *nsegs = i;
1605         return rc;
1606 }
1607
1608 static int
1609 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1610                         struct rpcrdma_ia *ia)
1611 {
1612         struct rpcrdma_mr_seg *seg1 = seg;
1613         LIST_HEAD(l);
1614         int rc;
1615
1616         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1617         rc = ib_unmap_fmr(&l);
1618         while (seg1->mr_nsegs--)
1619                 rpcrdma_unmap_one(ia, seg++);
1620         if (rc)
1621                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1622                         " status %i\n", __func__, rc);
1623         return rc;
1624 }
1625
1626 int
1627 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1628                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1629 {
1630         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1631         int rc = 0;
1632
1633         switch (ia->ri_memreg_strategy) {
1634
1635 #if RPCRDMA_PERSISTENT_REGISTRATION
1636         case RPCRDMA_ALLPHYSICAL:
1637                 rpcrdma_map_one(ia, seg, writing);
1638                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1639                 seg->mr_base = seg->mr_dma;
1640                 seg->mr_nsegs = 1;
1641                 nsegs = 1;
1642                 break;
1643 #endif
1644
1645         /* Registration using frmr registration */
1646         case RPCRDMA_FRMR:
1647                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1648                 break;
1649
1650         /* Registration using fmr memory registration */
1651         case RPCRDMA_MTHCAFMR:
1652                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1653                 break;
1654
1655         default:
1656                 return -1;
1657         }
1658         if (rc)
1659                 return -1;
1660
1661         return nsegs;
1662 }
1663
1664 int
1665 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1666                 struct rpcrdma_xprt *r_xprt)
1667 {
1668         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1669         int nsegs = seg->mr_nsegs, rc;
1670
1671         switch (ia->ri_memreg_strategy) {
1672
1673 #if RPCRDMA_PERSISTENT_REGISTRATION
1674         case RPCRDMA_ALLPHYSICAL:
1675                 rpcrdma_unmap_one(ia, seg);
1676                 break;
1677 #endif
1678
1679         case RPCRDMA_FRMR:
1680                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1681                 break;
1682
1683         case RPCRDMA_MTHCAFMR:
1684                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1685                 break;
1686
1687         default:
1688                 break;
1689         }
1690         return nsegs;
1691 }
1692
1693 /*
1694  * Prepost any receive buffer, then post send.
1695  *
1696  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1697  */
1698 int
1699 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1700                 struct rpcrdma_ep *ep,
1701                 struct rpcrdma_req *req)
1702 {
1703         struct ib_send_wr send_wr, *send_wr_fail;
1704         struct rpcrdma_rep *rep = req->rl_reply;
1705         int rc;
1706
1707         if (rep) {
1708                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1709                 if (rc)
1710                         goto out;
1711                 req->rl_reply = NULL;
1712         }
1713
1714         send_wr.next = NULL;
1715         send_wr.wr_id = 0ULL;   /* no send cookie */
1716         send_wr.sg_list = req->rl_send_iov;
1717         send_wr.num_sge = req->rl_niovs;
1718         send_wr.opcode = IB_WR_SEND;
1719         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1720                 ib_dma_sync_single_for_device(ia->ri_id->device,
1721                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1722                         DMA_TO_DEVICE);
1723         ib_dma_sync_single_for_device(ia->ri_id->device,
1724                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1725                 DMA_TO_DEVICE);
1726         ib_dma_sync_single_for_device(ia->ri_id->device,
1727                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1728                 DMA_TO_DEVICE);
1729
1730         if (DECR_CQCOUNT(ep) > 0)
1731                 send_wr.send_flags = 0;
1732         else { /* Provider must take a send completion every now and then */
1733                 INIT_CQCOUNT(ep);
1734                 send_wr.send_flags = IB_SEND_SIGNALED;
1735         }
1736
1737         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1738         if (rc)
1739                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1740                         rc);
1741 out:
1742         return rc;
1743 }
1744
1745 /*
1746  * (Re)post a receive buffer.
1747  */
1748 int
1749 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1750                      struct rpcrdma_ep *ep,
1751                      struct rpcrdma_rep *rep)
1752 {
1753         struct ib_recv_wr recv_wr, *recv_wr_fail;
1754         int rc;
1755
1756         recv_wr.next = NULL;
1757         recv_wr.wr_id = (u64) (unsigned long) rep;
1758         recv_wr.sg_list = &rep->rr_iov;
1759         recv_wr.num_sge = 1;
1760
1761         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1762                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1763
1764         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1765
1766         if (rc)
1767                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1768                         rc);
1769         return rc;
1770 }