ehci-hcd: Boost transfer speed
authorBenoît Thébaudeau <benoit.thebaudeau@advansee.com>
Fri, 10 Aug 2012 16:22:32 +0000 (18:22 +0200)
committerMarek Vasut <marex@denx.de>
Sat, 1 Sep 2012 14:21:52 +0000 (16:21 +0200)
This patch takes advantage of the hardware EHCI qTD queuing mechanism to avoid
software and transfer splitting overhead so as to make transfers as fast as
possible.

The only drawback is a call to memalign. However, this is fast compared to the
transfer timings, and the heap size to allocate is small, e.g. 128 kiB in the
worst case for a transfer length of 65535 packets of 512 bytes.

Tested on i.MX25, i.MX35 and i.MX51. In my test conditions, the speed gain was
very significant (several times faster), which is really appreciable when
accessing large files.

Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
Cc: Marek Vasut <marex@denx.de>
Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
drivers/usb/host/ehci-hcd.c

index 00a155b64dca57f0c899e78c14bc802298b9c59e..a0ef5db43fb619d70ab1b869e58430173ed1b01b 100644 (file)
@@ -208,8 +208,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
                   int length, struct devrequest *req)
 {
        ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
-#define QTD_COUNT      3
-       ALLOC_ALIGN_BUFFER(struct qTD, qtd, QTD_COUNT, USB_DMA_MINALIGN);
+       struct qTD *qtd;
+       int qtd_count = 0;
        int qtd_counter = 0;
 
        volatile struct qTD *vtd;
@@ -230,8 +230,74 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
                      le16_to_cpu(req->value), le16_to_cpu(req->value),
                      le16_to_cpu(req->index));
 
+       /*
+        * The USB transfer is split into qTD transfers. Eeach qTD transfer is
+        * described by a transfer descriptor (the qTD). The qTDs form a linked
+        * list with a queue head (QH).
+        *
+        * Each qTD transfer starts with a new USB packet, i.e. a packet cannot
+        * have its beginning in a qTD transfer and its end in the following
+        * one, so the qTD transfer lengths have to be chosen accordingly.
+        *
+        * Each qTD transfer uses up to QT_BUFFER_CNT data buffers, mapped to
+        * single pages. The first data buffer can start at any offset within a
+        * page (not considering the cache-line alignment issues), while the
+        * following buffers must be page-aligned. There is no alignment
+        * constraint on the size of a qTD transfer.
+        */
+       if (req != NULL)
+               /* 1 qTD will be needed for SETUP, and 1 for ACK. */
+               qtd_count += 1 + 1;
+       if (length > 0 || req == NULL) {
+               /*
+                * Determine the qTD transfer size that will be used for the
+                * data payload (not considering the final qTD transfer, which
+                * may be shorter).
+                *
+                * In order to keep each packet within a qTD transfer, the qTD
+                * transfer size is aligned to EHCI_PAGE_SIZE, which is a
+                * multiple of wMaxPacketSize (except in some cases for
+                * interrupt transfers, see comment in submit_int_msg()).
+                *
+                * By default, i.e. if the input buffer is page-aligned,
+                * QT_BUFFER_CNT full pages will be used.
+                */
+               int xfr_sz = QT_BUFFER_CNT;
+               /*
+                * However, if the input buffer is not page-aligned, the qTD
+                * transfer size will be one page shorter, and the first qTD
+                * data buffer of each transfer will be page-unaligned.
+                */
+               if ((uint32_t)buffer & (EHCI_PAGE_SIZE - 1))
+                       xfr_sz--;
+               /* Convert the qTD transfer size to bytes. */
+               xfr_sz *= EHCI_PAGE_SIZE;
+               /*
+                * Determine the number of qTDs that will be required for the
+                * data payload. This value has to be rounded up since the final
+                * qTD transfer may be shorter than the regular qTD transfer
+                * size that has just been computed.
+                */
+               qtd_count += DIV_ROUND_UP(length, xfr_sz);
+               /* ZLPs also need a qTD. */
+               if (!qtd_count)
+                       qtd_count++;
+       }
+/*
+ * Threshold value based on the worst-case total size of the qTDs to allocate
+ * for a mass-storage transfer of 65535 blocks of 512 bytes.
+ */
+#if CONFIG_SYS_MALLOC_LEN <= 128 * 1024
+#warning CONFIG_SYS_MALLOC_LEN may be too small for EHCI
+#endif
+       qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
+       if (qtd == NULL) {
+               printf("unable to allocate TDs\n");
+               return -1;
+       }
+
        memset(qh, 0, sizeof(struct QH));
-       memset(qtd, 0, QTD_COUNT * sizeof(*qtd));
+       memset(qtd, 0, qtd_count * sizeof(*qtd));
 
        toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
 
@@ -290,30 +356,64 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
        }
 
        if (length > 0 || req == NULL) {
-               /*
-                * Setup request qTD (3.5 in ehci-r10.pdf)
-                *
-                *   qt_next ................ 03-00 H
-                *   qt_altnext ............. 07-04 H
-                *   qt_token ............... 0B-08 H
-                *
-                *   [ buffer, buffer_hi ] loaded with "buffer".
-                */
-               qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
-               qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
-               token = QT_TOKEN_DT(toggle) | QT_TOKEN_TOTALBYTES(length) |
-                       QT_TOKEN_IOC(req == NULL) | QT_TOKEN_CPAGE(0) |
-                       QT_TOKEN_CERR(3) | QT_TOKEN_PID(usb_pipein(pipe) ?
-                               QT_TOKEN_PID_IN : QT_TOKEN_PID_OUT) |
-                       QT_TOKEN_STATUS(QT_TOKEN_STATUS_ACTIVE);
-               qtd[qtd_counter].qt_token = cpu_to_hc32(token);
-               if (ehci_td_buffer(&qtd[qtd_counter], buffer, length)) {
-                       printf("unable to construct DATA TD\n");
-                       goto fail;
-               }
-               /* Update previous qTD! */
-               *tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
-               tdp = &qtd[qtd_counter++].qt_next;
+               uint8_t *buf_ptr = buffer;
+               int left_length = length;
+
+               do {
+                       /*
+                        * Determine the size of this qTD transfer. By default,
+                        * QT_BUFFER_CNT full pages can be used.
+                        */
+                       int xfr_bytes = QT_BUFFER_CNT * EHCI_PAGE_SIZE;
+                       /*
+                        * However, if the input buffer is not page-aligned, the
+                        * portion of the first page before the buffer start
+                        * offset within that page is unusable.
+                        */
+                       xfr_bytes -= (uint32_t)buf_ptr & (EHCI_PAGE_SIZE - 1);
+                       /*
+                        * In order to keep each packet within a qTD transfer,
+                        * align the qTD transfer size to EHCI_PAGE_SIZE.
+                        */
+                       xfr_bytes &= ~(EHCI_PAGE_SIZE - 1);
+                       /*
+                        * This transfer may be shorter than the available qTD
+                        * transfer size that has just been computed.
+                        */
+                       xfr_bytes = min(xfr_bytes, left_length);
+
+                       /*
+                        * Setup request qTD (3.5 in ehci-r10.pdf)
+                        *
+                        *   qt_next ................ 03-00 H
+                        *   qt_altnext ............. 07-04 H
+                        *   qt_token ............... 0B-08 H
+                        *
+                        *   [ buffer, buffer_hi ] loaded with "buffer".
+                        */
+                       qtd[qtd_counter].qt_next =
+                                       cpu_to_hc32(QT_NEXT_TERMINATE);
+                       qtd[qtd_counter].qt_altnext =
+                                       cpu_to_hc32(QT_NEXT_TERMINATE);
+                       token = QT_TOKEN_DT(toggle) |
+                               QT_TOKEN_TOTALBYTES(xfr_bytes) |
+                               QT_TOKEN_IOC(req == NULL) | QT_TOKEN_CPAGE(0) |
+                               QT_TOKEN_CERR(3) |
+                               QT_TOKEN_PID(usb_pipein(pipe) ?
+                                       QT_TOKEN_PID_IN : QT_TOKEN_PID_OUT) |
+                               QT_TOKEN_STATUS(QT_TOKEN_STATUS_ACTIVE);
+                       qtd[qtd_counter].qt_token = cpu_to_hc32(token);
+                       if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
+                                               xfr_bytes)) {
+                               printf("unable to construct DATA TD\n");
+                               goto fail;
+                       }
+                       /* Update previous qTD! */
+                       *tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
+                       tdp = &qtd[qtd_counter++].qt_next;
+                       buf_ptr += xfr_bytes;
+                       left_length -= xfr_bytes;
+               } while (left_length > 0);
        }
 
        if (req != NULL) {
@@ -344,7 +444,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
                ALIGN_END_ADDR(struct QH, qh_list, 1));
        flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh, 1));
        flush_dcache_range((uint32_t)qtd,
-                          ALIGN_END_ADDR(struct qTD, qtd, QTD_COUNT));
+                          ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
        /* Set async. queue head pointer. */
        ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
@@ -375,7 +475,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
                invalidate_dcache_range((uint32_t)qh,
                        ALIGN_END_ADDR(struct QH, qh, 1));
                invalidate_dcache_range((uint32_t)qtd,
-                       ALIGN_END_ADDR(struct qTD, qtd, QTD_COUNT));
+                       ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
                token = hc32_to_cpu(vtd->qt_token);
                if (!(QT_TOKEN_GET_STATUS(token) & QT_TOKEN_STATUS_ACTIVE))
@@ -448,9 +548,11 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
                      ehci_readl(&hcor->or_portsc[1]));
        }
 
+       free(qtd);
        return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
 
 fail:
+       free(qtd);
        return -1;
 }
 
@@ -827,6 +929,13 @@ submit_int_msg(struct usb_device *dev, unsigned long pipe, void *buffer,
        /*
         * Interrupt transfers requiring several transactions are not supported
         * because bInterval is ignored.
+        *
+        * Also, ehci_submit_async() relies on wMaxPacketSize being a power of 2
+        * if several qTDs are required, while the USB specification does not
+        * constrain this for interrupt transfers. That means that
+        * ehci_submit_async() would support interrupt transfers requiring
+        * several transactions only as long as the transfer size does not
+        * require more than a single qTD.
         */
        if (length > usb_maxpacket(dev, pipe)) {
                printf("%s: Interrupt transfers requiring several transactions "