kernel: refresh patches
[oweals/openwrt.git] / target / linux / brcm2708 / patches-3.14 / 0026-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
1 From c2731f282848af32425043a2df88c1289538983e Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
3 Date: Mon, 17 Jun 2013 16:00:25 +0300
4 Subject: [PATCH 26/54] bcm2708_fb: DMA acceleration for fb_copyarea
5
6 Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
7 Also used Simon's dmaer_master module as a reference for tweaking DMA
8 settings for better performance.
9
10 For now busylooping only. IRQ support might be added later.
11 With non-overclocked Raspberry Pi, the performance is ~360 MB/s
12 for simple copy or ~260 MB/s for two-pass copy (used when dragging
13 windows to the right).
14
15 In the case of using DMA channel 0, the performance improves
16 to ~440 MB/s.
17
18 For comparison, VFP optimized CPU copy can only do ~114 MB/s in
19 the same conditions (hindered by reading uncached source buffer).
20
21 Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
22
23 bcm2708_fb: report number of dma copies
24
25 Add a counter (exported via debugfs) reporting the
26 number of dma copies that the framebuffer driver
27 has done, in order to help evaluate different
28 optimization strategies.
29
30 Signed-off-by: Luke Diamand <luked@broadcom.com>
31
32 bcm2708_fb: use IRQ for DMA copies
33
34 The copyarea ioctl() uses DMA to speed things along. This
35 was busy-waiting for completion. This change supports using
36 an interrupt instead for larger transfers. For small
37 transfers, busy-waiting is still likely to be faster.
38
39 Signed-off-by: Luke Diamand <luke@diamand.org>
40 ---
41  arch/arm/mach-bcm2708/dma.c              |   8 +
42  arch/arm/mach-bcm2708/include/mach/dma.h |   2 +
43  drivers/video/bcm2708_fb.c               | 273 ++++++++++++++++++++++++++++++-
44  3 files changed, 278 insertions(+), 5 deletions(-)
45
46 --- a/arch/arm/mach-bcm2708/dma.c
47 +++ b/arch/arm/mach-bcm2708/dma.c
48 @@ -83,6 +83,14 @@ extern void bcm_dma_wait_idle(void __iom
49  
50  EXPORT_SYMBOL_GPL(bcm_dma_start);
51  
52 +extern bool bcm_dma_is_busy(void __iomem *dma_chan_base)
53 +{
54 +       dsb();
55 +
56 +       return readl(dma_chan_base + BCM2708_DMA_CS) & BCM2708_DMA_ACTIVE;
57 +}
58 +EXPORT_SYMBOL_GPL(bcm_dma_is_busy);
59 +
60  /* Complete an ongoing DMA (assuming its results are to be ignored)
61     Does nothing if there is no DMA in progress.
62     This routine waits for the current AXI transfer to complete before
63 --- a/arch/arm/mach-bcm2708/include/mach/dma.h
64 +++ b/arch/arm/mach-bcm2708/include/mach/dma.h
65 @@ -62,11 +62,13 @@ struct bcm2708_dma_cb {
66         unsigned long next;
67         unsigned long pad[2];
68  };
69 +struct scatterlist;
70  
71  extern int bcm_sg_suitable_for_dma(struct scatterlist *sg_ptr, int sg_len);
72  extern void bcm_dma_start(void __iomem *dma_chan_base,
73                           dma_addr_t control_block);
74  extern void bcm_dma_wait_idle(void __iomem *dma_chan_base);
75 +extern bool bcm_dma_is_busy(void __iomem *dma_chan_base);
76  extern int /*rc*/ bcm_dma_abort(void __iomem *dma_chan_base);
77  
78  /* When listing features we can ask for when allocating DMA channels give
79 --- a/drivers/video/bcm2708_fb.c
80 +++ b/drivers/video/bcm2708_fb.c
81 @@ -21,13 +21,16 @@
82  #include <linux/mm.h>
83  #include <linux/fb.h>
84  #include <linux/init.h>
85 +#include <linux/interrupt.h>
86  #include <linux/ioport.h>
87  #include <linux/list.h>
88  #include <linux/platform_device.h>
89  #include <linux/clk.h>
90  #include <linux/printk.h>
91  #include <linux/console.h>
92 +#include <linux/debugfs.h>
93  
94 +#include <mach/dma.h>
95  #include <mach/platform.h>
96  #include <mach/vcio.h>
97  
98 @@ -51,6 +54,10 @@ static int fbheight = 480; /* module par
99  static int fbdepth = 16;   /* module parameter */
100  static int fbswap = 0;     /* module parameter */
101  
102 +static u32 dma_busy_wait_threshold = 1<<15;
103 +module_param(dma_busy_wait_threshold, int, 0644);
104 +MODULE_PARM_DESC(dma_busy_wait_threshold, "Busy-wait for DMA completion below this area");
105 +
106  /* this data structure describes each frame buffer device we find */
107  
108  struct fbinfo_s {
109 @@ -62,16 +69,73 @@ struct fbinfo_s {
110         u16 cmap[256];
111  };
112  
113 +struct bcm2708_fb_stats {
114 +       struct debugfs_regset32 regset;
115 +       u32 dma_copies;
116 +       u32 dma_irqs;
117 +};
118 +
119  struct bcm2708_fb {
120         struct fb_info fb;
121         struct platform_device *dev;
122         struct fbinfo_s *info;
123         dma_addr_t dma;
124         u32 cmap[16];
125 +       int dma_chan;
126 +       int dma_irq;
127 +       void __iomem *dma_chan_base;
128 +       void *cb_base;          /* DMA control blocks */
129 +       dma_addr_t cb_handle;
130 +       struct dentry *debugfs_dir;
131 +       wait_queue_head_t dma_waitq;
132 +       struct bcm2708_fb_stats stats;
133  };
134  
135  #define to_bcm2708(info)       container_of(info, struct bcm2708_fb, fb)
136  
137 +static void bcm2708_fb_debugfs_deinit(struct bcm2708_fb *fb)
138 +{
139 +       debugfs_remove_recursive(fb->debugfs_dir);
140 +       fb->debugfs_dir = NULL;
141 +}
142 +
143 +static int bcm2708_fb_debugfs_init(struct bcm2708_fb *fb)
144 +{
145 +       static struct debugfs_reg32 stats_registers[] = {
146 +               {
147 +                       "dma_copies",
148 +                       offsetof(struct bcm2708_fb_stats, dma_copies)
149 +               },
150 +               {
151 +                       "dma_irqs",
152 +                       offsetof(struct bcm2708_fb_stats, dma_irqs)
153 +               },
154 +       };
155 +
156 +       fb->debugfs_dir = debugfs_create_dir(DRIVER_NAME, NULL);
157 +       if (!fb->debugfs_dir) {
158 +               pr_warn("%s: could not create debugfs entry\n",
159 +                       __func__);
160 +               return -EFAULT;
161 +       }
162 +
163 +       fb->stats.regset.regs = stats_registers;
164 +       fb->stats.regset.nregs = ARRAY_SIZE(stats_registers);
165 +       fb->stats.regset.base = &fb->stats;
166 +
167 +       if (!debugfs_create_regset32(
168 +               "stats", 0444, fb->debugfs_dir, &fb->stats.regset)) {
169 +               pr_warn("%s: could not create statistics registers\n",
170 +                       __func__);
171 +               goto fail;
172 +       }
173 +       return 0;
174 +
175 +fail:
176 +       bcm2708_fb_debugfs_deinit(fb);
177 +       return -EFAULT;
178 +}
179 +
180  static int bcm2708_fb_set_bitfields(struct fb_var_screeninfo *var)
181  {
182         int ret = 0;
183 @@ -322,11 +386,148 @@ static void bcm2708_fb_fillrect(struct f
184         cfb_fillrect(info, rect);
185  }
186  
187 +/* A helper function for configuring dma control block */
188 +static void set_dma_cb(struct bcm2708_dma_cb *cb,
189 +                      int        burst_size,
190 +                      dma_addr_t dst,
191 +                      int        dst_stride,
192 +                      dma_addr_t src,
193 +                      int        src_stride,
194 +                      int        w,
195 +                      int        h)
196 +{
197 +       cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
198 +                  BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
199 +                  BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
200 +       cb->dst = dst;
201 +       cb->src = src;
202 +       /*
203 +        * This is not really obvious from the DMA documentation,
204 +        * but the top 16 bits must be programmmed to "height -1"
205 +        * and not "height" in 2D mode.
206 +        */
207 +       cb->length = ((h - 1) << 16) | w;
208 +       cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
209 +       cb->pad[0] = 0;
210 +       cb->pad[1] = 0;
211 +}
212 +
213  static void bcm2708_fb_copyarea(struct fb_info *info,
214                                 const struct fb_copyarea *region)
215  {
216 -       /*print_debug("bcm2708_fb_copyarea\n"); */
217 -       cfb_copyarea(info, region);
218 +       struct bcm2708_fb *fb = to_bcm2708(info);
219 +       struct bcm2708_dma_cb *cb = fb->cb_base;
220 +       int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
221 +       /* Channel 0 supports larger bursts and is a bit faster */
222 +       int burst_size = (fb->dma_chan == 0) ? 8 : 2;
223 +       int pixels = region->width * region->height;
224 +
225 +       /* Fallback to cfb_copyarea() if we don't like something */
226 +       if (bytes_per_pixel > 4 ||
227 +           info->var.xres * info->var.yres > 1920 * 1200 ||
228 +           region->width <= 0 || region->width > info->var.xres ||
229 +           region->height <= 0 || region->height > info->var.yres ||
230 +           region->sx < 0 || region->sx >= info->var.xres ||
231 +           region->sy < 0 || region->sy >= info->var.yres ||
232 +           region->dx < 0 || region->dx >= info->var.xres ||
233 +           region->dy < 0 || region->dy >= info->var.yres ||
234 +           region->sx + region->width > info->var.xres ||
235 +           region->dx + region->width > info->var.xres ||
236 +           region->sy + region->height > info->var.yres ||
237 +           region->dy + region->height > info->var.yres) {
238 +               cfb_copyarea(info, region);
239 +               return;
240 +       }
241 +
242 +       if (region->dy == region->sy && region->dx > region->sx) {
243 +               /*
244 +                * A difficult case of overlapped copy. Because DMA can't
245 +                * copy individual scanlines in backwards direction, we need
246 +                * two-pass processing. We do it by programming a chain of dma
247 +                * control blocks in the first 16K part of the buffer and use
248 +                * the remaining 48K as the intermediate temporary scratch
249 +                * buffer. The buffer size is sufficient to handle up to
250 +                * 1920x1200 resolution at 32bpp pixel depth.
251 +                */
252 +               int y;
253 +               dma_addr_t control_block_pa = fb->cb_handle;
254 +               dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
255 +               int scanline_size = bytes_per_pixel * region->width;
256 +               int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
257 +
258 +               for (y = 0; y < region->height; y += scanlines_per_cb) {
259 +                       dma_addr_t src =
260 +                               fb->fb.fix.smem_start +
261 +                               bytes_per_pixel * region->sx +
262 +                               (region->sy + y) * fb->fb.fix.line_length;
263 +                       dma_addr_t dst =
264 +                               fb->fb.fix.smem_start +
265 +                               bytes_per_pixel * region->dx +
266 +                               (region->dy + y) * fb->fb.fix.line_length;
267 +
268 +                       if (region->height - y < scanlines_per_cb)
269 +                               scanlines_per_cb = region->height - y;
270 +
271 +                       set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
272 +                                  src, fb->fb.fix.line_length,
273 +                                  scanline_size, scanlines_per_cb);
274 +                       control_block_pa += sizeof(struct bcm2708_dma_cb);
275 +                       cb->next = control_block_pa;
276 +                       cb++;
277 +
278 +                       set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
279 +                                  scratchbuf, scanline_size,
280 +                                  scanline_size, scanlines_per_cb);
281 +                       control_block_pa += sizeof(struct bcm2708_dma_cb);
282 +                       cb->next = control_block_pa;
283 +                       cb++;
284 +               }
285 +               /* move the pointer back to the last dma control block */
286 +               cb--;
287 +       } else {
288 +               /* A single dma control block is enough. */
289 +               int sy, dy, stride;
290 +               if (region->dy <= region->sy) {
291 +                       /* processing from top to bottom */
292 +                       dy = region->dy;
293 +                       sy = region->sy;
294 +                       stride = fb->fb.fix.line_length;
295 +               } else {
296 +                       /* processing from bottom to top */
297 +                       dy = region->dy + region->height - 1;
298 +                       sy = region->sy + region->height - 1;
299 +                       stride = -fb->fb.fix.line_length;
300 +               }
301 +               set_dma_cb(cb, burst_size,
302 +                          fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
303 +                                                  bytes_per_pixel * region->dx,
304 +                          stride,
305 +                          fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
306 +                                                  bytes_per_pixel * region->sx,
307 +                          stride,
308 +                          region->width * bytes_per_pixel,
309 +                          region->height);
310 +       }
311 +
312 +       /* end of dma control blocks chain */
313 +       cb->next = 0;
314 +
315 +
316 +       if (pixels < dma_busy_wait_threshold) {
317 +               bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
318 +               bcm_dma_wait_idle(fb->dma_chan_base);
319 +       } else {
320 +               void __iomem *dma_chan = fb->dma_chan_base;
321 +               cb->info |= BCM2708_DMA_INT_EN;
322 +               bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
323 +               while (bcm_dma_is_busy(dma_chan)) {
324 +                       wait_event_interruptible(
325 +                               fb->dma_waitq,
326 +                               !bcm_dma_is_busy(dma_chan));
327 +               }
328 +               fb->stats.dma_irqs++;
329 +       }
330 +       fb->stats.dma_copies++;
331  }
332  
333  static void bcm2708_fb_imageblit(struct fb_info *info,
334 @@ -336,6 +537,24 @@ static void bcm2708_fb_imageblit(struct 
335         cfb_imageblit(info, image);
336  }
337  
338 +static irqreturn_t bcm2708_fb_dma_irq(int irq, void *cxt)
339 +{
340 +       struct bcm2708_fb *fb = cxt;
341 +
342 +       /* FIXME: should read status register to check if this is
343 +        * actually interrupting us or not, in case this interrupt
344 +        * ever becomes shared amongst several DMA channels
345 +        *
346 +        * readl(dma_chan_base + BCM2708_DMA_CS) & BCM2708_DMA_IRQ;
347 +        */
348 +
349 +       /* acknowledge the interrupt */
350 +       writel(BCM2708_DMA_INT, fb->dma_chan_base + BCM2708_DMA_CS);
351 +
352 +       wake_up(&fb->dma_waitq);
353 +       return IRQ_HANDLED;
354 +}
355 +
356  static struct fb_ops bcm2708_fb_ops = {
357         .owner = THIS_MODULE,
358         .fb_check_var = bcm2708_fb_check_var,
359 @@ -365,7 +584,7 @@ static int bcm2708_fb_register(struct bc
360                 fb->dma = dma;
361         }
362         fb->fb.fbops = &bcm2708_fb_ops;
363 -       fb->fb.flags = FBINFO_FLAG_DEFAULT;
364 +       fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
365         fb->fb.pseudo_palette = fb->cmap;
366  
367         strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
368 @@ -396,6 +615,7 @@ static int bcm2708_fb_register(struct bc
369         fb->fb.monspecs.dclkmax = 100000000;
370  
371         bcm2708_fb_set_bitfields(&fb->fb.var);
372 +       init_waitqueue_head(&fb->dma_waitq);
373  
374         /*
375          * Allocate colourmap.
376 @@ -421,14 +641,45 @@ static int bcm2708_fb_probe(struct platf
377         struct bcm2708_fb *fb;
378         int ret;
379  
380 -       fb = kmalloc(sizeof(struct bcm2708_fb), GFP_KERNEL);
381 +       fb = kzalloc(sizeof(struct bcm2708_fb), GFP_KERNEL);
382         if (!fb) {
383                 dev_err(&dev->dev,
384                         "could not allocate new bcm2708_fb struct\n");
385                 ret = -ENOMEM;
386                 goto free_region;
387         }
388 -       memset(fb, 0, sizeof(struct bcm2708_fb));
389 +
390 +       bcm2708_fb_debugfs_init(fb);
391 +
392 +       fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
393 +                                            &fb->cb_handle, GFP_KERNEL);
394 +       if (!fb->cb_base) {
395 +               dev_err(&dev->dev, "cannot allocate DMA CBs\n");
396 +               ret = -ENOMEM;
397 +               goto free_fb;
398 +       }
399 +
400 +       pr_info("BCM2708FB: allocated DMA memory %08x\n",
401 +              fb->cb_handle);
402 +
403 +       ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
404 +                                &fb->dma_chan_base, &fb->dma_irq);
405 +       if (ret < 0) {
406 +               dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
407 +               goto free_cb;
408 +       }
409 +       fb->dma_chan = ret;
410 +
411 +       ret = request_irq(fb->dma_irq, bcm2708_fb_dma_irq,
412 +                         0, "bcm2708_fb dma", fb);
413 +       if (ret) {
414 +               pr_err("%s: failed to request DMA irq\n", __func__);
415 +               goto free_dma_chan;
416 +       }
417 +
418 +
419 +       pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
420 +              fb->dma_chan, fb->dma_chan_base);
421  
422         fb->dev = dev;
423  
424 @@ -438,6 +689,11 @@ static int bcm2708_fb_probe(struct platf
425                 goto out;
426         }
427  
428 +free_dma_chan:
429 +       bcm_dma_chan_free(fb->dma_chan);
430 +free_cb:
431 +       dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
432 +free_fb:
433         kfree(fb);
434  free_region:
435         dev_err(&dev->dev, "probe failed, err %d\n", ret);
436 @@ -455,8 +711,15 @@ static int bcm2708_fb_remove(struct plat
437                 iounmap(fb->fb.screen_base);
438         unregister_framebuffer(&fb->fb);
439  
440 +       dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
441 +       bcm_dma_chan_free(fb->dma_chan);
442 +
443         dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
444                           fb->dma);
445 +       bcm2708_fb_debugfs_deinit(fb);
446 +
447 +       free_irq(fb->dma_irq, fb);
448 +
449         kfree(fb);
450  
451         return 0;