kernel: bump 5.4 to 5.4.48
[oweals/openwrt.git] / target / linux / bcm27xx / patches-5.4 / 950-0698-bcm2835-dma-Add-proper-40-bit-DMA-support.patch
1 From a2f673d1aa39752608e5f4838ed9656b38cbc4b9 Mon Sep 17 00:00:00 2001
2 From: Phil Elwell <phil@raspberrypi.org>
3 Date: Thu, 4 Apr 2019 13:33:47 +0100
4 Subject: [PATCH] bcm2835-dma: Add proper 40-bit DMA support
5
6 BCM2711 has 4 DMA channels with a 40-bit address range, allowing them
7 to access the full 4GB of memory on a Pi 4.
8
9 Signed-off-by: Phil Elwell <phil@raspberrypi.org>
10 ---
11  drivers/dma/bcm2835-dma.c | 485 ++++++++++++++++++++++++++++++++------
12  1 file changed, 412 insertions(+), 73 deletions(-)
13
14 --- a/drivers/dma/bcm2835-dma.c
15 +++ b/drivers/dma/bcm2835-dma.c
16 @@ -38,6 +38,11 @@
17  #define BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED 14
18  #define BCM2835_DMA_CHAN_NAME_SIZE 8
19  #define BCM2835_DMA_BULK_MASK  BIT(0)
20 +#define BCM2711_DMA_MEMCPY_CHAN 14
21 +
22 +struct bcm2835_dma_cfg_data {
23 +       u32     chan_40bit_mask;
24 +};
25  
26  /**
27   * struct bcm2835_dmadev - BCM2835 DMA controller
28 @@ -52,6 +57,7 @@ struct bcm2835_dmadev {
29         void __iomem *base;
30         struct device_dma_parameters dma_parms;
31         dma_addr_t zero_page;
32 +       const struct bcm2835_dma_cfg_data *cfg_data;
33  };
34  
35  struct bcm2835_dma_cb {
36 @@ -64,6 +70,17 @@ struct bcm2835_dma_cb {
37         uint32_t pad[2];
38  };
39  
40 +struct bcm2711_dma40_scb {
41 +       uint32_t ti;
42 +       uint32_t src;
43 +       uint32_t srci;
44 +       uint32_t dst;
45 +       uint32_t dsti;
46 +       uint32_t len;
47 +       uint32_t next_cb;
48 +       uint32_t rsvd;
49 +};
50 +
51  struct bcm2835_cb_entry {
52         struct bcm2835_dma_cb *cb;
53         dma_addr_t paddr;
54 @@ -84,6 +101,7 @@ struct bcm2835_chan {
55         unsigned int irq_flags;
56  
57         bool is_lite_channel;
58 +       bool is_40bit_channel;
59  };
60  
61  struct bcm2835_desc {
62 @@ -173,13 +191,118 @@ struct bcm2835_desc {
63  #define BCM2835_DMA_DATA_TYPE_S128     16
64  
65  /* Valid only for channels 0 - 14, 15 has its own base address */
66 -#define BCM2835_DMA_CHAN(n)    ((n) << 8) /* Base address */
67 +#define BCM2835_DMA_CHAN_SIZE  0x100
68 +#define BCM2835_DMA_CHAN(n)    ((n) * BCM2835_DMA_CHAN_SIZE) /* Base address */
69  #define BCM2835_DMA_CHANIO(base, n) ((base) + BCM2835_DMA_CHAN(n))
70  
71  /* the max dma length for different channels */
72  #define MAX_DMA_LEN SZ_1G
73  #define MAX_LITE_DMA_LEN (SZ_64K - 4)
74  
75 +/* 40-bit DMA support */
76 +#define BCM2711_DMA40_CS       0x00
77 +#define BCM2711_DMA40_CB       0x04
78 +#define BCM2711_DMA40_DEBUG    0x0c
79 +#define BCM2711_DMA40_TI       0x10
80 +#define BCM2711_DMA40_SRC      0x14
81 +#define BCM2711_DMA40_SRCI     0x18
82 +#define BCM2711_DMA40_DEST     0x1c
83 +#define BCM2711_DMA40_DESTI    0x20
84 +#define BCM2711_DMA40_LEN      0x24
85 +#define BCM2711_DMA40_NEXT_CB  0x28
86 +#define BCM2711_DMA40_DEBUG2   0x2c
87 +
88 +#define BCM2711_DMA40_ACTIVE           BIT(0)
89 +#define BCM2711_DMA40_END              BIT(1)
90 +#define BCM2711_DMA40_INT              BIT(2)
91 +#define BCM2711_DMA40_DREQ             BIT(3)  /* DREQ state */
92 +#define BCM2711_DMA40_RD_PAUSED                BIT(4)  /* Reading is paused */
93 +#define BCM2711_DMA40_WR_PAUSED                BIT(5)  /* Writing is paused */
94 +#define BCM2711_DMA40_DREQ_PAUSED      BIT(6)  /* Is paused by DREQ flow control */
95 +#define BCM2711_DMA40_WAITING_FOR_WRITES BIT(7)  /* Waiting for last write */
96 +#define BCM2711_DMA40_ERR              BIT(10)
97 +#define BCM2711_DMA40_QOS(x)           (((x) & 0x1f) << 16)
98 +#define BCM2711_DMA40_PANIC_QOS(x)     (((x) & 0x1f) << 20)
99 +#define BCM2711_DMA40_WAIT_FOR_WRITES  BIT(28)
100 +#define BCM2711_DMA40_DISDEBUG         BIT(29)
101 +#define BCM2711_DMA40_ABORT            BIT(30)
102 +#define BCM2711_DMA40_HALT             BIT(31)
103 +#define BCM2711_DMA40_CS_FLAGS(x) (x & (BCM2711_DMA40_QOS(15) | \
104 +                                       BCM2711_DMA40_PANIC_QOS(15) | \
105 +                                       BCM2711_DMA40_WAIT_FOR_WRITES | \
106 +                                       BCM2711_DMA40_DISDEBUG))
107 +
108 +/* Transfer information bits */
109 +#define BCM2711_DMA40_INTEN            BIT(0)
110 +#define BCM2711_DMA40_TDMODE           BIT(1) /* 2D-Mode */
111 +#define BCM2711_DMA40_WAIT_RESP                BIT(2) /* wait for AXI write to be acked */
112 +#define BCM2711_DMA40_WAIT_RD_RESP     BIT(3) /* wait for AXI read to complete */
113 +#define BCM2711_DMA40_PER_MAP(x)       ((x & 31) << 9) /* REQ source */
114 +#define BCM2711_DMA40_S_DREQ           BIT(14) /* enable SREQ for source */
115 +#define BCM2711_DMA40_D_DREQ           BIT(15) /* enable DREQ for destination */
116 +#define BCM2711_DMA40_S_WAIT(x)                ((x & 0xff) << 16) /* add DMA read-wait cycles */
117 +#define BCM2711_DMA40_D_WAIT(x)                ((x & 0xff) << 24) /* add DMA write-wait cycles */
118 +
119 +/* debug register bits */
120 +#define BCM2711_DMA40_DEBUG_WRITE_ERR          BIT(0)
121 +#define BCM2711_DMA40_DEBUG_FIFO_ERR           BIT(1)
122 +#define BCM2711_DMA40_DEBUG_READ_ERR           BIT(2)
123 +#define BCM2711_DMA40_DEBUG_READ_CB_ERR                BIT(3)
124 +#define BCM2711_DMA40_DEBUG_IN_ON_ERR          BIT(8)
125 +#define BCM2711_DMA40_DEBUG_ABORT_ON_ERR       BIT(9)
126 +#define BCM2711_DMA40_DEBUG_HALT_ON_ERR                BIT(10)
127 +#define BCM2711_DMA40_DEBUG_DISABLE_CLK_GATE   BIT(11)
128 +#define BCM2711_DMA40_DEBUG_RSTATE_SHIFT       14
129 +#define BCM2711_DMA40_DEBUG_RSTATE_BITS                4
130 +#define BCM2711_DMA40_DEBUG_WSTATE_SHIFT       18
131 +#define BCM2711_DMA40_DEBUG_WSTATE_BITS                4
132 +#define BCM2711_DMA40_DEBUG_RESET              BIT(23)
133 +#define BCM2711_DMA40_DEBUG_ID_SHIFT           24
134 +#define BCM2711_DMA40_DEBUG_ID_BITS            4
135 +#define BCM2711_DMA40_DEBUG_VERSION_SHIFT      28
136 +#define BCM2711_DMA40_DEBUG_VERSION_BITS       4
137 +
138 +/* Valid only for channels 0 - 3 (11 - 14) */
139 +#define BCM2711_DMA40_CHAN(n)  (((n) + 11) << 8) /* Base address */
140 +#define BCM2711_DMA40_CHANIO(base, n) ((base) + BCM2711_DMA_CHAN(n))
141 +
142 +/* the max dma length for different channels */
143 +#define MAX_DMA40_LEN SZ_1G
144 +
145 +#define BCM2711_DMA40_BURST_LEN(x)     ((min(x,16) - 1) << 8)
146 +#define BCM2711_DMA40_INC              BIT(12)
147 +#define BCM2711_DMA40_SIZE_32          (0 << 13)
148 +#define BCM2711_DMA40_SIZE_64          (1 << 13)
149 +#define BCM2711_DMA40_SIZE_128         (2 << 13)
150 +#define BCM2711_DMA40_SIZE_256         (3 << 13)
151 +#define BCM2711_DMA40_IGNORE           BIT(15)
152 +#define BCM2711_DMA40_STRIDE(x)                ((x) << 16) /* For 2D mode */
153 +
154 +#define BCM2711_DMA40_MEMCPY_FLAGS \
155 +       (BCM2711_DMA40_QOS(0) | \
156 +        BCM2711_DMA40_PANIC_QOS(0) | \
157 +        BCM2711_DMA40_WAIT_FOR_WRITES | \
158 +        BCM2711_DMA40_DISDEBUG)
159 +
160 +#define BCM2711_DMA40_MEMCPY_XFER_INFO \
161 +       (BCM2711_DMA40_SIZE_128 | \
162 +        BCM2711_DMA40_INC | \
163 +        BCM2711_DMA40_BURST_LEN(16))
164 +
165 +struct bcm2835_dmadev *memcpy_parent;
166 +static void __iomem *memcpy_chan;
167 +static struct bcm2711_dma40_scb *memcpy_scb;
168 +static dma_addr_t memcpy_scb_dma;
169 +DEFINE_SPINLOCK(memcpy_lock);
170 +
171 +static const struct bcm2835_dma_cfg_data bcm2835_dma_cfg = {
172 +       .chan_40bit_mask = 0,
173 +};
174 +
175 +static const struct bcm2835_dma_cfg_data bcm2711_dma_cfg = {
176 +       .chan_40bit_mask = BIT(11) | BIT(12) | BIT(13) | BIT(14),
177 +};
178 +
179  static inline size_t bcm2835_dma_max_frame_length(struct bcm2835_chan *c)
180  {
181         /* lite and normal channels have different max frame length */
182 @@ -209,6 +332,32 @@ static inline struct bcm2835_desc *to_bc
183         return container_of(t, struct bcm2835_desc, vd.tx);
184  }
185  
186 +static inline uint32_t to_bcm2711_ti(uint32_t info)
187 +{
188 +       return ((info & BCM2835_DMA_INT_EN) ? BCM2711_DMA40_INTEN : 0) |
189 +               ((info & BCM2835_DMA_WAIT_RESP) ? BCM2711_DMA40_WAIT_RESP : 0) |
190 +               ((info & BCM2835_DMA_S_DREQ) ?
191 +                (BCM2711_DMA40_S_DREQ | BCM2711_DMA40_WAIT_RD_RESP) : 0) |
192 +               ((info & BCM2835_DMA_D_DREQ) ? BCM2711_DMA40_D_DREQ : 0) |
193 +               BCM2711_DMA40_PER_MAP((info >> 16) & 0x1f);
194 +}
195 +
196 +static inline uint32_t to_bcm2711_srci(uint32_t info)
197 +{
198 +       return ((info & BCM2835_DMA_S_INC) ? BCM2711_DMA40_INC : 0);
199 +}
200 +
201 +static inline uint32_t to_bcm2711_dsti(uint32_t info)
202 +{
203 +       return ((info & BCM2835_DMA_D_INC) ? BCM2711_DMA40_INC : 0);
204 +}
205 +
206 +static inline uint32_t to_bcm2711_cbaddr(dma_addr_t addr)
207 +{
208 +       BUG_ON(addr & 0x1f);
209 +       return (addr >> 5);
210 +}
211 +
212  static void bcm2835_dma_free_cb_chain(struct bcm2835_desc *desc)
213  {
214         size_t i;
215 @@ -227,45 +376,53 @@ static void bcm2835_dma_desc_free(struct
216  }
217  
218  static void bcm2835_dma_create_cb_set_length(
219 -       struct bcm2835_chan *chan,
220 +       struct bcm2835_chan *c,
221         struct bcm2835_dma_cb *control_block,
222         size_t len,
223         size_t period_len,
224         size_t *total_len,
225         u32 finalextrainfo)
226  {
227 -       size_t max_len = bcm2835_dma_max_frame_length(chan);
228 +       size_t max_len = bcm2835_dma_max_frame_length(c);
229 +       uint32_t cb_len;
230  
231         /* set the length taking lite-channel limitations into account */
232 -       control_block->length = min_t(u32, len, max_len);
233 +       cb_len = min_t(u32, len, max_len);
234  
235 -       /* finished if we have no period_length */
236 -       if (!period_len)
237 -               return;
238 +       if (period_len) {
239 +               /*
240 +                * period_len means: that we need to generate
241 +                * transfers that are terminating at every
242 +                * multiple of period_len - this is typically
243 +                * used to set the interrupt flag in info
244 +                * which is required during cyclic transfers
245 +                */
246  
247 -       /*
248 -        * period_len means: that we need to generate
249 -        * transfers that are terminating at every
250 -        * multiple of period_len - this is typically
251 -        * used to set the interrupt flag in info
252 -        * which is required during cyclic transfers
253 -        */
254 +               /* have we filled in period_length yet? */
255 +               if (*total_len + cb_len < period_len) {
256 +                       /* update number of bytes in this period so far */
257 +                       *total_len += cb_len;
258 +               } else {
259 +                       /* calculate the length that remains to reach period_len */
260 +                       cb_len = period_len - *total_len;
261  
262 -       /* have we filled in period_length yet? */
263 -       if (*total_len + control_block->length < period_len) {
264 -               /* update number of bytes in this period so far */
265 -               *total_len += control_block->length;
266 -               return;
267 +                       /* reset total_length for next period */
268 +                       *total_len = 0;
269 +               }
270         }
271  
272 -       /* calculate the length that remains to reach period_length */
273 -       control_block->length = period_len - *total_len;
274 -
275 -       /* reset total_length for next period */
276 -       *total_len = 0;
277 -
278 -       /* add extrainfo bits in info */
279 -       control_block->info |= finalextrainfo;
280 +       if (c->is_40bit_channel) {
281 +               struct bcm2711_dma40_scb *scb =
282 +                       (struct bcm2711_dma40_scb *)control_block;
283 +
284 +               scb->len = cb_len;
285 +               /* add extrainfo bits to ti */
286 +               scb->ti |= to_bcm2711_ti(finalextrainfo);
287 +       } else {
288 +               control_block->length = cb_len;
289 +               /* add extrainfo bits to info */
290 +               control_block->info |= finalextrainfo;
291 +       }
292  }
293  
294  static inline size_t bcm2835_dma_count_frames_for_sg(
295 @@ -288,7 +445,7 @@ static inline size_t bcm2835_dma_count_f
296  /**
297   * bcm2835_dma_create_cb_chain - create a control block and fills data in
298   *
299 - * @chan:           the @dma_chan for which we run this
300 + * @c:              the @bcm2835_chan for which we run this
301   * @direction:      the direction in which we transfer
302   * @cyclic:         it is a cyclic transfer
303   * @info:           the default info bits to apply per controlblock
304 @@ -306,12 +463,11 @@ static inline size_t bcm2835_dma_count_f
305   * @gfp:            the GFP flag to use for allocation
306   */
307  static struct bcm2835_desc *bcm2835_dma_create_cb_chain(
308 -       struct dma_chan *chan, enum dma_transfer_direction direction,
309 +       struct bcm2835_chan *c, enum dma_transfer_direction direction,
310         bool cyclic, u32 info, u32 finalextrainfo, size_t frames,
311         dma_addr_t src, dma_addr_t dst, size_t buf_len,
312         size_t period_len, gfp_t gfp)
313  {
314 -       struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
315         size_t len = buf_len, total_len;
316         size_t frame;
317         struct bcm2835_desc *d;
318 @@ -343,11 +499,23 @@ static struct bcm2835_desc *bcm2835_dma_
319  
320                 /* fill in the control block */
321                 control_block = cb_entry->cb;
322 -               control_block->info = info;
323 -               control_block->src = src;
324 -               control_block->dst = dst;
325 -               control_block->stride = 0;
326 -               control_block->next = 0;
327 +               if (c->is_40bit_channel) {
328 +                       struct bcm2711_dma40_scb *scb =
329 +                               (struct bcm2711_dma40_scb *)control_block;
330 +                       scb->ti = to_bcm2711_ti(info);
331 +                       scb->src = lower_32_bits(src);
332 +                       scb->srci= upper_32_bits(src) | to_bcm2711_srci(info);
333 +                       scb->dst = lower_32_bits(dst);
334 +                       scb->dsti = upper_32_bits(dst) | to_bcm2711_dsti(info);
335 +                       scb->next_cb = 0;
336 +               } else {
337 +                       control_block->info = info;
338 +                       control_block->src = src;
339 +                       control_block->dst = dst;
340 +                       control_block->stride = 0;
341 +                       control_block->next = 0;
342 +               }
343 +
344                 /* set up length in control_block if requested */
345                 if (buf_len) {
346                         /* calculate length honoring period_length */
347 @@ -361,7 +529,11 @@ static struct bcm2835_desc *bcm2835_dma_
348                 }
349  
350                 /* link this the last controlblock */
351 -               if (frame)
352 +               if (frame && c->is_40bit_channel)
353 +                       ((struct bcm2711_dma40_scb *)
354 +                        d->cb_list[frame - 1].cb)->next_cb =
355 +                               to_bcm2711_cbaddr(cb_entry->paddr);
356 +               if (frame && !c->is_40bit_channel)
357                         d->cb_list[frame - 1].cb->next = cb_entry->paddr;
358  
359                 /* update src and dst and length */
360 @@ -371,11 +543,21 @@ static struct bcm2835_desc *bcm2835_dma_
361                         dst += control_block->length;
362  
363                 /* Length of total transfer */
364 -               d->size += control_block->length;
365 +               if (c->is_40bit_channel)
366 +                       d->size += ((struct bcm2711_dma40_scb *)control_block)->len;
367 +               else
368 +                       d->size += control_block->length;
369         }
370  
371         /* the last frame requires extra flags */
372 -       d->cb_list[d->frames - 1].cb->info |= finalextrainfo;
373 +       if (c->is_40bit_channel) {
374 +               struct bcm2711_dma40_scb *scb =
375 +                       (struct bcm2711_dma40_scb *)d->cb_list[d->frames-1].cb;
376 +
377 +               scb->ti |= to_bcm2711_ti(finalextrainfo);
378 +       } else {
379 +               d->cb_list[d->frames - 1].cb->info |= finalextrainfo;
380 +       }
381  
382         /* detect a size missmatch */
383         if (buf_len && (d->size != buf_len))
384 @@ -389,13 +571,12 @@ error_cb:
385  }
386  
387  static void bcm2835_dma_fill_cb_chain_with_sg(
388 -       struct dma_chan *chan,
389 +       struct bcm2835_chan *c,
390         enum dma_transfer_direction direction,
391         struct bcm2835_cb_entry *cb,
392         struct scatterlist *sgl,
393         unsigned int sg_len)
394  {
395 -       struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
396         size_t len, max_len;
397         unsigned int i;
398         dma_addr_t addr;
399 @@ -403,14 +584,35 @@ static void bcm2835_dma_fill_cb_chain_wi
400  
401         max_len = bcm2835_dma_max_frame_length(c);
402         for_each_sg(sgl, sgent, sg_len, i) {
403 -               for (addr = sg_dma_address(sgent), len = sg_dma_len(sgent);
404 -                    len > 0;
405 -                    addr += cb->cb->length, len -= cb->cb->length, cb++) {
406 -                       if (direction == DMA_DEV_TO_MEM)
407 -                               cb->cb->dst = addr;
408 -                       else
409 -                               cb->cb->src = addr;
410 -                       cb->cb->length = min(len, max_len);
411 +               if (c->is_40bit_channel) {
412 +                       struct bcm2711_dma40_scb *scb;
413 +
414 +                       for (addr = sg_dma_address(sgent),
415 +                                    len = sg_dma_len(sgent);
416 +                                    len > 0;
417 +                            addr += scb->len, len -= scb->len, cb++) {
418 +                               scb = (struct bcm2711_dma40_scb *)cb->cb;
419 +                               if (direction == DMA_DEV_TO_MEM) {
420 +                                       scb->dst = lower_32_bits(addr);
421 +                                       scb->dsti = upper_32_bits(addr) | BCM2711_DMA40_INC;
422 +                               } else {
423 +                                       scb->src = lower_32_bits(addr);
424 +                                       scb->srci = upper_32_bits(addr) | BCM2711_DMA40_INC;
425 +                               }
426 +                               scb->len = min(len, max_len);
427 +                       }
428 +               } else {
429 +                       for (addr = sg_dma_address(sgent),
430 +                                    len = sg_dma_len(sgent);
431 +                            len > 0;
432 +                            addr += cb->cb->length, len -= cb->cb->length,
433 +                            cb++) {
434 +                               if (direction == DMA_DEV_TO_MEM)
435 +                                       cb->cb->dst = addr;
436 +                               else
437 +                                       cb->cb->src = addr;
438 +                               cb->cb->length = min(len, max_len);
439 +                       }
440                 }
441         }
442  }
443 @@ -419,6 +621,10 @@ static void bcm2835_dma_abort(struct bcm
444  {
445         void __iomem *chan_base = c->chan_base;
446         long int timeout = 10000;
447 +       u32 wait_mask = BCM2835_DMA_WAITING_FOR_WRITES;
448 +
449 +       if (c->is_40bit_channel)
450 +               wait_mask = BCM2711_DMA40_WAITING_FOR_WRITES;
451  
452         /*
453          * A zero control block address means the channel is idle.
454 @@ -431,8 +637,7 @@ static void bcm2835_dma_abort(struct bcm
455         writel(0, chan_base + BCM2835_DMA_CS);
456  
457         /* Wait for any current AXI transfer to complete */
458 -       while ((readl(chan_base + BCM2835_DMA_CS) &
459 -               BCM2835_DMA_WAITING_FOR_WRITES) && --timeout)
460 +       while ((readl(chan_base + BCM2835_DMA_CS) & wait_mask) && --timeout)
461                 cpu_relax();
462  
463         /* Peripheral might be stuck and fail to signal AXI write responses */
464 @@ -457,9 +662,16 @@ static void bcm2835_dma_start_desc(struc
465  
466         c->desc = d = to_bcm2835_dma_desc(&vd->tx);
467  
468 -       writel(d->cb_list[0].paddr, c->chan_base + BCM2835_DMA_ADDR);
469 -       writel(BCM2835_DMA_ACTIVE | BCM2835_DMA_CS_FLAGS(c->dreq),
470 -              c->chan_base + BCM2835_DMA_CS);
471 +       if (c->is_40bit_channel) {
472 +               writel(to_bcm2711_cbaddr(d->cb_list[0].paddr),
473 +                      c->chan_base + BCM2711_DMA40_CB);
474 +               writel(BCM2711_DMA40_ACTIVE | BCM2711_DMA40_CS_FLAGS(c->dreq),
475 +                      c->chan_base + BCM2711_DMA40_CS);
476 +       } else {
477 +               writel(d->cb_list[0].paddr, c->chan_base + BCM2835_DMA_ADDR);
478 +               writel(BCM2835_DMA_ACTIVE | BCM2835_DMA_CS_FLAGS(c->dreq),
479 +                      c->chan_base + BCM2835_DMA_CS);
480 +       }
481  }
482  
483  static irqreturn_t bcm2835_dma_callback(int irq, void *data)
484 @@ -486,8 +698,7 @@ static irqreturn_t bcm2835_dma_callback(
485          * if this IRQ handler is threaded.) If the channel is finished, it
486          * will remain idle despite the ACTIVE flag being set.
487          */
488 -       writel(BCM2835_DMA_INT | BCM2835_DMA_ACTIVE |
489 -              BCM2835_DMA_CS_FLAGS(c->dreq),
490 +       writel(BCM2835_DMA_INT | BCM2835_DMA_ACTIVE,
491                c->chan_base + BCM2835_DMA_CS);
492  
493         d = c->desc;
494 @@ -590,9 +801,17 @@ static enum dma_status bcm2835_dma_tx_st
495                 struct bcm2835_desc *d = c->desc;
496                 dma_addr_t pos;
497  
498 -               if (d->dir == DMA_MEM_TO_DEV)
499 +               if (d->dir == DMA_MEM_TO_DEV && c->is_40bit_channel)
500 +                       pos = readl(c->chan_base + BCM2711_DMA40_SRC) +
501 +                               ((readl(c->chan_base + BCM2711_DMA40_SRCI) &
502 +                                 0xff) << 8);
503 +               else if (d->dir == DMA_MEM_TO_DEV && !c->is_40bit_channel)
504                         pos = readl(c->chan_base + BCM2835_DMA_SOURCE_AD);
505 -               else if (d->dir == DMA_DEV_TO_MEM)
506 +               else if (d->dir == DMA_DEV_TO_MEM && c->is_40bit_channel)
507 +                       pos = readl(c->chan_base + BCM2711_DMA40_DEST) +
508 +                               ((readl(c->chan_base + BCM2711_DMA40_DESTI) &
509 +                                 0xff) << 8);
510 +               else if (d->dir == DMA_DEV_TO_MEM && !c->is_40bit_channel)
511                         pos = readl(c->chan_base + BCM2835_DMA_DEST_AD);
512                 else
513                         pos = 0;
514 @@ -638,7 +857,7 @@ static struct dma_async_tx_descriptor *b
515         frames = bcm2835_dma_frames_for_length(len, max_len);
516  
517         /* allocate the CB chain - this also fills in the pointers */
518 -       d = bcm2835_dma_create_cb_chain(chan, DMA_MEM_TO_MEM, false,
519 +       d = bcm2835_dma_create_cb_chain(c, DMA_MEM_TO_MEM, false,
520                                         info, extra, frames,
521                                         src, dst, len, 0, GFP_KERNEL);
522         if (!d)
523 @@ -673,11 +892,21 @@ static struct dma_async_tx_descriptor *b
524                 if (c->cfg.src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
525                         return NULL;
526                 src = c->cfg.src_addr;
527 +               /*
528 +                * One would think it ought to be possible to get the physical
529 +                * to dma address mapping information from the dma-ranges DT
530 +                * property, but I've not found a way yet that doesn't involve
531 +                * open-coding the whole thing.
532 +                */
533 +               if (c->is_40bit_channel)
534 +                   src |= 0x400000000ull;
535                 info |= BCM2835_DMA_S_DREQ | BCM2835_DMA_D_INC;
536         } else {
537                 if (c->cfg.dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
538                         return NULL;
539                 dst = c->cfg.dst_addr;
540 +               if (c->is_40bit_channel)
541 +                   dst |= 0x400000000ull;
542                 info |= BCM2835_DMA_D_DREQ | BCM2835_DMA_S_INC;
543         }
544  
545 @@ -685,7 +914,7 @@ static struct dma_async_tx_descriptor *b
546         frames = bcm2835_dma_count_frames_for_sg(c, sgl, sg_len);
547  
548         /* allocate the CB chain */
549 -       d = bcm2835_dma_create_cb_chain(chan, direction, false,
550 +       d = bcm2835_dma_create_cb_chain(c, direction, false,
551                                         info, extra,
552                                         frames, src, dst, 0, 0,
553                                         GFP_NOWAIT);
554 @@ -693,7 +922,7 @@ static struct dma_async_tx_descriptor *b
555                 return NULL;
556  
557         /* fill in frames with scatterlist pointers */
558 -       bcm2835_dma_fill_cb_chain_with_sg(chan, direction, d->cb_list,
559 +       bcm2835_dma_fill_cb_chain_with_sg(c, direction, d->cb_list,
560                                           sgl, sg_len);
561  
562         return vchan_tx_prep(&c->vc, &d->vd, flags);
563 @@ -747,12 +976,16 @@ static struct dma_async_tx_descriptor *b
564                 if (c->cfg.src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
565                         return NULL;
566                 src = c->cfg.src_addr;
567 +               if (c->is_40bit_channel)
568 +                   src |= 0x400000000ull;
569                 dst = buf_addr;
570                 info |= BCM2835_DMA_S_DREQ | BCM2835_DMA_D_INC;
571         } else {
572                 if (c->cfg.dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
573                         return NULL;
574                 dst = c->cfg.dst_addr;
575 +               if (c->is_40bit_channel)
576 +                   dst |= 0x400000000ull;
577                 src = buf_addr;
578                 info |= BCM2835_DMA_D_DREQ | BCM2835_DMA_S_INC;
579  
580 @@ -772,7 +1005,7 @@ static struct dma_async_tx_descriptor *b
581          * note that we need to use GFP_NOWAIT, as the ALSA i2s dmaengine
582          * implementation calls prep_dma_cyclic with interrupts disabled.
583          */
584 -       d = bcm2835_dma_create_cb_chain(chan, direction, true,
585 +       d = bcm2835_dma_create_cb_chain(c, direction, true,
586                                         info, extra,
587                                         frames, src, dst, buf_len,
588                                         period_len, GFP_NOWAIT);
589 @@ -780,7 +1013,12 @@ static struct dma_async_tx_descriptor *b
590                 return NULL;
591  
592         /* wrap around into a loop */
593 -       d->cb_list[d->frames - 1].cb->next = d->cb_list[0].paddr;
594 +       if (c->is_40bit_channel)
595 +               ((struct bcm2711_dma40_scb *)
596 +                d->cb_list[frames - 1].cb)->next_cb =
597 +                       to_bcm2711_cbaddr(d->cb_list[0].paddr);
598 +       else
599 +               d->cb_list[d->frames - 1].cb->next = d->cb_list[0].paddr;
600  
601         return vchan_tx_prep(&c->vc, &d->vd, flags);
602  }
603 @@ -844,9 +1082,11 @@ static int bcm2835_dma_chan_init(struct
604         c->irq_number = irq;
605         c->irq_flags = irq_flags;
606  
607 -       /* check in DEBUG register if this is a LITE channel */
608 -       if (readl(c->chan_base + BCM2835_DMA_DEBUG) &
609 -               BCM2835_DMA_DEBUG_LITE)
610 +       /* check for 40bit and lite channels */
611 +       if (d->cfg_data->chan_40bit_mask & BIT(chan_id))
612 +               c->is_40bit_channel = true;
613 +       else if (readl(c->chan_base + BCM2835_DMA_DEBUG) &
614 +                BCM2835_DMA_DEBUG_LITE)
615                 c->is_lite_channel = true;
616  
617         return 0;
618 @@ -866,8 +1106,58 @@ static void bcm2835_dma_free(struct bcm2
619                              DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
620  }
621  
622 +int bcm2711_dma40_memcpy_init(void)
623 +{
624 +       if (!memcpy_parent)
625 +               return -EPROBE_DEFER;
626 +
627 +       if (!memcpy_chan)
628 +               return -EINVAL;
629 +
630 +       if (!memcpy_scb)
631 +               return -ENOMEM;
632 +
633 +       return 0;
634 +}
635 +EXPORT_SYMBOL(bcm2711_dma40_memcpy_init);
636 +
637 +void bcm2711_dma40_memcpy(dma_addr_t dst, dma_addr_t src, size_t size)
638 +{
639 +       struct bcm2711_dma40_scb *scb = memcpy_scb;
640 +       unsigned long flags;
641 +
642 +       if (!scb) {
643 +               pr_err("bcm2711_dma40_memcpy not initialised!\n");
644 +               return;
645 +       }
646 +
647 +       spin_lock_irqsave(&memcpy_lock, flags);
648 +
649 +       scb->ti = 0;
650 +       scb->src = lower_32_bits(src);
651 +       scb->srci = upper_32_bits(src) | BCM2711_DMA40_MEMCPY_XFER_INFO;
652 +       scb->dst = lower_32_bits(dst);
653 +       scb->dsti = upper_32_bits(dst) | BCM2711_DMA40_MEMCPY_XFER_INFO;
654 +       scb->len = size;
655 +       scb->next_cb = 0;
656 +
657 +       writel((u32)(memcpy_scb_dma >> 5), memcpy_chan + BCM2711_DMA40_CB);
658 +       writel(BCM2711_DMA40_MEMCPY_FLAGS + BCM2711_DMA40_ACTIVE,
659 +              memcpy_chan + BCM2711_DMA40_CS);
660 +
661 +       /* Poll for completion */
662 +       while (!(readl(memcpy_chan + BCM2711_DMA40_CS) & BCM2711_DMA40_END))
663 +               cpu_relax();
664 +
665 +       writel(BCM2711_DMA40_END, memcpy_chan + BCM2711_DMA40_CS);
666 +
667 +       spin_unlock_irqrestore(&memcpy_lock, flags);
668 +}
669 +EXPORT_SYMBOL(bcm2711_dma40_memcpy);
670 +
671  static const struct of_device_id bcm2835_dma_of_match[] = {
672 -       { .compatible = "brcm,bcm2835-dma", },
673 +       { .compatible = "brcm,bcm2835-dma", .data = &bcm2835_dma_cfg },
674 +       { .compatible = "brcm,bcm2711-dma", .data = &bcm2711_dma_cfg },
675         {},
676  };
677  MODULE_DEVICE_TABLE(of, bcm2835_dma_of_match);
678 @@ -899,6 +1189,8 @@ static int bcm2835_dma_probe(struct plat
679         int irq_flags;
680         uint32_t chans_available;
681         char chan_name[BCM2835_DMA_CHAN_NAME_SIZE];
682 +       const struct of_device_id *of_id;
683 +       int chan_count, chan_start, chan_end;
684  
685         if (!pdev->dev.dma_mask)
686                 pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
687 @@ -920,9 +1212,13 @@ static int bcm2835_dma_probe(struct plat
688         base = devm_ioremap_resource(&pdev->dev, res);
689         if (IS_ERR(base))
690                 return PTR_ERR(base);
691 -       rc = bcm_dmaman_probe(pdev, base, BCM2835_DMA_BULK_MASK);
692 -       if (rc)
693 -               dev_err(&pdev->dev, "Failed to initialize the legacy API\n");
694 +
695 +       /* The set of channels can be split across multiple instances. */
696 +       chan_start = ((u32)(uintptr_t)base / BCM2835_DMA_CHAN_SIZE) & 0xf;
697 +       base -= BCM2835_DMA_CHAN(chan_start);
698 +       chan_count = resource_size(res) / BCM2835_DMA_CHAN_SIZE;
699 +       chan_end = min(chan_start + chan_count,
700 +                        BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED + 1);
701  
702         od->base = base;
703  
704 @@ -959,6 +1255,14 @@ static int bcm2835_dma_probe(struct plat
705                 return -ENOMEM;
706         }
707  
708 +       of_id = of_match_node(bcm2835_dma_of_match, pdev->dev.of_node);
709 +       if (!of_id) {
710 +               dev_err(&pdev->dev, "Failed to match compatible string\n");
711 +               return -EINVAL;
712 +       }
713 +
714 +       od->cfg_data = of_id->data;
715 +
716         /* Request DMA channel mask from device tree */
717         if (of_property_read_u32(pdev->dev.of_node,
718                         "brcm,dma-channel-mask",
719 @@ -968,11 +1272,34 @@ static int bcm2835_dma_probe(struct plat
720                 goto err_no_dma;
721         }
722  
723 -       /* Channel 0 is used by the legacy API */
724 -       chans_available &= ~BCM2835_DMA_BULK_MASK;
725 +       /* One channel is reserved for the legacy API */
726 +       if (chans_available & BCM2835_DMA_BULK_MASK) {
727 +               rc = bcm_dmaman_probe(pdev, base,
728 +                                     chans_available & BCM2835_DMA_BULK_MASK);
729 +               if (rc)
730 +                       dev_err(&pdev->dev,
731 +                               "Failed to initialize the legacy API\n");
732 +
733 +               chans_available &= ~BCM2835_DMA_BULK_MASK;
734 +       }
735 +
736 +       /* And possibly one for the 40-bit DMA memcpy API */
737 +       if (chans_available & od->cfg_data->chan_40bit_mask &
738 +           BIT(BCM2711_DMA_MEMCPY_CHAN)) {
739 +               memcpy_parent = od;
740 +               memcpy_chan = BCM2835_DMA_CHANIO(base, BCM2711_DMA_MEMCPY_CHAN);
741 +               memcpy_scb = dma_alloc_coherent(memcpy_parent->ddev.dev,
742 +                                               sizeof(*memcpy_scb),
743 +                                               &memcpy_scb_dma, GFP_KERNEL);
744 +               if (!memcpy_scb)
745 +                       dev_warn(&pdev->dev,
746 +                                "Failed to allocated memcpy scb\n");
747 +
748 +               chans_available &= ~BIT(BCM2711_DMA_MEMCPY_CHAN);
749 +       }
750  
751         /* get irqs for each channel that we support */
752 -       for (i = 0; i <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; i++) {
753 +       for (i = chan_start; i < chan_end; i++) {
754                 /* skip masked out channels */
755                 if (!(chans_available & (1 << i))) {
756                         irq[i] = -1;
757 @@ -995,13 +1322,17 @@ static int bcm2835_dma_probe(struct plat
758                 irq[i] = platform_get_irq(pdev, i < 11 ? i : 11);
759         }
760  
761 +       chan_count = 0;
762 +
763         /* get irqs for each channel */
764 -       for (i = 0; i <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; i++) {
765 +       for (i = chan_start; i < chan_end; i++) {
766                 /* skip channels without irq */
767                 if (irq[i] < 0)
768                         continue;
769  
770                 /* check if there are other channels that also use this irq */
771 +               /* FIXME: This will fail if interrupts are shared across
772 +                  instances */
773                 irq_flags = 0;
774                 for (j = 0; j <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; j++)
775                         if ((i != j) && (irq[j] == irq[i])) {
776 @@ -1013,9 +1344,10 @@ static int bcm2835_dma_probe(struct plat
777                 rc = bcm2835_dma_chan_init(od, i, irq[i], irq_flags);
778                 if (rc)
779                         goto err_no_dma;
780 +               chan_count++;
781         }
782  
783 -       dev_dbg(&pdev->dev, "Initialized %i DMA channels\n", i);
784 +       dev_dbg(&pdev->dev, "Initialized %i DMA channels\n", chan_count);
785  
786         /* Device-tree DMA controller registration */
787         rc = of_dma_controller_register(pdev->dev.of_node,
788 @@ -1047,6 +1379,13 @@ static int bcm2835_dma_remove(struct pla
789  
790         bcm_dmaman_remove(pdev);
791         dma_async_device_unregister(&od->ddev);
792 +       if (memcpy_parent == od) {
793 +               dma_free_coherent(&pdev->dev, sizeof(*memcpy_scb), memcpy_scb,
794 +                                 memcpy_scb_dma);
795 +               memcpy_parent = NULL;
796 +               memcpy_scb = NULL;
797 +               memcpy_chan = NULL;
798 +       }
799         bcm2835_dma_free(od);
800  
801         return 0;