arch/sh/lib/memcpy-sh4.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * "memcpy" implementation of SuperH
   4  *
   5  * Copyright (C) 1999  Niibe Yutaka
   6  * Copyright (c) 2002  STMicroelectronics Ltd
   7  *   Modified from memcpy.S and micro-optimised for SH4
   8  *   Stuart Menefy (stuart.menefy@st.com)
   9  *
  10  */
  11 #include <linux/linkage.h>
  12
  13 /*
  14  * void *memcpy(void *dst, const void *src, size_t n);
  15  *
  16  * It is assumed that there is no overlap between src and dst.
  17  * If there is an overlap, then the results are undefined.
  18  */
  19
  20         !
  21         !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
  22         !
  23
  24         ! Size is 16 or greater, and may have trailing bytes
  25
  26         .balign 32
  27 .Lcase1:
  28         ! Read a long word and write a long word at once
  29         ! At the start of each iteration, r7 contains last long load
  30         add     #-1,r5          !  79 EX
  31         mov     r4,r2           !   5 MT (0 cycles latency)
  32
  33         mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
  34         add     #-4,r5          !  50 EX
  35
  36         add     #7,r2           !  79 EX
  37         !
  38 #ifdef CONFIG_CPU_LITTLE_ENDIAN
  39         ! 6 cycles, 4 bytes per iteration
  40 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
  41         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
  42
  43         cmp/hi  r2,r0           !  57 MT
  44         shll16  r3              ! 103 EX
  45
  46         mov     r1,r6           !   5 MT (latency=0)
  47         shll8   r3              ! 102 EX                ! Oxxx
  48
  49         shlr8   r6              ! 106 EX                ! xNML
  50         mov     r1, r7          !   5 MT (latency=0)
  51
  52         or      r6,r3           !  82 EX                ! ONML
  53         bt/s    3b              ! 109 BR
  54
  55          mov.l  r3,@-r0         !  30 LS
  56 #else
  57 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
  58         mov     r7,r3           !   5 MT (latency=0)    ! OPQR
  59
  60         cmp/hi  r2,r0           !  57 MT
  61         shlr16  r3              ! 107 EX
  62
  63         shlr8   r3              ! 106 EX                ! xxxO
  64         mov     r1,r6           !   5 MT (latency=0)
  65
  66         shll8   r6              ! 102 EX                ! LMNx
  67         mov     r1,r7           !   5 MT (latency=0)
  68
  69         or      r6,r3           !  82 EX                ! LMNO
  70         bt/s    3b              ! 109 BR
  71
  72          mov.l  r3,@-r0         !  30 LS
  73 #endif
  74         ! Finally, copy a byte at once, if necessary
  75
  76         add     #4,r5           !  50 EX
  77         cmp/eq  r4,r0           !  54 MT
  78
  79         add     #-6,r2          !  50 EX
  80         bt      9f              ! 109 BR
  81
  82 8:      cmp/hi  r2,r0           !  57 MT
  83         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
  84
  85         bt/s    8b              ! 109 BR
  86
  87          mov.b  r1,@-r0         !  29 LS
  88
  89 9:      rts
  90          nop
  91
  92
  93         !
  94         !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
  95         !
  96
  97         ! Size is 16 or greater, and may have trailing bytes
  98
  99         .balign 32
 100 .Lcase3:
 101         ! Read a long word and write a long word at once
 102         ! At the start of each iteration, r7 contains last long load
 103         add     #-3,r5          ! 79 EX
 104         mov     r4,r2           !  5 MT (0 cycles latency)
 105
 106         mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
 107         add     #-4,r5          ! 50 EX
 108
 109         add     #7,r2           !  79 EX
 110         !
 111 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 112         ! 6 cycles, 4 bytes per iteration
 113 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
 114         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
 115
 116         cmp/hi  r2,r0           !  57 MT
 117         shll8   r3              ! 102 EX                ! QPOx
 118
 119         mov     r1,r6           !   5 MT (latency=0)
 120         shlr16  r6              ! 107 EX
 121
 122         shlr8   r6              ! 106 EX                ! xxxN
 123         mov     r1, r7          !   5 MT (latency=0)
 124
 125         or      r6,r3           !  82 EX                ! QPON
 126         bt/s    3b              ! 109 BR
 127
 128          mov.l  r3,@-r0         !  30 LS
 129 #else
 130 3:      mov     r7,r3           ! OPQR
 131         shlr8   r3              ! xOPQ
 132         mov.l   @(r0,r5),r7     ! KLMN
 133         mov     r7,r6
 134         shll16  r6
 135         shll8   r6              ! Nxxx
 136         or      r6,r3           ! NOPQ
 137         cmp/hi  r2,r0
 138         bt/s    3b
 139          mov.l  r3,@-r0
 140 #endif
 141
 142         ! Finally, copy a byte at once, if necessary
 143
 144         add     #6,r5           !  50 EX
 145         cmp/eq  r4,r0           !  54 MT
 146
 147         add     #-6,r2          !  50 EX
 148         bt      9f              ! 109 BR
 149
 150 8:      cmp/hi  r2,r0           !  57 MT
 151         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 152
 153         bt/s    8b              ! 109 BR
 154
 155          mov.b  r1,@-r0         !  29 LS
 156
 157 9:      rts
 158          nop
 159
 160 ENTRY(memcpy)
 161
 162         ! Calculate the invariants which will be used in the remainder
 163         ! of the code:
 164         !
 165         !      r4   -->  [ ...  ] DST             [ ...  ] SRC
 166         !                [ ...  ]                 [ ...  ]
 167         !                  :                        :
 168         !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
 169         !
 170         !
 171
 172         ! Short circuit the common case of src, dst and len being 32 bit aligned
 173         ! and test for zero length move
 174
 175         mov     r6, r0          !   5 MT (0 cycle latency)
 176         or      r4, r0          !  82 EX
 177
 178         or      r5, r0          !  82 EX
 179         tst     r6, r6          !  86 MT
 180
 181         bt/s    99f             ! 111 BR                (zero len)
 182          tst    #3, r0          !  87 MT
 183
 184         mov     r4, r0          !   5 MT (0 cycle latency)
 185         add     r6, r0          !  49 EX
 186
 187         mov     #16, r1         !   6 EX
 188         bt/s    .Lcase00        ! 111 BR                (aligned)
 189
 190          sub    r4, r5          !  75 EX
 191
 192         ! Arguments are not nicely long word aligned or zero len.
 193         ! Check for small copies, and if so do a simple byte at a time copy.
 194         !
 195         ! Deciding on an exact value of 'small' is not easy, as the point at which
 196         ! using the optimised routines become worthwhile varies (these are the
 197         ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
 198         !       size    byte-at-time    long    word    byte
 199         !       16      42              39-40   46-50   50-55
 200         !       24      58              43-44   54-58   62-67
 201         !       36      82              49-50   66-70   80-85
 202         ! However the penalty for getting it 'wrong' is much higher for long word
 203         ! aligned data (and this is more common), so use a value of 16.
 204
 205         cmp/gt  r6,r1           !  56 MT
 206
 207         add     #-1,r5          !  50 EX
 208         bf/s    6f              ! 108 BR                (not small)
 209
 210          mov    r5, r3          !   5 MT (latency=0)
 211         shlr    r6              ! 104 EX
 212
 213         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 214         bf/s    4f              ! 111 BR
 215
 216          add    #-1,r3          !  50 EX
 217         tst     r6, r6          !  86 MT
 218
 219         bt/s    98f             ! 110 BR
 220          mov.b  r1,@-r0         !  29 LS
 221
 222         ! 4 cycles, 2 bytes per iteration
 223 3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 224
 225 4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
 226         dt      r6              !  67 EX
 227
 228         mov.b   r1,@-r0         !  29 LS
 229         bf/s    3b              ! 111 BR
 230
 231          mov.b  r2,@-r0         !  29 LS
 232 98:
 233         rts
 234          nop
 235
 236 99:     rts
 237          mov    r4, r0
 238
 239         ! Size is not small, so its worthwhile looking for optimisations.
 240         ! First align destination to a long word boundary.
 241         !
 242         ! r5 = normal value -1
 243
 244 6:      tst     #3, r0          !  87 MT
 245         mov     #3, r3          !   6 EX
 246
 247         bt/s    2f              ! 111 BR
 248          and    r0,r3           !  78 EX
 249
 250         ! 3 cycles, 1 byte per iteration
 251 1:      dt      r3              !  67 EX
 252         mov.b   @(r0,r5),r1     !  19 LS (latency=2)
 253
 254         add     #-1, r6         !  79 EX
 255         bf/s    1b              ! 109 BR
 256
 257          mov.b  r1,@-r0         !  28 LS
 258
 259 2:      add     #1, r5          !  79 EX
 260
 261         ! Now select the appropriate bulk transfer code based on relative
 262         ! alignment of src and dst.
 263
 264         mov     r0, r3          !   5 MT (latency=0)
 265
 266         mov     r5, r0          !   5 MT (latency=0)
 267         tst     #1, r0          !  87 MT
 268
 269         bf/s    1f              ! 111 BR
 270          mov    #64, r7         !   6 EX
 271
 272         ! bit 0 clear
 273
 274         cmp/ge  r7, r6          !  55 MT
 275
 276         bt/s    2f              ! 111 BR
 277          tst    #2, r0          !  87 MT
 278
 279         ! small
 280         bt/s    .Lcase0
 281          mov    r3, r0
 282
 283         bra     .Lcase2
 284          nop
 285
 286         ! big
 287 2:      bt/s    .Lcase0b
 288          mov    r3, r0
 289
 290         bra     .Lcase2b
 291          nop
 292
 293         ! bit 0 set
 294 1:      tst     #2, r0          ! 87 MT
 295
 296         bt/s    .Lcase1
 297          mov    r3, r0
 298
 299         bra     .Lcase3
 300          nop
 301
 302
 303         !
 304         !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
 305         !
 306
 307         ! src, dst and size are all long word aligned
 308         ! size is non-zero
 309
 310         .balign 32
 311 .Lcase00:
 312         mov     #64, r1         !   6 EX
 313         mov     r5, r3          !   5 MT (latency=0)
 314
 315         cmp/gt  r6, r1          !  56 MT
 316         add     #-4, r5         !  50 EX
 317
 318         bf      .Lcase00b       ! 108 BR                (big loop)
 319         shlr2   r6              ! 105 EX
 320
 321         shlr    r6              ! 104 EX
 322         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 323
 324         bf/s    4f              ! 111 BR
 325          add    #-8, r3         !  50 EX
 326
 327         tst     r6, r6          !  86 MT
 328         bt/s    5f              ! 110 BR
 329
 330          mov.l  r1,@-r0         !  30 LS
 331
 332         ! 4 cycles, 2 long words per iteration
 333 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 334
 335 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 336         dt      r6              !  67 EX
 337
 338         mov.l   r1, @-r0        !  30 LS
 339         bf/s    3b              ! 109 BR
 340
 341          mov.l  r2, @-r0        !  30 LS
 342
 343 5:      rts
 344          nop
 345
 346
 347         ! Size is 16 or greater and less than 64, but may have trailing bytes
 348
 349         .balign 32
 350 .Lcase0:
 351         add     #-4, r5         !  50 EX
 352         mov     r4, r7          !   5 MT (latency=0)
 353
 354         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 355         mov     #4, r2          !   6 EX
 356
 357         add     #11, r7         !  50 EX
 358         tst     r2, r6          !  86 MT
 359
 360         mov     r5, r3          !   5 MT (latency=0)
 361         bt/s    4f              ! 111 BR
 362
 363          add    #-4, r3         !  50 EX
 364         mov.l   r1,@-r0         !  30 LS
 365
 366         ! 4 cycles, 2 long words per iteration
 367 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 368
 369 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 370         cmp/hi  r7, r0
 371
 372         mov.l   r1, @-r0        !  30 LS
 373         bt/s    3b              ! 109 BR
 374
 375          mov.l  r2, @-r0        !  30 LS
 376
 377         ! Copy the final 0-3 bytes
 378
 379         add     #3,r5           !  50 EX
 380
 381         cmp/eq  r0, r4          !  54 MT
 382         add     #-10, r7        !  50 EX
 383
 384         bt      9f              ! 110 BR
 385
 386         ! 3 cycles, 1 byte per iteration
 387 1:      mov.b   @(r0,r5),r1     !  19 LS
 388         cmp/hi  r7,r0           !  57 MT
 389
 390         bt/s    1b              ! 111 BR
 391          mov.b  r1,@-r0         !  28 LS
 392
 393 9:      rts
 394          nop
 395
 396         ! Size is at least 64 bytes, so will be going round the big loop at least once.
 397         !
 398         !   r2 = rounded up r4
 399         !   r3 = rounded down r0
 400
 401         .balign 32
 402 .Lcase0b:
 403         add     #-4, r5         !  50 EX
 404
 405 .Lcase00b:
 406         mov     r0, r3          !   5 MT (latency=0)
 407         mov     #(~0x1f), r1    !   6 EX
 408
 409         and     r1, r3          !  78 EX
 410         mov     r4, r2          !   5 MT (latency=0)
 411
 412         cmp/eq  r3, r0          !  54 MT
 413         add     #0x1f, r2       !  50 EX
 414
 415         bt/s    1f              ! 110 BR
 416          and    r1, r2          !  78 EX
 417
 418         ! copy initial words until cache line aligned
 419
 420         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 421         tst     #4, r0          !  87 MT
 422
 423         mov     r5, r6          !   5 MT (latency=0)
 424         add     #-4, r6         !  50 EX
 425
 426         bt/s    4f              ! 111 BR
 427          add    #8, r3          !  50 EX
 428
 429         tst     #0x18, r0       !  87 MT
 430
 431         bt/s    1f              ! 109 BR
 432          mov.l  r1,@-r0         !  30 LS
 433
 434         ! 4 cycles, 2 long words per iteration
 435 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
 436
 437 4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
 438         cmp/eq  r3, r0          !  54 MT
 439
 440         mov.l   r1, @-r0        !  30 LS
 441         bf/s    3b              ! 109 BR
 442
 443          mov.l  r7, @-r0        !  30 LS
 444
 445         ! Copy the cache line aligned blocks
 446         !
 447         ! In use: r0, r2, r4, r5
 448         ! Scratch: r1, r3, r6, r7
 449         !
 450         ! We could do this with the four scratch registers, but if src
 451         ! and dest hit the same cache line, this will thrash, so make
 452         ! use of additional registers.
 453         !
 454         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
 455         !   r5:  src (was r0+r5)
 456         !   r1:  dest (was r0)
 457         ! this can be reversed at the end, so we don't need to save any extra
 458         ! state.
 459         !
 460 1:      mov.l   r8, @-r15       !  30 LS
 461         add     r0, r5          !  49 EX
 462
 463         mov.l   r9, @-r15       !  30 LS
 464         mov     r0, r1          !   5 MT (latency=0)
 465
 466         mov.l   r10, @-r15      !  30 LS
 467         add     #-0x1c, r5      !  50 EX
 468
 469         mov.l   r11, @-r15      !  30 LS
 470
 471         ! 16 cycles, 32 bytes per iteration
 472 2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
 473         add     #-0x20, r1      ! 50 EX
 474         mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
 475         mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
 476         mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
 477         mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
 478         mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
 479         mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
 480         mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
 481         movca.l r0,@r1          ! 40 LS (latency=3-7)
 482         mov.l   r3,@(0x04,r1)   ! 33 LS
 483         mov.l   r6,@(0x08,r1)   ! 33 LS
 484         mov.l   r7,@(0x0c,r1)   ! 33 LS
 485
 486         mov.l   r8,@(0x10,r1)   ! 33 LS
 487         add     #-0x20, r5      ! 50 EX
 488
 489         mov.l   r9,@(0x14,r1)   ! 33 LS
 490         cmp/eq  r2,r1           ! 54 MT
 491
 492         mov.l   r10,@(0x18,r1)  !  33 LS
 493         bf/s    2b              ! 109 BR
 494
 495          mov.l  r11,@(0x1c,r1)  !  33 LS
 496
 497         mov     r1, r0          !   5 MT (latency=0)
 498
 499         mov.l   @r15+, r11      !  15 LS
 500         sub     r1, r5          !  75 EX
 501
 502         mov.l   @r15+, r10      !  15 LS
 503         cmp/eq  r4, r0          !  54 MT
 504
 505         bf/s    1f              ! 109 BR
 506          mov.l   @r15+, r9      !  15 LS
 507
 508         rts
 509 1:       mov.l  @r15+, r8       !  15 LS
 510         sub     r4, r1          !  75 EX                (len remaining)
 511
 512         ! number of trailing bytes is non-zero
 513         !
 514         ! invariants restored (r5 already decremented by 4)
 515         ! also r1=num bytes remaining
 516
 517         mov     #4, r2          !   6 EX
 518         mov     r4, r7          !   5 MT (latency=0)
 519
 520         add     #0x1c, r5       !  50 EX                (back to -4)
 521         cmp/hs  r2, r1          !  58 MT
 522
 523         bf/s    5f              ! 108 BR
 524          add     #11, r7        !  50 EX
 525
 526         mov.l   @(r0, r5), r6   !  21 LS (latency=2)
 527         tst     r2, r1          !  86 MT
 528
 529         mov     r5, r3          !   5 MT (latency=0)
 530         bt/s    4f              ! 111 BR
 531
 532          add    #-4, r3         !  50 EX
 533         cmp/hs  r2, r1          !  58 MT
 534
 535         bt/s    5f              ! 111 BR
 536          mov.l  r6,@-r0         !  30 LS
 537
 538         ! 4 cycles, 2 long words per iteration
 539 3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
 540
 541 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
 542         cmp/hi  r7, r0
 543
 544         mov.l   r6, @-r0        !  30 LS
 545         bt/s    3b              ! 109 BR
 546
 547          mov.l  r2, @-r0        !  30 LS
 548
 549         ! Copy the final 0-3 bytes
 550
 551 5:      cmp/eq  r0, r4          !  54 MT
 552         add     #-10, r7        !  50 EX
 553
 554         bt      9f              ! 110 BR
 555         add     #3,r5           !  50 EX
 556
 557         ! 3 cycles, 1 byte per iteration
 558 1:      mov.b   @(r0,r5),r1     !  19 LS
 559         cmp/hi  r7,r0           !  57 MT
 560
 561         bt/s    1b              ! 111 BR
 562          mov.b  r1,@-r0         !  28 LS
 563
 564 9:      rts
 565          nop
 566
 567         !
 568         !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
 569         !
 570
 571         .balign 32
 572 .Lcase2:
 573         ! Size is 16 or greater and less then 64, but may have trailing bytes
 574
 575 2:      mov     r5, r6          !   5 MT (latency=0)
 576         add     #-2,r5          !  50 EX
 577
 578         mov     r4,r2           !   5 MT (latency=0)
 579         add     #-4,r6          !  50 EX
 580
 581         add     #7,r2           !  50 EX
 582 3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
 583
 584         mov.w   @(r0,r6),r3     !  20 LS (latency=2)
 585         cmp/hi  r2,r0           !  57 MT
 586
 587         mov.w   r1,@-r0         !  29 LS
 588         bt/s    3b              ! 111 BR
 589
 590          mov.w  r3,@-r0         !  29 LS
 591
 592         bra     10f
 593          nop
 594
 595
 596         .balign 32
 597 .Lcase2b:
 598         ! Size is at least 64 bytes, so will be going round the big loop at least once.
 599         !
 600         !   r2 = rounded up r4
 601         !   r3 = rounded down r0
 602
 603         mov     r0, r3          !   5 MT (latency=0)
 604         mov     #(~0x1f), r1    !   6 EX
 605
 606         and     r1, r3          !  78 EX
 607         mov     r4, r2          !   5 MT (latency=0)
 608
 609         cmp/eq  r3, r0          !  54 MT
 610         add     #0x1f, r2       !  50 EX
 611
 612         add     #-2, r5         !  50 EX
 613         bt/s    1f              ! 110 BR
 614          and    r1, r2          !  78 EX
 615
 616         ! Copy a short word one at a time until we are cache line aligned
 617         !   Normal values: r0, r2, r3, r4
 618         !   Unused: r1, r6, r7
 619         !   Mod: r5 (=r5-2)
 620         !
 621         add     #2, r3          !  50 EX
 622
 623 2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
 624         cmp/eq  r3,r0           !  54 MT
 625
 626         bf/s    2b              ! 111 BR
 627
 628          mov.w  r1,@-r0         !  29 LS
 629
 630         ! Copy the cache line aligned blocks
 631         !
 632         ! In use: r0, r2, r4, r5 (=r5-2)
 633         ! Scratch: r1, r3, r6, r7
 634         !
 635         ! We could do this with the four scratch registers, but if src
 636         ! and dest hit the same cache line, this will thrash, so make
 637         ! use of additional registers.
 638         !
 639         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
 640         !   r5:  src (was r0+r5)
 641         !   r1:  dest (was r0)
 642         ! this can be reversed at the end, so we don't need to save any extra
 643         ! state.
 644         !
 645 1:      mov.l   r8, @-r15       !  30 LS
 646         add     r0, r5          !  49 EX
 647
 648         mov.l   r9, @-r15       !  30 LS
 649         mov     r0, r1          !   5 MT (latency=0)
 650
 651         mov.l   r10, @-r15      !  30 LS
 652         add     #-0x1e, r5      !  50 EX
 653
 654         mov.l   r11, @-r15      !  30 LS
 655
 656         mov.l   r12, @-r15      !  30 LS
 657
 658         ! 17 cycles, 32 bytes per iteration
 659 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 660 2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
 661         add     #-0x20, r1      !  50 EX
 662
 663         mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
 664
 665         mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
 666         shll16  r0              ! 103 EX                        JI..
 667
 668         mov.l   @r5+, r7        !  15 LS (latency=2)
 669         xtrct   r3, r0          !  48 EX                        LKJI
 670
 671         mov.l   @r5+, r8        !  15 LS (latency=2)
 672         xtrct   r6, r3          !  48 EX                        PONM
 673
 674         mov.l   @r5+, r9        !  15 LS (latency=2)
 675         xtrct   r7, r6          !  48 EX
 676
 677         mov.l   @r5+, r10       !  15 LS (latency=2)
 678         xtrct   r8, r7          !  48 EX
 679
 680         mov.l   @r5+, r11       !  15 LS (latency=2)
 681         xtrct   r9, r8          !  48 EX
 682
 683         mov.w   @r5+, r12       !  15 LS (latency=2)
 684         xtrct   r10, r9         !  48 EX
 685
 686         movca.l r0,@r1          !  40 LS (latency=3-7)
 687         xtrct   r11, r10        !  48 EX
 688
 689         mov.l   r3, @(0x04,r1)  !  33 LS
 690         xtrct   r12, r11        !  48 EX
 691
 692         mov.l   r6, @(0x08,r1)  !  33 LS
 693
 694         mov.l   r7, @(0x0c,r1)  !  33 LS
 695
 696         mov.l   r8, @(0x10,r1)  !  33 LS
 697         add     #-0x40, r5      !  50 EX
 698
 699         mov.l   r9, @(0x14,r1)  !  33 LS
 700         cmp/eq  r2,r1           !  54 MT
 701
 702         mov.l   r10, @(0x18,r1) !  33 LS
 703         bf/s    2b              ! 109 BR
 704
 705          mov.l  r11, @(0x1c,r1) !  33 LS
 706 #else
 707 2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
 708         add     #-2, r5         !  50 EX
 709
 710         mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
 711         add     #-4, r1         !  50 EX
 712
 713         mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
 714         shll16  r0              ! 103 EX
 715
 716         mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
 717         xtrct   r3, r0          !  48 EX
 718
 719         mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
 720         xtrct   r6, r3          !  48 EX
 721
 722         mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
 723         xtrct   r7, r6          !  48 EX
 724
 725         mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
 726         xtrct   r8, r7          !  48 EX
 727
 728         mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
 729         xtrct   r9, r8          !  48 EX
 730
 731         mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
 732         xtrct   r10, r9         !  48 EX
 733
 734         movca.l r0,@r1          !  40 LS (latency=3-7)
 735         add     #-0x1c, r1      !  50 EX
 736
 737         mov.l   r3, @(0x18,r1)  !  33 LS
 738         xtrct   r11, r10        !  48 EX
 739
 740         mov.l   r6, @(0x14,r1)  !  33 LS
 741         xtrct   r12, r11        !  48 EX
 742
 743         mov.l   r7, @(0x10,r1)  !  33 LS
 744
 745         mov.l   r8, @(0x0c,r1)  !  33 LS
 746         add     #-0x1e, r5      !  50 EX
 747
 748         mov.l   r9, @(0x08,r1)  !  33 LS
 749         cmp/eq  r2,r1           !  54 MT
 750
 751         mov.l   r10, @(0x04,r1) !  33 LS
 752         bf/s    2b              ! 109 BR
 753
 754          mov.l  r11, @(0x00,r1) !  33 LS
 755 #endif
 756
 757         mov.l   @r15+, r12
 758         mov     r1, r0          !   5 MT (latency=0)
 759
 760         mov.l   @r15+, r11      !  15 LS
 761         sub     r1, r5          !  75 EX
 762
 763         mov.l   @r15+, r10      !  15 LS
 764         cmp/eq  r4, r0          !  54 MT
 765
 766         bf/s    1f              ! 109 BR
 767          mov.l   @r15+, r9      !  15 LS
 768
 769         rts
 770 1:       mov.l  @r15+, r8       !  15 LS
 771
 772         add     #0x1e, r5       !  50 EX
 773
 774         ! Finish off a short word at a time
 775         ! r5 must be invariant - 2
 776 10:     mov     r4,r2           !   5 MT (latency=0)
 777         add     #1,r2           !  50 EX
 778
 779         cmp/hi  r2, r0          !  57 MT
 780         bf/s    1f              ! 109 BR
 781
 782          add    #2, r2          !  50 EX
 783
 784 3:      mov.w   @(r0,r5),r1     !  20 LS
 785         cmp/hi  r2,r0           !  57 MT
 786
 787         bt/s    3b              ! 109 BR
 788
 789          mov.w  r1,@-r0         !  29 LS
 790 1:
 791
 792         !
 793         ! Finally, copy the last byte if necessary
 794         cmp/eq  r4,r0           !  54 MT
 795         bt/s    9b
 796          add    #1,r5
 797         mov.b   @(r0,r5),r1
 798         rts
 799          mov.b  r1,@-r0
 800