crypto/rc4/asm/rc4-ia64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by David Mosberger <David.Mosberger@acm.org> based on the
  12 # Itanium optimized Crypto code which was released by HP Labs at
  13 # http://www.hpl.hp.com/research/linux/crypto/.
  14 #
  15 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
  16 #
  17 # Permission is hereby granted, free of charge, to any person obtaining
  18 # a copy of this software and associated documentation files (the
  19 # "Software"), to deal in the Software without restriction, including
  20 # without limitation the rights to use, copy, modify, merge, publish,
  21 # distribute, sublicense, and/or sell copies of the Software, and to
  22 # permit persons to whom the Software is furnished to do so, subject to
  23 # the following conditions:
  24 #
  25 # The above copyright notice and this permission notice shall be
  26 # included in all copies or substantial portions of the Software.
  27
  28 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  29 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  30 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  31 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  32 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  33 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  34 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  35
  36
  37
  38 # This is a little helper program which generates a software-pipelined
  39 # for RC4 encryption.  The basic algorithm looks like this:
  40 #
  41 #   for (counter = 0; counter < len; ++counter)
  42 #     {
  43 #       in = inp[counter];
  44 #       SI = S[I];
  45 #       J = (SI + J) & 0xff;
  46 #       SJ = S[J];
  47 #       T = (SI + SJ) & 0xff;
  48 #       S[I] = SJ, S[J] = SI;
  49 #       ST = S[T];
  50 #       outp[counter] = in ^ ST;
  51 #       I = (I + 1) & 0xff;
  52 #     }
  53 #
  54 # Pipelining this loop isn't easy, because the stores to the S[] array
  55 # need to be observed in the right order.  The loop generated by the
  56 # code below has the following pipeline diagram:
  57 #
  58 #      cycle
  59 #     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
  60 # iter
  61 #   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  62 #   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  63 #   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  64 #
  65 #   where:
  66 #       LDI = load of S[I]
  67 #       LDJ = load of S[J]
  68 #       SWP = swap of S[I] and S[J]
  69 #       LDT = load of S[T]
  70 #
  71 # Note that in the above diagram, the major trouble-spot is that LDI
  72 # of the 2nd iteration is performed BEFORE the SWP of the first
  73 # iteration.  Fortunately, this is easy to detect (I of the 1st
  74 # iteration will be equal to J of the 2nd iteration) and when this
  75 # happens, we simply forward the proper value from the 1st iteration
  76 # to the 2nd one.  The proper value in this case is simply the value
  77 # of S[I] from the first iteration (thanks to the fact that SWP
  78 # simply swaps the contents of S[I] and S[J]).
  79 #
  80 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
  81 # iteration issues at the same time as the LDI of the 3rd iteration.
  82 # However, thanks to IA-64 execution semantics, this can be taken
  83 # care of simply by placing LDI later in the instruction-group than
  84 # SWP.  IA-64 CPUs will automatically forward the value if they
  85 # detect that the SWP and LDI are accessing the same memory-location.
  86
  87 # The core-loop that can be pipelined then looks like this (annotated
  88 # with McKinley/Madison issue port & latency numbers, assuming L1
  89 # cache hits for the most part):
  90
  91 # operation:        instruction:                    issue-ports:  latency
  92 # ------------------  -----------------------------   ------------- -------
  93
  94 # Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
  95 #                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
  96 # I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
  97 #                     ;;
  98 # SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
  99 #                     ;;
 100 #                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
 101 # J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
 102 #                     (pBypass) br.cond.spnt Bypass
 103 #                     ;;
 104 # ---------------------------------------------------------------------------------------
 105 # J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
 106 #                     ;;
 107 #                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
 108 #                     ;;
 109 # SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
 110 #                     ;;
 111 # ---------------------------------------------------------------------------------------
 112 # T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
 113 #                     ;;
 114 # T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
 115 # S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
 116 # S[J] = SI           st8 [Jptr] = SI                 M2-M3
 117 #                     ;;
 118 #                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
 119 #                     ;;
 120 # ---------------------------------------------------------------------------------------
 121 # T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
 122 #                     ;;
 123 # data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
 124 #                     ;;
 125 # *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
 126 #                     ;;
 127 # ---------------------------------------------------------------------------------------
 128
 129 # There are several points worth making here:
 130
 131 #   - Note that due to the bypass/forwarding-path, the first two
 132 #     phases of the loop are strangly mingled together.  In
 133 #     particular, note that the first stage of the pipeline is
 134 #     using the value of "J", as calculated by the second stage.
 135 #   - Each bundle-pair will have exactly 6 instructions.
 136 #   - Pipelined, the loop can execute in 3 cycles/iteration and
 137 #     4 stages.  However, McKinley/Madison can issue "st1" to
 138 #     the same bank at a rate of at most one per 4 cycles.  Thus,
 139 #     instead of storing each byte, we accumulate them in a word
 140 #     and then write them back at once with a single "st8" (this
 141 #     implies that the setup code needs to ensure that the output
 142 #     buffer is properly aligned, if need be, by encoding the
 143 #     first few bytes separately).
 144 #   - There is no space for a "br.ctop" instruction.  For this
 145 #     reason we can't use module-loop support in IA-64 and have
 146 #     to do a traditional, purely software-pipelined loop.
 147 #   - We can't replace any of the remaining "add/zxt1" pairs with
 148 #     "padd1" because the latency for that instruction is too high
 149 #     and would push the loop to the point where more bypasses
 150 #     would be needed, which we don't have space for.
 151 #   - The above loop runs at around 3.26 cycles/byte, or roughly
 152 #     440 MByte/sec on a 1.5GHz Madison.  This is well below the
 153 #     system bus bandwidth and hence with judicious use of
 154 #     "lfetch" this loop can run at (almost) peak speed even when
 155 #     the input and output data reside in memory.  The
 156 #     max. latency that can be tolerated is (PREFETCH_DISTANCE *
 157 #     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
 158 #     least) 1-ahead prefetching of 128 byte cache-lines.  Note
 159 #     that we do NOT prefetch into L1, since that would only
 160 #     interfere with the S[] table values stored there.  This is
 161 #     acceptable because there is a 10 cycle latency between
 162 #     load and first use of the input data.
 163 #   - We use a branch to out-of-line bypass-code of cycle-pressure:
 164 #     we calculate the next J, check for the need to activate the
 165 #     bypass path, and activate the bypass path ALL IN THE SAME
 166 #     CYCLE.  If we didn't have these constraints, we could do
 167 #     the bypass with a simple conditional move instruction.
 168 #     Fortunately, the bypass paths get activated relatively
 169 #     infrequently, so the extra branches don't cost all that much
 170 #     (about 0.04 cycles/byte, measured on a 16396 byte file with
 171 #     random input data).
 172 #
 173
 174 $output = pop;
 175 open STDOUT,">$output";
 176
 177 $phases = 4;            # number of stages/phases in the pipelined-loop
 178 $unroll_count = 6;      # number of times we unrolled it
 179 $pComI = (1 << 0);
 180 $pComJ = (1 << 1);
 181 $pComT = (1 << 2);
 182 $pOut  = (1 << 3);
 183
 184 $NData = 4;
 185 $NIP = 3;
 186 $NJP = 2;
 187 $NI = 2;
 188 $NSI = 3;
 189 $NSJ = 2;
 190 $NT = 2;
 191 $NOutWord = 2;
 192
 193 #
 194 # $threshold is the minimum length before we attempt to use the
 195 # big software-pipelined loop.  It MUST be greater-or-equal
 196 # to:
 197 #               PHASES * (UNROLL_COUNT + 1) + 7
 198 #
 199 # The "+ 7" comes from the fact we may have to encode up to
 200 #   7 bytes separately before the output pointer is aligned.
 201 #
 202 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
 203
 204 sub I {
 205     local *code = shift;
 206     local $format = shift;
 207     $code .= sprintf ("\t\t".$format."\n", @_);
 208 }
 209
 210 sub P {
 211     local *code = shift;
 212     local $format = shift;
 213     $code .= sprintf ($format."\n", @_);
 214 }
 215
 216 sub STOP {
 217     local *code = shift;
 218     $code .=<<___;
 219                 ;;
 220 ___
 221 }
 222
 223 sub emit_body {
 224     local *c = shift;
 225     local *bypass = shift;
 226     local ($iteration, $p) = @_;
 227
 228     local $i0 = $iteration;
 229     local $i1 = $iteration - 1;
 230     local $i2 = $iteration - 2;
 231     local $i3 = $iteration - 3;
 232     local $iw0 = ($iteration - 3) / 8;
 233     local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
 234     local $byte_num = ($iteration - 3) % 8;
 235     local $label = $iteration + 1;
 236     local $pAny = ($p & 0xf) == 0xf;
 237     local $pByp = (($p & $pComI) && ($iteration > 0));
 238
 239     $c.=<<___;
 240 //////////////////////////////////////////////////
 241 ___
 242
 243     if (($p & 0xf) == 0) {
 244         $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
 245         &I(\$c,"shr.u   OutWord[%u] = OutWord[%u], 32;;",
 246                                 $iw1 % $NOutWord, $iw1 % $NOutWord);
 247         $c.="#endif\n";
 248         &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
 249         return;
 250     }
 251
 252     # Cycle 0
 253     &I(\$c, "{ .mmi")                                         if ($pAny);
 254     &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
 255     &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
 256     &I(\$c, "zxt1   J = J")                                   if ($p & $pComJ);
 257     &I(\$c, "}")                                              if ($pAny);
 258     &I(\$c, "{ .mmi")                                         if ($pAny);
 259     &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
 260     &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
 261        $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)                     if ($p & $pComT);
 262     &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
 263     &I(\$c, "}")                                              if ($pAny);
 264     &STOP(\$c);
 265
 266     # Cycle 1
 267     &I(\$c, "{ .mmi")                                         if ($pAny);
 268     &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
 269     &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
 270     &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
 271     &I(\$c, "}")                                              if ($pAny);
 272     &I(\$c, "{ .mmi")                                         if ($pAny);
 273     &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
 274     &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)                 if ($p & $pComJ);
 275     &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
 276        $i3 % $NData, $i3 % $NData, $i1 % $NT)                 if ($p & $pOut);
 277     &I(\$c, "}")                                              if ($pAny);
 278     &STOP(\$c);
 279
 280     # Cycle 2
 281     &I(\$c, "{ .mmi")                                         if ($pAny);
 282     &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
 283     &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)       if ($pByp);
 284     &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
 285        $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
 286     &I(\$c, "}")                                              if ($pAny);
 287     &I(\$c, "{ .mmb")                                         if ($pAny);
 288     &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)               if ($p & $pComI);
 289     &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
 290     &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
 291     &I(\$c, "}") if ($pAny);
 292     &STOP(\$c);
 293
 294     &P(\$c, ".rc4Resume%u:", $label)                          if ($pByp);
 295     if ($byte_num == 0 && $iteration >= $phases) {
 296         &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
 297            $iw1 % $NOutWord)                                  if ($p & $pOut);
 298         if ($iteration == (1 + $unroll_count) * $phases - 1) {
 299             if ($unroll_count == 6) {
 300                 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
 301                    $iw1 % $NOutWord, $iw0 % $NOutWord);
 302             }
 303             &I(\$c, "lfetch.nt1 [InPrefetch], %u",
 304                $unroll_count * $phases);
 305             &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
 306                $unroll_count * $phases);
 307             &I(\$c, "br.cloop.sptk.few .rc4Loop");
 308         }
 309     }
 310
 311     if ($pByp) {
 312         &P(\$bypass, ".rc4Bypass%u:", $label);
 313         &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
 314         &I(\$bypass, "nop 0");
 315         &I(\$bypass, "nop 0");
 316         &I(\$bypass, ";;");
 317         &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
 318         &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
 319         &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
 320         &I(\$bypass, ";;");
 321     }
 322 }
 323
 324 $code=<<___;
 325 .ident \"rc4-ia64.s, version 3.0\"
 326 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
 327
 328 #define LCSave          r8
 329 #define PRSave          r9
 330
 331 /* Inputs become invalid once rotation begins!  */
 332
 333 #define StateTable      in0
 334 #define DataLen         in1
 335 #define InputBuffer     in2
 336 #define OutputBuffer    in3
 337
 338 #define KTable          r14
 339 #define J               r15
 340 #define InPtr           r16
 341 #define OutPtr          r17
 342 #define InPrefetch      r18
 343 #define OutPrefetch     r19
 344 #define One             r20
 345 #define LoopCount       r21
 346 #define Remainder       r22
 347 #define IFinal          r23
 348 #define EndPtr          r24
 349
 350 #define tmp0            r25
 351 #define tmp1            r26
 352
 353 #define pBypass         p6
 354 #define pDone           p7
 355 #define pSmall          p8
 356 #define pAligned        p9
 357 #define pUnaligned      p10
 358
 359 #define pComputeI       pPhase[0]
 360 #define pComputeJ       pPhase[1]
 361 #define pComputeT       pPhase[2]
 362 #define pOutput         pPhase[3]
 363
 364 #define RetVal          r8
 365 #define L_OK            p7
 366 #define L_NOK           p8
 367
 368 #define _NINPUTS        4
 369 #define _NOUTPUT        0
 370
 371 #define _NROTATE        24
 372 #define _NLOCALS        (_NROTATE - _NINPUTS - _NOUTPUT)
 373
 374 #ifndef SZ
 375 # define SZ     4       // this must be set to sizeof(RC4_INT)
 376 #endif
 377
 378 #if SZ == 1
 379 # define LKEY                   ld1
 380 # define SKEY                   st1
 381 # define KEYADDR(dst, i)        add dst = i, KTable
 382 #elif SZ == 2
 383 # define LKEY                   ld2
 384 # define SKEY                   st2
 385 # define KEYADDR(dst, i)        shladd dst = i, 1, KTable
 386 #elif SZ == 4
 387 # define LKEY                   ld4
 388 # define SKEY                   st4
 389 # define KEYADDR(dst, i)        shladd dst = i, 2, KTable
 390 #else
 391 # define LKEY                   ld8
 392 # define SKEY                   st8
 393 # define KEYADDR(dst, i)        shladd dst = i, 3, KTable
 394 #endif
 395
 396 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 397 # define ADDP   addp4
 398 #else
 399 # define ADDP   add
 400 #endif
 401
 402 /* Define a macro for the bit number of the n-th byte: */
 403
 404 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
 405 # define HOST_IS_BIG_ENDIAN
 406 # define BYTE_POS(n)    (56 - (8 * (n)))
 407 #else
 408 # define BYTE_POS(n)    (8 * (n))
 409 #endif
 410
 411 /*
 412    We must perform the first phase of the pipeline explicitly since
 413    we will always load from the stable the first time. The br.cexit
 414    will never be taken since regardless of the number of bytes because
 415    the epilogue count is 4.
 416 */
 417 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
 418    assembler failed on original macro with syntax error. <appro> */
 419 #define MODSCHED_RC4_PROLOGUE                                              \\
 420         {                                                                  \\
 421                                 ld1             Data[0] = [InPtr], 1;      \\
 422                                 add             IFinal = 1, I[1];          \\
 423                                 KEYADDR(IPr[0], I[1]);                     \\
 424         } ;;                                                               \\
 425         {                                                                  \\
 426                                 LKEY            SI[0] = [IPr[0]];          \\
 427                                 mov             pr.rot = 0x10000;          \\
 428                                 mov             ar.ec = 4;                 \\
 429         } ;;                                                               \\
 430         {                                                                  \\
 431                                 add             J = J, SI[0];              \\
 432                                 zxt1            I[0] = IFinal;             \\
 433                                 br.cexit.spnt.few .+16; /* never taken */  \\
 434         } ;;
 435 #define MODSCHED_RC4_LOOP(label)                                           \\
 436 label:                                                                     \\
 437         {       .mmi;                                                      \\
 438                 (pComputeI)     ld1             Data[0] = [InPtr], 1;      \\
 439                 (pComputeI)     add             IFinal = 1, I[1];          \\
 440                 (pComputeJ)     zxt1            J = J;                     \\
 441         }{      .mmi;                                                      \\
 442                 (pOutput)       LKEY            T[1] = [T[1]];             \\
 443                 (pComputeT)     add             T[0] = SI[2], SJ[1];       \\
 444                 (pComputeI)     KEYADDR(IPr[0], I[1]);                     \\
 445         } ;;                                                               \\
 446         {       .mmi;                                                      \\
 447                 (pComputeT)     SKEY            [IPr[2]] = SJ[1];          \\
 448                 (pComputeT)     SKEY            [JP[1]] = SI[2];           \\
 449                 (pComputeT)     zxt1            T[0] = T[0];               \\
 450         }{      .mmi;                                                      \\
 451                 (pComputeI)     LKEY            SI[0] = [IPr[0]];          \\
 452                 (pComputeJ)     KEYADDR(JP[0], J);                         \\
 453                 (pComputeI)     cmp.eq.unc      pBypass, p0 = I[1], J;     \\
 454         } ;;                                                               \\
 455         {       .mmi;                                                      \\
 456                 (pComputeJ)     LKEY            SJ[0] = [JP[0]];           \\
 457                 (pOutput)       xor             Data[3] = Data[3], T[1];   \\
 458                                 nop             0x0;                       \\
 459         }{      .mmi;                                                      \\
 460                 (pComputeT)     KEYADDR(T[0], T[0]);                       \\
 461                 (pBypass)       mov             SI[0] = SI[1];             \\
 462                 (pComputeI)     zxt1            I[0] = IFinal;             \\
 463         } ;;                                                               \\
 464         {       .mmb;                                                      \\
 465                 (pOutput)       st1             [OutPtr] = Data[3], 1;     \\
 466                 (pComputeI)     add             J = J, SI[0];              \\
 467                                 br.ctop.sptk.few label;                    \\
 468         } ;;
 469
 470         .text
 471
 472         .align  32
 473
 474         .type   RC4, \@function
 475         .global RC4
 476
 477         .proc   RC4
 478         .prologue
 479
 480 RC4:
 481         {
 482                 .mmi
 483                 alloc   r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 484
 485                 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
 486                       OutWord[2]
 487                 .rotp pPhase[4]
 488
 489                 ADDP            InPrefetch = 0, InputBuffer
 490                 ADDP            KTable = 0, StateTable
 491         }
 492         {
 493                 .mmi
 494                 ADDP            InPtr = 0, InputBuffer
 495                 ADDP            OutPtr = 0, OutputBuffer
 496                 mov             RetVal = r0
 497         }
 498         ;;
 499         {
 500                 .mmi
 501                 lfetch.nt1      [InPrefetch], 0x80
 502                 ADDP            OutPrefetch = 0, OutputBuffer
 503         }
 504         {               // Return 0 if the input length is nonsensical
 505                 .mib
 506                 ADDP            StateTable = 0, StateTable
 507                 cmp.ge.unc      L_NOK, L_OK = r0, DataLen
 508         (L_NOK) br.ret.sptk.few rp
 509         }
 510         ;;
 511         {
 512                 .mib
 513                 cmp.eq.or       L_NOK, L_OK = r0, InPtr
 514                 cmp.eq.or       L_NOK, L_OK = r0, OutPtr
 515                 nop             0x0
 516         }
 517         {
 518                 .mib
 519                 cmp.eq.or       L_NOK, L_OK = r0, StateTable
 520                 nop             0x0
 521         (L_NOK) br.ret.sptk.few rp
 522         }
 523         ;;
 524                 LKEY            I[1] = [KTable], SZ
 525 /* Prefetch the state-table. It contains 256 elements of size SZ */
 526
 527 #if SZ == 1
 528                 ADDP            tmp0 = 1*128, StateTable
 529 #elif SZ == 2
 530                 ADDP            tmp0 = 3*128, StateTable
 531                 ADDP            tmp1 = 2*128, StateTable
 532 #elif SZ == 4
 533                 ADDP            tmp0 = 7*128, StateTable
 534                 ADDP            tmp1 = 6*128, StateTable
 535 #elif SZ == 8
 536                 ADDP            tmp0 = 15*128, StateTable
 537                 ADDP            tmp1 = 14*128, StateTable
 538 #endif
 539                 ;;
 540 #if SZ >= 8
 541                 lfetch.fault.nt1                [tmp0], -256    // 15
 542                 lfetch.fault.nt1                [tmp1], -256;;
 543                 lfetch.fault.nt1                [tmp0], -256    // 13
 544                 lfetch.fault.nt1                [tmp1], -256;;
 545                 lfetch.fault.nt1                [tmp0], -256    // 11
 546                 lfetch.fault.nt1                [tmp1], -256;;
 547                 lfetch.fault.nt1                [tmp0], -256    //  9
 548                 lfetch.fault.nt1                [tmp1], -256;;
 549 #endif
 550 #if SZ >= 4
 551                 lfetch.fault.nt1                [tmp0], -256    //  7
 552                 lfetch.fault.nt1                [tmp1], -256;;
 553                 lfetch.fault.nt1                [tmp0], -256    //  5
 554                 lfetch.fault.nt1                [tmp1], -256;;
 555 #endif
 556 #if SZ >= 2
 557                 lfetch.fault.nt1                [tmp0], -256    //  3
 558                 lfetch.fault.nt1                [tmp1], -256;;
 559 #endif
 560         {
 561                 .mii
 562                 lfetch.fault.nt1                [tmp0]          //  1
 563                 add             I[1]=1,I[1];;
 564                 zxt1            I[1]=I[1]
 565         }
 566         {
 567                 .mmi
 568                 lfetch.nt1      [InPrefetch], 0x80
 569                 lfetch.excl.nt1 [OutPrefetch], 0x80
 570                 .save           pr, PRSave
 571                 mov             PRSave = pr
 572         } ;;
 573         {
 574                 .mmi
 575                 lfetch.excl.nt1 [OutPrefetch], 0x80
 576                 LKEY            J = [KTable], SZ
 577                 ADDP            EndPtr = DataLen, InPtr
 578         }  ;;
 579         {
 580                 .mmi
 581                 ADDP            EndPtr = -1, EndPtr     // Make it point to
 582                                                         // last data byte.
 583                 mov             One = 1
 584                 .save           ar.lc, LCSave
 585                 mov             LCSave = ar.lc
 586                 .body
 587         } ;;
 588         {
 589                 .mmb
 590                 sub             Remainder = 0, OutPtr
 591                 cmp.gtu         pSmall, p0 = $threshold, DataLen
 592 (pSmall)        br.cond.dpnt    .rc4Remainder           // Data too small for
 593                                                         // big loop.
 594         } ;;
 595         {
 596                 .mmi
 597                 and             Remainder = 0x7, Remainder
 598                 ;;
 599                 cmp.eq          pAligned, pUnaligned = Remainder, r0
 600                 nop             0x0
 601         } ;;
 602         {
 603                 .mmb
 604 .pred.rel       "mutex",pUnaligned,pAligned
 605 (pUnaligned)    add             Remainder = -1, Remainder
 606 (pAligned)      sub             Remainder = EndPtr, InPtr
 607 (pAligned)      br.cond.dptk.many .rc4Aligned
 608         } ;;
 609         {
 610                 .mmi
 611                 nop             0x0
 612                 nop             0x0
 613                 mov.i           ar.lc = Remainder
 614         }
 615
 616 /* Do the initial few bytes via the compact, modulo-scheduled loop
 617    until the output pointer is 8-byte-aligned.  */
 618
 619                 MODSCHED_RC4_PROLOGUE
 620                 MODSCHED_RC4_LOOP(.RC4AlignLoop)
 621
 622         {
 623                 .mib
 624                 sub             Remainder = EndPtr, InPtr
 625                 zxt1            IFinal = IFinal
 626                 clrrrb                          // Clear CFM.rrb.pr so
 627                 ;;                              // next "mov pr.rot = N"
 628                                                 // does the right thing.
 629         }
 630         {
 631                 .mmi
 632                 mov             I[1] = IFinal
 633                 nop             0x0
 634                 nop             0x0
 635         } ;;
 636
 637
 638 .rc4Aligned:
 639
 640 /*
 641    Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
 642  */
 643
 644         {
 645                 .mlx
 646                 add     LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
 647                 movl            Remainder = 0xaaaaaaaaaaaaaaab
 648         } ;;
 649         {
 650                 .mmi
 651                 setf.sig        f6 = LoopCount          // M2, M3       6 cyc
 652                 setf.sig        f7 = Remainder          // M2, M3       6 cyc
 653                 nop             0x0
 654         } ;;
 655         {
 656                 .mfb
 657                 nop             0x0
 658                 xmpy.hu         f6 = f6, f7
 659                 nop             0x0
 660         } ;;
 661         {
 662                 .mmi
 663                 getf.sig        LoopCount = f6;;        // M2           5 cyc
 664                 nop             0x0
 665                 shr.u           LoopCount = LoopCount, 4
 666         } ;;
 667         {
 668                 .mmi
 669                 nop             0x0
 670                 nop             0x0
 671                 mov.i           ar.lc = LoopCount
 672         } ;;
 673
 674 /* Now comes the unrolled loop: */
 675
 676 .rc4Prologue:
 677 ___
 678
 679 $iteration = 0;
 680
 681 # Generate the prologue:
 682 $predicates = 1;
 683 for ($i = 0; $i < $phases; ++$i) {
 684     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 685     $predicates = ($predicates << 1) | 1;
 686 }
 687
 688 $code.=<<___;
 689 .rc4Loop:
 690 ___
 691
 692 # Generate the body:
 693 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
 694     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 695 }
 696
 697 $code.=<<___;
 698 .rc4Epilogue:
 699 ___
 700
 701 # Generate the epilogue:
 702 for ($i = 0; $i < $phases; ++$i) {
 703     $predicates <<= 1;
 704     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 705 }
 706
 707 $code.=<<___;
 708         {
 709                 .mmi
 710                 lfetch.nt1      [EndPtr]        // fetch line with last byte
 711                 mov             IFinal = I[1]
 712                 nop             0x0
 713         }
 714
 715 .rc4Remainder:
 716         {
 717                 .mmi
 718                 sub             Remainder = EndPtr, InPtr       // Calculate
 719                                                                 // # of bytes
 720                                                                 // left - 1
 721                 nop             0x0
 722                 nop             0x0
 723         } ;;
 724         {
 725                 .mib
 726                 cmp.eq          pDone, p0 = -1, Remainder // done already?
 727                 mov.i           ar.lc = Remainder
 728 (pDone)         br.cond.dptk.few .rc4Complete
 729         }
 730
 731 /* Do the remaining bytes via the compact, modulo-scheduled loop */
 732
 733                 MODSCHED_RC4_PROLOGUE
 734                 MODSCHED_RC4_LOOP(.RC4RestLoop)
 735
 736 .rc4Complete:
 737         {
 738                 .mmi
 739                 add             KTable = -SZ, KTable
 740                 add             IFinal = -1, IFinal
 741                 mov             ar.lc = LCSave
 742         } ;;
 743         {
 744                 .mii
 745                 SKEY            [KTable] = J,-SZ
 746                 zxt1            IFinal = IFinal
 747                 mov             pr = PRSave, 0x1FFFF
 748         } ;;
 749         {
 750                 .mib
 751                 SKEY            [KTable] = IFinal
 752                 add             RetVal = 1, r0
 753                 br.ret.sptk.few rp
 754         } ;;
 755 ___
 756
 757 # Last but not least, emit the code for the bypass-code of the unrolled loop:
 758
 759 $code.=$bypass;
 760
 761 $code.=<<___;
 762         .endp RC4
 763 ___
 764
 765 print $code;
 766
 767 close STDOUT;