X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;ds=sidebyside;f=crypto%2Faes%2Fasm%2Fvpaes-ppc.pl;h=74f4d9af28b7322609222b3b17323ac710c4e747;hb=32be631ca1f2b73c92e4f7f5d23f1c3aee80ec69;hp=122dfff0faf6c39442bb5d01707b424204af132f;hpb=a61e51220f9ceee8d5984677d5e2886ede674e0b;p=oweals%2Fopenssl.git diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl index 122dfff0fa..74f4d9af28 100644 --- a/crypto/aes/asm/vpaes-ppc.pl +++ b/crypto/aes/asm/vpaes-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ###################################################################### ## Constant-time SSSE3 AES core implementation. @@ -14,7 +21,8 @@ # 128-bit key. # # aes-ppc.pl this -# G4e 35.5/52.1/(23.8) 11.9(*)/15.4 +# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4 +# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**) # POWER7 32.3/42.9/(18.4) 18.5/23.3 # @@ -27,7 +35,10 @@ # (**) Inadequate POWER6 performance is due to astronomic AltiVec # latency, 9 cycles per simple logical operation. -$flavour = shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour =~ /64/) { $SIZE_T =8; @@ -35,12 +46,14 @@ if ($flavour =~ /64/) { $STU ="stdu"; $POP ="ld"; $PUSH ="std"; + $UCMP ="cmpld"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; + $UCMP ="cmplw"; } else { die "nonsense $flavour"; } $sp="r1"; @@ -51,7 +64,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; +open STDOUT,"| $^X $xlate $flavour \"$output\"" + || die "can't call $xlate: $!"; $code.=<<___; .machine "any" @@ -61,89 +75,89 @@ $code.=<<___; .align 7 # totally strategic alignment _vpaes_consts: Lk_mc_forward: # mc_forward - .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c - .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 - .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 - .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 + .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv + .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv + .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv + .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv Lk_mc_backward: # mc_backward - .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e - .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a - .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 - .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 + .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv + .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv + .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv + .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv Lk_sr: # sr - .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f - .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b - .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 - .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv + .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv + .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv + .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv ## ## "Hot" constants ## Lk_inv: # inv, inva - .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 - .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 + .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev + .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev Lk_ipt: # input transform (lo, hi) - .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca - .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd + .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev + .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev Lk_sbo: # sbou, sbot - .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 - .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e + .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev + .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev Lk_sb1: # sb1u, sb1t - .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b - .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 + .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev + .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev Lk_sb2: # sb2u, sb2t - .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 - .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e + .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev + .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev ## ## Decryption stuff ## Lk_dipt: # decryption input transform - .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 - .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 + .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev + .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev Lk_dsbo: # decryption sbox final output - .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 - .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca + .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev + .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev Lk_dsb9: # decryption sbox output *9*u, *9*t - .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca - .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 + .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev + .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev Lk_dsbd: # decryption sbox output *D*u, *D*t - .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 - .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 + .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev + .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev Lk_dsbb: # decryption sbox output *B*u, *B*t - .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 - .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 + .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev + .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev Lk_dsbe: # decryption sbox output *E*u, *E*t - .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 - .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 + .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev + .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev ## ## Key schedule constants ## Lk_dksd: # decryption key schedule: invskew x*D - .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 - .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f + .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev + .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev Lk_dksb: # decryption key schedule: invskew x*B - .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 - .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 + .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev + .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev Lk_dkse: # decryption key schedule: invskew x*E + 0x63 - .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 - .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd + .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev + .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev Lk_dks9: # decryption key schedule: invskew x*9 - .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a - .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b + .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev + .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev Lk_rcon: # rcon - .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 + .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis Lk_s63: - .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b + .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis Lk_opt: # output transform - .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 - .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 + .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev + .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev Lk_deskew: # deskew tables: inverts the sbox's "skew" - .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d - .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 + .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev + .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev .align 5 Lconsts: mflr r0 @@ -154,7 +168,7 @@ Lconsts: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 -.asciz "Vector Permutaion AES for AltiVec, Mike Hamburg (Stanford University)" +.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" .align 6 ___ @@ -227,7 +241,7 @@ _vpaes_encrypt_core: li r11, 0x10 lvx v6, r9, $key addi r9, r9, 16 - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key addi r10, r11, 0x40 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 @@ -275,7 +289,7 @@ Lenc_entry: vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak addi r9, r9, 16 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo bdnz Lenc_loop @@ -302,87 +316,85 @@ Lenc_entry: mflr r6 mfspr r7, 256 # save vrsave stvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - lwz r7,`$FRAME-4`($sp) # save vrsave + stw r7,`$FRAME-4`($sp) # save vrsave li r0, -1 $PUSH r6,`$FRAME+$LRSAVE`($sp) mtspr 256, r0 # preserve all AltiVec registers bl _vpaes_encrypt_preheat - neg r8, $inp # prepare for unaligned access - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp - vnor $outmask, v7, v7 # 0xff..ff - lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out - - ######## - vmr v0, $inptail + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key # prepare for unaligned access lvx $inptail, 0, $inp # redundant in aligned case - addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core - vperm v0, v0, v0, $outperm # rotate left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + andi. r8, $out, 15 + li r9, 16 + beq Lenc_out_aligned - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out + vperm v0, v0, v0, $outperm # rotate right/left + mtctr r9 +Lenc_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Lenc_out_unaligned + b Lenc_done + +.align 4 +Lenc_out_aligned: + stvx v0, 0, $out +Lenc_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` mtlr r6 mtspr 256, r7 # restore vrsave lvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp addi $sp,$sp,$FRAME @@ -445,7 +457,7 @@ _vpaes_decrypt_core: li r11, 0x30 lvx v6, r9, $key addi r9, r9, 16 - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 @@ -509,7 +521,7 @@ Ldec_entry: vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak addi r9, r9, 16 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo bdnz Ldec_loop @@ -536,87 +548,85 @@ Ldec_entry: mflr r6 mfspr r7, 256 # save vrsave stvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - lwz r7,`$FRAME-4`($sp) # save vrsave + stw r7,`$FRAME-4`($sp) # save vrsave li r0, -1 $PUSH r6,`$FRAME+$LRSAVE`($sp) mtspr 256, r0 # preserve all AltiVec registers bl _vpaes_decrypt_preheat - neg r8, $inp # prepare for unaligned access - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp - vnor $outmask, v7, v7 # 0xff..ff - lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out - - ######## - vmr v0, $inptail + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key lvx $inptail, 0, $inp # redundant in aligned case - addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core - vperm v0, v0, v0, $outperm # rotate left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + andi. r8, $out, 15 + li r9, 16 + beq Ldec_out_aligned - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out + vperm v0, v0, v0, $outperm # rotate right/left + mtctr r9 +Ldec_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Ldec_out_unaligned + b Ldec_done + +.align 4 +Ldec_out_aligned: + stvx v0, 0, $out +Ldec_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` mtlr r6 mtspr 256, r7 # restore vrsave lvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp addi $sp,$sp,$FRAME @@ -629,87 +639,115 @@ Ldec_entry: .globl .vpaes_cbc_encrypt .align 5 .vpaes_cbc_encrypt: + ${UCMP}i r5,16 + bltlr- + $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) mflr r0 li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` mfspr r12, 256 stvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - lwz r12,`$FRAME-4`($sp) # save vrsave + stw r12,`$FRAME-4`($sp) # save vrsave $PUSH r30,`$FRAME+$SIZE_T*0`($sp) $PUSH r31,`$FRAME+$SIZE_T*1`($sp) - li r9, 16 + li r9, -16 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) - sub. r30, r5, r9 # copy length-16 + and r30, r5, r9 # copy length&-16 + andi. r9, $out, 15 # is $out aligned? mr r5, r6 # copy pointer to key mr r31, r7 # copy pointer to iv - blt Lcbc_abort - cmpwi r8, 0 # test direction li r6, -1 + mcrf cr1, cr0 # put aside $out alignment flag mr r7, r12 # copy vrsave mtspr 256, r6 # preserve all AltiVec registers lvx v24, 0, r31 # load [potentially unaligned] iv li r9, 15 - lvsl $inpperm, 0, r31 + ?lvsl $inpperm, 0, r31 lvx v25, r9, r31 - vperm v24, v24, v25, $inpperm + ?vperm v24, v24, v25, $inpperm + cmpwi r8, 0 # test direction neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp + ?lvsl $keyperm, 0, $key + ?lvsr $outperm, 0, $out + ?lvsr $inpperm, 0, r8 # -$inp vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out beq Lcbc_decrypt bl _vpaes_encrypt_preheat li r0, 16 + beq cr1, Lcbc_enc_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vxor v0, v0, v24 # ^= iv + + bl _vpaes_encrypt_core + + andi. r8, $out, 15 + vmr v24, v0 # put aside iv + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_enc_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_enc_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm vxor v0, v0, v24 # ^= iv bl _vpaes_encrypt_core vmr v24, v0 # put aside iv sub. r30, r30, r0 # len -= 16 - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 16 - bge Lcbc_enc_loop + bne Lcbc_enc_loop b Lcbc_done @@ -718,11 +756,37 @@ Lcbc_decrypt: bl _vpaes_decrypt_preheat li r0, 16 + beq cr1, Lcbc_dec_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vmr v25, v0 # put aside input + + bl _vpaes_decrypt_core + + andi. r8, $out, 15 + vxor v0, v0, v24 # ^= iv + vmr v24, v25 + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_dec_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_dec_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm vmr v25, v0 # put aside input bl _vpaes_decrypt_core @@ -730,55 +794,61 @@ Lcbc_dec_loop: vxor v0, v0, v24 # ^= iv vmr v24, v25 sub. r30, r30, r0 # len -= 16 - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 16 - bge Lcbc_dec_loop + bne Lcbc_dec_loop Lcbc_done: - addi $out, $out, -1 - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out - + beq cr1, Lcbc_write_iv # $out is aligned + +Lcbc_unaligned_done: + andi. r8, $out, 15 + sub $out, $out, r8 + li r9, 0 +Lcbc_tail: + stvebx $outhead, r9, $out + addi r9, r9, 1 + cmpw r9, r8 + bne Lcbc_tail + +Lcbc_write_iv: neg r8, r31 # write [potentially unaligned] iv - lvsl $outperm, 0, r8 - li r6, 15 - vnor $outmask, v7, v7 # 0xff..ff - vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, r31 - vperm v24, v24, v24, $outperm # rotate - vsel v0, $outhead, v24, $outmask - lvx v1, r6, r31 - stvx v0, 0, r31 - vsel v1, v24, v1, $outmask - stvx v1, r6, r31 + li r10, 4 + ?lvsl $outperm, 0, r8 + li r11, 8 + li r12, 12 + vperm v24, v24, v24, $outperm # rotate right/left + stvewx v24, 0, r31 # ivp is at least 32-bit aligned + stvewx v24, r10, r31 + stvewx v24, r11, r31 + stvewx v24, r12, r31 mtspr 256, r7 # restore vrsave li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` lvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp Lcbc_abort: @@ -863,10 +933,10 @@ _vpaes_schedule_core: neg r8, $inp # prepare for unaligned access lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not typo - lvsr $inpperm, 0, r8 # -$inp + ?lvsr $inpperm, 0, r8 # -$inp lvx v6, 0, $inp # v6 serves as inptail addi $inp, $inp, 8 - vperm v0, v0, v6, $inpperm + ?vperm v0, v0, v6, $inpperm # input transform vmr v3, v0 # vmovdqa %xmm0, %xmm3 @@ -877,18 +947,21 @@ _vpaes_schedule_core: # encrypting, output zeroth round key after transform li r8, 0x30 # mov \$0x30,%r8d - addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + li r9, 4 + li r10, 8 + li r11, 12 - lvsr $outperm, 0, $out # prepare for unaligned access - vspltisb $outmask, -1 # 0xff..ff - lvx $outhead, 0, $out - vperm $outmask, v9, $outmask, $outperm + ?lvsr $outperm, 0, $out # prepare for unaligned access + vnor $outmask, v9, v9 # 0xff..ff + ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) - vperm v1, v0, v0, $outperm # rotate left - vsel v2, $outhead, v1, $outmask - vmr $outhead, v1 - stvx v2, 0, $out + vperm $outhead, v0, v0, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out b Lschedule_go Lschedule_am_decrypting: @@ -898,20 +971,24 @@ Lschedule_am_decrypting: addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 # decrypting, output zeroth round key after shiftrows lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + li r9, 4 + li r10, 8 + li r11, 12 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access - lvsl $outperm, 0, r0 - addi $out, $out, 15 # 15 is not typo - vspltisb $outmask, -1 # 0xff..ff - lvx $outhead, 0, $out - vperm $outmask, $outmask, v9, $outperm + ?lvsl $outperm, 0, r0 + vnor $outmask, v9, v9 # 0xff..ff + ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v4, v4, v4, $outperm # rotate left - vsel v2, $outhead, v4, $outmask - vmr $outhead, v4 - stvx v2, 0, $out + vperm $outhead, v4, v4, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out + addi $out, $out, 15 # 15 is not typo xori r8, r8, 0x30 # xor \$0x30, %r8 Lschedule_go: @@ -957,16 +1034,16 @@ Loop_schedule_128: Lschedule_192: li r0, 4 # mov \$4, %esi lvx v0, 0, $inp - vperm v0, v6, v0, $inpperm - vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + ?vperm v0, v6, v0, $inpperm + ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform # input transform - vsldoi v6, v0, v9, 8 - vsldoi v6, v9, v6, 8 # clobber "low" side with zeros + ?vsldoi v6, v0, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros mtctr r0 Loop_schedule_192: bl _vpaes_schedule_round - vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 + ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle # save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle # save key n+1 @@ -991,7 +1068,7 @@ Lschedule_256: li r0, 7 # mov \$7, %esi addi $inp, $inp, 8 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - vperm v0, v6, v0, $inpperm + ?vperm v0, v6, v0, $inpperm bl _vpaes_schedule_transform # input transform mtctr r0 @@ -1002,15 +1079,15 @@ Loop_schedule_256: # high round bl _vpaes_schedule_round bdz Lschedule_mangle_last # dec %esi - bl _vpaes_schedule_mangle + bl _vpaes_schedule_mangle # low round. swap xmm7 and xmm6 - vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 vmr v5, v7 # vmovdqa %xmm7, %xmm5 vmr v7, v6 # vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round vmr v7, v5 # vmovdqa %xmm5, %xmm7 - + b Loop_schedule_256 ## ## .aes_schedule_mangle_last @@ -1042,35 +1119,39 @@ Lschedule_mangle_last: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out - - addi $out, $out, 15 # 15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out + li r12, 12 + stvewx v0, 0, $out # some (or all) are redundant + stvewx v0, r10, $out + stvewx v0, r11, $out + stvewx v0, r12, $out b Lschedule_mangle_done .align 4 Lschedule_mangle_last_dec: lvx $iptlo, r11, r12 # reload $ipt lvx $ipthi, r9, r12 - addi $out, $out, -16 # add \$-16, %rdx + addi $out, $out, -16 # add \$-16, %rdx vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key - vperm v0, v0, v0, $outperm # rotate left + addi r9, $out, -15 # -15 is not typo + vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out + li r12, 12 + stvewx v0, 0, r9 # some (or all) are redundant + stvewx v0, r10, r9 + stvewx v0, r11, r9 + stvewx v0, r12, r9 - addi $out, $out, -15 # -15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out Lschedule_mangle_done: mtlr r7 @@ -1104,14 +1185,14 @@ Lschedule_mangle_done: ## .align 4 _vpaes_schedule_192_smear: - vspltw v0, v7, 3 - vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + ?vspltw v0, v7, 3 + ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a vmr v0, v6 - vsldoi v6, v6, v9, 8 - vsldoi v6, v9, v6, 8 # clobber low side with zeros + ?vsldoi v6, v6, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber low side with zeros blr .long 0 .byte 0,12,0x14,0,0,0,0,0 @@ -1138,23 +1219,23 @@ _vpaes_schedule_192_smear: _vpaes_schedule_round: # extract rcon from xmm8 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 - vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 - vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 + ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 + ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 # rotate - vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 - vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 # fall through... # low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: # smear xmm7 - vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 + ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 vspltisb v1, 0x0f # 0x0f..0f - vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 + ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 # subbytes vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k @@ -1248,7 +1329,7 @@ _vpaes_schedule_mangle: andi. r8, r8, 0x30 # and \$0x30, %r8 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v1, v3, v3, $outperm # rotate left + vperm v1, v3, v3, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out @@ -1299,7 +1380,7 @@ Lschedule_mangle_dec: andi. r8, r8, 0x30 # and \$0x30, %r8 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v1, v3, v3, $outperm # rotate left + vperm v1, v3, v3, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out @@ -1316,28 +1397,28 @@ Lschedule_mangle_dec: mflr r0 mfspr r6, 256 # save vrsave stvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - lwz r6,`$FRAME-4`($sp) # save vrsave + stw r6,`$FRAME-4`($sp) # save vrsave li r7, -1 $PUSH r0, `$FRAME+$LRSAVE`($sp) mtspr 256, r7 # preserve all AltiVec registers @@ -1346,7 +1427,7 @@ Lschedule_mangle_dec: addi r9, r9, 6 # add \$5,%eax stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - cmplw $dir, $bits, $bits + cmplw $dir, $bits, $bits # set encrypt direction li r8, 0x30 # mov \$0x30,%r8d bl _vpaes_schedule_core @@ -1357,31 +1438,31 @@ Lschedule_mangle_dec: mtlr r0 xor r3, r3, r3 lvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp addi $sp,$sp,$FRAME blr .long 0 - .byte 0,12,0x04,1,0x80,3,0 + .byte 0,12,0x04,1,0x80,0,3,0 .long 0 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key @@ -1394,28 +1475,28 @@ Lschedule_mangle_dec: mflr r0 mfspr r6, 256 # save vrsave stvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 stvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - lwz r6,`$FRAME-4`($sp) # save vrsave + stw r6,`$FRAME-4`($sp) # save vrsave li r7, -1 $PUSH r0, `$FRAME+$LRSAVE`($sp) mtspr 256, r7 # preserve all AltiVec registers @@ -1427,7 +1508,7 @@ Lschedule_mangle_dec: slwi r9, r9, 4 # shl \$4,%eax add $out, $out, r9 # lea (%rdx,%rax),%rdx - cmplwi $dir, $bits, 0 + cmplwi $dir, $bits, 0 # set decrypt direction srwi r8, $bits, 1 # shr \$1,%r8d andi. r8, r8, 32 # and \$32,%r8d xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 @@ -1440,38 +1521,78 @@ Lschedule_mangle_dec: mtlr r0 xor r3, r3, r3 lvx v20,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v21,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v22,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v23,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v24,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v25,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v26,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v27,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v28,r10,$sp - addi r10,r10,16 + addi r10,r10,32 lvx v29,r11,$sp - addi r11,r11,16 + addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp addi $sp,$sp,$FRAME blr .long 0 - .byte 0,12,0x04,1,0x80,3,0 + .byte 0,12,0x04,1,0x80,0,3,0 .long 0 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key ___ } -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; +my $consts=1; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + # constants table endian-specific conversion + if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$2; + my @bytes=(); + + # convert to endian-agnostic format + foreach (split(/,\s+/,$1)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} -close STDOUT; +close STDOUT or die "error closing STDOUT";