From 1fb83a3bc28d4d179518c25c6f8294c9238cd94c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 7 Jan 2014 16:46:25 +0100 Subject: [PATCH] aes/asm/vpaes-ppc.pl: add little-endian support. --- Configure | 2 +- TABLE | 2 +- crypto/aes/asm/vpaes-ppc.pl | 290 ++++++++++++++++++++---------------- 3 files changed, 162 insertions(+), 132 deletions(-) diff --git a/Configure b/Configure index 4ae91992ed..2091b844c1 100755 --- a/Configure +++ b/Configure @@ -365,7 +365,7 @@ my %table=( #### "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ppc64", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", -"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:".eval{my $asm=$ppc64_asm;$asm=~s/vpaes\-ppc\.o//;$asm}.":linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::", +"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:$ppc64_asm:linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::", "linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", diff --git a/TABLE b/TABLE index b64b753932..e31e104ed0 100644 --- a/TABLE +++ b/TABLE @@ -4532,7 +4532,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = -$aes_obj = aes_core.o aes_cbc.o aes-ppc.o +$aes_obj = aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o $bf_obj = $md5_obj = $sha1_obj = sha1-ppc.o sha256-ppc.o sha512-ppc.o diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl index 122dfff0fa..f78e713f70 100644 --- a/crypto/aes/asm/vpaes-ppc.pl +++ b/crypto/aes/asm/vpaes-ppc.pl @@ -61,89 +61,89 @@ $code.=<<___; .align 7 # totally strategic alignment _vpaes_consts: Lk_mc_forward: # mc_forward - .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c - .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 - .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 - .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 + .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv + .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv + .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv + .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv Lk_mc_backward: # mc_backward - .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e - .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a - .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 - .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 + .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv + .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv + .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv + .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv Lk_sr: # sr - .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f - .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b - .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 - .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv + .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv + .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv + .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv ## ## "Hot" constants ## Lk_inv: # inv, inva - .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 - .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 + .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev + .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev Lk_ipt: # input transform (lo, hi) - .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca - .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd + .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev + .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev Lk_sbo: # sbou, sbot - .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 - .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e + .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev + .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev Lk_sb1: # sb1u, sb1t - .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b - .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 + .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev + .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev Lk_sb2: # sb2u, sb2t - .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 - .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e + .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev + .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev ## ## Decryption stuff ## Lk_dipt: # decryption input transform - .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 - .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 + .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev + .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev Lk_dsbo: # decryption sbox final output - .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 - .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca + .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev + .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev Lk_dsb9: # decryption sbox output *9*u, *9*t - .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca - .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 + .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev + .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev Lk_dsbd: # decryption sbox output *D*u, *D*t - .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 - .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 + .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev + .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev Lk_dsbb: # decryption sbox output *B*u, *B*t - .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 - .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 + .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev + .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev Lk_dsbe: # decryption sbox output *E*u, *E*t - .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 - .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 + .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev + .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev ## ## Key schedule constants ## Lk_dksd: # decryption key schedule: invskew x*D - .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 - .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f + .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev + .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev Lk_dksb: # decryption key schedule: invskew x*B - .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 - .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 + .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev + .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev Lk_dkse: # decryption key schedule: invskew x*E + 0x63 - .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 - .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd + .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev + .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev Lk_dks9: # decryption key schedule: invskew x*9 - .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a - .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b + .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev + .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev Lk_rcon: # rcon - .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 + .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis Lk_s63: - .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b + .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis Lk_opt: # output transform - .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 - .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 + .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev + .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev Lk_deskew: # deskew tables: inverts the sbox's "skew" - .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d - .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 + .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev + .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev .align 5 Lconsts: mflr r0 @@ -227,7 +227,7 @@ _vpaes_encrypt_core: li r11, 0x10 lvx v6, r9, $key addi r9, r9, 16 - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key addi r10, r11, 0x40 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 @@ -275,7 +275,7 @@ Lenc_entry: vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak addi r9, r9, 16 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo bdnz Lenc_loop @@ -330,25 +330,20 @@ Lenc_entry: bl _vpaes_encrypt_preheat - neg r8, $inp # prepare for unaligned access - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp - vnor $outmask, v7, v7 # 0xff..ff - lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out - - ######## - vmr v0, $inptail + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key # prepare for unaligned access + vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm $outmask, v7, $outmask, $outperm + lvx $outhead, 0, $out + ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out @@ -445,7 +440,7 @@ _vpaes_decrypt_core: li r11, 0x30 lvx v6, r9, $key addi r9, r9, 16 - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 @@ -509,7 +504,7 @@ Ldec_entry: vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak addi r9, r9, 16 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io - vperm v5, v5, v6, $keyperm # align round key + ?vperm v5, v5, v6, $keyperm # align round key vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo bdnz Ldec_loop @@ -564,25 +559,20 @@ Ldec_entry: bl _vpaes_decrypt_preheat - neg r8, $inp # prepare for unaligned access - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp - vnor $outmask, v7, v7 # 0xff..ff - lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out - - ######## - vmr v0, $inptail + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key + vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm $outmask, v7, $outmask, $outperm + lvx $outhead, 0, $out + ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out @@ -673,18 +663,18 @@ Ldec_entry: lvx v24, 0, r31 # load [potentially unaligned] iv li r9, 15 - lvsl $inpperm, 0, r31 + ?lvsl $inpperm, 0, r31 lvx v25, r9, r31 - vperm v24, v24, v25, $inpperm + ?vperm v24, v24, v25, $inpperm neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 - lvsl $keyperm, 0, $key - lvsr $outperm, 0, $out - lvsr $inpperm, 0, r8 # -$inp + ?lvsl $keyperm, 0, $key + ?lvsr $outperm, 0, $out + ?lvsr $inpperm, 0, r8 # -$inp vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp - vperm $outmask, v7, $outmask, $outperm + ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo lvx $outhead, 0, $out @@ -697,14 +687,14 @@ Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm vxor v0, v0, v24 # ^= iv bl _vpaes_encrypt_core vmr v24, v0 # put aside iv sub. r30, r30, r0 # len -= 16 - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out @@ -722,7 +712,7 @@ Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 - vperm v0, v0, $inptail, $inpperm + ?vperm v0, v0, $inptail, $inpperm vmr v25, v0 # put aside input bl _vpaes_decrypt_core @@ -730,7 +720,7 @@ Lcbc_dec_loop: vxor v0, v0, v24 # ^= iv vmr v24, v25 sub. r30, r30, r0 # len -= 16 - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out @@ -744,12 +734,12 @@ Lcbc_done: stvx v1, 0, $out neg r8, r31 # write [potentially unaligned] iv - lvsl $outperm, 0, r8 + ?lvsl $outperm, 0, r8 li r6, 15 vnor $outmask, v7, v7 # 0xff..ff - vperm $outmask, v7, $outmask, $outperm + ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, r31 - vperm v24, v24, v24, $outperm # rotate + vperm v24, v24, v24, $outperm # rotate right/left vsel v0, $outhead, v24, $outmask lvx v1, r6, r31 stvx v0, 0, r31 @@ -863,10 +853,10 @@ _vpaes_schedule_core: neg r8, $inp # prepare for unaligned access lvx v0, 0, $inp addi $inp, $inp, 15 # 15 is not typo - lvsr $inpperm, 0, r8 # -$inp + ?lvsr $inpperm, 0, r8 # -$inp lvx v6, 0, $inp # v6 serves as inptail addi $inp, $inp, 8 - vperm v0, v0, v6, $inpperm + ?vperm v0, v0, v6, $inpperm # input transform vmr v3, v0 # vmovdqa %xmm0, %xmm3 @@ -879,13 +869,13 @@ _vpaes_schedule_core: li r8, 0x30 # mov \$0x30,%r8d addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 - lvsr $outperm, 0, $out # prepare for unaligned access - vspltisb $outmask, -1 # 0xff..ff + ?lvsr $outperm, 0, $out # prepare for unaligned access + vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out - vperm $outmask, v9, $outmask, $outperm + ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) - vperm v1, v0, v0, $outperm # rotate left + vperm v1, v0, v0, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out @@ -901,14 +891,14 @@ Lschedule_am_decrypting: vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access - lvsl $outperm, 0, r0 + ?lvsl $outperm, 0, r0 addi $out, $out, 15 # 15 is not typo - vspltisb $outmask, -1 # 0xff..ff + vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out - vperm $outmask, $outmask, v9, $outperm + ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v4, v4, v4, $outperm # rotate left + vperm v4, v4, v4, $outperm # rotate right/left vsel v2, $outhead, v4, $outmask vmr $outhead, v4 stvx v2, 0, $out @@ -957,16 +947,16 @@ Loop_schedule_128: Lschedule_192: li r0, 4 # mov \$4, %esi lvx v0, 0, $inp - vperm v0, v6, v0, $inpperm - vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + ?vperm v0, v6, v0, $inpperm + ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform # input transform - vsldoi v6, v0, v9, 8 - vsldoi v6, v9, v6, 8 # clobber "low" side with zeros + ?vsldoi v6, v0, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros mtctr r0 Loop_schedule_192: bl _vpaes_schedule_round - vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 + ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle # save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle # save key n+1 @@ -991,7 +981,7 @@ Lschedule_256: li r0, 7 # mov \$7, %esi addi $inp, $inp, 8 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - vperm v0, v6, v0, $inpperm + ?vperm v0, v6, v0, $inpperm bl _vpaes_schedule_transform # input transform mtctr r0 @@ -1005,7 +995,7 @@ Loop_schedule_256: bl _vpaes_schedule_mangle # low round. swap xmm7 and xmm6 - vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 vmr v5, v7 # vmovdqa %xmm7, %xmm5 vmr v7, v6 # vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round @@ -1042,7 +1032,7 @@ Lschedule_mangle_last: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v2, $outhead, v0, $outmask vmr $outhead, v0 stvx v2, 0, $out @@ -1062,7 +1052,7 @@ Lschedule_mangle_last_dec: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key - vperm v0, v0, v0, $outperm # rotate left + vperm v0, v0, v0, $outperm # rotate right/left vsel v2, $outhead, v0, $outmask vmr $outhead, v0 stvx v2, 0, $out @@ -1104,14 +1094,14 @@ Lschedule_mangle_done: ## .align 4 _vpaes_schedule_192_smear: - vspltw v0, v7, 3 - vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + ?vspltw v0, v7, 3 + ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a vmr v0, v6 - vsldoi v6, v6, v9, 8 - vsldoi v6, v9, v6, 8 # clobber low side with zeros + ?vsldoi v6, v6, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber low side with zeros blr .long 0 .byte 0,12,0x14,0,0,0,0,0 @@ -1138,23 +1128,23 @@ _vpaes_schedule_192_smear: _vpaes_schedule_round: # extract rcon from xmm8 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 - vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 - vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 + ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 + ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 # rotate - vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 - vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 # fall through... # low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: # smear xmm7 - vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 + ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 vspltisb v1, 0x0f # 0x0f..0f - vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 + ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 # subbytes vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k @@ -1248,7 +1238,7 @@ _vpaes_schedule_mangle: andi. r8, r8, 0x30 # and \$0x30, %r8 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v1, v3, v3, $outperm # rotate left + vperm v1, v3, v3, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out @@ -1299,7 +1289,7 @@ Lschedule_mangle_dec: andi. r8, r8, 0x30 # and \$0x30, %r8 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v1, v3, v3, $outperm # rotate left + vperm v1, v3, v3, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out @@ -1346,7 +1336,7 @@ Lschedule_mangle_dec: addi r9, r9, 6 # add \$5,%eax stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - cmplw $dir, $bits, $bits + cmplw $dir, $bits, $bits # set encrypt direction li r8, 0x30 # mov \$0x30,%r8d bl _vpaes_schedule_core @@ -1427,7 +1417,7 @@ Lschedule_mangle_dec: slwi r9, r9, 4 # shl \$4,%eax add $out, $out, r9 # lea (%rdx,%rax),%rdx - cmplwi $dir, $bits, 0 + cmplwi $dir, $bits, 0 # set decrypt direction srwi r8, $bits, 1 # shr \$1,%r8d andi. r8, r8, 32 # and \$32,%r8d xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 @@ -1470,8 +1460,48 @@ Lschedule_mangle_dec: ___ } -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; +my $consts=1; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + # constants table endian-specific conversion + if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$2; + my @bytes=(); + + # convert to endian-agnostic format + foreach (split(/,\s+/,$1)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} close STDOUT; -- 2.25.1