bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.
authorAndy Polyakov <appro@openssl.org>
Fri, 4 Mar 2016 10:39:11 +0000 (11:39 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 7 Mar 2016 21:16:11 +0000 (22:16 +0100)
Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from commit adc4f1fc25b2cac90076f1e1695b05b7aeeae501)

Resolved conflicts:
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl

Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl

index e8f6b050842e361df1b28d6e90f0e4a400b91dfb..89f4de61e8964fce2793804a78c313fde9e27a3c 100755 (executable)
@@ -85,6 +85,21 @@ $frame=32;                           # size of above frame rounded up to 16n
 
        &and    ("esp",-64);            # align to cache line
 
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       &mov    ("eax","ebp");
+       &sub    ("eax","esp");
+       &and    ("eax",-4096);
+&set_label("page_walk");
+       &mov    ("edx",&DWP(0,"esp","eax"));
+       &sub    ("eax",4096);
+       &data_byte(0x2e);
+       &jnc    (&label("page_walk"));
+
        ################################# load argument block...
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
index 17fb94c84c9f1793268a26fa553cec8ad38d5579..c8ae019da1b903d5b5da3ef2b0e3fe35aff643f0 100755 (executable)
@@ -91,6 +91,20 @@ bn_mul_mont:
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul_body:
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmul_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x66,0x2e               # predict non-taken
+       jnc     .Lmul_page_walk
+
        mov     $bp,%r12                # reassign $bp
 ___
                $bp="%r12";
@@ -296,6 +310,14 @@ bn_mul4x_mont:
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul4x_body:
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lmul4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul4x_page_walk
+
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
 ___
@@ -707,6 +729,7 @@ $code.=<<___;
 .align 16
 bn_sqr4x_mont:
 .Lsqr4x_enter:
+       mov     %rsp,%rax
        push    %rbx
        push    %rbp
        push    %r12
@@ -715,12 +738,23 @@ bn_sqr4x_mont:
        push    %r15
 
        shl     \$3,${num}d             # convert $num to bytes
-       xor     %r10,%r10
        mov     %rsp,%r11               # put aside %rsp
-       sub     $num,%r10               # -$num
+       neg     $num                    # -$num
        mov     ($n0),$n0               # *n0
-       lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
+       lea     -72(%rsp,$num,2),%rsp   # alloca(frame+2*$num)
        and     \$-1024,%rsp            # minimize TLB usage
+
+       sub     %rsp,%r11
+       and     \$-4096,%r11
+.Lsqr4x_page_walk:
+       mov     (%rsp,%r11),%r10
+       sub     \$4096,%r11
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lsqr4x_page_walk
+
+       mov     $num,%r10
+       neg     $num                    # restore $num
+       lea     -48(%rax),%r11          # restore saved %rsp
        ##############################################################
        # Stack layout
        #
index 235979181f5452376b2a38f65798a3664693dce6..99243ae3ec2cd470f47801fc37c9ee363e5c1813 100755 (executable)
@@ -84,6 +84,20 @@ bn_mul_mont_gather5:
 
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul_body:
+       # Some OSes, *cough*-dows, insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       sub     %rsp,%rax
+       and     \$-4096,%rax
+.Lmul_page_walk:
+       mov     (%rsp,%rax),%r11
+       sub     \$4096,%rax
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul_page_walk
+
        lea     128($bp),%r12           # reassign $bp (+size optimization)
 ___
                $bp="%r12";
@@ -407,6 +421,14 @@ bn_mul4x_mont_gather5:
 
        mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
 .Lmul4x_body:
+       sub     %rsp,%rax
+       and     \$-4096,%rax
+.Lmul4x_page_walk:
+       mov     (%rsp,%rax),%r11
+       sub     \$4096,%rax
+       .byte   0x2e                    # predict non-taken
+       jnc     .Lmul4x_page_walk
+
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        lea     128(%rdx),%r12          # reassign $bp (+size optimization)
 ___