fmovs %f1,%f3
fmovs %f0,%f2
- add %fp,BIAS,%i0 ! return pointer to caller´s top of stack
+ add %fp,BIAS,%i0 ! return pointer to caller´s top of stack
ret
restore
.type _sparcv9_rdtick,#function
.size _sparcv9_rdtick,.-_sparcv9_rdtick
+.global _sparcv9_vis1_probe
+.align 8
+_sparcv9_vis1_probe:
+ add %sp,BIAS+2,%o1
+ .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
+ retl
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+.type _sparcv9_vis1_probe,#function
+.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
+
! Probe and instrument VIS1 instruction. Output is number of cycles it
! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
! is slow (documented to be 6 cycles on T2) and the core is in-order
! single-issue, it should be possible to distinguish Tx reliably...
! Observed return values are:
!
-! UltraSPARC IIi 7
+! UltraSPARC IIe 7
! UltraSPARC III 7
! UltraSPARC T1 24
+! SPARC T4 65(*)
+!
+! (*) result has lesser to do with VIS instruction latencies, rdtick
+! appears that slow, but it does the trick in sense that FP and
+! VIS code paths are still slower than integer-only ones.
!
! Numbers for T2 and SPARC64 V-VII are more than welcomed.
!
! It would be possible to detect specifically US-T1 by instrumenting
! fmul8ulx16, which is emulated on T1 and as such accounts for quite
! a lot of %tick-s, couple of thousand on Linux...
-.global _sparcv9_vis1_probe
+.global _sparcv9_vis1_instrument
.align 8
-_sparcv9_vis1_probe:
+_sparcv9_vis1_instrument:
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
.word 0x91410000 !rd %tick,%o0
.word 0x81b00d80 !fxor %f0,%f0,%f0
.word 0x85b08d82 !fxor %f2,%f2,%f2
.word 0x38680002 !bgu,a %xcc,.+8
mov %o3,%o0
- ! check for ASI_FP16_P is redundant...
- add %sp,BIAS+2,%o1
retl
- .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
-.type _sparcv9_vis1_probe,#function
-.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
+ nop
+.type _sparcv9_vis1_instrument,#function
+.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
+
+.global _sparcv9_vis2_probe
+.align 8
+_sparcv9_vis2_probe:
+ retl
+ .word 0x81b00980 !bshuffle %f0,%f0,%f0
+.type _sparcv9_vis2_probe,#function
+.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe
+
+.global _sparcv9_fmadd_probe
+.align 8
+_sparcv9_fmadd_probe:
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ retl
+ .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0
+.type _sparcv9_fmadd_probe,#function
+.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+
+.global _sparcv9_rdcfr
+.align 8
+_sparcv9_rdcfr:
+ retl
+ .word 0x91468000 !rd %asr26,%o0
+.type _sparcv9_rdcfr,#function
+.size _sparcv9_rdcfr,.-_sparcv9_rdcfr
+
+.global _sparcv9_vis3_probe
+.align 8
+_sparcv9_vis3_probe:
+ retl
+ .word 0x81b022a0 !xmulx %g0,%g0,%g0
+.type _sparcv9_vis3_probe,#function
+.size _sparcv9_vis3_probe,.-_sparcv9_vis3_probe
+
+.global _sparcv9_random
+.align 8
+_sparcv9_random:
+ retl
+ .word 0x91b002a0 !random %o0
+.type _sparcv9_random,#function
+.size _sparcv9_random,.-_sparcv9_vis3_probe
.global OPENSSL_cleanse
.align 32
.type OPENSSL_cleanse,#function
.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.global _sparcv9_vis1_instrument_bus
+.align 8
+_sparcv9_vis1_instrument_bus:
+ mov %o1,%o3 ! save cnt
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+.Loop: .word 0x99410000 !rd %tick,%o4
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+ subcc %o1,1,%o1 ! --$cnt
+ bnz .Loop
+ add %o0,4,%o0 ! ++$out
+
+ retl
+ mov %o3,%o0
+.type _sparcv9_vis1_instrument_bus,#function
+.size _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
+
+.global _sparcv9_vis1_instrument_bus2
+.align 8
+_sparcv9_vis1_instrument_bus2:
+ mov %o1,%o3 ! save cnt
+ sll %o1,2,%o1 ! cnt*=4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ mov %g4,%g5 ! lastdiff=diff
+.Loop2:
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ subcc %o2,1,%o2 ! --max
+ bz .Ldone2
+ nop
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ cmp %g4,%g5
+ mov %g4,%g5 ! lastdiff=diff
+
+ .word 0x83408000 !rd %ccr,%g1
+ and %g1,4,%g1 ! isolate zero flag
+ xor %g1,4,%g1 ! flip zero flag
+
+ subcc %o1,%g1,%o1 ! conditional --$cnt
+ bnz .Loop2
+ add %o0,%g1,%o0 ! conditional ++$out
+
+.Ldone2:
+ srl %o1,2,%o1
+ retl
+ sub %o3,%o1,%o0
+.type _sparcv9_vis1_instrument_bus2,#function
+.size _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
+
.section ".init",#alloc,#execinstr
call OPENSSL_cpuid_setup
nop