1 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
3 Permission is hereby granted, free of charge, to any person obtaining
4 a copy of this software and associated documentation files (the
5 "Software"), to deal in the Software without restriction, including
6 without limitation the rights to use, copy, modify, merge, publish,
7 distribute, sublicense, and/or sell copies of the Software, and to
8 permit persons to whom the Software is furnished to do so, subject to
9 the following conditions:
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 // Common registers are assigned as follows:
26 // t0 Const Tbl Ptr TPtr
27 // t1 Round Constant TRound
28 // t4 Block residual LenResid
29 // t5 Residual Data DTmp
31 // {in,out}0 Block 0 Cycle RotateM0
32 // {in,out}1 Block Value 12 M12
33 // {in,out}2 Block Value 8 M8
34 // {in,out}3 Block Value 4 M4
35 // {in,out}4 Block Value 0 M0
36 // {in,out}5 Block 1 Cycle RotateM1
37 // {in,out}6 Block Value 13 M13
38 // {in,out}7 Block Value 9 M9
39 // {in,out}8 Block Value 5 M5
40 // {in,out}9 Block Value 1 M1
41 // {in,out}10 Block 2 Cycle RotateM2
42 // {in,out}11 Block Value 14 M14
43 // {in,out}12 Block Value 10 M10
44 // {in,out}13 Block Value 6 M6
45 // {in,out}14 Block Value 2 M2
46 // {in,out}15 Block 3 Cycle RotateM3
47 // {in,out}16 Block Value 15 M15
48 // {in,out}17 Block Value 11 M11
49 // {in,out}18 Block Value 7 M7
50 // {in,out}19 Block Value 3 M3
51 // {in,out}20 Scratch Z
52 // {in,out}21 Scratch Y
53 // {in,out}22 Scratch X
54 // {in,out}23 Scratch W
55 // {in,out}24 Digest A A
56 // {in,out}25 Digest B B
57 // {in,out}26 Digest C C
58 // {in,out}27 Digest D D
59 // {in,out}28 Active Data Ptr DPtr
61 // out28 Dummy Value -
62 // bt0 Coroutine Link QUICK_RTN
64 /// These predicates are used for computing the padding block(s) and
65 /// are shared between the driver and digest co-routines
67 // pt0 Extra Pad Block pExtra
68 // pt1 Load next word pLoad
69 // pt2 Skip next word pSkip
70 // pt3 Search for Pad pNoPad
71 // pt4 Pad Word 0 pPad0
72 // pt5 Pad Word 1 pPad1
73 // pt6 Pad Word 2 pPad2
74 // pt7 Pad Word 3 pPad3
89 // This two below shall remain constant througout whole routine
90 #define pDataOrder p14
91 #define pHostOrder p15
114 #define RotateM0_ out0
115 #define RotateM1_ out5
116 #define RotateM2_ out10
117 #define RotateM3_ out15
146 #define RotateM2 in10
147 #define RotateM3 in15
153 /* register stack configuration for md5_block_asm_host_order(): */
159 /* register stack configuration for helpers: */
160 #define _NINPUTS MD5_NOUT
163 #define _NROTATE 24 /* this must be <= _NINPUTS */
165 #if defined(_HPUX_SOURCE) && !defined(_LP64)
171 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
172 #define HOST_IS_BIG_ENDIAN
175 // Macros for getting the left and right portions of little-endian words
177 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
178 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
182 // Reads an input block, then calls the digest block
183 // subroutine and adds the results to the accumulated
184 // digest. It allocates 32 outs which the subroutine
185 // uses as it's inputs and rotating
186 // registers. Initializes the round constant pointer and
187 // takes care of saving/restoring ar.lc
191 // in0 Context Ptr CtxPtr0
192 // in1 Input Data Ptr DPtrIn
193 // in2 Integral Blocks BlockCount
194 // rp Return Address -
198 // v2 Input Align InAlign
199 // t0 Shared w/digest -
200 // t1 Shared w/digest -
201 // t2 Shared w/digest -
202 // t3 Shared w/digest -
203 // t4 Shared w/digest -
204 // t5 Shared w/digest -
205 // t6 PFS Save PFSSave
206 // t7 ar.lc Save LCSave
207 // t8 Saved PR PRSave
208 // t9 2nd CtxPtr CtxPtr1
209 // t10 Table Base CTable
210 // t11 Table[0] CTable0
211 // t13 Accumulator A AccumA
212 // t14 Accumulator B AccumB
213 // t15 Accumulator C AccumC
214 // t16 Accumulator D AccumD
215 // pt0 Shared w/digest -
216 // pt1 Shared w/digest -
217 // pt2 Shared w/digest -
218 // pt3 Shared w/digest -
219 // pt4 Shared w/digest -
220 // pt5 Shared w/digest -
221 // pt6 Shared w/digest -
222 // pt7 Shared w/digest -
223 // pt8 Not Aligned pOff
224 // pt8 Blocks Left pAgain
235 #define BlockCount in2
245 /* md5_block_asm_host_order(MD5_CTX *c, const void *data, size_t num)
248 c: a pointer to a structure of this type:
250 typedef struct MD5state_st
254 MD5_LONG data[MD5_LBLOCK];
259 data: a pointer to the input data (may be misaligned)
260 num: the number of 16-byte blocks to hash (i.e., the length
265 .type md5_block_asm_data_order, @function
266 .global md5_block_asm_data_order
268 .proc md5_block_asm_data_order
269 md5_block_asm_data_order:
271 cmp.eq pDataOrder,pHostOrder = r0,r0
272 br.sptk.many .md5_block
274 .endp md5_block_asm_data_order
276 .type md5_block_asm_host_order, @function
277 .global md5_block_asm_host_order
279 .proc md5_block_asm_host_order
280 md5_block_asm_host_order:
283 cmp.eq pHostOrder,pDataOrder = r0,r0
287 .save ar.pfs, PFSSave
288 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
289 ADDP CtxPtr1 = 8, CtxPtr0
293 ADDP DPtrIn = 0, DPtrIn
294 ADDP CtxPtr0 = 0, CtxPtr0
299 .pred.rel "mutex",pDataOrder,pHostOrder
301 (pDataOrder) add CTable = .md5_tbl_data_order#-.md5_block#, CTable
302 (pHostOrder) add CTable = .md5_tbl_host_order#-.md5_block#, CTable
303 and InAlign = 0x3, DPtrIn
307 ld4 AccumA = [CtxPtr0], 4
308 ld4 AccumC = [CtxPtr1], 4
315 ld4 AccumB = [CtxPtr0]
316 ld4 AccumD = [CtxPtr1]
317 dep DPtr_ = 0, DPtrIn, 0, 2
319 #ifdef HOST_IS_BIG_ENDIAN
320 (pDataOrder) rum psr.be;; // switch to little-endian
323 ld4 CTable0 = [CTable], 4
324 cmp.ne pOff, p0 = 0, InAlign
325 (pOff) br.cond.spnt.many .md5_unaligned
328 // The FF load/compute loop rotates values three times, so that
329 // loading into M12 here produces the M0 value, M13 -> M1, etc.
333 ld4 M12_ = [DPtr_], 4
338 ld4 M13_ = [DPtr_], 4
343 ld4 M14_ = [DPtr_], 4
348 ld4 M15_ = [DPtr_], 4
349 add BlockCount = -1, BlockCount
350 br.call.sptk.many QUICK_RTN = md5_digest_block0
353 // Now, we add the new digest values and do some clean-up
354 // before checking if there's another full block to process
357 add AccumA = AccumA, A_
358 add AccumB = AccumB, B_
359 cmp.ne pAgain, p0 = 0, BlockCount
362 add AccumC = AccumC, C_
363 add AccumD = AccumD, D_
364 (pAgain) br.cond.dptk.many .md5_block_loop0
368 #ifdef HOST_IS_BIG_ENDIAN
369 (pDataOrder) sum psr.be;; // switch back to big-endian mode
372 st4 [CtxPtr0] = AccumB, -4
373 st4 [CtxPtr1] = AccumD, -4
374 mov pr = PRSave, 0x1ffff ;;
377 st4 [CtxPtr0] = AccumA
378 st4 [CtxPtr1] = AccumC
386 #define MD5UNALIGNED(offset) \
387 .md5_process##offset: \
390 GETRW(DTmp, DTmp, offset) ; \
392 .md5_block_loop##offset: \
394 ld4 Y_ = [DPtr_], 4 ; \
395 mov TPtr = CTable ; \
396 mov TRound = CTable0 ; \
399 ld4 M13_ = [DPtr_], 4 ; \
404 ld4 M14_ = [DPtr_], 4 ; \
405 GETLW(W_, Y_, offset) ; \
410 or M12_ = W_, DTmp ; \
411 GETRW(DTmp, Y_, offset) ; \
414 ld4 M15_ = [DPtr_], 4 ; \
415 add BlockCount = -1, BlockCount ; \
416 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
419 add AccumA = AccumA, A_ ; \
420 add AccumB = AccumB, B_ ; \
421 cmp.ne pAgain, p0 = 0, BlockCount ; \
424 add AccumC = AccumC, C_ ; \
425 add AccumD = AccumD, D_ ; \
426 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
431 br.cond.sptk.many .md5_exit ; \
437 // Because variable shifts are expensive, we special case each of
438 // the four alignements. In practice, this won't hurt too much
439 // since only one working set of code will be loaded.
442 ld4 DTmp = [DPtr_], 4
443 cmp.eq pOff, p0 = 1, InAlign
444 (pOff) br.cond.dpnt.many .md5_process1
447 cmp.eq pOff, p0 = 2, InAlign
449 (pOff) br.cond.dpnt.many .md5_process2
455 .endp md5_block_asm_host_order
458 // MD5 Perform the F function and load
460 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
461 // computes the FF() round of functions, then branches to the common
462 // digest code to finish up with GG(), HH, and II().
466 // rp Return Address -
470 // v0 PFS bit bucket PFS
471 // v1 Loop Trip Count LTrip
472 // pt0 Load next word pMore
479 /* For GHI rounds: */
484 #define COMPUTE(a, b, s, M, R) \
487 ld4 TRound = [TPtr], 4 ; \
488 dep.z Y = Z, 32, 32 ;; \
489 shrp Z = Z, Y, 64 - s ; \
498 #define LOOP(a, b, s, M, R, label) \
500 ld4 TRound = [TPtr], 4 ; \
501 dep.z Y = Z, 32, 32 ;; \
502 shrp Z = Z, Y, 64 - s ; \
507 br.ctop.sptk.many label ; \
510 // G(B, C, D) = (B & D) | (C & ~D)
512 #define G(a, b, c, d, M) \
514 add Z = M, TRound ; \
524 // H(B, C, D) = B ^ C ^ D
526 #define H(a, b, c, d, M) \
528 add Z = M, TRound ; \
538 // I(B, C, D) = C ^ (B | ~D)
540 // However, since we have an andcm operator, we use the fact that
544 // to rewrite the expression as
546 // I(B, C, D) = ~C ^ (~B & D)
548 #define I(a, b, c, d, M) \
550 add Z = M, TRound ; \
562 COMPUTE(A, B, 5, M0, RotateM0) \
564 COMPUTE(D, A, 9, M1, RotateM1) \
566 COMPUTE(C, D, 14, M2, RotateM2) \
568 LOOP(B, C, 20, M3, RotateM3, label)
572 COMPUTE(A, B, 4, M0, RotateM0) \
574 COMPUTE(D, A, 11, M1, RotateM1) \
576 COMPUTE(C, D, 16, M2, RotateM2) \
578 LOOP(B, C, 23, M3, RotateM3, label)
582 COMPUTE(A, B, 6, M0, RotateM0) \
584 COMPUTE(D, A, 10, M1, RotateM1) \
586 COMPUTE(C, D, 15, M2, RotateM2) \
588 LOOP(B, C, 21, M3, RotateM3, label)
590 #define FFLOAD(a, b, c, d, M, N, s) \
592 (pMore) ld4 N = [DPtr], 4 ; \
593 add Z = M, TRound ; \
602 ld4 TRound = [TPtr], 4 ; \
604 dep.z Y = Z, 32, 32 ; \
608 shrp Z = Z, Y, 64 - s ;; \
612 #define FFLOOP(a, b, c, d, M, N, s, dest) \
614 (pMore) ld4 N = [DPtr], 4 ; \
615 add Z = M, TRound ; \
624 ld4 TRound = [TPtr], 4 ; \
626 dep.z Y = Z, 32, 32 ; \
630 shrp Z = Z, Y, 64 - s ;; \
634 cmp.ne pMore, p0 = 0, LTrip ; \
635 add LTrip = -1, LTrip ; \
636 br.ctop.dptk.many dest ; \
639 .type md5_digest_block0, @function
642 .proc md5_digest_block0
648 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
653 cmp.eq pMore, p0 = r0, r0
659 FFLOAD(A, B, C, D, M12, RotateM0, 7)
660 FFLOAD(D, A, B, C, M13, RotateM1, 12)
661 FFLOAD(C, D, A, B, M14, RotateM2, 17)
662 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
664 // !!! Fall through to md5_digest_GHI
666 .endp md5_digest_block0
668 .type md5_digest_GHI, @function
673 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
678 // The following sequence shuffles the block counstants round for the
681 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
682 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
731 // The following sequence shuffles the block constants round for the
734 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
735 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
784 // The following sequence shuffles the block constants round for the
787 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
788 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
840 br.ret.sptk.many QUICK_RTN
845 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
847 (pMore) ld4 N = [DPtr], 4 ; \
848 add Z = M, TRound ; \
857 ld4 TRound = [TPtr], 4 ; \
858 GETLW(W, P, offset) ; \
863 dep.z Y = Z, 32, 32 ;; \
864 shrp Z = Z, Y, 64 - s ; \
868 GETRW(DTmp, P, offset) ; \
872 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
874 (pMore) ld4 N = [DPtr], 4 ; \
875 add Z = M, TRound ; \
884 ld4 TRound = [TPtr], 4 ; \
885 (pMore) GETLW(W, P, offset) ; \
889 (pMore) or W = W, DTmp ; \
890 dep.z Y = Z, 32, 32 ;; \
891 shrp Z = Z, Y, 64 - s ; \
895 (pMore) GETRW(DTmp, P, offset) ; \
896 (pMore) mov P = W ; \
899 cmp.ne pMore, p0 = 0, LTrip ; \
900 add LTrip = -1, LTrip ; \
901 br.ctop.sptk.many .md5_FF_round##offset ; \
904 #define MD5FBLOCK(offset) \
905 .type md5_digest_block##offset, @function ; \
908 .proc md5_digest_block##offset ; \
912 md5_digest_block##offset: \
914 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
919 cmp.eq pMore, p0 = r0, r0 ; \
924 .pred.rel "mutex", pLoad, pSkip ; \
925 .md5_FF_round##offset: \
926 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
927 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
928 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
929 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
934 br.cond.sptk.many md5_digest_GHI ; \
936 .endp md5digestBlock ## offset
943 .type md5_constants, @object
945 .md5_tbl_data_order: // To ensure little-endian data
946 // order, code as bytes.
947 data1 0x78, 0xa4, 0x6a, 0xd7 // 0
948 data1 0x56, 0xb7, 0xc7, 0xe8 // 1
949 data1 0xdb, 0x70, 0x20, 0x24 // 2
950 data1 0xee, 0xce, 0xbd, 0xc1 // 3
951 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
952 data1 0x2a, 0xc6, 0x87, 0x47 // 5
953 data1 0x13, 0x46, 0x30, 0xa8 // 6
954 data1 0x01, 0x95, 0x46, 0xfd // 7
955 data1 0xd8, 0x98, 0x80, 0x69 // 8
956 data1 0xaf, 0xf7, 0x44, 0x8b // 9
957 data1 0xb1, 0x5b, 0xff, 0xff // 10
958 data1 0xbe, 0xd7, 0x5c, 0x89 // 11
959 data1 0x22, 0x11, 0x90, 0x6b // 12
960 data1 0x93, 0x71, 0x98, 0xfd // 13
961 data1 0x8e, 0x43, 0x79, 0xa6 // 14
962 data1 0x21, 0x08, 0xb4, 0x49 // 15
963 data1 0x62, 0x25, 0x1e, 0xf6 // 16
964 data1 0x40, 0xb3, 0x40, 0xc0 // 17
965 data1 0x51, 0x5a, 0x5e, 0x26 // 18
966 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
967 data1 0x5d, 0x10, 0x2f, 0xd6 // 20
968 data1 0x53, 0x14, 0x44, 0x02 // 21
969 data1 0x81, 0xe6, 0xa1, 0xd8 // 22
970 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
971 data1 0xe6, 0xcd, 0xe1, 0x21 // 24
972 data1 0xd6, 0x07, 0x37, 0xc3 // 25
973 data1 0x87, 0x0d, 0xd5, 0xf4 // 26
974 data1 0xed, 0x14, 0x5a, 0x45 // 27
975 data1 0x05, 0xe9, 0xe3, 0xa9 // 28
976 data1 0xf8, 0xa3, 0xef, 0xfc // 29
977 data1 0xd9, 0x02, 0x6f, 0x67 // 30
978 data1 0x8a, 0x4c, 0x2a, 0x8d // 31
979 data1 0x42, 0x39, 0xfa, 0xff // 32
980 data1 0x81, 0xf6, 0x71, 0x87 // 33
981 data1 0x22, 0x61, 0x9d, 0x6d // 34
982 data1 0x0c, 0x38, 0xe5, 0xfd // 35
983 data1 0x44, 0xea, 0xbe, 0xa4 // 36
984 data1 0xa9, 0xcf, 0xde, 0x4b // 37
985 data1 0x60, 0x4b, 0xbb, 0xf6 // 38
986 data1 0x70, 0xbc, 0xbf, 0xbe // 39
987 data1 0xc6, 0x7e, 0x9b, 0x28 // 40
988 data1 0xfa, 0x27, 0xa1, 0xea // 41
989 data1 0x85, 0x30, 0xef, 0xd4 // 42
990 data1 0x05, 0x1d, 0x88, 0x04 // 43
991 data1 0x39, 0xd0, 0xd4, 0xd9 // 44
992 data1 0xe5, 0x99, 0xdb, 0xe6 // 45
993 data1 0xf8, 0x7c, 0xa2, 0x1f // 46
994 data1 0x65, 0x56, 0xac, 0xc4 // 47
995 data1 0x44, 0x22, 0x29, 0xf4 // 48
996 data1 0x97, 0xff, 0x2a, 0x43 // 49
997 data1 0xa7, 0x23, 0x94, 0xab // 50
998 data1 0x39, 0xa0, 0x93, 0xfc // 51
999 data1 0xc3, 0x59, 0x5b, 0x65 // 52
1000 data1 0x92, 0xcc, 0x0c, 0x8f // 53
1001 data1 0x7d, 0xf4, 0xef, 0xff // 54
1002 data1 0xd1, 0x5d, 0x84, 0x85 // 55
1003 data1 0x4f, 0x7e, 0xa8, 0x6f // 56
1004 data1 0xe0, 0xe6, 0x2c, 0xfe // 57
1005 data1 0x14, 0x43, 0x01, 0xa3 // 58
1006 data1 0xa1, 0x11, 0x08, 0x4e // 59
1007 data1 0x82, 0x7e, 0x53, 0xf7 // 60
1008 data1 0x35, 0xf2, 0x3a, 0xbd // 61
1009 data1 0xbb, 0xd2, 0xd7, 0x2a // 62
1010 data1 0x91, 0xd3, 0x86, 0xeb // 63
1012 .md5_tbl_host_order: // OS data order, might as well
1013 // be little-endian.
1014 data4 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee // 0
1015 data4 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 // 4
1016 data4 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be // 8
1017 data4 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 // 12
1018 data4 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa // 16
1019 data4 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 // 20
1020 data4 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed // 24
1021 data4 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a // 28
1022 data4 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c // 32
1023 data4 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 // 36
1024 data4 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 // 40
1025 data4 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 // 44
1026 data4 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 // 48
1027 data4 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 // 52
1028 data4 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 // 56
1029 data4 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 // 60
1030 .size md5_constants#,64*4*2