--- /dev/null
+/*
+ * linux/arch/arm/lib/div64.S
+ *
+ * Optimized computation of 64-bit dividend / 32-bit divisor
+ *
+ * Author: Nicolas Pitre
+ * Created: Oct 5, 2003
+ * Copyright: Monta Vista Software, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#ifdef __UBOOT__
+#define UNWIND(x...)
+#endif
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+/*
+ * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
+ *
+ * Note: Calling convention is totally non standard for optimal code.
+ * This is meant to be used by do_div() from include/asm/div64.h only.
+ *
+ * Input parameters:
+ * xh-xl = dividend (clobbered)
+ * r4 = divisor (preserved)
+ *
+ * Output values:
+ * yh-yl = result
+ * xh = remainder
+ *
+ * Clobbered regs: xl, ip
+ */
+
+ENTRY(__do_div64)
+UNWIND(.fnstart)
+
+ @ Test for easy paths first.
+ subs ip, r4, #1
+ bls 9f @ divisor is 0 or 1
+ tst ip, r4
+ beq 8f @ divisor is power of 2
+
+ @ See if we need to handle upper 32-bit result.
+ cmp xh, r4
+ mov yh, #0
+ blo 3f
+
+ @ Align divisor with upper part of dividend.
+ @ The aligned divisor is stored in yl preserving the original.
+ @ The bit position is stored in ip.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+ clz yl, r4
+ clz ip, xh
+ sub yl, yl, ip
+ mov ip, #1
+ mov ip, ip, lsl yl
+ mov yl, r4, lsl yl
+
+#else
+
+ mov yl, r4
+ mov ip, #1
+1: cmp yl, #0x80000000
+ cmpcc yl, xh
+ movcc yl, yl, lsl #1
+ movcc ip, ip, lsl #1
+ bcc 1b
+
+#endif
+
+ @ The division loop for needed upper bit positions.
+ @ Break out early if dividend reaches 0.
+2: cmp xh, yl
+ orrcs yh, yh, ip
+ subcss xh, xh, yl
+ movnes ip, ip, lsr #1
+ mov yl, yl, lsr #1
+ bne 2b
+
+ @ See if we need to handle lower 32-bit result.
+3: cmp xh, #0
+ mov yl, #0
+ cmpeq xl, r4
+ movlo xh, xl
+ retlo lr
+
+ @ The division loop for lower bit positions.
+ @ Here we shift remainer bits leftwards rather than moving the
+ @ divisor for comparisons, considering the carry-out bit as well.
+ mov ip, #0x80000000
+4: movs xl, xl, lsl #1
+ adcs xh, xh, xh
+ beq 6f
+ cmpcc xh, r4
+5: orrcs yl, yl, ip
+ subcs xh, xh, r4
+ movs ip, ip, lsr #1
+ bne 4b
+ ret lr
+
+ @ The top part of remainder became zero. If carry is set
+ @ (the 33th bit) this is a false positive so resume the loop.
+ @ Otherwise, if lower part is also null then we are done.
+6: bcs 5b
+ cmp xl, #0
+ reteq lr
+
+ @ We still have remainer bits in the low part. Bring them up.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+ clz xh, xl @ we know xh is zero here so...
+ add xh, xh, #1
+ mov xl, xl, lsl xh
+ mov ip, ip, lsr xh
+
+#else
+
+7: movs xl, xl, lsl #1
+ mov ip, ip, lsr #1
+ bcc 7b
+
+#endif
+
+ @ Current remainder is now 1. It is worthless to compare with
+ @ divisor at this point since divisor can not be smaller than 3 here.
+ @ If possible, branch for another shift in the division loop.
+ @ If no bit position left then we are done.
+ movs ip, ip, lsr #1
+ mov xh, #1
+ bne 4b
+ ret lr
+
+8: @ Division by a power of 2: determine what that divisor order is
+ @ then simply shift values around
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+ clz ip, r4
+ rsb ip, ip, #31
+
+#else
+
+ mov yl, r4
+ cmp r4, #(1 << 16)
+ mov ip, #0
+ movhs yl, yl, lsr #16
+ movhs ip, #16
+
+ cmp yl, #(1 << 8)
+ movhs yl, yl, lsr #8
+ addhs ip, ip, #8
+
+ cmp yl, #(1 << 4)
+ movhs yl, yl, lsr #4
+ addhs ip, ip, #4
+
+ cmp yl, #(1 << 2)
+ addhi ip, ip, #3
+ addls ip, ip, yl, lsr #1
+
+#endif
+
+ mov yh, xh, lsr ip
+ mov yl, xl, lsr ip
+ rsb ip, ip, #32
+ ARM( orr yl, yl, xh, lsl ip )
+ THUMB( lsl xh, xh, ip )
+ THUMB( orr yl, yl, xh )
+ mov xh, xl, lsl ip
+ mov xh, xh, lsr ip
+ ret lr
+
+ @ eq -> division by 1: obvious enough...
+9: moveq yl, xl
+ moveq yh, xh
+ moveq xh, #0
+ reteq lr
+UNWIND(.fnend)
+
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+Ldiv0_64:
+ @ Division by 0:
+ str lr, [sp, #-8]!
+ bl __div0
+
+ @ as wrong as it could be...
+ mov yl, #0
+ mov yh, #0
+ mov xh, #0
+ ldr pc, [sp], #8
+
+UNWIND(.fnend)
+ENDPROC(__do_div64)