summaryrefslogtreecommitdiff
path: root/lib/builtins/arm
diff options
context:
space:
mode:
authorWeiming Zhao <weimingz@codeaurora.org>2016-12-05 21:40:36 +0000
committerWeiming Zhao <weimingz@codeaurora.org>2016-12-05 21:40:36 +0000
commit92aa663d37027ec3ae85dc3750dad19beb5b5380 (patch)
treec9d75b9a962ab176b86d8ba2f5326bec14c2bd57 /lib/builtins/arm
parent2ad661c3f71c1810d9ea5aa1edae51fcee44bfcc (diff)
builtins: Add ARM Thumb1 implementation for uidiv and uidivmod
Summary: The current uidiv supports archs without clz. However, the asm is for thumb2/arm. For uidivmod, the existing code calls the C version of uidivmodsi4, which then calls uidiv. The extra push/pop/bl makes it less efficient. Reviewers: jmolloy, jroelofs, joerg, compnerd, rengolin Subscribers: llvm-commits, aemerson Differential Revision: https://reviews.llvm.org/D27309 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@288710 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/builtins/arm')
-rw-r--r--lib/builtins/arm/aeabi_uidivmod.S15
-rw-r--r--lib/builtins/arm/udivsi3.S129
2 files changed, 122 insertions, 22 deletions
diff --git a/lib/builtins/arm/aeabi_uidivmod.S b/lib/builtins/arm/aeabi_uidivmod.S
index 4a8944908..7098bc6ff 100644
--- a/lib/builtins/arm/aeabi_uidivmod.S
+++ b/lib/builtins/arm/aeabi_uidivmod.S
@@ -23,6 +23,20 @@
.syntax unified
.p2align 2
DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod)
+#if __ARM_ARCH_ISA_THUMB == 1
+ cmp r0, r1
+ bcc LOCAL_LABEL(case_denom_larger)
+ push {r0, r1, lr}
+ bl SYMBOL_NAME(__aeabi_uidiv)
+ pop {r1, r2, r3}
+ muls r2, r2, r0 // r2 = quot * denom
+ subs r1, r1, r2
+ JMP (r3)
+LOCAL_LABEL(case_denom_larger):
+ movs r1, r0
+ movs r0, #0
+ JMP (lr)
+#else
push { lr }
sub sp, sp, #4
mov r2, sp
@@ -35,6 +49,7 @@ DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod)
ldr r1, [sp]
add sp, sp, #4
pop { pc }
+#endif
END_COMPILERRT_FUNCTION(__aeabi_uidivmod)
NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/builtins/arm/udivsi3.S b/lib/builtins/arm/udivsi3.S
index 085f8fb9e..6739dc2ed 100644
--- a/lib/builtins/arm/udivsi3.S
+++ b/lib/builtins/arm/udivsi3.S
@@ -40,12 +40,26 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
#else
cmp r1, #1
bcc LOCAL_LABEL(divby0)
+#if __ARM_ARCH_ISA_THUMB == 1
+ bne LOCAL_LABEL(num_neq_denom)
+ JMP(lr)
+LOCAL_LABEL(num_neq_denom):
+#else
IT(eq)
JMPc(lr, eq)
+#endif
cmp r0, r1
+#if __ARM_ARCH_ISA_THUMB == 1
+ bhs LOCAL_LABEL(num_ge_denom)
+ movs r0, #0
+ JMP(lr)
+LOCAL_LABEL(num_ge_denom):
+#else
ITT(cc)
movcc r0, #0
JMPc(lr, cc)
+#endif
+
/*
* Implement division using binary long division algorithm.
*
@@ -62,7 +76,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
*/
-# ifdef __ARM_FEATURE_CLZ
+# if defined(__ARM_FEATURE_CLZ)
clz ip, r0
clz r3, r1
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
@@ -77,49 +91,128 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
sub ip, ip, r3, lsl #3
mov r3, #0
bx ip
-# else
+# else /* No CLZ Feature */
# if __ARM_ARCH_ISA_THUMB == 2
# error THUMB mode requires CLZ or UDIV
# endif
+# if __ARM_ARCH_ISA_THUMB == 1
+# define BLOCK_SIZE 10
+# else
+# define BLOCK_SIZE 12
+# endif
+
mov r2, r0
+# if __ARM_ARCH_ISA_THUMB == 1
+ mov ip, r0
+ adr r0, LOCAL_LABEL(div0block)
+ adds r0, #1
+# else
adr ip, LOCAL_LABEL(div0block)
-
- lsr r3, r2, #16
+# endif
+ lsrs r3, r2, #16
cmp r3, r1
+# if __ARM_ARCH_ISA_THUMB == 1
+ blo LOCAL_LABEL(skip_16)
+ movs r2, r3
+ subs r0, r0, #(16 * BLOCK_SIZE)
+LOCAL_LABEL(skip_16):
+# else
movhs r2, r3
- subhs ip, ip, #(16 * 12)
+ subhs ip, ip, #(16 * BLOCK_SIZE)
+# endif
- lsr r3, r2, #8
+ lsrs r3, r2, #8
cmp r3, r1
+# if __ARM_ARCH_ISA_THUMB == 1
+ blo LOCAL_LABEL(skip_8)
+ movs r2, r3
+ subs r0, r0, #(8 * BLOCK_SIZE)
+LOCAL_LABEL(skip_8):
+# else
movhs r2, r3
- subhs ip, ip, #(8 * 12)
+ subhs ip, ip, #(8 * BLOCK_SIZE)
+# endif
- lsr r3, r2, #4
+ lsrs r3, r2, #4
cmp r3, r1
+# if __ARM_ARCH_ISA_THUMB == 1
+ blo LOCAL_LABEL(skip_4)
+ movs r2, r3
+ subs r0, r0, #(4 * BLOCK_SIZE)
+LOCAL_LABEL(skip_4):
+# else
movhs r2, r3
- subhs ip, #(4 * 12)
+ subhs ip, #(4 * BLOCK_SIZE)
+# endif
- lsr r3, r2, #2
+ lsrs r3, r2, #2
cmp r3, r1
+# if __ARM_ARCH_ISA_THUMB == 1
+ blo LOCAL_LABEL(skip_2)
+ movs r2, r3
+ subs r0, r0, #(2 * BLOCK_SIZE)
+LOCAL_LABEL(skip_2):
+# else
movhs r2, r3
- subhs ip, ip, #(2 * 12)
+ subhs ip, ip, #(2 * BLOCK_SIZE)
+# endif
/* Last block, no need to update r2 or r3. */
+# if __ARM_ARCH_ISA_THUMB == 1
+ lsrs r3, r2, #1
+ cmp r3, r1
+ blo LOCAL_LABEL(skip_1)
+ subs r0, r0, #(1 * BLOCK_SIZE)
+LOCAL_LABEL(skip_1):
+ movs r2, r0
+ mov r0, ip
+ movs r3, #0
+ JMP (r2)
+
+# else
cmp r1, r2, lsr #1
- subls ip, ip, #(1 * 12)
+ subls ip, ip, #(1 * BLOCK_SIZE)
- mov r3, #0
+ movs r3, #0
JMP(ip)
-# endif
+# endif
+# endif /* __ARM_FEATURE_CLZ */
+
#define IMM #
+ /* due to the range limit of branch in Thumb1, we have to place the
+ block closer */
+LOCAL_LABEL(divby0):
+ movs r0, #0
+# if defined(__ARM_EABI__)
+ bl __aeabi_idiv0 // due to relocation limit, can't use b.
+# endif
+ JMP(lr)
+
+
+#if __ARM_ARCH_ISA_THUMB == 1
+#define block(shift) \
+ lsls r2, r1, IMM shift; \
+ cmp r0, r2; \
+ blo LOCAL_LABEL(block_skip_##shift); \
+ subs r0, r0, r2; \
+ LOCAL_LABEL(block_skip_##shift) :; \
+ adcs r3, r3 /* same as ((r3 << 1) | Carry). Carry is set if r0 >= r2. */
+
+ /* TODO: if current location counter is not not word aligned, we don't
+ need the .p2align and nop */
+ /* Label div0block must be word-aligned. First align block 31 */
+ .p2align 2
+ nop /* Padding to align div0block as 31 blocks = 310 bytes */
+#else
#define block(shift) \
cmp r0, r1, lsl IMM shift; \
ITT(hs); \
WIDE(addhs) r3, r3, IMM (1 << shift); \
WIDE(subhs) r0, r0, r1, lsl IMM shift
+#endif
block(31)
block(30)
@@ -159,14 +252,6 @@ LOCAL_LABEL(div0block):
JMP(lr)
#endif /* __ARM_ARCH_EXT_IDIV__ */
-LOCAL_LABEL(divby0):
- mov r0, #0
-#ifdef __ARM_EABI__
- b __aeabi_idiv0
-#else
- JMP(lr)
-#endif
-
END_COMPILERRT_FUNCTION(__udivsi3)
NO_EXEC_STACK_DIRECTIVE