diff options
author | Weiming Zhao <weimingz@codeaurora.org> | 2016-12-05 21:40:36 +0000 |
---|---|---|
committer | Weiming Zhao <weimingz@codeaurora.org> | 2016-12-05 21:40:36 +0000 |
commit | 92aa663d37027ec3ae85dc3750dad19beb5b5380 (patch) | |
tree | c9d75b9a962ab176b86d8ba2f5326bec14c2bd57 /lib/builtins/arm | |
parent | 2ad661c3f71c1810d9ea5aa1edae51fcee44bfcc (diff) |
builtins: Add ARM Thumb1 implementation for uidiv and uidivmod
Summary:
The current uidiv supports archs without clz. However, the asm is for thumb2/arm.
For uidivmod, the existing code calls the C version of uidivmodsi4, which then calls uidiv. The extra push/pop/bl makes it less efficient.
Reviewers: jmolloy, jroelofs, joerg, compnerd, rengolin
Subscribers: llvm-commits, aemerson
Differential Revision: https://reviews.llvm.org/D27309
git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@288710 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/builtins/arm')
-rw-r--r-- | lib/builtins/arm/aeabi_uidivmod.S | 15 | ||||
-rw-r--r-- | lib/builtins/arm/udivsi3.S | 129 |
2 files changed, 122 insertions, 22 deletions
diff --git a/lib/builtins/arm/aeabi_uidivmod.S b/lib/builtins/arm/aeabi_uidivmod.S index 4a8944908..7098bc6ff 100644 --- a/lib/builtins/arm/aeabi_uidivmod.S +++ b/lib/builtins/arm/aeabi_uidivmod.S @@ -23,6 +23,20 @@ .syntax unified .p2align 2 DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod) +#if __ARM_ARCH_ISA_THUMB == 1 + cmp r0, r1 + bcc LOCAL_LABEL(case_denom_larger) + push {r0, r1, lr} + bl SYMBOL_NAME(__aeabi_uidiv) + pop {r1, r2, r3} + muls r2, r2, r0 // r2 = quot * denom + subs r1, r1, r2 + JMP (r3) +LOCAL_LABEL(case_denom_larger): + movs r1, r0 + movs r0, #0 + JMP (lr) +#else push { lr } sub sp, sp, #4 mov r2, sp @@ -35,6 +49,7 @@ DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod) ldr r1, [sp] add sp, sp, #4 pop { pc } +#endif END_COMPILERRT_FUNCTION(__aeabi_uidivmod) NO_EXEC_STACK_DIRECTIVE diff --git a/lib/builtins/arm/udivsi3.S b/lib/builtins/arm/udivsi3.S index 085f8fb9e..6739dc2ed 100644 --- a/lib/builtins/arm/udivsi3.S +++ b/lib/builtins/arm/udivsi3.S @@ -40,12 +40,26 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) #else cmp r1, #1 bcc LOCAL_LABEL(divby0) +#if __ARM_ARCH_ISA_THUMB == 1 + bne LOCAL_LABEL(num_neq_denom) + JMP(lr) +LOCAL_LABEL(num_neq_denom): +#else IT(eq) JMPc(lr, eq) +#endif cmp r0, r1 +#if __ARM_ARCH_ISA_THUMB == 1 + bhs LOCAL_LABEL(num_ge_denom) + movs r0, #0 + JMP(lr) +LOCAL_LABEL(num_ge_denom): +#else ITT(cc) movcc r0, #0 JMPc(lr, cc) +#endif + /* * Implement division using binary long division algorithm. * @@ -62,7 +76,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) * that (r0 << shift) < 2 * r1. The quotient is stored in r3. */ -# ifdef __ARM_FEATURE_CLZ +# if defined(__ARM_FEATURE_CLZ) clz ip, r0 clz r3, r1 /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ @@ -77,49 +91,128 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) sub ip, ip, r3, lsl #3 mov r3, #0 bx ip -# else +# else /* No CLZ Feature */ # if __ARM_ARCH_ISA_THUMB == 2 # error THUMB mode requires CLZ or UDIV # endif +# if __ARM_ARCH_ISA_THUMB == 1 +# define BLOCK_SIZE 10 +# else +# define BLOCK_SIZE 12 +# endif + mov r2, r0 +# if __ARM_ARCH_ISA_THUMB == 1 + mov ip, r0 + adr r0, LOCAL_LABEL(div0block) + adds r0, #1 +# else adr ip, LOCAL_LABEL(div0block) - - lsr r3, r2, #16 +# endif + lsrs r3, r2, #16 cmp r3, r1 +# if __ARM_ARCH_ISA_THUMB == 1 + blo LOCAL_LABEL(skip_16) + movs r2, r3 + subs r0, r0, #(16 * BLOCK_SIZE) +LOCAL_LABEL(skip_16): +# else movhs r2, r3 - subhs ip, ip, #(16 * 12) + subhs ip, ip, #(16 * BLOCK_SIZE) +# endif - lsr r3, r2, #8 + lsrs r3, r2, #8 cmp r3, r1 +# if __ARM_ARCH_ISA_THUMB == 1 + blo LOCAL_LABEL(skip_8) + movs r2, r3 + subs r0, r0, #(8 * BLOCK_SIZE) +LOCAL_LABEL(skip_8): +# else movhs r2, r3 - subhs ip, ip, #(8 * 12) + subhs ip, ip, #(8 * BLOCK_SIZE) +# endif - lsr r3, r2, #4 + lsrs r3, r2, #4 cmp r3, r1 +# if __ARM_ARCH_ISA_THUMB == 1 + blo LOCAL_LABEL(skip_4) + movs r2, r3 + subs r0, r0, #(4 * BLOCK_SIZE) +LOCAL_LABEL(skip_4): +# else movhs r2, r3 - subhs ip, #(4 * 12) + subhs ip, #(4 * BLOCK_SIZE) +# endif - lsr r3, r2, #2 + lsrs r3, r2, #2 cmp r3, r1 +# if __ARM_ARCH_ISA_THUMB == 1 + blo LOCAL_LABEL(skip_2) + movs r2, r3 + subs r0, r0, #(2 * BLOCK_SIZE) +LOCAL_LABEL(skip_2): +# else movhs r2, r3 - subhs ip, ip, #(2 * 12) + subhs ip, ip, #(2 * BLOCK_SIZE) +# endif /* Last block, no need to update r2 or r3. */ +# if __ARM_ARCH_ISA_THUMB == 1 + lsrs r3, r2, #1 + cmp r3, r1 + blo LOCAL_LABEL(skip_1) + subs r0, r0, #(1 * BLOCK_SIZE) +LOCAL_LABEL(skip_1): + movs r2, r0 + mov r0, ip + movs r3, #0 + JMP (r2) + +# else cmp r1, r2, lsr #1 - subls ip, ip, #(1 * 12) + subls ip, ip, #(1 * BLOCK_SIZE) - mov r3, #0 + movs r3, #0 JMP(ip) -# endif +# endif +# endif /* __ARM_FEATURE_CLZ */ + #define IMM # + /* due to the range limit of branch in Thumb1, we have to place the + block closer */ +LOCAL_LABEL(divby0): + movs r0, #0 +# if defined(__ARM_EABI__) + bl __aeabi_idiv0 // due to relocation limit, can't use b. +# endif + JMP(lr) + + +#if __ARM_ARCH_ISA_THUMB == 1 +#define block(shift) \ + lsls r2, r1, IMM shift; \ + cmp r0, r2; \ + blo LOCAL_LABEL(block_skip_##shift); \ + subs r0, r0, r2; \ + LOCAL_LABEL(block_skip_##shift) :; \ + adcs r3, r3 /* same as ((r3 << 1) | Carry). Carry is set if r0 >= r2. */ + + /* TODO: if current location counter is not not word aligned, we don't + need the .p2align and nop */ + /* Label div0block must be word-aligned. First align block 31 */ + .p2align 2 + nop /* Padding to align div0block as 31 blocks = 310 bytes */ +#else #define block(shift) \ cmp r0, r1, lsl IMM shift; \ ITT(hs); \ WIDE(addhs) r3, r3, IMM (1 << shift); \ WIDE(subhs) r0, r0, r1, lsl IMM shift +#endif block(31) block(30) @@ -159,14 +252,6 @@ LOCAL_LABEL(div0block): JMP(lr) #endif /* __ARM_ARCH_EXT_IDIV__ */ -LOCAL_LABEL(divby0): - mov r0, #0 -#ifdef __ARM_EABI__ - b __aeabi_idiv0 -#else - JMP(lr) -#endif - END_COMPILERRT_FUNCTION(__udivsi3) NO_EXEC_STACK_DIRECTIVE |