summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSid Manning <sidneym@codeaurora.org>2018-05-09 14:44:54 +0000
committerSid Manning <sidneym@codeaurora.org>2018-05-09 14:44:54 +0000
commit45688582b548029601dc801e9f92b0e36dd86823 (patch)
tree1f6af105d0a0365cbd57847506f597fd8b198511
parent65b4b157b2946ffe854538ce931a133faa89cdd9 (diff)
Add basic compiler-rt builtins support for hexagon.
Differential Revision: https://reviews.llvm.org/D46364 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@331881 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--cmake/builtin-config-ix.cmake3
-rw-r--r--cmake/config-ix.cmake1
-rw-r--r--lib/builtins/CMakeLists.txt35
-rw-r--r--lib/builtins/hexagon/common_entry_exit_abi1.S103
-rw-r--r--lib/builtins/hexagon/common_entry_exit_abi2.S268
-rw-r--r--lib/builtins/hexagon/common_entry_exit_legacy.S157
-rw-r--r--lib/builtins/hexagon/dfaddsub.S398
-rw-r--r--lib/builtins/hexagon/dfdiv.S492
-rw-r--r--lib/builtins/hexagon/dffma.S705
-rw-r--r--lib/builtins/hexagon/dfminmax.S79
-rw-r--r--lib/builtins/hexagon/dfmul.S418
-rw-r--r--lib/builtins/hexagon/dfsqrt.S406
-rw-r--r--lib/builtins/hexagon/divdi3.S85
-rw-r--r--lib/builtins/hexagon/divsi3.S84
-rw-r--r--lib/builtins/hexagon/fabs_opt.S37
-rw-r--r--lib/builtins/hexagon/fastmath2_dlib_asm.S491
-rw-r--r--lib/builtins/hexagon/fastmath2_ldlib_asm.S345
-rw-r--r--lib/builtins/hexagon/fastmath_dlib_asm.S400
-rw-r--r--lib/builtins/hexagon/fma_opt.S31
-rw-r--r--lib/builtins/hexagon/fmax_opt.S30
-rw-r--r--lib/builtins/hexagon/fmin_opt.S30
-rw-r--r--lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S125
-rw-r--r--lib/builtins/hexagon/memcpy_likely_aligned.S64
-rw-r--r--lib/builtins/hexagon/moddi3.S83
-rw-r--r--lib/builtins/hexagon/modsi3.S66
-rw-r--r--lib/builtins/hexagon/sfdiv_opt.S66
-rw-r--r--lib/builtins/hexagon/sfsqrt_opt.S82
-rw-r--r--lib/builtins/hexagon/udivdi3.S71
-rw-r--r--lib/builtins/hexagon/udivmoddi4.S71
-rw-r--r--lib/builtins/hexagon/udivmodsi4.S60
-rw-r--r--lib/builtins/hexagon/udivsi3.S56
-rw-r--r--lib/builtins/hexagon/umoddi3.S74
-rw-r--r--lib/builtins/hexagon/umodsi3.S55
33 files changed, 5470 insertions, 1 deletions
diff --git a/cmake/builtin-config-ix.cmake b/cmake/builtin-config-ix.cmake
index eda5f4641..a5704e5fe 100644
--- a/cmake/builtin-config-ix.cmake
+++ b/cmake/builtin-config-ix.cmake
@@ -25,6 +25,7 @@ int foo(int x, int y) {
set(ARM64 aarch64)
set(ARM32 arm armhf armv6m armv7m armv7em armv7 armv7s armv7k)
+set(HEXAGON hexagon)
set(X86 i386)
set(X86_64 x86_64)
set(MIPS32 mips mipsel)
@@ -42,7 +43,7 @@ if(APPLE)
endif()
set(ALL_BUILTIN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
- ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${WASM32} ${WASM64})
+ ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${WASM32} ${WASM64})
include(CompilerRTUtils)
include(CompilerRTDarwinUtils)
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 82c3d2859..ee387dc22 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -174,6 +174,7 @@ endmacro()
set(ARM64 aarch64)
set(ARM32 arm armhf)
+set(HEXAGON hexagon)
set(X86 i386)
set(X86_64 x86_64)
set(MIPS32 mips mipsel)
diff --git a/lib/builtins/CMakeLists.txt b/lib/builtins/CMakeLists.txt
index 6c48a404a..5e7f22467 100644
--- a/lib/builtins/CMakeLists.txt
+++ b/lib/builtins/CMakeLists.txt
@@ -459,6 +459,41 @@ set(armv6m_SOURCES ${thumb1_SOURCES})
set(armv7m_SOURCES ${arm_SOURCES})
set(armv7em_SOURCES ${arm_SOURCES})
+# hexagon arch
+set(hexagon_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES})
+set(hexagon_SOURCES
+ hexagon/common_entry_exit_abi1.S
+ hexagon/common_entry_exit_abi2.S
+ hexagon/common_entry_exit_legacy.S
+ hexagon/dfaddsub.S
+ hexagon/dfdiv.S
+ hexagon/dffma.S
+ hexagon/dfminmax.S
+ hexagon/dfmul.S
+ hexagon/dfsqrt.S
+ hexagon/divdi3.S
+ hexagon/divsi3.S
+ hexagon/fabs_opt.S
+ hexagon/fastmath2_dlib_asm.S
+ hexagon/fastmath2_ldlib_asm.S
+ hexagon/fastmath_dlib_asm.S
+ hexagon/fma_opt.S
+ hexagon/fmax_opt.S
+ hexagon/fmin_opt.S
+ hexagon/memcpy_forward_vp4cp4n2.S
+ hexagon/memcpy_likely_aligned.S
+ hexagon/moddi3.S
+ hexagon/modsi3.S
+ hexagon/sfdiv_opt.S
+ hexagon/sfsqrt_opt.S
+ hexagon/udivdi3.S
+ hexagon/udivmoddi4.S
+ hexagon/udivmodsi4.S
+ hexagon/udivsi3.S
+ hexagon/umoddi3.S
+ hexagon/umodsi3.S)
+
+
set(mips_SOURCES ${GENERIC_SOURCES})
set(mipsel_SOURCES ${mips_SOURCES})
set(mips64_SOURCES ${GENERIC_TF_SOURCES}
diff --git a/lib/builtins/hexagon/common_entry_exit_abi1.S b/lib/builtins/hexagon/common_entry_exit_abi1.S
new file mode 100644
index 000000000..d5479d2a5
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_abi1.S
@@ -0,0 +1,103 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+ used to save code size */
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .globl \name
+ .type \name, @function
+ .falign
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+ .macro FALLTHROUGH_TAIL_CALL name0 name1
+ .size \name0, . - \name0
+ .globl \name1
+ .type \name1, @function
+ .falign
+\name1:
+ .endm
+
+
+
+
+/* Save r25:24 at fp+#-8 and r27:26 at fp+#-16. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR. No other
+ registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+ simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r24_through_r27
+ memd(fp+#-16) = r27:26
+FALLTHROUGH_TAIL_CALL __save_r24_through_r27 __save_r24_through_r25
+ {
+ memd(fp+#-8) = r25:24
+ jumpr lr
+ }
+FUNCTION_END __save_r24_through_r25
+
+
+
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+ with deallocframe. That way, the return gets the old value of lr, which is
+ where these functions need to return, and at the same time, lr gets the value
+ it needs going into the tail call. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe_before_tailcall
+ r27:26 = memd(fp+#-16)
+FALLTHROUGH_TAIL_CALL __restore_r24_through_r27_and_deallocframe_before_tailcall __restore_r24_through_r25_and_deallocframe_before_tailcall
+ {
+ r25:24 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r24_through_r25_and_deallocframe_before_tailcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+ to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe
+ {
+ lr = memw(fp+#4)
+ r27:26 = memd(fp+#-16)
+ }
+ {
+ r25:24 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r24_through_r27_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized. */
+
+FUNCTION_BEGIN __restore_r24_through_r25_and_deallocframe
+ {
+ r25:24 = memd(fp+#-8)
+ deallocframe
+ }
+ jumpr lr
+FUNCTION_END __restore_r24_through_r25_and_deallocframe
diff --git a/lib/builtins/hexagon/common_entry_exit_abi2.S b/lib/builtins/hexagon/common_entry_exit_abi2.S
new file mode 100644
index 000000000..6f470343d
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_abi2.S
@@ -0,0 +1,268 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+ used to save code size */
+
+ .macro FUNCTION_BEGIN name
+ .p2align 2
+ .section .text.\name,"ax",@progbits
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+ .macro FALLTHROUGH_TAIL_CALL name0 name1
+ .p2align 2
+ .size \name0, . - \name0
+ .globl \name1
+ .type \name1, @function
+\name1:
+ .endm
+
+
+
+
+/* Save r17:16 at fp+#-8, r19:18 at fp+#-16, r21:20 at fp+#-24, r23:22 at
+ fp+#-32, r25:24 at fp+#-40, and r27:26 at fp+#-48.
+ The compiler knows that the __save_* functions clobber LR. No other
+ registers should be used without informing the compiler. */
+
+FUNCTION_BEGIN __save_r16_through_r27
+ {
+ memd(fp+#-48) = r27:26
+ memd(fp+#-40) = r25:24
+ }
+ {
+ memd(fp+#-32) = r23:22
+ memd(fp+#-24) = r21:20
+ }
+ {
+ memd(fp+#-16) = r19:18
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r27
+
+FUNCTION_BEGIN __save_r16_through_r25
+ {
+ memd(fp+#-40) = r25:24
+ memd(fp+#-32) = r23:22
+ }
+ {
+ memd(fp+#-24) = r21:20
+ memd(fp+#-16) = r19:18
+ }
+ {
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r25
+
+FUNCTION_BEGIN __save_r16_through_r23
+ {
+ memd(fp+#-32) = r23:22
+ memd(fp+#-24) = r21:20
+ }
+ {
+ memd(fp+#-16) = r19:18
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r23
+
+FUNCTION_BEGIN __save_r16_through_r21
+ {
+ memd(fp+#-24) = r21:20
+ memd(fp+#-16) = r19:18
+ }
+ {
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r21
+
+FUNCTION_BEGIN __save_r16_through_r19
+ {
+ memd(fp+#-16) = r19:18
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r19
+
+FUNCTION_BEGIN __save_r16_through_r17
+ {
+ memd(fp+#-8) = r17:16
+ jumpr lr
+ }
+FUNCTION_END __save_r16_through_r17
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+ with deallocframe. That way, the return gets the old value of lr, which is
+ where these functions need to return, and at the same time, lr gets the value
+ it needs going into the tail call. */
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe_before_tailcall
+ r27:26 = memd(fp+#-48)
+ {
+ r25:24 = memd(fp+#-40)
+ r23:22 = memd(fp+#-32)
+ }
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r27_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe_before_tailcall
+ {
+ r25:24 = memd(fp+#-40)
+ r23:22 = memd(fp+#-32)
+ }
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r25_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe_before_tailcall
+ {
+ r23:22 = memd(fp+#-32)
+ r21:20 = memd(fp+#-24)
+ }
+ r19:18 = memd(fp+#-16)
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r23_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe_before_tailcall
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe_before_tailcall
+ r19:18 = memd(fp+#-16)
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe_before_tailcall
+ {
+ r17:16 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r16_through_r17_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe
+ r27:26 = memd(fp+#-48)
+ {
+ r25:24 = memd(fp+#-40)
+ r23:22 = memd(fp+#-32)
+ }
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r27_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe
+ {
+ r25:24 = memd(fp+#-40)
+ r23:22 = memd(fp+#-32)
+ }
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r25_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe
+ {
+ r23:22 = memd(fp+#-32)
+ }
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r23_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe
+ {
+ r21:20 = memd(fp+#-24)
+ r19:18 = memd(fp+#-16)
+ }
+ {
+ r17:16 = memd(fp+#-8)
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r21_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe
+ {
+ r19:18 = memd(fp+#-16)
+ r17:16 = memd(fp+#-8)
+ }
+ {
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe
+ {
+ r17:16 = memd(fp+#-8)
+ dealloc_return
+ }
+FUNCTION_END __restore_r16_through_r17_and_deallocframe
+
+FUNCTION_BEGIN __deallocframe
+ dealloc_return
+FUNCTION_END __deallocframe
diff --git a/lib/builtins/hexagon/common_entry_exit_legacy.S b/lib/builtins/hexagon/common_entry_exit_legacy.S
new file mode 100644
index 000000000..3258f15a3
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_legacy.S
@@ -0,0 +1,157 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/* Functions that implement common sequences in function prologues and epilogues
+ used to save code size */
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .globl \name
+ .type \name, @function
+ .falign
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+ .macro FALLTHROUGH_TAIL_CALL name0 name1
+ .size \name0, . - \name0
+ .globl \name1
+ .type \name1, @function
+ .falign
+\name1:
+ .endm
+
+
+
+
+/* Save r27:26 at fp+#-8, r25:24 at fp+#-16, r23:22 at fp+#-24, r21:20 at
+ fp+#-32, r19:18 at fp+#-40, and r17:16 at fp+#-48. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR. No other
+ registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+ simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r27_through_r16
+ memd(fp+#-48) = r17:16
+FALLTHROUGH_TAIL_CALL __save_r27_through_r16 __save_r27_through_r18
+ memd(fp+#-40) = r19:18
+FALLTHROUGH_TAIL_CALL __save_r27_through_r18 __save_r27_through_r20
+ memd(fp+#-32) = r21:20
+FALLTHROUGH_TAIL_CALL __save_r27_through_r20 __save_r27_through_r22
+ memd(fp+#-24) = r23:22
+FALLTHROUGH_TAIL_CALL __save_r27_through_r22 __save_r27_through_r24
+ memd(fp+#-16) = r25:24
+ {
+ memd(fp+#-8) = r27:26
+ jumpr lr
+ }
+FUNCTION_END __save_r27_through_r24
+
+
+
+
+/* For each of the *_before_sibcall functions, jumpr lr is executed in parallel
+ with deallocframe. That way, the return gets the old value of lr, which is
+ where these functions need to return, and at the same time, lr gets the value
+ it needs going into the sibcall. */
+
+FUNCTION_BEGIN __restore_r27_through_r20_and_deallocframe_before_sibcall
+ {
+ r21:20 = memd(fp+#-32)
+ r23:22 = memd(fp+#-24)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe_before_sibcall __restore_r27_through_r24_and_deallocframe_before_sibcall
+ {
+ r25:24 = memd(fp+#-16)
+ jump __restore_r27_through_r26_and_deallocframe_before_sibcall
+ }
+FUNCTION_END __restore_r27_through_r24_and_deallocframe_before_sibcall
+
+
+
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe_before_sibcall
+ r17:16 = memd(fp+#-48)
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe_before_sibcall __restore_r27_through_r18_and_deallocframe_before_sibcall
+ {
+ r19:18 = memd(fp+#-40)
+ r21:20 = memd(fp+#-32)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe_before_sibcall __restore_r27_through_r22_and_deallocframe_before_sibcall
+ {
+ r23:22 = memd(fp+#-24)
+ r25:24 = memd(fp+#-16)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe_before_sibcall __restore_r27_through_r26_and_deallocframe_before_sibcall
+ {
+ r27:26 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r27_through_r26_and_deallocframe_before_sibcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+ to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe
+ {
+ r17:16 = memd(fp+#-48)
+ r19:18 = memd(fp+#-40)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe __restore_r27_through_r20_and_deallocframe
+ {
+ r21:20 = memd(fp+#-32)
+ r23:22 = memd(fp+#-24)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe __restore_r27_through_r24_and_deallocframe
+ {
+ lr = memw(fp+#4)
+ r25:24 = memd(fp+#-16)
+ }
+ {
+ r27:26 = memd(fp+#-8)
+ deallocframe
+ jumpr lr
+ }
+FUNCTION_END __restore_r27_through_r24_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized for all three functions. */
+
+FUNCTION_BEGIN __restore_r27_through_r18_and_deallocframe
+ {
+ r19:18 = memd(fp+#-40)
+ r21:20 = memd(fp+#-32)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe __restore_r27_through_r22_and_deallocframe
+ {
+ r23:22 = memd(fp+#-24)
+ r25:24 = memd(fp+#-16)
+ }
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe __restore_r27_through_r26_and_deallocframe
+ {
+ r27:26 = memd(fp+#-8)
+ deallocframe
+ }
+ jumpr lr
+FUNCTION_END __restore_r27_through_r26_and_deallocframe
diff --git a/lib/builtins/hexagon/dfaddsub.S b/lib/builtins/hexagon/dfaddsub.S
new file mode 100644
index 000000000..4173f86a4
--- /dev/null
+++ b/lib/builtins/hexagon/dfaddsub.S
@@ -0,0 +1,398 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define EXPA r4
+#define EXPB r5
+#define EXPB_A r5:4
+
+#define ZTMP r7:6
+#define ZTMPH r7
+#define ZTMPL r6
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define BTMP r9:8
+#define BTMPH r9
+#define BTMPL r8
+
+#define ATMP2 r11:10
+#define ATMP2H r11
+#define ATMP2L r10
+
+#define EXPDIFF r15
+#define EXTRACTOFF r14
+#define EXTRACTAMT r15:14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+#define SR_BIT_INEXACT 5
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+#define NORMAL p3
+#define BIGB p2
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+ .text
+ .global __hexagon_adddf3
+ .global __hexagon_subdf3
+ .type __hexagon_adddf3, @function
+ .type __hexagon_subdf3, @function
+
+Q6_ALIAS(adddf3)
+FAST_ALIAS(adddf3)
+FAST2_ALIAS(adddf3)
+Q6_ALIAS(subdf3)
+FAST_ALIAS(subdf3)
+FAST2_ALIAS(subdf3)
+
+ .p2align 5
+__hexagon_adddf3:
+ {
+ EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+ ATMP = combine(##0x20000000,#0)
+ }
+ {
+ NORMAL = dfclass(A,#2)
+ NORMAL = dfclass(B,#2)
+ BTMP = ATMP
+ BIGB = cmp.gtu(EXPB,EXPA) // Is B substantially greater than A?
+ }
+ {
+ if (!NORMAL) jump .Ladd_abnormal // If abnormal, go to special code
+ if (BIGB) A = B // if B >> A, swap A and B
+ if (BIGB) B = A // If B >> A, swap A and B
+ if (BIGB) EXPB_A = combine(EXPA,EXPB) // swap exponents
+ }
+ {
+ ATMP = insert(A,#MANTBITS,#EXPBITS-2) // Q1.62
+ BTMP = insert(B,#MANTBITS,#EXPBITS-2) // Q1.62
+ EXPDIFF = sub(EXPA,EXPB)
+ ZTMP = combine(#62,#1)
+ }
+#undef BIGB
+#undef NORMAL
+#define B_POS p3
+#define A_POS p2
+#define NO_STICKIES p1
+.Ladd_continue:
+ {
+ EXPDIFF = min(EXPDIFF,ZTMPH) // If exponent difference >= ~60,
+ // will collapse to sticky bit
+ ATMP2 = neg(ATMP)
+ A_POS = cmp.gt(AH,#-1)
+ EXTRACTOFF = #0
+ }
+ {
+ if (!A_POS) ATMP = ATMP2
+ ATMP2 = extractu(BTMP,EXTRACTAMT)
+ BTMP = ASR(BTMP,EXPDIFF)
+#undef EXTRACTAMT
+#undef EXPDIFF
+#undef EXTRACTOFF
+#define ZERO r15:14
+ ZERO = #0
+ }
+ {
+ NO_STICKIES = cmp.eq(ATMP2,ZERO)
+ if (!NO_STICKIES.new) BTMPL = or(BTMPL,ZTMPL)
+ EXPB = add(EXPA,#-BIAS-60)
+ B_POS = cmp.gt(BH,#-1)
+ }
+ {
+ ATMP = add(ATMP,BTMP) // ADD!!!
+ ATMP2 = sub(ATMP,BTMP) // Negate and ADD --> SUB!!!
+ ZTMP = combine(#54,##2045)
+ }
+ {
+ p0 = cmp.gtu(EXPA,ZTMPH) // must be pretty high in case of large cancellation
+ p0 = !cmp.gtu(EXPA,ZTMPL)
+ if (!p0.new) jump:nt .Ladd_ovf_unf
+ if (!B_POS) ATMP = ATMP2 // if B neg, pick difference
+ }
+ {
+ A = convert_d2df(ATMP) // Convert to Double Precision, taking care of flags, etc. So nice!
+ p0 = cmp.eq(ATMPH,#0)
+ p0 = cmp.eq(ATMPL,#0)
+ if (p0.new) jump:nt .Ladd_zero // or maybe conversion handles zero case correctly?
+ }
+ {
+ AH += asl(EXPB,#HI_MANTBITS)
+ jumpr r31
+ }
+ .falign
+__hexagon_subdf3:
+ {
+ BH = togglebit(BH,#31)
+ jump __qdsp_adddf3
+ }
+
+
+ .falign
+.Ladd_zero:
+ // True zero, full cancellation
+ // +0 unless round towards negative infinity
+ {
+ TMP = USR
+ A = #0
+ BH = #1
+ }
+ {
+ TMP = extractu(TMP,#2,#22)
+ BH = asl(BH,#31)
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = xor(AH,BH)
+ jumpr r31
+ }
+ .falign
+.Ladd_ovf_unf:
+ // Overflow or Denormal is possible
+ // Good news: Underflow flag is not possible!
+ /*
+ * ATMP has 2's complement value
+ *
+ * EXPA has A's exponent, EXPB has EXPA-BIAS-60
+ *
+ * Convert, extract exponent, add adjustment.
+ * If > 2046, overflow
+ * If <= 0, denormal
+ *
+ * Note that we've not done our zero check yet, so do that too
+ *
+ */
+ {
+ A = convert_d2df(ATMP)
+ p0 = cmp.eq(ATMPH,#0)
+ p0 = cmp.eq(ATMPL,#0)
+ if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+ TMP = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ AH += asl(EXPB,#HI_MANTBITS)
+ }
+ {
+ EXPB = add(EXPB,TMP)
+ B = combine(##0x00100000,#0)
+ }
+ {
+ p0 = cmp.gt(EXPB,##BIAS+BIAS-2)
+ if (p0.new) jump:nt .Ladd_ovf
+ }
+ {
+ p0 = cmp.gt(EXPB,#0)
+ if (p0.new) jumpr:t r31
+ TMP = sub(#1,EXPB)
+ }
+ {
+ B = insert(A,#MANTBITS,#0)
+ A = ATMP
+ }
+ {
+ B = lsr(B,TMP)
+ }
+ {
+ A = insert(B,#63,#0)
+ jumpr r31
+ }
+ .falign
+.Ladd_ovf:
+ // We get either max finite value or infinity. Either way, overflow+inexact
+ {
+ A = ATMP // 2's complement value
+ TMP = USR
+ ATMP = combine(##0x7fefffff,#-1) // positive max finite
+ }
+ {
+ EXPB = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
+ TMP = or(TMP,#0x28) // inexact + overflow
+ BTMP = combine(##0x7ff00000,#0) // positive infinity
+ }
+ {
+ USR = TMP
+ EXPB ^= lsr(AH,#31) // Does sign match rounding?
+ TMP = EXPB // unmodified rounding mode
+ }
+ {
+ p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
+ p0 = !cmp.eq(EXPB,#2) // Not rounding the other way,
+ if (p0.new) ATMP = BTMP // we should get infinity
+ }
+ {
+ A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
+ }
+ {
+ p0 = dfcmp.eq(A,A)
+ jumpr r31
+ }
+
+.Ladd_abnormal:
+ {
+ ATMP = extractu(A,#63,#0) // strip off sign
+ BTMP = extractu(B,#63,#0) // strip off sign
+ }
+ {
+ p3 = cmp.gtu(ATMP,BTMP)
+ if (!p3.new) A = B // sort values
+ if (!p3.new) B = A // sort values
+ }
+ {
+ // Any NaN --> NaN, possibly raise invalid if sNaN
+ p0 = dfclass(A,#0x0f) // A not NaN?
+ if (!p0.new) jump:nt .Linvalid_nan_add
+ if (!p3) ATMP = BTMP
+ if (!p3) BTMP = ATMP
+ }
+ {
+ // Infinity + non-infinity number is infinity
+ // Infinity + infinity --> inf or nan
+ p1 = dfclass(A,#0x08) // A is infinity
+ if (p1.new) jump:nt .Linf_add
+ }
+ {
+ p2 = dfclass(B,#0x01) // B is zero
+ if (p2.new) jump:nt .LB_zero // so return A or special 0+0
+ ATMP = #0
+ }
+ // We are left with adding one or more subnormals
+ {
+ p0 = dfclass(A,#4)
+ if (p0.new) jump:nt .Ladd_two_subnormal
+ ATMP = combine(##0x20000000,#0)
+ }
+ {
+ EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ EXPB = #1
+ // BTMP already ABS(B)
+ BTMP = asl(BTMP,#EXPBITS-2)
+ }
+#undef ZERO
+#define EXTRACTOFF r14
+#define EXPDIFF r15
+ {
+ ATMP = insert(A,#MANTBITS,#EXPBITS-2)
+ EXPDIFF = sub(EXPA,EXPB)
+ ZTMP = combine(#62,#1)
+ jump .Ladd_continue
+ }
+
+.Ladd_two_subnormal:
+ {
+ ATMP = extractu(A,#63,#0)
+ BTMP = extractu(B,#63,#0)
+ }
+ {
+ ATMP = neg(ATMP)
+ BTMP = neg(BTMP)
+ p0 = cmp.gt(AH,#-1)
+ p1 = cmp.gt(BH,#-1)
+ }
+ {
+ if (p0) ATMP = A
+ if (p1) BTMP = B
+ }
+ {
+ ATMP = add(ATMP,BTMP)
+ }
+ {
+ BTMP = neg(ATMP)
+ p0 = cmp.gt(ATMPH,#-1)
+ B = #0
+ }
+ {
+ if (!p0) A = BTMP
+ if (p0) A = ATMP
+ BH = ##0x80000000
+ }
+ {
+ if (!p0) AH = or(AH,BH)
+ p0 = dfcmp.eq(A,B)
+ if (p0.new) jump:nt .Lzero_plus_zero
+ }
+ {
+ jumpr r31
+ }
+
+.Linvalid_nan_add:
+ {
+ TMP = convert_df2sf(A) // will generate invalid if sNaN
+ p0 = dfclass(B,#0x0f) // if B is not NaN
+ if (p0.new) B = A // make it whatever A is
+ }
+ {
+ BL = convert_df2sf(B) // will generate invalid if sNaN
+ A = #-1
+ jumpr r31
+ }
+ .falign
+.LB_zero:
+ {
+ p0 = dfcmp.eq(ATMP,A) // is A also zero?
+ if (!p0.new) jumpr:t r31 // If not, just return A
+ }
+ // 0 + 0 is special
+ // if equal integral values, they have the same sign, which is fine for all rounding
+ // modes.
+ // If unequal in sign, we get +0 for all rounding modes except round down
+.Lzero_plus_zero:
+ {
+ p0 = cmp.eq(A,B)
+ if (p0.new) jumpr:t r31
+ }
+ {
+ TMP = USR
+ }
+ {
+ TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+ A = #0
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = ##0x80000000
+ jumpr r31
+ }
+.Linf_add:
+ // adding infinities is only OK if they are equal
+ {
+ p0 = !cmp.eq(AH,BH) // Do they have different signs
+ p0 = dfclass(B,#8) // And is B also infinite?
+ if (!p0.new) jumpr:t r31 // If not, just a normal inf
+ }
+ {
+ BL = ##0x7f800001 // sNAN
+ }
+ {
+ A = convert_sf2df(BL) // trigger invalid, set NaN
+ jumpr r31
+ }
+END(__hexagon_adddf3)
diff --git a/lib/builtins/hexagon/dfdiv.S b/lib/builtins/hexagon/dfdiv.S
new file mode 100644
index 000000000..0c5dbe272
--- /dev/null
+++ b/lib/builtins/hexagon/dfdiv.S
@@ -0,0 +1,492 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Divide */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define Q r5:4
+#define QH r5
+#define QL r4
+
+#define PROD r7:6
+#define PRODHI r7
+#define PRODLO r6
+
+#define SFONE r8
+#define SFDEN r9
+#define SFERROR r10
+#define SFRECIP r11
+
+#define EXPBA r13:12
+#define EXPB r13
+#define EXPA r12
+
+#define REMSUB2 r15:14
+
+
+
+#define SIGN r28
+
+#define Q_POSITIVE p3
+#define NORMAL p2
+#define NO_OVF_UNF p1
+#define P_TMP p0
+
+#define RECIPEST_SHIFT 3
+#define QADJ 61
+
+#define DFCLASS_NORMAL 0x02
+#define DFCLASS_NUMBER 0x0F
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_ZERO 0x01
+#define DFCLASS_NONZERO (DFCLASS_NUMBER ^ DFCLASS_ZERO)
+#define DFCLASS_NONINFINITE (DFCLASS_NUMBER ^ DFCLASS_INFINITE)
+
+#define DF_MANTBITS 52
+#define DF_EXPBITS 11
+#define SF_MANTBITS 23
+#define SF_EXPBITS 8
+#define DF_BIAS 0x3ff
+
+#define SR_ROUND_OFF 22
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+ .text
+ .global __hexagon_divdf3
+ .type __hexagon_divdf3,@function
+ Q6_ALIAS(divdf3)
+ FAST_ALIAS(divdf3)
+ FAST2_ALIAS(divdf3)
+ .p2align 5
+__hexagon_divdf3:
+ {
+ NORMAL = dfclass(A,#DFCLASS_NORMAL)
+ NORMAL = dfclass(B,#DFCLASS_NORMAL)
+ EXPBA = combine(BH,AH)
+ SIGN = xor(AH,BH)
+ }
+#undef A
+#undef AH
+#undef AL
+#undef B
+#undef BH
+#undef BL
+#define REM r1:0
+#define REMHI r1
+#define REMLO r0
+#define DENOM r3:2
+#define DENOMHI r3
+#define DENOMLO r2
+ {
+ if (!NORMAL) jump .Ldiv_abnormal
+ PROD = extractu(DENOM,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+ SFONE = ##0x3f800001
+ }
+ {
+ SFDEN = or(SFONE,PRODLO)
+ EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+ EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+ Q_POSITIVE = cmp.gt(SIGN,#-1)
+ }
+#undef SIGN
+#define ONE r28
+.Ldenorm_continue:
+ {
+ SFRECIP,P_TMP = sfrecipa(SFONE,SFDEN)
+ SFERROR = and(SFONE,#-2)
+ ONE = #1
+ EXPA = sub(EXPA,EXPB)
+ }
+#undef EXPB
+#define RECIPEST r13
+ {
+ SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+ REMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+ RECIPEST = ##0x00800000 << RECIPEST_SHIFT
+ }
+ {
+ SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+ DENOMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+ SFERROR = and(SFONE,#-2)
+ }
+ {
+ SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+ QH = #-DF_BIAS+1
+ QL = #DF_BIAS-1
+ }
+ {
+ SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+ NO_OVF_UNF = cmp.gt(EXPA,QH)
+ NO_OVF_UNF = !cmp.gt(EXPA,QL)
+ }
+ {
+ RECIPEST = insert(SFRECIP,#SF_MANTBITS,#RECIPEST_SHIFT)
+ Q = #0
+ EXPA = add(EXPA,#-QADJ)
+ }
+#undef SFERROR
+#undef SFRECIP
+#define TMP r10
+#define TMP1 r11
+ {
+ RECIPEST = add(RECIPEST,#((-3) << RECIPEST_SHIFT))
+ }
+
+#define DIV_ITER1B(QSHIFTINSN,QSHIFT,REMSHIFT,EXTRA) \
+ { \
+ PROD = mpyu(RECIPEST,REMHI); \
+ REM = asl(REM,# ## ( REMSHIFT )); \
+ }; \
+ { \
+ PRODLO = # ## 0; \
+ REM -= mpyu(PRODHI,DENOMLO); \
+ REMSUB2 = mpyu(PRODHI,DENOMHI); \
+ }; \
+ { \
+ Q += QSHIFTINSN(PROD, # ## ( QSHIFT )); \
+ REM -= asl(REMSUB2, # ## 32); \
+ EXTRA \
+ }
+
+
+ DIV_ITER1B(ASL,14,15,)
+ DIV_ITER1B(ASR,1,15,)
+ DIV_ITER1B(ASR,16,15,)
+ DIV_ITER1B(ASR,31,15,PROD=# ( 0 );)
+
+#undef REMSUB2
+#define TMPPAIR r15:14
+#define TMPPAIRHI r15
+#define TMPPAIRLO r14
+#undef RECIPEST
+#define EXPB r13
+ {
+ // compare or sub with carry
+ TMPPAIR = sub(REM,DENOM)
+ P_TMP = cmp.gtu(DENOM,REM)
+ // set up amt to add to q
+ if (!P_TMP.new) PRODLO = #2
+ }
+ {
+ Q = add(Q,PROD)
+ if (!P_TMP) REM = TMPPAIR
+ TMPPAIR = #0
+ }
+ {
+ P_TMP = cmp.eq(REM,TMPPAIR)
+ if (!P_TMP.new) QL = or(QL,ONE)
+ }
+ {
+ PROD = neg(Q)
+ }
+ {
+ if (!Q_POSITIVE) Q = PROD
+ }
+#undef REM
+#undef REMHI
+#undef REMLO
+#undef DENOM
+#undef DENOMLO
+#undef DENOMHI
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+ {
+ A = convert_d2df(Q)
+ if (!NO_OVF_UNF) jump .Ldiv_ovf_unf
+ }
+ {
+ AH += asl(EXPA,#DF_MANTBITS-32)
+ jumpr r31
+ }
+
+.Ldiv_ovf_unf:
+ {
+ AH += asl(EXPA,#DF_MANTBITS-32)
+ EXPB = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+ }
+ {
+ PROD = abs(Q)
+ EXPA = add(EXPA,EXPB)
+ }
+ {
+ P_TMP = cmp.gt(EXPA,##DF_BIAS+DF_BIAS) // overflow
+ if (P_TMP.new) jump:nt .Ldiv_ovf
+ }
+ {
+ P_TMP = cmp.gt(EXPA,#0)
+ if (P_TMP.new) jump:nt .Lpossible_unf // round up to normal possible...
+ }
+ /* Underflow */
+ /* We know what the infinite range exponent should be (EXPA) */
+ /* Q is 2's complement, PROD is abs(Q) */
+ /* Normalize Q, shift right, add a high bit, convert, change exponent */
+
+#define FUDGE1 7 // how much to shift right
+#define FUDGE2 4 // how many guard/round to keep at lsbs
+
+ {
+ EXPB = add(clb(PROD),#-1) // doesn't need to be added in since
+ EXPA = sub(#FUDGE1,EXPA) // we extract post-converted exponent
+ TMP = USR
+ TMP1 = #63
+ }
+ {
+ EXPB = min(EXPA,TMP1)
+ TMP1 = or(TMP,#0x030)
+ PROD = asl(PROD,EXPB)
+ EXPA = #0
+ }
+ {
+ TMPPAIR = extractu(PROD,EXPBA) // bits that will get shifted out
+ PROD = lsr(PROD,EXPB) // shift out bits
+ B = #1
+ }
+ {
+ P_TMP = cmp.gtu(B,TMPPAIR)
+ if (!P_TMP.new) PRODLO = or(BL,PRODLO)
+ PRODHI = setbit(PRODHI,#DF_MANTBITS-32+FUDGE2)
+ }
+ {
+ Q = neg(PROD)
+ P_TMP = bitsclr(PRODLO,#(1<<FUDGE2)-1)
+ if (!P_TMP.new) TMP = TMP1
+ }
+ {
+ USR = TMP
+ if (Q_POSITIVE) Q = PROD
+ TMP = #-DF_BIAS-(DF_MANTBITS+FUDGE2)
+ }
+ {
+ A = convert_d2df(Q)
+ }
+ {
+ AH += asl(TMP,#DF_MANTBITS-32)
+ jumpr r31
+ }
+
+
+.Lpossible_unf:
+ /* If upper parts of Q were all F's, but abs(A) == 0x00100000_00000000, we rounded up to min_normal */
+ /* The answer is correct, but we need to raise Underflow */
+ {
+ B = extractu(A,#63,#0)
+ TMPPAIR = combine(##0x00100000,#0) // min normal
+ TMP = #0x7FFF
+ }
+ {
+ P_TMP = dfcmp.eq(TMPPAIR,B) // Is everything zero in the rounded value...
+ P_TMP = bitsset(PRODHI,TMP) // but a bunch of bits set in the unrounded abs(quotient)?
+ }
+
+#if (__HEXAGON_ARCH__ == 60)
+ TMP = USR // If not, just return
+ if (!P_TMP) jumpr r31 // Else, we want to set Unf+Inexact
+ // Note that inexact is already set...
+#else
+ {
+ if (!P_TMP) jumpr r31 // If not, just return
+ TMP = USR // Else, we want to set Unf+Inexact
+ } // Note that inexact is already set...
+#endif
+ {
+ TMP = or(TMP,#0x30)
+ }
+ {
+ USR = TMP
+ }
+ {
+ p0 = dfcmp.eq(A,A)
+ jumpr r31
+ }
+
+.Ldiv_ovf:
+ /*
+ * Raise Overflow, and choose the correct overflow value (saturated normal or infinity)
+ */
+ {
+ TMP = USR
+ B = combine(##0x7fefffff,#-1)
+ AH = mux(Q_POSITIVE,#0,#-1)
+ }
+ {
+ PROD = combine(##0x7ff00000,#0)
+ QH = extractu(TMP,#2,#SR_ROUND_OFF)
+ TMP = or(TMP,#0x28)
+ }
+ {
+ USR = TMP
+ QH ^= lsr(AH,#31)
+ QL = QH
+ }
+ {
+ p0 = !cmp.eq(QL,#1) // if not round-to-zero
+ p0 = !cmp.eq(QH,#2) // and not rounding the other way
+ if (p0.new) B = PROD // go to inf
+ p0 = dfcmp.eq(B,B) // get exceptions
+ }
+ {
+ A = insert(B,#63,#0)
+ jumpr r31
+ }
+
+#undef ONE
+#define SIGN r28
+#undef NORMAL
+#undef NO_OVF_UNF
+#define P_INF p1
+#define P_ZERO p2
+.Ldiv_abnormal:
+ {
+ P_TMP = dfclass(A,#DFCLASS_NUMBER)
+ P_TMP = dfclass(B,#DFCLASS_NUMBER)
+ Q_POSITIVE = cmp.gt(SIGN,#-1)
+ }
+ {
+ P_INF = dfclass(A,#DFCLASS_INFINITE)
+ P_INF = dfclass(B,#DFCLASS_INFINITE)
+ }
+ {
+ P_ZERO = dfclass(A,#DFCLASS_ZERO)
+ P_ZERO = dfclass(B,#DFCLASS_ZERO)
+ }
+ {
+ if (!P_TMP) jump .Ldiv_nan
+ if (P_INF) jump .Ldiv_invalid
+ }
+ {
+ if (P_ZERO) jump .Ldiv_invalid
+ }
+ {
+ P_ZERO = dfclass(A,#DFCLASS_NONZERO) // nonzero
+ P_ZERO = dfclass(B,#DFCLASS_NONINFINITE) // non-infinite
+ }
+ {
+ P_INF = dfclass(A,#DFCLASS_NONINFINITE) // non-infinite
+ P_INF = dfclass(B,#DFCLASS_NONZERO) // nonzero
+ }
+ {
+ if (!P_ZERO) jump .Ldiv_zero_result
+ if (!P_INF) jump .Ldiv_inf_result
+ }
+ /* Now we've narrowed it down to (de)normal / (de)normal */
+ /* Set up A/EXPA B/EXPB and go back */
+#undef P_ZERO
+#undef P_INF
+#define P_TMP2 p1
+ {
+ P_TMP = dfclass(A,#DFCLASS_NORMAL)
+ P_TMP2 = dfclass(B,#DFCLASS_NORMAL)
+ TMP = ##0x00100000
+ }
+ {
+ EXPBA = combine(BH,AH)
+ AH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32) // clear out hidden bit, sign bit
+ BH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32) // clear out hidden bit, sign bit
+ }
+ {
+ if (P_TMP) AH = or(AH,TMP) // if normal, add back in hidden bit
+ if (P_TMP2) BH = or(BH,TMP) // if normal, add back in hidden bit
+ }
+ {
+ QH = add(clb(A),#-DF_EXPBITS)
+ QL = add(clb(B),#-DF_EXPBITS)
+ TMP = #1
+ }
+ {
+ EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+ EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+ }
+ {
+ A = asl(A,QH)
+ B = asl(B,QL)
+ if (!P_TMP) EXPA = sub(TMP,QH)
+ if (!P_TMP2) EXPB = sub(TMP,QL)
+ } // recreate values needed by resume coke
+ {
+ PROD = extractu(B,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+ }
+ {
+ SFDEN = or(SFONE,PRODLO)
+ jump .Ldenorm_continue
+ }
+
+.Ldiv_zero_result:
+ {
+ AH = xor(AH,BH)
+ B = #0
+ }
+ {
+ A = insert(B,#63,#0)
+ jumpr r31
+ }
+.Ldiv_inf_result:
+ {
+ p2 = dfclass(B,#DFCLASS_ZERO)
+ p2 = dfclass(A,#DFCLASS_NONINFINITE)
+ }
+ {
+ TMP = USR
+ if (!p2) jump 1f
+ AH = xor(AH,BH)
+ }
+ {
+ TMP = or(TMP,#0x04) // DBZ
+ }
+ {
+ USR = TMP
+ }
+1:
+ {
+ B = combine(##0x7ff00000,#0)
+ p0 = dfcmp.uo(B,B) // take possible exception
+ }
+ {
+ A = insert(B,#63,#0)
+ jumpr r31
+ }
+.Ldiv_nan:
+ {
+ p0 = dfclass(A,#0x10)
+ p1 = dfclass(B,#0x10)
+ if (!p0.new) A = B
+ if (!p1.new) B = A
+ }
+ {
+ QH = convert_df2sf(A) // get possible invalid exceptions
+ QL = convert_df2sf(B)
+ }
+ {
+ A = #-1
+ jumpr r31
+ }
+
+.Ldiv_invalid:
+ {
+ TMP = ##0x7f800001
+ }
+ {
+ A = convert_sf2df(TMP) // get invalid, get DF qNaN
+ jumpr r31
+ }
+END(__hexagon_divdf3)
diff --git a/lib/builtins/hexagon/dffma.S b/lib/builtins/hexagon/dffma.S
new file mode 100644
index 000000000..97b885a3b
--- /dev/null
+++ b/lib/builtins/hexagon/dffma.S
@@ -0,0 +1,705 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/* Double Precision Multiply */
+
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+#define C r5:4
+#define CH r5
+#define CL r4
+
+
+
+#define BTMP r15:14
+#define BTMPH r15
+#define BTMPL r14
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define CTMP r11:10
+#define CTMPH r11
+#define CTMPL r10
+
+#define PP_LL r9:8
+#define PP_LL_H r9
+#define PP_LL_L r8
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+
+#define PP_HH r17:16
+#define PP_HH_H r17
+#define PP_HH_L r16
+
+#define EXPA r18
+#define EXPB r19
+#define EXPBA r19:18
+
+#define TMP r28
+
+#define P_TMP p0
+#define PROD_NEG p3
+#define EXACT p2
+#define SWAP p1
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1023
+#define STACKSPACE 32
+
+#define ADJUST 4
+
+#define FUDGE 7
+#define FUDGE2 3
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+ /*
+ * First, classify for normal values, and abort if abnormal
+ *
+ * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
+ *
+ * Since we know that the 2 MSBs of the H registers is zero, we should never carry
+ * the partial products that involve the H registers
+ *
+ * Try to buy X slots, at the expense of latency if needed
+ *
+ * We will have PP_HH with the upper bits of the product, PP_LL with the lower
+ * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
+ * PP_HH can have a minimum of 0x0100_0000_0000_0000
+ *
+ * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
+ *
+ * We need to align CTMP.
+ * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
+ * If CTMP << PP align CTMP and add 128 bits. Then compute sticky
+ * If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
+ *
+ * Convert partial product and CTMP to 2's complement prior to addition
+ *
+ * After we add, we need to normalize into upper 64 bits, then compute sticky.
+ *
+ *
+ */
+
+ .text
+ .global __hexagon_fmadf4
+ .type __hexagon_fmadf4,@function
+ .global __hexagon_fmadf5
+ .type __hexagon_fmadf5,@function
+ .global fma
+ .type fma,@function
+ Q6_ALIAS(fmadf5)
+ .p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+ {
+ P_TMP = dfclass(A,#2)
+ P_TMP = dfclass(B,#2)
+ ATMP = #0
+ BTMP = #0
+ }
+ {
+ ATMP = insert(A,#MANTBITS,#EXPBITS-3)
+ BTMP = insert(B,#MANTBITS,#EXPBITS-3)
+ PP_ODD_H = ##0x10000000
+ allocframe(#STACKSPACE)
+ }
+ {
+ PP_LL = mpyu(ATMPL,BTMPL)
+ if (!P_TMP) jump .Lfma_abnormal_ab
+ ATMPH = or(ATMPH,PP_ODD_H)
+ BTMPH = or(BTMPH,PP_ODD_H)
+ }
+ {
+ P_TMP = dfclass(C,#2)
+ if (!P_TMP.new) jump:nt .Lfma_abnormal_c
+ CTMP = combine(PP_ODD_H,#0)
+ PP_ODD = combine(#0,PP_LL_H)
+ }
+.Lfma_abnormal_c_restart:
+ {
+ PP_ODD += mpyu(BTMPL,ATMPH)
+ CTMP = insert(C,#MANTBITS,#EXPBITS-3)
+ memd(r29+#0) = PP_HH
+ memd(r29+#8) = EXPBA
+ }
+ {
+ PP_ODD += mpyu(ATMPL,BTMPH)
+ EXPBA = neg(CTMP)
+ P_TMP = cmp.gt(CH,#-1)
+ TMP = xor(AH,BH)
+ }
+ {
+ EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+ PP_HH = combine(#0,PP_ODD_H)
+ if (!P_TMP) CTMP = EXPBA
+ }
+ {
+ PP_HH += mpyu(ATMPH,BTMPH)
+ PP_LL = combine(PP_ODD_L,PP_LL_L)
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
+#define RIGHTLEFTSHIFT r13:12
+#define RIGHTSHIFT r13
+#define LEFTSHIFT r12
+
+ EXPA = add(EXPA,EXPB)
+#undef EXPB
+#undef EXPBA
+#define EXPC r19
+#define EXPCA r19:18
+ EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
+ }
+ /* PP_HH:PP_LL now has product */
+ /* CTMP is negated */
+ /* EXPA,B,C are extracted */
+ /*
+ * We need to negate PP
+ * Since we will be adding with carry later, if we need to negate,
+ * just invert all bits now, which we can do conditionally and in parallel
+ */
+#define PP_HH_TMP r15:14
+#define PP_LL_TMP r7:6
+ {
+ EXPA = add(EXPA,#-BIAS+(ADJUST))
+ PROD_NEG = !cmp.gt(TMP,#-1)
+ PP_LL_TMP = #0
+ PP_HH_TMP = #0
+ }
+ {
+ PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
+ P_TMP = !cmp.gt(TMP,#-1)
+ SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
+ if (SWAP.new) EXPCA = combine(EXPA,EXPC)
+ }
+ {
+ PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
+ if (P_TMP) PP_LL = PP_LL_TMP
+#undef PP_LL_TMP
+#define CTMP2 r7:6
+#define CTMP2H r7
+#define CTMP2L r6
+ CTMP2 = #0
+ EXPC = sub(EXPA,EXPC)
+ }
+ {
+ if (P_TMP) PP_HH = PP_HH_TMP
+ P_TMP = cmp.gt(EXPC,#63)
+ if (SWAP) PP_LL = CTMP2
+ if (SWAP) CTMP2 = PP_LL
+ }
+#undef PP_HH_TMP
+//#define ONE r15:14
+//#define S_ONE r14
+#define ZERO r15:14
+#define S_ZERO r15
+#undef PROD_NEG
+#define P_CARRY p3
+ {
+ if (SWAP) PP_HH = CTMP // Swap C and PP
+ if (SWAP) CTMP = PP_HH
+ if (P_TMP) EXPC = add(EXPC,#-64)
+ TMP = #63
+ }
+ {
+ // If diff > 63, pre-shift-right by 64...
+ if (P_TMP) CTMP2 = CTMP
+ TMP = asr(CTMPH,#31)
+ RIGHTSHIFT = min(EXPC,TMP)
+ LEFTSHIFT = #0
+ }
+#undef C
+#undef CH
+#undef CL
+#define STICKIES r5:4
+#define STICKIESH r5
+#define STICKIESL r4
+ {
+ if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
+ STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
+ CTMP2 = lsr(CTMP2,RIGHTSHIFT)
+ LEFTSHIFT = sub(#64,RIGHTSHIFT)
+ }
+ {
+ ZERO = #0
+ TMP = #-2
+ CTMP2 |= lsl(CTMP,LEFTSHIFT)
+ CTMP = asr(CTMP,RIGHTSHIFT)
+ }
+ {
+ P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
+ if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
+#undef ZERO
+#define ONE r15:14
+#define S_ONE r14
+ ONE = #1
+ STICKIES = #0
+ }
+ {
+ PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
+ }
+ {
+ PP_HH = add(CTMP,PP_HH,P_CARRY):carry
+ TMP = #62
+ }
+ /*
+ * PP_HH:PP_LL now holds the sum
+ * We may need to normalize left, up to ??? bits.
+ *
+ * I think that if we have massive cancellation, the range we normalize by
+ * is still limited
+ */
+ {
+ LEFTSHIFT = add(clb(PP_HH),#-2)
+ if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
+ }
+ /* We had all sign bits, shift left by 62. */
+ {
+ CTMP = extractu(PP_LL,#62,#2)
+ PP_LL = asl(PP_LL,#62)
+ EXPA = add(EXPA,#-62) // And adjust exponent of result
+ }
+ {
+ PP_HH = insert(CTMP,#62,#0) // Then shift 63
+ }
+ {
+ LEFTSHIFT = add(clb(PP_HH),#-2)
+ }
+ .falign
+1:
+ {
+ CTMP = asl(PP_HH,LEFTSHIFT)
+ STICKIES |= asl(PP_LL,LEFTSHIFT)
+ RIGHTSHIFT = sub(#64,LEFTSHIFT)
+ EXPA = sub(EXPA,LEFTSHIFT)
+ }
+ {
+ CTMP |= lsr(PP_LL,RIGHTSHIFT)
+ EXACT = cmp.gtu(ONE,STICKIES)
+ TMP = #BIAS+BIAS-2
+ }
+ {
+ if (!EXACT) CTMPL = or(CTMPL,S_ONE)
+ // If EXPA is overflow/underflow, jump to ovf_unf
+ P_TMP = !cmp.gt(EXPA,TMP)
+ P_TMP = cmp.gt(EXPA,#1)
+ if (!P_TMP.new) jump:nt .Lfma_ovf_unf
+ }
+ {
+ // XXX: FIXME: should PP_HH for check of zero be CTMP?
+ P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
+ A = convert_d2df(CTMP)
+ EXPA = add(EXPA,#-BIAS-60)
+ PP_HH = memd(r29+#0)
+ }
+ {
+ AH += asl(EXPA,#HI_MANTBITS)
+ EXPCA = memd(r29+#8)
+ if (!P_TMP) dealloc_return // not zero, return
+ }
+.Ladd_yields_zero:
+ /* We had full cancellation. Return +/- zero (-0 when round-down) */
+ {
+ TMP = USR
+ A = #0
+ }
+ {
+ TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+ PP_HH = memd(r29+#0)
+ EXPCA = memd(r29+#8)
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = ##0x80000000
+ dealloc_return
+ }
+
+#undef RIGHTLEFTSHIFT
+#undef RIGHTSHIFT
+#undef LEFTSHIFT
+#undef CTMP2
+#undef CTMP2H
+#undef CTMP2L
+
+.Lfma_ovf_unf:
+ {
+ p0 = cmp.gtu(ONE,CTMP)
+ if (p0.new) jump:nt .Ladd_yields_zero
+ }
+ {
+ A = convert_d2df(CTMP)
+ EXPA = add(EXPA,#-BIAS-60)
+ TMP = EXPA
+ }
+#define NEW_EXPB r7
+#define NEW_EXPA r6
+ {
+ AH += asl(EXPA,#HI_MANTBITS)
+ NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ }
+ {
+ NEW_EXPA = add(EXPA,NEW_EXPB)
+ PP_HH = memd(r29+#0)
+ EXPCA = memd(r29+#8)
+#undef PP_HH
+#undef PP_HH_H
+#undef PP_HH_L
+#undef EXPCA
+#undef EXPC
+#undef EXPA
+#undef PP_LL
+#undef PP_LL_H
+#undef PP_LL_L
+#define EXPA r6
+#define EXPB r7
+#define EXPBA r7:6
+#define ATMP r9:8
+#define ATMPH r9
+#define ATMPL r8
+#undef NEW_EXPB
+#undef NEW_EXPA
+ ATMP = abs(CTMP)
+ }
+ {
+ p0 = cmp.gt(EXPA,##BIAS+BIAS)
+ if (p0.new) jump:nt .Lfma_ovf
+ }
+ {
+ p0 = cmp.gt(EXPA,#0)
+ if (p0.new) jump:nt .Lpossible_unf
+ }
+ {
+ // TMP has original EXPA.
+ // ATMP is corresponding value
+ // Normalize ATMP and shift right to correct location
+ EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
+ EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
+ p3 = cmp.gt(CTMPH,#-1)
+ }
+ /* Underflow */
+ /* We know that the infinte range exponent should be EXPA */
+ /* CTMP is 2's complement, ATMP is abs(CTMP) */
+ {
+ EXPA = add(EXPA,EXPB) // how much to shift back right
+ ATMP = asl(ATMP,EXPB) // shift left
+ AH = USR
+ TMP = #63
+ }
+ {
+ EXPB = min(EXPA,TMP)
+ EXPA = #0
+ AL = #0x0030
+ }
+ {
+ B = extractu(ATMP,EXPBA)
+ ATMP = asr(ATMP,EXPB)
+ }
+ {
+ p0 = cmp.gtu(ONE,B)
+ if (!p0.new) ATMPL = or(ATMPL,S_ONE)
+ ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
+ }
+ {
+ CTMP = neg(ATMP)
+ p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
+ if (!p1.new) AH = or(AH,AL)
+ B = #0
+ }
+ {
+ if (p3) CTMP = ATMP
+ USR = AH
+ TMP = #-BIAS-(MANTBITS+FUDGE2)
+ }
+ {
+ A = convert_d2df(CTMP)
+ }
+ {
+ AH += asl(TMP,#HI_MANTBITS)
+ dealloc_return
+ }
+.Lpossible_unf:
+ {
+ TMP = ##0x7fefffff
+ ATMP = abs(CTMP)
+ }
+ {
+ p0 = cmp.eq(AL,#0)
+ p0 = bitsclr(AH,TMP)
+ if (!p0.new) dealloc_return:t
+ TMP = #0x7fff
+ }
+ {
+ p0 = bitsset(ATMPH,TMP)
+ BH = USR
+ BL = #0x0030
+ }
+ {
+ if (p0) BH = or(BH,BL)
+ }
+ {
+ USR = BH
+ }
+ {
+ p0 = dfcmp.eq(A,A)
+ dealloc_return
+ }
+.Lfma_ovf:
+ {
+ TMP = USR
+ CTMP = combine(##0x7fefffff,#-1)
+ A = CTMP
+ }
+ {
+ ATMP = combine(##0x7ff00000,#0)
+ BH = extractu(TMP,#2,#SR_ROUND_OFF)
+ TMP = or(TMP,#0x28)
+ }
+ {
+ USR = TMP
+ BH ^= lsr(AH,#31)
+ BL = BH
+ }
+ {
+ p0 = !cmp.eq(BL,#1)
+ p0 = !cmp.eq(BH,#2)
+ }
+ {
+ p0 = dfcmp.eq(ATMP,ATMP)
+ if (p0.new) CTMP = ATMP
+ }
+ {
+ A = insert(CTMP,#63,#0)
+ dealloc_return
+ }
+#undef CTMP
+#undef CTMPH
+#undef CTMPL
+#define BTMP r11:10
+#define BTMPH r11
+#define BTMPL r10
+
+#undef STICKIES
+#undef STICKIESH
+#undef STICKIESL
+#define C r5:4
+#define CH r5
+#define CL r4
+
+.Lfma_abnormal_ab:
+ {
+ ATMP = extractu(A,#63,#0)
+ BTMP = extractu(B,#63,#0)
+ deallocframe
+ }
+ {
+ p3 = cmp.gtu(ATMP,BTMP)
+ if (!p3.new) A = B // sort values
+ if (!p3.new) B = A
+ }
+ {
+ p0 = dfclass(A,#0x0f) // A NaN?
+ if (!p0.new) jump:nt .Lnan
+ if (!p3) ATMP = BTMP
+ if (!p3) BTMP = ATMP
+ }
+ {
+ p1 = dfclass(A,#0x08) // A is infinity
+ p1 = dfclass(B,#0x0e) // B is nonzero
+ }
+ {
+ p0 = dfclass(A,#0x08) // a is inf
+ p0 = dfclass(B,#0x01) // b is zero
+ }
+ {
+ if (p1) jump .Lab_inf
+ p2 = dfclass(B,#0x01)
+ }
+ {
+ if (p0) jump .Linvalid
+ if (p2) jump .Lab_true_zero
+ TMP = ##0x7c000000
+ }
+ // We are left with a normal or subnormal times a subnormal, A > B
+ // If A and B are both very small, we will go to a single sticky bit; replace
+ // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
+ // if A and B might multiply to something bigger, decrease A exp and increase B exp
+ // and start over
+ {
+ p0 = bitsclr(AH,TMP)
+ if (p0.new) jump:nt .Lfma_ab_tiny
+ }
+ {
+ TMP = add(clb(BTMP),#-EXPBITS)
+ }
+ {
+ BTMP = asl(BTMP,TMP)
+ }
+ {
+ B = insert(BTMP,#63,#0)
+ AH -= asl(TMP,#HI_MANTBITS)
+ }
+ jump fma
+
+.Lfma_ab_tiny:
+ ATMP = combine(##0x00100000,#0)
+ {
+ A = insert(ATMP,#63,#0)
+ B = insert(ATMP,#63,#0)
+ }
+ jump fma
+
+.Lab_inf:
+ {
+ B = lsr(B,#63)
+ p0 = dfclass(C,#0x10)
+ }
+ {
+ A ^= asl(B,#63)
+ if (p0) jump .Lnan
+ }
+ {
+ p1 = dfclass(C,#0x08)
+ if (p1.new) jump:nt .Lfma_inf_plus_inf
+ }
+ /* A*B is +/- inf, C is finite. Return A */
+ {
+ jumpr r31
+ }
+ .falign
+.Lfma_inf_plus_inf:
+ { // adding infinities of different signs is invalid
+ p0 = dfcmp.eq(A,C)
+ if (!p0.new) jump:nt .Linvalid
+ }
+ {
+ jumpr r31
+ }
+
+.Lnan:
+ {
+ p0 = dfclass(B,#0x10)
+ p1 = dfclass(C,#0x10)
+ if (!p0.new) B = A
+ if (!p1.new) C = A
+ }
+ { // find sNaNs
+ BH = convert_df2sf(B)
+ BL = convert_df2sf(C)
+ }
+ {
+ BH = convert_df2sf(A)
+ A = #-1
+ jumpr r31
+ }
+
+.Linvalid:
+ {
+ TMP = ##0x7f800001 // sp snan
+ }
+ {
+ A = convert_sf2df(TMP)
+ jumpr r31
+ }
+
+.Lab_true_zero:
+ // B is zero, A is finite number
+ {
+ p0 = dfclass(C,#0x10)
+ if (p0.new) jump:nt .Lnan
+ if (p0.new) A = C
+ }
+ {
+ p0 = dfcmp.eq(B,C) // is C also zero?
+ AH = lsr(AH,#31) // get sign
+ }
+ {
+ BH ^= asl(AH,#31) // form correctly signed zero in B
+ if (!p0) A = C // If C is not zero, return C
+ if (!p0) jumpr r31
+ }
+ /* B has correctly signed zero, C is also zero */
+.Lzero_plus_zero:
+ {
+ p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
+ if (p0.new) jumpr:t r31
+ A = B
+ }
+ {
+ TMP = USR
+ }
+ {
+ TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+ A = #0
+ }
+ {
+ p0 = cmp.eq(TMP,#2)
+ if (p0.new) AH = ##0x80000000
+ jumpr r31
+ }
+#undef BTMP
+#undef BTMPH
+#undef BTMPL
+#define CTMP r11:10
+ .falign
+.Lfma_abnormal_c:
+ /* We know that AB is normal * normal */
+ /* C is not normal: zero, subnormal, inf, or NaN. */
+ {
+ p0 = dfclass(C,#0x10) // is C NaN?
+ if (p0.new) jump:nt .Lnan
+ if (p0.new) A = C // move NaN to A
+ deallocframe
+ }
+ {
+ p0 = dfclass(C,#0x08) // is C inf?
+ if (p0.new) A = C // return C
+ if (p0.new) jumpr:nt r31
+ }
+ // zero or subnormal
+ // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
+ {
+ p0 = dfclass(C,#0x01) // is C zero?
+ if (p0.new) jump:nt __hexagon_muldf3
+ TMP = #1
+ }
+ // Left with: subnormal
+ // Adjust C and jump back to restart
+ {
+ allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
+ CTMP = #0
+ CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
+ jump .Lfma_abnormal_c_restart
+ }
+END(fma)
diff --git a/lib/builtins/hexagon/dfminmax.S b/lib/builtins/hexagon/dfminmax.S
new file mode 100644
index 000000000..41122911f
--- /dev/null
+++ b/lib/builtins/hexagon/dfminmax.S
@@ -0,0 +1,79 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define A r1:0
+#define B r3:2
+#define ATMP r5:4
+
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/*
+ * Min and Max return A if B is NaN, or B if A is NaN
+ * Otherwise, they return the smaller or bigger value
+ *
+ * If values are equal, we want to favor -0.0 for min and +0.0 for max.
+ */
+
+/*
+ * Compares always return false for NaN
+ * if (isnan(A)) A = B; if (A > B) A = B will only trigger at most one of those options.
+ */
+ .text
+ .global __hexagon_mindf3
+ .global __hexagon_maxdf3
+ .global fmin
+ .type fmin,@function
+ .global fmax
+ .type fmax,@function
+ .type __hexagon_mindf3,@function
+ .type __hexagon_maxdf3,@function
+ Q6_ALIAS(mindf3)
+ Q6_ALIAS(maxdf3)
+ .p2align 5
+__hexagon_mindf3:
+fmin:
+ {
+ p0 = dfclass(A,#0x10) // If A is a number
+ p1 = dfcmp.gt(A,B) // AND B > A, don't swap
+ ATMP = A
+ }
+ {
+ if (p0) A = B // if A is NaN use B
+ if (p1) A = B // gt is always false if either is NaN
+ p2 = dfcmp.eq(A,B) // if A == B
+ if (!p2.new) jumpr:t r31
+ }
+ /* A == B, return A|B to select -0.0 over 0.0 */
+ {
+ A = or(ATMP,B)
+ jumpr r31
+ }
+END(__hexagon_mindf3)
+ .falign
+__hexagon_maxdf3:
+fmax:
+ {
+ p0 = dfclass(A,#0x10)
+ p1 = dfcmp.gt(B,A)
+ ATMP = A
+ }
+ {
+ if (p0) A = B
+ if (p1) A = B
+ p2 = dfcmp.eq(A,B)
+ if (!p2.new) jumpr:t r31
+ }
+ /* A == B, return A&B to select 0.0 over -0.0 */
+ {
+ A = and(ATMP,B)
+ jumpr r31
+ }
+END(__hexagon_maxdf3)
diff --git a/lib/builtins/hexagon/dfmul.S b/lib/builtins/hexagon/dfmul.S
new file mode 100644
index 000000000..fde6d77bd
--- /dev/null
+++ b/lib/builtins/hexagon/dfmul.S
@@ -0,0 +1,418 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define BTMP r5:4
+#define BTMPH r5
+#define BTMPL r4
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+#define ONE r9:8
+#define S_ONE r8
+#define S_ZERO r9
+
+#define PP_HH r11:10
+#define PP_HH_H r11
+#define PP_HH_L r10
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define PP_LL r15:14
+#define PP_LL_H r15
+#define PP_LL_L r14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+
+/* Some constant to adjust normalization amount in error code */
+/* Amount to right shift the partial product to get to a denorm */
+#define FUDGE 5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+#define SR_ROUND_OFF 22
+ .text
+ .global __hexagon_muldf3
+ .type __hexagon_muldf3,@function
+ Q6_ALIAS(muldf3)
+ FAST_ALIAS(muldf3)
+ FAST2_ALIAS(muldf3)
+ .p2align 5
+__hexagon_muldf3:
+ {
+ p0 = dfclass(A,#2)
+ p0 = dfclass(B,#2)
+ ATMP = combine(##0x40000000,#0)
+ }
+ {
+ ATMP = insert(A,#MANTBITS,#EXPBITS-1)
+ BTMP = asl(B,#EXPBITS-1)
+ TMP = #-BIAS
+ ONE = #1
+ }
+ {
+ PP_ODD = mpyu(BTMPL,ATMPH)
+ BTMP = insert(ONE,#2,#62)
+ }
+ /* since we know that the MSB of the H registers is zero, we should never carry */
+ /* H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 */
+ /* Adding 2 HLs, we get 2^64-3*2^32+2 maximum. */
+ /* Therefore, we can add 3 2^32-1 values safely without carry. We only need one. */
+ {
+ PP_LL = mpyu(ATMPL,BTMPL)
+ PP_ODD += mpyu(ATMPL,BTMPH)
+ }
+ {
+ PP_ODD += lsr(PP_LL,#32)
+ PP_HH = mpyu(ATMPH,BTMPH)
+ BTMP = combine(##BIAS+BIAS-4,#0)
+ }
+ {
+ PP_HH += lsr(PP_ODD,#32)
+ if (!p0) jump .Lmul_abnormal
+ p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
+ p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
+ }
+ /*
+ * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+ * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+ */
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#define EXP10 r7:6
+#define EXP1 r7
+#define EXP0 r6
+ {
+ if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
+ EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
+ }
+ {
+ PP_LL = neg(PP_HH)
+ EXP0 += add(TMP,EXP1)
+ TMP = xor(AH,BH)
+ }
+ {
+ if (!p2.new) PP_HH = PP_LL
+ p2 = cmp.gt(TMP,#-1)
+ p0 = !cmp.gt(EXP0,BTMPH)
+ p0 = cmp.gt(EXP0,BTMPL)
+ if (!p0.new) jump:nt .Lmul_ovf_unf
+ }
+ {
+ A = convert_d2df(PP_HH)
+ EXP0 = add(EXP0,#-BIAS-58)
+ }
+ {
+ AH += asl(EXP0,#HI_MANTBITS)
+ jumpr r31
+ }
+
+ .falign
+.Lpossible_unf:
+ /* We end up with a positive exponent */
+ /* But we may have rounded up to an exponent of 1. */
+ /* If the exponent is 1, if we rounded up to it
+ * we need to also raise underflow
+ * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
+ * And the PP should also have more than one bit set
+ */
+ /* Note: ATMP should have abs(PP_HH) */
+ /* Note: BTMPL should have 0x7FEFFFFF */
+ {
+ p0 = cmp.eq(AL,#0)
+ p0 = bitsclr(AH,BTMPL)
+ if (!p0.new) jumpr:t r31
+ BTMPH = #0x7fff
+ }
+ {
+ p0 = bitsset(ATMPH,BTMPH)
+ BTMPL = USR
+ BTMPH = #0x030
+ }
+ {
+ if (p0) BTMPL = or(BTMPL,BTMPH)
+ }
+ {
+ USR = BTMPL
+ }
+ {
+ p0 = dfcmp.eq(A,A)
+ jumpr r31
+ }
+ .falign
+.Lmul_ovf_unf:
+ {
+ A = convert_d2df(PP_HH)
+ ATMP = abs(PP_HH) // take absolute value
+ EXP1 = add(EXP0,#-BIAS-58)
+ }
+ {
+ AH += asl(EXP1,#HI_MANTBITS)
+ EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+ BTMPL = ##0x7FEFFFFF
+ }
+ {
+ EXP1 += add(EXP0,##-BIAS-58)
+ //BTMPH = add(clb(ATMP),#-2)
+ BTMPH = #0
+ }
+ {
+ p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
+ if (p0.new) jump:nt .Lmul_ovf
+ }
+ {
+ p0 = cmp.gt(EXP1,#0)
+ if (p0.new) jump:nt .Lpossible_unf
+ BTMPH = sub(EXP0,BTMPH)
+ TMP = #63 // max amount to shift
+ }
+ /* Underflow */
+ /*
+ * PP_HH has the partial product with sticky LSB.
+ * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+ * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+ * The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
+ * That's the exponent that happens after the normalization
+ *
+ * EXP0 has the exponent that, when added to the normalized value, is out of range.
+ *
+ * Strategy:
+ *
+ * * Shift down bits, with sticky bit, such that the bits are aligned according
+ * to the LZ count and appropriate exponent, but not all the way to mantissa
+ * field, keep around the last few bits.
+ * * Put a 1 near the MSB
+ * * Check the LSBs for inexact; if inexact also set underflow
+ * * Convert [u]d2df -- will correctly round according to rounding mode
+ * * Replace exponent field with zero
+ *
+ *
+ */
+
+
+ {
+ BTMPL = #0 // offset for extract
+ BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
+ }
+ {
+ p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
+ BTMPH = min(BTMPH,TMP) // Don't shift more than 63
+ PP_HH = ATMP
+ }
+ {
+ TMP = USR
+ PP_LL = extractu(PP_HH,BTMP)
+ }
+ {
+ PP_HH = asr(PP_HH,BTMPH)
+ BTMPL = #0x0030 // underflow flag
+ AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
+ }
+ {
+ p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
+ if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
+ PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
+ }
+ {
+ PP_LL = neg(PP_HH)
+ p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
+ if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
+ }
+ {
+ if (!p3) PP_HH = PP_LL
+ USR = TMP
+ }
+ {
+ A = convert_d2df(PP_HH) // Do rounding
+ p0 = dfcmp.eq(A,A) // realize exception
+ }
+ {
+ AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
+ jumpr r31
+ }
+ .falign
+.Lmul_ovf:
+ // We get either max finite value or infinity. Either way, overflow+inexact
+ {
+ TMP = USR
+ ATMP = combine(##0x7fefffff,#-1) // positive max finite
+ A = PP_HH
+ }
+ {
+ PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
+ TMP = or(TMP,#0x28) // inexact + overflow
+ BTMP = combine(##0x7ff00000,#0) // positive infinity
+ }
+ {
+ USR = TMP
+ PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
+ TMP = PP_LL_L // unmodified rounding mode
+ }
+ {
+ p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
+ p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
+ if (p0.new) ATMP = BTMP // we should get infinity
+ p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
+ }
+ {
+ A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
+ jumpr r31
+ }
+
+.Lmul_abnormal:
+ {
+ ATMP = extractu(A,#63,#0) // strip off sign
+ BTMP = extractu(B,#63,#0) // strip off sign
+ }
+ {
+ p3 = cmp.gtu(ATMP,BTMP)
+ if (!p3.new) A = B // sort values
+ if (!p3.new) B = A // sort values
+ }
+ {
+ // Any NaN --> NaN, possibly raise invalid if sNaN
+ p0 = dfclass(A,#0x0f) // A not NaN?
+ if (!p0.new) jump:nt .Linvalid_nan
+ if (!p3) ATMP = BTMP
+ if (!p3) BTMP = ATMP
+ }
+ {
+ // Infinity * nonzero number is infinity
+ p1 = dfclass(A,#0x08) // A is infinity
+ p1 = dfclass(B,#0x0e) // B is nonzero
+ }
+ {
+ // Infinity * zero --> NaN, raise invalid
+ // Other zeros return zero
+ p0 = dfclass(A,#0x08) // A is infinity
+ p0 = dfclass(B,#0x01) // B is zero
+ }
+ {
+ if (p1) jump .Ltrue_inf
+ p2 = dfclass(B,#0x01)
+ }
+ {
+ if (p0) jump .Linvalid_zeroinf
+ if (p2) jump .Ltrue_zero // so return zero
+ TMP = ##0x7c000000
+ }
+ // We are left with a normal or subnormal times a subnormal. A > B
+ // If A and B are both very small (exp(a) < BIAS-MANTBITS),
+ // we go to a single sticky bit, which we can round easily.
+ // If A and B might multiply to something bigger, decrease A exponent and increase
+ // B exponent and try again
+ {
+ p0 = bitsclr(AH,TMP)
+ if (p0.new) jump:nt .Lmul_tiny
+ }
+ {
+ TMP = cl0(BTMP)
+ }
+ {
+ TMP = add(TMP,#-EXPBITS)
+ }
+ {
+ BTMP = asl(BTMP,TMP)
+ }
+ {
+ B = insert(BTMP,#63,#0)
+ AH -= asl(TMP,#HI_MANTBITS)
+ }
+ jump __hexagon_muldf3
+.Lmul_tiny:
+ {
+ TMP = USR
+ A = xor(A,B) // get sign bit
+ }
+ {
+ TMP = or(TMP,#0x30) // Inexact + Underflow
+ A = insert(ONE,#63,#0) // put in rounded up value
+ BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
+ }
+ {
+ USR = TMP
+ p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
+ if (!p0.new) AL = #0 // If not, zero
+ BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
+ }
+ {
+ p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
+ if (!p0.new) AL = #0 // don't go to zero
+ jumpr r31
+ }
+.Linvalid_zeroinf:
+ {
+ TMP = USR
+ }
+ {
+ A = #-1
+ TMP = or(TMP,#2)
+ }
+ {
+ USR = TMP
+ }
+ {
+ p0 = dfcmp.uo(A,A) // force exception if enabled
+ jumpr r31
+ }
+.Linvalid_nan:
+ {
+ p0 = dfclass(B,#0x0f) // if B is not NaN
+ TMP = convert_df2sf(A) // will generate invalid if sNaN
+ if (p0.new) B = A // make it whatever A is
+ }
+ {
+ BL = convert_df2sf(B) // will generate invalid if sNaN
+ A = #-1
+ jumpr r31
+ }
+ .falign
+.Ltrue_zero:
+ {
+ A = B
+ B = A
+ }
+.Ltrue_inf:
+ {
+ BH = extract(BH,#1,#31)
+ }
+ {
+ AH ^= asl(BH,#31)
+ jumpr r31
+ }
+END(__hexagon_muldf3)
+
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
diff --git a/lib/builtins/hexagon/dfsqrt.S b/lib/builtins/hexagon/dfsqrt.S
new file mode 100644
index 000000000..027d9e1fd
--- /dev/null
+++ b/lib/builtins/hexagon/dfsqrt.S
@@ -0,0 +1,406 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision square root */
+
+#define EXP r28
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define SFSH r3:2
+#define SF_S r3
+#define SF_H r2
+
+#define SFHALF_SONE r5:4
+#define S_ONE r4
+#define SFHALF r5
+#define SF_D r6
+#define SF_E r7
+#define RECIPEST r8
+#define SFRAD r9
+
+#define FRACRAD r11:10
+#define FRACRADH r11
+#define FRACRADL r10
+
+#define ROOT r13:12
+#define ROOTHI r13
+#define ROOTLO r12
+
+#define PROD r15:14
+#define PRODHI r15
+#define PRODLO r14
+
+#define P_TMP p0
+#define P_EXP1 p1
+#define NORMAL p2
+
+#define SF_EXPBITS 8
+#define SF_MANTBITS 23
+
+#define DF_EXPBITS 11
+#define DF_MANTBITS 52
+
+#define DF_BIAS 0x3ff
+
+#define DFCLASS_ZERO 0x01
+#define DFCLASS_NORMAL 0x02
+#define DFCLASS_DENORMAL 0x02
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_NAN 0x10
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
+#define END(TAG) .size TAG,.-TAG
+
+ .text
+ .global __hexagon_sqrtdf2
+ .type __hexagon_sqrtdf2,@function
+ .global __hexagon_sqrt
+ .type __hexagon_sqrt,@function
+ Q6_ALIAS(sqrtdf2)
+ Q6_ALIAS(sqrt)
+ FAST_ALIAS(sqrtdf2)
+ FAST_ALIAS(sqrt)
+ FAST2_ALIAS(sqrtdf2)
+ FAST2_ALIAS(sqrt)
+ .type sqrt,@function
+ .p2align 5
+__hexagon_sqrtdf2:
+__hexagon_sqrt:
+ {
+ PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
+ EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+ SFHALF_SONE = combine(##0x3f000004,#1)
+ }
+ {
+ NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
+ NORMAL = cmp.gt(AH,#-1) // and positive?
+ if (!NORMAL.new) jump:nt .Lsqrt_abnormal
+ SFRAD = or(SFHALF,PRODLO)
+ }
+#undef NORMAL
+.Ldenormal_restart:
+ {
+ FRACRAD = A
+ SF_E,P_TMP = sfinvsqrta(SFRAD)
+ SFHALF = and(SFHALF,#-16)
+ SFSH = #0
+ }
+#undef A
+#undef AH
+#undef AL
+#define ERROR r1:0
+#define ERRORHI r1
+#define ERRORLO r0
+ // SF_E : reciprocal square root
+ // SF_H : half rsqrt
+ // sf_S : square root
+ // SF_D : error term
+ // SFHALF: 0.5
+ {
+ SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
+ SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent...
+ SF_D = SFHALF
+#undef SFRAD
+#define SHIFTAMT r9
+ SHIFTAMT = and(EXP,#1)
+ }
+ {
+ SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1
+ FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden
+ P_EXP1 = cmp.gtu(SHIFTAMT,#0)
+ }
+ {
+ SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
+ SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
+ SF_D = SFHALF
+ SHIFTAMT = mux(P_EXP1,#8,#9)
+ }
+ {
+ SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
+ FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
+ SHIFTAMT = mux(P_EXP1,#3,#2)
+ }
+ {
+ SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
+ // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
+ PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
+ }
+ {
+ SF_H = and(SF_H,##0x007fffff)
+ }
+ {
+ SF_H = add(SF_H,##0x00800000 - 3)
+ SHIFTAMT = mux(P_EXP1,#7,#8)
+ }
+ {
+ RECIPEST = asl(SF_H,SHIFTAMT)
+ SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
+ }
+ {
+ ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
+ }
+
+#undef SFSH // r3:2
+#undef SF_H // r2
+#undef SF_S // r3
+#undef S_ONE // r4
+#undef SFHALF // r5
+#undef SFHALF_SONE // r5:4
+#undef SF_D // r6
+#undef SF_E // r7
+
+#define HL r3:2
+#define LL r5:4
+#define HH r7:6
+
+#undef P_EXP1
+#define P_CARRY0 p1
+#define P_CARRY1 p2
+#define P_CARRY2 p3
+
+ /* Iteration 0 */
+ /* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */
+ /* We can shift and subtract instead of shift and add? */
+ {
+ ERROR = asl(FRACRAD,#15)
+ PROD = mpyu(ROOTHI,ROOTHI)
+ P_CARRY0 = cmp.eq(r0,r0)
+ }
+ {
+ ERROR -= asl(PROD,#15)
+ PROD = mpyu(ROOTHI,ROOTLO)
+ P_CARRY1 = cmp.eq(r0,r0)
+ }
+ {
+ ERROR -= lsr(PROD,#16)
+ P_CARRY2 = cmp.eq(r0,r0)
+ }
+ {
+ ERROR = mpyu(ERRORHI,RECIPEST)
+ }
+ {
+ ROOT += lsr(ERROR,SHIFTAMT)
+ SHIFTAMT = add(SHIFTAMT,#16)
+ ERROR = asl(FRACRAD,#31) // for next iter
+ }
+ /* Iteration 1 */
+ {
+ PROD = mpyu(ROOTHI,ROOTHI)
+ ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
+ }
+ {
+ ERROR -= asl(PROD,#31)
+ PROD = mpyu(ROOTLO,ROOTLO)
+ }
+ {
+ ERROR -= lsr(PROD,#33)
+ }
+ {
+ ERROR = mpyu(ERRORHI,RECIPEST)
+ }
+ {
+ ROOT += lsr(ERROR,SHIFTAMT)
+ SHIFTAMT = add(SHIFTAMT,#16)
+ ERROR = asl(FRACRAD,#47) // for next iter
+ }
+ /* Iteration 2 */
+ {
+ PROD = mpyu(ROOTHI,ROOTHI)
+ }
+ {
+ ERROR -= asl(PROD,#47)
+ PROD = mpyu(ROOTHI,ROOTLO)
+ }
+ {
+ ERROR -= asl(PROD,#16) // bidir shr 31-47
+ PROD = mpyu(ROOTLO,ROOTLO)
+ }
+ {
+ ERROR -= lsr(PROD,#17) // 64-47
+ }
+ {
+ ERROR = mpyu(ERRORHI,RECIPEST)
+ }
+ {
+ ROOT += lsr(ERROR,SHIFTAMT)
+ }
+#undef ERROR
+#undef PROD
+#undef PRODHI
+#undef PRODLO
+#define REM_HI r15:14
+#define REM_HI_HI r15
+#define REM_LO r1:0
+#undef RECIPEST
+#undef SHIFTAMT
+#define TWOROOT_LO r9:8
+ /* Adjust Root */
+ {
+ HL = mpyu(ROOTHI,ROOTLO)
+ LL = mpyu(ROOTLO,ROOTLO)
+ REM_HI = #0
+ REM_LO = #0
+ }
+ {
+ HL += lsr(LL,#33)
+ LL += asl(HL,#33)
+ P_CARRY0 = cmp.eq(r0,r0)
+ }
+ {
+ HH = mpyu(ROOTHI,ROOTHI)
+ REM_LO = sub(REM_LO,LL,P_CARRY0):carry
+ TWOROOT_LO = #1
+ }
+ {
+ HH += lsr(HL,#31)
+ TWOROOT_LO += asl(ROOT,#1)
+ }
+#undef HL
+#undef LL
+#define REM_HI_TMP r3:2
+#define REM_HI_TMP_HI r3
+#define REM_LO_TMP r5:4
+ {
+ REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
+ REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
+#undef FRACRAD
+#undef HH
+#define ZERO r11:10
+#define ONE r7:6
+ ONE = #1
+ ZERO = #0
+ }
+ {
+ REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
+ ONE = add(ROOT,ONE)
+ EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
+ }
+ {
+ // If carry set, no borrow: result was still positive
+ if (P_CARRY1) ROOT = ONE
+ if (P_CARRY1) REM_LO = REM_LO_TMP
+ if (P_CARRY1) REM_HI = REM_HI_TMP
+ }
+ {
+ REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
+ ONE = #1
+ EXP = asr(EXP,#1) // divide signed exp by 2
+ }
+ {
+ REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
+ ONE = add(ROOT,ONE)
+ }
+ {
+ if (P_CARRY2) ROOT = ONE
+ if (P_CARRY2) REM_LO = REM_LO_TMP
+ // since tworoot <= 2^32, remhi must be zero
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#define S_ONE r2
+#define ADJ r3
+ S_ONE = #1
+ }
+ {
+ P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
+ if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
+ ADJ = cl0(ROOT)
+ EXP = add(EXP,#-63)
+ }
+#undef REM_LO
+#define RET r1:0
+#define RETHI r1
+ {
+ RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
+ EXP = add(EXP,ADJ) // add back bias
+ }
+ {
+ RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust
+ jumpr r31
+ }
+#undef REM_LO_TMP
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#undef REM_LO
+#undef REM_HI
+#undef TWOROOT_LO
+
+#undef RET
+#define A r1:0
+#define AH r1
+#define AL r1
+#undef S_ONE
+#define TMP r3:2
+#define TMPHI r3
+#define TMPLO r2
+#undef P_CARRY0
+#define P_NEG p1
+
+
+#define SFHALF r5
+#define SFRAD r9
+.Lsqrt_abnormal:
+ {
+ P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
+ if (P_TMP.new) jumpr:t r31
+ }
+ {
+ P_TMP = dfclass(A,#DFCLASS_NAN)
+ if (P_TMP.new) jump:nt .Lsqrt_nan
+ }
+ {
+ P_TMP = cmp.gt(AH,#-1)
+ if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
+ if (!P_TMP.new) EXP = ##0x7F800001 // sNaN
+ }
+ {
+ P_TMP = dfclass(A,#DFCLASS_INFINITE)
+ if (P_TMP.new) jumpr:nt r31
+ }
+ // If we got here, we're denormal
+ // prepare to restart
+ {
+ A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa
+ }
+ {
+ EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
+ }
+ {
+ A = asl(A,EXP) // Shift mantissa
+ EXP = sub(#1,EXP) // Form exponent
+ }
+ {
+ AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent
+ }
+ {
+ TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
+ SFHALF = ##0x3f000004 // form half constant
+ }
+ {
+ SFRAD = or(SFHALF,TMPLO) // form sf value
+ SFHALF = and(SFHALF,#-16)
+ jump .Ldenormal_restart // restart
+ }
+.Lsqrt_nan:
+ {
+ EXP = convert_df2sf(A) // if sNaN, get invalid
+ A = #-1 // qNaN
+ jumpr r31
+ }
+.Lsqrt_invalid_neg:
+ {
+ A = convert_sf2df(EXP) // Invalid,NaNval
+ jumpr r31
+ }
+END(__hexagon_sqrt)
+END(__hexagon_sqrtdf2)
diff --git a/lib/builtins/hexagon/divdi3.S b/lib/builtins/hexagon/divdi3.S
new file mode 100644
index 000000000..49ee8104f
--- /dev/null
+++ b/lib/builtins/hexagon/divdi3.S
@@ -0,0 +1,85 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_divdi3
+ {
+ p2 = tstbit(r1,#31)
+ p3 = tstbit(r3,#31)
+ }
+ {
+ r1:0 = abs(r1:0)
+ r3:2 = abs(r3:2)
+ }
+ {
+ r6 = cl0(r1:0) // count leading 0's of dividend (numerator)
+ r7 = cl0(r3:2) // count leading 0's of divisor (denominator)
+ r5:4 = r3:2 // divisor moved into working registers
+ r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder
+ }
+ {
+ p3 = xor(p2,p3)
+ r10 = sub(r7,r6) // left shift count for bit & divisor
+ r1:0 = #0 // initialize quotient to 0
+ r15:14 = #1 // initialize bit to 1
+ }
+ {
+ r11 = add(r10,#1) // loop count is 1 more than shift count
+ r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb
+ r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor
+ }
+ {
+ p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend
+ loop0(1f,r11) // register loop
+ }
+ {
+ if (p0) jump .hexagon_divdi3_return // if divisor > dividend, we're done, so return
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder
+ }
+ {
+ r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder
+ r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8)
+ }
+ {
+ r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+ r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+ }
+ {
+ r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration
+ r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration
+ }:endloop0
+
+.hexagon_divdi3_return:
+ {
+ r3:2 = neg(r1:0)
+ }
+ {
+ r1:0 = vmux(p3,r3:2,r1:0)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_divdi3
+
+ .globl __qdsp_divdi3
+ .set __qdsp_divdi3, __hexagon_divdi3
diff --git a/lib/builtins/hexagon/divsi3.S b/lib/builtins/hexagon/divsi3.S
new file mode 100644
index 000000000..8e159baa1
--- /dev/null
+++ b/lib/builtins/hexagon/divsi3.S
@@ -0,0 +1,84 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_divsi3
+ {
+ p0 = cmp.ge(r0,#0)
+ p1 = cmp.ge(r1,#0)
+ r1 = abs(r0)
+ r2 = abs(r1)
+ }
+ {
+ r3 = cl0(r1)
+ r4 = cl0(r2)
+ r5 = sub(r1,r2)
+ p2 = cmp.gtu(r2,r1)
+ }
+#if (__HEXAGON_ARCH__ == 60)
+ {
+ r0 = #0
+ p1 = xor(p0,p1)
+ p0 = cmp.gtu(r2,r5)
+ }
+ if (p2) jumpr r31
+#else
+ {
+ r0 = #0
+ p1 = xor(p0,p1)
+ p0 = cmp.gtu(r2,r5)
+ if (p2) jumpr r31
+ }
+#endif
+ {
+ r0 = mux(p1,#-1,#1)
+ if (p0) jumpr r31
+ r4 = sub(r4,r3)
+ r3 = #1
+ }
+ {
+ r0 = #0
+ r3:2 = vlslw(r3:2,r4)
+ loop0(1f,r4)
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r1 = sub(r1,r2)
+ if (!p0.new) r0 = add(r0,r3)
+ r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r0 = add(r0,r3)
+ if (!p1) jumpr r31
+ }
+ {
+ r0 = neg(r0)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_divsi3
+
+ .globl __qdsp_divsi3
+ .set __qdsp_divsi3, __hexagon_divsi3
diff --git a/lib/builtins/hexagon/fabs_opt.S b/lib/builtins/hexagon/fabs_opt.S
new file mode 100644
index 000000000..b09b00734
--- /dev/null
+++ b/lib/builtins/hexagon/fabs_opt.S
@@ -0,0 +1,37 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size \name, . - \name
+.endm
+
+FUNCTION_BEGIN fabs
+ {
+ r1 = clrbit(r1, #31)
+ jumpr r31
+ }
+FUNCTION_END fabs
+
+FUNCTION_BEGIN fabsf
+ {
+ r0 = clrbit(r0, #31)
+ jumpr r31
+ }
+FUNCTION_END fabsf
+
+ .globl fabsl
+ .set fabsl, fabs
diff --git a/lib/builtins/hexagon/fastmath2_dlib_asm.S b/lib/builtins/hexagon/fastmath2_dlib_asm.S
new file mode 100644
index 000000000..9286df06c
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath2_dlib_asm.S
@@ -0,0 +1,491 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/* FUNCTIONS Optimized double floating point operators */
+/* ==================================================================== */
+/* c = dadd_asm(a, b) */
+/* ==================================================================== *
+fast2_QDOUBLE fast2_dadd(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+ fast2_QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, expdiff, j, k, hi, lo, cn;
+ lint mant;
+
+ expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+ expdiff = Q6_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) + (mantb>>expb);
+
+ hi = (int) (mant>>32);
+ lo = (int) (mant);
+
+ k = Q6_R_normamt_R(hi);
+ if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+ mant = (mant << k);
+ cn = (mant == 0x8000000000000000LL);
+ exp = exp - k + cn;
+
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_dadd_asm
+ .type fast2_dadd_asm, @function
+fast2_dadd_asm:
+#define manta R0
+#define mantexpa R1:0
+#define lmanta R1:0
+#define mantb R2
+#define mantexpb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define mantexpd R7:6
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define manth R1
+#define mantl R0
+#define minmin R11:10 // exactly 0x000000000000008001LL
+#define minminl R10
+#define k R4
+#define ce P0
+ .falign
+ {
+ mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+ c63 = #62
+ expa = SXTH(manta)
+ expb = SXTH(mantb)
+ } {
+ expd = SXTH(expd)
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ manta.L = #0
+ expd = MIN(expd, c63)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ mantb.L = #0
+ minmin = #0
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ } {
+ lmant = add(lmanta, lmantb)
+ minminl.L = #0x8001
+ } {
+ k = clb(lmant)
+ c63 = #58
+ } {
+ k = add(k, #-1)
+ p0 = cmp.gt(k, c63)
+ } {
+ mantexpa = ASL(lmant, k)
+ exp = SUB(exp, k)
+ if(p0) jump .Ldenorma
+ } {
+ manta = insert(exp, #16, #0)
+ jumpr r31
+ }
+.Ldenorma:
+ {
+ mantexpa = minmin
+ jumpr r31
+ }
+/* =================================================================== *
+ fast2_QDOUBLE fast2_dsub(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+ fast2_QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, expdiff, j, k;
+ lint mant;
+
+ expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+ expdiff = Q6_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) - (mantb>>expb);
+ k = Q6_R_clb_P(mant)-1;
+ mant = (mant << k);
+ exp = exp - k;
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_dsub_asm
+ .type fast2_dsub_asm, @function
+fast2_dsub_asm:
+
+#define manta R0
+#define mantexpa R1:0
+#define lmanta R1:0
+#define mantb R2
+#define mantexpb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define mantexpd R7:6
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define manth R1
+#define mantl R0
+#define minmin R11:10 // exactly 0x000000000000008001LL
+#define minminl R10
+#define k R4
+#define ce P0
+ .falign
+ {
+ mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+ c63 = #62
+ expa = SXTH(manta)
+ expb = SXTH(mantb)
+ } {
+ expd = SXTH(expd)
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ manta.L = #0
+ expd = MIN(expd, c63)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ mantb.L = #0
+ minmin = #0
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ } {
+ lmant = sub(lmanta, lmantb)
+ minminl.L = #0x8001
+ } {
+ k = clb(lmant)
+ c63 = #58
+ } {
+ k = add(k, #-1)
+ p0 = cmp.gt(k, c63)
+ } {
+ mantexpa = ASL(lmant, k)
+ exp = SUB(exp, k)
+ if(p0) jump .Ldenorm
+ } {
+ manta = insert(exp, #16, #0)
+ jumpr r31
+ }
+.Ldenorm:
+ {
+ mantexpa = minmin
+ jumpr r31
+ }
+/* ==================================================================== *
+ fast2_QDOUBLE fast2_dmpy(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+ fast2_QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, k;
+ lint mant;
+ int hia, hib, hi, lo;
+ unsigned int loa, lob;
+
+ hia = (int)(a >> 32);
+ loa = Q6_R_extractu_RII((int)manta, 31, 1);
+ hib = (int)(b >> 32);
+ lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+ mant = Q6_P_mpy_RR(hia, lob);
+ mant = Q6_P_mpyacc_RR(mant,hib, loa);
+ mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+ hi = (int) (mant>>32);
+
+ k = Q6_R_normamt_R(hi);
+ mant = mant << k;
+ exp = expa + expb - k;
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_dmpy_asm
+ .type fast2_dmpy_asm, @function
+fast2_dmpy_asm:
+
+#define mantal R0
+#define mantah R1
+#define mantexpa R1:0
+#define mantbl R2
+#define mantbh R3
+#define mantexpb R3:2
+#define expa R4
+#define expb R5
+#define c8001 R12
+#define mantexpd R7:6
+#define mantdh R7
+#define exp R8
+#define lmantc R11:10
+#define kb R9
+#define guard R11
+#define mantal_ R12
+#define mantbl_ R13
+#define min R15:14
+#define minh R15
+
+ .falign
+ {
+ mantbl_= lsr(mantbl, #16)
+ expb = sxth(mantbl)
+ expa = sxth(mantal)
+ mantal_= lsr(mantal, #16)
+ }
+ {
+ lmantc = mpy(mantah, mantbh)
+ mantexpd = mpy(mantah, mantbl_)
+ mantal.L = #0x0
+ min = #0
+ }
+ {
+ lmantc = add(lmantc, lmantc)
+ mantexpd+= mpy(mantbh, mantal_)
+ mantbl.L = #0x0
+ minh.H = #0x8000
+ }
+ {
+ mantexpd = asr(mantexpd, #15)
+ c8001.L = #0x8001
+ p1 = cmp.eq(mantexpa, mantexpb)
+ }
+ {
+ mantexpd = add(mantexpd, lmantc)
+ exp = add(expa, expb)
+ p2 = cmp.eq(mantexpa, min)
+ }
+ {
+ kb = clb(mantexpd)
+ mantexpb = abs(mantexpd)
+ guard = #58
+ }
+ {
+ p1 = and(p1, p2)
+ exp = sub(exp, kb)
+ kb = add(kb, #-1)
+ p0 = cmp.gt(kb, guard)
+ }
+ {
+ exp = add(exp, #1)
+ mantexpa = asl(mantexpd, kb)
+ if(p1) jump .Lsat //rarely happens
+ }
+ {
+ mantal = insert(exp,#16, #0)
+ if(!p0) jumpr r31
+ }
+ {
+ mantal = insert(c8001,#16, #0)
+ jumpr r31
+ }
+.Lsat:
+ {
+ mantexpa = #-1
+ }
+ {
+ mantexpa = lsr(mantexpa, #1)
+ }
+ {
+ mantal = insert(exp,#16, #0)
+ jumpr r31
+ }
+
+/* ==================================================================== *
+ int fast2_qd2f(fast2_QDOUBLE a) {
+ int exp;
+ long long int manta;
+ int ic, rnd, mantb;
+
+ manta = a>>32;
+ exp = Q6_R_sxth_R(a) ;
+ ic = 0x80000000 & manta;
+ manta = Q6_R_abs_R_sat(manta);
+ mantb = (manta + rnd)>>7;
+ rnd = 0x40
+ exp = (exp + 126);
+ if((manta & 0xff) == rnd) rnd = 0x00;
+ if((manta & 0x7fffffc0) == 0x7fffffc0) {
+ manta = 0x0; exp++;
+ } else {
+ manta= mantb & 0x007fffff;
+ }
+ exp = (exp << 23) & 0x7fffffc0;
+ ic = Q6_R_addacc_RR(ic, exp, manta);
+ return (ic);
+ }
+ * ==================================================================== */
+
+ .text
+ .global fast2_qd2f_asm
+ .type fast2_qd2f_asm, @function
+fast2_qd2f_asm:
+#define mantah R1
+#define mantal R0
+#define cff R0
+#define mant R3
+#define expo R4
+#define rnd R5
+#define mask R6
+#define c07f R7
+#define c80 R0
+#define mantb R2
+#define ic R0
+
+ .falign
+ {
+ mant = abs(mantah):sat
+ expo = sxth(mantal)
+ rnd = #0x40
+ mask.L = #0xffc0
+ }
+ {
+ cff = extractu(mant, #8, #0)
+ p2 = cmp.gt(expo, #126)
+ p3 = cmp.ge(expo, #-126)
+ mask.H = #0x7fff
+ }
+ {
+ p1 = cmp.eq(cff,#0x40)
+ if(p1.new) rnd = #0
+ expo = add(expo, #126)
+ if(!p3) jump .Lmin
+ }
+ {
+ p0 = bitsset(mant, mask)
+ c80.L = #0x0000
+ mantb = add(mant, rnd)
+ c07f = lsr(mask, #8)
+ }
+ {
+ if(p0) expo = add(expo, #1)
+ if(p0) mant = #0
+ mantb = lsr(mantb, #7)
+ c80.H = #0x8000
+ }
+ {
+ ic = and(c80, mantah)
+ mask &= asl(expo, #23)
+ if(!p0) mant = and(mantb, c07f)
+ if(p2) jump .Lmax
+ }
+ {
+ ic += add(mask, mant)
+ jumpr r31
+ }
+.Lmax:
+ {
+ ic.L = #0xffff;
+ }
+ {
+ ic.H = #0x7f7f;
+ jumpr r31
+ }
+.Lmin:
+ {
+ ic = #0x0
+ jumpr r31
+ }
+
+/* ==================================================================== *
+fast2_QDOUBLE fast2_f2qd(int ia) {
+ lint exp;
+ lint mant;
+ fast2_QDOUBLE c;
+
+ mant = ((ia << 7) | 0x40000000)&0x7fffff80 ;
+ if (ia & 0x80000000) mant = -mant;
+ exp = ((ia >> 23) & 0xFFLL) - 126;
+ c = (mant<<32) | Q6_R_zxth_R(exp);;
+ return(c);
+}
+ * ==================================================================== */
+ .text
+ .global fast2_f2qd_asm
+ .type fast2_f2qd_asm, @function
+fast2_f2qd_asm:
+#define ia R0
+#define mag R3
+#define mantr R1
+#define expr R0
+#define zero R2
+#define maxneg R5:4
+#define maxnegl R4
+ .falign
+ {
+ mantr = asl(ia, #7)
+ p0 = tstbit(ia, #31)
+ maxneg = #0
+ mag = add(ia,ia)
+ }
+ {
+ mantr = setbit(mantr, #30)
+ expr= extractu(ia,#8,#23)
+ maxnegl.L = #0x8001
+ p1 = cmp.eq(mag, #0)
+ }
+ {
+ mantr= extractu(mantr, #31, #0)
+ expr= add(expr, #-126)
+ zero = #0
+ if(p1) jump .Lminqd
+ }
+ {
+ expr = zxth(expr)
+ if(p0) mantr= sub(zero, mantr)
+ jumpr r31
+ }
+.Lminqd:
+ {
+ R1:0 = maxneg
+ jumpr r31
+ }
diff --git a/lib/builtins/hexagon/fastmath2_ldlib_asm.S b/lib/builtins/hexagon/fastmath2_ldlib_asm.S
new file mode 100644
index 000000000..419255535
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath2_ldlib_asm.S
@@ -0,0 +1,345 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== *
+
+fast2_QLDOUBLE fast2_ldadd(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+ fast2_QLDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, expdiff, j, k, hi, lo, cn;
+ lint mant;
+
+ expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+ expdiff = Q6_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) + (mantb>>expb);
+
+ hi = (int) (mant>>32);
+ lo = (int) (mant);
+
+ k = Q6_R_normamt_R(hi);
+ if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+ mant = (mant << k);
+ cn = (mant == 0x8000000000000000LL);
+ exp = exp - k + cn;
+
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_ldadd_asm
+ .type fast2_ldadd_asm, @function
+fast2_ldadd_asm:
+#define manta R1:0
+#define lmanta R1:0
+#define mantb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define k R4
+#define ce P0
+#define zero R3:2
+ .falign
+ {
+ expa = memw(r29+#8)
+ expb = memw(r29+#24)
+ r7 = r0
+ }
+ {
+ expd = sub(expa, expb):sat
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ expd = abs(expd):sat
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ c63 = #62
+ } {
+ expd = MIN(expd, c63)
+ manta = memd(r29+#0)
+ mantb = memd(r29+#16)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ } {
+ lmant = add(lmanta, lmantb)
+ zero = #0
+ } {
+ k = clb(lmant)
+ c63.L =#0x0001
+ } {
+ exp -= add(k, #-1) //exp = exp - (k-1)
+ k = add(k, #-1)
+ p0 = cmp.gt(k, #58)
+ c63.H =#0x8000
+ } {
+ if(!p0)memw(r7+#8) = exp
+ lmant = ASL(lmant, k)
+ if(p0) jump .Ldenorma
+ } {
+ memd(r7+#0) = lmant
+ jumpr r31
+ }
+.Ldenorma:
+ memd(r7+#0) = zero
+ {
+ memw(r7+#8) = c63
+ jumpr r31
+ }
+/* =================================================================== *
+ fast2_QLDOUBLE fast2_ldsub(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+ fast2_QLDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, expdiff, j, k;
+ lint mant;
+
+ expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+ expdiff = Q6_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) - (mantb>>expb);
+ k = Q6_R_clb_P(mant)-1;
+ mant = (mant << k);
+ exp = exp - k;
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_ldsub_asm
+ .type fast2_ldsub_asm, @function
+fast2_ldsub_asm:
+#define manta R1:0
+#define lmanta R1:0
+#define mantb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define k R4
+#define ce P0
+#define zero R3:2
+ .falign
+ {
+ expa = memw(r29+#8)
+ expb = memw(r29+#24)
+ r7 = r0
+ }
+ {
+ expd = sub(expa, expb):sat
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ expd = abs(expd):sat
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ c63 = #62
+ } {
+ expd = min(expd, c63)
+ manta = memd(r29+#0)
+ mantb = memd(r29+#16)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ } {
+ lmant = sub(lmanta, lmantb)
+ zero = #0
+ } {
+ k = clb(lmant)
+ c63.L =#0x0001
+ } {
+ exp -= add(k, #-1) //exp = exp - (k+1)
+ k = add(k, #-1)
+ p0 = cmp.gt(k, #58)
+ c63.H =#0x8000
+ } {
+ if(!p0)memw(r7+#8) = exp
+ lmant = asl(lmant, k)
+ if(p0) jump .Ldenorma_s
+ } {
+ memd(r7+#0) = lmant
+ jumpr r31
+ }
+.Ldenorma_s:
+ memd(r7+#0) = zero
+ {
+ memw(r7+#8) = c63
+ jumpr r31
+ }
+
+/* ==================================================================== *
+ fast2_QLDOUBLE fast2_ldmpy(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+ fast2_QLDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = Q6_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = Q6_R_sxth_R(b) ;
+ int exp, k;
+ lint mant;
+ int hia, hib, hi, lo;
+ unsigned int loa, lob;
+
+ hia = (int)(a >> 32);
+ loa = Q6_R_extractu_RII((int)manta, 31, 1);
+ hib = (int)(b >> 32);
+ lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+ mant = Q6_P_mpy_RR(hia, lob);
+ mant = Q6_P_mpyacc_RR(mant,hib, loa);
+ mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+ hi = (int) (mant>>32);
+
+ k = Q6_R_normamt_R(hi);
+ mant = mant << k;
+ exp = expa + expb - k;
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global fast2_ldmpy_asm
+ .type fast2_ldmpy_asm, @function
+fast2_ldmpy_asm:
+
+#define mantxl_ R9
+#define mantxl R14
+#define mantxh R15
+#define mantx R15:14
+#define mantbl R2
+#define mantbl_ R8
+#define mantbh R3
+#define mantb R3:2
+#define expa R4
+#define expb R5
+#define c8001 R8
+#define mantd R7:6
+#define lmantc R11:10
+#define kp R9
+#define min R13:12
+#define minh R13
+#define max R13:12
+#define maxh R13
+#define ret R0
+
+ .falign
+ {
+ mantx = memd(r29+#0)
+ mantb = memd(r29+#16)
+ min = #0
+ }
+ {
+ mantbl_= extractu(mantbl, #31, #1)
+ mantxl_= extractu(mantxl, #31, #1)
+ minh.H = #0x8000
+ }
+ {
+ lmantc = mpy(mantxh, mantbh)
+ mantd = mpy(mantxh, mantbl_)
+ expa = memw(r29+#8)
+ expb = memw(r29+#24)
+ }
+ {
+ lmantc = add(lmantc, lmantc)
+ mantd += mpy(mantbh, mantxl_)
+ }
+ {
+ mantd = asr(mantd, #30)
+ c8001.L = #0x0001
+ p1 = cmp.eq(mantx, mantb)
+ }
+ {
+ mantd = add(mantd, lmantc)
+ expa= add(expa, expb)
+ p2 = cmp.eq(mantb, min)
+ }
+ {
+ kp = clb(mantd)
+ c8001.H = #0x8000
+ p1 = and(p1, p2)
+ }
+ {
+ expa-= add(kp, #-1)
+ kp = add(kp, #-1)
+ if(p1) jump .Lsat
+ }
+ {
+ mantd = asl(mantd, kp)
+ memw(ret+#8) = expa
+ p0 = cmp.gt(kp, #58)
+ if(p0.new) jump:NT .Ldenorm //rarely happens
+ }
+ {
+ memd(ret+#0) = mantd
+ jumpr r31
+ }
+.Lsat:
+ {
+ max = #0
+ expa+= add(kp, #1)
+ }
+ {
+ maxh.H = #0x4000
+ memw(ret+#8) = expa
+ }
+ {
+ memd(ret+#0) = max
+ jumpr r31
+ }
+.Ldenorm:
+ {
+ memw(ret+#8) = c8001
+ mantx = #0
+ }
+ {
+ memd(ret+#0) = mantx
+ jumpr r31
+ }
diff --git a/lib/builtins/hexagon/fastmath_dlib_asm.S b/lib/builtins/hexagon/fastmath_dlib_asm.S
new file mode 100644
index 000000000..215936b78
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath_dlib_asm.S
@@ -0,0 +1,400 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/* FUNCTIONS Optimized double floating point operators */
+/* ==================================================================== */
+/* c = dadd_asm(a, b) */
+/* ====================================================================
+
+QDOUBLE dadd(QDOUBLE a,QDOUBLE b) {
+ QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = HEXAGON_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = HEXAGON_R_sxth_R(b) ;
+ int exp, expdiff, j, k, hi, lo, cn;
+ lint mant;
+
+ expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+ expdiff = HEXAGON_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) + (mantb>>expb);
+
+ hi = (int) (mant>>32);
+ lo = (int) (mant);
+
+ k = HEXAGON_R_normamt_R(hi);
+ if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo);
+
+ mant = (mant << k);
+ cn = (mant == 0x8000000000000000LL);
+ exp = exp - k + cn;
+
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global dadd_asm
+ .type dadd_asm, @function
+dadd_asm:
+
+#define manta R0
+#define mantexpa R1:0
+#define lmanta R1:0
+#define mantb R2
+#define mantexpb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define mantexpd R7:6
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define manth R1
+#define mantl R0
+#define zero R7:6
+#define zerol R6
+#define minus R3:2
+#define minusl R2
+#define maxneg R9
+#define minmin R11:10 // exactly 0x800000000000000000LL
+#define minminh R11
+#define k R4
+#define kl R5
+#define ce P0
+ .falign
+ {
+ mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+ c63 = #62
+ expa = SXTH(manta)
+ expb = SXTH(mantb)
+ } {
+ expd = SXTH(expd)
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ manta.L = #0
+ expd = MIN(expd, c63)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ mantb.L = #0
+ zero = #0
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ minmin = #0
+ } {
+ lmant = add(lmanta, lmantb)
+ minus = #-1
+ minminh.H = #0x8000
+ } {
+ k = NORMAMT(manth)
+ kl = NORMAMT(mantl)
+ p0 = cmp.eq(manth, zerol)
+ p1 = cmp.eq(manth, minusl)
+ } {
+ p0 = OR(p0, p1)
+ if(p0.new) k = add(kl, #31)
+ maxneg.H = #0
+ } {
+ mantexpa = ASL(lmant, k)
+ exp = SUB(exp, k)
+ maxneg.L = #0x8001
+ } {
+ p0 = cmp.eq(mantexpa, zero)
+ p1 = cmp.eq(mantexpa, minus)
+ manta.L = #0
+ exp = ZXTH(exp)
+ } {
+ p2 = cmp.eq(mantexpa, minmin) //is result 0x80....0
+ if(p2.new) exp = add(exp, #1)
+ }
+#if (__HEXAGON_ARCH__ == 60)
+ {
+ p0 = OR(p0, p1)
+ if( p0.new) manta = OR(manta,maxneg)
+ if(!p0.new) manta = OR(manta,exp)
+ }
+ jumpr r31
+#else
+ {
+ p0 = OR(p0, p1)
+ if( p0.new) manta = OR(manta,maxneg)
+ if(!p0.new) manta = OR(manta,exp)
+ jumpr r31
+ }
+#endif
+/* =================================================================== *
+ QDOUBLE dsub(QDOUBLE a,QDOUBLE b) {
+ QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = HEXAGON_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = HEXAGON_R_sxth_R(b) ;
+ int exp, expdiff, j, k, hi, lo, cn;
+ lint mant;
+
+ expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+ expdiff = HEXAGON_R_sxth_R(expdiff) ;
+ if (expdiff > 63) { expdiff = 62;}
+ if (expa > expb) {
+ exp = expa + 1;
+ expa = 1;
+ expb = expdiff + 1;
+ } else {
+ exp = expb + 1;
+ expb = 1;
+ expa = expdiff + 1;
+ }
+ mant = (manta>>expa) - (mantb>>expb);
+
+ hi = (int) (mant>>32);
+ lo = (int) (mant);
+
+ k = HEXAGON_R_normamt_R(hi);
+ if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo);
+
+ mant = (mant << k);
+ cn = (mant == 0x8000000000000000LL);
+ exp = exp - k + cn;
+
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global dsub_asm
+ .type dsub_asm, @function
+dsub_asm:
+
+#define manta R0
+#define mantexpa R1:0
+#define lmanta R1:0
+#define mantb R2
+#define mantexpb R3:2
+#define lmantb R3:2
+#define expa R4
+#define expb R5
+#define mantexpd R7:6
+#define expd R6
+#define exp R8
+#define c63 R9
+#define lmant R1:0
+#define manth R1
+#define mantl R0
+#define zero R7:6
+#define zerol R6
+#define minus R3:2
+#define minusl R2
+#define maxneg R9
+#define minmin R11:10 // exactly 0x800000000000000000LL
+#define minminh R11
+#define k R4
+#define kl R5
+#define ce P0
+ .falign
+ {
+ mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+ c63 = #62
+ expa = SXTH(manta)
+ expb = SXTH(mantb)
+ } {
+ expd = SXTH(expd)
+ ce = CMP.GT(expa, expb);
+ if ( ce.new) exp = add(expa, #1)
+ if (!ce.new) exp = add(expb, #1)
+ } {
+ if ( ce) expa = #1
+ if (!ce) expb = #1
+ manta.L = #0
+ expd = MIN(expd, c63)
+ } {
+ if (!ce) expa = add(expd, #1)
+ if ( ce) expb = add(expd, #1)
+ mantb.L = #0
+ zero = #0
+ } {
+ lmanta = ASR(lmanta, expa)
+ lmantb = ASR(lmantb, expb)
+ minmin = #0
+ } {
+ lmant = sub(lmanta, lmantb)
+ minus = #-1
+ minminh.H = #0x8000
+ } {
+ k = NORMAMT(manth)
+ kl = NORMAMT(mantl)
+ p0 = cmp.eq(manth, zerol)
+ p1 = cmp.eq(manth, minusl)
+ } {
+ p0 = OR(p0, p1)
+ if(p0.new) k = add(kl, #31)
+ maxneg.H = #0
+ } {
+ mantexpa = ASL(lmant, k)
+ exp = SUB(exp, k)
+ maxneg.L = #0x8001
+ } {
+ p0 = cmp.eq(mantexpa, zero)
+ p1 = cmp.eq(mantexpa, minus)
+ manta.L = #0
+ exp = ZXTH(exp)
+ } {
+ p2 = cmp.eq(mantexpa, minmin) //is result 0x80....0
+ if(p2.new) exp = add(exp, #1)
+ }
+#if (__HEXAGON_ARCH__ == 60)
+ {
+ p0 = OR(p0, p1)
+ if( p0.new) manta = OR(manta,maxneg)
+ if(!p0.new) manta = OR(manta,exp)
+ }
+ jumpr r31
+#else
+ {
+ p0 = OR(p0, p1)
+ if( p0.new) manta = OR(manta,maxneg)
+ if(!p0.new) manta = OR(manta,exp)
+ jumpr r31
+ }
+#endif
+/* ==================================================================== *
+ QDOUBLE dmpy(QDOUBLE a,QDOUBLE b) {
+ QDOUBLE c;
+ lint manta = a & MANTMASK;
+ int expa = HEXAGON_R_sxth_R(a) ;
+ lint mantb = b & MANTMASK;
+ int expb = HEXAGON_R_sxth_R(b) ;
+ int exp, k;
+ lint mant;
+ int hia, hib, hi, lo;
+ unsigned int loa, lob;
+
+ hia = (int)(a >> 32);
+ loa = HEXAGON_R_extractu_RII((int)manta, 31, 1);
+ hib = (int)(b >> 32);
+ lob = HEXAGON_R_extractu_RII((int)mantb, 31, 1);
+
+ mant = HEXAGON_P_mpy_RR(hia, lob);
+ mant = HEXAGON_P_mpyacc_RR(mant,hib, loa);
+ mant = (mant >> 30) + (HEXAGON_P_mpy_RR(hia, hib)<<1);
+
+ hi = (int) (mant>>32);
+ lo = (int) (mant);
+
+ k = HEXAGON_R_normamt_R(hi);
+ if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo);
+ mant = mant << k;
+ exp = expa + expb - k;
+ if (mant == 0 || mant == -1) exp = 0x8001;
+ c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+ return(c);
+ }
+ * ==================================================================== */
+ .text
+ .global dmpy_asm
+ .type dmpy_asm, @function
+dmpy_asm:
+
+#define mantal R0
+#define mantah R1
+#define mantexpa R1:0
+#define mantbl R2
+#define mantbh R3
+#define mantexpb R3:2
+#define expa R4
+#define expb R5
+#define mantexpd R7:6
+#define exp R8
+#define lmantc R11:10
+#define mantch R11
+#define mantcl R10
+#define zero0 R7:6
+#define zero0l R6
+#define minus1 R3:2
+#define minus1l R2
+#define maxneg R9
+#define k R4
+#define kl R5
+
+ .falign
+ {
+ mantbl = lsr(mantbl, #16)
+ mantal = lsr(mantal, #16)
+ expa = sxth(mantal)
+ expb = sxth(mantbl)
+ }
+ {
+ lmantc = mpy(mantah, mantbh)
+ mantexpd = mpy(mantah, mantbl)
+ }
+ {
+ lmantc = add(lmantc, lmantc) //<<1
+ mantexpd+= mpy(mantbh, mantal)
+ }
+ {
+ lmantc += asr(mantexpd, #15)
+ exp = add(expa, expb)
+ zero0 = #0
+ minus1 = #-1
+ }
+ {
+ k = normamt(mantch)
+ kl = normamt(mantcl)
+ p0 = cmp.eq(mantch, zero0l)
+ p1 = cmp.eq(mantch, minus1l)
+ }
+ {
+ p0 = or(p0, p1)
+ if(p0.new) k = add(kl, #31)
+ maxneg.H = #0
+ }
+ {
+ mantexpa = asl(lmantc, k)
+ exp = sub(exp, k)
+ maxneg.L = #0x8001
+ }
+ {
+ p0 = cmp.eq(mantexpa, zero0)
+ p1 = cmp.eq(mantexpa, minus1)
+ mantal.L = #0
+ exp = zxth(exp)
+ }
+#if (__HEXAGON_ARCH__ == 60)
+ {
+ p0 = or(p0, p1)
+ if( p0.new) mantal = or(mantal,maxneg)
+ if(!p0.new) mantal = or(mantal,exp)
+ }
+ jumpr r31
+#else
+ {
+ p0 = or(p0, p1)
+ if( p0.new) mantal = or(mantal,maxneg)
+ if(!p0.new) mantal = or(mantal,exp)
+ jumpr r31
+ }
+#endif
diff --git a/lib/builtins/hexagon/fma_opt.S b/lib/builtins/hexagon/fma_opt.S
new file mode 100644
index 000000000..12378f0da
--- /dev/null
+++ b/lib/builtins/hexagon/fma_opt.S
@@ -0,0 +1,31 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaf
+ r2 += sfmpy(r0, r1)
+ {
+ r0 = r2
+ jumpr r31
+ }
+FUNCTION_END fmaf
+
+ .globl fmal
+ .set fmal, fma
diff --git a/lib/builtins/hexagon/fmax_opt.S b/lib/builtins/hexagon/fmax_opt.S
new file mode 100644
index 000000000..f3a218c97
--- /dev/null
+++ b/lib/builtins/hexagon/fmax_opt.S
@@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaxf
+ {
+ r0 = sfmax(r0, r1)
+ jumpr r31
+ }
+FUNCTION_END fmaxf
+
+ .globl fmaxl
+ .set fmaxl, fmax
diff --git a/lib/builtins/hexagon/fmin_opt.S b/lib/builtins/hexagon/fmin_opt.S
new file mode 100644
index 000000000..ef9b0ff85
--- /dev/null
+++ b/lib/builtins/hexagon/fmin_opt.S
@@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size \name, . - \name
+.endm
+
+FUNCTION_BEGIN fminf
+ {
+ r0 = sfmin(r0, r1)
+ jumpr r31
+ }
+FUNCTION_END fminf
+
+ .globl fminl
+ .set fminl, fmin
diff --git a/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
new file mode 100644
index 000000000..fbe09086c
--- /dev/null
+++ b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
@@ -0,0 +1,125 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// An optimized version of a memcpy which is equivalent to the following loop:
+//
+// volatile unsigned *dest;
+// unsigned *src;
+//
+// for (i = 0; i < num_words; ++i)
+// *dest++ = *src++;
+//
+// The corresponding C prototype for this function would be
+// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
+// const unsigned *src,
+// unsigned num_words);
+//
+// *** Both dest and src must be aligned to 32-bit boundaries. ***
+// The code does not perform any runtime checks for this, and will fail
+// in bad ways if this requirement is not met.
+//
+// The "forward" in the name refers to the fact that the function copies
+// the words going forward in memory. It is incorrect to use this function
+// for cases where the original code copied words in any other order.
+//
+// *** This function is only for the use by the compiler. ***
+// The only indended use is for the LLVM compiler to generate calls to
+// this function, when a mem-copy loop, like the one above, is detected.
+
+ .text
+
+// Inputs:
+// r0: dest
+// r1: src
+// r2: num_words
+
+ .globl hexagon_memcpy_forward_vp4cp4n2
+ .balign 32
+ .type hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+ // Compute r3 to be the number of words remaining in the current page.
+ // At the same time, compute r4 to be the number of 32-byte blocks
+ // remaining in the page (for prefetch).
+ {
+ r3 = sub(##4096, r1)
+ r5 = lsr(r2, #3)
+ }
+ {
+ // The word count before end-of-page is in the 12 lowest bits of r3.
+ // (If the address in r1 was already page-aligned, the bits are 0.)
+ r3 = extractu(r3, #10, #2)
+ r4 = extractu(r3, #7, #5)
+ }
+ {
+ r3 = minu(r2, r3)
+ r4 = minu(r5, r4)
+ }
+ {
+ r4 = or(r4, ##2105344) // 2105344 = 0x202000
+ p0 = cmp.eq(r3, #0)
+ if (p0.new) jump:nt .Lskipprolog
+ }
+ l2fetch(r1, r4)
+ {
+ loop0(.Lprolog, r3)
+ r2 = sub(r2, r3) // r2 = number of words left after the prolog.
+ }
+ .falign
+.Lprolog:
+ {
+ r4 = memw(r1++#4)
+ memw(r0++#4) = r4.new
+ } :endloop0
+.Lskipprolog:
+ {
+ // Let r3 = number of whole pages left (page = 1024 words).
+ r3 = lsr(r2, #10)
+ if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+ }
+ {
+ loop1(.Lout, r3)
+ r2 = extractu(r2, #10, #0) // r2 = r2 & 1023
+ r3 = ##2105472 // r3 = 0x202080 (prefetch info)
+ }
+ // Iterate over pages.
+ .falign
+.Lout:
+ // Prefetch each individual page.
+ l2fetch(r1, r3)
+ loop0(.Lpage, #512)
+ .falign
+.Lpage:
+ r5:4 = memd(r1++#8)
+ {
+ memw(r0++#8) = r4
+ memw(r0+#4) = r5
+ } :endloop0:endloop1
+.Lskipmain:
+ {
+ r3 = ##2105344 // r3 = 0x202000 (prefetch info)
+ r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining.
+ p0 = cmp.eq(r2, #0)
+ if (p0.new) jumpr:nt r31
+ }
+ {
+ r3 = or(r3, r4)
+ loop0(.Lepilog, r2)
+ }
+ l2fetch(r1, r3)
+ .falign
+.Lepilog:
+ {
+ r4 = memw(r1++#4)
+ memw(r0++#4) = r4.new
+ } :endloop0
+
+ jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
diff --git a/lib/builtins/hexagon/memcpy_likely_aligned.S b/lib/builtins/hexagon/memcpy_likely_aligned.S
new file mode 100644
index 000000000..bbc85c22d
--- /dev/null
+++ b/lib/builtins/hexagon/memcpy_likely_aligned.S
@@ -0,0 +1,64 @@
+//===------------------------- memcopy routines ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+ {
+ p0 = bitsclr(r1,#7)
+ p0 = bitsclr(r0,#7)
+ if (p0.new) r5:4 = memd(r1)
+ r3 = #-3
+ }
+ {
+ if (!p0) jump .Lmemcpy_call
+ if (p0) memd(r0++#8) = r5:4
+ if (p0) r5:4 = memd(r1+#8)
+ r3 += lsr(r2,#3)
+ }
+ {
+ memd(r0++#8) = r5:4
+ r5:4 = memd(r1+#16)
+ r1 = add(r1,#24)
+ loop0(1f,r3)
+ }
+ .falign
+1:
+ {
+ memd(r0++#8) = r5:4
+ r5:4 = memd(r1++#8)
+ }:endloop0
+ {
+ memd(r0) = r5:4
+ r0 -= add(r2,#-8)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+.Lmemcpy_call:
+#ifdef __PIC__
+ jump memcpy@PLT
+#else
+ jump memcpy
+#endif
+
+ .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes
+ .set __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, \
+ __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
diff --git a/lib/builtins/hexagon/moddi3.S b/lib/builtins/hexagon/moddi3.S
new file mode 100644
index 000000000..12a0595fe
--- /dev/null
+++ b/lib/builtins/hexagon/moddi3.S
@@ -0,0 +1,83 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_moddi3
+ {
+ p3 = tstbit(r1,#31)
+ }
+ {
+ r1:0 = abs(r1:0)
+ r3:2 = abs(r3:2)
+ }
+ {
+ r6 = cl0(r1:0) // count leading 0's of dividend (numerator)
+ r7 = cl0(r3:2) // count leading 0's of divisor (denominator)
+ r5:4 = r3:2 // divisor moved into working registers
+ r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder
+ }
+ {
+ r10 = sub(r7,r6) // left shift count for bit & divisor
+ r1:0 = #0 // initialize quotient to 0
+ r15:14 = #1 // initialize bit to 1
+ }
+ {
+ r11 = add(r10,#1) // loop count is 1 more than shift count
+ r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb
+ r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor
+ }
+ {
+ p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend
+ loop0(1f,r11) // register loop
+ }
+ {
+ if (p0) jump .hexagon_moddi3_return // if divisor > dividend, we're done, so return
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder
+ }
+ {
+ r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder
+ r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8)
+ }
+ {
+ r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+ r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+ }
+ {
+ r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration
+ r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration
+ }:endloop0
+
+.hexagon_moddi3_return:
+ {
+ r1:0 = neg(r3:2)
+ }
+ {
+ r1:0 = vmux(p3,r1:0,r3:2)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_moddi3
+
+ .globl __qdsp_moddi3
+ .set __qdsp_moddi3, __hexagon_moddi3
diff --git a/lib/builtins/hexagon/modsi3.S b/lib/builtins/hexagon/modsi3.S
new file mode 100644
index 000000000..5afda9e29
--- /dev/null
+++ b/lib/builtins/hexagon/modsi3.S
@@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_modsi3
+ {
+ p2 = cmp.ge(r0,#0)
+ r2 = abs(r0)
+ r1 = abs(r1)
+ }
+ {
+ r3 = cl0(r2)
+ r4 = cl0(r1)
+ p0 = cmp.gtu(r1,r2)
+ }
+ {
+ r3 = sub(r4,r3)
+ if (p0) jumpr r31
+ }
+ {
+ p1 = cmp.eq(r3,#0)
+ loop0(1f,r3)
+ r0 = r2
+ r2 = lsl(r1,r3)
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r2,r0)
+ if (!p0.new) r0 = sub(r0,r2)
+ r2 = lsr(r2,#1)
+ if (p1) r1 = #0
+ }:endloop0
+ {
+ p0 = cmp.gtu(r2,r0)
+ if (!p0.new) r0 = sub(r0,r1)
+ if (p2) jumpr r31
+ }
+ {
+ r0 = neg(r0)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_modsi3
+
+ .globl __qdsp_modsi3
+ .set __qdsp_modsi3, __hexagon_modsi3
diff --git a/lib/builtins/hexagon/sfdiv_opt.S b/lib/builtins/hexagon/sfdiv_opt.S
new file mode 100644
index 000000000..6bdd4808c
--- /dev/null
+++ b/lib/builtins/hexagon/sfdiv_opt.S
@@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_divsf3
+ {
+ r2,p0 = sfrecipa(r0,r1)
+ r4 = sffixupd(r0,r1)
+ r3 = ##0x3f800000 // 1.0
+ }
+ {
+ r5 = sffixupn(r0,r1)
+ r3 -= sfmpy(r4,r2):lib // 1-(den/recip) yields error?
+ r6 = ##0x80000000
+ r7 = r3
+ }
+ {
+ r2 += sfmpy(r3,r2):lib
+ r3 = r7
+ r6 = r5
+ r0 = and(r6,r5)
+ }
+ {
+ r3 -= sfmpy(r4,r2):lib
+ r0 += sfmpy(r5,r2):lib
+ }
+ {
+ r2 += sfmpy(r3,r2):lib
+ r6 -= sfmpy(r0,r4):lib
+ }
+ {
+ r0 += sfmpy(r6,r2):lib
+ }
+ {
+ r5 -= sfmpy(r0,r4):lib
+ }
+ {
+ r0 += sfmpy(r5,r2,p0):scale
+ jumpr r31
+ }
+FUNCTION_END __hexagon_divsf3
+
+Q6_ALIAS(divsf3)
+FAST_ALIAS(divsf3)
+FAST2_ALIAS(divsf3)
diff --git a/lib/builtins/hexagon/sfsqrt_opt.S b/lib/builtins/hexagon/sfsqrt_opt.S
new file mode 100644
index 000000000..7f6190027
--- /dev/null
+++ b/lib/builtins/hexagon/sfsqrt_opt.S
@@ -0,0 +1,82 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+#define RIN r0
+#define S r0
+#define H r1
+#define D r2
+#define E r3
+#define HALF r4
+#define R r5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_sqrtf
+ {
+ E,p0 = sfinvsqrta(RIN)
+ R = sffixupr(RIN)
+ HALF = ##0x3f000000 // 0.5
+ r1:0 = combine(#0,#0) // clear S/H
+ }
+ {
+ S += sfmpy(E,R):lib // S0
+ H += sfmpy(E,HALF):lib // H0
+ D = HALF
+ E = R
+ }
+ {
+ D -= sfmpy(S,H):lib // d0
+ p1 = sfclass(R,#1) // is zero?
+ //E -= sfmpy(S,S):lib // e0
+ }
+ {
+ S += sfmpy(S,D):lib // S1
+ H += sfmpy(H,D):lib // H1
+ D = HALF
+ E = R
+ }
+ {
+ D -= sfmpy(S,H):lib // d0
+ E -= sfmpy(S,S):lib // e0
+ }
+ {
+ S += sfmpy(H,E):lib // S2
+ H += sfmpy(H,D):lib // H2
+ D = HALF
+ E = R
+ }
+ {
+ //D -= sfmpy(S,H):lib // d2
+ E -= sfmpy(S,S):lib // e2
+ if (p1) r0 = or(r0,R) // sqrt(-0.0) = -0.0
+ }
+ {
+ S += sfmpy(H,E,p0):scale // S3
+ jumpr r31
+ }
+
+FUNCTION_END __hexagon_sqrtf
+
+Q6_ALIAS(sqrtf)
+FAST_ALIAS(sqrtf)
+FAST2_ALIAS(sqrtf)
diff --git a/lib/builtins/hexagon/udivdi3.S b/lib/builtins/hexagon/udivdi3.S
new file mode 100644
index 000000000..1ca326b75
--- /dev/null
+++ b/lib/builtins/hexagon/udivdi3.S
@@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_udivdi3
+ {
+ r6 = cl0(r1:0) // count leading 0's of dividend (numerator)
+ r7 = cl0(r3:2) // count leading 0's of divisor (denominator)
+ r5:4 = r3:2 // divisor moved into working registers
+ r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder
+ }
+ {
+ r10 = sub(r7,r6) // left shift count for bit & divisor
+ r1:0 = #0 // initialize quotient to 0
+ r15:14 = #1 // initialize bit to 1
+ }
+ {
+ r11 = add(r10,#1) // loop count is 1 more than shift count
+ r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb
+ r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor
+ }
+ {
+ p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend
+ loop0(1f,r11) // register loop
+ }
+ {
+ if (p0) jumpr r31 // if divisor > dividend, we're done, so return
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder
+ }
+ {
+ r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder
+ r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8)
+ }
+ {
+ r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+ r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+ }
+ {
+ r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration
+ r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration
+ }:endloop0
+ {
+ jumpr r31 // return
+ }
+FUNCTION_END __hexagon_udivdi3
+
+ .globl __qdsp_udivdi3
+ .set __qdsp_udivdi3, __hexagon_udivdi3
diff --git a/lib/builtins/hexagon/udivmoddi4.S b/lib/builtins/hexagon/udivmoddi4.S
new file mode 100644
index 000000000..deb5aae09
--- /dev/null
+++ b/lib/builtins/hexagon/udivmoddi4.S
@@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_udivmoddi4
+ {
+ r6 = cl0(r1:0) // count leading 0's of dividend (numerator)
+ r7 = cl0(r3:2) // count leading 0's of divisor (denominator)
+ r5:4 = r3:2 // divisor moved into working registers
+ r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder
+ }
+ {
+ r10 = sub(r7,r6) // left shift count for bit & divisor
+ r1:0 = #0 // initialize quotient to 0
+ r15:14 = #1 // initialize bit to 1
+ }
+ {
+ r11 = add(r10,#1) // loop count is 1 more than shift count
+ r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb
+ r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor
+ }
+ {
+ p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend
+ loop0(1f,r11) // register loop
+ }
+ {
+ if (p0) jumpr r31 // if divisor > dividend, we're done, so return
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder
+ }
+ {
+ r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder
+ r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8)
+ }
+ {
+ r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+ r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+ }
+ {
+ r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration
+ r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration
+ }:endloop0
+ {
+ jumpr r31 // return
+ }
+FUNCTION_END __hexagon_udivmoddi4
+
+ .globl __qdsp_udivmoddi4
+ .set __qdsp_udivmoddi4, __hexagon_udivmoddi4
diff --git a/lib/builtins/hexagon/udivmodsi4.S b/lib/builtins/hexagon/udivmodsi4.S
new file mode 100644
index 000000000..25bbe7cd5
--- /dev/null
+++ b/lib/builtins/hexagon/udivmodsi4.S
@@ -0,0 +1,60 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_udivmodsi4
+ {
+ r2 = cl0(r0)
+ r3 = cl0(r1)
+ r5:4 = combine(#1,#0)
+ p0 = cmp.gtu(r1,r0)
+ }
+ {
+ r6 = sub(r3,r2)
+ r4 = r1
+ r1:0 = combine(r0,r4)
+ if (p0) jumpr r31
+ }
+ {
+ r3:2 = vlslw(r5:4,r6)
+ loop0(1f,r6)
+ p0 = cmp.eq(r6,#0)
+ if (p0.new) r4 = #0
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r1 = sub(r1,r2)
+ if (!p0.new) r0 = add(r0,r3)
+ r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r1 = sub(r1,r4)
+ if (!p0.new) r0 = add(r0,r3)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_udivmodsi4
+
+ .globl __qdsp_udivmodsi4
+ .set __qdsp_udivmodsi4, __hexagon_udivmodsi4
diff --git a/lib/builtins/hexagon/udivsi3.S b/lib/builtins/hexagon/udivsi3.S
new file mode 100644
index 000000000..54f0aa409
--- /dev/null
+++ b/lib/builtins/hexagon/udivsi3.S
@@ -0,0 +1,56 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_udivsi3
+ {
+ r2 = cl0(r0)
+ r3 = cl0(r1)
+ r5:4 = combine(#1,#0)
+ p0 = cmp.gtu(r1,r0)
+ }
+ {
+ r6 = sub(r3,r2)
+ r4 = r1
+ r1:0 = combine(r0,r4)
+ if (p0) jumpr r31
+ }
+ {
+ r3:2 = vlslw(r5:4,r6)
+ loop0(1f,r6)
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r1 = sub(r1,r2)
+ if (!p0.new) r0 = add(r0,r3)
+ r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+ p0 = cmp.gtu(r2,r1)
+ if (!p0.new) r0 = add(r0,r3)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_udivsi3
+
+ .globl __qdsp_udivsi3
+ .set __qdsp_udivsi3, __hexagon_udivsi3
diff --git a/lib/builtins/hexagon/umoddi3.S b/lib/builtins/hexagon/umoddi3.S
new file mode 100644
index 000000000..f09152141
--- /dev/null
+++ b/lib/builtins/hexagon/umoddi3.S
@@ -0,0 +1,74 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_umoddi3
+ {
+ r6 = cl0(r1:0) // count leading 0's of dividend (numerator)
+ r7 = cl0(r3:2) // count leading 0's of divisor (denominator)
+ r5:4 = r3:2 // divisor moved into working registers
+ r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder
+ }
+ {
+ r10 = sub(r7,r6) // left shift count for bit & divisor
+ r1:0 = #0 // initialize quotient to 0
+ r15:14 = #1 // initialize bit to 1
+ }
+ {
+ r11 = add(r10,#1) // loop count is 1 more than shift count
+ r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb
+ r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor
+ }
+ {
+ p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend
+ loop0(1f,r11) // register loop
+ }
+ {
+ if (p0) jump .hexagon_umoddi3_return // if divisor > dividend, we're done, so return
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder
+ }
+ {
+ r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder
+ r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8)
+ }
+ {
+ r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+ r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+ }
+ {
+ r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration
+ r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration
+ }:endloop0
+
+.hexagon_umoddi3_return:
+ {
+ r1:0 = r3:2
+ jumpr r31
+ }
+FUNCTION_END __hexagon_umoddi3
+
+ .globl __qdsp_umoddi3
+ .set __qdsp_umoddi3, __hexagon_umoddi3
diff --git a/lib/builtins/hexagon/umodsi3.S b/lib/builtins/hexagon/umodsi3.S
new file mode 100644
index 000000000..a8270c203
--- /dev/null
+++ b/lib/builtins/hexagon/umodsi3.S
@@ -0,0 +1,55 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
+
+FUNCTION_BEGIN __hexagon_umodsi3
+ {
+ r2 = cl0(r0)
+ r3 = cl0(r1)
+ p0 = cmp.gtu(r1,r0)
+ }
+ {
+ r2 = sub(r3,r2)
+ if (p0) jumpr r31
+ }
+ {
+ loop0(1f,r2)
+ p1 = cmp.eq(r2,#0)
+ r2 = lsl(r1,r2)
+ }
+ .falign
+1:
+ {
+ p0 = cmp.gtu(r2,r0)
+ if (!p0.new) r0 = sub(r0,r2)
+ r2 = lsr(r2,#1)
+ if (p1) r1 = #0
+ }:endloop0
+ {
+ p0 = cmp.gtu(r2,r0)
+ if (!p0.new) r0 = sub(r0,r1)
+ jumpr r31
+ }
+FUNCTION_END __hexagon_umodsi3
+
+ .globl __qdsp_umodsi3
+ .set __qdsp_umodsi3, __hexagon_umodsi3