Add basic compiler-rt builtins support for hexagon.

Differential Revision: https://reviews.llvm.org/D46364 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@331881 91177308-0d34-0410-b5e6-96231b3b80d8
author: Sid Manning <sidneym@codeaurora.org> 2018-05-09 14:44:54 +0000
committer: Sid Manning <sidneym@codeaurora.org> 2018-05-09 14:44:54 +0000
commit: 45688582b548029601dc801e9f92b0e36dd86823 (patch)
tree: 1f6af105d0a0365cbd57847506f597fd8b198511
parent: 65b4b157b2946ffe854538ce931a133faa89cdd9 (diff)
33 files changed, 5470 insertions, 1 deletions
diff --git a/cmake/builtin-config-ix.cmake b/cmake/builtin-config-ix.cmake
index eda5f4641..a5704e5fe 100644
--- a/cmake/builtin-config-ix.cmake
+++ b/cmake/builtin-config-ix.cmake
@@ -25,6 +25,7 @@ int foo(int x, int y) {
 
 set(ARM64 aarch64)
 set(ARM32 arm armhf armv6m armv7m armv7em armv7 armv7s armv7k)
+set(HEXAGON hexagon)
 set(X86 i386)
 set(X86_64 x86_64)
 set(MIPS32 mips mipsel)
@@ -42,7 +43,7 @@ if(APPLE)
 endif()
 
 set(ALL_BUILTIN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
-    ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${WASM32} ${WASM64})
+    ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${WASM32} ${WASM64})
 
 include(CompilerRTUtils)
 include(CompilerRTDarwinUtils)
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 82c3d2859..ee387dc22 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -174,6 +174,7 @@ endmacro()
 
 set(ARM64 aarch64)
 set(ARM32 arm armhf)
+set(HEXAGON hexagon)
 set(X86 i386)
 set(X86_64 x86_64)
 set(MIPS32 mips mipsel)
diff --git a/lib/builtins/CMakeLists.txt b/lib/builtins/CMakeLists.txt
index 6c48a404a..5e7f22467 100644
--- a/lib/builtins/CMakeLists.txt
+++ b/lib/builtins/CMakeLists.txt
@@ -459,6 +459,41 @@ set(armv6m_SOURCES ${thumb1_SOURCES})
 set(armv7m_SOURCES ${arm_SOURCES})
 set(armv7em_SOURCES ${arm_SOURCES})
 
+# hexagon arch
+set(hexagon_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES})
+set(hexagon_SOURCES
+  hexagon/common_entry_exit_abi1.S
+  hexagon/common_entry_exit_abi2.S
+  hexagon/common_entry_exit_legacy.S
+  hexagon/dfaddsub.S
+  hexagon/dfdiv.S
+  hexagon/dffma.S
+  hexagon/dfminmax.S
+  hexagon/dfmul.S
+  hexagon/dfsqrt.S
+  hexagon/divdi3.S
+  hexagon/divsi3.S
+  hexagon/fabs_opt.S
+  hexagon/fastmath2_dlib_asm.S
+  hexagon/fastmath2_ldlib_asm.S
+  hexagon/fastmath_dlib_asm.S
+  hexagon/fma_opt.S
+  hexagon/fmax_opt.S
+  hexagon/fmin_opt.S
+  hexagon/memcpy_forward_vp4cp4n2.S
+  hexagon/memcpy_likely_aligned.S
+  hexagon/moddi3.S
+  hexagon/modsi3.S
+  hexagon/sfdiv_opt.S
+  hexagon/sfsqrt_opt.S
+  hexagon/udivdi3.S
+  hexagon/udivmoddi4.S
+  hexagon/udivmodsi4.S
+  hexagon/udivsi3.S
+  hexagon/umoddi3.S
+  hexagon/umodsi3.S)
+
+
 set(mips_SOURCES ${GENERIC_SOURCES})
 set(mipsel_SOURCES ${mips_SOURCES})
 set(mips64_SOURCES ${GENERIC_TF_SOURCES}
diff --git a/lib/builtins/hexagon/common_entry_exit_abi1.S b/lib/builtins/hexagon/common_entry_exit_abi1.S
new file mode 100644
index 000000000..d5479d2a5
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_abi1.S
@@ -0,0 +1,103 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.globl \name
+	.type  \name, @function
+	.falign
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+	.falign
+\name1:
+	.endm
+
+
+
+
+/* Save r25:24 at fp+#-8 and r27:26 at fp+#-16. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+   simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r24_through_r27
+		memd(fp+#-16) = r27:26
+FALLTHROUGH_TAIL_CALL __save_r24_through_r27 __save_r24_through_r25
+	{
+		memd(fp+#-8) = r25:24
+		jumpr lr
+	}
+FUNCTION_END __save_r24_through_r25
+
+
+
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the tail call. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe_before_tailcall
+		r27:26 = memd(fp+#-16)
+FALLTHROUGH_TAIL_CALL __restore_r24_through_r27_and_deallocframe_before_tailcall __restore_r24_through_r25_and_deallocframe_before_tailcall
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r24_through_r25_and_deallocframe_before_tailcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+   to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe
+	{
+		lr = memw(fp+#4)
+		r27:26 = memd(fp+#-16)
+	}
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r24_through_r27_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized. */
+
+FUNCTION_BEGIN __restore_r24_through_r25_and_deallocframe
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+	}
+		jumpr lr
+FUNCTION_END __restore_r24_through_r25_and_deallocframe
diff --git a/lib/builtins/hexagon/common_entry_exit_abi2.S b/lib/builtins/hexagon/common_entry_exit_abi2.S
new file mode 100644
index 000000000..6f470343d
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_abi2.S
@@ -0,0 +1,268 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.p2align 2
+        .section .text.\name,"ax",@progbits
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.p2align 2
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+\name1:
+	.endm
+
+
+
+
+/* Save r17:16 at fp+#-8, r19:18 at fp+#-16, r21:20 at fp+#-24, r23:22 at
+   fp+#-32, r25:24 at fp+#-40, and r27:26 at fp+#-48.
+   The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+FUNCTION_BEGIN __save_r16_through_r27
+        {
+                memd(fp+#-48) = r27:26
+                memd(fp+#-40) = r25:24
+        }
+        {
+                memd(fp+#-32) = r23:22
+                memd(fp+#-24) = r21:20
+        }
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r27
+
+FUNCTION_BEGIN __save_r16_through_r25
+        {
+                memd(fp+#-40) = r25:24
+                memd(fp+#-32) = r23:22
+        }
+        {
+                memd(fp+#-24) = r21:20
+                memd(fp+#-16) = r19:18
+        }
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r25
+
+FUNCTION_BEGIN __save_r16_through_r23
+        {
+                memd(fp+#-32) = r23:22
+                memd(fp+#-24) = r21:20
+        }
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r23
+
+FUNCTION_BEGIN __save_r16_through_r21
+        {
+                memd(fp+#-24) = r21:20
+                memd(fp+#-16) = r19:18
+        }
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r21
+
+FUNCTION_BEGIN __save_r16_through_r19
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r19
+
+FUNCTION_BEGIN __save_r16_through_r17
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r17
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the tail call. */
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe_before_tailcall
+                r27:26 = memd(fp+#-48)
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r27_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe_before_tailcall
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r25_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe_before_tailcall
+        {
+                r23:22 = memd(fp+#-32)
+                r21:20 = memd(fp+#-24)
+        }
+                r19:18 = memd(fp+#-16)
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r23_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe_before_tailcall
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe_before_tailcall
+                r19:18 = memd(fp+#-16)
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe_before_tailcall
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r17_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe
+                r27:26 = memd(fp+#-48)
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r27_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r25_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe
+        {
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r23_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r21_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe
+	{
+                r19:18 = memd(fp+#-16)
+		r17:16 = memd(fp+#-8)
+        }
+        {
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r19_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r17_and_deallocframe
+
+FUNCTION_BEGIN __deallocframe
+        dealloc_return
+FUNCTION_END __deallocframe
diff --git a/lib/builtins/hexagon/common_entry_exit_legacy.S b/lib/builtins/hexagon/common_entry_exit_legacy.S
new file mode 100644
index 000000000..3258f15a3
--- /dev/null
+++ b/lib/builtins/hexagon/common_entry_exit_legacy.S
@@ -0,0 +1,157 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.globl \name
+	.type  \name, @function
+	.falign
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+	.falign
+\name1:
+	.endm
+
+
+
+
+/* Save r27:26 at fp+#-8, r25:24 at fp+#-16, r23:22 at fp+#-24, r21:20 at
+   fp+#-32, r19:18 at fp+#-40, and r17:16 at fp+#-48. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+   simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r27_through_r16
+		memd(fp+#-48) = r17:16
+FALLTHROUGH_TAIL_CALL __save_r27_through_r16 __save_r27_through_r18
+		memd(fp+#-40) = r19:18
+FALLTHROUGH_TAIL_CALL __save_r27_through_r18 __save_r27_through_r20
+		memd(fp+#-32) = r21:20
+FALLTHROUGH_TAIL_CALL __save_r27_through_r20 __save_r27_through_r22
+		memd(fp+#-24) = r23:22
+FALLTHROUGH_TAIL_CALL __save_r27_through_r22 __save_r27_through_r24
+		memd(fp+#-16) = r25:24
+	{
+		memd(fp+#-8) = r27:26
+		jumpr lr
+	}
+FUNCTION_END __save_r27_through_r24
+
+
+
+
+/* For each of the *_before_sibcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the sibcall. */
+
+FUNCTION_BEGIN __restore_r27_through_r20_and_deallocframe_before_sibcall
+	{
+		r21:20 = memd(fp+#-32)
+		r23:22 = memd(fp+#-24)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe_before_sibcall __restore_r27_through_r24_and_deallocframe_before_sibcall
+	{
+		r25:24 = memd(fp+#-16)
+		jump __restore_r27_through_r26_and_deallocframe_before_sibcall
+	}
+FUNCTION_END __restore_r27_through_r24_and_deallocframe_before_sibcall
+
+
+
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe_before_sibcall
+		r17:16 = memd(fp+#-48)
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe_before_sibcall __restore_r27_through_r18_and_deallocframe_before_sibcall
+	{
+		r19:18 = memd(fp+#-40)
+		r21:20 = memd(fp+#-32)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe_before_sibcall __restore_r27_through_r22_and_deallocframe_before_sibcall
+	{
+		r23:22 = memd(fp+#-24)
+		r25:24 = memd(fp+#-16)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe_before_sibcall __restore_r27_through_r26_and_deallocframe_before_sibcall
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r27_through_r26_and_deallocframe_before_sibcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+   to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe
+	{
+		r17:16 = memd(fp+#-48)
+		r19:18 = memd(fp+#-40)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe __restore_r27_through_r20_and_deallocframe
+	{
+		r21:20 = memd(fp+#-32)
+		r23:22 = memd(fp+#-24)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe __restore_r27_through_r24_and_deallocframe
+	{
+		lr = memw(fp+#4)
+		r25:24 = memd(fp+#-16)
+	}
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r27_through_r24_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized for all three functions. */
+
+FUNCTION_BEGIN __restore_r27_through_r18_and_deallocframe
+	{
+		r19:18 = memd(fp+#-40)
+		r21:20 = memd(fp+#-32)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe __restore_r27_through_r22_and_deallocframe
+	{
+		r23:22 = memd(fp+#-24)
+		r25:24 = memd(fp+#-16)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe __restore_r27_through_r26_and_deallocframe
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+	}
+		jumpr lr
+FUNCTION_END __restore_r27_through_r26_and_deallocframe
diff --git a/lib/builtins/hexagon/dfaddsub.S b/lib/builtins/hexagon/dfaddsub.S
new file mode 100644
index 000000000..4173f86a4
--- /dev/null
+++ b/lib/builtins/hexagon/dfaddsub.S
@@ -0,0 +1,398 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define EXPA r4
+#define EXPB r5
+#define EXPB_A r5:4
+
+#define ZTMP r7:6
+#define ZTMPH r7
+#define ZTMPL r6
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define BTMP r9:8
+#define BTMPH r9
+#define BTMPL r8
+
+#define ATMP2 r11:10
+#define ATMP2H r11
+#define ATMP2L r10
+
+#define EXPDIFF r15
+#define EXTRACTOFF r14
+#define EXTRACTAMT r15:14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+#define SR_BIT_INEXACT 5
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+#define NORMAL p3
+#define BIGB p2
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_adddf3
+	.global __hexagon_subdf3
+	.type __hexagon_adddf3, @function
+	.type __hexagon_subdf3, @function
+
+Q6_ALIAS(adddf3)
+FAST_ALIAS(adddf3)
+FAST2_ALIAS(adddf3)
+Q6_ALIAS(subdf3)
+FAST_ALIAS(subdf3)
+FAST2_ALIAS(subdf3)
+
+	.p2align 5
+__hexagon_adddf3:
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+		ATMP = combine(##0x20000000,#0)
+	}
+	{
+		NORMAL = dfclass(A,#2)
+		NORMAL = dfclass(B,#2)
+		BTMP = ATMP
+		BIGB = cmp.gtu(EXPB,EXPA)			// Is B substantially greater than A?
+	}
+	{
+		if (!NORMAL) jump .Ladd_abnormal		// If abnormal, go to special code
+		if (BIGB) A = B				// if B >> A, swap A and B
+		if (BIGB) B = A				// If B >> A, swap A and B
+		if (BIGB) EXPB_A = combine(EXPA,EXPB)	// swap exponents
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-2)	// Q1.62
+		BTMP = insert(B,#MANTBITS,#EXPBITS-2)	// Q1.62
+		EXPDIFF = sub(EXPA,EXPB)
+		ZTMP = combine(#62,#1)
+	}
+#undef BIGB
+#undef NORMAL
+#define B_POS p3
+#define A_POS p2
+#define NO_STICKIES p1
+.Ladd_continue:
+	{
+		EXPDIFF = min(EXPDIFF,ZTMPH)		// If exponent difference >= ~60,
+							// will collapse to sticky bit
+		ATMP2 = neg(ATMP)
+		A_POS = cmp.gt(AH,#-1)
+		EXTRACTOFF = #0
+	}
+	{
+		if (!A_POS) ATMP = ATMP2
+		ATMP2 = extractu(BTMP,EXTRACTAMT)
+		BTMP = ASR(BTMP,EXPDIFF)
+#undef EXTRACTAMT
+#undef EXPDIFF
+#undef EXTRACTOFF
+#define ZERO r15:14
+		ZERO = #0
+	}
+	{
+		NO_STICKIES = cmp.eq(ATMP2,ZERO)
+		if (!NO_STICKIES.new) BTMPL = or(BTMPL,ZTMPL)
+		EXPB = add(EXPA,#-BIAS-60)
+		B_POS = cmp.gt(BH,#-1)
+	}
+	{
+		ATMP = add(ATMP,BTMP)			// ADD!!!
+		ATMP2 = sub(ATMP,BTMP)			// Negate and ADD --> SUB!!!
+		ZTMP = combine(#54,##2045)
+	}
+	{
+		p0 = cmp.gtu(EXPA,ZTMPH)		// must be pretty high in case of large cancellation
+		p0 = !cmp.gtu(EXPA,ZTMPL)
+		if (!p0.new) jump:nt .Ladd_ovf_unf
+		if (!B_POS) ATMP = ATMP2		// if B neg, pick difference
+	}
+	{
+		A = convert_d2df(ATMP)			// Convert to Double Precision, taking care of flags, etc.  So nice!
+		p0 = cmp.eq(ATMPH,#0)
+		p0 = cmp.eq(ATMPL,#0)
+		if (p0.new) jump:nt .Ladd_zero		// or maybe conversion handles zero case correctly?
+	}
+	{
+		AH += asl(EXPB,#HI_MANTBITS)
+		jumpr r31
+	}
+	.falign
+__hexagon_subdf3:
+	{
+		BH = togglebit(BH,#31)
+		jump __qdsp_adddf3
+	}
+
+
+	.falign
+.Ladd_zero:
+	// True zero, full cancellation
+	// +0 unless round towards negative infinity
+	{
+		TMP = USR
+		A = #0
+		BH = #1
+	}
+	{
+		TMP = extractu(TMP,#2,#22)
+		BH = asl(BH,#31)
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = xor(AH,BH)
+		jumpr r31
+	}
+	.falign
+.Ladd_ovf_unf:
+	// Overflow or Denormal is possible
+	// Good news: Underflow flag is not possible!
+	/*
+	 * ATMP has 2's complement value
+	 *
+	 * EXPA has A's exponent, EXPB has EXPA-BIAS-60
+	 *
+	 * Convert, extract exponent, add adjustment.
+	 * If > 2046, overflow
+	 * If <= 0, denormal
+	 *
+	 * Note that we've not done our zero check yet, so do that too
+	 *
+	 */
+	{
+		A = convert_d2df(ATMP)
+		p0 = cmp.eq(ATMPH,#0)
+		p0 = cmp.eq(ATMPL,#0)
+		if (p0.new) jump:nt .Ladd_zero
+	}
+	{
+		TMP = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		AH += asl(EXPB,#HI_MANTBITS)
+	}
+	{
+		EXPB = add(EXPB,TMP)
+		B = combine(##0x00100000,#0)
+	}
+	{
+		p0 = cmp.gt(EXPB,##BIAS+BIAS-2)
+		if (p0.new) jump:nt .Ladd_ovf
+	}
+	{
+		p0 = cmp.gt(EXPB,#0)
+		if (p0.new) jumpr:t r31
+		TMP = sub(#1,EXPB)
+	}
+	{
+		B = insert(A,#MANTBITS,#0)
+		A = ATMP
+	}
+	{
+		B = lsr(B,TMP)
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+	.falign
+.Ladd_ovf:
+	// We get either max finite value or infinity.  Either way, overflow+inexact
+	{
+		A = ATMP				// 2's complement value
+		TMP = USR
+		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
+	}
+	{
+		EXPB = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
+		TMP = or(TMP,#0x28)			// inexact + overflow
+		BTMP = combine(##0x7ff00000,#0)		// positive infinity
+	}
+	{
+		USR = TMP
+		EXPB ^= lsr(AH,#31)			// Does sign match rounding?
+		TMP = EXPB				// unmodified rounding mode
+	}
+	{
+		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
+		p0 = !cmp.eq(EXPB,#2)			// Not rounding the other way,
+		if (p0.new) ATMP = BTMP			// we should get infinity
+	}
+	{
+		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+
+.Ladd_abnormal:
+	{
+		ATMP = extractu(A,#63,#0)		// strip off sign
+		BTMP = extractu(B,#63,#0)		// strip off sign
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B			// sort values
+		if (!p3.new) B = A			// sort values
+	}
+	{
+		// Any NaN --> NaN, possibly raise invalid if sNaN
+		p0 = dfclass(A,#0x0f)		// A not NaN?
+		if (!p0.new) jump:nt .Linvalid_nan_add
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		// Infinity + non-infinity number is infinity
+		// Infinity + infinity --> inf or nan
+		p1 = dfclass(A,#0x08)		// A is infinity
+		if (p1.new) jump:nt .Linf_add
+	}
+	{
+		p2 = dfclass(B,#0x01)		// B is zero
+		if (p2.new) jump:nt .LB_zero	// so return A or special 0+0
+		ATMP = #0
+	}
+	// We are left with adding one or more subnormals
+	{
+		p0 = dfclass(A,#4)
+		if (p0.new) jump:nt .Ladd_two_subnormal
+		ATMP = combine(##0x20000000,#0)
+	}
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = #1
+		// BTMP already ABS(B)
+		BTMP = asl(BTMP,#EXPBITS-2)
+	}
+#undef ZERO
+#define EXTRACTOFF r14
+#define EXPDIFF r15
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-2)
+		EXPDIFF = sub(EXPA,EXPB)
+		ZTMP = combine(#62,#1)
+		jump .Ladd_continue
+	}
+
+.Ladd_two_subnormal:
+	{
+		ATMP = extractu(A,#63,#0)
+		BTMP = extractu(B,#63,#0)
+	}
+	{
+		ATMP = neg(ATMP)
+		BTMP = neg(BTMP)
+		p0 = cmp.gt(AH,#-1)
+		p1 = cmp.gt(BH,#-1)
+	}
+	{
+		if (p0) ATMP = A
+		if (p1) BTMP = B
+	}
+	{
+		ATMP = add(ATMP,BTMP)
+	}
+	{
+		BTMP = neg(ATMP)
+		p0 = cmp.gt(ATMPH,#-1)
+		B = #0
+	}
+	{
+		if (!p0) A = BTMP
+		if (p0) A = ATMP
+		BH = ##0x80000000
+	}
+	{
+		if (!p0) AH = or(AH,BH)
+		p0 = dfcmp.eq(A,B)
+		if (p0.new) jump:nt .Lzero_plus_zero
+	}
+	{
+		jumpr r31
+	}
+
+.Linvalid_nan_add:
+	{
+		TMP = convert_df2sf(A)			// will generate invalid if sNaN
+		p0 = dfclass(B,#0x0f)			// if B is not NaN
+		if (p0.new) B = A 			// make it whatever A is
+	}
+	{
+		BL = convert_df2sf(B)			// will generate invalid if sNaN
+		A = #-1
+		jumpr r31
+	}
+	.falign
+.LB_zero:
+	{
+		p0 = dfcmp.eq(ATMP,A)			// is A also zero?
+		if (!p0.new) jumpr:t r31		// If not, just return A
+	}
+	// 0 + 0 is special
+	// if equal integral values, they have the same sign, which is fine for all rounding
+	// modes.
+	// If unequal in sign, we get +0 for all rounding modes except round down
+.Lzero_plus_zero:
+	{
+		p0 = cmp.eq(A,B)
+		if (p0.new) jumpr:t r31
+	}
+	{
+		TMP = USR
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		A = #0
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		jumpr r31
+	}
+.Linf_add:
+	// adding infinities is only OK if they are equal
+	{
+		p0 = !cmp.eq(AH,BH)			// Do they have different signs
+		p0 = dfclass(B,#8)			// And is B also infinite?
+		if (!p0.new) jumpr:t r31		// If not, just a normal inf
+	}
+	{
+		BL = ##0x7f800001			// sNAN
+	}
+	{
+		A = convert_sf2df(BL)			// trigger invalid, set NaN
+		jumpr r31
+	}
+END(__hexagon_adddf3)
diff --git a/lib/builtins/hexagon/dfdiv.S b/lib/builtins/hexagon/dfdiv.S
new file mode 100644
index 000000000..0c5dbe272
--- /dev/null
+++ b/lib/builtins/hexagon/dfdiv.S
@@ -0,0 +1,492 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Divide */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define Q r5:4
+#define QH r5
+#define QL r4
+
+#define PROD r7:6
+#define PRODHI r7
+#define PRODLO r6
+
+#define SFONE r8
+#define SFDEN r9
+#define SFERROR r10
+#define SFRECIP r11
+
+#define EXPBA r13:12
+#define EXPB r13
+#define EXPA r12
+
+#define REMSUB2 r15:14
+
+
+
+#define SIGN r28
+
+#define Q_POSITIVE p3
+#define NORMAL p2
+#define NO_OVF_UNF p1
+#define P_TMP p0
+
+#define RECIPEST_SHIFT 3
+#define QADJ 61
+
+#define DFCLASS_NORMAL 0x02
+#define DFCLASS_NUMBER 0x0F
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_ZERO 0x01
+#define DFCLASS_NONZERO (DFCLASS_NUMBER ^ DFCLASS_ZERO)
+#define DFCLASS_NONINFINITE (DFCLASS_NUMBER ^ DFCLASS_INFINITE)
+
+#define DF_MANTBITS 52
+#define DF_EXPBITS 11
+#define SF_MANTBITS 23
+#define SF_EXPBITS 8
+#define DF_BIAS 0x3ff
+
+#define SR_ROUND_OFF 22
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_divdf3
+	.type __hexagon_divdf3,@function
+	Q6_ALIAS(divdf3)
+        FAST_ALIAS(divdf3)
+        FAST2_ALIAS(divdf3)
+	.p2align 5
+__hexagon_divdf3:
+	{
+		NORMAL = dfclass(A,#DFCLASS_NORMAL)
+		NORMAL = dfclass(B,#DFCLASS_NORMAL)
+		EXPBA = combine(BH,AH)
+		SIGN = xor(AH,BH)
+	}
+#undef A
+#undef AH
+#undef AL
+#undef B
+#undef BH
+#undef BL
+#define REM r1:0
+#define REMHI r1
+#define REMLO r0
+#define DENOM r3:2
+#define DENOMHI r3
+#define DENOMLO r2
+	{
+		if (!NORMAL) jump .Ldiv_abnormal
+		PROD = extractu(DENOM,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+		SFONE = ##0x3f800001
+	}
+	{
+		SFDEN = or(SFONE,PRODLO)
+		EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+		EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+		Q_POSITIVE = cmp.gt(SIGN,#-1)
+	}
+#undef SIGN
+#define ONE r28
+.Ldenorm_continue:
+	{
+		SFRECIP,P_TMP = sfrecipa(SFONE,SFDEN)
+		SFERROR = and(SFONE,#-2)
+		ONE = #1
+		EXPA = sub(EXPA,EXPB)
+	}
+#undef EXPB
+#define RECIPEST r13
+	{
+		SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+		REMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+		RECIPEST = ##0x00800000 << RECIPEST_SHIFT
+	}
+	{
+		SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+		DENOMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+		SFERROR = and(SFONE,#-2)
+	}
+	{
+		SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+		QH = #-DF_BIAS+1
+		QL = #DF_BIAS-1
+	}
+	{
+		SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+		NO_OVF_UNF = cmp.gt(EXPA,QH)
+		NO_OVF_UNF = !cmp.gt(EXPA,QL)
+	}
+	{
+		RECIPEST = insert(SFRECIP,#SF_MANTBITS,#RECIPEST_SHIFT)
+		Q = #0
+		EXPA = add(EXPA,#-QADJ)
+	}
+#undef SFERROR
+#undef SFRECIP
+#define TMP r10
+#define TMP1 r11
+	{
+		RECIPEST = add(RECIPEST,#((-3) << RECIPEST_SHIFT))
+	}
+
+#define DIV_ITER1B(QSHIFTINSN,QSHIFT,REMSHIFT,EXTRA) \
+	{ \
+		PROD = mpyu(RECIPEST,REMHI); \
+		REM = asl(REM,# ## ( REMSHIFT )); \
+	}; \
+	{ \
+		PRODLO = # ## 0; \
+		REM -= mpyu(PRODHI,DENOMLO); \
+		REMSUB2 = mpyu(PRODHI,DENOMHI); \
+	}; \
+	{ \
+		Q += QSHIFTINSN(PROD, # ## ( QSHIFT )); \
+		REM -= asl(REMSUB2, # ## 32); \
+		EXTRA \
+	}
+
+
+	DIV_ITER1B(ASL,14,15,)
+	DIV_ITER1B(ASR,1,15,)
+	DIV_ITER1B(ASR,16,15,)
+	DIV_ITER1B(ASR,31,15,PROD=# ( 0 );)
+
+#undef REMSUB2
+#define TMPPAIR r15:14
+#define TMPPAIRHI r15
+#define TMPPAIRLO r14
+#undef RECIPEST
+#define EXPB r13
+	{
+		// compare or sub with carry
+		TMPPAIR = sub(REM,DENOM)
+		P_TMP = cmp.gtu(DENOM,REM)
+		// set up amt to add to q
+		if (!P_TMP.new) PRODLO  = #2
+	}
+	{
+		Q = add(Q,PROD)
+		if (!P_TMP) REM = TMPPAIR
+		TMPPAIR = #0
+	}
+	{
+		P_TMP = cmp.eq(REM,TMPPAIR)
+		if (!P_TMP.new) QL = or(QL,ONE)
+	}
+	{
+		PROD = neg(Q)
+	}
+	{
+		if (!Q_POSITIVE) Q = PROD
+	}
+#undef REM
+#undef REMHI
+#undef REMLO
+#undef DENOM
+#undef DENOMLO
+#undef DENOMHI
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+	{
+		A = convert_d2df(Q)
+		if (!NO_OVF_UNF) jump .Ldiv_ovf_unf
+	}
+	{
+		AH += asl(EXPA,#DF_MANTBITS-32)
+		jumpr r31
+	}
+
+.Ldiv_ovf_unf:
+	{
+		AH += asl(EXPA,#DF_MANTBITS-32)
+		EXPB = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+	}
+	{
+		PROD = abs(Q)
+		EXPA = add(EXPA,EXPB)
+	}
+	{
+		P_TMP = cmp.gt(EXPA,##DF_BIAS+DF_BIAS)		// overflow
+		if (P_TMP.new) jump:nt .Ldiv_ovf
+	}
+	{
+		P_TMP = cmp.gt(EXPA,#0)
+		if (P_TMP.new) jump:nt .Lpossible_unf		// round up to normal possible...
+	}
+	/* Underflow */
+	/* We know what the infinite range exponent should be (EXPA) */
+	/* Q is 2's complement, PROD is abs(Q) */
+	/* Normalize Q, shift right, add a high bit, convert, change exponent */
+
+#define FUDGE1 7	// how much to shift right
+#define FUDGE2 4	// how many guard/round to keep at lsbs
+
+	{
+		EXPB = add(clb(PROD),#-1)			// doesn't need to be added in since
+		EXPA = sub(#FUDGE1,EXPA)			// we extract post-converted exponent
+		TMP = USR
+		TMP1 = #63
+	}
+	{
+		EXPB = min(EXPA,TMP1)
+		TMP1 = or(TMP,#0x030)
+		PROD = asl(PROD,EXPB)
+		EXPA = #0
+	}
+	{
+		TMPPAIR = extractu(PROD,EXPBA)				// bits that will get shifted out
+		PROD = lsr(PROD,EXPB)					// shift out bits
+		B = #1
+	}
+	{
+		P_TMP = cmp.gtu(B,TMPPAIR)
+		if (!P_TMP.new) PRODLO = or(BL,PRODLO)
+		PRODHI = setbit(PRODHI,#DF_MANTBITS-32+FUDGE2)
+	}
+	{
+		Q = neg(PROD)
+		P_TMP = bitsclr(PRODLO,#(1<<FUDGE2)-1)
+		if (!P_TMP.new) TMP = TMP1
+	}
+	{
+		USR = TMP
+		if (Q_POSITIVE) Q = PROD
+		TMP = #-DF_BIAS-(DF_MANTBITS+FUDGE2)
+	}
+	{
+		A = convert_d2df(Q)
+	}
+	{
+		AH += asl(TMP,#DF_MANTBITS-32)
+		jumpr r31
+	}
+
+
+.Lpossible_unf:
+	/* If upper parts of Q were all F's, but abs(A) == 0x00100000_00000000, we rounded up to min_normal */
+	/* The answer is correct, but we need to raise Underflow */
+	{
+		B = extractu(A,#63,#0)
+		TMPPAIR = combine(##0x00100000,#0)		// min normal
+		TMP = #0x7FFF
+	}
+	{
+		P_TMP = dfcmp.eq(TMPPAIR,B)		// Is everything zero in the rounded value...
+		P_TMP = bitsset(PRODHI,TMP)		// but a bunch of bits set in the unrounded abs(quotient)?
+	}
+
+#if (__HEXAGON_ARCH__ == 60)
+		TMP = USR		// If not, just return
+		if (!P_TMP) jumpr r31   // Else, we want to set Unf+Inexact
+					// Note that inexact is already set...
+#else
+	{
+		if (!P_TMP) jumpr r31			// If not, just return
+		TMP = USR				// Else, we want to set Unf+Inexact
+	}						// Note that inexact is already set...
+#endif
+	{
+		TMP = or(TMP,#0x30)
+	}
+	{
+		USR = TMP
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+
+.Ldiv_ovf:
+	/*
+	 * Raise Overflow, and choose the correct overflow value (saturated normal or infinity)
+	 */
+	{
+		TMP = USR
+		B = combine(##0x7fefffff,#-1)
+		AH = mux(Q_POSITIVE,#0,#-1)
+	}
+	{
+		PROD = combine(##0x7ff00000,#0)
+		QH = extractu(TMP,#2,#SR_ROUND_OFF)
+		TMP = or(TMP,#0x28)
+	}
+	{
+		USR = TMP
+		QH ^= lsr(AH,#31)
+		QL = QH
+	}
+	{
+		p0 = !cmp.eq(QL,#1)		// if not round-to-zero
+		p0 = !cmp.eq(QH,#2)		// and not rounding the other way
+		if (p0.new) B = PROD		// go to inf
+		p0 = dfcmp.eq(B,B)		// get exceptions
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+
+#undef ONE
+#define SIGN r28
+#undef NORMAL
+#undef NO_OVF_UNF
+#define P_INF p1
+#define P_ZERO p2
+.Ldiv_abnormal:
+	{
+		P_TMP = dfclass(A,#DFCLASS_NUMBER)
+		P_TMP = dfclass(B,#DFCLASS_NUMBER)
+		Q_POSITIVE = cmp.gt(SIGN,#-1)
+	}
+	{
+		P_INF = dfclass(A,#DFCLASS_INFINITE)
+		P_INF = dfclass(B,#DFCLASS_INFINITE)
+	}
+	{
+		P_ZERO = dfclass(A,#DFCLASS_ZERO)
+		P_ZERO = dfclass(B,#DFCLASS_ZERO)
+	}
+	{
+		if (!P_TMP) jump .Ldiv_nan
+		if (P_INF) jump .Ldiv_invalid
+	}
+	{
+		if (P_ZERO) jump .Ldiv_invalid
+	}
+	{
+		P_ZERO = dfclass(A,#DFCLASS_NONZERO)		// nonzero
+		P_ZERO = dfclass(B,#DFCLASS_NONINFINITE)	// non-infinite
+	}
+	{
+		P_INF = dfclass(A,#DFCLASS_NONINFINITE)	// non-infinite
+		P_INF = dfclass(B,#DFCLASS_NONZERO)	// nonzero
+	}
+	{
+		if (!P_ZERO) jump .Ldiv_zero_result
+		if (!P_INF) jump .Ldiv_inf_result
+	}
+	/* Now we've narrowed it down to (de)normal / (de)normal */
+	/* Set up A/EXPA B/EXPB and go back */
+#undef P_ZERO
+#undef P_INF
+#define P_TMP2 p1
+	{
+		P_TMP = dfclass(A,#DFCLASS_NORMAL)
+		P_TMP2 = dfclass(B,#DFCLASS_NORMAL)
+		TMP = ##0x00100000
+	}
+	{
+		EXPBA = combine(BH,AH)
+		AH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32)		// clear out hidden bit, sign bit
+		BH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32)		// clear out hidden bit, sign bit
+	}
+	{
+		if (P_TMP) AH = or(AH,TMP)				// if normal, add back in hidden bit
+		if (P_TMP2) BH = or(BH,TMP)				// if normal, add back in hidden bit
+	}
+	{
+		QH = add(clb(A),#-DF_EXPBITS)
+		QL = add(clb(B),#-DF_EXPBITS)
+		TMP = #1
+	}
+	{
+		EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+		EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+	}
+	{
+		A = asl(A,QH)
+		B = asl(B,QL)
+		if (!P_TMP) EXPA = sub(TMP,QH)
+		if (!P_TMP2) EXPB = sub(TMP,QL)
+	}	// recreate values needed by resume coke
+	{
+		PROD = extractu(B,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+	}
+	{
+		SFDEN = or(SFONE,PRODLO)
+		jump .Ldenorm_continue
+	}
+
+.Ldiv_zero_result:
+	{
+		AH = xor(AH,BH)
+		B = #0
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+.Ldiv_inf_result:
+	{
+		p2 = dfclass(B,#DFCLASS_ZERO)
+		p2 = dfclass(A,#DFCLASS_NONINFINITE)
+	}
+	{
+		TMP = USR
+		if (!p2) jump 1f
+		AH = xor(AH,BH)
+	}
+	{
+		TMP = or(TMP,#0x04)		// DBZ
+	}
+	{
+		USR = TMP
+	}
+1:
+	{
+		B = combine(##0x7ff00000,#0)
+		p0 = dfcmp.uo(B,B)		// take possible exception
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+.Ldiv_nan:
+	{
+		p0 = dfclass(A,#0x10)
+		p1 = dfclass(B,#0x10)
+		if (!p0.new) A = B
+		if (!p1.new) B = A
+	}
+	{
+		QH = convert_df2sf(A)	// get possible invalid exceptions
+		QL = convert_df2sf(B)
+	}
+	{
+		A = #-1
+		jumpr r31
+	}
+
+.Ldiv_invalid:
+	{
+		TMP = ##0x7f800001
+	}
+	{
+		A = convert_sf2df(TMP)		// get invalid, get DF qNaN
+		jumpr r31
+	}
+END(__hexagon_divdf3)
diff --git a/lib/builtins/hexagon/dffma.S b/lib/builtins/hexagon/dffma.S
new file mode 100644
index 000000000..97b885a3b
--- /dev/null
+++ b/lib/builtins/hexagon/dffma.S
@@ -0,0 +1,705 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/* Double Precision Multiply */
+
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+#define C r5:4
+#define CH r5
+#define CL r4
+
+
+
+#define BTMP r15:14
+#define BTMPH r15
+#define BTMPL r14
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define CTMP r11:10
+#define CTMPH r11
+#define CTMPL r10
+
+#define PP_LL r9:8
+#define PP_LL_H r9
+#define PP_LL_L r8
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+
+#define PP_HH r17:16
+#define PP_HH_H r17
+#define PP_HH_L r16
+
+#define EXPA r18
+#define EXPB r19
+#define EXPBA r19:18
+
+#define TMP r28
+
+#define P_TMP p0
+#define PROD_NEG p3
+#define EXACT p2
+#define SWAP p1
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1023
+#define STACKSPACE 32
+
+#define ADJUST 4
+
+#define FUDGE 7
+#define FUDGE2 3
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+	/*
+	 * First, classify for normal values, and abort if abnormal
+	 *
+	 * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
+	 *
+	 * Since we know that the 2 MSBs of the H registers is zero, we should never carry
+	 * the partial products that involve the H registers
+	 *
+	 * Try to buy X slots, at the expense of latency if needed
+	 *
+	 * We will have PP_HH with the upper bits of the product, PP_LL with the lower
+	 * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x0100_0000_0000_0000
+	 *
+	 * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
+	 *
+	 * We need to align CTMP.
+	 * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
+	 * If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
+	 * If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
+	 *
+	 * Convert partial product and CTMP to 2's complement prior to addition
+	 *
+	 * After we add, we need to normalize into upper 64 bits, then compute sticky.
+	 *
+	 *
+	 */
+
+	.text
+	.global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+	.global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+	.global fma
+	.type fma,@function
+	Q6_ALIAS(fmadf5)
+	.p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+	{
+		P_TMP = dfclass(A,#2)
+		P_TMP = dfclass(B,#2)
+		ATMP = #0
+		BTMP = #0
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
+		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
+		PP_ODD_H = ##0x10000000
+		allocframe(#STACKSPACE)
+	}
+	{
+		PP_LL = mpyu(ATMPL,BTMPL)
+		if (!P_TMP) jump .Lfma_abnormal_ab
+		ATMPH = or(ATMPH,PP_ODD_H)
+		BTMPH = or(BTMPH,PP_ODD_H)
+	}
+	{
+		P_TMP = dfclass(C,#2)
+		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
+		CTMP = combine(PP_ODD_H,#0)
+		PP_ODD = combine(#0,PP_LL_H)
+	}
+.Lfma_abnormal_c_restart:
+	{
+		PP_ODD += mpyu(BTMPL,ATMPH)
+		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
+		memd(r29+#0) = PP_HH
+		memd(r29+#8) = EXPBA
+	}
+	{
+		PP_ODD += mpyu(ATMPL,BTMPH)
+		EXPBA = neg(CTMP)
+		P_TMP = cmp.gt(CH,#-1)
+		TMP = xor(AH,BH)
+	}
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+		PP_HH = combine(#0,PP_ODD_H)
+		if (!P_TMP) CTMP = EXPBA
+	}
+	{
+		PP_HH += mpyu(ATMPH,BTMPH)
+		PP_LL = combine(PP_ODD_L,PP_LL_L)
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
+#define RIGHTLEFTSHIFT r13:12
+#define RIGHTSHIFT r13
+#define LEFTSHIFT r12
+
+		EXPA = add(EXPA,EXPB)
+#undef EXPB
+#undef EXPBA
+#define EXPC r19
+#define EXPCA r19:18
+		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
+	}
+	/* PP_HH:PP_LL now has product */
+	/* CTMP is negated */
+	/* EXPA,B,C are extracted */
+	/*
+	 * We need to negate PP
+	 * Since we will be adding with carry later, if we need to negate,
+	 * just invert all bits now, which we can do conditionally and in parallel
+	 */
+#define PP_HH_TMP r15:14
+#define PP_LL_TMP r7:6
+	{
+		EXPA = add(EXPA,#-BIAS+(ADJUST))
+		PROD_NEG = !cmp.gt(TMP,#-1)
+		PP_LL_TMP = #0
+		PP_HH_TMP = #0
+	}
+	{
+		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
+		P_TMP = !cmp.gt(TMP,#-1)
+		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
+		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
+	}
+	{
+		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
+		if (P_TMP) PP_LL = PP_LL_TMP
+#undef PP_LL_TMP
+#define CTMP2 r7:6
+#define CTMP2H r7
+#define CTMP2L r6
+		CTMP2 = #0
+		EXPC = sub(EXPA,EXPC)
+	}
+	{
+		if (P_TMP) PP_HH = PP_HH_TMP
+		P_TMP = cmp.gt(EXPC,#63)
+		if (SWAP) PP_LL = CTMP2
+		if (SWAP) CTMP2 = PP_LL
+	}
+#undef PP_HH_TMP
+//#define ONE r15:14
+//#define S_ONE r14
+#define ZERO r15:14
+#define S_ZERO r15
+#undef PROD_NEG
+#define P_CARRY p3
+	{
+		if (SWAP) PP_HH = CTMP	// Swap C and PP
+		if (SWAP) CTMP = PP_HH
+		if (P_TMP) EXPC = add(EXPC,#-64)
+		TMP = #63
+	}
+	{
+		// If diff > 63, pre-shift-right by 64...
+		if (P_TMP) CTMP2 = CTMP
+		TMP = asr(CTMPH,#31)
+		RIGHTSHIFT = min(EXPC,TMP)
+		LEFTSHIFT = #0
+	}
+#undef C
+#undef CH
+#undef CL
+#define STICKIES r5:4
+#define STICKIESH r5
+#define STICKIESL r4
+	{
+		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
+		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
+		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
+		LEFTSHIFT = sub(#64,RIGHTSHIFT)
+	}
+	{
+		ZERO = #0
+		TMP = #-2
+		CTMP2 |= lsl(CTMP,LEFTSHIFT)
+		CTMP = asr(CTMP,RIGHTSHIFT)
+	}
+	{
+		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
+		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
+#undef ZERO
+#define ONE r15:14
+#define S_ONE r14
+		ONE = #1
+		STICKIES = #0
+	}
+	{
+		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
+	}
+	{
+		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
+		TMP = #62
+	}
+	/*
+	 * PP_HH:PP_LL now holds the sum
+	 * We may need to normalize left, up to ??? bits.
+	 *
+	 * I think that if we have massive cancellation, the range we normalize by
+	 * is still limited
+	 */
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
+	}
+	/* We had all sign bits, shift left by 62. */
+	{
+		CTMP = extractu(PP_LL,#62,#2)
+		PP_LL = asl(PP_LL,#62)
+		EXPA = add(EXPA,#-62)			// And adjust exponent of result
+	}
+	{
+		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
+	}
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+	}
+	.falign
+1:
+	{
+		CTMP = asl(PP_HH,LEFTSHIFT)
+		STICKIES |= asl(PP_LL,LEFTSHIFT)
+		RIGHTSHIFT = sub(#64,LEFTSHIFT)
+		EXPA = sub(EXPA,LEFTSHIFT)
+	}
+	{
+		CTMP |= lsr(PP_LL,RIGHTSHIFT)
+		EXACT = cmp.gtu(ONE,STICKIES)
+		TMP = #BIAS+BIAS-2
+	}
+	{
+		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
+		// If EXPA is overflow/underflow, jump to ovf_unf
+		P_TMP = !cmp.gt(EXPA,TMP)
+		P_TMP = cmp.gt(EXPA,#1)
+		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
+	}
+	{
+		// XXX: FIXME: should PP_HH for check of zero be CTMP?
+		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		PP_HH = memd(r29+#0)
+	}
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		EXPCA = memd(r29+#8)
+		if (!P_TMP) dealloc_return		// not zero, return
+	}
+.Ladd_yields_zero:
+	/* We had full cancellation.  Return +/- zero (-0 when round-down) */
+	{
+		TMP = USR
+		A = #0
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		dealloc_return
+	}
+
+#undef RIGHTLEFTSHIFT
+#undef RIGHTSHIFT
+#undef LEFTSHIFT
+#undef CTMP2
+#undef CTMP2H
+#undef CTMP2L
+
+.Lfma_ovf_unf:
+	{
+		p0 = cmp.gtu(ONE,CTMP)
+		if (p0.new) jump:nt .Ladd_yields_zero
+	}
+	{
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		TMP = EXPA
+	}
+#define NEW_EXPB r7
+#define NEW_EXPA r6
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		NEW_EXPA = add(EXPA,NEW_EXPB)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+#undef PP_HH
+#undef PP_HH_H
+#undef PP_HH_L
+#undef EXPCA
+#undef EXPC
+#undef EXPA
+#undef PP_LL
+#undef PP_LL_H
+#undef PP_LL_L
+#define EXPA r6
+#define EXPB r7
+#define EXPBA r7:6
+#define ATMP r9:8
+#define ATMPH r9
+#define ATMPL r8
+#undef NEW_EXPB
+#undef NEW_EXPA
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.gt(EXPA,##BIAS+BIAS)
+		if (p0.new) jump:nt .Lfma_ovf
+	}
+	{
+		p0 = cmp.gt(EXPA,#0)
+		if (p0.new) jump:nt .Lpossible_unf
+	}
+	{
+		// TMP has original EXPA.
+		// ATMP is corresponding value
+		// Normalize ATMP and shift right to correct location
+		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
+		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
+		p3 = cmp.gt(CTMPH,#-1)
+	}
+	/* Underflow */
+	/* We know that the infinte range exponent should be EXPA */
+	/* CTMP is 2's complement, ATMP is abs(CTMP) */
+	{
+		EXPA = add(EXPA,EXPB)		// how much to shift back right
+		ATMP = asl(ATMP,EXPB)		// shift left
+		AH = USR
+		TMP = #63
+	}
+	{
+		EXPB = min(EXPA,TMP)
+		EXPA = #0
+		AL = #0x0030
+	}
+	{
+		B = extractu(ATMP,EXPBA)
+		ATMP = asr(ATMP,EXPB)
+	}
+	{
+		p0 = cmp.gtu(ONE,B)
+		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
+		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
+	}
+	{
+		CTMP = neg(ATMP)
+		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
+		if (!p1.new) AH = or(AH,AL)
+		B = #0
+	}
+	{
+		if (p3) CTMP = ATMP
+		USR = AH
+		TMP = #-BIAS-(MANTBITS+FUDGE2)
+	}
+	{
+		A = convert_d2df(CTMP)
+	}
+	{
+		AH += asl(TMP,#HI_MANTBITS)
+		dealloc_return
+	}
+.Lpossible_unf:
+	{
+		TMP = ##0x7fefffff
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.eq(AL,#0)
+		p0 = bitsclr(AH,TMP)
+		if (!p0.new) dealloc_return:t
+		TMP = #0x7fff
+	}
+	{
+		p0 = bitsset(ATMPH,TMP)
+		BH = USR
+		BL = #0x0030
+	}
+	{
+		if (p0) BH = or(BH,BL)
+	}
+	{
+		USR = BH
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		dealloc_return
+	}
+.Lfma_ovf:
+	{
+		TMP = USR
+		CTMP = combine(##0x7fefffff,#-1)
+		A = CTMP
+	}
+	{
+		ATMP = combine(##0x7ff00000,#0)
+		BH = extractu(TMP,#2,#SR_ROUND_OFF)
+		TMP = or(TMP,#0x28)
+	}
+	{
+		USR = TMP
+		BH ^= lsr(AH,#31)
+		BL = BH
+	}
+	{
+		p0 = !cmp.eq(BL,#1)
+		p0 = !cmp.eq(BH,#2)
+	}
+	{
+		p0 = dfcmp.eq(ATMP,ATMP)
+		if (p0.new) CTMP = ATMP
+	}
+	{
+		A = insert(CTMP,#63,#0)
+		dealloc_return
+	}
+#undef CTMP
+#undef CTMPH
+#undef CTMPL
+#define BTMP r11:10
+#define BTMPH r11
+#define BTMPL r10
+
+#undef STICKIES
+#undef STICKIESH
+#undef STICKIESL
+#define C r5:4
+#define CH r5
+#define CL r4
+
+.Lfma_abnormal_ab:
+	{
+		ATMP = extractu(A,#63,#0)
+		BTMP = extractu(B,#63,#0)
+		deallocframe
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B		// sort values
+		if (!p3.new) B = A
+	}
+	{
+		p0 = dfclass(A,#0x0f)		// A NaN?
+		if (!p0.new) jump:nt .Lnan
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		p1 = dfclass(A,#0x08)		// A is infinity
+		p1 = dfclass(B,#0x0e)		// B is nonzero
+	}
+	{
+		p0 = dfclass(A,#0x08)		// a is inf
+		p0 = dfclass(B,#0x01)		// b is zero
+	}
+	{
+		if (p1) jump .Lab_inf
+		p2 = dfclass(B,#0x01)
+	}
+	{
+		if (p0) jump .Linvalid
+		if (p2) jump .Lab_true_zero
+		TMP = ##0x7c000000
+	}
+	// We are left with a normal or subnormal times a subnormal, A > B
+	// If A and B are both very small, we will go to a single sticky bit; replace
+	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
+	// if A and B might multiply to something bigger, decrease A exp and increase B exp
+	// and start over
+	{
+		p0 = bitsclr(AH,TMP)
+		if (p0.new) jump:nt .Lfma_ab_tiny
+	}
+	{
+		TMP = add(clb(BTMP),#-EXPBITS)
+	}
+	{
+		BTMP = asl(BTMP,TMP)
+	}
+	{
+		B = insert(BTMP,#63,#0)
+		AH -= asl(TMP,#HI_MANTBITS)
+	}
+	jump fma
+
+.Lfma_ab_tiny:
+	ATMP = combine(##0x00100000,#0)
+	{
+		A = insert(ATMP,#63,#0)
+		B = insert(ATMP,#63,#0)
+	}
+	jump fma
+
+.Lab_inf:
+	{
+		B = lsr(B,#63)
+		p0 = dfclass(C,#0x10)
+	}
+	{
+		A ^= asl(B,#63)
+		if (p0) jump .Lnan
+	}
+	{
+		p1 = dfclass(C,#0x08)
+		if (p1.new) jump:nt .Lfma_inf_plus_inf
+	}
+	/* A*B is +/- inf, C is finite.  Return A */
+	{
+		jumpr r31
+	}
+	.falign
+.Lfma_inf_plus_inf:
+	{	// adding infinities of different signs is invalid
+		p0 = dfcmp.eq(A,C)
+		if (!p0.new) jump:nt .Linvalid
+	}
+	{
+		jumpr r31
+	}
+
+.Lnan:
+	{
+		p0 = dfclass(B,#0x10)
+		p1 = dfclass(C,#0x10)
+		if (!p0.new) B = A
+		if (!p1.new) C = A
+	}
+	{	// find sNaNs
+		BH = convert_df2sf(B)
+		BL = convert_df2sf(C)
+	}
+	{
+		BH = convert_df2sf(A)
+		A = #-1
+		jumpr r31
+	}
+
+.Linvalid:
+	{
+		TMP = ##0x7f800001		// sp snan
+	}
+	{
+		A = convert_sf2df(TMP)
+		jumpr r31
+	}
+
+.Lab_true_zero:
+	// B is zero, A is finite number
+	{
+		p0 = dfclass(C,#0x10)
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C
+	}
+	{
+		p0 = dfcmp.eq(B,C)		// is C also zero?
+		AH = lsr(AH,#31)		// get sign
+	}
+	{
+		BH ^= asl(AH,#31)		// form correctly signed zero in B
+		if (!p0) A = C			// If C is not zero, return C
+		if (!p0) jumpr r31
+	}
+	/* B has correctly signed zero, C is also zero */
+.Lzero_plus_zero:
+	{
+		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
+		if (p0.new) jumpr:t r31
+		A = B
+	}
+	{
+		TMP = USR
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		A = #0
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		jumpr r31
+	}
+#undef BTMP
+#undef BTMPH
+#undef BTMPL
+#define CTMP r11:10
+	.falign
+.Lfma_abnormal_c:
+	/* We know that AB is normal * normal */
+	/* C is not normal: zero, subnormal, inf, or NaN. */
+	{
+		p0 = dfclass(C,#0x10)		// is C NaN?
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C		// move NaN to A
+		deallocframe
+	}
+	{
+		p0 = dfclass(C,#0x08)		// is C inf?
+		if (p0.new) A = C		// return C
+		if (p0.new) jumpr:nt r31
+	}
+	// zero or subnormal
+	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
+	{
+		p0 = dfclass(C,#0x01)		// is C zero?
+		if (p0.new) jump:nt __hexagon_muldf3
+		TMP = #1
+	}
+	// Left with: subnormal
+	// Adjust C and jump back to restart
+	{
+		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
+		CTMP = #0
+		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
+		jump .Lfma_abnormal_c_restart
+	}
+END(fma)
diff --git a/lib/builtins/hexagon/dfminmax.S b/lib/builtins/hexagon/dfminmax.S
new file mode 100644
index 000000000..41122911f
--- /dev/null
+++ b/lib/builtins/hexagon/dfminmax.S
@@ -0,0 +1,79 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define A r1:0
+#define B r3:2
+#define ATMP r5:4
+
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/*
+ * Min and Max return A if B is NaN, or B if A is NaN
+ * Otherwise, they return the smaller or bigger value
+ *
+ * If values are equal, we want to favor -0.0 for min and +0.0 for max.
+ */
+
+/*
+ * Compares always return false for NaN
+ * if (isnan(A)) A = B; if (A > B) A = B will only trigger at most one of those options.
+ */
+	.text
+	.global __hexagon_mindf3
+	.global __hexagon_maxdf3
+	.global fmin
+	.type fmin,@function
+	.global fmax
+	.type fmax,@function
+	.type __hexagon_mindf3,@function
+	.type __hexagon_maxdf3,@function
+	Q6_ALIAS(mindf3)
+	Q6_ALIAS(maxdf3)
+	.p2align 5
+__hexagon_mindf3:
+fmin:
+	{
+		p0 = dfclass(A,#0x10)		// If A is a number
+		p1 = dfcmp.gt(A,B)		// AND B > A, don't swap
+		ATMP = A
+	}
+	{
+		if (p0) A = B			// if A is NaN use B
+		if (p1) A = B			// gt is always false if either is NaN
+		p2 = dfcmp.eq(A,B)		// if A == B
+		if (!p2.new) jumpr:t r31
+	}
+	/* A == B, return A|B to select -0.0 over 0.0 */
+	{
+		A = or(ATMP,B)
+		jumpr r31
+	}
+END(__hexagon_mindf3)
+	.falign
+__hexagon_maxdf3:
+fmax:
+	{
+		p0 = dfclass(A,#0x10)
+		p1 = dfcmp.gt(B,A)
+		ATMP = A
+	}
+	{
+		if (p0) A = B
+		if (p1) A = B
+		p2 = dfcmp.eq(A,B)
+		if (!p2.new) jumpr:t r31
+	}
+	/* A == B, return A&B to select 0.0 over -0.0 */
+	{
+		A = and(ATMP,B)
+		jumpr r31
+	}
+END(__hexagon_maxdf3)
diff --git a/lib/builtins/hexagon/dfmul.S b/lib/builtins/hexagon/dfmul.S
new file mode 100644
index 000000000..fde6d77bd
--- /dev/null
+++ b/lib/builtins/hexagon/dfmul.S
@@ -0,0 +1,418 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define BTMP r5:4
+#define BTMPH r5
+#define BTMPL r4
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+#define ONE r9:8
+#define S_ONE r8
+#define S_ZERO r9
+
+#define PP_HH r11:10
+#define PP_HH_H r11
+#define PP_HH_L r10
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define PP_LL r15:14
+#define PP_LL_H r15
+#define PP_LL_L r14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+
+/* Some constant to adjust normalization amount in error code */
+/* Amount to right shift the partial product to get to a denorm */
+#define FUDGE 5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+#define SR_ROUND_OFF 22
+	.text
+	.global __hexagon_muldf3
+	.type __hexagon_muldf3,@function
+	Q6_ALIAS(muldf3)
+  FAST_ALIAS(muldf3)
+  FAST2_ALIAS(muldf3)
+	.p2align 5
+__hexagon_muldf3:
+	{
+		p0 = dfclass(A,#2)
+		p0 = dfclass(B,#2)
+		ATMP = combine(##0x40000000,#0)
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-1)
+		BTMP = asl(B,#EXPBITS-1)
+		TMP = #-BIAS
+		ONE = #1
+	}
+	{
+		PP_ODD = mpyu(BTMPL,ATMPH)
+		BTMP = insert(ONE,#2,#62)
+	}
+	/* since we know that the MSB of the H registers is zero, we should never carry */
+	/* H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1 */
+	/* Adding 2 HLs, we get 2^64-3*2^32+2 maximum.  */
+	/* Therefore, we can add 3 2^32-1 values safely without carry.  We only need one. */
+	{
+		PP_LL = mpyu(ATMPL,BTMPL)
+		PP_ODD += mpyu(ATMPL,BTMPH)
+	}
+	{
+		PP_ODD += lsr(PP_LL,#32)
+		PP_HH = mpyu(ATMPH,BTMPH)
+		BTMP = combine(##BIAS+BIAS-4,#0)
+	}
+	{
+		PP_HH += lsr(PP_ODD,#32)
+		if (!p0) jump .Lmul_abnormal
+		p1 = cmp.eq(PP_LL_L,#0)		// 64 lsb's 0?
+		p1 = cmp.eq(PP_ODD_L,#0)	// 64 lsb's 0?
+	}
+	/*
+	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+	 */
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#define EXP10 r7:6
+#define EXP1 r7
+#define EXP0 r6
+	{
+		if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
+		EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		PP_LL = neg(PP_HH)
+		EXP0 += add(TMP,EXP1)
+		TMP = xor(AH,BH)
+	}
+	{
+		if (!p2.new) PP_HH = PP_LL
+		p2 = cmp.gt(TMP,#-1)
+		p0 = !cmp.gt(EXP0,BTMPH)
+		p0 = cmp.gt(EXP0,BTMPL)
+		if (!p0.new) jump:nt .Lmul_ovf_unf
+	}
+	{
+		A = convert_d2df(PP_HH)
+		EXP0 = add(EXP0,#-BIAS-58)
+	}
+	{
+		AH += asl(EXP0,#HI_MANTBITS)
+		jumpr r31
+	}
+
+	.falign
+.Lpossible_unf:
+	/* We end up with a positive exponent */
+	/* But we may have rounded up to an exponent of 1. */
+	/* If the exponent is 1, if we rounded up to it
+	 * we need to also raise underflow
+	 * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
+	 * And the PP should also have more than one bit set
+	 */
+	/* Note: ATMP should have abs(PP_HH) */
+	/* Note: BTMPL should have 0x7FEFFFFF */
+	{
+		p0 = cmp.eq(AL,#0)
+		p0 = bitsclr(AH,BTMPL)
+		if (!p0.new) jumpr:t r31
+		BTMPH = #0x7fff
+	}
+	{
+		p0 = bitsset(ATMPH,BTMPH)
+		BTMPL = USR
+		BTMPH = #0x030
+	}
+	{
+		if (p0) BTMPL = or(BTMPL,BTMPH)
+	}
+	{
+		USR = BTMPL
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+	.falign
+.Lmul_ovf_unf:
+	{
+		A = convert_d2df(PP_HH)
+		ATMP = abs(PP_HH)			// take absolute value
+		EXP1 = add(EXP0,#-BIAS-58)
+	}
+	{
+		AH += asl(EXP1,#HI_MANTBITS)
+		EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		BTMPL = ##0x7FEFFFFF
+	}
+	{
+		EXP1 += add(EXP0,##-BIAS-58)
+		//BTMPH = add(clb(ATMP),#-2)
+		BTMPH = #0
+	}
+	{
+		p0 = cmp.gt(EXP1,##BIAS+BIAS-2)	// overflow
+		if (p0.new) jump:nt .Lmul_ovf
+	}
+	{
+		p0 = cmp.gt(EXP1,#0)
+		if (p0.new) jump:nt .Lpossible_unf
+		BTMPH = sub(EXP0,BTMPH)
+		TMP = #63				// max amount to shift
+	}
+	/* Underflow */
+	/*
+	 * PP_HH has the partial product with sticky LSB.
+	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+	 * The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
+	 * That's the exponent that happens after the normalization
+	 *
+	 * EXP0 has the exponent that, when added to the normalized value, is out of range.
+	 *
+	 * Strategy:
+	 *
+	 * * Shift down bits, with sticky bit, such that the bits are aligned according
+	 *   to the LZ count and appropriate exponent, but not all the way to mantissa
+	 *   field, keep around the last few bits.
+	 * * Put a 1 near the MSB
+	 * * Check the LSBs for inexact; if inexact also set underflow
+	 * * Convert [u]d2df -- will correctly round according to rounding mode
+	 * * Replace exponent field with zero
+	 *
+	 *
+	 */
+
+
+	{
+		BTMPL = #0	 			// offset for extract
+		BTMPH = sub(#FUDGE,BTMPH)		// amount to right shift
+	}
+	{
+		p3 = cmp.gt(PP_HH_H,#-1)		// is it positive?
+		BTMPH = min(BTMPH,TMP)			// Don't shift more than 63
+		PP_HH = ATMP
+	}
+	{
+		TMP = USR
+		PP_LL = extractu(PP_HH,BTMP)
+	}
+	{
+		PP_HH = asr(PP_HH,BTMPH)
+		BTMPL = #0x0030					// underflow flag
+		AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		p0 = cmp.gtu(ONE,PP_LL)				// Did we extract all zeros?
+		if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)	// add sticky bit
+		PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)	// Add back in a bit so we can use convert instruction
+	}
+	{
+		PP_LL = neg(PP_HH)
+		p1 = bitsclr(PP_HH_L,#0x7)		// Are the LSB's clear?
+		if (!p1.new) TMP = or(BTMPL,TMP)	// If not, Inexact+Underflow
+	}
+	{
+		if (!p3) PP_HH = PP_LL
+		USR = TMP
+	}
+	{
+		A = convert_d2df(PP_HH)			// Do rounding
+		p0 = dfcmp.eq(A,A)			// realize exception
+	}
+	{
+		AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)		// Insert correct exponent
+		jumpr r31
+	}
+	.falign
+.Lmul_ovf:
+	// We get either max finite value or infinity.  Either way, overflow+inexact
+	{
+		TMP = USR
+		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
+		A = PP_HH
+	}
+	{
+		PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
+		TMP = or(TMP,#0x28)			// inexact + overflow
+		BTMP = combine(##0x7ff00000,#0)		// positive infinity
+	}
+	{
+		USR = TMP
+		PP_LL_L ^= lsr(AH,#31)			// Does sign match rounding?
+		TMP = PP_LL_L				// unmodified rounding mode
+	}
+	{
+		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
+		p0 = !cmp.eq(PP_LL_L,#2)		// Not rounding the other way,
+		if (p0.new) ATMP = BTMP			// we should get infinity
+		p0 = dfcmp.eq(A,A)			// Realize FP exception if enabled
+	}
+	{
+		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
+		jumpr r31
+	}
+
+.Lmul_abnormal:
+	{
+		ATMP = extractu(A,#63,#0)		// strip off sign
+		BTMP = extractu(B,#63,#0)		// strip off sign
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B			// sort values
+		if (!p3.new) B = A			// sort values
+	}
+	{
+		// Any NaN --> NaN, possibly raise invalid if sNaN
+		p0 = dfclass(A,#0x0f)		// A not NaN?
+		if (!p0.new) jump:nt .Linvalid_nan
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		// Infinity * nonzero number is infinity
+		p1 = dfclass(A,#0x08)		// A is infinity
+		p1 = dfclass(B,#0x0e)		// B is nonzero
+	}
+	{
+		// Infinity * zero --> NaN, raise invalid
+		// Other zeros return zero
+		p0 = dfclass(A,#0x08)		// A is infinity
+		p0 = dfclass(B,#0x01)		// B is zero
+	}
+	{
+		if (p1) jump .Ltrue_inf
+		p2 = dfclass(B,#0x01)
+	}
+	{
+		if (p0) jump .Linvalid_zeroinf
+		if (p2) jump .Ltrue_zero		// so return zero
+		TMP = ##0x7c000000
+	}
+	// We are left with a normal or subnormal times a subnormal. A > B
+	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
+	// we go to a single sticky bit, which we can round easily.
+	// If A and B might multiply to something bigger, decrease A exponent and increase
+	// B exponent and try again
+	{
+		p0 = bitsclr(AH,TMP)
+		if (p0.new) jump:nt .Lmul_tiny
+	}
+	{
+		TMP = cl0(BTMP)
+	}
+	{
+		TMP = add(TMP,#-EXPBITS)
+	}
+	{
+		BTMP = asl(BTMP,TMP)
+	}
+	{
+		B = insert(BTMP,#63,#0)
+		AH -= asl(TMP,#HI_MANTBITS)
+	}
+	jump __hexagon_muldf3
+.Lmul_tiny:
+	{
+		TMP = USR
+		A = xor(A,B)				// get sign bit
+	}
+	{
+		TMP = or(TMP,#0x30)			// Inexact + Underflow
+		A = insert(ONE,#63,#0)			// put in rounded up value
+		BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)	// get rounding mode
+	}
+	{
+		USR = TMP
+		p0 = cmp.gt(BTMPH,#1)			// Round towards pos/neg inf?
+		if (!p0.new) AL = #0			// If not, zero
+		BTMPH ^= lsr(AH,#31)			// rounding my way --> set LSB
+	}
+	{
+		p0 = cmp.eq(BTMPH,#3)			// if rounding towards right inf
+		if (!p0.new) AL = #0			// don't go to zero
+		jumpr r31
+	}
+.Linvalid_zeroinf:
+	{
+		TMP = USR
+	}
+	{
+		A = #-1
+		TMP = or(TMP,#2)
+	}
+	{
+		USR = TMP
+	}
+	{
+		p0 = dfcmp.uo(A,A)			// force exception if enabled
+		jumpr r31
+	}
+.Linvalid_nan:
+	{
+		p0 = dfclass(B,#0x0f)			// if B is not NaN
+		TMP = convert_df2sf(A)			// will generate invalid if sNaN
+		if (p0.new) B = A 			// make it whatever A is
+	}
+	{
+		BL = convert_df2sf(B)			// will generate invalid if sNaN
+		A = #-1
+		jumpr r31
+	}
+	.falign
+.Ltrue_zero:
+	{
+		A = B
+		B = A
+	}
+.Ltrue_inf:
+	{
+		BH = extract(BH,#1,#31)
+	}
+	{
+		AH ^= asl(BH,#31)
+		jumpr r31
+	}
+END(__hexagon_muldf3)
+
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
diff --git a/lib/builtins/hexagon/dfsqrt.S b/lib/builtins/hexagon/dfsqrt.S
new file mode 100644
index 000000000..027d9e1fd
--- /dev/null
+++ b/lib/builtins/hexagon/dfsqrt.S
@@ -0,0 +1,406 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision square root */
+
+#define EXP r28
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define SFSH r3:2
+#define SF_S r3
+#define SF_H r2
+
+#define SFHALF_SONE r5:4
+#define S_ONE r4
+#define SFHALF r5
+#define SF_D r6
+#define SF_E r7
+#define RECIPEST r8
+#define SFRAD r9
+
+#define FRACRAD r11:10
+#define FRACRADH r11
+#define FRACRADL r10
+
+#define ROOT r13:12
+#define ROOTHI r13
+#define ROOTLO r12
+
+#define PROD r15:14
+#define PRODHI r15
+#define PRODLO r14
+
+#define P_TMP p0
+#define P_EXP1 p1
+#define NORMAL p2
+
+#define SF_EXPBITS 8
+#define SF_MANTBITS 23
+
+#define DF_EXPBITS 11
+#define DF_MANTBITS 52
+
+#define DF_BIAS 0x3ff
+
+#define DFCLASS_ZERO     0x01
+#define DFCLASS_NORMAL   0x02
+#define DFCLASS_DENORMAL 0x02
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_NAN      0x10
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_sqrtdf2
+	.type __hexagon_sqrtdf2,@function
+	.global __hexagon_sqrt
+	.type __hexagon_sqrt,@function
+	Q6_ALIAS(sqrtdf2)
+	Q6_ALIAS(sqrt)
+	FAST_ALIAS(sqrtdf2)
+	FAST_ALIAS(sqrt)
+	FAST2_ALIAS(sqrtdf2)
+	FAST2_ALIAS(sqrt)
+	.type sqrt,@function
+	.p2align 5
+__hexagon_sqrtdf2:
+__hexagon_sqrt:
+	{
+		PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
+		EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+		SFHALF_SONE = combine(##0x3f000004,#1)
+	}
+	{
+		NORMAL = dfclass(A,#DFCLASS_NORMAL)		// Is it normal
+		NORMAL = cmp.gt(AH,#-1)				// and positive?
+		if (!NORMAL.new) jump:nt .Lsqrt_abnormal
+		SFRAD = or(SFHALF,PRODLO)
+	}
+#undef NORMAL
+.Ldenormal_restart:
+	{
+		FRACRAD = A
+		SF_E,P_TMP = sfinvsqrta(SFRAD)
+		SFHALF = and(SFHALF,#-16)
+		SFSH = #0
+	}
+#undef A
+#undef AH
+#undef AL
+#define ERROR r1:0
+#define ERRORHI r1
+#define ERRORLO r0
+	// SF_E : reciprocal square root
+	// SF_H : half rsqrt
+	// sf_S : square root
+	// SF_D : error term
+	// SFHALF: 0.5
+	{
+		SF_S += sfmpy(SF_E,SFRAD):lib		// s0: root
+		SF_H += sfmpy(SF_E,SFHALF):lib		// h0: 0.5*y0. Could also decrement exponent...
+		SF_D = SFHALF
+#undef SFRAD
+#define SHIFTAMT r9
+		SHIFTAMT = and(EXP,#1)
+	}
+	{
+		SF_D -= sfmpy(SF_S,SF_H):lib		// d0: 0.5-H*S = 0.5-0.5*~1
+		FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)	// replace upper bits with hidden
+		P_EXP1 = cmp.gtu(SHIFTAMT,#0)
+	}
+	{
+		SF_S += sfmpy(SF_S,SF_D):lib		// s1: refine sqrt
+		SF_H += sfmpy(SF_H,SF_D):lib		// h1: refine half-recip
+		SF_D = SFHALF
+		SHIFTAMT = mux(P_EXP1,#8,#9)
+	}
+	{
+		SF_D -= sfmpy(SF_S,SF_H):lib		// d1: error term
+		FRACRAD = asl(FRACRAD,SHIFTAMT)		// Move fracrad bits to right place
+		SHIFTAMT = mux(P_EXP1,#3,#2)
+	}
+	{
+		SF_H += sfmpy(SF_H,SF_D):lib		// d2: rsqrt
+		// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
+		PROD = asl(FRACRAD,SHIFTAMT)		// fracrad<<(2+exp1)
+	}
+	{
+		SF_H = and(SF_H,##0x007fffff)
+	}
+	{
+		SF_H = add(SF_H,##0x00800000 - 3)
+		SHIFTAMT = mux(P_EXP1,#7,#8)
+	}
+	{
+		RECIPEST = asl(SF_H,SHIFTAMT)
+		SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
+	}
+	{
+		ROOT = mpyu(RECIPEST,PRODHI)		// root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
+	}
+
+#undef SFSH	// r3:2
+#undef SF_H	// r2
+#undef SF_S	// r3
+#undef S_ONE	// r4
+#undef SFHALF	// r5
+#undef SFHALF_SONE	// r5:4
+#undef SF_D	// r6
+#undef SF_E	// r7
+
+#define HL r3:2
+#define LL r5:4
+#define HH r7:6
+
+#undef P_EXP1
+#define P_CARRY0 p1
+#define P_CARRY1 p2
+#define P_CARRY2 p3
+
+	/* Iteration 0 */
+	/* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */
+	/* We can shift and subtract instead of shift and add? */
+	{
+		ERROR = asl(FRACRAD,#15)
+		PROD = mpyu(ROOTHI,ROOTHI)
+		P_CARRY0 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR -= asl(PROD,#15)
+		PROD = mpyu(ROOTHI,ROOTLO)
+		P_CARRY1 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR -= lsr(PROD,#16)
+		P_CARRY2 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+		SHIFTAMT = add(SHIFTAMT,#16)
+		ERROR = asl(FRACRAD,#31)		// for next iter
+	}
+	/* Iteration 1 */
+	{
+		PROD = mpyu(ROOTHI,ROOTHI)
+		ERROR -= mpyu(ROOTHI,ROOTLO)	// amount is 31, no shift needed
+	}
+	{
+		ERROR -= asl(PROD,#31)
+		PROD = mpyu(ROOTLO,ROOTLO)
+	}
+	{
+		ERROR -= lsr(PROD,#33)
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+		SHIFTAMT = add(SHIFTAMT,#16)
+		ERROR = asl(FRACRAD,#47)	// for next iter
+	}
+	/* Iteration 2 */
+	{
+		PROD = mpyu(ROOTHI,ROOTHI)
+	}
+	{
+		ERROR -= asl(PROD,#47)
+		PROD = mpyu(ROOTHI,ROOTLO)
+	}
+	{
+		ERROR -= asl(PROD,#16)		// bidir shr 31-47
+		PROD = mpyu(ROOTLO,ROOTLO)
+	}
+	{
+		ERROR -= lsr(PROD,#17)		// 64-47
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+	}
+#undef ERROR
+#undef PROD
+#undef PRODHI
+#undef PRODLO
+#define REM_HI r15:14
+#define REM_HI_HI r15
+#define REM_LO r1:0
+#undef RECIPEST
+#undef SHIFTAMT
+#define TWOROOT_LO r9:8
+	/* Adjust Root */
+	{
+		HL = mpyu(ROOTHI,ROOTLO)
+		LL = mpyu(ROOTLO,ROOTLO)
+		REM_HI = #0
+		REM_LO = #0
+	}
+	{
+		HL += lsr(LL,#33)
+		LL += asl(HL,#33)
+		P_CARRY0 = cmp.eq(r0,r0)
+	}
+	{
+		HH = mpyu(ROOTHI,ROOTHI)
+		REM_LO = sub(REM_LO,LL,P_CARRY0):carry
+		TWOROOT_LO = #1
+	}
+	{
+		HH += lsr(HL,#31)
+		TWOROOT_LO += asl(ROOT,#1)
+	}
+#undef HL
+#undef LL
+#define REM_HI_TMP r3:2
+#define REM_HI_TMP_HI r3
+#define REM_LO_TMP r5:4
+	{
+		REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
+		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
+#undef FRACRAD
+#undef HH
+#define ZERO r11:10
+#define ONE r7:6
+		ONE = #1
+		ZERO = #0
+	}
+	{
+		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
+		ONE = add(ROOT,ONE)
+		EXP = add(EXP,#-DF_BIAS)			// subtract bias --> signed exp
+	}
+	{
+				// If carry set, no borrow: result was still positive
+		if (P_CARRY1) ROOT = ONE
+		if (P_CARRY1) REM_LO = REM_LO_TMP
+		if (P_CARRY1) REM_HI = REM_HI_TMP
+	}
+	{
+		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
+		ONE = #1
+		EXP = asr(EXP,#1)				// divide signed exp by 2
+	}
+	{
+		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
+		ONE = add(ROOT,ONE)
+	}
+	{
+		if (P_CARRY2) ROOT = ONE
+		if (P_CARRY2) REM_LO = REM_LO_TMP
+								// since tworoot <= 2^32, remhi must be zero
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#define S_ONE r2
+#define ADJ r3
+		S_ONE = #1
+	}
+	{
+		P_TMP = cmp.eq(REM_LO,ZERO)			// is the low part zero
+		if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE)	// if so, it's exact... hopefully
+		ADJ = cl0(ROOT)
+		EXP = add(EXP,#-63)
+	}
+#undef REM_LO
+#define RET r1:0
+#define RETHI r1
+	{
+		RET = convert_ud2df(ROOT)			// set up mantissa, maybe set inexact flag
+		EXP = add(EXP,ADJ)				// add back bias
+	}
+	{
+		RETHI += asl(EXP,#DF_MANTBITS-32)		// add exponent adjust
+		jumpr r31
+	}
+#undef REM_LO_TMP
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#undef REM_LO
+#undef REM_HI
+#undef TWOROOT_LO
+
+#undef RET
+#define A r1:0
+#define AH r1
+#define AL r1
+#undef S_ONE
+#define TMP r3:2
+#define TMPHI r3
+#define TMPLO r2
+#undef P_CARRY0
+#define P_NEG p1
+
+
+#define SFHALF r5
+#define SFRAD r9
+.Lsqrt_abnormal:
+	{
+		P_TMP = dfclass(A,#DFCLASS_ZERO)			// zero?
+		if (P_TMP.new) jumpr:t r31
+	}
+	{
+		P_TMP = dfclass(A,#DFCLASS_NAN)
+		if (P_TMP.new) jump:nt .Lsqrt_nan
+	}
+	{
+		P_TMP = cmp.gt(AH,#-1)
+		if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
+		if (!P_TMP.new) EXP = ##0x7F800001			// sNaN
+	}
+	{
+		P_TMP = dfclass(A,#DFCLASS_INFINITE)
+		if (P_TMP.new) jumpr:nt r31
+	}
+	// If we got here, we're denormal
+	// prepare to restart
+	{
+		A = extractu(A,#DF_MANTBITS,#0)		// Extract mantissa
+	}
+	{
+		EXP = add(clb(A),#-DF_EXPBITS)		// how much to normalize?
+	}
+	{
+		A = asl(A,EXP)				// Shift mantissa
+		EXP = sub(#1,EXP)			// Form exponent
+	}
+	{
+		AH = insert(EXP,#1,#DF_MANTBITS-32)		// insert lsb of exponent
+	}
+	{
+		TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)	// get sf value (mant+exp1)
+		SFHALF = ##0x3f000004						// form half constant
+	}
+	{
+		SFRAD = or(SFHALF,TMPLO)			// form sf value
+		SFHALF = and(SFHALF,#-16)
+		jump .Ldenormal_restart				// restart
+	}
+.Lsqrt_nan:
+	{
+		EXP = convert_df2sf(A)				// if sNaN, get invalid
+		A = #-1						// qNaN
+		jumpr r31
+	}
+.Lsqrt_invalid_neg:
+	{
+		A = convert_sf2df(EXP)				// Invalid,NaNval
+		jumpr r31
+	}
+END(__hexagon_sqrt)
+END(__hexagon_sqrtdf2)
diff --git a/lib/builtins/hexagon/divdi3.S b/lib/builtins/hexagon/divdi3.S
new file mode 100644
index 000000000..49ee8104f
--- /dev/null
+++ b/lib/builtins/hexagon/divdi3.S
@@ -0,0 +1,85 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_divdi3
+	{
+		p2 = tstbit(r1,#31)
+		p3 = tstbit(r3,#31)
+	}
+	{
+		r1:0 = abs(r1:0)
+		r3:2 = abs(r3:2)
+	}
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		p3 = xor(p2,p3)
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_divdi3_return          // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_divdi3_return:
+	{
+		r3:2 = neg(r1:0)
+	}
+	{
+		r1:0 = vmux(p3,r3:2,r1:0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_divdi3
+
+  .globl __qdsp_divdi3
+  .set   __qdsp_divdi3, __hexagon_divdi3
diff --git a/lib/builtins/hexagon/divsi3.S b/lib/builtins/hexagon/divsi3.S
new file mode 100644
index 000000000..8e159baa1
--- /dev/null
+++ b/lib/builtins/hexagon/divsi3.S
@@ -0,0 +1,84 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_divsi3
+	{
+		p0 = cmp.ge(r0,#0)
+		p1 = cmp.ge(r1,#0)
+		r1 = abs(r0)
+		r2 = abs(r1)
+	}
+	{
+		r3 = cl0(r1)
+		r4 = cl0(r2)
+		r5 = sub(r1,r2)
+		p2 = cmp.gtu(r2,r1)
+	}
+#if (__HEXAGON_ARCH__ == 60)
+	{
+		r0 = #0
+		p1 = xor(p0,p1)
+		p0 = cmp.gtu(r2,r5)
+	}
+		if (p2) jumpr r31
+#else
+	{
+		r0 = #0
+		p1 = xor(p0,p1)
+		p0 = cmp.gtu(r2,r5)
+		if (p2) jumpr r31
+	}
+#endif
+	{
+		r0 = mux(p1,#-1,#1)
+		if (p0) jumpr r31
+		r4 = sub(r4,r3)
+		r3 = #1
+	}
+	{
+		r0 = #0
+		r3:2 = vlslw(r3:2,r4)
+		loop0(1f,r4)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r0 = add(r0,r3)
+		if (!p1) jumpr r31
+	}
+	{
+		r0 = neg(r0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_divsi3
+
+  .globl __qdsp_divsi3
+  .set   __qdsp_divsi3, __hexagon_divsi3
diff --git a/lib/builtins/hexagon/fabs_opt.S b/lib/builtins/hexagon/fabs_opt.S
new file mode 100644
index 000000000..b09b00734
--- /dev/null
+++ b/lib/builtins/hexagon/fabs_opt.S
@@ -0,0 +1,37 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fabs
+  {
+    r1 = clrbit(r1, #31)
+    jumpr r31
+  }
+FUNCTION_END fabs
+
+FUNCTION_BEGIN fabsf
+  {
+    r0 = clrbit(r0, #31)
+    jumpr r31
+  }
+FUNCTION_END fabsf
+
+  .globl fabsl
+  .set fabsl, fabs
diff --git a/lib/builtins/hexagon/fastmath2_dlib_asm.S b/lib/builtins/hexagon/fastmath2_dlib_asm.S
new file mode 100644
index 000000000..9286df06c
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath2_dlib_asm.S
@@ -0,0 +1,491 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/*   FUNCTIONS Optimized double floating point operators                */
+/* ==================================================================== */
+/*      c = dadd_asm(a, b)                                              */
+/* ==================================================================== *
+fast2_QDOUBLE fast2_dadd(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+      fast2_QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  Q6_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dadd_asm
+        .type fast2_dadd_asm, @function
+fast2_dadd_asm:
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define minmin     R11:10  // exactly 0x000000000000008001LL
+#define minminl    R10
+#define k          R4
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        minmin = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = add(lmanta, lmantb)
+        minminl.L = #0x8001
+      } {
+        k  = clb(lmant)
+        c63 = #58
+      } {
+        k = add(k, #-1)
+        p0 = cmp.gt(k, c63)
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        if(p0) jump .Ldenorma
+      } {
+        manta = insert(exp, #16, #0)
+        jumpr  r31
+      }
+.Ldenorma:
+      {
+        mantexpa = minmin
+        jumpr  r31
+      }
+/* =================================================================== *
+ fast2_QDOUBLE fast2_dsub(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+      fast2_QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+        k =  Q6_R_clb_P(mant)-1;
+        mant = (mant << k);
+        exp = exp - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dsub_asm
+        .type fast2_dsub_asm, @function
+fast2_dsub_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define minmin     R11:10  // exactly 0x000000000000008001LL
+#define minminl    R10
+#define k          R4
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        minmin = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = sub(lmanta, lmantb)
+        minminl.L = #0x8001
+      } {
+        k  = clb(lmant)
+        c63 = #58
+      } {
+        k = add(k, #-1)
+        p0 = cmp.gt(k, c63)
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        if(p0) jump .Ldenorm
+      } {
+        manta = insert(exp, #16, #0)
+        jumpr  r31
+      }
+.Ldenorm:
+      {
+        mantexpa = minmin
+        jumpr  r31
+      }
+/* ==================================================================== *
+ fast2_QDOUBLE fast2_dmpy(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+        fast2_QDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = Q6_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = Q6_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = Q6_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = Q6_P_mpy_RR(hia, lob);
+        mant = Q6_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+
+        k =  Q6_R_normamt_R(hi);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dmpy_asm
+        .type fast2_dmpy_asm, @function
+fast2_dmpy_asm:
+
+#define mantal     R0
+#define mantah     R1
+#define mantexpa   R1:0
+#define mantbl     R2
+#define mantbh     R3
+#define mantexpb   R3:2
+#define expa       R4
+#define expb       R5
+#define c8001      R12
+#define mantexpd   R7:6
+#define mantdh     R7
+#define exp        R8
+#define lmantc     R11:10
+#define kb         R9
+#define guard      R11
+#define mantal_    R12
+#define mantbl_    R13
+#define min        R15:14
+#define minh       R15
+
+        .falign
+      {
+        mantbl_= lsr(mantbl, #16)
+        expb = sxth(mantbl)
+        expa = sxth(mantal)
+        mantal_= lsr(mantal, #16)
+      }
+      {
+        lmantc = mpy(mantah, mantbh)
+        mantexpd = mpy(mantah, mantbl_)
+        mantal.L = #0x0
+        min = #0
+      }
+      {
+        lmantc = add(lmantc, lmantc)
+        mantexpd+= mpy(mantbh, mantal_)
+        mantbl.L = #0x0
+        minh.H = #0x8000
+      }
+      {
+        mantexpd = asr(mantexpd, #15)
+        c8001.L =  #0x8001
+        p1 = cmp.eq(mantexpa, mantexpb)
+      }
+      {
+        mantexpd = add(mantexpd, lmantc)
+        exp = add(expa, expb)
+        p2 = cmp.eq(mantexpa, min)
+      }
+      {
+        kb  = clb(mantexpd)
+        mantexpb = abs(mantexpd)
+        guard = #58
+      }
+      {
+        p1 = and(p1, p2)
+        exp = sub(exp, kb)
+        kb = add(kb, #-1)
+	p0 = cmp.gt(kb, guard)
+      }
+      {
+        exp = add(exp, #1)
+        mantexpa = asl(mantexpd, kb)
+        if(p1) jump .Lsat   //rarely happens
+      }
+      {
+        mantal = insert(exp,#16, #0)
+        if(!p0) jumpr  r31
+      }
+      {
+        mantal = insert(c8001,#16, #0)
+        jumpr  r31
+      }
+.Lsat:
+      {
+        mantexpa = #-1
+      }
+      {
+        mantexpa = lsr(mantexpa, #1)
+      }
+      {
+        mantal = insert(exp,#16, #0)
+        jumpr  r31
+      }
+
+/* ==================================================================== *
+ int fast2_qd2f(fast2_QDOUBLE a) {
+        int exp;
+        long long int manta;
+        int ic, rnd, mantb;
+
+        manta = a>>32;
+        exp = Q6_R_sxth_R(a) ;
+        ic = 0x80000000 & manta;
+        manta = Q6_R_abs_R_sat(manta);
+        mantb = (manta + rnd)>>7;
+        rnd = 0x40
+        exp = (exp + 126);
+        if((manta & 0xff) == rnd) rnd = 0x00;
+        if((manta & 0x7fffffc0) == 0x7fffffc0) {
+           manta = 0x0; exp++;
+        } else {
+           manta= mantb & 0x007fffff;
+        }
+        exp = (exp << 23) & 0x7fffffc0;
+        ic = Q6_R_addacc_RR(ic, exp, manta);
+        return (ic);
+ }
+ * ==================================================================== */
+
+        .text
+        .global fast2_qd2f_asm
+        .type fast2_qd2f_asm, @function
+fast2_qd2f_asm:
+#define mantah   R1
+#define mantal   R0
+#define cff      R0
+#define mant     R3
+#define expo     R4
+#define rnd      R5
+#define mask     R6
+#define c07f     R7
+#define c80      R0
+#define mantb    R2
+#define ic       R0
+
+      .falign
+     {
+       mant = abs(mantah):sat
+       expo = sxth(mantal)
+       rnd = #0x40
+       mask.L = #0xffc0
+     }
+     {
+       cff = extractu(mant, #8, #0)
+       p2 = cmp.gt(expo, #126)
+       p3 = cmp.ge(expo, #-126)
+       mask.H = #0x7fff
+     }
+     {
+       p1 = cmp.eq(cff,#0x40)
+       if(p1.new) rnd = #0
+       expo = add(expo, #126)
+       if(!p3) jump .Lmin
+     }
+     {
+       p0 = bitsset(mant, mask)
+       c80.L = #0x0000
+       mantb = add(mant, rnd)
+       c07f = lsr(mask, #8)
+     }
+     {
+       if(p0) expo = add(expo, #1)
+       if(p0) mant = #0
+       mantb = lsr(mantb, #7)
+       c80.H = #0x8000
+     }
+     {
+       ic = and(c80, mantah)
+       mask &= asl(expo, #23)
+       if(!p0) mant = and(mantb, c07f)
+       if(p2) jump .Lmax
+     }
+     {
+       ic += add(mask, mant)
+       jumpr r31
+     }
+.Lmax:
+     {
+       ic.L = #0xffff;
+     }
+     {
+       ic.H = #0x7f7f;
+       jumpr r31
+     }
+.Lmin:
+     {
+       ic = #0x0
+       jumpr r31
+     }
+
+/* ==================================================================== *
+fast2_QDOUBLE fast2_f2qd(int ia) {
+        lint exp;
+        lint mant;
+        fast2_QDOUBLE c;
+
+        mant = ((ia << 7) | 0x40000000)&0x7fffff80 ;
+        if (ia & 0x80000000) mant = -mant;
+        exp =  ((ia >> 23) & 0xFFLL) - 126;
+        c = (mant<<32) | Q6_R_zxth_R(exp);;
+        return(c);
+}
+ * ==================================================================== */
+        .text
+        .global fast2_f2qd_asm
+        .type fast2_f2qd_asm, @function
+fast2_f2qd_asm:
+#define ia    R0
+#define mag   R3
+#define mantr R1
+#define expr  R0
+#define zero  R2
+#define maxneg R5:4
+#define maxnegl R4
+        .falign
+  {
+       mantr = asl(ia, #7)
+       p0 = tstbit(ia, #31)
+       maxneg = #0
+       mag = add(ia,ia)
+  }
+  {
+       mantr = setbit(mantr, #30)
+       expr= extractu(ia,#8,#23)
+       maxnegl.L = #0x8001
+       p1 = cmp.eq(mag, #0)
+  }
+  {
+       mantr= extractu(mantr, #31, #0)
+       expr= add(expr, #-126)
+       zero = #0
+       if(p1) jump .Lminqd
+  }
+  {
+       expr = zxth(expr)
+       if(p0) mantr= sub(zero, mantr)
+       jumpr r31
+  }
+.Lminqd:
+  {
+       R1:0 = maxneg
+       jumpr r31
+  }
diff --git a/lib/builtins/hexagon/fastmath2_ldlib_asm.S b/lib/builtins/hexagon/fastmath2_ldlib_asm.S
new file mode 100644
index 000000000..419255535
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath2_ldlib_asm.S
@@ -0,0 +1,345 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== *
+
+fast2_QLDOUBLE fast2_ldadd(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+      fast2_QLDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  Q6_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldadd_asm
+        .type fast2_ldadd_asm, @function
+fast2_ldadd_asm:
+#define manta      R1:0
+#define lmanta     R1:0
+#define mantb      R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define k          R4
+#define ce         P0
+#define zero       R3:2
+        .falign
+      {
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        expd = sub(expa, expb):sat
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        expd = abs(expd):sat
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        c63 = #62
+      } {
+        expd = MIN(expd, c63)
+        manta = memd(r29+#0)
+        mantb = memd(r29+#16)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = add(lmanta, lmantb)
+        zero = #0
+      } {
+        k  = clb(lmant)
+        c63.L =#0x0001
+      } {
+        exp -= add(k, #-1)  //exp =  exp - (k-1)
+        k = add(k, #-1)
+        p0 = cmp.gt(k, #58)
+        c63.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = exp
+        lmant = ASL(lmant, k)
+        if(p0) jump .Ldenorma
+      } {
+        memd(r7+#0) = lmant
+        jumpr  r31
+      }
+.Ldenorma:
+        memd(r7+#0) = zero
+      {
+        memw(r7+#8) = c63
+        jumpr  r31
+      }
+/* =================================================================== *
+ fast2_QLDOUBLE fast2_ldsub(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+      fast2_QLDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+        k =  Q6_R_clb_P(mant)-1;
+        mant = (mant << k);
+        exp = exp - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldsub_asm
+        .type fast2_ldsub_asm, @function
+fast2_ldsub_asm:
+#define manta      R1:0
+#define lmanta     R1:0
+#define mantb      R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define k          R4
+#define ce         P0
+#define zero       R3:2
+        .falign
+      {
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        expd = sub(expa, expb):sat
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        expd = abs(expd):sat
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        c63 = #62
+      } {
+        expd = min(expd, c63)
+        manta = memd(r29+#0)
+        mantb = memd(r29+#16)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = sub(lmanta, lmantb)
+        zero = #0
+      } {
+        k  = clb(lmant)
+        c63.L =#0x0001
+      } {
+        exp -= add(k, #-1)  //exp =  exp - (k+1)
+        k = add(k, #-1)
+        p0 = cmp.gt(k, #58)
+        c63.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = exp
+        lmant = asl(lmant, k)
+        if(p0) jump .Ldenorma_s
+      } {
+        memd(r7+#0) = lmant
+        jumpr  r31
+      }
+.Ldenorma_s:
+        memd(r7+#0) = zero
+      {
+        memw(r7+#8) = c63
+        jumpr  r31
+      }
+
+/* ==================================================================== *
+ fast2_QLDOUBLE fast2_ldmpy(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+        fast2_QLDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = Q6_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = Q6_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = Q6_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = Q6_P_mpy_RR(hia, lob);
+        mant = Q6_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+
+        k =  Q6_R_normamt_R(hi);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldmpy_asm
+        .type fast2_ldmpy_asm, @function
+fast2_ldmpy_asm:
+
+#define mantxl_    R9
+#define mantxl     R14
+#define mantxh     R15
+#define mantx      R15:14
+#define mantbl     R2
+#define mantbl_    R8
+#define mantbh     R3
+#define mantb      R3:2
+#define expa       R4
+#define expb       R5
+#define c8001      R8
+#define mantd      R7:6
+#define lmantc     R11:10
+#define kp         R9
+#define min        R13:12
+#define minh       R13
+#define max        R13:12
+#define maxh       R13
+#define ret        R0
+
+        .falign
+      {
+        mantx = memd(r29+#0)
+        mantb = memd(r29+#16)
+        min = #0
+      }
+      {
+        mantbl_= extractu(mantbl, #31, #1)
+        mantxl_= extractu(mantxl, #31, #1)
+        minh.H = #0x8000
+      }
+      {
+        lmantc = mpy(mantxh, mantbh)
+        mantd = mpy(mantxh, mantbl_)
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+      }
+      {
+        lmantc = add(lmantc, lmantc)
+        mantd += mpy(mantbh, mantxl_)
+      }
+      {
+        mantd = asr(mantd, #30)
+        c8001.L =  #0x0001
+        p1 = cmp.eq(mantx, mantb)
+      }
+      {
+        mantd = add(mantd, lmantc)
+        expa= add(expa, expb)
+        p2 = cmp.eq(mantb, min)
+      }
+      {
+        kp  = clb(mantd)
+        c8001.H =  #0x8000
+        p1 = and(p1, p2)
+      }
+      {
+        expa-= add(kp, #-1)
+        kp = add(kp, #-1)
+        if(p1) jump .Lsat
+      }
+      {
+        mantd = asl(mantd, kp)
+        memw(ret+#8) = expa
+	p0 = cmp.gt(kp, #58)
+        if(p0.new) jump:NT .Ldenorm   //rarely happens
+      }
+      {
+        memd(ret+#0) = mantd
+        jumpr  r31
+      }
+.Lsat:
+      {
+        max = #0
+        expa+= add(kp, #1)
+      }
+      {
+        maxh.H = #0x4000
+        memw(ret+#8) = expa
+      }
+      {
+        memd(ret+#0) = max
+        jumpr  r31
+      }
+.Ldenorm:
+      {
+        memw(ret+#8) = c8001
+        mantx = #0
+      }
+      {
+        memd(ret+#0) = mantx
+        jumpr  r31
+      }
diff --git a/lib/builtins/hexagon/fastmath_dlib_asm.S b/lib/builtins/hexagon/fastmath_dlib_asm.S
new file mode 100644
index 000000000..215936b78
--- /dev/null
+++ b/lib/builtins/hexagon/fastmath_dlib_asm.S
@@ -0,0 +1,400 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/*   FUNCTIONS Optimized double floating point operators                */
+/* ==================================================================== */
+/*      c = dadd_asm(a, b)                                              */
+/* ====================================================================
+
+QDOUBLE dadd(QDOUBLE a,QDOUBLE b) {
+      QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = HEXAGON_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = HEXAGON_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+        expdiff = HEXAGON_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dadd_asm
+        .type dadd_asm, @function
+dadd_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define zero       R7:6
+#define zerol      R6
+#define minus      R3:2
+#define minusl     R2
+#define maxneg     R9
+#define minmin     R11:10  // exactly 0x800000000000000000LL
+#define minminh    R11
+#define k          R4
+#define kl         R5
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        zero = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+        minmin = #0
+      } {
+        lmant = add(lmanta, lmantb)
+        minus = #-1
+        minminh.H = #0x8000
+      } {
+        k  = NORMAMT(manth)
+        kl = NORMAMT(mantl)
+        p0 = cmp.eq(manth, zerol)
+        p1 = cmp.eq(manth, minusl)
+      } {
+        p0 = OR(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        maxneg.L = #0x8001
+      } {
+        p0 = cmp.eq(mantexpa, zero)
+        p1 = cmp.eq(mantexpa, minus)
+        manta.L = #0
+        exp = ZXTH(exp)
+      } {
+        p2 = cmp.eq(mantexpa, minmin)    //is result 0x80....0
+        if(p2.new) exp = add(exp, #1)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+        jumpr  r31
+      }
+#endif
+/* =================================================================== *
+ QDOUBLE dsub(QDOUBLE a,QDOUBLE b) {
+      QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = HEXAGON_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = HEXAGON_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+        expdiff = HEXAGON_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dsub_asm
+        .type dsub_asm, @function
+dsub_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define zero       R7:6
+#define zerol      R6
+#define minus      R3:2
+#define minusl     R2
+#define maxneg     R9
+#define minmin     R11:10  // exactly 0x800000000000000000LL
+#define minminh    R11
+#define k          R4
+#define kl         R5
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        zero = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+        minmin = #0
+      } {
+        lmant = sub(lmanta, lmantb)
+        minus = #-1
+        minminh.H = #0x8000
+      } {
+        k  = NORMAMT(manth)
+        kl = NORMAMT(mantl)
+        p0 = cmp.eq(manth, zerol)
+        p1 = cmp.eq(manth, minusl)
+      } {
+        p0 = OR(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        maxneg.L = #0x8001
+      } {
+        p0 = cmp.eq(mantexpa, zero)
+        p1 = cmp.eq(mantexpa, minus)
+        manta.L = #0
+        exp = ZXTH(exp)
+      } {
+        p2 = cmp.eq(mantexpa, minmin)    //is result 0x80....0
+        if(p2.new) exp = add(exp, #1)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+        jumpr  r31
+      }
+#endif
+/* ==================================================================== *
+ QDOUBLE dmpy(QDOUBLE a,QDOUBLE b) {
+        QDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = HEXAGON_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = HEXAGON_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = HEXAGON_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = HEXAGON_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = HEXAGON_P_mpy_RR(hia, lob);
+        mant = HEXAGON_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (HEXAGON_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dmpy_asm
+        .type dmpy_asm, @function
+dmpy_asm:
+
+#define mantal     R0
+#define mantah     R1
+#define mantexpa   R1:0
+#define mantbl     R2
+#define mantbh     R3
+#define mantexpb   R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define exp        R8
+#define lmantc     R11:10
+#define mantch     R11
+#define mantcl     R10
+#define zero0      R7:6
+#define zero0l     R6
+#define minus1     R3:2
+#define minus1l    R2
+#define maxneg     R9
+#define k          R4
+#define kl         R5
+
+        .falign
+      {
+        mantbl = lsr(mantbl, #16)
+        mantal = lsr(mantal, #16)
+        expa = sxth(mantal)
+        expb = sxth(mantbl)
+      }
+      {
+        lmantc = mpy(mantah, mantbh)
+        mantexpd = mpy(mantah, mantbl)
+      }
+      {
+        lmantc = add(lmantc, lmantc) //<<1
+        mantexpd+= mpy(mantbh, mantal)
+      }
+      {
+        lmantc += asr(mantexpd, #15)
+        exp = add(expa, expb)
+        zero0 = #0
+        minus1 = #-1
+      }
+      {
+        k  = normamt(mantch)
+        kl = normamt(mantcl)
+        p0 = cmp.eq(mantch, zero0l)
+        p1 = cmp.eq(mantch, minus1l)
+      }
+      {
+        p0 = or(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      }
+      {
+        mantexpa = asl(lmantc, k)
+        exp = sub(exp, k)
+        maxneg.L = #0x8001
+      }
+      {
+        p0 = cmp.eq(mantexpa, zero0)
+        p1 = cmp.eq(mantexpa, minus1)
+        mantal.L = #0
+        exp = zxth(exp)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = or(p0, p1)
+        if( p0.new) mantal = or(mantal,maxneg)
+        if(!p0.new) mantal = or(mantal,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = or(p0, p1)
+        if( p0.new) mantal = or(mantal,maxneg)
+        if(!p0.new) mantal = or(mantal,exp)
+        jumpr  r31
+      }
+#endif
diff --git a/lib/builtins/hexagon/fma_opt.S b/lib/builtins/hexagon/fma_opt.S
new file mode 100644
index 000000000..12378f0da
--- /dev/null
+++ b/lib/builtins/hexagon/fma_opt.S
@@ -0,0 +1,31 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaf
+  r2 += sfmpy(r0, r1)
+  {
+    r0 = r2
+    jumpr r31
+  }
+FUNCTION_END fmaf
+
+  .globl fmal
+  .set fmal, fma
diff --git a/lib/builtins/hexagon/fmax_opt.S b/lib/builtins/hexagon/fmax_opt.S
new file mode 100644
index 000000000..f3a218c97
--- /dev/null
+++ b/lib/builtins/hexagon/fmax_opt.S
@@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaxf
+  {
+    r0 = sfmax(r0, r1)
+    jumpr r31
+  }
+FUNCTION_END fmaxf
+
+  .globl fmaxl
+  .set fmaxl, fmax
diff --git a/lib/builtins/hexagon/fmin_opt.S b/lib/builtins/hexagon/fmin_opt.S
new file mode 100644
index 000000000..ef9b0ff85
--- /dev/null
+++ b/lib/builtins/hexagon/fmin_opt.S
@@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fminf
+  {
+    r0 = sfmin(r0, r1)
+    jumpr r31
+  }
+FUNCTION_END fminf
+
+  .globl fminl
+  .set fminl, fmin
diff --git a/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
new file mode 100644
index 000000000..fbe09086c
--- /dev/null
+++ b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
@@ -0,0 +1,125 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// An optimized version of a memcpy which is equivalent to the following loop:
+//
+//   volatile unsigned *dest;
+//   unsigned *src;
+//
+//   for (i = 0; i < num_words; ++i)
+//     *dest++ = *src++;
+//
+// The corresponding C prototype for this function would be
+// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
+//                                      const unsigned *src,
+//                                      unsigned num_words);
+//
+// *** Both dest and src must be aligned to 32-bit boundaries. ***
+// The code does not perform any runtime checks for this, and will fail
+// in bad ways if this requirement is not met.
+//
+// The "forward" in the name refers to the fact that the function copies
+// the words going forward in memory.  It is incorrect to use this function
+// for cases where the original code copied words in any other order.
+//
+// *** This function is only for the use by the compiler. ***
+// The only indended use is for the LLVM compiler to generate calls to
+// this function, when a mem-copy loop, like the one above, is detected.
+
+  .text
+
+// Inputs:
+//   r0: dest
+//   r1: src
+//   r2: num_words
+
+  .globl  hexagon_memcpy_forward_vp4cp4n2
+  .balign 32
+  .type   hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+    // Compute r3 to be the number of words remaining in the current page.
+    // At the same time, compute r4 to be the number of 32-byte blocks
+    // remaining in the page (for prefetch).
+  {
+    r3 = sub(##4096, r1)
+    r5 = lsr(r2, #3)
+  }
+  {
+    // The word count before end-of-page is in the 12 lowest bits of r3.
+    // (If the address in r1 was already page-aligned, the bits are 0.)
+    r3 = extractu(r3, #10, #2)
+    r4 = extractu(r3, #7, #5)
+  }
+  {
+    r3 = minu(r2, r3)
+    r4 = minu(r5, r4)
+  }
+  {
+    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
+    p0 = cmp.eq(r3, #0)
+    if (p0.new) jump:nt .Lskipprolog
+  }
+    l2fetch(r1, r4)
+  {
+    loop0(.Lprolog, r3)
+    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
+  }
+  .falign
+.Lprolog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+.Lskipprolog:
+  {
+    // Let r3 = number of whole pages left (page = 1024 words).
+    r3 = lsr(r2, #10)
+    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+  }
+  {
+    loop1(.Lout, r3)
+    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
+    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
+  }
+    // Iterate over pages.
+  .falign
+.Lout:
+    // Prefetch each individual page.
+    l2fetch(r1, r3)
+    loop0(.Lpage, #512)
+  .falign
+.Lpage:
+    r5:4 = memd(r1++#8)
+  {
+    memw(r0++#8) = r4
+    memw(r0+#4) = r5
+  } :endloop0:endloop1
+.Lskipmain:
+  {
+    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
+    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
+    p0 = cmp.eq(r2, #0)
+    if (p0.new) jumpr:nt r31
+  }
+  {
+    r3 = or(r3, r4)
+    loop0(.Lepilog, r2)
+  }
+    l2fetch(r1, r3)
+  .falign
+.Lepilog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+
+    jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
diff --git a/lib/builtins/hexagon/memcpy_likely_aligned.S b/lib/builtins/hexagon/memcpy_likely_aligned.S
new file mode 100644
index 000000000..bbc85c22d
--- /dev/null
+++ b/lib/builtins/hexagon/memcpy_likely_aligned.S
@@ -0,0 +1,64 @@
+//===------------------------- memcopy routines ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+	{
+		p0 = bitsclr(r1,#7)
+		p0 = bitsclr(r0,#7)
+		if (p0.new) r5:4 = memd(r1)
+		r3 = #-3
+	}
+	{
+		if (!p0) jump .Lmemcpy_call
+		if (p0) memd(r0++#8) = r5:4
+		if (p0) r5:4 = memd(r1+#8)
+		r3 += lsr(r2,#3)
+	}
+	{
+		memd(r0++#8) = r5:4
+		r5:4 = memd(r1+#16)
+		r1 = add(r1,#24)
+		loop0(1f,r3)
+	}
+	.falign
+1:
+	{
+		memd(r0++#8) = r5:4
+		r5:4 = memd(r1++#8)
+	}:endloop0
+	{
+		memd(r0) = r5:4
+		r0 -= add(r2,#-8)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+.Lmemcpy_call:
+#ifdef __PIC__
+	jump memcpy@PLT
+#else
+	jump memcpy
+#endif
+
+  .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes
+  .set   __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, \
+         __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
diff --git a/lib/builtins/hexagon/moddi3.S b/lib/builtins/hexagon/moddi3.S
new file mode 100644
index 000000000..12a0595fe
--- /dev/null
+++ b/lib/builtins/hexagon/moddi3.S
@@ -0,0 +1,83 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_moddi3
+	{
+		p3 = tstbit(r1,#31)
+	}
+	{
+		r1:0 = abs(r1:0)
+		r3:2 = abs(r3:2)
+	}
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_moddi3_return          // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_moddi3_return:
+	{
+		r1:0 = neg(r3:2)
+	}
+	{
+		r1:0 = vmux(p3,r1:0,r3:2)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_moddi3
+
+  .globl __qdsp_moddi3
+  .set   __qdsp_moddi3, __hexagon_moddi3
diff --git a/lib/builtins/hexagon/modsi3.S b/lib/builtins/hexagon/modsi3.S
new file mode 100644
index 000000000..5afda9e29
--- /dev/null
+++ b/lib/builtins/hexagon/modsi3.S
@@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_modsi3
+	{
+		p2 = cmp.ge(r0,#0)
+		r2 = abs(r0)
+		r1 = abs(r1)
+	}
+	{
+		r3 = cl0(r2)
+		r4 = cl0(r1)
+		p0 = cmp.gtu(r1,r2)
+	}
+	{
+		r3 = sub(r4,r3)
+		if (p0) jumpr r31
+	}
+	{
+		p1 = cmp.eq(r3,#0)
+		loop0(1f,r3)
+		r0 = r2
+		r2 = lsl(r1,r3)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r2)
+		r2 = lsr(r2,#1)
+		if (p1) r1 = #0
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r1)
+		if (p2) jumpr r31
+	}
+	{
+		r0 = neg(r0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_modsi3
+
+  .globl __qdsp_modsi3
+  .set   __qdsp_modsi3, __hexagon_modsi3
diff --git a/lib/builtins/hexagon/sfdiv_opt.S b/lib/builtins/hexagon/sfdiv_opt.S
new file mode 100644
index 000000000..6bdd4808c
--- /dev/null
+++ b/lib/builtins/hexagon/sfdiv_opt.S
@@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_divsf3
+  {
+    r2,p0 = sfrecipa(r0,r1)
+    r4 = sffixupd(r0,r1)
+    r3 = ##0x3f800000   // 1.0
+  }
+  {
+    r5 = sffixupn(r0,r1)
+    r3 -= sfmpy(r4,r2):lib  // 1-(den/recip) yields error?
+    r6 = ##0x80000000
+    r7 = r3
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r3 = r7
+    r6 = r5
+    r0 = and(r6,r5)
+  }
+  {
+    r3 -= sfmpy(r4,r2):lib
+    r0 += sfmpy(r5,r2):lib
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r6 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r6,r2):lib
+  }
+  {
+    r5 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r5,r2,p0):scale
+    jumpr r31
+  }
+FUNCTION_END __hexagon_divsf3
+
+Q6_ALIAS(divsf3)
+FAST_ALIAS(divsf3)
+FAST2_ALIAS(divsf3)
diff --git a/lib/builtins/hexagon/sfsqrt_opt.S b/lib/builtins/hexagon/sfsqrt_opt.S
new file mode 100644
index 000000000..7f6190027
--- /dev/null
+++ b/lib/builtins/hexagon/sfsqrt_opt.S
@@ -0,0 +1,82 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+#define RIN r0
+#define S r0
+#define H r1
+#define D r2
+#define E r3
+#define HALF r4
+#define R r5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_sqrtf
+  {
+    E,p0 = sfinvsqrta(RIN)
+    R = sffixupr(RIN)
+    HALF = ##0x3f000000   // 0.5
+    r1:0 = combine(#0,#0)   // clear S/H
+  }
+  {
+    S += sfmpy(E,R):lib   // S0
+    H += sfmpy(E,HALF):lib    // H0
+    D = HALF
+    E = R
+  }
+  {
+    D -= sfmpy(S,H):lib   // d0
+    p1 = sfclass(R,#1)    // is zero?
+    //E -= sfmpy(S,S):lib   // e0
+  }
+  {
+    S += sfmpy(S,D):lib   // S1
+    H += sfmpy(H,D):lib   // H1
+    D = HALF
+    E = R
+  }
+  {
+    D -= sfmpy(S,H):lib   // d0
+    E -= sfmpy(S,S):lib   // e0
+  }
+  {
+    S += sfmpy(H,E):lib   // S2
+    H += sfmpy(H,D):lib   // H2
+    D = HALF
+    E = R
+  }
+  {
+    //D -= sfmpy(S,H):lib   // d2
+    E -= sfmpy(S,S):lib   // e2
+    if (p1) r0 = or(r0,R)     // sqrt(-0.0) = -0.0
+  }
+  {
+    S += sfmpy(H,E,p0):scale  // S3
+    jumpr r31
+  }
+
+FUNCTION_END __hexagon_sqrtf
+
+Q6_ALIAS(sqrtf)
+FAST_ALIAS(sqrtf)
+FAST2_ALIAS(sqrtf)
diff --git a/lib/builtins/hexagon/udivdi3.S b/lib/builtins/hexagon/udivdi3.S
new file mode 100644
index 000000000..1ca326b75
--- /dev/null
+++ b/lib/builtins/hexagon/udivdi3.S
@@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivdi3
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jumpr r31           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+	{
+		jumpr r31                   // return
+	}
+FUNCTION_END __hexagon_udivdi3
+
+  .globl __qdsp_udivdi3
+  .set   __qdsp_udivdi3, __hexagon_udivdi3
diff --git a/lib/builtins/hexagon/udivmoddi4.S b/lib/builtins/hexagon/udivmoddi4.S
new file mode 100644
index 000000000..deb5aae09
--- /dev/null
+++ b/lib/builtins/hexagon/udivmoddi4.S
@@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivmoddi4
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jumpr r31           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+	{
+		jumpr r31                   // return
+	}
+FUNCTION_END __hexagon_udivmoddi4
+
+  .globl __qdsp_udivmoddi4
+  .set   __qdsp_udivmoddi4, __hexagon_udivmoddi4
diff --git a/lib/builtins/hexagon/udivmodsi4.S b/lib/builtins/hexagon/udivmodsi4.S
new file mode 100644
index 000000000..25bbe7cd5
--- /dev/null
+++ b/lib/builtins/hexagon/udivmodsi4.S
@@ -0,0 +1,60 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivmodsi4
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		r5:4 = combine(#1,#0)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r6 = sub(r3,r2)
+		r4 = r1
+		r1:0 = combine(r0,r4)
+		if (p0) jumpr r31
+	}
+	{
+		r3:2 = vlslw(r5:4,r6)
+		loop0(1f,r6)
+		p0 = cmp.eq(r6,#0)
+		if (p0.new) r4 = #0
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r4)
+		if (!p0.new) r0 = add(r0,r3)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_udivmodsi4
+
+  .globl __qdsp_udivmodsi4
+  .set   __qdsp_udivmodsi4, __hexagon_udivmodsi4
diff --git a/lib/builtins/hexagon/udivsi3.S b/lib/builtins/hexagon/udivsi3.S
new file mode 100644
index 000000000..54f0aa409
--- /dev/null
+++ b/lib/builtins/hexagon/udivsi3.S
@@ -0,0 +1,56 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivsi3
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		r5:4 = combine(#1,#0)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r6 = sub(r3,r2)
+		r4 = r1
+		r1:0 = combine(r0,r4)
+		if (p0) jumpr r31
+	}
+	{
+		r3:2 = vlslw(r5:4,r6)
+		loop0(1f,r6)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r0 = add(r0,r3)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_udivsi3
+
+  .globl __qdsp_udivsi3
+  .set   __qdsp_udivsi3, __hexagon_udivsi3
diff --git a/lib/builtins/hexagon/umoddi3.S b/lib/builtins/hexagon/umoddi3.S
new file mode 100644
index 000000000..f09152141
--- /dev/null
+++ b/lib/builtins/hexagon/umoddi3.S
@@ -0,0 +1,74 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_umoddi3
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_umoddi3_return           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_umoddi3_return:
+	{
+		r1:0 = r3:2
+		jumpr r31
+	}
+FUNCTION_END __hexagon_umoddi3
+
+  .globl __qdsp_umoddi3
+  .set   __qdsp_umoddi3, __hexagon_umoddi3
diff --git a/lib/builtins/hexagon/umodsi3.S b/lib/builtins/hexagon/umodsi3.S
new file mode 100644
index 000000000..a8270c203
--- /dev/null
+++ b/lib/builtins/hexagon/umodsi3.S
@@ -0,0 +1,55 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_umodsi3
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r2 = sub(r3,r2)
+		if (p0) jumpr r31
+	}
+	{
+		loop0(1f,r2)
+		p1 = cmp.eq(r2,#0)
+		r2 = lsl(r1,r2)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r2)
+		r2 = lsr(r2,#1)
+		if (p1) r1 = #0
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r1)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_umodsi3
+
+  .globl __qdsp_umodsi3
+  .set   __qdsp_umodsi3, __hexagon_umodsi3
author	Sid Manning <sidneym@codeaurora.org>	2018-05-09 14:44:54 +0000
committer	Sid Manning <sidneym@codeaurora.org>	2018-05-09 14:44:54 +0000
commit	45688582b548029601dc801e9f92b0e36dd86823 (patch)
tree	1f6af105d0a0365cbd57847506f597fd8b198511
parent	65b4b157b2946ffe854538ce931a133faa89cdd9 (diff)