[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.

Currently, there are substantial problems forming vld1_dup even if the VDUP survives legalization. The lack of an actual node leads to terrible results: not only can we not form post-increment vld1_dup instructions, but we form scalar pre-increment and post-increment loads which force the loaded value into a GPR. This patch fixes that by combining the vdup+load into an ARMISD node before DAGCombine messes it up. Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable). Recommiting with fix to avoid forming vld1dup if the type of the load doesn't match the type of the vdup (see https://llvm.org/bugs/show_bug.cgi?id=31404). Differential Revision: https://reviews.llvm.org/D27694 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@289972 91177308-0d34-0410-b5e6-96231b3b80d8
author: Eli Friedman <efriedma@codeaurora.org> 2016-12-16 18:44:08 +0000
committer: Eli Friedman <efriedma@codeaurora.org> 2016-12-16 18:44:08 +0000
commit: 59768451f253c5845b2ec5e42c94970285594ffa (patch)
tree: 729f237faad188922345cf058a78dc7ab69c7a9a /test/CodeGen/ARM/vmul.ll
parent: a07f27643c7e5334f2fd5bd6c84c7bde32a759d7 (diff)
1 files changed, 19 insertions, 6 deletions
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 0455190b4c9..fcffe175e2b 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -635,13 +635,26 @@ entry:
   ret void
 }
 
-define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
-;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
-;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
-;   Then check that the vector multiply has folded the splat to all lanes
-;   and used a vector * scalar instruction.
-; CHECK: vldr  {{s[0-9]+}}, [r2]
+define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind {
+; Look for a scalar float rather than a splat, then a vector*scalar multiply.
+; CHECK: vmov s0, r2
 ; CHECK: vmul.f32  q8, q8, d0[0]
+  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
+  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+  %tmp10 = fmul <4 x float> %tmp9, %tmp5
+  store <4 x float> %tmp10, <4 x float>* %dst, align 4
+  ret void
+}
+
+define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+; Look for doing a normal scalar FP load rather than an to-all-lanes load,
+; then a vector*scalar multiply.
+; FIXME: Temporarily broken due to splat representation changes.
+; CHECK: vld1.32 {d18[], d19[]}, [r2:32]
+; CHECK: vmul.f32  q8, q9, q8
   %tmp = load float, float* %src, align 4
   %tmp5 = load <4 x float>, <4 x float>* %a, align 4
   %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
author	Eli Friedman <efriedma@codeaurora.org>	2016-12-16 18:44:08 +0000
committer	Eli Friedman <efriedma@codeaurora.org>	2016-12-16 18:44:08 +0000
commit	59768451f253c5845b2ec5e42c94970285594ffa (patch)
tree	729f237faad188922345cf058a78dc7ab69c7a9a /test/CodeGen/ARM/vmul.ll
parent	a07f27643c7e5334f2fd5bd6c84c7bde32a759d7 (diff)