; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) ; Tests showing replacement of variable rotates with immediate splat versions. define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { ; KNL-LABEL: test_splat_rol_v16i32: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprold $5, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprold $5, %zmm0, %zmm0 ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_rol_v16i32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprold $5, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprold $5, %zmm0, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) %res3 = add <16 x i32> %res, %res1 %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { ; KNL-LABEL: test_splat_rol_v8i64: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprolq $5, %zmm0, %zmm0 ; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_rol_v8i64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprolq $5, %zmm0, %zmm0 ; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) %res3 = add <8 x i64> %res, %res1 %res4 = add <8 x i64> %res3, %res2 ret <8 x i64> %res4 } define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { ; KNL-LABEL: test_splat_ror_v16i32: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprord $5, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprord $5, %zmm0, %zmm0 ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_ror_v16i32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprord $5, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprord $5, %zmm0, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) %res3 = add <16 x i32> %res, %res1 %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { ; KNL-LABEL: test_splat_ror_v8i64: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprorq $5, %zmm0, %zmm0 ; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_ror_v8i64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprorq $5, %zmm0, %zmm0 ; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) %res3 = add <8 x i64> %res, %res1 %res4 = add <8 x i64> %res3, %res2 ret <8 x i64> %res4 } ; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions. define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { ; KNL-LABEL: test_splat_bounds_rol_v16i32: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprold $1, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprold $30, %zmm0, %zmm0 ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_bounds_rol_v16i32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprold $1, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprold $30, %zmm0, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) %res3 = add <16 x i32> %res, %res1 %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { ; KNL-LABEL: test_splat_bounds_rol_v8i64: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprolq $63, %zmm0, %zmm0 ; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_bounds_rol_v8i64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprolq $63, %zmm0, %zmm0 ; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) %res3 = add <8 x i64> %res, %res1 %res4 = add <8 x i64> %res3, %res2 ret <8 x i64> %res4 } define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { ; KNL-LABEL: test_splat_bounds_ror_v16i32: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprord $1, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprord $30, %zmm0, %zmm0 ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_bounds_ror_v16i32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprord $1, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprord $30, %zmm0, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 %x2) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> zeroinitializer, i16 %x2) %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) %res3 = add <16 x i32> %res, %res1 %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { ; KNL-LABEL: test_splat_bounds_ror_v8i64: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: vprorq $63, %zmm0, %zmm0 ; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_splat_bounds_ror_v8i64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} ; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: vprorq $63, %zmm0, %zmm0 ; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %x2) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> zeroinitializer, i8 %x2) %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 -1) %res3 = add <8 x i64> %res, %res1 %res4 = add <8 x i64> %res3, %res2 ret <8 x i64> %res4 } ; Constant folding ; We also test with a target shuffle so that this can't be constant folded upon creation, it must ; wait until the target shuffle has been constant folded in combineX86ShufflesRecursively. define <8 x i64> @test_fold_rol_v8i64() { ; CHECK-LABEL: test_fold_rol_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) ret <8 x i64> %res } define <16 x i32> @test_fold_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: test_fold_rol_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> , <16 x i32> zeroinitializer, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %res0, <16 x i32> , <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %res1 } define <8 x i64> @test_fold_ror_v8i64() { ; CHECK-LABEL: test_fold_ror_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: vprorvq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %res0, <8 x i64> , <8 x i64> zeroinitializer, i8 -1) ret <8 x i64> %res1 } define <16 x i32> @test_fold_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: test_fold_ror_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: vprorvd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> , <16 x i32> zeroinitializer, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %res0, <16 x i32> , <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %res1 }