-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][GlobalISel] Fix legalization for <4 x i1> vector stores. #121185
base: users/aemerson/spr/main.aarch64globalisel-fix-legalization-for-4-x-i1-vector-stores
Are you sure you want to change the base?
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Amara Emerson (aemerson) ChangesThis case is different from the earlier <8 x i1> case handled because it triggers It also was triggering incorrect bitcast actions in the AArch64 rules that weren't With these two fixed, more cases are handled. The code is still bad, including Patch is 23.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121185.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7ffd00bf4cd689..239d35c7b1d04f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4117,10 +4117,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
unsigned StoreWidth = MemTy.getSizeInBits();
unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
- if (StoreWidth != StoreSizeInBits) {
- if (SrcTy.isVector())
- return UnableToLegalize;
-
+ if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
// Promote to a byte-sized store with upper bits zero if not
// storing an integral number of bytes. For example, promote
// TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 062e7ace5e724d..80339b5228d23c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -467,7 +467,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampMaxNumElements(0, p0, 2)
.lowerIfMemSizeNotPow2()
// TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
- .bitcastIf(typeInSet(0, {v4s8}),
+ .bitcastIf(all(typeInSet(0, {v4s8}),
+ LegalityPredicate([=](const LegalityQuery &Query) {
+ return Query.Types[0].getSizeInBits() ==
+ Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ })),
[=](const LegalityQuery &Query) {
const LLT VecTy = Query.Types[0];
return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 1fa96979f45530..cbb90c52835df8 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -154,15 +154,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
}
define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; CHECK-LABEL: convert_to_bitmask4:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: adrp x8, lCPI2_0@PAGE
-; CHECK-NEXT: cmeq.4s v0, v0, #0
-; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF]
-; CHECK-NEXT: bic.16b v0, v1, v0
-; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: adrp x8, lCPI2_0@PAGE
+; SDAG-NEXT: cmeq.4s v0, v0, #0
+; SDAG-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF]
+; SDAG-NEXT: bic.16b v0, v1, v0
+; SDAG-NEXT: addv.4s s0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: cmeq.4s v0, v0, #0
+; GISEL-NEXT: mvn.16b v0, v0
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -210,17 +231,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_no_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: adrp x8, lCPI5_0@PAGE
-; CHECK-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF]
-; CHECK-NEXT: shl.4s v0, v0, #31
-; CHECK-NEXT: cmlt.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_no_compare:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: and.16b v0, v0, v1
+; SDAG-NEXT: adrp x8, lCPI5_0@PAGE
+; SDAG-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF]
+; SDAG-NEXT: shl.4s v0, v0, #31
+; SDAG-NEXT: cmlt.4s v0, v0, #0
+; SDAG-NEXT: and.16b v0, v0, v1
+; SDAG-NEXT: addv.4s s0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_no_compare:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: and.16b v0, v0, v1
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp = and <4 x i32> %vec1, %vec2
@@ -230,17 +271,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
}
define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_compare_chain:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: cmeq.4s v2, v0, #0
-; CHECK-NEXT: cmeq.4s v0, v0, v1
-; CHECK-NEXT: adrp x8, lCPI6_0@PAGE
-; CHECK-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF]
-; CHECK-NEXT: bic.16b v0, v0, v2
-; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: cmeq.4s v2, v0, #0
+; SDAG-NEXT: cmeq.4s v0, v0, v1
+; SDAG-NEXT: adrp x8, lCPI6_0@PAGE
+; SDAG-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF]
+; SDAG-NEXT: bic.16b v0, v0, v2
+; SDAG-NEXT: and.16b v0, v0, v1
+; SDAG-NEXT: addv.4s s0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: cmeq.4s v2, v0, #0
+; GISEL-NEXT: cmeq.4s v0, v0, v1
+; GISEL-NEXT: bic.16b v0, v0, v2
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -251,18 +314,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
}
define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: cmeq.4s v0, v0, #0
-; CHECK-NEXT: adrp x8, lCPI7_0@PAGE
-; CHECK-NEXT: bic.16b v0, v1, v0
-; CHECK-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF]
-; CHECK-NEXT: shl.4s v0, v0, #31
-; CHECK-NEXT: cmlt.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: cmeq.4s v0, v0, #0
+; SDAG-NEXT: adrp x8, lCPI7_0@PAGE
+; SDAG-NEXT: bic.16b v0, v1, v0
+; SDAG-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF]
+; SDAG-NEXT: shl.4s v0, v0, #31
+; SDAG-NEXT: cmlt.4s v0, v0, #0
+; SDAG-NEXT: and.16b v0, v0, v1
+; SDAG-NEXT: addv.4s s0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: cmeq.4s v0, v0, #0
+; GISEL-NEXT: bic.16b v0, v1, v0
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -273,30 +357,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
}
define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: cmeq.4s v0, v0, #0
-; CHECK-NEXT: cmeq.4s v1, v1, #0
-; CHECK-NEXT: adrp x8, lCPI8_0@PAGE
-; CHECK-NEXT: movi d2, #0x000000ffffffff
-; CHECK-NEXT: movi d3, #0x00ffffffffffff
-; CHECK-NEXT: bic.16b v0, v1, v0
-; CHECK-NEXT: movi d1, #0xffff0000ffff0000
-; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: orr.8b v0, v0, v2
-; CHECK-NEXT: movi d2, #0x00ffffffff0000
-; CHECK-NEXT: eor.8b v1, v0, v1
-; CHECK-NEXT: eor.8b v0, v0, v2
-; CHECK-NEXT: mov.h v1[2], wzr
-; CHECK-NEXT: orr.8b v0, v0, v3
-; CHECK-NEXT: orr.8b v0, v1, v0
-; CHECK-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF]
-; CHECK-NEXT: shl.4h v0, v0, #15
-; CHECK-NEXT: cmlt.4h v0, v0, #0
-; CHECK-NEXT: and.8b v0, v0, v1
-; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: cmeq.4s v0, v0, #0
+; SDAG-NEXT: cmeq.4s v1, v1, #0
+; SDAG-NEXT: adrp x8, lCPI8_0@PAGE
+; SDAG-NEXT: movi d2, #0x000000ffffffff
+; SDAG-NEXT: movi d3, #0x00ffffffffffff
+; SDAG-NEXT: bic.16b v0, v1, v0
+; SDAG-NEXT: movi d1, #0xffff0000ffff0000
+; SDAG-NEXT: xtn.4h v0, v0
+; SDAG-NEXT: orr.8b v0, v0, v2
+; SDAG-NEXT: movi d2, #0x00ffffffff0000
+; SDAG-NEXT: eor.8b v1, v0, v1
+; SDAG-NEXT: eor.8b v0, v0, v2
+; SDAG-NEXT: mov.h v1[2], wzr
+; SDAG-NEXT: orr.8b v0, v0, v3
+; SDAG-NEXT: orr.8b v0, v1, v0
+; SDAG-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF]
+; SDAG-NEXT: shl.4h v0, v0, #15
+; SDAG-NEXT: cmlt.4h v0, v0, #0
+; SDAG-NEXT: and.8b v0, v0, v1
+; SDAG-NEXT: addv.4h h0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: mov w8, #1 ; =0x1
+; GISEL-NEXT: mov w9, #0 ; =0x0
+; GISEL-NEXT: cmeq.4s v5, v0, #0
+; GISEL-NEXT: fmov s2, w8
+; GISEL-NEXT: fmov s4, w9
+; GISEL-NEXT: cmeq.4s v1, v1, #0
+; GISEL-NEXT: mov.16b v3, v2
+; GISEL-NEXT: mov.16b v0, v4
+; GISEL-NEXT: mov.h v4[1], w8
+; GISEL-NEXT: bic.16b v1, v1, v5
+; GISEL-NEXT: mov.16b v5, v2
+; GISEL-NEXT: mov.h v2[1], w8
+; GISEL-NEXT: mov.h v3[1], w8
+; GISEL-NEXT: mov.h v0[1], w8
+; GISEL-NEXT: mov.h v5[1], w8
+; GISEL-NEXT: mov.h v4[2], w8
+; GISEL-NEXT: xtn.4h v1, v1
+; GISEL-NEXT: mov.h v2[2], w8
+; GISEL-NEXT: mov.h v3[2], w9
+; GISEL-NEXT: mov.h v0[2], w9
+; GISEL-NEXT: mov.h v5[2], w9
+; GISEL-NEXT: mov.h v4[3], w9
+; GISEL-NEXT: mov.h v2[3], w9
+; GISEL-NEXT: mov.h v3[3], w9
+; GISEL-NEXT: mov.h v0[3], w8
+; GISEL-NEXT: mov.h v5[3], w8
+; GISEL-NEXT: orr.8b v1, v1, v3
+; GISEL-NEXT: eor.8b v0, v1, v0
+; GISEL-NEXT: eor.8b v1, v4, v1
+; GISEL-NEXT: and.8b v0, v0, v5
+; GISEL-NEXT: orr.8b v1, v2, v1
+; GISEL-NEXT: orr.8b v0, v0, v1
+; GISEL-NEXT: ushll.4s v0, v0, #0
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -315,18 +451,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
}
define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: cmeq.4s v1, v1, #0
-; CHECK-NEXT: cmeq.4h v0, v0, #0
-; CHECK-NEXT: adrp x8, lCPI9_0@PAGE
-; CHECK-NEXT: xtn.4h v1, v1
-; CHECK-NEXT: orn.8b v0, v1, v0
-; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF]
-; CHECK-NEXT: and.8b v0, v0, v1
-; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: cmeq.4s v1, v1, #0
+; SDAG-NEXT: cmeq.4h v0, v0, #0
+; SDAG-NEXT: adrp x8, lCPI9_0@PAGE
+; SDAG-NEXT: xtn.4h v1, v1
+; SDAG-NEXT: orn.8b v0, v1, v0
+; SDAG-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF]
+; SDAG-NEXT: and.8b v0, v0, v1
+; SDAG-NEXT: addv.4h h0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: cmeq.4s v1, v1, #0
+; GISEL-NEXT: cmeq.4h v0, v0, #0
+; GISEL-NEXT: xtn.4h v1, v1
+; GISEL-NEXT: orn.8b v0, v1, v0
+; GISEL-NEXT: ushll.4s v0, v0, #0
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -426,16 +586,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
}
define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; CHECK-LABEL: convert_to_bitmask_4xi8:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
-; CHECK-NEXT: adrp x8, lCPI12_0@PAGE
-; CHECK-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF]
-; CHECK-NEXT: cmeq.4h v0, v0, #0
-; CHECK-NEXT: bic.8b v0, v1, v0
-; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_4xi8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: bic.4h v0, #255, lsl #8
+; SDAG-NEXT: adrp x8, lCPI12_0@PAGE
+; SDAG-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF]
+; SDAG-NEXT: cmeq.4h v0, v0, #0
+; SDAG-NEXT: bic.8b v0, v1, v0
+; SDAG-NEXT: addv.4h h0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_4xi8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: mov w8, #0 ; =0x0
+; GISEL-NEXT: uzp1.8b v0, v0, v0
+; GISEL-NEXT: fmov s1, w8
+; GISEL-NEXT: mov.b v1[1], w8
+; GISEL-NEXT: mov.b v1[2], w8
+; GISEL-NEXT: mov.b v1[3], w8
+; GISEL-NEXT: cmeq.8b v0, v0, v1
+; GISEL-NEXT: mvn.8b v0, v0
+; GISEL-NEXT: umov.b w8, v0[0]
+; GISEL-NEXT: umov.b w9, v0[1]
+; GISEL-NEXT: mov.s v1[0], w8
+; GISEL-NEXT: umov.b w8, v0[2]
+; GISEL-NEXT: mov.s v1[1], w9
+; GISEL-NEXT: umov.b w9, v0[3]
+; GISEL-NEXT: mov.s v1[2], w8
+; GISEL-NEXT: mov.s v1[3], w9
+; GISEL-NEXT: mov.s w8, v1[1]
+; GISEL-NEXT: mov.s w9, v1[2]
+; GISEL-NEXT: fmov w11, s1
+; GISEL-NEXT: mov.s w10, v1[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
%bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -461,17 +656,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
}
define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; CHECK-LABEL: convert_to_bitmask_float:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: fcmgt.4s v1, v0, #0.0
-; CHECK-NEXT: fcmlt.4s v0, v0, #0.0
-; CHECK-NEXT: adrp x8, lCPI14_0@PAGE
-; CHECK-NEXT: orr.16b v0, v0, v1
-; CHECK-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF]
-; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_to_bitmask_float:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: fcmgt.4s v1, v0, #0.0
+; SDAG-NEXT: fcmlt.4s v0, v0, #0.0
+; SDAG-NEXT: adrp x8, lCPI14_0@PAGE
+; SDAG-NEXT: orr.16b v0, v0, v1
+; SDAG-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF]
+; SDAG-NEXT: and.16b v0, v0, v1
+; SDAG-NEXT: addv.4s s0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_to_bitmask_float:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: fcmgt.4s v1, v0, #0.0
+; GISEL-NEXT: fcmlt.4s v0, v0, #0.0
+; GISEL-NEXT: orr.16b v0, v0, v1
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -542,17 +759,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
}
define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; CHECK-LABEL: convert_legalized_illegal_element_size:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.4s v1, #63, msl #16
-; CHECK-NEXT: adrp x8, lCPI16_0@PAGE
-; CHECK-NEXT: cmtst.4s v0, v0, v1
-; CHECK-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF]
-; CHECK-NEXT: xtn.4h v0, v0
-; CHECK-NEXT: and.8b v0, v0, v1
-; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: convert_legalized_illegal_element_size:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: movi.4s v1, #63, msl #16
+; SDAG-NEXT: adrp x8, lCPI16_0@PAGE
+; SDAG-NEXT: cmtst.4s v0, v0, v1
+; SDAG-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF]
+; SDAG-NEXT: xtn.4h v0, v0
+; SDAG-NEXT: and.8b v0, v0, v1
+; SDAG-NEXT: addv.4h h0, v0
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: convert_legalized_illegal_element_size:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: sub sp, sp, #16
+; GISEL-NEXT: .cfi_def_cfa_offset 16
+; GISEL-NEXT: movi.4s v1, #63, msl #16
+; GISEL-NEXT: and.16b v0, v0, v1
+; GISEL-NEXT: cmeq.4s v0, v0, #0
+; GISEL-NEXT: mvn.16b v0, v0
+; GISEL-NEXT: mov.s w8, v0[1]
+; GISEL-NEXT: mov.s w9, v0[2]
+; GISEL-NEXT: fmov w11, s0
+; GISEL-NEXT: mov.s w10, v0[3]
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: bfi w11, w8, #1, #31
+; GISEL-NEXT: and w8, w9, #0x1
+; GISEL-NEXT: and w9, w10, #0x1
+; GISEL-NEXT: orr w8, w11, w8, lsl #2
+; GISEL-NEXT: orr w8, w8, w9, lsl #3
+; GISEL-NEXT: strb w8, [sp, #15]
+; GISEL-NEXT: and w0, w8, #0xff
+; GISEL-NEXT: add sp, sp, #16
+; GISEL-NEXT: ret
%cmp_result = icmp ne <4 x i22> %vec, zeroinit...
[truncated]
|
Depends on #121169 |
Created using spr 1.3.5
Created using spr 1.3.5
This case is different from the earlier <8 x i1> case handled because it triggers
a legalization failure in lowerStore() that's intended for scalar code.
It also was triggering incorrect bitcast actions in the AArch64 rules that weren't
expecting truncating stores.
With these two fixed, more cases are handled. The code is still bad, including
some missing load promotion in our combiners that result in dead stores hanging
around at the end of codegen. Again, we can fix these in separate changes.