Skip to content

Commit c3f95e9

Browse files
committed
[X86] Refine AMX fast register allocation
1 parent 3b8ec86 commit c3f95e9

File tree

2 files changed

+38
-36
lines changed

2 files changed

+38
-36
lines changed

llvm/lib/Target/X86/X86PreAMXConfig.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
161161
Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
162162
Value *PalettePos =
163163
GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
164-
new StoreInst(PaletteValue, PalettePos, "", Pos);
164+
new StoreInst(PaletteValue, PalettePos, Pos);
165165

166166
for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
167167
Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
@@ -175,8 +175,8 @@ bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
175175
Value *Row = Shapes[I * 2];
176176
Value *Col = Shapes[I * 2 + 1];
177177
Row = new TruncInst(Row, I8Ty, "", Pos);
178-
new StoreInst(Row, RowPos, "", Pos);
179-
new StoreInst(Col, ColPos, "", Pos);
178+
new StoreInst(Row, RowPos, Pos);
179+
new StoreInst(Col, ColPos, Pos);
180180
Write = true;
181181
}
182182
return Write;

llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll

+35-33
Original file line numberDiff line numberDiff line change
@@ -25,117 +25,119 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) l
2525
; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8*
2626
; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0
2727
; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then
28-
; CHECK: if.then: ; preds = %entry
28+
29+
; CHECK: if.then:
2930
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
3031
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
3132
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
32-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
33+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
3334
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
3435
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
3536
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
3637
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
37-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
38-
; CHECK-NEXT: store volatile i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
38+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
39+
; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
3940
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
4041
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
4142
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
4243
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
4344
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
4445
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
45-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
46+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
4647
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
4748
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
4849
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
4950
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
50-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
51-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
51+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
52+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
5253
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
5354
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
5455
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
5556
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
5657
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
5758
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
58-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
59+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
5960
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
6061
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
6162
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
6263
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
63-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
64-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
64+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
65+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
6566
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
6667
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
6768
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
6869
; CHECK-NEXT: br label %if.end
69-
; CHECK: if.else:
70+
71+
; CHECK: if.else:
7072
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
7173
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
7274
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
73-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
75+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
7476
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
7577
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
7678
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
7779
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
78-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
79-
; CHECK-NEXT: store volatile i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
80+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
81+
; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2
8082
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
8183
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
8284
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
8385
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
8486
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
8587
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
86-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
88+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
8789
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
8890
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
8991
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
9092
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
91-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
92-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
93+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
94+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
9395
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
9496
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
9597
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
9698
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
9799
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
98100
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
99-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
101+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
100102
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
101103
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
102104
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
103105
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
104-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
105-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
106+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
107+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
106108
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
107109
; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
108110
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}})
109111
; CHECK-NEXT: br label %if.end
110-
; CHECK: if.end: ; preds = %if.else, %if.then
112+
; CHECK: if.end: ; preds = %if.else, %if.then
111113
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
112114
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
113115
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
114-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
116+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
115117
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
116118
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
117119
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
118120
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
119-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
120-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
121+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
122+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
121123
; CHECK-NEXT: %amx.tmm.1.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 49
122124
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 18
123125
; CHECK-NEXT: %amx.tmm.1.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
124126
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
125-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1
126-
; CHECK-NEXT: store volatile i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2
127+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1
128+
; CHECK-NEXT: store i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2
127129
; CHECK-NEXT: %amx.tmm.2.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 50
128130
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 20
129131
; CHECK-NEXT: %amx.tmm.2.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
130132
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8
131-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1
132-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2
133+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1
134+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2
133135
; CHECK-NEXT: %amx.tmm.3.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 51
134136
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 22
135137
; CHECK-NEXT: %amx.tmm.3.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
136138
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
137-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1
138-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2
139+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1
140+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2
139141
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
140142
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64)
141143
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64)
@@ -145,13 +147,13 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) l
145147
; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8*
146148
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4
147149
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0
148-
; CHECK-NEXT: store volatile i8 1, i8* %{{[0-9]+}}, align 1
150+
; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1
149151
; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48
150152
; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16
151153
; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16*
152154
; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8
153-
; CHECK-NEXT: store volatile i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
154-
; CHECK-NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
155+
; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1
156+
; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2
155157
; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}})
156158
; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64)
157159
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}})

0 commit comments

Comments
 (0)