[Bug] Fixed not cloberring YMM|ZMM registers in function calls that preserve only low 128-bits of vector registers

This commit is contained in:
kobalicek
2022-04-10 00:32:57 +02:00
parent 8fdee13aea
commit a4cb51b532
2 changed files with 147 additions and 0 deletions

View File

@@ -836,6 +836,34 @@ Error RALocalAllocator::allocInst(InstNode* node) noexcept {
// STEP 9
// ------
//
// Vector registers can be cloberred partially by invoke - find if that's the case and clobber when necessary.
if (node->isInvoke() && group == RegGroup::kVec) {
const InvokeNode* invokeNode = node->as<InvokeNode>();
RegMask maybeClobberedRegs = invokeNode->detail().callConv().preservedRegs(group) & _curAssignment.assigned(group);
if (maybeClobberedRegs) {
uint32_t saveRestoreVecSize = invokeNode->detail().callConv().saveRestoreRegSize(group);
Support::BitWordIterator<RegMask> it(maybeClobberedRegs);
do {
uint32_t physId = it.next();
uint32_t workId = _curAssignment.physToWorkId(group, physId);
RAWorkReg* workReg = workRegById(workId);
uint32_t virtSize = workReg->virtReg()->virtSize();
if (virtSize > saveRestoreVecSize) {
ASMJIT_PROPAGATE(onSpillReg(group, workId, physId));
}
} while (it.hasNext());
}
}
// STEP 10
// -------
//
// Assign OUT registers.
if (outPending) {

View File

@@ -3819,6 +3819,124 @@ public:
static uint32_t calledFunc(uint32_t x) { return x + 1; }
};
// x86::Compiler - X86Test_FuncCallAVXClobber
// ==========================================
class X86Test_FuncCallAVXClobber : public X86TestCase {
public:
X86Test_FuncCallAVXClobber() : X86TestCase("FuncCallAVXClobber") {}
static void add(TestApp& app) {
const CpuInfo& cpuInfo = CpuInfo::host();
if (cpuInfo.features().x86().hasAVX2() && sizeof(void*) == 8)
app.add(new X86Test_FuncCallAVXClobber());
}
virtual void compile(x86::Compiler& cc) {
FuncNode* mainFunc = cc.addFunc(FuncSignatureT<void, void*, const void*, const void*>(CallConvId::kHost));
mainFunc->frame().setAvxEnabled();
mainFunc->frame().setAvxCleanup();
// We need a Windows calling convention to test this properly also on a non-Windows machine.
FuncNode* helperFunc = cc.newFunc(FuncSignatureT<void, void*, const void*>(CallConvId::kX64Windows));
helperFunc->frame().setAvxEnabled();
helperFunc->frame().setAvxCleanup();
{
size_t i;
x86::Gp dPtr = cc.newIntPtr("dPtr");
x86::Gp aPtr = cc.newIntPtr("aPtr");
x86::Gp bPtr = cc.newIntPtr("bPtr");
x86::Gp tPtr = cc.newIntPtr("tPtr");
x86::Ymm acc[8];
x86::Mem stack = cc.newStack(32, 1, "stack");
mainFunc->setArg(0, dPtr);
mainFunc->setArg(1, aPtr);
mainFunc->setArg(2, bPtr);
cc.lea(tPtr, stack);
for (i = 0; i < 8; i++) {
acc[i] = cc.newYmm("acc%zu", i);
cc.vmovdqu(acc[i], x86::ptr(aPtr));
}
InvokeNode* invokeNode;
cc.invoke(&invokeNode,
helperFunc->label(),
FuncSignatureT<void, void*, const void*>(CallConvId::kX64Windows));
invokeNode->setArg(0, tPtr);
invokeNode->setArg(1, bPtr);
for (i = 1; i < 8; i++) {
cc.vpaddd(acc[0], acc[0], acc[i]);
}
cc.vpaddd(acc[0], acc[0], x86::ptr(tPtr));
cc.vmovdqu(x86::ptr(dPtr), acc[0]);
cc.endFunc();
}
{
cc.addFunc(helperFunc);
x86::Gp dPtr = cc.newIntPtr("dPtr");
x86::Gp aPtr = cc.newIntPtr("aPtr");
helperFunc->setArg(0, dPtr);
helperFunc->setArg(1, aPtr);
x86::Gp tmp = cc.newIntPtr("tmp");
x86::Ymm acc = cc.newYmm("acc");
cc.mov(tmp, 1);
cc.vmovd(acc.xmm(), tmp);
cc.vpbroadcastd(acc, acc.xmm());
cc.vpaddd(acc, acc, x86::ptr(aPtr));
cc.vmovdqu(x86::ptr(dPtr), acc);
cc.endFunc();
}
}
virtual bool run(void* _func, String& result, String& expect) {
typedef void (*Func)(void*, const void*, const void*);
Func func = ptr_as_func<Func>(_func);
size_t i;
static const uint32_t aData[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
static const uint32_t bData[8] = { 6, 3, 5, 9, 1, 8, 7, 2 };
uint32_t resultData[8];
uint32_t expectData[8];
for (i = 0; i < 8; i++)
expectData[i] = aData[i] * 8 + bData[i] + 1;
func(resultData, aData, bData);
result.assign("{");
expect.assign("{");
for (i = 0; i < 8; i++) {
result.appendFormat("%u", resultData[i]);
expect.appendFormat("%u", expectData[i]);
if (i != 7) result.append(", ");
if (i != 7) expect.append(", ");
}
result.append("}");
expect.append("}");
return result == expect;
}
};
// x86::Compiler - X86Test_MiscLocalConstPool
// ==========================================
@@ -4186,6 +4304,7 @@ void compiler_add_x86_tests(TestApp& app) {
app.addT<X86Test_FuncCallMisc4>();
app.addT<X86Test_FuncCallMisc5>();
app.addT<X86Test_FuncCallMisc6>();
app.addT<X86Test_FuncCallAVXClobber>();
// Miscellaneous tests.
app.addT<X86Test_MiscLocalConstPool>();