Refactored register allocator asm Compiler. (#249)

Refactored build system macros (ASMJIT_BUILD_STATIC -> ASMJIT_STATIC)
Refactored AVX512 broadcast {1toN} - moved to operand from instruction.
Refactored naming - renamed getters to not use get prefix.
Refactored code structure - move arch-specific stuff into x86 namespace.
Refactored some compiler/arch-specific macros, respect rel/abs option in mov REG, [ADDR].
Refactored StringBuilder (Renamed to String, added small string optimization).
Refactored LabelId<->LabelEntry mapping, force label offset to 64-bits on all archs.
Renamed Runtime to Target (JitRuntime kept for now).
Renamed VirtMemManager to JitAllocator.
Renamed VirtMem to JitUtils.
Renamed FuncSignatureX to FuncSignatureBuilder.
Fixed xchg [mem], rex-lo, refactored RelocEntry.
Fixed Logger to always show abs|rel when formatting a memory operand
Fixed Logger to prefix HEX numbers with 0x prefix
Fixed Support::ctzGeneric to always return uint32_t, T doesn't matter.
Fixed LightCall to not save MMX and K registers
Fixed CpuInfo constructor to propagate NoInit (#243)
Added VAES, AVX512_VBMI2, AVX512_VNNI, and AVX512_BITALG cpu-features and instructions.
Added emscripten support (asmjit can be now compiled by emscripten).
Added asmjit.natvis for better MSVC experience
Added x86::ptr_abs|ptr_rel
Added support for multibyte nop r/m (#135)
Added support for 32-bit to 64-bit zero-extended addresses, improved validation of memory addresses, and removed wrt address type as this will be reworked
Added support for multiple sections, reworked address table support (previously known as trampolines)
Added the following x86 modifiers to the x86::Emitter - xacquire(), xrelease(), and k(kreg)
Added a possibility to use REP prefix with RET instruction
Added a possibility to relocate [rel addr] during relocate()
Added a variadic function-call support (Compiler), argument duplication (Compiler), better /dev/shm vs /tmp shared memory handling (VirtMem).
Removed imm_u imm_ptr helpers, imm() can now accept any integer and pointer.
Changed the default behavior of optimizing instructions to disabled with a possibility to enable that feature through kOptionOptimizedForSize
Use default copy construction / assignment to prevent new kind of warnings introduced by GCC 9
This commit is contained in:
Petr Kobalicek
2019-07-16 01:24:22 +02:00
committed by GitHub
parent 761130b1d8
commit 5d40561d14
196 changed files with 65058 additions and 56743 deletions

View File

@@ -1,14 +1,12 @@
// [AsmJit]
// Complete x86/x64 JIT and Remote Assembler for C++.
// Machine Code Generation for C++.
//
// [License]
// Zlib - See LICENSE.md file in the package.
// [Guard]
#ifndef _ASMJIT_TEST_MISC_H
#define _ASMJIT_TEST_MISC_H
// [Dependencies]
#include "./asmjit.h"
namespace asmtest {
@@ -16,26 +14,20 @@ namespace asmtest {
// Generate a typical alpha blend function using SSE2 instruction set. Used
// for benchmarking and also in test86. The generated code should be stable
// and fully functional.
static void generateAlphaBlend(asmjit::X86Compiler& cc) {
static void generateAlphaBlend(asmjit::x86::Compiler& cc) {
using namespace asmjit;
using namespace asmjit::x86;
X86Gp dst = cc.newIntPtr("dst");
X86Gp src = cc.newIntPtr("src");
Gp dst = cc.newIntPtr("dst");
Gp src = cc.newIntPtr("src");
X86Gp i = cc.newIntPtr("i");
X86Gp j = cc.newIntPtr("j");
X86Gp t = cc.newIntPtr("t");
Gp i = cc.newIntPtr("i");
Gp j = cc.newIntPtr("j");
Gp t = cc.newIntPtr("t");
X86Xmm x0 = cc.newXmm("x0");
X86Xmm x1 = cc.newXmm("x1");
X86Xmm y0 = cc.newXmm("y0");
X86Xmm a0 = cc.newXmm("a0");
X86Xmm a1 = cc.newXmm("a1");
X86Xmm cZero = cc.newXmm("cZero");
X86Xmm cMul255A = cc.newXmm("cMul255A");
X86Xmm cMul255M = cc.newXmm("cMul255M");
Xmm vzero = cc.newXmm("vzero");
Xmm v0080 = cc.newXmm("v0080");
Xmm v0101 = cc.newXmm("v0101");
Label L_SmallLoop = cc.newLabel();
Label L_SmallEnd = cc.newLabel();
@@ -43,73 +35,68 @@ static void generateAlphaBlend(asmjit::X86Compiler& cc) {
Label L_LargeEnd = cc.newLabel();
Label L_DataPool = cc.newLabel();
cc.addFunc(FuncSignature3<void, void*, const void*, size_t>(cc.getCodeInfo().getCdeclCallConv()));
cc.addFunc(FuncSignatureT<void, void*, const void*, size_t>(cc.codeInfo().cdeclCallConv()));
cc.setArg(0, dst);
cc.setArg(1, src);
cc.setArg(2, i);
cc.alloc(dst);
cc.alloc(src);
cc.alloc(i);
// How many pixels have to be processed to make the loop aligned.
cc.lea(t, ptr(L_DataPool));
cc.lea(t, x86::ptr(L_DataPool));
cc.xorps(vzero, vzero);
cc.movaps(v0080, x86::ptr(t, 0));
cc.movaps(v0101, x86::ptr(t, 16));
cc.xor_(j, j);
cc.xorps(cZero, cZero);
cc.sub(j, dst);
cc.movaps(cMul255A, ptr(t, 0));
cc.and_(j, 15);
cc.movaps(cMul255M, ptr(t, 16));
cc.shr(j, 2);
cc.jz(L_SmallEnd);
// j = min(i, j).
cc.cmp(j, i);
cc.cmovg(j, i);
// i -= j.
cc.sub(i, j);
cc.cmovg(j, i); // j = min(i, j).
cc.sub(i, j); // i -= j.
// Small loop.
cc.bind(L_SmallLoop);
{
Xmm x0 = cc.newXmm("x0");
Xmm y0 = cc.newXmm("y0");
Xmm a0 = cc.newXmm("a0");
cc.pcmpeqb(a0, a0);
cc.movd(y0, ptr(src));
cc.movd(y0, x86::ptr(src));
cc.movd(x0, x86::ptr(dst));
cc.pxor(a0, y0);
cc.movd(x0, ptr(dst));
cc.pcmpeqb(a0, a0);
cc.pxor(a0, y0);
cc.psrlw(a0, 8);
cc.punpcklbw(x0, vzero);
cc.psrlw(a0, 8);
cc.punpcklbw(x0, cZero);
cc.pshuflw(a0, a0, x86::Predicate::shuf(1, 1, 1, 1));
cc.punpcklbw(y0, vzero);
cc.pshuflw(a0, a0, x86::shufImm(1, 1, 1, 1));
cc.punpcklbw(y0, cZero);
cc.pmullw(x0, a0);
cc.paddsw(x0, v0080);
cc.pmulhuw(x0, v0101);
cc.pmullw(x0, a0);
cc.paddsw(x0, cMul255A);
cc.pmulhuw(x0, cMul255M);
cc.paddw(x0, y0);
cc.packuswb(x0, x0);
cc.paddw(x0, y0);
cc.packuswb(x0, x0);
cc.movd(x86::ptr(dst), x0);
cc.movd(ptr(dst), x0);
cc.add(dst, 4);
cc.add(src, 4);
cc.add(dst, 4);
cc.add(src, 4);
cc.dec(j);
cc.jnz(L_SmallLoop);
cc.dec(j);
cc.jnz(L_SmallLoop);
}
// Second section, prepare for an aligned loop.
cc.bind(L_SmallEnd);
cc.test(i, i);
cc.mov(j, i);
cc.jz(cc.getFunc()->getExitLabel());
cc.jz(cc.func()->exitLabel());
cc.and_(j, 3);
cc.shr(i, 2);
@@ -117,45 +104,52 @@ static void generateAlphaBlend(asmjit::X86Compiler& cc) {
// Aligned loop.
cc.bind(L_LargeLoop);
{
Xmm x0 = cc.newXmm("x0");
Xmm x1 = cc.newXmm("x1");
Xmm y0 = cc.newXmm("y0");
Xmm a0 = cc.newXmm("a0");
Xmm a1 = cc.newXmm("a1");
cc.movups(y0, ptr(src));
cc.pcmpeqb(a0, a0);
cc.movaps(x0, ptr(dst));
cc.movups(y0, x86::ptr(src));
cc.movaps(x0, x86::ptr(dst));
cc.xorps(a0, y0);
cc.movaps(x1, x0);
cc.pcmpeqb(a0, a0);
cc.xorps(a0, y0);
cc.movaps(x1, x0);
cc.psrlw(a0, 8);
cc.punpcklbw(x0, cZero);
cc.psrlw(a0, 8);
cc.punpcklbw(x0, vzero);
cc.movaps(a1, a0);
cc.punpcklwd(a0, a0);
cc.movaps(a1, a0);
cc.punpcklwd(a0, a0);
cc.punpckhbw(x1, cZero);
cc.punpckhwd(a1, a1);
cc.punpckhbw(x1, vzero);
cc.punpckhwd(a1, a1);
cc.pshufd(a0, a0, x86::shufImm(3, 3, 1, 1));
cc.pshufd(a1, a1, x86::shufImm(3, 3, 1, 1));
cc.pshufd(a0, a0, x86::Predicate::shuf(3, 3, 1, 1));
cc.pshufd(a1, a1, x86::Predicate::shuf(3, 3, 1, 1));
cc.pmullw(x0, a0);
cc.pmullw(x1, a1);
cc.pmullw(x0, a0);
cc.pmullw(x1, a1);
cc.paddsw(x0, cMul255A);
cc.paddsw(x1, cMul255A);
cc.paddsw(x0, v0080);
cc.paddsw(x1, v0080);
cc.pmulhuw(x0, cMul255M);
cc.pmulhuw(x1, cMul255M);
cc.pmulhuw(x0, v0101);
cc.pmulhuw(x1, v0101);
cc.add(src, 16);
cc.packuswb(x0, x1);
cc.add(src, 16);
cc.packuswb(x0, x1);
cc.paddw(x0, y0);
cc.movaps(ptr(dst), x0);
cc.paddw(x0, y0);
cc.movaps(x86::ptr(dst), x0);
cc.add(dst, 16);
cc.add(dst, 16);
cc.dec(i);
cc.jnz(L_LargeLoop);
cc.dec(i);
cc.jnz(L_LargeLoop);
}
cc.bind(L_LargeEnd);
cc.test(j, j);
@@ -170,7 +164,6 @@ static void generateAlphaBlend(asmjit::X86Compiler& cc) {
cc.dxmm(Data128::fromI16(0x0101));
}
} // asmtest namespace
} // {asmtest}
// [Guard]
#endif // _ASMJIT_TEST_MISC_H