Files
asmjit/src/test/genblend.h
kobalicek 3fcd65cf80 - Moved to cxxtool to generate build.h compiler and platform based definitions.
- Compiler no longer works on its own, it requires Assembler.
- Labels created by Assembler and Compiler now share their IDs, so they can be used nearly interchangeably without weird side-effects and hacks.
- Renamed getError() and setError() to getLastError() and setLastError().
- Renamed compiler nodes to "HL" nodes (preparation for HLStream).
- Renamed FuncConv to CallConv.
- Function calling convention is now part of FuncPrototype.
- Added a possibility to align by inserting zeros (kAlignZero)
- Fixed assertion in X86Compiler that didn't like unhandled function argument(s).
- Added Compiler::embedConstPool() helper, which can be handy if you use your own ConstPool.
- Code refactorization and other minor changes.
- CpuTicks::now() renamed to Utils::getTickCount().
- error.h merged with globals.h
- Documentation updates related to recent API changes.
2015-12-07 07:34:25 +01:00

181 lines
3.2 KiB
C++

// [AsmJit]
// Complete x86/x64 JIT and Remote Assembler for C++.
//
// [License]
// Zlib - See LICENSE.md file in the package.
// [Guard]
#ifndef _TEST_GENBLEND_H
#define _TEST_GENBLEND_H
// [Dependencies]
#include "../asmjit/asmjit.h"
namespace asmgen {
enum { kGenBlendInstCount = 65 };
// Generate a typical alpha blend function using SSE2 instruction set. Used
// for benchmarking and also in test86. The generated code should be stable
// and fully functional.
static void blend(asmjit::X86Compiler& c) {
using namespace asmjit;
using namespace asmjit::x86;
X86GpVar dst = c.newIntPtr("dst");
X86GpVar src = c.newIntPtr("src");
X86GpVar i = c.newIntPtr("i");
X86GpVar j = c.newIntPtr("j");
X86GpVar t = c.newIntPtr("t");
X86XmmVar cZero = c.newXmm("cZero");
X86XmmVar cMul255A = c.newXmm("cMul255A");
X86XmmVar cMul255M = c.newXmm("cMul255M");
X86XmmVar x0 = c.newXmm("x0");
X86XmmVar x1 = c.newXmm("x1");
X86XmmVar y0 = c.newXmm("y0");
X86XmmVar a0 = c.newXmm("a0");
X86XmmVar a1 = c.newXmm("a1");
Label L_SmallLoop(c);
Label L_SmallEnd(c);
Label L_LargeLoop(c);
Label L_LargeEnd(c);
Label L_Data(c);
c.addFunc(FuncBuilder3<Void, void*, const void*, size_t>(kCallConvHost));
c.setArg(0, dst);
c.setArg(1, src);
c.setArg(2, i);
c.alloc(dst);
c.alloc(src);
c.alloc(i);
// How many pixels have to be processed to make the loop aligned.
c.lea(t, ptr(L_Data));
c.xor_(j, j);
c.xorps(cZero, cZero);
c.sub(j, dst);
c.movaps(cMul255A, ptr(t, 0));
c.and_(j, 15);
c.movaps(cMul255M, ptr(t, 16));
c.shr(j, 2);
c.jz(L_SmallEnd);
// j = min(i, j).
c.cmp(j, i);
c.cmovg(j, i);
// i -= j.
c.sub(i, j);
// Small loop.
c.bind(L_SmallLoop);
c.pcmpeqb(a0, a0);
c.movd(y0, ptr(src));
c.pxor(a0, y0);
c.movd(x0, ptr(dst));
c.psrlw(a0, 8);
c.punpcklbw(x0, cZero);
c.pshuflw(a0, a0, X86Util::shuffle(1, 1, 1, 1));
c.punpcklbw(y0, cZero);
c.pmullw(x0, a0);
c.paddsw(x0, cMul255A);
c.pmulhuw(x0, cMul255M);
c.paddw(x0, y0);
c.packuswb(x0, x0);
c.movd(ptr(dst), x0);
c.add(dst, 4);
c.add(src, 4);
c.dec(j);
c.jnz(L_SmallLoop);
// Second section, prepare for an aligned loop.
c.bind(L_SmallEnd);
c.test(i, i);
c.mov(j, i);
c.jz(c.getFunc()->getExitLabel());
c.and_(j, 3);
c.shr(i, 2);
c.jz(L_LargeEnd);
// Aligned loop.
c.bind(L_LargeLoop);
c.movups(y0, ptr(src));
c.pcmpeqb(a0, a0);
c.movaps(x0, ptr(dst));
c.xorps(a0, y0);
c.movaps(x1, x0);
c.psrlw(a0, 8);
c.punpcklbw(x0, cZero);
c.movaps(a1, a0);
c.punpcklwd(a0, a0);
c.punpckhbw(x1, cZero);
c.punpckhwd(a1, a1);
c.pshufd(a0, a0, X86Util::shuffle(3, 3, 1, 1));
c.pshufd(a1, a1, X86Util::shuffle(3, 3, 1, 1));
c.pmullw(x0, a0);
c.pmullw(x1, a1);
c.paddsw(x0, cMul255A);
c.paddsw(x1, cMul255A);
c.pmulhuw(x0, cMul255M);
c.pmulhuw(x1, cMul255M);
c.add(src, 16);
c.packuswb(x0, x1);
c.paddw(x0, y0);
c.movaps(ptr(dst), x0);
c.add(dst, 16);
c.dec(i);
c.jnz(L_LargeLoop);
c.bind(L_LargeEnd);
c.test(j, j);
c.jnz(L_SmallLoop);
c.endFunc();
// Data.
c.align(kAlignData, 16);
c.bind(L_Data);
c.dxmm(Vec128::fromSw(0x0080));
c.dxmm(Vec128::fromSw(0x0101));
}
} // asmgen namespace
// [Guard]
#endif // _TEST_GENBLEND_H