mirror of
https://github.com/asmjit/asmjit.git
synced 2025-12-16 20:17:05 +03:00
* Renamed round to round_even
* Added round_half_up intrinsic
* Added floating-point mod
* Added a scalar version of floating-point abs and neg
* Added a behavior enum to specify how float to int conversion
handles out-of-range and NaN cases
* Updated some APX stuff in instruction database
5337 lines
175 KiB
C++
5337 lines
175 KiB
C++
// This file is part of AsmJit project <https://asmjit.com>
|
|
//
|
|
// See <asmjit/core.h> or LICENSE.md for license and copyright information
|
|
// SPDX-License-Identifier: Zlib
|
|
|
|
#include <asmjit/core.h>
|
|
|
|
#if !defined(ASMJIT_NO_X86)
|
|
#include <asmjit/x86.h>
|
|
|
|
#include <limits>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "asmjit_bench_codegen.h"
|
|
#include "../tests/asmjit_test_misc.h"
|
|
|
|
using namespace asmjit;
|
|
|
|
enum class InstForm {
|
|
kReg,
|
|
kMem
|
|
};
|
|
|
|
// Generates a long sequence of GP instructions.
|
|
template<typename Emitter>
|
|
static void generate_gp_sequence_internal(
|
|
Emitter& cc,
|
|
InstForm form,
|
|
const x86::Gp& a, const x86::Gp& b, const x86::Gp& c, const x86::Gp& d) {
|
|
|
|
cc.mov(a, 0xAAAAAAAA);
|
|
cc.mov(b, 0xBBBBBBBB);
|
|
cc.mov(c, 0xCCCCCCCC);
|
|
cc.mov(d, 0xFFFFFFFF);
|
|
|
|
if (form == InstForm::kReg) {
|
|
cc.adc(a, b);
|
|
cc.adc(b, c);
|
|
cc.adc(c, d);
|
|
cc.add(a, b);
|
|
cc.add(b, c);
|
|
cc.add(c, d);
|
|
cc.and_(a, b);
|
|
cc.and_(b, c);
|
|
cc.and_(c, d);
|
|
cc.bsf(a, b);
|
|
cc.bsf(b, c);
|
|
cc.bsf(c, d);
|
|
cc.bsr(a, b);
|
|
cc.bsr(b, c);
|
|
cc.bsr(c, d);
|
|
cc.bswap(a);
|
|
cc.bswap(b);
|
|
cc.bswap(c);
|
|
cc.bt(a, b);
|
|
cc.bt(b, c);
|
|
cc.bt(c, d);
|
|
cc.btc(a, b);
|
|
cc.btc(b, c);
|
|
cc.btc(c, d);
|
|
cc.btr(a, b);
|
|
cc.btr(b, c);
|
|
cc.btr(c, d);
|
|
cc.bts(a, b);
|
|
cc.bts(b, c);
|
|
cc.bts(c, d);
|
|
cc.cmp(a, b);
|
|
cc.cmovc(a, b);
|
|
cc.cmp(b, c);
|
|
cc.cmovc(b, c);
|
|
cc.cmp(c, d);
|
|
cc.cmovc(c, d);
|
|
cc.dec(a);
|
|
cc.dec(b);
|
|
cc.dec(c);
|
|
cc.imul(a, b);
|
|
cc.imul(b, c);
|
|
cc.imul(c, d);
|
|
cc.movsx(a, b.r8_lo());
|
|
cc.movsx(b, c.r8_lo());
|
|
cc.movsx(c, d.r8_lo());
|
|
cc.movzx(a, b.r8_lo());
|
|
cc.movzx(b, c.r8_lo());
|
|
cc.movzx(c, d.r8_lo());
|
|
cc.neg(a);
|
|
cc.neg(b);
|
|
cc.neg(c);
|
|
cc.not_(a);
|
|
cc.not_(b);
|
|
cc.not_(c);
|
|
cc.or_(a, b);
|
|
cc.or_(b, c);
|
|
cc.or_(c, d);
|
|
cc.sbb(a, b);
|
|
cc.sbb(b, c);
|
|
cc.sbb(c, d);
|
|
cc.sub(a, b);
|
|
cc.sub(b, c);
|
|
cc.sub(c, d);
|
|
cc.test(a, b);
|
|
cc.test(b, c);
|
|
cc.test(c, d);
|
|
cc.xchg(a, b);
|
|
cc.xchg(b, c);
|
|
cc.xchg(c, d);
|
|
cc.xor_(a, b);
|
|
cc.xor_(b, c);
|
|
cc.xor_(c, d);
|
|
|
|
cc.rcl(a, c.r8_lo());
|
|
cc.rcl(b, c.r8_lo());
|
|
cc.rcl(d, c.r8_lo());
|
|
cc.rcr(a, c.r8_lo());
|
|
cc.rcr(b, c.r8_lo());
|
|
cc.rcr(d, c.r8_lo());
|
|
cc.rol(a, c.r8_lo());
|
|
cc.rol(b, c.r8_lo());
|
|
cc.rol(d, c.r8_lo());
|
|
cc.ror(a, c.r8_lo());
|
|
cc.ror(b, c.r8_lo());
|
|
cc.ror(d, c.r8_lo());
|
|
cc.shl(a, c.r8_lo());
|
|
cc.shl(b, c.r8_lo());
|
|
cc.shl(d, c.r8_lo());
|
|
cc.shr(a, c.r8_lo());
|
|
cc.shr(b, c.r8_lo());
|
|
cc.shr(d, c.r8_lo());
|
|
cc.sar(a, c.r8_lo());
|
|
cc.sar(b, c.r8_lo());
|
|
cc.sar(d, c.r8_lo());
|
|
cc.shld(a, b, c.r8_lo());
|
|
cc.shld(b, d, c.r8_lo());
|
|
cc.shld(d, a, c.r8_lo());
|
|
cc.shrd(a, b, c.r8_lo());
|
|
cc.shrd(b, d, c.r8_lo());
|
|
cc.shrd(d, a, c.r8_lo());
|
|
|
|
cc.adcx(a, b);
|
|
cc.adox(a, b);
|
|
cc.adcx(b, c);
|
|
cc.adox(b, c);
|
|
cc.adcx(c, d);
|
|
cc.adox(c, d);
|
|
cc.andn(a, b, c);
|
|
cc.andn(b, c, d);
|
|
cc.andn(c, d, a);
|
|
cc.bextr(a, b, c);
|
|
cc.bextr(b, c, d);
|
|
cc.bextr(c, d, a);
|
|
cc.blsi(a, b);
|
|
cc.blsi(b, c);
|
|
cc.blsi(c, d);
|
|
cc.blsmsk(a, b);
|
|
cc.blsmsk(b, c);
|
|
cc.blsmsk(c, d);
|
|
cc.blsr(a, b);
|
|
cc.blsr(b, c);
|
|
cc.blsr(c, d);
|
|
cc.bzhi(a, b, c);
|
|
cc.bzhi(b, c, d);
|
|
cc.bzhi(c, d, a);
|
|
cc.lzcnt(a, b);
|
|
cc.lzcnt(b, c);
|
|
cc.lzcnt(c, d);
|
|
cc.pdep(a, b, c);
|
|
cc.pdep(b, c, d);
|
|
cc.pdep(c, d, a);
|
|
cc.pext(a, b, c);
|
|
cc.pext(b, c, d);
|
|
cc.pext(c, d, a);
|
|
cc.popcnt(a, b);
|
|
cc.popcnt(b, c);
|
|
cc.popcnt(c, d);
|
|
cc.rorx(a, b, 8);
|
|
cc.rorx(b, c, 8);
|
|
cc.rorx(c, d, 8);
|
|
cc.sarx(a, b, c);
|
|
cc.sarx(b, c, d);
|
|
cc.sarx(c, d, a);
|
|
cc.shlx(a, b, c);
|
|
cc.shlx(b, c, d);
|
|
cc.shlx(c, d, a);
|
|
cc.shrx(a, b, c);
|
|
cc.shrx(b, c, d);
|
|
cc.shrx(c, d, a);
|
|
cc.tzcnt(a, b);
|
|
cc.tzcnt(b, c);
|
|
cc.tzcnt(c, d);
|
|
}
|
|
else {
|
|
uint32_t register_size = cc.register_size();
|
|
x86::Mem m = x86::ptr(c, 0, register_size);
|
|
x86::Mem m8 = x86::byte_ptr(c);
|
|
|
|
cc.adc(a, m);
|
|
cc.adc(b, m);
|
|
cc.adc(c, m);
|
|
cc.add(a, m);
|
|
cc.add(b, m);
|
|
cc.add(c, m);
|
|
cc.and_(a, m);
|
|
cc.and_(b, m);
|
|
cc.and_(c, m);
|
|
cc.bsf(a, m);
|
|
cc.bsf(b, m);
|
|
cc.bsf(c, m);
|
|
cc.bsr(a, m);
|
|
cc.bsr(b, m);
|
|
cc.bsr(c, m);
|
|
cc.bt(m, a);
|
|
cc.bt(m, b);
|
|
cc.bt(m, c);
|
|
cc.btc(m, a);
|
|
cc.btc(m, b);
|
|
cc.btc(m, c);
|
|
cc.btr(m, a);
|
|
cc.btr(m, b);
|
|
cc.btr(m, c);
|
|
cc.bts(m, a);
|
|
cc.bts(m, b);
|
|
cc.bts(m, c);
|
|
cc.cmp(a, m);
|
|
cc.cmovc(a, m);
|
|
cc.cmp(b, m);
|
|
cc.cmovc(b, m);
|
|
cc.cmp(c, m);
|
|
cc.cmovc(c, m);
|
|
cc.dec(m);
|
|
cc.movsx(a, m8);
|
|
cc.movsx(b, m8);
|
|
cc.movsx(c, m8);
|
|
cc.movzx(a, m8);
|
|
cc.movzx(b, m8);
|
|
cc.movzx(c, m8);
|
|
cc.neg(m);
|
|
cc.not_(m);
|
|
cc.or_(a, m);
|
|
cc.or_(b, m);
|
|
cc.or_(c, m);
|
|
cc.sbb(a, m);
|
|
cc.sbb(b, m);
|
|
cc.sbb(c, m);
|
|
cc.sub(a, m);
|
|
cc.sub(b, m);
|
|
cc.sub(c, m);
|
|
cc.test(m, a);
|
|
cc.test(m, b);
|
|
cc.test(m, c);
|
|
cc.xchg(a, m);
|
|
cc.xchg(b, m);
|
|
cc.xchg(c, m);
|
|
cc.xor_(a, m);
|
|
cc.xor_(b, m);
|
|
cc.xor_(c, m);
|
|
|
|
cc.rcl(m, c.r8_lo());
|
|
cc.rcr(m, c.r8_lo());
|
|
cc.rol(m, c.r8_lo());
|
|
cc.ror(m, c.r8_lo());
|
|
cc.shl(m, c.r8_lo());
|
|
cc.shr(m, c.r8_lo());
|
|
cc.sar(m, c.r8_lo());
|
|
cc.shld(m, b, c.r8_lo());
|
|
cc.shld(m, d, c.r8_lo());
|
|
cc.shld(m, a, c.r8_lo());
|
|
cc.shrd(m, b, c.r8_lo());
|
|
cc.shrd(m, d, c.r8_lo());
|
|
cc.shrd(m, a, c.r8_lo());
|
|
|
|
cc.adcx(a, m);
|
|
cc.adox(a, m);
|
|
cc.adcx(b, m);
|
|
cc.adox(b, m);
|
|
cc.adcx(c, m);
|
|
cc.adox(c, m);
|
|
cc.andn(a, b, m);
|
|
cc.andn(b, c, m);
|
|
cc.andn(c, d, m);
|
|
cc.bextr(a, m, c);
|
|
cc.bextr(b, m, d);
|
|
cc.bextr(c, m, a);
|
|
cc.blsi(a, m);
|
|
cc.blsi(b, m);
|
|
cc.blsi(c, m);
|
|
cc.blsmsk(a, m);
|
|
cc.blsmsk(b, m);
|
|
cc.blsmsk(c, m);
|
|
cc.blsr(a, m);
|
|
cc.blsr(b, m);
|
|
cc.blsr(c, m);
|
|
cc.bzhi(a, m, c);
|
|
cc.bzhi(b, m, d);
|
|
cc.bzhi(c, m, a);
|
|
cc.lzcnt(a, m);
|
|
cc.lzcnt(b, m);
|
|
cc.lzcnt(c, m);
|
|
cc.pdep(a, b, m);
|
|
cc.pdep(b, c, m);
|
|
cc.pdep(c, d, m);
|
|
cc.pext(a, b, m);
|
|
cc.pext(b, c, m);
|
|
cc.pext(c, d, m);
|
|
cc.popcnt(a, m);
|
|
cc.popcnt(b, m);
|
|
cc.popcnt(c, m);
|
|
cc.rorx(a, m, 8);
|
|
cc.rorx(b, m, 8);
|
|
cc.rorx(c, m, 8);
|
|
cc.sarx(a, m, c);
|
|
cc.sarx(b, m, d);
|
|
cc.sarx(c, m, a);
|
|
cc.shlx(a, m, c);
|
|
cc.shlx(b, m, d);
|
|
cc.shlx(c, m, a);
|
|
cc.shrx(a, m, c);
|
|
cc.shrx(b, m, d);
|
|
cc.shrx(c, m, a);
|
|
cc.tzcnt(a, m);
|
|
cc.tzcnt(b, m);
|
|
cc.tzcnt(c, m);
|
|
}
|
|
}
|
|
|
|
static void generate_empty_function(BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp rv = cc.new_gp32("rv");
|
|
|
|
cc.add_func(FuncSignature::build<uint32_t>());
|
|
cc.mov(rv, 0);
|
|
cc.ret(rv);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
Gp rv = eax;
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<uint32_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(rv);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
cc.mov(rv, 0);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
cc.mov(rv, 0);
|
|
cc.ret();
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
Gp rv = eax;
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<uint32_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(rv);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
cc.mov(rv, 0);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
cc.mov(rv, 0);
|
|
cc.ret();
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
static void generate_n_ops_sequence(BaseEmitter& emitter, uint32_t ops, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp ra = cc.new_gp32("ra");
|
|
Gp rb = cc.new_gp32("rb");
|
|
Gp rc = cc.new_gp32("rc");
|
|
Gp rd = cc.new_gp32("rd");
|
|
|
|
FuncNode* f = cc.add_func(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>());
|
|
f->set_arg(0, ra);
|
|
f->set_arg(1, rb);
|
|
f->set_arg(2, rc);
|
|
f->set_arg(3, rd);
|
|
|
|
for (uint32_t i = 0; i < ops; i += 4) {
|
|
cc.add(ra, rb);
|
|
cc.imul(ra, rc);
|
|
cc.sub(ra, rd);
|
|
cc.imul(ra, rc);
|
|
}
|
|
|
|
cc.ret(ra);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
Gp ra = eax;
|
|
Gp rb = ebx;
|
|
Gp rc = ecx;
|
|
Gp rd = edx;
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(ra, rb, rc, rd);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
for (uint32_t i = 0; i < ops; i += 4) {
|
|
cc.add(ra, rb);
|
|
cc.imul(ra, rc);
|
|
cc.sub(ra, rd);
|
|
cc.imul(ra, rc);
|
|
}
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
for (uint32_t i = 0; i < ops; i += 4) {
|
|
cc.add(ra, rb);
|
|
cc.imul(ra, rc);
|
|
cc.sub(ra, rd);
|
|
cc.imul(ra, rc);
|
|
}
|
|
cc.ret();
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
Gp ra = eax;
|
|
Gp rb = ebx;
|
|
Gp rc = ecx;
|
|
Gp rd = edx;
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(ra, rb, rc, rd);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
for (uint32_t i = 0; i < ops; i += 4) {
|
|
cc.add(ra, rb);
|
|
cc.imul(ra, rc);
|
|
cc.sub(ra, rd);
|
|
cc.imul(ra, rc);
|
|
}
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
for (uint32_t i = 0; i < ops; i += 4) {
|
|
cc.add(ra, rb);
|
|
cc.imul(ra, rc);
|
|
cc.sub(ra, rd);
|
|
cc.imul(ra, rc);
|
|
}
|
|
cc.ret();
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
static void generate_gp_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp a = cc.new_gp_ptr("a");
|
|
Gp b = cc.new_gp_ptr("b");
|
|
Gp c = cc.new_gp_ptr("c");
|
|
Gp d = cc.new_gp_ptr("d");
|
|
|
|
cc.add_func(FuncSignature::build<void>());
|
|
generate_gp_sequence_internal(cc, form, a, b, c, d);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
x86::Gp a = cc.zax();
|
|
x86::Gp b = cc.zbx();
|
|
x86::Gp c = cc.zcx();
|
|
x86::Gp d = cc.zdx();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(a, b, c, d);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_gp_sequence_internal(cc, form, a, b, c, d);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_gp_sequence_internal(cc, form, a, b, c, d);
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
x86::Gp a = cc.zax();
|
|
x86::Gp b = cc.zbx();
|
|
x86::Gp c = cc.zcx();
|
|
x86::Gp d = cc.zdx();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(a, b, c, d);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_gp_sequence_internal(cc, form, a, b, c, d);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_gp_sequence_internal(cc, form, a, b, c, d);
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Generates a long sequence of SSE instructions using only registers.
|
|
template<typename Emitter>
|
|
static void generate_sse_sequenceInternal(
|
|
Emitter& cc,
|
|
InstForm form,
|
|
const x86::Gp& gp,
|
|
const x86::Vec& xmm_a, const x86::Vec& xmm_b, const x86::Vec& xmm_c, const x86::Vec& xmm_d) {
|
|
|
|
x86::Gp gpd = gp.r32();
|
|
x86::Gp gpq = gp.r64();
|
|
x86::Gp gpz = cc.is_32bit() ? gpd : gpq;
|
|
|
|
cc.xor_(gpd, gpd);
|
|
cc.xorps(xmm_a, xmm_a);
|
|
cc.xorps(xmm_b, xmm_b);
|
|
cc.xorps(xmm_c, xmm_c);
|
|
cc.xorps(xmm_d, xmm_d);
|
|
|
|
if (form == InstForm::kReg) {
|
|
// SSE.
|
|
cc.addps(xmm_a, xmm_b);
|
|
cc.addss(xmm_a, xmm_b);
|
|
cc.andnps(xmm_a, xmm_b);
|
|
cc.andps(xmm_a, xmm_b);
|
|
cc.cmpps(xmm_a, xmm_b, 0);
|
|
cc.cmpss(xmm_a, xmm_b, 0);
|
|
cc.comiss(xmm_a, xmm_b);
|
|
cc.cvtsi2ss(xmm_a, gpd);
|
|
cc.cvtsi2ss(xmm_a, gpz);
|
|
cc.cvtss2si(gpd, xmm_b);
|
|
cc.cvtss2si(gpz, xmm_b);
|
|
cc.cvttss2si(gpd, xmm_b);
|
|
cc.cvttss2si(gpz, xmm_b);
|
|
cc.divps(xmm_a, xmm_b);
|
|
cc.divss(xmm_a, xmm_b);
|
|
cc.maxps(xmm_a, xmm_b);
|
|
cc.maxss(xmm_a, xmm_b);
|
|
cc.minps(xmm_a, xmm_b);
|
|
cc.minss(xmm_a, xmm_b);
|
|
cc.movaps(xmm_a, xmm_b);
|
|
cc.movd(gpd, xmm_b);
|
|
cc.movd(xmm_a, gpd);
|
|
cc.movq(xmm_a, xmm_b);
|
|
cc.movhlps(xmm_a, xmm_b);
|
|
cc.movlhps(xmm_a, xmm_b);
|
|
cc.movups(xmm_a, xmm_b);
|
|
cc.mulps(xmm_a, xmm_b);
|
|
cc.mulss(xmm_a, xmm_b);
|
|
cc.orps(xmm_a, xmm_b);
|
|
cc.rcpps(xmm_a, xmm_b);
|
|
cc.rcpss(xmm_a, xmm_b);
|
|
cc.psadbw(xmm_a, xmm_b);
|
|
cc.rsqrtps(xmm_a, xmm_b);
|
|
cc.rsqrtss(xmm_a, xmm_b);
|
|
cc.sfence();
|
|
cc.shufps(xmm_a, xmm_b, 0);
|
|
cc.sqrtps(xmm_a, xmm_b);
|
|
cc.sqrtss(xmm_a, xmm_b);
|
|
cc.subps(xmm_a, xmm_b);
|
|
cc.subss(xmm_a, xmm_b);
|
|
cc.ucomiss(xmm_a, xmm_b);
|
|
cc.unpckhps(xmm_a, xmm_b);
|
|
cc.unpcklps(xmm_a, xmm_b);
|
|
cc.xorps(xmm_a, xmm_b);
|
|
|
|
// SSE2.
|
|
cc.addpd(xmm_a, xmm_b);
|
|
cc.addsd(xmm_a, xmm_b);
|
|
cc.andnpd(xmm_a, xmm_b);
|
|
cc.andpd(xmm_a, xmm_b);
|
|
cc.cmppd(xmm_a, xmm_b, 0);
|
|
cc.cmpsd(xmm_a, xmm_b, 0);
|
|
cc.comisd(xmm_a, xmm_b);
|
|
cc.cvtdq2pd(xmm_a, xmm_b);
|
|
cc.cvtdq2ps(xmm_a, xmm_b);
|
|
cc.cvtpd2dq(xmm_a, xmm_b);
|
|
cc.cvtpd2ps(xmm_a, xmm_b);
|
|
cc.cvtps2dq(xmm_a, xmm_b);
|
|
cc.cvtps2pd(xmm_a, xmm_b);
|
|
cc.cvtsd2si(gpd, xmm_b);
|
|
cc.cvtsd2si(gpz, xmm_b);
|
|
cc.cvtsd2ss(xmm_a, xmm_b);
|
|
cc.cvtsi2sd(xmm_a, gpd);
|
|
cc.cvtsi2sd(xmm_a, gpz);
|
|
cc.cvtss2sd(xmm_a, xmm_b);
|
|
cc.cvtss2si(gpd, xmm_b);
|
|
cc.cvtss2si(gpz, xmm_b);
|
|
cc.cvttpd2dq(xmm_a, xmm_b);
|
|
cc.cvttps2dq(xmm_a, xmm_b);
|
|
cc.cvttsd2si(gpd, xmm_b);
|
|
cc.cvttsd2si(gpz, xmm_b);
|
|
cc.divpd(xmm_a, xmm_b);
|
|
cc.divsd(xmm_a, xmm_b);
|
|
cc.maxpd(xmm_a, xmm_b);
|
|
cc.maxsd(xmm_a, xmm_b);
|
|
cc.minpd(xmm_a, xmm_b);
|
|
cc.minsd(xmm_a, xmm_b);
|
|
cc.movdqa(xmm_a, xmm_b);
|
|
cc.movdqu(xmm_a, xmm_b);
|
|
cc.movmskps(gpd, xmm_b);
|
|
cc.movmskpd(gpd, xmm_b);
|
|
cc.movsd(xmm_a, xmm_b);
|
|
cc.mulpd(xmm_a, xmm_b);
|
|
cc.mulsd(xmm_a, xmm_b);
|
|
cc.orpd(xmm_a, xmm_b);
|
|
cc.packsswb(xmm_a, xmm_b);
|
|
cc.packssdw(xmm_a, xmm_b);
|
|
cc.packuswb(xmm_a, xmm_b);
|
|
cc.paddb(xmm_a, xmm_b);
|
|
cc.paddw(xmm_a, xmm_b);
|
|
cc.paddd(xmm_a, xmm_b);
|
|
cc.paddq(xmm_a, xmm_b);
|
|
cc.paddsb(xmm_a, xmm_b);
|
|
cc.paddsw(xmm_a, xmm_b);
|
|
cc.paddusb(xmm_a, xmm_b);
|
|
cc.paddusw(xmm_a, xmm_b);
|
|
cc.pand(xmm_a, xmm_b);
|
|
cc.pandn(xmm_a, xmm_b);
|
|
cc.pavgb(xmm_a, xmm_b);
|
|
cc.pavgw(xmm_a, xmm_b);
|
|
cc.pcmpeqb(xmm_a, xmm_b);
|
|
cc.pcmpeqw(xmm_a, xmm_b);
|
|
cc.pcmpeqd(xmm_a, xmm_b);
|
|
cc.pcmpgtb(xmm_a, xmm_b);
|
|
cc.pcmpgtw(xmm_a, xmm_b);
|
|
cc.pcmpgtd(xmm_a, xmm_b);
|
|
cc.pmaxsw(xmm_a, xmm_b);
|
|
cc.pmaxub(xmm_a, xmm_b);
|
|
cc.pminsw(xmm_a, xmm_b);
|
|
cc.pminub(xmm_a, xmm_b);
|
|
cc.pmovmskb(gpd, xmm_b);
|
|
cc.pmulhw(xmm_a, xmm_b);
|
|
cc.pmulhuw(xmm_a, xmm_b);
|
|
cc.pmullw(xmm_a, xmm_b);
|
|
cc.pmuludq(xmm_a, xmm_b);
|
|
cc.por(xmm_a, xmm_b);
|
|
cc.pslld(xmm_a, xmm_b);
|
|
cc.pslld(xmm_a, 0);
|
|
cc.psllq(xmm_a, xmm_b);
|
|
cc.psllq(xmm_a, 0);
|
|
cc.psllw(xmm_a, xmm_b);
|
|
cc.psllw(xmm_a, 0);
|
|
cc.pslldq(xmm_a, 0);
|
|
cc.psrad(xmm_a, xmm_b);
|
|
cc.psrad(xmm_a, 0);
|
|
cc.psraw(xmm_a, xmm_b);
|
|
cc.psraw(xmm_a, 0);
|
|
cc.psubb(xmm_a, xmm_b);
|
|
cc.psubw(xmm_a, xmm_b);
|
|
cc.psubd(xmm_a, xmm_b);
|
|
cc.psubq(xmm_a, xmm_b);
|
|
cc.pmaddwd(xmm_a, xmm_b);
|
|
cc.pshufd(xmm_a, xmm_b, 0);
|
|
cc.pshufhw(xmm_a, xmm_b, 0);
|
|
cc.pshuflw(xmm_a, xmm_b, 0);
|
|
cc.psrld(xmm_a, xmm_b);
|
|
cc.psrld(xmm_a, 0);
|
|
cc.psrlq(xmm_a, xmm_b);
|
|
cc.psrlq(xmm_a, 0);
|
|
cc.psrldq(xmm_a, 0);
|
|
cc.psrlw(xmm_a, xmm_b);
|
|
cc.psrlw(xmm_a, 0);
|
|
cc.psubsb(xmm_a, xmm_b);
|
|
cc.psubsw(xmm_a, xmm_b);
|
|
cc.psubusb(xmm_a, xmm_b);
|
|
cc.psubusw(xmm_a, xmm_b);
|
|
cc.punpckhbw(xmm_a, xmm_b);
|
|
cc.punpckhwd(xmm_a, xmm_b);
|
|
cc.punpckhdq(xmm_a, xmm_b);
|
|
cc.punpckhqdq(xmm_a, xmm_b);
|
|
cc.punpcklbw(xmm_a, xmm_b);
|
|
cc.punpcklwd(xmm_a, xmm_b);
|
|
cc.punpckldq(xmm_a, xmm_b);
|
|
cc.punpcklqdq(xmm_a, xmm_b);
|
|
cc.pxor(xmm_a, xmm_b);
|
|
cc.sqrtpd(xmm_a, xmm_b);
|
|
cc.sqrtsd(xmm_a, xmm_b);
|
|
cc.subpd(xmm_a, xmm_b);
|
|
cc.subsd(xmm_a, xmm_b);
|
|
cc.ucomisd(xmm_a, xmm_b);
|
|
cc.unpckhpd(xmm_a, xmm_b);
|
|
cc.unpcklpd(xmm_a, xmm_b);
|
|
cc.xorpd(xmm_a, xmm_b);
|
|
|
|
// SSE3.
|
|
cc.addsubpd(xmm_a, xmm_b);
|
|
cc.addsubps(xmm_a, xmm_b);
|
|
cc.haddpd(xmm_a, xmm_b);
|
|
cc.haddps(xmm_a, xmm_b);
|
|
cc.hsubpd(xmm_a, xmm_b);
|
|
cc.hsubps(xmm_a, xmm_b);
|
|
cc.movddup(xmm_a, xmm_b);
|
|
cc.movshdup(xmm_a, xmm_b);
|
|
cc.movsldup(xmm_a, xmm_b);
|
|
|
|
// SSSE3.
|
|
cc.psignb(xmm_a, xmm_b);
|
|
cc.psignw(xmm_a, xmm_b);
|
|
cc.psignd(xmm_a, xmm_b);
|
|
cc.phaddw(xmm_a, xmm_b);
|
|
cc.phaddd(xmm_a, xmm_b);
|
|
cc.phaddsw(xmm_a, xmm_b);
|
|
cc.phsubw(xmm_a, xmm_b);
|
|
cc.phsubd(xmm_a, xmm_b);
|
|
cc.phsubsw(xmm_a, xmm_b);
|
|
cc.pmaddubsw(xmm_a, xmm_b);
|
|
cc.pabsb(xmm_a, xmm_b);
|
|
cc.pabsw(xmm_a, xmm_b);
|
|
cc.pabsd(xmm_a, xmm_b);
|
|
cc.pmulhrsw(xmm_a, xmm_b);
|
|
cc.pshufb(xmm_a, xmm_b);
|
|
cc.palignr(xmm_a, xmm_b, 0);
|
|
|
|
// SSE4.1.
|
|
cc.blendpd(xmm_a, xmm_b, 0);
|
|
cc.blendps(xmm_a, xmm_b, 0);
|
|
cc.blendvpd(xmm_a, xmm_b, xmm_a);
|
|
cc.blendvps(xmm_a, xmm_b, xmm_a);
|
|
|
|
cc.dppd(xmm_a, xmm_b, 0);
|
|
cc.dpps(xmm_a, xmm_b, 0);
|
|
cc.extractps(gpd, xmm_b, 0);
|
|
cc.insertps(xmm_a, xmm_b, 0);
|
|
cc.mpsadbw(xmm_a, xmm_b, 0);
|
|
cc.packusdw(xmm_a, xmm_b);
|
|
cc.pblendvb(xmm_a, xmm_b, xmm_a);
|
|
cc.pblendw(xmm_a, xmm_b, 0);
|
|
cc.pcmpeqq(xmm_a, xmm_b);
|
|
cc.pextrb(gpd, xmm_b, 0);
|
|
cc.pextrd(gpd, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.pextrq(gpq, xmm_b, 0);
|
|
cc.pextrw(gpd, xmm_b, 0);
|
|
cc.phminposuw(xmm_a, xmm_b);
|
|
cc.pinsrb(xmm_a, gpd, 0);
|
|
cc.pinsrd(xmm_a, gpd, 0);
|
|
cc.pinsrw(xmm_a, gpd, 0);
|
|
cc.pmaxuw(xmm_a, xmm_b);
|
|
cc.pmaxsb(xmm_a, xmm_b);
|
|
cc.pmaxsd(xmm_a, xmm_b);
|
|
cc.pmaxud(xmm_a, xmm_b);
|
|
cc.pminsb(xmm_a, xmm_b);
|
|
cc.pminuw(xmm_a, xmm_b);
|
|
cc.pminud(xmm_a, xmm_b);
|
|
cc.pminsd(xmm_a, xmm_b);
|
|
cc.pmovsxbw(xmm_a, xmm_b);
|
|
cc.pmovsxbd(xmm_a, xmm_b);
|
|
cc.pmovsxbq(xmm_a, xmm_b);
|
|
cc.pmovsxwd(xmm_a, xmm_b);
|
|
cc.pmovsxwq(xmm_a, xmm_b);
|
|
cc.pmovsxdq(xmm_a, xmm_b);
|
|
cc.pmovzxbw(xmm_a, xmm_b);
|
|
cc.pmovzxbd(xmm_a, xmm_b);
|
|
cc.pmovzxbq(xmm_a, xmm_b);
|
|
cc.pmovzxwd(xmm_a, xmm_b);
|
|
cc.pmovzxwq(xmm_a, xmm_b);
|
|
cc.pmovzxdq(xmm_a, xmm_b);
|
|
cc.pmuldq(xmm_a, xmm_b);
|
|
cc.pmulld(xmm_a, xmm_b);
|
|
cc.ptest(xmm_a, xmm_b);
|
|
cc.roundps(xmm_a, xmm_b, 0);
|
|
cc.roundss(xmm_a, xmm_b, 0);
|
|
cc.roundpd(xmm_a, xmm_b, 0);
|
|
cc.roundsd(xmm_a, xmm_b, 0);
|
|
}
|
|
else {
|
|
x86::Mem m = x86::ptr(gpz);
|
|
|
|
cc.addps(xmm_a, m);
|
|
cc.addss(xmm_a, m);
|
|
cc.andnps(xmm_a, m);
|
|
cc.andps(xmm_a, m);
|
|
cc.cmpps(xmm_a, m, 0);
|
|
cc.cmpss(xmm_a, m, 0);
|
|
cc.comiss(xmm_a, m);
|
|
cc.cvtpi2ps(xmm_a, m);
|
|
cc.cvtsi2ss(xmm_a, m);
|
|
cc.cvtss2si(gpd, m);
|
|
cc.cvtss2si(gpz, m);
|
|
cc.cvttss2si(gpd, m);
|
|
cc.cvttss2si(gpz, m);
|
|
cc.divps(xmm_a, m);
|
|
cc.divss(xmm_a, m);
|
|
cc.maxps(xmm_a, m);
|
|
cc.maxss(xmm_a, m);
|
|
cc.minps(xmm_a, m);
|
|
cc.minss(xmm_a, m);
|
|
cc.movaps(xmm_a, m);
|
|
cc.movaps(m, xmm_b);
|
|
cc.movd(m, xmm_b);
|
|
cc.movd(xmm_a, m);
|
|
cc.movq(m, xmm_b);
|
|
cc.movq(xmm_a, m);
|
|
cc.movhps(xmm_a, m);
|
|
cc.movhps(m, xmm_b);
|
|
cc.movlps(xmm_a, m);
|
|
cc.movlps(m, xmm_b);
|
|
cc.movntps(m, xmm_b);
|
|
cc.movss(xmm_a, m);
|
|
cc.movss(m, xmm_b);
|
|
cc.movups(xmm_a, m);
|
|
cc.movups(m, xmm_b);
|
|
cc.mulps(xmm_a, m);
|
|
cc.mulss(xmm_a, m);
|
|
cc.orps(xmm_a, m);
|
|
cc.rcpps(xmm_a, m);
|
|
cc.rcpss(xmm_a, m);
|
|
cc.psadbw(xmm_a, m);
|
|
cc.rsqrtps(xmm_a, m);
|
|
cc.rsqrtss(xmm_a, m);
|
|
cc.shufps(xmm_a, m, 0);
|
|
cc.sqrtps(xmm_a, m);
|
|
cc.sqrtss(xmm_a, m);
|
|
cc.stmxcsr(m);
|
|
cc.subps(xmm_a, m);
|
|
cc.subss(xmm_a, m);
|
|
cc.ucomiss(xmm_a, m);
|
|
cc.unpckhps(xmm_a, m);
|
|
cc.unpcklps(xmm_a, m);
|
|
cc.xorps(xmm_a, m);
|
|
|
|
// SSE2.
|
|
cc.addpd(xmm_a, m);
|
|
cc.addsd(xmm_a, m);
|
|
cc.andnpd(xmm_a, m);
|
|
cc.andpd(xmm_a, m);
|
|
cc.cmppd(xmm_a, m, 0);
|
|
cc.cmpsd(xmm_a, m, 0);
|
|
cc.comisd(xmm_a, m);
|
|
cc.cvtdq2pd(xmm_a, m);
|
|
cc.cvtdq2ps(xmm_a, m);
|
|
cc.cvtpd2dq(xmm_a, m);
|
|
cc.cvtpd2ps(xmm_a, m);
|
|
cc.cvtpi2pd(xmm_a, m);
|
|
cc.cvtps2dq(xmm_a, m);
|
|
cc.cvtps2pd(xmm_a, m);
|
|
cc.cvtsd2si(gpd, m);
|
|
cc.cvtsd2si(gpz, m);
|
|
cc.cvtsd2ss(xmm_a, m);
|
|
cc.cvtsi2sd(xmm_a, m);
|
|
cc.cvtss2sd(xmm_a, m);
|
|
cc.cvtss2si(gpd, m);
|
|
cc.cvtss2si(gpz, m);
|
|
cc.cvttpd2dq(xmm_a, m);
|
|
cc.cvttps2dq(xmm_a, m);
|
|
cc.cvttsd2si(gpd, m);
|
|
cc.cvttsd2si(gpz, m);
|
|
cc.divpd(xmm_a, m);
|
|
cc.divsd(xmm_a, m);
|
|
cc.maxpd(xmm_a, m);
|
|
cc.maxsd(xmm_a, m);
|
|
cc.minpd(xmm_a, m);
|
|
cc.minsd(xmm_a, m);
|
|
cc.movdqa(xmm_a, m);
|
|
cc.movdqa(m, xmm_b);
|
|
cc.movdqu(xmm_a, m);
|
|
cc.movdqu(m, xmm_b);
|
|
cc.movsd(xmm_a, m);
|
|
cc.movsd(m, xmm_b);
|
|
cc.movapd(xmm_a, m);
|
|
cc.movapd(m, xmm_b);
|
|
cc.movhpd(xmm_a, m);
|
|
cc.movhpd(m, xmm_b);
|
|
cc.movlpd(xmm_a, m);
|
|
cc.movlpd(m, xmm_b);
|
|
cc.movntdq(m, xmm_b);
|
|
cc.movntpd(m, xmm_b);
|
|
cc.movupd(xmm_a, m);
|
|
cc.movupd(m, xmm_b);
|
|
cc.mulpd(xmm_a, m);
|
|
cc.mulsd(xmm_a, m);
|
|
cc.orpd(xmm_a, m);
|
|
cc.packsswb(xmm_a, m);
|
|
cc.packssdw(xmm_a, m);
|
|
cc.packuswb(xmm_a, m);
|
|
cc.paddb(xmm_a, m);
|
|
cc.paddw(xmm_a, m);
|
|
cc.paddd(xmm_a, m);
|
|
cc.paddq(xmm_a, m);
|
|
cc.paddsb(xmm_a, m);
|
|
cc.paddsw(xmm_a, m);
|
|
cc.paddusb(xmm_a, m);
|
|
cc.paddusw(xmm_a, m);
|
|
cc.pand(xmm_a, m);
|
|
cc.pandn(xmm_a, m);
|
|
cc.pavgb(xmm_a, m);
|
|
cc.pavgw(xmm_a, m);
|
|
cc.pcmpeqb(xmm_a, m);
|
|
cc.pcmpeqw(xmm_a, m);
|
|
cc.pcmpeqd(xmm_a, m);
|
|
cc.pcmpgtb(xmm_a, m);
|
|
cc.pcmpgtw(xmm_a, m);
|
|
cc.pcmpgtd(xmm_a, m);
|
|
cc.pmaxsw(xmm_a, m);
|
|
cc.pmaxub(xmm_a, m);
|
|
cc.pminsw(xmm_a, m);
|
|
cc.pminub(xmm_a, m);
|
|
cc.pmulhw(xmm_a, m);
|
|
cc.pmulhuw(xmm_a, m);
|
|
cc.pmullw(xmm_a, m);
|
|
cc.pmuludq(xmm_a, m);
|
|
cc.por(xmm_a, m);
|
|
cc.pslld(xmm_a, m);
|
|
cc.psllq(xmm_a, m);
|
|
cc.psllw(xmm_a, m);
|
|
cc.psrad(xmm_a, m);
|
|
cc.psraw(xmm_a, m);
|
|
cc.psubb(xmm_a, m);
|
|
cc.psubw(xmm_a, m);
|
|
cc.psubd(xmm_a, m);
|
|
cc.psubq(xmm_a, m);
|
|
cc.pmaddwd(xmm_a, m);
|
|
cc.pshufd(xmm_a, m, 0);
|
|
cc.pshufhw(xmm_a, m, 0);
|
|
cc.pshuflw(xmm_a, m, 0);
|
|
cc.psrld(xmm_a, m);
|
|
cc.psrlq(xmm_a, m);
|
|
cc.psrlw(xmm_a, m);
|
|
cc.psubsb(xmm_a, m);
|
|
cc.psubsw(xmm_a, m);
|
|
cc.psubusb(xmm_a, m);
|
|
cc.psubusw(xmm_a, m);
|
|
cc.punpckhbw(xmm_a, m);
|
|
cc.punpckhwd(xmm_a, m);
|
|
cc.punpckhdq(xmm_a, m);
|
|
cc.punpckhqdq(xmm_a, m);
|
|
cc.punpcklbw(xmm_a, m);
|
|
cc.punpcklwd(xmm_a, m);
|
|
cc.punpckldq(xmm_a, m);
|
|
cc.punpcklqdq(xmm_a, m);
|
|
cc.pxor(xmm_a, m);
|
|
cc.sqrtpd(xmm_a, m);
|
|
cc.sqrtsd(xmm_a, m);
|
|
cc.subpd(xmm_a, m);
|
|
cc.subsd(xmm_a, m);
|
|
cc.ucomisd(xmm_a, m);
|
|
cc.unpckhpd(xmm_a, m);
|
|
cc.unpcklpd(xmm_a, m);
|
|
cc.xorpd(xmm_a, m);
|
|
|
|
// SSE3.
|
|
cc.addsubpd(xmm_a, m);
|
|
cc.addsubps(xmm_a, m);
|
|
cc.haddpd(xmm_a, m);
|
|
cc.haddps(xmm_a, m);
|
|
cc.hsubpd(xmm_a, m);
|
|
cc.hsubps(xmm_a, m);
|
|
cc.lddqu(xmm_a, m);
|
|
cc.movddup(xmm_a, m);
|
|
cc.movshdup(xmm_a, m);
|
|
cc.movsldup(xmm_a, m);
|
|
|
|
// SSSE3.
|
|
cc.psignb(xmm_a, m);
|
|
cc.psignw(xmm_a, m);
|
|
cc.psignd(xmm_a, m);
|
|
cc.phaddw(xmm_a, m);
|
|
cc.phaddd(xmm_a, m);
|
|
cc.phaddsw(xmm_a, m);
|
|
cc.phsubw(xmm_a, m);
|
|
cc.phsubd(xmm_a, m);
|
|
cc.phsubsw(xmm_a, m);
|
|
cc.pmaddubsw(xmm_a, m);
|
|
cc.pabsb(xmm_a, m);
|
|
cc.pabsw(xmm_a, m);
|
|
cc.pabsd(xmm_a, m);
|
|
cc.pmulhrsw(xmm_a, m);
|
|
cc.pshufb(xmm_a, m);
|
|
cc.palignr(xmm_a, m, 0);
|
|
|
|
// SSE4.1.
|
|
cc.blendpd(xmm_a, m, 0);
|
|
cc.blendps(xmm_a, m, 0);
|
|
cc.blendvpd(xmm_a, m, xmm_a);
|
|
cc.blendvps(xmm_a, m, xmm_a);
|
|
|
|
cc.dppd(xmm_a, m, 0);
|
|
cc.dpps(xmm_a, m, 0);
|
|
cc.extractps(m, xmm_b, 0);
|
|
cc.insertps(xmm_a, m, 0);
|
|
cc.movntdqa(xmm_a, m);
|
|
cc.mpsadbw(xmm_a, m, 0);
|
|
cc.packusdw(xmm_a, m);
|
|
cc.pblendvb(xmm_a, m, xmm_a);
|
|
cc.pblendw(xmm_a, m, 0);
|
|
cc.pcmpeqq(xmm_a, m);
|
|
cc.pextrb(m, xmm_b, 0);
|
|
cc.pextrd(m, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.pextrq(m, xmm_b, 0);
|
|
cc.pextrw(m, xmm_b, 0);
|
|
cc.phminposuw(xmm_a, m);
|
|
cc.pinsrb(xmm_a, m, 0);
|
|
cc.pinsrd(xmm_a, m, 0);
|
|
cc.pinsrw(xmm_a, m, 0);
|
|
cc.pmaxuw(xmm_a, m);
|
|
cc.pmaxsb(xmm_a, m);
|
|
cc.pmaxsd(xmm_a, m);
|
|
cc.pmaxud(xmm_a, m);
|
|
cc.pminsb(xmm_a, m);
|
|
cc.pminuw(xmm_a, m);
|
|
cc.pminud(xmm_a, m);
|
|
cc.pminsd(xmm_a, m);
|
|
cc.pmovsxbw(xmm_a, m);
|
|
cc.pmovsxbd(xmm_a, m);
|
|
cc.pmovsxbq(xmm_a, m);
|
|
cc.pmovsxwd(xmm_a, m);
|
|
cc.pmovsxwq(xmm_a, m);
|
|
cc.pmovsxdq(xmm_a, m);
|
|
cc.pmovzxbw(xmm_a, m);
|
|
cc.pmovzxbd(xmm_a, m);
|
|
cc.pmovzxbq(xmm_a, m);
|
|
cc.pmovzxwd(xmm_a, m);
|
|
cc.pmovzxwq(xmm_a, m);
|
|
cc.pmovzxdq(xmm_a, m);
|
|
cc.pmuldq(xmm_a, m);
|
|
cc.pmulld(xmm_a, m);
|
|
cc.ptest(xmm_a, m);
|
|
cc.roundps(xmm_a, m, 0);
|
|
cc.roundss(xmm_a, m, 0);
|
|
cc.roundpd(xmm_a, m, 0);
|
|
cc.roundsd(xmm_a, m, 0);
|
|
|
|
// SSE4.2.
|
|
cc.pcmpgtq(xmm_a, m);
|
|
}
|
|
}
|
|
|
|
static void generate_sse_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp gp = cc.new_gpz("gp");
|
|
Vec a = cc.new_xmm("a");
|
|
Vec b = cc.new_xmm("b");
|
|
Vec c = cc.new_xmm("c");
|
|
Vec d = cc.new_xmm("d");
|
|
|
|
cc.add_func(FuncSignature::build<void>());
|
|
generate_sse_sequenceInternal(cc, form, gp, a, b, c, d);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Generates a long sequence of AVX instructions.
|
|
template<typename Emitter>
|
|
static void generate_avx_sequenceInternalRegOnly(
|
|
Emitter& cc,
|
|
const x86::Gp& gp,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
x86::Gp gpd = gp.r32();
|
|
x86::Gp gpq = gp.r64();
|
|
x86::Gp gpz = cc.is_32bit() ? gpd : gpq;
|
|
|
|
x86::Vec xmm_a = vec_a.xmm();
|
|
x86::Vec xmm_b = vec_b.xmm();
|
|
x86::Vec xmm_c = vec_c.xmm();
|
|
x86::Vec xmm_d = vec_d.xmm();
|
|
|
|
x86::Vec ymm_a = vec_a.ymm();
|
|
x86::Vec ymm_b = vec_b.ymm();
|
|
x86::Vec ymm_c = vec_c.ymm();
|
|
|
|
cc.xor_(gpd, gpd);
|
|
cc.vxorps(xmm_a, xmm_a, xmm_a);
|
|
cc.vxorps(xmm_b, xmm_b, xmm_b);
|
|
cc.vxorps(xmm_c, xmm_c, xmm_c);
|
|
cc.vxorps(xmm_d, xmm_d, xmm_d);
|
|
|
|
cc.vaddpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vaddps(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddps(ymm_a, ymm_b, ymm_c);
|
|
cc.vaddsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddss(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddsubpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddsubpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vaddsubps(xmm_a, xmm_b, xmm_c);
|
|
cc.vaddsubps(ymm_a, ymm_b, ymm_c);
|
|
cc.vandpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vandpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vandps(xmm_a, xmm_b, xmm_c);
|
|
cc.vandps(ymm_a, ymm_b, ymm_c);
|
|
cc.vandnpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vandnpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vandnps(xmm_a, xmm_b, xmm_c);
|
|
cc.vandnps(ymm_a, ymm_b, ymm_c);
|
|
cc.vblendpd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vblendpd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vblendps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vblendps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vblendvpd(xmm_a, xmm_b, xmm_c, xmm_a);
|
|
cc.vblendvpd(ymm_a, ymm_b, ymm_c, ymm_a);
|
|
cc.vcmppd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vcmppd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vcmpps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vcmpps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vcmpsd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vcmpss(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vcomisd(xmm_a, xmm_b);
|
|
cc.vcomiss(xmm_a, xmm_b);
|
|
cc.vcvtdq2pd(xmm_a, xmm_b);
|
|
cc.vcvtdq2pd(ymm_a, xmm_b);
|
|
cc.vcvtdq2ps(xmm_a, xmm_b);
|
|
cc.vcvtdq2ps(ymm_a, ymm_b);
|
|
cc.vcvtpd2dq(xmm_a, xmm_b);
|
|
cc.vcvtpd2dq(xmm_a, ymm_b);
|
|
cc.vcvtpd2ps(xmm_a, xmm_b);
|
|
cc.vcvtpd2ps(xmm_a, ymm_b);
|
|
cc.vcvtps2dq(xmm_a, xmm_b);
|
|
cc.vcvtps2dq(ymm_a, ymm_b);
|
|
cc.vcvtps2pd(xmm_a, xmm_b);
|
|
cc.vcvtps2pd(ymm_a, xmm_b);
|
|
cc.vcvtsd2si(gpd, xmm_b);
|
|
cc.vcvtsd2si(gpz, xmm_b);
|
|
cc.vcvtsd2ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vcvtsi2sd(xmm_a, xmm_b, gpd);
|
|
cc.vcvtsi2sd(xmm_a, xmm_b, gpz);
|
|
cc.vcvtsi2ss(xmm_a, xmm_b, gpd);
|
|
cc.vcvtsi2ss(xmm_a, xmm_b, gpz);
|
|
cc.vcvtss2sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vcvtss2si(gpd, xmm_b);
|
|
cc.vcvttpd2dq(xmm_a, xmm_b);
|
|
cc.vcvttpd2dq(xmm_a, ymm_b);
|
|
cc.vcvttps2dq(xmm_a, xmm_b);
|
|
cc.vcvttps2dq(ymm_a, ymm_b);
|
|
cc.vcvttsd2si(gpd, xmm_b);
|
|
cc.vcvttss2si(gpz, xmm_b);
|
|
cc.vdivpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vdivpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vdivps(xmm_a, xmm_b, xmm_c);
|
|
cc.vdivps(ymm_a, ymm_b, ymm_c);
|
|
cc.vdivsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vdivss(xmm_a, xmm_b, xmm_c);
|
|
cc.vdppd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vdpps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vdpps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vextractf128(xmm_a, ymm_b, 0);
|
|
cc.vextractps(gpd, xmm_b, 0);
|
|
cc.vhaddpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vhaddpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vhaddps(xmm_a, xmm_b, xmm_c);
|
|
cc.vhaddps(ymm_a, ymm_b, ymm_c);
|
|
cc.vhsubpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vhsubpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vhsubps(xmm_a, xmm_b, xmm_c);
|
|
cc.vhsubps(ymm_a, ymm_b, ymm_c);
|
|
cc.vinsertf128(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.vinsertps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vmaxpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vmaxpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vmaxps(xmm_a, xmm_b, xmm_c);
|
|
cc.vmaxps(ymm_a, ymm_b, ymm_c);
|
|
cc.vmaxsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vmaxss(xmm_a, xmm_b, xmm_c);
|
|
cc.vminpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vminpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vminps(xmm_a, xmm_b, xmm_c);
|
|
cc.vminps(ymm_a, ymm_b, ymm_c);
|
|
cc.vminsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vminss(xmm_a, xmm_b, xmm_c);
|
|
cc.vmovapd(xmm_a, xmm_b);
|
|
cc.vmovapd(ymm_a, ymm_b);
|
|
cc.vmovaps(xmm_a, xmm_b);
|
|
cc.vmovaps(ymm_a, ymm_b);
|
|
cc.vmovd(xmm_a, gpd);
|
|
cc.vmovd(gpd, xmm_b);
|
|
cc.vmovddup(xmm_a, xmm_b);
|
|
cc.vmovddup(ymm_a, ymm_b);
|
|
cc.vmovdqa(xmm_a, xmm_b);
|
|
cc.vmovdqa(ymm_a, ymm_b);
|
|
cc.vmovdqu(xmm_a, xmm_b);
|
|
cc.vmovdqu(ymm_a, ymm_b);
|
|
cc.vmovhlps(xmm_a, xmm_b, xmm_c);
|
|
cc.vmovlhps(xmm_a, xmm_b, xmm_c);
|
|
cc.vmovmskpd(gpd, xmm_b);
|
|
cc.vmovmskpd(gpd, ymm_b);
|
|
cc.vmovmskps(gpd, xmm_b);
|
|
cc.vmovmskps(gpd, ymm_b);
|
|
cc.vmovsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vmovshdup(xmm_a, xmm_b);
|
|
cc.vmovshdup(ymm_a, ymm_b);
|
|
cc.vmovsldup(xmm_a, xmm_b);
|
|
cc.vmovsldup(ymm_a, ymm_b);
|
|
cc.vmovss(xmm_a, xmm_b, xmm_c);
|
|
cc.vmovupd(xmm_a, xmm_b);
|
|
cc.vmovupd(ymm_a, ymm_b);
|
|
cc.vmovups(xmm_a, xmm_b);
|
|
cc.vmovups(ymm_a, ymm_b);
|
|
cc.vmpsadbw(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vmulpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vmulpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vmulps(xmm_a, xmm_b, xmm_c);
|
|
cc.vmulps(ymm_a, ymm_b, ymm_c);
|
|
cc.vmulsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vmulss(xmm_a, xmm_b, xmm_c);
|
|
cc.vorpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vorpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vorps(xmm_a, xmm_b, xmm_c);
|
|
cc.vorps(ymm_a, ymm_b, ymm_c);
|
|
cc.vpabsb(xmm_a, xmm_b);
|
|
cc.vpabsd(xmm_a, xmm_b);
|
|
cc.vpabsw(xmm_a, xmm_b);
|
|
cc.vpackssdw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpacksswb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpackusdw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpackuswb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddsb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddusb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpaddusw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpalignr(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vpand(xmm_a, xmm_b, xmm_c);
|
|
cc.vpandn(xmm_a, xmm_b, xmm_c);
|
|
cc.vpavgb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpavgw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpblendvb(xmm_a, xmm_b, xmm_c, xmm_a);
|
|
cc.vpblendw(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vpcmpeqb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpeqd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpeqq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpeqw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpgtb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpgtd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpgtq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpcmpgtw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpermilpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpermilpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpermilpd(xmm_a, xmm_b, 0);
|
|
cc.vpermilpd(ymm_a, ymm_b, 0);
|
|
cc.vpermilps(xmm_a, xmm_b, xmm_c);
|
|
cc.vpermilps(ymm_a, ymm_b, ymm_c);
|
|
cc.vpermilps(xmm_a, xmm_b, 0);
|
|
cc.vpermilps(ymm_a, ymm_b, 0);
|
|
cc.vperm2f128(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpextrb(gpd, xmm_b, 0);
|
|
cc.vpextrd(gpd, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.vpextrq(gpq, xmm_b, 0);
|
|
cc.vpextrw(gpd, xmm_b, 0);
|
|
cc.vphaddd(xmm_a, xmm_b, xmm_c);
|
|
cc.vphaddsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vphaddw(xmm_a, xmm_b, xmm_c);
|
|
cc.vphminposuw(xmm_a, xmm_b);
|
|
cc.vphsubd(xmm_a, xmm_b, xmm_c);
|
|
cc.vphsubsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vphsubw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpinsrb(xmm_a, xmm_b, gpd, 0);
|
|
cc.vpinsrd(xmm_a, xmm_b, gpd, 0);
|
|
cc.vpinsrw(xmm_a, xmm_b, gpd, 0);
|
|
cc.vpmaddubsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaddwd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxsb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxub(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxud(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmaxuw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminsb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminub(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminud(xmm_a, xmm_b, xmm_c);
|
|
cc.vpminuw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmovmskb(gpd, xmm_b);
|
|
cc.vpmovsxbd(xmm_a, xmm_b);
|
|
cc.vpmovsxbq(xmm_a, xmm_b);
|
|
cc.vpmovsxbw(xmm_a, xmm_b);
|
|
cc.vpmovsxdq(xmm_a, xmm_b);
|
|
cc.vpmovsxwd(xmm_a, xmm_b);
|
|
cc.vpmovsxwq(xmm_a, xmm_b);
|
|
cc.vpmovzxbd(xmm_a, xmm_b);
|
|
cc.vpmovzxbq(xmm_a, xmm_b);
|
|
cc.vpmovzxbw(xmm_a, xmm_b);
|
|
cc.vpmovzxdq(xmm_a, xmm_b);
|
|
cc.vpmovzxwd(xmm_a, xmm_b);
|
|
cc.vpmovzxwq(xmm_a, xmm_b);
|
|
cc.vpmuldq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmulhrsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmulhuw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmulhw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmulld(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmullw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpmuludq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpor(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsadbw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpshufb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpshufd(xmm_a, xmm_b, 0);
|
|
cc.vpshufhw(xmm_a, xmm_b, 0);
|
|
cc.vpshuflw(xmm_a, xmm_b, 0);
|
|
cc.vpsignb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsignd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsignw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpslld(xmm_a, xmm_b, xmm_c);
|
|
cc.vpslld(xmm_a, xmm_b, 0);
|
|
cc.vpslldq(xmm_a, xmm_b, 0);
|
|
cc.vpsllq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsllq(xmm_a, xmm_b, 0);
|
|
cc.vpsllw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsllw(xmm_a, xmm_b, 0);
|
|
cc.vpsrad(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrad(xmm_a, xmm_b, 0);
|
|
cc.vpsraw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsraw(xmm_a, xmm_b, 0);
|
|
cc.vpsrld(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrld(xmm_a, xmm_b, 0);
|
|
cc.vpsrldq(xmm_a, xmm_b, 0);
|
|
cc.vpsrlq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrlq(xmm_a, xmm_b, 0);
|
|
cc.vpsrlw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrlw(xmm_a, xmm_b, 0);
|
|
cc.vpsubb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubsb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubsw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubusb(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsubusw(xmm_a, xmm_b, xmm_c);
|
|
cc.vptest(xmm_a, xmm_b);
|
|
cc.vptest(ymm_a, ymm_b);
|
|
cc.vpunpckhbw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpckhdq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpckhqdq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpckhwd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpcklbw(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpckldq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpcklqdq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpunpcklwd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpxor(xmm_a, xmm_b, xmm_c);
|
|
cc.vrcpps(xmm_a, xmm_b);
|
|
cc.vrcpps(ymm_a, ymm_b);
|
|
cc.vrcpss(xmm_a, xmm_b, xmm_c);
|
|
cc.vrsqrtps(xmm_a, xmm_b);
|
|
cc.vrsqrtps(ymm_a, ymm_b);
|
|
cc.vrsqrtss(xmm_a, xmm_b, xmm_c);
|
|
cc.vroundpd(xmm_a, xmm_b, 0);
|
|
cc.vroundpd(ymm_a, ymm_b, 0);
|
|
cc.vroundps(xmm_a, xmm_b, 0);
|
|
cc.vroundps(ymm_a, ymm_b, 0);
|
|
cc.vroundsd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vroundss(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vshufpd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vshufpd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vshufps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vshufps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vsqrtpd(xmm_a, xmm_b);
|
|
cc.vsqrtpd(ymm_a, ymm_b);
|
|
cc.vsqrtps(xmm_a, xmm_b);
|
|
cc.vsqrtps(ymm_a, ymm_b);
|
|
cc.vsqrtsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vsqrtss(xmm_a, xmm_b, xmm_c);
|
|
cc.vsubpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vsubpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vsubps(xmm_a, xmm_b, xmm_c);
|
|
cc.vsubps(ymm_a, ymm_b, ymm_c);
|
|
cc.vsubsd(xmm_a, xmm_b, xmm_c);
|
|
cc.vsubss(xmm_a, xmm_b, xmm_c);
|
|
cc.vtestps(xmm_a, xmm_b);
|
|
cc.vtestps(ymm_a, ymm_b);
|
|
cc.vtestpd(xmm_a, xmm_b);
|
|
cc.vtestpd(ymm_a, ymm_b);
|
|
cc.vucomisd(xmm_a, xmm_b);
|
|
cc.vucomiss(xmm_a, xmm_b);
|
|
cc.vunpckhpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vunpckhpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vunpckhps(xmm_a, xmm_b, xmm_c);
|
|
cc.vunpckhps(ymm_a, ymm_b, ymm_c);
|
|
cc.vunpcklpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vunpcklpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vunpcklps(xmm_a, xmm_b, xmm_c);
|
|
cc.vunpcklps(ymm_a, ymm_b, ymm_c);
|
|
cc.vxorpd(xmm_a, xmm_b, xmm_c);
|
|
cc.vxorpd(ymm_a, ymm_b, ymm_c);
|
|
cc.vxorps(xmm_a, xmm_b, xmm_c);
|
|
cc.vxorps(ymm_a, ymm_b, ymm_c);
|
|
|
|
// AVX+AESNI.
|
|
cc.vaesdec(xmm_a, xmm_b, xmm_c);
|
|
cc.vaesdeclast(xmm_a, xmm_b, xmm_c);
|
|
cc.vaesenc(xmm_a, xmm_b, xmm_c);
|
|
cc.vaesenclast(xmm_a, xmm_b, xmm_c);
|
|
cc.vaesimc(xmm_a, xmm_b);
|
|
cc.vaeskeygenassist(xmm_a, xmm_b, 0);
|
|
|
|
// AVX+PCLMULQDQ.
|
|
cc.vpclmulqdq(xmm_a, xmm_b, xmm_c, 0);
|
|
|
|
// AVX2.
|
|
cc.vbroadcastsd(ymm_a, xmm_b);
|
|
cc.vbroadcastss(xmm_a, xmm_b);
|
|
cc.vbroadcastss(ymm_a, xmm_b);
|
|
cc.vextracti128(xmm_a, ymm_b, 0);
|
|
cc.vinserti128(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.vmpsadbw(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpabsb(ymm_a, ymm_b);
|
|
cc.vpabsd(ymm_a, ymm_b);
|
|
cc.vpabsw(ymm_a, ymm_b);
|
|
cc.vpackssdw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpacksswb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpackusdw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpackuswb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddsb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddusb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpaddusw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpalignr(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpand(ymm_a, ymm_b, ymm_c);
|
|
cc.vpandn(ymm_a, ymm_b, ymm_c);
|
|
cc.vpavgb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpavgw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpblendd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.vpblendd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpblendvb(ymm_a, ymm_b, ymm_c, ymm_a);
|
|
cc.vpblendw(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpbroadcastb(xmm_a, xmm_b);
|
|
cc.vpbroadcastb(ymm_a, xmm_b);
|
|
cc.vpbroadcastd(xmm_a, xmm_b);
|
|
cc.vpbroadcastd(ymm_a, xmm_b);
|
|
cc.vpbroadcastq(xmm_a, xmm_b);
|
|
cc.vpbroadcastq(ymm_a, xmm_b);
|
|
cc.vpbroadcastw(xmm_a, xmm_b);
|
|
cc.vpbroadcastw(ymm_a, xmm_b);
|
|
cc.vpcmpeqb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpeqd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpeqq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpeqw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpgtb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpgtd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpgtq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpcmpgtw(ymm_a, ymm_b, ymm_c);
|
|
cc.vperm2i128(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.vpermd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpermps(ymm_a, ymm_b, ymm_c);
|
|
cc.vpermpd(ymm_a, ymm_b, 0);
|
|
cc.vpermq(ymm_a, ymm_b, 0);
|
|
cc.vpmovmskb(gpd, ymm_b);
|
|
cc.vpmovsxbd(ymm_a, xmm_b);
|
|
cc.vpmovsxbq(ymm_a, xmm_b);
|
|
cc.vpmovsxbw(ymm_a, xmm_b);
|
|
cc.vpmovsxdq(ymm_a, xmm_b);
|
|
cc.vpmovsxwd(ymm_a, xmm_b);
|
|
cc.vpmovsxwq(ymm_a, xmm_b);
|
|
cc.vpmovzxbd(ymm_a, xmm_b);
|
|
cc.vpmovzxbq(ymm_a, xmm_b);
|
|
cc.vpmovzxbw(ymm_a, xmm_b);
|
|
cc.vpmovzxdq(ymm_a, xmm_b);
|
|
cc.vpmovzxwd(ymm_a, xmm_b);
|
|
cc.vpmovzxwq(ymm_a, xmm_b);
|
|
cc.vpshufd(ymm_a, ymm_b, 0);
|
|
cc.vpshufhw(ymm_a, ymm_b, 0);
|
|
cc.vpshuflw(ymm_a, ymm_b, 0);
|
|
cc.vpslld(ymm_a, ymm_b, 0);
|
|
cc.vpslldq(ymm_a, ymm_b, 0);
|
|
cc.vpsllq(ymm_a, ymm_b, 0);
|
|
cc.vpsllw(ymm_a, ymm_b, 0);
|
|
cc.vpsrad(ymm_a, ymm_b, 0);
|
|
cc.vpsraw(ymm_a, ymm_b, 0);
|
|
cc.vpsrld(ymm_a, ymm_b, 0);
|
|
cc.vpsrldq(ymm_a, ymm_b, 0);
|
|
cc.vpsrlq(ymm_a, ymm_b, 0);
|
|
cc.vpsrlw(ymm_a, ymm_b, 0);
|
|
cc.vphaddd(ymm_a, ymm_b, ymm_c);
|
|
cc.vphaddsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vphaddw(ymm_a, ymm_b, ymm_c);
|
|
cc.vphsubd(ymm_a, ymm_b, ymm_c);
|
|
cc.vphsubsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vphsubw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaddubsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaddwd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxsb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxsd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxub(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxud(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmaxuw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminsb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminsd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminub(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminud(ymm_a, ymm_b, ymm_c);
|
|
cc.vpminuw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmuldq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmulhrsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmulhuw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmulhw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmulld(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmullw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpmuludq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpor(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsadbw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpshufb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsignb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsignd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsignw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpslld(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsllq(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsllvd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsllvd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsllvq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsllvq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsllw(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsrad(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsravd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsravd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsraw(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsrld(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsrlq(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsrlvd(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrlvd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsrlvq(xmm_a, xmm_b, xmm_c);
|
|
cc.vpsrlvq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsrlw(ymm_a, ymm_b, xmm_c);
|
|
cc.vpsubb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubsb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubsw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubusb(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubusw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpsubw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpckhbw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpckhdq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpckhqdq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpckhwd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpcklbw(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpckldq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpcklqdq(ymm_a, ymm_b, ymm_c);
|
|
cc.vpunpcklwd(ymm_a, ymm_b, ymm_c);
|
|
cc.vpxor(ymm_a, ymm_b, ymm_c);
|
|
|
|
// FMA.
|
|
cc.vfmadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmadd231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmadd231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmaddsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmaddsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmaddsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmaddsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmaddsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmaddsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsub231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsub231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsubadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsubadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsubadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsubadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfmsubadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfmsubadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmadd231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmadd231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.vfnmsub231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.vfnmsub231ss(xmm_a, xmm_b, xmm_c);
|
|
}
|
|
|
|
// Generates a long sequence of AVX instructions.
|
|
template<typename Emitter>
|
|
static void generate_avx_sequenceInternalRegMem(
|
|
Emitter& cc,
|
|
const x86::Gp& gp,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
x86::Gp gpd = gp.r32();
|
|
x86::Gp gpq = gp.r64();
|
|
x86::Gp gpz = cc.is_32bit() ? gpd : gpq;
|
|
|
|
x86::Vec xmm_a = vec_a.xmm();
|
|
x86::Vec xmm_b = vec_b.xmm();
|
|
x86::Vec xmm_c = vec_c.xmm();
|
|
x86::Vec xmm_d = vec_d.xmm();
|
|
|
|
x86::Vec ymm_a = vec_a.ymm();
|
|
x86::Vec ymm_b = vec_b.ymm();
|
|
x86::Vec ymm_c = vec_c.ymm();
|
|
x86::Vec ymm_d = vec_d.ymm();
|
|
|
|
x86::Mem m = x86::ptr(gpz);
|
|
x86::Mem m128 = x86::xmmword_ptr(gpz);
|
|
x86::Mem m256 = x86::xmmword_ptr(gpz);
|
|
x86::Mem vx_ptr = x86::ptr(gpz, xmm_d);
|
|
x86::Mem vy_ptr = x86::ptr(gpz, ymm_d);
|
|
|
|
cc.xor_(gpd, gpd);
|
|
cc.vxorps(xmm_a, xmm_a, xmm_a);
|
|
cc.vxorps(xmm_b, xmm_b, xmm_b);
|
|
cc.vxorps(xmm_c, xmm_c, xmm_c);
|
|
cc.vxorps(xmm_d, xmm_d, xmm_d);
|
|
|
|
cc.vaddpd(xmm_a, xmm_b, m);
|
|
cc.vaddpd(ymm_a, ymm_b, m);
|
|
cc.vaddps(xmm_a, xmm_b, m);
|
|
cc.vaddps(ymm_a, ymm_b, m);
|
|
cc.vaddsd(xmm_a, xmm_b, m);
|
|
cc.vaddss(xmm_a, xmm_b, m);
|
|
cc.vaddsubpd(xmm_a, xmm_b, m);
|
|
cc.vaddsubpd(ymm_a, ymm_b, m);
|
|
cc.vaddsubps(xmm_a, xmm_b, m);
|
|
cc.vaddsubps(ymm_a, ymm_b, m);
|
|
cc.vandpd(xmm_a, xmm_b, m);
|
|
cc.vandpd(ymm_a, ymm_b, m);
|
|
cc.vandps(xmm_a, xmm_b, m);
|
|
cc.vandps(ymm_a, ymm_b, m);
|
|
cc.vandnpd(xmm_a, xmm_b, m);
|
|
cc.vandnpd(ymm_a, ymm_b, m);
|
|
cc.vandnps(xmm_a, xmm_b, m);
|
|
cc.vandnps(ymm_a, ymm_b, m);
|
|
cc.vblendpd(xmm_a, xmm_b, m, 0);
|
|
cc.vblendpd(ymm_a, ymm_b, m, 0);
|
|
cc.vblendps(xmm_a, xmm_b, m, 0);
|
|
cc.vblendps(ymm_a, ymm_b, m, 0);
|
|
cc.vblendvpd(xmm_a, xmm_b, m, xmm_a);
|
|
cc.vblendvpd(ymm_a, ymm_b, m, ymm_a);
|
|
cc.vbroadcastf128(ymm_a, m);
|
|
cc.vbroadcastsd(ymm_a, m);
|
|
cc.vbroadcastss(xmm_a, m);
|
|
cc.vbroadcastss(ymm_a, m);
|
|
cc.vcmppd(xmm_a, xmm_b, m, 0);
|
|
cc.vcmppd(ymm_a, ymm_b, m, 0);
|
|
cc.vcmpps(xmm_a, xmm_b, m, 0);
|
|
cc.vcmpps(ymm_a, ymm_b, m, 0);
|
|
cc.vcmpsd(xmm_a, xmm_b, m, 0);
|
|
cc.vcmpss(xmm_a, xmm_b, m, 0);
|
|
cc.vcomisd(xmm_a, m);
|
|
cc.vcomiss(xmm_a, m);
|
|
cc.vcvtdq2pd(xmm_a, m);
|
|
cc.vcvtdq2pd(ymm_a, m);
|
|
cc.vcvtdq2ps(xmm_a, m);
|
|
cc.vcvtdq2ps(ymm_a, m);
|
|
cc.vcvtpd2dq(xmm_a, m128);
|
|
cc.vcvtpd2dq(xmm_a, m256);
|
|
cc.vcvtpd2ps(xmm_a, m128);
|
|
cc.vcvtpd2ps(xmm_a, m256);
|
|
cc.vcvtps2dq(xmm_a, m);
|
|
cc.vcvtps2dq(ymm_a, m);
|
|
cc.vcvtps2pd(xmm_a, m);
|
|
cc.vcvtps2pd(ymm_a, m);
|
|
cc.vcvtsd2si(gpd, m);
|
|
cc.vcvtsd2ss(xmm_a, xmm_b, m);
|
|
cc.vcvtsi2sd(xmm_a, xmm_b, m);
|
|
cc.vcvtsi2ss(xmm_a, xmm_b, m);
|
|
cc.vcvtss2sd(xmm_a, xmm_b, m);
|
|
cc.vcvtss2si(gpd, m);
|
|
cc.vcvttpd2dq(xmm_a, m128);
|
|
cc.vcvttpd2dq(xmm_a, m256);
|
|
cc.vcvttps2dq(xmm_a, m);
|
|
cc.vcvttps2dq(ymm_a, m);
|
|
cc.vcvttsd2si(gpd, m);
|
|
cc.vcvttss2si(gpd, m);
|
|
cc.vdivpd(xmm_a, xmm_b, m);
|
|
cc.vdivpd(ymm_a, ymm_b, m);
|
|
cc.vdivps(xmm_a, xmm_b, m);
|
|
cc.vdivps(ymm_a, ymm_b, m);
|
|
cc.vdivsd(xmm_a, xmm_b, m);
|
|
cc.vdivss(xmm_a, xmm_b, m);
|
|
cc.vdppd(xmm_a, xmm_b, m, 0);
|
|
cc.vdpps(xmm_a, xmm_b, m, 0);
|
|
cc.vdpps(ymm_a, ymm_b, m, 0);
|
|
cc.vextractf128(m, ymm_b, 0);
|
|
cc.vextractps(m, xmm_b, 0);
|
|
cc.vhaddpd(xmm_a, xmm_b, m);
|
|
cc.vhaddpd(ymm_a, ymm_b, m);
|
|
cc.vhaddps(xmm_a, xmm_b, m);
|
|
cc.vhaddps(ymm_a, ymm_b, m);
|
|
cc.vhsubpd(xmm_a, xmm_b, m);
|
|
cc.vhsubpd(ymm_a, ymm_b, m);
|
|
cc.vhsubps(xmm_a, xmm_b, m);
|
|
cc.vhsubps(ymm_a, ymm_b, m);
|
|
cc.vinsertf128(ymm_a, ymm_b, m, 0);
|
|
cc.vinsertps(xmm_a, xmm_b, m, 0);
|
|
cc.vlddqu(xmm_a, m);
|
|
cc.vlddqu(ymm_a, m);
|
|
cc.vmaskmovps(xmm_a, xmm_b, m);
|
|
cc.vmaskmovps(ymm_a, ymm_b, m);
|
|
cc.vmaskmovps(m, xmm_b, xmm_c);
|
|
cc.vmaskmovps(m, ymm_b, ymm_c);
|
|
cc.vmaskmovpd(xmm_a, xmm_b, m);
|
|
cc.vmaskmovpd(ymm_a, ymm_b, m);
|
|
cc.vmaskmovpd(m, xmm_b, xmm_c);
|
|
cc.vmaskmovpd(m, ymm_b, ymm_c);
|
|
cc.vmaxpd(xmm_a, xmm_b, m);
|
|
cc.vmaxpd(ymm_a, ymm_b, m);
|
|
cc.vmaxps(xmm_a, xmm_b, m);
|
|
cc.vmaxps(ymm_a, ymm_b, m);
|
|
cc.vmaxsd(xmm_a, xmm_b, m);
|
|
cc.vmaxss(xmm_a, xmm_b, m);
|
|
cc.vminpd(xmm_a, xmm_b, m);
|
|
cc.vminpd(ymm_a, ymm_b, m);
|
|
cc.vminps(xmm_a, xmm_b, m);
|
|
cc.vminps(ymm_a, ymm_b, m);
|
|
cc.vminsd(xmm_a, xmm_b, m);
|
|
cc.vminss(xmm_a, xmm_b, m);
|
|
cc.vmovapd(xmm_a, m);
|
|
cc.vmovapd(m, xmm_b);
|
|
cc.vmovapd(ymm_a, m);
|
|
cc.vmovapd(m, ymm_b);
|
|
cc.vmovaps(xmm_a, m);
|
|
cc.vmovaps(m, xmm_b);
|
|
cc.vmovaps(ymm_a, m);
|
|
cc.vmovaps(m, ymm_b);
|
|
cc.vmovd(xmm_a, m);
|
|
cc.vmovd(m, xmm_b);
|
|
cc.vmovddup(xmm_a, m);
|
|
cc.vmovddup(ymm_a, m);
|
|
cc.vmovdqa(xmm_a, m);
|
|
cc.vmovdqa(m, xmm_b);
|
|
cc.vmovdqa(ymm_a, m);
|
|
cc.vmovdqa(m, ymm_b);
|
|
cc.vmovdqu(xmm_a, m);
|
|
cc.vmovdqu(m, xmm_b);
|
|
cc.vmovdqu(ymm_a, m);
|
|
cc.vmovdqu(m, ymm_b);
|
|
cc.vmovhpd(xmm_a, xmm_b, m);
|
|
cc.vmovhps(xmm_a, xmm_b, m);
|
|
cc.vmovhps(m, xmm_b);
|
|
cc.vmovlpd(xmm_a, xmm_b, m);
|
|
cc.vmovlpd(m, xmm_b);
|
|
cc.vmovlps(xmm_a, xmm_b, m);
|
|
cc.vmovlps(m, xmm_b);
|
|
cc.vmovntdq(m, xmm_b);
|
|
cc.vmovntdq(m, ymm_b);
|
|
cc.vmovntdqa(xmm_a, m);
|
|
cc.vmovntpd(m, xmm_b);
|
|
cc.vmovntpd(m, ymm_b);
|
|
cc.vmovntps(m, xmm_b);
|
|
cc.vmovntps(m, ymm_b);
|
|
cc.vmovsd(xmm_a, m);
|
|
cc.vmovsd(m, xmm_b);
|
|
cc.vmovshdup(xmm_a, m);
|
|
cc.vmovshdup(ymm_a, m);
|
|
cc.vmovsldup(xmm_a, m);
|
|
cc.vmovsldup(ymm_a, m);
|
|
cc.vmovss(xmm_a, m);
|
|
cc.vmovss(m, xmm_b);
|
|
cc.vmovupd(xmm_a, m);
|
|
cc.vmovupd(m, xmm_b);
|
|
cc.vmovupd(ymm_a, m);
|
|
cc.vmovupd(m, ymm_b);
|
|
cc.vmovups(xmm_a, m);
|
|
cc.vmovups(m, xmm_b);
|
|
cc.vmovups(ymm_a, m);
|
|
cc.vmovups(m, ymm_b);
|
|
cc.vmpsadbw(xmm_a, xmm_b, m, 0);
|
|
cc.vmulpd(xmm_a, xmm_b, m);
|
|
cc.vmulpd(ymm_a, ymm_b, m);
|
|
cc.vmulps(xmm_a, xmm_b, m);
|
|
cc.vmulps(ymm_a, ymm_b, m);
|
|
cc.vmulsd(xmm_a, xmm_b, m);
|
|
cc.vmulss(xmm_a, xmm_b, m);
|
|
cc.vorpd(xmm_a, xmm_b, m);
|
|
cc.vorpd(ymm_a, ymm_b, m);
|
|
cc.vorps(xmm_a, xmm_b, m);
|
|
cc.vorps(ymm_a, ymm_b, m);
|
|
cc.vpabsb(xmm_a, m);
|
|
cc.vpabsd(xmm_a, m);
|
|
cc.vpabsw(xmm_a, m);
|
|
cc.vpackssdw(xmm_a, xmm_b, m);
|
|
cc.vpacksswb(xmm_a, xmm_b, m);
|
|
cc.vpackusdw(xmm_a, xmm_b, m);
|
|
cc.vpackuswb(xmm_a, xmm_b, m);
|
|
cc.vpaddb(xmm_a, xmm_b, m);
|
|
cc.vpaddd(xmm_a, xmm_b, m);
|
|
cc.vpaddq(xmm_a, xmm_b, m);
|
|
cc.vpaddw(xmm_a, xmm_b, m);
|
|
cc.vpaddsb(xmm_a, xmm_b, m);
|
|
cc.vpaddsw(xmm_a, xmm_b, m);
|
|
cc.vpaddusb(xmm_a, xmm_b, m);
|
|
cc.vpaddusw(xmm_a, xmm_b, m);
|
|
cc.vpalignr(xmm_a, xmm_b, m, 0);
|
|
cc.vpand(xmm_a, xmm_b, m);
|
|
cc.vpandn(xmm_a, xmm_b, m);
|
|
cc.vpavgb(xmm_a, xmm_b, m);
|
|
cc.vpavgw(xmm_a, xmm_b, m);
|
|
cc.vpblendvb(xmm_a, xmm_b, m, xmm_a);
|
|
cc.vpblendw(xmm_a, xmm_b, m, 0);
|
|
cc.vpcmpeqb(xmm_a, xmm_b, m);
|
|
cc.vpcmpeqd(xmm_a, xmm_b, m);
|
|
cc.vpcmpeqq(xmm_a, xmm_b, m);
|
|
cc.vpcmpeqw(xmm_a, xmm_b, m);
|
|
cc.vpcmpgtb(xmm_a, xmm_b, m);
|
|
cc.vpcmpgtd(xmm_a, xmm_b, m);
|
|
cc.vpcmpgtq(xmm_a, xmm_b, m);
|
|
cc.vpcmpgtw(xmm_a, xmm_b, m);
|
|
cc.vpermilpd(xmm_a, xmm_b, m);
|
|
cc.vpermilpd(ymm_a, ymm_b, m);
|
|
cc.vpermilpd(xmm_a, m, 0);
|
|
cc.vpermilpd(ymm_a, m, 0);
|
|
cc.vpermilps(xmm_a, xmm_b, m);
|
|
cc.vpermilps(ymm_a, ymm_b, m);
|
|
cc.vpermilps(xmm_a, m, 0);
|
|
cc.vpermilps(ymm_a, m, 0);
|
|
cc.vperm2f128(ymm_a, ymm_b, m, 0);
|
|
cc.vpextrb(m, xmm_b, 0);
|
|
cc.vpextrd(m, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.vpextrq(m, xmm_b, 0);
|
|
cc.vpextrw(m, xmm_b, 0);
|
|
cc.vphaddd(xmm_a, xmm_b, m);
|
|
cc.vphaddsw(xmm_a, xmm_b, m);
|
|
cc.vphaddw(xmm_a, xmm_b, m);
|
|
cc.vphminposuw(xmm_a, m);
|
|
cc.vphsubd(xmm_a, xmm_b, m);
|
|
cc.vphsubsw(xmm_a, xmm_b, m);
|
|
cc.vphsubw(xmm_a, xmm_b, m);
|
|
cc.vpinsrb(xmm_a, xmm_b, m, 0);
|
|
cc.vpinsrd(xmm_a, xmm_b, m, 0);
|
|
cc.vpinsrw(xmm_a, xmm_b, m, 0);
|
|
cc.vpmaddubsw(xmm_a, xmm_b, m);
|
|
cc.vpmaddwd(xmm_a, xmm_b, m);
|
|
cc.vpmaxsb(xmm_a, xmm_b, m);
|
|
cc.vpmaxsd(xmm_a, xmm_b, m);
|
|
cc.vpmaxsw(xmm_a, xmm_b, m);
|
|
cc.vpmaxub(xmm_a, xmm_b, m);
|
|
cc.vpmaxud(xmm_a, xmm_b, m);
|
|
cc.vpmaxuw(xmm_a, xmm_b, m);
|
|
cc.vpminsb(xmm_a, xmm_b, m);
|
|
cc.vpminsd(xmm_a, xmm_b, m);
|
|
cc.vpminsw(xmm_a, xmm_b, m);
|
|
cc.vpminub(xmm_a, xmm_b, m);
|
|
cc.vpminud(xmm_a, xmm_b, m);
|
|
cc.vpminuw(xmm_a, xmm_b, m);
|
|
cc.vpmovsxbd(xmm_a, m);
|
|
cc.vpmovsxbq(xmm_a, m);
|
|
cc.vpmovsxbw(xmm_a, m);
|
|
cc.vpmovsxdq(xmm_a, m);
|
|
cc.vpmovsxwd(xmm_a, m);
|
|
cc.vpmovsxwq(xmm_a, m);
|
|
cc.vpmovzxbd(xmm_a, m);
|
|
cc.vpmovzxbq(xmm_a, m);
|
|
cc.vpmovzxbw(xmm_a, m);
|
|
cc.vpmovzxdq(xmm_a, m);
|
|
cc.vpmovzxwd(xmm_a, m);
|
|
cc.vpmovzxwq(xmm_a, m);
|
|
cc.vpmuldq(xmm_a, xmm_b, m);
|
|
cc.vpmulhrsw(xmm_a, xmm_b, m);
|
|
cc.vpmulhuw(xmm_a, xmm_b, m);
|
|
cc.vpmulhw(xmm_a, xmm_b, m);
|
|
cc.vpmulld(xmm_a, xmm_b, m);
|
|
cc.vpmullw(xmm_a, xmm_b, m);
|
|
cc.vpmuludq(xmm_a, xmm_b, m);
|
|
cc.vpor(xmm_a, xmm_b, m);
|
|
cc.vpsadbw(xmm_a, xmm_b, m);
|
|
cc.vpshufb(xmm_a, xmm_b, m);
|
|
cc.vpshufd(xmm_a, m, 0);
|
|
cc.vpshufhw(xmm_a, m, 0);
|
|
cc.vpshuflw(xmm_a, m, 0);
|
|
cc.vpsignb(xmm_a, xmm_b, m);
|
|
cc.vpsignd(xmm_a, xmm_b, m);
|
|
cc.vpsignw(xmm_a, xmm_b, m);
|
|
cc.vpslld(xmm_a, xmm_b, m);
|
|
cc.vpsllq(xmm_a, xmm_b, m);
|
|
cc.vpsllw(xmm_a, xmm_b, m);
|
|
cc.vpsrad(xmm_a, xmm_b, m);
|
|
cc.vpsraw(xmm_a, xmm_b, m);
|
|
cc.vpsrld(xmm_a, xmm_b, m);
|
|
cc.vpsrlq(xmm_a, xmm_b, m);
|
|
cc.vpsrlw(xmm_a, xmm_b, m);
|
|
cc.vpsubb(xmm_a, xmm_b, m);
|
|
cc.vpsubd(xmm_a, xmm_b, m);
|
|
cc.vpsubq(xmm_a, xmm_b, m);
|
|
cc.vpsubw(xmm_a, xmm_b, m);
|
|
cc.vpsubsb(xmm_a, xmm_b, m);
|
|
cc.vpsubsw(xmm_a, xmm_b, m);
|
|
cc.vpsubusb(xmm_a, xmm_b, m);
|
|
cc.vpsubusw(xmm_a, xmm_b, m);
|
|
cc.vptest(xmm_a, m);
|
|
cc.vptest(ymm_a, m);
|
|
cc.vpunpckhbw(xmm_a, xmm_b, m);
|
|
cc.vpunpckhdq(xmm_a, xmm_b, m);
|
|
cc.vpunpckhqdq(xmm_a, xmm_b, m);
|
|
cc.vpunpckhwd(xmm_a, xmm_b, m);
|
|
cc.vpunpcklbw(xmm_a, xmm_b, m);
|
|
cc.vpunpckldq(xmm_a, xmm_b, m);
|
|
cc.vpunpcklqdq(xmm_a, xmm_b, m);
|
|
cc.vpunpcklwd(xmm_a, xmm_b, m);
|
|
cc.vpxor(xmm_a, xmm_b, m);
|
|
cc.vrcpps(xmm_a, m);
|
|
cc.vrcpps(ymm_a, m);
|
|
cc.vrcpss(xmm_a, xmm_b, m);
|
|
cc.vrsqrtps(xmm_a, m);
|
|
cc.vrsqrtps(ymm_a, m);
|
|
cc.vrsqrtss(xmm_a, xmm_b, m);
|
|
cc.vroundpd(xmm_a, m, 0);
|
|
cc.vroundpd(ymm_a, m, 0);
|
|
cc.vroundps(xmm_a, m, 0);
|
|
cc.vroundps(ymm_a, m, 0);
|
|
cc.vroundsd(xmm_a, xmm_b, m, 0);
|
|
cc.vroundss(xmm_a, xmm_b, m, 0);
|
|
cc.vshufpd(xmm_a, xmm_b, m, 0);
|
|
cc.vshufpd(ymm_a, ymm_b, m, 0);
|
|
cc.vshufps(xmm_a, xmm_b, m, 0);
|
|
cc.vshufps(ymm_a, ymm_b, m, 0);
|
|
cc.vsqrtpd(xmm_a, m);
|
|
cc.vsqrtpd(ymm_a, m);
|
|
cc.vsqrtps(xmm_a, m);
|
|
cc.vsqrtps(ymm_a, m);
|
|
cc.vsqrtsd(xmm_a, xmm_b, m);
|
|
cc.vsqrtss(xmm_a, xmm_b, m);
|
|
cc.vsubpd(xmm_a, xmm_b, m);
|
|
cc.vsubpd(ymm_a, ymm_b, m);
|
|
cc.vsubps(xmm_a, xmm_b, m);
|
|
cc.vsubps(ymm_a, ymm_b, m);
|
|
cc.vsubsd(xmm_a, xmm_b, m);
|
|
cc.vsubss(xmm_a, xmm_b, m);
|
|
cc.vtestps(xmm_a, m);
|
|
cc.vtestps(ymm_a, m);
|
|
cc.vtestpd(xmm_a, m);
|
|
cc.vtestpd(ymm_a, m);
|
|
cc.vucomisd(xmm_a, m);
|
|
cc.vucomiss(xmm_a, m);
|
|
cc.vunpckhpd(xmm_a, xmm_b, m);
|
|
cc.vunpckhpd(ymm_a, ymm_b, m);
|
|
cc.vunpckhps(xmm_a, xmm_b, m);
|
|
cc.vunpckhps(ymm_a, ymm_b, m);
|
|
cc.vunpcklpd(xmm_a, xmm_b, m);
|
|
cc.vunpcklpd(ymm_a, ymm_b, m);
|
|
cc.vunpcklps(xmm_a, xmm_b, m);
|
|
cc.vunpcklps(ymm_a, ymm_b, m);
|
|
cc.vxorpd(xmm_a, xmm_b, m);
|
|
cc.vxorpd(ymm_a, ymm_b, m);
|
|
cc.vxorps(xmm_a, xmm_b, m);
|
|
cc.vxorps(ymm_a, ymm_b, m);
|
|
|
|
// AVX+AESNI.
|
|
cc.vaesdec(xmm_a, xmm_b, m);
|
|
cc.vaesdeclast(xmm_a, xmm_b, m);
|
|
cc.vaesenc(xmm_a, xmm_b, m);
|
|
cc.vaesenclast(xmm_a, xmm_b, m);
|
|
cc.vaesimc(xmm_a, m);
|
|
cc.vaeskeygenassist(xmm_a, m, 0);
|
|
|
|
// AVX+PCLMULQDQ.
|
|
cc.vpclmulqdq(xmm_a, xmm_b, m, 0);
|
|
|
|
// AVX2.
|
|
cc.vbroadcasti128(ymm_a, m);
|
|
cc.vextracti128(m, ymm_b, 0);
|
|
cc.vgatherdpd(xmm_a, vx_ptr, xmm_c);
|
|
cc.vgatherdpd(ymm_a, vx_ptr, ymm_c);
|
|
cc.vgatherdps(xmm_a, vx_ptr, xmm_c);
|
|
cc.vgatherdps(ymm_a, vy_ptr, ymm_c);
|
|
cc.vgatherqpd(xmm_a, vx_ptr, xmm_c);
|
|
cc.vgatherqpd(ymm_a, vy_ptr, ymm_c);
|
|
cc.vgatherqps(xmm_a, vx_ptr, xmm_c);
|
|
cc.vgatherqps(xmm_a, vy_ptr, xmm_c);
|
|
cc.vinserti128(ymm_a, ymm_b, m, 0);
|
|
cc.vmovntdqa(ymm_a, m);
|
|
cc.vmpsadbw(ymm_a, ymm_b, m, 0);
|
|
cc.vpabsb(ymm_a, m);
|
|
cc.vpabsd(ymm_a, m);
|
|
cc.vpabsw(ymm_a, m);
|
|
cc.vpackssdw(ymm_a, ymm_b, m);
|
|
cc.vpacksswb(ymm_a, ymm_b, m);
|
|
cc.vpackusdw(ymm_a, ymm_b, m);
|
|
cc.vpackuswb(ymm_a, ymm_b, m);
|
|
cc.vpaddb(ymm_a, ymm_b, m);
|
|
cc.vpaddd(ymm_a, ymm_b, m);
|
|
cc.vpaddq(ymm_a, ymm_b, m);
|
|
cc.vpaddw(ymm_a, ymm_b, m);
|
|
cc.vpaddsb(ymm_a, ymm_b, m);
|
|
cc.vpaddsw(ymm_a, ymm_b, m);
|
|
cc.vpaddusb(ymm_a, ymm_b, m);
|
|
cc.vpaddusw(ymm_a, ymm_b, m);
|
|
cc.vpalignr(ymm_a, ymm_b, m, 0);
|
|
cc.vpand(ymm_a, ymm_b, m);
|
|
cc.vpandn(ymm_a, ymm_b, m);
|
|
cc.vpavgb(ymm_a, ymm_b, m);
|
|
cc.vpavgw(ymm_a, ymm_b, m);
|
|
cc.vpblendd(xmm_a, xmm_b, m, 0);
|
|
cc.vpblendd(ymm_a, ymm_b, m, 0);
|
|
cc.vpblendvb(ymm_a, ymm_b, m, ymm_a);
|
|
cc.vpblendw(ymm_a, ymm_b, m, 0);
|
|
cc.vpbroadcastb(xmm_a, m);
|
|
cc.vpbroadcastb(ymm_a, m);
|
|
cc.vpbroadcastd(xmm_a, m);
|
|
cc.vpbroadcastd(ymm_a, m);
|
|
cc.vpbroadcastq(xmm_a, m);
|
|
cc.vpbroadcastq(ymm_a, m);
|
|
cc.vpbroadcastw(xmm_a, m);
|
|
cc.vpbroadcastw(ymm_a, m);
|
|
cc.vpcmpeqb(ymm_a, ymm_b, m);
|
|
cc.vpcmpeqd(ymm_a, ymm_b, m);
|
|
cc.vpcmpeqq(ymm_a, ymm_b, m);
|
|
cc.vpcmpeqw(ymm_a, ymm_b, m);
|
|
cc.vpcmpgtb(ymm_a, ymm_b, m);
|
|
cc.vpcmpgtd(ymm_a, ymm_b, m);
|
|
cc.vpcmpgtq(ymm_a, ymm_b, m);
|
|
cc.vpcmpgtw(ymm_a, ymm_b, m);
|
|
cc.vperm2i128(ymm_a, ymm_b, m, 0);
|
|
cc.vpermd(ymm_a, ymm_b, m);
|
|
cc.vpermps(ymm_a, ymm_b, m);
|
|
cc.vpermpd(ymm_a, m, 0);
|
|
cc.vpermq(ymm_a, m, 0);
|
|
cc.vpgatherdd(xmm_a, vx_ptr, xmm_c);
|
|
cc.vpgatherdd(ymm_a, vy_ptr, ymm_c);
|
|
cc.vpgatherdq(xmm_a, vx_ptr, xmm_c);
|
|
cc.vpgatherdq(ymm_a, vx_ptr, ymm_c);
|
|
cc.vpgatherqd(xmm_a, vx_ptr, xmm_c);
|
|
cc.vpgatherqd(xmm_a, vy_ptr, xmm_c);
|
|
cc.vpgatherqq(xmm_a, vx_ptr, xmm_c);
|
|
cc.vpgatherqq(ymm_a, vy_ptr, ymm_c);
|
|
cc.vpmovsxbd(ymm_a, m);
|
|
cc.vpmovsxbq(ymm_a, m);
|
|
cc.vpmovsxbw(ymm_a, m);
|
|
cc.vpmovsxdq(ymm_a, m);
|
|
cc.vpmovsxwd(ymm_a, m);
|
|
cc.vpmovsxwq(ymm_a, m);
|
|
cc.vpmovzxbd(ymm_a, m);
|
|
cc.vpmovzxbq(ymm_a, m);
|
|
cc.vpmovzxbw(ymm_a, m);
|
|
cc.vpmovzxdq(ymm_a, m);
|
|
cc.vpmovzxwd(ymm_a, m);
|
|
cc.vpmovzxwq(ymm_a, m);
|
|
cc.vpshufd(ymm_a, m, 0);
|
|
cc.vpshufhw(ymm_a, m, 0);
|
|
cc.vpshuflw(ymm_a, m, 0);
|
|
cc.vphaddd(ymm_a, ymm_b, m);
|
|
cc.vphaddsw(ymm_a, ymm_b, m);
|
|
cc.vphaddw(ymm_a, ymm_b, m);
|
|
cc.vphsubd(ymm_a, ymm_b, m);
|
|
cc.vphsubsw(ymm_a, ymm_b, m);
|
|
cc.vphsubw(ymm_a, ymm_b, m);
|
|
cc.vpmaddubsw(ymm_a, ymm_b, m);
|
|
cc.vpmaddwd(ymm_a, ymm_b, m);
|
|
cc.vpmaskmovd(m, xmm_b, xmm_c);
|
|
cc.vpmaskmovd(m, ymm_b, ymm_c);
|
|
cc.vpmaskmovd(xmm_a, xmm_b, m);
|
|
cc.vpmaskmovd(ymm_a, ymm_b, m);
|
|
cc.vpmaskmovq(m, xmm_b, xmm_c);
|
|
cc.vpmaskmovq(m, ymm_b, ymm_c);
|
|
cc.vpmaskmovq(xmm_a, xmm_b, m);
|
|
cc.vpmaskmovq(ymm_a, ymm_b, m);
|
|
cc.vpmaxsb(ymm_a, ymm_b, m);
|
|
cc.vpmaxsd(ymm_a, ymm_b, m);
|
|
cc.vpmaxsw(ymm_a, ymm_b, m);
|
|
cc.vpmaxub(ymm_a, ymm_b, m);
|
|
cc.vpmaxud(ymm_a, ymm_b, m);
|
|
cc.vpmaxuw(ymm_a, ymm_b, m);
|
|
cc.vpminsb(ymm_a, ymm_b, m);
|
|
cc.vpminsd(ymm_a, ymm_b, m);
|
|
cc.vpminsw(ymm_a, ymm_b, m);
|
|
cc.vpminub(ymm_a, ymm_b, m);
|
|
cc.vpminud(ymm_a, ymm_b, m);
|
|
cc.vpminuw(ymm_a, ymm_b, m);
|
|
cc.vpmuldq(ymm_a, ymm_b, m);
|
|
cc.vpmulhrsw(ymm_a, ymm_b, m);
|
|
cc.vpmulhuw(ymm_a, ymm_b, m);
|
|
cc.vpmulhw(ymm_a, ymm_b, m);
|
|
cc.vpmulld(ymm_a, ymm_b, m);
|
|
cc.vpmullw(ymm_a, ymm_b, m);
|
|
cc.vpmuludq(ymm_a, ymm_b, m);
|
|
cc.vpor(ymm_a, ymm_b, m);
|
|
cc.vpsadbw(ymm_a, ymm_b, m);
|
|
cc.vpshufb(ymm_a, ymm_b, m);
|
|
cc.vpsignb(ymm_a, ymm_b, m);
|
|
cc.vpsignd(ymm_a, ymm_b, m);
|
|
cc.vpsignw(ymm_a, ymm_b, m);
|
|
cc.vpslld(ymm_a, ymm_b, m);
|
|
cc.vpsllq(ymm_a, ymm_b, m);
|
|
cc.vpsllvd(xmm_a, xmm_b, m);
|
|
cc.vpsllvd(ymm_a, ymm_b, m);
|
|
cc.vpsllvq(xmm_a, xmm_b, m);
|
|
cc.vpsllvq(ymm_a, ymm_b, m);
|
|
cc.vpsllw(ymm_a, ymm_b, m);
|
|
cc.vpsrad(ymm_a, ymm_b, m);
|
|
cc.vpsravd(xmm_a, xmm_b, m);
|
|
cc.vpsravd(ymm_a, ymm_b, m);
|
|
cc.vpsraw(ymm_a, ymm_b, m);
|
|
cc.vpsrld(ymm_a, ymm_b, m);
|
|
cc.vpsrlq(ymm_a, ymm_b, m);
|
|
cc.vpsrlvd(xmm_a, xmm_b, m);
|
|
cc.vpsrlvd(ymm_a, ymm_b, m);
|
|
cc.vpsrlvq(xmm_a, xmm_b, m);
|
|
cc.vpsrlvq(ymm_a, ymm_b, m);
|
|
cc.vpsrlw(ymm_a, ymm_b, m);
|
|
cc.vpsubb(ymm_a, ymm_b, m);
|
|
cc.vpsubd(ymm_a, ymm_b, m);
|
|
cc.vpsubq(ymm_a, ymm_b, m);
|
|
cc.vpsubsb(ymm_a, ymm_b, m);
|
|
cc.vpsubsw(ymm_a, ymm_b, m);
|
|
cc.vpsubusb(ymm_a, ymm_b, m);
|
|
cc.vpsubusw(ymm_a, ymm_b, m);
|
|
cc.vpsubw(ymm_a, ymm_b, m);
|
|
cc.vpunpckhbw(ymm_a, ymm_b, m);
|
|
cc.vpunpckhdq(ymm_a, ymm_b, m);
|
|
cc.vpunpckhqdq(ymm_a, ymm_b, m);
|
|
cc.vpunpckhwd(ymm_a, ymm_b, m);
|
|
cc.vpunpcklbw(ymm_a, ymm_b, m);
|
|
cc.vpunpckldq(ymm_a, ymm_b, m);
|
|
cc.vpunpcklqdq(ymm_a, ymm_b, m);
|
|
cc.vpunpcklwd(ymm_a, ymm_b, m);
|
|
cc.vpxor(ymm_a, ymm_b, m);
|
|
}
|
|
|
|
// Generates a long sequence of AVX instructions.
|
|
template<typename Emitter>
|
|
static void generate_avx_sequenceInternal(
|
|
Emitter& cc,
|
|
InstForm form,
|
|
const x86::Gp& gp,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
if (form == InstForm::kReg)
|
|
generate_avx_sequenceInternalRegOnly(cc, gp, vec_a, vec_b, vec_c, vec_d);
|
|
else
|
|
generate_avx_sequenceInternalRegMem(cc, gp, vec_a, vec_b, vec_c, vec_d);
|
|
}
|
|
|
|
static void generate_avx_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp gp = cc.new_gpz("gp");
|
|
x86::Vec a = cc.new_ymm("a");
|
|
x86::Vec b = cc.new_ymm("b");
|
|
x86::Vec c = cc.new_ymm("c");
|
|
x86::Vec d = cc.new_ymm("d");
|
|
|
|
cc.add_func(FuncSignature::build<void>());
|
|
generate_avx_sequenceInternal(cc, form, gp, a, b, c, d);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Generates a long sequence of AVX512 instructions.
|
|
template<typename Emitter>
|
|
static void generate_avx512_sequence_internal_reg_only(
|
|
Emitter& cc,
|
|
const x86::Gp& gp,
|
|
const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
x86::Gp gpd = gp.r32();
|
|
x86::Gp gpq = gp.r64();
|
|
x86::Gp gpz = cc.is_32bit() ? gpd : gpq;
|
|
|
|
x86::Vec xmm_a = vec_a.xmm();
|
|
x86::Vec xmm_b = vec_b.xmm();
|
|
x86::Vec xmm_c = vec_c.xmm();
|
|
x86::Vec xmm_d = vec_d.xmm();
|
|
|
|
x86::Vec ymm_a = vec_a.ymm();
|
|
x86::Vec ymm_b = vec_b.ymm();
|
|
x86::Vec ymm_c = vec_c.ymm();
|
|
|
|
x86::Vec zmm_a = vec_a.zmm();
|
|
x86::Vec zmm_b = vec_b.zmm();
|
|
x86::Vec zmm_c = vec_c.zmm();
|
|
|
|
cc.xor_(gpd, gpd);
|
|
cc.vxorps(xmm_a, xmm_a, xmm_a);
|
|
cc.vxorps(xmm_b, xmm_b, xmm_b);
|
|
cc.vxorps(xmm_c, xmm_c, xmm_c);
|
|
cc.vxorps(xmm_d, xmm_d, xmm_d);
|
|
|
|
cc.kaddb(kA, kB, kC);
|
|
cc.kaddd(kA, kB, kC);
|
|
cc.kaddq(kA, kB, kC);
|
|
cc.kaddw(kA, kB, kC);
|
|
cc.kandb(kA, kB, kC);
|
|
cc.kandd(kA, kB, kC);
|
|
cc.kandnb(kA, kB, kC);
|
|
cc.kandnd(kA, kB, kC);
|
|
cc.kandnq(kA, kB, kC);
|
|
cc.kandnw(kA, kB, kC);
|
|
cc.kandq(kA, kB, kC);
|
|
cc.kandw(kA, kB, kC);
|
|
cc.kmovb(kA, kB);
|
|
cc.kmovb(kA, gpd);
|
|
cc.kmovb(gpd, kB);
|
|
cc.kmovd(kA, kB);
|
|
cc.kmovd(kA, gpd);
|
|
cc.kmovd(gpd, kB);
|
|
cc.kmovq(kA, kB);
|
|
if (cc.is_64bit()) cc.kmovq(kA, gpq);
|
|
if (cc.is_64bit()) cc.kmovq(gpq, kB);
|
|
cc.kmovw(kA, kB);
|
|
cc.kmovw(kA, gpd);
|
|
cc.kmovw(gpd, kB);
|
|
cc.knotb(kA, kB);
|
|
cc.knotd(kA, kB);
|
|
cc.knotq(kA, kB);
|
|
cc.knotw(kA, kB);
|
|
cc.korb(kA, kB, kC);
|
|
cc.kord(kA, kB, kC);
|
|
cc.korq(kA, kB, kC);
|
|
cc.kortestb(kA, kB);
|
|
cc.kortestd(kA, kB);
|
|
cc.kortestq(kA, kB);
|
|
cc.kortestw(kA, kB);
|
|
cc.korw(kA, kB, kC);
|
|
cc.kshiftlb(kA, kB, 0);
|
|
cc.kshiftld(kA, kB, 0);
|
|
cc.kshiftlq(kA, kB, 0);
|
|
cc.kshiftlw(kA, kB, 0);
|
|
cc.kshiftrb(kA, kB, 0);
|
|
cc.kshiftrd(kA, kB, 0);
|
|
cc.kshiftrq(kA, kB, 0);
|
|
cc.kshiftrw(kA, kB, 0);
|
|
cc.ktestb(kA, kB);
|
|
cc.ktestd(kA, kB);
|
|
cc.ktestq(kA, kB);
|
|
cc.ktestw(kA, kB);
|
|
cc.kunpckbw(kA, kB, kC);
|
|
cc.kunpckdq(kA, kB, kC);
|
|
cc.kunpckwd(kA, kB, kC);
|
|
cc.kxnorb(kA, kB, kC);
|
|
cc.kxnord(kA, kB, kC);
|
|
cc.kxnorq(kA, kB, kC);
|
|
cc.kxnorw(kA, kB, kC);
|
|
cc.kxorb(kA, kB, kC);
|
|
cc.kxord(kA, kB, kC);
|
|
cc.kxorq(kA, kB, kC);
|
|
cc.kxorw(kA, kB, kC);
|
|
cc.nop();
|
|
|
|
cc.evex().vaddpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vaddpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vaddpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vaddps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vaddps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vaddps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vaddsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vaddss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().valignd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().valignd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().valignd(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().valignq(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().valignq(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().valignq(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vandnpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vandnpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vandnpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vandnps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vandnps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vandnps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vandpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vandpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vandpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vandps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vandps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vandps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vblendmpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vblendmpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vblendmpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vblendmps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vblendmps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vblendmps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vbroadcastf32x2(ymm_a, xmm_b);
|
|
cc.evex().vbroadcastf32x2(zmm_a, xmm_b);
|
|
cc.evex().vbroadcasti32x2(xmm_a, xmm_b);
|
|
cc.evex().vbroadcasti32x2(ymm_a, xmm_b);
|
|
cc.evex().vbroadcasti32x2(zmm_a, xmm_b);
|
|
cc.evex().vbroadcastsd(ymm_a, xmm_b);
|
|
cc.evex().vbroadcastsd(zmm_a, xmm_b);
|
|
cc.evex().vbroadcastss(xmm_a, xmm_b);
|
|
cc.evex().vbroadcastss(ymm_a, xmm_b);
|
|
cc.evex().vbroadcastss(zmm_a, xmm_b);
|
|
cc.evex().vcmppd(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vcmppd(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vcmppd(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vcmpps(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vcmpps(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vcmpps(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vcmpsd(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vcmpss(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vcomisd(xmm_a, xmm_b);
|
|
cc.evex().vcomiss(xmm_a, xmm_b);
|
|
cc.evex().vcompresspd(xmm_a, xmm_b);
|
|
cc.evex().vcompresspd(ymm_a, ymm_b);
|
|
cc.evex().vcompresspd(zmm_a, zmm_b);
|
|
cc.evex().vcompressps(xmm_a, xmm_b);
|
|
cc.evex().vcompressps(ymm_a, ymm_b);
|
|
cc.evex().vcompressps(zmm_a, zmm_b);
|
|
cc.evex().vcvtdq2pd(xmm_a, xmm_b);
|
|
cc.evex().vcvtdq2pd(ymm_a, xmm_b);
|
|
cc.evex().vcvtdq2pd(zmm_a, ymm_b);
|
|
cc.evex().vcvtdq2ps(xmm_a, xmm_b);
|
|
cc.evex().vcvtdq2ps(ymm_a, ymm_b);
|
|
cc.evex().vcvtdq2ps(zmm_a, zmm_b);
|
|
cc.evex().vcvtpd2dq(xmm_a, xmm_b);
|
|
cc.evex().vcvtpd2dq(xmm_a, ymm_b);
|
|
cc.evex().vcvtpd2dq(ymm_a, zmm_b);
|
|
cc.evex().vcvtpd2qq(xmm_a, xmm_b);
|
|
cc.evex().vcvtpd2qq(ymm_a, ymm_b);
|
|
cc.evex().vcvtpd2qq(zmm_a, zmm_b);
|
|
cc.evex().vcvtpd2udq(xmm_a, xmm_b);
|
|
cc.evex().vcvtpd2udq(xmm_a, ymm_b);
|
|
cc.evex().vcvtpd2udq(ymm_a, zmm_b);
|
|
cc.evex().vcvtpd2uqq(xmm_a, xmm_b);
|
|
cc.evex().vcvtpd2uqq(ymm_a, ymm_b);
|
|
cc.evex().vcvtpd2uqq(zmm_a, zmm_b);
|
|
cc.evex().vcvtph2ps(xmm_a, xmm_b);
|
|
cc.evex().vcvtph2ps(ymm_a, xmm_b);
|
|
cc.evex().vcvtph2ps(zmm_a, ymm_b);
|
|
cc.evex().vcvtps2dq(xmm_a, xmm_b);
|
|
cc.evex().vcvtps2dq(ymm_a, ymm_b);
|
|
cc.evex().vcvtps2dq(zmm_a, zmm_b);
|
|
cc.evex().vcvtps2pd(xmm_a, xmm_b);
|
|
cc.evex().vcvtps2pd(ymm_a, xmm_b);
|
|
cc.evex().vcvtps2pd(zmm_a, ymm_b);
|
|
cc.evex().vcvtps2ph(xmm_a, xmm_b, 0);
|
|
cc.evex().vcvtps2ph(xmm_a, ymm_b, 0);
|
|
cc.evex().vcvtps2ph(ymm_a, zmm_b, 0);
|
|
cc.evex().vcvtps2qq(xmm_a, xmm_b);
|
|
cc.evex().vcvtps2qq(ymm_a, xmm_b);
|
|
cc.evex().vcvtps2qq(zmm_a, ymm_b);
|
|
cc.evex().vcvtps2udq(xmm_a, xmm_b);
|
|
cc.evex().vcvtps2udq(ymm_a, ymm_b);
|
|
cc.evex().vcvtps2udq(zmm_a, zmm_b);
|
|
cc.evex().vcvtps2uqq(xmm_a, xmm_b);
|
|
cc.evex().vcvtps2uqq(ymm_a, xmm_b);
|
|
cc.evex().vcvtps2uqq(zmm_a, ymm_b);
|
|
cc.evex().vcvtqq2pd(xmm_a, xmm_b);
|
|
cc.evex().vcvtqq2pd(ymm_a, ymm_b);
|
|
cc.evex().vcvtqq2pd(zmm_a, zmm_b);
|
|
cc.evex().vcvtqq2ps(xmm_a, xmm_b);
|
|
cc.evex().vcvtqq2ps(xmm_a, ymm_b);
|
|
cc.evex().vcvtqq2ps(ymm_a, zmm_b);
|
|
cc.evex().vcvtsd2si(gpd, xmm_b);
|
|
cc.evex().vcvtsd2si(gpz, xmm_b);
|
|
cc.evex().vcvtsd2ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vcvtsd2usi(gpd, xmm_b);
|
|
cc.evex().vcvtsd2usi(gpz, xmm_b);
|
|
cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpd);
|
|
cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpz);
|
|
cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpd);
|
|
cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpz);
|
|
cc.evex().vcvtss2sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vcvtss2si(gpd, xmm_b);
|
|
cc.evex().vcvtss2si(gpz, xmm_b);
|
|
cc.evex().vcvtss2usi(gpd, xmm_b);
|
|
cc.evex().vcvtss2usi(gpz, xmm_b);
|
|
cc.evex().vcvttpd2dq(xmm_a, xmm_b);
|
|
cc.evex().vcvttpd2dq(xmm_a, ymm_b);
|
|
cc.evex().vcvttpd2dq(ymm_a, zmm_b);
|
|
cc.evex().vcvttpd2qq(xmm_a, xmm_b);
|
|
cc.evex().vcvttpd2qq(ymm_a, ymm_b);
|
|
cc.evex().vcvttpd2qq(zmm_a, zmm_b);
|
|
cc.evex().vcvttpd2udq(xmm_a, xmm_b);
|
|
cc.evex().vcvttpd2udq(xmm_a, ymm_b);
|
|
cc.evex().vcvttpd2udq(ymm_a, zmm_b);
|
|
cc.evex().vcvttpd2uqq(xmm_a, xmm_b);
|
|
cc.evex().vcvttpd2uqq(ymm_a, ymm_b);
|
|
cc.evex().vcvttpd2uqq(zmm_a, zmm_b);
|
|
cc.evex().vcvttps2dq(xmm_a, xmm_b);
|
|
cc.evex().vcvttps2dq(ymm_a, ymm_b);
|
|
cc.evex().vcvttps2dq(zmm_a, zmm_b);
|
|
cc.evex().vcvttps2qq(xmm_a, xmm_b);
|
|
cc.evex().vcvttps2qq(ymm_a, xmm_b);
|
|
cc.evex().vcvttps2qq(zmm_a, ymm_b);
|
|
cc.evex().vcvttps2udq(xmm_a, xmm_b);
|
|
cc.evex().vcvttps2udq(ymm_a, ymm_b);
|
|
cc.evex().vcvttps2udq(zmm_a, zmm_b);
|
|
cc.evex().vcvttps2uqq(xmm_a, xmm_b);
|
|
cc.evex().vcvttps2uqq(ymm_a, xmm_b);
|
|
cc.evex().vcvttps2uqq(zmm_a, ymm_b);
|
|
cc.evex().vcvttsd2si(gpd, xmm_b);
|
|
cc.evex().vcvttsd2si(gpz, xmm_b);
|
|
cc.evex().vcvttsd2usi(gpd, xmm_b);
|
|
cc.evex().vcvttsd2usi(gpz, xmm_b);
|
|
cc.evex().vcvttss2si(gpd, xmm_b);
|
|
cc.evex().vcvttss2si(gpz, xmm_b);
|
|
cc.evex().vcvttss2usi(gpd, xmm_b);
|
|
cc.evex().vcvttss2usi(gpz, xmm_b);
|
|
cc.evex().vcvtudq2pd(xmm_a, xmm_b);
|
|
cc.evex().vcvtudq2pd(ymm_a, xmm_b);
|
|
cc.evex().vcvtudq2pd(zmm_a, ymm_b);
|
|
cc.evex().vcvtudq2ps(xmm_a, xmm_b);
|
|
cc.evex().vcvtudq2ps(ymm_a, ymm_b);
|
|
cc.evex().vcvtudq2ps(zmm_a, zmm_b);
|
|
cc.evex().vcvtuqq2pd(xmm_a, xmm_b);
|
|
cc.evex().vcvtuqq2pd(ymm_a, ymm_b);
|
|
cc.evex().vcvtuqq2pd(zmm_a, zmm_b);
|
|
cc.evex().vcvtuqq2ps(xmm_a, xmm_b);
|
|
cc.evex().vcvtuqq2ps(xmm_a, ymm_b);
|
|
cc.evex().vcvtuqq2ps(ymm_a, zmm_b);
|
|
cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpd);
|
|
cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpz);
|
|
cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpd);
|
|
cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpz);
|
|
cc.evex().vdbpsadbw(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vdbpsadbw(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vdbpsadbw(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vdivpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vdivpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vdivpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vdivps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vdivps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vdivps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vdivsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vdivss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vexpandpd(xmm_a, xmm_b);
|
|
cc.evex().vexpandpd(ymm_a, ymm_b);
|
|
cc.evex().vexpandpd(zmm_a, zmm_b);
|
|
cc.evex().vexpandps(xmm_a, xmm_b);
|
|
cc.evex().vexpandps(ymm_a, ymm_b);
|
|
cc.evex().vexpandps(zmm_a, zmm_b);
|
|
cc.evex().vextractf32x4(xmm_a, ymm_b, 0);
|
|
cc.evex().vextractf32x4(xmm_a, zmm_b, 0);
|
|
cc.evex().vextractf32x8(ymm_a, zmm_b, 0);
|
|
cc.evex().vextractf64x2(xmm_a, ymm_b, 0);
|
|
cc.evex().vextractf64x2(xmm_a, zmm_b, 0);
|
|
cc.evex().vextractf64x4(ymm_a, zmm_b, 0);
|
|
cc.evex().vextracti32x4(xmm_a, ymm_b, 0);
|
|
cc.evex().vextracti32x4(xmm_a, zmm_b, 0);
|
|
cc.evex().vextracti32x8(ymm_a, zmm_b, 0);
|
|
cc.evex().vextracti64x2(xmm_a, ymm_b, 0);
|
|
cc.evex().vextracti64x2(xmm_a, zmm_b, 0);
|
|
cc.evex().vextracti64x4(ymm_a, zmm_b, 0);
|
|
cc.evex().vextractps(gpd, xmm_b, 0);
|
|
cc.evex().vfixupimmpd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vfixupimmpd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vfixupimmpd(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vfixupimmps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vfixupimmps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vfixupimmps(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vfixupimmsd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vfixupimmss(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vfmadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmadd231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmadd231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmadd231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmaddsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmaddsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmaddsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmaddsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmaddsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmaddsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmaddsub231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsub231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsub231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsub231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsubadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsubadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsubadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsubadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfmsubadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfmsubadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfmsubadd231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmadd231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmadd231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmadd231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub132pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub132pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub132pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub132ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub132ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub132ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub132sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub132ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub213pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub213pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub213pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub213ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub213ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub213ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub213sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub213ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub231pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub231pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub231pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub231ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub231ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vfnmsub231ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vfnmsub231sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfnmsub231ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vfpclasspd(kA, xmm_b, 0);
|
|
cc.evex().vfpclasspd(kA, ymm_b, 0);
|
|
cc.evex().vfpclasspd(kA, zmm_b, 0);
|
|
cc.evex().vfpclassps(kA, xmm_b, 0);
|
|
cc.evex().vfpclassps(kA, ymm_b, 0);
|
|
cc.evex().vfpclassps(kA, zmm_b, 0);
|
|
cc.evex().vfpclasssd(kA, xmm_b, 0);
|
|
cc.evex().vfpclassss(kA, xmm_b, 0);
|
|
cc.evex().vgetexppd(xmm_a, xmm_b);
|
|
cc.evex().vgetexppd(ymm_a, ymm_b);
|
|
cc.evex().vgetexppd(zmm_a, zmm_b);
|
|
cc.evex().vgetexpps(xmm_a, xmm_b);
|
|
cc.evex().vgetexpps(ymm_a, ymm_b);
|
|
cc.evex().vgetexpps(zmm_a, zmm_b);
|
|
cc.evex().vgetexpsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vgetexpss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vgetmantpd(xmm_a, xmm_b, 0);
|
|
cc.evex().vgetmantpd(ymm_a, ymm_b, 0);
|
|
cc.evex().vgetmantpd(zmm_a, zmm_b, 0);
|
|
cc.evex().vgetmantps(xmm_a, xmm_b, 0);
|
|
cc.evex().vgetmantps(ymm_a, ymm_b, 0);
|
|
cc.evex().vgetmantps(zmm_a, zmm_b, 0);
|
|
cc.evex().vgetmantsd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vgetmantss(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vinsertf32x4(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.evex().vinsertf32x4(zmm_a, zmm_b, xmm_c, 0);
|
|
cc.evex().vinsertf32x8(zmm_a, zmm_b, ymm_c, 0);
|
|
cc.evex().vinsertf64x2(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.evex().vinsertf64x2(zmm_a, zmm_b, xmm_c, 0);
|
|
cc.evex().vinsertf64x4(zmm_a, zmm_b, ymm_c, 0);
|
|
cc.evex().vinserti32x4(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.evex().vinserti32x4(zmm_a, zmm_b, xmm_c, 0);
|
|
cc.evex().vinserti32x8(zmm_a, zmm_b, ymm_c, 0);
|
|
cc.evex().vinserti64x2(ymm_a, ymm_b, xmm_c, 0);
|
|
cc.evex().vinserti64x2(zmm_a, zmm_b, xmm_c, 0);
|
|
cc.evex().vinserti64x4(zmm_a, zmm_b, ymm_c, 0);
|
|
cc.evex().vinsertps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vmaxpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmaxpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vmaxpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vmaxps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmaxps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vmaxps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vmaxsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmaxss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vminpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vminpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vminpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vminps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vminps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vminps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vminsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vminss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmovapd(xmm_a, xmm_b);
|
|
cc.evex().vmovapd(xmm_a, xmm_b);
|
|
cc.evex().vmovapd(ymm_a, ymm_b);
|
|
cc.evex().vmovapd(ymm_a, ymm_b);
|
|
cc.evex().vmovapd(zmm_a, zmm_b);
|
|
cc.evex().vmovapd(zmm_a, zmm_b);
|
|
cc.evex().vmovaps(xmm_a, xmm_b);
|
|
cc.evex().vmovaps(xmm_a, xmm_b);
|
|
cc.evex().vmovaps(ymm_a, ymm_b);
|
|
cc.evex().vmovaps(ymm_a, ymm_b);
|
|
cc.evex().vmovaps(zmm_a, zmm_b);
|
|
cc.evex().vmovaps(zmm_a, zmm_b);
|
|
cc.evex().vmovd(gpd, xmm_b);
|
|
cc.evex().vmovd(xmm_a, gpd);
|
|
cc.evex().vmovddup(xmm_a, xmm_b);
|
|
cc.evex().vmovddup(ymm_a, ymm_b);
|
|
cc.evex().vmovddup(zmm_a, zmm_b);
|
|
cc.evex().vmovdqa32(xmm_a, xmm_b);
|
|
cc.evex().vmovdqa32(xmm_a, xmm_b);
|
|
cc.evex().vmovdqa32(ymm_a, ymm_b);
|
|
cc.evex().vmovdqa32(ymm_a, ymm_b);
|
|
cc.evex().vmovdqa32(zmm_a, zmm_b);
|
|
cc.evex().vmovdqa32(zmm_a, zmm_b);
|
|
cc.evex().vmovdqa64(xmm_a, xmm_b);
|
|
cc.evex().vmovdqa64(xmm_a, xmm_b);
|
|
cc.evex().vmovdqa64(ymm_a, ymm_b);
|
|
cc.evex().vmovdqa64(ymm_a, ymm_b);
|
|
cc.evex().vmovdqa64(zmm_a, zmm_b);
|
|
cc.evex().vmovdqa64(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu16(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu16(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu16(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu16(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu16(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu16(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu32(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu32(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu32(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu32(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu32(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu32(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu64(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu64(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu64(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu64(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu64(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu64(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu8(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu8(xmm_a, xmm_b);
|
|
cc.evex().vmovdqu8(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu8(ymm_a, ymm_b);
|
|
cc.evex().vmovdqu8(zmm_a, zmm_b);
|
|
cc.evex().vmovdqu8(zmm_a, zmm_b);
|
|
cc.evex().vmovhlps(xmm_a, xmm_b, xmm_c);
|
|
if (cc.is_64bit()) cc.evex().vmovq(gpq, xmm_b);
|
|
if (cc.is_64bit()) cc.evex().vmovq(xmm_a, gpq);
|
|
cc.evex().vmovq(xmm_a, xmm_b);
|
|
cc.evex().vmovsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmovshdup(xmm_a, xmm_b);
|
|
cc.evex().vmovshdup(ymm_a, ymm_b);
|
|
cc.evex().vmovshdup(zmm_a, zmm_b);
|
|
cc.evex().vmovsldup(xmm_a, xmm_b);
|
|
cc.evex().vmovsldup(ymm_a, ymm_b);
|
|
cc.evex().vmovsldup(zmm_a, zmm_b);
|
|
cc.evex().vmovss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmovupd(xmm_a, xmm_b);
|
|
cc.evex().vmovupd(xmm_a, xmm_b);
|
|
cc.evex().vmovupd(ymm_a, ymm_b);
|
|
cc.evex().vmovupd(ymm_a, ymm_b);
|
|
cc.evex().vmovupd(zmm_a, zmm_b);
|
|
cc.evex().vmovupd(zmm_a, zmm_b);
|
|
cc.evex().vmovups(xmm_a, xmm_b);
|
|
cc.evex().vmovups(xmm_a, xmm_b);
|
|
cc.evex().vmovups(ymm_a, ymm_b);
|
|
cc.evex().vmovups(ymm_a, ymm_b);
|
|
cc.evex().vmovups(zmm_a, zmm_b);
|
|
cc.evex().vmovups(zmm_a, zmm_b);
|
|
cc.evex().vmulpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmulpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vmulpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vmulps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmulps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vmulps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vmulsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vmulss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vorpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vorpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vorpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vorps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vorps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vorps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpabsb(xmm_a, xmm_b);
|
|
cc.evex().vpabsb(ymm_a, ymm_b);
|
|
cc.evex().vpabsb(zmm_a, zmm_b);
|
|
cc.evex().vpabsd(xmm_a, xmm_b);
|
|
cc.evex().vpabsd(ymm_a, ymm_b);
|
|
cc.evex().vpabsd(zmm_a, zmm_b);
|
|
cc.evex().vpabsq(xmm_a, xmm_b);
|
|
cc.evex().vpabsq(ymm_a, ymm_b);
|
|
cc.evex().vpabsq(zmm_a, zmm_b);
|
|
cc.evex().vpabsw(xmm_a, xmm_b);
|
|
cc.evex().vpabsw(ymm_a, ymm_b);
|
|
cc.evex().vpabsw(zmm_a, zmm_b);
|
|
cc.evex().vpackssdw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpackssdw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpackssdw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpacksswb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpacksswb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpacksswb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpackusdw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpackusdw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpackusdw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpackuswb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpackuswb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpackuswb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddsb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddsb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddsb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddusb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddusb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddusb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddusw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddusw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddusw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpaddw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpaddw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpaddw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpalignr(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vpalignr(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vpalignr(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vpandd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpandd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpandd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpandnd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpandnd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpandnd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpandnq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpandnq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpandnq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpandq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpandq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpandq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpavgb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpavgb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpavgb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpavgw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpavgw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpavgw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpblendmb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpblendmb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpblendmb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpblendmd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpblendmd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpblendmd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpblendmq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpblendmq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpblendmq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpblendmw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpblendmw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpblendmw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpbroadcastb(xmm_a, gpd);
|
|
cc.evex().vpbroadcastb(xmm_a, xmm_b);
|
|
cc.evex().vpbroadcastb(ymm_a, gpd);
|
|
cc.evex().vpbroadcastb(ymm_a, xmm_b);
|
|
cc.evex().vpbroadcastb(zmm_a, gpd);
|
|
cc.evex().vpbroadcastb(zmm_a, xmm_b);
|
|
cc.evex().vpbroadcastd(xmm_a, gpd);
|
|
cc.evex().vpbroadcastd(xmm_a, xmm_b);
|
|
cc.evex().vpbroadcastd(ymm_a, gpd);
|
|
cc.evex().vpbroadcastd(ymm_a, xmm_b);
|
|
cc.evex().vpbroadcastd(zmm_a, gpd);
|
|
cc.evex().vpbroadcastd(zmm_a, xmm_b);
|
|
cc.evex().vpbroadcastmb2q(xmm_a, kB);
|
|
cc.evex().vpbroadcastmb2q(ymm_a, kB);
|
|
cc.evex().vpbroadcastmb2q(zmm_a, kB);
|
|
cc.evex().vpbroadcastmw2d(xmm_a, kB);
|
|
cc.evex().vpbroadcastmw2d(ymm_a, kB);
|
|
cc.evex().vpbroadcastmw2d(zmm_a, kB);
|
|
if (cc.is_64bit()) cc.evex().vpbroadcastq(xmm_a, gpq);
|
|
cc.evex().vpbroadcastq(xmm_a, xmm_b);
|
|
if (cc.is_64bit()) cc.evex().vpbroadcastq(ymm_a, gpq);
|
|
cc.evex().vpbroadcastq(ymm_a, xmm_b);
|
|
if (cc.is_64bit()) cc.evex().vpbroadcastq(zmm_a, gpq);
|
|
cc.evex().vpbroadcastq(zmm_a, xmm_b);
|
|
cc.evex().vpbroadcastw(xmm_a, gpd);
|
|
cc.evex().vpbroadcastw(xmm_a, xmm_b);
|
|
cc.evex().vpbroadcastw(ymm_a, gpd);
|
|
cc.evex().vpbroadcastw(ymm_a, xmm_b);
|
|
cc.evex().vpbroadcastw(zmm_a, gpd);
|
|
cc.evex().vpbroadcastw(zmm_a, xmm_b);
|
|
cc.evex().vpcmpb(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpb(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpb(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpd(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpd(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpd(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpeqb(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpeqb(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpeqb(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpeqd(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpeqd(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpeqd(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpeqq(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpeqq(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpeqq(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpeqw(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpeqw(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpeqw(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpgtb(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpgtb(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpgtb(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpgtd(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpgtd(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpgtd(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpgtq(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpgtq(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpgtq(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpgtw(kA, xmm_b, xmm_c);
|
|
cc.evex().vpcmpgtw(kA, ymm_b, ymm_c);
|
|
cc.evex().vpcmpgtw(kA, zmm_b, zmm_c);
|
|
cc.evex().vpcmpq(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpq(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpq(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpub(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpub(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpub(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpud(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpud(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpud(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpuq(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpuq(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpuq(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpuw(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpuw(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpuw(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcmpw(kA, xmm_b, xmm_c, 0);
|
|
cc.evex().vpcmpw(kA, ymm_b, ymm_c, 0);
|
|
cc.evex().vpcmpw(kA, zmm_b, zmm_c, 0);
|
|
cc.evex().vpcompressd(xmm_a, xmm_b);
|
|
cc.evex().vpcompressd(ymm_a, ymm_b);
|
|
cc.evex().vpcompressd(zmm_a, zmm_b);
|
|
cc.evex().vpcompressq(xmm_a, xmm_b);
|
|
cc.evex().vpcompressq(ymm_a, ymm_b);
|
|
cc.evex().vpcompressq(zmm_a, zmm_b);
|
|
cc.evex().vpconflictd(xmm_a, xmm_b);
|
|
cc.evex().vpconflictd(ymm_a, ymm_b);
|
|
cc.evex().vpconflictd(zmm_a, zmm_b);
|
|
cc.evex().vpconflictq(xmm_a, xmm_b);
|
|
cc.evex().vpconflictq(ymm_a, ymm_b);
|
|
cc.evex().vpconflictq(zmm_a, zmm_b);
|
|
cc.evex().vpermb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2b(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2b(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2b(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2d(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2d(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2d(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2q(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2q(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2q(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermi2w(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermi2w(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermi2w(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermilpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermilpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermilpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermilpd(xmm_a, xmm_b, 0);
|
|
cc.evex().vpermilpd(ymm_a, ymm_b, 0);
|
|
cc.evex().vpermilpd(zmm_a, zmm_b, 0);
|
|
cc.evex().vpermilps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermilps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermilps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermilps(xmm_a, xmm_b, 0);
|
|
cc.evex().vpermilps(ymm_a, ymm_b, 0);
|
|
cc.evex().vpermilps(zmm_a, zmm_b, 0);
|
|
cc.evex().vpermq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpermq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpermt2b(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2b(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2b(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermt2d(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2d(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2d(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermt2pd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2pd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2pd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermt2ps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2ps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2ps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermt2q(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2q(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2q(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermt2w(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermt2w(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermt2w(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpermw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpermw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpermw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpexpandd(xmm_a, xmm_b);
|
|
cc.evex().vpexpandd(ymm_a, ymm_b);
|
|
cc.evex().vpexpandd(zmm_a, zmm_b);
|
|
cc.evex().vpexpandq(xmm_a, xmm_b);
|
|
cc.evex().vpexpandq(ymm_a, ymm_b);
|
|
cc.evex().vpexpandq(zmm_a, zmm_b);
|
|
cc.evex().vpextrb(gpd, xmm_b, 0);
|
|
cc.evex().vpextrd(gpd, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.evex().vpextrq(gpq, xmm_b, 0);
|
|
cc.evex().vpextrw(gpd, xmm_b, 0);
|
|
cc.evex().vpinsrb(xmm_a, xmm_b, gpd, 0);
|
|
cc.evex().vpinsrd(xmm_a, xmm_b, gpd, 0);
|
|
if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, gpq, 0);
|
|
cc.evex().vpinsrw(xmm_a, xmm_b, gpd, 0);
|
|
cc.evex().vplzcntd(xmm_a, xmm_b);
|
|
cc.evex().vplzcntd(ymm_a, ymm_b);
|
|
cc.evex().vplzcntd(zmm_a, zmm_b);
|
|
cc.evex().vplzcntq(xmm_a, xmm_b);
|
|
cc.evex().vplzcntq(ymm_a, ymm_b);
|
|
cc.evex().vplzcntq(zmm_a, zmm_b);
|
|
cc.evex().vpmadd52huq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmadd52huq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmadd52huq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmadd52luq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmadd52luq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmadd52luq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaddubsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaddubsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaddubsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaddwd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaddwd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaddwd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxsb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxsb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxsb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxsd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxsd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxsq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxsq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxsq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxub(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxub(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxub(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxud(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxud(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxud(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxuq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxuq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxuq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmaxuw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmaxuw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmaxuw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminsb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminsb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminsb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminsd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminsd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminsq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminsq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminsq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminub(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminub(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminub(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminud(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminud(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminud(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminuq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminuq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminuq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpminuw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpminuw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpminuw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmovb2m(kA, xmm_b);
|
|
cc.evex().vpmovb2m(kA, ymm_b);
|
|
cc.evex().vpmovb2m(kA, zmm_b);
|
|
cc.evex().vpmovd2m(kA, xmm_b);
|
|
cc.evex().vpmovd2m(kA, ymm_b);
|
|
cc.evex().vpmovd2m(kA, zmm_b);
|
|
cc.evex().vpmovdb(xmm_a, xmm_b);
|
|
cc.evex().vpmovdb(xmm_a, ymm_b);
|
|
cc.evex().vpmovdb(xmm_a, zmm_b);
|
|
cc.evex().vpmovdw(xmm_a, xmm_b);
|
|
cc.evex().vpmovdw(xmm_a, ymm_b);
|
|
cc.evex().vpmovdw(ymm_a, zmm_b);
|
|
cc.evex().vpmovm2b(xmm_a, kB);
|
|
cc.evex().vpmovm2b(ymm_a, kB);
|
|
cc.evex().vpmovm2b(zmm_a, kB);
|
|
cc.evex().vpmovm2d(xmm_a, kB);
|
|
cc.evex().vpmovm2d(ymm_a, kB);
|
|
cc.evex().vpmovm2d(zmm_a, kB);
|
|
cc.evex().vpmovm2q(xmm_a, kB);
|
|
cc.evex().vpmovm2q(ymm_a, kB);
|
|
cc.evex().vpmovm2q(zmm_a, kB);
|
|
cc.evex().vpmovm2w(xmm_a, kB);
|
|
cc.evex().vpmovm2w(ymm_a, kB);
|
|
cc.evex().vpmovm2w(zmm_a, kB);
|
|
cc.evex().vpmovq2m(kA, xmm_b);
|
|
cc.evex().vpmovq2m(kA, ymm_b);
|
|
cc.evex().vpmovq2m(kA, zmm_b);
|
|
cc.evex().vpmovqb(xmm_a, xmm_b);
|
|
cc.evex().vpmovqb(xmm_a, ymm_b);
|
|
cc.evex().vpmovqb(xmm_a, zmm_b);
|
|
cc.evex().vpmovqd(xmm_a, xmm_b);
|
|
cc.evex().vpmovqd(xmm_a, ymm_b);
|
|
cc.evex().vpmovqd(ymm_a, zmm_b);
|
|
cc.evex().vpmovqw(xmm_a, xmm_b);
|
|
cc.evex().vpmovqw(xmm_a, ymm_b);
|
|
cc.evex().vpmovqw(xmm_a, zmm_b);
|
|
cc.evex().vpmovsdb(xmm_a, xmm_b);
|
|
cc.evex().vpmovsdb(xmm_a, ymm_b);
|
|
cc.evex().vpmovsdb(xmm_a, zmm_b);
|
|
cc.evex().vpmovsdw(xmm_a, xmm_b);
|
|
cc.evex().vpmovsdw(xmm_a, ymm_b);
|
|
cc.evex().vpmovsdw(ymm_a, zmm_b);
|
|
cc.evex().vpmovsqb(xmm_a, xmm_b);
|
|
cc.evex().vpmovsqb(xmm_a, ymm_b);
|
|
cc.evex().vpmovsqb(xmm_a, zmm_b);
|
|
cc.evex().vpmovsqd(xmm_a, xmm_b);
|
|
cc.evex().vpmovsqd(xmm_a, ymm_b);
|
|
cc.evex().vpmovsqd(ymm_a, zmm_b);
|
|
cc.evex().vpmovsqw(xmm_a, xmm_b);
|
|
cc.evex().vpmovsqw(xmm_a, ymm_b);
|
|
cc.evex().vpmovsqw(xmm_a, zmm_b);
|
|
cc.evex().vpmovswb(xmm_a, xmm_b);
|
|
cc.evex().vpmovswb(xmm_a, ymm_b);
|
|
cc.evex().vpmovswb(ymm_a, zmm_b);
|
|
cc.evex().vpmovsxbd(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxbd(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxbd(zmm_a, xmm_b);
|
|
cc.evex().vpmovsxbq(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxbq(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxbq(zmm_a, xmm_b);
|
|
cc.evex().vpmovsxbw(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxbw(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxbw(zmm_a, ymm_b);
|
|
cc.evex().vpmovsxdq(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxdq(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxdq(zmm_a, ymm_b);
|
|
cc.evex().vpmovsxwd(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxwd(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxwd(zmm_a, ymm_b);
|
|
cc.evex().vpmovsxwq(xmm_a, xmm_b);
|
|
cc.evex().vpmovsxwq(ymm_a, xmm_b);
|
|
cc.evex().vpmovsxwq(zmm_a, xmm_b);
|
|
cc.evex().vpmovusdb(xmm_a, xmm_b);
|
|
cc.evex().vpmovusdb(xmm_a, ymm_b);
|
|
cc.evex().vpmovusdb(xmm_a, zmm_b);
|
|
cc.evex().vpmovusdw(xmm_a, xmm_b);
|
|
cc.evex().vpmovusdw(xmm_a, ymm_b);
|
|
cc.evex().vpmovusdw(ymm_a, zmm_b);
|
|
cc.evex().vpmovusqb(xmm_a, xmm_b);
|
|
cc.evex().vpmovusqb(xmm_a, ymm_b);
|
|
cc.evex().vpmovusqb(xmm_a, zmm_b);
|
|
cc.evex().vpmovusqd(xmm_a, xmm_b);
|
|
cc.evex().vpmovusqd(xmm_a, ymm_b);
|
|
cc.evex().vpmovusqd(ymm_a, zmm_b);
|
|
cc.evex().vpmovusqw(xmm_a, xmm_b);
|
|
cc.evex().vpmovusqw(xmm_a, ymm_b);
|
|
cc.evex().vpmovusqw(xmm_a, zmm_b);
|
|
cc.evex().vpmovuswb(xmm_a, xmm_b);
|
|
cc.evex().vpmovuswb(xmm_a, ymm_b);
|
|
cc.evex().vpmovuswb(ymm_a, zmm_b);
|
|
cc.evex().vpmovw2m(kA, xmm_b);
|
|
cc.evex().vpmovw2m(kA, ymm_b);
|
|
cc.evex().vpmovw2m(kA, zmm_b);
|
|
cc.evex().vpmovwb(xmm_a, xmm_b);
|
|
cc.evex().vpmovwb(xmm_a, ymm_b);
|
|
cc.evex().vpmovwb(ymm_a, zmm_b);
|
|
cc.evex().vpmovzxbd(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxbd(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxbd(zmm_a, xmm_b);
|
|
cc.evex().vpmovzxbq(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxbq(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxbq(zmm_a, xmm_b);
|
|
cc.evex().vpmovzxbw(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxbw(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxbw(zmm_a, ymm_b);
|
|
cc.evex().vpmovzxdq(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxdq(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxdq(zmm_a, ymm_b);
|
|
cc.evex().vpmovzxwd(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxwd(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxwd(zmm_a, ymm_b);
|
|
cc.evex().vpmovzxwq(xmm_a, xmm_b);
|
|
cc.evex().vpmovzxwq(ymm_a, xmm_b);
|
|
cc.evex().vpmovzxwq(zmm_a, xmm_b);
|
|
cc.evex().vpmuldq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmuldq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmuldq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmulhrsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmulhrsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmulhrsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmulhuw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmulhuw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmulhuw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmulhw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmulhw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmulhw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmulld(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmulld(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmulld(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmullq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmullq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmullq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmullw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmullw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmullw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmultishiftqb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmultishiftqb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmultishiftqb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpmuludq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpmuludq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpmuludq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpopcntd(zmm_a, zmm_b);
|
|
cc.evex().vpopcntq(zmm_a, zmm_b);
|
|
cc.evex().vpord(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpord(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpord(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vporq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vporq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vporq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vprold(xmm_a, xmm_b, 0);
|
|
cc.evex().vprold(ymm_a, ymm_b, 0);
|
|
cc.evex().vprold(zmm_a, zmm_b, 0);
|
|
cc.evex().vprolq(xmm_a, xmm_b, 0);
|
|
cc.evex().vprolq(ymm_a, ymm_b, 0);
|
|
cc.evex().vprolq(zmm_a, zmm_b, 0);
|
|
cc.evex().vprolvd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vprolvd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vprolvd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vprolvq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vprolvq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vprolvq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vprord(xmm_a, xmm_b, 0);
|
|
cc.evex().vprord(ymm_a, ymm_b, 0);
|
|
cc.evex().vprord(zmm_a, zmm_b, 0);
|
|
cc.evex().vprorq(xmm_a, xmm_b, 0);
|
|
cc.evex().vprorq(ymm_a, ymm_b, 0);
|
|
cc.evex().vprorq(zmm_a, zmm_b, 0);
|
|
cc.evex().vprorvd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vprorvd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vprorvd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vprorvq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vprorvq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vprorvq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsadbw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsadbw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsadbw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpshufb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpshufb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpshufb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpshufd(xmm_a, xmm_b, 0);
|
|
cc.evex().vpshufd(ymm_a, ymm_b, 0);
|
|
cc.evex().vpshufd(zmm_a, zmm_b, 0);
|
|
cc.evex().vpshufhw(xmm_a, xmm_b, 0);
|
|
cc.evex().vpshufhw(ymm_a, ymm_b, 0);
|
|
cc.evex().vpshufhw(zmm_a, zmm_b, 0);
|
|
cc.evex().vpshuflw(xmm_a, xmm_b, 0);
|
|
cc.evex().vpshuflw(ymm_a, ymm_b, 0);
|
|
cc.evex().vpshuflw(zmm_a, zmm_b, 0);
|
|
cc.evex().vpslld(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpslld(xmm_a, xmm_b, 0);
|
|
cc.evex().vpslld(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpslld(ymm_a, ymm_b, 0);
|
|
cc.evex().vpslld(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpslld(zmm_a, zmm_b, 0);
|
|
cc.evex().vpslldq(xmm_a, xmm_b, 0);
|
|
cc.evex().vpslldq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpslldq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsllq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsllq(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsllq(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsllq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsllq(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsllq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsllvd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsllvd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsllvd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsllvq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsllvq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsllvq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsllvw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsllvw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsllvw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsllw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsllw(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsllw(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsllw(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsllw(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsllw(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsrad(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrad(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsrad(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsrad(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsrad(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsrad(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsraq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsraq(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsraq(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsraq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsraq(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsraq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsravd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsravd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsravd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsravq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsravq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsravq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsravw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsravw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsravw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsraw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsraw(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsraw(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsraw(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsraw(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsraw(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsrld(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrld(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsrld(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsrld(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsrld(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsrld(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsrldq(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsrldq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsrldq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsrlq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrlq(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsrlq(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsrlq(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsrlq(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsrlq(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsrlvd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrlvd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsrlvd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsrlvq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrlvq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsrlvq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsrlvw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrlvw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsrlvw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsrlw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsrlw(xmm_a, xmm_b, 0);
|
|
cc.evex().vpsrlw(ymm_a, ymm_b, xmm_c);
|
|
cc.evex().vpsrlw(ymm_a, ymm_b, 0);
|
|
cc.evex().vpsrlw(zmm_a, zmm_b, xmm_c);
|
|
cc.evex().vpsrlw(zmm_a, zmm_b, 0);
|
|
cc.evex().vpsubb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubsb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubsb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubsb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubsw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubsw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubsw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubusb(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubusb(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubusb(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubusw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubusw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubusw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpsubw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpsubw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpsubw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpternlogd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vpternlogd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vpternlogd(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vpternlogq(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vpternlogq(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vpternlogq(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vptestmb(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestmb(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestmb(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestmd(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestmd(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestmd(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestmq(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestmq(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestmq(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestmw(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestmw(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestmw(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestnmb(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestnmb(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestnmb(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestnmd(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestnmd(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestnmd(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestnmq(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestnmq(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestnmq(kA, zmm_b, zmm_c);
|
|
cc.evex().vptestnmw(kA, xmm_b, xmm_c);
|
|
cc.evex().vptestnmw(kA, ymm_b, ymm_c);
|
|
cc.evex().vptestnmw(kA, zmm_b, zmm_c);
|
|
cc.evex().vpunpckhbw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpckhbw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpckhbw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpckhdq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpckhdq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpckhdq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpckhqdq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpckhqdq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpckhqdq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpckhwd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpckhwd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpckhwd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpcklbw(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpcklbw(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpcklbw(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpckldq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpckldq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpckldq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpcklqdq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpcklqdq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpcklqdq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpunpcklwd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpunpcklwd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpunpcklwd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpxord(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpxord(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpxord(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vpxorq(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vpxorq(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vpxorq(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vrangepd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrangepd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vrangepd(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vrangeps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrangeps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vrangeps(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vrangesd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrangess(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrcp14pd(xmm_a, xmm_b);
|
|
cc.evex().vrcp14pd(ymm_a, ymm_b);
|
|
cc.evex().vrcp14pd(zmm_a, zmm_b);
|
|
cc.evex().vrcp14ps(xmm_a, xmm_b);
|
|
cc.evex().vrcp14ps(ymm_a, ymm_b);
|
|
cc.evex().vrcp14ps(zmm_a, zmm_b);
|
|
cc.evex().vrcp14sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vrcp14ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vreducepd(xmm_a, xmm_b, 0);
|
|
cc.evex().vreducepd(ymm_a, ymm_b, 0);
|
|
cc.evex().vreducepd(zmm_a, zmm_b, 0);
|
|
cc.evex().vreduceps(xmm_a, xmm_b, 0);
|
|
cc.evex().vreduceps(ymm_a, ymm_b, 0);
|
|
cc.evex().vreduceps(zmm_a, zmm_b, 0);
|
|
cc.evex().vreducesd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vreducess(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrndscalepd(xmm_a, xmm_b, 0);
|
|
cc.evex().vrndscalepd(ymm_a, ymm_b, 0);
|
|
cc.evex().vrndscalepd(zmm_a, zmm_b, 0);
|
|
cc.evex().vrndscaleps(xmm_a, xmm_b, 0);
|
|
cc.evex().vrndscaleps(ymm_a, ymm_b, 0);
|
|
cc.evex().vrndscaleps(zmm_a, zmm_b, 0);
|
|
cc.evex().vrndscalesd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrndscaless(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vrsqrt14pd(xmm_a, xmm_b);
|
|
cc.evex().vrsqrt14pd(ymm_a, ymm_b);
|
|
cc.evex().vrsqrt14pd(zmm_a, zmm_b);
|
|
cc.evex().vrsqrt14ps(xmm_a, xmm_b);
|
|
cc.evex().vrsqrt14ps(ymm_a, ymm_b);
|
|
cc.evex().vrsqrt14ps(zmm_a, zmm_b);
|
|
cc.evex().vrsqrt14sd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vrsqrt14ss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vscalefpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vscalefpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vscalefpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vscalefps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vscalefps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vscalefps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vscalefsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vscalefss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vshuff32x4(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshuff32x4(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vshuff64x2(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshuff64x2(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vshufi32x4(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshufi32x4(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vshufi64x2(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshufi64x2(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vshufpd(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vshufpd(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshufpd(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vshufps(xmm_a, xmm_b, xmm_c, 0);
|
|
cc.evex().vshufps(ymm_a, ymm_b, ymm_c, 0);
|
|
cc.evex().vshufps(zmm_a, zmm_b, zmm_c, 0);
|
|
cc.evex().vsqrtpd(xmm_a, xmm_b);
|
|
cc.evex().vsqrtpd(ymm_a, ymm_b);
|
|
cc.evex().vsqrtpd(zmm_a, zmm_b);
|
|
cc.evex().vsqrtps(xmm_a, xmm_b);
|
|
cc.evex().vsqrtps(ymm_a, ymm_b);
|
|
cc.evex().vsqrtps(zmm_a, zmm_b);
|
|
cc.evex().vsqrtsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vsqrtss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vsubpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vsubpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vsubpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vsubps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vsubps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vsubps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vsubsd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vsubss(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vucomisd(xmm_a, xmm_b);
|
|
cc.evex().vucomiss(xmm_a, xmm_b);
|
|
cc.evex().vunpckhpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vunpckhpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vunpckhpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vunpckhps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vunpckhps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vunpckhps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vunpcklpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vunpcklpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vunpcklpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vunpcklps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vunpcklps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vunpcklps(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vxorpd(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vxorpd(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vxorpd(zmm_a, zmm_b, zmm_c);
|
|
cc.evex().vxorps(xmm_a, xmm_b, xmm_c);
|
|
cc.evex().vxorps(ymm_a, ymm_b, ymm_c);
|
|
cc.evex().vxorps(zmm_a, zmm_b, zmm_c);
|
|
}
|
|
|
|
template<typename Emitter>
|
|
static void generate_avx512_sequence_internal_reg_mem(
|
|
Emitter& cc,
|
|
const x86::Gp& gp,
|
|
const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
Support::maybe_unused(kC);
|
|
|
|
x86::Gp gpd = gp.r32();
|
|
x86::Gp gpq = gp.r64();
|
|
x86::Gp gpz = cc.is_32bit() ? gpd : gpq;
|
|
|
|
x86::Vec xmm_a = vec_a.xmm();
|
|
x86::Vec xmm_b = vec_b.xmm();
|
|
x86::Vec xmm_c = vec_c.xmm();
|
|
x86::Vec xmm_d = vec_d.xmm();
|
|
|
|
x86::Vec ymm_a = vec_a.ymm();
|
|
x86::Vec ymm_b = vec_b.ymm();
|
|
x86::Vec ymm_d = vec_d.ymm();
|
|
|
|
x86::Vec zmm_a = vec_a.zmm();
|
|
x86::Vec zmm_b = vec_b.zmm();
|
|
x86::Vec zmm_d = vec_d.zmm();
|
|
|
|
x86::Mem m = x86::ptr(gpz);
|
|
x86::Mem m32 = x86::dword_ptr(gpz);
|
|
x86::Mem m64 = x86::qword_ptr(gpz);
|
|
x86::Mem m128 = x86::xmmword_ptr(gpz);
|
|
x86::Mem m256 = x86::ymmword_ptr(gpz);
|
|
x86::Mem m512 = x86::zmmword_ptr(gpz);
|
|
x86::Mem vx_ptr = x86::ptr(gpz, xmm_d);
|
|
x86::Mem vy_ptr = x86::ptr(gpz, ymm_d);
|
|
x86::Mem vz_ptr = x86::ptr(gpz, zmm_d);
|
|
|
|
cc.xor_(gpd, gpd);
|
|
cc.vxorps(xmm_a, xmm_a, xmm_a);
|
|
cc.vxorps(xmm_b, xmm_b, xmm_b);
|
|
cc.vxorps(xmm_c, xmm_c, xmm_c);
|
|
cc.vxorps(xmm_d, xmm_d, xmm_d);
|
|
|
|
cc.kmovb(kA, m);
|
|
cc.kmovb(m, kB);
|
|
cc.kmovd(kA, m);
|
|
cc.kmovd(m, kB);
|
|
cc.kmovq(kA, m);
|
|
cc.kmovq(m, kB);
|
|
cc.kmovw(kA, m);
|
|
cc.kmovw(m, kB);
|
|
|
|
cc.evex().vaddpd(xmm_a, xmm_b, m);
|
|
cc.evex().vaddpd(ymm_a, ymm_b, m);
|
|
cc.evex().vaddpd(zmm_a, zmm_b, m);
|
|
cc.evex().vaddps(xmm_a, xmm_b, m);
|
|
cc.evex().vaddps(ymm_a, ymm_b, m);
|
|
cc.evex().vaddps(zmm_a, zmm_b, m);
|
|
cc.evex().vaddsd(xmm_a, xmm_b, m);
|
|
cc.evex().vaddss(xmm_a, xmm_b, m);
|
|
cc.evex().valignd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().valignd(ymm_a, ymm_b, m, 0);
|
|
cc.evex().valignd(zmm_a, zmm_b, m, 0);
|
|
cc.evex().valignq(xmm_a, xmm_b, m, 0);
|
|
cc.evex().valignq(ymm_a, ymm_b, m, 0);
|
|
cc.evex().valignq(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vandnpd(xmm_a, xmm_b, m);
|
|
cc.evex().vandnpd(ymm_a, ymm_b, m);
|
|
cc.evex().vandnpd(zmm_a, zmm_b, m);
|
|
cc.evex().vandnps(xmm_a, xmm_b, m);
|
|
cc.evex().vandnps(ymm_a, ymm_b, m);
|
|
cc.evex().vandnps(zmm_a, zmm_b, m);
|
|
cc.evex().vandpd(xmm_a, xmm_b, m);
|
|
cc.evex().vandpd(ymm_a, ymm_b, m);
|
|
cc.evex().vandpd(zmm_a, zmm_b, m);
|
|
cc.evex().vandps(xmm_a, xmm_b, m);
|
|
cc.evex().vandps(ymm_a, ymm_b, m);
|
|
cc.evex().vandps(zmm_a, zmm_b, m);
|
|
cc.evex().vblendmpd(xmm_a, xmm_b, m);
|
|
cc.evex().vblendmpd(ymm_a, ymm_b, m);
|
|
cc.evex().vblendmpd(zmm_a, zmm_b, m);
|
|
cc.evex().vblendmps(xmm_a, xmm_b, m);
|
|
cc.evex().vblendmps(ymm_a, ymm_b, m);
|
|
cc.evex().vblendmps(zmm_a, zmm_b, m);
|
|
cc.evex().vbroadcastf32x2(ymm_a, m);
|
|
cc.evex().vbroadcastf32x2(zmm_a, m);
|
|
cc.evex().vbroadcastf32x4(ymm_a, m);
|
|
cc.evex().vbroadcastf32x4(zmm_a, m);
|
|
cc.evex().vbroadcastf32x8(zmm_a, m);
|
|
cc.evex().vbroadcastf64x2(ymm_a, m);
|
|
cc.evex().vbroadcastf64x2(zmm_a, m);
|
|
cc.evex().vbroadcastf64x4(zmm_a, m);
|
|
cc.evex().vbroadcasti32x2(xmm_a, m);
|
|
cc.evex().vbroadcasti32x2(ymm_a, m);
|
|
cc.evex().vbroadcasti32x2(zmm_a, m);
|
|
cc.evex().vbroadcasti32x4(ymm_a, m);
|
|
cc.evex().vbroadcasti32x4(zmm_a, m);
|
|
cc.evex().vbroadcasti32x8(zmm_a, m);
|
|
cc.evex().vbroadcasti64x2(ymm_a, m);
|
|
cc.evex().vbroadcasti64x2(zmm_a, m);
|
|
cc.evex().vbroadcasti64x4(zmm_a, m);
|
|
cc.evex().vbroadcastsd(ymm_a, m);
|
|
cc.evex().vbroadcastsd(zmm_a, m);
|
|
cc.evex().vbroadcastss(xmm_a, m);
|
|
cc.evex().vbroadcastss(ymm_a, m);
|
|
cc.evex().vbroadcastss(zmm_a, m);
|
|
cc.evex().vcmppd(kA, xmm_b, m, 0);
|
|
cc.evex().vcmppd(kA, ymm_b, m, 0);
|
|
cc.evex().vcmppd(kA, zmm_b, m, 0);
|
|
cc.evex().vcmpps(kA, xmm_b, m, 0);
|
|
cc.evex().vcmpps(kA, ymm_b, m, 0);
|
|
cc.evex().vcmpps(kA, zmm_b, m, 0);
|
|
cc.evex().vcmpsd(kA, xmm_b, m, 0);
|
|
cc.evex().vcmpss(kA, xmm_b, m, 0);
|
|
cc.evex().vcomisd(xmm_a, m);
|
|
cc.evex().vcomiss(xmm_a, m);
|
|
cc.evex().vcompresspd(m, xmm_b);
|
|
cc.evex().vcompresspd(m, ymm_b);
|
|
cc.evex().vcompresspd(m, zmm_b);
|
|
cc.evex().vcompressps(m, xmm_b);
|
|
cc.evex().vcompressps(m, ymm_b);
|
|
cc.evex().vcompressps(m, zmm_b);
|
|
cc.evex().vcvtdq2pd(xmm_a, m);
|
|
cc.evex().vcvtdq2pd(ymm_a, m);
|
|
cc.evex().vcvtdq2pd(zmm_a, m);
|
|
cc.evex().vcvtdq2ps(xmm_a, m);
|
|
cc.evex().vcvtdq2ps(ymm_a, m);
|
|
cc.evex().vcvtdq2ps(zmm_a, m);
|
|
cc.evex().vcvtpd2dq(xmm_a, m128);
|
|
cc.evex().vcvtpd2dq(xmm_a, m256);
|
|
cc.evex().vcvtpd2dq(ymm_a, m512);
|
|
cc.evex().vcvtpd2qq(xmm_a, m);
|
|
cc.evex().vcvtpd2qq(ymm_a, m);
|
|
cc.evex().vcvtpd2qq(zmm_a, m);
|
|
cc.evex().vcvtpd2udq(xmm_a, m128);
|
|
cc.evex().vcvtpd2udq(xmm_a, m256);
|
|
cc.evex().vcvtpd2udq(ymm_a, m512);
|
|
cc.evex().vcvtpd2uqq(xmm_a, m);
|
|
cc.evex().vcvtpd2uqq(ymm_a, m);
|
|
cc.evex().vcvtpd2uqq(zmm_a, m);
|
|
cc.evex().vcvtph2ps(xmm_a, m);
|
|
cc.evex().vcvtph2ps(ymm_a, m);
|
|
cc.evex().vcvtph2ps(zmm_a, m);
|
|
cc.evex().vcvtps2dq(xmm_a, m);
|
|
cc.evex().vcvtps2dq(ymm_a, m);
|
|
cc.evex().vcvtps2dq(zmm_a, m);
|
|
cc.evex().vcvtps2pd(xmm_a, m);
|
|
cc.evex().vcvtps2pd(ymm_a, m);
|
|
cc.evex().vcvtps2pd(zmm_a, m);
|
|
cc.evex().vcvtps2ph(m, xmm_b, 0);
|
|
cc.evex().vcvtps2ph(m, ymm_b, 0);
|
|
cc.evex().vcvtps2ph(m, zmm_b, 0);
|
|
cc.evex().vcvtps2qq(xmm_a, m);
|
|
cc.evex().vcvtps2qq(ymm_a, m);
|
|
cc.evex().vcvtps2qq(zmm_a, m);
|
|
cc.evex().vcvtps2udq(xmm_a, m);
|
|
cc.evex().vcvtps2udq(ymm_a, m);
|
|
cc.evex().vcvtps2udq(zmm_a, m);
|
|
cc.evex().vcvtps2uqq(xmm_a, m);
|
|
cc.evex().vcvtps2uqq(ymm_a, m);
|
|
cc.evex().vcvtps2uqq(zmm_a, m);
|
|
cc.evex().vcvtqq2pd(xmm_a, m);
|
|
cc.evex().vcvtqq2pd(ymm_a, m);
|
|
cc.evex().vcvtqq2pd(zmm_a, m);
|
|
cc.evex().vcvtqq2ps(xmm_a, m128);
|
|
cc.evex().vcvtqq2ps(xmm_a, m256);
|
|
cc.evex().vcvtqq2ps(ymm_a, m512);
|
|
cc.evex().vcvtsd2si(gpd, m);
|
|
cc.evex().vcvtsd2si(gpz, m);
|
|
cc.evex().vcvtsd2ss(xmm_a, xmm_b, m);
|
|
cc.evex().vcvtsd2usi(gpd, m);
|
|
cc.evex().vcvtsd2usi(gpz, m);
|
|
cc.evex().vcvtsi2sd(xmm_a, xmm_b, m32);
|
|
if (cc.is_64bit()) cc.evex().vcvtsi2sd(xmm_a, xmm_b, m64);
|
|
cc.evex().vcvtsi2ss(xmm_a, xmm_b, m32);
|
|
if (cc.is_64bit()) cc.evex().vcvtsi2ss(xmm_a, xmm_b, m64);
|
|
cc.evex().vcvtss2sd(xmm_a, xmm_b, m);
|
|
cc.evex().vcvtss2si(gpd, m);
|
|
cc.evex().vcvtss2si(gpz, m);
|
|
cc.evex().vcvtss2usi(gpd, m);
|
|
cc.evex().vcvtss2usi(gpz, m);
|
|
cc.evex().vcvttpd2dq(xmm_a, m128);
|
|
cc.evex().vcvttpd2dq(xmm_a, m256);
|
|
cc.evex().vcvttpd2dq(ymm_a, m512);
|
|
cc.evex().vcvttpd2qq(xmm_a, m);
|
|
cc.evex().vcvttpd2qq(ymm_a, m);
|
|
cc.evex().vcvttpd2qq(zmm_a, m);
|
|
cc.evex().vcvttpd2udq(xmm_a, m128);
|
|
cc.evex().vcvttpd2udq(xmm_a, m256);
|
|
cc.evex().vcvttpd2udq(ymm_a, m512);
|
|
cc.evex().vcvttpd2uqq(xmm_a, m);
|
|
cc.evex().vcvttpd2uqq(ymm_a, m);
|
|
cc.evex().vcvttpd2uqq(zmm_a, m);
|
|
cc.evex().vcvttps2dq(xmm_a, m);
|
|
cc.evex().vcvttps2dq(ymm_a, m);
|
|
cc.evex().vcvttps2dq(zmm_a, m);
|
|
cc.evex().vcvttps2qq(xmm_a, m);
|
|
cc.evex().vcvttps2qq(ymm_a, m);
|
|
cc.evex().vcvttps2qq(zmm_a, m);
|
|
cc.evex().vcvttps2udq(xmm_a, m);
|
|
cc.evex().vcvttps2udq(ymm_a, m);
|
|
cc.evex().vcvttps2udq(zmm_a, m);
|
|
cc.evex().vcvttps2uqq(xmm_a, m);
|
|
cc.evex().vcvttps2uqq(ymm_a, m);
|
|
cc.evex().vcvttps2uqq(zmm_a, m);
|
|
cc.evex().vcvttsd2si(gpd, m);
|
|
cc.evex().vcvttsd2si(gpz, m);
|
|
cc.evex().vcvttsd2usi(gpd, m);
|
|
cc.evex().vcvttsd2usi(gpz, m);
|
|
cc.evex().vcvttss2si(gpd, m);
|
|
cc.evex().vcvttss2si(gpz, m);
|
|
cc.evex().vcvttss2usi(gpd, m);
|
|
cc.evex().vcvttss2usi(gpz, m);
|
|
cc.evex().vcvtudq2pd(xmm_a, m);
|
|
cc.evex().vcvtudq2pd(ymm_a, m);
|
|
cc.evex().vcvtudq2pd(zmm_a, m);
|
|
cc.evex().vcvtudq2ps(xmm_a, m);
|
|
cc.evex().vcvtudq2ps(ymm_a, m);
|
|
cc.evex().vcvtudq2ps(zmm_a, m);
|
|
cc.evex().vcvtuqq2pd(xmm_a, m);
|
|
cc.evex().vcvtuqq2pd(ymm_a, m);
|
|
cc.evex().vcvtuqq2pd(zmm_a, m);
|
|
cc.evex().vcvtuqq2ps(xmm_a, m128);
|
|
cc.evex().vcvtuqq2ps(xmm_a, m256);
|
|
cc.evex().vcvtuqq2ps(ymm_a, m512);
|
|
cc.evex().vcvtusi2sd(xmm_a, xmm_b, m32);
|
|
if (cc.is_64bit()) cc.evex().vcvtusi2sd(xmm_a, xmm_b, m64);
|
|
cc.evex().vcvtusi2ss(xmm_a, xmm_b, m32);
|
|
if (cc.is_64bit()) cc.evex().vcvtusi2ss(xmm_a, xmm_b, m64);
|
|
cc.evex().vdbpsadbw(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vdbpsadbw(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vdbpsadbw(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vdivpd(xmm_a, xmm_b, m);
|
|
cc.evex().vdivpd(ymm_a, ymm_b, m);
|
|
cc.evex().vdivpd(zmm_a, zmm_b, m);
|
|
cc.evex().vdivps(xmm_a, xmm_b, m);
|
|
cc.evex().vdivps(ymm_a, ymm_b, m);
|
|
cc.evex().vdivps(zmm_a, zmm_b, m);
|
|
cc.evex().vdivsd(xmm_a, xmm_b, m);
|
|
cc.evex().vdivss(xmm_a, xmm_b, m);
|
|
cc.evex().vexpandpd(xmm_a, m);
|
|
cc.evex().vexpandpd(ymm_a, m);
|
|
cc.evex().vexpandpd(zmm_a, m);
|
|
cc.evex().vexpandps(xmm_a, m);
|
|
cc.evex().vexpandps(ymm_a, m);
|
|
cc.evex().vexpandps(zmm_a, m);
|
|
cc.evex().vextractf32x4(m, ymm_b, 0);
|
|
cc.evex().vextractf32x4(m, zmm_b, 0);
|
|
cc.evex().vextractf32x8(m, zmm_b, 0);
|
|
cc.evex().vextractf64x2(m, ymm_b, 0);
|
|
cc.evex().vextractf64x2(m, zmm_b, 0);
|
|
cc.evex().vextractf64x4(m, zmm_b, 0);
|
|
cc.evex().vextracti32x4(m, ymm_b, 0);
|
|
cc.evex().vextracti32x4(m, zmm_b, 0);
|
|
cc.evex().vextracti32x8(m, zmm_b, 0);
|
|
cc.evex().vextracti64x2(m, ymm_b, 0);
|
|
cc.evex().vextracti64x2(m, zmm_b, 0);
|
|
cc.evex().vextracti64x4(m, zmm_b, 0);
|
|
cc.evex().vextractps(m, xmm_b, 0);
|
|
cc.evex().vfixupimmpd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vfixupimmpd(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vfixupimmpd(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vfixupimmps(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vfixupimmps(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vfixupimmps(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vfixupimmsd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vfixupimmss(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vfmadd132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd132sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd132ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd213sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd213ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmadd231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmadd231sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmadd231ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmaddsub132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmaddsub213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmaddsub213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmaddsub231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmaddsub231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmaddsub231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmaddsub231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub132sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub132ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub213sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub213ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsub231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsub231sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsub231ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsubadd132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsubadd213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsubadd213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsubadd231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfmsubadd231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfmsubadd231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfmsubadd231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd132sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd132ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd213sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd213ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmadd231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmadd231sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmadd231ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub132pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub132pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub132pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub132ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub132ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub132ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub132sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub132ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub213pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub213pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub213pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub213ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub213ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub213ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub213sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub213ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub231pd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub231pd(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub231pd(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub231ps(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub231ps(ymm_a, ymm_b, m);
|
|
cc.evex().vfnmsub231ps(zmm_a, zmm_b, m);
|
|
cc.evex().vfnmsub231sd(xmm_a, xmm_b, m);
|
|
cc.evex().vfnmsub231ss(xmm_a, xmm_b, m);
|
|
cc.evex().vfpclasspd(kA, m128, 0);
|
|
cc.evex().vfpclasspd(kA, m256, 0);
|
|
cc.evex().vfpclasspd(kA, m512, 0);
|
|
cc.evex().vfpclassps(kA, m128, 0);
|
|
cc.evex().vfpclassps(kA, m256, 0);
|
|
cc.evex().vfpclassps(kA, m512, 0);
|
|
cc.evex().vfpclasssd(kA, m, 0);
|
|
cc.evex().vfpclassss(kA, m, 0);
|
|
cc.evex().k(kA).vgatherdpd(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vgatherdpd(ymm_a, vx_ptr);
|
|
cc.evex().k(kA).vgatherdpd(zmm_a, vy_ptr);
|
|
cc.evex().k(kA).vgatherdps(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vgatherdps(ymm_a, vy_ptr);
|
|
cc.evex().k(kA).vgatherdps(zmm_a, vz_ptr);
|
|
cc.evex().k(kA).vgatherqpd(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vgatherqpd(ymm_a, vy_ptr);
|
|
cc.evex().k(kA).vgatherqpd(zmm_a, vz_ptr);
|
|
cc.evex().k(kA).vgatherqps(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vgatherqps(xmm_a, vy_ptr);
|
|
cc.evex().k(kA).vgatherqps(ymm_a, vz_ptr);
|
|
cc.evex().vgetexppd(xmm_a, m);
|
|
cc.evex().vgetexppd(ymm_a, m);
|
|
cc.evex().vgetexppd(zmm_a, m);
|
|
cc.evex().vgetexpps(xmm_a, m);
|
|
cc.evex().vgetexpps(ymm_a, m);
|
|
cc.evex().vgetexpps(zmm_a, m);
|
|
cc.evex().vgetexpsd(xmm_a, xmm_b, m);
|
|
cc.evex().vgetexpss(xmm_a, xmm_b, m);
|
|
cc.evex().vgetmantpd(xmm_a, m, 0);
|
|
cc.evex().vgetmantpd(ymm_a, m, 0);
|
|
cc.evex().vgetmantpd(zmm_a, m, 0);
|
|
cc.evex().vgetmantps(xmm_a, m, 0);
|
|
cc.evex().vgetmantps(ymm_a, m, 0);
|
|
cc.evex().vgetmantps(zmm_a, m, 0);
|
|
cc.evex().vgetmantsd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vgetmantss(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vinsertf32x4(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vinsertf32x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinsertf32x8(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinsertf64x2(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vinsertf64x2(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinsertf64x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinserti32x4(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vinserti32x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinserti32x8(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinserti64x2(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vinserti64x2(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinserti64x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vinsertps(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vmaxpd(xmm_a, xmm_b, m);
|
|
cc.evex().vmaxpd(ymm_a, ymm_b, m);
|
|
cc.evex().vmaxpd(zmm_a, zmm_b, m);
|
|
cc.evex().vmaxps(xmm_a, xmm_b, m);
|
|
cc.evex().vmaxps(ymm_a, ymm_b, m);
|
|
cc.evex().vmaxps(zmm_a, zmm_b, m);
|
|
cc.evex().vmaxsd(xmm_a, xmm_b, m);
|
|
cc.evex().vmaxss(xmm_a, xmm_b, m);
|
|
cc.evex().vminpd(xmm_a, xmm_b, m);
|
|
cc.evex().vminpd(ymm_a, ymm_b, m);
|
|
cc.evex().vminpd(zmm_a, zmm_b, m);
|
|
cc.evex().vminps(xmm_a, xmm_b, m);
|
|
cc.evex().vminps(ymm_a, ymm_b, m);
|
|
cc.evex().vminps(zmm_a, zmm_b, m);
|
|
cc.evex().vminsd(xmm_a, xmm_b, m);
|
|
cc.evex().vminss(xmm_a, xmm_b, m);
|
|
cc.evex().vmovapd(xmm_a, m);
|
|
cc.evex().vmovapd(m, xmm_b);
|
|
cc.evex().vmovapd(ymm_a, m);
|
|
cc.evex().vmovapd(m, ymm_b);
|
|
cc.evex().vmovapd(zmm_a, m);
|
|
cc.evex().vmovapd(m, zmm_b);
|
|
cc.evex().vmovaps(xmm_a, m);
|
|
cc.evex().vmovaps(m, xmm_b);
|
|
cc.evex().vmovaps(ymm_a, m);
|
|
cc.evex().vmovaps(m, ymm_b);
|
|
cc.evex().vmovaps(zmm_a, m);
|
|
cc.evex().vmovaps(m, zmm_b);
|
|
cc.evex().vmovd(m, xmm_b);
|
|
cc.evex().vmovd(xmm_a, m);
|
|
cc.evex().vmovddup(xmm_a, m);
|
|
cc.evex().vmovddup(ymm_a, m);
|
|
cc.evex().vmovddup(zmm_a, m);
|
|
cc.evex().vmovdqa32(xmm_a, m);
|
|
cc.evex().vmovdqa32(m, xmm_b);
|
|
cc.evex().vmovdqa32(ymm_a, m);
|
|
cc.evex().vmovdqa32(m, ymm_b);
|
|
cc.evex().vmovdqa32(zmm_a, m);
|
|
cc.evex().vmovdqa32(m, zmm_b);
|
|
cc.evex().vmovdqa64(xmm_a, m);
|
|
cc.evex().vmovdqa64(m, xmm_b);
|
|
cc.evex().vmovdqa64(ymm_a, m);
|
|
cc.evex().vmovdqa64(m, ymm_b);
|
|
cc.evex().vmovdqa64(zmm_a, m);
|
|
cc.evex().vmovdqa64(m, zmm_b);
|
|
cc.evex().vmovdqu16(xmm_a, m);
|
|
cc.evex().vmovdqu16(m, xmm_b);
|
|
cc.evex().vmovdqu16(ymm_a, m);
|
|
cc.evex().vmovdqu16(m, ymm_b);
|
|
cc.evex().vmovdqu16(zmm_a, m);
|
|
cc.evex().vmovdqu16(m, zmm_b);
|
|
cc.evex().vmovdqu32(xmm_a, m);
|
|
cc.evex().vmovdqu32(m, xmm_b);
|
|
cc.evex().vmovdqu32(ymm_a, m);
|
|
cc.evex().vmovdqu32(m, ymm_b);
|
|
cc.evex().vmovdqu32(zmm_a, m);
|
|
cc.evex().vmovdqu32(m, zmm_b);
|
|
cc.evex().vmovdqu64(xmm_a, m);
|
|
cc.evex().vmovdqu64(m, xmm_b);
|
|
cc.evex().vmovdqu64(ymm_a, m);
|
|
cc.evex().vmovdqu64(m, ymm_b);
|
|
cc.evex().vmovdqu64(zmm_a, m);
|
|
cc.evex().vmovdqu64(m, zmm_b);
|
|
cc.evex().vmovdqu8(xmm_a, m);
|
|
cc.evex().vmovdqu8(m, xmm_b);
|
|
cc.evex().vmovdqu8(ymm_a, m);
|
|
cc.evex().vmovdqu8(m, ymm_b);
|
|
cc.evex().vmovdqu8(zmm_a, m);
|
|
cc.evex().vmovdqu8(m, zmm_b);
|
|
cc.evex().vmovhpd(m, xmm_b);
|
|
cc.evex().vmovhpd(xmm_a, xmm_b, m);
|
|
cc.evex().vmovhps(m, xmm_b);
|
|
cc.evex().vmovhps(xmm_a, xmm_b, m);
|
|
cc.evex().vmovlpd(m, xmm_b);
|
|
cc.evex().vmovlpd(xmm_a, xmm_b, m);
|
|
cc.evex().vmovlps(m, xmm_b);
|
|
cc.evex().vmovlps(xmm_a, xmm_b, m);
|
|
cc.evex().vmovntdq(m, xmm_b);
|
|
cc.evex().vmovntdq(m, ymm_b);
|
|
cc.evex().vmovntdq(m, zmm_b);
|
|
cc.evex().vmovntdqa(xmm_a, m);
|
|
cc.evex().vmovntdqa(ymm_a, m);
|
|
cc.evex().vmovntdqa(zmm_a, m);
|
|
cc.evex().vmovntpd(m, xmm_b);
|
|
cc.evex().vmovntpd(m, ymm_b);
|
|
cc.evex().vmovntpd(m, zmm_b);
|
|
cc.evex().vmovntps(m, xmm_b);
|
|
cc.evex().vmovntps(m, ymm_b);
|
|
cc.evex().vmovntps(m, zmm_b);
|
|
cc.evex().vmovq(m, xmm_b);
|
|
cc.evex().vmovq(xmm_a, m);
|
|
cc.evex().vmovq(xmm_a, m);
|
|
cc.evex().vmovq(m, xmm_b);
|
|
cc.evex().vmovsd(m, xmm_b);
|
|
cc.evex().vmovsd(xmm_a, m);
|
|
cc.evex().vmovshdup(xmm_a, m);
|
|
cc.evex().vmovshdup(ymm_a, m);
|
|
cc.evex().vmovshdup(zmm_a, m);
|
|
cc.evex().vmovsldup(xmm_a, m);
|
|
cc.evex().vmovsldup(ymm_a, m);
|
|
cc.evex().vmovsldup(zmm_a, m);
|
|
cc.evex().vmovss(m, xmm_b);
|
|
cc.evex().vmovss(xmm_a, m);
|
|
cc.evex().vmovupd(xmm_a, m);
|
|
cc.evex().vmovupd(m, xmm_b);
|
|
cc.evex().vmovupd(ymm_a, m);
|
|
cc.evex().vmovupd(m, ymm_b);
|
|
cc.evex().vmovupd(zmm_a, m);
|
|
cc.evex().vmovupd(m, zmm_b);
|
|
cc.evex().vmovups(xmm_a, m);
|
|
cc.evex().vmovups(m, xmm_b);
|
|
cc.evex().vmovups(ymm_a, m);
|
|
cc.evex().vmovups(m, ymm_b);
|
|
cc.evex().vmovups(zmm_a, m);
|
|
cc.evex().vmovups(m, zmm_b);
|
|
cc.evex().vmulpd(xmm_a, xmm_b, m);
|
|
cc.evex().vmulpd(ymm_a, ymm_b, m);
|
|
cc.evex().vmulpd(zmm_a, zmm_b, m);
|
|
cc.evex().vmulps(xmm_a, xmm_b, m);
|
|
cc.evex().vmulps(ymm_a, ymm_b, m);
|
|
cc.evex().vmulps(zmm_a, zmm_b, m);
|
|
cc.evex().vmulsd(xmm_a, xmm_b, m);
|
|
cc.evex().vmulss(xmm_a, xmm_b, m);
|
|
cc.evex().vorpd(xmm_a, xmm_b, m);
|
|
cc.evex().vorpd(ymm_a, ymm_b, m);
|
|
cc.evex().vorpd(zmm_a, zmm_b, m);
|
|
cc.evex().vorps(xmm_a, xmm_b, m);
|
|
cc.evex().vorps(ymm_a, ymm_b, m);
|
|
cc.evex().vorps(zmm_a, zmm_b, m);
|
|
cc.evex().vpabsb(xmm_a, m);
|
|
cc.evex().vpabsb(ymm_a, m);
|
|
cc.evex().vpabsb(zmm_a, m);
|
|
cc.evex().vpabsd(xmm_a, m);
|
|
cc.evex().vpabsd(ymm_a, m);
|
|
cc.evex().vpabsd(zmm_a, m);
|
|
cc.evex().vpabsq(xmm_a, m);
|
|
cc.evex().vpabsq(ymm_a, m);
|
|
cc.evex().vpabsq(zmm_a, m);
|
|
cc.evex().vpabsw(xmm_a, m);
|
|
cc.evex().vpabsw(ymm_a, m);
|
|
cc.evex().vpabsw(zmm_a, m);
|
|
cc.evex().vpackssdw(xmm_a, xmm_b, m);
|
|
cc.evex().vpackssdw(ymm_a, ymm_b, m);
|
|
cc.evex().vpackssdw(zmm_a, zmm_b, m);
|
|
cc.evex().vpacksswb(xmm_a, xmm_b, m);
|
|
cc.evex().vpacksswb(ymm_a, ymm_b, m);
|
|
cc.evex().vpacksswb(zmm_a, zmm_b, m);
|
|
cc.evex().vpackusdw(xmm_a, xmm_b, m);
|
|
cc.evex().vpackusdw(ymm_a, ymm_b, m);
|
|
cc.evex().vpackusdw(zmm_a, zmm_b, m);
|
|
cc.evex().vpackuswb(xmm_a, xmm_b, m);
|
|
cc.evex().vpackuswb(ymm_a, ymm_b, m);
|
|
cc.evex().vpackuswb(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddb(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddb(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddb(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddd(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddd(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddd(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddq(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddq(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddq(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddsb(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddsb(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddsb(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddusb(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddusb(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddusb(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddusw(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddusw(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddusw(zmm_a, zmm_b, m);
|
|
cc.evex().vpaddw(xmm_a, xmm_b, m);
|
|
cc.evex().vpaddw(ymm_a, ymm_b, m);
|
|
cc.evex().vpaddw(zmm_a, zmm_b, m);
|
|
cc.evex().vpalignr(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vpalignr(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vpalignr(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vpandd(xmm_a, xmm_b, m);
|
|
cc.evex().vpandd(ymm_a, ymm_b, m);
|
|
cc.evex().vpandd(zmm_a, zmm_b, m);
|
|
cc.evex().vpandnd(xmm_a, xmm_b, m);
|
|
cc.evex().vpandnd(ymm_a, ymm_b, m);
|
|
cc.evex().vpandnd(zmm_a, zmm_b, m);
|
|
cc.evex().vpandnq(xmm_a, xmm_b, m);
|
|
cc.evex().vpandnq(ymm_a, ymm_b, m);
|
|
cc.evex().vpandnq(zmm_a, zmm_b, m);
|
|
cc.evex().vpandq(xmm_a, xmm_b, m);
|
|
cc.evex().vpandq(ymm_a, ymm_b, m);
|
|
cc.evex().vpandq(zmm_a, zmm_b, m);
|
|
cc.evex().vpavgb(xmm_a, xmm_b, m);
|
|
cc.evex().vpavgb(ymm_a, ymm_b, m);
|
|
cc.evex().vpavgb(zmm_a, zmm_b, m);
|
|
cc.evex().vpavgw(xmm_a, xmm_b, m);
|
|
cc.evex().vpavgw(ymm_a, ymm_b, m);
|
|
cc.evex().vpavgw(zmm_a, zmm_b, m);
|
|
cc.evex().vpblendmb(xmm_a, xmm_b, m);
|
|
cc.evex().vpblendmb(ymm_a, ymm_b, m);
|
|
cc.evex().vpblendmb(zmm_a, zmm_b, m);
|
|
cc.evex().vpblendmd(xmm_a, xmm_b, m);
|
|
cc.evex().vpblendmd(ymm_a, ymm_b, m);
|
|
cc.evex().vpblendmd(zmm_a, zmm_b, m);
|
|
cc.evex().vpblendmq(xmm_a, xmm_b, m);
|
|
cc.evex().vpblendmq(ymm_a, ymm_b, m);
|
|
cc.evex().vpblendmq(zmm_a, zmm_b, m);
|
|
cc.evex().vpblendmw(xmm_a, xmm_b, m);
|
|
cc.evex().vpblendmw(ymm_a, ymm_b, m);
|
|
cc.evex().vpblendmw(zmm_a, zmm_b, m);
|
|
cc.evex().vpbroadcastb(xmm_a, m);
|
|
cc.evex().vpbroadcastb(ymm_a, m);
|
|
cc.evex().vpbroadcastb(zmm_a, m);
|
|
cc.evex().vpbroadcastd(xmm_a, m);
|
|
cc.evex().vpbroadcastd(ymm_a, m);
|
|
cc.evex().vpbroadcastd(zmm_a, m);
|
|
cc.evex().vpbroadcastq(xmm_a, m);
|
|
cc.evex().vpbroadcastq(ymm_a, m);
|
|
cc.evex().vpbroadcastq(zmm_a, m);
|
|
cc.evex().vpbroadcastw(xmm_a, m);
|
|
cc.evex().vpbroadcastw(ymm_a, m);
|
|
cc.evex().vpbroadcastw(zmm_a, m);
|
|
cc.evex().vpcmpb(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpb(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpb(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpd(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpd(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpd(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpeqb(kA, xmm_b, m);
|
|
cc.evex().vpcmpeqb(kA, ymm_b, m);
|
|
cc.evex().vpcmpeqb(kA, zmm_b, m);
|
|
cc.evex().vpcmpeqd(kA, xmm_b, m);
|
|
cc.evex().vpcmpeqd(kA, ymm_b, m);
|
|
cc.evex().vpcmpeqd(kA, zmm_b, m);
|
|
cc.evex().vpcmpeqq(kA, xmm_b, m);
|
|
cc.evex().vpcmpeqq(kA, ymm_b, m);
|
|
cc.evex().vpcmpeqq(kA, zmm_b, m);
|
|
cc.evex().vpcmpeqw(kA, xmm_b, m);
|
|
cc.evex().vpcmpeqw(kA, ymm_b, m);
|
|
cc.evex().vpcmpeqw(kA, zmm_b, m);
|
|
cc.evex().vpcmpgtb(kA, xmm_b, m);
|
|
cc.evex().vpcmpgtb(kA, ymm_b, m);
|
|
cc.evex().vpcmpgtb(kA, zmm_b, m);
|
|
cc.evex().vpcmpgtd(kA, xmm_b, m);
|
|
cc.evex().vpcmpgtd(kA, ymm_b, m);
|
|
cc.evex().vpcmpgtd(kA, zmm_b, m);
|
|
cc.evex().vpcmpgtq(kA, xmm_b, m);
|
|
cc.evex().vpcmpgtq(kA, ymm_b, m);
|
|
cc.evex().vpcmpgtq(kA, zmm_b, m);
|
|
cc.evex().vpcmpgtw(kA, xmm_b, m);
|
|
cc.evex().vpcmpgtw(kA, ymm_b, m);
|
|
cc.evex().vpcmpgtw(kA, zmm_b, m);
|
|
cc.evex().vpcmpq(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpq(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpq(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpub(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpub(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpub(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpud(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpud(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpud(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpuq(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpuq(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpuq(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpuw(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpuw(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpuw(kA, zmm_b, m, 0);
|
|
cc.evex().vpcmpw(kA, xmm_b, m, 0);
|
|
cc.evex().vpcmpw(kA, ymm_b, m, 0);
|
|
cc.evex().vpcmpw(kA, zmm_b, m, 0);
|
|
cc.evex().vpcompressd(m, xmm_b);
|
|
cc.evex().vpcompressd(m, ymm_b);
|
|
cc.evex().vpcompressd(m, zmm_b);
|
|
cc.evex().vpcompressq(m, xmm_b);
|
|
cc.evex().vpcompressq(m, ymm_b);
|
|
cc.evex().vpcompressq(m, zmm_b);
|
|
cc.evex().vpconflictd(xmm_a, m);
|
|
cc.evex().vpconflictd(ymm_a, m);
|
|
cc.evex().vpconflictd(zmm_a, m);
|
|
cc.evex().vpconflictq(xmm_a, m);
|
|
cc.evex().vpconflictq(ymm_a, m);
|
|
cc.evex().vpconflictq(zmm_a, m);
|
|
cc.evex().vpermb(xmm_a, xmm_b, m);
|
|
cc.evex().vpermb(ymm_a, ymm_b, m);
|
|
cc.evex().vpermb(zmm_a, zmm_b, m);
|
|
cc.evex().vpermd(ymm_a, ymm_b, m);
|
|
cc.evex().vpermd(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2b(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2b(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2b(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2d(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2d(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2d(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2pd(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2pd(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2pd(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2ps(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2ps(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2ps(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2q(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2q(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2q(zmm_a, zmm_b, m);
|
|
cc.evex().vpermi2w(xmm_a, xmm_b, m);
|
|
cc.evex().vpermi2w(ymm_a, ymm_b, m);
|
|
cc.evex().vpermi2w(zmm_a, zmm_b, m);
|
|
cc.evex().vpermilpd(xmm_a, xmm_b, m);
|
|
cc.evex().vpermilpd(ymm_a, ymm_b, m);
|
|
cc.evex().vpermilpd(zmm_a, zmm_b, m);
|
|
cc.evex().vpermilpd(xmm_a, m, 0);
|
|
cc.evex().vpermilpd(ymm_a, m, 0);
|
|
cc.evex().vpermilpd(zmm_a, m, 0);
|
|
cc.evex().vpermilps(xmm_a, xmm_b, m);
|
|
cc.evex().vpermilps(ymm_a, ymm_b, m);
|
|
cc.evex().vpermilps(zmm_a, zmm_b, m);
|
|
cc.evex().vpermilps(xmm_a, m, 0);
|
|
cc.evex().vpermilps(ymm_a, m, 0);
|
|
cc.evex().vpermilps(zmm_a, m, 0);
|
|
cc.evex().vpermq(ymm_a, ymm_b, m);
|
|
cc.evex().vpermq(zmm_a, zmm_b, m);
|
|
cc.evex().vpermq(ymm_a, m, 0);
|
|
cc.evex().vpermq(zmm_a, m, 0);
|
|
cc.evex().vpermt2b(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2b(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2b(zmm_a, zmm_b, m);
|
|
cc.evex().vpermt2d(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2d(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2d(zmm_a, zmm_b, m);
|
|
cc.evex().vpermt2pd(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2pd(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2pd(zmm_a, zmm_b, m);
|
|
cc.evex().vpermt2ps(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2ps(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2ps(zmm_a, zmm_b, m);
|
|
cc.evex().vpermt2q(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2q(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2q(zmm_a, zmm_b, m);
|
|
cc.evex().vpermt2w(xmm_a, xmm_b, m);
|
|
cc.evex().vpermt2w(ymm_a, ymm_b, m);
|
|
cc.evex().vpermt2w(zmm_a, zmm_b, m);
|
|
cc.evex().vpermw(xmm_a, xmm_b, m);
|
|
cc.evex().vpermw(ymm_a, ymm_b, m);
|
|
cc.evex().vpermw(zmm_a, zmm_b, m);
|
|
cc.evex().vpexpandd(xmm_a, m);
|
|
cc.evex().vpexpandd(ymm_a, m);
|
|
cc.evex().vpexpandd(zmm_a, m);
|
|
cc.evex().vpexpandq(xmm_a, m);
|
|
cc.evex().vpexpandq(ymm_a, m);
|
|
cc.evex().vpexpandq(zmm_a, m);
|
|
cc.evex().vpextrb(m, xmm_b, 0);
|
|
cc.evex().vpextrd(m, xmm_b, 0);
|
|
if (cc.is_64bit()) cc.evex().vpextrq(m, xmm_b, 0);
|
|
cc.evex().vpextrw(m, xmm_b, 0);
|
|
cc.evex().k(kA).vpgatherdd(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vpgatherdd(ymm_a, vy_ptr);
|
|
cc.evex().k(kA).vpgatherdd(zmm_a, vz_ptr);
|
|
cc.evex().k(kA).vpgatherdq(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vpgatherdq(ymm_a, vx_ptr);
|
|
cc.evex().k(kA).vpgatherdq(zmm_a, vy_ptr);
|
|
cc.evex().k(kA).vpgatherqd(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vpgatherqd(xmm_a, vy_ptr);
|
|
cc.evex().k(kA).vpgatherqd(ymm_a, vz_ptr);
|
|
cc.evex().k(kA).vpgatherqq(xmm_a, vx_ptr);
|
|
cc.evex().k(kA).vpgatherqq(ymm_a, vy_ptr);
|
|
cc.evex().k(kA).vpgatherqq(zmm_a, vz_ptr);
|
|
cc.evex().vpinsrb(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vpinsrd(xmm_a, xmm_b, m, 0);
|
|
if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vpinsrw(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vplzcntd(xmm_a, m);
|
|
cc.evex().vplzcntd(ymm_a, m);
|
|
cc.evex().vplzcntd(zmm_a, m);
|
|
cc.evex().vplzcntq(xmm_a, m);
|
|
cc.evex().vplzcntq(ymm_a, m);
|
|
cc.evex().vplzcntq(zmm_a, m);
|
|
cc.evex().vpmadd52huq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmadd52huq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmadd52huq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmadd52luq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmadd52luq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmadd52luq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaddubsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaddubsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaddubsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaddwd(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaddwd(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaddwd(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxsb(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxsb(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxsb(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxsd(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxsd(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxsd(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxsq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxsq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxsq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxub(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxub(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxub(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxud(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxud(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxud(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxuq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxuq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxuq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmaxuw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmaxuw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmaxuw(zmm_a, zmm_b, m);
|
|
cc.evex().vpminsb(xmm_a, xmm_b, m);
|
|
cc.evex().vpminsb(ymm_a, ymm_b, m);
|
|
cc.evex().vpminsb(zmm_a, zmm_b, m);
|
|
cc.evex().vpminsd(xmm_a, xmm_b, m);
|
|
cc.evex().vpminsd(ymm_a, ymm_b, m);
|
|
cc.evex().vpminsd(zmm_a, zmm_b, m);
|
|
cc.evex().vpminsq(xmm_a, xmm_b, m);
|
|
cc.evex().vpminsq(ymm_a, ymm_b, m);
|
|
cc.evex().vpminsq(zmm_a, zmm_b, m);
|
|
cc.evex().vpminsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpminsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpminsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpminub(xmm_a, xmm_b, m);
|
|
cc.evex().vpminub(ymm_a, ymm_b, m);
|
|
cc.evex().vpminub(zmm_a, zmm_b, m);
|
|
cc.evex().vpminud(xmm_a, xmm_b, m);
|
|
cc.evex().vpminud(ymm_a, ymm_b, m);
|
|
cc.evex().vpminud(zmm_a, zmm_b, m);
|
|
cc.evex().vpminuq(xmm_a, xmm_b, m);
|
|
cc.evex().vpminuq(ymm_a, ymm_b, m);
|
|
cc.evex().vpminuq(zmm_a, zmm_b, m);
|
|
cc.evex().vpminuw(xmm_a, xmm_b, m);
|
|
cc.evex().vpminuw(ymm_a, ymm_b, m);
|
|
cc.evex().vpminuw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmovdb(m, xmm_b);
|
|
cc.evex().vpmovdb(m, ymm_b);
|
|
cc.evex().vpmovdb(m, zmm_b);
|
|
cc.evex().vpmovdw(m, xmm_b);
|
|
cc.evex().vpmovdw(m, ymm_b);
|
|
cc.evex().vpmovdw(m, zmm_b);
|
|
cc.evex().vpmovqb(m, xmm_b);
|
|
cc.evex().vpmovqb(m, ymm_b);
|
|
cc.evex().vpmovqb(m, zmm_b);
|
|
cc.evex().vpmovqd(m, xmm_b);
|
|
cc.evex().vpmovqd(m, ymm_b);
|
|
cc.evex().vpmovqd(m, zmm_b);
|
|
cc.evex().vpmovqw(m, xmm_b);
|
|
cc.evex().vpmovqw(m, ymm_b);
|
|
cc.evex().vpmovqw(m, zmm_b);
|
|
cc.evex().vpmovsdb(m, xmm_b);
|
|
cc.evex().vpmovsdb(m, ymm_b);
|
|
cc.evex().vpmovsdb(m, zmm_b);
|
|
cc.evex().vpmovsdw(m, xmm_b);
|
|
cc.evex().vpmovsdw(m, ymm_b);
|
|
cc.evex().vpmovsdw(m, zmm_b);
|
|
cc.evex().vpmovsqb(m, xmm_b);
|
|
cc.evex().vpmovsqb(m, ymm_b);
|
|
cc.evex().vpmovsqb(m, zmm_b);
|
|
cc.evex().vpmovsqd(m, xmm_b);
|
|
cc.evex().vpmovsqd(m, ymm_b);
|
|
cc.evex().vpmovsqd(m, zmm_b);
|
|
cc.evex().vpmovsqw(m, xmm_b);
|
|
cc.evex().vpmovsqw(m, ymm_b);
|
|
cc.evex().vpmovsqw(m, zmm_b);
|
|
cc.evex().vpmovswb(m, xmm_b);
|
|
cc.evex().vpmovswb(m, ymm_b);
|
|
cc.evex().vpmovswb(m, zmm_b);
|
|
cc.evex().vpmovsxbd(xmm_a, m);
|
|
cc.evex().vpmovsxbd(ymm_a, m);
|
|
cc.evex().vpmovsxbd(zmm_a, m);
|
|
cc.evex().vpmovsxbq(xmm_a, m);
|
|
cc.evex().vpmovsxbq(ymm_a, m);
|
|
cc.evex().vpmovsxbq(zmm_a, m);
|
|
cc.evex().vpmovsxbw(xmm_a, m);
|
|
cc.evex().vpmovsxbw(ymm_a, m);
|
|
cc.evex().vpmovsxbw(zmm_a, m);
|
|
cc.evex().vpmovsxdq(xmm_a, m);
|
|
cc.evex().vpmovsxdq(ymm_a, m);
|
|
cc.evex().vpmovsxdq(zmm_a, m);
|
|
cc.evex().vpmovsxwd(xmm_a, m);
|
|
cc.evex().vpmovsxwd(ymm_a, m);
|
|
cc.evex().vpmovsxwd(zmm_a, m);
|
|
cc.evex().vpmovsxwq(xmm_a, m);
|
|
cc.evex().vpmovsxwq(ymm_a, m);
|
|
cc.evex().vpmovsxwq(zmm_a, m);
|
|
cc.evex().vpmovusdb(m, xmm_b);
|
|
cc.evex().vpmovusdb(m, ymm_b);
|
|
cc.evex().vpmovusdb(m, zmm_b);
|
|
cc.evex().vpmovusdw(m, xmm_b);
|
|
cc.evex().vpmovusdw(m, ymm_b);
|
|
cc.evex().vpmovusdw(m, zmm_b);
|
|
cc.evex().vpmovusqb(m, xmm_b);
|
|
cc.evex().vpmovusqb(m, ymm_b);
|
|
cc.evex().vpmovusqb(m, zmm_b);
|
|
cc.evex().vpmovusqd(m, xmm_b);
|
|
cc.evex().vpmovusqd(m, ymm_b);
|
|
cc.evex().vpmovusqd(m, zmm_b);
|
|
cc.evex().vpmovusqw(m, xmm_b);
|
|
cc.evex().vpmovusqw(m, ymm_b);
|
|
cc.evex().vpmovusqw(m, zmm_b);
|
|
cc.evex().vpmovuswb(m, xmm_b);
|
|
cc.evex().vpmovuswb(m, ymm_b);
|
|
cc.evex().vpmovuswb(m, zmm_b);
|
|
cc.evex().vpmovwb(m, xmm_b);
|
|
cc.evex().vpmovwb(m, ymm_b);
|
|
cc.evex().vpmovwb(m, zmm_b);
|
|
cc.evex().vpmovzxbd(xmm_a, m);
|
|
cc.evex().vpmovzxbd(ymm_a, m);
|
|
cc.evex().vpmovzxbd(zmm_a, m);
|
|
cc.evex().vpmovzxbq(xmm_a, m);
|
|
cc.evex().vpmovzxbq(ymm_a, m);
|
|
cc.evex().vpmovzxbq(zmm_a, m);
|
|
cc.evex().vpmovzxbw(xmm_a, m);
|
|
cc.evex().vpmovzxbw(ymm_a, m);
|
|
cc.evex().vpmovzxbw(zmm_a, m);
|
|
cc.evex().vpmovzxdq(xmm_a, m);
|
|
cc.evex().vpmovzxdq(ymm_a, m);
|
|
cc.evex().vpmovzxdq(zmm_a, m);
|
|
cc.evex().vpmovzxwd(xmm_a, m);
|
|
cc.evex().vpmovzxwd(ymm_a, m);
|
|
cc.evex().vpmovzxwd(zmm_a, m);
|
|
cc.evex().vpmovzxwq(xmm_a, m);
|
|
cc.evex().vpmovzxwq(ymm_a, m);
|
|
cc.evex().vpmovzxwq(zmm_a, m);
|
|
cc.evex().vpmuldq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmuldq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmuldq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmulhrsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmulhrsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmulhrsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmulhuw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmulhuw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmulhuw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmulhw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmulhw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmulhw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmulld(xmm_a, xmm_b, m);
|
|
cc.evex().vpmulld(ymm_a, ymm_b, m);
|
|
cc.evex().vpmulld(zmm_a, zmm_b, m);
|
|
cc.evex().vpmullq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmullq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmullq(zmm_a, zmm_b, m);
|
|
cc.evex().vpmullw(xmm_a, xmm_b, m);
|
|
cc.evex().vpmullw(ymm_a, ymm_b, m);
|
|
cc.evex().vpmullw(zmm_a, zmm_b, m);
|
|
cc.evex().vpmultishiftqb(xmm_a, xmm_b, m);
|
|
cc.evex().vpmultishiftqb(ymm_a, ymm_b, m);
|
|
cc.evex().vpmultishiftqb(zmm_a, zmm_b, m);
|
|
cc.evex().vpmuludq(xmm_a, xmm_b, m);
|
|
cc.evex().vpmuludq(ymm_a, ymm_b, m);
|
|
cc.evex().vpmuludq(zmm_a, zmm_b, m);
|
|
cc.evex().vpopcntd(zmm_a, m);
|
|
cc.evex().vpopcntq(zmm_a, m);
|
|
cc.evex().vpord(xmm_a, xmm_b, m);
|
|
cc.evex().vpord(ymm_a, ymm_b, m);
|
|
cc.evex().vpord(zmm_a, zmm_b, m);
|
|
cc.evex().vporq(xmm_a, xmm_b, m);
|
|
cc.evex().vporq(ymm_a, ymm_b, m);
|
|
cc.evex().vporq(zmm_a, zmm_b, m);
|
|
cc.evex().vprold(xmm_a, m, 0);
|
|
cc.evex().vprold(ymm_a, m, 0);
|
|
cc.evex().vprold(zmm_a, m, 0);
|
|
cc.evex().vprolq(xmm_a, m, 0);
|
|
cc.evex().vprolq(ymm_a, m, 0);
|
|
cc.evex().vprolq(zmm_a, m, 0);
|
|
cc.evex().vprolvd(xmm_a, xmm_b, m);
|
|
cc.evex().vprolvd(ymm_a, ymm_b, m);
|
|
cc.evex().vprolvd(zmm_a, zmm_b, m);
|
|
cc.evex().vprolvq(xmm_a, xmm_b, m);
|
|
cc.evex().vprolvq(ymm_a, ymm_b, m);
|
|
cc.evex().vprolvq(zmm_a, zmm_b, m);
|
|
cc.evex().vprord(xmm_a, m, 0);
|
|
cc.evex().vprord(ymm_a, m, 0);
|
|
cc.evex().vprord(zmm_a, m, 0);
|
|
cc.evex().vprorq(xmm_a, m, 0);
|
|
cc.evex().vprorq(ymm_a, m, 0);
|
|
cc.evex().vprorq(zmm_a, m, 0);
|
|
cc.evex().vprorvd(xmm_a, xmm_b, m);
|
|
cc.evex().vprorvd(ymm_a, ymm_b, m);
|
|
cc.evex().vprorvd(zmm_a, zmm_b, m);
|
|
cc.evex().vprorvq(xmm_a, xmm_b, m);
|
|
cc.evex().vprorvq(ymm_a, ymm_b, m);
|
|
cc.evex().vprorvq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsadbw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsadbw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsadbw(zmm_a, zmm_b, m);
|
|
cc.evex().k(kA).vpscatterdd(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vpscatterdd(vy_ptr, ymm_b);
|
|
cc.evex().k(kA).vpscatterdd(vz_ptr, zmm_b);
|
|
cc.evex().k(kA).vpscatterdq(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vpscatterdq(vx_ptr, ymm_b);
|
|
cc.evex().k(kA).vpscatterdq(vy_ptr, zmm_b);
|
|
cc.evex().k(kA).vpscatterqd(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vpscatterqd(vy_ptr, xmm_b);
|
|
cc.evex().k(kA).vpscatterqd(vz_ptr, ymm_b);
|
|
cc.evex().k(kA).vpscatterqq(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vpscatterqq(vy_ptr, ymm_b);
|
|
cc.evex().k(kA).vpscatterqq(vz_ptr, zmm_b);
|
|
cc.evex().vpshufb(xmm_a, xmm_b, m);
|
|
cc.evex().vpshufb(ymm_a, ymm_b, m);
|
|
cc.evex().vpshufb(zmm_a, zmm_b, m);
|
|
cc.evex().vpshufd(xmm_a, m, 0);
|
|
cc.evex().vpshufd(ymm_a, m, 0);
|
|
cc.evex().vpshufd(zmm_a, m, 0);
|
|
cc.evex().vpshufhw(xmm_a, m, 0);
|
|
cc.evex().vpshufhw(ymm_a, m, 0);
|
|
cc.evex().vpshufhw(zmm_a, m, 0);
|
|
cc.evex().vpshuflw(xmm_a, m, 0);
|
|
cc.evex().vpshuflw(ymm_a, m, 0);
|
|
cc.evex().vpshuflw(zmm_a, m, 0);
|
|
cc.evex().vpslld(xmm_a, xmm_b, m);
|
|
cc.evex().vpslld(xmm_a, m, 0);
|
|
cc.evex().vpslld(ymm_a, ymm_b, m);
|
|
cc.evex().vpslld(ymm_a, m, 0);
|
|
cc.evex().vpslld(zmm_a, zmm_b, m);
|
|
cc.evex().vpslld(zmm_a, m, 0);
|
|
cc.evex().vpslldq(xmm_a, m, 0);
|
|
cc.evex().vpslldq(ymm_a, m, 0);
|
|
cc.evex().vpslldq(zmm_a, m, 0);
|
|
cc.evex().vpsllq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsllq(xmm_a, m, 0);
|
|
cc.evex().vpsllq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsllq(ymm_a, m, 0);
|
|
cc.evex().vpsllq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsllq(zmm_a, m, 0);
|
|
cc.evex().vpsllvd(xmm_a, xmm_b, m);
|
|
cc.evex().vpsllvd(ymm_a, ymm_b, m);
|
|
cc.evex().vpsllvd(zmm_a, zmm_b, m);
|
|
cc.evex().vpsllvq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsllvq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsllvq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsllvw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsllvw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsllvw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsllw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsllw(xmm_a, m, 0);
|
|
cc.evex().vpsllw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsllw(ymm_a, m, 0);
|
|
cc.evex().vpsllw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsllw(zmm_a, m, 0);
|
|
cc.evex().vpsrad(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrad(xmm_a, m, 0);
|
|
cc.evex().vpsrad(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrad(ymm_a, m, 0);
|
|
cc.evex().vpsrad(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrad(zmm_a, m, 0);
|
|
cc.evex().vpsraq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsraq(xmm_a, m, 0);
|
|
cc.evex().vpsraq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsraq(ymm_a, m, 0);
|
|
cc.evex().vpsraq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsraq(zmm_a, m, 0);
|
|
cc.evex().vpsravd(xmm_a, xmm_b, m);
|
|
cc.evex().vpsravd(ymm_a, ymm_b, m);
|
|
cc.evex().vpsravd(zmm_a, zmm_b, m);
|
|
cc.evex().vpsravq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsravq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsravq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsravw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsravw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsravw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsraw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsraw(xmm_a, m, 0);
|
|
cc.evex().vpsraw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsraw(ymm_a, m, 0);
|
|
cc.evex().vpsraw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsraw(zmm_a, m, 0);
|
|
cc.evex().vpsrld(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrld(xmm_a, m, 0);
|
|
cc.evex().vpsrld(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrld(ymm_a, m, 0);
|
|
cc.evex().vpsrld(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrld(zmm_a, m, 0);
|
|
cc.evex().vpsrldq(xmm_a, m, 0);
|
|
cc.evex().vpsrldq(ymm_a, m, 0);
|
|
cc.evex().vpsrldq(zmm_a, m, 0);
|
|
cc.evex().vpsrlq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrlq(xmm_a, m, 0);
|
|
cc.evex().vpsrlq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrlq(ymm_a, m, 0);
|
|
cc.evex().vpsrlq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrlq(zmm_a, m, 0);
|
|
cc.evex().vpsrlvd(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrlvd(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrlvd(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrlvq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrlvq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrlvq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrlvw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrlvw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrlvw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrlw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsrlw(xmm_a, m, 0);
|
|
cc.evex().vpsrlw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsrlw(ymm_a, m, 0);
|
|
cc.evex().vpsrlw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsrlw(zmm_a, m, 0);
|
|
cc.evex().vpsubb(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubb(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubb(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubd(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubd(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubd(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubq(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubq(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubq(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubsb(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubsb(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubsb(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubsw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubsw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubsw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubusb(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubusb(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubusb(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubusw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubusw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubusw(zmm_a, zmm_b, m);
|
|
cc.evex().vpsubw(xmm_a, xmm_b, m);
|
|
cc.evex().vpsubw(ymm_a, ymm_b, m);
|
|
cc.evex().vpsubw(zmm_a, zmm_b, m);
|
|
cc.evex().vpternlogd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vpternlogd(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vpternlogd(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vpternlogq(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vpternlogq(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vpternlogq(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vptestmb(kA, xmm_b, m);
|
|
cc.evex().vptestmb(kA, ymm_b, m);
|
|
cc.evex().vptestmb(kA, zmm_b, m);
|
|
cc.evex().vptestmd(kA, xmm_b, m);
|
|
cc.evex().vptestmd(kA, ymm_b, m);
|
|
cc.evex().vptestmd(kA, zmm_b, m);
|
|
cc.evex().vptestmq(kA, xmm_b, m);
|
|
cc.evex().vptestmq(kA, ymm_b, m);
|
|
cc.evex().vptestmq(kA, zmm_b, m);
|
|
cc.evex().vptestmw(kA, xmm_b, m);
|
|
cc.evex().vptestmw(kA, ymm_b, m);
|
|
cc.evex().vptestmw(kA, zmm_b, m);
|
|
cc.evex().vptestnmb(kA, xmm_b, m);
|
|
cc.evex().vptestnmb(kA, ymm_b, m);
|
|
cc.evex().vptestnmb(kA, zmm_b, m);
|
|
cc.evex().vptestnmd(kA, xmm_b, m);
|
|
cc.evex().vptestnmd(kA, ymm_b, m);
|
|
cc.evex().vptestnmd(kA, zmm_b, m);
|
|
cc.evex().vptestnmq(kA, xmm_b, m);
|
|
cc.evex().vptestnmq(kA, ymm_b, m);
|
|
cc.evex().vptestnmq(kA, zmm_b, m);
|
|
cc.evex().vptestnmw(kA, xmm_b, m);
|
|
cc.evex().vptestnmw(kA, ymm_b, m);
|
|
cc.evex().vptestnmw(kA, zmm_b, m);
|
|
cc.evex().vpunpckhbw(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpckhbw(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpckhbw(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpckhdq(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpckhdq(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpckhdq(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpckhqdq(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpckhqdq(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpckhqdq(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpckhwd(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpckhwd(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpckhwd(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpcklbw(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpcklbw(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpcklbw(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpckldq(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpckldq(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpckldq(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpcklqdq(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpcklqdq(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpcklqdq(zmm_a, zmm_b, m);
|
|
cc.evex().vpunpcklwd(xmm_a, xmm_b, m);
|
|
cc.evex().vpunpcklwd(ymm_a, ymm_b, m);
|
|
cc.evex().vpunpcklwd(zmm_a, zmm_b, m);
|
|
cc.evex().vpxord(xmm_a, xmm_b, m);
|
|
cc.evex().vpxord(ymm_a, ymm_b, m);
|
|
cc.evex().vpxord(zmm_a, zmm_b, m);
|
|
cc.evex().vpxorq(xmm_a, xmm_b, m);
|
|
cc.evex().vpxorq(ymm_a, ymm_b, m);
|
|
cc.evex().vpxorq(zmm_a, zmm_b, m);
|
|
cc.evex().vrangepd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrangepd(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vrangepd(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vrangeps(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrangeps(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vrangeps(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vrangesd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrangess(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrcp14pd(xmm_a, m);
|
|
cc.evex().vrcp14pd(ymm_a, m);
|
|
cc.evex().vrcp14pd(zmm_a, m);
|
|
cc.evex().vrcp14ps(xmm_a, m);
|
|
cc.evex().vrcp14ps(ymm_a, m);
|
|
cc.evex().vrcp14ps(zmm_a, m);
|
|
cc.evex().vrcp14sd(xmm_a, xmm_b, m);
|
|
cc.evex().vrcp14ss(xmm_a, xmm_b, m);
|
|
cc.evex().vreducepd(xmm_a, m, 0);
|
|
cc.evex().vreducepd(ymm_a, m, 0);
|
|
cc.evex().vreducepd(zmm_a, m, 0);
|
|
cc.evex().vreduceps(xmm_a, m, 0);
|
|
cc.evex().vreduceps(ymm_a, m, 0);
|
|
cc.evex().vreduceps(zmm_a, m, 0);
|
|
cc.evex().vreducesd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vreducess(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrndscalepd(xmm_a, m, 0);
|
|
cc.evex().vrndscalepd(ymm_a, m, 0);
|
|
cc.evex().vrndscalepd(zmm_a, m, 0);
|
|
cc.evex().vrndscaleps(xmm_a, m, 0);
|
|
cc.evex().vrndscaleps(ymm_a, m, 0);
|
|
cc.evex().vrndscaleps(zmm_a, m, 0);
|
|
cc.evex().vrndscalesd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrndscaless(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vrsqrt14pd(xmm_a, m);
|
|
cc.evex().vrsqrt14pd(ymm_a, m);
|
|
cc.evex().vrsqrt14pd(zmm_a, m);
|
|
cc.evex().vrsqrt14ps(xmm_a, m);
|
|
cc.evex().vrsqrt14ps(ymm_a, m);
|
|
cc.evex().vrsqrt14ps(zmm_a, m);
|
|
cc.evex().vrsqrt14sd(xmm_a, xmm_b, m);
|
|
cc.evex().vrsqrt14ss(xmm_a, xmm_b, m);
|
|
cc.evex().vscalefpd(xmm_a, xmm_b, m);
|
|
cc.evex().vscalefpd(ymm_a, ymm_b, m);
|
|
cc.evex().vscalefpd(zmm_a, zmm_b, m);
|
|
cc.evex().vscalefps(xmm_a, xmm_b, m);
|
|
cc.evex().vscalefps(ymm_a, ymm_b, m);
|
|
cc.evex().vscalefps(zmm_a, zmm_b, m);
|
|
cc.evex().vscalefsd(xmm_a, xmm_b, m);
|
|
cc.evex().vscalefss(xmm_a, xmm_b, m);
|
|
cc.evex().k(kA).vscatterdpd(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vscatterdpd(vx_ptr, ymm_b);
|
|
cc.evex().k(kA).vscatterdpd(vy_ptr, zmm_b);
|
|
cc.evex().k(kA).vscatterdps(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vscatterdps(vy_ptr, ymm_b);
|
|
cc.evex().k(kA).vscatterdps(vz_ptr, zmm_b);
|
|
cc.evex().k(kA).vscatterqpd(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vscatterqpd(vy_ptr, ymm_b);
|
|
cc.evex().k(kA).vscatterqpd(vz_ptr, zmm_b);
|
|
cc.evex().k(kA).vscatterqps(vx_ptr, xmm_b);
|
|
cc.evex().k(kA).vscatterqps(vy_ptr, xmm_b);
|
|
cc.evex().k(kA).vscatterqps(vz_ptr, ymm_b);
|
|
cc.evex().vshuff32x4(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshuff32x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vshuff64x2(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshuff64x2(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vshufi32x4(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshufi32x4(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vshufi64x2(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshufi64x2(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vshufpd(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vshufpd(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshufpd(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vshufps(xmm_a, xmm_b, m, 0);
|
|
cc.evex().vshufps(ymm_a, ymm_b, m, 0);
|
|
cc.evex().vshufps(zmm_a, zmm_b, m, 0);
|
|
cc.evex().vsqrtpd(xmm_a, m);
|
|
cc.evex().vsqrtpd(ymm_a, m);
|
|
cc.evex().vsqrtpd(zmm_a, m);
|
|
cc.evex().vsqrtps(xmm_a, m);
|
|
cc.evex().vsqrtps(ymm_a, m);
|
|
cc.evex().vsqrtps(zmm_a, m);
|
|
cc.evex().vsqrtsd(xmm_a, xmm_b, m);
|
|
cc.evex().vsqrtss(xmm_a, xmm_b, m);
|
|
cc.evex().vsubpd(xmm_a, xmm_b, m);
|
|
cc.evex().vsubpd(ymm_a, ymm_b, m);
|
|
cc.evex().vsubpd(zmm_a, zmm_b, m);
|
|
cc.evex().vsubps(xmm_a, xmm_b, m);
|
|
cc.evex().vsubps(ymm_a, ymm_b, m);
|
|
cc.evex().vsubps(zmm_a, zmm_b, m);
|
|
cc.evex().vsubsd(xmm_a, xmm_b, m);
|
|
cc.evex().vsubss(xmm_a, xmm_b, m);
|
|
cc.evex().vucomisd(xmm_a, m);
|
|
cc.evex().vucomiss(xmm_a, m);
|
|
cc.evex().vunpckhpd(xmm_a, xmm_b, m);
|
|
cc.evex().vunpckhpd(ymm_a, ymm_b, m);
|
|
cc.evex().vunpckhpd(zmm_a, zmm_b, m);
|
|
cc.evex().vunpckhps(xmm_a, xmm_b, m);
|
|
cc.evex().vunpckhps(ymm_a, ymm_b, m);
|
|
cc.evex().vunpckhps(zmm_a, zmm_b, m);
|
|
cc.evex().vunpcklpd(xmm_a, xmm_b, m);
|
|
cc.evex().vunpcklpd(ymm_a, ymm_b, m);
|
|
cc.evex().vunpcklpd(zmm_a, zmm_b, m);
|
|
cc.evex().vunpcklps(xmm_a, xmm_b, m);
|
|
cc.evex().vunpcklps(ymm_a, ymm_b, m);
|
|
cc.evex().vunpcklps(zmm_a, zmm_b, m);
|
|
cc.evex().vxorpd(xmm_a, xmm_b, m);
|
|
cc.evex().vxorpd(ymm_a, ymm_b, m);
|
|
cc.evex().vxorpd(zmm_a, zmm_b, m);
|
|
cc.evex().vxorps(xmm_a, xmm_b, m);
|
|
cc.evex().vxorps(ymm_a, ymm_b, m);
|
|
cc.evex().vxorps(zmm_a, zmm_b, m);
|
|
}
|
|
|
|
// Generates a long sequence of AVX512 instructions.
|
|
template<typename Emitter>
|
|
static void generate_avx512_sequence_internal(
|
|
Emitter& cc,
|
|
InstForm form,
|
|
const x86::Gp& gp,
|
|
const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
|
|
const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) {
|
|
|
|
if (form == InstForm::kReg)
|
|
generate_avx512_sequence_internal_reg_only(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
|
|
else
|
|
generate_avx512_sequence_internal_reg_mem(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
|
|
}
|
|
|
|
static void generate_avx512_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) {
|
|
using namespace asmjit::x86;
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
if (emitter.is_compiler()) {
|
|
Compiler& cc = *emitter.as<Compiler>();
|
|
|
|
Gp gp = cc.new_gpz("gp");
|
|
Vec vec_a = cc.new_zmm("vec_a");
|
|
Vec vec_b = cc.new_zmm("vec_b");
|
|
Vec vec_c = cc.new_zmm("vec_c");
|
|
Vec vec_d = cc.new_zmm("vec_d");
|
|
|
|
KReg kA = cc.new_kq("kA");
|
|
KReg kB = cc.new_kq("kB");
|
|
KReg kC = cc.new_kq("kC");
|
|
|
|
cc.add_func(FuncSignature::build<void>());
|
|
generate_avx512_sequence_internal(cc, form, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d);
|
|
cc.end_func();
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
if (emitter.is_builder()) {
|
|
Builder& cc = *emitter.as<Builder>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
if (emitter.is_assembler()) {
|
|
Assembler& cc = *emitter.as<Assembler>();
|
|
|
|
if (emit_prolog_epilog) {
|
|
FuncDetail func;
|
|
func.init(FuncSignature::build<void, void*, const void*, size_t>(), cc.environment());
|
|
|
|
FuncFrame frame;
|
|
frame.init(func);
|
|
frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
frame.finalize();
|
|
|
|
cc.emit_prolog(frame);
|
|
generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
cc.emit_epilog(frame);
|
|
}
|
|
else {
|
|
generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
template<typename EmitterFn>
|
|
static void benchmark_x86_function(Arch arch, uint32_t num_iterations, const char* description, const EmitterFn& emitter_fn) noexcept {
|
|
CodeHolder code;
|
|
printf("%s:\n", description);
|
|
|
|
uint32_t instruction_count = 0;
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
instruction_count = asmjit_perf_utils::calculate_instruction_count<x86::Builder>(code, arch, [&](x86::Builder& cc) {
|
|
emitter_fn(cc, false);
|
|
});
|
|
#endif
|
|
|
|
asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[raw]", instruction_count, [&](x86::Assembler& cc) {
|
|
emitter_fn(cc, false);
|
|
});
|
|
|
|
asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[validated]", instruction_count, [&](x86::Assembler& cc) {
|
|
cc.add_diagnostic_options(DiagnosticOptions::kValidateAssembler);
|
|
emitter_fn(cc, false);
|
|
});
|
|
|
|
asmjit_perf_utils::bench<x86::Assembler>(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Assembler& cc) {
|
|
emitter_fn(cc, true);
|
|
});
|
|
|
|
#ifndef ASMJIT_NO_BUILDER
|
|
asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Builder& cc) {
|
|
emitter_fn(cc, false);
|
|
});
|
|
|
|
asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Builder& cc) {
|
|
emitter_fn(cc, false);
|
|
cc.finalize();
|
|
});
|
|
|
|
asmjit_perf_utils::bench<x86::Builder>(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Builder& cc) {
|
|
emitter_fn(cc, true);
|
|
cc.finalize();
|
|
});
|
|
#endif
|
|
|
|
#ifndef ASMJIT_NO_COMPILER
|
|
|
|
asmjit_perf_utils::bench<x86::Compiler>(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Compiler& cc) {
|
|
emitter_fn(cc, true);
|
|
});
|
|
|
|
asmjit_perf_utils::bench<x86::Compiler>(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Compiler& cc) {
|
|
emitter_fn(cc, true);
|
|
cc.finalize();
|
|
});
|
|
#endif
|
|
|
|
printf("\n");
|
|
}
|
|
|
|
void benchmark_x86_emitters(uint32_t num_iterations, bool test_x86, bool test_x64) {
|
|
uint32_t i = 0;
|
|
uint32_t n = 0;
|
|
|
|
Arch archs[2] {};
|
|
|
|
if (test_x86) archs[n++] = Arch::kX86;
|
|
if (test_x64) archs[n++] = Arch::kX64;
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "Empty function (mov + return from function)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_empty_function(emitter, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "4-Ops sequence (4 ops + return from function)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_n_ops_sequence(emitter, 4, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "16-Ops sequence (16 ops + return from function)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_n_ops_sequence(emitter, 16, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "32-Ops sequence (32 ops + return from function)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_n_ops_sequence(emitter, 32, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "64-Ops sequence (64 ops + return from function)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_n_ops_sequence(emitter, 64, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "GpSequence<Reg> (Sequence of GP instructions - reg-only)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_gp_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "GpSequence<Mem> (Sequence of GP instructions - reg/mem)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_gp_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "SseSequence<Reg> (sequence of SSE+ instructions - reg-only)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_sse_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "SseSequence<Mem> (sequence of SSE+ instructions - reg/mem)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_sse_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "AvxSequence<Reg> (sequence of AVX+ instructions - reg-only)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_avx_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "AvxSequence<Mem> (sequence of AVX+ instructions - reg/mem)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_avx_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "Avx512Sequence<Reg> (sequence of AVX512+ instructions - reg-only)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_avx512_sequence(emitter, InstForm::kReg, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "Avx512Sequence<Mem> (sequence of AVX512+ instructions - reg/mem)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
generate_avx512_sequence(emitter, InstForm::kMem, emit_prolog_epilog);
|
|
});
|
|
}
|
|
|
|
for (i = 0; i < n; i++) {
|
|
static const char description[] = "SseAlphaBlend (alpha-blend function with labels and jumps)";
|
|
benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) {
|
|
asmtest::generate_sse_alpha_blend(emitter, emit_prolog_epilog);
|
|
});
|
|
}
|
|
}
|
|
|
|
#endif // !ASMJIT_NO_X86
|