// This file is part of AsmJit project // // See or LICENSE.md for license and copyright information // SPDX-License-Identifier: Zlib #include #if !defined(ASMJIT_NO_X86) #include #include #include #include #include #include using namespace asmjit; enum class InstForm { kReg, kMem }; // Generates a long sequence of GP instructions. template static void generate_gp_sequence_internal( Emitter& cc, InstForm form, const x86::Gp& a, const x86::Gp& b, const x86::Gp& c, const x86::Gp& d) { cc.mov(a, 0xAAAAAAAA); cc.mov(b, 0xBBBBBBBB); cc.mov(c, 0xCCCCCCCC); cc.mov(d, 0xFFFFFFFF); if (form == InstForm::kReg) { cc.adc(a, b); cc.adc(b, c); cc.adc(c, d); cc.add(a, b); cc.add(b, c); cc.add(c, d); cc.and_(a, b); cc.and_(b, c); cc.and_(c, d); cc.bsf(a, b); cc.bsf(b, c); cc.bsf(c, d); cc.bsr(a, b); cc.bsr(b, c); cc.bsr(c, d); cc.bswap(a); cc.bswap(b); cc.bswap(c); cc.bt(a, b); cc.bt(b, c); cc.bt(c, d); cc.btc(a, b); cc.btc(b, c); cc.btc(c, d); cc.btr(a, b); cc.btr(b, c); cc.btr(c, d); cc.bts(a, b); cc.bts(b, c); cc.bts(c, d); cc.cmp(a, b); cc.cmovc(a, b); cc.cmp(b, c); cc.cmovc(b, c); cc.cmp(c, d); cc.cmovc(c, d); cc.dec(a); cc.dec(b); cc.dec(c); cc.imul(a, b); cc.imul(b, c); cc.imul(c, d); cc.movsx(a, b.r8_lo()); cc.movsx(b, c.r8_lo()); cc.movsx(c, d.r8_lo()); cc.movzx(a, b.r8_lo()); cc.movzx(b, c.r8_lo()); cc.movzx(c, d.r8_lo()); cc.neg(a); cc.neg(b); cc.neg(c); cc.not_(a); cc.not_(b); cc.not_(c); cc.or_(a, b); cc.or_(b, c); cc.or_(c, d); cc.sbb(a, b); cc.sbb(b, c); cc.sbb(c, d); cc.sub(a, b); cc.sub(b, c); cc.sub(c, d); cc.test(a, b); cc.test(b, c); cc.test(c, d); cc.xchg(a, b); cc.xchg(b, c); cc.xchg(c, d); cc.xor_(a, b); cc.xor_(b, c); cc.xor_(c, d); cc.rcl(a, c.r8_lo()); cc.rcl(b, c.r8_lo()); cc.rcl(d, c.r8_lo()); cc.rcr(a, c.r8_lo()); cc.rcr(b, c.r8_lo()); cc.rcr(d, c.r8_lo()); cc.rol(a, c.r8_lo()); cc.rol(b, c.r8_lo()); cc.rol(d, c.r8_lo()); cc.ror(a, c.r8_lo()); cc.ror(b, c.r8_lo()); cc.ror(d, c.r8_lo()); cc.shl(a, c.r8_lo()); cc.shl(b, c.r8_lo()); cc.shl(d, c.r8_lo()); cc.shr(a, c.r8_lo()); cc.shr(b, c.r8_lo()); cc.shr(d, c.r8_lo()); cc.sar(a, c.r8_lo()); cc.sar(b, c.r8_lo()); cc.sar(d, c.r8_lo()); cc.shld(a, b, c.r8_lo()); cc.shld(b, d, c.r8_lo()); cc.shld(d, a, c.r8_lo()); cc.shrd(a, b, c.r8_lo()); cc.shrd(b, d, c.r8_lo()); cc.shrd(d, a, c.r8_lo()); cc.adcx(a, b); cc.adox(a, b); cc.adcx(b, c); cc.adox(b, c); cc.adcx(c, d); cc.adox(c, d); cc.andn(a, b, c); cc.andn(b, c, d); cc.andn(c, d, a); cc.bextr(a, b, c); cc.bextr(b, c, d); cc.bextr(c, d, a); cc.blsi(a, b); cc.blsi(b, c); cc.blsi(c, d); cc.blsmsk(a, b); cc.blsmsk(b, c); cc.blsmsk(c, d); cc.blsr(a, b); cc.blsr(b, c); cc.blsr(c, d); cc.bzhi(a, b, c); cc.bzhi(b, c, d); cc.bzhi(c, d, a); cc.lzcnt(a, b); cc.lzcnt(b, c); cc.lzcnt(c, d); cc.pdep(a, b, c); cc.pdep(b, c, d); cc.pdep(c, d, a); cc.pext(a, b, c); cc.pext(b, c, d); cc.pext(c, d, a); cc.popcnt(a, b); cc.popcnt(b, c); cc.popcnt(c, d); cc.rorx(a, b, 8); cc.rorx(b, c, 8); cc.rorx(c, d, 8); cc.sarx(a, b, c); cc.sarx(b, c, d); cc.sarx(c, d, a); cc.shlx(a, b, c); cc.shlx(b, c, d); cc.shlx(c, d, a); cc.shrx(a, b, c); cc.shrx(b, c, d); cc.shrx(c, d, a); cc.tzcnt(a, b); cc.tzcnt(b, c); cc.tzcnt(c, d); } else { uint32_t register_size = cc.register_size(); x86::Mem m = x86::ptr(c, 0, register_size); x86::Mem m8 = x86::byte_ptr(c); cc.adc(a, m); cc.adc(b, m); cc.adc(c, m); cc.add(a, m); cc.add(b, m); cc.add(c, m); cc.and_(a, m); cc.and_(b, m); cc.and_(c, m); cc.bsf(a, m); cc.bsf(b, m); cc.bsf(c, m); cc.bsr(a, m); cc.bsr(b, m); cc.bsr(c, m); cc.bt(m, a); cc.bt(m, b); cc.bt(m, c); cc.btc(m, a); cc.btc(m, b); cc.btc(m, c); cc.btr(m, a); cc.btr(m, b); cc.btr(m, c); cc.bts(m, a); cc.bts(m, b); cc.bts(m, c); cc.cmp(a, m); cc.cmovc(a, m); cc.cmp(b, m); cc.cmovc(b, m); cc.cmp(c, m); cc.cmovc(c, m); cc.dec(m); cc.movsx(a, m8); cc.movsx(b, m8); cc.movsx(c, m8); cc.movzx(a, m8); cc.movzx(b, m8); cc.movzx(c, m8); cc.neg(m); cc.not_(m); cc.or_(a, m); cc.or_(b, m); cc.or_(c, m); cc.sbb(a, m); cc.sbb(b, m); cc.sbb(c, m); cc.sub(a, m); cc.sub(b, m); cc.sub(c, m); cc.test(m, a); cc.test(m, b); cc.test(m, c); cc.xchg(a, m); cc.xchg(b, m); cc.xchg(c, m); cc.xor_(a, m); cc.xor_(b, m); cc.xor_(c, m); cc.rcl(m, c.r8_lo()); cc.rcr(m, c.r8_lo()); cc.rol(m, c.r8_lo()); cc.ror(m, c.r8_lo()); cc.shl(m, c.r8_lo()); cc.shr(m, c.r8_lo()); cc.sar(m, c.r8_lo()); cc.shld(m, b, c.r8_lo()); cc.shld(m, d, c.r8_lo()); cc.shld(m, a, c.r8_lo()); cc.shrd(m, b, c.r8_lo()); cc.shrd(m, d, c.r8_lo()); cc.shrd(m, a, c.r8_lo()); cc.adcx(a, m); cc.adox(a, m); cc.adcx(b, m); cc.adox(b, m); cc.adcx(c, m); cc.adox(c, m); cc.andn(a, b, m); cc.andn(b, c, m); cc.andn(c, d, m); cc.bextr(a, m, c); cc.bextr(b, m, d); cc.bextr(c, m, a); cc.blsi(a, m); cc.blsi(b, m); cc.blsi(c, m); cc.blsmsk(a, m); cc.blsmsk(b, m); cc.blsmsk(c, m); cc.blsr(a, m); cc.blsr(b, m); cc.blsr(c, m); cc.bzhi(a, m, c); cc.bzhi(b, m, d); cc.bzhi(c, m, a); cc.lzcnt(a, m); cc.lzcnt(b, m); cc.lzcnt(c, m); cc.pdep(a, b, m); cc.pdep(b, c, m); cc.pdep(c, d, m); cc.pext(a, b, m); cc.pext(b, c, m); cc.pext(c, d, m); cc.popcnt(a, m); cc.popcnt(b, m); cc.popcnt(c, m); cc.rorx(a, m, 8); cc.rorx(b, m, 8); cc.rorx(c, m, 8); cc.sarx(a, m, c); cc.sarx(b, m, d); cc.sarx(c, m, a); cc.shlx(a, m, c); cc.shlx(b, m, d); cc.shlx(c, m, a); cc.shrx(a, m, c); cc.shrx(b, m, d); cc.shrx(c, m, a); cc.tzcnt(a, m); cc.tzcnt(b, m); cc.tzcnt(c, m); } } static void generate_empty_function(BaseEmitter& emitter, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp rv = cc.new_gp32("rv"); cc.add_func(FuncSignature::build()); cc.mov(rv, 0); cc.ret(rv); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); Gp rv = eax; if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(rv); frame.finalize(); cc.emit_prolog(frame); cc.mov(rv, 0); cc.emit_epilog(frame); } else { cc.mov(rv, 0); cc.ret(); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); Gp rv = eax; if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(rv); frame.finalize(); cc.emit_prolog(frame); cc.mov(rv, 0); cc.emit_epilog(frame); } else { cc.mov(rv, 0); cc.ret(); } return; } } static void generate_n_ops_sequence(BaseEmitter& emitter, uint32_t ops, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp ra = cc.new_gp32("ra"); Gp rb = cc.new_gp32("rb"); Gp rc = cc.new_gp32("rc"); Gp rd = cc.new_gp32("rd"); FuncNode* f = cc.add_func(FuncSignature::build()); f->set_arg(0, ra); f->set_arg(1, rb); f->set_arg(2, rc); f->set_arg(3, rd); for (uint32_t i = 0; i < ops; i += 4) { cc.add(ra, rb); cc.imul(ra, rc); cc.sub(ra, rd); cc.imul(ra, rc); } cc.ret(ra); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); Gp ra = eax; Gp rb = ebx; Gp rc = ecx; Gp rd = edx; if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(ra, rb, rc, rd); frame.finalize(); cc.emit_prolog(frame); for (uint32_t i = 0; i < ops; i += 4) { cc.add(ra, rb); cc.imul(ra, rc); cc.sub(ra, rd); cc.imul(ra, rc); } cc.emit_epilog(frame); } else { for (uint32_t i = 0; i < ops; i += 4) { cc.add(ra, rb); cc.imul(ra, rc); cc.sub(ra, rd); cc.imul(ra, rc); } cc.ret(); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); Gp ra = eax; Gp rb = ebx; Gp rc = ecx; Gp rd = edx; if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(ra, rb, rc, rd); frame.finalize(); cc.emit_prolog(frame); for (uint32_t i = 0; i < ops; i += 4) { cc.add(ra, rb); cc.imul(ra, rc); cc.sub(ra, rd); cc.imul(ra, rc); } cc.emit_epilog(frame); } else { for (uint32_t i = 0; i < ops; i += 4) { cc.add(ra, rb); cc.imul(ra, rc); cc.sub(ra, rd); cc.imul(ra, rc); } cc.ret(); } return; } } static void generate_gp_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp a = cc.new_gp_ptr("a"); Gp b = cc.new_gp_ptr("b"); Gp c = cc.new_gp_ptr("c"); Gp d = cc.new_gp_ptr("d"); cc.add_func(FuncSignature::build()); generate_gp_sequence_internal(cc, form, a, b, c, d); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); x86::Gp a = cc.zax(); x86::Gp b = cc.zbx(); x86::Gp c = cc.zcx(); x86::Gp d = cc.zdx(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(a, b, c, d); frame.finalize(); cc.emit_prolog(frame); generate_gp_sequence_internal(cc, form, a, b, c, d); cc.emit_epilog(frame); } else { generate_gp_sequence_internal(cc, form, a, b, c, d); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); x86::Gp a = cc.zax(); x86::Gp b = cc.zbx(); x86::Gp c = cc.zcx(); x86::Gp d = cc.zdx(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(a, b, c, d); frame.finalize(); cc.emit_prolog(frame); generate_gp_sequence_internal(cc, form, a, b, c, d); cc.emit_epilog(frame); } else { generate_gp_sequence_internal(cc, form, a, b, c, d); } return; } } // Generates a long sequence of SSE instructions using only registers. template static void generate_sse_sequenceInternal( Emitter& cc, InstForm form, const x86::Gp& gp, const x86::Vec& xmm_a, const x86::Vec& xmm_b, const x86::Vec& xmm_c, const x86::Vec& xmm_d) { x86::Gp gpd = gp.r32(); x86::Gp gpq = gp.r64(); x86::Gp gpz = cc.is_32bit() ? gpd : gpq; cc.xor_(gpd, gpd); cc.xorps(xmm_a, xmm_a); cc.xorps(xmm_b, xmm_b); cc.xorps(xmm_c, xmm_c); cc.xorps(xmm_d, xmm_d); if (form == InstForm::kReg) { // SSE. cc.addps(xmm_a, xmm_b); cc.addss(xmm_a, xmm_b); cc.andnps(xmm_a, xmm_b); cc.andps(xmm_a, xmm_b); cc.cmpps(xmm_a, xmm_b, 0); cc.cmpss(xmm_a, xmm_b, 0); cc.comiss(xmm_a, xmm_b); cc.cvtsi2ss(xmm_a, gpd); cc.cvtsi2ss(xmm_a, gpz); cc.cvtss2si(gpd, xmm_b); cc.cvtss2si(gpz, xmm_b); cc.cvttss2si(gpd, xmm_b); cc.cvttss2si(gpz, xmm_b); cc.divps(xmm_a, xmm_b); cc.divss(xmm_a, xmm_b); cc.maxps(xmm_a, xmm_b); cc.maxss(xmm_a, xmm_b); cc.minps(xmm_a, xmm_b); cc.minss(xmm_a, xmm_b); cc.movaps(xmm_a, xmm_b); cc.movd(gpd, xmm_b); cc.movd(xmm_a, gpd); cc.movq(xmm_a, xmm_b); cc.movhlps(xmm_a, xmm_b); cc.movlhps(xmm_a, xmm_b); cc.movups(xmm_a, xmm_b); cc.mulps(xmm_a, xmm_b); cc.mulss(xmm_a, xmm_b); cc.orps(xmm_a, xmm_b); cc.rcpps(xmm_a, xmm_b); cc.rcpss(xmm_a, xmm_b); cc.psadbw(xmm_a, xmm_b); cc.rsqrtps(xmm_a, xmm_b); cc.rsqrtss(xmm_a, xmm_b); cc.sfence(); cc.shufps(xmm_a, xmm_b, 0); cc.sqrtps(xmm_a, xmm_b); cc.sqrtss(xmm_a, xmm_b); cc.subps(xmm_a, xmm_b); cc.subss(xmm_a, xmm_b); cc.ucomiss(xmm_a, xmm_b); cc.unpckhps(xmm_a, xmm_b); cc.unpcklps(xmm_a, xmm_b); cc.xorps(xmm_a, xmm_b); // SSE2. cc.addpd(xmm_a, xmm_b); cc.addsd(xmm_a, xmm_b); cc.andnpd(xmm_a, xmm_b); cc.andpd(xmm_a, xmm_b); cc.cmppd(xmm_a, xmm_b, 0); cc.cmpsd(xmm_a, xmm_b, 0); cc.comisd(xmm_a, xmm_b); cc.cvtdq2pd(xmm_a, xmm_b); cc.cvtdq2ps(xmm_a, xmm_b); cc.cvtpd2dq(xmm_a, xmm_b); cc.cvtpd2ps(xmm_a, xmm_b); cc.cvtps2dq(xmm_a, xmm_b); cc.cvtps2pd(xmm_a, xmm_b); cc.cvtsd2si(gpd, xmm_b); cc.cvtsd2si(gpz, xmm_b); cc.cvtsd2ss(xmm_a, xmm_b); cc.cvtsi2sd(xmm_a, gpd); cc.cvtsi2sd(xmm_a, gpz); cc.cvtss2sd(xmm_a, xmm_b); cc.cvtss2si(gpd, xmm_b); cc.cvtss2si(gpz, xmm_b); cc.cvttpd2dq(xmm_a, xmm_b); cc.cvttps2dq(xmm_a, xmm_b); cc.cvttsd2si(gpd, xmm_b); cc.cvttsd2si(gpz, xmm_b); cc.divpd(xmm_a, xmm_b); cc.divsd(xmm_a, xmm_b); cc.maxpd(xmm_a, xmm_b); cc.maxsd(xmm_a, xmm_b); cc.minpd(xmm_a, xmm_b); cc.minsd(xmm_a, xmm_b); cc.movdqa(xmm_a, xmm_b); cc.movdqu(xmm_a, xmm_b); cc.movmskps(gpd, xmm_b); cc.movmskpd(gpd, xmm_b); cc.movsd(xmm_a, xmm_b); cc.mulpd(xmm_a, xmm_b); cc.mulsd(xmm_a, xmm_b); cc.orpd(xmm_a, xmm_b); cc.packsswb(xmm_a, xmm_b); cc.packssdw(xmm_a, xmm_b); cc.packuswb(xmm_a, xmm_b); cc.paddb(xmm_a, xmm_b); cc.paddw(xmm_a, xmm_b); cc.paddd(xmm_a, xmm_b); cc.paddq(xmm_a, xmm_b); cc.paddsb(xmm_a, xmm_b); cc.paddsw(xmm_a, xmm_b); cc.paddusb(xmm_a, xmm_b); cc.paddusw(xmm_a, xmm_b); cc.pand(xmm_a, xmm_b); cc.pandn(xmm_a, xmm_b); cc.pavgb(xmm_a, xmm_b); cc.pavgw(xmm_a, xmm_b); cc.pcmpeqb(xmm_a, xmm_b); cc.pcmpeqw(xmm_a, xmm_b); cc.pcmpeqd(xmm_a, xmm_b); cc.pcmpgtb(xmm_a, xmm_b); cc.pcmpgtw(xmm_a, xmm_b); cc.pcmpgtd(xmm_a, xmm_b); cc.pmaxsw(xmm_a, xmm_b); cc.pmaxub(xmm_a, xmm_b); cc.pminsw(xmm_a, xmm_b); cc.pminub(xmm_a, xmm_b); cc.pmovmskb(gpd, xmm_b); cc.pmulhw(xmm_a, xmm_b); cc.pmulhuw(xmm_a, xmm_b); cc.pmullw(xmm_a, xmm_b); cc.pmuludq(xmm_a, xmm_b); cc.por(xmm_a, xmm_b); cc.pslld(xmm_a, xmm_b); cc.pslld(xmm_a, 0); cc.psllq(xmm_a, xmm_b); cc.psllq(xmm_a, 0); cc.psllw(xmm_a, xmm_b); cc.psllw(xmm_a, 0); cc.pslldq(xmm_a, 0); cc.psrad(xmm_a, xmm_b); cc.psrad(xmm_a, 0); cc.psraw(xmm_a, xmm_b); cc.psraw(xmm_a, 0); cc.psubb(xmm_a, xmm_b); cc.psubw(xmm_a, xmm_b); cc.psubd(xmm_a, xmm_b); cc.psubq(xmm_a, xmm_b); cc.pmaddwd(xmm_a, xmm_b); cc.pshufd(xmm_a, xmm_b, 0); cc.pshufhw(xmm_a, xmm_b, 0); cc.pshuflw(xmm_a, xmm_b, 0); cc.psrld(xmm_a, xmm_b); cc.psrld(xmm_a, 0); cc.psrlq(xmm_a, xmm_b); cc.psrlq(xmm_a, 0); cc.psrldq(xmm_a, 0); cc.psrlw(xmm_a, xmm_b); cc.psrlw(xmm_a, 0); cc.psubsb(xmm_a, xmm_b); cc.psubsw(xmm_a, xmm_b); cc.psubusb(xmm_a, xmm_b); cc.psubusw(xmm_a, xmm_b); cc.punpckhbw(xmm_a, xmm_b); cc.punpckhwd(xmm_a, xmm_b); cc.punpckhdq(xmm_a, xmm_b); cc.punpckhqdq(xmm_a, xmm_b); cc.punpcklbw(xmm_a, xmm_b); cc.punpcklwd(xmm_a, xmm_b); cc.punpckldq(xmm_a, xmm_b); cc.punpcklqdq(xmm_a, xmm_b); cc.pxor(xmm_a, xmm_b); cc.sqrtpd(xmm_a, xmm_b); cc.sqrtsd(xmm_a, xmm_b); cc.subpd(xmm_a, xmm_b); cc.subsd(xmm_a, xmm_b); cc.ucomisd(xmm_a, xmm_b); cc.unpckhpd(xmm_a, xmm_b); cc.unpcklpd(xmm_a, xmm_b); cc.xorpd(xmm_a, xmm_b); // SSE3. cc.addsubpd(xmm_a, xmm_b); cc.addsubps(xmm_a, xmm_b); cc.haddpd(xmm_a, xmm_b); cc.haddps(xmm_a, xmm_b); cc.hsubpd(xmm_a, xmm_b); cc.hsubps(xmm_a, xmm_b); cc.movddup(xmm_a, xmm_b); cc.movshdup(xmm_a, xmm_b); cc.movsldup(xmm_a, xmm_b); // SSSE3. cc.psignb(xmm_a, xmm_b); cc.psignw(xmm_a, xmm_b); cc.psignd(xmm_a, xmm_b); cc.phaddw(xmm_a, xmm_b); cc.phaddd(xmm_a, xmm_b); cc.phaddsw(xmm_a, xmm_b); cc.phsubw(xmm_a, xmm_b); cc.phsubd(xmm_a, xmm_b); cc.phsubsw(xmm_a, xmm_b); cc.pmaddubsw(xmm_a, xmm_b); cc.pabsb(xmm_a, xmm_b); cc.pabsw(xmm_a, xmm_b); cc.pabsd(xmm_a, xmm_b); cc.pmulhrsw(xmm_a, xmm_b); cc.pshufb(xmm_a, xmm_b); cc.palignr(xmm_a, xmm_b, 0); // SSE4.1. cc.blendpd(xmm_a, xmm_b, 0); cc.blendps(xmm_a, xmm_b, 0); cc.blendvpd(xmm_a, xmm_b, xmm_a); cc.blendvps(xmm_a, xmm_b, xmm_a); cc.dppd(xmm_a, xmm_b, 0); cc.dpps(xmm_a, xmm_b, 0); cc.extractps(gpd, xmm_b, 0); cc.insertps(xmm_a, xmm_b, 0); cc.mpsadbw(xmm_a, xmm_b, 0); cc.packusdw(xmm_a, xmm_b); cc.pblendvb(xmm_a, xmm_b, xmm_a); cc.pblendw(xmm_a, xmm_b, 0); cc.pcmpeqq(xmm_a, xmm_b); cc.pextrb(gpd, xmm_b, 0); cc.pextrd(gpd, xmm_b, 0); if (cc.is_64bit()) cc.pextrq(gpq, xmm_b, 0); cc.pextrw(gpd, xmm_b, 0); cc.phminposuw(xmm_a, xmm_b); cc.pinsrb(xmm_a, gpd, 0); cc.pinsrd(xmm_a, gpd, 0); cc.pinsrw(xmm_a, gpd, 0); cc.pmaxuw(xmm_a, xmm_b); cc.pmaxsb(xmm_a, xmm_b); cc.pmaxsd(xmm_a, xmm_b); cc.pmaxud(xmm_a, xmm_b); cc.pminsb(xmm_a, xmm_b); cc.pminuw(xmm_a, xmm_b); cc.pminud(xmm_a, xmm_b); cc.pminsd(xmm_a, xmm_b); cc.pmovsxbw(xmm_a, xmm_b); cc.pmovsxbd(xmm_a, xmm_b); cc.pmovsxbq(xmm_a, xmm_b); cc.pmovsxwd(xmm_a, xmm_b); cc.pmovsxwq(xmm_a, xmm_b); cc.pmovsxdq(xmm_a, xmm_b); cc.pmovzxbw(xmm_a, xmm_b); cc.pmovzxbd(xmm_a, xmm_b); cc.pmovzxbq(xmm_a, xmm_b); cc.pmovzxwd(xmm_a, xmm_b); cc.pmovzxwq(xmm_a, xmm_b); cc.pmovzxdq(xmm_a, xmm_b); cc.pmuldq(xmm_a, xmm_b); cc.pmulld(xmm_a, xmm_b); cc.ptest(xmm_a, xmm_b); cc.roundps(xmm_a, xmm_b, 0); cc.roundss(xmm_a, xmm_b, 0); cc.roundpd(xmm_a, xmm_b, 0); cc.roundsd(xmm_a, xmm_b, 0); } else { x86::Mem m = x86::ptr(gpz); cc.addps(xmm_a, m); cc.addss(xmm_a, m); cc.andnps(xmm_a, m); cc.andps(xmm_a, m); cc.cmpps(xmm_a, m, 0); cc.cmpss(xmm_a, m, 0); cc.comiss(xmm_a, m); cc.cvtpi2ps(xmm_a, m); cc.cvtsi2ss(xmm_a, m); cc.cvtss2si(gpd, m); cc.cvtss2si(gpz, m); cc.cvttss2si(gpd, m); cc.cvttss2si(gpz, m); cc.divps(xmm_a, m); cc.divss(xmm_a, m); cc.maxps(xmm_a, m); cc.maxss(xmm_a, m); cc.minps(xmm_a, m); cc.minss(xmm_a, m); cc.movaps(xmm_a, m); cc.movaps(m, xmm_b); cc.movd(m, xmm_b); cc.movd(xmm_a, m); cc.movq(m, xmm_b); cc.movq(xmm_a, m); cc.movhps(xmm_a, m); cc.movhps(m, xmm_b); cc.movlps(xmm_a, m); cc.movlps(m, xmm_b); cc.movntps(m, xmm_b); cc.movss(xmm_a, m); cc.movss(m, xmm_b); cc.movups(xmm_a, m); cc.movups(m, xmm_b); cc.mulps(xmm_a, m); cc.mulss(xmm_a, m); cc.orps(xmm_a, m); cc.rcpps(xmm_a, m); cc.rcpss(xmm_a, m); cc.psadbw(xmm_a, m); cc.rsqrtps(xmm_a, m); cc.rsqrtss(xmm_a, m); cc.shufps(xmm_a, m, 0); cc.sqrtps(xmm_a, m); cc.sqrtss(xmm_a, m); cc.stmxcsr(m); cc.subps(xmm_a, m); cc.subss(xmm_a, m); cc.ucomiss(xmm_a, m); cc.unpckhps(xmm_a, m); cc.unpcklps(xmm_a, m); cc.xorps(xmm_a, m); // SSE2. cc.addpd(xmm_a, m); cc.addsd(xmm_a, m); cc.andnpd(xmm_a, m); cc.andpd(xmm_a, m); cc.cmppd(xmm_a, m, 0); cc.cmpsd(xmm_a, m, 0); cc.comisd(xmm_a, m); cc.cvtdq2pd(xmm_a, m); cc.cvtdq2ps(xmm_a, m); cc.cvtpd2dq(xmm_a, m); cc.cvtpd2ps(xmm_a, m); cc.cvtpi2pd(xmm_a, m); cc.cvtps2dq(xmm_a, m); cc.cvtps2pd(xmm_a, m); cc.cvtsd2si(gpd, m); cc.cvtsd2si(gpz, m); cc.cvtsd2ss(xmm_a, m); cc.cvtsi2sd(xmm_a, m); cc.cvtss2sd(xmm_a, m); cc.cvtss2si(gpd, m); cc.cvtss2si(gpz, m); cc.cvttpd2dq(xmm_a, m); cc.cvttps2dq(xmm_a, m); cc.cvttsd2si(gpd, m); cc.cvttsd2si(gpz, m); cc.divpd(xmm_a, m); cc.divsd(xmm_a, m); cc.maxpd(xmm_a, m); cc.maxsd(xmm_a, m); cc.minpd(xmm_a, m); cc.minsd(xmm_a, m); cc.movdqa(xmm_a, m); cc.movdqa(m, xmm_b); cc.movdqu(xmm_a, m); cc.movdqu(m, xmm_b); cc.movsd(xmm_a, m); cc.movsd(m, xmm_b); cc.movapd(xmm_a, m); cc.movapd(m, xmm_b); cc.movhpd(xmm_a, m); cc.movhpd(m, xmm_b); cc.movlpd(xmm_a, m); cc.movlpd(m, xmm_b); cc.movntdq(m, xmm_b); cc.movntpd(m, xmm_b); cc.movupd(xmm_a, m); cc.movupd(m, xmm_b); cc.mulpd(xmm_a, m); cc.mulsd(xmm_a, m); cc.orpd(xmm_a, m); cc.packsswb(xmm_a, m); cc.packssdw(xmm_a, m); cc.packuswb(xmm_a, m); cc.paddb(xmm_a, m); cc.paddw(xmm_a, m); cc.paddd(xmm_a, m); cc.paddq(xmm_a, m); cc.paddsb(xmm_a, m); cc.paddsw(xmm_a, m); cc.paddusb(xmm_a, m); cc.paddusw(xmm_a, m); cc.pand(xmm_a, m); cc.pandn(xmm_a, m); cc.pavgb(xmm_a, m); cc.pavgw(xmm_a, m); cc.pcmpeqb(xmm_a, m); cc.pcmpeqw(xmm_a, m); cc.pcmpeqd(xmm_a, m); cc.pcmpgtb(xmm_a, m); cc.pcmpgtw(xmm_a, m); cc.pcmpgtd(xmm_a, m); cc.pmaxsw(xmm_a, m); cc.pmaxub(xmm_a, m); cc.pminsw(xmm_a, m); cc.pminub(xmm_a, m); cc.pmulhw(xmm_a, m); cc.pmulhuw(xmm_a, m); cc.pmullw(xmm_a, m); cc.pmuludq(xmm_a, m); cc.por(xmm_a, m); cc.pslld(xmm_a, m); cc.psllq(xmm_a, m); cc.psllw(xmm_a, m); cc.psrad(xmm_a, m); cc.psraw(xmm_a, m); cc.psubb(xmm_a, m); cc.psubw(xmm_a, m); cc.psubd(xmm_a, m); cc.psubq(xmm_a, m); cc.pmaddwd(xmm_a, m); cc.pshufd(xmm_a, m, 0); cc.pshufhw(xmm_a, m, 0); cc.pshuflw(xmm_a, m, 0); cc.psrld(xmm_a, m); cc.psrlq(xmm_a, m); cc.psrlw(xmm_a, m); cc.psubsb(xmm_a, m); cc.psubsw(xmm_a, m); cc.psubusb(xmm_a, m); cc.psubusw(xmm_a, m); cc.punpckhbw(xmm_a, m); cc.punpckhwd(xmm_a, m); cc.punpckhdq(xmm_a, m); cc.punpckhqdq(xmm_a, m); cc.punpcklbw(xmm_a, m); cc.punpcklwd(xmm_a, m); cc.punpckldq(xmm_a, m); cc.punpcklqdq(xmm_a, m); cc.pxor(xmm_a, m); cc.sqrtpd(xmm_a, m); cc.sqrtsd(xmm_a, m); cc.subpd(xmm_a, m); cc.subsd(xmm_a, m); cc.ucomisd(xmm_a, m); cc.unpckhpd(xmm_a, m); cc.unpcklpd(xmm_a, m); cc.xorpd(xmm_a, m); // SSE3. cc.addsubpd(xmm_a, m); cc.addsubps(xmm_a, m); cc.haddpd(xmm_a, m); cc.haddps(xmm_a, m); cc.hsubpd(xmm_a, m); cc.hsubps(xmm_a, m); cc.lddqu(xmm_a, m); cc.movddup(xmm_a, m); cc.movshdup(xmm_a, m); cc.movsldup(xmm_a, m); // SSSE3. cc.psignb(xmm_a, m); cc.psignw(xmm_a, m); cc.psignd(xmm_a, m); cc.phaddw(xmm_a, m); cc.phaddd(xmm_a, m); cc.phaddsw(xmm_a, m); cc.phsubw(xmm_a, m); cc.phsubd(xmm_a, m); cc.phsubsw(xmm_a, m); cc.pmaddubsw(xmm_a, m); cc.pabsb(xmm_a, m); cc.pabsw(xmm_a, m); cc.pabsd(xmm_a, m); cc.pmulhrsw(xmm_a, m); cc.pshufb(xmm_a, m); cc.palignr(xmm_a, m, 0); // SSE4.1. cc.blendpd(xmm_a, m, 0); cc.blendps(xmm_a, m, 0); cc.blendvpd(xmm_a, m, xmm_a); cc.blendvps(xmm_a, m, xmm_a); cc.dppd(xmm_a, m, 0); cc.dpps(xmm_a, m, 0); cc.extractps(m, xmm_b, 0); cc.insertps(xmm_a, m, 0); cc.movntdqa(xmm_a, m); cc.mpsadbw(xmm_a, m, 0); cc.packusdw(xmm_a, m); cc.pblendvb(xmm_a, m, xmm_a); cc.pblendw(xmm_a, m, 0); cc.pcmpeqq(xmm_a, m); cc.pextrb(m, xmm_b, 0); cc.pextrd(m, xmm_b, 0); if (cc.is_64bit()) cc.pextrq(m, xmm_b, 0); cc.pextrw(m, xmm_b, 0); cc.phminposuw(xmm_a, m); cc.pinsrb(xmm_a, m, 0); cc.pinsrd(xmm_a, m, 0); cc.pinsrw(xmm_a, m, 0); cc.pmaxuw(xmm_a, m); cc.pmaxsb(xmm_a, m); cc.pmaxsd(xmm_a, m); cc.pmaxud(xmm_a, m); cc.pminsb(xmm_a, m); cc.pminuw(xmm_a, m); cc.pminud(xmm_a, m); cc.pminsd(xmm_a, m); cc.pmovsxbw(xmm_a, m); cc.pmovsxbd(xmm_a, m); cc.pmovsxbq(xmm_a, m); cc.pmovsxwd(xmm_a, m); cc.pmovsxwq(xmm_a, m); cc.pmovsxdq(xmm_a, m); cc.pmovzxbw(xmm_a, m); cc.pmovzxbd(xmm_a, m); cc.pmovzxbq(xmm_a, m); cc.pmovzxwd(xmm_a, m); cc.pmovzxwq(xmm_a, m); cc.pmovzxdq(xmm_a, m); cc.pmuldq(xmm_a, m); cc.pmulld(xmm_a, m); cc.ptest(xmm_a, m); cc.roundps(xmm_a, m, 0); cc.roundss(xmm_a, m, 0); cc.roundpd(xmm_a, m, 0); cc.roundsd(xmm_a, m, 0); // SSE4.2. cc.pcmpgtq(xmm_a, m); } } static void generate_sse_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp gp = cc.new_gpz("gp"); Vec a = cc.new_xmm("a"); Vec b = cc.new_xmm("b"); Vec c = cc.new_xmm("c"); Vec d = cc.new_xmm("d"); cc.add_func(FuncSignature::build()); generate_sse_sequenceInternal(cc, form, gp, a, b, c, d); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3); frame.finalize(); cc.emit_prolog(frame); generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3); cc.emit_epilog(frame); } else { generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, xmm0, xmm1, xmm2, xmm3); frame.finalize(); cc.emit_prolog(frame); generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3); cc.emit_epilog(frame); } else { generate_sse_sequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3); } return; } } // Generates a long sequence of AVX instructions. template static void generate_avx_sequenceInternalRegOnly( Emitter& cc, const x86::Gp& gp, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { x86::Gp gpd = gp.r32(); x86::Gp gpq = gp.r64(); x86::Gp gpz = cc.is_32bit() ? gpd : gpq; x86::Vec xmm_a = vec_a.xmm(); x86::Vec xmm_b = vec_b.xmm(); x86::Vec xmm_c = vec_c.xmm(); x86::Vec xmm_d = vec_d.xmm(); x86::Vec ymm_a = vec_a.ymm(); x86::Vec ymm_b = vec_b.ymm(); x86::Vec ymm_c = vec_c.ymm(); cc.xor_(gpd, gpd); cc.vxorps(xmm_a, xmm_a, xmm_a); cc.vxorps(xmm_b, xmm_b, xmm_b); cc.vxorps(xmm_c, xmm_c, xmm_c); cc.vxorps(xmm_d, xmm_d, xmm_d); cc.vaddpd(xmm_a, xmm_b, xmm_c); cc.vaddpd(ymm_a, ymm_b, ymm_c); cc.vaddps(xmm_a, xmm_b, xmm_c); cc.vaddps(ymm_a, ymm_b, ymm_c); cc.vaddsd(xmm_a, xmm_b, xmm_c); cc.vaddss(xmm_a, xmm_b, xmm_c); cc.vaddsubpd(xmm_a, xmm_b, xmm_c); cc.vaddsubpd(ymm_a, ymm_b, ymm_c); cc.vaddsubps(xmm_a, xmm_b, xmm_c); cc.vaddsubps(ymm_a, ymm_b, ymm_c); cc.vandpd(xmm_a, xmm_b, xmm_c); cc.vandpd(ymm_a, ymm_b, ymm_c); cc.vandps(xmm_a, xmm_b, xmm_c); cc.vandps(ymm_a, ymm_b, ymm_c); cc.vandnpd(xmm_a, xmm_b, xmm_c); cc.vandnpd(ymm_a, ymm_b, ymm_c); cc.vandnps(xmm_a, xmm_b, xmm_c); cc.vandnps(ymm_a, ymm_b, ymm_c); cc.vblendpd(xmm_a, xmm_b, xmm_c, 0); cc.vblendpd(ymm_a, ymm_b, ymm_c, 0); cc.vblendps(xmm_a, xmm_b, xmm_c, 0); cc.vblendps(ymm_a, ymm_b, ymm_c, 0); cc.vblendvpd(xmm_a, xmm_b, xmm_c, xmm_a); cc.vblendvpd(ymm_a, ymm_b, ymm_c, ymm_a); cc.vcmppd(xmm_a, xmm_b, xmm_c, 0); cc.vcmppd(ymm_a, ymm_b, ymm_c, 0); cc.vcmpps(xmm_a, xmm_b, xmm_c, 0); cc.vcmpps(ymm_a, ymm_b, ymm_c, 0); cc.vcmpsd(xmm_a, xmm_b, xmm_c, 0); cc.vcmpss(xmm_a, xmm_b, xmm_c, 0); cc.vcomisd(xmm_a, xmm_b); cc.vcomiss(xmm_a, xmm_b); cc.vcvtdq2pd(xmm_a, xmm_b); cc.vcvtdq2pd(ymm_a, xmm_b); cc.vcvtdq2ps(xmm_a, xmm_b); cc.vcvtdq2ps(ymm_a, ymm_b); cc.vcvtpd2dq(xmm_a, xmm_b); cc.vcvtpd2dq(xmm_a, ymm_b); cc.vcvtpd2ps(xmm_a, xmm_b); cc.vcvtpd2ps(xmm_a, ymm_b); cc.vcvtps2dq(xmm_a, xmm_b); cc.vcvtps2dq(ymm_a, ymm_b); cc.vcvtps2pd(xmm_a, xmm_b); cc.vcvtps2pd(ymm_a, xmm_b); cc.vcvtsd2si(gpd, xmm_b); cc.vcvtsd2si(gpz, xmm_b); cc.vcvtsd2ss(xmm_a, xmm_b, xmm_c); cc.vcvtsi2sd(xmm_a, xmm_b, gpd); cc.vcvtsi2sd(xmm_a, xmm_b, gpz); cc.vcvtsi2ss(xmm_a, xmm_b, gpd); cc.vcvtsi2ss(xmm_a, xmm_b, gpz); cc.vcvtss2sd(xmm_a, xmm_b, xmm_c); cc.vcvtss2si(gpd, xmm_b); cc.vcvttpd2dq(xmm_a, xmm_b); cc.vcvttpd2dq(xmm_a, ymm_b); cc.vcvttps2dq(xmm_a, xmm_b); cc.vcvttps2dq(ymm_a, ymm_b); cc.vcvttsd2si(gpd, xmm_b); cc.vcvttss2si(gpz, xmm_b); cc.vdivpd(xmm_a, xmm_b, xmm_c); cc.vdivpd(ymm_a, ymm_b, ymm_c); cc.vdivps(xmm_a, xmm_b, xmm_c); cc.vdivps(ymm_a, ymm_b, ymm_c); cc.vdivsd(xmm_a, xmm_b, xmm_c); cc.vdivss(xmm_a, xmm_b, xmm_c); cc.vdppd(xmm_a, xmm_b, xmm_c, 0); cc.vdpps(xmm_a, xmm_b, xmm_c, 0); cc.vdpps(ymm_a, ymm_b, ymm_c, 0); cc.vextractf128(xmm_a, ymm_b, 0); cc.vextractps(gpd, xmm_b, 0); cc.vhaddpd(xmm_a, xmm_b, xmm_c); cc.vhaddpd(ymm_a, ymm_b, ymm_c); cc.vhaddps(xmm_a, xmm_b, xmm_c); cc.vhaddps(ymm_a, ymm_b, ymm_c); cc.vhsubpd(xmm_a, xmm_b, xmm_c); cc.vhsubpd(ymm_a, ymm_b, ymm_c); cc.vhsubps(xmm_a, xmm_b, xmm_c); cc.vhsubps(ymm_a, ymm_b, ymm_c); cc.vinsertf128(ymm_a, ymm_b, xmm_c, 0); cc.vinsertps(xmm_a, xmm_b, xmm_c, 0); cc.vmaxpd(xmm_a, xmm_b, xmm_c); cc.vmaxpd(ymm_a, ymm_b, ymm_c); cc.vmaxps(xmm_a, xmm_b, xmm_c); cc.vmaxps(ymm_a, ymm_b, ymm_c); cc.vmaxsd(xmm_a, xmm_b, xmm_c); cc.vmaxss(xmm_a, xmm_b, xmm_c); cc.vminpd(xmm_a, xmm_b, xmm_c); cc.vminpd(ymm_a, ymm_b, ymm_c); cc.vminps(xmm_a, xmm_b, xmm_c); cc.vminps(ymm_a, ymm_b, ymm_c); cc.vminsd(xmm_a, xmm_b, xmm_c); cc.vminss(xmm_a, xmm_b, xmm_c); cc.vmovapd(xmm_a, xmm_b); cc.vmovapd(ymm_a, ymm_b); cc.vmovaps(xmm_a, xmm_b); cc.vmovaps(ymm_a, ymm_b); cc.vmovd(xmm_a, gpd); cc.vmovd(gpd, xmm_b); cc.vmovddup(xmm_a, xmm_b); cc.vmovddup(ymm_a, ymm_b); cc.vmovdqa(xmm_a, xmm_b); cc.vmovdqa(ymm_a, ymm_b); cc.vmovdqu(xmm_a, xmm_b); cc.vmovdqu(ymm_a, ymm_b); cc.vmovhlps(xmm_a, xmm_b, xmm_c); cc.vmovlhps(xmm_a, xmm_b, xmm_c); cc.vmovmskpd(gpd, xmm_b); cc.vmovmskpd(gpd, ymm_b); cc.vmovmskps(gpd, xmm_b); cc.vmovmskps(gpd, ymm_b); cc.vmovsd(xmm_a, xmm_b, xmm_c); cc.vmovshdup(xmm_a, xmm_b); cc.vmovshdup(ymm_a, ymm_b); cc.vmovsldup(xmm_a, xmm_b); cc.vmovsldup(ymm_a, ymm_b); cc.vmovss(xmm_a, xmm_b, xmm_c); cc.vmovupd(xmm_a, xmm_b); cc.vmovupd(ymm_a, ymm_b); cc.vmovups(xmm_a, xmm_b); cc.vmovups(ymm_a, ymm_b); cc.vmpsadbw(xmm_a, xmm_b, xmm_c, 0); cc.vmulpd(xmm_a, xmm_b, xmm_c); cc.vmulpd(ymm_a, ymm_b, ymm_c); cc.vmulps(xmm_a, xmm_b, xmm_c); cc.vmulps(ymm_a, ymm_b, ymm_c); cc.vmulsd(xmm_a, xmm_b, xmm_c); cc.vmulss(xmm_a, xmm_b, xmm_c); cc.vorpd(xmm_a, xmm_b, xmm_c); cc.vorpd(ymm_a, ymm_b, ymm_c); cc.vorps(xmm_a, xmm_b, xmm_c); cc.vorps(ymm_a, ymm_b, ymm_c); cc.vpabsb(xmm_a, xmm_b); cc.vpabsd(xmm_a, xmm_b); cc.vpabsw(xmm_a, xmm_b); cc.vpackssdw(xmm_a, xmm_b, xmm_c); cc.vpacksswb(xmm_a, xmm_b, xmm_c); cc.vpackusdw(xmm_a, xmm_b, xmm_c); cc.vpackuswb(xmm_a, xmm_b, xmm_c); cc.vpaddb(xmm_a, xmm_b, xmm_c); cc.vpaddd(xmm_a, xmm_b, xmm_c); cc.vpaddq(xmm_a, xmm_b, xmm_c); cc.vpaddw(xmm_a, xmm_b, xmm_c); cc.vpaddsb(xmm_a, xmm_b, xmm_c); cc.vpaddsw(xmm_a, xmm_b, xmm_c); cc.vpaddusb(xmm_a, xmm_b, xmm_c); cc.vpaddusw(xmm_a, xmm_b, xmm_c); cc.vpalignr(xmm_a, xmm_b, xmm_c, 0); cc.vpand(xmm_a, xmm_b, xmm_c); cc.vpandn(xmm_a, xmm_b, xmm_c); cc.vpavgb(xmm_a, xmm_b, xmm_c); cc.vpavgw(xmm_a, xmm_b, xmm_c); cc.vpblendvb(xmm_a, xmm_b, xmm_c, xmm_a); cc.vpblendw(xmm_a, xmm_b, xmm_c, 0); cc.vpcmpeqb(xmm_a, xmm_b, xmm_c); cc.vpcmpeqd(xmm_a, xmm_b, xmm_c); cc.vpcmpeqq(xmm_a, xmm_b, xmm_c); cc.vpcmpeqw(xmm_a, xmm_b, xmm_c); cc.vpcmpgtb(xmm_a, xmm_b, xmm_c); cc.vpcmpgtd(xmm_a, xmm_b, xmm_c); cc.vpcmpgtq(xmm_a, xmm_b, xmm_c); cc.vpcmpgtw(xmm_a, xmm_b, xmm_c); cc.vpermilpd(xmm_a, xmm_b, xmm_c); cc.vpermilpd(ymm_a, ymm_b, ymm_c); cc.vpermilpd(xmm_a, xmm_b, 0); cc.vpermilpd(ymm_a, ymm_b, 0); cc.vpermilps(xmm_a, xmm_b, xmm_c); cc.vpermilps(ymm_a, ymm_b, ymm_c); cc.vpermilps(xmm_a, xmm_b, 0); cc.vpermilps(ymm_a, ymm_b, 0); cc.vperm2f128(ymm_a, ymm_b, ymm_c, 0); cc.vpextrb(gpd, xmm_b, 0); cc.vpextrd(gpd, xmm_b, 0); if (cc.is_64bit()) cc.vpextrq(gpq, xmm_b, 0); cc.vpextrw(gpd, xmm_b, 0); cc.vphaddd(xmm_a, xmm_b, xmm_c); cc.vphaddsw(xmm_a, xmm_b, xmm_c); cc.vphaddw(xmm_a, xmm_b, xmm_c); cc.vphminposuw(xmm_a, xmm_b); cc.vphsubd(xmm_a, xmm_b, xmm_c); cc.vphsubsw(xmm_a, xmm_b, xmm_c); cc.vphsubw(xmm_a, xmm_b, xmm_c); cc.vpinsrb(xmm_a, xmm_b, gpd, 0); cc.vpinsrd(xmm_a, xmm_b, gpd, 0); cc.vpinsrw(xmm_a, xmm_b, gpd, 0); cc.vpmaddubsw(xmm_a, xmm_b, xmm_c); cc.vpmaddwd(xmm_a, xmm_b, xmm_c); cc.vpmaxsb(xmm_a, xmm_b, xmm_c); cc.vpmaxsd(xmm_a, xmm_b, xmm_c); cc.vpmaxsw(xmm_a, xmm_b, xmm_c); cc.vpmaxub(xmm_a, xmm_b, xmm_c); cc.vpmaxud(xmm_a, xmm_b, xmm_c); cc.vpmaxuw(xmm_a, xmm_b, xmm_c); cc.vpminsb(xmm_a, xmm_b, xmm_c); cc.vpminsd(xmm_a, xmm_b, xmm_c); cc.vpminsw(xmm_a, xmm_b, xmm_c); cc.vpminub(xmm_a, xmm_b, xmm_c); cc.vpminud(xmm_a, xmm_b, xmm_c); cc.vpminuw(xmm_a, xmm_b, xmm_c); cc.vpmovmskb(gpd, xmm_b); cc.vpmovsxbd(xmm_a, xmm_b); cc.vpmovsxbq(xmm_a, xmm_b); cc.vpmovsxbw(xmm_a, xmm_b); cc.vpmovsxdq(xmm_a, xmm_b); cc.vpmovsxwd(xmm_a, xmm_b); cc.vpmovsxwq(xmm_a, xmm_b); cc.vpmovzxbd(xmm_a, xmm_b); cc.vpmovzxbq(xmm_a, xmm_b); cc.vpmovzxbw(xmm_a, xmm_b); cc.vpmovzxdq(xmm_a, xmm_b); cc.vpmovzxwd(xmm_a, xmm_b); cc.vpmovzxwq(xmm_a, xmm_b); cc.vpmuldq(xmm_a, xmm_b, xmm_c); cc.vpmulhrsw(xmm_a, xmm_b, xmm_c); cc.vpmulhuw(xmm_a, xmm_b, xmm_c); cc.vpmulhw(xmm_a, xmm_b, xmm_c); cc.vpmulld(xmm_a, xmm_b, xmm_c); cc.vpmullw(xmm_a, xmm_b, xmm_c); cc.vpmuludq(xmm_a, xmm_b, xmm_c); cc.vpor(xmm_a, xmm_b, xmm_c); cc.vpsadbw(xmm_a, xmm_b, xmm_c); cc.vpshufb(xmm_a, xmm_b, xmm_c); cc.vpshufd(xmm_a, xmm_b, 0); cc.vpshufhw(xmm_a, xmm_b, 0); cc.vpshuflw(xmm_a, xmm_b, 0); cc.vpsignb(xmm_a, xmm_b, xmm_c); cc.vpsignd(xmm_a, xmm_b, xmm_c); cc.vpsignw(xmm_a, xmm_b, xmm_c); cc.vpslld(xmm_a, xmm_b, xmm_c); cc.vpslld(xmm_a, xmm_b, 0); cc.vpslldq(xmm_a, xmm_b, 0); cc.vpsllq(xmm_a, xmm_b, xmm_c); cc.vpsllq(xmm_a, xmm_b, 0); cc.vpsllw(xmm_a, xmm_b, xmm_c); cc.vpsllw(xmm_a, xmm_b, 0); cc.vpsrad(xmm_a, xmm_b, xmm_c); cc.vpsrad(xmm_a, xmm_b, 0); cc.vpsraw(xmm_a, xmm_b, xmm_c); cc.vpsraw(xmm_a, xmm_b, 0); cc.vpsrld(xmm_a, xmm_b, xmm_c); cc.vpsrld(xmm_a, xmm_b, 0); cc.vpsrldq(xmm_a, xmm_b, 0); cc.vpsrlq(xmm_a, xmm_b, xmm_c); cc.vpsrlq(xmm_a, xmm_b, 0); cc.vpsrlw(xmm_a, xmm_b, xmm_c); cc.vpsrlw(xmm_a, xmm_b, 0); cc.vpsubb(xmm_a, xmm_b, xmm_c); cc.vpsubd(xmm_a, xmm_b, xmm_c); cc.vpsubq(xmm_a, xmm_b, xmm_c); cc.vpsubw(xmm_a, xmm_b, xmm_c); cc.vpsubsb(xmm_a, xmm_b, xmm_c); cc.vpsubsw(xmm_a, xmm_b, xmm_c); cc.vpsubusb(xmm_a, xmm_b, xmm_c); cc.vpsubusw(xmm_a, xmm_b, xmm_c); cc.vptest(xmm_a, xmm_b); cc.vptest(ymm_a, ymm_b); cc.vpunpckhbw(xmm_a, xmm_b, xmm_c); cc.vpunpckhdq(xmm_a, xmm_b, xmm_c); cc.vpunpckhqdq(xmm_a, xmm_b, xmm_c); cc.vpunpckhwd(xmm_a, xmm_b, xmm_c); cc.vpunpcklbw(xmm_a, xmm_b, xmm_c); cc.vpunpckldq(xmm_a, xmm_b, xmm_c); cc.vpunpcklqdq(xmm_a, xmm_b, xmm_c); cc.vpunpcklwd(xmm_a, xmm_b, xmm_c); cc.vpxor(xmm_a, xmm_b, xmm_c); cc.vrcpps(xmm_a, xmm_b); cc.vrcpps(ymm_a, ymm_b); cc.vrcpss(xmm_a, xmm_b, xmm_c); cc.vrsqrtps(xmm_a, xmm_b); cc.vrsqrtps(ymm_a, ymm_b); cc.vrsqrtss(xmm_a, xmm_b, xmm_c); cc.vroundpd(xmm_a, xmm_b, 0); cc.vroundpd(ymm_a, ymm_b, 0); cc.vroundps(xmm_a, xmm_b, 0); cc.vroundps(ymm_a, ymm_b, 0); cc.vroundsd(xmm_a, xmm_b, xmm_c, 0); cc.vroundss(xmm_a, xmm_b, xmm_c, 0); cc.vshufpd(xmm_a, xmm_b, xmm_c, 0); cc.vshufpd(ymm_a, ymm_b, ymm_c, 0); cc.vshufps(xmm_a, xmm_b, xmm_c, 0); cc.vshufps(ymm_a, ymm_b, ymm_c, 0); cc.vsqrtpd(xmm_a, xmm_b); cc.vsqrtpd(ymm_a, ymm_b); cc.vsqrtps(xmm_a, xmm_b); cc.vsqrtps(ymm_a, ymm_b); cc.vsqrtsd(xmm_a, xmm_b, xmm_c); cc.vsqrtss(xmm_a, xmm_b, xmm_c); cc.vsubpd(xmm_a, xmm_b, xmm_c); cc.vsubpd(ymm_a, ymm_b, ymm_c); cc.vsubps(xmm_a, xmm_b, xmm_c); cc.vsubps(ymm_a, ymm_b, ymm_c); cc.vsubsd(xmm_a, xmm_b, xmm_c); cc.vsubss(xmm_a, xmm_b, xmm_c); cc.vtestps(xmm_a, xmm_b); cc.vtestps(ymm_a, ymm_b); cc.vtestpd(xmm_a, xmm_b); cc.vtestpd(ymm_a, ymm_b); cc.vucomisd(xmm_a, xmm_b); cc.vucomiss(xmm_a, xmm_b); cc.vunpckhpd(xmm_a, xmm_b, xmm_c); cc.vunpckhpd(ymm_a, ymm_b, ymm_c); cc.vunpckhps(xmm_a, xmm_b, xmm_c); cc.vunpckhps(ymm_a, ymm_b, ymm_c); cc.vunpcklpd(xmm_a, xmm_b, xmm_c); cc.vunpcklpd(ymm_a, ymm_b, ymm_c); cc.vunpcklps(xmm_a, xmm_b, xmm_c); cc.vunpcklps(ymm_a, ymm_b, ymm_c); cc.vxorpd(xmm_a, xmm_b, xmm_c); cc.vxorpd(ymm_a, ymm_b, ymm_c); cc.vxorps(xmm_a, xmm_b, xmm_c); cc.vxorps(ymm_a, ymm_b, ymm_c); // AVX+AESNI. cc.vaesdec(xmm_a, xmm_b, xmm_c); cc.vaesdeclast(xmm_a, xmm_b, xmm_c); cc.vaesenc(xmm_a, xmm_b, xmm_c); cc.vaesenclast(xmm_a, xmm_b, xmm_c); cc.vaesimc(xmm_a, xmm_b); cc.vaeskeygenassist(xmm_a, xmm_b, 0); // AVX+PCLMULQDQ. cc.vpclmulqdq(xmm_a, xmm_b, xmm_c, 0); // AVX2. cc.vbroadcastsd(ymm_a, xmm_b); cc.vbroadcastss(xmm_a, xmm_b); cc.vbroadcastss(ymm_a, xmm_b); cc.vextracti128(xmm_a, ymm_b, 0); cc.vinserti128(ymm_a, ymm_b, xmm_c, 0); cc.vmpsadbw(ymm_a, ymm_b, ymm_c, 0); cc.vpabsb(ymm_a, ymm_b); cc.vpabsd(ymm_a, ymm_b); cc.vpabsw(ymm_a, ymm_b); cc.vpackssdw(ymm_a, ymm_b, ymm_c); cc.vpacksswb(ymm_a, ymm_b, ymm_c); cc.vpackusdw(ymm_a, ymm_b, ymm_c); cc.vpackuswb(ymm_a, ymm_b, ymm_c); cc.vpaddb(ymm_a, ymm_b, ymm_c); cc.vpaddd(ymm_a, ymm_b, ymm_c); cc.vpaddq(ymm_a, ymm_b, ymm_c); cc.vpaddw(ymm_a, ymm_b, ymm_c); cc.vpaddsb(ymm_a, ymm_b, ymm_c); cc.vpaddsw(ymm_a, ymm_b, ymm_c); cc.vpaddusb(ymm_a, ymm_b, ymm_c); cc.vpaddusw(ymm_a, ymm_b, ymm_c); cc.vpalignr(ymm_a, ymm_b, ymm_c, 0); cc.vpand(ymm_a, ymm_b, ymm_c); cc.vpandn(ymm_a, ymm_b, ymm_c); cc.vpavgb(ymm_a, ymm_b, ymm_c); cc.vpavgw(ymm_a, ymm_b, ymm_c); cc.vpblendd(xmm_a, xmm_b, xmm_c, 0); cc.vpblendd(ymm_a, ymm_b, ymm_c, 0); cc.vpblendvb(ymm_a, ymm_b, ymm_c, ymm_a); cc.vpblendw(ymm_a, ymm_b, ymm_c, 0); cc.vpbroadcastb(xmm_a, xmm_b); cc.vpbroadcastb(ymm_a, xmm_b); cc.vpbroadcastd(xmm_a, xmm_b); cc.vpbroadcastd(ymm_a, xmm_b); cc.vpbroadcastq(xmm_a, xmm_b); cc.vpbroadcastq(ymm_a, xmm_b); cc.vpbroadcastw(xmm_a, xmm_b); cc.vpbroadcastw(ymm_a, xmm_b); cc.vpcmpeqb(ymm_a, ymm_b, ymm_c); cc.vpcmpeqd(ymm_a, ymm_b, ymm_c); cc.vpcmpeqq(ymm_a, ymm_b, ymm_c); cc.vpcmpeqw(ymm_a, ymm_b, ymm_c); cc.vpcmpgtb(ymm_a, ymm_b, ymm_c); cc.vpcmpgtd(ymm_a, ymm_b, ymm_c); cc.vpcmpgtq(ymm_a, ymm_b, ymm_c); cc.vpcmpgtw(ymm_a, ymm_b, ymm_c); cc.vperm2i128(ymm_a, ymm_b, ymm_c, 0); cc.vpermd(ymm_a, ymm_b, ymm_c); cc.vpermps(ymm_a, ymm_b, ymm_c); cc.vpermpd(ymm_a, ymm_b, 0); cc.vpermq(ymm_a, ymm_b, 0); cc.vpmovmskb(gpd, ymm_b); cc.vpmovsxbd(ymm_a, xmm_b); cc.vpmovsxbq(ymm_a, xmm_b); cc.vpmovsxbw(ymm_a, xmm_b); cc.vpmovsxdq(ymm_a, xmm_b); cc.vpmovsxwd(ymm_a, xmm_b); cc.vpmovsxwq(ymm_a, xmm_b); cc.vpmovzxbd(ymm_a, xmm_b); cc.vpmovzxbq(ymm_a, xmm_b); cc.vpmovzxbw(ymm_a, xmm_b); cc.vpmovzxdq(ymm_a, xmm_b); cc.vpmovzxwd(ymm_a, xmm_b); cc.vpmovzxwq(ymm_a, xmm_b); cc.vpshufd(ymm_a, ymm_b, 0); cc.vpshufhw(ymm_a, ymm_b, 0); cc.vpshuflw(ymm_a, ymm_b, 0); cc.vpslld(ymm_a, ymm_b, 0); cc.vpslldq(ymm_a, ymm_b, 0); cc.vpsllq(ymm_a, ymm_b, 0); cc.vpsllw(ymm_a, ymm_b, 0); cc.vpsrad(ymm_a, ymm_b, 0); cc.vpsraw(ymm_a, ymm_b, 0); cc.vpsrld(ymm_a, ymm_b, 0); cc.vpsrldq(ymm_a, ymm_b, 0); cc.vpsrlq(ymm_a, ymm_b, 0); cc.vpsrlw(ymm_a, ymm_b, 0); cc.vphaddd(ymm_a, ymm_b, ymm_c); cc.vphaddsw(ymm_a, ymm_b, ymm_c); cc.vphaddw(ymm_a, ymm_b, ymm_c); cc.vphsubd(ymm_a, ymm_b, ymm_c); cc.vphsubsw(ymm_a, ymm_b, ymm_c); cc.vphsubw(ymm_a, ymm_b, ymm_c); cc.vpmaddubsw(ymm_a, ymm_b, ymm_c); cc.vpmaddwd(ymm_a, ymm_b, ymm_c); cc.vpmaxsb(ymm_a, ymm_b, ymm_c); cc.vpmaxsd(ymm_a, ymm_b, ymm_c); cc.vpmaxsw(ymm_a, ymm_b, ymm_c); cc.vpmaxub(ymm_a, ymm_b, ymm_c); cc.vpmaxud(ymm_a, ymm_b, ymm_c); cc.vpmaxuw(ymm_a, ymm_b, ymm_c); cc.vpminsb(ymm_a, ymm_b, ymm_c); cc.vpminsd(ymm_a, ymm_b, ymm_c); cc.vpminsw(ymm_a, ymm_b, ymm_c); cc.vpminub(ymm_a, ymm_b, ymm_c); cc.vpminud(ymm_a, ymm_b, ymm_c); cc.vpminuw(ymm_a, ymm_b, ymm_c); cc.vpmuldq(ymm_a, ymm_b, ymm_c); cc.vpmulhrsw(ymm_a, ymm_b, ymm_c); cc.vpmulhuw(ymm_a, ymm_b, ymm_c); cc.vpmulhw(ymm_a, ymm_b, ymm_c); cc.vpmulld(ymm_a, ymm_b, ymm_c); cc.vpmullw(ymm_a, ymm_b, ymm_c); cc.vpmuludq(ymm_a, ymm_b, ymm_c); cc.vpor(ymm_a, ymm_b, ymm_c); cc.vpsadbw(ymm_a, ymm_b, ymm_c); cc.vpshufb(ymm_a, ymm_b, ymm_c); cc.vpsignb(ymm_a, ymm_b, ymm_c); cc.vpsignd(ymm_a, ymm_b, ymm_c); cc.vpsignw(ymm_a, ymm_b, ymm_c); cc.vpslld(ymm_a, ymm_b, xmm_c); cc.vpsllq(ymm_a, ymm_b, xmm_c); cc.vpsllvd(xmm_a, xmm_b, xmm_c); cc.vpsllvd(ymm_a, ymm_b, ymm_c); cc.vpsllvq(xmm_a, xmm_b, xmm_c); cc.vpsllvq(ymm_a, ymm_b, ymm_c); cc.vpsllw(ymm_a, ymm_b, xmm_c); cc.vpsrad(ymm_a, ymm_b, xmm_c); cc.vpsravd(xmm_a, xmm_b, xmm_c); cc.vpsravd(ymm_a, ymm_b, ymm_c); cc.vpsraw(ymm_a, ymm_b, xmm_c); cc.vpsrld(ymm_a, ymm_b, xmm_c); cc.vpsrlq(ymm_a, ymm_b, xmm_c); cc.vpsrlvd(xmm_a, xmm_b, xmm_c); cc.vpsrlvd(ymm_a, ymm_b, ymm_c); cc.vpsrlvq(xmm_a, xmm_b, xmm_c); cc.vpsrlvq(ymm_a, ymm_b, ymm_c); cc.vpsrlw(ymm_a, ymm_b, xmm_c); cc.vpsubb(ymm_a, ymm_b, ymm_c); cc.vpsubd(ymm_a, ymm_b, ymm_c); cc.vpsubq(ymm_a, ymm_b, ymm_c); cc.vpsubsb(ymm_a, ymm_b, ymm_c); cc.vpsubsw(ymm_a, ymm_b, ymm_c); cc.vpsubusb(ymm_a, ymm_b, ymm_c); cc.vpsubusw(ymm_a, ymm_b, ymm_c); cc.vpsubw(ymm_a, ymm_b, ymm_c); cc.vpunpckhbw(ymm_a, ymm_b, ymm_c); cc.vpunpckhdq(ymm_a, ymm_b, ymm_c); cc.vpunpckhqdq(ymm_a, ymm_b, ymm_c); cc.vpunpckhwd(ymm_a, ymm_b, ymm_c); cc.vpunpcklbw(ymm_a, ymm_b, ymm_c); cc.vpunpckldq(ymm_a, ymm_b, ymm_c); cc.vpunpcklqdq(ymm_a, ymm_b, ymm_c); cc.vpunpcklwd(ymm_a, ymm_b, ymm_c); cc.vpxor(ymm_a, ymm_b, ymm_c); // FMA. cc.vfmadd132pd(xmm_a, xmm_b, xmm_c); cc.vfmadd132pd(ymm_a, ymm_b, ymm_c); cc.vfmadd132ps(xmm_a, xmm_b, xmm_c); cc.vfmadd132ps(ymm_a, ymm_b, ymm_c); cc.vfmadd132sd(xmm_a, xmm_b, xmm_c); cc.vfmadd132ss(xmm_a, xmm_b, xmm_c); cc.vfmadd213pd(xmm_a, xmm_b, xmm_c); cc.vfmadd213pd(ymm_a, ymm_b, ymm_c); cc.vfmadd213ps(xmm_a, xmm_b, xmm_c); cc.vfmadd213ps(ymm_a, ymm_b, ymm_c); cc.vfmadd213sd(xmm_a, xmm_b, xmm_c); cc.vfmadd213ss(xmm_a, xmm_b, xmm_c); cc.vfmadd231pd(xmm_a, xmm_b, xmm_c); cc.vfmadd231pd(ymm_a, ymm_b, ymm_c); cc.vfmadd231ps(xmm_a, xmm_b, xmm_c); cc.vfmadd231ps(ymm_a, ymm_b, ymm_c); cc.vfmadd231sd(xmm_a, xmm_b, xmm_c); cc.vfmadd231ss(xmm_a, xmm_b, xmm_c); cc.vfmaddsub132pd(xmm_a, xmm_b, xmm_c); cc.vfmaddsub132pd(ymm_a, ymm_b, ymm_c); cc.vfmaddsub132ps(xmm_a, xmm_b, xmm_c); cc.vfmaddsub132ps(ymm_a, ymm_b, ymm_c); cc.vfmaddsub213pd(xmm_a, xmm_b, xmm_c); cc.vfmaddsub213pd(ymm_a, ymm_b, ymm_c); cc.vfmaddsub213ps(xmm_a, xmm_b, xmm_c); cc.vfmaddsub213ps(ymm_a, ymm_b, ymm_c); cc.vfmaddsub231pd(xmm_a, xmm_b, xmm_c); cc.vfmaddsub231pd(ymm_a, ymm_b, ymm_c); cc.vfmaddsub231ps(xmm_a, xmm_b, xmm_c); cc.vfmaddsub231ps(ymm_a, ymm_b, ymm_c); cc.vfmsub132pd(xmm_a, xmm_b, xmm_c); cc.vfmsub132pd(ymm_a, ymm_b, ymm_c); cc.vfmsub132ps(xmm_a, xmm_b, xmm_c); cc.vfmsub132ps(ymm_a, ymm_b, ymm_c); cc.vfmsub132sd(xmm_a, xmm_b, xmm_c); cc.vfmsub132ss(xmm_a, xmm_b, xmm_c); cc.vfmsub213pd(xmm_a, xmm_b, xmm_c); cc.vfmsub213pd(ymm_a, ymm_b, ymm_c); cc.vfmsub213ps(xmm_a, xmm_b, xmm_c); cc.vfmsub213ps(ymm_a, ymm_b, ymm_c); cc.vfmsub213sd(xmm_a, xmm_b, xmm_c); cc.vfmsub213ss(xmm_a, xmm_b, xmm_c); cc.vfmsub231pd(xmm_a, xmm_b, xmm_c); cc.vfmsub231pd(ymm_a, ymm_b, ymm_c); cc.vfmsub231ps(xmm_a, xmm_b, xmm_c); cc.vfmsub231ps(ymm_a, ymm_b, ymm_c); cc.vfmsub231sd(xmm_a, xmm_b, xmm_c); cc.vfmsub231ss(xmm_a, xmm_b, xmm_c); cc.vfmsubadd132pd(xmm_a, xmm_b, xmm_c); cc.vfmsubadd132pd(ymm_a, ymm_b, ymm_c); cc.vfmsubadd132ps(xmm_a, xmm_b, xmm_c); cc.vfmsubadd132ps(ymm_a, ymm_b, ymm_c); cc.vfmsubadd213pd(xmm_a, xmm_b, xmm_c); cc.vfmsubadd213pd(ymm_a, ymm_b, ymm_c); cc.vfmsubadd213ps(xmm_a, xmm_b, xmm_c); cc.vfmsubadd213ps(ymm_a, ymm_b, ymm_c); cc.vfmsubadd231pd(xmm_a, xmm_b, xmm_c); cc.vfmsubadd231pd(ymm_a, ymm_b, ymm_c); cc.vfmsubadd231ps(xmm_a, xmm_b, xmm_c); cc.vfmsubadd231ps(ymm_a, ymm_b, ymm_c); cc.vfnmadd132pd(xmm_a, xmm_b, xmm_c); cc.vfnmadd132pd(ymm_a, ymm_b, ymm_c); cc.vfnmadd132ps(xmm_a, xmm_b, xmm_c); cc.vfnmadd132ps(ymm_a, ymm_b, ymm_c); cc.vfnmadd132sd(xmm_a, xmm_b, xmm_c); cc.vfnmadd132ss(xmm_a, xmm_b, xmm_c); cc.vfnmadd213pd(xmm_a, xmm_b, xmm_c); cc.vfnmadd213pd(ymm_a, ymm_b, ymm_c); cc.vfnmadd213ps(xmm_a, xmm_b, xmm_c); cc.vfnmadd213ps(ymm_a, ymm_b, ymm_c); cc.vfnmadd213sd(xmm_a, xmm_b, xmm_c); cc.vfnmadd213ss(xmm_a, xmm_b, xmm_c); cc.vfnmadd231pd(xmm_a, xmm_b, xmm_c); cc.vfnmadd231pd(ymm_a, ymm_b, ymm_c); cc.vfnmadd231ps(xmm_a, xmm_b, xmm_c); cc.vfnmadd231ps(ymm_a, ymm_b, ymm_c); cc.vfnmadd231sd(xmm_a, xmm_b, xmm_c); cc.vfnmadd231ss(xmm_a, xmm_b, xmm_c); cc.vfnmsub132pd(xmm_a, xmm_b, xmm_c); cc.vfnmsub132pd(ymm_a, ymm_b, ymm_c); cc.vfnmsub132ps(xmm_a, xmm_b, xmm_c); cc.vfnmsub132ps(ymm_a, ymm_b, ymm_c); cc.vfnmsub132sd(xmm_a, xmm_b, xmm_c); cc.vfnmsub132ss(xmm_a, xmm_b, xmm_c); cc.vfnmsub213pd(xmm_a, xmm_b, xmm_c); cc.vfnmsub213pd(ymm_a, ymm_b, ymm_c); cc.vfnmsub213ps(xmm_a, xmm_b, xmm_c); cc.vfnmsub213ps(ymm_a, ymm_b, ymm_c); cc.vfnmsub213sd(xmm_a, xmm_b, xmm_c); cc.vfnmsub213ss(xmm_a, xmm_b, xmm_c); cc.vfnmsub231pd(xmm_a, xmm_b, xmm_c); cc.vfnmsub231pd(ymm_a, ymm_b, ymm_c); cc.vfnmsub231ps(xmm_a, xmm_b, xmm_c); cc.vfnmsub231ps(ymm_a, ymm_b, ymm_c); cc.vfnmsub231sd(xmm_a, xmm_b, xmm_c); cc.vfnmsub231ss(xmm_a, xmm_b, xmm_c); } // Generates a long sequence of AVX instructions. template static void generate_avx_sequenceInternalRegMem( Emitter& cc, const x86::Gp& gp, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { x86::Gp gpd = gp.r32(); x86::Gp gpq = gp.r64(); x86::Gp gpz = cc.is_32bit() ? gpd : gpq; x86::Vec xmm_a = vec_a.xmm(); x86::Vec xmm_b = vec_b.xmm(); x86::Vec xmm_c = vec_c.xmm(); x86::Vec xmm_d = vec_d.xmm(); x86::Vec ymm_a = vec_a.ymm(); x86::Vec ymm_b = vec_b.ymm(); x86::Vec ymm_c = vec_c.ymm(); x86::Vec ymm_d = vec_d.ymm(); x86::Mem m = x86::ptr(gpz); x86::Mem m128 = x86::xmmword_ptr(gpz); x86::Mem m256 = x86::xmmword_ptr(gpz); x86::Mem vx_ptr = x86::ptr(gpz, xmm_d); x86::Mem vy_ptr = x86::ptr(gpz, ymm_d); cc.xor_(gpd, gpd); cc.vxorps(xmm_a, xmm_a, xmm_a); cc.vxorps(xmm_b, xmm_b, xmm_b); cc.vxorps(xmm_c, xmm_c, xmm_c); cc.vxorps(xmm_d, xmm_d, xmm_d); cc.vaddpd(xmm_a, xmm_b, m); cc.vaddpd(ymm_a, ymm_b, m); cc.vaddps(xmm_a, xmm_b, m); cc.vaddps(ymm_a, ymm_b, m); cc.vaddsd(xmm_a, xmm_b, m); cc.vaddss(xmm_a, xmm_b, m); cc.vaddsubpd(xmm_a, xmm_b, m); cc.vaddsubpd(ymm_a, ymm_b, m); cc.vaddsubps(xmm_a, xmm_b, m); cc.vaddsubps(ymm_a, ymm_b, m); cc.vandpd(xmm_a, xmm_b, m); cc.vandpd(ymm_a, ymm_b, m); cc.vandps(xmm_a, xmm_b, m); cc.vandps(ymm_a, ymm_b, m); cc.vandnpd(xmm_a, xmm_b, m); cc.vandnpd(ymm_a, ymm_b, m); cc.vandnps(xmm_a, xmm_b, m); cc.vandnps(ymm_a, ymm_b, m); cc.vblendpd(xmm_a, xmm_b, m, 0); cc.vblendpd(ymm_a, ymm_b, m, 0); cc.vblendps(xmm_a, xmm_b, m, 0); cc.vblendps(ymm_a, ymm_b, m, 0); cc.vblendvpd(xmm_a, xmm_b, m, xmm_a); cc.vblendvpd(ymm_a, ymm_b, m, ymm_a); cc.vbroadcastf128(ymm_a, m); cc.vbroadcastsd(ymm_a, m); cc.vbroadcastss(xmm_a, m); cc.vbroadcastss(ymm_a, m); cc.vcmppd(xmm_a, xmm_b, m, 0); cc.vcmppd(ymm_a, ymm_b, m, 0); cc.vcmpps(xmm_a, xmm_b, m, 0); cc.vcmpps(ymm_a, ymm_b, m, 0); cc.vcmpsd(xmm_a, xmm_b, m, 0); cc.vcmpss(xmm_a, xmm_b, m, 0); cc.vcomisd(xmm_a, m); cc.vcomiss(xmm_a, m); cc.vcvtdq2pd(xmm_a, m); cc.vcvtdq2pd(ymm_a, m); cc.vcvtdq2ps(xmm_a, m); cc.vcvtdq2ps(ymm_a, m); cc.vcvtpd2dq(xmm_a, m128); cc.vcvtpd2dq(xmm_a, m256); cc.vcvtpd2ps(xmm_a, m128); cc.vcvtpd2ps(xmm_a, m256); cc.vcvtps2dq(xmm_a, m); cc.vcvtps2dq(ymm_a, m); cc.vcvtps2pd(xmm_a, m); cc.vcvtps2pd(ymm_a, m); cc.vcvtsd2si(gpd, m); cc.vcvtsd2ss(xmm_a, xmm_b, m); cc.vcvtsi2sd(xmm_a, xmm_b, m); cc.vcvtsi2ss(xmm_a, xmm_b, m); cc.vcvtss2sd(xmm_a, xmm_b, m); cc.vcvtss2si(gpd, m); cc.vcvttpd2dq(xmm_a, m128); cc.vcvttpd2dq(xmm_a, m256); cc.vcvttps2dq(xmm_a, m); cc.vcvttps2dq(ymm_a, m); cc.vcvttsd2si(gpd, m); cc.vcvttss2si(gpd, m); cc.vdivpd(xmm_a, xmm_b, m); cc.vdivpd(ymm_a, ymm_b, m); cc.vdivps(xmm_a, xmm_b, m); cc.vdivps(ymm_a, ymm_b, m); cc.vdivsd(xmm_a, xmm_b, m); cc.vdivss(xmm_a, xmm_b, m); cc.vdppd(xmm_a, xmm_b, m, 0); cc.vdpps(xmm_a, xmm_b, m, 0); cc.vdpps(ymm_a, ymm_b, m, 0); cc.vextractf128(m, ymm_b, 0); cc.vextractps(m, xmm_b, 0); cc.vhaddpd(xmm_a, xmm_b, m); cc.vhaddpd(ymm_a, ymm_b, m); cc.vhaddps(xmm_a, xmm_b, m); cc.vhaddps(ymm_a, ymm_b, m); cc.vhsubpd(xmm_a, xmm_b, m); cc.vhsubpd(ymm_a, ymm_b, m); cc.vhsubps(xmm_a, xmm_b, m); cc.vhsubps(ymm_a, ymm_b, m); cc.vinsertf128(ymm_a, ymm_b, m, 0); cc.vinsertps(xmm_a, xmm_b, m, 0); cc.vlddqu(xmm_a, m); cc.vlddqu(ymm_a, m); cc.vmaskmovps(xmm_a, xmm_b, m); cc.vmaskmovps(ymm_a, ymm_b, m); cc.vmaskmovps(m, xmm_b, xmm_c); cc.vmaskmovps(m, ymm_b, ymm_c); cc.vmaskmovpd(xmm_a, xmm_b, m); cc.vmaskmovpd(ymm_a, ymm_b, m); cc.vmaskmovpd(m, xmm_b, xmm_c); cc.vmaskmovpd(m, ymm_b, ymm_c); cc.vmaxpd(xmm_a, xmm_b, m); cc.vmaxpd(ymm_a, ymm_b, m); cc.vmaxps(xmm_a, xmm_b, m); cc.vmaxps(ymm_a, ymm_b, m); cc.vmaxsd(xmm_a, xmm_b, m); cc.vmaxss(xmm_a, xmm_b, m); cc.vminpd(xmm_a, xmm_b, m); cc.vminpd(ymm_a, ymm_b, m); cc.vminps(xmm_a, xmm_b, m); cc.vminps(ymm_a, ymm_b, m); cc.vminsd(xmm_a, xmm_b, m); cc.vminss(xmm_a, xmm_b, m); cc.vmovapd(xmm_a, m); cc.vmovapd(m, xmm_b); cc.vmovapd(ymm_a, m); cc.vmovapd(m, ymm_b); cc.vmovaps(xmm_a, m); cc.vmovaps(m, xmm_b); cc.vmovaps(ymm_a, m); cc.vmovaps(m, ymm_b); cc.vmovd(xmm_a, m); cc.vmovd(m, xmm_b); cc.vmovddup(xmm_a, m); cc.vmovddup(ymm_a, m); cc.vmovdqa(xmm_a, m); cc.vmovdqa(m, xmm_b); cc.vmovdqa(ymm_a, m); cc.vmovdqa(m, ymm_b); cc.vmovdqu(xmm_a, m); cc.vmovdqu(m, xmm_b); cc.vmovdqu(ymm_a, m); cc.vmovdqu(m, ymm_b); cc.vmovhpd(xmm_a, xmm_b, m); cc.vmovhps(xmm_a, xmm_b, m); cc.vmovhps(m, xmm_b); cc.vmovlpd(xmm_a, xmm_b, m); cc.vmovlpd(m, xmm_b); cc.vmovlps(xmm_a, xmm_b, m); cc.vmovlps(m, xmm_b); cc.vmovntdq(m, xmm_b); cc.vmovntdq(m, ymm_b); cc.vmovntdqa(xmm_a, m); cc.vmovntpd(m, xmm_b); cc.vmovntpd(m, ymm_b); cc.vmovntps(m, xmm_b); cc.vmovntps(m, ymm_b); cc.vmovsd(xmm_a, m); cc.vmovsd(m, xmm_b); cc.vmovshdup(xmm_a, m); cc.vmovshdup(ymm_a, m); cc.vmovsldup(xmm_a, m); cc.vmovsldup(ymm_a, m); cc.vmovss(xmm_a, m); cc.vmovss(m, xmm_b); cc.vmovupd(xmm_a, m); cc.vmovupd(m, xmm_b); cc.vmovupd(ymm_a, m); cc.vmovupd(m, ymm_b); cc.vmovups(xmm_a, m); cc.vmovups(m, xmm_b); cc.vmovups(ymm_a, m); cc.vmovups(m, ymm_b); cc.vmpsadbw(xmm_a, xmm_b, m, 0); cc.vmulpd(xmm_a, xmm_b, m); cc.vmulpd(ymm_a, ymm_b, m); cc.vmulps(xmm_a, xmm_b, m); cc.vmulps(ymm_a, ymm_b, m); cc.vmulsd(xmm_a, xmm_b, m); cc.vmulss(xmm_a, xmm_b, m); cc.vorpd(xmm_a, xmm_b, m); cc.vorpd(ymm_a, ymm_b, m); cc.vorps(xmm_a, xmm_b, m); cc.vorps(ymm_a, ymm_b, m); cc.vpabsb(xmm_a, m); cc.vpabsd(xmm_a, m); cc.vpabsw(xmm_a, m); cc.vpackssdw(xmm_a, xmm_b, m); cc.vpacksswb(xmm_a, xmm_b, m); cc.vpackusdw(xmm_a, xmm_b, m); cc.vpackuswb(xmm_a, xmm_b, m); cc.vpaddb(xmm_a, xmm_b, m); cc.vpaddd(xmm_a, xmm_b, m); cc.vpaddq(xmm_a, xmm_b, m); cc.vpaddw(xmm_a, xmm_b, m); cc.vpaddsb(xmm_a, xmm_b, m); cc.vpaddsw(xmm_a, xmm_b, m); cc.vpaddusb(xmm_a, xmm_b, m); cc.vpaddusw(xmm_a, xmm_b, m); cc.vpalignr(xmm_a, xmm_b, m, 0); cc.vpand(xmm_a, xmm_b, m); cc.vpandn(xmm_a, xmm_b, m); cc.vpavgb(xmm_a, xmm_b, m); cc.vpavgw(xmm_a, xmm_b, m); cc.vpblendvb(xmm_a, xmm_b, m, xmm_a); cc.vpblendw(xmm_a, xmm_b, m, 0); cc.vpcmpeqb(xmm_a, xmm_b, m); cc.vpcmpeqd(xmm_a, xmm_b, m); cc.vpcmpeqq(xmm_a, xmm_b, m); cc.vpcmpeqw(xmm_a, xmm_b, m); cc.vpcmpgtb(xmm_a, xmm_b, m); cc.vpcmpgtd(xmm_a, xmm_b, m); cc.vpcmpgtq(xmm_a, xmm_b, m); cc.vpcmpgtw(xmm_a, xmm_b, m); cc.vpermilpd(xmm_a, xmm_b, m); cc.vpermilpd(ymm_a, ymm_b, m); cc.vpermilpd(xmm_a, m, 0); cc.vpermilpd(ymm_a, m, 0); cc.vpermilps(xmm_a, xmm_b, m); cc.vpermilps(ymm_a, ymm_b, m); cc.vpermilps(xmm_a, m, 0); cc.vpermilps(ymm_a, m, 0); cc.vperm2f128(ymm_a, ymm_b, m, 0); cc.vpextrb(m, xmm_b, 0); cc.vpextrd(m, xmm_b, 0); if (cc.is_64bit()) cc.vpextrq(m, xmm_b, 0); cc.vpextrw(m, xmm_b, 0); cc.vphaddd(xmm_a, xmm_b, m); cc.vphaddsw(xmm_a, xmm_b, m); cc.vphaddw(xmm_a, xmm_b, m); cc.vphminposuw(xmm_a, m); cc.vphsubd(xmm_a, xmm_b, m); cc.vphsubsw(xmm_a, xmm_b, m); cc.vphsubw(xmm_a, xmm_b, m); cc.vpinsrb(xmm_a, xmm_b, m, 0); cc.vpinsrd(xmm_a, xmm_b, m, 0); cc.vpinsrw(xmm_a, xmm_b, m, 0); cc.vpmaddubsw(xmm_a, xmm_b, m); cc.vpmaddwd(xmm_a, xmm_b, m); cc.vpmaxsb(xmm_a, xmm_b, m); cc.vpmaxsd(xmm_a, xmm_b, m); cc.vpmaxsw(xmm_a, xmm_b, m); cc.vpmaxub(xmm_a, xmm_b, m); cc.vpmaxud(xmm_a, xmm_b, m); cc.vpmaxuw(xmm_a, xmm_b, m); cc.vpminsb(xmm_a, xmm_b, m); cc.vpminsd(xmm_a, xmm_b, m); cc.vpminsw(xmm_a, xmm_b, m); cc.vpminub(xmm_a, xmm_b, m); cc.vpminud(xmm_a, xmm_b, m); cc.vpminuw(xmm_a, xmm_b, m); cc.vpmovsxbd(xmm_a, m); cc.vpmovsxbq(xmm_a, m); cc.vpmovsxbw(xmm_a, m); cc.vpmovsxdq(xmm_a, m); cc.vpmovsxwd(xmm_a, m); cc.vpmovsxwq(xmm_a, m); cc.vpmovzxbd(xmm_a, m); cc.vpmovzxbq(xmm_a, m); cc.vpmovzxbw(xmm_a, m); cc.vpmovzxdq(xmm_a, m); cc.vpmovzxwd(xmm_a, m); cc.vpmovzxwq(xmm_a, m); cc.vpmuldq(xmm_a, xmm_b, m); cc.vpmulhrsw(xmm_a, xmm_b, m); cc.vpmulhuw(xmm_a, xmm_b, m); cc.vpmulhw(xmm_a, xmm_b, m); cc.vpmulld(xmm_a, xmm_b, m); cc.vpmullw(xmm_a, xmm_b, m); cc.vpmuludq(xmm_a, xmm_b, m); cc.vpor(xmm_a, xmm_b, m); cc.vpsadbw(xmm_a, xmm_b, m); cc.vpshufb(xmm_a, xmm_b, m); cc.vpshufd(xmm_a, m, 0); cc.vpshufhw(xmm_a, m, 0); cc.vpshuflw(xmm_a, m, 0); cc.vpsignb(xmm_a, xmm_b, m); cc.vpsignd(xmm_a, xmm_b, m); cc.vpsignw(xmm_a, xmm_b, m); cc.vpslld(xmm_a, xmm_b, m); cc.vpsllq(xmm_a, xmm_b, m); cc.vpsllw(xmm_a, xmm_b, m); cc.vpsrad(xmm_a, xmm_b, m); cc.vpsraw(xmm_a, xmm_b, m); cc.vpsrld(xmm_a, xmm_b, m); cc.vpsrlq(xmm_a, xmm_b, m); cc.vpsrlw(xmm_a, xmm_b, m); cc.vpsubb(xmm_a, xmm_b, m); cc.vpsubd(xmm_a, xmm_b, m); cc.vpsubq(xmm_a, xmm_b, m); cc.vpsubw(xmm_a, xmm_b, m); cc.vpsubsb(xmm_a, xmm_b, m); cc.vpsubsw(xmm_a, xmm_b, m); cc.vpsubusb(xmm_a, xmm_b, m); cc.vpsubusw(xmm_a, xmm_b, m); cc.vptest(xmm_a, m); cc.vptest(ymm_a, m); cc.vpunpckhbw(xmm_a, xmm_b, m); cc.vpunpckhdq(xmm_a, xmm_b, m); cc.vpunpckhqdq(xmm_a, xmm_b, m); cc.vpunpckhwd(xmm_a, xmm_b, m); cc.vpunpcklbw(xmm_a, xmm_b, m); cc.vpunpckldq(xmm_a, xmm_b, m); cc.vpunpcklqdq(xmm_a, xmm_b, m); cc.vpunpcklwd(xmm_a, xmm_b, m); cc.vpxor(xmm_a, xmm_b, m); cc.vrcpps(xmm_a, m); cc.vrcpps(ymm_a, m); cc.vrcpss(xmm_a, xmm_b, m); cc.vrsqrtps(xmm_a, m); cc.vrsqrtps(ymm_a, m); cc.vrsqrtss(xmm_a, xmm_b, m); cc.vroundpd(xmm_a, m, 0); cc.vroundpd(ymm_a, m, 0); cc.vroundps(xmm_a, m, 0); cc.vroundps(ymm_a, m, 0); cc.vroundsd(xmm_a, xmm_b, m, 0); cc.vroundss(xmm_a, xmm_b, m, 0); cc.vshufpd(xmm_a, xmm_b, m, 0); cc.vshufpd(ymm_a, ymm_b, m, 0); cc.vshufps(xmm_a, xmm_b, m, 0); cc.vshufps(ymm_a, ymm_b, m, 0); cc.vsqrtpd(xmm_a, m); cc.vsqrtpd(ymm_a, m); cc.vsqrtps(xmm_a, m); cc.vsqrtps(ymm_a, m); cc.vsqrtsd(xmm_a, xmm_b, m); cc.vsqrtss(xmm_a, xmm_b, m); cc.vsubpd(xmm_a, xmm_b, m); cc.vsubpd(ymm_a, ymm_b, m); cc.vsubps(xmm_a, xmm_b, m); cc.vsubps(ymm_a, ymm_b, m); cc.vsubsd(xmm_a, xmm_b, m); cc.vsubss(xmm_a, xmm_b, m); cc.vtestps(xmm_a, m); cc.vtestps(ymm_a, m); cc.vtestpd(xmm_a, m); cc.vtestpd(ymm_a, m); cc.vucomisd(xmm_a, m); cc.vucomiss(xmm_a, m); cc.vunpckhpd(xmm_a, xmm_b, m); cc.vunpckhpd(ymm_a, ymm_b, m); cc.vunpckhps(xmm_a, xmm_b, m); cc.vunpckhps(ymm_a, ymm_b, m); cc.vunpcklpd(xmm_a, xmm_b, m); cc.vunpcklpd(ymm_a, ymm_b, m); cc.vunpcklps(xmm_a, xmm_b, m); cc.vunpcklps(ymm_a, ymm_b, m); cc.vxorpd(xmm_a, xmm_b, m); cc.vxorpd(ymm_a, ymm_b, m); cc.vxorps(xmm_a, xmm_b, m); cc.vxorps(ymm_a, ymm_b, m); // AVX+AESNI. cc.vaesdec(xmm_a, xmm_b, m); cc.vaesdeclast(xmm_a, xmm_b, m); cc.vaesenc(xmm_a, xmm_b, m); cc.vaesenclast(xmm_a, xmm_b, m); cc.vaesimc(xmm_a, m); cc.vaeskeygenassist(xmm_a, m, 0); // AVX+PCLMULQDQ. cc.vpclmulqdq(xmm_a, xmm_b, m, 0); // AVX2. cc.vbroadcasti128(ymm_a, m); cc.vextracti128(m, ymm_b, 0); cc.vgatherdpd(xmm_a, vx_ptr, xmm_c); cc.vgatherdpd(ymm_a, vx_ptr, ymm_c); cc.vgatherdps(xmm_a, vx_ptr, xmm_c); cc.vgatherdps(ymm_a, vy_ptr, ymm_c); cc.vgatherqpd(xmm_a, vx_ptr, xmm_c); cc.vgatherqpd(ymm_a, vy_ptr, ymm_c); cc.vgatherqps(xmm_a, vx_ptr, xmm_c); cc.vgatherqps(xmm_a, vy_ptr, xmm_c); cc.vinserti128(ymm_a, ymm_b, m, 0); cc.vmovntdqa(ymm_a, m); cc.vmpsadbw(ymm_a, ymm_b, m, 0); cc.vpabsb(ymm_a, m); cc.vpabsd(ymm_a, m); cc.vpabsw(ymm_a, m); cc.vpackssdw(ymm_a, ymm_b, m); cc.vpacksswb(ymm_a, ymm_b, m); cc.vpackusdw(ymm_a, ymm_b, m); cc.vpackuswb(ymm_a, ymm_b, m); cc.vpaddb(ymm_a, ymm_b, m); cc.vpaddd(ymm_a, ymm_b, m); cc.vpaddq(ymm_a, ymm_b, m); cc.vpaddw(ymm_a, ymm_b, m); cc.vpaddsb(ymm_a, ymm_b, m); cc.vpaddsw(ymm_a, ymm_b, m); cc.vpaddusb(ymm_a, ymm_b, m); cc.vpaddusw(ymm_a, ymm_b, m); cc.vpalignr(ymm_a, ymm_b, m, 0); cc.vpand(ymm_a, ymm_b, m); cc.vpandn(ymm_a, ymm_b, m); cc.vpavgb(ymm_a, ymm_b, m); cc.vpavgw(ymm_a, ymm_b, m); cc.vpblendd(xmm_a, xmm_b, m, 0); cc.vpblendd(ymm_a, ymm_b, m, 0); cc.vpblendvb(ymm_a, ymm_b, m, ymm_a); cc.vpblendw(ymm_a, ymm_b, m, 0); cc.vpbroadcastb(xmm_a, m); cc.vpbroadcastb(ymm_a, m); cc.vpbroadcastd(xmm_a, m); cc.vpbroadcastd(ymm_a, m); cc.vpbroadcastq(xmm_a, m); cc.vpbroadcastq(ymm_a, m); cc.vpbroadcastw(xmm_a, m); cc.vpbroadcastw(ymm_a, m); cc.vpcmpeqb(ymm_a, ymm_b, m); cc.vpcmpeqd(ymm_a, ymm_b, m); cc.vpcmpeqq(ymm_a, ymm_b, m); cc.vpcmpeqw(ymm_a, ymm_b, m); cc.vpcmpgtb(ymm_a, ymm_b, m); cc.vpcmpgtd(ymm_a, ymm_b, m); cc.vpcmpgtq(ymm_a, ymm_b, m); cc.vpcmpgtw(ymm_a, ymm_b, m); cc.vperm2i128(ymm_a, ymm_b, m, 0); cc.vpermd(ymm_a, ymm_b, m); cc.vpermps(ymm_a, ymm_b, m); cc.vpermpd(ymm_a, m, 0); cc.vpermq(ymm_a, m, 0); cc.vpgatherdd(xmm_a, vx_ptr, xmm_c); cc.vpgatherdd(ymm_a, vy_ptr, ymm_c); cc.vpgatherdq(xmm_a, vx_ptr, xmm_c); cc.vpgatherdq(ymm_a, vx_ptr, ymm_c); cc.vpgatherqd(xmm_a, vx_ptr, xmm_c); cc.vpgatherqd(xmm_a, vy_ptr, xmm_c); cc.vpgatherqq(xmm_a, vx_ptr, xmm_c); cc.vpgatherqq(ymm_a, vy_ptr, ymm_c); cc.vpmovsxbd(ymm_a, m); cc.vpmovsxbq(ymm_a, m); cc.vpmovsxbw(ymm_a, m); cc.vpmovsxdq(ymm_a, m); cc.vpmovsxwd(ymm_a, m); cc.vpmovsxwq(ymm_a, m); cc.vpmovzxbd(ymm_a, m); cc.vpmovzxbq(ymm_a, m); cc.vpmovzxbw(ymm_a, m); cc.vpmovzxdq(ymm_a, m); cc.vpmovzxwd(ymm_a, m); cc.vpmovzxwq(ymm_a, m); cc.vpshufd(ymm_a, m, 0); cc.vpshufhw(ymm_a, m, 0); cc.vpshuflw(ymm_a, m, 0); cc.vphaddd(ymm_a, ymm_b, m); cc.vphaddsw(ymm_a, ymm_b, m); cc.vphaddw(ymm_a, ymm_b, m); cc.vphsubd(ymm_a, ymm_b, m); cc.vphsubsw(ymm_a, ymm_b, m); cc.vphsubw(ymm_a, ymm_b, m); cc.vpmaddubsw(ymm_a, ymm_b, m); cc.vpmaddwd(ymm_a, ymm_b, m); cc.vpmaskmovd(m, xmm_b, xmm_c); cc.vpmaskmovd(m, ymm_b, ymm_c); cc.vpmaskmovd(xmm_a, xmm_b, m); cc.vpmaskmovd(ymm_a, ymm_b, m); cc.vpmaskmovq(m, xmm_b, xmm_c); cc.vpmaskmovq(m, ymm_b, ymm_c); cc.vpmaskmovq(xmm_a, xmm_b, m); cc.vpmaskmovq(ymm_a, ymm_b, m); cc.vpmaxsb(ymm_a, ymm_b, m); cc.vpmaxsd(ymm_a, ymm_b, m); cc.vpmaxsw(ymm_a, ymm_b, m); cc.vpmaxub(ymm_a, ymm_b, m); cc.vpmaxud(ymm_a, ymm_b, m); cc.vpmaxuw(ymm_a, ymm_b, m); cc.vpminsb(ymm_a, ymm_b, m); cc.vpminsd(ymm_a, ymm_b, m); cc.vpminsw(ymm_a, ymm_b, m); cc.vpminub(ymm_a, ymm_b, m); cc.vpminud(ymm_a, ymm_b, m); cc.vpminuw(ymm_a, ymm_b, m); cc.vpmuldq(ymm_a, ymm_b, m); cc.vpmulhrsw(ymm_a, ymm_b, m); cc.vpmulhuw(ymm_a, ymm_b, m); cc.vpmulhw(ymm_a, ymm_b, m); cc.vpmulld(ymm_a, ymm_b, m); cc.vpmullw(ymm_a, ymm_b, m); cc.vpmuludq(ymm_a, ymm_b, m); cc.vpor(ymm_a, ymm_b, m); cc.vpsadbw(ymm_a, ymm_b, m); cc.vpshufb(ymm_a, ymm_b, m); cc.vpsignb(ymm_a, ymm_b, m); cc.vpsignd(ymm_a, ymm_b, m); cc.vpsignw(ymm_a, ymm_b, m); cc.vpslld(ymm_a, ymm_b, m); cc.vpsllq(ymm_a, ymm_b, m); cc.vpsllvd(xmm_a, xmm_b, m); cc.vpsllvd(ymm_a, ymm_b, m); cc.vpsllvq(xmm_a, xmm_b, m); cc.vpsllvq(ymm_a, ymm_b, m); cc.vpsllw(ymm_a, ymm_b, m); cc.vpsrad(ymm_a, ymm_b, m); cc.vpsravd(xmm_a, xmm_b, m); cc.vpsravd(ymm_a, ymm_b, m); cc.vpsraw(ymm_a, ymm_b, m); cc.vpsrld(ymm_a, ymm_b, m); cc.vpsrlq(ymm_a, ymm_b, m); cc.vpsrlvd(xmm_a, xmm_b, m); cc.vpsrlvd(ymm_a, ymm_b, m); cc.vpsrlvq(xmm_a, xmm_b, m); cc.vpsrlvq(ymm_a, ymm_b, m); cc.vpsrlw(ymm_a, ymm_b, m); cc.vpsubb(ymm_a, ymm_b, m); cc.vpsubd(ymm_a, ymm_b, m); cc.vpsubq(ymm_a, ymm_b, m); cc.vpsubsb(ymm_a, ymm_b, m); cc.vpsubsw(ymm_a, ymm_b, m); cc.vpsubusb(ymm_a, ymm_b, m); cc.vpsubusw(ymm_a, ymm_b, m); cc.vpsubw(ymm_a, ymm_b, m); cc.vpunpckhbw(ymm_a, ymm_b, m); cc.vpunpckhdq(ymm_a, ymm_b, m); cc.vpunpckhqdq(ymm_a, ymm_b, m); cc.vpunpckhwd(ymm_a, ymm_b, m); cc.vpunpcklbw(ymm_a, ymm_b, m); cc.vpunpckldq(ymm_a, ymm_b, m); cc.vpunpcklqdq(ymm_a, ymm_b, m); cc.vpunpcklwd(ymm_a, ymm_b, m); cc.vpxor(ymm_a, ymm_b, m); } // Generates a long sequence of AVX instructions. template static void generate_avx_sequenceInternal( Emitter& cc, InstForm form, const x86::Gp& gp, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { if (form == InstForm::kReg) generate_avx_sequenceInternalRegOnly(cc, gp, vec_a, vec_b, vec_c, vec_d); else generate_avx_sequenceInternalRegMem(cc, gp, vec_a, vec_b, vec_c, vec_d); } static void generate_avx_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp gp = cc.new_gpz("gp"); x86::Vec a = cc.new_ymm("a"); x86::Vec b = cc.new_ymm("b"); x86::Vec c = cc.new_ymm("c"); x86::Vec d = cc.new_ymm("d"); cc.add_func(FuncSignature::build()); generate_avx_sequenceInternal(cc, form, gp, a, b, c, d); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3); frame.finalize(); cc.emit_prolog(frame); generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3); cc.emit_epilog(frame); } else { generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, ymm0, ymm1, ymm2, ymm3); frame.finalize(); cc.emit_prolog(frame); generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3); cc.emit_epilog(frame); } else { generate_avx_sequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3); } return; } } // Generates a long sequence of AVX512 instructions. template static void generate_avx512_sequence_internal_reg_only( Emitter& cc, const x86::Gp& gp, const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { x86::Gp gpd = gp.r32(); x86::Gp gpq = gp.r64(); x86::Gp gpz = cc.is_32bit() ? gpd : gpq; x86::Vec xmm_a = vec_a.xmm(); x86::Vec xmm_b = vec_b.xmm(); x86::Vec xmm_c = vec_c.xmm(); x86::Vec xmm_d = vec_d.xmm(); x86::Vec ymm_a = vec_a.ymm(); x86::Vec ymm_b = vec_b.ymm(); x86::Vec ymm_c = vec_c.ymm(); x86::Vec zmm_a = vec_a.zmm(); x86::Vec zmm_b = vec_b.zmm(); x86::Vec zmm_c = vec_c.zmm(); cc.xor_(gpd, gpd); cc.vxorps(xmm_a, xmm_a, xmm_a); cc.vxorps(xmm_b, xmm_b, xmm_b); cc.vxorps(xmm_c, xmm_c, xmm_c); cc.vxorps(xmm_d, xmm_d, xmm_d); cc.kaddb(kA, kB, kC); cc.kaddd(kA, kB, kC); cc.kaddq(kA, kB, kC); cc.kaddw(kA, kB, kC); cc.kandb(kA, kB, kC); cc.kandd(kA, kB, kC); cc.kandnb(kA, kB, kC); cc.kandnd(kA, kB, kC); cc.kandnq(kA, kB, kC); cc.kandnw(kA, kB, kC); cc.kandq(kA, kB, kC); cc.kandw(kA, kB, kC); cc.kmovb(kA, kB); cc.kmovb(kA, gpd); cc.kmovb(gpd, kB); cc.kmovd(kA, kB); cc.kmovd(kA, gpd); cc.kmovd(gpd, kB); cc.kmovq(kA, kB); if (cc.is_64bit()) cc.kmovq(kA, gpq); if (cc.is_64bit()) cc.kmovq(gpq, kB); cc.kmovw(kA, kB); cc.kmovw(kA, gpd); cc.kmovw(gpd, kB); cc.knotb(kA, kB); cc.knotd(kA, kB); cc.knotq(kA, kB); cc.knotw(kA, kB); cc.korb(kA, kB, kC); cc.kord(kA, kB, kC); cc.korq(kA, kB, kC); cc.kortestb(kA, kB); cc.kortestd(kA, kB); cc.kortestq(kA, kB); cc.kortestw(kA, kB); cc.korw(kA, kB, kC); cc.kshiftlb(kA, kB, 0); cc.kshiftld(kA, kB, 0); cc.kshiftlq(kA, kB, 0); cc.kshiftlw(kA, kB, 0); cc.kshiftrb(kA, kB, 0); cc.kshiftrd(kA, kB, 0); cc.kshiftrq(kA, kB, 0); cc.kshiftrw(kA, kB, 0); cc.ktestb(kA, kB); cc.ktestd(kA, kB); cc.ktestq(kA, kB); cc.ktestw(kA, kB); cc.kunpckbw(kA, kB, kC); cc.kunpckdq(kA, kB, kC); cc.kunpckwd(kA, kB, kC); cc.kxnorb(kA, kB, kC); cc.kxnord(kA, kB, kC); cc.kxnorq(kA, kB, kC); cc.kxnorw(kA, kB, kC); cc.kxorb(kA, kB, kC); cc.kxord(kA, kB, kC); cc.kxorq(kA, kB, kC); cc.kxorw(kA, kB, kC); cc.nop(); cc.evex().vaddpd(xmm_a, xmm_b, xmm_c); cc.evex().vaddpd(ymm_a, ymm_b, ymm_c); cc.evex().vaddpd(zmm_a, zmm_b, zmm_c); cc.evex().vaddps(xmm_a, xmm_b, xmm_c); cc.evex().vaddps(ymm_a, ymm_b, ymm_c); cc.evex().vaddps(zmm_a, zmm_b, zmm_c); cc.evex().vaddsd(xmm_a, xmm_b, xmm_c); cc.evex().vaddss(xmm_a, xmm_b, xmm_c); cc.evex().valignd(xmm_a, xmm_b, xmm_c, 0); cc.evex().valignd(ymm_a, ymm_b, ymm_c, 0); cc.evex().valignd(zmm_a, zmm_b, zmm_c, 0); cc.evex().valignq(xmm_a, xmm_b, xmm_c, 0); cc.evex().valignq(ymm_a, ymm_b, ymm_c, 0); cc.evex().valignq(zmm_a, zmm_b, zmm_c, 0); cc.evex().vandnpd(xmm_a, xmm_b, xmm_c); cc.evex().vandnpd(ymm_a, ymm_b, ymm_c); cc.evex().vandnpd(zmm_a, zmm_b, zmm_c); cc.evex().vandnps(xmm_a, xmm_b, xmm_c); cc.evex().vandnps(ymm_a, ymm_b, ymm_c); cc.evex().vandnps(zmm_a, zmm_b, zmm_c); cc.evex().vandpd(xmm_a, xmm_b, xmm_c); cc.evex().vandpd(ymm_a, ymm_b, ymm_c); cc.evex().vandpd(zmm_a, zmm_b, zmm_c); cc.evex().vandps(xmm_a, xmm_b, xmm_c); cc.evex().vandps(ymm_a, ymm_b, ymm_c); cc.evex().vandps(zmm_a, zmm_b, zmm_c); cc.evex().vblendmpd(xmm_a, xmm_b, xmm_c); cc.evex().vblendmpd(ymm_a, ymm_b, ymm_c); cc.evex().vblendmpd(zmm_a, zmm_b, zmm_c); cc.evex().vblendmps(xmm_a, xmm_b, xmm_c); cc.evex().vblendmps(ymm_a, ymm_b, ymm_c); cc.evex().vblendmps(zmm_a, zmm_b, zmm_c); cc.evex().vbroadcastf32x2(ymm_a, xmm_b); cc.evex().vbroadcastf32x2(zmm_a, xmm_b); cc.evex().vbroadcasti32x2(xmm_a, xmm_b); cc.evex().vbroadcasti32x2(ymm_a, xmm_b); cc.evex().vbroadcasti32x2(zmm_a, xmm_b); cc.evex().vbroadcastsd(ymm_a, xmm_b); cc.evex().vbroadcastsd(zmm_a, xmm_b); cc.evex().vbroadcastss(xmm_a, xmm_b); cc.evex().vbroadcastss(ymm_a, xmm_b); cc.evex().vbroadcastss(zmm_a, xmm_b); cc.evex().vcmppd(kA, xmm_b, xmm_c, 0); cc.evex().vcmppd(kA, ymm_b, ymm_c, 0); cc.evex().vcmppd(kA, zmm_b, zmm_c, 0); cc.evex().vcmpps(kA, xmm_b, xmm_c, 0); cc.evex().vcmpps(kA, ymm_b, ymm_c, 0); cc.evex().vcmpps(kA, zmm_b, zmm_c, 0); cc.evex().vcmpsd(kA, xmm_b, xmm_c, 0); cc.evex().vcmpss(kA, xmm_b, xmm_c, 0); cc.evex().vcomisd(xmm_a, xmm_b); cc.evex().vcomiss(xmm_a, xmm_b); cc.evex().vcompresspd(xmm_a, xmm_b); cc.evex().vcompresspd(ymm_a, ymm_b); cc.evex().vcompresspd(zmm_a, zmm_b); cc.evex().vcompressps(xmm_a, xmm_b); cc.evex().vcompressps(ymm_a, ymm_b); cc.evex().vcompressps(zmm_a, zmm_b); cc.evex().vcvtdq2pd(xmm_a, xmm_b); cc.evex().vcvtdq2pd(ymm_a, xmm_b); cc.evex().vcvtdq2pd(zmm_a, ymm_b); cc.evex().vcvtdq2ps(xmm_a, xmm_b); cc.evex().vcvtdq2ps(ymm_a, ymm_b); cc.evex().vcvtdq2ps(zmm_a, zmm_b); cc.evex().vcvtpd2dq(xmm_a, xmm_b); cc.evex().vcvtpd2dq(xmm_a, ymm_b); cc.evex().vcvtpd2dq(ymm_a, zmm_b); cc.evex().vcvtpd2qq(xmm_a, xmm_b); cc.evex().vcvtpd2qq(ymm_a, ymm_b); cc.evex().vcvtpd2qq(zmm_a, zmm_b); cc.evex().vcvtpd2udq(xmm_a, xmm_b); cc.evex().vcvtpd2udq(xmm_a, ymm_b); cc.evex().vcvtpd2udq(ymm_a, zmm_b); cc.evex().vcvtpd2uqq(xmm_a, xmm_b); cc.evex().vcvtpd2uqq(ymm_a, ymm_b); cc.evex().vcvtpd2uqq(zmm_a, zmm_b); cc.evex().vcvtph2ps(xmm_a, xmm_b); cc.evex().vcvtph2ps(ymm_a, xmm_b); cc.evex().vcvtph2ps(zmm_a, ymm_b); cc.evex().vcvtps2dq(xmm_a, xmm_b); cc.evex().vcvtps2dq(ymm_a, ymm_b); cc.evex().vcvtps2dq(zmm_a, zmm_b); cc.evex().vcvtps2pd(xmm_a, xmm_b); cc.evex().vcvtps2pd(ymm_a, xmm_b); cc.evex().vcvtps2pd(zmm_a, ymm_b); cc.evex().vcvtps2ph(xmm_a, xmm_b, 0); cc.evex().vcvtps2ph(xmm_a, ymm_b, 0); cc.evex().vcvtps2ph(ymm_a, zmm_b, 0); cc.evex().vcvtps2qq(xmm_a, xmm_b); cc.evex().vcvtps2qq(ymm_a, xmm_b); cc.evex().vcvtps2qq(zmm_a, ymm_b); cc.evex().vcvtps2udq(xmm_a, xmm_b); cc.evex().vcvtps2udq(ymm_a, ymm_b); cc.evex().vcvtps2udq(zmm_a, zmm_b); cc.evex().vcvtps2uqq(xmm_a, xmm_b); cc.evex().vcvtps2uqq(ymm_a, xmm_b); cc.evex().vcvtps2uqq(zmm_a, ymm_b); cc.evex().vcvtqq2pd(xmm_a, xmm_b); cc.evex().vcvtqq2pd(ymm_a, ymm_b); cc.evex().vcvtqq2pd(zmm_a, zmm_b); cc.evex().vcvtqq2ps(xmm_a, xmm_b); cc.evex().vcvtqq2ps(xmm_a, ymm_b); cc.evex().vcvtqq2ps(ymm_a, zmm_b); cc.evex().vcvtsd2si(gpd, xmm_b); cc.evex().vcvtsd2si(gpz, xmm_b); cc.evex().vcvtsd2ss(xmm_a, xmm_b, xmm_c); cc.evex().vcvtsd2usi(gpd, xmm_b); cc.evex().vcvtsd2usi(gpz, xmm_b); cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpd); cc.evex().vcvtsi2sd(xmm_a, xmm_b, gpz); cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpd); cc.evex().vcvtsi2ss(xmm_a, xmm_b, gpz); cc.evex().vcvtss2sd(xmm_a, xmm_b, xmm_c); cc.evex().vcvtss2si(gpd, xmm_b); cc.evex().vcvtss2si(gpz, xmm_b); cc.evex().vcvtss2usi(gpd, xmm_b); cc.evex().vcvtss2usi(gpz, xmm_b); cc.evex().vcvttpd2dq(xmm_a, xmm_b); cc.evex().vcvttpd2dq(xmm_a, ymm_b); cc.evex().vcvttpd2dq(ymm_a, zmm_b); cc.evex().vcvttpd2qq(xmm_a, xmm_b); cc.evex().vcvttpd2qq(ymm_a, ymm_b); cc.evex().vcvttpd2qq(zmm_a, zmm_b); cc.evex().vcvttpd2udq(xmm_a, xmm_b); cc.evex().vcvttpd2udq(xmm_a, ymm_b); cc.evex().vcvttpd2udq(ymm_a, zmm_b); cc.evex().vcvttpd2uqq(xmm_a, xmm_b); cc.evex().vcvttpd2uqq(ymm_a, ymm_b); cc.evex().vcvttpd2uqq(zmm_a, zmm_b); cc.evex().vcvttps2dq(xmm_a, xmm_b); cc.evex().vcvttps2dq(ymm_a, ymm_b); cc.evex().vcvttps2dq(zmm_a, zmm_b); cc.evex().vcvttps2qq(xmm_a, xmm_b); cc.evex().vcvttps2qq(ymm_a, xmm_b); cc.evex().vcvttps2qq(zmm_a, ymm_b); cc.evex().vcvttps2udq(xmm_a, xmm_b); cc.evex().vcvttps2udq(ymm_a, ymm_b); cc.evex().vcvttps2udq(zmm_a, zmm_b); cc.evex().vcvttps2uqq(xmm_a, xmm_b); cc.evex().vcvttps2uqq(ymm_a, xmm_b); cc.evex().vcvttps2uqq(zmm_a, ymm_b); cc.evex().vcvttsd2si(gpd, xmm_b); cc.evex().vcvttsd2si(gpz, xmm_b); cc.evex().vcvttsd2usi(gpd, xmm_b); cc.evex().vcvttsd2usi(gpz, xmm_b); cc.evex().vcvttss2si(gpd, xmm_b); cc.evex().vcvttss2si(gpz, xmm_b); cc.evex().vcvttss2usi(gpd, xmm_b); cc.evex().vcvttss2usi(gpz, xmm_b); cc.evex().vcvtudq2pd(xmm_a, xmm_b); cc.evex().vcvtudq2pd(ymm_a, xmm_b); cc.evex().vcvtudq2pd(zmm_a, ymm_b); cc.evex().vcvtudq2ps(xmm_a, xmm_b); cc.evex().vcvtudq2ps(ymm_a, ymm_b); cc.evex().vcvtudq2ps(zmm_a, zmm_b); cc.evex().vcvtuqq2pd(xmm_a, xmm_b); cc.evex().vcvtuqq2pd(ymm_a, ymm_b); cc.evex().vcvtuqq2pd(zmm_a, zmm_b); cc.evex().vcvtuqq2ps(xmm_a, xmm_b); cc.evex().vcvtuqq2ps(xmm_a, ymm_b); cc.evex().vcvtuqq2ps(ymm_a, zmm_b); cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpd); cc.evex().vcvtusi2sd(xmm_a, xmm_b, gpz); cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpd); cc.evex().vcvtusi2ss(xmm_a, xmm_b, gpz); cc.evex().vdbpsadbw(xmm_a, xmm_b, xmm_c, 0); cc.evex().vdbpsadbw(ymm_a, ymm_b, ymm_c, 0); cc.evex().vdbpsadbw(zmm_a, zmm_b, zmm_c, 0); cc.evex().vdivpd(xmm_a, xmm_b, xmm_c); cc.evex().vdivpd(ymm_a, ymm_b, ymm_c); cc.evex().vdivpd(zmm_a, zmm_b, zmm_c); cc.evex().vdivps(xmm_a, xmm_b, xmm_c); cc.evex().vdivps(ymm_a, ymm_b, ymm_c); cc.evex().vdivps(zmm_a, zmm_b, zmm_c); cc.evex().vdivsd(xmm_a, xmm_b, xmm_c); cc.evex().vdivss(xmm_a, xmm_b, xmm_c); cc.evex().vexpandpd(xmm_a, xmm_b); cc.evex().vexpandpd(ymm_a, ymm_b); cc.evex().vexpandpd(zmm_a, zmm_b); cc.evex().vexpandps(xmm_a, xmm_b); cc.evex().vexpandps(ymm_a, ymm_b); cc.evex().vexpandps(zmm_a, zmm_b); cc.evex().vextractf32x4(xmm_a, ymm_b, 0); cc.evex().vextractf32x4(xmm_a, zmm_b, 0); cc.evex().vextractf32x8(ymm_a, zmm_b, 0); cc.evex().vextractf64x2(xmm_a, ymm_b, 0); cc.evex().vextractf64x2(xmm_a, zmm_b, 0); cc.evex().vextractf64x4(ymm_a, zmm_b, 0); cc.evex().vextracti32x4(xmm_a, ymm_b, 0); cc.evex().vextracti32x4(xmm_a, zmm_b, 0); cc.evex().vextracti32x8(ymm_a, zmm_b, 0); cc.evex().vextracti64x2(xmm_a, ymm_b, 0); cc.evex().vextracti64x2(xmm_a, zmm_b, 0); cc.evex().vextracti64x4(ymm_a, zmm_b, 0); cc.evex().vextractps(gpd, xmm_b, 0); cc.evex().vfixupimmpd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vfixupimmpd(ymm_a, ymm_b, ymm_c, 0); cc.evex().vfixupimmpd(zmm_a, zmm_b, zmm_c, 0); cc.evex().vfixupimmps(xmm_a, xmm_b, xmm_c, 0); cc.evex().vfixupimmps(ymm_a, ymm_b, ymm_c, 0); cc.evex().vfixupimmps(zmm_a, zmm_b, zmm_c, 0); cc.evex().vfixupimmsd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vfixupimmss(xmm_a, xmm_b, xmm_c, 0); cc.evex().vfmadd132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd132sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd132ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd213sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd213ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmadd231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmadd231sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmadd231ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmaddsub132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmaddsub213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmaddsub213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmaddsub231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmaddsub231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmaddsub231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmaddsub231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub132sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub132ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub213sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub213ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsub231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsub231sd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsub231ss(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsubadd132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsubadd213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsubadd213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfmsubadd231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfmsubadd231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfmsubadd231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfmsubadd231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd132sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd132ss(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd213sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd213ss(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmadd231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmadd231sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmadd231ss(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub132pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub132pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub132pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub132ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub132ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub132ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub132sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub132ss(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub213pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub213pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub213pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub213ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub213ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub213ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub213sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub213ss(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub231pd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub231pd(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub231pd(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub231ps(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub231ps(ymm_a, ymm_b, ymm_c); cc.evex().vfnmsub231ps(zmm_a, zmm_b, zmm_c); cc.evex().vfnmsub231sd(xmm_a, xmm_b, xmm_c); cc.evex().vfnmsub231ss(xmm_a, xmm_b, xmm_c); cc.evex().vfpclasspd(kA, xmm_b, 0); cc.evex().vfpclasspd(kA, ymm_b, 0); cc.evex().vfpclasspd(kA, zmm_b, 0); cc.evex().vfpclassps(kA, xmm_b, 0); cc.evex().vfpclassps(kA, ymm_b, 0); cc.evex().vfpclassps(kA, zmm_b, 0); cc.evex().vfpclasssd(kA, xmm_b, 0); cc.evex().vfpclassss(kA, xmm_b, 0); cc.evex().vgetexppd(xmm_a, xmm_b); cc.evex().vgetexppd(ymm_a, ymm_b); cc.evex().vgetexppd(zmm_a, zmm_b); cc.evex().vgetexpps(xmm_a, xmm_b); cc.evex().vgetexpps(ymm_a, ymm_b); cc.evex().vgetexpps(zmm_a, zmm_b); cc.evex().vgetexpsd(xmm_a, xmm_b, xmm_c); cc.evex().vgetexpss(xmm_a, xmm_b, xmm_c); cc.evex().vgetmantpd(xmm_a, xmm_b, 0); cc.evex().vgetmantpd(ymm_a, ymm_b, 0); cc.evex().vgetmantpd(zmm_a, zmm_b, 0); cc.evex().vgetmantps(xmm_a, xmm_b, 0); cc.evex().vgetmantps(ymm_a, ymm_b, 0); cc.evex().vgetmantps(zmm_a, zmm_b, 0); cc.evex().vgetmantsd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vgetmantss(xmm_a, xmm_b, xmm_c, 0); cc.evex().vinsertf32x4(ymm_a, ymm_b, xmm_c, 0); cc.evex().vinsertf32x4(zmm_a, zmm_b, xmm_c, 0); cc.evex().vinsertf32x8(zmm_a, zmm_b, ymm_c, 0); cc.evex().vinsertf64x2(ymm_a, ymm_b, xmm_c, 0); cc.evex().vinsertf64x2(zmm_a, zmm_b, xmm_c, 0); cc.evex().vinsertf64x4(zmm_a, zmm_b, ymm_c, 0); cc.evex().vinserti32x4(ymm_a, ymm_b, xmm_c, 0); cc.evex().vinserti32x4(zmm_a, zmm_b, xmm_c, 0); cc.evex().vinserti32x8(zmm_a, zmm_b, ymm_c, 0); cc.evex().vinserti64x2(ymm_a, ymm_b, xmm_c, 0); cc.evex().vinserti64x2(zmm_a, zmm_b, xmm_c, 0); cc.evex().vinserti64x4(zmm_a, zmm_b, ymm_c, 0); cc.evex().vinsertps(xmm_a, xmm_b, xmm_c, 0); cc.evex().vmaxpd(xmm_a, xmm_b, xmm_c); cc.evex().vmaxpd(ymm_a, ymm_b, ymm_c); cc.evex().vmaxpd(zmm_a, zmm_b, zmm_c); cc.evex().vmaxps(xmm_a, xmm_b, xmm_c); cc.evex().vmaxps(ymm_a, ymm_b, ymm_c); cc.evex().vmaxps(zmm_a, zmm_b, zmm_c); cc.evex().vmaxsd(xmm_a, xmm_b, xmm_c); cc.evex().vmaxss(xmm_a, xmm_b, xmm_c); cc.evex().vminpd(xmm_a, xmm_b, xmm_c); cc.evex().vminpd(ymm_a, ymm_b, ymm_c); cc.evex().vminpd(zmm_a, zmm_b, zmm_c); cc.evex().vminps(xmm_a, xmm_b, xmm_c); cc.evex().vminps(ymm_a, ymm_b, ymm_c); cc.evex().vminps(zmm_a, zmm_b, zmm_c); cc.evex().vminsd(xmm_a, xmm_b, xmm_c); cc.evex().vminss(xmm_a, xmm_b, xmm_c); cc.evex().vmovapd(xmm_a, xmm_b); cc.evex().vmovapd(xmm_a, xmm_b); cc.evex().vmovapd(ymm_a, ymm_b); cc.evex().vmovapd(ymm_a, ymm_b); cc.evex().vmovapd(zmm_a, zmm_b); cc.evex().vmovapd(zmm_a, zmm_b); cc.evex().vmovaps(xmm_a, xmm_b); cc.evex().vmovaps(xmm_a, xmm_b); cc.evex().vmovaps(ymm_a, ymm_b); cc.evex().vmovaps(ymm_a, ymm_b); cc.evex().vmovaps(zmm_a, zmm_b); cc.evex().vmovaps(zmm_a, zmm_b); cc.evex().vmovd(gpd, xmm_b); cc.evex().vmovd(xmm_a, gpd); cc.evex().vmovddup(xmm_a, xmm_b); cc.evex().vmovddup(ymm_a, ymm_b); cc.evex().vmovddup(zmm_a, zmm_b); cc.evex().vmovdqa32(xmm_a, xmm_b); cc.evex().vmovdqa32(xmm_a, xmm_b); cc.evex().vmovdqa32(ymm_a, ymm_b); cc.evex().vmovdqa32(ymm_a, ymm_b); cc.evex().vmovdqa32(zmm_a, zmm_b); cc.evex().vmovdqa32(zmm_a, zmm_b); cc.evex().vmovdqa64(xmm_a, xmm_b); cc.evex().vmovdqa64(xmm_a, xmm_b); cc.evex().vmovdqa64(ymm_a, ymm_b); cc.evex().vmovdqa64(ymm_a, ymm_b); cc.evex().vmovdqa64(zmm_a, zmm_b); cc.evex().vmovdqa64(zmm_a, zmm_b); cc.evex().vmovdqu16(xmm_a, xmm_b); cc.evex().vmovdqu16(xmm_a, xmm_b); cc.evex().vmovdqu16(ymm_a, ymm_b); cc.evex().vmovdqu16(ymm_a, ymm_b); cc.evex().vmovdqu16(zmm_a, zmm_b); cc.evex().vmovdqu16(zmm_a, zmm_b); cc.evex().vmovdqu32(xmm_a, xmm_b); cc.evex().vmovdqu32(xmm_a, xmm_b); cc.evex().vmovdqu32(ymm_a, ymm_b); cc.evex().vmovdqu32(ymm_a, ymm_b); cc.evex().vmovdqu32(zmm_a, zmm_b); cc.evex().vmovdqu32(zmm_a, zmm_b); cc.evex().vmovdqu64(xmm_a, xmm_b); cc.evex().vmovdqu64(xmm_a, xmm_b); cc.evex().vmovdqu64(ymm_a, ymm_b); cc.evex().vmovdqu64(ymm_a, ymm_b); cc.evex().vmovdqu64(zmm_a, zmm_b); cc.evex().vmovdqu64(zmm_a, zmm_b); cc.evex().vmovdqu8(xmm_a, xmm_b); cc.evex().vmovdqu8(xmm_a, xmm_b); cc.evex().vmovdqu8(ymm_a, ymm_b); cc.evex().vmovdqu8(ymm_a, ymm_b); cc.evex().vmovdqu8(zmm_a, zmm_b); cc.evex().vmovdqu8(zmm_a, zmm_b); cc.evex().vmovhlps(xmm_a, xmm_b, xmm_c); if (cc.is_64bit()) cc.evex().vmovq(gpq, xmm_b); if (cc.is_64bit()) cc.evex().vmovq(xmm_a, gpq); cc.evex().vmovq(xmm_a, xmm_b); cc.evex().vmovsd(xmm_a, xmm_b, xmm_c); cc.evex().vmovshdup(xmm_a, xmm_b); cc.evex().vmovshdup(ymm_a, ymm_b); cc.evex().vmovshdup(zmm_a, zmm_b); cc.evex().vmovsldup(xmm_a, xmm_b); cc.evex().vmovsldup(ymm_a, ymm_b); cc.evex().vmovsldup(zmm_a, zmm_b); cc.evex().vmovss(xmm_a, xmm_b, xmm_c); cc.evex().vmovupd(xmm_a, xmm_b); cc.evex().vmovupd(xmm_a, xmm_b); cc.evex().vmovupd(ymm_a, ymm_b); cc.evex().vmovupd(ymm_a, ymm_b); cc.evex().vmovupd(zmm_a, zmm_b); cc.evex().vmovupd(zmm_a, zmm_b); cc.evex().vmovups(xmm_a, xmm_b); cc.evex().vmovups(xmm_a, xmm_b); cc.evex().vmovups(ymm_a, ymm_b); cc.evex().vmovups(ymm_a, ymm_b); cc.evex().vmovups(zmm_a, zmm_b); cc.evex().vmovups(zmm_a, zmm_b); cc.evex().vmulpd(xmm_a, xmm_b, xmm_c); cc.evex().vmulpd(ymm_a, ymm_b, ymm_c); cc.evex().vmulpd(zmm_a, zmm_b, zmm_c); cc.evex().vmulps(xmm_a, xmm_b, xmm_c); cc.evex().vmulps(ymm_a, ymm_b, ymm_c); cc.evex().vmulps(zmm_a, zmm_b, zmm_c); cc.evex().vmulsd(xmm_a, xmm_b, xmm_c); cc.evex().vmulss(xmm_a, xmm_b, xmm_c); cc.evex().vorpd(xmm_a, xmm_b, xmm_c); cc.evex().vorpd(ymm_a, ymm_b, ymm_c); cc.evex().vorpd(zmm_a, zmm_b, zmm_c); cc.evex().vorps(xmm_a, xmm_b, xmm_c); cc.evex().vorps(ymm_a, ymm_b, ymm_c); cc.evex().vorps(zmm_a, zmm_b, zmm_c); cc.evex().vpabsb(xmm_a, xmm_b); cc.evex().vpabsb(ymm_a, ymm_b); cc.evex().vpabsb(zmm_a, zmm_b); cc.evex().vpabsd(xmm_a, xmm_b); cc.evex().vpabsd(ymm_a, ymm_b); cc.evex().vpabsd(zmm_a, zmm_b); cc.evex().vpabsq(xmm_a, xmm_b); cc.evex().vpabsq(ymm_a, ymm_b); cc.evex().vpabsq(zmm_a, zmm_b); cc.evex().vpabsw(xmm_a, xmm_b); cc.evex().vpabsw(ymm_a, ymm_b); cc.evex().vpabsw(zmm_a, zmm_b); cc.evex().vpackssdw(xmm_a, xmm_b, xmm_c); cc.evex().vpackssdw(ymm_a, ymm_b, ymm_c); cc.evex().vpackssdw(zmm_a, zmm_b, zmm_c); cc.evex().vpacksswb(xmm_a, xmm_b, xmm_c); cc.evex().vpacksswb(ymm_a, ymm_b, ymm_c); cc.evex().vpacksswb(zmm_a, zmm_b, zmm_c); cc.evex().vpackusdw(xmm_a, xmm_b, xmm_c); cc.evex().vpackusdw(ymm_a, ymm_b, ymm_c); cc.evex().vpackusdw(zmm_a, zmm_b, zmm_c); cc.evex().vpackuswb(xmm_a, xmm_b, xmm_c); cc.evex().vpackuswb(ymm_a, ymm_b, ymm_c); cc.evex().vpackuswb(zmm_a, zmm_b, zmm_c); cc.evex().vpaddb(xmm_a, xmm_b, xmm_c); cc.evex().vpaddb(ymm_a, ymm_b, ymm_c); cc.evex().vpaddb(zmm_a, zmm_b, zmm_c); cc.evex().vpaddd(xmm_a, xmm_b, xmm_c); cc.evex().vpaddd(ymm_a, ymm_b, ymm_c); cc.evex().vpaddd(zmm_a, zmm_b, zmm_c); cc.evex().vpaddq(xmm_a, xmm_b, xmm_c); cc.evex().vpaddq(ymm_a, ymm_b, ymm_c); cc.evex().vpaddq(zmm_a, zmm_b, zmm_c); cc.evex().vpaddsb(xmm_a, xmm_b, xmm_c); cc.evex().vpaddsb(ymm_a, ymm_b, ymm_c); cc.evex().vpaddsb(zmm_a, zmm_b, zmm_c); cc.evex().vpaddsw(xmm_a, xmm_b, xmm_c); cc.evex().vpaddsw(ymm_a, ymm_b, ymm_c); cc.evex().vpaddsw(zmm_a, zmm_b, zmm_c); cc.evex().vpaddusb(xmm_a, xmm_b, xmm_c); cc.evex().vpaddusb(ymm_a, ymm_b, ymm_c); cc.evex().vpaddusb(zmm_a, zmm_b, zmm_c); cc.evex().vpaddusw(xmm_a, xmm_b, xmm_c); cc.evex().vpaddusw(ymm_a, ymm_b, ymm_c); cc.evex().vpaddusw(zmm_a, zmm_b, zmm_c); cc.evex().vpaddw(xmm_a, xmm_b, xmm_c); cc.evex().vpaddw(ymm_a, ymm_b, ymm_c); cc.evex().vpaddw(zmm_a, zmm_b, zmm_c); cc.evex().vpalignr(xmm_a, xmm_b, xmm_c, 0); cc.evex().vpalignr(ymm_a, ymm_b, ymm_c, 0); cc.evex().vpalignr(zmm_a, zmm_b, zmm_c, 0); cc.evex().vpandd(xmm_a, xmm_b, xmm_c); cc.evex().vpandd(ymm_a, ymm_b, ymm_c); cc.evex().vpandd(zmm_a, zmm_b, zmm_c); cc.evex().vpandnd(xmm_a, xmm_b, xmm_c); cc.evex().vpandnd(ymm_a, ymm_b, ymm_c); cc.evex().vpandnd(zmm_a, zmm_b, zmm_c); cc.evex().vpandnq(xmm_a, xmm_b, xmm_c); cc.evex().vpandnq(ymm_a, ymm_b, ymm_c); cc.evex().vpandnq(zmm_a, zmm_b, zmm_c); cc.evex().vpandq(xmm_a, xmm_b, xmm_c); cc.evex().vpandq(ymm_a, ymm_b, ymm_c); cc.evex().vpandq(zmm_a, zmm_b, zmm_c); cc.evex().vpavgb(xmm_a, xmm_b, xmm_c); cc.evex().vpavgb(ymm_a, ymm_b, ymm_c); cc.evex().vpavgb(zmm_a, zmm_b, zmm_c); cc.evex().vpavgw(xmm_a, xmm_b, xmm_c); cc.evex().vpavgw(ymm_a, ymm_b, ymm_c); cc.evex().vpavgw(zmm_a, zmm_b, zmm_c); cc.evex().vpblendmb(xmm_a, xmm_b, xmm_c); cc.evex().vpblendmb(ymm_a, ymm_b, ymm_c); cc.evex().vpblendmb(zmm_a, zmm_b, zmm_c); cc.evex().vpblendmd(xmm_a, xmm_b, xmm_c); cc.evex().vpblendmd(ymm_a, ymm_b, ymm_c); cc.evex().vpblendmd(zmm_a, zmm_b, zmm_c); cc.evex().vpblendmq(xmm_a, xmm_b, xmm_c); cc.evex().vpblendmq(ymm_a, ymm_b, ymm_c); cc.evex().vpblendmq(zmm_a, zmm_b, zmm_c); cc.evex().vpblendmw(xmm_a, xmm_b, xmm_c); cc.evex().vpblendmw(ymm_a, ymm_b, ymm_c); cc.evex().vpblendmw(zmm_a, zmm_b, zmm_c); cc.evex().vpbroadcastb(xmm_a, gpd); cc.evex().vpbroadcastb(xmm_a, xmm_b); cc.evex().vpbroadcastb(ymm_a, gpd); cc.evex().vpbroadcastb(ymm_a, xmm_b); cc.evex().vpbroadcastb(zmm_a, gpd); cc.evex().vpbroadcastb(zmm_a, xmm_b); cc.evex().vpbroadcastd(xmm_a, gpd); cc.evex().vpbroadcastd(xmm_a, xmm_b); cc.evex().vpbroadcastd(ymm_a, gpd); cc.evex().vpbroadcastd(ymm_a, xmm_b); cc.evex().vpbroadcastd(zmm_a, gpd); cc.evex().vpbroadcastd(zmm_a, xmm_b); cc.evex().vpbroadcastmb2q(xmm_a, kB); cc.evex().vpbroadcastmb2q(ymm_a, kB); cc.evex().vpbroadcastmb2q(zmm_a, kB); cc.evex().vpbroadcastmw2d(xmm_a, kB); cc.evex().vpbroadcastmw2d(ymm_a, kB); cc.evex().vpbroadcastmw2d(zmm_a, kB); if (cc.is_64bit()) cc.evex().vpbroadcastq(xmm_a, gpq); cc.evex().vpbroadcastq(xmm_a, xmm_b); if (cc.is_64bit()) cc.evex().vpbroadcastq(ymm_a, gpq); cc.evex().vpbroadcastq(ymm_a, xmm_b); if (cc.is_64bit()) cc.evex().vpbroadcastq(zmm_a, gpq); cc.evex().vpbroadcastq(zmm_a, xmm_b); cc.evex().vpbroadcastw(xmm_a, gpd); cc.evex().vpbroadcastw(xmm_a, xmm_b); cc.evex().vpbroadcastw(ymm_a, gpd); cc.evex().vpbroadcastw(ymm_a, xmm_b); cc.evex().vpbroadcastw(zmm_a, gpd); cc.evex().vpbroadcastw(zmm_a, xmm_b); cc.evex().vpcmpb(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpb(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpb(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpd(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpd(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpd(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpeqb(kA, xmm_b, xmm_c); cc.evex().vpcmpeqb(kA, ymm_b, ymm_c); cc.evex().vpcmpeqb(kA, zmm_b, zmm_c); cc.evex().vpcmpeqd(kA, xmm_b, xmm_c); cc.evex().vpcmpeqd(kA, ymm_b, ymm_c); cc.evex().vpcmpeqd(kA, zmm_b, zmm_c); cc.evex().vpcmpeqq(kA, xmm_b, xmm_c); cc.evex().vpcmpeqq(kA, ymm_b, ymm_c); cc.evex().vpcmpeqq(kA, zmm_b, zmm_c); cc.evex().vpcmpeqw(kA, xmm_b, xmm_c); cc.evex().vpcmpeqw(kA, ymm_b, ymm_c); cc.evex().vpcmpeqw(kA, zmm_b, zmm_c); cc.evex().vpcmpgtb(kA, xmm_b, xmm_c); cc.evex().vpcmpgtb(kA, ymm_b, ymm_c); cc.evex().vpcmpgtb(kA, zmm_b, zmm_c); cc.evex().vpcmpgtd(kA, xmm_b, xmm_c); cc.evex().vpcmpgtd(kA, ymm_b, ymm_c); cc.evex().vpcmpgtd(kA, zmm_b, zmm_c); cc.evex().vpcmpgtq(kA, xmm_b, xmm_c); cc.evex().vpcmpgtq(kA, ymm_b, ymm_c); cc.evex().vpcmpgtq(kA, zmm_b, zmm_c); cc.evex().vpcmpgtw(kA, xmm_b, xmm_c); cc.evex().vpcmpgtw(kA, ymm_b, ymm_c); cc.evex().vpcmpgtw(kA, zmm_b, zmm_c); cc.evex().vpcmpq(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpq(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpq(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpub(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpub(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpub(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpud(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpud(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpud(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpuq(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpuq(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpuq(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpuw(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpuw(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpuw(kA, zmm_b, zmm_c, 0); cc.evex().vpcmpw(kA, xmm_b, xmm_c, 0); cc.evex().vpcmpw(kA, ymm_b, ymm_c, 0); cc.evex().vpcmpw(kA, zmm_b, zmm_c, 0); cc.evex().vpcompressd(xmm_a, xmm_b); cc.evex().vpcompressd(ymm_a, ymm_b); cc.evex().vpcompressd(zmm_a, zmm_b); cc.evex().vpcompressq(xmm_a, xmm_b); cc.evex().vpcompressq(ymm_a, ymm_b); cc.evex().vpcompressq(zmm_a, zmm_b); cc.evex().vpconflictd(xmm_a, xmm_b); cc.evex().vpconflictd(ymm_a, ymm_b); cc.evex().vpconflictd(zmm_a, zmm_b); cc.evex().vpconflictq(xmm_a, xmm_b); cc.evex().vpconflictq(ymm_a, ymm_b); cc.evex().vpconflictq(zmm_a, zmm_b); cc.evex().vpermb(xmm_a, xmm_b, xmm_c); cc.evex().vpermb(ymm_a, ymm_b, ymm_c); cc.evex().vpermb(zmm_a, zmm_b, zmm_c); cc.evex().vpermd(ymm_a, ymm_b, ymm_c); cc.evex().vpermd(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2b(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2b(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2b(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2d(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2d(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2d(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2pd(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2pd(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2pd(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2ps(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2ps(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2ps(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2q(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2q(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2q(zmm_a, zmm_b, zmm_c); cc.evex().vpermi2w(xmm_a, xmm_b, xmm_c); cc.evex().vpermi2w(ymm_a, ymm_b, ymm_c); cc.evex().vpermi2w(zmm_a, zmm_b, zmm_c); cc.evex().vpermilpd(xmm_a, xmm_b, xmm_c); cc.evex().vpermilpd(ymm_a, ymm_b, ymm_c); cc.evex().vpermilpd(zmm_a, zmm_b, zmm_c); cc.evex().vpermilpd(xmm_a, xmm_b, 0); cc.evex().vpermilpd(ymm_a, ymm_b, 0); cc.evex().vpermilpd(zmm_a, zmm_b, 0); cc.evex().vpermilps(xmm_a, xmm_b, xmm_c); cc.evex().vpermilps(ymm_a, ymm_b, ymm_c); cc.evex().vpermilps(zmm_a, zmm_b, zmm_c); cc.evex().vpermilps(xmm_a, xmm_b, 0); cc.evex().vpermilps(ymm_a, ymm_b, 0); cc.evex().vpermilps(zmm_a, zmm_b, 0); cc.evex().vpermq(ymm_a, ymm_b, ymm_c); cc.evex().vpermq(zmm_a, zmm_b, zmm_c); cc.evex().vpermq(ymm_a, ymm_b, 0); cc.evex().vpermq(zmm_a, zmm_b, 0); cc.evex().vpermt2b(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2b(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2b(zmm_a, zmm_b, zmm_c); cc.evex().vpermt2d(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2d(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2d(zmm_a, zmm_b, zmm_c); cc.evex().vpermt2pd(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2pd(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2pd(zmm_a, zmm_b, zmm_c); cc.evex().vpermt2ps(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2ps(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2ps(zmm_a, zmm_b, zmm_c); cc.evex().vpermt2q(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2q(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2q(zmm_a, zmm_b, zmm_c); cc.evex().vpermt2w(xmm_a, xmm_b, xmm_c); cc.evex().vpermt2w(ymm_a, ymm_b, ymm_c); cc.evex().vpermt2w(zmm_a, zmm_b, zmm_c); cc.evex().vpermw(xmm_a, xmm_b, xmm_c); cc.evex().vpermw(ymm_a, ymm_b, ymm_c); cc.evex().vpermw(zmm_a, zmm_b, zmm_c); cc.evex().vpexpandd(xmm_a, xmm_b); cc.evex().vpexpandd(ymm_a, ymm_b); cc.evex().vpexpandd(zmm_a, zmm_b); cc.evex().vpexpandq(xmm_a, xmm_b); cc.evex().vpexpandq(ymm_a, ymm_b); cc.evex().vpexpandq(zmm_a, zmm_b); cc.evex().vpextrb(gpd, xmm_b, 0); cc.evex().vpextrd(gpd, xmm_b, 0); if (cc.is_64bit()) cc.evex().vpextrq(gpq, xmm_b, 0); cc.evex().vpextrw(gpd, xmm_b, 0); cc.evex().vpinsrb(xmm_a, xmm_b, gpd, 0); cc.evex().vpinsrd(xmm_a, xmm_b, gpd, 0); if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, gpq, 0); cc.evex().vpinsrw(xmm_a, xmm_b, gpd, 0); cc.evex().vplzcntd(xmm_a, xmm_b); cc.evex().vplzcntd(ymm_a, ymm_b); cc.evex().vplzcntd(zmm_a, zmm_b); cc.evex().vplzcntq(xmm_a, xmm_b); cc.evex().vplzcntq(ymm_a, ymm_b); cc.evex().vplzcntq(zmm_a, zmm_b); cc.evex().vpmadd52huq(xmm_a, xmm_b, xmm_c); cc.evex().vpmadd52huq(ymm_a, ymm_b, ymm_c); cc.evex().vpmadd52huq(zmm_a, zmm_b, zmm_c); cc.evex().vpmadd52luq(xmm_a, xmm_b, xmm_c); cc.evex().vpmadd52luq(ymm_a, ymm_b, ymm_c); cc.evex().vpmadd52luq(zmm_a, zmm_b, zmm_c); cc.evex().vpmaddubsw(xmm_a, xmm_b, xmm_c); cc.evex().vpmaddubsw(ymm_a, ymm_b, ymm_c); cc.evex().vpmaddubsw(zmm_a, zmm_b, zmm_c); cc.evex().vpmaddwd(xmm_a, xmm_b, xmm_c); cc.evex().vpmaddwd(ymm_a, ymm_b, ymm_c); cc.evex().vpmaddwd(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxsb(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxsb(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxsb(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxsd(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxsd(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxsd(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxsq(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxsq(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxsq(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxsw(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxsw(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxsw(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxub(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxub(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxub(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxud(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxud(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxud(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxuq(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxuq(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxuq(zmm_a, zmm_b, zmm_c); cc.evex().vpmaxuw(xmm_a, xmm_b, xmm_c); cc.evex().vpmaxuw(ymm_a, ymm_b, ymm_c); cc.evex().vpmaxuw(zmm_a, zmm_b, zmm_c); cc.evex().vpminsb(xmm_a, xmm_b, xmm_c); cc.evex().vpminsb(ymm_a, ymm_b, ymm_c); cc.evex().vpminsb(zmm_a, zmm_b, zmm_c); cc.evex().vpminsd(xmm_a, xmm_b, xmm_c); cc.evex().vpminsd(ymm_a, ymm_b, ymm_c); cc.evex().vpminsd(zmm_a, zmm_b, zmm_c); cc.evex().vpminsq(xmm_a, xmm_b, xmm_c); cc.evex().vpminsq(ymm_a, ymm_b, ymm_c); cc.evex().vpminsq(zmm_a, zmm_b, zmm_c); cc.evex().vpminsw(xmm_a, xmm_b, xmm_c); cc.evex().vpminsw(ymm_a, ymm_b, ymm_c); cc.evex().vpminsw(zmm_a, zmm_b, zmm_c); cc.evex().vpminub(xmm_a, xmm_b, xmm_c); cc.evex().vpminub(ymm_a, ymm_b, ymm_c); cc.evex().vpminub(zmm_a, zmm_b, zmm_c); cc.evex().vpminud(xmm_a, xmm_b, xmm_c); cc.evex().vpminud(ymm_a, ymm_b, ymm_c); cc.evex().vpminud(zmm_a, zmm_b, zmm_c); cc.evex().vpminuq(xmm_a, xmm_b, xmm_c); cc.evex().vpminuq(ymm_a, ymm_b, ymm_c); cc.evex().vpminuq(zmm_a, zmm_b, zmm_c); cc.evex().vpminuw(xmm_a, xmm_b, xmm_c); cc.evex().vpminuw(ymm_a, ymm_b, ymm_c); cc.evex().vpminuw(zmm_a, zmm_b, zmm_c); cc.evex().vpmovb2m(kA, xmm_b); cc.evex().vpmovb2m(kA, ymm_b); cc.evex().vpmovb2m(kA, zmm_b); cc.evex().vpmovd2m(kA, xmm_b); cc.evex().vpmovd2m(kA, ymm_b); cc.evex().vpmovd2m(kA, zmm_b); cc.evex().vpmovdb(xmm_a, xmm_b); cc.evex().vpmovdb(xmm_a, ymm_b); cc.evex().vpmovdb(xmm_a, zmm_b); cc.evex().vpmovdw(xmm_a, xmm_b); cc.evex().vpmovdw(xmm_a, ymm_b); cc.evex().vpmovdw(ymm_a, zmm_b); cc.evex().vpmovm2b(xmm_a, kB); cc.evex().vpmovm2b(ymm_a, kB); cc.evex().vpmovm2b(zmm_a, kB); cc.evex().vpmovm2d(xmm_a, kB); cc.evex().vpmovm2d(ymm_a, kB); cc.evex().vpmovm2d(zmm_a, kB); cc.evex().vpmovm2q(xmm_a, kB); cc.evex().vpmovm2q(ymm_a, kB); cc.evex().vpmovm2q(zmm_a, kB); cc.evex().vpmovm2w(xmm_a, kB); cc.evex().vpmovm2w(ymm_a, kB); cc.evex().vpmovm2w(zmm_a, kB); cc.evex().vpmovq2m(kA, xmm_b); cc.evex().vpmovq2m(kA, ymm_b); cc.evex().vpmovq2m(kA, zmm_b); cc.evex().vpmovqb(xmm_a, xmm_b); cc.evex().vpmovqb(xmm_a, ymm_b); cc.evex().vpmovqb(xmm_a, zmm_b); cc.evex().vpmovqd(xmm_a, xmm_b); cc.evex().vpmovqd(xmm_a, ymm_b); cc.evex().vpmovqd(ymm_a, zmm_b); cc.evex().vpmovqw(xmm_a, xmm_b); cc.evex().vpmovqw(xmm_a, ymm_b); cc.evex().vpmovqw(xmm_a, zmm_b); cc.evex().vpmovsdb(xmm_a, xmm_b); cc.evex().vpmovsdb(xmm_a, ymm_b); cc.evex().vpmovsdb(xmm_a, zmm_b); cc.evex().vpmovsdw(xmm_a, xmm_b); cc.evex().vpmovsdw(xmm_a, ymm_b); cc.evex().vpmovsdw(ymm_a, zmm_b); cc.evex().vpmovsqb(xmm_a, xmm_b); cc.evex().vpmovsqb(xmm_a, ymm_b); cc.evex().vpmovsqb(xmm_a, zmm_b); cc.evex().vpmovsqd(xmm_a, xmm_b); cc.evex().vpmovsqd(xmm_a, ymm_b); cc.evex().vpmovsqd(ymm_a, zmm_b); cc.evex().vpmovsqw(xmm_a, xmm_b); cc.evex().vpmovsqw(xmm_a, ymm_b); cc.evex().vpmovsqw(xmm_a, zmm_b); cc.evex().vpmovswb(xmm_a, xmm_b); cc.evex().vpmovswb(xmm_a, ymm_b); cc.evex().vpmovswb(ymm_a, zmm_b); cc.evex().vpmovsxbd(xmm_a, xmm_b); cc.evex().vpmovsxbd(ymm_a, xmm_b); cc.evex().vpmovsxbd(zmm_a, xmm_b); cc.evex().vpmovsxbq(xmm_a, xmm_b); cc.evex().vpmovsxbq(ymm_a, xmm_b); cc.evex().vpmovsxbq(zmm_a, xmm_b); cc.evex().vpmovsxbw(xmm_a, xmm_b); cc.evex().vpmovsxbw(ymm_a, xmm_b); cc.evex().vpmovsxbw(zmm_a, ymm_b); cc.evex().vpmovsxdq(xmm_a, xmm_b); cc.evex().vpmovsxdq(ymm_a, xmm_b); cc.evex().vpmovsxdq(zmm_a, ymm_b); cc.evex().vpmovsxwd(xmm_a, xmm_b); cc.evex().vpmovsxwd(ymm_a, xmm_b); cc.evex().vpmovsxwd(zmm_a, ymm_b); cc.evex().vpmovsxwq(xmm_a, xmm_b); cc.evex().vpmovsxwq(ymm_a, xmm_b); cc.evex().vpmovsxwq(zmm_a, xmm_b); cc.evex().vpmovusdb(xmm_a, xmm_b); cc.evex().vpmovusdb(xmm_a, ymm_b); cc.evex().vpmovusdb(xmm_a, zmm_b); cc.evex().vpmovusdw(xmm_a, xmm_b); cc.evex().vpmovusdw(xmm_a, ymm_b); cc.evex().vpmovusdw(ymm_a, zmm_b); cc.evex().vpmovusqb(xmm_a, xmm_b); cc.evex().vpmovusqb(xmm_a, ymm_b); cc.evex().vpmovusqb(xmm_a, zmm_b); cc.evex().vpmovusqd(xmm_a, xmm_b); cc.evex().vpmovusqd(xmm_a, ymm_b); cc.evex().vpmovusqd(ymm_a, zmm_b); cc.evex().vpmovusqw(xmm_a, xmm_b); cc.evex().vpmovusqw(xmm_a, ymm_b); cc.evex().vpmovusqw(xmm_a, zmm_b); cc.evex().vpmovuswb(xmm_a, xmm_b); cc.evex().vpmovuswb(xmm_a, ymm_b); cc.evex().vpmovuswb(ymm_a, zmm_b); cc.evex().vpmovw2m(kA, xmm_b); cc.evex().vpmovw2m(kA, ymm_b); cc.evex().vpmovw2m(kA, zmm_b); cc.evex().vpmovwb(xmm_a, xmm_b); cc.evex().vpmovwb(xmm_a, ymm_b); cc.evex().vpmovwb(ymm_a, zmm_b); cc.evex().vpmovzxbd(xmm_a, xmm_b); cc.evex().vpmovzxbd(ymm_a, xmm_b); cc.evex().vpmovzxbd(zmm_a, xmm_b); cc.evex().vpmovzxbq(xmm_a, xmm_b); cc.evex().vpmovzxbq(ymm_a, xmm_b); cc.evex().vpmovzxbq(zmm_a, xmm_b); cc.evex().vpmovzxbw(xmm_a, xmm_b); cc.evex().vpmovzxbw(ymm_a, xmm_b); cc.evex().vpmovzxbw(zmm_a, ymm_b); cc.evex().vpmovzxdq(xmm_a, xmm_b); cc.evex().vpmovzxdq(ymm_a, xmm_b); cc.evex().vpmovzxdq(zmm_a, ymm_b); cc.evex().vpmovzxwd(xmm_a, xmm_b); cc.evex().vpmovzxwd(ymm_a, xmm_b); cc.evex().vpmovzxwd(zmm_a, ymm_b); cc.evex().vpmovzxwq(xmm_a, xmm_b); cc.evex().vpmovzxwq(ymm_a, xmm_b); cc.evex().vpmovzxwq(zmm_a, xmm_b); cc.evex().vpmuldq(xmm_a, xmm_b, xmm_c); cc.evex().vpmuldq(ymm_a, ymm_b, ymm_c); cc.evex().vpmuldq(zmm_a, zmm_b, zmm_c); cc.evex().vpmulhrsw(xmm_a, xmm_b, xmm_c); cc.evex().vpmulhrsw(ymm_a, ymm_b, ymm_c); cc.evex().vpmulhrsw(zmm_a, zmm_b, zmm_c); cc.evex().vpmulhuw(xmm_a, xmm_b, xmm_c); cc.evex().vpmulhuw(ymm_a, ymm_b, ymm_c); cc.evex().vpmulhuw(zmm_a, zmm_b, zmm_c); cc.evex().vpmulhw(xmm_a, xmm_b, xmm_c); cc.evex().vpmulhw(ymm_a, ymm_b, ymm_c); cc.evex().vpmulhw(zmm_a, zmm_b, zmm_c); cc.evex().vpmulld(xmm_a, xmm_b, xmm_c); cc.evex().vpmulld(ymm_a, ymm_b, ymm_c); cc.evex().vpmulld(zmm_a, zmm_b, zmm_c); cc.evex().vpmullq(xmm_a, xmm_b, xmm_c); cc.evex().vpmullq(ymm_a, ymm_b, ymm_c); cc.evex().vpmullq(zmm_a, zmm_b, zmm_c); cc.evex().vpmullw(xmm_a, xmm_b, xmm_c); cc.evex().vpmullw(ymm_a, ymm_b, ymm_c); cc.evex().vpmullw(zmm_a, zmm_b, zmm_c); cc.evex().vpmultishiftqb(xmm_a, xmm_b, xmm_c); cc.evex().vpmultishiftqb(ymm_a, ymm_b, ymm_c); cc.evex().vpmultishiftqb(zmm_a, zmm_b, zmm_c); cc.evex().vpmuludq(xmm_a, xmm_b, xmm_c); cc.evex().vpmuludq(ymm_a, ymm_b, ymm_c); cc.evex().vpmuludq(zmm_a, zmm_b, zmm_c); cc.evex().vpopcntd(zmm_a, zmm_b); cc.evex().vpopcntq(zmm_a, zmm_b); cc.evex().vpord(xmm_a, xmm_b, xmm_c); cc.evex().vpord(ymm_a, ymm_b, ymm_c); cc.evex().vpord(zmm_a, zmm_b, zmm_c); cc.evex().vporq(xmm_a, xmm_b, xmm_c); cc.evex().vporq(ymm_a, ymm_b, ymm_c); cc.evex().vporq(zmm_a, zmm_b, zmm_c); cc.evex().vprold(xmm_a, xmm_b, 0); cc.evex().vprold(ymm_a, ymm_b, 0); cc.evex().vprold(zmm_a, zmm_b, 0); cc.evex().vprolq(xmm_a, xmm_b, 0); cc.evex().vprolq(ymm_a, ymm_b, 0); cc.evex().vprolq(zmm_a, zmm_b, 0); cc.evex().vprolvd(xmm_a, xmm_b, xmm_c); cc.evex().vprolvd(ymm_a, ymm_b, ymm_c); cc.evex().vprolvd(zmm_a, zmm_b, zmm_c); cc.evex().vprolvq(xmm_a, xmm_b, xmm_c); cc.evex().vprolvq(ymm_a, ymm_b, ymm_c); cc.evex().vprolvq(zmm_a, zmm_b, zmm_c); cc.evex().vprord(xmm_a, xmm_b, 0); cc.evex().vprord(ymm_a, ymm_b, 0); cc.evex().vprord(zmm_a, zmm_b, 0); cc.evex().vprorq(xmm_a, xmm_b, 0); cc.evex().vprorq(ymm_a, ymm_b, 0); cc.evex().vprorq(zmm_a, zmm_b, 0); cc.evex().vprorvd(xmm_a, xmm_b, xmm_c); cc.evex().vprorvd(ymm_a, ymm_b, ymm_c); cc.evex().vprorvd(zmm_a, zmm_b, zmm_c); cc.evex().vprorvq(xmm_a, xmm_b, xmm_c); cc.evex().vprorvq(ymm_a, ymm_b, ymm_c); cc.evex().vprorvq(zmm_a, zmm_b, zmm_c); cc.evex().vpsadbw(xmm_a, xmm_b, xmm_c); cc.evex().vpsadbw(ymm_a, ymm_b, ymm_c); cc.evex().vpsadbw(zmm_a, zmm_b, zmm_c); cc.evex().vpshufb(xmm_a, xmm_b, xmm_c); cc.evex().vpshufb(ymm_a, ymm_b, ymm_c); cc.evex().vpshufb(zmm_a, zmm_b, zmm_c); cc.evex().vpshufd(xmm_a, xmm_b, 0); cc.evex().vpshufd(ymm_a, ymm_b, 0); cc.evex().vpshufd(zmm_a, zmm_b, 0); cc.evex().vpshufhw(xmm_a, xmm_b, 0); cc.evex().vpshufhw(ymm_a, ymm_b, 0); cc.evex().vpshufhw(zmm_a, zmm_b, 0); cc.evex().vpshuflw(xmm_a, xmm_b, 0); cc.evex().vpshuflw(ymm_a, ymm_b, 0); cc.evex().vpshuflw(zmm_a, zmm_b, 0); cc.evex().vpslld(xmm_a, xmm_b, xmm_c); cc.evex().vpslld(xmm_a, xmm_b, 0); cc.evex().vpslld(ymm_a, ymm_b, xmm_c); cc.evex().vpslld(ymm_a, ymm_b, 0); cc.evex().vpslld(zmm_a, zmm_b, xmm_c); cc.evex().vpslld(zmm_a, zmm_b, 0); cc.evex().vpslldq(xmm_a, xmm_b, 0); cc.evex().vpslldq(ymm_a, ymm_b, 0); cc.evex().vpslldq(zmm_a, zmm_b, 0); cc.evex().vpsllq(xmm_a, xmm_b, xmm_c); cc.evex().vpsllq(xmm_a, xmm_b, 0); cc.evex().vpsllq(ymm_a, ymm_b, xmm_c); cc.evex().vpsllq(ymm_a, ymm_b, 0); cc.evex().vpsllq(zmm_a, zmm_b, xmm_c); cc.evex().vpsllq(zmm_a, zmm_b, 0); cc.evex().vpsllvd(xmm_a, xmm_b, xmm_c); cc.evex().vpsllvd(ymm_a, ymm_b, ymm_c); cc.evex().vpsllvd(zmm_a, zmm_b, zmm_c); cc.evex().vpsllvq(xmm_a, xmm_b, xmm_c); cc.evex().vpsllvq(ymm_a, ymm_b, ymm_c); cc.evex().vpsllvq(zmm_a, zmm_b, zmm_c); cc.evex().vpsllvw(xmm_a, xmm_b, xmm_c); cc.evex().vpsllvw(ymm_a, ymm_b, ymm_c); cc.evex().vpsllvw(zmm_a, zmm_b, zmm_c); cc.evex().vpsllw(xmm_a, xmm_b, xmm_c); cc.evex().vpsllw(xmm_a, xmm_b, 0); cc.evex().vpsllw(ymm_a, ymm_b, xmm_c); cc.evex().vpsllw(ymm_a, ymm_b, 0); cc.evex().vpsllw(zmm_a, zmm_b, xmm_c); cc.evex().vpsllw(zmm_a, zmm_b, 0); cc.evex().vpsrad(xmm_a, xmm_b, xmm_c); cc.evex().vpsrad(xmm_a, xmm_b, 0); cc.evex().vpsrad(ymm_a, ymm_b, xmm_c); cc.evex().vpsrad(ymm_a, ymm_b, 0); cc.evex().vpsrad(zmm_a, zmm_b, xmm_c); cc.evex().vpsrad(zmm_a, zmm_b, 0); cc.evex().vpsraq(xmm_a, xmm_b, xmm_c); cc.evex().vpsraq(xmm_a, xmm_b, 0); cc.evex().vpsraq(ymm_a, ymm_b, xmm_c); cc.evex().vpsraq(ymm_a, ymm_b, 0); cc.evex().vpsraq(zmm_a, zmm_b, xmm_c); cc.evex().vpsraq(zmm_a, zmm_b, 0); cc.evex().vpsravd(xmm_a, xmm_b, xmm_c); cc.evex().vpsravd(ymm_a, ymm_b, ymm_c); cc.evex().vpsravd(zmm_a, zmm_b, zmm_c); cc.evex().vpsravq(xmm_a, xmm_b, xmm_c); cc.evex().vpsravq(ymm_a, ymm_b, ymm_c); cc.evex().vpsravq(zmm_a, zmm_b, zmm_c); cc.evex().vpsravw(xmm_a, xmm_b, xmm_c); cc.evex().vpsravw(ymm_a, ymm_b, ymm_c); cc.evex().vpsravw(zmm_a, zmm_b, zmm_c); cc.evex().vpsraw(xmm_a, xmm_b, xmm_c); cc.evex().vpsraw(xmm_a, xmm_b, 0); cc.evex().vpsraw(ymm_a, ymm_b, xmm_c); cc.evex().vpsraw(ymm_a, ymm_b, 0); cc.evex().vpsraw(zmm_a, zmm_b, xmm_c); cc.evex().vpsraw(zmm_a, zmm_b, 0); cc.evex().vpsrld(xmm_a, xmm_b, xmm_c); cc.evex().vpsrld(xmm_a, xmm_b, 0); cc.evex().vpsrld(ymm_a, ymm_b, xmm_c); cc.evex().vpsrld(ymm_a, ymm_b, 0); cc.evex().vpsrld(zmm_a, zmm_b, xmm_c); cc.evex().vpsrld(zmm_a, zmm_b, 0); cc.evex().vpsrldq(xmm_a, xmm_b, 0); cc.evex().vpsrldq(ymm_a, ymm_b, 0); cc.evex().vpsrldq(zmm_a, zmm_b, 0); cc.evex().vpsrlq(xmm_a, xmm_b, xmm_c); cc.evex().vpsrlq(xmm_a, xmm_b, 0); cc.evex().vpsrlq(ymm_a, ymm_b, xmm_c); cc.evex().vpsrlq(ymm_a, ymm_b, 0); cc.evex().vpsrlq(zmm_a, zmm_b, xmm_c); cc.evex().vpsrlq(zmm_a, zmm_b, 0); cc.evex().vpsrlvd(xmm_a, xmm_b, xmm_c); cc.evex().vpsrlvd(ymm_a, ymm_b, ymm_c); cc.evex().vpsrlvd(zmm_a, zmm_b, zmm_c); cc.evex().vpsrlvq(xmm_a, xmm_b, xmm_c); cc.evex().vpsrlvq(ymm_a, ymm_b, ymm_c); cc.evex().vpsrlvq(zmm_a, zmm_b, zmm_c); cc.evex().vpsrlvw(xmm_a, xmm_b, xmm_c); cc.evex().vpsrlvw(ymm_a, ymm_b, ymm_c); cc.evex().vpsrlvw(zmm_a, zmm_b, zmm_c); cc.evex().vpsrlw(xmm_a, xmm_b, xmm_c); cc.evex().vpsrlw(xmm_a, xmm_b, 0); cc.evex().vpsrlw(ymm_a, ymm_b, xmm_c); cc.evex().vpsrlw(ymm_a, ymm_b, 0); cc.evex().vpsrlw(zmm_a, zmm_b, xmm_c); cc.evex().vpsrlw(zmm_a, zmm_b, 0); cc.evex().vpsubb(xmm_a, xmm_b, xmm_c); cc.evex().vpsubb(ymm_a, ymm_b, ymm_c); cc.evex().vpsubb(zmm_a, zmm_b, zmm_c); cc.evex().vpsubd(xmm_a, xmm_b, xmm_c); cc.evex().vpsubd(ymm_a, ymm_b, ymm_c); cc.evex().vpsubd(zmm_a, zmm_b, zmm_c); cc.evex().vpsubq(xmm_a, xmm_b, xmm_c); cc.evex().vpsubq(ymm_a, ymm_b, ymm_c); cc.evex().vpsubq(zmm_a, zmm_b, zmm_c); cc.evex().vpsubsb(xmm_a, xmm_b, xmm_c); cc.evex().vpsubsb(ymm_a, ymm_b, ymm_c); cc.evex().vpsubsb(zmm_a, zmm_b, zmm_c); cc.evex().vpsubsw(xmm_a, xmm_b, xmm_c); cc.evex().vpsubsw(ymm_a, ymm_b, ymm_c); cc.evex().vpsubsw(zmm_a, zmm_b, zmm_c); cc.evex().vpsubusb(xmm_a, xmm_b, xmm_c); cc.evex().vpsubusb(ymm_a, ymm_b, ymm_c); cc.evex().vpsubusb(zmm_a, zmm_b, zmm_c); cc.evex().vpsubusw(xmm_a, xmm_b, xmm_c); cc.evex().vpsubusw(ymm_a, ymm_b, ymm_c); cc.evex().vpsubusw(zmm_a, zmm_b, zmm_c); cc.evex().vpsubw(xmm_a, xmm_b, xmm_c); cc.evex().vpsubw(ymm_a, ymm_b, ymm_c); cc.evex().vpsubw(zmm_a, zmm_b, zmm_c); cc.evex().vpternlogd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vpternlogd(ymm_a, ymm_b, ymm_c, 0); cc.evex().vpternlogd(zmm_a, zmm_b, zmm_c, 0); cc.evex().vpternlogq(xmm_a, xmm_b, xmm_c, 0); cc.evex().vpternlogq(ymm_a, ymm_b, ymm_c, 0); cc.evex().vpternlogq(zmm_a, zmm_b, zmm_c, 0); cc.evex().vptestmb(kA, xmm_b, xmm_c); cc.evex().vptestmb(kA, ymm_b, ymm_c); cc.evex().vptestmb(kA, zmm_b, zmm_c); cc.evex().vptestmd(kA, xmm_b, xmm_c); cc.evex().vptestmd(kA, ymm_b, ymm_c); cc.evex().vptestmd(kA, zmm_b, zmm_c); cc.evex().vptestmq(kA, xmm_b, xmm_c); cc.evex().vptestmq(kA, ymm_b, ymm_c); cc.evex().vptestmq(kA, zmm_b, zmm_c); cc.evex().vptestmw(kA, xmm_b, xmm_c); cc.evex().vptestmw(kA, ymm_b, ymm_c); cc.evex().vptestmw(kA, zmm_b, zmm_c); cc.evex().vptestnmb(kA, xmm_b, xmm_c); cc.evex().vptestnmb(kA, ymm_b, ymm_c); cc.evex().vptestnmb(kA, zmm_b, zmm_c); cc.evex().vptestnmd(kA, xmm_b, xmm_c); cc.evex().vptestnmd(kA, ymm_b, ymm_c); cc.evex().vptestnmd(kA, zmm_b, zmm_c); cc.evex().vptestnmq(kA, xmm_b, xmm_c); cc.evex().vptestnmq(kA, ymm_b, ymm_c); cc.evex().vptestnmq(kA, zmm_b, zmm_c); cc.evex().vptestnmw(kA, xmm_b, xmm_c); cc.evex().vptestnmw(kA, ymm_b, ymm_c); cc.evex().vptestnmw(kA, zmm_b, zmm_c); cc.evex().vpunpckhbw(xmm_a, xmm_b, xmm_c); cc.evex().vpunpckhbw(ymm_a, ymm_b, ymm_c); cc.evex().vpunpckhbw(zmm_a, zmm_b, zmm_c); cc.evex().vpunpckhdq(xmm_a, xmm_b, xmm_c); cc.evex().vpunpckhdq(ymm_a, ymm_b, ymm_c); cc.evex().vpunpckhdq(zmm_a, zmm_b, zmm_c); cc.evex().vpunpckhqdq(xmm_a, xmm_b, xmm_c); cc.evex().vpunpckhqdq(ymm_a, ymm_b, ymm_c); cc.evex().vpunpckhqdq(zmm_a, zmm_b, zmm_c); cc.evex().vpunpckhwd(xmm_a, xmm_b, xmm_c); cc.evex().vpunpckhwd(ymm_a, ymm_b, ymm_c); cc.evex().vpunpckhwd(zmm_a, zmm_b, zmm_c); cc.evex().vpunpcklbw(xmm_a, xmm_b, xmm_c); cc.evex().vpunpcklbw(ymm_a, ymm_b, ymm_c); cc.evex().vpunpcklbw(zmm_a, zmm_b, zmm_c); cc.evex().vpunpckldq(xmm_a, xmm_b, xmm_c); cc.evex().vpunpckldq(ymm_a, ymm_b, ymm_c); cc.evex().vpunpckldq(zmm_a, zmm_b, zmm_c); cc.evex().vpunpcklqdq(xmm_a, xmm_b, xmm_c); cc.evex().vpunpcklqdq(ymm_a, ymm_b, ymm_c); cc.evex().vpunpcklqdq(zmm_a, zmm_b, zmm_c); cc.evex().vpunpcklwd(xmm_a, xmm_b, xmm_c); cc.evex().vpunpcklwd(ymm_a, ymm_b, ymm_c); cc.evex().vpunpcklwd(zmm_a, zmm_b, zmm_c); cc.evex().vpxord(xmm_a, xmm_b, xmm_c); cc.evex().vpxord(ymm_a, ymm_b, ymm_c); cc.evex().vpxord(zmm_a, zmm_b, zmm_c); cc.evex().vpxorq(xmm_a, xmm_b, xmm_c); cc.evex().vpxorq(ymm_a, ymm_b, ymm_c); cc.evex().vpxorq(zmm_a, zmm_b, zmm_c); cc.evex().vrangepd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrangepd(ymm_a, ymm_b, ymm_c, 0); cc.evex().vrangepd(zmm_a, zmm_b, zmm_c, 0); cc.evex().vrangeps(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrangeps(ymm_a, ymm_b, ymm_c, 0); cc.evex().vrangeps(zmm_a, zmm_b, zmm_c, 0); cc.evex().vrangesd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrangess(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrcp14pd(xmm_a, xmm_b); cc.evex().vrcp14pd(ymm_a, ymm_b); cc.evex().vrcp14pd(zmm_a, zmm_b); cc.evex().vrcp14ps(xmm_a, xmm_b); cc.evex().vrcp14ps(ymm_a, ymm_b); cc.evex().vrcp14ps(zmm_a, zmm_b); cc.evex().vrcp14sd(xmm_a, xmm_b, xmm_c); cc.evex().vrcp14ss(xmm_a, xmm_b, xmm_c); cc.evex().vreducepd(xmm_a, xmm_b, 0); cc.evex().vreducepd(ymm_a, ymm_b, 0); cc.evex().vreducepd(zmm_a, zmm_b, 0); cc.evex().vreduceps(xmm_a, xmm_b, 0); cc.evex().vreduceps(ymm_a, ymm_b, 0); cc.evex().vreduceps(zmm_a, zmm_b, 0); cc.evex().vreducesd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vreducess(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrndscalepd(xmm_a, xmm_b, 0); cc.evex().vrndscalepd(ymm_a, ymm_b, 0); cc.evex().vrndscalepd(zmm_a, zmm_b, 0); cc.evex().vrndscaleps(xmm_a, xmm_b, 0); cc.evex().vrndscaleps(ymm_a, ymm_b, 0); cc.evex().vrndscaleps(zmm_a, zmm_b, 0); cc.evex().vrndscalesd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrndscaless(xmm_a, xmm_b, xmm_c, 0); cc.evex().vrsqrt14pd(xmm_a, xmm_b); cc.evex().vrsqrt14pd(ymm_a, ymm_b); cc.evex().vrsqrt14pd(zmm_a, zmm_b); cc.evex().vrsqrt14ps(xmm_a, xmm_b); cc.evex().vrsqrt14ps(ymm_a, ymm_b); cc.evex().vrsqrt14ps(zmm_a, zmm_b); cc.evex().vrsqrt14sd(xmm_a, xmm_b, xmm_c); cc.evex().vrsqrt14ss(xmm_a, xmm_b, xmm_c); cc.evex().vscalefpd(xmm_a, xmm_b, xmm_c); cc.evex().vscalefpd(ymm_a, ymm_b, ymm_c); cc.evex().vscalefpd(zmm_a, zmm_b, zmm_c); cc.evex().vscalefps(xmm_a, xmm_b, xmm_c); cc.evex().vscalefps(ymm_a, ymm_b, ymm_c); cc.evex().vscalefps(zmm_a, zmm_b, zmm_c); cc.evex().vscalefsd(xmm_a, xmm_b, xmm_c); cc.evex().vscalefss(xmm_a, xmm_b, xmm_c); cc.evex().vshuff32x4(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshuff32x4(zmm_a, zmm_b, zmm_c, 0); cc.evex().vshuff64x2(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshuff64x2(zmm_a, zmm_b, zmm_c, 0); cc.evex().vshufi32x4(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshufi32x4(zmm_a, zmm_b, zmm_c, 0); cc.evex().vshufi64x2(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshufi64x2(zmm_a, zmm_b, zmm_c, 0); cc.evex().vshufpd(xmm_a, xmm_b, xmm_c, 0); cc.evex().vshufpd(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshufpd(zmm_a, zmm_b, zmm_c, 0); cc.evex().vshufps(xmm_a, xmm_b, xmm_c, 0); cc.evex().vshufps(ymm_a, ymm_b, ymm_c, 0); cc.evex().vshufps(zmm_a, zmm_b, zmm_c, 0); cc.evex().vsqrtpd(xmm_a, xmm_b); cc.evex().vsqrtpd(ymm_a, ymm_b); cc.evex().vsqrtpd(zmm_a, zmm_b); cc.evex().vsqrtps(xmm_a, xmm_b); cc.evex().vsqrtps(ymm_a, ymm_b); cc.evex().vsqrtps(zmm_a, zmm_b); cc.evex().vsqrtsd(xmm_a, xmm_b, xmm_c); cc.evex().vsqrtss(xmm_a, xmm_b, xmm_c); cc.evex().vsubpd(xmm_a, xmm_b, xmm_c); cc.evex().vsubpd(ymm_a, ymm_b, ymm_c); cc.evex().vsubpd(zmm_a, zmm_b, zmm_c); cc.evex().vsubps(xmm_a, xmm_b, xmm_c); cc.evex().vsubps(ymm_a, ymm_b, ymm_c); cc.evex().vsubps(zmm_a, zmm_b, zmm_c); cc.evex().vsubsd(xmm_a, xmm_b, xmm_c); cc.evex().vsubss(xmm_a, xmm_b, xmm_c); cc.evex().vucomisd(xmm_a, xmm_b); cc.evex().vucomiss(xmm_a, xmm_b); cc.evex().vunpckhpd(xmm_a, xmm_b, xmm_c); cc.evex().vunpckhpd(ymm_a, ymm_b, ymm_c); cc.evex().vunpckhpd(zmm_a, zmm_b, zmm_c); cc.evex().vunpckhps(xmm_a, xmm_b, xmm_c); cc.evex().vunpckhps(ymm_a, ymm_b, ymm_c); cc.evex().vunpckhps(zmm_a, zmm_b, zmm_c); cc.evex().vunpcklpd(xmm_a, xmm_b, xmm_c); cc.evex().vunpcklpd(ymm_a, ymm_b, ymm_c); cc.evex().vunpcklpd(zmm_a, zmm_b, zmm_c); cc.evex().vunpcklps(xmm_a, xmm_b, xmm_c); cc.evex().vunpcklps(ymm_a, ymm_b, ymm_c); cc.evex().vunpcklps(zmm_a, zmm_b, zmm_c); cc.evex().vxorpd(xmm_a, xmm_b, xmm_c); cc.evex().vxorpd(ymm_a, ymm_b, ymm_c); cc.evex().vxorpd(zmm_a, zmm_b, zmm_c); cc.evex().vxorps(xmm_a, xmm_b, xmm_c); cc.evex().vxorps(ymm_a, ymm_b, ymm_c); cc.evex().vxorps(zmm_a, zmm_b, zmm_c); } template static void generate_avx512_sequence_internal_reg_mem( Emitter& cc, const x86::Gp& gp, const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { Support::maybe_unused(kC); x86::Gp gpd = gp.r32(); x86::Gp gpq = gp.r64(); x86::Gp gpz = cc.is_32bit() ? gpd : gpq; x86::Vec xmm_a = vec_a.xmm(); x86::Vec xmm_b = vec_b.xmm(); x86::Vec xmm_c = vec_c.xmm(); x86::Vec xmm_d = vec_d.xmm(); x86::Vec ymm_a = vec_a.ymm(); x86::Vec ymm_b = vec_b.ymm(); x86::Vec ymm_d = vec_d.ymm(); x86::Vec zmm_a = vec_a.zmm(); x86::Vec zmm_b = vec_b.zmm(); x86::Vec zmm_d = vec_d.zmm(); x86::Mem m = x86::ptr(gpz); x86::Mem m32 = x86::dword_ptr(gpz); x86::Mem m64 = x86::qword_ptr(gpz); x86::Mem m128 = x86::xmmword_ptr(gpz); x86::Mem m256 = x86::ymmword_ptr(gpz); x86::Mem m512 = x86::zmmword_ptr(gpz); x86::Mem vx_ptr = x86::ptr(gpz, xmm_d); x86::Mem vy_ptr = x86::ptr(gpz, ymm_d); x86::Mem vz_ptr = x86::ptr(gpz, zmm_d); cc.xor_(gpd, gpd); cc.vxorps(xmm_a, xmm_a, xmm_a); cc.vxorps(xmm_b, xmm_b, xmm_b); cc.vxorps(xmm_c, xmm_c, xmm_c); cc.vxorps(xmm_d, xmm_d, xmm_d); cc.kmovb(kA, m); cc.kmovb(m, kB); cc.kmovd(kA, m); cc.kmovd(m, kB); cc.kmovq(kA, m); cc.kmovq(m, kB); cc.kmovw(kA, m); cc.kmovw(m, kB); cc.evex().vaddpd(xmm_a, xmm_b, m); cc.evex().vaddpd(ymm_a, ymm_b, m); cc.evex().vaddpd(zmm_a, zmm_b, m); cc.evex().vaddps(xmm_a, xmm_b, m); cc.evex().vaddps(ymm_a, ymm_b, m); cc.evex().vaddps(zmm_a, zmm_b, m); cc.evex().vaddsd(xmm_a, xmm_b, m); cc.evex().vaddss(xmm_a, xmm_b, m); cc.evex().valignd(xmm_a, xmm_b, m, 0); cc.evex().valignd(ymm_a, ymm_b, m, 0); cc.evex().valignd(zmm_a, zmm_b, m, 0); cc.evex().valignq(xmm_a, xmm_b, m, 0); cc.evex().valignq(ymm_a, ymm_b, m, 0); cc.evex().valignq(zmm_a, zmm_b, m, 0); cc.evex().vandnpd(xmm_a, xmm_b, m); cc.evex().vandnpd(ymm_a, ymm_b, m); cc.evex().vandnpd(zmm_a, zmm_b, m); cc.evex().vandnps(xmm_a, xmm_b, m); cc.evex().vandnps(ymm_a, ymm_b, m); cc.evex().vandnps(zmm_a, zmm_b, m); cc.evex().vandpd(xmm_a, xmm_b, m); cc.evex().vandpd(ymm_a, ymm_b, m); cc.evex().vandpd(zmm_a, zmm_b, m); cc.evex().vandps(xmm_a, xmm_b, m); cc.evex().vandps(ymm_a, ymm_b, m); cc.evex().vandps(zmm_a, zmm_b, m); cc.evex().vblendmpd(xmm_a, xmm_b, m); cc.evex().vblendmpd(ymm_a, ymm_b, m); cc.evex().vblendmpd(zmm_a, zmm_b, m); cc.evex().vblendmps(xmm_a, xmm_b, m); cc.evex().vblendmps(ymm_a, ymm_b, m); cc.evex().vblendmps(zmm_a, zmm_b, m); cc.evex().vbroadcastf32x2(ymm_a, m); cc.evex().vbroadcastf32x2(zmm_a, m); cc.evex().vbroadcastf32x4(ymm_a, m); cc.evex().vbroadcastf32x4(zmm_a, m); cc.evex().vbroadcastf32x8(zmm_a, m); cc.evex().vbroadcastf64x2(ymm_a, m); cc.evex().vbroadcastf64x2(zmm_a, m); cc.evex().vbroadcastf64x4(zmm_a, m); cc.evex().vbroadcasti32x2(xmm_a, m); cc.evex().vbroadcasti32x2(ymm_a, m); cc.evex().vbroadcasti32x2(zmm_a, m); cc.evex().vbroadcasti32x4(ymm_a, m); cc.evex().vbroadcasti32x4(zmm_a, m); cc.evex().vbroadcasti32x8(zmm_a, m); cc.evex().vbroadcasti64x2(ymm_a, m); cc.evex().vbroadcasti64x2(zmm_a, m); cc.evex().vbroadcasti64x4(zmm_a, m); cc.evex().vbroadcastsd(ymm_a, m); cc.evex().vbroadcastsd(zmm_a, m); cc.evex().vbroadcastss(xmm_a, m); cc.evex().vbroadcastss(ymm_a, m); cc.evex().vbroadcastss(zmm_a, m); cc.evex().vcmppd(kA, xmm_b, m, 0); cc.evex().vcmppd(kA, ymm_b, m, 0); cc.evex().vcmppd(kA, zmm_b, m, 0); cc.evex().vcmpps(kA, xmm_b, m, 0); cc.evex().vcmpps(kA, ymm_b, m, 0); cc.evex().vcmpps(kA, zmm_b, m, 0); cc.evex().vcmpsd(kA, xmm_b, m, 0); cc.evex().vcmpss(kA, xmm_b, m, 0); cc.evex().vcomisd(xmm_a, m); cc.evex().vcomiss(xmm_a, m); cc.evex().vcompresspd(m, xmm_b); cc.evex().vcompresspd(m, ymm_b); cc.evex().vcompresspd(m, zmm_b); cc.evex().vcompressps(m, xmm_b); cc.evex().vcompressps(m, ymm_b); cc.evex().vcompressps(m, zmm_b); cc.evex().vcvtdq2pd(xmm_a, m); cc.evex().vcvtdq2pd(ymm_a, m); cc.evex().vcvtdq2pd(zmm_a, m); cc.evex().vcvtdq2ps(xmm_a, m); cc.evex().vcvtdq2ps(ymm_a, m); cc.evex().vcvtdq2ps(zmm_a, m); cc.evex().vcvtpd2dq(xmm_a, m128); cc.evex().vcvtpd2dq(xmm_a, m256); cc.evex().vcvtpd2dq(ymm_a, m512); cc.evex().vcvtpd2qq(xmm_a, m); cc.evex().vcvtpd2qq(ymm_a, m); cc.evex().vcvtpd2qq(zmm_a, m); cc.evex().vcvtpd2udq(xmm_a, m128); cc.evex().vcvtpd2udq(xmm_a, m256); cc.evex().vcvtpd2udq(ymm_a, m512); cc.evex().vcvtpd2uqq(xmm_a, m); cc.evex().vcvtpd2uqq(ymm_a, m); cc.evex().vcvtpd2uqq(zmm_a, m); cc.evex().vcvtph2ps(xmm_a, m); cc.evex().vcvtph2ps(ymm_a, m); cc.evex().vcvtph2ps(zmm_a, m); cc.evex().vcvtps2dq(xmm_a, m); cc.evex().vcvtps2dq(ymm_a, m); cc.evex().vcvtps2dq(zmm_a, m); cc.evex().vcvtps2pd(xmm_a, m); cc.evex().vcvtps2pd(ymm_a, m); cc.evex().vcvtps2pd(zmm_a, m); cc.evex().vcvtps2ph(m, xmm_b, 0); cc.evex().vcvtps2ph(m, ymm_b, 0); cc.evex().vcvtps2ph(m, zmm_b, 0); cc.evex().vcvtps2qq(xmm_a, m); cc.evex().vcvtps2qq(ymm_a, m); cc.evex().vcvtps2qq(zmm_a, m); cc.evex().vcvtps2udq(xmm_a, m); cc.evex().vcvtps2udq(ymm_a, m); cc.evex().vcvtps2udq(zmm_a, m); cc.evex().vcvtps2uqq(xmm_a, m); cc.evex().vcvtps2uqq(ymm_a, m); cc.evex().vcvtps2uqq(zmm_a, m); cc.evex().vcvtqq2pd(xmm_a, m); cc.evex().vcvtqq2pd(ymm_a, m); cc.evex().vcvtqq2pd(zmm_a, m); cc.evex().vcvtqq2ps(xmm_a, m128); cc.evex().vcvtqq2ps(xmm_a, m256); cc.evex().vcvtqq2ps(ymm_a, m512); cc.evex().vcvtsd2si(gpd, m); cc.evex().vcvtsd2si(gpz, m); cc.evex().vcvtsd2ss(xmm_a, xmm_b, m); cc.evex().vcvtsd2usi(gpd, m); cc.evex().vcvtsd2usi(gpz, m); cc.evex().vcvtsi2sd(xmm_a, xmm_b, m32); if (cc.is_64bit()) cc.evex().vcvtsi2sd(xmm_a, xmm_b, m64); cc.evex().vcvtsi2ss(xmm_a, xmm_b, m32); if (cc.is_64bit()) cc.evex().vcvtsi2ss(xmm_a, xmm_b, m64); cc.evex().vcvtss2sd(xmm_a, xmm_b, m); cc.evex().vcvtss2si(gpd, m); cc.evex().vcvtss2si(gpz, m); cc.evex().vcvtss2usi(gpd, m); cc.evex().vcvtss2usi(gpz, m); cc.evex().vcvttpd2dq(xmm_a, m128); cc.evex().vcvttpd2dq(xmm_a, m256); cc.evex().vcvttpd2dq(ymm_a, m512); cc.evex().vcvttpd2qq(xmm_a, m); cc.evex().vcvttpd2qq(ymm_a, m); cc.evex().vcvttpd2qq(zmm_a, m); cc.evex().vcvttpd2udq(xmm_a, m128); cc.evex().vcvttpd2udq(xmm_a, m256); cc.evex().vcvttpd2udq(ymm_a, m512); cc.evex().vcvttpd2uqq(xmm_a, m); cc.evex().vcvttpd2uqq(ymm_a, m); cc.evex().vcvttpd2uqq(zmm_a, m); cc.evex().vcvttps2dq(xmm_a, m); cc.evex().vcvttps2dq(ymm_a, m); cc.evex().vcvttps2dq(zmm_a, m); cc.evex().vcvttps2qq(xmm_a, m); cc.evex().vcvttps2qq(ymm_a, m); cc.evex().vcvttps2qq(zmm_a, m); cc.evex().vcvttps2udq(xmm_a, m); cc.evex().vcvttps2udq(ymm_a, m); cc.evex().vcvttps2udq(zmm_a, m); cc.evex().vcvttps2uqq(xmm_a, m); cc.evex().vcvttps2uqq(ymm_a, m); cc.evex().vcvttps2uqq(zmm_a, m); cc.evex().vcvttsd2si(gpd, m); cc.evex().vcvttsd2si(gpz, m); cc.evex().vcvttsd2usi(gpd, m); cc.evex().vcvttsd2usi(gpz, m); cc.evex().vcvttss2si(gpd, m); cc.evex().vcvttss2si(gpz, m); cc.evex().vcvttss2usi(gpd, m); cc.evex().vcvttss2usi(gpz, m); cc.evex().vcvtudq2pd(xmm_a, m); cc.evex().vcvtudq2pd(ymm_a, m); cc.evex().vcvtudq2pd(zmm_a, m); cc.evex().vcvtudq2ps(xmm_a, m); cc.evex().vcvtudq2ps(ymm_a, m); cc.evex().vcvtudq2ps(zmm_a, m); cc.evex().vcvtuqq2pd(xmm_a, m); cc.evex().vcvtuqq2pd(ymm_a, m); cc.evex().vcvtuqq2pd(zmm_a, m); cc.evex().vcvtuqq2ps(xmm_a, m128); cc.evex().vcvtuqq2ps(xmm_a, m256); cc.evex().vcvtuqq2ps(ymm_a, m512); cc.evex().vcvtusi2sd(xmm_a, xmm_b, m32); if (cc.is_64bit()) cc.evex().vcvtusi2sd(xmm_a, xmm_b, m64); cc.evex().vcvtusi2ss(xmm_a, xmm_b, m32); if (cc.is_64bit()) cc.evex().vcvtusi2ss(xmm_a, xmm_b, m64); cc.evex().vdbpsadbw(xmm_a, xmm_b, m, 0); cc.evex().vdbpsadbw(ymm_a, ymm_b, m, 0); cc.evex().vdbpsadbw(zmm_a, zmm_b, m, 0); cc.evex().vdivpd(xmm_a, xmm_b, m); cc.evex().vdivpd(ymm_a, ymm_b, m); cc.evex().vdivpd(zmm_a, zmm_b, m); cc.evex().vdivps(xmm_a, xmm_b, m); cc.evex().vdivps(ymm_a, ymm_b, m); cc.evex().vdivps(zmm_a, zmm_b, m); cc.evex().vdivsd(xmm_a, xmm_b, m); cc.evex().vdivss(xmm_a, xmm_b, m); cc.evex().vexpandpd(xmm_a, m); cc.evex().vexpandpd(ymm_a, m); cc.evex().vexpandpd(zmm_a, m); cc.evex().vexpandps(xmm_a, m); cc.evex().vexpandps(ymm_a, m); cc.evex().vexpandps(zmm_a, m); cc.evex().vextractf32x4(m, ymm_b, 0); cc.evex().vextractf32x4(m, zmm_b, 0); cc.evex().vextractf32x8(m, zmm_b, 0); cc.evex().vextractf64x2(m, ymm_b, 0); cc.evex().vextractf64x2(m, zmm_b, 0); cc.evex().vextractf64x4(m, zmm_b, 0); cc.evex().vextracti32x4(m, ymm_b, 0); cc.evex().vextracti32x4(m, zmm_b, 0); cc.evex().vextracti32x8(m, zmm_b, 0); cc.evex().vextracti64x2(m, ymm_b, 0); cc.evex().vextracti64x2(m, zmm_b, 0); cc.evex().vextracti64x4(m, zmm_b, 0); cc.evex().vextractps(m, xmm_b, 0); cc.evex().vfixupimmpd(xmm_a, xmm_b, m, 0); cc.evex().vfixupimmpd(ymm_a, ymm_b, m, 0); cc.evex().vfixupimmpd(zmm_a, zmm_b, m, 0); cc.evex().vfixupimmps(xmm_a, xmm_b, m, 0); cc.evex().vfixupimmps(ymm_a, ymm_b, m, 0); cc.evex().vfixupimmps(zmm_a, zmm_b, m, 0); cc.evex().vfixupimmsd(xmm_a, xmm_b, m, 0); cc.evex().vfixupimmss(xmm_a, xmm_b, m, 0); cc.evex().vfmadd132pd(xmm_a, xmm_b, m); cc.evex().vfmadd132pd(ymm_a, ymm_b, m); cc.evex().vfmadd132pd(zmm_a, zmm_b, m); cc.evex().vfmadd132ps(xmm_a, xmm_b, m); cc.evex().vfmadd132ps(ymm_a, ymm_b, m); cc.evex().vfmadd132ps(zmm_a, zmm_b, m); cc.evex().vfmadd132sd(xmm_a, xmm_b, m); cc.evex().vfmadd132ss(xmm_a, xmm_b, m); cc.evex().vfmadd213pd(xmm_a, xmm_b, m); cc.evex().vfmadd213pd(ymm_a, ymm_b, m); cc.evex().vfmadd213pd(zmm_a, zmm_b, m); cc.evex().vfmadd213ps(xmm_a, xmm_b, m); cc.evex().vfmadd213ps(ymm_a, ymm_b, m); cc.evex().vfmadd213ps(zmm_a, zmm_b, m); cc.evex().vfmadd213sd(xmm_a, xmm_b, m); cc.evex().vfmadd213ss(xmm_a, xmm_b, m); cc.evex().vfmadd231pd(xmm_a, xmm_b, m); cc.evex().vfmadd231pd(ymm_a, ymm_b, m); cc.evex().vfmadd231pd(zmm_a, zmm_b, m); cc.evex().vfmadd231ps(xmm_a, xmm_b, m); cc.evex().vfmadd231ps(ymm_a, ymm_b, m); cc.evex().vfmadd231ps(zmm_a, zmm_b, m); cc.evex().vfmadd231sd(xmm_a, xmm_b, m); cc.evex().vfmadd231ss(xmm_a, xmm_b, m); cc.evex().vfmaddsub132pd(xmm_a, xmm_b, m); cc.evex().vfmaddsub132pd(ymm_a, ymm_b, m); cc.evex().vfmaddsub132pd(zmm_a, zmm_b, m); cc.evex().vfmaddsub132ps(xmm_a, xmm_b, m); cc.evex().vfmaddsub132ps(ymm_a, ymm_b, m); cc.evex().vfmaddsub132ps(zmm_a, zmm_b, m); cc.evex().vfmaddsub213pd(xmm_a, xmm_b, m); cc.evex().vfmaddsub213pd(ymm_a, ymm_b, m); cc.evex().vfmaddsub213pd(zmm_a, zmm_b, m); cc.evex().vfmaddsub213ps(xmm_a, xmm_b, m); cc.evex().vfmaddsub213ps(ymm_a, ymm_b, m); cc.evex().vfmaddsub213ps(zmm_a, zmm_b, m); cc.evex().vfmaddsub231pd(xmm_a, xmm_b, m); cc.evex().vfmaddsub231pd(ymm_a, ymm_b, m); cc.evex().vfmaddsub231pd(zmm_a, zmm_b, m); cc.evex().vfmaddsub231ps(xmm_a, xmm_b, m); cc.evex().vfmaddsub231ps(ymm_a, ymm_b, m); cc.evex().vfmaddsub231ps(zmm_a, zmm_b, m); cc.evex().vfmsub132pd(xmm_a, xmm_b, m); cc.evex().vfmsub132pd(ymm_a, ymm_b, m); cc.evex().vfmsub132pd(zmm_a, zmm_b, m); cc.evex().vfmsub132ps(xmm_a, xmm_b, m); cc.evex().vfmsub132ps(ymm_a, ymm_b, m); cc.evex().vfmsub132ps(zmm_a, zmm_b, m); cc.evex().vfmsub132sd(xmm_a, xmm_b, m); cc.evex().vfmsub132ss(xmm_a, xmm_b, m); cc.evex().vfmsub213pd(xmm_a, xmm_b, m); cc.evex().vfmsub213pd(ymm_a, ymm_b, m); cc.evex().vfmsub213pd(zmm_a, zmm_b, m); cc.evex().vfmsub213ps(xmm_a, xmm_b, m); cc.evex().vfmsub213ps(ymm_a, ymm_b, m); cc.evex().vfmsub213ps(zmm_a, zmm_b, m); cc.evex().vfmsub213sd(xmm_a, xmm_b, m); cc.evex().vfmsub213ss(xmm_a, xmm_b, m); cc.evex().vfmsub231pd(xmm_a, xmm_b, m); cc.evex().vfmsub231pd(ymm_a, ymm_b, m); cc.evex().vfmsub231pd(zmm_a, zmm_b, m); cc.evex().vfmsub231ps(xmm_a, xmm_b, m); cc.evex().vfmsub231ps(ymm_a, ymm_b, m); cc.evex().vfmsub231ps(zmm_a, zmm_b, m); cc.evex().vfmsub231sd(xmm_a, xmm_b, m); cc.evex().vfmsub231ss(xmm_a, xmm_b, m); cc.evex().vfmsubadd132pd(xmm_a, xmm_b, m); cc.evex().vfmsubadd132pd(ymm_a, ymm_b, m); cc.evex().vfmsubadd132pd(zmm_a, zmm_b, m); cc.evex().vfmsubadd132ps(xmm_a, xmm_b, m); cc.evex().vfmsubadd132ps(ymm_a, ymm_b, m); cc.evex().vfmsubadd132ps(zmm_a, zmm_b, m); cc.evex().vfmsubadd213pd(xmm_a, xmm_b, m); cc.evex().vfmsubadd213pd(ymm_a, ymm_b, m); cc.evex().vfmsubadd213pd(zmm_a, zmm_b, m); cc.evex().vfmsubadd213ps(xmm_a, xmm_b, m); cc.evex().vfmsubadd213ps(ymm_a, ymm_b, m); cc.evex().vfmsubadd213ps(zmm_a, zmm_b, m); cc.evex().vfmsubadd231pd(xmm_a, xmm_b, m); cc.evex().vfmsubadd231pd(ymm_a, ymm_b, m); cc.evex().vfmsubadd231pd(zmm_a, zmm_b, m); cc.evex().vfmsubadd231ps(xmm_a, xmm_b, m); cc.evex().vfmsubadd231ps(ymm_a, ymm_b, m); cc.evex().vfmsubadd231ps(zmm_a, zmm_b, m); cc.evex().vfnmadd132pd(xmm_a, xmm_b, m); cc.evex().vfnmadd132pd(ymm_a, ymm_b, m); cc.evex().vfnmadd132pd(zmm_a, zmm_b, m); cc.evex().vfnmadd132ps(xmm_a, xmm_b, m); cc.evex().vfnmadd132ps(ymm_a, ymm_b, m); cc.evex().vfnmadd132ps(zmm_a, zmm_b, m); cc.evex().vfnmadd132sd(xmm_a, xmm_b, m); cc.evex().vfnmadd132ss(xmm_a, xmm_b, m); cc.evex().vfnmadd213pd(xmm_a, xmm_b, m); cc.evex().vfnmadd213pd(ymm_a, ymm_b, m); cc.evex().vfnmadd213pd(zmm_a, zmm_b, m); cc.evex().vfnmadd213ps(xmm_a, xmm_b, m); cc.evex().vfnmadd213ps(ymm_a, ymm_b, m); cc.evex().vfnmadd213ps(zmm_a, zmm_b, m); cc.evex().vfnmadd213sd(xmm_a, xmm_b, m); cc.evex().vfnmadd213ss(xmm_a, xmm_b, m); cc.evex().vfnmadd231pd(xmm_a, xmm_b, m); cc.evex().vfnmadd231pd(ymm_a, ymm_b, m); cc.evex().vfnmadd231pd(zmm_a, zmm_b, m); cc.evex().vfnmadd231ps(xmm_a, xmm_b, m); cc.evex().vfnmadd231ps(ymm_a, ymm_b, m); cc.evex().vfnmadd231ps(zmm_a, zmm_b, m); cc.evex().vfnmadd231sd(xmm_a, xmm_b, m); cc.evex().vfnmadd231ss(xmm_a, xmm_b, m); cc.evex().vfnmsub132pd(xmm_a, xmm_b, m); cc.evex().vfnmsub132pd(ymm_a, ymm_b, m); cc.evex().vfnmsub132pd(zmm_a, zmm_b, m); cc.evex().vfnmsub132ps(xmm_a, xmm_b, m); cc.evex().vfnmsub132ps(ymm_a, ymm_b, m); cc.evex().vfnmsub132ps(zmm_a, zmm_b, m); cc.evex().vfnmsub132sd(xmm_a, xmm_b, m); cc.evex().vfnmsub132ss(xmm_a, xmm_b, m); cc.evex().vfnmsub213pd(xmm_a, xmm_b, m); cc.evex().vfnmsub213pd(ymm_a, ymm_b, m); cc.evex().vfnmsub213pd(zmm_a, zmm_b, m); cc.evex().vfnmsub213ps(xmm_a, xmm_b, m); cc.evex().vfnmsub213ps(ymm_a, ymm_b, m); cc.evex().vfnmsub213ps(zmm_a, zmm_b, m); cc.evex().vfnmsub213sd(xmm_a, xmm_b, m); cc.evex().vfnmsub213ss(xmm_a, xmm_b, m); cc.evex().vfnmsub231pd(xmm_a, xmm_b, m); cc.evex().vfnmsub231pd(ymm_a, ymm_b, m); cc.evex().vfnmsub231pd(zmm_a, zmm_b, m); cc.evex().vfnmsub231ps(xmm_a, xmm_b, m); cc.evex().vfnmsub231ps(ymm_a, ymm_b, m); cc.evex().vfnmsub231ps(zmm_a, zmm_b, m); cc.evex().vfnmsub231sd(xmm_a, xmm_b, m); cc.evex().vfnmsub231ss(xmm_a, xmm_b, m); cc.evex().vfpclasspd(kA, m128, 0); cc.evex().vfpclasspd(kA, m256, 0); cc.evex().vfpclasspd(kA, m512, 0); cc.evex().vfpclassps(kA, m128, 0); cc.evex().vfpclassps(kA, m256, 0); cc.evex().vfpclassps(kA, m512, 0); cc.evex().vfpclasssd(kA, m, 0); cc.evex().vfpclassss(kA, m, 0); cc.evex().k(kA).vgatherdpd(xmm_a, vx_ptr); cc.evex().k(kA).vgatherdpd(ymm_a, vx_ptr); cc.evex().k(kA).vgatherdpd(zmm_a, vy_ptr); cc.evex().k(kA).vgatherdps(xmm_a, vx_ptr); cc.evex().k(kA).vgatherdps(ymm_a, vy_ptr); cc.evex().k(kA).vgatherdps(zmm_a, vz_ptr); cc.evex().k(kA).vgatherqpd(xmm_a, vx_ptr); cc.evex().k(kA).vgatherqpd(ymm_a, vy_ptr); cc.evex().k(kA).vgatherqpd(zmm_a, vz_ptr); cc.evex().k(kA).vgatherqps(xmm_a, vx_ptr); cc.evex().k(kA).vgatherqps(xmm_a, vy_ptr); cc.evex().k(kA).vgatherqps(ymm_a, vz_ptr); cc.evex().vgetexppd(xmm_a, m); cc.evex().vgetexppd(ymm_a, m); cc.evex().vgetexppd(zmm_a, m); cc.evex().vgetexpps(xmm_a, m); cc.evex().vgetexpps(ymm_a, m); cc.evex().vgetexpps(zmm_a, m); cc.evex().vgetexpsd(xmm_a, xmm_b, m); cc.evex().vgetexpss(xmm_a, xmm_b, m); cc.evex().vgetmantpd(xmm_a, m, 0); cc.evex().vgetmantpd(ymm_a, m, 0); cc.evex().vgetmantpd(zmm_a, m, 0); cc.evex().vgetmantps(xmm_a, m, 0); cc.evex().vgetmantps(ymm_a, m, 0); cc.evex().vgetmantps(zmm_a, m, 0); cc.evex().vgetmantsd(xmm_a, xmm_b, m, 0); cc.evex().vgetmantss(xmm_a, xmm_b, m, 0); cc.evex().vinsertf32x4(ymm_a, ymm_b, m, 0); cc.evex().vinsertf32x4(zmm_a, zmm_b, m, 0); cc.evex().vinsertf32x8(zmm_a, zmm_b, m, 0); cc.evex().vinsertf64x2(ymm_a, ymm_b, m, 0); cc.evex().vinsertf64x2(zmm_a, zmm_b, m, 0); cc.evex().vinsertf64x4(zmm_a, zmm_b, m, 0); cc.evex().vinserti32x4(ymm_a, ymm_b, m, 0); cc.evex().vinserti32x4(zmm_a, zmm_b, m, 0); cc.evex().vinserti32x8(zmm_a, zmm_b, m, 0); cc.evex().vinserti64x2(ymm_a, ymm_b, m, 0); cc.evex().vinserti64x2(zmm_a, zmm_b, m, 0); cc.evex().vinserti64x4(zmm_a, zmm_b, m, 0); cc.evex().vinsertps(xmm_a, xmm_b, m, 0); cc.evex().vmaxpd(xmm_a, xmm_b, m); cc.evex().vmaxpd(ymm_a, ymm_b, m); cc.evex().vmaxpd(zmm_a, zmm_b, m); cc.evex().vmaxps(xmm_a, xmm_b, m); cc.evex().vmaxps(ymm_a, ymm_b, m); cc.evex().vmaxps(zmm_a, zmm_b, m); cc.evex().vmaxsd(xmm_a, xmm_b, m); cc.evex().vmaxss(xmm_a, xmm_b, m); cc.evex().vminpd(xmm_a, xmm_b, m); cc.evex().vminpd(ymm_a, ymm_b, m); cc.evex().vminpd(zmm_a, zmm_b, m); cc.evex().vminps(xmm_a, xmm_b, m); cc.evex().vminps(ymm_a, ymm_b, m); cc.evex().vminps(zmm_a, zmm_b, m); cc.evex().vminsd(xmm_a, xmm_b, m); cc.evex().vminss(xmm_a, xmm_b, m); cc.evex().vmovapd(xmm_a, m); cc.evex().vmovapd(m, xmm_b); cc.evex().vmovapd(ymm_a, m); cc.evex().vmovapd(m, ymm_b); cc.evex().vmovapd(zmm_a, m); cc.evex().vmovapd(m, zmm_b); cc.evex().vmovaps(xmm_a, m); cc.evex().vmovaps(m, xmm_b); cc.evex().vmovaps(ymm_a, m); cc.evex().vmovaps(m, ymm_b); cc.evex().vmovaps(zmm_a, m); cc.evex().vmovaps(m, zmm_b); cc.evex().vmovd(m, xmm_b); cc.evex().vmovd(xmm_a, m); cc.evex().vmovddup(xmm_a, m); cc.evex().vmovddup(ymm_a, m); cc.evex().vmovddup(zmm_a, m); cc.evex().vmovdqa32(xmm_a, m); cc.evex().vmovdqa32(m, xmm_b); cc.evex().vmovdqa32(ymm_a, m); cc.evex().vmovdqa32(m, ymm_b); cc.evex().vmovdqa32(zmm_a, m); cc.evex().vmovdqa32(m, zmm_b); cc.evex().vmovdqa64(xmm_a, m); cc.evex().vmovdqa64(m, xmm_b); cc.evex().vmovdqa64(ymm_a, m); cc.evex().vmovdqa64(m, ymm_b); cc.evex().vmovdqa64(zmm_a, m); cc.evex().vmovdqa64(m, zmm_b); cc.evex().vmovdqu16(xmm_a, m); cc.evex().vmovdqu16(m, xmm_b); cc.evex().vmovdqu16(ymm_a, m); cc.evex().vmovdqu16(m, ymm_b); cc.evex().vmovdqu16(zmm_a, m); cc.evex().vmovdqu16(m, zmm_b); cc.evex().vmovdqu32(xmm_a, m); cc.evex().vmovdqu32(m, xmm_b); cc.evex().vmovdqu32(ymm_a, m); cc.evex().vmovdqu32(m, ymm_b); cc.evex().vmovdqu32(zmm_a, m); cc.evex().vmovdqu32(m, zmm_b); cc.evex().vmovdqu64(xmm_a, m); cc.evex().vmovdqu64(m, xmm_b); cc.evex().vmovdqu64(ymm_a, m); cc.evex().vmovdqu64(m, ymm_b); cc.evex().vmovdqu64(zmm_a, m); cc.evex().vmovdqu64(m, zmm_b); cc.evex().vmovdqu8(xmm_a, m); cc.evex().vmovdqu8(m, xmm_b); cc.evex().vmovdqu8(ymm_a, m); cc.evex().vmovdqu8(m, ymm_b); cc.evex().vmovdqu8(zmm_a, m); cc.evex().vmovdqu8(m, zmm_b); cc.evex().vmovhpd(m, xmm_b); cc.evex().vmovhpd(xmm_a, xmm_b, m); cc.evex().vmovhps(m, xmm_b); cc.evex().vmovhps(xmm_a, xmm_b, m); cc.evex().vmovlpd(m, xmm_b); cc.evex().vmovlpd(xmm_a, xmm_b, m); cc.evex().vmovlps(m, xmm_b); cc.evex().vmovlps(xmm_a, xmm_b, m); cc.evex().vmovntdq(m, xmm_b); cc.evex().vmovntdq(m, ymm_b); cc.evex().vmovntdq(m, zmm_b); cc.evex().vmovntdqa(xmm_a, m); cc.evex().vmovntdqa(ymm_a, m); cc.evex().vmovntdqa(zmm_a, m); cc.evex().vmovntpd(m, xmm_b); cc.evex().vmovntpd(m, ymm_b); cc.evex().vmovntpd(m, zmm_b); cc.evex().vmovntps(m, xmm_b); cc.evex().vmovntps(m, ymm_b); cc.evex().vmovntps(m, zmm_b); cc.evex().vmovq(m, xmm_b); cc.evex().vmovq(xmm_a, m); cc.evex().vmovq(xmm_a, m); cc.evex().vmovq(m, xmm_b); cc.evex().vmovsd(m, xmm_b); cc.evex().vmovsd(xmm_a, m); cc.evex().vmovshdup(xmm_a, m); cc.evex().vmovshdup(ymm_a, m); cc.evex().vmovshdup(zmm_a, m); cc.evex().vmovsldup(xmm_a, m); cc.evex().vmovsldup(ymm_a, m); cc.evex().vmovsldup(zmm_a, m); cc.evex().vmovss(m, xmm_b); cc.evex().vmovss(xmm_a, m); cc.evex().vmovupd(xmm_a, m); cc.evex().vmovupd(m, xmm_b); cc.evex().vmovupd(ymm_a, m); cc.evex().vmovupd(m, ymm_b); cc.evex().vmovupd(zmm_a, m); cc.evex().vmovupd(m, zmm_b); cc.evex().vmovups(xmm_a, m); cc.evex().vmovups(m, xmm_b); cc.evex().vmovups(ymm_a, m); cc.evex().vmovups(m, ymm_b); cc.evex().vmovups(zmm_a, m); cc.evex().vmovups(m, zmm_b); cc.evex().vmulpd(xmm_a, xmm_b, m); cc.evex().vmulpd(ymm_a, ymm_b, m); cc.evex().vmulpd(zmm_a, zmm_b, m); cc.evex().vmulps(xmm_a, xmm_b, m); cc.evex().vmulps(ymm_a, ymm_b, m); cc.evex().vmulps(zmm_a, zmm_b, m); cc.evex().vmulsd(xmm_a, xmm_b, m); cc.evex().vmulss(xmm_a, xmm_b, m); cc.evex().vorpd(xmm_a, xmm_b, m); cc.evex().vorpd(ymm_a, ymm_b, m); cc.evex().vorpd(zmm_a, zmm_b, m); cc.evex().vorps(xmm_a, xmm_b, m); cc.evex().vorps(ymm_a, ymm_b, m); cc.evex().vorps(zmm_a, zmm_b, m); cc.evex().vpabsb(xmm_a, m); cc.evex().vpabsb(ymm_a, m); cc.evex().vpabsb(zmm_a, m); cc.evex().vpabsd(xmm_a, m); cc.evex().vpabsd(ymm_a, m); cc.evex().vpabsd(zmm_a, m); cc.evex().vpabsq(xmm_a, m); cc.evex().vpabsq(ymm_a, m); cc.evex().vpabsq(zmm_a, m); cc.evex().vpabsw(xmm_a, m); cc.evex().vpabsw(ymm_a, m); cc.evex().vpabsw(zmm_a, m); cc.evex().vpackssdw(xmm_a, xmm_b, m); cc.evex().vpackssdw(ymm_a, ymm_b, m); cc.evex().vpackssdw(zmm_a, zmm_b, m); cc.evex().vpacksswb(xmm_a, xmm_b, m); cc.evex().vpacksswb(ymm_a, ymm_b, m); cc.evex().vpacksswb(zmm_a, zmm_b, m); cc.evex().vpackusdw(xmm_a, xmm_b, m); cc.evex().vpackusdw(ymm_a, ymm_b, m); cc.evex().vpackusdw(zmm_a, zmm_b, m); cc.evex().vpackuswb(xmm_a, xmm_b, m); cc.evex().vpackuswb(ymm_a, ymm_b, m); cc.evex().vpackuswb(zmm_a, zmm_b, m); cc.evex().vpaddb(xmm_a, xmm_b, m); cc.evex().vpaddb(ymm_a, ymm_b, m); cc.evex().vpaddb(zmm_a, zmm_b, m); cc.evex().vpaddd(xmm_a, xmm_b, m); cc.evex().vpaddd(ymm_a, ymm_b, m); cc.evex().vpaddd(zmm_a, zmm_b, m); cc.evex().vpaddq(xmm_a, xmm_b, m); cc.evex().vpaddq(ymm_a, ymm_b, m); cc.evex().vpaddq(zmm_a, zmm_b, m); cc.evex().vpaddsb(xmm_a, xmm_b, m); cc.evex().vpaddsb(ymm_a, ymm_b, m); cc.evex().vpaddsb(zmm_a, zmm_b, m); cc.evex().vpaddsw(xmm_a, xmm_b, m); cc.evex().vpaddsw(ymm_a, ymm_b, m); cc.evex().vpaddsw(zmm_a, zmm_b, m); cc.evex().vpaddusb(xmm_a, xmm_b, m); cc.evex().vpaddusb(ymm_a, ymm_b, m); cc.evex().vpaddusb(zmm_a, zmm_b, m); cc.evex().vpaddusw(xmm_a, xmm_b, m); cc.evex().vpaddusw(ymm_a, ymm_b, m); cc.evex().vpaddusw(zmm_a, zmm_b, m); cc.evex().vpaddw(xmm_a, xmm_b, m); cc.evex().vpaddw(ymm_a, ymm_b, m); cc.evex().vpaddw(zmm_a, zmm_b, m); cc.evex().vpalignr(xmm_a, xmm_b, m, 0); cc.evex().vpalignr(ymm_a, ymm_b, m, 0); cc.evex().vpalignr(zmm_a, zmm_b, m, 0); cc.evex().vpandd(xmm_a, xmm_b, m); cc.evex().vpandd(ymm_a, ymm_b, m); cc.evex().vpandd(zmm_a, zmm_b, m); cc.evex().vpandnd(xmm_a, xmm_b, m); cc.evex().vpandnd(ymm_a, ymm_b, m); cc.evex().vpandnd(zmm_a, zmm_b, m); cc.evex().vpandnq(xmm_a, xmm_b, m); cc.evex().vpandnq(ymm_a, ymm_b, m); cc.evex().vpandnq(zmm_a, zmm_b, m); cc.evex().vpandq(xmm_a, xmm_b, m); cc.evex().vpandq(ymm_a, ymm_b, m); cc.evex().vpandq(zmm_a, zmm_b, m); cc.evex().vpavgb(xmm_a, xmm_b, m); cc.evex().vpavgb(ymm_a, ymm_b, m); cc.evex().vpavgb(zmm_a, zmm_b, m); cc.evex().vpavgw(xmm_a, xmm_b, m); cc.evex().vpavgw(ymm_a, ymm_b, m); cc.evex().vpavgw(zmm_a, zmm_b, m); cc.evex().vpblendmb(xmm_a, xmm_b, m); cc.evex().vpblendmb(ymm_a, ymm_b, m); cc.evex().vpblendmb(zmm_a, zmm_b, m); cc.evex().vpblendmd(xmm_a, xmm_b, m); cc.evex().vpblendmd(ymm_a, ymm_b, m); cc.evex().vpblendmd(zmm_a, zmm_b, m); cc.evex().vpblendmq(xmm_a, xmm_b, m); cc.evex().vpblendmq(ymm_a, ymm_b, m); cc.evex().vpblendmq(zmm_a, zmm_b, m); cc.evex().vpblendmw(xmm_a, xmm_b, m); cc.evex().vpblendmw(ymm_a, ymm_b, m); cc.evex().vpblendmw(zmm_a, zmm_b, m); cc.evex().vpbroadcastb(xmm_a, m); cc.evex().vpbroadcastb(ymm_a, m); cc.evex().vpbroadcastb(zmm_a, m); cc.evex().vpbroadcastd(xmm_a, m); cc.evex().vpbroadcastd(ymm_a, m); cc.evex().vpbroadcastd(zmm_a, m); cc.evex().vpbroadcastq(xmm_a, m); cc.evex().vpbroadcastq(ymm_a, m); cc.evex().vpbroadcastq(zmm_a, m); cc.evex().vpbroadcastw(xmm_a, m); cc.evex().vpbroadcastw(ymm_a, m); cc.evex().vpbroadcastw(zmm_a, m); cc.evex().vpcmpb(kA, xmm_b, m, 0); cc.evex().vpcmpb(kA, ymm_b, m, 0); cc.evex().vpcmpb(kA, zmm_b, m, 0); cc.evex().vpcmpd(kA, xmm_b, m, 0); cc.evex().vpcmpd(kA, ymm_b, m, 0); cc.evex().vpcmpd(kA, zmm_b, m, 0); cc.evex().vpcmpeqb(kA, xmm_b, m); cc.evex().vpcmpeqb(kA, ymm_b, m); cc.evex().vpcmpeqb(kA, zmm_b, m); cc.evex().vpcmpeqd(kA, xmm_b, m); cc.evex().vpcmpeqd(kA, ymm_b, m); cc.evex().vpcmpeqd(kA, zmm_b, m); cc.evex().vpcmpeqq(kA, xmm_b, m); cc.evex().vpcmpeqq(kA, ymm_b, m); cc.evex().vpcmpeqq(kA, zmm_b, m); cc.evex().vpcmpeqw(kA, xmm_b, m); cc.evex().vpcmpeqw(kA, ymm_b, m); cc.evex().vpcmpeqw(kA, zmm_b, m); cc.evex().vpcmpgtb(kA, xmm_b, m); cc.evex().vpcmpgtb(kA, ymm_b, m); cc.evex().vpcmpgtb(kA, zmm_b, m); cc.evex().vpcmpgtd(kA, xmm_b, m); cc.evex().vpcmpgtd(kA, ymm_b, m); cc.evex().vpcmpgtd(kA, zmm_b, m); cc.evex().vpcmpgtq(kA, xmm_b, m); cc.evex().vpcmpgtq(kA, ymm_b, m); cc.evex().vpcmpgtq(kA, zmm_b, m); cc.evex().vpcmpgtw(kA, xmm_b, m); cc.evex().vpcmpgtw(kA, ymm_b, m); cc.evex().vpcmpgtw(kA, zmm_b, m); cc.evex().vpcmpq(kA, xmm_b, m, 0); cc.evex().vpcmpq(kA, ymm_b, m, 0); cc.evex().vpcmpq(kA, zmm_b, m, 0); cc.evex().vpcmpub(kA, xmm_b, m, 0); cc.evex().vpcmpub(kA, ymm_b, m, 0); cc.evex().vpcmpub(kA, zmm_b, m, 0); cc.evex().vpcmpud(kA, xmm_b, m, 0); cc.evex().vpcmpud(kA, ymm_b, m, 0); cc.evex().vpcmpud(kA, zmm_b, m, 0); cc.evex().vpcmpuq(kA, xmm_b, m, 0); cc.evex().vpcmpuq(kA, ymm_b, m, 0); cc.evex().vpcmpuq(kA, zmm_b, m, 0); cc.evex().vpcmpuw(kA, xmm_b, m, 0); cc.evex().vpcmpuw(kA, ymm_b, m, 0); cc.evex().vpcmpuw(kA, zmm_b, m, 0); cc.evex().vpcmpw(kA, xmm_b, m, 0); cc.evex().vpcmpw(kA, ymm_b, m, 0); cc.evex().vpcmpw(kA, zmm_b, m, 0); cc.evex().vpcompressd(m, xmm_b); cc.evex().vpcompressd(m, ymm_b); cc.evex().vpcompressd(m, zmm_b); cc.evex().vpcompressq(m, xmm_b); cc.evex().vpcompressq(m, ymm_b); cc.evex().vpcompressq(m, zmm_b); cc.evex().vpconflictd(xmm_a, m); cc.evex().vpconflictd(ymm_a, m); cc.evex().vpconflictd(zmm_a, m); cc.evex().vpconflictq(xmm_a, m); cc.evex().vpconflictq(ymm_a, m); cc.evex().vpconflictq(zmm_a, m); cc.evex().vpermb(xmm_a, xmm_b, m); cc.evex().vpermb(ymm_a, ymm_b, m); cc.evex().vpermb(zmm_a, zmm_b, m); cc.evex().vpermd(ymm_a, ymm_b, m); cc.evex().vpermd(zmm_a, zmm_b, m); cc.evex().vpermi2b(xmm_a, xmm_b, m); cc.evex().vpermi2b(ymm_a, ymm_b, m); cc.evex().vpermi2b(zmm_a, zmm_b, m); cc.evex().vpermi2d(xmm_a, xmm_b, m); cc.evex().vpermi2d(ymm_a, ymm_b, m); cc.evex().vpermi2d(zmm_a, zmm_b, m); cc.evex().vpermi2pd(xmm_a, xmm_b, m); cc.evex().vpermi2pd(ymm_a, ymm_b, m); cc.evex().vpermi2pd(zmm_a, zmm_b, m); cc.evex().vpermi2ps(xmm_a, xmm_b, m); cc.evex().vpermi2ps(ymm_a, ymm_b, m); cc.evex().vpermi2ps(zmm_a, zmm_b, m); cc.evex().vpermi2q(xmm_a, xmm_b, m); cc.evex().vpermi2q(ymm_a, ymm_b, m); cc.evex().vpermi2q(zmm_a, zmm_b, m); cc.evex().vpermi2w(xmm_a, xmm_b, m); cc.evex().vpermi2w(ymm_a, ymm_b, m); cc.evex().vpermi2w(zmm_a, zmm_b, m); cc.evex().vpermilpd(xmm_a, xmm_b, m); cc.evex().vpermilpd(ymm_a, ymm_b, m); cc.evex().vpermilpd(zmm_a, zmm_b, m); cc.evex().vpermilpd(xmm_a, m, 0); cc.evex().vpermilpd(ymm_a, m, 0); cc.evex().vpermilpd(zmm_a, m, 0); cc.evex().vpermilps(xmm_a, xmm_b, m); cc.evex().vpermilps(ymm_a, ymm_b, m); cc.evex().vpermilps(zmm_a, zmm_b, m); cc.evex().vpermilps(xmm_a, m, 0); cc.evex().vpermilps(ymm_a, m, 0); cc.evex().vpermilps(zmm_a, m, 0); cc.evex().vpermq(ymm_a, ymm_b, m); cc.evex().vpermq(zmm_a, zmm_b, m); cc.evex().vpermq(ymm_a, m, 0); cc.evex().vpermq(zmm_a, m, 0); cc.evex().vpermt2b(xmm_a, xmm_b, m); cc.evex().vpermt2b(ymm_a, ymm_b, m); cc.evex().vpermt2b(zmm_a, zmm_b, m); cc.evex().vpermt2d(xmm_a, xmm_b, m); cc.evex().vpermt2d(ymm_a, ymm_b, m); cc.evex().vpermt2d(zmm_a, zmm_b, m); cc.evex().vpermt2pd(xmm_a, xmm_b, m); cc.evex().vpermt2pd(ymm_a, ymm_b, m); cc.evex().vpermt2pd(zmm_a, zmm_b, m); cc.evex().vpermt2ps(xmm_a, xmm_b, m); cc.evex().vpermt2ps(ymm_a, ymm_b, m); cc.evex().vpermt2ps(zmm_a, zmm_b, m); cc.evex().vpermt2q(xmm_a, xmm_b, m); cc.evex().vpermt2q(ymm_a, ymm_b, m); cc.evex().vpermt2q(zmm_a, zmm_b, m); cc.evex().vpermt2w(xmm_a, xmm_b, m); cc.evex().vpermt2w(ymm_a, ymm_b, m); cc.evex().vpermt2w(zmm_a, zmm_b, m); cc.evex().vpermw(xmm_a, xmm_b, m); cc.evex().vpermw(ymm_a, ymm_b, m); cc.evex().vpermw(zmm_a, zmm_b, m); cc.evex().vpexpandd(xmm_a, m); cc.evex().vpexpandd(ymm_a, m); cc.evex().vpexpandd(zmm_a, m); cc.evex().vpexpandq(xmm_a, m); cc.evex().vpexpandq(ymm_a, m); cc.evex().vpexpandq(zmm_a, m); cc.evex().vpextrb(m, xmm_b, 0); cc.evex().vpextrd(m, xmm_b, 0); if (cc.is_64bit()) cc.evex().vpextrq(m, xmm_b, 0); cc.evex().vpextrw(m, xmm_b, 0); cc.evex().k(kA).vpgatherdd(xmm_a, vx_ptr); cc.evex().k(kA).vpgatherdd(ymm_a, vy_ptr); cc.evex().k(kA).vpgatherdd(zmm_a, vz_ptr); cc.evex().k(kA).vpgatherdq(xmm_a, vx_ptr); cc.evex().k(kA).vpgatherdq(ymm_a, vx_ptr); cc.evex().k(kA).vpgatherdq(zmm_a, vy_ptr); cc.evex().k(kA).vpgatherqd(xmm_a, vx_ptr); cc.evex().k(kA).vpgatherqd(xmm_a, vy_ptr); cc.evex().k(kA).vpgatherqd(ymm_a, vz_ptr); cc.evex().k(kA).vpgatherqq(xmm_a, vx_ptr); cc.evex().k(kA).vpgatherqq(ymm_a, vy_ptr); cc.evex().k(kA).vpgatherqq(zmm_a, vz_ptr); cc.evex().vpinsrb(xmm_a, xmm_b, m, 0); cc.evex().vpinsrd(xmm_a, xmm_b, m, 0); if (cc.is_64bit()) cc.evex().vpinsrq(xmm_a, xmm_b, m, 0); cc.evex().vpinsrw(xmm_a, xmm_b, m, 0); cc.evex().vplzcntd(xmm_a, m); cc.evex().vplzcntd(ymm_a, m); cc.evex().vplzcntd(zmm_a, m); cc.evex().vplzcntq(xmm_a, m); cc.evex().vplzcntq(ymm_a, m); cc.evex().vplzcntq(zmm_a, m); cc.evex().vpmadd52huq(xmm_a, xmm_b, m); cc.evex().vpmadd52huq(ymm_a, ymm_b, m); cc.evex().vpmadd52huq(zmm_a, zmm_b, m); cc.evex().vpmadd52luq(xmm_a, xmm_b, m); cc.evex().vpmadd52luq(ymm_a, ymm_b, m); cc.evex().vpmadd52luq(zmm_a, zmm_b, m); cc.evex().vpmaddubsw(xmm_a, xmm_b, m); cc.evex().vpmaddubsw(ymm_a, ymm_b, m); cc.evex().vpmaddubsw(zmm_a, zmm_b, m); cc.evex().vpmaddwd(xmm_a, xmm_b, m); cc.evex().vpmaddwd(ymm_a, ymm_b, m); cc.evex().vpmaddwd(zmm_a, zmm_b, m); cc.evex().vpmaxsb(xmm_a, xmm_b, m); cc.evex().vpmaxsb(ymm_a, ymm_b, m); cc.evex().vpmaxsb(zmm_a, zmm_b, m); cc.evex().vpmaxsd(xmm_a, xmm_b, m); cc.evex().vpmaxsd(ymm_a, ymm_b, m); cc.evex().vpmaxsd(zmm_a, zmm_b, m); cc.evex().vpmaxsq(xmm_a, xmm_b, m); cc.evex().vpmaxsq(ymm_a, ymm_b, m); cc.evex().vpmaxsq(zmm_a, zmm_b, m); cc.evex().vpmaxsw(xmm_a, xmm_b, m); cc.evex().vpmaxsw(ymm_a, ymm_b, m); cc.evex().vpmaxsw(zmm_a, zmm_b, m); cc.evex().vpmaxub(xmm_a, xmm_b, m); cc.evex().vpmaxub(ymm_a, ymm_b, m); cc.evex().vpmaxub(zmm_a, zmm_b, m); cc.evex().vpmaxud(xmm_a, xmm_b, m); cc.evex().vpmaxud(ymm_a, ymm_b, m); cc.evex().vpmaxud(zmm_a, zmm_b, m); cc.evex().vpmaxuq(xmm_a, xmm_b, m); cc.evex().vpmaxuq(ymm_a, ymm_b, m); cc.evex().vpmaxuq(zmm_a, zmm_b, m); cc.evex().vpmaxuw(xmm_a, xmm_b, m); cc.evex().vpmaxuw(ymm_a, ymm_b, m); cc.evex().vpmaxuw(zmm_a, zmm_b, m); cc.evex().vpminsb(xmm_a, xmm_b, m); cc.evex().vpminsb(ymm_a, ymm_b, m); cc.evex().vpminsb(zmm_a, zmm_b, m); cc.evex().vpminsd(xmm_a, xmm_b, m); cc.evex().vpminsd(ymm_a, ymm_b, m); cc.evex().vpminsd(zmm_a, zmm_b, m); cc.evex().vpminsq(xmm_a, xmm_b, m); cc.evex().vpminsq(ymm_a, ymm_b, m); cc.evex().vpminsq(zmm_a, zmm_b, m); cc.evex().vpminsw(xmm_a, xmm_b, m); cc.evex().vpminsw(ymm_a, ymm_b, m); cc.evex().vpminsw(zmm_a, zmm_b, m); cc.evex().vpminub(xmm_a, xmm_b, m); cc.evex().vpminub(ymm_a, ymm_b, m); cc.evex().vpminub(zmm_a, zmm_b, m); cc.evex().vpminud(xmm_a, xmm_b, m); cc.evex().vpminud(ymm_a, ymm_b, m); cc.evex().vpminud(zmm_a, zmm_b, m); cc.evex().vpminuq(xmm_a, xmm_b, m); cc.evex().vpminuq(ymm_a, ymm_b, m); cc.evex().vpminuq(zmm_a, zmm_b, m); cc.evex().vpminuw(xmm_a, xmm_b, m); cc.evex().vpminuw(ymm_a, ymm_b, m); cc.evex().vpminuw(zmm_a, zmm_b, m); cc.evex().vpmovdb(m, xmm_b); cc.evex().vpmovdb(m, ymm_b); cc.evex().vpmovdb(m, zmm_b); cc.evex().vpmovdw(m, xmm_b); cc.evex().vpmovdw(m, ymm_b); cc.evex().vpmovdw(m, zmm_b); cc.evex().vpmovqb(m, xmm_b); cc.evex().vpmovqb(m, ymm_b); cc.evex().vpmovqb(m, zmm_b); cc.evex().vpmovqd(m, xmm_b); cc.evex().vpmovqd(m, ymm_b); cc.evex().vpmovqd(m, zmm_b); cc.evex().vpmovqw(m, xmm_b); cc.evex().vpmovqw(m, ymm_b); cc.evex().vpmovqw(m, zmm_b); cc.evex().vpmovsdb(m, xmm_b); cc.evex().vpmovsdb(m, ymm_b); cc.evex().vpmovsdb(m, zmm_b); cc.evex().vpmovsdw(m, xmm_b); cc.evex().vpmovsdw(m, ymm_b); cc.evex().vpmovsdw(m, zmm_b); cc.evex().vpmovsqb(m, xmm_b); cc.evex().vpmovsqb(m, ymm_b); cc.evex().vpmovsqb(m, zmm_b); cc.evex().vpmovsqd(m, xmm_b); cc.evex().vpmovsqd(m, ymm_b); cc.evex().vpmovsqd(m, zmm_b); cc.evex().vpmovsqw(m, xmm_b); cc.evex().vpmovsqw(m, ymm_b); cc.evex().vpmovsqw(m, zmm_b); cc.evex().vpmovswb(m, xmm_b); cc.evex().vpmovswb(m, ymm_b); cc.evex().vpmovswb(m, zmm_b); cc.evex().vpmovsxbd(xmm_a, m); cc.evex().vpmovsxbd(ymm_a, m); cc.evex().vpmovsxbd(zmm_a, m); cc.evex().vpmovsxbq(xmm_a, m); cc.evex().vpmovsxbq(ymm_a, m); cc.evex().vpmovsxbq(zmm_a, m); cc.evex().vpmovsxbw(xmm_a, m); cc.evex().vpmovsxbw(ymm_a, m); cc.evex().vpmovsxbw(zmm_a, m); cc.evex().vpmovsxdq(xmm_a, m); cc.evex().vpmovsxdq(ymm_a, m); cc.evex().vpmovsxdq(zmm_a, m); cc.evex().vpmovsxwd(xmm_a, m); cc.evex().vpmovsxwd(ymm_a, m); cc.evex().vpmovsxwd(zmm_a, m); cc.evex().vpmovsxwq(xmm_a, m); cc.evex().vpmovsxwq(ymm_a, m); cc.evex().vpmovsxwq(zmm_a, m); cc.evex().vpmovusdb(m, xmm_b); cc.evex().vpmovusdb(m, ymm_b); cc.evex().vpmovusdb(m, zmm_b); cc.evex().vpmovusdw(m, xmm_b); cc.evex().vpmovusdw(m, ymm_b); cc.evex().vpmovusdw(m, zmm_b); cc.evex().vpmovusqb(m, xmm_b); cc.evex().vpmovusqb(m, ymm_b); cc.evex().vpmovusqb(m, zmm_b); cc.evex().vpmovusqd(m, xmm_b); cc.evex().vpmovusqd(m, ymm_b); cc.evex().vpmovusqd(m, zmm_b); cc.evex().vpmovusqw(m, xmm_b); cc.evex().vpmovusqw(m, ymm_b); cc.evex().vpmovusqw(m, zmm_b); cc.evex().vpmovuswb(m, xmm_b); cc.evex().vpmovuswb(m, ymm_b); cc.evex().vpmovuswb(m, zmm_b); cc.evex().vpmovwb(m, xmm_b); cc.evex().vpmovwb(m, ymm_b); cc.evex().vpmovwb(m, zmm_b); cc.evex().vpmovzxbd(xmm_a, m); cc.evex().vpmovzxbd(ymm_a, m); cc.evex().vpmovzxbd(zmm_a, m); cc.evex().vpmovzxbq(xmm_a, m); cc.evex().vpmovzxbq(ymm_a, m); cc.evex().vpmovzxbq(zmm_a, m); cc.evex().vpmovzxbw(xmm_a, m); cc.evex().vpmovzxbw(ymm_a, m); cc.evex().vpmovzxbw(zmm_a, m); cc.evex().vpmovzxdq(xmm_a, m); cc.evex().vpmovzxdq(ymm_a, m); cc.evex().vpmovzxdq(zmm_a, m); cc.evex().vpmovzxwd(xmm_a, m); cc.evex().vpmovzxwd(ymm_a, m); cc.evex().vpmovzxwd(zmm_a, m); cc.evex().vpmovzxwq(xmm_a, m); cc.evex().vpmovzxwq(ymm_a, m); cc.evex().vpmovzxwq(zmm_a, m); cc.evex().vpmuldq(xmm_a, xmm_b, m); cc.evex().vpmuldq(ymm_a, ymm_b, m); cc.evex().vpmuldq(zmm_a, zmm_b, m); cc.evex().vpmulhrsw(xmm_a, xmm_b, m); cc.evex().vpmulhrsw(ymm_a, ymm_b, m); cc.evex().vpmulhrsw(zmm_a, zmm_b, m); cc.evex().vpmulhuw(xmm_a, xmm_b, m); cc.evex().vpmulhuw(ymm_a, ymm_b, m); cc.evex().vpmulhuw(zmm_a, zmm_b, m); cc.evex().vpmulhw(xmm_a, xmm_b, m); cc.evex().vpmulhw(ymm_a, ymm_b, m); cc.evex().vpmulhw(zmm_a, zmm_b, m); cc.evex().vpmulld(xmm_a, xmm_b, m); cc.evex().vpmulld(ymm_a, ymm_b, m); cc.evex().vpmulld(zmm_a, zmm_b, m); cc.evex().vpmullq(xmm_a, xmm_b, m); cc.evex().vpmullq(ymm_a, ymm_b, m); cc.evex().vpmullq(zmm_a, zmm_b, m); cc.evex().vpmullw(xmm_a, xmm_b, m); cc.evex().vpmullw(ymm_a, ymm_b, m); cc.evex().vpmullw(zmm_a, zmm_b, m); cc.evex().vpmultishiftqb(xmm_a, xmm_b, m); cc.evex().vpmultishiftqb(ymm_a, ymm_b, m); cc.evex().vpmultishiftqb(zmm_a, zmm_b, m); cc.evex().vpmuludq(xmm_a, xmm_b, m); cc.evex().vpmuludq(ymm_a, ymm_b, m); cc.evex().vpmuludq(zmm_a, zmm_b, m); cc.evex().vpopcntd(zmm_a, m); cc.evex().vpopcntq(zmm_a, m); cc.evex().vpord(xmm_a, xmm_b, m); cc.evex().vpord(ymm_a, ymm_b, m); cc.evex().vpord(zmm_a, zmm_b, m); cc.evex().vporq(xmm_a, xmm_b, m); cc.evex().vporq(ymm_a, ymm_b, m); cc.evex().vporq(zmm_a, zmm_b, m); cc.evex().vprold(xmm_a, m, 0); cc.evex().vprold(ymm_a, m, 0); cc.evex().vprold(zmm_a, m, 0); cc.evex().vprolq(xmm_a, m, 0); cc.evex().vprolq(ymm_a, m, 0); cc.evex().vprolq(zmm_a, m, 0); cc.evex().vprolvd(xmm_a, xmm_b, m); cc.evex().vprolvd(ymm_a, ymm_b, m); cc.evex().vprolvd(zmm_a, zmm_b, m); cc.evex().vprolvq(xmm_a, xmm_b, m); cc.evex().vprolvq(ymm_a, ymm_b, m); cc.evex().vprolvq(zmm_a, zmm_b, m); cc.evex().vprord(xmm_a, m, 0); cc.evex().vprord(ymm_a, m, 0); cc.evex().vprord(zmm_a, m, 0); cc.evex().vprorq(xmm_a, m, 0); cc.evex().vprorq(ymm_a, m, 0); cc.evex().vprorq(zmm_a, m, 0); cc.evex().vprorvd(xmm_a, xmm_b, m); cc.evex().vprorvd(ymm_a, ymm_b, m); cc.evex().vprorvd(zmm_a, zmm_b, m); cc.evex().vprorvq(xmm_a, xmm_b, m); cc.evex().vprorvq(ymm_a, ymm_b, m); cc.evex().vprorvq(zmm_a, zmm_b, m); cc.evex().vpsadbw(xmm_a, xmm_b, m); cc.evex().vpsadbw(ymm_a, ymm_b, m); cc.evex().vpsadbw(zmm_a, zmm_b, m); cc.evex().k(kA).vpscatterdd(vx_ptr, xmm_b); cc.evex().k(kA).vpscatterdd(vy_ptr, ymm_b); cc.evex().k(kA).vpscatterdd(vz_ptr, zmm_b); cc.evex().k(kA).vpscatterdq(vx_ptr, xmm_b); cc.evex().k(kA).vpscatterdq(vx_ptr, ymm_b); cc.evex().k(kA).vpscatterdq(vy_ptr, zmm_b); cc.evex().k(kA).vpscatterqd(vx_ptr, xmm_b); cc.evex().k(kA).vpscatterqd(vy_ptr, xmm_b); cc.evex().k(kA).vpscatterqd(vz_ptr, ymm_b); cc.evex().k(kA).vpscatterqq(vx_ptr, xmm_b); cc.evex().k(kA).vpscatterqq(vy_ptr, ymm_b); cc.evex().k(kA).vpscatterqq(vz_ptr, zmm_b); cc.evex().vpshufb(xmm_a, xmm_b, m); cc.evex().vpshufb(ymm_a, ymm_b, m); cc.evex().vpshufb(zmm_a, zmm_b, m); cc.evex().vpshufd(xmm_a, m, 0); cc.evex().vpshufd(ymm_a, m, 0); cc.evex().vpshufd(zmm_a, m, 0); cc.evex().vpshufhw(xmm_a, m, 0); cc.evex().vpshufhw(ymm_a, m, 0); cc.evex().vpshufhw(zmm_a, m, 0); cc.evex().vpshuflw(xmm_a, m, 0); cc.evex().vpshuflw(ymm_a, m, 0); cc.evex().vpshuflw(zmm_a, m, 0); cc.evex().vpslld(xmm_a, xmm_b, m); cc.evex().vpslld(xmm_a, m, 0); cc.evex().vpslld(ymm_a, ymm_b, m); cc.evex().vpslld(ymm_a, m, 0); cc.evex().vpslld(zmm_a, zmm_b, m); cc.evex().vpslld(zmm_a, m, 0); cc.evex().vpslldq(xmm_a, m, 0); cc.evex().vpslldq(ymm_a, m, 0); cc.evex().vpslldq(zmm_a, m, 0); cc.evex().vpsllq(xmm_a, xmm_b, m); cc.evex().vpsllq(xmm_a, m, 0); cc.evex().vpsllq(ymm_a, ymm_b, m); cc.evex().vpsllq(ymm_a, m, 0); cc.evex().vpsllq(zmm_a, zmm_b, m); cc.evex().vpsllq(zmm_a, m, 0); cc.evex().vpsllvd(xmm_a, xmm_b, m); cc.evex().vpsllvd(ymm_a, ymm_b, m); cc.evex().vpsllvd(zmm_a, zmm_b, m); cc.evex().vpsllvq(xmm_a, xmm_b, m); cc.evex().vpsllvq(ymm_a, ymm_b, m); cc.evex().vpsllvq(zmm_a, zmm_b, m); cc.evex().vpsllvw(xmm_a, xmm_b, m); cc.evex().vpsllvw(ymm_a, ymm_b, m); cc.evex().vpsllvw(zmm_a, zmm_b, m); cc.evex().vpsllw(xmm_a, xmm_b, m); cc.evex().vpsllw(xmm_a, m, 0); cc.evex().vpsllw(ymm_a, ymm_b, m); cc.evex().vpsllw(ymm_a, m, 0); cc.evex().vpsllw(zmm_a, zmm_b, m); cc.evex().vpsllw(zmm_a, m, 0); cc.evex().vpsrad(xmm_a, xmm_b, m); cc.evex().vpsrad(xmm_a, m, 0); cc.evex().vpsrad(ymm_a, ymm_b, m); cc.evex().vpsrad(ymm_a, m, 0); cc.evex().vpsrad(zmm_a, zmm_b, m); cc.evex().vpsrad(zmm_a, m, 0); cc.evex().vpsraq(xmm_a, xmm_b, m); cc.evex().vpsraq(xmm_a, m, 0); cc.evex().vpsraq(ymm_a, ymm_b, m); cc.evex().vpsraq(ymm_a, m, 0); cc.evex().vpsraq(zmm_a, zmm_b, m); cc.evex().vpsraq(zmm_a, m, 0); cc.evex().vpsravd(xmm_a, xmm_b, m); cc.evex().vpsravd(ymm_a, ymm_b, m); cc.evex().vpsravd(zmm_a, zmm_b, m); cc.evex().vpsravq(xmm_a, xmm_b, m); cc.evex().vpsravq(ymm_a, ymm_b, m); cc.evex().vpsravq(zmm_a, zmm_b, m); cc.evex().vpsravw(xmm_a, xmm_b, m); cc.evex().vpsravw(ymm_a, ymm_b, m); cc.evex().vpsravw(zmm_a, zmm_b, m); cc.evex().vpsraw(xmm_a, xmm_b, m); cc.evex().vpsraw(xmm_a, m, 0); cc.evex().vpsraw(ymm_a, ymm_b, m); cc.evex().vpsraw(ymm_a, m, 0); cc.evex().vpsraw(zmm_a, zmm_b, m); cc.evex().vpsraw(zmm_a, m, 0); cc.evex().vpsrld(xmm_a, xmm_b, m); cc.evex().vpsrld(xmm_a, m, 0); cc.evex().vpsrld(ymm_a, ymm_b, m); cc.evex().vpsrld(ymm_a, m, 0); cc.evex().vpsrld(zmm_a, zmm_b, m); cc.evex().vpsrld(zmm_a, m, 0); cc.evex().vpsrldq(xmm_a, m, 0); cc.evex().vpsrldq(ymm_a, m, 0); cc.evex().vpsrldq(zmm_a, m, 0); cc.evex().vpsrlq(xmm_a, xmm_b, m); cc.evex().vpsrlq(xmm_a, m, 0); cc.evex().vpsrlq(ymm_a, ymm_b, m); cc.evex().vpsrlq(ymm_a, m, 0); cc.evex().vpsrlq(zmm_a, zmm_b, m); cc.evex().vpsrlq(zmm_a, m, 0); cc.evex().vpsrlvd(xmm_a, xmm_b, m); cc.evex().vpsrlvd(ymm_a, ymm_b, m); cc.evex().vpsrlvd(zmm_a, zmm_b, m); cc.evex().vpsrlvq(xmm_a, xmm_b, m); cc.evex().vpsrlvq(ymm_a, ymm_b, m); cc.evex().vpsrlvq(zmm_a, zmm_b, m); cc.evex().vpsrlvw(xmm_a, xmm_b, m); cc.evex().vpsrlvw(ymm_a, ymm_b, m); cc.evex().vpsrlvw(zmm_a, zmm_b, m); cc.evex().vpsrlw(xmm_a, xmm_b, m); cc.evex().vpsrlw(xmm_a, m, 0); cc.evex().vpsrlw(ymm_a, ymm_b, m); cc.evex().vpsrlw(ymm_a, m, 0); cc.evex().vpsrlw(zmm_a, zmm_b, m); cc.evex().vpsrlw(zmm_a, m, 0); cc.evex().vpsubb(xmm_a, xmm_b, m); cc.evex().vpsubb(ymm_a, ymm_b, m); cc.evex().vpsubb(zmm_a, zmm_b, m); cc.evex().vpsubd(xmm_a, xmm_b, m); cc.evex().vpsubd(ymm_a, ymm_b, m); cc.evex().vpsubd(zmm_a, zmm_b, m); cc.evex().vpsubq(xmm_a, xmm_b, m); cc.evex().vpsubq(ymm_a, ymm_b, m); cc.evex().vpsubq(zmm_a, zmm_b, m); cc.evex().vpsubsb(xmm_a, xmm_b, m); cc.evex().vpsubsb(ymm_a, ymm_b, m); cc.evex().vpsubsb(zmm_a, zmm_b, m); cc.evex().vpsubsw(xmm_a, xmm_b, m); cc.evex().vpsubsw(ymm_a, ymm_b, m); cc.evex().vpsubsw(zmm_a, zmm_b, m); cc.evex().vpsubusb(xmm_a, xmm_b, m); cc.evex().vpsubusb(ymm_a, ymm_b, m); cc.evex().vpsubusb(zmm_a, zmm_b, m); cc.evex().vpsubusw(xmm_a, xmm_b, m); cc.evex().vpsubusw(ymm_a, ymm_b, m); cc.evex().vpsubusw(zmm_a, zmm_b, m); cc.evex().vpsubw(xmm_a, xmm_b, m); cc.evex().vpsubw(ymm_a, ymm_b, m); cc.evex().vpsubw(zmm_a, zmm_b, m); cc.evex().vpternlogd(xmm_a, xmm_b, m, 0); cc.evex().vpternlogd(ymm_a, ymm_b, m, 0); cc.evex().vpternlogd(zmm_a, zmm_b, m, 0); cc.evex().vpternlogq(xmm_a, xmm_b, m, 0); cc.evex().vpternlogq(ymm_a, ymm_b, m, 0); cc.evex().vpternlogq(zmm_a, zmm_b, m, 0); cc.evex().vptestmb(kA, xmm_b, m); cc.evex().vptestmb(kA, ymm_b, m); cc.evex().vptestmb(kA, zmm_b, m); cc.evex().vptestmd(kA, xmm_b, m); cc.evex().vptestmd(kA, ymm_b, m); cc.evex().vptestmd(kA, zmm_b, m); cc.evex().vptestmq(kA, xmm_b, m); cc.evex().vptestmq(kA, ymm_b, m); cc.evex().vptestmq(kA, zmm_b, m); cc.evex().vptestmw(kA, xmm_b, m); cc.evex().vptestmw(kA, ymm_b, m); cc.evex().vptestmw(kA, zmm_b, m); cc.evex().vptestnmb(kA, xmm_b, m); cc.evex().vptestnmb(kA, ymm_b, m); cc.evex().vptestnmb(kA, zmm_b, m); cc.evex().vptestnmd(kA, xmm_b, m); cc.evex().vptestnmd(kA, ymm_b, m); cc.evex().vptestnmd(kA, zmm_b, m); cc.evex().vptestnmq(kA, xmm_b, m); cc.evex().vptestnmq(kA, ymm_b, m); cc.evex().vptestnmq(kA, zmm_b, m); cc.evex().vptestnmw(kA, xmm_b, m); cc.evex().vptestnmw(kA, ymm_b, m); cc.evex().vptestnmw(kA, zmm_b, m); cc.evex().vpunpckhbw(xmm_a, xmm_b, m); cc.evex().vpunpckhbw(ymm_a, ymm_b, m); cc.evex().vpunpckhbw(zmm_a, zmm_b, m); cc.evex().vpunpckhdq(xmm_a, xmm_b, m); cc.evex().vpunpckhdq(ymm_a, ymm_b, m); cc.evex().vpunpckhdq(zmm_a, zmm_b, m); cc.evex().vpunpckhqdq(xmm_a, xmm_b, m); cc.evex().vpunpckhqdq(ymm_a, ymm_b, m); cc.evex().vpunpckhqdq(zmm_a, zmm_b, m); cc.evex().vpunpckhwd(xmm_a, xmm_b, m); cc.evex().vpunpckhwd(ymm_a, ymm_b, m); cc.evex().vpunpckhwd(zmm_a, zmm_b, m); cc.evex().vpunpcklbw(xmm_a, xmm_b, m); cc.evex().vpunpcklbw(ymm_a, ymm_b, m); cc.evex().vpunpcklbw(zmm_a, zmm_b, m); cc.evex().vpunpckldq(xmm_a, xmm_b, m); cc.evex().vpunpckldq(ymm_a, ymm_b, m); cc.evex().vpunpckldq(zmm_a, zmm_b, m); cc.evex().vpunpcklqdq(xmm_a, xmm_b, m); cc.evex().vpunpcklqdq(ymm_a, ymm_b, m); cc.evex().vpunpcklqdq(zmm_a, zmm_b, m); cc.evex().vpunpcklwd(xmm_a, xmm_b, m); cc.evex().vpunpcklwd(ymm_a, ymm_b, m); cc.evex().vpunpcklwd(zmm_a, zmm_b, m); cc.evex().vpxord(xmm_a, xmm_b, m); cc.evex().vpxord(ymm_a, ymm_b, m); cc.evex().vpxord(zmm_a, zmm_b, m); cc.evex().vpxorq(xmm_a, xmm_b, m); cc.evex().vpxorq(ymm_a, ymm_b, m); cc.evex().vpxorq(zmm_a, zmm_b, m); cc.evex().vrangepd(xmm_a, xmm_b, m, 0); cc.evex().vrangepd(ymm_a, ymm_b, m, 0); cc.evex().vrangepd(zmm_a, zmm_b, m, 0); cc.evex().vrangeps(xmm_a, xmm_b, m, 0); cc.evex().vrangeps(ymm_a, ymm_b, m, 0); cc.evex().vrangeps(zmm_a, zmm_b, m, 0); cc.evex().vrangesd(xmm_a, xmm_b, m, 0); cc.evex().vrangess(xmm_a, xmm_b, m, 0); cc.evex().vrcp14pd(xmm_a, m); cc.evex().vrcp14pd(ymm_a, m); cc.evex().vrcp14pd(zmm_a, m); cc.evex().vrcp14ps(xmm_a, m); cc.evex().vrcp14ps(ymm_a, m); cc.evex().vrcp14ps(zmm_a, m); cc.evex().vrcp14sd(xmm_a, xmm_b, m); cc.evex().vrcp14ss(xmm_a, xmm_b, m); cc.evex().vreducepd(xmm_a, m, 0); cc.evex().vreducepd(ymm_a, m, 0); cc.evex().vreducepd(zmm_a, m, 0); cc.evex().vreduceps(xmm_a, m, 0); cc.evex().vreduceps(ymm_a, m, 0); cc.evex().vreduceps(zmm_a, m, 0); cc.evex().vreducesd(xmm_a, xmm_b, m, 0); cc.evex().vreducess(xmm_a, xmm_b, m, 0); cc.evex().vrndscalepd(xmm_a, m, 0); cc.evex().vrndscalepd(ymm_a, m, 0); cc.evex().vrndscalepd(zmm_a, m, 0); cc.evex().vrndscaleps(xmm_a, m, 0); cc.evex().vrndscaleps(ymm_a, m, 0); cc.evex().vrndscaleps(zmm_a, m, 0); cc.evex().vrndscalesd(xmm_a, xmm_b, m, 0); cc.evex().vrndscaless(xmm_a, xmm_b, m, 0); cc.evex().vrsqrt14pd(xmm_a, m); cc.evex().vrsqrt14pd(ymm_a, m); cc.evex().vrsqrt14pd(zmm_a, m); cc.evex().vrsqrt14ps(xmm_a, m); cc.evex().vrsqrt14ps(ymm_a, m); cc.evex().vrsqrt14ps(zmm_a, m); cc.evex().vrsqrt14sd(xmm_a, xmm_b, m); cc.evex().vrsqrt14ss(xmm_a, xmm_b, m); cc.evex().vscalefpd(xmm_a, xmm_b, m); cc.evex().vscalefpd(ymm_a, ymm_b, m); cc.evex().vscalefpd(zmm_a, zmm_b, m); cc.evex().vscalefps(xmm_a, xmm_b, m); cc.evex().vscalefps(ymm_a, ymm_b, m); cc.evex().vscalefps(zmm_a, zmm_b, m); cc.evex().vscalefsd(xmm_a, xmm_b, m); cc.evex().vscalefss(xmm_a, xmm_b, m); cc.evex().k(kA).vscatterdpd(vx_ptr, xmm_b); cc.evex().k(kA).vscatterdpd(vx_ptr, ymm_b); cc.evex().k(kA).vscatterdpd(vy_ptr, zmm_b); cc.evex().k(kA).vscatterdps(vx_ptr, xmm_b); cc.evex().k(kA).vscatterdps(vy_ptr, ymm_b); cc.evex().k(kA).vscatterdps(vz_ptr, zmm_b); cc.evex().k(kA).vscatterqpd(vx_ptr, xmm_b); cc.evex().k(kA).vscatterqpd(vy_ptr, ymm_b); cc.evex().k(kA).vscatterqpd(vz_ptr, zmm_b); cc.evex().k(kA).vscatterqps(vx_ptr, xmm_b); cc.evex().k(kA).vscatterqps(vy_ptr, xmm_b); cc.evex().k(kA).vscatterqps(vz_ptr, ymm_b); cc.evex().vshuff32x4(ymm_a, ymm_b, m, 0); cc.evex().vshuff32x4(zmm_a, zmm_b, m, 0); cc.evex().vshuff64x2(ymm_a, ymm_b, m, 0); cc.evex().vshuff64x2(zmm_a, zmm_b, m, 0); cc.evex().vshufi32x4(ymm_a, ymm_b, m, 0); cc.evex().vshufi32x4(zmm_a, zmm_b, m, 0); cc.evex().vshufi64x2(ymm_a, ymm_b, m, 0); cc.evex().vshufi64x2(zmm_a, zmm_b, m, 0); cc.evex().vshufpd(xmm_a, xmm_b, m, 0); cc.evex().vshufpd(ymm_a, ymm_b, m, 0); cc.evex().vshufpd(zmm_a, zmm_b, m, 0); cc.evex().vshufps(xmm_a, xmm_b, m, 0); cc.evex().vshufps(ymm_a, ymm_b, m, 0); cc.evex().vshufps(zmm_a, zmm_b, m, 0); cc.evex().vsqrtpd(xmm_a, m); cc.evex().vsqrtpd(ymm_a, m); cc.evex().vsqrtpd(zmm_a, m); cc.evex().vsqrtps(xmm_a, m); cc.evex().vsqrtps(ymm_a, m); cc.evex().vsqrtps(zmm_a, m); cc.evex().vsqrtsd(xmm_a, xmm_b, m); cc.evex().vsqrtss(xmm_a, xmm_b, m); cc.evex().vsubpd(xmm_a, xmm_b, m); cc.evex().vsubpd(ymm_a, ymm_b, m); cc.evex().vsubpd(zmm_a, zmm_b, m); cc.evex().vsubps(xmm_a, xmm_b, m); cc.evex().vsubps(ymm_a, ymm_b, m); cc.evex().vsubps(zmm_a, zmm_b, m); cc.evex().vsubsd(xmm_a, xmm_b, m); cc.evex().vsubss(xmm_a, xmm_b, m); cc.evex().vucomisd(xmm_a, m); cc.evex().vucomiss(xmm_a, m); cc.evex().vunpckhpd(xmm_a, xmm_b, m); cc.evex().vunpckhpd(ymm_a, ymm_b, m); cc.evex().vunpckhpd(zmm_a, zmm_b, m); cc.evex().vunpckhps(xmm_a, xmm_b, m); cc.evex().vunpckhps(ymm_a, ymm_b, m); cc.evex().vunpckhps(zmm_a, zmm_b, m); cc.evex().vunpcklpd(xmm_a, xmm_b, m); cc.evex().vunpcklpd(ymm_a, ymm_b, m); cc.evex().vunpcklpd(zmm_a, zmm_b, m); cc.evex().vunpcklps(xmm_a, xmm_b, m); cc.evex().vunpcklps(ymm_a, ymm_b, m); cc.evex().vunpcklps(zmm_a, zmm_b, m); cc.evex().vxorpd(xmm_a, xmm_b, m); cc.evex().vxorpd(ymm_a, ymm_b, m); cc.evex().vxorpd(zmm_a, zmm_b, m); cc.evex().vxorps(xmm_a, xmm_b, m); cc.evex().vxorps(ymm_a, ymm_b, m); cc.evex().vxorps(zmm_a, zmm_b, m); } // Generates a long sequence of AVX512 instructions. template static void generate_avx512_sequence_internal( Emitter& cc, InstForm form, const x86::Gp& gp, const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC, const x86::Vec& vec_a, const x86::Vec& vec_b, const x86::Vec& vec_c, const x86::Vec& vec_d) { if (form == InstForm::kReg) generate_avx512_sequence_internal_reg_only(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d); else generate_avx512_sequence_internal_reg_mem(cc, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d); } static void generate_avx512_sequence(BaseEmitter& emitter, InstForm form, bool emit_prolog_epilog) { using namespace asmjit::x86; #ifndef ASMJIT_NO_COMPILER if (emitter.is_compiler()) { Compiler& cc = *emitter.as(); Gp gp = cc.new_gpz("gp"); Vec vec_a = cc.new_zmm("vec_a"); Vec vec_b = cc.new_zmm("vec_b"); Vec vec_c = cc.new_zmm("vec_c"); Vec vec_d = cc.new_zmm("vec_d"); KReg kA = cc.new_kq("kA"); KReg kB = cc.new_kq("kB"); KReg kC = cc.new_kq("kC"); cc.add_func(FuncSignature::build()); generate_avx512_sequence_internal(cc, form, gp, kA, kB, kC, vec_a, vec_b, vec_c, vec_d); cc.end_func(); return; } #endif #ifndef ASMJIT_NO_BUILDER if (emitter.is_builder()) { Builder& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); frame.finalize(); cc.emit_prolog(frame); generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); cc.emit_epilog(frame); } else { generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); } return; } #endif if (emitter.is_assembler()) { Assembler& cc = *emitter.as(); if (emit_prolog_epilog) { FuncDetail func; func.init(FuncSignature::build(), cc.environment()); FuncFrame frame; frame.init(func); frame.add_dirty_regs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); frame.finalize(); cc.emit_prolog(frame); generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); cc.emit_epilog(frame); } else { generate_avx512_sequence_internal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3); } return; } } template static void benchmark_x86_function(Arch arch, uint32_t num_iterations, const char* description, const EmitterFn& emitter_fn) noexcept { CodeHolder code; printf("%s:\n", description); uint32_t instruction_count = 0; #ifndef ASMJIT_NO_BUILDER instruction_count = asmjit_perf_utils::calculate_instruction_count(code, arch, [&](x86::Builder& cc) { emitter_fn(cc, false); }); #endif asmjit_perf_utils::bench(code, arch, num_iterations, "[raw]", instruction_count, [&](x86::Assembler& cc) { emitter_fn(cc, false); }); asmjit_perf_utils::bench(code, arch, num_iterations, "[validated]", instruction_count, [&](x86::Assembler& cc) { cc.add_diagnostic_options(DiagnosticOptions::kValidateAssembler); emitter_fn(cc, false); }); asmjit_perf_utils::bench(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Assembler& cc) { emitter_fn(cc, true); }); #ifndef ASMJIT_NO_BUILDER asmjit_perf_utils::bench(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Builder& cc) { emitter_fn(cc, false); }); asmjit_perf_utils::bench(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Builder& cc) { emitter_fn(cc, false); cc.finalize(); }); asmjit_perf_utils::bench(code, arch, num_iterations, "[prolog/epilog]", instruction_count, [&](x86::Builder& cc) { emitter_fn(cc, true); cc.finalize(); }); #endif #ifndef ASMJIT_NO_COMPILER asmjit_perf_utils::bench(code, arch, num_iterations, "[no-asm]", instruction_count, [&](x86::Compiler& cc) { emitter_fn(cc, true); }); asmjit_perf_utils::bench(code, arch, num_iterations, "[finalized]", instruction_count, [&](x86::Compiler& cc) { emitter_fn(cc, true); cc.finalize(); }); #endif printf("\n"); } void benchmark_x86_emitters(uint32_t num_iterations, bool test_x86, bool test_x64) { uint32_t i = 0; uint32_t n = 0; Arch archs[2] {}; if (test_x86) archs[n++] = Arch::kX86; if (test_x64) archs[n++] = Arch::kX64; for (i = 0; i < n; i++) { static const char description[] = "Empty function (mov + return from function)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_empty_function(emitter, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "4-Ops sequence (4 ops + return from function)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_n_ops_sequence(emitter, 4, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "16-Ops sequence (16 ops + return from function)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_n_ops_sequence(emitter, 16, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "32-Ops sequence (32 ops + return from function)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_n_ops_sequence(emitter, 32, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "64-Ops sequence (64 ops + return from function)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_n_ops_sequence(emitter, 64, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "GpSequence (Sequence of GP instructions - reg-only)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_gp_sequence(emitter, InstForm::kReg, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "GpSequence (Sequence of GP instructions - reg/mem)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_gp_sequence(emitter, InstForm::kMem, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "SseSequence (sequence of SSE+ instructions - reg-only)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_sse_sequence(emitter, InstForm::kReg, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "SseSequence (sequence of SSE+ instructions - reg/mem)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_sse_sequence(emitter, InstForm::kMem, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "AvxSequence (sequence of AVX+ instructions - reg-only)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_avx_sequence(emitter, InstForm::kReg, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "AvxSequence (sequence of AVX+ instructions - reg/mem)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_avx_sequence(emitter, InstForm::kMem, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "Avx512Sequence (sequence of AVX512+ instructions - reg-only)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_avx512_sequence(emitter, InstForm::kReg, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "Avx512Sequence (sequence of AVX512+ instructions - reg/mem)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { generate_avx512_sequence(emitter, InstForm::kMem, emit_prolog_epilog); }); } for (i = 0; i < n; i++) { static const char description[] = "SseAlphaBlend (alpha-blend function with labels and jumps)"; benchmark_x86_function(archs[i], num_iterations, description, [](BaseEmitter& emitter, bool emit_prolog_epilog) { asmtest::generate_sse_alpha_blend(emitter, emit_prolog_epilog); }); } } #endif // !ASMJIT_NO_X86