diff --git a/src/asmjit/base/func.h b/src/asmjit/base/func.h index a75d16b..5c29a6c 100644 --- a/src/asmjit/base/func.h +++ b/src/asmjit/base/func.h @@ -45,56 +45,89 @@ struct CallConv { //! None or invalid (can't be used). kIdNone = 0, + // ------------------------------------------------------------------------ + // [Universal] + // ------------------------------------------------------------------------ + + // TODO: To make this possible we need to know target ARCH and ABI. + + /* + + // Universal calling conventions are applicable to any target and are + // converted to target dependent conventions at runtime. The purpose of + // these conventions is to make using functions less target dependent. + + kIdCDecl = 1, + kIdStdCall = 2, + kIdFastCall = 3, + + //! AsmJit specific calling convention designed for calling functions + //! inside a multimedia code like that don't use many registers internally, + //! but are long enough to be called and not inlined. These functions are + //! usually used to calculate trigonometric functions, logarithms, etc... + kIdFastEval2 = 10, + kIdFastEval3 = 11, + kIdFastEval4 = 12, + */ + // ------------------------------------------------------------------------ // [X86] // ------------------------------------------------------------------------ //! X86 `__cdecl` calling convention (used by C runtime and libraries). - kIdX86CDecl = 1, + kIdX86CDecl = 16, //! X86 `__stdcall` calling convention (used mostly by WinAPI). - kIdX86StdCall = 2, + kIdX86StdCall = 17, //! X86 `__thiscall` calling convention (MSVC/Intel). - kIdX86MsThisCall = 3, + kIdX86MsThisCall = 18, //! X86 `__fastcall` convention (MSVC/Intel). - kIdX86MsFastCall = 4, + kIdX86MsFastCall = 19, //! X86 `__fastcall` convention (GCC and Clang). - kIdX86GccFastCall = 5, + kIdX86GccFastCall = 20, //! X86 `regparm(1)` convention (GCC and Clang). - kIdX86GccRegParm1 = 6, + kIdX86GccRegParm1 = 21, //! X86 `regparm(2)` convention (GCC and Clang). - kIdX86GccRegParm2 = 7, + kIdX86GccRegParm2 = 22, //! X86 `regparm(3)` convention (GCC and Clang). - kIdX86GccRegParm3 = 8, + kIdX86GccRegParm3 = 23, + + kIdX86FastEval2 = 29, + kIdX86FastEval3 = 30, + kIdX86FastEval4 = 31, //! X64 calling convention defined by WIN64-ABI. //! //! Links: //! * . - kIdX86Win64 = 16, + kIdX86Win64 = 32, //! X64 calling convention used by Unix platforms (SYSV/AMD64-ABI). - kIdX86SysV64 = 17, + kIdX86SysV64 = 33, + + kIdX64FastEval2 = 45, + kIdX64FastEval3 = 46, + kIdX64FastEval4 = 47, // ------------------------------------------------------------------------ // [ARM] // ------------------------------------------------------------------------ //! Legacy calling convention, floating point arguments are passed via GP registers. - kIdArm32SoftFP = 32, + kIdArm32SoftFP = 48, //! Modern calling convention, uses VFP registers to pass floating point arguments. - kIdArm32HardFP = 33, + kIdArm32HardFP = 49, // ------------------------------------------------------------------------ // [Internal] // ------------------------------------------------------------------------ - _kIdX86Start = 1, //!< \internal - _kIdX86End = 8, //!< \internal + _kIdX86Start = 16, //!< \internal + _kIdX86End = 31, //!< \internal - _kIdX64Start = 16, //!< \internal - _kIdX64End = 17, //!< \internal + _kIdX64Start = 32, //!< \internal + _kIdX64End = 47, //!< \internal - _kIdArmStart = 32, //!< \internal - _kIdArmEnd = 33, //!< \internal + _kIdArmStart = 48, //!< \internal + _kIdArmEnd = 49, //!< \internal // ------------------------------------------------------------------------ // [Host] @@ -106,42 +139,48 @@ struct CallConv { //! NOTE: This should be always the same as `kIdHostCDecl`, but some //! compilers allow to override the default calling convention. Overriding //! is not detected at the moment. - kIdHost = DETECTED_AT_COMPILE_TIME, + kIdHost = DETECTED_AT_COMPILE_TIME, //! Default CDECL calling convention based on the current C++ compiler's settings. - kIdHostCDecl = DETECTED_AT_COMPILE_TIME, + kIdHostCDecl = DETECTED_AT_COMPILE_TIME, //! Default STDCALL calling convention based on the current C++ compiler's settings. //! //! NOTE: If not defined by the host then it's the same as `kIdHostCDecl`. - kIdHostStdCall = DETECTED_AT_COMPILE_TIME, + kIdHostStdCall = DETECTED_AT_COMPILE_TIME, //! Compatibility for `__fastcall` calling convention. //! //! NOTE: If not defined by the host then it's the same as `kIdHostCDecl`. - kIdHostFastCall = DETECTED_AT_COMPILE_TIME + kIdHostFastCall = DETECTED_AT_COMPILE_TIME #elif ASMJIT_ARCH_X86 - kIdHost = kIdX86CDecl, - kIdHostCDecl = kIdX86CDecl, - kIdHostStdCall = kIdX86StdCall, - kIdHostFastCall = ASMJIT_CC_MSC ? kIdX86MsFastCall : - ASMJIT_CC_GCC ? kIdX86GccFastCall : - ASMJIT_CC_CLANG ? kIdX86GccFastCall : kIdNone + kIdHost = kIdX86CDecl, + kIdHostCDecl = kIdX86CDecl, + kIdHostStdCall = kIdX86StdCall, + kIdHostFastCall = ASMJIT_CC_MSC ? kIdX86MsFastCall : + ASMJIT_CC_GCC ? kIdX86GccFastCall : + ASMJIT_CC_CLANG ? kIdX86GccFastCall : kIdNone, + kIdHostFastEval2 = kIdX86FastEval2, + kIdHostFastEval3 = kIdX86FastEval3, + kIdHostFastEval4 = kIdX86FastEval4 #elif ASMJIT_ARCH_X64 - kIdHost = ASMJIT_OS_WINDOWS ? kIdX86Win64 : kIdX86SysV64, - kIdHostCDecl = kIdHost, // Doesn't exist, redirected to host. - kIdHostStdCall = kIdHost, // Doesn't exist, redirected to host. - kIdHostFastCall = kIdHost // Doesn't exist, redirected to host. + kIdHost = ASMJIT_OS_WINDOWS ? kIdX86Win64 : kIdX86SysV64, + kIdHostCDecl = kIdHost, // Doesn't exist, redirected to host. + kIdHostStdCall = kIdHost, // Doesn't exist, redirected to host. + kIdHostFastCall = kIdHost, // Doesn't exist, redirected to host. + kIdHostFastEval2 = kIdX64FastEval2, + kIdHostFastEval3 = kIdX64FastEval3, + kIdHostFastEval4 = kIdX64FastEval4 #elif ASMJIT_ARCH_ARM32 # if defined(__SOFTFP__) - kIdHost = kIdArm32SoftFP, + kIdHost = kIdArm32SoftFP, # else - kIdHost = kIdArm32HardFP, + kIdHost = kIdArm32HardFP, # endif // These don't exist on ARM. - kIdHostCDecl = kIdHost, // Doesn't exist, redirected to host. - kIdHostStdCall = kIdHost, // Doesn't exist, redirected to host. - kIdHostFastCall = kIdHost // Doesn't exist, redirected to host. + kIdHostCDecl = kIdHost, // Doesn't exist, redirected to host. + kIdHostStdCall = kIdHost, // Doesn't exist, redirected to host. + kIdHostFastCall = kIdHost // Doesn't exist, redirected to host. #else # error "[asmjit] Couldn't determine the target's calling convention." #endif diff --git a/src/asmjit/base/string.h b/src/asmjit/base/string.h index 3621a99..8d1ef16 100644 --- a/src/asmjit/base/string.h +++ b/src/asmjit/base/string.h @@ -224,7 +224,7 @@ public: //! Check for equality with other `str` of length `len`. ASMJIT_API bool eq(const char* str, size_t len = Globals::kInvalidIndex) const noexcept; //! Check for equality with `other`. - ASMJIT_INLINE bool eq(const StringBuilder& other) const noexcept { return eq(other._data); } + ASMJIT_INLINE bool eq(const StringBuilder& other) const noexcept { return eq(other._data, other._length); } // -------------------------------------------------------------------------- // [Operator Overload] diff --git a/src/asmjit/base/utils.h b/src/asmjit/base/utils.h index 4568e92..350db89 100644 --- a/src/asmjit/base/utils.h +++ b/src/asmjit/base/utils.h @@ -345,7 +345,7 @@ struct Utils { // [Bits] // -------------------------------------------------------------------------- - //! Generate a bit-mask that has `x` most significant bits set. + //! Generate a bit-mask that has `x` least significant bits set. static ASMJIT_INLINE uint32_t bits(uint32_t x) noexcept { // Shifting more bits than the type has results in undefined behavior. In // such case asmjit trashes the result by ORing with `overflow` mask, which diff --git a/src/asmjit/x86/x86internal.cpp b/src/asmjit/x86/x86internal.cpp index 4af3349..2e46f37 100644 --- a/src/asmjit/x86/x86internal.cpp +++ b/src/asmjit/x86/x86internal.cpp @@ -275,15 +275,17 @@ ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrameInfo& ffi) ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) noexcept { const uint32_t kKindGp = X86Reg::kKindGp; const uint32_t kKindVec = X86Reg::kKindVec; + const uint32_t kKindMm = X86Reg::kKindMm; + const uint32_t kKindK = X86Reg::kKindK; - const uint32_t kAx = X86Gp::kIdAx; - const uint32_t kBx = X86Gp::kIdBx; - const uint32_t kCx = X86Gp::kIdCx; - const uint32_t kDx = X86Gp::kIdDx; - const uint32_t kSp = X86Gp::kIdSp; - const uint32_t kBp = X86Gp::kIdBp; - const uint32_t kSi = X86Gp::kIdSi; - const uint32_t kDi = X86Gp::kIdDi; + const uint32_t kZax = X86Gp::kIdAx; + const uint32_t kZbx = X86Gp::kIdBx; + const uint32_t kZcx = X86Gp::kIdCx; + const uint32_t kZdx = X86Gp::kIdDx; + const uint32_t kZsp = X86Gp::kIdSp; + const uint32_t kZbp = X86Gp::kIdBp; + const uint32_t kZsi = X86Gp::kIdSi; + const uint32_t kZdi = X86Gp::kIdDi; switch (ccId) { case CallConv::kIdX86StdCall: @@ -292,32 +294,32 @@ ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) n case CallConv::kIdX86MsThisCall: cc.setFlags(CallConv::kFlagCalleePopsStack); - cc.setPassedOrder(kKindGp, kCx); + cc.setPassedOrder(kKindGp, kZcx); goto X86CallConv; case CallConv::kIdX86MsFastCall: case CallConv::kIdX86GccFastCall: cc.setFlags(CallConv::kFlagCalleePopsStack); - cc.setPassedOrder(kKindGp, kCx, kDx); + cc.setPassedOrder(kKindGp, kZcx, kZdx); goto X86CallConv; case CallConv::kIdX86GccRegParm1: - cc.setPassedOrder(kKindGp, kAx); + cc.setPassedOrder(kKindGp, kZax); goto X86CallConv; case CallConv::kIdX86GccRegParm2: - cc.setPassedOrder(kKindGp, kAx, kDx); + cc.setPassedOrder(kKindGp, kZax, kZdx); goto X86CallConv; case CallConv::kIdX86GccRegParm3: - cc.setPassedOrder(kKindGp, kAx, kDx, kCx); + cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx); goto X86CallConv; case CallConv::kIdX86CDecl: X86CallConv: cc.setNaturalStackAlignment(4); cc.setArchType(ArchInfo::kTypeX86); - cc.setPreservedRegs(kKindGp, Utils::mask(kBx, kSp, kBp, kSi, kDi)); + cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi)); break; case CallConv::kIdX86Win64: @@ -326,9 +328,9 @@ X86CallConv: cc.setFlags(CallConv::kFlagPassFloatsByVec | CallConv::kFlagIndirectVecArgs); cc.setNaturalStackAlignment(16); cc.setSpillZoneSize(32); - cc.setPassedOrder(kKindGp, kCx, kDx, 8, 9); + cc.setPassedOrder(kKindGp, kZcx, kZdx, 8, 9); cc.setPassedOrder(kKindVec, 0, 1, 2, 3); - cc.setPreservedRegs(kKindGp, Utils::mask(kBx, kSp, kBp, kSi, kDi, 12, 13, 14, 15)); + cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15)); cc.setPreservedRegs(kKindVec, Utils::mask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); break; @@ -337,11 +339,49 @@ X86CallConv: cc.setFlags(CallConv::kFlagPassFloatsByVec); cc.setNaturalStackAlignment(16); cc.setRedZoneSize(128); - cc.setPassedOrder(kKindGp, kDi, kSi, kDx, kCx, 8, 9); + cc.setPassedOrder(kKindGp, kZdi, kZsi, kZdx, kZcx, 8, 9); cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7); - cc.setPreservedRegs(kKindGp, Utils::mask(kBx, kSp, kBp, 12, 13, 14, 15)); + cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, 12, 13, 14, 15)); break; + case CallConv::kIdX86FastEval2: + case CallConv::kIdX86FastEval3: + case CallConv::kIdX86FastEval4: { + uint32_t n = ccId - CallConv::kIdX86FastEval2; + + cc.setArchType(ArchInfo::kTypeX86); + cc.setFlags(CallConv::kFlagPassFloatsByVec); + cc.setNaturalStackAlignment(16); + cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi); + cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7); + cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7); + + cc.setPreservedRegs(kKindGp , Utils::bits(8)); + cc.setPreservedRegs(kKindVec, Utils::bits(8) & ~Utils::bits(n)); + cc.setPreservedRegs(kKindMm , Utils::bits(8)); + cc.setPreservedRegs(kKindK , Utils::bits(8)); + break; + } + + case CallConv::kIdX64FastEval2: + case CallConv::kIdX64FastEval3: + case CallConv::kIdX64FastEval4: { + uint32_t n = ccId - CallConv::kIdX64FastEval2; + + cc.setArchType(ArchInfo::kTypeX64); + cc.setFlags(CallConv::kFlagPassFloatsByVec); + cc.setNaturalStackAlignment(16); + cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi); + cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7); + cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7); + + cc.setPreservedRegs(kKindGp , Utils::bits(16)); + cc.setPreservedRegs(kKindVec,~Utils::bits(n)); + cc.setPreservedRegs(kKindMm , Utils::bits(8)); + cc.setPreservedRegs(kKindK , Utils::bits(8)); + break; + } + default: return DebugUtils::errored(kErrorInvalidArgument); } @@ -532,7 +572,7 @@ ASMJIT_FAVOR_SIZE Error X86Internal::initFrameLayout(FuncFrameLayout& layout, co // Calculate a bit-mask of all registers that must be saved & restored. for (kind = 0; kind < Globals::kMaxVRegKinds; kind++) - layout._savedRegs[kind] = ffi.getDirtyRegs(kind) & func.getPreservedRegs(kind); + layout._savedRegs[kind] = (ffi.getDirtyRegs(kind) & ~func.getPassedRegs(kind)) & func.getPreservedRegs(kind); // Include EBP|RBP if the function preserves the frame-pointer. if (ffi.hasPreservedFP()) { diff --git a/src/asmjit/x86/x86regalloc.cpp b/src/asmjit/x86/x86regalloc.cpp index 95cb429..cb2df44 100644 --- a/src/asmjit/x86/x86regalloc.cpp +++ b/src/asmjit/x86/x86regalloc.cpp @@ -2050,10 +2050,10 @@ _NextGroup: } // Init clobbered. - clobberedRegs.set(X86Reg::kKindGp , Utils::bits(_regCount.getGp()) & (~fd.getPreservedRegs(X86Reg::kKindGp ))); - clobberedRegs.set(X86Reg::kKindMm , Utils::bits(_regCount.getMm()) & (~fd.getPreservedRegs(X86Reg::kKindMm ))); - clobberedRegs.set(X86Reg::kKindK , Utils::bits(_regCount.getK()) & (~fd.getPreservedRegs(X86Reg::kKindK ))); - clobberedRegs.set(X86Reg::kKindVec, Utils::bits(_regCount.getVec()) & (~fd.getPreservedRegs(X86Reg::kKindVec))); + clobberedRegs.set(X86Reg::kKindGp , Utils::bits(_regCount.getGp()) & (fd.getPassedRegs(X86Reg::kKindGp ) | ~fd.getPreservedRegs(X86Reg::kKindGp ))); + clobberedRegs.set(X86Reg::kKindMm , Utils::bits(_regCount.getMm()) & (fd.getPassedRegs(X86Reg::kKindMm ) | ~fd.getPreservedRegs(X86Reg::kKindMm ))); + clobberedRegs.set(X86Reg::kKindK , Utils::bits(_regCount.getK()) & (fd.getPassedRegs(X86Reg::kKindK ) | ~fd.getPreservedRegs(X86Reg::kKindK ))); + clobberedRegs.set(X86Reg::kKindVec, Utils::bits(_regCount.getVec()) & (fd.getPassedRegs(X86Reg::kKindVec) | ~fd.getPreservedRegs(X86Reg::kKindVec))); RA_FINALIZE(node_); break; diff --git a/test/asmjit_test_x86_cc.cpp b/test/asmjit_test_x86_cc.cpp index 4aaee06..973af47 100644 --- a/test/asmjit_test_x86_cc.cpp +++ b/test/asmjit_test_x86_cc.cpp @@ -3260,6 +3260,104 @@ public: } }; +// ============================================================================ +// [X86Test_MiscFastEval] +// ============================================================================ + +class X86Test_MiscFastEval : public X86Test { +public: + X86Test_MiscFastEval() : X86Test("[Misc] FastEval (CConv)") {} + + static void add(X86TestManager& mgr) { + mgr.add(new X86Test_MiscFastEval()); + } + + virtual void compile(X86Compiler& cc) { + FuncSignature5 funcSig(CallConv::kIdHostCDecl); + FuncSignature2 fastSig(CallConv::kIdHostFastEval2); + + CCFunc* func = cc.newFunc(funcSig); + CCFunc* fast = cc.newFunc(fastSig); + + { + X86Gp aPtr = cc.newIntPtr("aPtr"); + X86Gp bPtr = cc.newIntPtr("bPtr"); + X86Gp cPtr = cc.newIntPtr("cPtr"); + X86Gp dPtr = cc.newIntPtr("dPtr"); + X86Gp pOut = cc.newIntPtr("pOut"); + + X86Xmm aXmm = cc.newXmm("aXmm"); + X86Xmm bXmm = cc.newXmm("bXmm"); + X86Xmm cXmm = cc.newXmm("cXmm"); + X86Xmm dXmm = cc.newXmm("dXmm"); + + cc.addFunc(func); + + cc.setArg(0, aPtr); + cc.setArg(1, bPtr); + cc.setArg(2, cPtr); + cc.setArg(3, dPtr); + cc.setArg(4, pOut); + + cc.movups(aXmm, x86::ptr(aPtr)); + cc.movups(bXmm, x86::ptr(bPtr)); + cc.movups(cXmm, x86::ptr(cPtr)); + cc.movups(dXmm, x86::ptr(dPtr)); + + X86Xmm xXmm = cc.newXmm("xXmm"); + X86Xmm yXmm = cc.newXmm("yXmm"); + + CCFuncCall* call1 = cc.call(fast->getLabel(), fastSig); + call1->setArg(0, aXmm); + call1->setArg(1, bXmm); + call1->setRet(0, xXmm); + + CCFuncCall* call2 = cc.call(fast->getLabel(), fastSig); + call2->setArg(0, cXmm); + call2->setArg(1, dXmm); + call2->setRet(0, yXmm); + + cc.pmullw(xXmm, yXmm); + cc.movups(x86::ptr(pOut), xXmm); + + cc.endFunc(); + } + + { + X86Xmm aXmm = cc.newXmm("aXmm"); + X86Xmm bXmm = cc.newXmm("bXmm"); + + cc.addFunc(fast); + cc.setArg(0, aXmm); + cc.setArg(1, bXmm); + cc.paddw(aXmm, bXmm); + cc.ret(aXmm); + cc.endFunc(); + } + } + + virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) { + typedef void (*Func)(const void*, const void*, const void*, const void*, void*); + + Func func = ptr_as_func(_func); + + int16_t a[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + int16_t b[8] = { 7, 6, 5, 4, 3, 2, 1, 0 }; + int16_t c[8] = { 1, 3, 9, 7, 5, 4, 2, 1 }; + int16_t d[8] = { 2, 0,-6,-4,-2,-1, 1, 2 }; + + int16_t o[8]; + int oExp = 7 * 3; + + func(a, b, c, d, o); + + result.setFormat("ret={%02X %02X %02X %02X %02X %02X %02X %02X}", o[0], o[1], o[2], o[3], o[4], o[5], o[6], o[7]); + expect.setFormat("ret={%02X %02X %02X %02X %02X %02X %02X %02X}", oExp, oExp, oExp, oExp, oExp, oExp, oExp, oExp); + + return result == expect; + } +}; + // ============================================================================ // [X86Test_MiscUnfollow] // ============================================================================ @@ -3423,6 +3521,7 @@ int main(int argc, char* argv[]) { ADD_TEST(X86Test_MiscConstPool); ADD_TEST(X86Test_MiscMultiRet); ADD_TEST(X86Test_MiscMultiFunc); + ADD_TEST(X86Test_MiscFastEval); ADD_TEST(X86Test_MiscUnfollow); return testMgr.run();