From f5df7a2b1bd942fc1169d0f563dacbe34e6a11ac Mon Sep 17 00:00:00 2001 From: kobalicek Date: Thu, 13 Jun 2024 13:17:50 +0200 Subject: [PATCH] Improved the performance of bin-packing (Compiler) (fixes #440) During bin-packing, a single function nonOverlappingUnionOf() is called many times to calculate a union of one live ranges with another. Before this change it used ZoneVector::reserve() to make sure that there is enough space for the union, however, in some cases this is not ideal in case that the union grows every time the function is called. In that case it's reallocating the vector many times, which affects performance. Instead of calling reserve(), a new function growingReserve() was added to tell the vector to grow when it needs to reallocate. In addition, this change fixes some documentation regarding the use of JitAllocator (Explicit Code Relocation section in core.h). --- src/asmjit/core.h | 31 +++++---- src/asmjit/core/radefs_p.h | 4 +- src/asmjit/core/zonevector.cpp | 118 ++++++++++++++++++++++++--------- src/asmjit/core/zonevector.h | 17 ++++- 4 files changed, 123 insertions(+), 47 deletions(-) diff --git a/src/asmjit/core.h b/src/asmjit/core.h index eb99c39..cb19333 100644 --- a/src/asmjit/core.h +++ b/src/asmjit/core.h @@ -739,15 +739,17 @@ namespace asmjit { //! JitAllocator allocator; //! //! // Allocate an executable virtual memory and handle a possible failure. -//! void* p = allocator.alloc(estimatedSize); -//! if (!p) +//! JitAllocator::Span span; +//! Error err = allocator.alloc(span, estimatedSize); +//! +//! if (err != kErrorOk) // <- NOTE: This must be checked, always! //! return 0; //! //! // Now relocate the code to the address provided by the memory allocator. -//! // Please note that this DOESN'T COPY anything to `p`. This function will -//! // store the address in CodeHolder and use relocation entries to patch the -//! // existing code in all sections to respect the base address provided. -//! code.relocateToBase((uint64_t)p); +//! // Please note that this DOESN'T COPY anything to it. This function will +//! // store the address in CodeHolder and use relocation entries to patch +//! // the existing code in all sections to respect the base address provided. +//! code.relocateToBase((uint64_t)span.rx()); //! //! // This is purely optional. There are cases in which the relocation can omit //! // unneeded data, which would shrink the size of address table. If that @@ -760,12 +762,17 @@ namespace asmjit { //! // additional options that can be used to also zero pad sections' virtual //! // size, etc. //! // -//! // With some additional features, copyFlattenData() does roughly this: -//! // for (Section* section : code.sections()) -//! // memcpy((uint8_t*)p + section->offset(), -//! // section->data(), -//! // section->bufferSize()); -//! code.copyFlattenedData(p, codeSize, CopySectionFlags::kPadSectionBuffer); +//! // With some additional features, copyFlattenData() does roughly the following: +//! // +//! // allocator.write([&](JitAllocator::Span& span) { +//! // for (Section* section : code.sections()) { +//! // uint8_t* p = (uint8_t*)span.rw() + section->offset(); +//! // memcpy(p, section->data(), section->bufferSize()); +//! // } +//! // } +//! allocator.write([&](JitAllocator::Span& span) { +//! code.copyFlattenedData(span.rw(), codeSize, CopySectionFlags::kPadSectionBuffer); +//! }); //! //! // Execute the generated function. //! int inA[4] = { 4, 3, 2, 1 }; diff --git a/src/asmjit/core/radefs_p.h b/src/asmjit/core/radefs_p.h index d61a9cc..3250396 100644 --- a/src/asmjit/core/radefs_p.h +++ b/src/asmjit/core/radefs_p.h @@ -559,7 +559,7 @@ public: ASMJIT_FORCE_INLINE Error nonOverlappingUnionOf(ZoneAllocator* allocator, const RALiveSpans& x, const RALiveSpans& y, const DataType& yData) noexcept { uint32_t finalSize = x.size() + y.size(); - ASMJIT_PROPAGATE(_data.reserve(allocator, finalSize)); + ASMJIT_PROPAGATE(_data.growingReserve(allocator, finalSize)); T* dstPtr = _data.data(); const T* xSpan = x.data(); @@ -694,7 +694,7 @@ typedef RALiveSpans LiveRegSpans; //! - LEA x{ W|Out}, [x{R|Use} + y{R|Out}] -> {x:R|W|Use|Out y:R|Use} //! //! It should be obvious from the example above how these flags get created. Each operand contains READ/WRITE -//! information, which is then merged to RATiedReg's flags. However, we also need to represent the possitility +//! information, which is then merged to RATiedReg's flags. However, we also need to represent the possibility //! to view the operation as two independent operations - USE and OUT, because the register allocator first //! allocates USE registers, and then assigns OUT registers independently of USE registers. enum class RATiedFlags : uint32_t { diff --git a/src/asmjit/core/zonevector.cpp b/src/asmjit/core/zonevector.cpp index 2486021..b68e25a 100644 --- a/src/asmjit/core/zonevector.cpp +++ b/src/asmjit/core/zonevector.cpp @@ -13,8 +13,63 @@ ASMJIT_BEGIN_NAMESPACE // ZoneVectorBase - Helpers // ======================== +// ZoneVector is used as an array to hold short-lived data structures used during code generation. The growing +// strategy is simple - use small capacity at the beginning (very good for ZoneAllocator) and then grow quicker +// to prevent successive reallocations. +static ASMJIT_FORCE_INLINE uint32_t ZoneVector_growCapacity(uint32_t current, uint32_t growMinimum, uint32_t sizeOfT) noexcept { + static constexpr size_t kGrowThreshold = Globals::kGrowThreshold; + + size_t byteSize = size_t(current) * sizeOfT; + size_t minimumByteSize = size_t(growMinimum) * sizeOfT; + + // This is more than exponential growth at the beginning. + if (byteSize < 32) { + byteSize = 32; + } + else if (byteSize < 128) { + byteSize = 128; + } + else if (byteSize < 512) { + byteSize = 512; + } + + if (byteSize < minimumByteSize) { + // Exponential growth before we reach `kGrowThreshold`. + byteSize = Support::alignUpPowerOf2(minimumByteSize); + + // Bail to `growMinimum` in case of overflow - most likely whatever that is happening afterwards would just fail. + if (byteSize < minimumByteSize) { + return growMinimum; + } + + // Pretty much chunked growth advancing by `kGrowThreshold` after we exceed it. + // This should not be a common case, so we don't really have to optimize for it. + if (byteSize > kGrowThreshold) { + // Align to kGrowThreshold. + size_t remainder = minimumByteSize % kGrowThreshold; + + byteSize = minimumByteSize + remainder; + + // Bail to `growMinimum` in case of overflow - should never happen as it's unlikely we would hit this on a 32-bit + // machine (consecutive near 4GiB allocation is impossible, and this should never happen on 64-bit machine as we + // use 32-bit size & capacity, so overflow of 64 bit integer is not possible. Added just as an extreme measure. + if (byteSize < minimumByteSize) + return growMinimum; + } + } + + size_t n = byteSize / sizeOfT; + return uint32_t(Support::min(n, 0xFFFFFFFFu)); +} + +static ASMJIT_FORCE_INLINE bool ZoneVector_byteSizeIsSafe(size_t nBytes, uint32_t n) noexcept { + if (sizeof(uint32_t) < sizeof(size_t)) + return true; // there is no problem when running on a 64-bit machine. + else + return nBytes >= size_t(n); +}; + Error ZoneVectorBase::_grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { - uint32_t threshold = Globals::kGrowThreshold / sizeOfT; uint32_t capacity = _capacity; uint32_t after = _size; @@ -25,29 +80,7 @@ Error ZoneVectorBase::_grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t if (capacity >= after) return kErrorOk; - // ZoneVector is used as an array to hold short-lived data structures used - // during code generation. The growing strategy is simple - use small capacity - // at the beginning (very good for ZoneAllocator) and then grow quicker to - // prevent successive reallocations. - if (capacity < 4) - capacity = 4; - else if (capacity < 8) - capacity = 8; - else if (capacity < 16) - capacity = 16; - else if (capacity < 64) - capacity = 64; - else if (capacity < 256) - capacity = 256; - - while (capacity < after) { - if (capacity < threshold) - capacity *= 2; - else - capacity += threshold; - } - - return _reserve(allocator, sizeOfT, capacity); + return _reserve(allocator, sizeOfT, ZoneVector_growCapacity(capacity, after, sizeOfT)); } Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { @@ -55,8 +88,8 @@ Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint3 if (oldCapacity >= n) return kErrorOk; - uint32_t nBytes = n * sizeOfT; - if (ASMJIT_UNLIKELY(nBytes < n)) + size_t nBytes = size_t(n) * sizeOfT; + if (ASMJIT_UNLIKELY(!ZoneVector_byteSizeIsSafe(nBytes, n))) return DebugUtils::errored(kErrorOutOfMemory); size_t allocatedBytes; @@ -65,19 +98,28 @@ Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint3 if (ASMJIT_UNLIKELY(!newData)) return DebugUtils::errored(kErrorOutOfMemory); + uint32_t newCapacity = uint32_t(allocatedBytes / sizeOfT); + ASMJIT_ASSERT(newCapacity >= n); + void* oldData = _data; if (oldData && _size) { memcpy(newData, oldData, size_t(_size) * sizeOfT); allocator->release(oldData, size_t(oldCapacity) * sizeOfT); } - _capacity = uint32_t(allocatedBytes / sizeOfT); - ASMJIT_ASSERT(_capacity >= n); - _data = newData; + _capacity = newCapacity; + return kErrorOk; } +Error ZoneVectorBase::_growingReserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { + uint32_t capacity = _capacity; + if (capacity >= n) + return kErrorOk; + return _reserve(allocator, sizeOfT, ZoneVector_growCapacity(capacity, n, sizeOfT)); +} + Error ZoneVectorBase::_resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { uint32_t size = _size; @@ -266,6 +308,8 @@ Error ZoneBitVector::_append(ZoneAllocator* allocator, bool value) noexcept { #if defined(ASMJIT_TEST) template static void test_zone_vector(ZoneAllocator* allocator, const char* typeName) { + constexpr uint32_t kMiB = 1024 * 1024; + int i; int kMax = 100000; @@ -301,12 +345,22 @@ static void test_zone_vector(ZoneAllocator* allocator, const char* typeName) { int64_t fsum = 0; int64_t rsum = 0; - for (const T& item : vec) { fsum += item; } - for (auto it = vec.rbegin(); it != vec.rend(); ++it) { rsum += *it; } + for (const T& item : vec) { + fsum += item; + } + + for (auto it = vec.rbegin(); it != vec.rend(); ++it) { + rsum += *it; + } EXPECT_EQ(fsum, rsum); - vec.release(allocator); + + INFO("ZoneBitVector::growingReserve()"); + for (uint32_t j = 0; j < 40 / sizeof(T); j += 8) { + EXPECT_EQ(vec.growingReserve(allocator, j * kMiB), kErrorOk); + EXPECT_GE(vec.capacity(), j * kMiB); + } } static void test_zone_bitvector(ZoneAllocator* allocator) { diff --git a/src/asmjit/core/zonevector.h b/src/asmjit/core/zonevector.h index 13d28bb..f38dca5 100644 --- a/src/asmjit/core/zonevector.h +++ b/src/asmjit/core/zonevector.h @@ -58,6 +58,7 @@ protected: ASMJIT_API Error _grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; ASMJIT_API Error _resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; ASMJIT_API Error _reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; + ASMJIT_API Error _growingReserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; inline void _swap(ZoneVectorBase& other) noexcept { std::swap(_data, other._data); @@ -414,7 +415,21 @@ public: //! Reallocates the internal array to fit at least `n` items. inline Error reserve(ZoneAllocator* allocator, uint32_t n) noexcept { - return n > _capacity ? ZoneVectorBase::_reserve(allocator, sizeof(T), n) : Error(kErrorOk); + if (ASMJIT_UNLIKELY(n > _capacity)) + return ZoneVectorBase::_reserve(allocator, sizeof(T), n); + else + return Error(kErrorOk); + } + + //! Reallocates the internal array to fit at least `n` items with growing semantics. + //! + //! If the vector is smaller than `n` the same growing calculations will be used as if N items were appended + //! to an empty vector, which means reserving additional space for more append operations that could follow. + inline Error growingReserve(ZoneAllocator* allocator, uint32_t n) noexcept { + if (ASMJIT_UNLIKELY(n > _capacity)) + return ZoneVectorBase::_growingReserve(allocator, sizeof(T), n); + else + return Error(kErrorOk); } inline Error willGrow(ZoneAllocator* allocator, uint32_t n = 1) noexcept {