Improved the performance of bin-packing (Compiler) (fixes #440)

During bin-packing, a single function nonOverlappingUnionOf() is
called many times to calculate a union of one live ranges with
another. Before this change it used ZoneVector::reserve() to make
sure that there is enough space for the union, however, in some
cases this is not ideal in case that the union grows every time
the function is called. In that case it's reallocating the vector
many times, which affects performance.

Instead of calling reserve(), a new function growingReserve() was
added to tell the vector to grow when it needs to reallocate.

In addition, this change fixes some documentation regarding the
use of JitAllocator (Explicit Code Relocation section in core.h).
This commit is contained in:
kobalicek
2024-06-13 13:17:50 +02:00
parent 4a61c23ab6
commit f5df7a2b1b
4 changed files with 123 additions and 47 deletions

View File

@@ -739,15 +739,17 @@ namespace asmjit {
//! JitAllocator allocator; //! JitAllocator allocator;
//! //!
//! // Allocate an executable virtual memory and handle a possible failure. //! // Allocate an executable virtual memory and handle a possible failure.
//! void* p = allocator.alloc(estimatedSize); //! JitAllocator::Span span;
//! if (!p) //! Error err = allocator.alloc(span, estimatedSize);
//!
//! if (err != kErrorOk) // <- NOTE: This must be checked, always!
//! return 0; //! return 0;
//! //!
//! // Now relocate the code to the address provided by the memory allocator. //! // Now relocate the code to the address provided by the memory allocator.
//! // Please note that this DOESN'T COPY anything to `p`. This function will //! // Please note that this DOESN'T COPY anything to it. This function will
//! // store the address in CodeHolder and use relocation entries to patch the //! // store the address in CodeHolder and use relocation entries to patch
//! // existing code in all sections to respect the base address provided. //! // the existing code in all sections to respect the base address provided.
//! code.relocateToBase((uint64_t)p); //! code.relocateToBase((uint64_t)span.rx());
//! //!
//! // This is purely optional. There are cases in which the relocation can omit //! // This is purely optional. There are cases in which the relocation can omit
//! // unneeded data, which would shrink the size of address table. If that //! // unneeded data, which would shrink the size of address table. If that
@@ -760,12 +762,17 @@ namespace asmjit {
//! // additional options that can be used to also zero pad sections' virtual //! // additional options that can be used to also zero pad sections' virtual
//! // size, etc. //! // size, etc.
//! // //! //
//! // With some additional features, copyFlattenData() does roughly this: //! // With some additional features, copyFlattenData() does roughly the following:
//! // for (Section* section : code.sections()) //! //
//! // memcpy((uint8_t*)p + section->offset(), //! // allocator.write([&](JitAllocator::Span& span) {
//! // section->data(), //! // for (Section* section : code.sections()) {
//! // section->bufferSize()); //! // uint8_t* p = (uint8_t*)span.rw() + section->offset();
//! code.copyFlattenedData(p, codeSize, CopySectionFlags::kPadSectionBuffer); //! // memcpy(p, section->data(), section->bufferSize());
//! // }
//! // }
//! allocator.write([&](JitAllocator::Span& span) {
//! code.copyFlattenedData(span.rw(), codeSize, CopySectionFlags::kPadSectionBuffer);
//! });
//! //!
//! // Execute the generated function. //! // Execute the generated function.
//! int inA[4] = { 4, 3, 2, 1 }; //! int inA[4] = { 4, 3, 2, 1 };

View File

@@ -559,7 +559,7 @@ public:
ASMJIT_FORCE_INLINE Error nonOverlappingUnionOf(ZoneAllocator* allocator, const RALiveSpans<T>& x, const RALiveSpans<T>& y, const DataType& yData) noexcept { ASMJIT_FORCE_INLINE Error nonOverlappingUnionOf(ZoneAllocator* allocator, const RALiveSpans<T>& x, const RALiveSpans<T>& y, const DataType& yData) noexcept {
uint32_t finalSize = x.size() + y.size(); uint32_t finalSize = x.size() + y.size();
ASMJIT_PROPAGATE(_data.reserve(allocator, finalSize)); ASMJIT_PROPAGATE(_data.growingReserve(allocator, finalSize));
T* dstPtr = _data.data(); T* dstPtr = _data.data();
const T* xSpan = x.data(); const T* xSpan = x.data();
@@ -694,7 +694,7 @@ typedef RALiveSpans<LiveRegSpan> LiveRegSpans;
//! - LEA x{ W|Out}, [x{R|Use} + y{R|Out}] -> {x:R|W|Use|Out y:R|Use} //! - LEA x{ W|Out}, [x{R|Use} + y{R|Out}] -> {x:R|W|Use|Out y:R|Use}
//! //!
//! It should be obvious from the example above how these flags get created. Each operand contains READ/WRITE //! It should be obvious from the example above how these flags get created. Each operand contains READ/WRITE
//! information, which is then merged to RATiedReg's flags. However, we also need to represent the possitility //! information, which is then merged to RATiedReg's flags. However, we also need to represent the possibility
//! to view the operation as two independent operations - USE and OUT, because the register allocator first //! to view the operation as two independent operations - USE and OUT, because the register allocator first
//! allocates USE registers, and then assigns OUT registers independently of USE registers. //! allocates USE registers, and then assigns OUT registers independently of USE registers.
enum class RATiedFlags : uint32_t { enum class RATiedFlags : uint32_t {

View File

@@ -13,8 +13,63 @@ ASMJIT_BEGIN_NAMESPACE
// ZoneVectorBase - Helpers // ZoneVectorBase - Helpers
// ======================== // ========================
// ZoneVector is used as an array to hold short-lived data structures used during code generation. The growing
// strategy is simple - use small capacity at the beginning (very good for ZoneAllocator) and then grow quicker
// to prevent successive reallocations.
static ASMJIT_FORCE_INLINE uint32_t ZoneVector_growCapacity(uint32_t current, uint32_t growMinimum, uint32_t sizeOfT) noexcept {
static constexpr size_t kGrowThreshold = Globals::kGrowThreshold;
size_t byteSize = size_t(current) * sizeOfT;
size_t minimumByteSize = size_t(growMinimum) * sizeOfT;
// This is more than exponential growth at the beginning.
if (byteSize < 32) {
byteSize = 32;
}
else if (byteSize < 128) {
byteSize = 128;
}
else if (byteSize < 512) {
byteSize = 512;
}
if (byteSize < minimumByteSize) {
// Exponential growth before we reach `kGrowThreshold`.
byteSize = Support::alignUpPowerOf2(minimumByteSize);
// Bail to `growMinimum` in case of overflow - most likely whatever that is happening afterwards would just fail.
if (byteSize < minimumByteSize) {
return growMinimum;
}
// Pretty much chunked growth advancing by `kGrowThreshold` after we exceed it.
// This should not be a common case, so we don't really have to optimize for it.
if (byteSize > kGrowThreshold) {
// Align to kGrowThreshold.
size_t remainder = minimumByteSize % kGrowThreshold;
byteSize = minimumByteSize + remainder;
// Bail to `growMinimum` in case of overflow - should never happen as it's unlikely we would hit this on a 32-bit
// machine (consecutive near 4GiB allocation is impossible, and this should never happen on 64-bit machine as we
// use 32-bit size & capacity, so overflow of 64 bit integer is not possible. Added just as an extreme measure.
if (byteSize < minimumByteSize)
return growMinimum;
}
}
size_t n = byteSize / sizeOfT;
return uint32_t(Support::min<size_t>(n, 0xFFFFFFFFu));
}
static ASMJIT_FORCE_INLINE bool ZoneVector_byteSizeIsSafe(size_t nBytes, uint32_t n) noexcept {
if (sizeof(uint32_t) < sizeof(size_t))
return true; // there is no problem when running on a 64-bit machine.
else
return nBytes >= size_t(n);
};
Error ZoneVectorBase::_grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { Error ZoneVectorBase::_grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept {
uint32_t threshold = Globals::kGrowThreshold / sizeOfT;
uint32_t capacity = _capacity; uint32_t capacity = _capacity;
uint32_t after = _size; uint32_t after = _size;
@@ -25,29 +80,7 @@ Error ZoneVectorBase::_grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t
if (capacity >= after) if (capacity >= after)
return kErrorOk; return kErrorOk;
// ZoneVector is used as an array to hold short-lived data structures used return _reserve(allocator, sizeOfT, ZoneVector_growCapacity(capacity, after, sizeOfT));
// during code generation. The growing strategy is simple - use small capacity
// at the beginning (very good for ZoneAllocator) and then grow quicker to
// prevent successive reallocations.
if (capacity < 4)
capacity = 4;
else if (capacity < 8)
capacity = 8;
else if (capacity < 16)
capacity = 16;
else if (capacity < 64)
capacity = 64;
else if (capacity < 256)
capacity = 256;
while (capacity < after) {
if (capacity < threshold)
capacity *= 2;
else
capacity += threshold;
}
return _reserve(allocator, sizeOfT, capacity);
} }
Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept {
@@ -55,8 +88,8 @@ Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint3
if (oldCapacity >= n) if (oldCapacity >= n)
return kErrorOk; return kErrorOk;
uint32_t nBytes = n * sizeOfT; size_t nBytes = size_t(n) * sizeOfT;
if (ASMJIT_UNLIKELY(nBytes < n)) if (ASMJIT_UNLIKELY(!ZoneVector_byteSizeIsSafe(nBytes, n)))
return DebugUtils::errored(kErrorOutOfMemory); return DebugUtils::errored(kErrorOutOfMemory);
size_t allocatedBytes; size_t allocatedBytes;
@@ -65,19 +98,28 @@ Error ZoneVectorBase::_reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint3
if (ASMJIT_UNLIKELY(!newData)) if (ASMJIT_UNLIKELY(!newData))
return DebugUtils::errored(kErrorOutOfMemory); return DebugUtils::errored(kErrorOutOfMemory);
uint32_t newCapacity = uint32_t(allocatedBytes / sizeOfT);
ASMJIT_ASSERT(newCapacity >= n);
void* oldData = _data; void* oldData = _data;
if (oldData && _size) { if (oldData && _size) {
memcpy(newData, oldData, size_t(_size) * sizeOfT); memcpy(newData, oldData, size_t(_size) * sizeOfT);
allocator->release(oldData, size_t(oldCapacity) * sizeOfT); allocator->release(oldData, size_t(oldCapacity) * sizeOfT);
} }
_capacity = uint32_t(allocatedBytes / sizeOfT);
ASMJIT_ASSERT(_capacity >= n);
_data = newData; _data = newData;
_capacity = newCapacity;
return kErrorOk; return kErrorOk;
} }
Error ZoneVectorBase::_growingReserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept {
uint32_t capacity = _capacity;
if (capacity >= n)
return kErrorOk;
return _reserve(allocator, sizeOfT, ZoneVector_growCapacity(capacity, n, sizeOfT));
}
Error ZoneVectorBase::_resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept { Error ZoneVectorBase::_resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept {
uint32_t size = _size; uint32_t size = _size;
@@ -266,6 +308,8 @@ Error ZoneBitVector::_append(ZoneAllocator* allocator, bool value) noexcept {
#if defined(ASMJIT_TEST) #if defined(ASMJIT_TEST)
template<typename T> template<typename T>
static void test_zone_vector(ZoneAllocator* allocator, const char* typeName) { static void test_zone_vector(ZoneAllocator* allocator, const char* typeName) {
constexpr uint32_t kMiB = 1024 * 1024;
int i; int i;
int kMax = 100000; int kMax = 100000;
@@ -301,12 +345,22 @@ static void test_zone_vector(ZoneAllocator* allocator, const char* typeName) {
int64_t fsum = 0; int64_t fsum = 0;
int64_t rsum = 0; int64_t rsum = 0;
for (const T& item : vec) { fsum += item; } for (const T& item : vec) {
for (auto it = vec.rbegin(); it != vec.rend(); ++it) { rsum += *it; } fsum += item;
}
for (auto it = vec.rbegin(); it != vec.rend(); ++it) {
rsum += *it;
}
EXPECT_EQ(fsum, rsum); EXPECT_EQ(fsum, rsum);
vec.release(allocator); vec.release(allocator);
INFO("ZoneBitVector::growingReserve()");
for (uint32_t j = 0; j < 40 / sizeof(T); j += 8) {
EXPECT_EQ(vec.growingReserve(allocator, j * kMiB), kErrorOk);
EXPECT_GE(vec.capacity(), j * kMiB);
}
} }
static void test_zone_bitvector(ZoneAllocator* allocator) { static void test_zone_bitvector(ZoneAllocator* allocator) {

View File

@@ -58,6 +58,7 @@ protected:
ASMJIT_API Error _grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; ASMJIT_API Error _grow(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept;
ASMJIT_API Error _resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; ASMJIT_API Error _resize(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept;
ASMJIT_API Error _reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept; ASMJIT_API Error _reserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept;
ASMJIT_API Error _growingReserve(ZoneAllocator* allocator, uint32_t sizeOfT, uint32_t n) noexcept;
inline void _swap(ZoneVectorBase& other) noexcept { inline void _swap(ZoneVectorBase& other) noexcept {
std::swap(_data, other._data); std::swap(_data, other._data);
@@ -414,7 +415,21 @@ public:
//! Reallocates the internal array to fit at least `n` items. //! Reallocates the internal array to fit at least `n` items.
inline Error reserve(ZoneAllocator* allocator, uint32_t n) noexcept { inline Error reserve(ZoneAllocator* allocator, uint32_t n) noexcept {
return n > _capacity ? ZoneVectorBase::_reserve(allocator, sizeof(T), n) : Error(kErrorOk); if (ASMJIT_UNLIKELY(n > _capacity))
return ZoneVectorBase::_reserve(allocator, sizeof(T), n);
else
return Error(kErrorOk);
}
//! Reallocates the internal array to fit at least `n` items with growing semantics.
//!
//! If the vector is smaller than `n` the same growing calculations will be used as if N items were appended
//! to an empty vector, which means reserving additional space for more append operations that could follow.
inline Error growingReserve(ZoneAllocator* allocator, uint32_t n) noexcept {
if (ASMJIT_UNLIKELY(n > _capacity))
return ZoneVectorBase::_growingReserve(allocator, sizeof(T), n);
else
return Error(kErrorOk);
} }
inline Error willGrow(ZoneAllocator* allocator, uint32_t n = 1) noexcept { inline Error willGrow(ZoneAllocator* allocator, uint32_t n = 1) noexcept {