2025-02-05 19:18:22 -08:00
|
|
|
//! An allocator that is intended to be used in Debug mode.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! ## Features
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! * Captures stack traces on allocation, free, and optionally resize.
|
|
|
|
|
//! * Double free detection, which prints all three traces (first alloc, first
|
|
|
|
|
//! free, second free).
|
|
|
|
|
//! * Leak detection, with stack traces.
|
|
|
|
|
//! * Never reuses memory addresses, making it easier for Zig to detect branch
|
|
|
|
|
//! on undefined values in case of dangling pointers. This relies on
|
|
|
|
|
//! the backing allocator to also not reuse addresses.
|
|
|
|
|
//! * Uses a minimum backing allocation size to avoid operating system errors
|
|
|
|
|
//! from having too many active memory mappings.
|
|
|
|
|
//! * When a page of memory is no longer needed, give it back to resident
|
|
|
|
|
//! memory as soon as possible, so that it causes page faults when used.
|
|
|
|
|
//! * Cross platform. Operates based on a backing allocator which makes it work
|
|
|
|
|
//! everywhere, even freestanding.
|
|
|
|
|
//! * Compile-time configuration.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! These features require the allocator to be quite slow and wasteful. For
|
|
|
|
|
//! example, when allocating a single byte, the efficiency is less than 1%;
|
|
|
|
|
//! it requires more than 100 bytes of overhead to manage the allocation for
|
|
|
|
|
//! one byte. The efficiency gets better with larger allocations.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! ## Basic Design
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! Allocations are divided into two categories, small and large.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! Small allocations are divided into buckets based on `page_size`:
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
|
|
|
|
//! ```
|
|
|
|
|
//! index obj_size
|
|
|
|
|
//! 0 1
|
|
|
|
|
//! 1 2
|
|
|
|
|
//! 2 4
|
|
|
|
|
//! 3 8
|
|
|
|
|
//! 4 16
|
|
|
|
|
//! 5 32
|
|
|
|
|
//! 6 64
|
|
|
|
|
//! 7 128
|
|
|
|
|
//! 8 256
|
|
|
|
|
//! 9 512
|
|
|
|
|
//! 10 1024
|
|
|
|
|
//! 11 2048
|
2025-02-05 19:18:22 -08:00
|
|
|
//! ...
|
2020-08-07 22:35:15 -07:00
|
|
|
//! ```
|
|
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! This goes on for `small_bucket_count` indexes.
|
|
|
|
|
//!
|
|
|
|
|
//! Allocations are grouped into an object size based on max(len, alignment),
|
|
|
|
|
//! rounded up to the next power of two.
|
|
|
|
|
//!
|
2020-08-07 22:35:15 -07:00
|
|
|
//! The main allocator state has an array of all the "current" buckets for each
|
|
|
|
|
//! size class. Each slot in the array can be null, meaning the bucket for that
|
|
|
|
|
//! size class is not allocated. When the first object is allocated for a given
|
2025-02-05 19:18:22 -08:00
|
|
|
//! size class, it makes one `page_size` allocation from the backing allocator.
|
|
|
|
|
//! This allocation is divided into "slots" - one per allocated object, leaving
|
|
|
|
|
//! room for the allocation metadata (starting with `BucketHeader`), which is
|
|
|
|
|
//! located at the very end of the "page".
|
|
|
|
|
//!
|
|
|
|
|
//! The allocation metadata includes "used bits" - 1 bit per slot representing
|
|
|
|
|
//! whether the slot is used. Allocations always take the next available slot
|
|
|
|
|
//! from the current bucket, setting the corresponding used bit, as well as
|
|
|
|
|
//! incrementing `allocated_count`.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! Frees recover the allocation metadata based on the address, length, and
|
|
|
|
|
//! alignment, relying on the backing allocation's large alignment, combined
|
|
|
|
|
//! with the fact that allocations are never moved from small to large, or vice
|
|
|
|
|
//! versa.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! When a bucket is full, a new one is allocated, containing a pointer to the
|
2025-10-28 10:26:04 +01:00
|
|
|
//! previous one. This doubly-linked list is iterated during leak detection.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! Resizing and remapping work the same on small allocations: if the size
|
|
|
|
|
//! class would not change, then the operation succeeds, and the address is
|
|
|
|
|
//! unchanged. Otherwise, the request is rejected.
|
2020-08-07 22:35:15 -07:00
|
|
|
//!
|
2025-02-05 19:18:22 -08:00
|
|
|
//! Large objects are allocated directly using the backing allocator. Metadata
|
|
|
|
|
//! is stored separately in a `std.HashMap` using the backing allocator.
|
|
|
|
|
//!
|
|
|
|
|
//! Resizing and remapping are forwarded directly to the backing allocator,
|
|
|
|
|
//! except where such operations would change the category from large to small.
|
2025-10-19 14:08:21 -07:00
|
|
|
const builtin = @import("builtin");
|
|
|
|
|
const StackTrace = std.builtin.StackTrace;
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const std = @import("std");
|
2021-01-30 20:15:26 -07:00
|
|
|
const log = std.log.scoped(.gpa);
|
2020-08-07 22:35:15 -07:00
|
|
|
const math = std.math;
|
|
|
|
|
const assert = std.debug.assert;
|
2020-08-08 12:04:19 -07:00
|
|
|
const mem = std.mem;
|
2020-08-07 22:35:15 -07:00
|
|
|
const Allocator = std.mem.Allocator;
|
2025-02-04 23:05:06 -08:00
|
|
|
|
2025-02-22 17:43:27 -08:00
|
|
|
const default_page_size: usize = switch (builtin.os.tag) {
|
|
|
|
|
// Makes `std.heap.PageAllocator` take the happy path.
|
|
|
|
|
.windows => 64 * 1024,
|
|
|
|
|
else => switch (builtin.cpu.arch) {
|
|
|
|
|
// Max alignment supported by `std.heap.WasmAllocator`.
|
|
|
|
|
.wasm32, .wasm64 => 64 * 1024,
|
|
|
|
|
// Avoids too many active mappings when `page_size_max` is low.
|
|
|
|
|
else => @max(std.heap.page_size_max, 128 * 1024),
|
|
|
|
|
},
|
|
|
|
|
};
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 01:04:44 -08:00
|
|
|
const Log2USize = std.math.Log2Int(usize);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 14:25:29 -08:00
|
|
|
const default_sys_stack_trace_frames: usize = if (std.debug.sys_can_stack_trace) 6 else 0;
|
2021-10-04 23:47:27 -07:00
|
|
|
const default_stack_trace_frames: usize = switch (builtin.mode) {
|
2020-08-08 12:04:19 -07:00
|
|
|
.Debug => default_sys_stack_trace_frames,
|
|
|
|
|
else => 0,
|
|
|
|
|
};
|
2020-08-08 01:00:29 -07:00
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
pub const Config = struct {
|
|
|
|
|
/// Number of stack frames to capture.
|
2020-08-08 12:04:19 -07:00
|
|
|
stack_trace_frames: usize = default_stack_trace_frames,
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
/// If true, the allocator will have two fields:
|
|
|
|
|
/// * `total_requested_bytes` which tracks the total allocated bytes of memory requested.
|
|
|
|
|
/// * `requested_memory_limit` which causes allocations to return `error.OutOfMemory`
|
|
|
|
|
/// when the `total_requested_bytes` exceeds this limit.
|
|
|
|
|
/// If false, these fields will be `void`.
|
|
|
|
|
enable_memory_limit: bool = false,
|
|
|
|
|
|
|
|
|
|
/// Whether to enable safety checks.
|
|
|
|
|
safety: bool = std.debug.runtime_safety,
|
|
|
|
|
|
|
|
|
|
/// Whether the allocator may be used simultaneously from multiple threads.
|
2021-10-04 23:47:27 -07:00
|
|
|
thread_safe: bool = !builtin.single_threaded,
|
2020-08-18 15:09:48 -07:00
|
|
|
|
2020-12-03 12:49:35 -08:00
|
|
|
/// What type of mutex you'd like to use, for thread safety.
|
2023-04-30 18:02:08 +01:00
|
|
|
/// when specified, the mutex type must have the same shape as `std.Thread.Mutex` and
|
2022-04-23 19:35:56 -05:00
|
|
|
/// `DummyMutex`, and have no required fields. Specifying this field causes
|
2020-12-03 12:49:35 -08:00
|
|
|
/// the `thread_safe` field to be ignored.
|
|
|
|
|
///
|
|
|
|
|
/// when null (default):
|
2021-01-14 20:41:37 -07:00
|
|
|
/// * the mutex type defaults to `std.Thread.Mutex` when thread_safe is enabled.
|
2022-04-23 19:35:56 -05:00
|
|
|
/// * the mutex type defaults to `DummyMutex` otherwise.
|
2020-12-03 12:49:35 -08:00
|
|
|
MutexType: ?type = null,
|
|
|
|
|
|
2020-08-18 15:09:48 -07:00
|
|
|
/// This is a temporary debugging trick you can use to turn segfaults into more helpful
|
|
|
|
|
/// logged error messages with stack trace details. The downside is that every allocation
|
2021-05-11 23:54:11 -04:00
|
|
|
/// will be leaked, unless used with retain_metadata!
|
2020-08-18 15:09:48 -07:00
|
|
|
never_unmap: bool = false,
|
2021-01-30 20:15:26 -07:00
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
/// This is a temporary debugging aid that retains metadata about allocations indefinitely.
|
|
|
|
|
/// This allows a greater range of double frees to be reported. All metadata is freed when
|
|
|
|
|
/// deinit is called. When used with never_unmap, deliberately leaked memory is also freed
|
|
|
|
|
/// during deinit. Currently should be used with never_unmap to avoid segfaults.
|
|
|
|
|
/// TODO https://github.com/ziglang/zig/issues/4298 will allow use without never_unmap
|
|
|
|
|
retain_metadata: bool = false,
|
|
|
|
|
|
2021-01-30 20:15:26 -07:00
|
|
|
/// Enables emitting info messages with the size and address of every allocation.
|
|
|
|
|
verbose_log: bool = false,
|
2025-02-03 22:34:29 -08:00
|
|
|
|
|
|
|
|
/// Tell whether the backing allocator returns already-zeroed memory.
|
|
|
|
|
backing_allocator_zeroes: bool = true,
|
2025-02-04 23:12:55 -08:00
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
/// When resizing an allocation, refresh the stack trace with the resize
|
|
|
|
|
/// callsite. Comes with a performance penalty.
|
|
|
|
|
resize_stack_traces: bool = false,
|
|
|
|
|
|
2025-02-04 23:12:55 -08:00
|
|
|
/// Magic value that distinguishes allocations owned by this allocator from
|
|
|
|
|
/// other regions of memory.
|
|
|
|
|
canary: usize = @truncate(0x9232a6ff85dff10f),
|
2025-02-05 19:28:48 -08:00
|
|
|
|
|
|
|
|
/// The size of allocations requested from the backing allocator for
|
|
|
|
|
/// subdividing into slots for small allocations.
|
|
|
|
|
///
|
|
|
|
|
/// Must be a power of two.
|
|
|
|
|
page_size: usize = default_page_size,
|
2020-08-07 22:35:15 -07:00
|
|
|
};
|
|
|
|
|
|
2024-08-31 02:50:11 +01:00
|
|
|
/// Default initialization of this struct is deprecated; use `.init` instead.
|
2025-02-05 19:18:22 -08:00
|
|
|
pub fn DebugAllocator(comptime config: Config) type {
|
2020-08-07 22:35:15 -07:00
|
|
|
return struct {
|
2021-10-29 00:37:25 +01:00
|
|
|
backing_allocator: Allocator = std.heap.page_allocator,
|
2025-02-04 23:05:06 -08:00
|
|
|
/// Tracks the active bucket, which is the one that has free slots in it.
|
|
|
|
|
buckets: [small_bucket_count]?*BucketHeader = [1]?*BucketHeader{null} ** small_bucket_count,
|
|
|
|
|
large_allocations: LargeAllocTable = .empty,
|
2020-08-07 22:35:15 -07:00
|
|
|
total_requested_bytes: @TypeOf(total_requested_bytes_init) = total_requested_bytes_init,
|
|
|
|
|
requested_memory_limit: @TypeOf(requested_memory_limit_init) = requested_memory_limit_init,
|
|
|
|
|
mutex: @TypeOf(mutex_init) = mutex_init,
|
|
|
|
|
|
|
|
|
|
const Self = @This();
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
pub const init: Self = .{};
|
|
|
|
|
|
|
|
|
|
/// These can be derived from size_class_index but the calculation is nontrivial.
|
|
|
|
|
const slot_counts: [small_bucket_count]SlotIndex = init: {
|
|
|
|
|
@setEvalBranchQuota(10000);
|
|
|
|
|
var result: [small_bucket_count]SlotIndex = undefined;
|
|
|
|
|
for (&result, 0..) |*elem, i| elem.* = calculateSlotCount(i);
|
|
|
|
|
break :init result;
|
2024-08-31 02:50:11 +01:00
|
|
|
};
|
|
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
comptime {
|
|
|
|
|
assert(math.isPowerOfTwo(page_size));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const page_size = config.page_size;
|
|
|
|
|
const page_align: mem.Alignment = .fromByteUnits(page_size);
|
|
|
|
|
/// Integer type for pointing to slots in a small allocation
|
|
|
|
|
const SlotIndex = std.meta.Int(.unsigned, math.log2(page_size) + 1);
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
const total_requested_bytes_init = if (config.enable_memory_limit) @as(usize, 0) else {};
|
|
|
|
|
const requested_memory_limit_init = if (config.enable_memory_limit) @as(usize, math.maxInt(usize)) else {};
|
|
|
|
|
|
2020-12-09 13:54:26 +02:00
|
|
|
const mutex_init = if (config.MutexType) |T|
|
|
|
|
|
T{}
|
|
|
|
|
else if (config.thread_safe)
|
2021-01-14 20:41:37 -07:00
|
|
|
std.Thread.Mutex{}
|
2020-12-09 13:54:26 +02:00
|
|
|
else
|
2022-04-23 19:35:56 -05:00
|
|
|
DummyMutex{};
|
|
|
|
|
|
|
|
|
|
const DummyMutex = struct {
|
2025-06-07 23:30:17 -04:00
|
|
|
inline fn lock(_: DummyMutex) void {}
|
|
|
|
|
inline fn unlock(_: DummyMutex) void {}
|
2022-04-23 19:35:56 -05:00
|
|
|
};
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const stack_n = config.stack_trace_frames;
|
|
|
|
|
const one_trace_size = @sizeOf(usize) * stack_n;
|
|
|
|
|
const traces_per_slot = 2;
|
|
|
|
|
|
2020-08-08 12:04:19 -07:00
|
|
|
pub const Error = mem.Allocator.Error;
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 14:25:29 -08:00
|
|
|
/// Avoids creating buckets that would only be able to store a small
|
|
|
|
|
/// number of slots. Value of 1 means 2 is the minimum slot count.
|
|
|
|
|
const minimum_slots_per_bucket_log2 = 1;
|
|
|
|
|
const small_bucket_count = math.log2(page_size) - minimum_slots_per_bucket_log2;
|
2020-08-07 22:35:15 -07:00
|
|
|
const largest_bucket_object_size = 1 << (small_bucket_count - 1);
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
const LargestSizeClassInt = std.math.IntFittingRange(0, largest_bucket_object_size);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
const bucketCompare = struct {
|
|
|
|
|
fn compare(a: *BucketHeader, b: *BucketHeader) std.math.Order {
|
|
|
|
|
return std.math.order(@intFromPtr(a.page), @intFromPtr(b.page));
|
|
|
|
|
}
|
|
|
|
|
}.compare;
|
2023-04-04 15:41:25 +05:30
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
const LargeAlloc = struct {
|
|
|
|
|
bytes: []u8,
|
2021-10-27 19:51:05 -04:00
|
|
|
requested_size: if (config.enable_memory_limit) usize else void,
|
2021-05-11 23:54:11 -04:00
|
|
|
stack_addresses: [trace_n][stack_n]usize,
|
|
|
|
|
freed: if (config.retain_metadata) bool else void,
|
2025-02-03 19:55:09 -08:00
|
|
|
alignment: if (config.never_unmap and config.retain_metadata) mem.Alignment else void,
|
2021-05-11 23:54:11 -04:00
|
|
|
|
|
|
|
|
const trace_n = if (config.retain_metadata) traces_per_slot else 1;
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
fn dumpStackTrace(self: *LargeAlloc, trace_kind: TraceKind) void {
|
|
|
|
|
std.debug.dumpStackTrace(self.getStackTrace(trace_kind));
|
2020-08-10 19:34:27 -07:00
|
|
|
}
|
|
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
fn getStackTrace(self: *LargeAlloc, trace_kind: TraceKind) std.builtin.StackTrace {
|
2023-06-15 13:14:16 +06:00
|
|
|
assert(@intFromEnum(trace_kind) < trace_n);
|
|
|
|
|
const stack_addresses = &self.stack_addresses[@intFromEnum(trace_kind)];
|
2020-08-07 22:35:15 -07:00
|
|
|
var len: usize = 0;
|
2021-05-11 23:54:11 -04:00
|
|
|
while (len < stack_n and stack_addresses[len] != 0) {
|
2020-08-07 22:35:15 -07:00
|
|
|
len += 1;
|
|
|
|
|
}
|
2020-08-10 19:34:27 -07:00
|
|
|
return .{
|
2021-05-11 23:54:11 -04:00
|
|
|
.instruction_addresses = stack_addresses,
|
2020-08-07 22:35:15 -07:00
|
|
|
.index = len,
|
|
|
|
|
};
|
|
|
|
|
}
|
2021-05-11 23:54:11 -04:00
|
|
|
|
|
|
|
|
fn captureStackTrace(self: *LargeAlloc, ret_addr: usize, trace_kind: TraceKind) void {
|
2023-06-15 13:14:16 +06:00
|
|
|
assert(@intFromEnum(trace_kind) < trace_n);
|
|
|
|
|
const stack_addresses = &self.stack_addresses[@intFromEnum(trace_kind)];
|
2021-05-11 23:54:11 -04:00
|
|
|
collectStackTrace(ret_addr, stack_addresses);
|
|
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
};
|
2020-08-08 15:59:03 -07:00
|
|
|
const LargeAllocTable = std.AutoHashMapUnmanaged(usize, LargeAlloc);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
/// Bucket: In memory, in order:
|
|
|
|
|
/// * BucketHeader
|
2025-02-05 13:31:01 -08:00
|
|
|
/// * bucket_used_bits: [N]usize, // 1 bit for every slot
|
2025-02-04 23:05:06 -08:00
|
|
|
/// -- below only exists when config.safety is true --
|
|
|
|
|
/// * requested_sizes: [N]LargestSizeClassInt // 1 int for every slot
|
|
|
|
|
/// * log2_ptr_aligns: [N]u8 // 1 byte for every slot
|
|
|
|
|
/// -- above only exists when config.safety is true --
|
|
|
|
|
/// * stack_trace_addresses: [N]usize, // traces_per_slot for every allocation
|
2020-08-07 22:35:15 -07:00
|
|
|
const BucketHeader = struct {
|
2025-02-04 23:05:06 -08:00
|
|
|
allocated_count: SlotIndex,
|
|
|
|
|
freed_count: SlotIndex,
|
|
|
|
|
prev: ?*BucketHeader,
|
2025-04-02 08:15:04 -04:00
|
|
|
next: ?*BucketHeader,
|
2025-02-04 23:12:55 -08:00
|
|
|
canary: usize = config.canary,
|
2025-02-04 23:05:06 -08:00
|
|
|
|
|
|
|
|
fn fromPage(page_addr: usize, slot_count: usize) *BucketHeader {
|
|
|
|
|
const unaligned = page_addr + page_size - bucketSize(slot_count);
|
|
|
|
|
return @ptrFromInt(unaligned & ~(@as(usize, @alignOf(BucketHeader)) - 1));
|
|
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 13:31:01 -08:00
|
|
|
fn usedBits(bucket: *BucketHeader, index: usize) *usize {
|
|
|
|
|
const ptr: [*]u8 = @ptrCast(bucket);
|
2025-08-03 14:53:52 +02:00
|
|
|
const bits: [*]usize = @ptrCast(@alignCast(ptr + @sizeOf(BucketHeader)));
|
2025-02-05 13:31:01 -08:00
|
|
|
return &bits[index];
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn requestedSizes(bucket: *BucketHeader, slot_count: usize) []LargestSizeClassInt {
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (!config.safety) @compileError("requested size is only stored when safety is enabled");
|
2025-02-04 23:05:06 -08:00
|
|
|
const start_ptr = @as([*]u8, @ptrCast(bucket)) + bucketRequestedSizesStart(slot_count);
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
const sizes = @as([*]LargestSizeClassInt, @ptrCast(@alignCast(start_ptr)));
|
|
|
|
|
return sizes[0..slot_count];
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn log2PtrAligns(bucket: *BucketHeader, slot_count: usize) []mem.Alignment {
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (!config.safety) @compileError("requested size is only stored when safety is enabled");
|
2025-02-04 23:05:06 -08:00
|
|
|
const aligns_ptr = @as([*]u8, @ptrCast(bucket)) + bucketAlignsStart(slot_count);
|
2025-02-03 19:55:09 -08:00
|
|
|
return @ptrCast(aligns_ptr[0..slot_count]);
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
}
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
fn stackTracePtr(
|
|
|
|
|
bucket: *BucketHeader,
|
2025-02-04 23:05:06 -08:00
|
|
|
slot_count: usize,
|
2020-08-07 22:35:15 -07:00
|
|
|
slot_index: SlotIndex,
|
|
|
|
|
trace_kind: TraceKind,
|
|
|
|
|
) *[stack_n]usize {
|
2025-02-04 23:05:06 -08:00
|
|
|
const start_ptr = @as([*]u8, @ptrCast(bucket)) + bucketStackFramesStart(slot_count);
|
2020-08-07 22:35:15 -07:00
|
|
|
const addr = start_ptr + one_trace_size * traces_per_slot * slot_index +
|
2023-06-15 13:14:16 +06:00
|
|
|
@intFromEnum(trace_kind) * @as(usize, one_trace_size);
|
2023-06-22 18:46:56 +01:00
|
|
|
return @ptrCast(@alignCast(addr));
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn captureStackTrace(
|
|
|
|
|
bucket: *BucketHeader,
|
2020-08-08 00:34:13 -07:00
|
|
|
ret_addr: usize,
|
2025-02-04 23:05:06 -08:00
|
|
|
slot_count: usize,
|
2020-08-07 22:35:15 -07:00
|
|
|
slot_index: SlotIndex,
|
|
|
|
|
trace_kind: TraceKind,
|
|
|
|
|
) void {
|
|
|
|
|
// Initialize them to 0. When determining the count we must look
|
|
|
|
|
// for non zero addresses.
|
2025-02-04 23:05:06 -08:00
|
|
|
const stack_addresses = bucket.stackTracePtr(slot_count, slot_index, trace_kind);
|
2020-08-08 00:34:13 -07:00
|
|
|
collectStackTrace(ret_addr, stack_addresses);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2021-10-29 02:08:41 +01:00
|
|
|
pub fn allocator(self: *Self) Allocator {
|
2022-11-27 01:07:35 -07:00
|
|
|
return .{
|
|
|
|
|
.ptr = self,
|
|
|
|
|
.vtable = &.{
|
|
|
|
|
.alloc = alloc,
|
|
|
|
|
.resize = resize,
|
2025-02-03 19:55:09 -08:00
|
|
|
.remap = remap,
|
2022-11-27 01:07:35 -07:00
|
|
|
.free = free,
|
|
|
|
|
},
|
|
|
|
|
};
|
2021-10-29 00:37:25 +01:00
|
|
|
}
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
fn bucketStackTrace(
|
|
|
|
|
bucket: *BucketHeader,
|
2025-02-04 23:05:06 -08:00
|
|
|
slot_count: usize,
|
2020-08-07 22:35:15 -07:00
|
|
|
slot_index: SlotIndex,
|
|
|
|
|
trace_kind: TraceKind,
|
|
|
|
|
) StackTrace {
|
2025-02-04 23:05:06 -08:00
|
|
|
const stack_addresses = bucket.stackTracePtr(slot_count, slot_index, trace_kind);
|
2020-08-07 22:35:15 -07:00
|
|
|
var len: usize = 0;
|
|
|
|
|
while (len < stack_n and stack_addresses[len] != 0) {
|
|
|
|
|
len += 1;
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
return .{
|
2020-08-07 22:35:15 -07:00
|
|
|
.instruction_addresses = stack_addresses,
|
|
|
|
|
.index = len,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn bucketRequestedSizesStart(slot_count: usize) usize {
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (!config.safety) @compileError("requested sizes are not stored unless safety is enabled");
|
2020-08-08 12:04:19 -07:00
|
|
|
return mem.alignForward(
|
2023-06-09 16:02:18 -07:00
|
|
|
usize,
|
2025-02-05 13:31:01 -08:00
|
|
|
@sizeOf(BucketHeader) + usedBitsSize(slot_count),
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
@alignOf(LargestSizeClassInt),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn bucketAlignsStart(slot_count: usize) usize {
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (!config.safety) @compileError("requested sizes are not stored unless safety is enabled");
|
2025-02-04 23:05:06 -08:00
|
|
|
return bucketRequestedSizesStart(slot_count) + (@sizeOf(LargestSizeClassInt) * slot_count);
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn bucketStackFramesStart(slot_count: usize) usize {
|
|
|
|
|
const unaligned_start = if (config.safety)
|
|
|
|
|
bucketAlignsStart(slot_count) + slot_count
|
|
|
|
|
else
|
2025-02-05 13:31:01 -08:00
|
|
|
@sizeOf(BucketHeader) + usedBitsSize(slot_count);
|
2025-02-04 23:05:06 -08:00
|
|
|
return mem.alignForward(usize, unaligned_start, @alignOf(usize));
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn bucketSize(slot_count: usize) usize {
|
|
|
|
|
return bucketStackFramesStart(slot_count) + one_trace_size * traces_per_slot * slot_count;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
/// This is executed only at compile-time to prepopulate a lookup table.
|
|
|
|
|
fn calculateSlotCount(size_class_index: usize) SlotIndex {
|
2025-02-05 01:04:44 -08:00
|
|
|
const size_class = @as(usize, 1) << @as(Log2USize, @intCast(size_class_index));
|
2025-02-05 14:25:29 -08:00
|
|
|
var lower: usize = 1 << minimum_slots_per_bucket_log2;
|
2025-02-04 23:05:06 -08:00
|
|
|
var upper: usize = (page_size - bucketSize(lower)) / size_class;
|
|
|
|
|
while (upper > lower) {
|
|
|
|
|
const proposed: usize = lower + (upper - lower) / 2;
|
|
|
|
|
if (proposed == lower) return lower;
|
|
|
|
|
const slots_end = proposed * size_class;
|
|
|
|
|
const header_begin = mem.alignForward(usize, slots_end, @alignOf(BucketHeader));
|
2025-02-05 14:25:29 -08:00
|
|
|
const end = header_begin + bucketSize(proposed);
|
2025-02-04 23:05:06 -08:00
|
|
|
if (end > page_size) {
|
|
|
|
|
upper = proposed - 1;
|
|
|
|
|
} else {
|
|
|
|
|
lower = proposed;
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-02-05 14:25:29 -08:00
|
|
|
const slots_end = lower * size_class;
|
|
|
|
|
const header_begin = mem.alignForward(usize, slots_end, @alignOf(BucketHeader));
|
|
|
|
|
const end = header_begin + bucketSize(lower);
|
|
|
|
|
assert(end <= page_size);
|
2025-02-04 23:05:06 -08:00
|
|
|
return lower;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn usedBitsCount(slot_count: usize) usize {
|
2025-02-05 13:31:01 -08:00
|
|
|
return (slot_count + (@bitSizeOf(usize) - 1)) / @bitSizeOf(usize);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn usedBitsSize(slot_count: usize) usize {
|
|
|
|
|
return usedBitsCount(slot_count) * @sizeOf(usize);
|
2025-02-04 23:05:06 -08:00
|
|
|
}
|
|
|
|
|
|
2025-10-19 14:08:21 -07:00
|
|
|
fn detectLeaksInBucket(
|
|
|
|
|
bucket: *BucketHeader,
|
|
|
|
|
size_class_index: usize,
|
|
|
|
|
used_bits_count: usize,
|
|
|
|
|
tty_config: std.Io.tty.Config,
|
|
|
|
|
) usize {
|
2025-02-05 01:04:44 -08:00
|
|
|
const size_class = @as(usize, 1) << @as(Log2USize, @intCast(size_class_index));
|
2025-02-04 23:05:06 -08:00
|
|
|
const slot_count = slot_counts[size_class_index];
|
std.Build.Step.Run: many enhancements
This is a major refactor to `Step.Run` which adds new functionality,
primarily to the execution of Zig tests.
* All tests are run, even if a test crashes. This happens through the
same mechanism as timeouts where the test processes is repeatedly
respawned as needed.
* The build status output is more precise. For each unit test, it
differentiates pass, skip, fail, crash, and timeout. Memory leaks are
reported separately, as they do not indicate a test's "status", but
are rather an additional property (a test with leaks may still pass!).
* The number of memory leaks is tracked and reported, both per-test and
for a whole `Run` step.
* Reporting is made clearer when a step is failed solely due to error
logs (`std.log.err`) where every unit test passed.
2025-08-26 15:34:53 +01:00
|
|
|
var leaks: usize = 0;
|
2025-02-05 13:21:15 -08:00
|
|
|
for (0..used_bits_count) |used_bits_byte| {
|
2025-02-05 13:31:01 -08:00
|
|
|
const used_int = bucket.usedBits(used_bits_byte).*;
|
|
|
|
|
if (used_int != 0) {
|
|
|
|
|
for (0..@bitSizeOf(usize)) |bit_index_usize| {
|
|
|
|
|
const bit_index: Log2USize = @intCast(bit_index_usize);
|
|
|
|
|
const is_used = @as(u1, @truncate(used_int >> bit_index)) != 0;
|
2020-08-07 22:35:15 -07:00
|
|
|
if (is_used) {
|
2025-02-05 13:31:01 -08:00
|
|
|
const slot_index: SlotIndex = @intCast(used_bits_byte * @bitSizeOf(usize) + bit_index);
|
2025-02-04 23:05:06 -08:00
|
|
|
const stack_trace = bucketStackTrace(bucket, slot_count, slot_index, .alloc);
|
|
|
|
|
const page_addr = @intFromPtr(bucket) & ~(page_size - 1);
|
|
|
|
|
const addr = page_addr + slot_index * size_class;
|
2025-10-19 14:08:21 -07:00
|
|
|
log.err("memory address 0x{x} leaked: {f}", .{
|
|
|
|
|
addr,
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
});
|
std.Build.Step.Run: many enhancements
This is a major refactor to `Step.Run` which adds new functionality,
primarily to the execution of Zig tests.
* All tests are run, even if a test crashes. This happens through the
same mechanism as timeouts where the test processes is repeatedly
respawned as needed.
* The build status output is more precise. For each unit test, it
differentiates pass, skip, fail, crash, and timeout. Memory leaks are
reported separately, as they do not indicate a test's "status", but
are rather an additional property (a test with leaks may still pass!).
* The number of memory leaks is tracked and reported, both per-test and
for a whole `Run` step.
* Reporting is made clearer when a step is failed solely due to error
logs (`std.log.err`) where every unit test passed.
2025-08-26 15:34:53 +01:00
|
|
|
leaks += 1;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-08-07 23:26:58 -07:00
|
|
|
return leaks;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
std.Build.Step.Run: many enhancements
This is a major refactor to `Step.Run` which adds new functionality,
primarily to the execution of Zig tests.
* All tests are run, even if a test crashes. This happens through the
same mechanism as timeouts where the test processes is repeatedly
respawned as needed.
* The build status output is more precise. For each unit test, it
differentiates pass, skip, fail, crash, and timeout. Memory leaks are
reported separately, as they do not indicate a test's "status", but
are rather an additional property (a test with leaks may still pass!).
* The number of memory leaks is tracked and reported, both per-test and
for a whole `Run` step.
* Reporting is made clearer when a step is failed solely due to error
logs (`std.log.err`) where every unit test passed.
2025-08-26 15:34:53 +01:00
|
|
|
/// Emits log messages for leaks and then returns the number of detected leaks (0 if no leaks were detected).
|
|
|
|
|
pub fn detectLeaks(self: *Self) usize {
|
|
|
|
|
var leaks: usize = 0;
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
for (self.buckets, 0..) |init_optional_bucket, size_class_index| {
|
|
|
|
|
var optional_bucket = init_optional_bucket;
|
|
|
|
|
const slot_count = slot_counts[size_class_index];
|
|
|
|
|
const used_bits_count = usedBitsCount(slot_count);
|
|
|
|
|
while (optional_bucket) |bucket| {
|
2025-10-19 14:08:21 -07:00
|
|
|
leaks += detectLeaksInBucket(bucket, size_class_index, used_bits_count, tty_config);
|
2025-02-04 23:05:06 -08:00
|
|
|
optional_bucket = bucket.prev;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
|
2021-06-03 15:39:26 -05:00
|
|
|
var it = self.large_allocations.valueIterator();
|
2020-08-02 23:24:03 +02:00
|
|
|
while (it.next()) |large_alloc| {
|
2021-05-11 23:54:11 -04:00
|
|
|
if (config.retain_metadata and large_alloc.freed) continue;
|
2022-02-28 10:20:29 +02:00
|
|
|
const stack_trace = large_alloc.getStackTrace(.alloc);
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("memory address 0x{x} leaked: {f}", .{
|
2025-10-19 14:08:21 -07:00
|
|
|
@intFromPtr(large_alloc.bytes.ptr),
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2022-03-19 15:13:27 +02:00
|
|
|
});
|
std.Build.Step.Run: many enhancements
This is a major refactor to `Step.Run` which adds new functionality,
primarily to the execution of Zig tests.
* All tests are run, even if a test crashes. This happens through the
same mechanism as timeouts where the test processes is repeatedly
respawned as needed.
* The build status output is more precise. For each unit test, it
differentiates pass, skip, fail, crash, and timeout. Memory leaks are
reported separately, as they do not indicate a test's "status", but
are rather an additional property (a test with leaks may still pass!).
* The number of memory leaks is tracked and reported, both per-test and
for a whole `Run` step.
* Reporting is made clearer when a step is failed solely due to error
logs (`std.log.err`) where every unit test passed.
2025-08-26 15:34:53 +01:00
|
|
|
leaks += 1;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2020-08-10 19:34:27 -07:00
|
|
|
return leaks;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
fn freeRetainedMetadata(self: *Self) void {
|
2025-02-04 23:05:06 -08:00
|
|
|
comptime assert(config.retain_metadata);
|
|
|
|
|
if (config.never_unmap) {
|
|
|
|
|
// free large allocations that were intentionally leaked by never_unmap
|
|
|
|
|
var it = self.large_allocations.iterator();
|
|
|
|
|
while (it.next()) |large| {
|
|
|
|
|
if (large.value_ptr.freed) {
|
|
|
|
|
self.backing_allocator.rawFree(large.value_ptr.bytes, large.value_ptr.alignment, @returnAddress());
|
2021-05-11 23:54:11 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 14:44:44 +00:00
|
|
|
pub fn flushRetainedMetadata(self: *Self) void {
|
2025-02-04 23:05:06 -08:00
|
|
|
comptime assert(config.retain_metadata);
|
2024-02-01 14:44:44 +00:00
|
|
|
self.freeRetainedMetadata();
|
|
|
|
|
// also remove entries from large_allocations
|
|
|
|
|
var it = self.large_allocations.iterator();
|
|
|
|
|
while (it.next()) |large| {
|
|
|
|
|
if (large.value_ptr.freed) {
|
|
|
|
|
_ = self.large_allocations.remove(@intFromPtr(large.value_ptr.bytes.ptr));
|
2021-05-11 23:54:11 -04:00
|
|
|
}
|
|
|
|
|
}
|
2024-02-01 14:44:44 +00:00
|
|
|
}
|
2021-05-11 23:54:11 -04:00
|
|
|
|
2025-02-05 19:18:22 -08:00
|
|
|
/// Returns `std.heap.Check.leak` if there were leaks; `std.heap.Check.ok` otherwise.
|
|
|
|
|
pub fn deinit(self: *Self) std.heap.Check {
|
std.Build.Step.Run: many enhancements
This is a major refactor to `Step.Run` which adds new functionality,
primarily to the execution of Zig tests.
* All tests are run, even if a test crashes. This happens through the
same mechanism as timeouts where the test processes is repeatedly
respawned as needed.
* The build status output is more precise. For each unit test, it
differentiates pass, skip, fail, crash, and timeout. Memory leaks are
reported separately, as they do not indicate a test's "status", but
are rather an additional property (a test with leaks may still pass!).
* The number of memory leaks is tracked and reported, both per-test and
for a whole `Run` step.
* Reporting is made clearer when a step is failed solely due to error
logs (`std.log.err`) where every unit test passed.
2025-08-26 15:34:53 +01:00
|
|
|
const leaks: usize = if (config.safety) self.detectLeaks() else 0;
|
|
|
|
|
self.deinitWithoutLeakChecks();
|
|
|
|
|
return if (leaks == 0) .ok else .leak;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Like `deinit`, but does not check for memory leaks. This is useful if leaks have already
|
|
|
|
|
/// been detected manually with `detectLeaks` to avoid reporting them for a second time.
|
|
|
|
|
pub fn deinitWithoutLeakChecks(self: *Self) void {
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.retain_metadata) self.freeRetainedMetadata();
|
2020-08-07 22:35:15 -07:00
|
|
|
self.large_allocations.deinit(self.backing_allocator);
|
|
|
|
|
self.* = undefined;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-05 20:40:11 +01:00
|
|
|
fn collectStackTrace(first_trace_addr: usize, addr_buf: *[stack_n]usize) void {
|
|
|
|
|
const st = std.debug.captureCurrentStackTrace(.{ .first_address = first_trace_addr }, addr_buf);
|
|
|
|
|
@memset(addr_buf[@min(st.index, addr_buf.len)..], 0);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
fn reportDoubleFree(ret_addr: usize, alloc_stack_trace: StackTrace, free_stack_trace: StackTrace) void {
|
2025-09-05 20:40:11 +01:00
|
|
|
var addr_buf: [stack_n]usize = undefined;
|
|
|
|
|
const second_free_stack_trace = std.debug.captureCurrentStackTrace(.{ .first_address = ret_addr }, &addr_buf);
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Double free detected. Allocation: {f} First free: {f} Second free: {f}", .{
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = alloc_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = second_free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2021-05-11 23:54:11 -04:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
/// This function assumes the object is in the large object storage regardless
|
|
|
|
|
/// of the parameters.
|
|
|
|
|
fn resizeLarge(
|
|
|
|
|
self: *Self,
|
|
|
|
|
old_mem: []u8,
|
2025-02-03 19:55:09 -08:00
|
|
|
alignment: mem.Alignment,
|
2020-08-07 22:35:15 -07:00
|
|
|
new_size: usize,
|
2020-08-08 00:34:13 -07:00
|
|
|
ret_addr: usize,
|
2025-02-03 19:55:09 -08:00
|
|
|
may_move: bool,
|
|
|
|
|
) ?[*]u8 {
|
2025-02-05 16:30:46 -08:00
|
|
|
if (config.retain_metadata and may_move) {
|
|
|
|
|
// Before looking up the entry (since this could invalidate
|
|
|
|
|
// it), we must reserve space for the new entry in case the
|
|
|
|
|
// allocation is relocated.
|
|
|
|
|
self.large_allocations.ensureUnusedCapacity(self.backing_allocator, 1) catch return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-15 13:14:16 +06:00
|
|
|
const entry = self.large_allocations.getEntry(@intFromPtr(old_mem.ptr)) orelse {
|
2020-08-07 22:35:15 -07:00
|
|
|
if (config.safety) {
|
|
|
|
|
@panic("Invalid free");
|
|
|
|
|
} else {
|
|
|
|
|
unreachable;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2021-05-11 23:54:11 -04:00
|
|
|
if (config.retain_metadata and entry.value_ptr.freed) {
|
|
|
|
|
if (config.safety) {
|
2021-06-10 20:13:43 -07:00
|
|
|
reportDoubleFree(ret_addr, entry.value_ptr.getStackTrace(.alloc), entry.value_ptr.getStackTrace(.free));
|
2021-05-11 23:54:11 -04:00
|
|
|
@panic("Unrecoverable double free");
|
|
|
|
|
} else {
|
|
|
|
|
unreachable;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-03 15:39:26 -05:00
|
|
|
if (config.safety and old_mem.len != entry.value_ptr.bytes.len) {
|
2025-09-05 20:40:11 +01:00
|
|
|
var addr_buf: [stack_n]usize = undefined;
|
|
|
|
|
const free_stack_trace = std.debug.captureCurrentStackTrace(.{ .first_address = ret_addr }, &addr_buf);
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation size {d} bytes does not match free size {d}. Allocation: {f} Free: {f}", .{
|
2021-06-03 15:39:26 -05:00
|
|
|
entry.value_ptr.bytes.len,
|
2020-08-07 22:35:15 -07:00
|
|
|
old_mem.len,
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = entry.value_ptr.getStackTrace(.alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2020-08-07 22:35:15 -07:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
// If this would move the allocation into a small size class,
|
|
|
|
|
// refuse the request, because it would require creating small
|
|
|
|
|
// allocation metadata.
|
|
|
|
|
const new_size_class_index: usize = @max(@bitSizeOf(usize) - @clz(new_size - 1), @intFromEnum(alignment));
|
|
|
|
|
if (new_size_class_index < self.buckets.len) return null;
|
|
|
|
|
|
2022-11-27 01:07:35 -07:00
|
|
|
// Do memory limit accounting with requested sizes rather than what
|
|
|
|
|
// backing_allocator returns because if we want to return
|
|
|
|
|
// error.OutOfMemory, we have to leave allocation untouched, and
|
|
|
|
|
// that is impossible to guarantee after calling
|
|
|
|
|
// backing_allocator.rawResize.
|
2021-10-27 19:51:05 -04:00
|
|
|
const prev_req_bytes = self.total_requested_bytes;
|
|
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
const new_req_bytes = prev_req_bytes + new_size - entry.value_ptr.requested_size;
|
|
|
|
|
if (new_req_bytes > prev_req_bytes and new_req_bytes > self.requested_memory_limit) {
|
2025-02-03 19:55:09 -08:00
|
|
|
return null;
|
2021-10-27 19:51:05 -04:00
|
|
|
}
|
|
|
|
|
self.total_requested_bytes = new_req_bytes;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-05 16:30:46 -08:00
|
|
|
const opt_resized_ptr = if (may_move)
|
|
|
|
|
self.backing_allocator.rawRemap(old_mem, alignment, new_size, ret_addr)
|
|
|
|
|
else if (self.backing_allocator.rawResize(old_mem, alignment, new_size, ret_addr))
|
2025-02-03 19:55:09 -08:00
|
|
|
old_mem.ptr
|
|
|
|
|
else
|
|
|
|
|
null;
|
|
|
|
|
|
|
|
|
|
const resized_ptr = opt_resized_ptr orelse {
|
2021-12-01 03:56:44 -05:00
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
self.total_requested_bytes = prev_req_bytes;
|
|
|
|
|
}
|
2025-02-03 19:55:09 -08:00
|
|
|
return null;
|
|
|
|
|
};
|
2021-10-27 19:51:05 -04:00
|
|
|
|
|
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
entry.value_ptr.requested_size = new_size;
|
|
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2021-01-30 20:15:26 -07:00
|
|
|
if (config.verbose_log) {
|
2025-02-03 19:55:09 -08:00
|
|
|
log.info("large resize {d} bytes at {*} to {d} at {*}", .{
|
|
|
|
|
old_mem.len, old_mem.ptr, new_size, resized_ptr,
|
2021-01-30 20:15:26 -07:00
|
|
|
});
|
|
|
|
|
}
|
2025-02-03 19:55:09 -08:00
|
|
|
entry.value_ptr.bytes = resized_ptr[0..new_size];
|
2025-02-05 00:18:43 -08:00
|
|
|
if (config.resize_stack_traces)
|
|
|
|
|
entry.value_ptr.captureStackTrace(ret_addr, .alloc);
|
2025-02-03 21:38:08 -08:00
|
|
|
|
|
|
|
|
// Update the key of the hash map if the memory was relocated.
|
|
|
|
|
if (resized_ptr != old_mem.ptr) {
|
2025-02-05 16:30:46 -08:00
|
|
|
const large_alloc = entry.value_ptr.*;
|
|
|
|
|
if (config.retain_metadata) {
|
|
|
|
|
entry.value_ptr.freed = true;
|
|
|
|
|
entry.value_ptr.captureStackTrace(ret_addr, .free);
|
|
|
|
|
} else {
|
|
|
|
|
self.large_allocations.removeByPtr(entry.key_ptr);
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-03 21:38:08 -08:00
|
|
|
const gop = self.large_allocations.getOrPutAssumeCapacity(@intFromPtr(resized_ptr));
|
|
|
|
|
if (config.retain_metadata and !config.never_unmap) {
|
|
|
|
|
// Backing allocator may be reusing memory that we're retaining metadata for
|
|
|
|
|
assert(!gop.found_existing or gop.value_ptr.freed);
|
|
|
|
|
} else {
|
|
|
|
|
assert(!gop.found_existing); // This would mean the kernel double-mapped pages.
|
|
|
|
|
}
|
2025-02-05 16:30:46 -08:00
|
|
|
gop.value_ptr.* = large_alloc;
|
2025-02-03 21:38:08 -08:00
|
|
|
}
|
|
|
|
|
|
2025-02-03 19:55:09 -08:00
|
|
|
return resized_ptr;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2021-11-06 00:54:35 +00:00
|
|
|
/// This function assumes the object is in the large object storage regardless
|
|
|
|
|
/// of the parameters.
|
|
|
|
|
fn freeLarge(
|
|
|
|
|
self: *Self,
|
|
|
|
|
old_mem: []u8,
|
2025-02-03 19:55:09 -08:00
|
|
|
alignment: mem.Alignment,
|
2021-11-06 00:54:35 +00:00
|
|
|
ret_addr: usize,
|
|
|
|
|
) void {
|
2023-06-15 13:14:16 +06:00
|
|
|
const entry = self.large_allocations.getEntry(@intFromPtr(old_mem.ptr)) orelse {
|
2021-11-06 00:54:35 +00:00
|
|
|
if (config.safety) {
|
|
|
|
|
@panic("Invalid free");
|
|
|
|
|
} else {
|
|
|
|
|
unreachable;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (config.retain_metadata and entry.value_ptr.freed) {
|
|
|
|
|
if (config.safety) {
|
|
|
|
|
reportDoubleFree(ret_addr, entry.value_ptr.getStackTrace(.alloc), entry.value_ptr.getStackTrace(.free));
|
|
|
|
|
return;
|
|
|
|
|
} else {
|
|
|
|
|
unreachable;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (config.safety and old_mem.len != entry.value_ptr.bytes.len) {
|
2025-09-05 20:40:11 +01:00
|
|
|
var addr_buf: [stack_n]usize = undefined;
|
|
|
|
|
const free_stack_trace = std.debug.captureCurrentStackTrace(.{ .first_address = ret_addr }, &addr_buf);
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation size {d} bytes does not match free size {d}. Allocation: {f} Free: {f}", .{
|
2021-11-06 00:54:35 +00:00
|
|
|
entry.value_ptr.bytes.len,
|
|
|
|
|
old_mem.len,
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = entry.value_ptr.getStackTrace(.alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2021-11-06 00:54:35 +00:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-01 03:56:44 -05:00
|
|
|
if (!config.never_unmap) {
|
2025-02-03 19:55:09 -08:00
|
|
|
self.backing_allocator.rawFree(old_mem, alignment, ret_addr);
|
2021-12-01 03:56:44 -05:00
|
|
|
}
|
|
|
|
|
|
2021-11-06 00:54:35 +00:00
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
self.total_requested_bytes -= entry.value_ptr.requested_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (config.verbose_log) {
|
|
|
|
|
log.info("large free {d} bytes at {*}", .{ old_mem.len, old_mem.ptr });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!config.retain_metadata) {
|
2023-06-15 13:14:16 +06:00
|
|
|
assert(self.large_allocations.remove(@intFromPtr(old_mem.ptr)));
|
2021-11-06 00:54:35 +00:00
|
|
|
} else {
|
|
|
|
|
entry.value_ptr.freed = true;
|
|
|
|
|
entry.value_ptr.captureStackTrace(ret_addr, .free);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ret_addr: usize) ?[*]u8 {
|
2025-02-03 19:55:09 -08:00
|
|
|
const self: *Self = @ptrCast(@alignCast(context));
|
2021-11-09 18:27:12 -07:00
|
|
|
self.mutex.lock();
|
|
|
|
|
defer self.mutex.unlock();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
const new_req_bytes = self.total_requested_bytes + len;
|
|
|
|
|
if (new_req_bytes > self.requested_memory_limit) return null;
|
|
|
|
|
self.total_requested_bytes = new_req_bytes;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment));
|
|
|
|
|
if (size_class_index >= self.buckets.len) {
|
|
|
|
|
@branchHint(.unlikely);
|
|
|
|
|
self.large_allocations.ensureUnusedCapacity(self.backing_allocator, 1) catch return null;
|
|
|
|
|
const ptr = self.backing_allocator.rawAlloc(len, alignment, ret_addr) orelse return null;
|
|
|
|
|
const slice = ptr[0..len];
|
|
|
|
|
|
|
|
|
|
const gop = self.large_allocations.getOrPutAssumeCapacity(@intFromPtr(slice.ptr));
|
|
|
|
|
if (config.retain_metadata and !config.never_unmap) {
|
|
|
|
|
// Backing allocator may be reusing memory that we're retaining metadata for
|
|
|
|
|
assert(!gop.found_existing or gop.value_ptr.freed);
|
|
|
|
|
} else {
|
|
|
|
|
assert(!gop.found_existing); // This would mean the kernel double-mapped pages.
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
gop.value_ptr.bytes = slice;
|
|
|
|
|
if (config.enable_memory_limit)
|
|
|
|
|
gop.value_ptr.requested_size = len;
|
|
|
|
|
gop.value_ptr.captureStackTrace(ret_addr, .alloc);
|
2021-05-11 23:54:11 -04:00
|
|
|
if (config.retain_metadata) {
|
2025-02-04 23:05:06 -08:00
|
|
|
gop.value_ptr.freed = false;
|
|
|
|
|
if (config.never_unmap) {
|
|
|
|
|
gop.value_ptr.alignment = alignment;
|
2021-05-11 23:54:11 -04:00
|
|
|
}
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
|
|
|
|
|
if (config.verbose_log) {
|
|
|
|
|
log.info("large alloc {d} bytes at {*}", .{ slice.len, slice.ptr });
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
return slice.ptr;
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2021-10-27 19:51:05 -04:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
const slot_count = slot_counts[size_class_index];
|
|
|
|
|
|
|
|
|
|
if (self.buckets[size_class_index]) |bucket| {
|
|
|
|
|
@branchHint(.likely);
|
|
|
|
|
const slot_index = bucket.allocated_count;
|
|
|
|
|
if (slot_index < slot_count) {
|
|
|
|
|
@branchHint(.likely);
|
|
|
|
|
bucket.allocated_count = slot_index + 1;
|
2025-02-05 13:31:01 -08:00
|
|
|
const used_bits_byte = bucket.usedBits(slot_index / @bitSizeOf(usize));
|
|
|
|
|
const used_bit_index: Log2USize = @intCast(slot_index % @bitSizeOf(usize));
|
|
|
|
|
used_bits_byte.* |= (@as(usize, 1) << used_bit_index);
|
2025-02-05 01:04:44 -08:00
|
|
|
const size_class = @as(usize, 1) << @as(Log2USize, @intCast(size_class_index));
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.stack_trace_frames > 0) {
|
|
|
|
|
bucket.captureStackTrace(ret_addr, slot_count, slot_index, .alloc);
|
2023-04-04 15:41:25 +05:30
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.safety) {
|
|
|
|
|
bucket.requestedSizes(slot_count)[slot_index] = @intCast(len);
|
|
|
|
|
bucket.log2PtrAligns(slot_count)[slot_index] = alignment;
|
2023-04-04 15:41:25 +05:30
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
const page_addr = @intFromPtr(bucket) & ~(page_size - 1);
|
|
|
|
|
const addr = page_addr + slot_index * size_class;
|
|
|
|
|
if (config.verbose_log) {
|
|
|
|
|
log.info("small alloc {d} bytes at 0x{x}", .{ len, addr });
|
|
|
|
|
}
|
|
|
|
|
return @ptrFromInt(addr);
|
2023-04-04 15:41:25 +05:30
|
|
|
}
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
|
|
|
|
|
const page = self.backing_allocator.rawAlloc(page_size, page_align, @returnAddress()) orelse
|
|
|
|
|
return null;
|
|
|
|
|
const bucket: *BucketHeader = .fromPage(@intFromPtr(page), slot_count);
|
|
|
|
|
bucket.* = .{
|
|
|
|
|
.allocated_count = 1,
|
|
|
|
|
.freed_count = 0,
|
|
|
|
|
.prev = self.buckets[size_class_index],
|
2025-04-02 08:15:04 -04:00
|
|
|
.next = null,
|
2025-02-04 23:05:06 -08:00
|
|
|
};
|
2025-04-02 08:15:04 -04:00
|
|
|
if (self.buckets[size_class_index]) |old_head| {
|
|
|
|
|
old_head.next = bucket;
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
self.buckets[size_class_index] = bucket;
|
|
|
|
|
|
|
|
|
|
if (!config.backing_allocator_zeroes) {
|
2025-02-05 13:31:01 -08:00
|
|
|
@memset(@as([*]usize, @as(*[1]usize, bucket.usedBits(0)))[0..usedBitsCount(slot_count)], 0);
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.safety) @memset(bucket.requestedSizes(slot_count), 0);
|
2021-10-27 19:51:05 -04:00
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
bucket.usedBits(0).* = 0b1;
|
|
|
|
|
|
|
|
|
|
if (config.stack_trace_frames > 0) {
|
|
|
|
|
bucket.captureStackTrace(ret_addr, slot_count, 0, .alloc);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2021-12-01 03:56:44 -05:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
if (config.safety) {
|
|
|
|
|
bucket.requestedSizes(slot_count)[0] = @intCast(len);
|
|
|
|
|
bucket.log2PtrAligns(slot_count)[0] = alignment;
|
2021-12-01 03:56:44 -05:00
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
|
|
|
|
|
if (config.verbose_log) {
|
|
|
|
|
log.info("small alloc {d} bytes at 0x{x}", .{ len, @intFromPtr(page) });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn resize(
|
|
|
|
|
context: *anyopaque,
|
|
|
|
|
memory: []u8,
|
|
|
|
|
alignment: mem.Alignment,
|
|
|
|
|
new_len: usize,
|
|
|
|
|
return_address: usize,
|
|
|
|
|
) bool {
|
2025-02-05 00:18:43 -08:00
|
|
|
const self: *Self = @ptrCast(@alignCast(context));
|
|
|
|
|
self.mutex.lock();
|
|
|
|
|
defer self.mutex.unlock();
|
|
|
|
|
|
|
|
|
|
const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(memory.len - 1), @intFromEnum(alignment));
|
|
|
|
|
if (size_class_index >= self.buckets.len) {
|
|
|
|
|
return self.resizeLarge(memory, alignment, new_len, return_address, false) != null;
|
|
|
|
|
} else {
|
|
|
|
|
return resizeSmall(self, memory, alignment, new_len, return_address, size_class_index);
|
|
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn remap(
|
|
|
|
|
context: *anyopaque,
|
|
|
|
|
memory: []u8,
|
|
|
|
|
alignment: mem.Alignment,
|
|
|
|
|
new_len: usize,
|
|
|
|
|
return_address: usize,
|
|
|
|
|
) ?[*]u8 {
|
2025-02-05 00:18:43 -08:00
|
|
|
const self: *Self = @ptrCast(@alignCast(context));
|
|
|
|
|
self.mutex.lock();
|
|
|
|
|
defer self.mutex.unlock();
|
|
|
|
|
|
|
|
|
|
const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(memory.len - 1), @intFromEnum(alignment));
|
|
|
|
|
if (size_class_index >= self.buckets.len) {
|
|
|
|
|
return self.resizeLarge(memory, alignment, new_len, return_address, true);
|
|
|
|
|
} else {
|
|
|
|
|
return if (resizeSmall(self, memory, alignment, new_len, return_address, size_class_index)) memory.ptr else null;
|
|
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2021-11-06 00:54:35 +00:00
|
|
|
fn free(
|
2025-02-04 23:05:06 -08:00
|
|
|
context: *anyopaque,
|
|
|
|
|
old_memory: []u8,
|
2025-02-03 19:55:09 -08:00
|
|
|
alignment: mem.Alignment,
|
2025-02-04 23:05:06 -08:00
|
|
|
return_address: usize,
|
2021-11-06 00:54:35 +00:00
|
|
|
) void {
|
2025-02-04 23:05:06 -08:00
|
|
|
const self: *Self = @ptrCast(@alignCast(context));
|
2021-11-07 01:40:06 +00:00
|
|
|
self.mutex.lock();
|
|
|
|
|
defer self.mutex.unlock();
|
2021-11-06 00:54:35 +00:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment));
|
|
|
|
|
if (size_class_index >= self.buckets.len) {
|
|
|
|
|
@branchHint(.unlikely);
|
|
|
|
|
self.freeLarge(old_memory, alignment, return_address);
|
2021-11-06 00:54:35 +00:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
const slot_count = slot_counts[size_class_index];
|
|
|
|
|
const freed_addr = @intFromPtr(old_memory.ptr);
|
|
|
|
|
const page_addr = freed_addr & ~(page_size - 1);
|
|
|
|
|
const bucket: *BucketHeader = .fromPage(page_addr, slot_count);
|
2025-02-04 23:12:55 -08:00
|
|
|
if (bucket.canary != config.canary) @panic("Invalid free");
|
2025-02-04 23:05:06 -08:00
|
|
|
const page_offset = freed_addr - page_addr;
|
2025-02-05 01:04:44 -08:00
|
|
|
const size_class = @as(usize, 1) << @as(Log2USize, @intCast(size_class_index));
|
2025-02-04 23:05:06 -08:00
|
|
|
const slot_index: SlotIndex = @intCast(page_offset / size_class);
|
2025-02-05 13:31:01 -08:00
|
|
|
const used_byte_index = slot_index / @bitSizeOf(usize);
|
|
|
|
|
const used_bit_index: Log2USize = @intCast(slot_index % @bitSizeOf(usize));
|
2021-11-06 00:54:35 +00:00
|
|
|
const used_byte = bucket.usedBits(used_byte_index);
|
2023-06-22 18:46:56 +01:00
|
|
|
const is_used = @as(u1, @truncate(used_byte.* >> used_bit_index)) != 0;
|
2021-11-06 00:54:35 +00:00
|
|
|
if (!is_used) {
|
|
|
|
|
if (config.safety) {
|
2025-02-04 23:05:06 -08:00
|
|
|
reportDoubleFree(
|
|
|
|
|
return_address,
|
|
|
|
|
bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
bucketStackTrace(bucket, slot_count, slot_index, .free),
|
|
|
|
|
);
|
|
|
|
|
// Recoverable since this is a free.
|
2021-11-06 00:54:35 +00:00
|
|
|
return;
|
|
|
|
|
} else {
|
|
|
|
|
unreachable;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Definitely an in-use small alloc now.
|
2023-04-04 15:41:25 +05:30
|
|
|
if (config.safety) {
|
2025-02-04 23:05:06 -08:00
|
|
|
const requested_size = bucket.requestedSizes(slot_count)[slot_index];
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (requested_size == 0) @panic("Invalid free");
|
2025-02-04 23:05:06 -08:00
|
|
|
const slot_alignment = bucket.log2PtrAligns(slot_count)[slot_index];
|
|
|
|
|
if (old_memory.len != requested_size or alignment != slot_alignment) {
|
2025-09-05 20:40:11 +01:00
|
|
|
var addr_buf: [stack_n]usize = undefined;
|
|
|
|
|
const free_stack_trace = std.debug.captureCurrentStackTrace(.{ .first_address = return_address }, &addr_buf);
|
2025-02-04 23:05:06 -08:00
|
|
|
if (old_memory.len != requested_size) {
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation size {d} bytes does not match free size {d}. Allocation: {f} Free: {f}", .{
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
requested_size,
|
2025-02-04 23:05:06 -08:00
|
|
|
old_memory.len,
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2023-04-04 15:41:25 +05:30
|
|
|
});
|
|
|
|
|
}
|
2025-02-03 19:55:09 -08:00
|
|
|
if (alignment != slot_alignment) {
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation alignment {d} does not match free alignment {d}. Allocation: {f} Free: {f}", .{
|
2025-02-03 19:55:09 -08:00
|
|
|
slot_alignment.toByteUnits(),
|
|
|
|
|
alignment.toByteUnits(),
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2023-04-04 15:41:25 +05:30
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-06 00:54:35 +00:00
|
|
|
if (config.enable_memory_limit) {
|
2025-02-04 23:05:06 -08:00
|
|
|
self.total_requested_bytes -= old_memory.len;
|
2021-11-06 00:54:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
if (config.stack_trace_frames > 0) {
|
|
|
|
|
// Capture stack trace to be the "first free", in case a double free happens.
|
|
|
|
|
bucket.captureStackTrace(return_address, slot_count, slot_index, .free);
|
|
|
|
|
}
|
2021-11-06 00:54:35 +00:00
|
|
|
|
2025-02-05 13:31:01 -08:00
|
|
|
used_byte.* &= ~(@as(usize, 1) << used_bit_index);
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
if (config.safety) {
|
2025-02-04 23:05:06 -08:00
|
|
|
bucket.requestedSizes(slot_count)[slot_index] = 0;
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
}
|
2025-02-04 23:05:06 -08:00
|
|
|
bucket.freed_count += 1;
|
|
|
|
|
if (bucket.freed_count == bucket.allocated_count) {
|
2025-04-02 08:15:04 -04:00
|
|
|
if (bucket.prev) |prev| {
|
|
|
|
|
prev.next = bucket.next;
|
2021-11-06 00:54:35 +00:00
|
|
|
}
|
2025-04-02 08:15:04 -04:00
|
|
|
|
|
|
|
|
if (bucket.next) |next| {
|
|
|
|
|
assert(self.buckets[size_class_index] != bucket);
|
|
|
|
|
next.prev = bucket.prev;
|
|
|
|
|
} else {
|
|
|
|
|
assert(self.buckets[size_class_index] == bucket);
|
|
|
|
|
self.buckets[size_class_index] = bucket.prev;
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-06 00:54:35 +00:00
|
|
|
if (!config.never_unmap) {
|
2025-02-04 23:05:06 -08:00
|
|
|
const page: [*]align(page_size) u8 = @ptrFromInt(page_addr);
|
|
|
|
|
self.backing_allocator.rawFree(page[0..page_size], page_align, @returnAddress());
|
2021-05-11 23:54:11 -04:00
|
|
|
}
|
2023-04-04 15:41:25 +05:30
|
|
|
}
|
2021-01-30 20:15:26 -07:00
|
|
|
if (config.verbose_log) {
|
2025-02-04 23:05:06 -08:00
|
|
|
log.info("small free {d} bytes at {*}", .{ old_memory.len, old_memory.ptr });
|
GeneralPurposeAllocator: Considerably improve worst case performance
Before this commit, GeneralPurposeAllocator could run into incredibly degraded performance in scenarios where the bucket count for a particular size class grew to be large. For example, if exactly `slot_count` allocations of a single size class were performed and then all of them were freed except one, then the bucket for those allocations would have to be kept around indefinitely. If that pattern of allocation were done over and over, then the bucket list for that size class could grow incredibly large.
This allocation pattern has been seen in the wild: https://github.com/Vexu/arocc/issues/508#issuecomment-1738275688
In that case, the length of the bucket list for the `128` size class would grow to tens of thousands of buckets and cause Debug runtime to balloon to ~8 minutes whereas with the c_allocator the Debug runtime would be ~3 seconds.
To address this, there are three different changes happening here:
1. std.Treap is used instead of a doubly linked list for the lists of buckets. This takes the time complexity of searchBucket [used in resize and free] from O(n) to O(log n), but increases the time complexity of insert from O(1) to O(log n) [before, all new buckets would get added to the head of the list]. Note: Any data structure with O(log n) or better search/insert/delete would also work for this use-case.
2. If the 'current' bucket for a size class is full, the list of buckets is never traversed and instead a new bucket is allocated. Previously, traversing the bucket list could only find a non-full bucket in specific circumstances, and only because of a separate optimization that is no longer needed (before, after any resize/free, the affected bucket would be moved to the head of the bucket list to allow searchBucket to perform better on average). Now, the current_bucket for each size class only changes when either (1) the current bucket is emptied/freed, or (2) a new bucket is allocated (due to the current bucket being full or null). Because each bucket's alloc_cursor only moves forward (i.e. slots within a bucket are never re-used), we can therefore always know that any bucket besides the current_bucket will be full, so traversing the list in the hopes of finding an existing non-full bucket is entirely pointless.
3. Size + alignment information for small allocations has been moved into the Bucket data instead of keeping it in a separate HashMap. This offers an improvement over the HashMap since whenever we need to get/modify the length/alignment of an allocation it's extremely likely we will already have calculated any bucket-related information necessary to get the data.
The first change is the most relevant and accounts for most of the benefit here. Also note that the overall functionality of GeneralPurposeAllocator is unchanged.
In the degraded `arocc` case, these changes bring Debug performance from ~8 minutes to ~20 seconds.
Benchmark 1: test-master.bat
Time (mean ± σ): 481.263 s ± 5.440 s [User: 479.159 s, System: 1.937 s]
Range (min … max): 477.416 s … 485.109 s 2 runs
Benchmark 2: test-optim-treap.bat
Time (mean ± σ): 19.639 s ± 0.037 s [User: 18.183 s, System: 1.452 s]
Range (min … max): 19.613 s … 19.665 s 2 runs
Summary
'test-optim-treap.bat' ran
24.51 ± 0.28 times faster than 'test-master.bat'
Note: Much of the time taken on Windows in this particular case is related to gathering stack traces. With `.stack_trace_frames = 0` the runtime goes down to 6.7 seconds, which is a little more than 2.5x slower compared to when the c_allocator is used.
These changes may or mat not introduce a slight performance regression in the average case:
Here's the standard library tests on Windows in Debug mode:
Benchmark 1 (10 runs): std-tests-master.exe
measurement mean ± σ min … max outliers delta
wall_time 16.0s ± 30.8ms 15.9s … 16.1s 1 (10%) 0%
peak_rss 42.8MB ± 8.24KB 42.8MB … 42.8MB 0 ( 0%) 0%
Benchmark 2 (10 runs): std-tests-optim-treap.exe
measurement mean ± σ min … max outliers delta
wall_time 16.2s ± 37.6ms 16.1s … 16.3s 0 ( 0%) 💩+ 1.3% ± 0.2%
peak_rss 42.8MB ± 5.18KB 42.8MB … 42.8MB 0 ( 0%) + 0.1% ± 0.0%
And on Linux:
Benchmark 1: ./test-master
Time (mean ± σ): 16.091 s ± 0.088 s [User: 15.856 s, System: 0.453 s]
Range (min … max): 15.870 s … 16.166 s 10 runs
Benchmark 2: ./test-optim-treap
Time (mean ± σ): 16.028 s ± 0.325 s [User: 15.755 s, System: 0.492 s]
Range (min … max): 15.735 s … 16.709 s 10 runs
Summary
'./test-optim-treap' ran
1.00 ± 0.02 times faster than './test-master'
2023-10-03 01:19:38 -07:00
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2025-02-05 00:18:43 -08:00
|
|
|
|
|
|
|
|
fn resizeSmall(
|
|
|
|
|
self: *Self,
|
|
|
|
|
memory: []u8,
|
|
|
|
|
alignment: mem.Alignment,
|
|
|
|
|
new_len: usize,
|
|
|
|
|
return_address: usize,
|
|
|
|
|
size_class_index: usize,
|
|
|
|
|
) bool {
|
|
|
|
|
const new_size_class_index: usize = @max(@bitSizeOf(usize) - @clz(new_len - 1), @intFromEnum(alignment));
|
|
|
|
|
if (!config.safety) return new_size_class_index == size_class_index;
|
|
|
|
|
const slot_count = slot_counts[size_class_index];
|
|
|
|
|
const memory_addr = @intFromPtr(memory.ptr);
|
|
|
|
|
const page_addr = memory_addr & ~(page_size - 1);
|
|
|
|
|
const bucket: *BucketHeader = .fromPage(page_addr, slot_count);
|
|
|
|
|
if (bucket.canary != config.canary) @panic("Invalid free");
|
|
|
|
|
const page_offset = memory_addr - page_addr;
|
2025-02-05 01:04:44 -08:00
|
|
|
const size_class = @as(usize, 1) << @as(Log2USize, @intCast(size_class_index));
|
2025-02-05 00:18:43 -08:00
|
|
|
const slot_index: SlotIndex = @intCast(page_offset / size_class);
|
2025-02-05 13:31:01 -08:00
|
|
|
const used_byte_index = slot_index / @bitSizeOf(usize);
|
|
|
|
|
const used_bit_index: Log2USize = @intCast(slot_index % @bitSizeOf(usize));
|
2025-02-05 00:18:43 -08:00
|
|
|
const used_byte = bucket.usedBits(used_byte_index);
|
|
|
|
|
const is_used = @as(u1, @truncate(used_byte.* >> used_bit_index)) != 0;
|
|
|
|
|
if (!is_used) {
|
|
|
|
|
reportDoubleFree(
|
|
|
|
|
return_address,
|
|
|
|
|
bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
bucketStackTrace(bucket, slot_count, slot_index, .free),
|
|
|
|
|
);
|
|
|
|
|
// Recoverable since this is a free.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Definitely an in-use small alloc now.
|
|
|
|
|
const requested_size = bucket.requestedSizes(slot_count)[slot_index];
|
|
|
|
|
if (requested_size == 0) @panic("Invalid free");
|
|
|
|
|
const slot_alignment = bucket.log2PtrAligns(slot_count)[slot_index];
|
|
|
|
|
if (memory.len != requested_size or alignment != slot_alignment) {
|
2025-09-05 20:40:11 +01:00
|
|
|
var addr_buf: [stack_n]usize = undefined;
|
|
|
|
|
const free_stack_trace = std.debug.captureCurrentStackTrace(.{ .first_address = return_address }, &addr_buf);
|
2025-02-05 00:18:43 -08:00
|
|
|
if (memory.len != requested_size) {
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation size {d} bytes does not match free size {d}. Allocation: {f} Free: {f}", .{
|
2025-02-05 00:18:43 -08:00
|
|
|
requested_size,
|
|
|
|
|
memory.len,
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2025-02-05 00:18:43 -08:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (alignment != slot_alignment) {
|
2025-10-19 14:08:21 -07:00
|
|
|
const tty_config = std.Io.tty.detectConfig(.stderr());
|
2025-06-27 20:05:22 -07:00
|
|
|
log.err("Allocation alignment {d} does not match free alignment {d}. Allocation: {f} Free: {f}", .{
|
2025-02-05 00:18:43 -08:00
|
|
|
slot_alignment.toByteUnits(),
|
|
|
|
|
alignment.toByteUnits(),
|
2025-10-19 14:08:21 -07:00
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = bucketStackTrace(bucket, slot_count, slot_index, .alloc),
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
|
|
|
|
std.debug.FormatStackTrace{
|
|
|
|
|
.stack_trace = free_stack_trace,
|
|
|
|
|
.tty_config = tty_config,
|
|
|
|
|
},
|
2025-02-05 00:18:43 -08:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (new_size_class_index != size_class_index) return false;
|
|
|
|
|
|
|
|
|
|
const prev_req_bytes = self.total_requested_bytes;
|
|
|
|
|
if (config.enable_memory_limit) {
|
|
|
|
|
const new_req_bytes = prev_req_bytes - memory.len + new_len;
|
|
|
|
|
if (new_req_bytes > prev_req_bytes and new_req_bytes > self.requested_memory_limit) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
self.total_requested_bytes = new_req_bytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (memory.len > new_len) @memset(memory[new_len..], undefined);
|
|
|
|
|
if (config.verbose_log)
|
|
|
|
|
log.info("small resize {d} bytes at {*} to {d}", .{ memory.len, memory.ptr, new_len });
|
|
|
|
|
|
|
|
|
|
if (config.safety)
|
|
|
|
|
bucket.requestedSizes(slot_count)[slot_index] = @intCast(new_len);
|
|
|
|
|
|
|
|
|
|
if (config.resize_stack_traces)
|
|
|
|
|
bucket.captureStackTrace(return_address, slot_count, slot_index, .alloc);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2020-08-07 22:35:15 -07:00
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const TraceKind = enum {
|
|
|
|
|
alloc,
|
|
|
|
|
free,
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-27 20:05:22 -07:00
|
|
|
const test_config: Config = .{};
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
test "small allocations - free in same order" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-07-31 21:54:07 -07:00
|
|
|
var list = std.array_list.Managed(*u64).init(std.testing.allocator);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer list.deinit();
|
|
|
|
|
|
|
|
|
|
var i: usize = 0;
|
|
|
|
|
while (i < 513) : (i += 1) {
|
|
|
|
|
const ptr = try allocator.create(u64);
|
|
|
|
|
try list.append(ptr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (list.items) |ptr| {
|
|
|
|
|
allocator.destroy(ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "small allocations - free in reverse order" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-07-31 21:54:07 -07:00
|
|
|
var list = std.array_list.Managed(*u64).init(std.testing.allocator);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer list.deinit();
|
|
|
|
|
|
|
|
|
|
var i: usize = 0;
|
|
|
|
|
while (i < 513) : (i += 1) {
|
|
|
|
|
const ptr = try allocator.create(u64);
|
|
|
|
|
try list.append(ptr);
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-09 20:21:31 -08:00
|
|
|
while (list.pop()) |ptr| {
|
2020-08-07 22:35:15 -07:00
|
|
|
allocator.destroy(ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "large allocations" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const ptr1 = try allocator.alloc(u64, 42768);
|
|
|
|
|
const ptr2 = try allocator.alloc(u64, 52768);
|
|
|
|
|
allocator.free(ptr1);
|
|
|
|
|
const ptr3 = try allocator.alloc(u64, 62768);
|
|
|
|
|
allocator.free(ptr3);
|
|
|
|
|
allocator.free(ptr2);
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-30 16:10:20 -07:00
|
|
|
test "very large allocation" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2022-10-30 16:10:20 -07:00
|
|
|
const allocator = gpa.allocator();
|
|
|
|
|
|
|
|
|
|
try std.testing.expectError(error.OutOfMemory, allocator.alloc(u8, math.maxInt(usize)));
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
test "realloc" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-04-11 17:55:25 -07:00
|
|
|
var slice = try allocator.alignedAlloc(u8, .of(u32), 1);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
|
|
|
|
|
// This reallocation should keep its pointer address.
|
|
|
|
|
const old_slice = slice;
|
|
|
|
|
slice = try allocator.realloc(slice, 2);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(old_slice.ptr == slice.ptr);
|
|
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
2020-08-07 22:35:15 -07:00
|
|
|
slice[1] = 0x34;
|
|
|
|
|
|
|
|
|
|
// This requires upgrading to a larger size class
|
|
|
|
|
slice = try allocator.realloc(slice, 17);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[1] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "shrink" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(test_config) = .{};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
var slice = try allocator.alloc(u8, 20);
|
|
|
|
|
defer allocator.free(slice);
|
|
|
|
|
|
2023-04-26 13:57:08 -07:00
|
|
|
@memset(slice, 0x11);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2022-11-27 01:07:35 -07:00
|
|
|
try std.testing.expect(allocator.resize(slice, 17));
|
|
|
|
|
slice = slice[0..17];
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
for (slice) |b| {
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(b == 0x11);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
// Does not cross size class boundaries when shrinking.
|
|
|
|
|
try std.testing.expect(!allocator.resize(slice, 16));
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "large object - grow" {
|
2025-01-24 03:45:38 +01:00
|
|
|
if (builtin.target.cpu.arch.isWasm()) {
|
2025-01-29 15:24:24 -08:00
|
|
|
// Not expected to pass on targets that do not have memory mapping.
|
|
|
|
|
return error.SkipZigTest;
|
|
|
|
|
}
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(test_config) = .{};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
var slice1 = try allocator.alloc(u8, default_page_size * 2 - 20);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice1);
|
|
|
|
|
|
2020-08-08 13:05:04 -07:00
|
|
|
const old = slice1;
|
2025-02-05 19:28:48 -08:00
|
|
|
slice1 = try allocator.realloc(slice1, default_page_size * 2 - 10);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice1.ptr == old.ptr);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice1 = try allocator.realloc(slice1, default_page_size * 2);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice1.ptr == old.ptr);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice1 = try allocator.realloc(slice1, default_page_size * 2 + 1);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "realloc small object to large object" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
var slice = try allocator.alloc(u8, 70);
|
|
|
|
|
defer allocator.free(slice);
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[60] = 0x34;
|
|
|
|
|
|
|
|
|
|
// This requires upgrading to a large object
|
2025-02-05 19:28:48 -08:00
|
|
|
const large_object_size = default_page_size * 2 + 50;
|
2020-08-07 22:35:15 -07:00
|
|
|
slice = try allocator.realloc(slice, large_object_size);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[60] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "shrink large object to large object" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(test_config) = .{};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
var slice = try allocator.alloc(u8, default_page_size * 2 + 50);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[60] = 0x34;
|
|
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
if (!allocator.resize(slice, default_page_size * 2 + 1)) return;
|
|
|
|
|
slice = slice.ptr[0 .. default_page_size * 2 + 1];
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[60] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
try std.testing.expect(allocator.resize(slice, default_page_size * 2 + 1));
|
|
|
|
|
slice = slice[0 .. default_page_size * 2 + 1];
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[60] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice = try allocator.realloc(slice, default_page_size * 2);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[60] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "shrink large object to large object with larger alignment" {
|
2025-02-03 09:12:20 +00:00
|
|
|
if (!builtin.link_libc and builtin.os.tag == .wasi) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/22731
|
|
|
|
|
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
var debug_buffer: [1000]u8 = undefined;
|
2022-03-16 13:31:16 -07:00
|
|
|
var fba = std.heap.FixedBufferAllocator.init(&debug_buffer);
|
|
|
|
|
const debug_allocator = fba.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
const alloc_size = default_page_size * 2 + 50;
|
2025-04-11 17:55:25 -07:00
|
|
|
var slice = try allocator.alignedAlloc(u8, .@"16", alloc_size);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
|
2025-02-05 20:23:21 -08:00
|
|
|
const big_alignment: usize = default_page_size * 2;
|
2020-08-08 13:46:18 -07:00
|
|
|
// This loop allocates until we find a page that is not aligned to the big
|
|
|
|
|
// alignment. Then we shrink the allocation after the loop, but increase the
|
|
|
|
|
// alignment to the higher one, that we know will force it to realloc.
|
2025-07-31 21:54:07 -07:00
|
|
|
var stuff_to_free = std.array_list.Managed([]align(16) u8).init(debug_allocator);
|
2023-06-15 13:14:16 +06:00
|
|
|
while (mem.isAligned(@intFromPtr(slice.ptr), big_alignment)) {
|
2020-08-07 22:35:15 -07:00
|
|
|
try stuff_to_free.append(slice);
|
2025-04-11 17:55:25 -07:00
|
|
|
slice = try allocator.alignedAlloc(u8, .@"16", alloc_size);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2025-02-09 20:21:31 -08:00
|
|
|
while (stuff_to_free.pop()) |item| {
|
2020-08-07 22:35:15 -07:00
|
|
|
allocator.free(item);
|
|
|
|
|
}
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[60] = 0x34;
|
|
|
|
|
|
2022-11-27 01:07:35 -07:00
|
|
|
slice = try allocator.reallocAdvanced(slice, big_alignment, alloc_size / 2);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[60] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "realloc large object to small object" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
var slice = try allocator.alloc(u8, default_page_size * 2 + 50);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[16] = 0x34;
|
|
|
|
|
|
|
|
|
|
slice = try allocator.realloc(slice, 19);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[16] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2023-04-30 18:02:08 +01:00
|
|
|
test "overridable mutexes" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(.{ .MutexType = std.Thread.Mutex }){
|
2020-12-03 12:49:35 -08:00
|
|
|
.backing_allocator = std.testing.allocator,
|
2021-01-14 20:41:37 -07:00
|
|
|
.mutex = std.Thread.Mutex{},
|
2020-12-03 12:49:35 -08:00
|
|
|
};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-12-03 12:49:35 -08:00
|
|
|
|
|
|
|
|
const ptr = try allocator.create(i32);
|
|
|
|
|
defer allocator.destroy(ptr);
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-07 22:35:15 -07:00
|
|
|
test "non-page-allocator backing allocator" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(.{
|
|
|
|
|
.backing_allocator_zeroes = false,
|
|
|
|
|
}) = .{
|
|
|
|
|
.backing_allocator = std.testing.allocator,
|
|
|
|
|
};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const ptr = try allocator.create(i32);
|
|
|
|
|
defer allocator.destroy(ptr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "realloc large object to larger alignment" {
|
2025-02-03 09:12:20 +00:00
|
|
|
if (!builtin.link_libc and builtin.os.tag == .wasi) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/22731
|
|
|
|
|
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
var debug_buffer: [1000]u8 = undefined;
|
2022-03-16 13:31:16 -07:00
|
|
|
var fba = std.heap.FixedBufferAllocator.init(&debug_buffer);
|
|
|
|
|
const debug_allocator = fba.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-04-11 17:55:25 -07:00
|
|
|
var slice = try allocator.alignedAlloc(u8, .@"16", default_page_size * 2 + 50);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
|
2025-02-05 20:23:21 -08:00
|
|
|
const big_alignment: usize = default_page_size * 2;
|
2020-08-08 13:46:18 -07:00
|
|
|
// This loop allocates until we find a page that is not aligned to the big alignment.
|
2025-07-31 21:54:07 -07:00
|
|
|
var stuff_to_free = std.array_list.Managed([]align(16) u8).init(debug_allocator);
|
2023-06-15 13:14:16 +06:00
|
|
|
while (mem.isAligned(@intFromPtr(slice.ptr), big_alignment)) {
|
2020-08-07 22:35:15 -07:00
|
|
|
try stuff_to_free.append(slice);
|
2025-04-11 17:55:25 -07:00
|
|
|
slice = try allocator.alignedAlloc(u8, .@"16", default_page_size * 2 + 50);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
2025-02-09 20:21:31 -08:00
|
|
|
while (stuff_to_free.pop()) |item| {
|
2020-08-07 22:35:15 -07:00
|
|
|
allocator.free(item);
|
|
|
|
|
}
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[16] = 0x34;
|
|
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice = try allocator.reallocAdvanced(slice, 32, default_page_size * 2 + 100);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[16] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice = try allocator.reallocAdvanced(slice, 32, default_page_size * 2 + 25);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[16] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
slice = try allocator.reallocAdvanced(slice, big_alignment, default_page_size * 2 + 100);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[16] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
test "large object rejects shrinking to small" {
|
2025-01-24 03:45:38 +01:00
|
|
|
if (builtin.target.cpu.arch.isWasm()) {
|
2025-01-29 15:24:24 -08:00
|
|
|
// Not expected to pass on targets that do not have memory mapping.
|
|
|
|
|
return error.SkipZigTest;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-28 20:25:05 -05:00
|
|
|
var failing_allocator = std.testing.FailingAllocator.init(std.heap.page_allocator, .{ .fail_index = 3 });
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(.{}) = .{
|
|
|
|
|
.backing_allocator = failing_allocator.allocator(),
|
|
|
|
|
};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-05 19:28:48 -08:00
|
|
|
var slice = try allocator.alloc(u8, default_page_size * 2 + 50);
|
2020-08-07 22:35:15 -07:00
|
|
|
defer allocator.free(slice);
|
|
|
|
|
slice[0] = 0x12;
|
|
|
|
|
slice[3] = 0x34;
|
|
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
try std.testing.expect(!allocator.resize(slice, 4));
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(slice[0] == 0x12);
|
|
|
|
|
try std.testing.expect(slice[3] == 0x34);
|
2020-08-07 22:35:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "objects of size 1024 and 2048" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(test_config){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const slice = try allocator.alloc(u8, 1025);
|
|
|
|
|
const slice2 = try allocator.alloc(u8, 3000);
|
|
|
|
|
|
|
|
|
|
allocator.free(slice);
|
|
|
|
|
allocator.free(slice2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test "setting a memory cap" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = DebugAllocator(.{ .enable_memory_limit = true }){};
|
2023-04-22 13:09:44 +02:00
|
|
|
defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
|
2021-10-29 02:08:41 +01:00
|
|
|
const allocator = gpa.allocator();
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2025-02-04 23:05:06 -08:00
|
|
|
gpa.requested_memory_limit = 1010;
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const small = try allocator.create(i32);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(gpa.total_requested_bytes == 4);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const big = try allocator.alloc(u8, 1000);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(gpa.total_requested_bytes == 1004);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expectError(error.OutOfMemory, allocator.create(u64));
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
allocator.destroy(small);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(gpa.total_requested_bytes == 1000);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
allocator.free(big);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(gpa.total_requested_bytes == 0);
|
2020-08-07 22:35:15 -07:00
|
|
|
|
|
|
|
|
const exact = try allocator.alloc(u8, 1010);
|
2021-05-04 20:47:26 +03:00
|
|
|
try std.testing.expect(gpa.total_requested_bytes == 1010);
|
2020-08-07 22:35:15 -07:00
|
|
|
allocator.free(exact);
|
|
|
|
|
}
|
2021-05-11 23:54:11 -04:00
|
|
|
|
2025-02-05 00:18:43 -08:00
|
|
|
test "large allocations count requested size not backing size" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa: DebugAllocator(.{ .enable_memory_limit = true }) = .{};
|
2021-10-31 21:45:27 +00:00
|
|
|
const allocator = gpa.allocator();
|
2021-11-03 12:49:31 +00:00
|
|
|
|
2025-04-11 17:55:25 -07:00
|
|
|
var buf = try allocator.alignedAlloc(u8, .@"1", default_page_size + 1);
|
2025-02-05 19:28:48 -08:00
|
|
|
try std.testing.expectEqual(default_page_size + 1, gpa.total_requested_bytes);
|
2022-11-27 01:07:35 -07:00
|
|
|
buf = try allocator.realloc(buf, 1);
|
2025-02-05 00:18:43 -08:00
|
|
|
try std.testing.expectEqual(1, gpa.total_requested_bytes);
|
2022-11-27 01:07:35 -07:00
|
|
|
buf = try allocator.realloc(buf, 2);
|
2025-02-05 00:18:43 -08:00
|
|
|
try std.testing.expectEqual(2, gpa.total_requested_bytes);
|
2021-10-27 19:51:05 -04:00
|
|
|
}
|
2024-05-20 18:29:02 +02:00
|
|
|
|
|
|
|
|
test "retain metadata and never unmap" {
|
2025-02-05 19:18:22 -08:00
|
|
|
var gpa = std.heap.DebugAllocator(.{
|
2024-05-20 18:29:02 +02:00
|
|
|
.safety = true,
|
|
|
|
|
.never_unmap = true,
|
|
|
|
|
.retain_metadata = true,
|
|
|
|
|
}){};
|
|
|
|
|
defer std.debug.assert(gpa.deinit() == .ok);
|
|
|
|
|
const allocator = gpa.allocator();
|
|
|
|
|
|
|
|
|
|
const alloc = try allocator.alloc(u8, 8);
|
|
|
|
|
allocator.free(alloc);
|
|
|
|
|
|
|
|
|
|
const alloc2 = try allocator.alloc(u8, 8);
|
|
|
|
|
allocator.free(alloc2);
|
|
|
|
|
}
|