Delta-in-hub · April 5, 2022 06:25
diff --git a/alloc.cc b/alloc.cc
 // #define NDEBUG
 #include <bits/stdc++.h>

 #define DEBUGPRINTF
 #undef DEBUGPRINTF

 using namespace std;
 int cpuid = 0;
 mutex _map_mutex;

 #define ROUNDUP(a, sz) ((((uintptr_t)a) + (sz)-1) & ~((sz)-1))
 // #define ROUNDDOWN(a, sz) ((((uintptr_t)a)) & ~((sz)-1))
 #define MAX(x, y) ((x) > (y) ? (x) : (y))

 #define ThreadNums 3
 int cpu_count()
 {
    return ThreadNums + 1;
 }

 unordered_map<thread::id, int> _map;

 int cpu_current()
 {
    lock_guard<mutex> lk(_map_mutex);
    return _map[this_thread::get_id()];
 }
 typedef struct mutex_t
 {
    mutex t;
    int flag; // 1 for avaiable , 0 for locked
 } mutex_t;

 static inline void lock_init(mutex_t *lock)
 {
    lock->flag = 1;
    assert(lock->flag == 1);
 }

 static inline void lock(mutex_t *lock)
 {
    lock->t.lock();
    lock->flag = 0;
 }

 static inline void unlock(mutex_t *lock)
 {
    lock->flag = 1;
    lock->t.unlock();
 }

 static inline bool try_lock(mutex_t *lock)
 {
    if (lock->t.try_lock())
    {
        lock->flag = 0;
        return true;
    }
    return false;
 }

 #define LOG2(X) ((unsigned)(8 * sizeof(unsigned long long) - __builtin_clzll((X)) - 1))

 // return the smallest number , which >= s and is a power of 2
 static inline size_t next_pow2(size_t x)
 {
    if (sizeof(size_t) == 8)
        return x == 1 ? 1 : (size_t)1 << (size_t)(64 - __builtin_clzl(x - 1));
    else
        return x == 1 ? 1 : (size_t)1 << (size_t)(32 - __builtin_clz(x - 1));
 }

 // return the largest number , which <= s and is a power of 2
 static inline size_t pre_pow2(size_t s)
 {
    assert(s >= 1ul);
    size_t x = next_pow2(s);
    if (s == x)
        return s;
    else
        return x >> 1;
 }

 #define B (1ul)
 #define KB (1024 * B)
 #define MB (1024 * KB)
 #define GB (1024 * MB)
 #define TB (1024 * GB)

 #define LCHILD(x) (x * 2 + 1)
 #define RCHILD(x) (x * 2 + 2)
 #define PARENT(x) ((x - 1) / 2)
 #define BUDDY(x) (((x - 1) ^ 1) + 1)

 typedef struct list_t
 {
    struct list_t *prev, *next;
 } list_t;

 #define MIN_BUDDY_ALLOC_SIZE (4 * KB)

 // Please forgive me for poor english

 struct buddy_allocator
 {
    /*
     * MIN_BUDDY_ALLOC_SIZE  is (4 * KB)
     * MAX_BUDDY_ALLOC_SIZE  is calculated by the following formula:
     * pre_pow2(heap-size) , the largets number , which <= heap-size and is a power of 2
     * and is larger than MIN_BUDDY_ALLOC_SIZE
     */
    size_t MAX_BUDDY_ALLOC_SIZE;

    uintptr_t buddy_start;    // buddy allocator start address
    size_t buddy_size;        // buddy allocator size(in bytes)
    uintptr_t reserved_start; // reserved memory for management start address
    size_t reserved_size;     // reserved memory for management size(in bytes)
    uintptr_t unused_start;   // unused memory start address
    size_t unused_size;       // unused memory size(in bytes)

    uint8_t bucket_num; // the number of bucket(free list)
    list_t *buckets;    // buckets[BUCKET_COUNT] , each bucket is a free list for a series of specific size block.
    uint8_t *bitmap;    // 0 for avaiable ,1 for used
    size_t bitmap_size;

    mutex_t _mutex;
    /*
     * bitmap and buckets must be mdoified together, lock both of them at the same time
     *
     */
 };

 bool get_bitmap(uint8_t *base, size_t index)
 {
    size_t i = index / 8;
    size_t j = index % 8;
    return (base[i] >> j) & 1;
 }
 void set_bitmap(uint8_t *base, size_t index)
 {
    size_t i = index / 8;
    size_t j = index % 8;
    base[i] |= (uint8_t)(1 << j);
 }
 void unset_bitmap(uint8_t *base, size_t index)
 {
    size_t i = index / 8;
    size_t j = index % 8;
    base[i] &= ~(uint8_t)(1 << j);
 }

 /*
 * Initialize a list to empty. Because these are circular lists, an "empty"
 * list is an entry where both links point to itself. This makes insertion
 * and removal simpler because they don't need any branches.
 */
 static inline void list_init(list_t *list)
 {
    list->prev = list;
    list->next = list;
 }

 /*
 * Append the provided entry to the end of the list. This assumes the entry
 * isn't in a list already because it overwrites the linked list pointers.
 */
 static inline void list_push(list_t *list, list_t *entry)
 {
    list_t *prev = list->prev;
    entry->prev = list->prev;
    entry->next = list;
    prev->next = entry;
    list->prev = entry;
 }

 /*
 * Remove the provided entry from whichever list it's currently in. This
 * assumes that the entry is in a list. You don't need to provide the list
 * because the lists are circular, so the list's pointers will automatically
 * be updated if the first or last entries are removed.
 */
 static inline void list_remove(list_t *entry)
 {
    list_t *prev = entry->prev;
    list_t *next = entry->next;
    prev->next = next;
    next->prev = prev;
 }
 static inline bool list_empty(list_t *list)
 {
    if (!list || (list->next == list && list->prev == list))
        return true;
    return false;
 }

 /*
 * Remove and return the first entry in the list or NULL if the list is empty.
 */
 static inline list_t *list_pop(list_t *list)
 {
    if (list_empty(list))
        return NULL;
    list_t *back = list->prev;
    list_remove(back);
    return back;
 }

 /**
 * @brief convert a block pointer into the index of that block in bitmap
 *
 * @param allocator which allocator the block belongs to
 * @param _ptr the block pointer
 * @param bucket which bucket the block belongs to , a bucket stands for a specific size
 * @return size_t the index of the block in bitmap
 */
 size_t ptr_to_index(struct buddy_allocator *allocator, void *_ptr, size_t bucket)
 {
    uintptr_t ptr = (uintptr_t)_ptr;
    size_t start_index = (1 << bucket) - 1;
    size_t piece_size = (allocator->MAX_BUDDY_ALLOC_SIZE) / (1 << bucket);
    size_t cnt = (ptr - allocator->buddy_start) / piece_size;
    assert(cnt < (1 << bucket));
    return start_index + cnt;
 }

 /** @brief convert a block index into the block pointer
 *
 * @param allocator which allocator the block belongs to
 * @param index the index of the block in bitmap
 * @param bucket which bucket the block belongs to , a bucket stands for a specific size
 * @return void* the block pointer
 */
 void *index_to_ptr(struct buddy_allocator *allocator, size_t index)
 {
    size_t bucket = LOG2(index + 1);
    size_t start_index = (1 << bucket) - 1;
    size_t piece_size = (allocator->MAX_BUDDY_ALLOC_SIZE) / (1 << bucket);
    size_t cnt = index - start_index;
    return (void *)(allocator->buddy_start + (cnt * piece_size));
 }

 /**
 * @brief initialize a buddy allocator, calculate the variable of buddy allocator
 * @param allocator which allocator to be initialized
 * @param start the start address of heap
 * @param end the end address of heap
 * @return bool true if success , false if failed
 */
 bool buddy_init_variable(struct buddy_allocator *allocator, uintptr_t start, uintptr_t end)
 {

    size_t total_size = end - start; // total size of heap

    // calculate the max possiable size of buddy allocator
    // now we have not alloc memory for management(reserved).
    // we have no place to store the bitmap and buckets
    allocator->MAX_BUDDY_ALLOC_SIZE = pre_pow2(total_size);

    // calculate the reserved size of buddy allocator
    // for bitmap and buckets
    {
        allocator->bucket_num = LOG2(allocator->MAX_BUDDY_ALLOC_SIZE) - LOG2(MIN_BUDDY_ALLOC_SIZE) + 1;
        allocator->bitmap_size = MAX(1, sizeof(uint8_t) * ((1 << (allocator->bucket_num)) / 8));

        allocator->reserved_size = (sizeof(list_t) * (allocator->bucket_num) + sizeof(uint8_t) * (allocator->bitmap_size));

        allocator->reserved_start = end - allocator->reserved_size;
    }
    // now we have alloc memory for management(reserved).

    // re-calculate the max size of buddy allocator
    allocator->MAX_BUDDY_ALLOC_SIZE = pre_pow2(allocator->reserved_start - start);

    if (allocator->MAX_BUDDY_ALLOC_SIZE < MIN_BUDDY_ALLOC_SIZE)
    {
        memset(allocator, 0, sizeof(struct buddy_allocator));
        return false;
    }
    /*
     * re-calculate the variable of buddy allocator
     * now MAX_BUDDY_ALLOC_SIZE is less or equal than its value before (Line 252)
     *
     * the reserved_size enough to store bitmap and buckets for current MAX_BUDDY_ALLOC_SIZE
     */

    allocator->bucket_num = LOG2(allocator->MAX_BUDDY_ALLOC_SIZE) - LOG2(MIN_BUDDY_ALLOC_SIZE) + 1;
    allocator->bitmap_size = MAX(1, sizeof(uint8_t) * ((1 << (allocator->bucket_num)) / 8));

    // the size of memory managed by buddy allocator
    allocator->buddy_size = allocator->MAX_BUDDY_ALLOC_SIZE;

    allocator->buddy_start = start;

    /*
     * unuse the memory
     * about half size of heap size
     * caused by we must alloc memory for bitmap and buckets for manangement
     */
    allocator->unused_start = start + allocator->MAX_BUDDY_ALLOC_SIZE;
    allocator->unused_size = allocator->reserved_start - (allocator->buddy_start + allocator->buddy_size);

 #ifdef DEBUGPRINTF
    printf("\n\nTotal heap size: %zuMB\n", total_size / MB);
    printf("Total Heap address:\t%p-%p\t%zuKB\n", start, end, (end - start) / KB);
    printf("-------------------------------------------------------\n");
    printf("Buddy allocator:\t%p-%p\t%zuKB\n", allocator->buddy_start, allocator->buddy_start + allocator->buddy_size, allocator->buddy_size / KB);
    printf("Unused Heap address:\t%p-%p\t%zuKB\n", allocator->unused_start, allocator->unused_start + allocator->unused_size, allocator->unused_size / KB);
    printf("Reserved for Buddy:\t%p-%p\t%zuB\n", allocator->reserved_start, allocator->reserved_start + allocator->reserved_size, allocator->reserved_size);
    printf("-------------------------------------------------------\n");
    printf("BUCKET_COUNT:\t%zu\n", allocator->bucket_num);
    printf("MIN_BUDDY_ALLOC_SIZE:\t%zuKB\n", MIN_BUDDY_ALLOC_SIZE / KB);
    printf("MAX_BUDDY_ALLOC_SIZE:\t%zuKB\n", allocator->MAX_BUDDY_ALLOC_SIZE / KB);
    printf("-------------------------------------------------------\n");
 #endif
    return true;
 }

 /**
 * @brief initialize a buddy allocator
 * @param allocator which allocator to be initialized
 * @param start the start address of heap
 * @param end the end address of heap
 * @return bool true if success , false if failed
 */
 bool buddy_init(struct buddy_allocator *allocator, uintptr_t start, uintptr_t end)
 {
    // just for set @buddy_start and @buddy_size to 0
    memset(allocator, 0, sizeof(struct buddy_allocator));

 #ifdef DEBUGPRINTF
    printf("Waste for alignment: %zuB\n", ROUNDUP(start, MIN_BUDDY_ALLOC_SIZE) - start);
 #endif
    // round up the start address to MIN_BUDDY_ALLOC_SIZE
    start = ROUNDUP(start, MIN_BUDDY_ALLOC_SIZE); // comment this line also works

    if (end < start + MIN_BUDDY_ALLOC_SIZE)
        return false;
    bool flag = buddy_init_variable(allocator, start, end);
    if (!flag)
        return false;

    // initialize the pointer of  bitmap and buckets
    allocator->buckets = (list_t *)(allocator->reserved_start);
    allocator->bitmap = (uint8_t *)(allocator->reserved_start + sizeof(list_t) * allocator->bucket_num);

    // initialize the buckets(each of them is a list)
    for (int i = 0; i < allocator->bucket_num; i++)
    {
        list_init(&(allocator->buckets[i]));
    }

    memset(allocator->bitmap, 0, allocator->bitmap_size);

    // after initialization , buckets[0] has a block of size MAX_BUDDY_ALLOC_SIZE
    // and other buckets is empty
    list_push(&(allocator->buckets[0]), (list_t *)(allocator->buddy_start));

    return true;
 }

 /**
 * @brief use a buddy allocator to allocate memory
 *
 * @param allocator which allocator to be used
 * @param size the size of memory to be allocated , must be a power of 2 , and less than MAX_BUDDY_ALLOC_SIZE, and greater or equal than MIN_BUDDY_ALLOC_SIZE
 * @return void* the start address of allocated memory
 */
 void *buddy_alloc(struct buddy_allocator *allocator, size_t size)
 {
    if (size == 0 || size > allocator->MAX_BUDDY_ALLOC_SIZE || size < MIN_BUDDY_ALLOC_SIZE || size != next_pow2(size))
        return NULL;

    // which bucket stands for the size of memory to be allocated
    ssize_t target_bucket = allocator->bucket_num - 1 - LOG2(size / MIN_BUDDY_ALLOC_SIZE);

    // each bucket is a list to store the free blocks
    list_t *l = &((allocator->buckets)[target_bucket]);

    // A BIG LOCK SAVES THE PEACE
    lock(&(allocator->_mutex));

    list_t *ret = list_pop(l);

    if (!ret)
    {
        /*
         * target bucket is empty
         * we need to split its nearest avaiable block greater than current size
         * to get a block of size we want
         */

        ssize_t cur_bucket = target_bucket - 1;
        while (cur_bucket > -1)
        {

            ret = list_pop(&((allocator->buckets)[cur_bucket]));

            if (!ret)
            {
                // bucket is empty either
                // continue to search the next bucket
                cur_bucket--;
                continue;
            }

            /*
             * now we get proper block(cur_bucket)
             * we need to split it till we get a block of size we want
             */

            size_t cur_index;

            // till cur_bucket = target_bucket , we got the block we want
            while (cur_bucket != target_bucket)
            {
                // get the index of the current block
                cur_index = ptr_to_index(allocator, ret, cur_bucket);

                assert(get_bitmap(allocator->bitmap, cur_index) == false);
                // set it to not-avaiable in bitmap
                set_bitmap(allocator->bitmap, cur_index);
                assert(get_bitmap(allocator->bitmap, cur_index) == true);

                assert(cur_bucket + 1 < allocator->bucket_num && cur_bucket + 1 >= 0);

                /*
                 * split current block to two blocks
                 * the right block is pushed to free list(buckets[cur_bucket+1])
                 * the left one has been recursively splited
                 */
                list_push(&((allocator->buckets)[cur_bucket + 1]), (list_t *)(index_to_ptr(allocator, RCHILD(cur_index))));

                // the left one
                ret = (list_t *)(index_to_ptr(allocator, LCHILD(cur_index)));
                cur_bucket++;
            }

            // now we get the block we want
            cur_index = ptr_to_index(allocator, ret, cur_bucket);

            assert(get_bitmap(allocator->bitmap, cur_index) == false);
            set_bitmap(allocator->bitmap, cur_index);
            assert(get_bitmap(allocator->bitmap, cur_index) == true);
            assert(get_bitmap(allocator->bitmap, PARENT(cur_index)) == true);

            break;
        }
    }
    else
    {
        /*
         * target bucket is not empty
         * ret is the block we want
         */
        size_t cur_index = ptr_to_index(allocator, ret, target_bucket);
        assert(get_bitmap(allocator->bitmap, cur_index) == false);
        set_bitmap(allocator->bitmap, cur_index);
        assert(get_bitmap(allocator->bitmap, cur_index) == true);
    }

    unlock(&(allocator->_mutex));
    return (void *)ret;
 }

 /**
 * @brief use a buddy allocator to free memory
 *
 * @param allocator which allocator to be used
 * @param _ptr the start address of memory to be freed
 * @param size the size of memory to be freed
 */
 void buddy_free(struct buddy_allocator *allocator, void *_ptr, size_t size)
 {
    if (!_ptr)
        return;
    if (size == 0 || size > allocator->MAX_BUDDY_ALLOC_SIZE || size < MIN_BUDDY_ALLOC_SIZE || size != next_pow2(size))
    {
        assert(0);
        return;
    }
    size_t cur_bucket = allocator->bucket_num - 1 - LOG2(size / MIN_BUDDY_ALLOC_SIZE);
    size_t index = ptr_to_index(allocator, _ptr, cur_bucket);

    lock(&(allocator->_mutex));

    assert(get_bitmap(allocator->bitmap, index) == true);
    // set it to avaiable in bitmap
    unset_bitmap(allocator->bitmap, index);

    /*
     * if a block and its buddy are both avaiable, merge them
     * recursively merge until reached the root (index = 0) or the block is not avaiable
     */
    while (index && get_bitmap(allocator->bitmap, BUDDY(index)) == false)
    {
        // if (index == 0 || get_bitmap(allocator->bitmap, BUDDY(index)))
        // break;

        list_t *pptr = (list_t *)index_to_ptr(allocator, BUDDY(index));
        assert(get_bitmap(allocator->bitmap, BUDDY(index)) == false);

        // remove the buddy block from the free list
        list_remove(pptr);

        assert(get_bitmap(allocator->bitmap, PARENT(index)) == true);
        // now we merge the block and its buddy
        // so the parent block is avaiable
        unset_bitmap(allocator->bitmap, PARENT(index));

        index = PARENT(index);
    }

    assert(get_bitmap(allocator->bitmap, index) == false);

    cur_bucket = LOG2(index + 1);

    assert(cur_bucket < allocator->bucket_num && cur_bucket >= 0);

    list_t *p1 = (list_t *)index_to_ptr(allocator, index);
    // p1 is the merged block,push it to the free list
    list_push(&((allocator->buckets)[cur_bucket]), p1);
    unlock(&(allocator->_mutex));
 }

 /**
 * @brief check the _ptr is in the range of a allocator's memory
 *
 * @param allocator which allocator to be used
 * @param _ptr the start address of memory to be checked
 * @return true  if _ptr is in the range of a allocator's memory
 */
 bool is_in_buddy_allocator_range(struct buddy_allocator *allocator, void *_ptr)
 {
    uintptr_t ptr = (uintptr_t)_ptr;
    return ptr >= allocator->buddy_start && ptr < allocator->buddy_start + allocator->buddy_size;
 }

 /*
 * About half of the the buddy allocator's memory is not used.
 * The unused memory could be used to construct a new buddy allocator.
 * recursively do this until the unused memory is less than MIN_BUDDY_ALLOC_SIZE
 *
 * the larger ,the unused memory is smaller.
 * about 1/(2^n) of the total memory is not used.
 * n is MAX_BUDDY_ALLOCATOR_NUMS
 * MAX_BUDDY_ALLOCATOR_NUMS must less than UINT16_MAX
 */
 #define MAX_BUDDY_ALLOCATOR_NUMS (20)

 struct buddy_allocator allocators[MAX_BUDDY_ALLOCATOR_NUMS];
 uint8_t allocator_inited_num = 0;

 void buddy_init_for_heap(uintptr_t start, uintptr_t end)
 {
    if (buddy_init(&allocators[0], start, end))
        allocator_inited_num++;

    // initlize buddy allocator
    for (int i = 1; i < MAX_BUDDY_ALLOCATOR_NUMS; i++)
    {
        // use pre allocator's unused space to construct new allocator
        if (buddy_init(&allocators[i], allocators[i - 1].unused_start, allocators[i - 1].unused_start + allocators[i - 1].unused_size))
            allocator_inited_num++;
        else
            break;
    }
 #ifdef DEBUGPRINTF
    printf("allocator_inited_num : %d\n", allocator_inited_num);
 #endif
 }

 /**
 * @brief Get a block from buddy allocator ! CONCURRENCY SAFE !
 *
 * @param size the size of memory to be allocated, must be a power of 2 and greater or equal than MIN_BUDDY_ALLOC_SIZE, and less than MAX_BUDDY_ALLOC_SIZE
 * @return void* the start address of the allocated memory
 */
 void *get_block_from_buddy(size_t size)
 {
    void *ret = NULL;
    // start from smaller buddy allocator
    for (int i = allocator_inited_num - 1; i >= 0; i--)
    {
        ret = buddy_alloc(&allocators[i], size);
        if (ret)
            break;
    }
    return ret;
 }

 /** @brief free a block to buddy allocator ! CONCURRENCY SAFE !
 *
 * @param _ptr the start address of memory to be freed
 * @param size the size of memory to be freed
 */
 void free_block_to_buddy(void *_ptr, size_t size)
 {
    for (int i = allocator_inited_num - 1; i >= 0; i--)
    {
        if (is_in_buddy_allocator_range(&allocators[i], _ptr))
        {
            buddy_free(&allocators[i], _ptr, size);
            return;
        }
    }
    // NEVER REACH HERE
    assert(0);
 }

 /*
 * This slab allocator is not a real slab allocator.
 * Just learn some basic idea about slab allocator.
 * Like per cpu cache for specific size of memory.
 */

 /*
 * each slab_allocator is designed to hold a fixed size of memory.
 * each cpu has several slab_allocator,which size is range of [MIN_SLAB_ALLOC_SIZE,MAX_SLAB_ALLOC_SIZE]
 *
 * slab_allocator has three lists , @ free / partial / full list.
 * each @list links to some @slab_header.
 * each @slab_header links to some @slab_block_header.
 * each @slab_block_header stands for a block of memory.
 */

 struct slab_block_header
 {
    list_t to_block;
    size_t cnt;
 };

 struct slab_header
 {
    list_t to_slab;  // link to free/full/partial _slab
    list_t to_block; // link to blocks
 };

 #define MAX_SLAB_BLOCK_SIZE (32 * KB)
 #define MIN_SLAB_BLOCK_SIZE ((2 * sizeof(uintptr_t)) * 2) // Must >= 3 * sizeof(uintptr_t)
 // per slab_allocator alloca fixed size memory
 // per slab_allocator has 2*MIN_BUDDY_ALLOC_SIZE memory for initilization
 struct slab_allocator
 {
    size_t blocks_per_slab;
    size_t slab_block_size; // range from [MIN_SLAB_ALLOC_SIZE, MAX_SLAB_ALLOC_SIZE]
    list_t free_slab;       // totally not used
    list_t full_slab;       // totally used
    list_t partial_slab;    // partially used
 };

 size_t get_requset_size_slab(struct slab_allocator *allocator)
 {
    size_t request_size = allocator->slab_block_size + sizeof(struct slab_header);

    request_size = MAX(next_pow2(request_size), MIN_BUDDY_ALLOC_SIZE);

    size_t cnt2 = (request_size - sizeof(struct slab_header)) / allocator->slab_block_size;
    int waste_percent = (allocator->slab_block_size - sizeof(struct slab_header)) * 100 /
                        (cnt2 * allocator->slab_block_size);
    if (waste_percent > 13)
    {
        request_size = allocator->slab_block_size * 8;
        request_size += sizeof(struct slab_header);
        request_size = MAX(next_pow2(request_size), MIN_BUDDY_ALLOC_SIZE);
        cnt2 = (request_size - sizeof(struct slab_header)) / allocator->slab_block_size;
        waste_percent = (allocator->slab_block_size - sizeof(struct slab_header)) * 100 /
                        (cnt2 * allocator->slab_block_size);
        assert(waste_percent <= 13);
        /*
    waste_percent: 0%
    waste_percent: 2%
    waste_percent: 5%
    waste_percent: 6%
    waste_percent: 13%
    */
    }
 #ifdef DEBUGPRINTF
    printf("waste_percent: %d%%\n", waste_percent);
 #endif
    return request_size;
 }

 /**
 * @brief from buddy allocator get a block, contruct it to a slab_header and some slab block ,and add it to slab allocator
 */
 bool add_slab_to_allocator(struct slab_allocator *allocator)
 {
    size_t request_size = get_requset_size_slab(allocator);

    void *ptr = get_block_from_buddy(request_size);
    if (!ptr)
    {
        return false;
    }

    struct slab_header *header = (struct slab_header *)ptr;
    list_init(&(header->to_slab));
    list_init(&(header->to_block));

    list_push(&(allocator->free_slab), (list_t *)(header));

    /*
     * got a block from buddy allocator
     * /                                block                                           /
     *  split it into
     * / slab_header / slab_block_header / slab_block_header / slab_block_header / ...../
     *
     */

    uintptr_t start = (uintptr_t)ptr + sizeof(struct slab_header);
    size_t cnt = 0;
    while (start + allocator->slab_block_size <= (uintptr_t)ptr + request_size)
    {
        // sizeof(struct slab_block_header);
        assert(allocator->slab_block_size >= sizeof(struct slab_block_header));
        struct slab_block_header *kh = (struct slab_block_header *)(start);

        // slab_block_header position in block , used in slab_free
        kh->cnt = cnt;

        list_push(&(header->to_block), (&(kh->to_block)));
        start += allocator->slab_block_size;
        cnt++;
    }

    // cause we will put the @cnt into kalloc_header , and use a uint16_t to store it
    assert(cnt <= UINT16_MAX);

    allocator->blocks_per_slab = cnt;

 #ifdef DEBUGPRINTF
    printf("waste_percent: %zd%%\n", waste_percent);
    size_t waste_size = (uintptr_t)ptr + request_size - start;
    printf("Requset: %zuKB \tWaste size: %zu, alloc %zu * %zuB\n", request_size / KB, waste_size, cnt, allocator->slab_alloc_size);
 #endif
    return true;
 }

 /**
 * @brief init a slab allocator
 *
 * @param allocator the slab allocator to be init
 * @param size the size of memory to be allocated, must be a power of 2 and greater or equal than MIN_SLAB_ALLOC_SIZE, and less than MAX_SLAB_ALLOC_SIZE
 */
 bool slab_allocator_init(struct slab_allocator *allocator, size_t size)
 {
    if (size < MIN_SLAB_BLOCK_SIZE || size > MAX_SLAB_BLOCK_SIZE || size != next_pow2(size))
        return false;
    allocator->slab_block_size = size;
    list_init(&(allocator->free_slab));
    list_init(&(allocator->full_slab));
    list_init(&(allocator->partial_slab));

    bool flag = add_slab_to_allocator(allocator);
    int cnt = MAX_SLAB_BLOCK_SIZE / allocator->slab_block_size;
    cnt -= 1;
    while (cnt > 0)
    {
        add_slab_to_allocator(allocator);
        cnt--;
    }
    return flag;
 }

 /** @brief init a cpu's slab allocator
 */
 void slab_init(struct slab_allocator *allocators)
 {
    // the number of slab allocator per cpu has
    size_t len = LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1;
    size_t size = MIN_SLAB_BLOCK_SIZE;
    for (size_t i = 0; i < len; i++, size *= 2)
    {
        if (!slab_allocator_init(&allocators[i], size))
        {
            allocators[i].slab_block_size = 0;
            assert(0); // no enough space to init slab allocator for per cpu
        }
    }
 }

 // the start memory address of slab_allocator array
 // this array contains all cpus' slab_allocator
 // a cpu's slab_allocator is continuous memory
 uintptr_t slab_allocators_for_all_cpu;

 // for all of cpus ,initlize slab_allocator
 /** @brief initlize all of cpus' slab allocator
 */
 void slab_init_for_all_cpu()
 {
    int cpu_num = cpu_count();
    size_t len = LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1;

    // get a block from buddy allocator to store slab_allocator array
    size_t size = MAX(MIN_BUDDY_ALLOC_SIZE, sizeof(struct slab_allocator) * len * cpu_num + sizeof(uint16_t) * cpu_num);
    size = next_pow2(size);
    slab_allocators_for_all_cpu = (uintptr_t)get_block_from_buddy(size);
    if (!slab_allocators_for_all_cpu)
    {
        assert(0);
        return;
    }
    memset((void *)slab_allocators_for_all_cpu, 0, size);
 #ifdef DEBUGPRINTF
    size_t wasted_size = size - sizeof(struct slab_allocator) * cpu_num * len - sizeof(uint16_t) * cpu_num;
    printf("slab_init_for_all_cpu Waste size: %zuB\n", wasted_size);
 #endif

    size_t offset = 0;
    for (int i = 0; i < cpu_num; i++)
    {
        // initlize slab_allocator for cpu(i)
        slab_init((struct slab_allocator *)(slab_allocators_for_all_cpu + offset));
        offset += sizeof(struct slab_allocator) * len;
    }
 }

 void slab_shrink()
 {
    int cur_cpu = cpu_current();
    size_t len = LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1;
    uintptr_t cur_slab_allocator = slab_allocators_for_all_cpu + sizeof(struct slab_allocator) * len * cur_cpu;
    size_t index = 0;

    for (size_t index = 0; index < len; index++)
    {
        struct slab_allocator *allocator = (struct slab_allocator *)(cur_slab_allocator + sizeof(struct slab_allocator) * index);

        list_t *cur = allocator->partial_slab.next;
        while (cur != &(allocator->partial_slab))
        {
            list_t *next = cur->next;
            list_t *block_cur = ((struct slab_header *)cur)->to_block.next;

            size_t cnt = 0;
            while (block_cur != &(((struct slab_header *)cur)->to_block))
            {
                list_t *block_next = block_cur->next;
                cnt++;
                block_cur = block_next;
            }
            assert(cnt <= allocator->blocks_per_slab);
            if (cnt == allocator->blocks_per_slab)
            {
                list_remove(cur);
                list_push(&(allocator->free_slab), cur);
            }
            cur = next;
        }

        cur = allocator->free_slab.next;
        while (cur != &(allocator->free_slab))
        {
            list_t *next = cur->next;
            if (next == &(allocator->free_slab))
                break;
            list_remove(cur);
            free_block_to_buddy(cur, get_requset_size_slab(allocator));
            cur = next;
        }
    }
 }

 /**
 * @brief get some memroy from slab allocator
 *
 * @param size the size of memory to be allocated, must be a power of 2 and greater or equal than MIN_SLAB_ALLOC_SIZE, and less than MAX_SLAB_ALLOC_SIZE
 * @return the pointer to the start memory address , if failed , return NULL
 */
 void *slab_alloc(size_t size)
 {
    if (size < MIN_SLAB_BLOCK_SIZE || size > MAX_SLAB_BLOCK_SIZE || size != next_pow2(size))
        return NULL;

    // per cpu will use its own slab_allocator
    int cur_cpu = cpu_current();
    uintptr_t cur_slab_allocator = slab_allocators_for_all_cpu + sizeof(struct slab_allocator) * (LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1) * cur_cpu;

    // cur_slab_allocator is the start address of the slab_allocator of current cpu

    size_t index = LOG2(size) - LOG2(MIN_SLAB_BLOCK_SIZE);
    // get the slab_allocator of target size
    struct slab_allocator *alloc = (struct slab_allocator *)(cur_slab_allocator + sizeof(struct slab_allocator) * index);

    if (alloc->slab_block_size != size)
        return NULL; // should not happen

    void *ret = NULL;
    if (!list_empty(&(alloc->partial_slab)))
    {
        list_t *node = list_pop(&(alloc->partial_slab));
        struct slab_header *header = (struct slab_header *)node;
        ret = list_pop(&(header->to_block));
        if (list_empty(&(header->to_block)))
        {
            list_push(&(alloc->full_slab), node);
        }
        else
        {
            list_push(&(alloc->partial_slab), node);
        }
    }
    else if (!list_empty(&(alloc->free_slab)))
    {
        list_t *node = list_pop(&(alloc->free_slab));
        // sizeof(struct slab_block_header);
        struct slab_header *header = (struct slab_header *)node;
        ret = list_pop(&(header->to_block));
        if (list_empty(&(header->to_block)))
        {
            list_push(&(alloc->full_slab), node);
        }
        else
        {
            list_push(&(alloc->partial_slab), node);
        }
    }
    else
    {
        // if free/partial_slab is empty, then alloc a new slab to free slab list
        // that's say get a block from buddy allocator for current cpu to use
        if (!add_slab_to_allocator(alloc))
            return NULL;
        // May be at some time, we should shrink the slab allocator, giving back the block to buddy allocator
        // otherwise, there will be some cpu get little memory, and then the other cpu will get a lot of memory.
        return slab_alloc(size);
    }
    return ret;
 }

 // used for kalloc_header
 size_t MAGIC;

 struct kalloc_header
 {
    size_t size;
    union
    {
        uint16_t cnt;  // only used for slab allocator
        size_t canary; // for error detection
    };
 };

 void slab_free(void *_ptr, size_t size)
 {
    if (!_ptr || size < MIN_SLAB_BLOCK_SIZE || size > MAX_SLAB_BLOCK_SIZE || size != next_pow2(size))
        return;

    int cur_cpu = cpu_current();
    uintptr_t cur_slab_allocator = slab_allocators_for_all_cpu + sizeof(struct slab_allocator) * (LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1) * cur_cpu;

    size_t index = LOG2(size) - LOG2(MIN_SLAB_BLOCK_SIZE);
    struct slab_allocator *alloc = (struct slab_allocator *)(cur_slab_allocator + sizeof(struct slab_allocator) * index);

    if (alloc->slab_block_size != size)
        return;

    size_t request_size = size + sizeof(struct slab_header);
    request_size = MAX(next_pow2(request_size), MIN_BUDDY_ALLOC_SIZE);

    struct kalloc_header *kh = (struct kalloc_header *)(_ptr);
    size_t cnt = kh->cnt;

    /*
     * / slab_header / slab_block_header / slab_block_header / ....
     * |                                   |
     *target                              _ptr
     */
    void *target = (void *)((uintptr_t)(_ptr) - sizeof(struct slab_header) - cnt * size);

    struct slab_header *header = (struct slab_header *)(target);

    struct slab_block_header *sbh = (struct slab_block_header *)(_ptr);
    list_push(&(header->to_block), &(sbh->to_block));
    sbh->cnt = cnt; // <---       CAUTION: this is important

    list_remove(&(header->to_slab)); // equal to list_remove(header);

    /*
     * Maybe to free_slab or partial_slab
     * Just for simplity, don't want to traverse the whole list
     */
    list_push(&(alloc->partial_slab), &(header->to_slab));

    int cpu_num = cpu_count();
    size_t len = LOG2(MAX_SLAB_BLOCK_SIZE) - LOG2(MIN_SLAB_BLOCK_SIZE) + 1;
    uint16_t *free_cnt = (uint16_t *)(slab_allocators_for_all_cpu + sizeof(struct slab_allocator) * len * cpu_num + sizeof(uint16_t) * cur_cpu);

    (*free_cnt)++;
    if (*free_cnt == UINT16_MAX)
    {
        slab_shrink();
 #ifdef DEBUGPRINTF
        printf("CPU %d: slab_shrink\n", cur_cpu);
 #endif
    }
 }

 /*
 * @brief initlize the kallocator
 * Include the slab allocator and the buddy allocator
 * Done by the boot cpu only
 */
 static void kalloc_init(uintptr_t start, uintptr_t end)
 {
    srand(time(NULL));
    MAGIC = ((size_t)(rand()) << 32) + rand();

    buddy_init_for_heap(start, end);
    slab_init_for_all_cpu();
 }

 /**
 * @brief make canary for error detection
 */
 size_t make_canary(size_t canary, uint16_t cnt)
 {
    /*
     * |31/63............. 15.....0| bits
     * |      canary       |  cnt  |
     */
    size_t ret = cnt;
    size_t mask = (SIZE_MAX) << (sizeof(cnt) * 8);
    mask &= canary;
    return ret | mask;
 }

 bool check_canary(size_t num, size_t canary, uint16_t cnt)
 {
    size_t ret = cnt;
    size_t mask = (SIZE_MAX) << (sizeof(cnt) * 8);
    mask &= canary;
    return (ret | mask) == num;
 }

 /**
 * @brief alloc memory of size
 *
 * @param _size  the size of memory to be allocated
 * @return void*  the start address of the allocated memory , NULL if failed
 */
 static void *
 kalloc(size_t _size)
 {
    sizeof(struct kalloc_header);
    if (_size == 0)
        return NULL;
    void *ret = NULL;
    size_t size = MAX(next_pow2(_size + sizeof(struct kalloc_header)), MIN_SLAB_BLOCK_SIZE);
    // if small size, then use slab allocator , each cpu has its own slab allocators
    if (size >= MIN_SLAB_BLOCK_SIZE && size <= MAX_SLAB_BLOCK_SIZE)
    {
        // slab alloc
        void *receive = slab_alloc(size); // bug for size==32768
        if (!receive)
            return NULL;
        struct slab_block_header *block_header = (struct slab_block_header *)receive;
        uint16_t cnt = block_header->cnt;
        struct kalloc_header *ptr = (struct kalloc_header *)(receive);
        ptr->size = size;
        ptr->canary = make_canary(MAGIC ^ size, cnt);
        ret = ptr + 1;
    }
    else // otherwise use buddy allocator , all cpu share one buddy allocator
    {
        struct kalloc_header *ptr = (struct kalloc_header *)get_block_from_buddy(size);
        if (!ptr)
            return NULL;
        ptr->size = size;
        ptr->canary = make_canary(MAGIC ^ size, (uint16_t)(rand())); // cnt is useless in buddy allocator
        ret = ptr + 1;
    }
    return ret;
 }

 /**
 * @brief free memory allocated by kalloc
 *
 * @param ptr the start address of the memory to be freed
 */
 static void kfree(void *ptr)
 {
    if (!ptr)
        return;
    struct kalloc_header *header = (struct kalloc_header *)(ptr)-1;
    if (!check_canary(header->canary, MAGIC ^ header->size, header->cnt) || header->size < MIN_SLAB_BLOCK_SIZE)
    {
        assert(0);
        return;
    }
    if (header->size >= MIN_SLAB_BLOCK_SIZE && header->size <= MAX_SLAB_BLOCK_SIZE)
    {
        slab_free(header, header->size);
    }
    else
    {
        free_block_to_buddy(header, header->size);
    }
 }

 signed main(int argc, char **argv)
 {
    setbuf(stdout, 0);

    bool use_buddy = true;
    if (argc >= 2)
    {
        use_buddy = atoi(argv[1]);
    }

    if (use_buddy)
        printf("use buddy\n");
    else
        printf("use malloc\n");

    const size_t size = 125 * MB;
    uintptr_t ptr = (uintptr_t)malloc(size);

    clock_t s = clock();
    kalloc_init(ptr, ptr + size);
    printf("kalloc_init: %ld clocks\n", (clock() - s));
    auto test = [&]()
    {
        _map_mutex.lock();
        _map[this_thread::get_id()] = cpuid++;
        printf("thread %d\n", _map[this_thread::get_id()]);
        _map_mutex.unlock();

        clock_t start_time = clock();
        int cnt = 1;
        size_t total_size = 0;
        vector<void *> parr;
 #define BASE_SIZE (32 * B)
        for (int i = 0; i < 5e3; i++)
        {
            void *p;
            if (use_buddy)
                p = kalloc(BASE_SIZE);
            else
                p = malloc(BASE_SIZE);
            if (p)
            {
                total_size += BASE_SIZE;
                cnt++;
                // printf("%d thread : %zuB\n", _map[this_thread::get_id()], BASE_SIZE * (i + 1));
                memset(p, 0xff, BASE_SIZE);
                parr.push_back(p);
                // kfree(p);
            }
        }
        for (auto &&i : parr)
        {
            if (use_buddy)
                kfree(i);
            else
                free(i);
        }

        printf("%d thread :use %ld clocks us alloc %d times , total %zuKB \n", _map[this_thread::get_id()], clock() - start_time, cnt, total_size / KB);
    };

    vector<thread> threads;
    for (int i = 0; i < ThreadNums; i++)
    {
        thread t(test);
        threads.push_back(move(t));
    }
    test();
    for (auto &&i : threads)
    {
        i.join();
    }
    return 0;
 }