Created
April 20, 2022 09:07
-
-
Save ventusff/e2bf91e32814333c3fe00d377e5fe357 to your computer and use it in GitHub Desktop.
tiny-cuda-nn-debug-fork
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without modification, are permitted | |
* provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright notice, this list of | |
* conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright notice, this list of | |
* conditions and the following disclaimer in the documentation and/or other materials | |
* provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used | |
* to endorse or promote products derived from this software without specific prior written | |
* permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR | |
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE | |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*//* | |
*/ | |
/** @file gpu_matrix.h | |
* @author Thomas Müller, NVIDIA | |
* @brief Matrix whose data resides in GPU (CUDA) memory | |
*/ | |
#pragma once | |
#include <tiny-cuda-nn/common.h> | |
#include <tiny-cuda-nn/gpu_memory.h> | |
#include <tiny-cuda-nn/matrix_layout.h> | |
#include <pcg32/pcg32.h> | |
#include <stdexcept> | |
#include <stdint.h> | |
#include <string> | |
#include <vector> | |
TCNN_NAMESPACE_BEGIN | |
template<typename T> | |
class GPUMatrixDynamic; | |
template<typename T, MatrixLayout _layout> | |
class GPUMatrix; | |
class GPUMatrixBase { | |
public: | |
virtual ~GPUMatrixBase() {} | |
virtual size_t n_bytes() const = 0; | |
virtual void set_data_unsafe(void* data) = 0; | |
static void allocate_shared_memory(GPUMemory<char>& memory, const std::vector<GPUMatrixBase*>& matrices) { | |
size_t total_n_bytes = 0; | |
for (auto* matrix : matrices) { | |
total_n_bytes += matrix->n_bytes(); | |
} | |
if (memory.bytes() < total_n_bytes) { | |
#ifdef TCNN_VERBOSE_MEMORY_ALLOCS | |
std::cout << "GPUMatrix: Allocating " << bytes_to_string(total_n_bytes) << " shared among " << matrices.size() << " matrices." << std::endl; | |
#endif | |
memory.resize(total_n_bytes); | |
} | |
size_t offset = 0; | |
for (auto* matrix : matrices) { | |
matrix->set_data_unsafe(memory.data() + offset); | |
offset += matrix->n_bytes(); | |
} | |
} | |
template <typename T> | |
static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices); | |
template <typename T, MatrixLayout layout> | |
static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices); | |
static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) { | |
size_t total_n_bytes = 0; | |
for (auto* matrix : matrices) { | |
total_n_bytes += matrix->n_bytes(); | |
} | |
auto alloc = allocate_workspace(stream, total_n_bytes); | |
size_t offset = 0; | |
for (auto* matrix : matrices) { | |
matrix->set_data_unsafe(alloc.data() + offset); | |
offset += matrix->n_bytes(); | |
} | |
return alloc; | |
} | |
template <typename T> | |
static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices); | |
template <typename T, MatrixLayout layout> | |
static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices); | |
}; | |
template <typename T> | |
struct MatrixView { | |
TCNN_HOST_DEVICE MatrixView() : data{nullptr}, stride_i{0}, stride_j{0} {} | |
TCNN_HOST_DEVICE MatrixView(T* data, uint32_t stride_i, uint32_t stride_j) : data{data}, stride_i{stride_i}, stride_j{stride_j} {} | |
TCNN_HOST_DEVICE MatrixView(const MatrixView<std::remove_const_t<T>>& other) : data{other.data}, stride_i{other.stride_i}, stride_j{other.stride_j} {} | |
TCNN_HOST_DEVICE T& operator()(uint32_t i, uint32_t j = 0) const { | |
return data[i * stride_i + j * stride_j]; | |
} | |
TCNN_HOST_DEVICE void advance(uint32_t m, uint32_t n) { | |
data = &(*this)(m, n); | |
} | |
TCNN_HOST_DEVICE void advance_rows(uint32_t m) { | |
advance(m, 0); | |
} | |
TCNN_HOST_DEVICE void advance_cols(uint32_t n) { | |
advance(0, n); | |
} | |
TCNN_HOST_DEVICE explicit operator bool() const { | |
return data; | |
} | |
T* data; | |
uint32_t stride_i, stride_j; | |
}; | |
template <typename T> | |
class GPUMatrixDynamic : public GPUMatrixBase { | |
public: | |
using Type = T; | |
// Owning its memory as a GPUMemory<T> | |
GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM) | |
: m_rows{m}, m_cols{n}, m_layout{layout} { | |
m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(m * n * sizeof(T)); | |
m_data = (T*)m_malloc_allocation->data(); | |
set_stride_contiguous(); | |
} | |
// Owning its memory as an allocation from a stream's memory arena | |
GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM) | |
: m_rows{m}, m_cols{n}, m_layout{layout} { | |
m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, m * n * sizeof(T))); | |
m_data = (T*)m_arena_allocation->data(); | |
set_stride_contiguous(); | |
} | |
// Pointing to external memory | |
explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr) | |
: m_data{data}, m_layout{layout}, m_malloc_allocation{malloc_allocation}, m_arena_allocation{arena_allocation} { | |
set(data, m, n, stride); | |
} | |
GPUMatrixDynamic() : GPUMatrixDynamic{nullptr, 0, 0} {} | |
GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) { | |
std::swap(m_data, other.m_data); | |
std::swap(m_rows, other.m_rows); | |
std::swap(m_cols, other.m_cols); | |
std::swap(m_stride, other.m_stride); | |
std::swap(m_layout, other.m_layout); | |
std::swap(m_malloc_allocation, other.m_malloc_allocation); | |
std::swap(m_arena_allocation, other.m_arena_allocation); | |
return *this; | |
} | |
GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) { | |
*this = std::move(other); | |
} | |
GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) = delete; | |
virtual ~GPUMatrixDynamic() {} | |
void set_data_unsafe(void* data) override { m_data = (T*)data; } | |
void set_size_unsafe(uint32_t rows, uint32_t cols, uint32_t stride = 0) { | |
m_rows = rows; | |
m_cols = cols; | |
if (stride == 0) { | |
set_stride_contiguous(); | |
} else { | |
m_stride = stride; | |
} | |
} | |
void set(T* data, uint32_t rows, uint32_t cols, uint32_t stride = 0) { | |
set_data_unsafe(data); | |
set_size_unsafe(rows, cols, stride); | |
} | |
void resize(uint32_t rows, uint32_t cols) { | |
if (m_arena_allocation) { | |
cudaStream_t stream = m_arena_allocation->stream(); | |
m_arena_allocation.reset(); | |
m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, rows * cols * sizeof(T))); | |
} else if (m_malloc_allocation) { | |
m_malloc_allocation.reset(); | |
m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(rows * cols * sizeof(T)); | |
} else { | |
throw std::runtime_error{"GPUMatrix::resize is not permitted when the underlying memory is not owned. Use GPUMatrix::set instead."}; | |
} | |
set_size_unsafe(rows, cols); | |
} | |
uint32_t stride_contiguous() const { | |
return m_layout == CM ? m() : n(); | |
} | |
bool is_contiguous() const { | |
return m_stride == stride_contiguous(); | |
} | |
void set_stride_contiguous() { | |
m_stride = stride_contiguous(); | |
} | |
GPUMatrixDynamic<T> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { | |
return GPUMatrixDynamic<T>{ | |
data() + (layout() == CM ? (offset_rows + offset_cols * stride()) : (offset_cols + offset_rows * stride())), | |
new_rows, | |
new_cols, | |
layout(), | |
stride(), | |
m_malloc_allocation, | |
m_arena_allocation, | |
}; | |
} | |
GPUMatrixDynamic<T> slice_rows(uint32_t offset, uint32_t size) const { | |
return slice(offset, size, 0, cols()); | |
} | |
GPUMatrixDynamic<T> slice_cols(uint32_t offset, uint32_t size) const { | |
return slice(0, rows(), offset, size); | |
} | |
GPUMatrixDynamic<T> alias() const { | |
return slice(0, rows(), 0, cols()); | |
} | |
MatrixView<T> view() const { | |
return {data(), layout() == CM ? 1u : stride(), layout() == CM ? stride() : 1u}; | |
} | |
uint32_t rows() const { return m_rows; } | |
uint32_t fan_out() const { return m_rows; } | |
uint32_t m() const { return m_rows; } | |
uint32_t cols() const { return m_cols; } | |
uint32_t fan_in() const { return m_cols; } | |
uint32_t n() const { return m_cols; } | |
uint32_t stride() const { return m_stride; } | |
PitchedPtr<T> pitched_ptr() { return {data(), stride()}; } | |
PitchedPtr<const T> pitched_ptr() const { return {data(), stride()}; } | |
uint32_t n_elements() const { return m_rows * m_cols; } | |
size_t n_bytes() const override { return n_elements() * sizeof(T); } | |
MatrixLayout layout() const { return m_layout; } | |
MatrixLayout transposed_layout() const { return m_layout == RM ? CM : RM; } | |
T* data() const { return m_data; } | |
void memset(int value) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes())); | |
} | |
void memset_async(cudaStream_t stream, int value) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream)); | |
} | |
// Various initializations | |
void initialize_xavier_uniform(pcg32& rnd, float scale = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
// Define probability distribution | |
scale *= std::sqrt(6.0f / (float)(fan_in() + fan_out())); | |
// Sample initialized values | |
std::vector<T> new_data(n_elements()); | |
for (size_t i = 0; i < new_data.size(); ++i) { | |
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
// Define probability distribution | |
scale *= std::sqrt(1.0f / (float)fan_in()); | |
// Sample initialized values | |
std::vector<T> new_data(n_elements()); | |
for (size_t i = 0; i < new_data.size(); ++i) { | |
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
// Define probability distribution | |
scale *= std::sqrt(1.0f / (float)fan_out()); | |
// Sample initialized values | |
std::vector<T> new_data(n_elements()); | |
for (size_t i = 0; i < new_data.size(); ++i) { | |
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_siren_uniform(pcg32& rnd, float scale = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
// Define probability distribution | |
scale *= std::sqrt(6.0f / (float)fan_in()); | |
// Sample initialized values | |
std::vector<T> new_data(n_elements()); | |
for (size_t i = 0; i < new_data.size(); ++i) { | |
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
// Define probability distribution | |
// The 30 in the first layer comes from https://vsitzmann.github.io/siren/ | |
scale *= 30.0f / (float)fan_in(); | |
// Sample initialized values | |
std::vector<T> new_data(n_elements()); | |
for (size_t i = 0; i < new_data.size(); ++i) { | |
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale); | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_constant(float val) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
std::vector<T> new_data(n_elements(), (T)val); | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
void initialize_diagonal(float val = 1) { | |
CHECK_THROW(data()); | |
CHECK_THROW(is_contiguous()); | |
CHECK_THROW(n() == m()); // Must be square for diagonal init to make sense | |
std::vector<T> new_data(n_elements(), (T)0); | |
for (uint32_t i = 0; i < n(); ++i) { | |
new_data[i + i*n()] = (T)val; | |
} | |
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice)); | |
} | |
GPUMatrixDynamic<T> transposed() const { | |
return GPUMatrixDynamic<T>(data(), n(), m(), transposed_layout(), stride(), m_malloc_allocation, m_arena_allocation); | |
} | |
GPUMatrix<T, RM> rm() const { | |
CHECK_THROW(m_layout == RM); | |
return GPUMatrix<T, RM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); | |
} | |
GPUMatrix<T, CM> cm() const { | |
CHECK_THROW(m_layout == CM); | |
return GPUMatrix<T, CM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation); | |
} | |
private: | |
T* m_data; | |
uint32_t m_rows, m_cols, m_stride; | |
MatrixLayout m_layout; | |
// References to corresponding memory allocations. These ensure that | |
// m_data does not accidentally become dangling. | |
std::shared_ptr<GPUMemory<uint8_t>> m_malloc_allocation; | |
std::shared_ptr<GPUMemoryArena::Allocation> m_arena_allocation; | |
}; | |
template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor> | |
class GPUMatrix : public GPUMatrixDynamic<T> { | |
public: | |
static const MatrixLayout static_layout = _layout; | |
static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM; | |
// Owning its memory as a GPUMemory<T> | |
GPUMatrix(uint32_t m, uint32_t n) | |
: GPUMatrixDynamic<T>{m, n, static_layout} { } | |
// Owning its memory as an allocation from a stream's memory arena | |
GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream) | |
: GPUMatrixDynamic<T>{m, n, stream, static_layout} { } | |
// Pointing to external memory | |
explicit GPUMatrix(T* data, uint32_t m, uint32_t n, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr) | |
: GPUMatrixDynamic<T>{data, m, n, static_layout, stride, malloc_allocation, arena_allocation} { } | |
GPUMatrix() : GPUMatrix{nullptr, 0, 0} {} | |
GPUMatrix<T, static_layout>& operator=(GPUMatrixDynamic<T>&& other) { | |
*((GPUMatrixDynamic<T>*)this) = std::move(other); | |
if (static_layout != this->layout()) { | |
throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."}; | |
} | |
return *this; | |
} | |
GPUMatrix(GPUMatrixDynamic<T>&& other) noexcept { | |
*this = std::move(other); | |
} | |
GPUMatrix<T, static_layout>& operator=(GPUMatrix<T, static_layout>&& other) noexcept { | |
*((GPUMatrixDynamic<T>*)this) = std::move(other); | |
return *this; | |
} | |
GPUMatrix(GPUMatrix<T, static_layout>&& other) noexcept { | |
*this = std::move(other); | |
} | |
GPUMatrix(const GPUMatrixDynamic<T>& other) = delete; | |
virtual ~GPUMatrix() {} | |
GPUMatrix<T, static_layout> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const { | |
return ((GPUMatrixDynamic<T>*)this)->slice(offset_rows, new_rows, offset_cols, new_cols); | |
} | |
GPUMatrix<T, static_layout> slice_rows(uint32_t offset, uint32_t size) const { | |
return ((GPUMatrixDynamic<T>*)this)->slice_rows(offset, size); | |
} | |
GPUMatrix<T, static_layout> slice_cols(uint32_t offset, uint32_t size) const { | |
return ((GPUMatrixDynamic<T>*)this)->slice_cols(offset, size); | |
} | |
GPUMatrix<T, static_layout> alias() const { | |
return ((GPUMatrixDynamic<T>*)this)->alias(); | |
} | |
GPUMatrix<T, static_transposed_layout> transposed() const { | |
return ((GPUMatrixDynamic<T>*)this)->transposed(); | |
} | |
}; | |
template <typename T> | |
void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices) { | |
std::vector<GPUMatrixBase*> matrix_pointers; | |
for (auto& matrix : matrices) { | |
matrix_pointers.emplace_back(&matrix); | |
} | |
allocate_shared_memory(memory, matrix_pointers); | |
} | |
template <typename T, MatrixLayout layout> | |
void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices) { | |
std::vector<GPUMatrixBase*> matrix_pointers; | |
for (auto& matrix : matrices) { | |
matrix_pointers.emplace_back(&matrix); | |
} | |
allocate_shared_memory(memory, matrix_pointers); | |
} | |
template <typename T> | |
GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) { | |
std::vector<GPUMatrixBase*> matrix_pointers; | |
for (auto& matrix : matrices) { | |
matrix_pointers.emplace_back(&matrix); | |
} | |
return allocate_shared_memory(stream, matrix_pointers); | |
} | |
template <typename T, MatrixLayout layout> | |
GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) { | |
std::vector<GPUMatrixBase*> matrix_pointers; | |
for (auto& matrix : matrices) { | |
matrix_pointers.emplace_back(&matrix); | |
} | |
return allocate_shared_memory(stream, matrix_pointers); | |
} | |
TCNN_NAMESPACE_END |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without modification, are permitted | |
* provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright notice, this list of | |
* conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright notice, this list of | |
* conditions and the following disclaimer in the documentation and/or other materials | |
* provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used | |
* to endorse or promote products derived from this software without specific prior written | |
* permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR | |
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE | |
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*//* | |
*/ | |
/** @file gpu_memory.h | |
* @author Thomas Müller and Nikolaus Binder, NVIDIA | |
* @brief Managed memory on the GPU. Like a std::vector, memory is allocated either explicitly (resize/enlarge) | |
* or implicitly (resize_and_copy_from_host etc). Memory is always and automatically released in the destructor. | |
* Also contains a GPU memory arena for light-weight stream-ordered allocations of temporary memory. The | |
* memory arena makes use of virtual memory when available to avoid re-allocations during progressive growing. | |
*/ | |
#pragma once | |
#include <tiny-cuda-nn/common.h> | |
#include <tiny-cuda-nn/cuda_graph.h> | |
#include <cuda.h> | |
#include <algorithm> | |
#include <atomic> | |
#include <stdexcept> | |
#include <stdint.h> | |
#include <string> | |
#include <tuple> | |
#include <unordered_map> | |
#include <vector> | |
TCNN_NAMESPACE_BEGIN | |
#define DEBUG_GUARD_SIZE 0 | |
inline std::atomic<size_t>& total_n_bytes_allocated() { | |
static std::atomic<size_t> s_total_n_bytes_allocated{0}; | |
return s_total_n_bytes_allocated; | |
} | |
/// Managed memory on the Device | |
template<class T> | |
class GPUMemory { | |
private: | |
T* m_data = nullptr; | |
size_t m_size = 0; // Number of elements | |
public: | |
GPUMemory() {} | |
GPUMemory<T>& operator=(GPUMemory<T>&& other) { | |
std::swap(m_data, other.m_data); | |
std::swap(m_size, other.m_size); | |
return *this; | |
} | |
GPUMemory(GPUMemory<T>&& other) { | |
*this = std::move(other); | |
} | |
explicit GPUMemory(const GPUMemory<T>& other) { | |
copy_from_device(other); | |
} | |
void check_guards() const { | |
#if DEBUG_GUARD_SIZE > 0 | |
if (!m_data) | |
return; | |
uint8_t buf[DEBUG_GUARD_SIZE]; | |
const uint8_t *rawptr=(const uint8_t *)m_data; | |
cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); | |
for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xff) { | |
printf("TRASH BEFORE BLOCK offset %d data %p, read 0x%02x expected 0xff!\n", i, m_data, buf[i] ); | |
break; | |
} | |
cudaMemcpy(buf, rawptr+m_size*sizeof(T), DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); | |
for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xfe) { | |
printf("TRASH AFTER BLOCK offset %d data %p, read 0x%02x expected 0xfe!\n", i, m_data, buf[i] ); | |
break; | |
} | |
#endif | |
} | |
void allocate_memory(size_t n_bytes) { | |
if (n_bytes == 0) { | |
return; | |
} | |
#ifdef TCNN_VERBOSE_MEMORY_ALLOCS | |
std::cout << "GPUMemory: Allocating " << bytes_to_string(n_bytes) << "." << std::endl; | |
#endif | |
uint8_t *rawptr = nullptr; | |
CUDA_CHECK_THROW(cudaMalloc(&rawptr, n_bytes+DEBUG_GUARD_SIZE*2)); | |
#if DEBUG_GUARD_SIZE > 0 | |
CUDA_CHECK_THROW(cudaMemset(rawptr , 0xff, DEBUG_GUARD_SIZE)); | |
CUDA_CHECK_THROW(cudaMemset(rawptr+n_bytes+DEBUG_GUARD_SIZE , 0xfe, DEBUG_GUARD_SIZE)); | |
#endif | |
if (rawptr) rawptr+=DEBUG_GUARD_SIZE; | |
m_data=(T*)(rawptr); | |
printf("GPUMemory::allocate_memory(): cnt[%d] += [%d]\n", total_n_bytes_allocated().load(), n_bytes); | |
total_n_bytes_allocated() += n_bytes; | |
} | |
void free_memory() { | |
if (!m_data) { | |
return; | |
} | |
uint8_t *rawptr = (uint8_t*)m_data; | |
if (rawptr) rawptr-=DEBUG_GUARD_SIZE; | |
CUDA_CHECK_THROW(cudaFree(rawptr)); | |
printf("GPUMemory()::free_memory(); cnt[%d] -= [%d]\n", total_n_bytes_allocated().load(), get_bytes()); | |
total_n_bytes_allocated() -= get_bytes(); | |
m_data = nullptr; | |
} | |
/// Allocates memory for size items of type T | |
GPUMemory(const size_t size) { | |
resize(size); | |
} | |
/// Frees memory again | |
TCNN_HOST_DEVICE ~GPUMemory() { | |
#ifndef __CUDA_ARCH__ | |
try { | |
if (m_data) { | |
free_memory(); | |
m_size = 0; | |
} | |
} catch (std::runtime_error error) { | |
// Don't need to report on memory-free problems when the driver is shutting down. | |
if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { | |
fprintf(stderr, "Could not free memory: %s\n", error.what()); | |
} | |
} | |
#endif | |
} | |
/** @name Resizing/enlargement | |
* @{ | |
*/ | |
/// Resizes the array to the exact new size, even if it is already larger | |
void resize(const size_t size) { | |
if (m_size != size) { | |
if (m_size) { | |
try { | |
free_memory(); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not free memory: ") + error.what()); | |
} | |
} | |
if (size > 0) { | |
try { | |
allocate_memory(size * sizeof(T)); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not allocate memory: ") + error.what()); | |
} | |
} | |
m_size = size; | |
} | |
} | |
/// Enlarges the array if its size is smaller | |
void enlarge(const size_t size) { | |
if (size > m_size) { | |
resize(size); | |
} | |
} | |
/** @} */ | |
/** @name Memset | |
* @{ | |
*/ | |
/// Sets the memory of the first num_elements to value | |
void memset(const int value, const size_t num_elements, const size_t offset = 0) { | |
if (num_elements + offset > m_size) { | |
throw std::runtime_error("Could not set memory: Number of elements larger than allocated memory"); | |
} | |
try { | |
CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T))); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not set memory: ") + error.what()); | |
} | |
} | |
/// Sets the memory of the all elements to value | |
void memset(const int value) { | |
memset(value, m_size); | |
} | |
/** @} */ | |
/** @name Copy operations | |
* @{ | |
*/ | |
/// Copy data of num_elements from the raw pointer on the host | |
void copy_from_host(const T* host_data, const size_t num_elements) { | |
try { | |
CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice)); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not copy from host: ") + error.what()); | |
} | |
} | |
/// Copy num_elements from the host vector | |
void copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
if (data.size() < num_elements) { | |
throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); | |
} | |
copy_from_host(data.data(), num_elements); | |
} | |
/// Copies data from the raw host pointer to fill the entire array | |
void copy_from_host(const T* data) { | |
copy_from_host(data, m_size); | |
} | |
/// Copies num_elements of data from the raw host pointer after enlarging the array so that everything fits in | |
void enlarge_and_copy_from_host(const T* data, const size_t num_elements) { | |
enlarge(num_elements); | |
copy_from_host(data, num_elements); | |
} | |
/// Copies num_elements from the host vector after enlarging the array so that everything fits in | |
void enlarge_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
enlarge_and_copy_from_host(data.data(), num_elements); | |
} | |
/// Copies the entire host vector after enlarging the array so that everything fits in | |
void enlarge_and_copy_from_host(const std::vector<T>& data) { | |
enlarge_and_copy_from_host(data.data(), data.size()); | |
} | |
/// Copies num_elements of data from the raw host pointer after resizing the array | |
void resize_and_copy_from_host(const T* data, const size_t num_elements) { | |
resize(num_elements); | |
copy_from_host(data, num_elements); | |
} | |
/// Copies num_elements from the host vector after resizing the array | |
void resize_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
resize_and_copy_from_host(data.data(), num_elements); | |
} | |
/// Copies the entire host vector after resizing the array | |
void resize_and_copy_from_host(const std::vector<T>& data) { | |
resize_and_copy_from_host(data.data(), data.size()); | |
} | |
/// Copies the entire host vector to the device. Fails if there is not enough space available. | |
void copy_from_host(const std::vector<T>& data) { | |
if (data.size() < m_size) { | |
throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); | |
} | |
copy_from_host(data.data(), m_size); | |
} | |
/// Copies num_elements of data from the raw host pointer to the device. Fails if there is not enough space available. | |
void copy_to_host(T* host_data, const size_t num_elements) const { | |
if (num_elements > m_size) { | |
throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(m_size)); | |
} | |
try { | |
CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost)); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not copy to host: ") + error.what()); | |
} | |
} | |
/// Copies num_elements from the device to a vector on the host | |
void copy_to_host(std::vector<T>& data, const size_t num_elements) const { | |
if (data.size() < num_elements) { | |
throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); | |
} | |
copy_to_host(data.data(), num_elements); | |
} | |
/// Copies num_elements from the device to a raw pointer on the host | |
void copy_to_host(T* data) const { | |
copy_to_host(data, m_size); | |
} | |
/// Copies all elements from the device to a vector on the host | |
void copy_to_host(std::vector<T>& data) const { | |
if (data.size() < m_size) { | |
throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); | |
} | |
copy_to_host(data.data(), m_size); | |
} | |
/// Copies size elements from another device array to this one, automatically resizing it | |
void copy_from_device(const GPUMemory<T> &other, const size_t size) { | |
if (size == 0) { | |
return; | |
} | |
if (m_size < size) { | |
resize(size); | |
} | |
try { | |
CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice)); | |
} catch (std::runtime_error error) { | |
throw std::runtime_error(std::string("Could not copy from device: ") + error.what()); | |
} | |
} | |
/// Copies data from another device array to this one, automatically resizing it | |
void copy_from_device(const GPUMemory<T> &other) { | |
copy_from_device(other, other.m_size); | |
} | |
// Created an (owned) copy of the data | |
GPUMemory<T> copy(size_t size) const { | |
GPUMemory<T> result{size}; | |
result.copy_from_device(*this); | |
return result; | |
} | |
GPUMemory<T> copy() const { | |
return copy(m_size); | |
} | |
T* data() const { | |
check_guards(); | |
return m_data; | |
} | |
TCNN_HOST_DEVICE T& operator[](size_t idx) const { | |
#ifdef DEBUG_BUFFER_OVERRUN | |
if (idx > m_size) { | |
printf("WARNING: buffer overrun of %p at idx %zu\n", idx); | |
} | |
#endif | |
return m_data[idx]; | |
} | |
TCNN_HOST_DEVICE T& operator[](uint32_t idx) const { | |
#ifdef DEBUG_BUFFER_OVERRUN | |
if (idx > m_size) { | |
printf("WARNING: buffer overrun of %p at idx %u\n", idx); | |
} | |
#endif | |
return m_data[idx]; | |
} | |
size_t get_num_elements() const { | |
return m_size; | |
} | |
size_t size() const { | |
return get_num_elements(); | |
} | |
size_t get_bytes() const { | |
return m_size * sizeof(T); | |
} | |
size_t bytes() const { | |
return get_bytes(); | |
} | |
}; | |
struct Interval { | |
// Inclusive start, exclusive end | |
size_t start, end; | |
bool operator<(const Interval& other) const { | |
return end < other.end; | |
} | |
bool overlaps(const Interval& other) const { | |
return !intersect(other).empty(); | |
} | |
Interval intersect(const Interval& other) const { | |
return {std::max(start, other.start), std::min(end, other.end)}; | |
} | |
bool valid() const { | |
return end >= start; | |
} | |
bool empty() const { | |
return end <= start; | |
} | |
size_t size() const { | |
return end - start; | |
} | |
}; | |
class GPUMemoryArena { | |
public: | |
GPUMemoryArena() { | |
// Align memory at least by a cache line (128 bytes). | |
m_alignment = (size_t)128; | |
m_max_size = next_multiple(cuda_memory_info().total, cuda_memory_granularity()); | |
m_free_intervals = {{0, m_max_size}}; | |
if (!cuda_supports_virtual_memory()) { | |
// Use regular memory as fallback | |
m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>(); | |
static bool printed_warning = false; | |
if (!printed_warning) { | |
printed_warning = true; | |
std::cout | |
<< "GPUMemoryArena: Warning: GPU " << cuda_device() << " does not support virtual memory. " | |
<< "Falling back to regular allocations, which will be larger and can cause occasional stutter." | |
<< std::endl; | |
} | |
return; | |
} | |
// Reserve an address range that would be sufficient for housing the entire | |
// available GPU RAM (if nothing else was using the GPU). This is unlikely | |
// to exhaust all available addresses (even if multiple GPUMemoryArenas are | |
// used simultaneously), while also ensuring that we never exhaust the | |
// reserved address range without running out of physical memory beforehand. | |
CU_CHECK_THROW(cuMemAddressReserve(&m_base_address, m_max_size, 0, 0, 0)); | |
} | |
GPUMemoryArena(GPUMemoryArena&& other) = default; | |
GPUMemoryArena(const GPUMemoryArena& other) = delete; | |
~GPUMemoryArena() { | |
try { | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
if (m_base_address) { | |
printf("~GPUMemoryArena(): cnt[%d] -= [%d]\n", total_n_bytes_allocated().load(), m_size); | |
total_n_bytes_allocated() -= m_size; | |
CU_CHECK_THROW(cuMemUnmap(m_base_address, m_size)); | |
for (const auto& handle : m_handles) { | |
CU_CHECK_THROW(cuMemRelease(handle)); | |
} | |
CU_CHECK_THROW(cuMemAddressFree(m_base_address, m_max_size)); | |
} | |
} catch (std::runtime_error error) { | |
// Don't need to report on memory-free problems when the driver is shutting down. | |
if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { | |
fprintf(stderr, "Could not free memory: %s\n", error.what()); | |
} | |
} | |
} | |
uint8_t* data() { | |
return m_fallback_memory ? m_fallback_memory->data() : (uint8_t*)m_base_address; | |
} | |
std::shared_ptr<GPUMemory<uint8_t>> backing_memory() { | |
return m_fallback_memory; | |
} | |
// Finds the smallest interval of free memory in the GPUMemoryArena that's | |
// large enough to hold the requested number of bytes. Then allocates | |
// that memory. | |
size_t allocate(size_t n_bytes) { | |
// Permitting zero-sized allocations is error prone | |
if (n_bytes == 0) { | |
n_bytes = m_alignment; | |
} | |
// Align allocations with the nearest cache line (at least the granularity of the memory allocations) | |
n_bytes = next_multiple(n_bytes, m_alignment); | |
Interval* best_candidate = &m_free_intervals.back(); | |
for (auto& f : m_free_intervals) { | |
if (f.size() >= n_bytes && f.size() < best_candidate->size()) { | |
best_candidate = &f; | |
} | |
} | |
size_t start = best_candidate->start; | |
m_allocated_intervals[start] = best_candidate->start += n_bytes; | |
printf("GPUMmeoryArena::allocate(): start=[%8x], size=[%d]\n", start, n_bytes); | |
enlarge(size()); | |
return start; | |
} | |
void free(size_t start) { | |
if (m_allocated_intervals.count(start) == 0) { | |
throw std::runtime_error{"Attempted to free arena memory that was not allocated."}; | |
} | |
Interval interval = {start, m_allocated_intervals[start]}; | |
m_allocated_intervals.erase(start); | |
m_free_intervals.insert( | |
std::upper_bound(std::begin(m_free_intervals), std::end(m_free_intervals), interval), | |
interval | |
); | |
merge_adjacent_intervals(); | |
} | |
void enlarge(size_t n_bytes) { | |
if (n_bytes <= m_size) { | |
return; | |
} | |
if (m_fallback_memory) { | |
static const double GROWTH_FACTOR = 1.5; | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
m_size = next_multiple((size_t)(n_bytes * GROWTH_FACTOR), cuda_memory_granularity()); | |
m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>(m_fallback_memory->copy(m_size)); | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
return; | |
} | |
size_t n_bytes_to_allocate = n_bytes - m_size; | |
n_bytes_to_allocate = next_multiple(n_bytes_to_allocate, cuda_memory_granularity()); | |
CUmemAllocationProp prop = {}; | |
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; | |
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
prop.location.id = cuda_device(); | |
m_handles.emplace_back(); | |
CU_CHECK_THROW(cuMemCreate(&m_handles.back(), n_bytes_to_allocate, &prop, 0)); | |
CUmemAccessDesc access_desc = {}; | |
access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
access_desc.location.id = prop.location.id; | |
access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; | |
CU_CHECK_THROW(cuMemMap(m_base_address + m_size, n_bytes_to_allocate, 0, m_handles.back(), 0)); | |
CU_CHECK_THROW(cuMemSetAccess(m_base_address + m_size, n_bytes_to_allocate, &access_desc, 1)); | |
m_size += n_bytes_to_allocate; | |
printf("GPUMemoryArena::enlarge(): cnt[%d] += [%d]\n", total_n_bytes_allocated().load(), n_bytes_to_allocate); | |
total_n_bytes_allocated() += n_bytes_to_allocate; | |
// Need to synchronize the device to make sure memory is available to all streams. | |
if (current_capture()) { | |
current_capture()->schedule_synchronize(); | |
} else { | |
CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
} | |
} | |
size_t size() const { | |
return m_free_intervals.back().start; | |
} | |
std::unordered_map<size_t, size_t> get_allocated_intervals() const { | |
return m_allocated_intervals; | |
} | |
class Allocation { | |
public: | |
Allocation() = default; | |
Allocation(cudaStream_t stream, size_t offset, GPUMemoryArena* workspace) | |
: m_stream{stream}, m_data{workspace->data() + offset}, m_offset{offset}, m_workspace{workspace}, m_backing_memory{workspace->backing_memory()} | |
{ | |
printf("Allocation: m_workspace=[%8x], m_offset=[%8x], m_data=[%8x]\n", m_workspace, m_offset, m_data); | |
} | |
~Allocation() { | |
if (m_workspace) { | |
printf("~Allocation: m_workspace=[%8x], free(m_offset=[%8x], m_data=[%8x]), cnt=[%llu]\n", (void*)m_workspace, (void*)m_offset, (void*)m_data, total_n_bytes_allocated().load()); | |
m_workspace->free(m_offset); | |
} | |
else { | |
printf("~Allocation: m_workspace=[%8x], cnt=[%llu]\n", (void*)m_workspace, total_n_bytes_allocated().load()); | |
} | |
} | |
Allocation(const Allocation& other) = delete; | |
Allocation& operator=(Allocation&& other) { | |
std::swap(m_stream, other.m_stream); | |
std::swap(m_data, other.m_data); | |
std::swap(m_offset, other.m_offset); | |
std::swap(m_workspace, other.m_workspace); | |
std::swap(m_backing_memory, other.m_backing_memory); | |
return *this; | |
} | |
Allocation(Allocation&& other) { | |
*this = std::move(other); | |
} | |
uint8_t* data() { | |
return m_data; | |
} | |
size_t offset() { | |
return m_offset; | |
} | |
const uint8_t* data() const { | |
return m_data; | |
} | |
cudaStream_t stream() const { | |
return m_stream; | |
} | |
private: | |
cudaStream_t m_stream = nullptr; | |
uint8_t* m_data = nullptr; | |
size_t m_offset = 0; | |
GPUMemoryArena* m_workspace = nullptr; | |
// Backing GPUMemory (if backed by a GPUMemory). Ensures that | |
// the backing memory is only freed once all allocations that | |
// use it were destroyed. | |
std::shared_ptr<GPUMemory<uint8_t>> m_backing_memory = nullptr; | |
}; | |
private: | |
void merge_adjacent_intervals() { | |
size_t j = 0; | |
for (size_t i = 1; i < m_free_intervals.size(); ++i) { | |
Interval& prev = m_free_intervals[j]; | |
Interval& cur = m_free_intervals[i]; | |
if (prev.end == cur.start) { | |
prev.end = cur.end; | |
} else { | |
++j; | |
m_free_intervals[j] = m_free_intervals[i]; | |
} | |
} | |
m_free_intervals.resize(j+1); | |
} | |
std::vector<Interval> m_free_intervals; | |
std::unordered_map<size_t, size_t> m_allocated_intervals; | |
CUdeviceptr m_base_address = {}; | |
size_t m_size = 0; | |
std::vector<CUmemGenericAllocationHandle> m_handles; | |
// Used then virtual memory isn't supported. | |
// Requires more storage + memcpy, but is more portable. | |
std::shared_ptr<GPUMemory<uint8_t>> m_fallback_memory = nullptr; | |
size_t m_alignment; | |
size_t m_max_size; | |
}; | |
inline std::unordered_map<cudaStream_t, GPUMemoryArena>& gpu_memory_arenas() { | |
static std::unordered_map<cudaStream_t, GPUMemoryArena> s_gpu_memory_arenas; | |
return s_gpu_memory_arenas; | |
} | |
inline GPUMemoryArena::Allocation allocate_workspace(cudaStream_t stream, size_t n_bytes) { | |
if (n_bytes == 0) { | |
// Return a null allocation if no bytes were requested. | |
return {}; | |
} | |
auto& arena = gpu_memory_arenas()[stream]; | |
return GPUMemoryArena::Allocation{stream, arena.allocate(n_bytes), &arena}; | |
} | |
static size_t align_to_cacheline(size_t bytes) { | |
return next_multiple(bytes, (size_t)128); | |
} | |
template <typename First, typename FirstSize> | |
std::tuple<First*> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size) { | |
*alloc = allocate_workspace(stream, offset + align_to_cacheline(first_size * sizeof(First))); | |
return std::make_tuple<First*>((First*)(alloc->data() + offset)); | |
} | |
template <typename First, typename ...Types, typename FirstSize, typename ...Sizes, std::enable_if_t<sizeof...(Types) != 0 && sizeof...(Types) == sizeof...(Sizes), int> = 0> | |
std::tuple<First*, Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size, Sizes... sizes) { | |
auto nested = allocate_workspace_and_distribute<Types...>(stream, alloc, offset + align_to_cacheline(first_size * sizeof(First)), sizes...); | |
return std::tuple_cat(std::make_tuple<First*>((First*)(alloc->data() + offset)), nested); | |
} | |
template <typename ...Types, typename ...Sizes, std::enable_if_t<sizeof...(Types) == sizeof...(Sizes), int> = 0> | |
std::tuple<Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, Sizes... sizes) { | |
return allocate_workspace_and_distribute<Types...>(stream, alloc, (size_t)0, sizes...); | |
} | |
inline void free_gpu_memory_arena(cudaStream_t stream) { | |
gpu_memory_arenas().erase(stream); | |
} | |
inline void free_all_gpu_memory_arenas() { | |
gpu_memory_arenas().clear(); | |
} | |
TCNN_NAMESPACE_END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment