ventusff · April 20, 2022 09:07
diff --git a/gpu_matrix.h b/gpu_matrix.h
 /*
 * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *//*
 */

 /** @file   gpu_matrix.h
 *  @author Thomas Müller, NVIDIA
 *  @brief  Matrix whose data resides in GPU (CUDA) memory
 */

 #pragma once

 #include <tiny-cuda-nn/common.h>
 #include <tiny-cuda-nn/gpu_memory.h>
 #include <tiny-cuda-nn/matrix_layout.h>

 #include <pcg32/pcg32.h>

 #include <stdexcept>
 #include <stdint.h>
 #include <string>
 #include <vector>

 TCNN_NAMESPACE_BEGIN

 template<typename T>
 class GPUMatrixDynamic;

 template<typename T, MatrixLayout _layout>
 class GPUMatrix;

 class GPUMatrixBase {
 public:
 	virtual ~GPUMatrixBase() {}

 	virtual size_t n_bytes() const = 0;
 	virtual void set_data_unsafe(void* data) = 0;

 	static void allocate_shared_memory(GPUMemory<char>& memory, const std::vector<GPUMatrixBase*>& matrices) {
 		size_t total_n_bytes = 0;
 		for (auto* matrix : matrices) {
 			total_n_bytes += matrix->n_bytes();
 		}

 		if (memory.bytes() < total_n_bytes) {
 #ifdef TCNN_VERBOSE_MEMORY_ALLOCS
 			std::cout << "GPUMatrix: Allocating " << bytes_to_string(total_n_bytes) << " shared among " << matrices.size() << " matrices." << std::endl;
 #endif
 			memory.resize(total_n_bytes);
 		}

 		size_t offset = 0;
 		for (auto* matrix : matrices) {
 			matrix->set_data_unsafe(memory.data() + offset);
 			offset += matrix->n_bytes();
 		}
 	}

 	template <typename T>
 	static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices);

 	template <typename T, MatrixLayout layout>
 	static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices);

 	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) {
 		size_t total_n_bytes = 0;
 		for (auto* matrix : matrices) {
 			total_n_bytes += matrix->n_bytes();
 		}

 		auto alloc = allocate_workspace(stream, total_n_bytes);

 		size_t offset = 0;
 		for (auto* matrix : matrices) {
 			matrix->set_data_unsafe(alloc.data() + offset);
 			offset += matrix->n_bytes();
 		}

 		return alloc;
 	}

 	template <typename T>
 	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices);

 	template <typename T, MatrixLayout layout>
 	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices);
 };

 template <typename T>
 struct MatrixView {
 	TCNN_HOST_DEVICE MatrixView() : data{nullptr}, stride_i{0}, stride_j{0} {}
 	TCNN_HOST_DEVICE MatrixView(T* data, uint32_t stride_i, uint32_t stride_j) : data{data}, stride_i{stride_i}, stride_j{stride_j} {}
 	TCNN_HOST_DEVICE MatrixView(const MatrixView<std::remove_const_t<T>>& other) : data{other.data}, stride_i{other.stride_i}, stride_j{other.stride_j} {}

 	TCNN_HOST_DEVICE T& operator()(uint32_t i, uint32_t j = 0) const {
 		return data[i * stride_i + j * stride_j];
 	}

 	TCNN_HOST_DEVICE void advance(uint32_t m, uint32_t n) {
 		data = &(*this)(m, n);
 	}

 	TCNN_HOST_DEVICE void advance_rows(uint32_t m) {
 		advance(m, 0);
 	}

 	TCNN_HOST_DEVICE void advance_cols(uint32_t n) {
 		advance(0, n);
 	}

 	TCNN_HOST_DEVICE explicit operator bool() const {
 		return data;
 	}

 	T* data;
 	uint32_t stride_i, stride_j;
 };

 template <typename T>
 class GPUMatrixDynamic : public GPUMatrixBase {
 public:
 	using Type = T;

 	// Owning its memory as a GPUMemory<T>
 	GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM)
 	: m_rows{m}, m_cols{n}, m_layout{layout} {
 		m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(m * n * sizeof(T));
 		m_data = (T*)m_malloc_allocation->data();
 		set_stride_contiguous();
 	}

 	// Owning its memory as an allocation from a stream's memory arena
 	GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM)
 	: m_rows{m}, m_cols{n}, m_layout{layout} {
 		m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, m * n * sizeof(T)));
 		m_data = (T*)m_arena_allocation->data();
 		set_stride_contiguous();
 	}

 	// Pointing to external memory
 	explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr)
 	: m_data{data}, m_layout{layout}, m_malloc_allocation{malloc_allocation}, m_arena_allocation{arena_allocation} {
 		set(data, m, n, stride);
 	}

 	GPUMatrixDynamic() : GPUMatrixDynamic{nullptr, 0, 0} {}

 	GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) {
 		std::swap(m_data, other.m_data);
 		std::swap(m_rows, other.m_rows);
 		std::swap(m_cols, other.m_cols);
 		std::swap(m_stride, other.m_stride);
 		std::swap(m_layout, other.m_layout);
 		std::swap(m_malloc_allocation, other.m_malloc_allocation);
 		std::swap(m_arena_allocation, other.m_arena_allocation);
 		return *this;
 	}

 	GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) {
 		*this = std::move(other);
 	}

 	GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) = delete;

 	virtual ~GPUMatrixDynamic() {}

 	void set_data_unsafe(void* data) override { m_data = (T*)data; }
 	void set_size_unsafe(uint32_t rows, uint32_t cols, uint32_t stride = 0) {
 		m_rows = rows;
 		m_cols = cols;

 		if (stride == 0) {
 			set_stride_contiguous();
 		} else {
 			m_stride = stride;
 		}
 	}

 	void set(T* data, uint32_t rows, uint32_t cols, uint32_t stride = 0) {
 		set_data_unsafe(data);
 		set_size_unsafe(rows, cols, stride);
 	}

 	void resize(uint32_t rows, uint32_t cols) {
 		if (m_arena_allocation) {
 			cudaStream_t stream = m_arena_allocation->stream();
 			m_arena_allocation.reset();
 			m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, rows * cols * sizeof(T)));
 		} else if (m_malloc_allocation) {
 			m_malloc_allocation.reset();
 			m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(rows * cols * sizeof(T));
 		} else {
 			throw std::runtime_error{"GPUMatrix::resize is not permitted when the underlying memory is not owned. Use GPUMatrix::set instead."};
 		}

 		set_size_unsafe(rows, cols);
 	}

 	uint32_t stride_contiguous() const {
 		return m_layout == CM ? m() : n();
 	}

 	bool is_contiguous() const {
 		return m_stride == stride_contiguous();
 	}

 	void set_stride_contiguous() {
 		m_stride = stride_contiguous();
 	}

 	GPUMatrixDynamic<T> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const {
 		return GPUMatrixDynamic<T>{
 			data() + (layout() == CM ? (offset_rows + offset_cols * stride()) : (offset_cols + offset_rows * stride())),
 			new_rows,
 			new_cols,
 			layout(),
 			stride(),
 			m_malloc_allocation,
 			m_arena_allocation,
 		};
 	}

 	GPUMatrixDynamic<T> slice_rows(uint32_t offset, uint32_t size) const {
 		return slice(offset, size, 0, cols());
 	}

 	GPUMatrixDynamic<T> slice_cols(uint32_t offset, uint32_t size) const {
 		return slice(0, rows(), offset, size);
 	}

 	GPUMatrixDynamic<T> alias() const {
 		return slice(0, rows(), 0, cols());
 	}

 	MatrixView<T> view() const {
 		return {data(), layout() == CM ? 1u : stride(), layout() == CM ? stride() : 1u};
 	}

 	uint32_t rows() const { return m_rows; }
 	uint32_t fan_out() const { return m_rows; }
 	uint32_t m() const { return m_rows; }

 	uint32_t cols() const { return m_cols; }
 	uint32_t fan_in() const { return m_cols; }
 	uint32_t n() const { return m_cols; }

 	uint32_t stride() const { return m_stride; }
 	PitchedPtr<T> pitched_ptr() { return {data(), stride()}; }
 	PitchedPtr<const T> pitched_ptr() const { return {data(), stride()}; }

 	uint32_t n_elements() const { return m_rows * m_cols; }
 	size_t n_bytes() const override { return n_elements() * sizeof(T); }

 	MatrixLayout layout() const { return m_layout; }
 	MatrixLayout transposed_layout() const { return m_layout == RM ? CM : RM; }

 	T* data() const { return m_data; }

 	void memset(int value) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());
 		CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes()));
 	}

 	void memset_async(cudaStream_t stream, int value) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());
 		CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream));
 	}

 	// Various initializations
 	void initialize_xavier_uniform(pcg32& rnd, float scale = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		// Define probability distribution
 		scale *= std::sqrt(6.0f / (float)(fan_in() + fan_out()));

 		// Sample initialized values
 		std::vector<T> new_data(n_elements());

 		for (size_t i = 0; i < new_data.size(); ++i) {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		// Define probability distribution
 		scale *= std::sqrt(1.0f / (float)fan_in());

 		// Sample initialized values
 		std::vector<T> new_data(n_elements());

 		for (size_t i = 0; i < new_data.size(); ++i) {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		// Define probability distribution
 		scale *= std::sqrt(1.0f / (float)fan_out());

 		// Sample initialized values
 		std::vector<T> new_data(n_elements());

 		for (size_t i = 0; i < new_data.size(); ++i) {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_siren_uniform(pcg32& rnd, float scale = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		// Define probability distribution
 		scale *= std::sqrt(6.0f / (float)fan_in());

 		// Sample initialized values
 		std::vector<T> new_data(n_elements());

 		for (size_t i = 0; i < new_data.size(); ++i) {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		// Define probability distribution

 		// The 30 in the first layer comes from https://vsitzmann.github.io/siren/
 		scale *= 30.0f / (float)fan_in();

 		// Sample initialized values
 		std::vector<T> new_data(n_elements());

 		for (size_t i = 0; i < new_data.size(); ++i) {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_constant(float val) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());

 		std::vector<T> new_data(n_elements(), (T)val);
 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	void initialize_diagonal(float val = 1) {
 		CHECK_THROW(data());
 		CHECK_THROW(is_contiguous());
 		CHECK_THROW(n() == m()); // Must be square for diagonal init to make sense

 		std::vector<T> new_data(n_elements(), (T)0);
 		for (uint32_t i = 0; i < n(); ++i) {
 			new_data[i + i*n()] = (T)val;
 		}

 		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}

 	GPUMatrixDynamic<T> transposed() const {
 		return GPUMatrixDynamic<T>(data(), n(), m(), transposed_layout(), stride(), m_malloc_allocation, m_arena_allocation);
 	}

 	GPUMatrix<T, RM> rm() const {
 		CHECK_THROW(m_layout == RM);
 		return GPUMatrix<T, RM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation);
 	}

 	GPUMatrix<T, CM> cm() const {
 		CHECK_THROW(m_layout == CM);
 		return GPUMatrix<T, CM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation);
 	}

 private:
 	T* m_data;
 	uint32_t m_rows, m_cols, m_stride;
 	MatrixLayout m_layout;

 	// References to corresponding memory allocations. These ensure that
 	// m_data does not accidentally become dangling.
 	std::shared_ptr<GPUMemory<uint8_t>> m_malloc_allocation;
 	std::shared_ptr<GPUMemoryArena::Allocation> m_arena_allocation;
 };

 template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor>
 class GPUMatrix : public GPUMatrixDynamic<T> {
 public:
 	static const MatrixLayout static_layout = _layout;
 	static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM;

 	// Owning its memory as a GPUMemory<T>
 	GPUMatrix(uint32_t m, uint32_t n)
 	: GPUMatrixDynamic<T>{m, n, static_layout} { }

 	// Owning its memory as an allocation from a stream's memory arena
 	GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream)
 	: GPUMatrixDynamic<T>{m, n, stream, static_layout} { }

 	// Pointing to external memory
 	explicit GPUMatrix(T* data, uint32_t m, uint32_t n, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr)
 	: GPUMatrixDynamic<T>{data, m, n, static_layout, stride, malloc_allocation, arena_allocation} { }

 	GPUMatrix() : GPUMatrix{nullptr, 0, 0} {}

 	GPUMatrix<T, static_layout>& operator=(GPUMatrixDynamic<T>&& other) {
 		*((GPUMatrixDynamic<T>*)this) = std::move(other);
 		if (static_layout != this->layout()) {
 			throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."};
 		}
 		return *this;
 	}

 	GPUMatrix(GPUMatrixDynamic<T>&& other) noexcept {
 		*this = std::move(other);
 	}

 	GPUMatrix<T, static_layout>& operator=(GPUMatrix<T, static_layout>&& other) noexcept {
 		*((GPUMatrixDynamic<T>*)this) = std::move(other);
 		return *this;
 	}

 	GPUMatrix(GPUMatrix<T, static_layout>&& other) noexcept {
 		*this = std::move(other);
 	}

 	GPUMatrix(const GPUMatrixDynamic<T>& other) = delete;

 	virtual ~GPUMatrix() {}

 	GPUMatrix<T, static_layout> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const {
 		return ((GPUMatrixDynamic<T>*)this)->slice(offset_rows, new_rows, offset_cols, new_cols);
 	}

 	GPUMatrix<T, static_layout> slice_rows(uint32_t offset, uint32_t size) const {
 		return ((GPUMatrixDynamic<T>*)this)->slice_rows(offset, size);
 	}

 	GPUMatrix<T, static_layout> slice_cols(uint32_t offset, uint32_t size) const {
 		return ((GPUMatrixDynamic<T>*)this)->slice_cols(offset, size);
 	}

 	GPUMatrix<T, static_layout> alias() const {
 		return ((GPUMatrixDynamic<T>*)this)->alias();
 	}

 	GPUMatrix<T, static_transposed_layout> transposed() const {
 		return ((GPUMatrixDynamic<T>*)this)->transposed();
 	}
 };

 template <typename T>
 void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices) {
 	std::vector<GPUMatrixBase*> matrix_pointers;
 	for (auto& matrix : matrices) {
 		matrix_pointers.emplace_back(&matrix);
 	}
 	allocate_shared_memory(memory, matrix_pointers);
 }

 template <typename T, MatrixLayout layout>
 void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices) {
 	std::vector<GPUMatrixBase*> matrix_pointers;
 	for (auto& matrix : matrices) {
 		matrix_pointers.emplace_back(&matrix);
 	}
 	allocate_shared_memory(memory, matrix_pointers);
 }

 template <typename T>
 GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) {
 	std::vector<GPUMatrixBase*> matrix_pointers;
 	for (auto& matrix : matrices) {
 		matrix_pointers.emplace_back(&matrix);
 	}
 	return allocate_shared_memory(stream, matrix_pointers);
 }

 template <typename T, MatrixLayout layout>
 GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) {
 	std::vector<GPUMatrixBase*> matrix_pointers;
 	for (auto& matrix : matrices) {
 		matrix_pointers.emplace_back(&matrix);
 	}
 	return allocate_shared_memory(stream, matrix_pointers);
 }

 TCNN_NAMESPACE_END
diff --git a/gpu_memory.h b/gpu_memory.h
 /*
 * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *//*
 */

 /** @file   gpu_memory.h
 *  @author Thomas Müller and Nikolaus Binder, NVIDIA
 *  @brief  Managed memory on the GPU. Like a std::vector, memory is allocated either explicitly (resize/enlarge)
 *          or implicitly (resize_and_copy_from_host etc). Memory is always and automatically released in the destructor.
 *          Also contains a GPU memory arena for light-weight stream-ordered allocations of temporary memory. The
 *          memory arena makes use of virtual memory when available to avoid re-allocations during progressive growing.
 */

 #pragma once

 #include <tiny-cuda-nn/common.h>
 #include <tiny-cuda-nn/cuda_graph.h>

 #include <cuda.h>

 #include <algorithm>
 #include <atomic>
 #include <stdexcept>
 #include <stdint.h>
 #include <string>
 #include <tuple>
 #include <unordered_map>
 #include <vector>

 TCNN_NAMESPACE_BEGIN

 #define DEBUG_GUARD_SIZE 0

 inline std::atomic<size_t>& total_n_bytes_allocated() {
 	static std::atomic<size_t> s_total_n_bytes_allocated{0};
 	return s_total_n_bytes_allocated;
 }

 /// Managed memory on the Device
 template<class T>
 class GPUMemory {
 private:
 	T* m_data = nullptr;
 	size_t m_size = 0; // Number of elements

 public:
 	GPUMemory() {}

 	GPUMemory<T>& operator=(GPUMemory<T>&& other) {
 		std::swap(m_data, other.m_data);
 		std::swap(m_size, other.m_size);
 		return *this;
 	}

 	GPUMemory(GPUMemory<T>&& other) {
 		*this = std::move(other);
 	}

 	explicit GPUMemory(const GPUMemory<T>& other) {
 		copy_from_device(other);
 	}

 	void check_guards() const {
 #if DEBUG_GUARD_SIZE > 0
 		if (!m_data)
 			return;
 		uint8_t buf[DEBUG_GUARD_SIZE];
 		const uint8_t *rawptr=(const uint8_t *)m_data;
 		cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost);
 		for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xff) {
 			printf("TRASH BEFORE BLOCK offset %d data %p, read 0x%02x expected 0xff!\n", i, m_data, buf[i] );
 			break;
 		}
 		cudaMemcpy(buf, rawptr+m_size*sizeof(T), DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost);
 		for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xfe) {
 			printf("TRASH AFTER BLOCK offset %d data %p, read 0x%02x expected 0xfe!\n", i, m_data, buf[i] );
 			break;
 		}
 #endif
 	}

 	void allocate_memory(size_t n_bytes) {
 		if (n_bytes == 0) {
 			return;
 		}

 #ifdef TCNN_VERBOSE_MEMORY_ALLOCS
 		std::cout << "GPUMemory: Allocating " << bytes_to_string(n_bytes) << "." << std::endl;
 #endif

 		uint8_t *rawptr = nullptr;
 		CUDA_CHECK_THROW(cudaMalloc(&rawptr, n_bytes+DEBUG_GUARD_SIZE*2));
 #if DEBUG_GUARD_SIZE > 0
 		CUDA_CHECK_THROW(cudaMemset(rawptr , 0xff, DEBUG_GUARD_SIZE));
 		CUDA_CHECK_THROW(cudaMemset(rawptr+n_bytes+DEBUG_GUARD_SIZE , 0xfe, DEBUG_GUARD_SIZE));
 #endif
 		if (rawptr) rawptr+=DEBUG_GUARD_SIZE;
 		m_data=(T*)(rawptr);
 		printf("GPUMemory::allocate_memory(): cnt[%d] += [%d]\n", total_n_bytes_allocated().load(), n_bytes);
 		total_n_bytes_allocated() += n_bytes;
 	}

 	void free_memory() {
 		if (!m_data) {
 			return;
 		}

 		uint8_t *rawptr = (uint8_t*)m_data;
 		if (rawptr) rawptr-=DEBUG_GUARD_SIZE;
 		CUDA_CHECK_THROW(cudaFree(rawptr));

 		printf("GPUMemory()::free_memory(); cnt[%d] -= [%d]\n", total_n_bytes_allocated().load(), get_bytes());
 		total_n_bytes_allocated() -= get_bytes();

 		m_data = nullptr;
 	}

 	/// Allocates memory for size items of type T
 	GPUMemory(const size_t size) {
 		resize(size);
 	}

 	/// Frees memory again
 	TCNN_HOST_DEVICE ~GPUMemory() {
 #ifndef __CUDA_ARCH__
 		try {
 			if (m_data) {
 				free_memory();
 				m_size = 0;
 			}
 		} catch (std::runtime_error error) {
 			// Don't need to report on memory-free problems when the driver is shutting down.
 			if (std::string{error.what()}.find("driver shutting down") == std::string::npos) {
 				fprintf(stderr, "Could not free memory: %s\n", error.what());
 			}
 		}
 #endif
 	}

 	/** @name Resizing/enlargement
 	 *  @{
 	 */
 	/// Resizes the array to the exact new size, even if it is already larger
 	void resize(const size_t size) {
 		if (m_size != size) {
 			if (m_size) {
 				try {
 					free_memory();
 				} catch (std::runtime_error error) {
 					throw std::runtime_error(std::string("Could not free memory: ") + error.what());
 				}
 			}

 			if (size > 0) {
 				try {
 					allocate_memory(size * sizeof(T));
 				} catch (std::runtime_error error) {
 					throw std::runtime_error(std::string("Could not allocate memory: ") + error.what());
 				}
 			}

 			m_size = size;
 		}
 	}

 	/// Enlarges the array if its size is smaller
 	void enlarge(const size_t size) {
 		if (size > m_size) {
 			resize(size);
 		}
 	}
 	/** @} */

 	/** @name Memset
 	 *  @{
 	 */
 	/// Sets the memory of the first num_elements to value
 	void memset(const int value, const size_t num_elements, const size_t offset = 0) {
 		if (num_elements + offset > m_size) {
 			throw std::runtime_error("Could not set memory: Number of elements larger than allocated memory");
 		}

 		try {
 			CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T)));
 		} catch (std::runtime_error error) {
 			throw std::runtime_error(std::string("Could not set memory: ") + error.what());
 		}
 	}

 	/// Sets the memory of the all elements to value
 	void memset(const int value) {
 		memset(value, m_size);
 	}
 	/** @} */

 	/** @name Copy operations
 	 *  @{
 	 */
 	/// Copy data of num_elements from the raw pointer on the host
 	void copy_from_host(const T* host_data, const size_t num_elements) {
 		try {
 			CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice));
 		} catch (std::runtime_error error) {
 			throw std::runtime_error(std::string("Could not copy from host: ") + error.what());
 		}
 	}

 	/// Copy num_elements from the host vector
 	void copy_from_host(const std::vector<T>& data, const size_t num_elements) {
 		if (data.size() < num_elements) {
 			throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
 		}
 		copy_from_host(data.data(), num_elements);
 	}

 	/// Copies data from the raw host pointer to fill the entire array
 	void copy_from_host(const T* data) {
 		copy_from_host(data, m_size);
 	}

 	/// Copies num_elements of data from the raw host pointer after enlarging the array so that everything fits in
 	void enlarge_and_copy_from_host(const T* data, const size_t num_elements) {
 		enlarge(num_elements);
 		copy_from_host(data, num_elements);
 	}

 	/// Copies num_elements from the host vector after enlarging the array so that everything fits in
 	void enlarge_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) {
 		enlarge_and_copy_from_host(data.data(), num_elements);
 	}

 	/// Copies the entire host vector after enlarging the array so that everything fits in
 	void enlarge_and_copy_from_host(const std::vector<T>& data) {
 		enlarge_and_copy_from_host(data.data(), data.size());
 	}

 	/// Copies num_elements of data from the raw host pointer after resizing the array
 	void resize_and_copy_from_host(const T* data, const size_t num_elements) {
 		resize(num_elements);
 		copy_from_host(data, num_elements);
 	}

 	/// Copies num_elements from the host vector after resizing the array
 	void resize_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) {
 		resize_and_copy_from_host(data.data(), num_elements);
 	}

 	/// Copies the entire host vector after resizing the array
 	void resize_and_copy_from_host(const std::vector<T>& data) {
 		resize_and_copy_from_host(data.data(), data.size());
 	}

 	/// Copies the entire host vector to the device. Fails if there is not enough space available.
 	void copy_from_host(const std::vector<T>& data) {
 		if (data.size() < m_size) {
 			throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
 		}
 		copy_from_host(data.data(), m_size);
 	}

 	/// Copies num_elements of data from the raw host pointer to the device. Fails if there is not enough space available.
 	void copy_to_host(T* host_data, const size_t num_elements) const {
 		if (num_elements > m_size) {
 			throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(m_size));
 		}
 		try {
 			CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost));
 		} catch (std::runtime_error error) {
 			throw std::runtime_error(std::string("Could not copy to host: ") + error.what());
 		}
 	}

 	/// Copies num_elements from the device to a vector on the host
 	void copy_to_host(std::vector<T>& data, const size_t num_elements) const {
 		if (data.size() < num_elements) {
 			throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
 		}
 		copy_to_host(data.data(), num_elements);
 	}

 	/// Copies num_elements from the device to a raw pointer on the host
 	void copy_to_host(T* data) const {
 		copy_to_host(data, m_size);
 	}

 	/// Copies all elements from the device to a vector on the host
 	void copy_to_host(std::vector<T>& data) const {
 		if (data.size() < m_size) {
 			throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
 		}
 		copy_to_host(data.data(), m_size);
 	}

 	/// Copies size elements from another device array to this one, automatically resizing it
 	void copy_from_device(const GPUMemory<T> &other, const size_t size) {
 		if (size == 0) {
 			return;
 		}

 		if (m_size < size) {
 			resize(size);
 		}

 		try {
 			CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice));
 		} catch (std::runtime_error error) {
 			throw std::runtime_error(std::string("Could not copy from device: ") + error.what());
 		}
 	}

 	/// Copies data from another device array to this one, automatically resizing it
 	void copy_from_device(const GPUMemory<T> &other) {
 		copy_from_device(other, other.m_size);
 	}

 	// Created an (owned) copy of the data
 	GPUMemory<T> copy(size_t size) const {
 		GPUMemory<T> result{size};
 		result.copy_from_device(*this);
 		return result;
 	}

 	GPUMemory<T> copy() const {
 		return copy(m_size);
 	}

 	T* data() const {
 		check_guards();
 		return m_data;
 	}

 	TCNN_HOST_DEVICE T& operator[](size_t idx) const {
 #ifdef DEBUG_BUFFER_OVERRUN
 		if (idx > m_size) {
 			printf("WARNING: buffer overrun of %p at idx %zu\n", idx);
 		}
 #endif
 		return m_data[idx];
 	}

 	TCNN_HOST_DEVICE T& operator[](uint32_t idx) const {
 #ifdef DEBUG_BUFFER_OVERRUN
 		if (idx > m_size) {
 			printf("WARNING: buffer overrun of %p at idx %u\n", idx);
 		}
 #endif
 		return m_data[idx];
 	}

 	size_t get_num_elements() const {
 		return m_size;
 	}

 	size_t size() const {
 		return get_num_elements();
 	}

 	size_t get_bytes() const {
 		return m_size * sizeof(T);
 	}

 	size_t bytes() const {
 		return get_bytes();
 	}
 };

 struct Interval {
 	// Inclusive start, exclusive end
 	size_t start, end;

 	bool operator<(const Interval& other) const {
 		return end < other.end;
 	}

 	bool overlaps(const Interval& other) const {
 		return !intersect(other).empty();
 	}

 	Interval intersect(const Interval& other) const {
 		return {std::max(start, other.start), std::min(end, other.end)};
 	}

 	bool valid() const {
 		return end >= start;
 	}

 	bool empty() const {
 		return end <= start;
 	}

 	size_t size() const {
 		return end - start;
 	}
 };

 class GPUMemoryArena {
 public:
 	GPUMemoryArena() {
 		// Align memory at least by a cache line (128 bytes).
 		m_alignment = (size_t)128;
 		m_max_size = next_multiple(cuda_memory_info().total, cuda_memory_granularity());

 		m_free_intervals = {{0, m_max_size}};

 		if (!cuda_supports_virtual_memory()) {
 			// Use regular memory as fallback
 			m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>();

 			static bool printed_warning = false;
 			if (!printed_warning) {
 				printed_warning = true;
 				std::cout
 					<< "GPUMemoryArena: Warning: GPU " << cuda_device() << " does not support virtual memory. "
 					<< "Falling back to regular allocations, which will be larger and can cause occasional stutter."
 					<< std::endl;
 			}
 			return;
 		}

 		// Reserve an address range that would be sufficient for housing the entire
 		// available GPU RAM (if nothing else was using the GPU). This is unlikely
 		// to exhaust all available addresses (even if multiple GPUMemoryArenas are
 		// used simultaneously), while also ensuring that we never exhaust the
 		// reserved address range without running out of physical memory beforehand.
 		CU_CHECK_THROW(cuMemAddressReserve(&m_base_address, m_max_size, 0, 0, 0));
 	}

 	GPUMemoryArena(GPUMemoryArena&& other) = default;
 	GPUMemoryArena(const GPUMemoryArena& other) = delete;

 	~GPUMemoryArena() {
 		try {
 			CUDA_CHECK_THROW(cudaDeviceSynchronize());

 			if (m_base_address) {
 				printf("~GPUMemoryArena(): cnt[%d] -= [%d]\n", total_n_bytes_allocated().load(), m_size);
 				total_n_bytes_allocated() -= m_size;

 				CU_CHECK_THROW(cuMemUnmap(m_base_address, m_size));

 				for (const auto& handle : m_handles) {
 					CU_CHECK_THROW(cuMemRelease(handle));
 				}

 				CU_CHECK_THROW(cuMemAddressFree(m_base_address, m_max_size));
 			}
 		} catch (std::runtime_error error) {
 			// Don't need to report on memory-free problems when the driver is shutting down.
 			if (std::string{error.what()}.find("driver shutting down") == std::string::npos) {
 				fprintf(stderr, "Could not free memory: %s\n", error.what());
 			}
 		}
 	}

 	uint8_t* data() {
 		return m_fallback_memory ? m_fallback_memory->data() : (uint8_t*)m_base_address;
 	}

 	std::shared_ptr<GPUMemory<uint8_t>> backing_memory() {
 		return m_fallback_memory;
 	}

 	// Finds the smallest interval of free memory in the GPUMemoryArena that's
 	// large enough to hold the requested number of bytes. Then allocates
 	// that memory.
 	size_t allocate(size_t n_bytes) {
 		// Permitting zero-sized allocations is error prone
 		if (n_bytes == 0) {
 			n_bytes = m_alignment;
 		}

 		// Align allocations with the nearest cache line (at least the granularity of the memory allocations)
 		n_bytes = next_multiple(n_bytes, m_alignment);

 		Interval* best_candidate = &m_free_intervals.back();
 		for (auto& f : m_free_intervals) {
 			if (f.size() >= n_bytes && f.size() < best_candidate->size()) {
 				best_candidate = &f;
 			}
 		}

 		size_t start = best_candidate->start;
 		m_allocated_intervals[start] = best_candidate->start += n_bytes;

 		printf("GPUMmeoryArena::allocate(): start=[%8x], size=[%d]\n", start, n_bytes);
 		enlarge(size());

 		return start;
 	}

 	void free(size_t start) {
 		if (m_allocated_intervals.count(start) == 0) {
 			throw std::runtime_error{"Attempted to free arena memory that was not allocated."};
 		}

 		Interval interval = {start, m_allocated_intervals[start]};
 		m_allocated_intervals.erase(start);

 		m_free_intervals.insert(
 			std::upper_bound(std::begin(m_free_intervals), std::end(m_free_intervals), interval),
 			interval
 		);

 		merge_adjacent_intervals();
 	}

 	void enlarge(size_t n_bytes) {
 		if (n_bytes <= m_size) {
 			return;
 		}

 		if (m_fallback_memory) {
 			static const double GROWTH_FACTOR = 1.5;

 			CUDA_CHECK_THROW(cudaDeviceSynchronize());

 			m_size = next_multiple((size_t)(n_bytes * GROWTH_FACTOR), cuda_memory_granularity());
 			m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>(m_fallback_memory->copy(m_size));

 			CUDA_CHECK_THROW(cudaDeviceSynchronize());

 			return;
 		}

 		size_t n_bytes_to_allocate = n_bytes - m_size;
 		n_bytes_to_allocate = next_multiple(n_bytes_to_allocate, cuda_memory_granularity());

 		CUmemAllocationProp prop = {};
 		prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 		prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 		prop.location.id = cuda_device();

 		m_handles.emplace_back();
 		CU_CHECK_THROW(cuMemCreate(&m_handles.back(), n_bytes_to_allocate, &prop, 0));

 		CUmemAccessDesc access_desc = {};
 		access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
 		access_desc.location.id = prop.location.id;
 		access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

 		CU_CHECK_THROW(cuMemMap(m_base_address + m_size, n_bytes_to_allocate, 0, m_handles.back(), 0));
 		CU_CHECK_THROW(cuMemSetAccess(m_base_address + m_size, n_bytes_to_allocate, &access_desc, 1));
 		m_size += n_bytes_to_allocate;

 		printf("GPUMemoryArena::enlarge(): cnt[%d] += [%d]\n", total_n_bytes_allocated().load(), n_bytes_to_allocate);
 		total_n_bytes_allocated() += n_bytes_to_allocate;

 		// Need to synchronize the device to make sure memory is available to all streams.
 		if (current_capture()) {
 			current_capture()->schedule_synchronize();
 		} else {
 			CUDA_CHECK_THROW(cudaDeviceSynchronize());
 		}
 	}

 	size_t size() const {
 		return m_free_intervals.back().start;
 	}

 	std::unordered_map<size_t, size_t> get_allocated_intervals() const {
 		return m_allocated_intervals;
 	}

 	class Allocation {
 	public:
 		Allocation() = default;
 		Allocation(cudaStream_t stream, size_t offset, GPUMemoryArena* workspace)
 		: m_stream{stream}, m_data{workspace->data() + offset}, m_offset{offset}, m_workspace{workspace}, m_backing_memory{workspace->backing_memory()}
 		{
 			printf("Allocation: m_workspace=[%8x], m_offset=[%8x], m_data=[%8x]\n", m_workspace, m_offset, m_data);
 		}

 		~Allocation() {
 			if (m_workspace) {
 				printf("~Allocation: m_workspace=[%8x], free(m_offset=[%8x], m_data=[%8x]), cnt=[%llu]\n", (void*)m_workspace, (void*)m_offset, (void*)m_data, total_n_bytes_allocated().load());
 				m_workspace->free(m_offset);
 			}
 			else {
 				printf("~Allocation: m_workspace=[%8x], cnt=[%llu]\n", (void*)m_workspace, total_n_bytes_allocated().load());
 			}
 		}

 		Allocation(const Allocation& other) = delete;

 		Allocation& operator=(Allocation&& other) {
 			std::swap(m_stream, other.m_stream);
 			std::swap(m_data, other.m_data);
 			std::swap(m_offset, other.m_offset);
 			std::swap(m_workspace, other.m_workspace);
 			std::swap(m_backing_memory, other.m_backing_memory);
 			return *this;
 		}

 		Allocation(Allocation&& other) {
 			*this = std::move(other);
 		}

 		uint8_t* data() {
 			return m_data;
 		}

 		size_t offset() {
 			return m_offset;
 		}

 		const uint8_t* data() const {
 			return m_data;
 		}

 		cudaStream_t stream() const {
 			return m_stream;
 		}

 	private:
 		cudaStream_t m_stream = nullptr;
 		uint8_t* m_data = nullptr;
 		size_t m_offset = 0;
 		GPUMemoryArena* m_workspace = nullptr;

 		// Backing GPUMemory (if backed by a GPUMemory). Ensures that
 		// the backing memory is only freed once all allocations that
 		// use it were destroyed.
 		std::shared_ptr<GPUMemory<uint8_t>> m_backing_memory = nullptr;
 	};

 private:
 	void merge_adjacent_intervals() {
 		size_t j = 0;
 		for (size_t i = 1; i < m_free_intervals.size(); ++i) {
 			Interval& prev = m_free_intervals[j];
 			Interval& cur = m_free_intervals[i];

 			if (prev.end == cur.start) {
 				prev.end = cur.end;
 			} else {
 				++j;
 				m_free_intervals[j] = m_free_intervals[i];
 			}
 		}
 		m_free_intervals.resize(j+1);
 	}

 	std::vector<Interval> m_free_intervals;
 	std::unordered_map<size_t, size_t> m_allocated_intervals;

 	CUdeviceptr m_base_address = {};
 	size_t m_size = 0;

 	std::vector<CUmemGenericAllocationHandle> m_handles;

 	// Used then virtual memory isn't supported.
 	// Requires more storage + memcpy, but is more portable.
 	std::shared_ptr<GPUMemory<uint8_t>> m_fallback_memory = nullptr;

 	size_t m_alignment;
 	size_t m_max_size;
 };

 inline std::unordered_map<cudaStream_t, GPUMemoryArena>& gpu_memory_arenas() {
 	static std::unordered_map<cudaStream_t, GPUMemoryArena> s_gpu_memory_arenas;
 	return s_gpu_memory_arenas;
 }

 inline GPUMemoryArena::Allocation allocate_workspace(cudaStream_t stream, size_t n_bytes) {
 	if (n_bytes == 0) {
 		// Return a null allocation if no bytes were requested.
 		return {};
 	}

 	auto& arena = gpu_memory_arenas()[stream];
 	return GPUMemoryArena::Allocation{stream, arena.allocate(n_bytes), &arena};
 }

 static size_t align_to_cacheline(size_t bytes) {
 	return next_multiple(bytes, (size_t)128);
 }

 template <typename First, typename FirstSize>
 std::tuple<First*> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size) {
 	*alloc = allocate_workspace(stream, offset + align_to_cacheline(first_size * sizeof(First)));
 	return std::make_tuple<First*>((First*)(alloc->data() + offset));
 }

 template <typename First, typename ...Types, typename FirstSize, typename ...Sizes, std::enable_if_t<sizeof...(Types) != 0 && sizeof...(Types) == sizeof...(Sizes), int> = 0>
 std::tuple<First*, Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size, Sizes... sizes) {
 	auto nested = allocate_workspace_and_distribute<Types...>(stream, alloc, offset + align_to_cacheline(first_size * sizeof(First)), sizes...);
 	return std::tuple_cat(std::make_tuple<First*>((First*)(alloc->data() + offset)), nested);
 }

 template <typename ...Types, typename ...Sizes, std::enable_if_t<sizeof...(Types) == sizeof...(Sizes), int> = 0>
 std::tuple<Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, Sizes... sizes) {
 	return allocate_workspace_and_distribute<Types...>(stream, alloc, (size_t)0, sizes...);
 }

 inline void free_gpu_memory_arena(cudaStream_t stream) {
 	gpu_memory_arenas().erase(stream);
 }

 inline void free_all_gpu_memory_arenas() {
 	gpu_memory_arenas().clear();
 }

 TCNN_NAMESPACE_END
	/*
	* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without modification, are permitted
	* provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright notice, this list of
	* conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright notice, this list of
	* conditions and the following disclaimer in the documentation and/or other materials
	* provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
	* to endorse or promote products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	*/

	/** @file gpu_matrix.h
	* @author Thomas Müller, NVIDIA
	* @brief Matrix whose data resides in GPU (CUDA) memory
	*/

	#pragma once

	#include <tiny-cuda-nn/common.h>
	#include <tiny-cuda-nn/gpu_memory.h>
	#include <tiny-cuda-nn/matrix_layout.h>

	#include <pcg32/pcg32.h>

	#include <stdexcept>
	#include <stdint.h>
	#include <string>
	#include <vector>

	TCNN_NAMESPACE_BEGIN

	template<typename T>
	class GPUMatrixDynamic;

	template<typename T, MatrixLayout _layout>
	class GPUMatrix;

	class GPUMatrixBase {
	public:
	virtual ~GPUMatrixBase() {}

	virtual size_t n_bytes() const = 0;
	virtual void set_data_unsafe(void* data) = 0;

	static void allocate_shared_memory(GPUMemory<char>& memory, const std::vector<GPUMatrixBase*>& matrices) {
	size_t total_n_bytes = 0;
	for (auto* matrix : matrices) {
	total_n_bytes += matrix->n_bytes();
	}

	if (memory.bytes() < total_n_bytes) {
	#ifdef TCNN_VERBOSE_MEMORY_ALLOCS
	std::cout << "GPUMatrix: Allocating " << bytes_to_string(total_n_bytes) << " shared among " << matrices.size() << " matrices." << std::endl;
	#endif
	memory.resize(total_n_bytes);
	}

	size_t offset = 0;
	for (auto* matrix : matrices) {
	matrix->set_data_unsafe(memory.data() + offset);
	offset += matrix->n_bytes();
	}
	}

	template <typename T>
	static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices);

	template <typename T, MatrixLayout layout>
	static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices);

	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) {
	size_t total_n_bytes = 0;
	for (auto* matrix : matrices) {
	total_n_bytes += matrix->n_bytes();
	}

	auto alloc = allocate_workspace(stream, total_n_bytes);

	size_t offset = 0;
	for (auto* matrix : matrices) {
	matrix->set_data_unsafe(alloc.data() + offset);
	offset += matrix->n_bytes();
	}

	return alloc;
	}

	template <typename T>
	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices);

	template <typename T, MatrixLayout layout>
	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices);
	};

	template <typename T>
	struct MatrixView {
	TCNN_HOST_DEVICE MatrixView() : data{nullptr}, stride_i{0}, stride_j{0} {}
	TCNN_HOST_DEVICE MatrixView(T* data, uint32_t stride_i, uint32_t stride_j) : data{data}, stride_i{stride_i}, stride_j{stride_j} {}
	TCNN_HOST_DEVICE MatrixView(const MatrixView<std::remove_const_t<T>>& other) : data{other.data}, stride_i{other.stride_i}, stride_j{other.stride_j} {}

	TCNN_HOST_DEVICE T& operator()(uint32_t i, uint32_t j = 0) const {
	return data[i * stride_i + j * stride_j];
	}

	TCNN_HOST_DEVICE void advance(uint32_t m, uint32_t n) {
	data = &(*this)(m, n);
	}

	TCNN_HOST_DEVICE void advance_rows(uint32_t m) {
	advance(m, 0);
	}

	TCNN_HOST_DEVICE void advance_cols(uint32_t n) {
	advance(0, n);
	}

	TCNN_HOST_DEVICE explicit operator bool() const {
	return data;
	}

	T* data;
	uint32_t stride_i, stride_j;
	};

	template <typename T>
	class GPUMatrixDynamic : public GPUMatrixBase {
	public:
	using Type = T;

	// Owning its memory as a GPUMemory<T>
	GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM)
	: m_rows{m}, m_cols{n}, m_layout{layout} {
	m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(m * n * sizeof(T));
	m_data = (T*)m_malloc_allocation->data();
	set_stride_contiguous();
	}

	// Owning its memory as an allocation from a stream's memory arena
	GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM)
	: m_rows{m}, m_cols{n}, m_layout{layout} {
	m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, m * n * sizeof(T)));
	m_data = (T*)m_arena_allocation->data();
	set_stride_contiguous();
	}

	// Pointing to external memory
	explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr)
	: m_data{data}, m_layout{layout}, m_malloc_allocation{malloc_allocation}, m_arena_allocation{arena_allocation} {
	set(data, m, n, stride);
	}

	GPUMatrixDynamic() : GPUMatrixDynamic{nullptr, 0, 0} {}

	GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) {
	std::swap(m_data, other.m_data);
	std::swap(m_rows, other.m_rows);
	std::swap(m_cols, other.m_cols);
	std::swap(m_stride, other.m_stride);
	std::swap(m_layout, other.m_layout);
	std::swap(m_malloc_allocation, other.m_malloc_allocation);
	std::swap(m_arena_allocation, other.m_arena_allocation);
	return *this;
	}

	GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) {
	*this = std::move(other);
	}

	GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) = delete;

	virtual ~GPUMatrixDynamic() {}

	void set_data_unsafe(void* data) override { m_data = (T*)data; }
	void set_size_unsafe(uint32_t rows, uint32_t cols, uint32_t stride = 0) {
	m_rows = rows;
	m_cols = cols;

	if (stride == 0) {
	set_stride_contiguous();
	} else {
	m_stride = stride;
	}
	}

	void set(T* data, uint32_t rows, uint32_t cols, uint32_t stride = 0) {
	set_data_unsafe(data);
	set_size_unsafe(rows, cols, stride);
	}

	void resize(uint32_t rows, uint32_t cols) {
	if (m_arena_allocation) {
	cudaStream_t stream = m_arena_allocation->stream();
	m_arena_allocation.reset();
	m_arena_allocation = std::make_shared<GPUMemoryArena::Allocation>(allocate_workspace(stream, rows * cols * sizeof(T)));
	} else if (m_malloc_allocation) {
	m_malloc_allocation.reset();
	m_malloc_allocation = std::make_shared<GPUMemory<uint8_t>>(rows * cols * sizeof(T));
	} else {
	throw std::runtime_error{"GPUMatrix::resize is not permitted when the underlying memory is not owned. Use GPUMatrix::set instead."};
	}

	set_size_unsafe(rows, cols);
	}

	uint32_t stride_contiguous() const {
	return m_layout == CM ? m() : n();
	}

	bool is_contiguous() const {
	return m_stride == stride_contiguous();
	}

	void set_stride_contiguous() {
	m_stride = stride_contiguous();
	}

	GPUMatrixDynamic<T> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const {
	return GPUMatrixDynamic<T>{
	data() + (layout() == CM ? (offset_rows + offset_cols * stride()) : (offset_cols + offset_rows * stride())),
	new_rows,
	new_cols,
	layout(),
	stride(),
	m_malloc_allocation,
	m_arena_allocation,
	};
	}

	GPUMatrixDynamic<T> slice_rows(uint32_t offset, uint32_t size) const {
	return slice(offset, size, 0, cols());
	}

	GPUMatrixDynamic<T> slice_cols(uint32_t offset, uint32_t size) const {
	return slice(0, rows(), offset, size);
	}

	GPUMatrixDynamic<T> alias() const {
	return slice(0, rows(), 0, cols());
	}

	MatrixView<T> view() const {
	return {data(), layout() == CM ? 1u : stride(), layout() == CM ? stride() : 1u};
	}

	uint32_t rows() const { return m_rows; }
	uint32_t fan_out() const { return m_rows; }
	uint32_t m() const { return m_rows; }

	uint32_t cols() const { return m_cols; }
	uint32_t fan_in() const { return m_cols; }
	uint32_t n() const { return m_cols; }

	uint32_t stride() const { return m_stride; }
	PitchedPtr<T> pitched_ptr() { return {data(), stride()}; }
	PitchedPtr<const T> pitched_ptr() const { return {data(), stride()}; }

	uint32_t n_elements() const { return m_rows * m_cols; }
	size_t n_bytes() const override { return n_elements() * sizeof(T); }

	MatrixLayout layout() const { return m_layout; }
	MatrixLayout transposed_layout() const { return m_layout == RM ? CM : RM; }

	T* data() const { return m_data; }

	void memset(int value) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());
	CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes()));
	}

	void memset_async(cudaStream_t stream, int value) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());
	CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream));
	}

	// Various initializations
	void initialize_xavier_uniform(pcg32& rnd, float scale = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	// Define probability distribution
	scale *= std::sqrt(6.0f / (float)(fan_in() + fan_out()));

	// Sample initialized values
	std::vector<T> new_data(n_elements());

	for (size_t i = 0; i < new_data.size(); ++i) {
	new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	// Define probability distribution
	scale *= std::sqrt(1.0f / (float)fan_in());

	// Sample initialized values
	std::vector<T> new_data(n_elements());

	for (size_t i = 0; i < new_data.size(); ++i) {
	new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	// Define probability distribution
	scale *= std::sqrt(1.0f / (float)fan_out());

	// Sample initialized values
	std::vector<T> new_data(n_elements());

	for (size_t i = 0; i < new_data.size(); ++i) {
	new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_siren_uniform(pcg32& rnd, float scale = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	// Define probability distribution
	scale *= std::sqrt(6.0f / (float)fan_in());

	// Sample initialized values
	std::vector<T> new_data(n_elements());

	for (size_t i = 0; i < new_data.size(); ++i) {
	new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	// Define probability distribution

	// The 30 in the first layer comes from https://vsitzmann.github.io/siren/
	scale *= 30.0f / (float)fan_in();

	// Sample initialized values
	std::vector<T> new_data(n_elements());

	for (size_t i = 0; i < new_data.size(); ++i) {
	new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_constant(float val) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());

	std::vector<T> new_data(n_elements(), (T)val);
	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	void initialize_diagonal(float val = 1) {
	CHECK_THROW(data());
	CHECK_THROW(is_contiguous());
	CHECK_THROW(n() == m()); // Must be square for diagonal init to make sense

	std::vector<T> new_data(n_elements(), (T)0);
	for (uint32_t i = 0; i < n(); ++i) {
	new_data[i + i*n()] = (T)val;
	}

	CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
	}

	GPUMatrixDynamic<T> transposed() const {
	return GPUMatrixDynamic<T>(data(), n(), m(), transposed_layout(), stride(), m_malloc_allocation, m_arena_allocation);
	}

	GPUMatrix<T, RM> rm() const {
	CHECK_THROW(m_layout == RM);
	return GPUMatrix<T, RM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation);
	}

	GPUMatrix<T, CM> cm() const {
	CHECK_THROW(m_layout == CM);
	return GPUMatrix<T, CM>(data(), m(), n(), stride(), m_malloc_allocation, m_arena_allocation);
	}

	private:
	T* m_data;
	uint32_t m_rows, m_cols, m_stride;
	MatrixLayout m_layout;

	// References to corresponding memory allocations. These ensure that
	// m_data does not accidentally become dangling.
	std::shared_ptr<GPUMemory<uint8_t>> m_malloc_allocation;
	std::shared_ptr<GPUMemoryArena::Allocation> m_arena_allocation;
	};

	template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor>
	class GPUMatrix : public GPUMatrixDynamic<T> {
	public:
	static const MatrixLayout static_layout = _layout;
	static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM;

	// Owning its memory as a GPUMemory<T>
	GPUMatrix(uint32_t m, uint32_t n)
	: GPUMatrixDynamic<T>{m, n, static_layout} { }

	// Owning its memory as an allocation from a stream's memory arena
	GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream)
	: GPUMatrixDynamic<T>{m, n, stream, static_layout} { }

	// Pointing to external memory
	explicit GPUMatrix(T* data, uint32_t m, uint32_t n, uint32_t stride = 0, std::shared_ptr<GPUMemory<uint8_t>> malloc_allocation = nullptr, std::shared_ptr<GPUMemoryArena::Allocation> arena_allocation = nullptr)
	: GPUMatrixDynamic<T>{data, m, n, static_layout, stride, malloc_allocation, arena_allocation} { }

	GPUMatrix() : GPUMatrix{nullptr, 0, 0} {}

	GPUMatrix<T, static_layout>& operator=(GPUMatrixDynamic<T>&& other) {
	((GPUMatrixDynamic<T>)this) = std::move(other);
	if (static_layout != this->layout()) {
	throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."};
	}
	return *this;
	}

	GPUMatrix(GPUMatrixDynamic<T>&& other) noexcept {
	*this = std::move(other);
	}

	GPUMatrix<T, static_layout>& operator=(GPUMatrix<T, static_layout>&& other) noexcept {
	((GPUMatrixDynamic<T>)this) = std::move(other);
	return *this;
	}

	GPUMatrix(GPUMatrix<T, static_layout>&& other) noexcept {
	*this = std::move(other);
	}

	GPUMatrix(const GPUMatrixDynamic<T>& other) = delete;

	virtual ~GPUMatrix() {}

	GPUMatrix<T, static_layout> slice(uint32_t offset_rows, uint32_t new_rows, uint32_t offset_cols, uint32_t new_cols) const {
	return ((GPUMatrixDynamic<T>*)this)->slice(offset_rows, new_rows, offset_cols, new_cols);
	}

	GPUMatrix<T, static_layout> slice_rows(uint32_t offset, uint32_t size) const {
	return ((GPUMatrixDynamic<T>*)this)->slice_rows(offset, size);
	}

	GPUMatrix<T, static_layout> slice_cols(uint32_t offset, uint32_t size) const {
	return ((GPUMatrixDynamic<T>*)this)->slice_cols(offset, size);
	}

	GPUMatrix<T, static_layout> alias() const {
	return ((GPUMatrixDynamic<T>*)this)->alias();
	}

	GPUMatrix<T, static_transposed_layout> transposed() const {
	return ((GPUMatrixDynamic<T>*)this)->transposed();
	}
	};

	template <typename T>
	void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrixDynamic<T>>& matrices) {
	std::vector<GPUMatrixBase*> matrix_pointers;
	for (auto& matrix : matrices) {
	matrix_pointers.emplace_back(&matrix);
	}
	allocate_shared_memory(memory, matrix_pointers);
	}

	template <typename T, MatrixLayout layout>
	void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices) {
	std::vector<GPUMatrixBase*> matrix_pointers;
	for (auto& matrix : matrices) {
	matrix_pointers.emplace_back(&matrix);
	}
	allocate_shared_memory(memory, matrix_pointers);
	}

	template <typename T>
	GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) {
	std::vector<GPUMatrixBase*> matrix_pointers;
	for (auto& matrix : matrices) {
	matrix_pointers.emplace_back(&matrix);
	}
	return allocate_shared_memory(stream, matrix_pointers);
	}

	template <typename T, MatrixLayout layout>
	GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) {
	std::vector<GPUMatrixBase*> matrix_pointers;
	for (auto& matrix : matrices) {
	matrix_pointers.emplace_back(&matrix);
	}
	return allocate_shared_memory(stream, matrix_pointers);
	}

	TCNN_NAMESPACE_END