Created
May 23, 2025 17:49
-
-
Save philipturner/f86852ca6990e2d33d0bf41f7e8704d2 to your computer and use it in GitHub Desktop.
Third set of files saved for easy reference, while cleaning up an iteration of the Windows port of Molecular Renderer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Next steps: | |
// - Access the GPU. | |
// - Modify it to get Metal rendering. [DONE] | |
// - Clean up and simplify the code as much as possible. [DONE] | |
// - Get timestamps synchronizing properly (moving rainbow banner | |
// scene). [DONE] | |
// - Repeat the same process with COM / D3D12 on Windows. | |
// - Get some general experience with C++ DirectX sample code. | |
// - Modify the files one-by-one to support Windows. | |
import MolecularRenderer | |
#if os(macOS) | |
import Metal | |
@MainActor | |
func createApplication() -> Application { | |
// Set up the display. | |
var displayDesc = DisplayDescriptor() | |
displayDesc.renderTargetSize = 1920 | |
displayDesc.screenID = Display.fastestScreenID | |
let display = Display(descriptor: displayDesc) | |
// Set up the GPU context. | |
var gpuContextDesc = GPUContextDescriptor() | |
gpuContextDesc.deviceID = GPUContext.fastestDeviceID | |
let gpuContext = GPUContext(descriptor: gpuContextDesc) | |
// Set up the application. | |
var applicationDesc = ApplicationDescriptor() | |
applicationDesc.display = display | |
applicationDesc.gpuContext = gpuContext | |
let application = Application(descriptor: applicationDesc) | |
return application | |
} | |
func createShaderSource() -> String { | |
""" | |
#include <metal_stdlib> | |
using namespace metal; | |
half convertToChannel( | |
half hue, | |
half saturation, | |
half lightness, | |
ushort n | |
) { | |
half k = half(n) + hue / 30; | |
k -= 12 * floor(k / 12); | |
half a = saturation; | |
a *= min(lightness, 1 - lightness); | |
half output = min(k - 3, 9 - k); | |
output = max(output, half(-1)); | |
output = min(output, half(1)); | |
output = lightness - a * output; | |
return output; | |
} | |
kernel void renderImage( | |
constant float *time0 [[buffer(0)]], | |
constant float *time1 [[buffer(1)]], | |
constant float *time2 [[buffer(2)]], | |
texture2d<half, access::write> drawableTexture [[texture(0)]], | |
ushort2 tid [[thread_position_in_grid]] | |
) { | |
half4 color; | |
if (tid.y < 1600) { | |
color = half4(0.707, 0.707, 0.00, 1.00); | |
} else { | |
float progress = float(tid.x) / 1920; | |
if (tid.y < 1600 + 107) { | |
progress += *time0; | |
} else if (tid.y < 1600 + 213) { | |
progress += *time1; | |
} else { | |
progress += *time2; | |
} | |
half hue = half(progress) * 360; | |
half saturation = 1.0; | |
half lightness = 0.5; | |
half red = convertToChannel(hue, saturation, lightness, 0); | |
half green = convertToChannel(hue, saturation, lightness, 8); | |
half blue = convertToChannel(hue, saturation, lightness, 4); | |
color = half4(red, green, blue, 1.00); | |
} | |
drawableTexture.write(color, tid); | |
} | |
""" | |
} | |
func createRenderPipeline( | |
application: Application, | |
shaderSource: String | |
) -> MTLComputePipelineState { | |
let device = application.gpuContext.device | |
let shaderSource = createShaderSource() | |
let library = try! device.makeLibrary(source: shaderSource, options: nil) | |
let function = library.makeFunction(name: "renderImage") | |
guard let function else { | |
fatalError("Could not make function.") | |
} | |
let pipeline = try! device.makeComputePipelineState(function: function) | |
return pipeline | |
} | |
// Set up the resources. | |
let application = createApplication() | |
let shaderSource = createShaderSource() | |
let renderPipeline = createRenderPipeline( | |
application: application, | |
shaderSource: shaderSource) | |
var startTime: UInt64? | |
var frameID: Int = .zero | |
// Enter the run loop. | |
application.run { renderTarget in | |
frameID += 1 | |
// Start the command encoder. | |
let commandQueue = application.gpuContext.commandQueue | |
let commandBuffer = commandQueue.makeCommandBuffer()! | |
let encoder = commandBuffer.makeComputeCommandEncoder()! | |
// Bind the buffers. | |
do { | |
func setTime(_ time: Double, index: Int) { | |
let fractionalTime = time - floor(time) | |
var time32 = Float(fractionalTime) | |
encoder.setBytes(&time32, length: 4, index: index) | |
} | |
if let startTime { | |
let currentTime = mach_continuous_time() | |
let timeSeconds = Double(currentTime - startTime) / 24_000_000 | |
setTime(timeSeconds, index: 0) | |
} else { | |
startTime = mach_continuous_time() | |
setTime(Double.zero, index: 0) | |
} | |
let clock = application.clock | |
let timeInFrames = clock.frames | |
let framesPerSecond = application.display.frameRate | |
let timeInSeconds = Double(timeInFrames) / Double(framesPerSecond) | |
setTime(timeInSeconds, index: 1) | |
setTime(Double.zero, index: 2) | |
} | |
// Bind the textures. | |
encoder.setTexture(renderTarget, index: 0) | |
// Dispatch | |
do { | |
encoder.setComputePipelineState(renderPipeline) | |
let width = Int(renderTarget.width) | |
let height = Int(renderTarget.height) | |
encoder.dispatchThreads( | |
MTLSize(width: width, height: height, depth: 1), | |
threadsPerThreadgroup: MTLSize(width: 8, height: 8, depth: 1)) | |
} | |
// End the command encoder. | |
encoder.endEncoding() | |
commandBuffer.commit() | |
} | |
#endif | |
#if os(Windows) | |
// I want to achieve 'hello world' for vector addition. | |
// | |
// Issues: | |
// - How to create resources | |
// - How to create pipelines | |
// - What the heck is going on with descriptors | |
// - What the heck is going on with root signatures | |
// - How to bind resources to commands | |
// - How to dispatch GPU threads | |
// - What the heck is going on with resource state (transitions) | |
// - How to test the results of GPU execution | |
// | |
// The above notes are the start of a brainstorming session about how to | |
// approach this goal in practice. Next, I must elaborate on and refactor the | |
// ideas. I've started by purging this repository of the C++ translations. | |
// ## Initiation | |
// | |
// My goal is to execute a proof of concept compute-only workflow on the GPU. | |
// I will create three UAV buffers of type FP32. They will be read and written | |
// from GPU-native memory during a compute shader. The CPU will supply input | |
// data and test the output data. | |
// | |
// Additional requirements: | |
// - Correct/conventional usage of the DirectX 12 API | |
// - No memory leaks from mishandling COM objects | |
// - Shader is compiled entirely at runtime | |
// - Root signature is specified in HLSL, not on the CPU | |
// - Resources are bound in separate 'root descriptor' entries | |
// - Resources are not sub-resources of another resource | |
// - All objects for encoding commands are regenerated for each command list | |
// | |
// Resource states: | |
// - 'COPY_DEST' while moving from CPU -> GPU | |
// - 'UAV' while executing the compute shader | |
// - 'COPY_SRC' while moving from GPU -> CPU | |
// - There are no constant buffers or inlined 32-bit constants. | |
// | |
// Additional small details: | |
// - Compile the shader with the SM 6.5 target. | |
// - Dispatch 128 threads per group. | |
// - Each buffer is 1024 elements. | |
// - First input is 0 to 1023, in ascending order. | |
// - Second input is 1024 to 2047, in ascending order. | |
// - Report the results for the first 10 entries explicitly. | |
// - To cover the remaining entries, count the number that did/didn't match | |
// results of an analytical formula. | |
// - In root signature v1.1, the UAV's flag is 'DATA_VOLATILE' by default. | |
let debugInterface: SwiftCOM.ID3D12Debug = | |
try! D3D12GetDebugInterface() | |
try! debugInterface.EnableDebugLayer() | |
// ## First Step | |
// | |
// Author the HLSL shader. Then, modify the DXCWrapper utility to provide the | |
// compiled blob. | |
let shaderSource: String = """ | |
RWStructuredBuffer<float> buffer0 : register(u0); | |
RWStructuredBuffer<float> buffer1 : register(u1); | |
RWStructuredBuffer<float> buffer2 : register(u2); | |
#define mainRS "UAV(u0), " \\ | |
"UAV(u1), " \\ | |
"UAV(u2)" | |
[numthreads(128, 1, 1)] | |
[RootSignature(mainRS)] | |
void main( | |
uint3 tid : SV_DispatchThreadID | |
) { | |
uint slotID = tid.x; | |
float input0 = buffer0[slotID]; | |
float input1 = buffer1[slotID]; | |
float output = input0 + input1; | |
buffer2[slotID] = output; | |
} | |
""" | |
let device = DirectXDevice() | |
let compiler = Compiler(device: device) | |
let shaderBytecode = compiler.compile(source: shaderSource) | |
// Retrieve the info queue from the device. | |
var infoQueue: SwiftCOM.ID3D12InfoQueue | |
do { | |
let d3d12Device = device.d3d12Device | |
let iid = SwiftCOM.ID3D12InfoQueue.IID | |
let interface = try! d3d12Device.QueryInterface(iid: iid) | |
infoQueue = .init(pUnk: interface) | |
} | |
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, true) | |
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, true) | |
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, true) | |
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_INFO, true) | |
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_MESSAGE, true) | |
// ## Second Step | |
// | |
// See whether I can jump directly to creating a PSO and root signature object. | |
import SwiftCOM | |
import WinSDK | |
// Create the root signature. | |
var rootSignature: SwiftCOM.ID3D12RootSignature? | |
shaderBytecode.rootSignature.withUnsafeBytes { bufferPointer in | |
let d3d12Device = device.d3d12Device | |
rootSignature = try! d3d12Device.CreateRootSignature( | |
0, | |
bufferPointer.baseAddress, | |
UInt64(bufferPointer.count)) | |
} | |
guard let rootSignature else { | |
fatalError("Could not create root signature.") | |
} | |
// Create the pipeline state. | |
var pipelineState: SwiftCOM.ID3D12PipelineState? | |
shaderBytecode.object.withUnsafeBytes { bufferPointer in | |
var computeShader = D3D12_SHADER_BYTECODE() | |
computeShader.pShaderBytecode = bufferPointer.baseAddress | |
computeShader.BytecodeLength = UInt64(bufferPointer.count) | |
var cachedPipelineState = D3D12_CACHED_PIPELINE_STATE() | |
cachedPipelineState.pCachedBlob = nil | |
cachedPipelineState.CachedBlobSizeInBytes = 0 | |
var computePipelineStateDesc = D3D12_COMPUTE_PIPELINE_STATE_DESC() | |
try! rootSignature.perform( | |
as: WinSDK.ID3D12RootSignature.self | |
) { pUnk in | |
computePipelineStateDesc.pRootSignature = pUnk | |
} | |
computePipelineStateDesc.CS = computeShader | |
computePipelineStateDesc.NodeMask = 0 | |
computePipelineStateDesc.CachedPSO = cachedPipelineState | |
computePipelineStateDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE | |
let d3d12Device = device.d3d12Device | |
var iid = SwiftCOM.ID3D12PipelineState.IID | |
let pUnk = try! d3d12Device.CreateComputePipelineState( | |
&computePipelineStateDesc, &iid) | |
pipelineState = SwiftCOM.ID3D12PipelineState( | |
pUnk: pUnk) | |
} | |
guard let pipelineState else { | |
fatalError("Could not create pipeline state.") | |
} | |
// ## Third Step | |
// | |
// Create buffer objects and test the API for accessing mapped pointers. | |
// Fill the descriptor properties common to all buffers. | |
var bufferDesc = BufferDescriptor() | |
bufferDesc.device = device | |
bufferDesc.size = 1024 * 4 | |
// Create the input buffers. | |
bufferDesc.type = .input | |
let inputBuffer0 = Buffer(descriptor: bufferDesc) | |
let inputBuffer1 = Buffer(descriptor: bufferDesc) | |
// Create the native buffers. | |
bufferDesc.type = .native | |
let nativeBuffer0 = Buffer(descriptor: bufferDesc) | |
let nativeBuffer1 = Buffer(descriptor: bufferDesc) | |
let nativeBuffer2 = Buffer(descriptor: bufferDesc) | |
// Create the output buffers. | |
bufferDesc.type = .output | |
let outputBuffer2 = Buffer(descriptor: bufferDesc) | |
// Generate the input data for the shader. | |
do { | |
var inputData0: [Float] = [] | |
var inputData1: [Float] = [] | |
for i in 0..<1024 { | |
let value0 = Float(i) | |
let value1 = 1024 + Float(i) | |
inputData0.append(value0) | |
inputData1.append(value1) | |
} | |
inputData0.withUnsafeBytes { bufferPointer in | |
let baseAddress = bufferPointer.baseAddress! | |
inputBuffer0.write(input: baseAddress) | |
} | |
inputData1.withUnsafeBytes { bufferPointer in | |
let baseAddress = bufferPointer.baseAddress! | |
inputBuffer1.write(input: baseAddress) | |
} | |
} | |
// Read the initial contents of the output buffer. | |
do { | |
var outputData2: [Float] = [] | |
for i in 0..<1024 { | |
let value2 = Float(i) | |
outputData2.append(value2) | |
} | |
outputData2.withUnsafeMutableBytes { bufferPointer in | |
let baseAddress = bufferPointer.baseAddress! | |
outputBuffer2.read(output: baseAddress) | |
} | |
for slotID in 0..<10 { | |
let value2 = outputData2[slotID] | |
guard value2 == 0 else { | |
fatalError("Output buffer was initialized to nonzero value.") | |
} | |
} | |
} | |
// Next steps: | |
// - Test the code for reading/writing mapped pointers. Set the input data to | |
// an increasing list of floating point numbers. Study the results of reading | |
// from the output buffer. It should at least overwrite the previous contents | |
// of the CPU memory allocation. | |
// - Redefine the "3rd step" and "4th step". The third step is shortened to just | |
// summarize what we've done above. The fourth step is to create a command | |
// queue, command list, and set up the resources for copying. | |
// - "Hello world" will come from shifting the data through various buffers. | |
// The output will match either buffer0 or buffer1 (of my choosing). The copy | |
// commands must pass through a GPU private buffer(s) as an intermediate. | |
// - After that is done, proceed with the descriptors necessary to bind UAVs to | |
// a compute command. | |
// ## Fourth Step | |
// | |
// Set up the command queue, command list, and anything else needed for copying | |
// buffers. | |
// List all the components needed to make this happen: | |
// - ID3D12CommandQueue | |
// - ID3D12CommandList | |
// - ID3D12CommandAllocator | |
// - ID3D12Fence | |
// - Windows OS event | |
// | |
// References to get started: | |
// - My worked examples of C++ translations | |
// - DirectX tutorials | |
// - Microsoft's online documentation | |
// | |
// Describe everything that will happen in the procedure, qualitatively, in | |
// chronological order. At the moment, we don't actually know the correct | |
// chronological order. | |
// | |
// - Create the command queue | |
// - Create the command allocator | |
// - Create the command list from the command allocator | |
// - Close the command allocator and command list | |
// | |
// - Bind the buffers | |
// - Encode the copy commands | |
// - Encode the fence signaling | |
// - Wait on the fence on the CPU | |
// - Read the contents of the output buffer | |
// | |
// - Copy commands: | |
// - Copy inputBuffer0 to nativeBuffer0 | |
// - Copy inputBuffer1 to nativeBuffer1 | |
// - Copy either [nativeBuffer0, nativeBuffer1] to nativeBuffer2 | |
// - Copy nativeBuffer2 to outputBuffer2 | |
// | |
// Regarding the command queue/list/fence, what object creates what? | |
// - ID3D12Device | |
// - ID3D12CommandQueue | |
// - ID3D12CommandAllocator | |
// - ID3D12CommandList | |
// - ID3D12Fence | |
// - CreateEventA | |
// - HANDLE | |
// Let's start with a simple deliverable: | |
// - Create the above objects, without using utility classes. | |
// - Dispatch an empty command buffer. | |
// - Close or clean up the objects. | |
// `ID3D12CommandQueue.ExecuteCommandLists` is like `MTLCommandBuffer.commit` in | |
// Metal applications. It sends commands to the GPU. | |
// | |
// Fences are similar to `MTLCommandBuffer.waitUntilCompleted` and | |
// `DispatchSemaphore` in Metal applications. They wait until a specific command | |
// buffer has completed. In Metal, one of the functions can facilitate triple- | |
// buffering without retaining a reference to the command buffer. | |
// | |
// API for quickly freezing the queue until all commands have finished, and it | |
// is safe to read contents from the CPU: | |
// - Metal: commandBuffer.waitUntilCompleted() | |
// - DirectX: immediately create, signal, and wait on a fence | |
// | |
// API for triple buffering: | |
// - Metal: DispatchSemaphore and commandBuffer.setCompletedHandler() | |
// - DirectX: increment a fence counter after an entire frame, remember the | |
// counter's value until a future frame that needs a resource | |
// | |
// Both APIs require an entire command list to be committed before waiting on | |
// a chunk of GPU work. It's not clear at what granularity you can gather | |
// execution latency data. | |
// | |
// MTLSharedEvent has similarities to ID3D12Fence. Especially the method | |
// `MTLSharedEvent.wait(untilSignaledValue:timeoutMS:)`. It is virtually | |
// identical to `WaitForSingleObject` on Windows. | |
// | |
// I don't know whether there's a Windows API for callbacks, similar to the Mac | |
// paradigm of using semaphores. | |
// | |
// I don't know whether using MTLSharedEvent causes performance issues on Mac. | |
// For the time being, we don't actually need to worry about triple-buffering | |
// of resources. The Mac side of the new codebase hasn't gotten that far yet. | |
// So just use fences as a means to immediately stall until a command buffer | |
// has completed. | |
// | |
// Option 1: | |
// - Every command buffer gets a unique ID, monotonically increasing from when | |
// the command queue was first created. | |
// - You can wait for GPU work at the granularity of previous command buffers. | |
// So, asynchronous compute. | |
// | |
// Option 2: | |
// - The command queue has an internal fence + event object created once at | |
// initialization. | |
// - Every instance of CPU-side stalling blocks at the latest command dispatched | |
// on that specific queue. | |
// | |
// Choose option 2. | |
// Another concern is the ability to profile GPU command execution time. In | |
// DirectX 12, ID3D12GraphicsCommandList.BeginQuery cannot be called on a | |
// timestamp query. Instead, call `EndQuery`. | |
// | |
// Source: https://pavelsmejkal.net/Posts/ | |
// | |
// The DX12 'Query' paradigm for measuring time looks similar to the Metal | |
// 'MTLCounterSampleBuffer' paradigm. There is an additional step, where one | |
// must store timestamps in a special buffer. Not as easy as the Metal API for | |
// retrieving the '.gpuStartTime' and '.gpuEndTime' of a command buffer. | |
// | |
// In both APIs, you must be careful about the step size of timestamp counters. | |
// On Mac, it could be Mach absolute time (24 MHz) instead of nanoseconds. | |
// On Windows, you must call `ID3D12CommandQueue.GetTimestampFrequency.` | |
// | |
// One difference might be that Windows allows finer granularity of timestamp | |
// sampling. On Mac, `.gpuStartTime` and `.gpuEndTime` are scoped to the entire | |
// command buffer. The counter sample buffers API looks scoped to an entire | |
// compute command encoder, which has just as much latency as creating a new | |
// command buffer. Windows might allow finer granularity, because you can | |
// inject timestamps at any point within the command list. Including between | |
// subsequent compute commands, without a severe latency penalty. | |
// | |
// For the time being, neglect the ability to profile GPU-side execution time. | |
// ## Fourth Step (2nd Iteration) | |
// | |
// Create an ergonomic API for generating and waiting on empty GPU command | |
// buffers. Design the API with the intent to wrap a Metal backend in the | |
// future. | |
// - CommandQueue utility class | |
// - 'flush' member function | |
// - Windows: increment the fence counter, use a fence created when the | |
// command queue initializes | |
// - Mac: store a reference to the latest command buffer submitted to the | |
// command queue | |
// - CommandBuffer utility class | |
// - 'commit' member function | |
// - Sends the command list to the command queue (perhaps this member | |
// belongs in CommandQueue, and there is no utility class for | |
// CommandBuffer). | |
// - Closes the command list. | |
// - No analogue to Metal 'waitUntilCompleted' | |
// | |
// To ease the prototyping process, just create an API for 'CommandQueue'. It | |
// creates and commits instances of 'ID3D12GraphicsCommandList'. The creation | |
// method abstracts away the 'ID3D12CommandAllocator'. The commit method | |
// abstracts away both 'commandList.Close()' and 'commandQueue. | |
// ExecuteCommandLists()'. The flush method works as described above. | |
let commandQueue = CommandQueue(device: device) | |
commandQueue.flush() | |
commandQueue.flush() | |
// Encapsulate the command buffer in a scope, so it can deinitialize itself. | |
// | |
// There is currently a problem with naming things both 'command buffer' and | |
// 'command list'. I have no solution at the moment. | |
do { | |
let commandList = commandQueue.createCommandList() | |
commandQueue.commit(commandList) | |
} | |
commandQueue.flush() | |
// ## Fifth Step | |
// | |
// Encode the copy commands into the command list. | |
let commandList = commandQueue.createCommandList() | |
// To start, copy inputBuffer0 to nativeBuffer0. | |
// | |
// Components of this task: | |
// - Identify methods of 'ID3D12(Graphics)CommandList' that bind the buffers | |
// to the 'src' or 'dst' slots of a copy operation. | |
// - Identify the DirectX APIs for changing resource states. | |
// - Acknowledge the state of each buffer prior to the transition (or don't). | |
// | |
// Where to start: the 3DGEP tutorial series. | |
// - D3D12_RESOURCE_BARRIER | |
// - D3D12_RESOURCE_BARRIER_TYPE | |
// - Don't want to use '_TRANSITION', because we're not splitting individual | |
// resources into subresources. | |
// - Don't think '_UAV' applies to copy commands. | |
// - D3D12_RESOURCE_FLAGS | |
// - The '_BEGIN_ONLY' and '_END_ONLY' flags seem strange. | |
// - union of the 3 possible types | |
// - '_UAV_BARRIER' has the simplest data structure, just a pointer to the | |
// resource. | |
// - '_TRANSITION_BARRIER' also makes sense. I would use the 0xFFFFFFFF | |
// flag because we don't have subresources (?). | |
// - D3D12_RESOURCE_STATES | |
// - Many members of 'ResourceStateTracker' just invoke the same method, | |
// 'ResourceBarrier'. | |
// - Microsoft documentation encourages batching multiple resource barriers | |
// into a single call. If all of the 2 transitions are scoped to one utility | |
// function for copying buffers, I can write the boilerplate code for this. | |
// No need to design a general API for easing the creation of barrier objects. | |
// | |
// Call: d3d12CommandList.ResourceBarrier(numBarriers, resourceBarriers.data()) | |
// - 'D3D12_RESOURCE_BARRIER' is a value type, not a COM reference type. This | |
// fact makes barriers easier to aggregate and send through a C interface. | |
// | |
// Based on the 3DGEP tutorial series, we might have to know the resource's | |
// prior/current state to encode a barrier. This makes things more complicated; | |
// we must implement state tracking and carry it around everywhere. | |
// | |
// For the 'hello world' demonstration, we can ignore the state tracking. We | |
// know every resource's specific state ahead of time. It becomes an issue | |
// when we create an API that generalizes to arbitrary code. Something to | |
// possibly defer to after the 'hello world' demonstration. | |
// I may have figured out resource state transitions. Next, figure out the | |
// DirectX API function that encodes the copy command. And whether it requires | |
// additional calls to bind buffers to slots. | |
// ## Copy command(s) in the DirectX 12 API | |
// | |
// Object that calls the member functions: ID3D12GraphicsCommandList | |
// | |
// Member: CopyBufferRegion(ID3D12Resource *pDstBuffer, | |
// UINT64 DstOffset, | |
// ID3D12Resource *pSrcBuffer, | |
// UINT64 SrcOffset, | |
// UINT64 NumBytes) | |
// | |
// Member: CopyResource(ID3D12Resource *pDstResource, | |
// ID3D12Resource *pSrcResource) | |
// | |
// Member: CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION *pDst, | |
// UINT DstX, | |
// UINT DstY, | |
// UINT DstZ, | |
// const D3D12_TEXTURE_COPY_LOCATION *pSrc, | |
// const D3D12_BOX *pSrcBox) | |
// | |
// Member: CopyTiles(const D3D12_TEXTURE_COPY_LOCATION *pDst, | |
// UINT DstX, | |
// UINT DstY, | |
// UINT DstZ, | |
// const D3D12_TEXTURE_COPY_LOCATION *pSrc, | |
// const D3D12_BOX *pSrcBox) | |
// I think I have figured out the copy command. Next, document the DirectX 12 | |
// API functions associated with 'TransitionBarrier' and 'TrackResource'. | |
// ## ResourceStateTracker::FlushResourceBarriers | |
// | |
// Takes a 'CommandList' helper class as an argument. References a list of | |
// resource barrier objects. These objects are in fact value types, making the | |
// code easier to implement. Invokes the 'ResourceBarrier' method of | |
// 'ID3D12GraphicsCommandList' with the barrier count and barrier pointer. | |
// Deletes all entries in the barrier list. | |
// | |
// ## CommandList::TransitionBarrier | |
// | |
// Takes a reference to the 'ID3D12Resource'. Pretends the initial state is | |
// 'COMMON'. Sets the final state to the specified state. Appends the newly | |
// created 'D3D12_RESOURCE_BARRIER' value type to the list. | |
// | |
// ## CommandList::TrackResource | |
// | |
// Takes an 'ID3D12Resource' as an argument. Casts it to 'ID3D12Object' and | |
// appends it to an internal list. | |
// Finally, the whole source code snippet where 3DGEP performed a copy | |
// operation. This is a high-level guide for how to proceed with coding a copy | |
// operation in DirectX. | |
// | |
// void CommandList::CopyResource( Resource& dstRes, const Resource& srcRes ) | |
// { | |
// TransitionBarrier( dstRes, D3D12_RESOURCE_STATE_COPY_DEST ); | |
// TransitionBarrier( srcRes, D3D12_RESOURCE_STATE_COPY_SOURCE ); | |
// | |
// FlushResourceBarriers(); | |
// | |
// m_d3d12CommandList->CopyResource( dstRes.GetD3D12Resource().Get(), srcRes.GetD3D12Resource().Get() ); | |
// | |
// TrackResource(dstRes); | |
// TrackResource(srcRes); | |
// } | |
// The task has now been specified in enough detail that I can do it. | |
// | |
// Or not. What's going on with COMMON? | |
// - Does the 3DGEP tutorial correct the pending commands, replacing the | |
// 'COMMON' placeholder with the true value? | |
// - If performance is not a concern, is it ideal to post-transition every | |
// single resource back to 'COMMON' after every command? | |
// | |
// Ignore the common state. I think it skips calls to 'ResourceBarrier' | |
// entirely. Instead, start by specifying all the transitions that ought to | |
// occur throughout all 3 buffers. | |
// Initial states: | |
// inputBuffer0 - GENERIC_READ | |
// inputBuffer1 - GENERIC_READ | |
// nativeBuffer0 - COMMON | |
// nativeBuffer1 - COMMON | |
// nativeBuffer2 - COMMON | |
// outputBuffer2 - COPY_DEST | |
// | |
// Ideal states for a copy command: | |
// input - GENERIC_READ | |
// output - COPY_DEST | |
// | |
// Copy commands: | |
// - inputBuffer0 -> nativeBuffer0 | |
// - inputBuffer0: GENERIC_READ -> GENERIC_READ [omitted] | |
// - nativeBuffer0: COMMON -> COPY_DEST | |
// - inputBuffer1 -> nativeBuffer1 | |
// - inputBuffer1: GENERIC_READ -> GENERIC_READ [omitted] | |
// - nativeBuffer1: COMMON -> COPY_DEST | |
// - nativeBuffer0 -> nativeBuffer2 | |
// - nativeBuffer0: COPY_DEST -> GENERIC_READ | |
// - nativeBuffer2: COMMON -> COPY_DEST | |
// - nativeBuffer2 -> outputBuffer2 | |
// - nativeBuffer2: COPY_DEST -> GENERIC_READ | |
// - outputBuffer2: COPY_DEST -> COPY_DEST [omitted] | |
var barrier00 = D3D12_RESOURCE_BARRIER() | |
barrier00.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION | |
barrier00.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE | |
barrier00.Transition.pResource = nil // nativeBuffer0 | |
barrier00.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES | |
barrier00.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON | |
barrier00.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST | |
// Next, create a utility function to minimize the boilerplate of creating | |
// barrier structs multiple times. | |
// Helper function for creating barriers. | |
func createBarrier( | |
resource: SwiftCOM.ID3D12Resource, | |
stateBefore: D3D12_RESOURCE_STATES, | |
stateAfter: D3D12_RESOURCE_STATES | |
) -> D3D12_RESOURCE_BARRIER { | |
// Specify the type of barrier. | |
var output = D3D12_RESOURCE_BARRIER() | |
output.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION | |
output.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE | |
// Specify the transition's parameters. | |
try! resource.perform( | |
as: WinSDK.ID3D12Resource.self | |
) { pUnk in | |
output.Transition.pResource = pUnk | |
} | |
output.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES | |
output.Transition.StateBefore = stateBefore | |
output.Transition.StateAfter = stateAfter | |
// Return the barrier. | |
return output | |
} | |
// Test out the utility function. | |
do { | |
let barrier = createBarrier( | |
resource: nativeBuffer0.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COMMON, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST) | |
} | |
// Next, encode a full copy command onto the command list. Commit the command | |
// list onto the command queue, then wait until it has completed. Verify that | |
// the code doesn't crash. | |
// Copy command: inputBuffer0 -> nativeBuffer0 | |
do { | |
// Create the barriers. | |
let barrier = createBarrier( | |
resource: nativeBuffer0.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COMMON, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST) | |
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier] | |
// Encode the barriers. | |
try! commandList.ResourceBarrier( | |
UInt32(barriers.count), | |
barriers) | |
// Encode the copy command. | |
try! commandList.CopyResource( | |
nativeBuffer0.d3d12Resource, | |
inputBuffer0.d3d12Resource) | |
} | |
// Copy command: inputBuffer1 -> nativeBuffer1 | |
do { | |
// Create the barriers. | |
let barrier = createBarrier( | |
resource: nativeBuffer1.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COMMON, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST) | |
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier] | |
// Encode the barriers. | |
try! commandList.ResourceBarrier( | |
UInt32(barriers.count), | |
barriers) | |
// Encode the copy command. | |
try! commandList.CopyResource( | |
nativeBuffer1.d3d12Resource, | |
inputBuffer1.d3d12Resource) | |
} | |
// Copy command: nativeBuffer0 -> nativeBuffer2 | |
do { | |
// Create the barriers. | |
let barrier0 = createBarrier( | |
resource: nativeBuffer0.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COPY_DEST, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_SOURCE) | |
let barrier2 = createBarrier( | |
resource: nativeBuffer2.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COMMON, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST) | |
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier0, barrier2] | |
// Encode the barriers. | |
print("start:", try! infoQueue.GetNumStoredMessages()) | |
try! commandList.ResourceBarrier( | |
UInt32(barriers.count), | |
barriers) | |
print("end:", try! infoQueue.GetNumStoredMessages()) | |
print("start:", try! infoQueue.GetNumStoredMessages()) | |
// Encode the copy command. | |
try! commandList.CopyResource( | |
nativeBuffer2.d3d12Resource, | |
nativeBuffer0.d3d12Resource) | |
print("end:", try! infoQueue.GetNumStoredMessages()) | |
} | |
// Copy command: nativeBuffer2 -> outputBuffer2 | |
do { | |
let barrier = createBarrier( | |
resource: nativeBuffer2.d3d12Resource, | |
stateBefore: D3D12_RESOURCE_STATE_COPY_DEST, | |
stateAfter: D3D12_RESOURCE_STATE_COPY_SOURCE) | |
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier] | |
// Encode the barriers. | |
print("start:", try! infoQueue.GetNumStoredMessages()) | |
try! commandList.ResourceBarrier( | |
UInt32(barriers.count), | |
barriers) | |
print("end:", try! infoQueue.GetNumStoredMessages()) | |
// Encode the copy command. | |
try! commandList.CopyResource( | |
outputBuffer2.d3d12Resource, | |
nativeBuffer2.d3d12Resource) | |
} | |
// Run the commands on the GPU. | |
commandQueue.commit(commandList) | |
commandQueue.flush() | |
// Check the data in the output buffer. | |
do { | |
var outputData2: [Float] = [] | |
for i in 0..<1024 { | |
let value2 = Float(i) | |
outputData2.append(value2) | |
} | |
outputData2.withUnsafeMutableBytes { bufferPointer in | |
let baseAddress = bufferPointer.baseAddress! | |
outputBuffer2.read(output: baseAddress) | |
} | |
for slotID in 0..<10 { | |
let value2 = outputData2[slotID] | |
print("outputBuffer[\(slotID)] = \(value2)") | |
} | |
} | |
// ## Sixth Step | |
// | |
// Activate the debug layer in DirectX. Try omitting all resource barriers. | |
// Mess with the commands by using 'COMMON' as the source state. Either the | |
// code should crash, or the output buffer should have incorrect data. | |
// | |
// After this step is complete, I should purge the 'main' file to a gist. | |
// Incorporate resource state tracking into the 'Buffer' API, and write new | |
// tests for this functionality from scratch. Activate the debug layer by | |
// default, possibly storing an info queue inside the 'DirectXDevice'. | |
// | |
// After that task, I can return to focusing on a 'hello world' compute shader. | |
// ## Known Information about the Debug Layer | |
// | |
// Most commands in the DirectX spec have specific errors related to the debug | |
// layer. Without the debug layer, these errors don't trigger. One particularly | |
// relevant error involves resource barriers. | |
// | |
// The first 3DGEP tutorial mentioned and invoked the debug layer. | |
// | |
// The DirectX spec doesn't reference the 'ID3D12Debug' object directly. | |
// Microsoft's online documentation doesn't describe it in much detail, either. | |
// This makes it difficult to understand the purpose and/or how to use the | |
// debug layer. | |
// | |
// There are 7 iterations of the 'ID3D12Debug' interface. Some other interfaces, | |
// such as 'ID3D12DebugCommandList' and 'ID3D12DebugDevice', do not inherit | |
// from the non-debug versions. This fact differs from the Metal API design, | |
// where debug versions of API objects conform to the same protocol as the | |
// vanilla objects. | |
// | |
// The DirectX debug layer is different from the Metal validation layer. I am | |
// unfamiliar with the purpose of all the objects. Therefore, I am going to read | |
// up on how to instantiate each one. | |
// ## ID3D12Debug | |
// | |
// This looks like the only component of the API represented in SwiftCOM. | |
// Therefore, I probably won't actually use any of the other interfaces. | |
// | |
// Initializer: D3D12GetDebugInterface() | |
// | |
// Instance method: EnableDebugLayer() | |
// | |
// ## ID3D12InfoQueue | |
// | |
// Most instance members are ported in SwiftCOM. It is created by calling | |
// 'IUnknown::QueryInterface' on 'ID3D12Device', which sounds like a strange | |
// way to create an object. Have I seen this before? | |
// | |
// ## IDXGIDebug | |
// | |
// Not part of the 'direct3d-12-sdklayers-interfaces' document, but potentially | |
// relevant. | |
// | |
// Initializer: DXGIGetDebugInterface1() | |
// | |
// Instance method: ReportLiveObjects(GUID, DXGI_DEBUG_RLO_FLAGS) | |
// Useful advice about how to initialize several debug interfaces: | |
// http://gamedev.net/forums/topic/672268-d3d12-debug-layers-how-to-get-id3d12debugdevice/5255763/ | |
// | |
// Steps: | |
// 1) Enable the debug layer | |
// 2) Create the regular device, command queue, command list | |
// 3) Extract the debug versions of each object through 'QueryInterface' | |
// | |
// The QueryInterface path might mean a COM object conforms to multiple | |
// interfaces. It doesn't look like ID3D12DebugDevice inherits from | |
// ID3D12Device. It makes sense that you'd need a special technique to cast | |
// something between these two types. | |
// One potential reason the debug layer isn't working: need to use dxcpl | |
// https://nvidia.custhelp.com/app/answers/detail/a_id/5604/~/how-to-capture-d3d-debug-layer-logs-to-detect-application-or-runtime-bugs | |
// | |
// Another potential reason: debug logging occurs through D3D12InfoQueue. | |
// I'm going to try the latter before resorting to the former. | |
print(try! infoQueue.GetMessageCountLimit()) | |
print(try! infoQueue.GetMuteDebugOutput()) | |
print(try! infoQueue.GetNumMessagesAllowedByStorageFilter()) | |
print(try! infoQueue.GetNumMessagesDeniedByStorageFilter()) | |
print(try! infoQueue.GetNumMessagesDiscardedByMessageCountLimit()) | |
print(try! infoQueue.GetNumStoredMessages()) | |
print(try! infoQueue.GetNumStoredMessagesAllowedByRetrievalFilter()) | |
// Display the stored messages. | |
do { | |
let messageCount = try! infoQueue.GetNumStoredMessages() | |
for messageID in 0..<messageCount { | |
print("Working on message \(messageID)") | |
let (message, size) = | |
try! infoQueue.GetMessage(messageID) | |
print("messages[\(messageID)] = (\(message), \(size))") | |
} | |
} | |
// I cannot retrieve the contents of a message. That causes a runtime crash, | |
// due to the 'GetMessage' C interface being messed up. Instead, just inspect | |
// 'GetNumStoredMessages()' after every API command that might trigger a debug | |
// layer error. | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment