Skip to content

Instantly share code, notes, and snippets.

@philipturner
Created May 23, 2025 17:49
Show Gist options
  • Save philipturner/f86852ca6990e2d33d0bf41f7e8704d2 to your computer and use it in GitHub Desktop.
Save philipturner/f86852ca6990e2d33d0bf41f7e8704d2 to your computer and use it in GitHub Desktop.
Third set of files saved for easy reference, while cleaning up an iteration of the Windows port of Molecular Renderer
// Next steps:
// - Access the GPU.
// - Modify it to get Metal rendering. [DONE]
// - Clean up and simplify the code as much as possible. [DONE]
// - Get timestamps synchronizing properly (moving rainbow banner
// scene). [DONE]
// - Repeat the same process with COM / D3D12 on Windows.
// - Get some general experience with C++ DirectX sample code.
// - Modify the files one-by-one to support Windows.
import MolecularRenderer
#if os(macOS)
import Metal
@MainActor
func createApplication() -> Application {
// Set up the display.
var displayDesc = DisplayDescriptor()
displayDesc.renderTargetSize = 1920
displayDesc.screenID = Display.fastestScreenID
let display = Display(descriptor: displayDesc)
// Set up the GPU context.
var gpuContextDesc = GPUContextDescriptor()
gpuContextDesc.deviceID = GPUContext.fastestDeviceID
let gpuContext = GPUContext(descriptor: gpuContextDesc)
// Set up the application.
var applicationDesc = ApplicationDescriptor()
applicationDesc.display = display
applicationDesc.gpuContext = gpuContext
let application = Application(descriptor: applicationDesc)
return application
}
func createShaderSource() -> String {
"""
#include <metal_stdlib>
using namespace metal;
half convertToChannel(
half hue,
half saturation,
half lightness,
ushort n
) {
half k = half(n) + hue / 30;
k -= 12 * floor(k / 12);
half a = saturation;
a *= min(lightness, 1 - lightness);
half output = min(k - 3, 9 - k);
output = max(output, half(-1));
output = min(output, half(1));
output = lightness - a * output;
return output;
}
kernel void renderImage(
constant float *time0 [[buffer(0)]],
constant float *time1 [[buffer(1)]],
constant float *time2 [[buffer(2)]],
texture2d<half, access::write> drawableTexture [[texture(0)]],
ushort2 tid [[thread_position_in_grid]]
) {
half4 color;
if (tid.y < 1600) {
color = half4(0.707, 0.707, 0.00, 1.00);
} else {
float progress = float(tid.x) / 1920;
if (tid.y < 1600 + 107) {
progress += *time0;
} else if (tid.y < 1600 + 213) {
progress += *time1;
} else {
progress += *time2;
}
half hue = half(progress) * 360;
half saturation = 1.0;
half lightness = 0.5;
half red = convertToChannel(hue, saturation, lightness, 0);
half green = convertToChannel(hue, saturation, lightness, 8);
half blue = convertToChannel(hue, saturation, lightness, 4);
color = half4(red, green, blue, 1.00);
}
drawableTexture.write(color, tid);
}
"""
}
func createRenderPipeline(
application: Application,
shaderSource: String
) -> MTLComputePipelineState {
let device = application.gpuContext.device
let shaderSource = createShaderSource()
let library = try! device.makeLibrary(source: shaderSource, options: nil)
let function = library.makeFunction(name: "renderImage")
guard let function else {
fatalError("Could not make function.")
}
let pipeline = try! device.makeComputePipelineState(function: function)
return pipeline
}
// Set up the resources.
let application = createApplication()
let shaderSource = createShaderSource()
let renderPipeline = createRenderPipeline(
application: application,
shaderSource: shaderSource)
var startTime: UInt64?
var frameID: Int = .zero
// Enter the run loop.
application.run { renderTarget in
frameID += 1
// Start the command encoder.
let commandQueue = application.gpuContext.commandQueue
let commandBuffer = commandQueue.makeCommandBuffer()!
let encoder = commandBuffer.makeComputeCommandEncoder()!
// Bind the buffers.
do {
func setTime(_ time: Double, index: Int) {
let fractionalTime = time - floor(time)
var time32 = Float(fractionalTime)
encoder.setBytes(&time32, length: 4, index: index)
}
if let startTime {
let currentTime = mach_continuous_time()
let timeSeconds = Double(currentTime - startTime) / 24_000_000
setTime(timeSeconds, index: 0)
} else {
startTime = mach_continuous_time()
setTime(Double.zero, index: 0)
}
let clock = application.clock
let timeInFrames = clock.frames
let framesPerSecond = application.display.frameRate
let timeInSeconds = Double(timeInFrames) / Double(framesPerSecond)
setTime(timeInSeconds, index: 1)
setTime(Double.zero, index: 2)
}
// Bind the textures.
encoder.setTexture(renderTarget, index: 0)
// Dispatch
do {
encoder.setComputePipelineState(renderPipeline)
let width = Int(renderTarget.width)
let height = Int(renderTarget.height)
encoder.dispatchThreads(
MTLSize(width: width, height: height, depth: 1),
threadsPerThreadgroup: MTLSize(width: 8, height: 8, depth: 1))
}
// End the command encoder.
encoder.endEncoding()
commandBuffer.commit()
}
#endif
#if os(Windows)
// I want to achieve 'hello world' for vector addition.
//
// Issues:
// - How to create resources
// - How to create pipelines
// - What the heck is going on with descriptors
// - What the heck is going on with root signatures
// - How to bind resources to commands
// - How to dispatch GPU threads
// - What the heck is going on with resource state (transitions)
// - How to test the results of GPU execution
//
// The above notes are the start of a brainstorming session about how to
// approach this goal in practice. Next, I must elaborate on and refactor the
// ideas. I've started by purging this repository of the C++ translations.
// ## Initiation
//
// My goal is to execute a proof of concept compute-only workflow on the GPU.
// I will create three UAV buffers of type FP32. They will be read and written
// from GPU-native memory during a compute shader. The CPU will supply input
// data and test the output data.
//
// Additional requirements:
// - Correct/conventional usage of the DirectX 12 API
// - No memory leaks from mishandling COM objects
// - Shader is compiled entirely at runtime
// - Root signature is specified in HLSL, not on the CPU
// - Resources are bound in separate 'root descriptor' entries
// - Resources are not sub-resources of another resource
// - All objects for encoding commands are regenerated for each command list
//
// Resource states:
// - 'COPY_DEST' while moving from CPU -> GPU
// - 'UAV' while executing the compute shader
// - 'COPY_SRC' while moving from GPU -> CPU
// - There are no constant buffers or inlined 32-bit constants.
//
// Additional small details:
// - Compile the shader with the SM 6.5 target.
// - Dispatch 128 threads per group.
// - Each buffer is 1024 elements.
// - First input is 0 to 1023, in ascending order.
// - Second input is 1024 to 2047, in ascending order.
// - Report the results for the first 10 entries explicitly.
// - To cover the remaining entries, count the number that did/didn't match
// results of an analytical formula.
// - In root signature v1.1, the UAV's flag is 'DATA_VOLATILE' by default.
let debugInterface: SwiftCOM.ID3D12Debug =
try! D3D12GetDebugInterface()
try! debugInterface.EnableDebugLayer()
// ## First Step
//
// Author the HLSL shader. Then, modify the DXCWrapper utility to provide the
// compiled blob.
let shaderSource: String = """
RWStructuredBuffer<float> buffer0 : register(u0);
RWStructuredBuffer<float> buffer1 : register(u1);
RWStructuredBuffer<float> buffer2 : register(u2);
#define mainRS "UAV(u0), " \\
"UAV(u1), " \\
"UAV(u2)"
[numthreads(128, 1, 1)]
[RootSignature(mainRS)]
void main(
uint3 tid : SV_DispatchThreadID
) {
uint slotID = tid.x;
float input0 = buffer0[slotID];
float input1 = buffer1[slotID];
float output = input0 + input1;
buffer2[slotID] = output;
}
"""
let device = DirectXDevice()
let compiler = Compiler(device: device)
let shaderBytecode = compiler.compile(source: shaderSource)
// Retrieve the info queue from the device.
var infoQueue: SwiftCOM.ID3D12InfoQueue
do {
let d3d12Device = device.d3d12Device
let iid = SwiftCOM.ID3D12InfoQueue.IID
let interface = try! d3d12Device.QueryInterface(iid: iid)
infoQueue = .init(pUnk: interface)
}
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, true)
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, true)
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, true)
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_INFO, true)
try! infoQueue.SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_MESSAGE, true)
// ## Second Step
//
// See whether I can jump directly to creating a PSO and root signature object.
import SwiftCOM
import WinSDK
// Create the root signature.
var rootSignature: SwiftCOM.ID3D12RootSignature?
shaderBytecode.rootSignature.withUnsafeBytes { bufferPointer in
let d3d12Device = device.d3d12Device
rootSignature = try! d3d12Device.CreateRootSignature(
0,
bufferPointer.baseAddress,
UInt64(bufferPointer.count))
}
guard let rootSignature else {
fatalError("Could not create root signature.")
}
// Create the pipeline state.
var pipelineState: SwiftCOM.ID3D12PipelineState?
shaderBytecode.object.withUnsafeBytes { bufferPointer in
var computeShader = D3D12_SHADER_BYTECODE()
computeShader.pShaderBytecode = bufferPointer.baseAddress
computeShader.BytecodeLength = UInt64(bufferPointer.count)
var cachedPipelineState = D3D12_CACHED_PIPELINE_STATE()
cachedPipelineState.pCachedBlob = nil
cachedPipelineState.CachedBlobSizeInBytes = 0
var computePipelineStateDesc = D3D12_COMPUTE_PIPELINE_STATE_DESC()
try! rootSignature.perform(
as: WinSDK.ID3D12RootSignature.self
) { pUnk in
computePipelineStateDesc.pRootSignature = pUnk
}
computePipelineStateDesc.CS = computeShader
computePipelineStateDesc.NodeMask = 0
computePipelineStateDesc.CachedPSO = cachedPipelineState
computePipelineStateDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE
let d3d12Device = device.d3d12Device
var iid = SwiftCOM.ID3D12PipelineState.IID
let pUnk = try! d3d12Device.CreateComputePipelineState(
&computePipelineStateDesc, &iid)
pipelineState = SwiftCOM.ID3D12PipelineState(
pUnk: pUnk)
}
guard let pipelineState else {
fatalError("Could not create pipeline state.")
}
// ## Third Step
//
// Create buffer objects and test the API for accessing mapped pointers.
// Fill the descriptor properties common to all buffers.
var bufferDesc = BufferDescriptor()
bufferDesc.device = device
bufferDesc.size = 1024 * 4
// Create the input buffers.
bufferDesc.type = .input
let inputBuffer0 = Buffer(descriptor: bufferDesc)
let inputBuffer1 = Buffer(descriptor: bufferDesc)
// Create the native buffers.
bufferDesc.type = .native
let nativeBuffer0 = Buffer(descriptor: bufferDesc)
let nativeBuffer1 = Buffer(descriptor: bufferDesc)
let nativeBuffer2 = Buffer(descriptor: bufferDesc)
// Create the output buffers.
bufferDesc.type = .output
let outputBuffer2 = Buffer(descriptor: bufferDesc)
// Generate the input data for the shader.
do {
var inputData0: [Float] = []
var inputData1: [Float] = []
for i in 0..<1024 {
let value0 = Float(i)
let value1 = 1024 + Float(i)
inputData0.append(value0)
inputData1.append(value1)
}
inputData0.withUnsafeBytes { bufferPointer in
let baseAddress = bufferPointer.baseAddress!
inputBuffer0.write(input: baseAddress)
}
inputData1.withUnsafeBytes { bufferPointer in
let baseAddress = bufferPointer.baseAddress!
inputBuffer1.write(input: baseAddress)
}
}
// Read the initial contents of the output buffer.
do {
var outputData2: [Float] = []
for i in 0..<1024 {
let value2 = Float(i)
outputData2.append(value2)
}
outputData2.withUnsafeMutableBytes { bufferPointer in
let baseAddress = bufferPointer.baseAddress!
outputBuffer2.read(output: baseAddress)
}
for slotID in 0..<10 {
let value2 = outputData2[slotID]
guard value2 == 0 else {
fatalError("Output buffer was initialized to nonzero value.")
}
}
}
// Next steps:
// - Test the code for reading/writing mapped pointers. Set the input data to
// an increasing list of floating point numbers. Study the results of reading
// from the output buffer. It should at least overwrite the previous contents
// of the CPU memory allocation.
// - Redefine the "3rd step" and "4th step". The third step is shortened to just
// summarize what we've done above. The fourth step is to create a command
// queue, command list, and set up the resources for copying.
// - "Hello world" will come from shifting the data through various buffers.
// The output will match either buffer0 or buffer1 (of my choosing). The copy
// commands must pass through a GPU private buffer(s) as an intermediate.
// - After that is done, proceed with the descriptors necessary to bind UAVs to
// a compute command.
// ## Fourth Step
//
// Set up the command queue, command list, and anything else needed for copying
// buffers.
// List all the components needed to make this happen:
// - ID3D12CommandQueue
// - ID3D12CommandList
// - ID3D12CommandAllocator
// - ID3D12Fence
// - Windows OS event
//
// References to get started:
// - My worked examples of C++ translations
// - DirectX tutorials
// - Microsoft's online documentation
//
// Describe everything that will happen in the procedure, qualitatively, in
// chronological order. At the moment, we don't actually know the correct
// chronological order.
//
// - Create the command queue
// - Create the command allocator
// - Create the command list from the command allocator
// - Close the command allocator and command list
//
// - Bind the buffers
// - Encode the copy commands
// - Encode the fence signaling
// - Wait on the fence on the CPU
// - Read the contents of the output buffer
//
// - Copy commands:
// - Copy inputBuffer0 to nativeBuffer0
// - Copy inputBuffer1 to nativeBuffer1
// - Copy either [nativeBuffer0, nativeBuffer1] to nativeBuffer2
// - Copy nativeBuffer2 to outputBuffer2
//
// Regarding the command queue/list/fence, what object creates what?
// - ID3D12Device
// - ID3D12CommandQueue
// - ID3D12CommandAllocator
// - ID3D12CommandList
// - ID3D12Fence
// - CreateEventA
// - HANDLE
// Let's start with a simple deliverable:
// - Create the above objects, without using utility classes.
// - Dispatch an empty command buffer.
// - Close or clean up the objects.
// `ID3D12CommandQueue.ExecuteCommandLists` is like `MTLCommandBuffer.commit` in
// Metal applications. It sends commands to the GPU.
//
// Fences are similar to `MTLCommandBuffer.waitUntilCompleted` and
// `DispatchSemaphore` in Metal applications. They wait until a specific command
// buffer has completed. In Metal, one of the functions can facilitate triple-
// buffering without retaining a reference to the command buffer.
//
// API for quickly freezing the queue until all commands have finished, and it
// is safe to read contents from the CPU:
// - Metal: commandBuffer.waitUntilCompleted()
// - DirectX: immediately create, signal, and wait on a fence
//
// API for triple buffering:
// - Metal: DispatchSemaphore and commandBuffer.setCompletedHandler()
// - DirectX: increment a fence counter after an entire frame, remember the
// counter's value until a future frame that needs a resource
//
// Both APIs require an entire command list to be committed before waiting on
// a chunk of GPU work. It's not clear at what granularity you can gather
// execution latency data.
//
// MTLSharedEvent has similarities to ID3D12Fence. Especially the method
// `MTLSharedEvent.wait(untilSignaledValue:timeoutMS:)`. It is virtually
// identical to `WaitForSingleObject` on Windows.
//
// I don't know whether there's a Windows API for callbacks, similar to the Mac
// paradigm of using semaphores.
//
// I don't know whether using MTLSharedEvent causes performance issues on Mac.
// For the time being, we don't actually need to worry about triple-buffering
// of resources. The Mac side of the new codebase hasn't gotten that far yet.
// So just use fences as a means to immediately stall until a command buffer
// has completed.
//
// Option 1:
// - Every command buffer gets a unique ID, monotonically increasing from when
// the command queue was first created.
// - You can wait for GPU work at the granularity of previous command buffers.
// So, asynchronous compute.
//
// Option 2:
// - The command queue has an internal fence + event object created once at
// initialization.
// - Every instance of CPU-side stalling blocks at the latest command dispatched
// on that specific queue.
//
// Choose option 2.
// Another concern is the ability to profile GPU command execution time. In
// DirectX 12, ID3D12GraphicsCommandList.BeginQuery cannot be called on a
// timestamp query. Instead, call `EndQuery`.
//
// Source: https://pavelsmejkal.net/Posts/
//
// The DX12 'Query' paradigm for measuring time looks similar to the Metal
// 'MTLCounterSampleBuffer' paradigm. There is an additional step, where one
// must store timestamps in a special buffer. Not as easy as the Metal API for
// retrieving the '.gpuStartTime' and '.gpuEndTime' of a command buffer.
//
// In both APIs, you must be careful about the step size of timestamp counters.
// On Mac, it could be Mach absolute time (24 MHz) instead of nanoseconds.
// On Windows, you must call `ID3D12CommandQueue.GetTimestampFrequency.`
//
// One difference might be that Windows allows finer granularity of timestamp
// sampling. On Mac, `.gpuStartTime` and `.gpuEndTime` are scoped to the entire
// command buffer. The counter sample buffers API looks scoped to an entire
// compute command encoder, which has just as much latency as creating a new
// command buffer. Windows might allow finer granularity, because you can
// inject timestamps at any point within the command list. Including between
// subsequent compute commands, without a severe latency penalty.
//
// For the time being, neglect the ability to profile GPU-side execution time.
// ## Fourth Step (2nd Iteration)
//
// Create an ergonomic API for generating and waiting on empty GPU command
// buffers. Design the API with the intent to wrap a Metal backend in the
// future.
// - CommandQueue utility class
// - 'flush' member function
// - Windows: increment the fence counter, use a fence created when the
// command queue initializes
// - Mac: store a reference to the latest command buffer submitted to the
// command queue
// - CommandBuffer utility class
// - 'commit' member function
// - Sends the command list to the command queue (perhaps this member
// belongs in CommandQueue, and there is no utility class for
// CommandBuffer).
// - Closes the command list.
// - No analogue to Metal 'waitUntilCompleted'
//
// To ease the prototyping process, just create an API for 'CommandQueue'. It
// creates and commits instances of 'ID3D12GraphicsCommandList'. The creation
// method abstracts away the 'ID3D12CommandAllocator'. The commit method
// abstracts away both 'commandList.Close()' and 'commandQueue.
// ExecuteCommandLists()'. The flush method works as described above.
let commandQueue = CommandQueue(device: device)
commandQueue.flush()
commandQueue.flush()
// Encapsulate the command buffer in a scope, so it can deinitialize itself.
//
// There is currently a problem with naming things both 'command buffer' and
// 'command list'. I have no solution at the moment.
do {
let commandList = commandQueue.createCommandList()
commandQueue.commit(commandList)
}
commandQueue.flush()
// ## Fifth Step
//
// Encode the copy commands into the command list.
let commandList = commandQueue.createCommandList()
// To start, copy inputBuffer0 to nativeBuffer0.
//
// Components of this task:
// - Identify methods of 'ID3D12(Graphics)CommandList' that bind the buffers
// to the 'src' or 'dst' slots of a copy operation.
// - Identify the DirectX APIs for changing resource states.
// - Acknowledge the state of each buffer prior to the transition (or don't).
//
// Where to start: the 3DGEP tutorial series.
// - D3D12_RESOURCE_BARRIER
// - D3D12_RESOURCE_BARRIER_TYPE
// - Don't want to use '_TRANSITION', because we're not splitting individual
// resources into subresources.
// - Don't think '_UAV' applies to copy commands.
// - D3D12_RESOURCE_FLAGS
// - The '_BEGIN_ONLY' and '_END_ONLY' flags seem strange.
// - union of the 3 possible types
// - '_UAV_BARRIER' has the simplest data structure, just a pointer to the
// resource.
// - '_TRANSITION_BARRIER' also makes sense. I would use the 0xFFFFFFFF
// flag because we don't have subresources (?).
// - D3D12_RESOURCE_STATES
// - Many members of 'ResourceStateTracker' just invoke the same method,
// 'ResourceBarrier'.
// - Microsoft documentation encourages batching multiple resource barriers
// into a single call. If all of the 2 transitions are scoped to one utility
// function for copying buffers, I can write the boilerplate code for this.
// No need to design a general API for easing the creation of barrier objects.
//
// Call: d3d12CommandList.ResourceBarrier(numBarriers, resourceBarriers.data())
// - 'D3D12_RESOURCE_BARRIER' is a value type, not a COM reference type. This
// fact makes barriers easier to aggregate and send through a C interface.
//
// Based on the 3DGEP tutorial series, we might have to know the resource's
// prior/current state to encode a barrier. This makes things more complicated;
// we must implement state tracking and carry it around everywhere.
//
// For the 'hello world' demonstration, we can ignore the state tracking. We
// know every resource's specific state ahead of time. It becomes an issue
// when we create an API that generalizes to arbitrary code. Something to
// possibly defer to after the 'hello world' demonstration.
// I may have figured out resource state transitions. Next, figure out the
// DirectX API function that encodes the copy command. And whether it requires
// additional calls to bind buffers to slots.
// ## Copy command(s) in the DirectX 12 API
//
// Object that calls the member functions: ID3D12GraphicsCommandList
//
// Member: CopyBufferRegion(ID3D12Resource *pDstBuffer,
// UINT64 DstOffset,
// ID3D12Resource *pSrcBuffer,
// UINT64 SrcOffset,
// UINT64 NumBytes)
//
// Member: CopyResource(ID3D12Resource *pDstResource,
// ID3D12Resource *pSrcResource)
//
// Member: CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION *pDst,
// UINT DstX,
// UINT DstY,
// UINT DstZ,
// const D3D12_TEXTURE_COPY_LOCATION *pSrc,
// const D3D12_BOX *pSrcBox)
//
// Member: CopyTiles(const D3D12_TEXTURE_COPY_LOCATION *pDst,
// UINT DstX,
// UINT DstY,
// UINT DstZ,
// const D3D12_TEXTURE_COPY_LOCATION *pSrc,
// const D3D12_BOX *pSrcBox)
// I think I have figured out the copy command. Next, document the DirectX 12
// API functions associated with 'TransitionBarrier' and 'TrackResource'.
// ## ResourceStateTracker::FlushResourceBarriers
//
// Takes a 'CommandList' helper class as an argument. References a list of
// resource barrier objects. These objects are in fact value types, making the
// code easier to implement. Invokes the 'ResourceBarrier' method of
// 'ID3D12GraphicsCommandList' with the barrier count and barrier pointer.
// Deletes all entries in the barrier list.
//
// ## CommandList::TransitionBarrier
//
// Takes a reference to the 'ID3D12Resource'. Pretends the initial state is
// 'COMMON'. Sets the final state to the specified state. Appends the newly
// created 'D3D12_RESOURCE_BARRIER' value type to the list.
//
// ## CommandList::TrackResource
//
// Takes an 'ID3D12Resource' as an argument. Casts it to 'ID3D12Object' and
// appends it to an internal list.
// Finally, the whole source code snippet where 3DGEP performed a copy
// operation. This is a high-level guide for how to proceed with coding a copy
// operation in DirectX.
//
// void CommandList::CopyResource( Resource& dstRes, const Resource& srcRes )
// {
// TransitionBarrier( dstRes, D3D12_RESOURCE_STATE_COPY_DEST );
// TransitionBarrier( srcRes, D3D12_RESOURCE_STATE_COPY_SOURCE );
//
// FlushResourceBarriers();
//
// m_d3d12CommandList->CopyResource( dstRes.GetD3D12Resource().Get(), srcRes.GetD3D12Resource().Get() );
//
// TrackResource(dstRes);
// TrackResource(srcRes);
// }
// The task has now been specified in enough detail that I can do it.
//
// Or not. What's going on with COMMON?
// - Does the 3DGEP tutorial correct the pending commands, replacing the
// 'COMMON' placeholder with the true value?
// - If performance is not a concern, is it ideal to post-transition every
// single resource back to 'COMMON' after every command?
//
// Ignore the common state. I think it skips calls to 'ResourceBarrier'
// entirely. Instead, start by specifying all the transitions that ought to
// occur throughout all 3 buffers.
// Initial states:
// inputBuffer0 - GENERIC_READ
// inputBuffer1 - GENERIC_READ
// nativeBuffer0 - COMMON
// nativeBuffer1 - COMMON
// nativeBuffer2 - COMMON
// outputBuffer2 - COPY_DEST
//
// Ideal states for a copy command:
// input - GENERIC_READ
// output - COPY_DEST
//
// Copy commands:
// - inputBuffer0 -> nativeBuffer0
// - inputBuffer0: GENERIC_READ -> GENERIC_READ [omitted]
// - nativeBuffer0: COMMON -> COPY_DEST
// - inputBuffer1 -> nativeBuffer1
// - inputBuffer1: GENERIC_READ -> GENERIC_READ [omitted]
// - nativeBuffer1: COMMON -> COPY_DEST
// - nativeBuffer0 -> nativeBuffer2
// - nativeBuffer0: COPY_DEST -> GENERIC_READ
// - nativeBuffer2: COMMON -> COPY_DEST
// - nativeBuffer2 -> outputBuffer2
// - nativeBuffer2: COPY_DEST -> GENERIC_READ
// - outputBuffer2: COPY_DEST -> COPY_DEST [omitted]
var barrier00 = D3D12_RESOURCE_BARRIER()
barrier00.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION
barrier00.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE
barrier00.Transition.pResource = nil // nativeBuffer0
barrier00.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES
barrier00.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON
barrier00.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST
// Next, create a utility function to minimize the boilerplate of creating
// barrier structs multiple times.
// Helper function for creating barriers.
func createBarrier(
resource: SwiftCOM.ID3D12Resource,
stateBefore: D3D12_RESOURCE_STATES,
stateAfter: D3D12_RESOURCE_STATES
) -> D3D12_RESOURCE_BARRIER {
// Specify the type of barrier.
var output = D3D12_RESOURCE_BARRIER()
output.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION
output.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE
// Specify the transition's parameters.
try! resource.perform(
as: WinSDK.ID3D12Resource.self
) { pUnk in
output.Transition.pResource = pUnk
}
output.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES
output.Transition.StateBefore = stateBefore
output.Transition.StateAfter = stateAfter
// Return the barrier.
return output
}
// Test out the utility function.
do {
let barrier = createBarrier(
resource: nativeBuffer0.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COMMON,
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST)
}
// Next, encode a full copy command onto the command list. Commit the command
// list onto the command queue, then wait until it has completed. Verify that
// the code doesn't crash.
// Copy command: inputBuffer0 -> nativeBuffer0
do {
// Create the barriers.
let barrier = createBarrier(
resource: nativeBuffer0.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COMMON,
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST)
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier]
// Encode the barriers.
try! commandList.ResourceBarrier(
UInt32(barriers.count),
barriers)
// Encode the copy command.
try! commandList.CopyResource(
nativeBuffer0.d3d12Resource,
inputBuffer0.d3d12Resource)
}
// Copy command: inputBuffer1 -> nativeBuffer1
do {
// Create the barriers.
let barrier = createBarrier(
resource: nativeBuffer1.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COMMON,
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST)
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier]
// Encode the barriers.
try! commandList.ResourceBarrier(
UInt32(barriers.count),
barriers)
// Encode the copy command.
try! commandList.CopyResource(
nativeBuffer1.d3d12Resource,
inputBuffer1.d3d12Resource)
}
// Copy command: nativeBuffer0 -> nativeBuffer2
do {
// Create the barriers.
let barrier0 = createBarrier(
resource: nativeBuffer0.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COPY_DEST,
stateAfter: D3D12_RESOURCE_STATE_COPY_SOURCE)
let barrier2 = createBarrier(
resource: nativeBuffer2.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COMMON,
stateAfter: D3D12_RESOURCE_STATE_COPY_DEST)
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier0, barrier2]
// Encode the barriers.
print("start:", try! infoQueue.GetNumStoredMessages())
try! commandList.ResourceBarrier(
UInt32(barriers.count),
barriers)
print("end:", try! infoQueue.GetNumStoredMessages())
print("start:", try! infoQueue.GetNumStoredMessages())
// Encode the copy command.
try! commandList.CopyResource(
nativeBuffer2.d3d12Resource,
nativeBuffer0.d3d12Resource)
print("end:", try! infoQueue.GetNumStoredMessages())
}
// Copy command: nativeBuffer2 -> outputBuffer2
do {
let barrier = createBarrier(
resource: nativeBuffer2.d3d12Resource,
stateBefore: D3D12_RESOURCE_STATE_COPY_DEST,
stateAfter: D3D12_RESOURCE_STATE_COPY_SOURCE)
let barriers: [D3D12_RESOURCE_BARRIER] = [barrier]
// Encode the barriers.
print("start:", try! infoQueue.GetNumStoredMessages())
try! commandList.ResourceBarrier(
UInt32(barriers.count),
barriers)
print("end:", try! infoQueue.GetNumStoredMessages())
// Encode the copy command.
try! commandList.CopyResource(
outputBuffer2.d3d12Resource,
nativeBuffer2.d3d12Resource)
}
// Run the commands on the GPU.
commandQueue.commit(commandList)
commandQueue.flush()
// Check the data in the output buffer.
do {
var outputData2: [Float] = []
for i in 0..<1024 {
let value2 = Float(i)
outputData2.append(value2)
}
outputData2.withUnsafeMutableBytes { bufferPointer in
let baseAddress = bufferPointer.baseAddress!
outputBuffer2.read(output: baseAddress)
}
for slotID in 0..<10 {
let value2 = outputData2[slotID]
print("outputBuffer[\(slotID)] = \(value2)")
}
}
// ## Sixth Step
//
// Activate the debug layer in DirectX. Try omitting all resource barriers.
// Mess with the commands by using 'COMMON' as the source state. Either the
// code should crash, or the output buffer should have incorrect data.
//
// After this step is complete, I should purge the 'main' file to a gist.
// Incorporate resource state tracking into the 'Buffer' API, and write new
// tests for this functionality from scratch. Activate the debug layer by
// default, possibly storing an info queue inside the 'DirectXDevice'.
//
// After that task, I can return to focusing on a 'hello world' compute shader.
// ## Known Information about the Debug Layer
//
// Most commands in the DirectX spec have specific errors related to the debug
// layer. Without the debug layer, these errors don't trigger. One particularly
// relevant error involves resource barriers.
//
// The first 3DGEP tutorial mentioned and invoked the debug layer.
//
// The DirectX spec doesn't reference the 'ID3D12Debug' object directly.
// Microsoft's online documentation doesn't describe it in much detail, either.
// This makes it difficult to understand the purpose and/or how to use the
// debug layer.
//
// There are 7 iterations of the 'ID3D12Debug' interface. Some other interfaces,
// such as 'ID3D12DebugCommandList' and 'ID3D12DebugDevice', do not inherit
// from the non-debug versions. This fact differs from the Metal API design,
// where debug versions of API objects conform to the same protocol as the
// vanilla objects.
//
// The DirectX debug layer is different from the Metal validation layer. I am
// unfamiliar with the purpose of all the objects. Therefore, I am going to read
// up on how to instantiate each one.
// ## ID3D12Debug
//
// This looks like the only component of the API represented in SwiftCOM.
// Therefore, I probably won't actually use any of the other interfaces.
//
// Initializer: D3D12GetDebugInterface()
//
// Instance method: EnableDebugLayer()
//
// ## ID3D12InfoQueue
//
// Most instance members are ported in SwiftCOM. It is created by calling
// 'IUnknown::QueryInterface' on 'ID3D12Device', which sounds like a strange
// way to create an object. Have I seen this before?
//
// ## IDXGIDebug
//
// Not part of the 'direct3d-12-sdklayers-interfaces' document, but potentially
// relevant.
//
// Initializer: DXGIGetDebugInterface1()
//
// Instance method: ReportLiveObjects(GUID, DXGI_DEBUG_RLO_FLAGS)
// Useful advice about how to initialize several debug interfaces:
// http://gamedev.net/forums/topic/672268-d3d12-debug-layers-how-to-get-id3d12debugdevice/5255763/
//
// Steps:
// 1) Enable the debug layer
// 2) Create the regular device, command queue, command list
// 3) Extract the debug versions of each object through 'QueryInterface'
//
// The QueryInterface path might mean a COM object conforms to multiple
// interfaces. It doesn't look like ID3D12DebugDevice inherits from
// ID3D12Device. It makes sense that you'd need a special technique to cast
// something between these two types.
// One potential reason the debug layer isn't working: need to use dxcpl
// https://nvidia.custhelp.com/app/answers/detail/a_id/5604/~/how-to-capture-d3d-debug-layer-logs-to-detect-application-or-runtime-bugs
//
// Another potential reason: debug logging occurs through D3D12InfoQueue.
// I'm going to try the latter before resorting to the former.
print(try! infoQueue.GetMessageCountLimit())
print(try! infoQueue.GetMuteDebugOutput())
print(try! infoQueue.GetNumMessagesAllowedByStorageFilter())
print(try! infoQueue.GetNumMessagesDeniedByStorageFilter())
print(try! infoQueue.GetNumMessagesDiscardedByMessageCountLimit())
print(try! infoQueue.GetNumStoredMessages())
print(try! infoQueue.GetNumStoredMessagesAllowedByRetrievalFilter())
// Display the stored messages.
do {
let messageCount = try! infoQueue.GetNumStoredMessages()
for messageID in 0..<messageCount {
print("Working on message \(messageID)")
let (message, size) =
try! infoQueue.GetMessage(messageID)
print("messages[\(messageID)] = (\(message), \(size))")
}
}
// I cannot retrieve the contents of a message. That causes a runtime crash,
// due to the 'GetMessage' C interface being messed up. Instead, just inspect
// 'GetNumStoredMessages()' after every API command that might trigger a debug
// layer error.
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment