Hello everyone. I am developing a volumetric visualization library which involves various GP-GPU operations. The GP-GPU operations including parallel scanning/reduction, parallel sorting are not well implemented in UE yet, thus I want to call CUDA for help.
I design a RDG pipeline as:
- In Post Opaque Render Delegate, allocate FRHITexture with ETextureCreateFlags::Shared and ETextureCreateFlags::UAV. Register it as a RDG External Resource.
bool bNeedRegister = false;
if (!VolumeColorTexture.IsValid()
|| VolumeColorTexture->GetDesc().Extent.X != VDBRendererParams.RenderResolutionX
|| VolumeColorTexture->GetDesc().Extent.Y != VDBRendererParams.RenderResolutionY)
{
VDBRenderer->Unregister();
auto Desc =
FRHITextureCreateDesc::Create2D(UE_SOURCE_LOCATION, VolumeRenderResolution, EPixelFormat::PF_R8G8B8A8);
Desc.AddFlags(NeededTextureCreateFlags);
VolumeColorTexture = RHICmdList.CreateTexture(Desc);
bNeedRegister = true;
}
auto VolumeColorTextureRDG = RegisterExternalTexture(*GraphBuilder, VolumeColorTexture, UE_SOURCE_LOCATION);
- Register FRHITexture to CUDA through DX12 Native Resource. Here I follow the instructions in https://gamedev.stackexchange.com/questions/176812/how-can-i-write-a-d3d12-texture-in-cuda, but ignore using Semaphore and Fence, since I don’t know how to use them with UE.
// UE
OutVolumeColorTextureNative = VolumeColorTexture->GetNativeResource();
if (bNeedRegister)
{
VDBRenderer->Register({ .Device = GDynamicRHI->RHIGetNativeDevice(),
.InDepthTexture = InDepthTextureNative,
.OutColorTexture = OutVolumeColorTextureNative });
}
// CUDA
TextureDesc = Texture->GetDesc();
{
HANDLE SharedHandle;
ThrowIfFailed(
Device->CreateSharedHandle(Texture, NULL, GENERIC_ALL, NULL, &SharedHandle));
D3D12_RESOURCE_ALLOCATION_INFO D3D12ResourceAllocationInfo;
D3D12ResourceAllocationInfo =
Device->GetResourceAllocationInfo(NodeMask, 1, &TextureDesc);
cudaExternalMemoryHandleDesc ExternalMemoryHandleDesc{};
ExternalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
ExternalMemoryHandleDesc.handle.win32.handle = SharedHandle;
ExternalMemoryHandleDesc.size = D3D12ResourceAllocationInfo.SizeInBytes;
ExternalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
CUDAVOLLIB_CHECK(cudaImportExternalMemory(&ExternalMemory, &ExternalMemoryHandleDesc));
CloseHandle(SharedHandle);
}
cudaExternalMemoryMipmappedArrayDesc ExternalMmeoryMipmappedArrayDesc{};
ExternalMmeoryMipmappedArrayDesc.extent =
make_cudaExtent(TextureDesc.Width, TextureDesc.Height, 0);
switch (TextureDesc.Format)
{
case DXGI_FORMAT_R32_FLOAT:
ExternalMmeoryMipmappedArrayDesc.formatDesc = cudaCreateChannelDesc<float>();
break;
case DXGI_FORMAT_R8G8B8A8_UNORM:
ExternalMmeoryMipmappedArrayDesc.formatDesc = cudaCreateChannelDesc<uchar4>();
break;
default:
assert(false, "Illegal texture format");
}
ExternalMmeoryMipmappedArrayDesc.numLevels = 1;
ExternalMmeoryMipmappedArrayDesc.flags = cudaArraySurfaceLoadStore;
cudaMipmappedArray_t MipmappedArray = nullptr;
CUDAVOLLIB_CHECK(cudaExternalMemoryGetMappedMipmappedArray(
&MipmappedArray, ExternalMemory, &ExternalMmeoryMipmappedArrayDesc));
cudaArray_t Array = nullptr;
CUDAVOLLIB_CHECK(cudaGetMipmappedArrayLevel(&Array, MipmappedArray, 0));
cudaResourceDesc ResDesc{};
ResDesc.resType = cudaResourceTypeArray;
ResDesc.res.array.array = Array;
CUDAVOLLIB_CHECK(cudaCreateSurfaceObject(&SurfaceObject, &ResDesc));
- Create 2 RDG passes, pass 1 identifies VolumeColorTexture as an UAV, pass 2 identifies it as a texture. According to the source code of RDG, I notice that in this way RDG will execute pass 1 before pass 2.
// Pass 1 shader parameters (no shader here)
BEGIN_SHADER_PARAMETER_STRUCT(FBarrierShaderParameters, VOLRENDERER_API)
SHADER_PARAMETER_RDG_TEXTURE_UAV(RWTexture2D, OutVolumeColorTexture)
END_SHADER_PARAMETER_STRUCT()
... ...
// Pass 2 shader and parameters
class VOLRENDERER_API FCompositionCS : public FGlobalShader
{
public:
... ...
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, VOLRENDERER_API)
SHADER_PARAMETER(FIntPoint, RenderResolution)
SHADER_PARAMETER_SAMPLER(SamplerState, ColorSamplerState)
SHADER_PARAMETER_RDG_TEXTURE(Texture2D, InColorTexture)
... ...
};
- In pass 1, I launch a CUDA Kernel to write the VolumeColorTexture. In pass 2, I blend the color in VolumeColorTexture to FPostOpaqueRenderParameters.SceneColorTexture.
// pass 1
auto VolumeColorTextureRDGUAV = GraphBuilder->CreateUAV(FRDGTextureUAVDesc(VolumeColorTextureRDG));
auto ShaderParams = GraphBuilder->AllocParameters<FBarrierShaderParameters>();
ShaderParams->InDepthTexture = DepthTextureRDG;
ShaderParams->OutVolumeColorTexture = VolumeColorTextureRDGUAV;
auto ShaderParametersMetadata = FBarrierShaderParameters::FTypeInfo::GetStructMetadata();
GraphBuilder->AddPass(RDG_EVENT_NAME("Volume Rendering"), ShaderParametersMetadata, ShaderParams,
ERDGPassFlags::AsyncCompute | ERDGPassFlags::NeverCull,
[DepthTexture = DepthTexture, VolumeColorTexture = VolumeColorTexture, VDBRenderer = VDBRenderer.get()](
FRHICommandListImmediate& RHICmdList) { VDBRenderer->Render({ .bUseDepthBox = true }); });
// pass 1 in CUDA
if (!OutColorTexture)
return;
dim3 ThreadPerBlock(16, 16, 1);
dim3 BlockPerGrid((RenderResolution.x + ThreadPerBlock.x - 1) / ThreadPerBlock.x,
(RenderResolution.y + ThreadPerBlock.y - 1) / ThreadPerBlock.y);
ParallelFor(
BlockPerGrid, ThreadPerBlock,
[RenderResolution = RenderResolution, OutColorSurface = OutColorTexture->SurfaceObject] __device__(const glm::uvec3& DispatchThreadID) {
if (DispatchThreadID.x >= RenderResolution.x
&& DispatchThreadID.y >= RenderResolution.y)
return;
glm::vec4 Color(static_cast<float>(DispatchThreadID.x) / (RenderResolution.x - 1),
static_cast<float>(DispatchThreadID.y) / (RenderResolution.y - 1), .5f, 5f);
Color = glm::clamp(Color * 255.f, 0.f, 255.f);
uchar4 ColorUCh4{ Color.r, Color.g, Color.b, Color.a };
surf2Dwrite(ColorUCh4, OutColorSurface, sizeof(uchar4) * DispatchThreadID.x,
DispatchThreadID.y);
},
Stream);
CUDAVOLLIB_CHECK(cudaStreamSynchronize(Stream));
By checking CUDA error, the first execution of CUDA Kernel suceesss while the second one failed with “cudaErrorLaunchFailure: unspecified launch failure”.
The reason of CUDA Error might come from no Synchronization being used between DX12 and CUDA, but I don’t know how to do that in a RDG style pipeline.