How to synchronize CUDA with shared resource in RDG?

Hello everyone. I am developing a volumetric visualization library which involves various GP-GPU operations. The GP-GPU operations including parallel scanning/reduction, parallel sorting are not well implemented in UE yet, thus I want to call CUDA for help.
I design a RDG pipeline as:

  1. In Post Opaque Render Delegate, allocate FRHITexture with ETextureCreateFlags::Shared and ETextureCreateFlags::UAV. Register it as a RDG External Resource.
bool bNeedRegister = false;
if (!VolumeColorTexture.IsValid()
	|| VolumeColorTexture->GetDesc().Extent.X != VDBRendererParams.RenderResolutionX
	|| VolumeColorTexture->GetDesc().Extent.Y != VDBRendererParams.RenderResolutionY)
{
	VDBRenderer->Unregister();

	auto Desc =
		FRHITextureCreateDesc::Create2D(UE_SOURCE_LOCATION, VolumeRenderResolution, EPixelFormat::PF_R8G8B8A8);
	Desc.AddFlags(NeededTextureCreateFlags);
	VolumeColorTexture = RHICmdList.CreateTexture(Desc);
	bNeedRegister = true;
}

auto VolumeColorTextureRDG = RegisterExternalTexture(*GraphBuilder, VolumeColorTexture, UE_SOURCE_LOCATION);
  1. Register FRHITexture to CUDA through DX12 Native Resource. Here I follow the instructions in https://gamedev.stackexchange.com/questions/176812/how-can-i-write-a-d3d12-texture-in-cuda, but ignore using Semaphore and Fence, since I don’t know how to use them with UE.
// UE
OutVolumeColorTextureNative = VolumeColorTexture->GetNativeResource();
if (bNeedRegister)
{
	VDBRenderer->Register({ .Device = GDynamicRHI->RHIGetNativeDevice(),
		.InDepthTexture = InDepthTextureNative,
		.OutColorTexture = OutVolumeColorTextureNative });
}


// CUDA
TextureDesc = Texture->GetDesc();

{
	HANDLE SharedHandle;
	ThrowIfFailed(
		Device->CreateSharedHandle(Texture, NULL, GENERIC_ALL, NULL, &SharedHandle));

	D3D12_RESOURCE_ALLOCATION_INFO D3D12ResourceAllocationInfo;
	D3D12ResourceAllocationInfo =
		Device->GetResourceAllocationInfo(NodeMask, 1, &TextureDesc);

	cudaExternalMemoryHandleDesc ExternalMemoryHandleDesc{};
	ExternalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
	ExternalMemoryHandleDesc.handle.win32.handle = SharedHandle;
	ExternalMemoryHandleDesc.size = D3D12ResourceAllocationInfo.SizeInBytes;
	ExternalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
	CUDAVOLLIB_CHECK(cudaImportExternalMemory(&ExternalMemory, &ExternalMemoryHandleDesc));
	CloseHandle(SharedHandle);
}

cudaExternalMemoryMipmappedArrayDesc ExternalMmeoryMipmappedArrayDesc{};
ExternalMmeoryMipmappedArrayDesc.extent =
	make_cudaExtent(TextureDesc.Width, TextureDesc.Height, 0);
switch (TextureDesc.Format)
{
	case DXGI_FORMAT_R32_FLOAT:
		ExternalMmeoryMipmappedArrayDesc.formatDesc = cudaCreateChannelDesc<float>();
		break;
	case DXGI_FORMAT_R8G8B8A8_UNORM:
		ExternalMmeoryMipmappedArrayDesc.formatDesc = cudaCreateChannelDesc<uchar4>();
		break;
	default:
		assert(false, "Illegal texture format");
}
ExternalMmeoryMipmappedArrayDesc.numLevels = 1;
ExternalMmeoryMipmappedArrayDesc.flags = cudaArraySurfaceLoadStore;

cudaMipmappedArray_t MipmappedArray = nullptr;
CUDAVOLLIB_CHECK(cudaExternalMemoryGetMappedMipmappedArray(
	&MipmappedArray, ExternalMemory, &ExternalMmeoryMipmappedArrayDesc));

cudaArray_t Array = nullptr;
CUDAVOLLIB_CHECK(cudaGetMipmappedArrayLevel(&Array, MipmappedArray, 0));

cudaResourceDesc ResDesc{};
ResDesc.resType = cudaResourceTypeArray;
ResDesc.res.array.array = Array;
CUDAVOLLIB_CHECK(cudaCreateSurfaceObject(&SurfaceObject, &ResDesc));
  1. Create 2 RDG passes, pass 1 identifies VolumeColorTexture as an UAV, pass 2 identifies it as a texture. According to the source code of RDG, I notice that in this way RDG will execute pass 1 before pass 2.
// Pass 1 shader parameters (no shader here)
BEGIN_SHADER_PARAMETER_STRUCT(FBarrierShaderParameters, VOLRENDERER_API)
SHADER_PARAMETER_RDG_TEXTURE_UAV(RWTexture2D, OutVolumeColorTexture)
END_SHADER_PARAMETER_STRUCT()
... ...
// Pass 2 shader and parameters
class VOLRENDERER_API FCompositionCS : public FGlobalShader
{
public:
	... ...
	BEGIN_SHADER_PARAMETER_STRUCT(FParameters, VOLRENDERER_API)
	SHADER_PARAMETER(FIntPoint, RenderResolution)
	SHADER_PARAMETER_SAMPLER(SamplerState, ColorSamplerState)
	SHADER_PARAMETER_RDG_TEXTURE(Texture2D, InColorTexture)
	... ...
};
  1. In pass 1, I launch a CUDA Kernel to write the VolumeColorTexture. In pass 2, I blend the color in VolumeColorTexture to FPostOpaqueRenderParameters.SceneColorTexture.
// pass 1
auto VolumeColorTextureRDGUAV = GraphBuilder->CreateUAV(FRDGTextureUAVDesc(VolumeColorTextureRDG));
auto ShaderParams = GraphBuilder->AllocParameters<FBarrierShaderParameters>();
ShaderParams->InDepthTexture = DepthTextureRDG;
ShaderParams->OutVolumeColorTexture = VolumeColorTextureRDGUAV;

auto ShaderParametersMetadata = FBarrierShaderParameters::FTypeInfo::GetStructMetadata();
GraphBuilder->AddPass(RDG_EVENT_NAME("Volume Rendering"), ShaderParametersMetadata, ShaderParams,
	ERDGPassFlags::AsyncCompute | ERDGPassFlags::NeverCull,
	[DepthTexture = DepthTexture, VolumeColorTexture = VolumeColorTexture, VDBRenderer = VDBRenderer.get()](
		FRHICommandListImmediate& RHICmdList) { VDBRenderer->Render({ .bUseDepthBox = true }); });


// pass 1 in CUDA
if (!OutColorTexture)
	return;

dim3 ThreadPerBlock(16, 16, 1);
dim3 BlockPerGrid((RenderResolution.x + ThreadPerBlock.x - 1) / ThreadPerBlock.x,
	(RenderResolution.y + ThreadPerBlock.y - 1) / ThreadPerBlock.y);
ParallelFor(
	BlockPerGrid, ThreadPerBlock,
	[RenderResolution = RenderResolution, OutColorSurface = OutColorTexture->SurfaceObject] __device__(const glm::uvec3& DispatchThreadID) {
		if (DispatchThreadID.x >= RenderResolution.x
			&& DispatchThreadID.y >= RenderResolution.y)
			return;

		glm::vec4 Color(static_cast<float>(DispatchThreadID.x) / (RenderResolution.x - 1),
			static_cast<float>(DispatchThreadID.y) / (RenderResolution.y - 1), .5f, 5f);
		Color = glm::clamp(Color * 255.f, 0.f, 255.f);
		uchar4 ColorUCh4{ Color.r, Color.g, Color.b, Color.a };

		surf2Dwrite(ColorUCh4, OutColorSurface, sizeof(uchar4) * DispatchThreadID.x,
			DispatchThreadID.y);
	},
	Stream);

CUDAVOLLIB_CHECK(cudaStreamSynchronize(Stream));

By checking CUDA error, the first execution of CUDA Kernel suceesss while the second one failed with “cudaErrorLaunchFailure: unspecified launch failure”.

The reason of CUDA Error might come from no Synchronization being used between DX12 and CUDA, but I don’t know how to do that in a RDG style pipeline.