Why Javascript faster then C++? XPBD Spatial Hash Grid from Mathias Müller

I’m following Mathias Müllers 10 minute physics Youtube video 11 - Finding collisions among thousands of objects blazing fast.
He’s a researcher at Nvidia, have published a lot of ground breaking articles. Extended Position Based Dynamics (XPBD) is one of them. He writes the code in Javascript so you can test it in yourself in your web browser. He has 13824 particles moving on the screen at the same time; colliding with the enviroment and each other. Thanks to spatial hashing he does so with 19ms per frame.

I’m writing the same code in C++, but not getting the same performance. What am I doing wrong?
If I calculate each particle movement and collision, but skip over updating their location, I get about 28ms. If I call the update location function for the instances I get 100ms (and the particles are quite blurry)?).


#pragma once

#include "CoreMinimal.h"
#include "GameFramework/Actor.h"
#include "XPBDBalls.generated.h"

class UXPBDMath;
class UXPBDHash;

UCLASS()
class XPBD_API AXPBDBalls : public AActor
{
    GENERATED_BODY()

public:
    AXPBDBalls();

    virtual void OnConstruction(const FTransform& Transform) override;

protected:
    virtual void BeginPlay() override;

public:
    virtual void Tick(float DeltaTime) override;

private:
    UXPBDMath* XMath;
    UPROPERTY() UXPBDHash* XHash;
    float SphereRadius;
    float Spacing;
    float VelRand;
    FColor BallColor;
    FColor ColisionColor;

    UPROPERTY(VisibleAnywhere) int32 NumBalls;
    UPROPERTY(EditAnywhere) bool bShowCollisions;
    UPROPERTY(EditAnywhere) int32 SubSteps;

    TArray<float> Gravity;
    TArray<int32> WorldBounds;

    TArray<float> Positions;
    TArray<float> Velocities;
    TArray<float> PreviousPositions;
    TArray<float> Normals;

    UPROPERTY(EditAnywhere, Category = MyLittleComp) UInstancedStaticMeshComponent* InstancedMeshComponent;

    void InitializeBalls();
    void UpdateMesh();
    void Simulate(float sdt);
    void InitializePhysics();
};


#include "XPBDBalls.h"
#include "Components/InstancedStaticMeshComponent.h"
#include "Engine/World.h"
#include "XPBDMath.h"
#include "XPBDHash.h"

AXPBDBalls::AXPBDBalls()
{
    PrimaryActorTick.bCanEverTick = true;

    InstancedMeshComponent = CreateDefaultSubobject<UInstancedStaticMeshComponent>(TEXT("InstancedMeshComponent"));
    RootComponent = InstancedMeshComponent;
    InstancedMeshComponent->SetMobility(EComponentMobility::Movable);
    InstancedMeshComponent->SetCollisionProfileName(UCollisionProfile::NoCollision_ProfileName);
    InstancedMeshComponent->SetGenerateOverlapEvents(false);
    InstancedMeshComponent->NumCustomDataFloats = 3;

    XHash = CreateDefaultSubobject<UXPBDHash>(TEXT("XPBDHash"));

    Gravity.Init(0.f, 3);
    //Gravity[2] = -980.f;
    WorldBounds = { -1000, 0, -1000, 1000, 2000, 1000 };

    SphereRadius = 25.f;
    Spacing = SphereRadius * 3;
    VelRand = 0.2;

    BallColor = FColor::White;
    ColisionColor = FColor::Red;
    bShowCollisions = true;
    SubSteps = 1;

    Normals.SetNumZeroed(3);

    InitializePhysics();
}

void AXPBDBalls::OnConstruction(const FTransform& Transform)
{
    Super::OnConstruction(Transform);
    InstancedMeshComponent->ClearInstances();
    InitializeBalls();
}

void AXPBDBalls::InitializeBalls()
{
    NumBalls = FMath::Floor(Positions.Num() / 3);
    XHash->InitializeHash(2.f * SphereRadius, NumBalls);
    if (InstancedMeshComponent->GetInstanceCount() == 0)
    {
        TArray<FTransform> InstanceTransform;
        InstanceTransform.SetNum(NumBalls);
        for (int32 i = 0; i < NumBalls; i++)
        {
            InstanceTransform[i].SetLocation(FVector(Positions[3 * i], Positions[3 * i + 1], Positions[3 * i + 2]));
        }
        InstancedMeshComponent->AddInstances(InstanceTransform, false, false, false);
        InstancedMeshComponent->MarkRenderInstancesDirty();
    }
    UE_LOG(LogTemp, Warning, TEXT("InitializeBalls: InstancedMeshComponent->GetInstanceCount() %d"), InstancedMeshComponent->GetInstanceCount());

    //UpdateMesh();
}

void AXPBDBalls::UpdateMesh()
{
    TArray<FTransform> InstanceTransform;
    InstanceTransform.SetNum(NumBalls);
    for (int32 i = 0; i < NumBalls; i++)
    {
        InstanceTransform[i].SetLocation(FVector(Positions[3 * i], Positions[3 * i + 1], Positions[3 * i + 2]));
    }
    InstancedMeshComponent->BatchUpdateInstancesTransforms(0, InstanceTransform, false, true, false);
    InstanceTransform.Empty();
}

void AXPBDBalls::Simulate(float sdt)
{
    float MinDist = 2.0f * SphereRadius;

    // Check if arrays are initialized
    if (Positions.Num() == 0 || Velocities.Num() == 0 || PreviousPositions.Num() == 0)
    {
        UE_LOG(LogTemp, Error, TEXT("Simulate: Arrays are not properly initialized."));
        UE_LOG(LogTemp, Error, TEXT("Simulate: Velocities.Num(): %d, Positions.Num(): %d, PreviousPositions.Num(): %d"), Velocities.Num(), Positions.Num(), PreviousPositions.Num());
        return;
    }
    //Integrate
    ParallelFor(NumBalls, [&](int32 i)
        {
            XMath->VecAdd(Velocities, i, Gravity, 0, sdt);
            XMath->VecCopy(PreviousPositions, i, Positions, i);
            XMath->VecAdd(Positions, i, Velocities, i, sdt);
        });
    //for (int i = 0; i < NumBalls; i++)
    //{
    //    XMath->VecAdd(Velocities, i, Gravity, 0, sdt);
    //    XMath->VecCopy(PreviousPositions, i, Positions, i);
    //    XMath->VecAdd(Positions, i, Velocities, i, sdt);
    //}
    XHash->Create(Positions);

    //Handle collision
    for (int32 i = 0; i < NumBalls; i++)
    {
        // Set colors if you want
        InstancedMeshComponent->SetCustomDataValue(i, 0, BallColor.R, false);
        InstancedMeshComponent->SetCustomDataValue(i, 1, BallColor.B, false);
        InstancedMeshComponent->SetCustomDataValue(i, 2, BallColor.G, true);

        //Does bellow loop never happen?
        // World Collision
        for (int32 Dim = 0; Dim < 3; Dim++)
        {
            int32 Nr = 3 * i + Dim;
            if (Positions[Nr] < (WorldBounds[Dim] + SphereRadius))
            {
                Positions[Nr] = WorldBounds[Dim] + SphereRadius;
                Velocities[Nr] = -Velocities[Nr];
                if (bShowCollisions)
                {
                    InstancedMeshComponent->SetCustomDataValue(i, 0, ColisionColor.R, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 1, ColisionColor.B, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 2, ColisionColor.G, true);
                }
            }
            else if (Positions[Nr] > (WorldBounds[Dim + 3] - SphereRadius))
            {
                Positions[Nr] = WorldBounds[Dim + 3] - SphereRadius;
                Velocities[Nr] = -Velocities[Nr];
                if (bShowCollisions)
                {
                    InstancedMeshComponent->SetCustomDataValue(i, 0, ColisionColor.R, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 1, ColisionColor.B, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 2, ColisionColor.G, true);
                }
            }
        }

        // Interball collision
        XHash->Query(Positions, i, SphereRadius * 2.f);

        for (int32 Nr = 0; Nr < XHash->QuerySize; Nr++)
        {
            int32 j = XHash->QueryIds[Nr];

            XMath->VecSetDiff(Normals, 0, Positions, i, Positions, j);
            float D2 = XMath->VecLengthSquared(Normals, 0);

            // Are the balls overlapping?
            if (D2 > 0.f && D2 < MinDist * MinDist)
            {
                float D = FMath::Sqrt(D2);
                XMath->VecScale(Normals, 0, 1.f / D);

                // Seperate the balls
                float Corr = (MinDist - D) * 0.5f;

                XMath->VecAdd(Positions, i, Normals, 0, Corr);
                XMath->VecAdd(Positions, j, Normals, 0, -Corr);

                // Reflect velocities along normal
                float Vi = XMath->VecDot(Velocities, i, Normals, 0);
                float Vj = XMath->VecDot(Velocities, j, Normals, 0);

                XMath->VecAdd(Velocities, i, Normals, 0, Vj - Vi);
                XMath->VecAdd(Velocities, j, Normals, 0, Vi - Vj);

                if (bShowCollisions)
                {
                    InstancedMeshComponent->SetCustomDataValue(i, 0, ColisionColor.R, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 1, ColisionColor.B, false);
                    InstancedMeshComponent->SetCustomDataValue(i, 2, ColisionColor.G, true);
                }
            }
        }
    }
    UpdateMesh();
}

void AXPBDBalls::InitializePhysics()
{
    TArray<int32> S = WorldBounds;

    int32 NumX = FMath::Floor((S[3] - S[0] - 2 * Spacing) / Spacing);
    int32 NumY = FMath::Floor((S[4] - S[1] - 2 * Spacing) / Spacing);
    int32 NumZ = FMath::Floor((S[5] - S[2] - 2 * Spacing) / Spacing);

    Positions.SetNum(3 * NumX * NumY * NumZ);
    PreviousPositions.SetNum(3 * NumX * NumY * NumZ);
    Velocities.SetNum(3 * NumX * NumY * NumZ);
    // Fill velocity with zero?

    for (int Xi = 0; Xi < NumX; Xi++)
    {
        for (int Yi = 0; Yi < NumY; Yi++)
        {
            for (int Zi = 0; Zi < NumZ; Zi++)
            {
                int32 x = 3 * ((Xi * NumY + Yi) * NumZ + Zi);
                int32 y = x + 1;
                int32 z = x + 2;

                Positions[x] = S[0] + Spacing + Xi * Spacing;
                Positions[y] = S[1] + Spacing + Yi * Spacing;
                Positions[z] = S[2] + Spacing + Zi * Spacing;

                Velocities[x] = -VelRand + 2.f * VelRand * FMath::RandRange(100.f, 300.f);
                Velocities[y] = -VelRand + 2.f * VelRand * FMath::RandRange(100.f, 300.f);
                Velocities[z] = -VelRand + 2.f * VelRand * FMath::RandRange(100.f, 300.f);
            }
        }
    }

    InitializeBalls();
}
void AXPBDBalls::BeginPlay()
{
    Super::BeginPlay();
}

void AXPBDBalls::Tick(float DeltaTime)
{
    Super::Tick(DeltaTime);

    for (int i = 0; i < SubSteps; i++)
    {
        Simulate(DeltaTime / SubSteps);
    }
}
#pragma once

#include "CoreMinimal.h"
#include "Components/ActorComponent.h"
#include "XPBDHash.generated.h"


UCLASS()
class XPBD_API UXPBDHash : public UActorComponent
{
    GENERATED_BODY()

public:
    UXPBDHash();
    void InitializeHash(const float& inSpacing, const int32& inMaxNumObjects);
    void Create(const TArray<float>& inPos);
    void Query(const TArray<float>& inPositions, const int nr, const float MaxDist);
    int32 QuerySize;
    TArray<int32> QueryIds;

protected:
    virtual void BeginPlay() override;

private:
    float Spacing;
    int32 TableSize;
    TArray<int32> CellStart;
    TArray<int32> CellEntries;

    int32 HashCoords(const int32 Xi, const int32 Yi, const int32 Zi) const;
    int32 IntCoord(float Coord) const;
    int32 HashPos(const TArray<float>& inPositions, const int32 Index) const;
};


#include "XPBDHash.h"

UXPBDHash::UXPBDHash()
{
    PrimaryComponentTick.bCanEverTick = false;

}

void UXPBDHash::InitializeHash(const float& inSpacing, const int32& inMaxNumObjects)
{
    TableSize = 2 * inMaxNumObjects;
    Spacing = inSpacing;
    CellStart.SetNumZeroed(TableSize + 1);
    CellEntries.SetNumZeroed(inMaxNumObjects);
    QueryIds.SetNumZeroed(inMaxNumObjects);
    QuerySize = 0;
}

int32 UXPBDHash::HashCoords(const int32 Xi, const int32 Yi, const int32 Zi) const
{
    int32 H = (Xi * 92837111) ^ (Yi * 689287499) ^ (Zi * 283923481); // fantasy function?
    return FMath::Abs(H) % TableSize;
}

int32 UXPBDHash::IntCoord(float Coord) const
{
    return FMath::FloorToInt(Coord / Spacing);
}

int32 UXPBDHash::HashPos(const TArray<float>& inPositions, const int32 Index) const
{
    return HashCoords(IntCoord(inPositions[3 * Index]), IntCoord(inPositions[3 * Index + 1]), IntCoord(inPositions[3 * Index + 2]));
}

void UXPBDHash::Create(const TArray<float>& inPositions)
{
    int32 NumObjects = FMath::Min(inPositions.Num() / 3, CellEntries.Num());

    // Determine cell size

    CellStart.Init(0, CellStart.Num());
    CellEntries.Init(0, CellEntries.Num());

    for (int32 i = 0; i < NumObjects; i++)
    {
        int32 Hash = HashPos(inPositions, i);
        CellStart[Hash]++; // isntead of adding one every loop. I could SetNum(NumObjects * 2) and have a j = NumObjects as index start?
    }

    // Determine cells starts

    int32 Start = 0;
    for (int32 i = 0; i < TableSize; i++)
    {
        Start += CellStart[i];
        CellStart[i] = Start;
    }
    CellStart[TableSize] = Start;

    // Fill in objects ids

    for (int32 i = 0; i < NumObjects; i++)
    {
        int32 Hash = HashPos(inPositions, i);
        CellStart[Hash]--;
        CellEntries[CellStart[Hash]] = i;
    }
}

void UXPBDHash::Query(const TArray<float>& inPositions, const int nr, const float MaxDist)
{

    int32 X0 = IntCoord(inPositions[3 * nr] - MaxDist);
    int32 Y0 = IntCoord(inPositions[3 * nr + 1] - MaxDist);
    int32 Z0 = IntCoord(inPositions[3 * nr + 2] - MaxDist);

    int32 X1 = IntCoord(inPositions[3 * nr] + MaxDist);
    int32 Y1 = IntCoord(inPositions[3 * nr + 1] + MaxDist);
    int32 Z1 = IntCoord(inPositions[3 * nr + 2] + MaxDist);

    QuerySize = 0;

    for (int32 Xi = X0; Xi <= X1; Xi++)
    {
        for (int32 Yi = Y0; Yi <= Y1; Yi++)
        {
            for (int32 Zi = Z0; Zi <= Z1; Zi++)
            {
                int32 Hash = HashCoords(Xi, Yi, Zi);
                int32 Start = CellStart[Hash];
                int32 End = CellStart[Hash + 1];

                for (int32 i = Start; i < End; i++)
                {
                    QueryIds[QuerySize] = CellEntries[i];
                    QuerySize++;
                }
            }
        }
    }
}


void UXPBDHash::BeginPlay()
{
    Super::BeginPlay();
}
#pragma once

#include "CoreMinimal.h"
#include "UObject/NoExportTypes.h"
#include "XPBDMath.generated.h"

UCLASS()
class XPBD_API UXPBDMath : public UObject
{
	GENERATED_BODY()
public:

UFUNCTION() void VecSetZero(TArray<float>& A, int32 Anr);
UFUNCTION() void VecScale(TArray<float>& A, int32 Anr, float Scale = 1.0f);
UFUNCTION() void VecCopy(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr);
UFUNCTION() void VecAdd(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, float Scale = 1.0f);
UFUNCTION() void VecSetDiff(TArray<float>& Dst, int32 Dnr, const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, float Scale = 1.0f);
UFUNCTION() float VecLengthSquared(const TArray<float>& A, int32 Anr);
UFUNCTION() float VecDistSquared(const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr);
UFUNCTION() float VecDot(const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr);
UFUNCTION() void VecSetCross(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, const TArray<float>& C, int32 Cnr);

};

#include "XPBDMath.h"

void UXPBDMath::VecSetZero(TArray<float>& A, int32 Anr)
{
    Anr *= 3;
    A[Anr++] = 0.0f;
    A[Anr++] = 0.0f;
    A[Anr] = 0.0f;
}

void UXPBDMath::VecScale(TArray<float>& A, int32 Anr, float Scale)
{
    Anr *= 3;
    A[Anr++] *= Scale;
    A[Anr++] *= Scale;
    A[Anr] *= Scale;
}

void UXPBDMath::VecCopy(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr)
{
    Anr *= 3;
    Bnr *= 3;
    A[Anr++] = B[Bnr++];
    A[Anr++] = B[Bnr++];
    A[Anr] = B[Bnr];
}

void UXPBDMath::VecAdd(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, float Scale)
{
    Anr *= 3;
    Bnr *= 3;
    A[Anr++] += B[Bnr++] * Scale;
    A[Anr++] += B[Bnr++] * Scale;
    A[Anr] += B[Bnr] * Scale;
}

void UXPBDMath::VecSetDiff(TArray<float>& Dst, int32 Dnr, const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, float Scale)
{
    Dnr *= 3;
    Anr *= 3;
    Bnr *= 3;
    Dst[Dnr++] = (A[Anr++] - B[Bnr++]) * Scale;
    Dst[Dnr++] = (A[Anr++] - B[Bnr++]) * Scale;
    Dst[Dnr] = (A[Anr] - B[Bnr]) * Scale;
}

float UXPBDMath::VecLengthSquared(const TArray<float>& A, int32 Anr)
{
    Anr *= 3;
    float A0 = A[Anr];
    float A1 = A[Anr + 1];
    float A2 = A[Anr + 2];
    return A0 * A0 + A1 * A1 + A2 * A2;
}

float UXPBDMath::VecDistSquared(const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr)
{
    Anr *= 3;
    Bnr *= 3;
    float A0 = A[Anr] - B[Bnr];
    float A1 = A[Anr + 1] - B[Bnr + 1];
    float A2 = A[Anr + 2] - B[Bnr + 2];
    return A0 * A0 + A1 * A1 + A2 * A2;
}


float UXPBDMath::VecDot(const TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr)
{
    Anr *= 3;
    Bnr *= 3;
    return A[Anr] * B[Bnr] + A[Anr + 1] * B[Bnr + 1] + A[Anr + 2] * B[Bnr + 2];
}

void UXPBDMath::VecSetCross(TArray<float>& A, int32 Anr, const TArray<float>& B, int32 Bnr, const TArray<float>& C, int32 Cnr)
{
    Anr *= 3;
    Bnr *= 3;
    Cnr *= 3;
    A[Anr++] = B[Bnr + 1] * C[Cnr + 2] - B[Bnr + 2] * C[Cnr + 1];
    A[Anr++] = B[Bnr + 2] * C[Cnr] - B[Bnr] * C[Cnr + 2];
    A[Anr] = B[Bnr] * C[Cnr + 1] - B[Bnr + 1] * C[Cnr];
}