fix: breakpad use miniz

2025-08-25 15:24:22 +08:00
parent a58517497b
commit 68b2e7f763
728 changed files with 489652 additions and 1211 deletions
--- a/third_party/tracy/examples/ToyPathTracer/Source/Config.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/Config.h
@@ -0,0 +1,33 @@
+
+#if defined(__APPLE__) && !defined(__METAL_VERSION__)
+#include <TargetConditionals.h>
+#endif
+
+#define kBackbufferWidth 1280
+#define kBackbufferHeight 720
+
+#if defined(__EMSCRIPTEN__)
+#define CPU_CAN_DO_SIMD 0
+#define CPU_CAN_DO_THREADS 0
+#else
+#define CPU_CAN_DO_SIMD 1
+#define CPU_CAN_DO_THREADS 1
+#endif
+
+
+#define DO_SAMPLES_PER_PIXEL 4
+#define DO_ANIMATE_SMOOTHING 0.9f
+#define DO_LIGHT_SAMPLING 1
+#define DO_MITSUBA_COMPARE 0
+
+// Should path tracing be done on the GPU with a compute shader?
+#define DO_COMPUTE_GPU 0
+#define kCSGroupSizeX 8
+#define kCSGroupSizeY 8
+#define kCSMaxObjects 64
+
+// Should float3 struct use SSE/NEON?
+#define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && CPU_CAN_DO_SIMD && 1)
+
+// Should HitSpheres function use SSE/NEON?
+#define DO_HIT_SPHERES_SIMD (CPU_CAN_DO_SIMD && 1)
--- a/third_party/tracy/examples/ToyPathTracer/Source/MathSimd.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/MathSimd.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#if defined(_MSC_VER)
+#define VM_INLINE __forceinline
+#else
+#define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline
+#endif
+
+#define kSimdWidth 4
+
+#if !defined(__arm__) && !defined(__arm64__) && !defined(__EMSCRIPTEN__)
+
+// ---- SSE implementation
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#define SHUFFLE4(V, X,Y,Z,W) float4(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(W,Z,Y,X)))
+
+struct float4
+{
+    VM_INLINE float4() {}
+    VM_INLINE explicit float4(const float *p) { m = _mm_loadu_ps(p); }
+    VM_INLINE explicit float4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
+    VM_INLINE explicit float4(float v) { m = _mm_set_ps1(v); }
+    VM_INLINE explicit float4(__m128 v) { m = v; }
+    
+    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
+    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
+    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
+    VM_INLINE float getW() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 3, 3, 3))); }
+    
+    __m128 m;
+};
+
+typedef float4 bool4;
+
+VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a, float4 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator* (float4 a, float4 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = _mm_and_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = _mm_or_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a) { a.m = _mm_xor_ps(a.m, _mm_set1_ps(-0.0f)); return a; }
+VM_INLINE float4 min(float4 a, float4 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
+VM_INLINE float4 max(float4 a, float4 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
+
+VM_INLINE float hmin(float4 v)
+{
+    v = min(v, SHUFFLE4(v, 2, 3, 0, 0));
+    v = min(v, SHUFFLE4(v, 1, 0, 0, 0));
+    return v.getX();
+}
+
+// Returns a 4-bit code where bit0..bit3 is X..W
+VM_INLINE unsigned mask(float4 v) { return _mm_movemask_ps(v.m); }
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
+
+// "select", i.e. hibit(cond) ? b : a
+// on SSE4.1 and up this can be done easily via "blend" instruction;
+// on older SSEs has to do a bunch of hoops, see
+// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
+
+VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
+{
+#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
+    a.m = _mm_blendv_ps(a.m, b.m, cond.m);
+#else
+    __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
+    a.m = _mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m));
+#endif
+    return a;
+}
+VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
+{
+#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
+    return _mm_blendv_epi8(a, b, _mm_castps_si128(cond.m));
+#else
+    __m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31);
+    return _mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, a));
+#endif
+}
+
+VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
+
+#elif !defined(__EMSCRIPTEN__)
+
+// ---- NEON implementation
+
+#define USE_NEON 1
+#include <arm_neon.h>
+
+struct float4
+{
+    VM_INLINE float4() {}
+    VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
+    VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
+    VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
+    VM_INLINE explicit float4(float32x4_t v) { m = v; }
+    
+    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
+    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
+    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
+    VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }
+    
+    float32x4_t m;
+};
+
+typedef float4 bool4;
+
+VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
+VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
+VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
+VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
+VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
+
+VM_INLINE float hmin(float4 v)
+{
+    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
+    return vget_lane_f32(minOfMinOfHalfs, 0);
+}
+
+// Returns a 4-bit code where bit0..bit3 is X..W
+VM_INLINE unsigned mask(float4 v)
+{
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
+    uint32x4_t t1 = vtstq_u32(t0, highbit);
+    uint32x4_t t2 = vandq_u32(t1, movemask);
+    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+}
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
+
+// "select", i.e. hibit(cond) ? b : a
+// on SSE4.1 and up this can be done easily via "blend" instruction;
+// on older SSEs has to do a bunch of hoops, see
+// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
+
+VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
+{
+    a.m = vbslq_f32(cond.m, b.m, a.m);
+    return a;
+}
+VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
+{
+    return vbslq_f32(cond.m, b, a);
+}
+
+VM_INLINE float4 sqrtf(float4 v)
+{
+    float32x4_t V = v.m;
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+    float32x4_t P1 = vmulq_f32( V, S1 );
+    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+    float32x4_t S2 = vmulq_f32( S1, R1 );
+    float32x4_t P2 = vmulq_f32( V, S2 );
+    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
+    float32x4_t S3 = vmulq_f32( S2, R2 );
+    return float4(vmulq_f32(V, S3));
+}
+
+VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
+VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
+VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
+VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }
+
+#endif
--- a/third_party/tracy/examples/ToyPathTracer/Source/Maths.cpp
+++ b/third_party/tracy/examples/ToyPathTracer/Source/Maths.cpp
@@ -0,0 +1,203 @@
+#include "Maths.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+static uint32_t XorShift32(uint32_t& state)
+{
+    uint32_t x = state;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 15;
+    state = x;
+    return x;
+}
+
+float RandomFloat01(uint32_t& state)
+{
+    return (XorShift32(state) & 0xFFFFFF) / 16777216.0f;
+}
+
+float3 RandomInUnitDisk(uint32_t& state)
+{
+    float3 p;
+    do
+    {
+        p = 2.0 * float3(RandomFloat01(state),RandomFloat01(state),0) - float3(1,1,0);
+    } while (dot(p,p) >= 1.0);
+    return p;
+}
+
+float3 RandomInUnitSphere(uint32_t& state)
+{
+    float3 p;
+    do {
+        p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1);
+    } while (sqLength(p) >= 1.0);
+    return p;
+}
+
+float3 RandomUnitVector(uint32_t& state)
+{
+    float z = RandomFloat01(state) * 2.0f - 1.0f;
+    float a = RandomFloat01(state) * 2.0f * kPI;
+    float r = sqrtf(1.0f - z * z);
+    float x = r * cosf(a);
+    float y = r * sinf(a);
+    return float3(x, y, z);
+}
+
+
+int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
+{
+#if DO_HIT_SPHERES_SIMD
+    float4 hitT = float4(tMax);
+#if USE_NEON
+    int32x4_t id = vdupq_n_s32(-1);
+#else
+    __m128i id = _mm_set1_epi32(-1);
+#endif
+
+#if DO_FLOAT3_WITH_SIMD && !USE_NEON
+    float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
+    float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
+    float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
+    float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
+    float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
+    float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
+#elif DO_FLOAT3_WITH_SIMD
+    float4 rOrigX = splatX(r.orig.m);
+    float4 rOrigY = splatY(r.orig.m);
+    float4 rOrigZ = splatZ(r.orig.m);
+    float4 rDirX = splatX(r.dir.m);
+    float4 rDirY = splatY(r.dir.m);
+    float4 rDirZ = splatZ(r.dir.m);
+#else
+    float4 rOrigX = float4(r.orig.x);
+    float4 rOrigY = float4(r.orig.y);
+    float4 rOrigZ = float4(r.orig.z);
+    float4 rDirX = float4(r.dir.x);
+    float4 rDirY = float4(r.dir.y);
+    float4 rDirZ = float4(r.dir.z);
+#endif
+    float4 tMin4 = float4(tMin);
+#if USE_NEON
+    int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
+#else
+    __m128i curId = _mm_set_epi32(3, 2, 1, 0);
+#endif
+    // process 4 spheres at once
+    for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
+    {
+        // load data for 4 spheres
+        float4 sCenterX = float4(spheres.centerX + i);
+        float4 sCenterY = float4(spheres.centerY + i);
+        float4 sCenterZ = float4(spheres.centerZ + i);
+        float4 sSqRadius = float4(spheres.sqRadius + i);
+        // note: we flip this vector and calculate -b (nb) since that happens to be slightly preferable computationally
+        float4 coX = sCenterX - rOrigX;
+        float4 coY = sCenterY - rOrigY;
+        float4 coZ = sCenterZ - rOrigZ;
+        float4 nb = coX * rDirX + coY * rDirY + coZ * rDirZ;
+        float4 c = coX * coX + coY * coY + coZ * coZ - sSqRadius;
+        float4 discr = nb * nb - c;
+        bool4 discrPos = discr > float4(0.0f);
+        // if ray hits any of the 4 spheres
+        if (any(discrPos))
+        {
+            float4 discrSq = sqrtf(discr);
+
+            // ray could hit spheres at t0 & t1
+            float4 t0 = nb - discrSq;
+            float4 t1 = nb + discrSq;
+
+            float4 t = select(t1, t0, t0 > tMin4); // if t0 is above min, take it (since it's the earlier hit); else try t1.
+            bool4 msk = discrPos & (t > tMin4) & (t < hitT);
+            // if hit, take it
+            id = select(id, curId, msk);
+            hitT = select(hitT, t, msk);
+        }
+#if USE_NEON
+        curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
+#else
+        curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
+#endif
+    }
+    // now we have up to 4 hits, find and return closest one
+    float minT = hmin(hitT);
+    if (minT < tMax) // any actual hits?
+    {
+        int minMask = mask(hitT == float4(minT));
+        if (minMask != 0)
+        {
+            int id_scalar[4];
+            float hitT_scalar[4];
+#if USE_NEON
+            vst1q_s32(id_scalar, id);
+            vst1q_f32(hitT_scalar, hitT.m);
+#else
+            _mm_storeu_si128((__m128i *)id_scalar, id);
+            _mm_storeu_ps(hitT_scalar, hitT.m);
+#endif
+
+            // In general, you would do this with a bit scan (first set/trailing zero count).
+            // But who cares, it's only 16 options.
+            static const int laneId[16] =
+            {
+                0, 0, 1, 0, // 00xx
+                2, 0, 1, 0, // 01xx
+                3, 0, 1, 0, // 10xx
+                2, 0, 1, 0, // 11xx
+            };
+
+            int lane = laneId[minMask];
+            int hitId = id_scalar[lane];
+            float finalHitT = hitT_scalar[lane];
+
+            outHit.pos = r.pointAt(finalHitT);
+            outHit.normal = (outHit.pos - float3(spheres.centerX[hitId], spheres.centerY[hitId], spheres.centerZ[hitId])) * spheres.invRadius[hitId];
+            outHit.t = finalHitT;
+            return hitId;
+        }
+    }
+
+    return -1;
+
+#else // #if DO_HIT_SPHERES_SIMD
+
+    float hitT = tMax;
+    int id = -1;
+    for (int i = 0; i < spheres.count; ++i)
+    {
+        float coX = spheres.centerX[i] - r.orig.getX();
+        float coY = spheres.centerY[i] - r.orig.getY();
+        float coZ = spheres.centerZ[i] - r.orig.getZ();
+        float nb = coX * r.dir.getX() + coY * r.dir.getY() + coZ * r.dir.getZ();
+        float c = coX * coX + coY * coY + coZ * coZ - spheres.sqRadius[i];
+        float discr = nb * nb - c;
+        if (discr > 0)
+        {
+            float discrSq = sqrtf(discr);
+
+            // Try earlier t
+            float t = nb - discrSq;
+            if (t <= tMin) // before min, try later t!
+                t = nb + discrSq;
+
+            if (t > tMin && t < hitT)
+            {
+                id = i;
+                hitT = t;
+            }
+        }
+    }
+    if (id != -1)
+    {
+        outHit.pos = r.pointAt(hitT);
+        outHit.normal = (outHit.pos - float3(spheres.centerX[id], spheres.centerY[id], spheres.centerZ[id])) * spheres.invRadius[id];
+        outHit.t = hitT;
+        return id;
+    }
+    else
+        return -1;
+#endif // #else of #if DO_HIT_SPHERES_SIMD
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/Maths.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/Maths.h
@@ -0,0 +1,436 @@
+#pragma once
+
+#include <math.h>
+#include <assert.h>
+#include <stdint.h>
+#include "Config.h"
+#include "MathSimd.h"
+
+#define kPI 3.1415926f
+
+// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
+#if DO_FLOAT3_WITH_SIMD
+
+
+#if !defined(__arm__) && !defined(__arm64__)
+
+// ---- SSE implementation
+
+// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
+// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
+#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))
+
+struct float3
+{
+    VM_INLINE float3() {}
+    VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
+    VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
+    VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
+    VM_INLINE explicit float3(__m128 v) { m = v; }
+
+    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
+    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
+    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
+
+    VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
+    VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }
+
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+
+    void setX(float x)
+    {
+        m = _mm_move_ss(m, _mm_set_ss(x));
+    }
+    void setY(float y)
+    {
+        __m128 t = _mm_move_ss(m, _mm_set_ss(y));
+        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
+        m = _mm_move_ss(t, m);
+    }
+    void setZ(float z)
+    {
+        __m128 t = _mm_move_ss(m, _mm_set_ss(z));
+        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
+        m = _mm_move_ss(t, m);
+    }
+
+    __m128 m;
+};
+
+typedef float3 bool3;
+
+VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
+VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
+VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
+VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
+VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
+VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
+VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
+VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
+VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
+
+VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }
+
+VM_INLINE float hmin(float3 v)
+{
+    v = min(v, SHUFFLE3(v, 1, 0, 2));
+    return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
+}
+VM_INLINE float hmax(float3 v)
+{
+    v = max(v, SHUFFLE3(v, 1, 0, 2));
+    return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
+}
+
+VM_INLINE float3 cross(float3 a, float3 b)
+{
+    // x  <-  a.y*b.z - a.z*b.y
+    // y  <-  a.z*b.x - a.x*b.z
+    // z  <-  a.x*b.y - a.y*b.x
+    // We can save a shuffle by grouping it in this wacky order:
+    return (a.zxy()*b - a*b.zxy()).zxy();
+}
+
+// Returns a 3-bit code where bit0..bit2 is X..Z
+VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
+
+VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
+VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
+VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
+
+#else // #if !defined(__arm__) && !defined(__arm64__)
+
+// ---- NEON implementation
+
+#include <arm_neon.h>
+
+struct float3
+{
+    VM_INLINE float3() {}
+    VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
+    VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
+    VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
+    VM_INLINE explicit float3(float32x4_t v) { m = v; }
+    
+    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
+    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
+    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
+    
+    VM_INLINE float3 yzx() const
+    {
+        float32x2_t low = vget_low_f32(m);
+        float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
+        return float3(yzx);
+    }
+    VM_INLINE float3 zxy() const
+    {
+        float32x4_t p = m;
+        p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
+        return float3(p);
+    }
+    
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+    
+    void setX(float x)
+    {
+        m = vsetq_lane_f32(x, m, 0);
+    }
+    void setY(float y)
+    {
+        m = vsetq_lane_f32(y, m, 1);
+    }
+    void setZ(float z)
+    {
+        m = vsetq_lane_f32(z, m, 2);
+    }
+    
+    float32x4_t m;
+};
+
+typedef float3 bool3;
+
+VM_INLINE float32x4_t rcp_2(float32x4_t v)
+{
+    float32x4_t e = vrecpeq_f32(v);
+    e = vmulq_f32(vrecpsq_f32(e, v), e);
+    e = vmulq_f32(vrecpsq_f32(e, v), e);
+    return e;
+}
+
+VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
+VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
+VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
+VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
+VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
+VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
+VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
+VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
+VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
+VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
+VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
+
+VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }
+
+VM_INLINE float hmin(float3 v)
+{
+    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
+    return vget_lane_f32(minOfMinOfHalfs, 0);
+}
+VM_INLINE float hmax(float3 v)
+{
+    float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
+    return vget_lane_f32(maxOfMaxOfHalfs, 0);
+}
+
+VM_INLINE float3 cross(float3 a, float3 b)
+{
+    // x  <-  a.y*b.z - a.z*b.y
+    // y  <-  a.z*b.x - a.x*b.z
+    // z  <-  a.x*b.y - a.y*b.x
+    // We can save a shuffle by grouping it in this wacky order:
+    return (a.zxy()*b - a*b.zxy()).zxy();
+}
+
+// Returns a 3-bit code where bit0..bit2 is X..Z
+VM_INLINE unsigned mask(float3 v)
+{
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
+    uint32x4_t t1 = vtstq_u32(t0, highbit);
+    uint32x4_t t2 = vandq_u32(t1, movemask);
+    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+}
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
+
+VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
+VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
+VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
+
+
+#endif // #else of #if !defined(__arm__) && !defined(__arm64__)
+
+#else // #if DO_FLOAT3_WITH_SIMD
+
+// ---- Simple scalar C implementation
+
+
+struct float3
+{
+    float3() : x(0), y(0), z(0) {}
+    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+
+    float3 operator-() const { return float3(-x, -y, -z); }
+    float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
+    float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
+    float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
+    float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }
+
+    VM_INLINE float getX() const { return x; }
+    VM_INLINE float getY() const { return y; }
+    VM_INLINE float getZ() const { return z; }
+    VM_INLINE void setX(float x_) { x = x_; }
+    VM_INLINE void setY(float y_) { y = y_; }
+    VM_INLINE void setZ(float z_) { z = z_; }
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+
+    float x, y, z;
+};
+
+VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
+VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
+VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
+VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
+VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
+VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
+VM_INLINE float3 cross(const float3& a, const float3& b)
+{
+    return float3(
+                  a.y*b.z - a.z*b.y,
+                  -(a.x*b.z - a.z*b.x),
+                  a.x*b.y - a.y*b.x
+                  );
+}
+#endif // #else of #if DO_FLOAT3_WITH_SIMD
+
+VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
+VM_INLINE float sqLength(float3 v) { return dot(v, v); }
+VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
+VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }
+
+
+inline void AssertUnit(float3 v)
+{
+    assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
+}
+
+inline float3 reflect(float3 v, float3 n)
+{
+    return v - 2*dot(v,n)*n;
+}
+
+inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
+{
+    AssertUnit(v);
+    float dt = dot(v, n);
+    float discr = 1.0f - nint*nint*(1-dt*dt);
+    if (discr > 0)
+    {
+        outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
+        return true;
+    }
+    return false;
+}
+inline float schlick(float cosine, float ri)
+{
+    float r0 = (1-ri) / (1+ri);
+    r0 = r0*r0;
+    return r0 + (1-r0)*powf(1-cosine, 5);
+}
+
+struct Ray
+{
+    Ray() {}
+    Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }
+
+    float3 pointAt(float t) const { return orig + dir * t; }
+
+    float3 orig;
+    float3 dir;
+};
+
+
+struct Hit
+{
+    float3 pos;
+    float3 normal;
+    float t;
+};
+
+
+struct Sphere
+{
+    Sphere() : radius(1.0f), invRadius(0.0f) {}
+    Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}
+
+    void UpdateDerivedData() { invRadius = 1.0f/radius; }
+
+    float3 center;
+    float radius;
+    float invRadius;
+};
+
+
+// data for all spheres in a "structure of arrays" layout
+struct SpheresSoA
+{
+    SpheresSoA(int c)
+    {
+        count = c;
+        // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
+        // enough space
+        simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
+        centerX = new float[simdCount];
+        centerY = new float[simdCount];
+        centerZ = new float[simdCount];
+        sqRadius = new float[simdCount];
+        invRadius = new float[simdCount];
+        // set all data to "impossible sphere" state
+        for (int i = count; i < simdCount; ++i)
+        {
+            centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
+            sqRadius[i] = 0.0f;
+            invRadius[i] = 0.0f;
+        }
+    }
+    ~SpheresSoA()
+    {
+        delete[] centerX;
+        delete[] centerY;
+        delete[] centerZ;
+        delete[] sqRadius;
+        delete[] invRadius;
+    }
+    float* centerX;
+    float* centerY;
+    float* centerZ;
+    float* sqRadius;
+    float* invRadius;
+    int simdCount;
+    int count;
+};
+
+
+int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);
+
+float RandomFloat01(uint32_t& state);
+float3 RandomInUnitDisk(uint32_t& state);
+float3 RandomInUnitSphere(uint32_t& state);
+float3 RandomUnitVector(uint32_t& state);
+
+struct Camera
+{
+    Camera() {}
+    // vfov is top to bottom in degrees
+    Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
+    {
+        lensRadius = aperture / 2;
+        float theta = vfov*kPI/180;
+        float halfHeight = tanf(theta/2);
+        float halfWidth = aspect * halfHeight;
+        origin = lookFrom;
+        w = normalize(lookFrom - lookAt);
+        u = normalize(cross(vup, w));
+        v = cross(w, u);
+        lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
+        horizontal = 2*halfWidth*focusDist*u;
+        vertical = 2*halfHeight*focusDist*v;
+    }
+
+    Ray GetRay(float s, float t, uint32_t& state) const
+    {
+        float3 rd = lensRadius * RandomInUnitDisk(state);
+        float3 offset = u * rd.getX() + v * rd.getY();
+        return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
+    }
+
+    float3 origin;
+    float3 lowerLeftCorner;
+    float3 horizontal;
+    float3 vertical;
+    float3 u, v, w;
+    float lensRadius;
+};
+
--- a/third_party/tracy/examples/ToyPathTracer/Source/Test.cpp
+++ b/third_party/tracy/examples/ToyPathTracer/Source/Test.cpp
@@ -0,0 +1,392 @@
+#include "Config.h"
+#include "Test.h"
+#include "Maths.h"
+#include <algorithm>
+#if CPU_CAN_DO_THREADS
+#include "enkiTS/TaskScheduler_c.h"
+#include <thread>
+#endif
+#include <atomic>
+
+#include "../../../public/tracy/Tracy.hpp"
+
+// 46 spheres (2 emissive) when enabled; 9 spheres (1 emissive) when disabled
+#define DO_BIG_SCENE 1
+
+static Sphere s_Spheres[] =
+{
+    {float3(0,-100.5,-1), 100},
+    {float3(2,0,-1), 0.5f},
+    {float3(0,0,-1), 0.5f},
+    {float3(-2,0,-1), 0.5f},
+    {float3(2,0,1), 0.5f},
+    {float3(0,0,1), 0.5f},
+    {float3(-2,0,1), 0.5f},
+    {float3(0.5f,1,0.5f), 0.5f},
+    {float3(-1.5f,1.5f,0.f), 0.3f},
+#if DO_BIG_SCENE
+    {float3(4,0,-3), 0.5f}, {float3(3,0,-3), 0.5f}, {float3(2,0,-3), 0.5f}, {float3(1,0,-3), 0.5f}, {float3(0,0,-3), 0.5f}, {float3(-1,0,-3), 0.5f}, {float3(-2,0,-3), 0.5f}, {float3(-3,0,-3), 0.5f}, {float3(-4,0,-3), 0.5f},
+    {float3(4,0,-4), 0.5f}, {float3(3,0,-4), 0.5f}, {float3(2,0,-4), 0.5f}, {float3(1,0,-4), 0.5f}, {float3(0,0,-4), 0.5f}, {float3(-1,0,-4), 0.5f}, {float3(-2,0,-4), 0.5f}, {float3(-3,0,-4), 0.5f}, {float3(-4,0,-4), 0.5f},
+    {float3(4,0,-5), 0.5f}, {float3(3,0,-5), 0.5f}, {float3(2,0,-5), 0.5f}, {float3(1,0,-5), 0.5f}, {float3(0,0,-5), 0.5f}, {float3(-1,0,-5), 0.5f}, {float3(-2,0,-5), 0.5f}, {float3(-3,0,-5), 0.5f}, {float3(-4,0,-5), 0.5f},
+    {float3(4,0,-6), 0.5f}, {float3(3,0,-6), 0.5f}, {float3(2,0,-6), 0.5f}, {float3(1,0,-6), 0.5f}, {float3(0,0,-6), 0.5f}, {float3(-1,0,-6), 0.5f}, {float3(-2,0,-6), 0.5f}, {float3(-3,0,-6), 0.5f}, {float3(-4,0,-6), 0.5f},
+    {float3(1.5f,1.5f,-2), 0.3f},
+#endif // #if DO_BIG_SCENE
+};
+const int kSphereCount = sizeof(s_Spheres) / sizeof(s_Spheres[0]);
+
+static SpheresSoA s_SpheresSoA(kSphereCount);
+
+struct Material
+{
+    enum Type { Lambert, Metal, Dielectric };
+    Type type;
+    float3 albedo;
+    float3 emissive;
+    float roughness;
+    float ri;
+};
+
+static Material s_SphereMats[kSphereCount] =
+{
+    { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.8f, 0.4f, 0.4f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.4f, 0.4f, 0.8f), float3(0,0,0), 0, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.2f, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.6f, 0 },
+    { Material::Dielectric, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 1.5f },
+    { Material::Lambert, float3(0.8f, 0.6f, 0.2f), float3(30,25,15), 0, 0 },
+#if DO_BIG_SCENE
+    { Material::Lambert, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.1f, 0.2f, 0.5f), float3(3,10,20), 0, 0 },
+#endif
+};
+
+static int s_EmissiveSpheres[kSphereCount];
+static int s_EmissiveSphereCount;
+
+static Camera s_Cam;
+
+const float kMinT = 0.001f;
+const float kMaxT = 1.0e7f;
+const int kMaxDepth = 10;
+
+
+bool HitWorld(const Ray& r, float tMin, float tMax, Hit& outHit, int& outID)
+{
+    outID = HitSpheres(r, s_SpheresSoA, tMin, tMax, outHit);
+    return outID != -1;
+}
+
+
+static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3& attenuation, Ray& scattered, float3& outLightE, int& inoutRayCount, uint32_t& state)
+{
+    ZoneScoped;
+    outLightE = float3(0,0,0);
+    if (mat.type == Material::Lambert)
+    {
+        // random point on unit sphere that is tangent to the hit point
+        float3 target = rec.pos + rec.normal + RandomUnitVector(state);
+        scattered = Ray(rec.pos, normalize(target - rec.pos));
+        attenuation = mat.albedo;
+
+        // sample lights
+#if DO_LIGHT_SAMPLING
+        for (int j = 0; j < s_EmissiveSphereCount; ++j)
+        {
+            int i = s_EmissiveSpheres[j];
+            const Material& smat = s_SphereMats[i];
+            if (&mat == &smat)
+                continue; // skip self
+            const Sphere& s = s_Spheres[i];
+
+            // create a random direction towards sphere
+            // coord system for sampling: sw, su, sv
+            float3 sw = normalize(s.center - rec.pos);
+            float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw));
+            float3 sv = cross(sw, su);
+            // sample sphere by solid angle
+            float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center));
+            float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
+            float cosA = 1.0f - eps1 + eps1 * cosAMax;
+            float sinA = sqrtf(1.0f - cosA*cosA);
+            float phi = 2 * kPI * eps2;
+            float3 l = su * (cosf(phi) * sinA) + sv * (sinf(phi) * sinA) + sw * cosA;
+            //l = normalize(l); // NOTE(fg): This is already normalized, by construction.
+
+            // shoot shadow ray
+            Hit lightHit;
+            int hitID;
+            ++inoutRayCount;
+            if (HitWorld(Ray(rec.pos, l), kMinT, kMaxT, lightHit, hitID) && hitID == i)
+            {
+                float omega = 2 * kPI * (1-cosAMax);
+
+                float3 rdir = r_in.dir;
+                AssertUnit(rdir);
+                float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
+                outLightE += (mat.albedo * smat.emissive) * (std::max(0.0f, dot(l, nl)) * omega / kPI);
+            }
+        }
+#endif
+        return true;
+    }
+    else if (mat.type == Material::Metal)
+    {
+        AssertUnit(r_in.dir); AssertUnit(rec.normal);
+        float3 refl = reflect(r_in.dir, rec.normal);
+        // reflected ray, and random inside of sphere based on roughness
+        float roughness = mat.roughness;
+#if DO_MITSUBA_COMPARE
+        roughness = 0; // until we get better BRDF for metals
+#endif
+        scattered = Ray(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
+        attenuation = mat.albedo;
+        return dot(scattered.dir, rec.normal) > 0;
+    }
+    else if (mat.type == Material::Dielectric)
+    {
+        AssertUnit(r_in.dir); AssertUnit(rec.normal);
+        float3 outwardN;
+        float3 rdir = r_in.dir;
+        float3 refl = reflect(rdir, rec.normal);
+        float nint;
+        attenuation = float3(1,1,1);
+        float3 refr;
+        float reflProb;
+        float cosine;
+        if (dot(rdir, rec.normal) > 0)
+        {
+            outwardN = -rec.normal;
+            nint = mat.ri;
+            cosine = mat.ri * dot(rdir, rec.normal);
+        }
+        else
+        {
+            outwardN = rec.normal;
+            nint = 1.0f / mat.ri;
+            cosine = -dot(rdir, rec.normal);
+        }
+        if (refract(rdir, outwardN, nint, refr))
+        {
+            reflProb = schlick(cosine, mat.ri);
+        }
+        else
+        {
+            reflProb = 1;
+        }
+        if (RandomFloat01(state) < reflProb)
+            scattered = Ray(rec.pos, normalize(refl));
+        else
+            scattered = Ray(rec.pos, normalize(refr));
+    }
+    else
+    {
+        attenuation = float3(1,0,1);
+        return false;
+    }
+    return true;
+}
+
+static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state, bool doMaterialE = true)
+{
+    ZoneScoped;
+    Hit rec;
+    int id = 0;
+    ++inoutRayCount;
+    if (HitWorld(r, kMinT, kMaxT, rec, id))
+    {
+        Ray scattered;
+        float3 attenuation;
+        float3 lightE;
+        const Material& mat = s_SphereMats[id];
+        float3 matE = mat.emissive;
+        if (depth < kMaxDepth && Scatter(mat, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
+        {
+#if DO_LIGHT_SAMPLING
+            if (!doMaterialE) matE = float3(0,0,0); // don't add material emission if told so
+            // dor Lambert materials, we just did explicit light (emissive) sampling and already
+            // for their contribution, so if next ray bounce hits the light again, don't add
+            // emission
+            doMaterialE = (mat.type != Material::Lambert);
+#endif
+            return matE + lightE + attenuation * Trace(scattered, depth+1, inoutRayCount, state, doMaterialE);
+        }
+        else
+        {
+            return matE;
+        }
+    }
+    else
+    {
+        // sky
+#if DO_MITSUBA_COMPARE
+        return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light
+#else
+        float3 unitDir = r.dir;
+        float t = 0.5f*(unitDir.getY() + 1.0f);
+        return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f;
+#endif
+    }
+}
+
+#if CPU_CAN_DO_THREADS
+static enkiTaskScheduler* g_TS;
+#endif
+
+void InitializeTest()
+{
+    ZoneScoped;
+    #if CPU_CAN_DO_THREADS
+    g_TS = enkiNewTaskScheduler();
+    enkiInitTaskSchedulerNumThreads(g_TS, std::max<int>( 2, std::thread::hardware_concurrency() - 2));
+    #endif
+}
+
+void ShutdownTest()
+{
+    ZoneScoped;
+    #if CPU_CAN_DO_THREADS
+    enkiDeleteTaskScheduler(g_TS);
+    #endif
+}
+
+struct JobData
+{
+    float time;
+    int frameCount;
+    int screenWidth, screenHeight;
+    float* backbuffer;
+    Camera* cam;
+    std::atomic<int> rayCount;
+    unsigned testFlags;
+};
+
+static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void* data_)
+{
+    ZoneScoped;
+    JobData& data = *(JobData*)data_;
+    float* backbuffer = data.backbuffer + start * data.screenWidth * 4;
+    float invWidth = 1.0f / data.screenWidth;
+    float invHeight = 1.0f / data.screenHeight;
+    float lerpFac = float(data.frameCount) / float(data.frameCount+1);
+    if (data.testFlags & kFlagAnimate)
+        lerpFac *= DO_ANIMATE_SMOOTHING;
+    if (!(data.testFlags & kFlagProgressive))
+        lerpFac = 0;
+    int rayCount = 0;
+    for (uint32_t y = start; y < end; ++y)
+    {
+        uint32_t state = (y * 9781 + data.frameCount * 6271) | 1;
+        for (int x = 0; x < data.screenWidth; ++x)
+        {
+            float3 col(0, 0, 0);
+            for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
+            {
+                float u = float(x + RandomFloat01(state)) * invWidth;
+                float v = float(y + RandomFloat01(state)) * invHeight;
+                Ray r = data.cam->GetRay(u, v, state);
+                col += Trace(r, 0, rayCount, state);
+            }
+            col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
+
+            float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]);
+            col = prev * lerpFac + col * (1-lerpFac);
+            col.store(backbuffer);
+            backbuffer += 4;
+        }
+    }
+    data.rayCount += rayCount;
+}
+
+void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags)
+{
+    ZoneScoped;
+    if (testFlags & kFlagAnimate)
+    {
+        s_Spheres[1].center.setY(cosf(time) + 1.0f);
+        s_Spheres[8].center.setZ(sinf(time)*0.3f);
+    }
+    float3 lookfrom(0, 2, 3);
+    float3 lookat(0, 0, 0);
+    float distToFocus = 3;
+#if DO_MITSUBA_COMPARE
+    float aperture = 0.0f;
+#else
+    float aperture = 0.1f;
+#endif
+#if DO_BIG_SCENE
+    aperture *= 0.2f;
+#endif
+
+    s_EmissiveSphereCount = 0;
+    for (int i = 0; i < kSphereCount; ++i)
+    {
+        Sphere& s = s_Spheres[i];
+        s.UpdateDerivedData();
+        s_SpheresSoA.centerX[i] = s.center.getX();
+        s_SpheresSoA.centerY[i] = s.center.getY();
+        s_SpheresSoA.centerZ[i] = s.center.getZ();
+        s_SpheresSoA.sqRadius[i] = s.radius * s.radius;
+        s_SpheresSoA.invRadius[i] = s.invRadius;
+
+        // Remember IDs of emissive spheres (light sources)
+        const Material& smat = s_SphereMats[i];
+        if (smat.emissive.getX() > 0 || smat.emissive.getY() > 0 || smat.emissive.getZ() > 0)
+        {
+            s_EmissiveSpheres[s_EmissiveSphereCount] = i;
+            s_EmissiveSphereCount++;
+        }
+    }
+
+    s_Cam = Camera(lookfrom, lookat, float3(0, 1, 0), 60, float(screenWidth) / float(screenHeight), aperture, distToFocus);
+}
+
+void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags)
+{
+    ZoneScoped;
+    JobData args;
+    args.time = time;
+    args.frameCount = frameCount;
+    args.screenWidth = screenWidth;
+    args.screenHeight = screenHeight;
+    args.backbuffer = backbuffer;
+    args.cam = &s_Cam;
+    args.testFlags = testFlags;
+    args.rayCount = 0;
+
+    #if CPU_CAN_DO_THREADS
+    enkiTaskSet* task = enkiCreateTaskSet(g_TS, TraceRowJob);
+    bool threaded = true;
+    enkiAddTaskSetToPipeMinRange(g_TS, task, &args, screenHeight, threaded ? 4 : screenHeight);
+    enkiWaitForTaskSet(g_TS, task);
+    enkiDeleteTaskSet(task);
+    #else
+    TraceRowJob(0, screenHeight, 0, &args);
+    #endif
+
+    outRayCount = args.rayCount;
+}
+
+void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize)
+{
+    ZoneScoped;
+    outCount = kSphereCount;
+    outObjectSize = sizeof(Sphere);
+    outMaterialSize = sizeof(Material);
+    outCamSize = sizeof(Camera);
+}
+
+void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount)
+{
+    ZoneScoped;
+    memcpy(outObjects, s_Spheres, kSphereCount * sizeof(s_Spheres[0]));
+    memcpy(outMaterials, s_SphereMats, kSphereCount * sizeof(s_SphereMats[0]));
+    memcpy(outCam, &s_Cam, sizeof(s_Cam));
+    memcpy(outEmissives, s_EmissiveSpheres, s_EmissiveSphereCount * sizeof(s_EmissiveSpheres[0]));
+    *outEmissiveCount = s_EmissiveSphereCount;
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/Test.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/Test.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <stdint.h>
+
+enum TestFlags
+{
+    kFlagAnimate = (1 << 0),
+    kFlagProgressive = (1 << 1),
+};
+
+void InitializeTest();
+void ShutdownTest();
+
+void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags);
+void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags);
+
+void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize);
+void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount);
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/Atomics.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/Atomics.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+    #include <Windows.h>
+	#undef GetObject
+    #include <intrin.h>
+
+    extern "C" void _ReadWriteBarrier();
+    #pragma intrinsic(_ReadWriteBarrier)
+    #pragma intrinsic(_InterlockedCompareExchange)
+    #pragma intrinsic(_InterlockedExchangeAdd)
+
+    // Memory Barriers to prevent CPU and Compiler re-ordering
+    #define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier()
+    #define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier()
+    #define BASE_ALIGN(x) __declspec( align( x ) ) 
+
+#else
+    #define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory")  
+    #define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory")  
+	#define BASE_ALIGN(x)  __attribute__ ((aligned( x )))
+#endif
+
+namespace enki
+{
+    // Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; }
+    // returns old *pDest (so if successfull, returns compareWith)
+    inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith )
+    {
+       #ifdef _WIN32
+			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
+            return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith );
+        #else
+            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
+        #endif      
+    }
+
+    inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith )
+    {
+       #ifdef _WIN32
+			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
+            return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith );
+        #else
+            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
+        #endif      
+    }	
+
+    // Atomically performs: tmp = *pDest; *pDest += value; return tmp;
+    inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value )
+    {
+       #ifdef _WIN32
+            return _InterlockedExchangeAdd( (long*)pDest, value );
+        #else
+            return __sync_fetch_and_add( pDest, value );
+        #endif      
+    }
+
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
@@ -0,0 +1,240 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include <assert.h>
+
+#include "Atomics.h"
+#include <string.h>
+
+
+namespace enki
+{
+    // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming
+    // Readers can only read from the back of the pipe
+    // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader)
+    // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx
+    // Note: using log2 sizes so we do not need to clamp (multi-operation)
+    // T is the contained type
+    // Note this is not true lockless as the use of flags as a form of lock state.
+    template<uint8_t cSizeLog2, typename T> class LockLessMultiReadPipe
+    {
+    public:
+        LockLessMultiReadPipe();
+        ~LockLessMultiReadPipe() {}
+
+        // ReaderTryReadBack returns false if we were unable to read
+        // This is thread safe for both multiple readers and the writer
+        bool ReaderTryReadBack(   T* pOut );
+
+        // WriterTryReadFront returns false if we were unable to read
+        // This is thread safe for the single writer, but should not be called by readers
+        bool WriterTryReadFront(  T* pOut );
+
+        // WriterTryWriteFront returns false if we were unable to write
+        // This is thread safe for the single writer, but should not be called by readers
+        bool WriterTryWriteFront( const T& in );
+
+        // IsPipeEmpty() is a utility function, not intended for general use
+        // Should only be used very prudently.
+        bool IsPipeEmpty() const
+        {
+            return 0 == m_WriteIndex - m_ReadCount;
+        }
+
+        void Clear()
+        {
+            m_WriteIndex = 0;
+            m_ReadIndex = 0;
+            m_ReadCount = 0;
+            memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
+        }
+
+    private:
+        const static uint32_t           ms_cSize        = ( 1 << cSizeLog2 );
+        const static uint32_t           ms_cIndexMask   = ms_cSize - 1;
+        const static uint32_t           FLAG_INVALID    = 0xFFFFFFFF; // 32bit for CAS
+        const static uint32_t           FLAG_CAN_WRITE  = 0x00000000; // 32bit for CAS
+        const static uint32_t           FLAG_CAN_READ   = 0x11111111; // 32bit for CAS
+
+        T                               m_Buffer[ ms_cSize ];
+
+        // read and write indexes allow fast access to the pipe, but actual access
+        // controlled by the access flags. 
+        volatile uint32_t BASE_ALIGN(4) m_WriteIndex;
+        volatile uint32_t BASE_ALIGN(4) m_ReadCount;
+        volatile uint32_t               m_Flags[  ms_cSize ];
+        volatile uint32_t BASE_ALIGN(4) m_ReadIndex;
+    };
+
+    template<uint8_t cSizeLog2, typename T> inline
+        LockLessMultiReadPipe<cSizeLog2,T>::LockLessMultiReadPipe()
+        : m_WriteIndex(0)
+        , m_ReadIndex(0)
+        , m_ReadCount(0)
+    {
+        assert( cSizeLog2 < 32 );
+        memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
+    }
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::ReaderTryReadBack(   T* pOut )
+    {
+
+        uint32_t actualReadIndex;
+
+        uint32_t readCount  = m_ReadCount;
+
+        // We get hold of read index for consistency,
+        // and do first pass starting at read count
+        uint32_t readIndexToUse  = readCount;
+
+
+        while(true)
+        {
+
+            uint32_t writeIndex = m_WriteIndex;
+            // power of two sizes ensures we can use a simple calc without modulus
+            uint32_t numInPipe = writeIndex - readCount;
+            if( 0 == numInPipe )
+            {
+                return false;
+            }
+            if( readIndexToUse >= writeIndex )
+            {
+                // move back to start
+                readIndexToUse = m_ReadIndex;
+            }
+
+
+            // power of two sizes ensures we can perform AND for a modulus
+            actualReadIndex    = readIndexToUse & ms_cIndexMask;
+
+            // Multiple potential readers mean we should check if the data is valid,
+            // using an atomic compare exchange
+            uint32_t previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
+            if( FLAG_CAN_READ == previous )
+            {
+                break;
+            }
+            ++readIndexToUse;
+
+            //update known readcount
+            readCount  = m_ReadCount;
+        }
+
+        // we update the read index using an atomic add, as we've only read one piece of data.
+        // this ensure consistency of the read index, and the above loop ensures readers
+        // only read from unread data
+        AtomicAdd(  (volatile int32_t*)&m_ReadCount, 1 );
+
+        BASE_MEMORYBARRIER_ACQUIRE();
+        // now read data, ensuring we do so after above reads & CAS
+        *pOut = m_Buffer[ actualReadIndex ];
+
+        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
+
+        return true;
+    }
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryReadFront(  T* pOut )
+    {
+        uint32_t writeIndex = m_WriteIndex;
+        uint32_t frontReadIndex  = writeIndex;
+
+        // Multiple potential readers mean we should check if the data is valid,
+        // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really).
+        uint32_t previous = FLAG_INVALID;
+        uint32_t actualReadIndex = 0;
+        while( true )
+        {
+            // power of two sizes ensures we can use a simple calc without modulus
+            uint32_t readCount = m_ReadCount;
+            uint32_t numInPipe = writeIndex - readCount;
+            if( 0 == numInPipe || 0 == frontReadIndex )
+            {
+                // frontReadIndex can get to 0 here if that item was just being read by another thread.
+                m_ReadIndex = readCount;
+                return false;
+            }
+            --frontReadIndex;
+            actualReadIndex = frontReadIndex & ms_cIndexMask;
+            previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
+            if( FLAG_CAN_READ == previous )
+            {
+                break;
+            }
+            else if( m_ReadIndex >= frontReadIndex  )
+            {
+                return false;
+            }
+        }
+
+        // now read data, ensuring we do so after above reads & CAS
+        *pOut = m_Buffer[ actualReadIndex ];
+
+        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
+
+        BASE_MEMORYBARRIER_RELEASE();
+
+        // 32-bit aligned stores are atomic, and writer owns the write index
+        // we only move one back as this is as many as we have read, not where we have read from.
+        --m_WriteIndex;
+        return true;
+    }
+
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryWriteFront( const T& in )
+    {
+        // The writer 'owns' the write index, and readers can only reduce
+        // the amount of data in the pipe.
+        // We get hold of both values for consistency and to reduce false sharing
+        // impacting more than one access
+        uint32_t writeIndex = m_WriteIndex;
+
+
+        // power of two sizes ensures we can perform AND for a modulus
+        uint32_t actualWriteIndex    = writeIndex & ms_cIndexMask;
+
+        // a reader may still be reading this item, as there are multiple readers
+        if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE ) 
+        {
+            return false; // still being read, so have caught up with tail. 
+        }
+
+
+        // as we are the only writer we can update the data without atomics
+        //  whilst the write index has not been updated
+        m_Buffer[ actualWriteIndex ] = in;
+        m_Flags[  actualWriteIndex ] = FLAG_CAN_READ;
+
+        // We need to ensure the above writes occur prior to updating the write index,
+        // otherwise another thread might read before it's finished
+        BASE_MEMORYBARRIER_RELEASE();
+
+        // 32-bit aligned stores are atomic, and the writer controls the write index
+        ++writeIndex;
+        m_WriteIndex = writeIndex;
+        return true;
+    }
+
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
@@ -0,0 +1,437 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#include <assert.h>
+
+#include "TaskScheduler.h"
+#include "LockLessMultiReadPipe.h"
+
+
+
+using namespace enki;
+
+
+static const uint32_t PIPESIZE_LOG2              = 8;
+static const uint32_t SPIN_COUNT                 = 100;
+static const uint32_t SPIN_BACKOFF_MULTIPLIER    = 10;
+static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;
+
+// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable
+static THREAD_LOCAL uint32_t                             gtl_threadNum       = 0;
+
+namespace enki 
+{
+	struct SubTaskSet
+	{
+		ITaskSet*           pTask;
+		TaskSetPartition    partition;
+	};
+
+	// we derive class TaskPipe rather than typedef to get forward declaration working easily
+	class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};
+
+	struct ThreadArgs
+	{
+		uint32_t		threadNum;
+		TaskScheduler*  pTaskScheduler;
+	};
+}
+
+namespace
+{
+	SubTaskSet       SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )
+	{
+		SubTaskSet splitTask = subTask_;
+		uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;
+
+        if( rangeToSplit_ > rangeLeft )
+        {
+            rangeToSplit_ = rangeLeft;
+        }
+        splitTask.partition.end = subTask_.partition.start + rangeToSplit_;
+		subTask_.partition.start = splitTask.partition.end;
+		return splitTask;
+	}
+
+	#if defined _WIN32
+		#if defined _M_IX86  || defined _M_X64
+			#pragma intrinsic(_mm_pause)
+			inline void Pause() { _mm_pause(); }
+		#endif
+	#elif defined __i386__ || defined __x86_64__
+		inline void Pause() { __asm__ __volatile__("pause;"); }
+	#else
+		inline void Pause() { ;} // may have NOP or yield equiv
+	#endif
+}
+
+
+static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)
+{
+	if( func_ )
+	{
+		func_(threadnum_);
+	}
+}
+
+ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()
+{
+	return &m_ProfilerCallbacks;
+}
+
+THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )
+{
+	ThreadArgs args					= *(ThreadArgs*)pArgs;
+	uint32_t threadNum				= args.threadNum;
+	TaskScheduler*  pTS				= args.pTaskScheduler;
+    gtl_threadNum      = threadNum;
+
+	SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );
+    
+    uint32_t spinCount = 0;
+	uint32_t hintPipeToCheck_io = threadNum + 1;	// does not need to be clamped.
+    while( pTS->m_bRunning )
+    {
+        if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )
+        {
+            // no tasks, will spin then wait
+            ++spinCount;
+            if( spinCount > SPIN_COUNT )
+            {
+				pTS->WaitForTasks( threadNum );
+				spinCount = 0;
+            }
+			else
+			{
+				uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;
+				while( spinBackoffCount )
+				{
+					Pause();
+					--spinBackoffCount;
+				}
+			}
+        }
+        else
+        {
+            spinCount = 0;
+        }
+    }
+
+    AtomicAdd( &pTS->m_NumThreadsRunning, -1 );
+	SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );
+
+    return 0;
+}
+
+
+void TaskScheduler::StartThreads()
+{
+    if( m_bHaveThreads )
+    {
+        return;
+    }
+    m_bRunning = true;
+
+    SemaphoreCreate( m_NewTaskSemaphore );
+
+    // we create one less thread than m_NumThreads as the main thread counts as one
+    m_pThreadNumStore = new ThreadArgs[m_NumThreads];
+    m_pThreadIDs      = new threadid_t[m_NumThreads];
+	m_pThreadNumStore[0].threadNum      = 0;
+	m_pThreadNumStore[0].pTaskScheduler = this;
+	m_pThreadIDs[0] = 0;
+    m_NumThreadsWaiting = 0;
+    m_NumThreadsRunning = 1;// acount for main thread
+    for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
+    {
+		m_pThreadNumStore[thread].threadNum      = thread;
+		m_pThreadNumStore[thread].pTaskScheduler = this;
+        ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );
+        ++m_NumThreadsRunning;
+    }
+
+    // ensure we have sufficient tasks to equally fill either all threads including main
+    // or just the threads we've launched, this is outside the firstinit as we want to be able
+    // to runtime change it
+	if( 1 == m_NumThreads )
+	{
+		m_NumPartitions = 1;
+		m_NumInitialPartitions = 1;
+	}
+	else
+	{
+		m_NumPartitions = m_NumThreads * (m_NumThreads - 1);
+		m_NumInitialPartitions = m_NumThreads - 1;
+		if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )
+		{
+			m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;
+		}
+	}
+
+    m_bHaveThreads = true;
+}
+
+void TaskScheduler::StopThreads( bool bWait_ )
+{
+    if( m_bHaveThreads )
+    {
+        // wait for them threads quit before deleting data
+        m_bRunning = false;
+        while( bWait_ && m_NumThreadsRunning > 1 )
+        {
+            // keep firing event to ensure all threads pick up state of m_bRunning
+            SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );
+        }
+
+        for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
+        {
+            ThreadTerminate( m_pThreadIDs[thread] );
+        }
+
+		m_NumThreads = 0;
+        delete[] m_pThreadNumStore;
+        delete[] m_pThreadIDs;
+        m_pThreadNumStore = 0;
+        m_pThreadIDs = 0;
+        SemaphoreClose( m_NewTaskSemaphore );
+
+        m_bHaveThreads = false;
+		m_NumThreadsWaiting = 0;
+		m_NumThreadsRunning = 0;
+    }
+}
+
+bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )
+{
+    // check for tasks
+    SubTaskSet subTask;
+    bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );
+
+	uint32_t threadToCheck = hintPipeToCheck_io_;
+	uint32_t checkCount = 0;
+    while( !bHaveTask && checkCount < m_NumThreads )
+    {
+		threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;
+		if( threadToCheck != threadNum )
+		{
+			bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );
+		}
+		++checkCount;
+    }
+        
+    if( bHaveTask )
+    {
+		// update hint, will preserve value unless actually got task from another thread.
+		hintPipeToCheck_io_ = threadToCheck;
+
+		uint32_t partitionSize = subTask.partition.end - subTask.partition.start;
+		if( subTask.pTask->m_RangeToRun < partitionSize )
+		{
+			SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );
+			SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );
+			taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );
+			AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );
+		}
+		else
+		{
+
+			// the task has already been divided up by AddTaskSetToPipe, so just run it
+			subTask.pTask->ExecuteRange( subTask.partition, threadNum );
+			AtomicAdd( &subTask.pTask->m_RunningCount, -1 );
+		}
+    }
+
+    return bHaveTask;
+
+}
+
+void TaskScheduler::WaitForTasks( uint32_t threadNum )
+{
+	// We incrememt the number of threads waiting here in order
+	// to ensure that the check for tasks occurs after the increment
+	// to prevent a task being added after a check, then the thread waiting.
+	// This will occasionally result in threads being mistakenly awoken,
+	// but they will then go back to sleep.
+	AtomicAdd( &m_NumThreadsWaiting, 1 );
+
+    bool bHaveTasks = false;
+    for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
+    {
+        if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
+        {
+            bHaveTasks = true;
+            break;
+        }
+    }
+    if( !bHaveTasks )
+    {
+        SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );
+        SemaphoreWait( m_NewTaskSemaphore );
+        SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );
+    }
+
+    int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );
+    assert( prev != 0 );
+}
+
+void TaskScheduler::WakeThreads()
+{
+	SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );
+}
+
+void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
+	uint32_t rangeToSplit_, int32_t runningCountOffset_ )
+{
+    int32_t numAdded = 0;
+    while( subTask_.partition.start != subTask_.partition.end )
+    {
+        SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );
+
+        // add the partition to the pipe
+        ++numAdded;
+        if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )
+        {
+			if( numAdded > 1 )
+			{
+				WakeThreads();
+			}
+			// alter range to run the appropriate fraction
+			if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )
+			{
+				taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;
+				subTask_.partition.start = taskToAdd.partition.end;
+			}
+            taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );
+            --numAdded;
+        }
+    }
+
+    // increment running count by number added
+    AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );
+
+	WakeThreads();
+}
+
+void    TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )
+{
+	// set running count to -1 to guarantee it won't be found complete until all subtasks added
+    pTaskSet->m_RunningCount = -1;
+
+    // divide task up and add to pipe
+    pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;
+    if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }
+
+	uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;
+	if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }
+
+    SubTaskSet subTask;
+    subTask.pTask = pTaskSet;
+    subTask.partition.start = 0;
+    subTask.partition.end = pTaskSet->m_SetSize;
+	SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );
+}
+
+void    TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )
+{
+	uint32_t hintPipeToCheck_io = gtl_threadNum + 1;	// does not need to be clamped.
+	if( pTaskSet )
+	{
+		while( pTaskSet->m_RunningCount )
+		{
+			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+			// should add a spin then wait for task completion event.
+		}
+	}
+	else
+	{
+			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+	}
+}
+
+void    TaskScheduler::WaitforAll()
+{
+    bool bHaveTasks = true;
+ 	uint32_t hintPipeToCheck_io = gtl_threadNum  + 1;	// does not need to be clamped.
+	int32_t threadsRunning = m_NumThreadsRunning - 1;
+    while( bHaveTasks || m_NumThreadsWaiting < threadsRunning )
+    {
+        TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+        bHaveTasks = false;
+        for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
+        {
+            if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
+            {
+                bHaveTasks = true;
+                break;
+            }
+        }
+     }
+}
+
+void    TaskScheduler::WaitforAllAndShutdown()
+{
+    WaitforAll();
+    StopThreads(true);
+	delete[] m_pPipesPerThread;
+    m_pPipesPerThread = 0;
+}
+
+uint32_t        TaskScheduler::GetNumTaskThreads() const
+{
+    return m_NumThreads;
+}
+
+TaskScheduler::TaskScheduler()
+		: m_pPipesPerThread(NULL)
+		, m_NumThreads(0)
+		, m_pThreadNumStore(NULL)
+		, m_pThreadIDs(NULL)
+		, m_bRunning(false)
+		, m_NumThreadsRunning(0)
+		, m_NumThreadsWaiting(0)
+		, m_NumPartitions(0)
+		, m_bHaveThreads(false)
+{
+	memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));
+}
+
+TaskScheduler::~TaskScheduler()
+{
+    StopThreads( true ); // Stops threads, waiting for them.
+
+    delete[] m_pPipesPerThread;
+    m_pPipesPerThread = 0;
+}
+
+void    TaskScheduler::Initialize( uint32_t numThreads_ )
+{
+	assert( numThreads_ );
+    StopThreads( true ); // Stops threads, waiting for them.
+    delete[] m_pPipesPerThread;
+
+	m_NumThreads = numThreads_;
+
+    m_pPipesPerThread = new TaskPipe[ m_NumThreads ];
+
+    StartThreads();
+}
+
+void   TaskScheduler::Initialize()
+{
+	Initialize( GetNumHardwareThreads() );
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include "Threads.h"
+
+namespace enki
+{
+
+	struct TaskSetPartition
+	{
+		uint32_t start;
+		uint32_t end;
+	};
+
+	class  TaskScheduler;
+	class  TaskPipe;
+	struct ThreadArgs;
+	struct SubTaskSet;
+
+	// Subclass ITaskSet to create tasks.
+	// TaskSets can be re-used, but check
+	class ITaskSet
+	{
+	public:
+        ITaskSet()
+            : m_SetSize(1)
+			, m_MinRange(1)
+            , m_RunningCount(0)
+			, m_RangeToRun(1)
+        {}
+
+        ITaskSet( uint32_t setSize_ )
+            : m_SetSize( setSize_ )
+			, m_MinRange(1)
+            , m_RunningCount(0)
+			, m_RangeToRun(1)
+        {}
+
+		ITaskSet( uint32_t setSize_, uint32_t minRange_ )
+            : m_SetSize( setSize_ )
+			, m_MinRange( minRange_ )
+            , m_RunningCount(0)
+			, m_RangeToRun(minRange_)
+        {}
+
+		// Execute range should be overloaded to process tasks. It will be called with a
+		// range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize;
+		// The range values should be mapped so that linearly processing them in order is cache friendly
+		// i.e. neighbouring values should be close together.
+		// threadnum should not be used for changing processing of data, it's intended purpose
+		// is to allow per-thread data buckets for output.
+		virtual void            ExecuteRange( TaskSetPartition range, uint32_t threadnum  ) = 0;
+
+		// Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1
+		uint32_t                m_SetSize;
+
+		// Minimum size of of TaskSetPartition range when splitting a task set into partitions.
+		// This should be set to a value which results in computation effort of at least 10k
+		// clock cycles to minimize tast scheduler overhead.
+		// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
+		// of m_MinRange.
+		// Also known as grain size in literature.
+		uint32_t                m_MinRange;
+
+		bool                    GetIsComplete()
+		{
+			return 0 == m_RunningCount;
+		}
+	private:
+		friend class           TaskScheduler;
+		volatile int32_t        m_RunningCount;
+		uint32_t                m_RangeToRun;
+	};
+
+	// TaskScheduler implements several callbacks intended for profilers
+	typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ );
+	struct ProfilerCallbacks
+	{
+		ProfilerCallbackFunc threadStart;
+		ProfilerCallbackFunc threadStop;
+		ProfilerCallbackFunc waitStart;
+		ProfilerCallbackFunc waitStop;
+	};
+
+	class TaskScheduler
+	{
+	public:
+		TaskScheduler();
+		~TaskScheduler();
+
+		// Call either Initialize() or Initialize( numThreads_ ) before adding tasks.
+
+		// Initialize() will create GetNumHardwareThreads()-1 threads, which is
+		// sufficient to fill the system when including the main thread.
+		// Initialize can be called multiple times - it will wait for completion
+		// before re-initializing.
+		void			Initialize();
+
+		// Initialize( numThreads_ ) - numThreads_ (must be > 0)
+		// will create numThreads_-1 threads, as thread 0 is
+		// the thread on which the initialize was called.
+		void			Initialize( uint32_t numThreads_ );
+
+
+		// Adds the TaskSet to pipe and returns if the pipe is not full.
+		// If the pipe is full, pTaskSet is run.
+		// should only be called from main thread, or within a task
+		void            AddTaskSetToPipe( ITaskSet* pTaskSet );
+
+		// Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete();
+		// should only be called from thread which created the taskscheduler , or within a task
+		// if called with 0 it will try to run tasks, and return if none available.
+		void            WaitforTaskSet( const ITaskSet* pTaskSet );
+
+		// Waits for all task sets to complete - not guaranteed to work unless we know we
+		// are in a situation where tasks aren't being continuosly added.
+		void            WaitforAll();
+
+		// Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we
+		// are in a situation where tasks aren't being continuosly added.
+		void            WaitforAllAndShutdown();
+
+		// Returns the number of threads created for running tasks + 1
+		// to account for the main thread.
+		uint32_t        GetNumTaskThreads() const;
+
+		// Returns the ProfilerCallbacks structure so that it can be modified to
+		// set the callbacks.
+		ProfilerCallbacks* GetProfilerCallbacks();
+
+	private:
+		static THREADFUNC_DECL  TaskingThreadFunction( void* pArgs );
+        void             WaitForTasks( uint32_t threadNum );
+		bool             TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ );
+		void             StartThreads();
+		void             StopThreads( bool bWait_ );
+		void             SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
+										  uint32_t rangeToSplit_, int32_t runningCountOffset_ );
+		void             WakeThreads();
+
+		TaskPipe*                                                m_pPipesPerThread;
+
+		uint32_t                                                 m_NumThreads;
+		ThreadArgs*                                              m_pThreadNumStore;
+		threadid_t*                                              m_pThreadIDs;
+		volatile bool                                            m_bRunning;
+		volatile int32_t                                         m_NumThreadsRunning;
+		volatile int32_t                                         m_NumThreadsWaiting;
+		uint32_t                                                 m_NumPartitions;
+		uint32_t                                                 m_NumInitialPartitions;
+		semaphoreid_t                                            m_NewTaskSemaphore;
+		bool                                                     m_bHaveThreads;
+		ProfilerCallbacks										 m_ProfilerCallbacks;
+
+		TaskScheduler( const TaskScheduler& nocopy );
+		TaskScheduler& operator=( const TaskScheduler& nocopy );
+	};
+
+}
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#include "TaskScheduler_c.h"
+#include "TaskScheduler.h"
+
+#include <assert.h>
+
+using namespace enki;
+
+struct enkiTaskScheduler : TaskScheduler
+{
+};
+
+struct enkiTaskSet : ITaskSet
+{
+	enkiTaskSet( enkiTaskExecuteRange taskFun_ ) : taskFun(taskFun_), pArgs(NULL) {}
+
+	virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum  )
+	{
+		taskFun( range.start, range.end, threadnum, pArgs );
+	}
+
+	enkiTaskExecuteRange taskFun;
+	void* pArgs;
+};
+
+enkiTaskScheduler*	enkiNewTaskScheduler()
+{
+	enkiTaskScheduler* pETS = new enkiTaskScheduler();
+    return pETS;
+}
+
+void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ )
+{
+    pETS_->Initialize();
+}
+
+void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ )
+{
+    pETS_->Initialize( numThreads_ );
+}
+
+void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ )
+{
+	delete pETS_;
+}
+
+enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  )
+{
+	return new enkiTaskSet( taskFunc_ );
+}
+
+void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ )
+{
+	delete pTaskSet_;
+}
+
+void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, void* pArgs_, uint32_t setSize_ )
+{
+	assert( pTaskSet_ );
+	assert( pTaskSet_->taskFun );
+
+	pTaskSet_->m_SetSize = setSize_;
+	pTaskSet_->pArgs = pArgs_;
+	pETS_->AddTaskSetToPipe( pTaskSet_ );
+}
+
+void enkiAddTaskSetToPipeMinRange(enkiTaskScheduler * pETS_, enkiTaskSet * pTaskSet_, void * pArgs_, uint32_t setSize_, uint32_t minRange_)
+{
+	assert( pTaskSet_ );
+	assert( pTaskSet_->taskFun );
+
+	pTaskSet_->m_SetSize = setSize_;
+	pTaskSet_->m_MinRange = minRange_;
+	pTaskSet_->pArgs = pArgs_;
+	pETS_->AddTaskSetToPipe( pTaskSet_ );
+}
+
+int				enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
+{
+	assert( pTaskSet_ );
+	return ( pTaskSet_->GetIsComplete() ) ? 1 : 0;
+}
+
+void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
+{
+	pETS_->WaitforTaskSet( pTaskSet_ );
+}
+
+void				enkiWaitForAll( enkiTaskScheduler* pETS_ )
+{
+	pETS_->WaitforAll();
+}
+
+
+uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ )
+{
+	return pETS_->GetNumTaskThreads();
+}
+
+enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ )
+{
+    assert( sizeof(enkiProfilerCallbacks) == sizeof(enki::ProfilerCallbacks) );
+    return (enkiProfilerCallbacks*)pETS_->GetProfilerCallbacks();
+}
+
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+typedef struct enkiTaskScheduler enkiTaskScheduler;
+typedef struct enkiTaskSet		 enkiTaskSet;
+
+typedef void (* enkiTaskExecuteRange)( uint32_t start_, uint32_t end, uint32_t threadnum_, void* pArgs_ );
+
+
+// Create a new task scheduler
+enkiTaskScheduler*	enkiNewTaskScheduler();
+
+// Initialize task scheduler - will create GetNumHardwareThreads()-1 threads, which is
+// sufficient to fill the system when including the main thread.
+// Initialize can be called multiple times - it will wait for completion
+// before re-initializing.
+void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ );
+
+// Initialize a task scheduler with numThreads_ (must be > 0)
+// will create numThreads_-1 threads, as thread 0 is
+// the thread on which the initialize was called.
+void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ );
+
+
+// Delete a task scheduler
+void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ );
+
+// Create a task set.
+enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  );
+
+// Delete a task set.
+void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ );
+
+// Schedule the task
+void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
+										   void* pArgs_, uint32_t setSize_ );
+
+// Schedule the task with a minimum range.
+// This should be set to a value which results in computation effort of at least 10k
+// clock cycles to minimize tast scheduler overhead.
+// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
+// of m_MinRange.
+// Also known as grain size in literature.
+void				enkiAddTaskSetToPipeMinRange( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
+												  void* pArgs_, uint32_t setSize_, uint32_t minRange_ );
+
+
+// Check if TaskSet is complete. Doesn't wait. Returns 1 if complete, 0 if not.
+int					enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
+
+
+// Wait for a given task.
+// should only be called from thread which created the taskscheduler , or within a task
+// if called with 0 it will try to run tasks, and return if none available.
+void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
+
+
+// Waits for all task sets to complete - not guaranteed to work unless we know we
+// are in a situation where tasks aren't being continuosly added.
+void				enkiWaitForAll( enkiTaskScheduler* pETS_ );
+
+
+// get number of threads
+uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ );
+
+// TaskScheduler implements several callbacks intended for profilers
+typedef void (*enkiProfilerCallbackFunc)( uint32_t threadnum_ );
+struct enkiProfilerCallbacks
+{
+    enkiProfilerCallbackFunc threadStart;
+    enkiProfilerCallbackFunc threadStop;
+    enkiProfilerCallbackFunc waitStart;
+    enkiProfilerCallbackFunc waitStop;
+};
+
+// Get the callback structure so it can be set 
+struct enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ );
+
+#ifdef __cplusplus
+}
+#endif
--- a/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/Threads.h
+++ b/third_party/tracy/examples/ToyPathTracer/Source/enkiTS/Threads.h
@@ -0,0 +1,210 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include <assert.h>
+
+#ifdef _WIN32
+
+	#include "Atomics.h"
+
+	#define WIN32_LEAN_AND_MEAN
+	#include <Windows.h>
+	
+	#define THREADFUNC_DECL DWORD WINAPI
+	#define THREAD_LOCAL __declspec( thread )
+
+namespace enki
+{
+    typedef HANDLE threadid_t;
+
+    // declare the thread start function as:
+    // THREADFUNC_DECL MyThreadStart( void* pArg );
+    inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg )
+    {
+        // posix equiv pthread_create
+        DWORD threadid;
+        *returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid );
+        return  *returnid != NULL;
+    }
+
+    inline bool ThreadTerminate( threadid_t threadid )
+    {
+        // posix equiv pthread_cancel
+        return CloseHandle( threadid ) == 0;
+    }
+
+    inline uint32_t GetNumHardwareThreads()
+    {
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+        return sysInfo.dwNumberOfProcessors;
+    }
+}
+
+#else // posix
+
+	#include <pthread.h>
+	#include <unistd.h>
+	#define THREADFUNC_DECL void*
+	#define THREAD_LOCAL __thread
+
+namespace enki
+{
+    typedef pthread_t threadid_t;  
+        
+    // declare the thread start function as:
+    // THREADFUNC_DECL MyThreadStart( void* pArg );
+    inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg )
+    {
+        // posix equiv pthread_create
+        int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg );
+
+        return  retval == 0;
+    }
+    
+    inline bool ThreadTerminate( threadid_t threadid )
+    {
+        // posix equiv pthread_cancel
+        return pthread_cancel( threadid ) == 0;
+    }
+    
+    inline uint32_t GetNumHardwareThreads()
+    {
+        return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN );
+    }
+}
+
+#endif // posix
+
+
+// Semaphore implementation
+#ifdef _WIN32
+
+namespace enki
+{
+    struct semaphoreid_t
+    {
+        HANDLE      sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+        semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL );
+    }
+
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        CloseHandle( semaphoreid.sem );
+    }
+
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE );
+
+        assert( retval != WAIT_FAILED );
+    }
+
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+		if( countWaiting )
+		{
+			ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL );
+		}
+    }
+}
+#elif defined(__MACH__)
+
+// OS X does not have POSIX semaphores
+// see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html
+#include <mach/mach.h>
+
+namespace enki
+{
+    
+    struct semaphoreid_t
+    {
+        semaphore_t   sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+		semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 );
+    }
+    
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        semaphore_destroy( mach_task_self(), semaphoreid.sem );
+    }
+    
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        semaphore_wait( semaphoreid.sem );
+    }
+    
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+        while( countWaiting-- > 0 )
+		{
+			semaphore_signal( semaphoreid.sem );
+		}
+    }
+}
+
+#else // POSIX
+
+#include <semaphore.h>
+
+namespace enki
+{
+    
+    struct semaphoreid_t
+    {
+        sem_t   sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+		int err = sem_init( &semaphoreid.sem, 0, 0 );
+		assert( err == 0 );
+    }
+    
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        sem_destroy( &semaphoreid.sem );
+    }
+    
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        int err = sem_wait( &semaphoreid.sem );
+		assert( err == 0 );
+    }
+    
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+        while( countWaiting-- > 0 )
+		{
+			sem_post( &semaphoreid.sem );
+		}
+    }
+}
+#endif
+
+